aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c4
-rw-r--r--fs/9p/vfs_addr.c4
-rw-r--r--fs/9p/vfs_dentry.c8
-rw-r--r--fs/9p/vfs_dir.c4
-rw-r--r--fs/9p/vfs_file.c8
-rw-r--r--fs/9p/vfs_inode.c32
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/adfs.h1
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/dir_fplus.c9
-rw-r--r--fs/affs/amigaffs.c2
-rw-r--r--fs/affs/file.c39
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/dir.c5
-rw-r--r--fs/afs/vlocation.c1
-rw-r--r--fs/aio.c184
-rw-r--r--fs/autofs4/autofs_i.h69
-rw-r--r--fs/autofs4/dev-ioctl.c2
-rw-r--r--fs/autofs4/expire.c217
-rw-r--r--fs/autofs4/root.c72
-rw-r--r--fs/bad_inode.c7
-rw-r--r--fs/befs/btree.c53
-rw-r--r--fs/befs/linuxvfs.c8
-rw-r--r--fs/bfs/bfs.h1
-rw-r--r--fs/bfs/dir.c4
-rw-r--r--fs/bfs/inode.c8
-rw-r--r--fs/binfmt_aout.c25
-rw-r--r--fs/binfmt_elf.c25
-rw-r--r--fs/binfmt_elf_fdpic.c24
-rw-r--r--fs/binfmt_misc.c23
-rw-r--r--fs/block_dev.c41
-rw-r--r--fs/btrfs/async-thread.c55
-rw-r--r--fs/btrfs/async-thread.h29
-rw-r--r--fs/btrfs/backref.c137
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h46
-rw-r--r--fs/btrfs/check-integrity.c18
-rw-r--r--fs/btrfs/compression.c21
-rw-r--r--fs/btrfs/ctree.c126
-rw-r--r--fs/btrfs/ctree.h97
-rw-r--r--fs/btrfs/delayed-inode.c12
-rw-r--r--fs/btrfs/dev-replace.c82
-rw-r--r--fs/btrfs/dir-item.c12
-rw-r--r--fs/btrfs/disk-io.c378
-rw-r--r--fs/btrfs/disk-io.h16
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c567
-rw-r--r--fs/btrfs/extent_io.c498
-rw-r--r--fs/btrfs/extent_io.h60
-rw-r--r--fs/btrfs/file-item.c32
-rw-r--r--fs/btrfs/file.c182
-rw-r--r--fs/btrfs/free-space-cache.c157
-rw-r--r--fs/btrfs/hash.c20
-rw-r--r--fs/btrfs/inode-item.c12
-rw-r--r--fs/btrfs/inode-map.c68
-rw-r--r--fs/btrfs/inode.c977
-rw-r--r--fs/btrfs/ioctl.c100
-rw-r--r--fs/btrfs/lzo.c3
-rw-r--r--fs/btrfs/ordered-data.c124
-rw-r--r--fs/btrfs/ordered-data.h5
-rw-r--r--fs/btrfs/orphan.c4
-rw-r--r--fs/btrfs/print-tree.c3
-rw-r--r--fs/btrfs/qgroup.c200
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid56.c17
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/relocation.c142
-rw-r--r--fs/btrfs/scrub.c92
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/super.c197
-rw-r--r--fs/btrfs/sysfs.c43
-rw-r--r--fs/btrfs/sysfs.h16
-rw-r--r--fs/btrfs/tests/free-space-tests.c516
-rw-r--r--fs/btrfs/transaction.c85
-rw-r--r--fs/btrfs/transaction.h3
-rw-r--r--fs/btrfs/tree-log.c334
-rw-r--r--fs/btrfs/tree-log.h4
-rw-r--r--fs/btrfs/ulist.h15
-rw-r--r--fs/btrfs/uuid-tree.c1
-rw-r--r--fs/btrfs/volumes.c722
-rw-r--r--fs/btrfs/volumes.h166
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/btrfs/zlib.c141
-rw-r--r--fs/buffer.c54
-rw-r--r--fs/cachefiles/bind.c8
-rw-r--r--fs/cachefiles/daemon.c30
-rw-r--r--fs/cachefiles/interface.c33
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/main.c2
-rw-r--r--fs/cachefiles/namei.c19
-rw-r--r--fs/cachefiles/rdwr.c54
-rw-r--r--fs/cachefiles/xattr.c10
-rw-r--r--fs/ceph/acl.c113
-rw-r--r--fs/ceph/addr.c9
-rw-r--r--fs/ceph/caps.c39
-rw-r--r--fs/ceph/debugfs.c46
-rw-r--r--fs/ceph/dir.c42
-rw-r--r--fs/ceph/file.c57
-rw-r--r--fs/ceph/inode.c16
-rw-r--r--fs/ceph/ioctl.c6
-rw-r--r--fs/ceph/mds_client.c152
-rw-r--r--fs/ceph/mds_client.h6
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/ceph/super.h27
-rw-r--r--fs/ceph/xattr.c85
-rw-r--r--fs/cifs/Kconfig35
-rw-r--r--fs/cifs/cifs_debug.c2
-rw-r--r--fs/cifs/cifs_dfs_ref.c6
-rw-r--r--fs/cifs/cifs_fs_sb.h1
-rw-r--r--fs/cifs/cifs_spnego.c1
-rw-r--r--fs/cifs/cifs_unicode.c203
-rw-r--r--fs/cifs/cifs_unicode.h31
-rw-r--r--fs/cifs/cifsacl.c1
-rw-r--r--fs/cifs/cifsencrypt.c2
-rw-r--r--fs/cifs/cifsfs.c33
-rw-r--r--fs/cifs/cifsfs.h6
-rw-r--r--fs/cifs/cifsglob.h35
-rw-r--r--fs/cifs/cifspdu.h23
-rw-r--r--fs/cifs/cifsproto.h4
-rw-r--r--fs/cifs/cifssmb.c135
-rw-r--r--fs/cifs/connect.c70
-rw-r--r--fs/cifs/dir.c44
-rw-r--r--fs/cifs/file.c903
-rw-r--r--fs/cifs/inode.c100
-rw-r--r--fs/cifs/link.c157
-rw-r--r--fs/cifs/misc.c22
-rw-r--r--fs/cifs/netmisc.c20
-rw-r--r--fs/cifs/readdir.c20
-rw-r--r--fs/cifs/sess.c1182
-rw-r--r--fs/cifs/smb1ops.c50
-rw-r--r--fs/cifs/smb2file.c2
-rw-r--r--fs/cifs/smb2inode.c4
-rw-r--r--fs/cifs/smb2maperror.c8
-rw-r--r--fs/cifs/smb2misc.c35
-rw-r--r--fs/cifs/smb2ops.c272
-rw-r--r--fs/cifs/smb2pdu.c119
-rw-r--r--fs/cifs/smb2pdu.h8
-rw-r--r--fs/cifs/smb2proto.h10
-rw-r--r--fs/cifs/smb2transport.c5
-rw-r--r--fs/cifs/smbencrypt.c1
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/cifs/transport.c25
-rw-r--r--fs/cifs/xattr.c32
-rw-r--r--fs/coda/cache.c2
-rw-r--r--fs/coda/coda_linux.c2
-rw-r--r--fs/coda/dir.c3
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/coda/inode.c4
-rw-r--r--fs/coda/pioctl.c2
-rw-r--r--fs/coda/psdev.c2
-rw-r--r--fs/coda/upcall.c2
-rw-r--r--fs/compat.c28
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/coredump.c8
-rw-r--r--fs/cramfs/inode.c45
-rw-r--r--fs/cramfs/uncompress.c10
-rw-r--r--fs/dcache.c528
-rw-r--r--fs/debugfs/file.c2
-rw-r--r--fs/debugfs/inode.c39
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/debug_fs.c15
-rw-r--r--fs/dlm/plock.c8
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/ecryptfs/file.c6
-rw-r--r--fs/ecryptfs/inode.c25
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/messaging.c3
-rw-r--r--fs/efs/namei.c11
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/exec.c27
-rw-r--r--fs/exofs/ore_raid.c2
-rw-r--r--fs/ext2/super.c8
-rw-r--r--fs/ext3/ext3.h12
-rw-r--r--fs/ext3/super.c25
-rw-r--r--fs/ext4/balloc.c1
-rw-r--r--fs/ext4/dir.c25
-rw-r--r--fs/ext4/ext4.h32
-rw-r--r--fs/ext4/extents.c102
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/indirect.c281
-rw-r--r--fs/ext4/inline.c18
-rw-r--r--fs/ext4/inode.c174
-rw-r--r--fs/ext4/mballoc.c48
-rw-r--r--fs/ext4/migrate.c7
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c59
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c95
-rw-r--r--fs/f2fs/Kconfig4
-rw-r--r--fs/f2fs/acl.c6
-rw-r--r--fs/f2fs/checkpoint.c347
-rw-r--r--fs/f2fs/data.c139
-rw-r--r--fs/f2fs/debug.c43
-rw-r--r--fs/f2fs/dir.c104
-rw-r--r--fs/f2fs/f2fs.h233
-rw-r--r--fs/f2fs/file.c342
-rw-r--r--fs/f2fs/gc.c39
-rw-r--r--fs/f2fs/gc.h2
-rw-r--r--fs/f2fs/hash.c9
-rw-r--r--fs/f2fs/inline.c59
-rw-r--r--fs/f2fs/inode.c49
-rw-r--r--fs/f2fs/namei.c280
-rw-r--r--fs/f2fs/node.c501
-rw-r--r--fs/f2fs/node.h63
-rw-r--r--fs/f2fs/recovery.c237
-rw-r--r--fs/f2fs/segment.c605
-rw-r--r--fs/f2fs/segment.h166
-rw-r--r--fs/f2fs/super.c98
-rw-r--r--fs/f2fs/xattr.c10
-rw-r--r--fs/fat/misc.c2
-rw-r--r--fs/fcntl.c26
-rw-r--r--fs/file.c3
-rw-r--r--fs/file_table.c14
-rw-r--r--fs/fs-writeback.c3
-rw-r--r--fs/fs_pin.c78
-rw-r--r--fs/fscache/cookie.c7
-rw-r--r--fs/fscache/internal.h2
-rw-r--r--fs/fscache/main.c22
-rw-r--r--fs/fscache/object-list.c16
-rw-r--r--fs/fscache/object.c1
-rw-r--r--fs/fscache/page.c29
-rw-r--r--fs/fuse/dir.c14
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/gfs2/bmap.c9
-rw-r--r--fs/gfs2/dentry.c3
-rw-r--r--fs/gfs2/dir.c9
-rw-r--r--fs/gfs2/dir.h1
-rw-r--r--fs/gfs2/file.c37
-rw-r--r--fs/gfs2/glock.c29
-rw-r--r--fs/gfs2/glops.c2
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c21
-rw-r--r--fs/gfs2/lock_dlm.c8
-rw-r--r--fs/gfs2/ops_fstype.c11
-rw-r--r--fs/gfs2/recovery.c8
-rw-r--r--fs/gfs2/rgrp.c30
-rw-r--r--fs/gfs2/rgrp.h1
-rw-r--r--fs/gfs2/super.c28
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/hfs/hfs_fs.h2
-rw-r--r--fs/hostfs/hostfs.h1
-rw-r--r--fs/hostfs/hostfs_kern.c30
-rw-r--r--fs/hostfs/hostfs_user.c28
-rw-r--r--fs/hpfs/dnode.c17
-rw-r--r--fs/inode.c8
-rw-r--r--fs/internal.h14
-rw-r--r--fs/isofs/compress.c4
-rw-r--r--fs/isofs/inode.c17
-rw-r--r--fs/isofs/isofs.h23
-rw-r--r--fs/isofs/rock.c39
-rw-r--r--fs/jbd2/commit.c21
-rw-r--r--fs/jbd2/journal.c56
-rw-r--r--fs/jbd2/recovery.c33
-rw-r--r--fs/jbd2/revoke.c6
-rw-r--r--fs/jbd2/transaction.c10
-rw-r--r--fs/jffs2/acl.c3
-rw-r--r--fs/jffs2/compr_zlib.c7
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jffs2/wbuf.c17
-rw-r--r--fs/jffs2/xattr.c3
-rw-r--r--fs/jfs/jfs_logmgr.c2
-rw-r--r--fs/jfs/jfs_txnmgr.c3
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/kernfs/dir.c11
-rw-r--r--fs/kernfs/file.c2
-rw-r--r--fs/libfs.c18
-rw-r--r--fs/lockd/Makefile3
-rw-r--r--fs/lockd/mon.c10
-rw-r--r--fs/lockd/netns.h1
-rw-r--r--fs/lockd/procfs.c92
-rw-r--r--fs/lockd/procfs.h28
-rw-r--r--fs/lockd/svc.c20
-rw-r--r--fs/lockd/svclock.c68
-rw-r--r--fs/locks.c542
-rw-r--r--fs/logfs/readwrite.c15
-rw-r--r--fs/minix/bitmap.c2
-rw-r--r--fs/minix/inode.c4
-rw-r--r--fs/mount.h27
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c157
-rw-r--r--fs/namespace.c346
-rw-r--r--fs/ncpfs/dir.c9
-rw-r--r--fs/ncpfs/ncplib_kernel.h14
-rw-r--r--fs/nfs/Makefile1
-rw-r--r--fs/nfs/blocklayout/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c1435
-rw-r--r--fs/nfs/blocklayout/blocklayout.h213
-rw-r--r--fs/nfs/blocklayout/blocklayoutdev.c384
-rw-r--r--fs/nfs/blocklayout/blocklayoutdm.c108
-rw-r--r--fs/nfs/blocklayout/dev.c363
-rw-r--r--fs/nfs/blocklayout/extent_tree.c602
-rw-r--r--fs/nfs/blocklayout/extents.c908
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c284
-rw-r--r--fs/nfs/callback.c16
-rw-r--r--fs/nfs/callback_proc.c23
-rw-r--r--fs/nfs/client.c111
-rw-r--r--fs/nfs/delegation.c34
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c215
-rw-r--r--fs/nfs/direct.c59
-rw-r--r--fs/nfs/file.c73
-rw-r--r--fs/nfs/filelayout/filelayout.c331
-rw-r--r--fs/nfs/filelayout/filelayout.h7
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c112
-rw-r--r--fs/nfs/fscache-index.c3
-rw-r--r--fs/nfs/getroot.c2
-rw-r--r--fs/nfs/idmap.c12
-rw-r--r--fs/nfs/inode.c24
-rw-r--r--fs/nfs/internal.h30
-rw-r--r--fs/nfs/netns.h3
-rw-r--r--fs/nfs/nfs3_fs.h34
-rw-r--r--fs/nfs/nfs3acl.c8
-rw-r--r--fs/nfs/nfs3client.c1
-rw-r--r--fs/nfs/nfs3proc.c22
-rw-r--r--fs/nfs/nfs3super.c1
-rw-r--r--fs/nfs/nfs42.h14
-rw-r--r--fs/nfs/nfs42proc.c69
-rw-r--r--fs/nfs/nfs42xdr.c98
-rw-r--r--fs/nfs/nfs4_fs.h24
-rw-r--r--fs/nfs/nfs4client.c43
-rw-r--r--fs/nfs/nfs4file.c27
-rw-r--r--fs/nfs/nfs4proc.c430
-rw-r--r--fs/nfs/nfs4renewd.c12
-rw-r--r--fs/nfs/nfs4state.c67
-rw-r--r--fs/nfs/nfs4trace.h28
-rw-r--r--fs/nfs/nfs4xdr.c188
-rw-r--r--fs/nfs/objlayout/objio_osd.c137
-rw-r--r--fs/nfs/objlayout/objlayout.c151
-rw-r--r--fs/nfs/objlayout/objlayout.h13
-rw-r--r--fs/nfs/pagelist.c338
-rw-r--r--fs/nfs/pnfs.c306
-rw-r--r--fs/nfs/pnfs.h92
-rw-r--r--fs/nfs/pnfs_dev.c150
-rw-r--r--fs/nfs/proc.c27
-rw-r--r--fs/nfs/read.c54
-rw-r--r--fs/nfs/super.c23
-rw-r--r--fs/nfs/write.c251
-rw-r--r--fs/nfs_common/Makefile3
-rw-r--r--fs/nfs_common/grace.c (renamed from fs/lockd/grace.c)68
-rw-r--r--fs/nfs_common/nfsacl.c5
-rw-r--r--fs/nfsd/Kconfig4
-rw-r--r--fs/nfsd/acl.h2
-rw-r--r--fs/nfsd/auth.c2
-rw-r--r--fs/nfsd/cache.h1
-rw-r--r--fs/nfsd/export.c7
-rw-r--r--fs/nfsd/export.h3
-rw-r--r--fs/nfsd/fault_inject.c138
-rw-r--r--fs/nfsd/netns.h23
-rw-r--r--fs/nfsd/nfs2acl.c8
-rw-r--r--fs/nfsd/nfs3acl.c8
-rw-r--r--fs/nfsd/nfs3proc.c22
-rw-r--r--fs/nfsd/nfs3xdr.c30
-rw-r--r--fs/nfsd/nfs4acl.c39
-rw-r--r--fs/nfsd/nfs4callback.c142
-rw-r--r--fs/nfsd/nfs4idmap.c20
-rw-r--r--fs/nfsd/nfs4proc.c102
-rw-r--r--fs/nfsd/nfs4recover.c206
-rw-r--r--fs/nfsd/nfs4state.c3239
-rw-r--r--fs/nfsd/nfs4xdr.c220
-rw-r--r--fs/nfsd/nfscache.c205
-rw-r--r--fs/nfsd/nfsctl.c96
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfsfh.c18
-rw-r--r--fs/nfsd/nfsfh.h15
-rw-r--r--fs/nfsd/nfsproc.c13
-rw-r--r--fs/nfsd/nfssvc.c21
-rw-r--r--fs/nfsd/nfsxdr.c14
-rw-r--r--fs/nfsd/state.h248
-rw-r--r--fs/nfsd/vfs.c85
-rw-r--r--fs/nfsd/vfs.h8
-rw-r--r--fs/nfsd/xdr4.h44
-rw-r--r--fs/nilfs2/Makefile2
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/inode.c20
-rw-r--r--fs/nilfs2/ioctl.c8
-rw-r--r--fs/nilfs2/nilfs.h22
-rw-r--r--fs/nilfs2/segment.c7
-rw-r--r--fs/nilfs2/super.c17
-rw-r--r--fs/nilfs2/sysfs.c1137
-rw-r--r--fs/nilfs2/sysfs.h176
-rw-r--r--fs/nilfs2/the_nilfs.c17
-rw-r--r--fs/nilfs2/the_nilfs.h42
-rw-r--r--fs/notify/dnotify/dnotify.c8
-rw-r--r--fs/notify/fanotify/fanotify.c11
-rw-r--r--fs/notify/fanotify/fanotify_user.c16
-rw-r--r--fs/notify/fdinfo.c4
-rw-r--r--fs/notify/fsnotify.h3
-rw-r--r--fs/notify/group.c2
-rw-r--r--fs/notify/inode_mark.c2
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c8
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/notify/notification.c37
-rw-r--r--fs/notify/vfsmount_mark.c2
-rw-r--r--fs/ntfs/Makefile2
-rw-r--r--fs/ntfs/aops.c163
-rw-r--r--fs/ntfs/debug.c2
-rw-r--r--fs/ntfs/file.c8
-rw-r--r--fs/ntfs/inode.c19
-rw-r--r--fs/ntfs/ntfs.h8
-rw-r--r--fs/ntfs/super.c2
-rw-r--r--fs/ocfs2/alloc.c15
-rw-r--r--fs/ocfs2/aops.c15
-rw-r--r--fs/ocfs2/cluster/heartbeat.c21
-rw-r--r--fs/ocfs2/cluster/heartbeat.h1
-rw-r--r--fs/ocfs2/cluster/masklog.c6
-rw-r--r--fs/ocfs2/cluster/netdebug.c78
-rw-r--r--fs/ocfs2/cluster/quorum.c13
-rw-r--r--fs/ocfs2/cluster/tcp.c88
-rw-r--r--fs/ocfs2/cluster/tcp.h1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c39
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c49
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c29
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c7
-rw-r--r--fs/ocfs2/dlmglue.c23
-rw-r--r--fs/ocfs2/file.c49
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/ioctl.c129
-rw-r--r--fs/ocfs2/move_extents.c4
-rw-r--r--fs/ocfs2/quota.h5
-rw-r--r--fs/ocfs2/quota_global.c4
-rw-r--r--fs/ocfs2/quota_local.c33
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/slot_map.c2
-rw-r--r--fs/ocfs2/stack_user.c2
-rw-r--r--fs/ocfs2/super.c31
-rw-r--r--fs/omfs/inode.c12
-rw-r--r--fs/omfs/omfs_fs.h1
-rw-r--r--fs/pnode.c1
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/array.c18
-rw-r--r--fs/proc/base.c284
-rw-r--r--fs/proc/fd.c4
-rw-r--r--fs/proc/generic.c32
-rw-r--r--fs/proc/inode.c7
-rw-r--r--fs/proc/internal.h20
-rw-r--r--fs/proc/kcore.c6
-rw-r--r--fs/proc/meminfo.c2
-rw-r--r--fs/proc/page.c3
-rw-r--r--fs/proc/proc_net.c4
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/proc_tty.c4
-rw-r--r--fs/proc/root.c7
-rw-r--r--fs/proc/task_mmu.c385
-rw-r--r--fs/proc/task_nommu.c88
-rw-r--r--fs/proc/thread_self.c85
-rw-r--r--fs/proc/vmcore.c82
-rw-r--r--fs/proc_namespace.c8
-rw-r--r--fs/pstore/inode.c4
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/qnx6/Makefile1
-rw-r--r--fs/qnx6/dir.c26
-rw-r--r--fs/qnx6/inode.c99
-rw-r--r--fs/qnx6/namei.c6
-rw-r--r--fs/qnx6/qnx6.h12
-rw-r--r--fs/qnx6/super_mmi.c22
-rw-r--r--fs/quota/dquot.c182
-rw-r--r--fs/quota/kqid.c2
-rw-r--r--fs/quota/netlink.c3
-rw-r--r--fs/quota/quota.c6
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/dir.c2
-rw-r--r--fs/reiserfs/do_balan.c113
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/ibalance.c2
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/reiserfs/ioctl.c2
-rw-r--r--fs/reiserfs/item_ops.c4
-rw-r--r--fs/reiserfs/journal.c30
-rw-r--r--fs/reiserfs/lbalance.c7
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/procfs.c2
-rw-r--r--fs/reiserfs/reiserfs.h14
-rw-r--r--fs/reiserfs/stree.c2
-rw-r--r--fs/reiserfs/super.c31
-rw-r--r--fs/reiserfs/xattr.c22
-rw-r--r--fs/reiserfs/xattr.h1
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/romfs/super.c23
-rw-r--r--fs/squashfs/file_direct.c2
-rw-r--r--fs/squashfs/super.c5
-rw-r--r--fs/stack.c2
-rw-r--r--fs/super.c25
-rw-r--r--fs/sync.c2
-rw-r--r--fs/timerfd.c76
-rw-r--r--fs/ubifs/commit.c10
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/io.c2
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/log.c31
-rw-r--r--fs/ubifs/lpt.c5
-rw-r--r--fs/ubifs/lpt_commit.c7
-rw-r--r--fs/ubifs/master.c7
-rw-r--r--fs/ubifs/orphan.c1
-rw-r--r--fs/ubifs/recovery.c5
-rw-r--r--fs/ubifs/sb.c4
-rw-r--r--fs/ubifs/scan.c14
-rw-r--r--fs/ubifs/super.c19
-rw-r--r--fs/ubifs/tnc.c1
-rw-r--r--fs/ubifs/tnc_commit.c1
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/udf/file.c31
-rw-r--r--fs/udf/ialloc.c28
-rw-r--r--fs/udf/inode.c165
-rw-r--r--fs/udf/lowlevel.c2
-rw-r--r--fs/udf/namei.c156
-rw-r--r--fs/udf/super.c73
-rw-r--r--fs/udf/symlink.c2
-rw-r--r--fs/udf/udfdecl.h16
-rw-r--r--fs/udf/udftime.c2
-rw-r--r--fs/udf/unicode.c9
-rw-r--r--fs/ufs/Makefile1
-rw-r--r--fs/ufs/balloc.c3
-rw-r--r--fs/ufs/ialloc.c6
-rw-r--r--fs/ufs/inode.c39
-rw-r--r--fs/ufs/namei.c18
-rw-r--r--fs/ufs/super.c304
-rw-r--r--fs/ufs/ufs.h10
-rw-r--r--fs/xattr.c116
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile71
-rw-r--r--fs/xfs/kmem.c1
-rw-r--r--fs/xfs/libxfs/xfs_ag.h (renamed from fs/xfs/xfs_ag.h)0
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c (renamed from fs/xfs/xfs_alloc.c)24
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h (renamed from fs/xfs/xfs_alloc.h)0
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c (renamed from fs/xfs/xfs_alloc_btree.c)6
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.h (renamed from fs/xfs/xfs_alloc_btree.h)0
-rw-r--r--fs/xfs/libxfs/xfs_attr.c (renamed from fs/xfs/xfs_attr.c)92
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c (renamed from fs/xfs/xfs_attr_leaf.c)78
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h (renamed from fs/xfs/xfs_attr_leaf.h)0
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c (renamed from fs/xfs/xfs_attr_remote.c)22
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h (renamed from fs/xfs/xfs_attr_remote.h)0
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h (renamed from fs/xfs/xfs_attr_sf.h)0
-rw-r--r--fs/xfs/libxfs/xfs_bit.h (renamed from fs/xfs/xfs_bit.h)0
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c (renamed from fs/xfs/xfs_bmap.c)427
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h (renamed from fs/xfs/xfs_bmap.h)7
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c (renamed from fs/xfs/xfs_bmap_btree.c)99
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h (renamed from fs/xfs/xfs_bmap_btree.h)0
-rw-r--r--fs/xfs/libxfs/xfs_btree.c (renamed from fs/xfs/xfs_btree.c)46
-rw-r--r--fs/xfs/libxfs/xfs_btree.h (renamed from fs/xfs/xfs_btree.h)2
-rw-r--r--fs/xfs/libxfs/xfs_cksum.h (renamed from fs/xfs/xfs_cksum.h)0
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c (renamed from fs/xfs/xfs_da_btree.c)115
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h (renamed from fs/xfs/xfs_da_btree.h)0
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c (renamed from fs/xfs/xfs_da_format.c)1
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h (renamed from fs/xfs/xfs_da_format.h)0
-rw-r--r--fs/xfs/libxfs/xfs_dinode.h (renamed from fs/xfs/xfs_dinode.h)0
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c (renamed from fs/xfs/xfs_dir2.c)89
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h (renamed from fs/xfs/xfs_dir2.h)2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c (renamed from fs/xfs/xfs_dir2_block.c)18
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c (renamed from fs/xfs/xfs_dir2_data.c)10
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c (renamed from fs/xfs/xfs_dir2_leaf.c)24
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c (renamed from fs/xfs/xfs_dir2_node.c)40
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h (renamed from fs/xfs/xfs_dir2_priv.h)0
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c (renamed from fs/xfs/xfs_dir2_sf.c)75
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c (renamed from fs/xfs/xfs_dquot_buf.c)6
-rw-r--r--fs/xfs/libxfs/xfs_format.h (renamed from fs/xfs/xfs_format.h)14
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c (renamed from fs/xfs/xfs_ialloc.c)41
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h (renamed from fs/xfs/xfs_ialloc.h)0
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c (renamed from fs/xfs/xfs_ialloc_btree.c)6
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.h (renamed from fs/xfs/xfs_ialloc_btree.h)0
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c (renamed from fs/xfs/xfs_inode_buf.c)10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h (renamed from fs/xfs/xfs_inode_buf.h)0
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c (renamed from fs/xfs/xfs_inode_fork.c)36
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h (renamed from fs/xfs/xfs_inode_fork.h)0
-rw-r--r--fs/xfs/libxfs/xfs_inum.h (renamed from fs/xfs/xfs_inum.h)4
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h (renamed from fs/xfs/xfs_log_format.h)4
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h (renamed from fs/xfs/xfs_log_recover.h)0
-rw-r--r--fs/xfs/libxfs/xfs_log_rlimit.c (renamed from fs/xfs/xfs_log_rlimit.c)0
-rw-r--r--fs/xfs/libxfs/xfs_quota_defs.h (renamed from fs/xfs/xfs_quota_defs.h)2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c (renamed from fs/xfs/xfs_rtbitmap.c)49
-rw-r--r--fs/xfs/libxfs/xfs_sb.c (renamed from fs/xfs/xfs_sb.c)63
-rw-r--r--fs/xfs/libxfs/xfs_sb.h (renamed from fs/xfs/xfs_sb.h)8
-rw-r--r--fs/xfs/libxfs/xfs_shared.h (renamed from fs/xfs/xfs_shared.h)0
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c (renamed from fs/xfs/xfs_symlink_remote.c)6
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c (renamed from fs/xfs/xfs_trans_resv.c)0
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h (renamed from fs/xfs/xfs_trans_resv.h)0
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h (renamed from fs/xfs/xfs_trans_space.h)0
-rw-r--r--fs/xfs/time.h36
-rw-r--r--fs/xfs/xfs_acl.c8
-rw-r--r--fs/xfs/xfs_aops.c102
-rw-r--r--fs/xfs/xfs_attr_inactive.c22
-rw-r--r--fs/xfs/xfs_attr_list.c38
-rw-r--r--fs/xfs/xfs_bmap_util.c284
-rw-r--r--fs/xfs/xfs_buf.c387
-rw-r--r--fs/xfs/xfs_buf.h17
-rw-r--r--fs/xfs/xfs_buf_item.c14
-rw-r--r--fs/xfs/xfs_dir2_readdir.c4
-rw-r--r--fs/xfs/xfs_discard.c18
-rw-r--r--fs/xfs/xfs_dquot.c41
-rw-r--r--fs/xfs/xfs_dquot.h15
-rw-r--r--fs/xfs/xfs_error.c25
-rw-r--r--fs/xfs/xfs_error.h13
-rw-r--r--fs/xfs/xfs_export.c10
-rw-r--r--fs/xfs/xfs_extfree_item.c2
-rw-r--r--fs/xfs/xfs_file.c272
-rw-r--r--fs/xfs/xfs_filestream.c4
-rw-r--r--fs/xfs/xfs_fs.h7
-rw-r--r--fs/xfs/xfs_fsops.c51
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_icache.c147
-rw-r--r--fs/xfs/xfs_icache.h13
-rw-r--r--fs/xfs/xfs_inode.c102
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_inode_item.c4
-rw-r--r--fs/xfs/xfs_ioctl.c294
-rw-r--r--fs/xfs/xfs_ioctl32.c113
-rw-r--r--fs/xfs/xfs_ioctl32.h3
-rw-r--r--fs/xfs/xfs_iomap.c58
-rw-r--r--fs/xfs/xfs_iops.c102
-rw-r--r--fs/xfs/xfs_itable.c580
-rw-r--r--fs/xfs/xfs_itable.h23
-rw-r--r--fs/xfs/xfs_linux.h33
-rw-r--r--fs/xfs/xfs_log.c128
-rw-r--r--fs/xfs/xfs_log_cil.c55
-rw-r--r--fs/xfs/xfs_log_priv.h2
-rw-r--r--fs/xfs/xfs_log_recover.c959
-rw-r--r--fs/xfs/xfs_mount.c141
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_mru_cache.c17
-rw-r--r--fs/xfs/xfs_qm.c230
-rw-r--r--fs/xfs/xfs_qm.h1
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c46
-rw-r--r--fs/xfs/xfs_quotaops.c20
-rw-r--r--fs/xfs/xfs_rtalloc.c105
-rw-r--r--fs/xfs/xfs_rtalloc.h6
-rw-r--r--fs/xfs/xfs_super.c167
-rw-r--r--fs/xfs/xfs_super.h15
-rw-r--r--fs/xfs/xfs_symlink.c38
-rw-r--r--fs/xfs/xfs_sysctl.h5
-rw-r--r--fs/xfs/xfs_sysfs.c239
-rw-r--r--fs/xfs/xfs_sysfs.h60
-rw-r--r--fs/xfs/xfs_trace.h3
-rw-r--r--fs/xfs/xfs_trans.c10
-rw-r--r--fs/xfs/xfs_trans_ail.c4
-rw-r--r--fs/xfs/xfs_trans_buf.c53
-rw-r--r--fs/xfs/xfs_trans_dquot.c4
-rw-r--r--fs/xfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/xfs_types.h29
-rw-r--r--fs/xfs/xfs_vnode.h46
-rw-r--r--fs/xfs/xfs_xattr.c6
647 files changed, 26811 insertions, 17377 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index d51ec9fafcc8..47db55aee7f2 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -65,8 +65,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
65{ 65{
66 struct p9_fid *fid, *ret; 66 struct p9_fid *fid, *ret;
67 67
68 p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", 68 p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p) uid %d any %d\n",
69 dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid), 69 dentry, dentry, from_kuid(&init_user_ns, uid),
70 any); 70 any);
71 ret = NULL; 71 ret = NULL;
72 /* we'll recheck under lock if there's anything to look in */ 72 /* we'll recheck under lock if there's anything to look in */
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index cc1cfae726b3..eb14e055ea83 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -266,8 +266,8 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
266 * Now that we do caching with cache mode enabled, We need 266 * Now that we do caching with cache mode enabled, We need
267 * to support direct IO 267 * to support direct IO
268 */ 268 */
269 p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n", 269 p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%pD) off/no(%lld/%lu) EINVAL\n",
270 iocb->ki_filp->f_path.dentry->d_name.name, 270 iocb->ki_filp,
271 (long long)pos, iter->nr_segs); 271 (long long)pos, iter->nr_segs);
272 272
273 return -EINVAL; 273 return -EINVAL;
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index b03dd23feda8..a345b2d659cc 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -49,8 +49,8 @@
49 */ 49 */
50static int v9fs_cached_dentry_delete(const struct dentry *dentry) 50static int v9fs_cached_dentry_delete(const struct dentry *dentry)
51{ 51{
52 p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", 52 p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
53 dentry->d_name.name, dentry); 53 dentry, dentry);
54 54
55 /* Don't cache negative dentries */ 55 /* Don't cache negative dentries */
56 if (!dentry->d_inode) 56 if (!dentry->d_inode)
@@ -67,8 +67,8 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry)
67static void v9fs_dentry_release(struct dentry *dentry) 67static void v9fs_dentry_release(struct dentry *dentry)
68{ 68{
69 struct hlist_node *p, *n; 69 struct hlist_node *p, *n;
70 p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", 70 p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n",
71 dentry->d_name.name, dentry); 71 dentry, dentry);
72 hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) 72 hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata)
73 p9_client_clunk(hlist_entry(p, struct p9_fid, dlist)); 73 p9_client_clunk(hlist_entry(p, struct p9_fid, dlist));
74 dentry->d_fsdata = NULL; 74 dentry->d_fsdata = NULL;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0b3bfa303dda..4f1151088ebe 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -116,7 +116,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
116 int reclen = 0; 116 int reclen = 0;
117 struct p9_rdir *rdir; 117 struct p9_rdir *rdir;
118 118
119 p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name); 119 p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
120 fid = file->private_data; 120 fid = file->private_data;
121 121
122 buflen = fid->clnt->msize - P9_IOHDRSZ; 122 buflen = fid->clnt->msize - P9_IOHDRSZ;
@@ -172,7 +172,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
172 struct p9_rdir *rdir; 172 struct p9_rdir *rdir;
173 struct p9_dirent curdirent; 173 struct p9_dirent curdirent;
174 174
175 p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name); 175 p9_debug(P9_DEBUG_VFS, "name %pD\n", file);
176 fid = file->private_data; 176 fid = file->private_data;
177 177
178 buflen = fid->clnt->msize - P9_READDIRHDRSZ; 178 buflen = fid->clnt->msize - P9_READDIRHDRSZ;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 520c11c2dcca..5594505e6e73 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -301,8 +301,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
301 struct inode *inode = file_inode(filp); 301 struct inode *inode = file_inode(filp);
302 int ret = -ENOLCK; 302 int ret = -ENOLCK;
303 303
304 p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", 304 p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
305 filp, cmd, fl, filp->f_path.dentry->d_name.name); 305 filp, cmd, fl, filp);
306 306
307 /* No mandatory locks */ 307 /* No mandatory locks */
308 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 308 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
@@ -337,8 +337,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
337 struct inode *inode = file_inode(filp); 337 struct inode *inode = file_inode(filp);
338 int ret = -ENOLCK; 338 int ret = -ENOLCK;
339 339
340 p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", 340 p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
341 filp, cmd, fl, filp->f_path.dentry->d_name.name); 341 filp, cmd, fl, filp);
342 342
343 /* No mandatory locks */ 343 /* No mandatory locks */
344 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) 344 if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 7fa4f7a7653d..296482fc77a9 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -648,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
648 struct p9_fid *dfid, *ofid, *fid; 648 struct p9_fid *dfid, *ofid, *fid;
649 struct inode *inode; 649 struct inode *inode;
650 650
651 p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 651 p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
652 652
653 err = 0; 653 err = 0;
654 ofid = NULL; 654 ofid = NULL;
@@ -755,7 +755,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
755 struct p9_fid *fid; 755 struct p9_fid *fid;
756 struct v9fs_session_info *v9ses; 756 struct v9fs_session_info *v9ses;
757 757
758 p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 758 p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
759 err = 0; 759 err = 0;
760 v9ses = v9fs_inode2v9ses(dir); 760 v9ses = v9fs_inode2v9ses(dir);
761 perm = unixmode2p9mode(v9ses, mode | S_IFDIR); 761 perm = unixmode2p9mode(v9ses, mode | S_IFDIR);
@@ -791,8 +791,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
791 struct inode *inode; 791 struct inode *inode;
792 char *name; 792 char *name;
793 793
794 p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n", 794 p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%pd) %p flags: %x\n",
795 dir, dentry->d_name.name, dentry, flags); 795 dir, dentry, dentry, flags);
796 796
797 if (dentry->d_name.len > NAME_MAX) 797 if (dentry->d_name.len > NAME_MAX)
798 return ERR_PTR(-ENAMETOOLONG); 798 return ERR_PTR(-ENAMETOOLONG);
@@ -1239,7 +1239,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1239 struct p9_fid *fid; 1239 struct p9_fid *fid;
1240 struct p9_wstat *st; 1240 struct p9_wstat *st;
1241 1241
1242 p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); 1242 p9_debug(P9_DEBUG_VFS, " %pd\n", dentry);
1243 retval = -EPERM; 1243 retval = -EPERM;
1244 v9ses = v9fs_dentry2v9ses(dentry); 1244 v9ses = v9fs_dentry2v9ses(dentry);
1245 fid = v9fs_fid_lookup(dentry); 1245 fid = v9fs_fid_lookup(dentry);
@@ -1262,8 +1262,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1262 retval = min(strlen(st->extension)+1, (size_t)buflen); 1262 retval = min(strlen(st->extension)+1, (size_t)buflen);
1263 memcpy(buffer, st->extension, retval); 1263 memcpy(buffer, st->extension, retval);
1264 1264
1265 p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n", 1265 p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n",
1266 dentry->d_name.name, st->extension, buflen, buffer); 1266 dentry, st->extension, buflen, buffer);
1267 1267
1268done: 1268done:
1269 p9stat_free(st); 1269 p9stat_free(st);
@@ -1283,7 +1283,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd)
1283 int len = 0; 1283 int len = 0;
1284 char *link = __getname(); 1284 char *link = __getname();
1285 1285
1286 p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); 1286 p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
1287 1287
1288 if (!link) 1288 if (!link)
1289 link = ERR_PTR(-ENOMEM); 1289 link = ERR_PTR(-ENOMEM);
@@ -1314,8 +1314,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1314{ 1314{
1315 char *s = nd_get_link(nd); 1315 char *s = nd_get_link(nd);
1316 1316
1317 p9_debug(P9_DEBUG_VFS, " %s %s\n", 1317 p9_debug(P9_DEBUG_VFS, " %pd %s\n",
1318 dentry->d_name.name, IS_ERR(s) ? "<error>" : s); 1318 dentry, IS_ERR(s) ? "<error>" : s);
1319 if (!IS_ERR(s)) 1319 if (!IS_ERR(s))
1320 __putname(s); 1320 __putname(s);
1321} 1321}
@@ -1364,8 +1364,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1364static int 1364static int
1365v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) 1365v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1366{ 1366{
1367 p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", 1367 p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n",
1368 dir->i_ino, dentry->d_name.name, symname); 1368 dir->i_ino, dentry, symname);
1369 1369
1370 return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); 1370 return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname);
1371} 1371}
@@ -1386,8 +1386,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1386 char *name; 1386 char *name;
1387 struct p9_fid *oldfid; 1387 struct p9_fid *oldfid;
1388 1388
1389 p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", 1389 p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n",
1390 dir->i_ino, dentry->d_name.name, old_dentry->d_name.name); 1390 dir->i_ino, dentry, old_dentry);
1391 1391
1392 oldfid = v9fs_fid_clone(old_dentry); 1392 oldfid = v9fs_fid_clone(old_dentry);
1393 if (IS_ERR(oldfid)) 1393 if (IS_ERR(oldfid))
@@ -1428,8 +1428,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
1428 char *name; 1428 char *name;
1429 u32 perm; 1429 u32 perm;
1430 1430
1431 p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", 1431 p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
1432 dir->i_ino, dentry->d_name.name, mode, 1432 dir->i_ino, dentry, mode,
1433 MAJOR(rdev), MINOR(rdev)); 1433 MAJOR(rdev), MINOR(rdev));
1434 1434
1435 if (!new_valid_dev(rdev)) 1435 if (!new_valid_dev(rdev))
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1fa85aae24df..02b64f4e576a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -393,7 +393,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
393 struct dentry *dir_dentry; 393 struct dentry *dir_dentry;
394 struct posix_acl *dacl = NULL, *pacl = NULL; 394 struct posix_acl *dacl = NULL, *pacl = NULL;
395 395
396 p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 396 p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
397 err = 0; 397 err = 0;
398 v9ses = v9fs_inode2v9ses(dir); 398 v9ses = v9fs_inode2v9ses(dir);
399 399
@@ -767,8 +767,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
767 struct p9_fid *dfid, *oldfid; 767 struct p9_fid *dfid, *oldfid;
768 struct v9fs_session_info *v9ses; 768 struct v9fs_session_info *v9ses;
769 769
770 p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", 770 p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n",
771 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name); 771 dir->i_ino, old_dentry, dentry);
772 772
773 v9ses = v9fs_inode2v9ses(dir); 773 v9ses = v9fs_inode2v9ses(dir);
774 dir_dentry = dentry->d_parent; 774 dir_dentry = dentry->d_parent;
@@ -917,7 +917,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
917 char *link = __getname(); 917 char *link = __getname();
918 char *target; 918 char *target;
919 919
920 p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); 920 p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
921 921
922 if (!link) { 922 if (!link) {
923 link = ERR_PTR(-ENOMEM); 923 link = ERR_PTR(-ENOMEM);
diff --git a/fs/Kconfig b/fs/Kconfig
index 312393f32948..db5dc1598716 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -233,9 +233,13 @@ if NETWORK_FILESYSTEMS
233source "fs/nfs/Kconfig" 233source "fs/nfs/Kconfig"
234source "fs/nfsd/Kconfig" 234source "fs/nfsd/Kconfig"
235 235
236config GRACE_PERIOD
237 tristate
238
236config LOCKD 239config LOCKD
237 tristate 240 tristate
238 depends on FILE_LOCKING 241 depends on FILE_LOCKING
242 select GRACE_PERIOD
239 243
240config LOCKD_V4 244config LOCKD_V4
241 bool 245 bool
@@ -249,7 +253,7 @@ config NFS_ACL_SUPPORT
249 253
250config NFS_COMMON 254config NFS_COMMON
251 bool 255 bool
252 depends on NFSD || NFS_FS 256 depends on NFSD || NFS_FS || LOCKD
253 default y 257 default y
254 258
255source "net/sunrpc/Kconfig" 259source "net/sunrpc/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 4030cbfbc9af..90c88529892b 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o splice.o sync.o utimes.o \ 13 pnode.o splice.o sync.o utimes.o \
14 stack.o fs_struct.o statfs.o 14 stack.o fs_struct.o statfs.o fs_pin.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o block_dev.o direct-io.o mpage.o 17obj-y += buffer.o block_dev.o direct-io.o mpage.o
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index c770337c4b45..24575d9d882d 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -153,6 +153,7 @@ extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigne
153extern unsigned int adfs_map_free(struct super_block *sb); 153extern unsigned int adfs_map_free(struct super_block *sb);
154 154
155/* Misc */ 155/* Misc */
156__printf(3, 4)
156void __adfs_error(struct super_block *sb, const char *function, 157void __adfs_error(struct super_block *sb, const char *function,
157 const char *fmt, ...); 158 const char *fmt, ...);
158#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt) 159#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt)
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 0d138c0de293..51c279a29845 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -138,7 +138,7 @@ adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_inf
138 goto out; 138 goto out;
139 139
140 if (ADFS_I(inode)->parent_id != dir.parent_id) { 140 if (ADFS_I(inode)->parent_id != dir.parent_id) {
141 adfs_error(sb, "parent directory changed under me! (%lx but got %lx)\n", 141 adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n",
142 ADFS_I(inode)->parent_id, dir.parent_id); 142 ADFS_I(inode)->parent_id, dir.parent_id);
143 ret = -EIO; 143 ret = -EIO;
144 goto free_out; 144 goto free_out;
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index d9e3bee4e653..f2ba88ab4aed 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -55,10 +55,10 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
55 } 55 }
56 56
57 size >>= sb->s_blocksize_bits; 57 size >>= sb->s_blocksize_bits;
58 if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) { 58 if (size > ARRAY_SIZE(dir->bh)) {
59 /* this directory is too big for fixed bh set, must allocate */ 59 /* this directory is too big for fixed bh set, must allocate */
60 struct buffer_head **bh_fplus = 60 struct buffer_head **bh_fplus =
61 kzalloc(size * sizeof(struct buffer_head *), 61 kcalloc(size, sizeof(struct buffer_head *),
62 GFP_KERNEL); 62 GFP_KERNEL);
63 if (!bh_fplus) { 63 if (!bh_fplus) {
64 adfs_error(sb, "not enough memory for" 64 adfs_error(sb, "not enough memory for"
@@ -79,9 +79,8 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
79 79
80 dir->bh_fplus[blk] = sb_bread(sb, block); 80 dir->bh_fplus[blk] = sb_bread(sb, block);
81 if (!dir->bh_fplus[blk]) { 81 if (!dir->bh_fplus[blk]) {
82 adfs_error(sb, "dir object %X failed read for" 82 adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX",
83 " offset %d, mapped block %X", 83 id, blk, block);
84 id, blk, block);
85 goto out; 84 goto out;
86 } 85 }
87 86
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 406b29836b19..abc853968fed 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -10,8 +10,6 @@
10 10
11#include "affs.h" 11#include "affs.h"
12 12
13extern struct timezone sys_tz;
14
15static char ErrorBuffer[256]; 13static char ErrorBuffer[256];
16 14
17/* 15/*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index a7fe57d2cd9a..1ed590aafecf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -584,11 +584,14 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
584 bh->b_state &= ~(1UL << BH_New); 584 bh->b_state &= ~(1UL << BH_New);
585 mark_buffer_dirty_inode(bh, inode); 585 mark_buffer_dirty_inode(bh, inode);
586 if (prev_bh) { 586 if (prev_bh) {
587 u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); 587 u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
588 if (tmp) 588
589 affs_warning(sb, "extent_file_ofs", "next block already set for %d (%d)", bidx, tmp); 589 if (tmp_next)
590 affs_warning(sb, "extent_file_ofs",
591 "next block already set for %d (%d)",
592 bidx, tmp_next);
590 AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); 593 AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
591 affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); 594 affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
592 mark_buffer_dirty_inode(prev_bh, inode); 595 mark_buffer_dirty_inode(prev_bh, inode);
593 affs_brelse(prev_bh); 596 affs_brelse(prev_bh);
594 } 597 }
@@ -727,11 +730,14 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
727 AFFS_DATA_HEAD(bh)->next = 0; 730 AFFS_DATA_HEAD(bh)->next = 0;
728 bh->b_state &= ~(1UL << BH_New); 731 bh->b_state &= ~(1UL << BH_New);
729 if (prev_bh) { 732 if (prev_bh) {
730 u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); 733 u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
731 if (tmp) 734
732 affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp); 735 if (tmp_next)
736 affs_warning(sb, "commit_write_ofs",
737 "next block already set for %d (%d)",
738 bidx, tmp_next);
733 AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); 739 AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
734 affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); 740 affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
735 mark_buffer_dirty_inode(prev_bh, inode); 741 mark_buffer_dirty_inode(prev_bh, inode);
736 } 742 }
737 } 743 }
@@ -758,11 +764,14 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
758 AFFS_DATA_HEAD(bh)->next = 0; 764 AFFS_DATA_HEAD(bh)->next = 0;
759 bh->b_state &= ~(1UL << BH_New); 765 bh->b_state &= ~(1UL << BH_New);
760 if (prev_bh) { 766 if (prev_bh) {
761 u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); 767 u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
762 if (tmp) 768
763 affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp); 769 if (tmp_next)
770 affs_warning(sb, "commit_write_ofs",
771 "next block already set for %d (%d)",
772 bidx, tmp_next);
764 AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); 773 AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
765 affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); 774 affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
766 mark_buffer_dirty_inode(prev_bh, inode); 775 mark_buffer_dirty_inode(prev_bh, inode);
767 } 776 }
768 } else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp) 777 } else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp)
@@ -842,12 +851,12 @@ affs_truncate(struct inode *inode)
842 struct address_space *mapping = inode->i_mapping; 851 struct address_space *mapping = inode->i_mapping;
843 struct page *page; 852 struct page *page;
844 void *fsdata; 853 void *fsdata;
845 loff_t size = inode->i_size; 854 loff_t isize = inode->i_size;
846 int res; 855 int res;
847 856
848 res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); 857 res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata);
849 if (!res) 858 if (!res)
850 res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata); 859 res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata);
851 else 860 else
852 inode->i_size = AFFS_I(inode)->mmu_private; 861 inode->i_size = AFFS_I(inode)->mmu_private;
853 mark_inode_dirty(inode); 862 mark_inode_dirty(inode);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index bec2d1a0c91c..e217c511459b 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -14,13 +14,11 @@
14#include "affs.h" 14#include "affs.h"
15 15
16extern const struct inode_operations affs_symlink_inode_operations; 16extern const struct inode_operations affs_symlink_inode_operations;
17extern struct timezone sys_tz;
18 17
19struct inode *affs_iget(struct super_block *sb, unsigned long ino) 18struct inode *affs_iget(struct super_block *sb, unsigned long ino)
20{ 19{
21 struct affs_sb_info *sbi = AFFS_SB(sb); 20 struct affs_sb_info *sbi = AFFS_SB(sb);
22 struct buffer_head *bh; 21 struct buffer_head *bh;
23 struct affs_head *head;
24 struct affs_tail *tail; 22 struct affs_tail *tail;
25 struct inode *inode; 23 struct inode *inode;
26 u32 block; 24 u32 block;
@@ -49,7 +47,6 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
49 goto bad_inode; 47 goto bad_inode;
50 } 48 }
51 49
52 head = AFFS_HEAD(bh);
53 tail = AFFS_TAIL(sb, bh); 50 tail = AFFS_TAIL(sb, bh);
54 prot = be32_to_cpu(tail->protect); 51 prot = be32_to_cpu(tail->protect);
55 52
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 51f1a95bff73..f754ab68a840 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -20,8 +20,6 @@
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include "affs.h" 21#include "affs.h"
22 22
23extern struct timezone sys_tz;
24
25static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); 23static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
26static int affs_remount (struct super_block *sb, int *flags, char *data); 24static int affs_remount (struct super_block *sb, int *flags, char *data);
27 25
@@ -308,7 +306,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
308 u32 chksum; 306 u32 chksum;
309 int num_bm; 307 int num_bm;
310 int i, j; 308 int i, j;
311 s32 key;
312 kuid_t uid; 309 kuid_t uid;
313 kgid_t gid; 310 kgid_t gid;
314 int reserved; 311 int reserved;
@@ -367,7 +364,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
367 i = j = blocksize; 364 i = j = blocksize;
368 size = size / (blocksize / 512); 365 size = size / (blocksize / 512);
369 } 366 }
370 for (blocksize = i, key = 0; blocksize <= j; blocksize <<= 1, size >>= 1) { 367 for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
371 sbi->s_root_block = root_block; 368 sbi->s_root_block = root_block;
372 if (root_block < 0) 369 if (root_block < 0)
373 sbi->s_root_block = (reserved + size - 1) / 2; 370 sbi->s_root_block = (reserved + size - 1) / 2;
@@ -399,7 +396,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
399 be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) { 396 be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) {
400 sbi->s_hashsize = blocksize / 4 - 56; 397 sbi->s_hashsize = blocksize / 4 - 56;
401 sbi->s_root_block += num_bm; 398 sbi->s_root_block += num_bm;
402 key = 1;
403 goto got_root; 399 goto got_root;
404 } 400 }
405 affs_brelse(root_bh); 401 affs_brelse(root_bh);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 529300327f45..a1645b88fe8a 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -669,7 +669,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
669 669
670out_valid: 670out_valid:
671 dentry->d_fsdata = dir_version; 671 dentry->d_fsdata = dir_version;
672out_skip:
673 dput(parent); 672 dput(parent);
674 key_put(key); 673 key_put(key);
675 _leave(" = 1 [valid]"); 674 _leave(" = 1 [valid]");
@@ -682,10 +681,6 @@ not_found:
682 spin_unlock(&dentry->d_lock); 681 spin_unlock(&dentry->d_lock);
683 682
684out_bad: 683out_bad:
685 /* don't unhash if we have submounts */
686 if (check_submounts_and_drop(dentry) != 0)
687 goto out_skip;
688
689 _debug("dropping dentry %s/%s", 684 _debug("dropping dentry %s/%s",
690 parent->d_name.name, dentry->d_name.name); 685 parent->d_name.name, dentry->d_name.name);
691 dput(parent); 686 dput(parent);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index b6df2e83809f..52976785a32c 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -130,7 +130,6 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
130 /* second+ BUSY - sleep a little bit */ 130 /* second+ BUSY - sleep a little bit */
131 set_current_state(TASK_UNINTERRUPTIBLE); 131 set_current_state(TASK_UNINTERRUPTIBLE);
132 schedule_timeout(1); 132 schedule_timeout(1);
133 __set_current_state(TASK_RUNNING);
134 } 133 }
135 continue; 134 continue;
136 } 135 }
diff --git a/fs/aio.c b/fs/aio.c
index 1c9c5f0a9e2b..84a751005f5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -141,6 +141,7 @@ struct kioctx {
141 141
142 struct { 142 struct {
143 unsigned tail; 143 unsigned tail;
144 unsigned completed_events;
144 spinlock_t completion_lock; 145 spinlock_t completion_lock;
145 } ____cacheline_aligned_in_smp; 146 } ____cacheline_aligned_in_smp;
146 147
@@ -192,7 +193,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
192 } 193 }
193 194
194 file->f_flags = O_RDWR; 195 file->f_flags = O_RDWR;
195 file->private_data = ctx;
196 return file; 196 return file;
197} 197}
198 198
@@ -202,7 +202,7 @@ static struct dentry *aio_mount(struct file_system_type *fs_type,
202 static const struct dentry_operations ops = { 202 static const struct dentry_operations ops = {
203 .d_dname = simple_dname, 203 .d_dname = simple_dname,
204 }; 204 };
205 return mount_pseudo(fs_type, "aio:", NULL, &ops, 0xa10a10a1); 205 return mount_pseudo(fs_type, "aio:", NULL, &ops, AIO_RING_MAGIC);
206} 206}
207 207
208/* aio_setup 208/* aio_setup
@@ -506,6 +506,8 @@ static void free_ioctx(struct work_struct *work)
506 506
507 aio_free_ring(ctx); 507 aio_free_ring(ctx);
508 free_percpu(ctx->cpu); 508 free_percpu(ctx->cpu);
509 percpu_ref_exit(&ctx->reqs);
510 percpu_ref_exit(&ctx->users);
509 kmem_cache_free(kioctx_cachep, ctx); 511 kmem_cache_free(kioctx_cachep, ctx);
510} 512}
511 513
@@ -554,8 +556,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
554 struct aio_ring *ring; 556 struct aio_ring *ring;
555 557
556 spin_lock(&mm->ioctx_lock); 558 spin_lock(&mm->ioctx_lock);
557 rcu_read_lock(); 559 table = rcu_dereference_raw(mm->ioctx_table);
558 table = rcu_dereference(mm->ioctx_table);
559 560
560 while (1) { 561 while (1) {
561 if (table) 562 if (table)
@@ -563,7 +564,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
563 if (!table->table[i]) { 564 if (!table->table[i]) {
564 ctx->id = i; 565 ctx->id = i;
565 table->table[i] = ctx; 566 table->table[i] = ctx;
566 rcu_read_unlock();
567 spin_unlock(&mm->ioctx_lock); 567 spin_unlock(&mm->ioctx_lock);
568 568
569 /* While kioctx setup is in progress, 569 /* While kioctx setup is in progress,
@@ -577,8 +577,6 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
577 } 577 }
578 578
579 new_nr = (table ? table->nr : 1) * 4; 579 new_nr = (table ? table->nr : 1) * 4;
580
581 rcu_read_unlock();
582 spin_unlock(&mm->ioctx_lock); 580 spin_unlock(&mm->ioctx_lock);
583 581
584 table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * 582 table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
@@ -589,8 +587,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
589 table->nr = new_nr; 587 table->nr = new_nr;
590 588
591 spin_lock(&mm->ioctx_lock); 589 spin_lock(&mm->ioctx_lock);
592 rcu_read_lock(); 590 old = rcu_dereference_raw(mm->ioctx_table);
593 old = rcu_dereference(mm->ioctx_table);
594 591
595 if (!old) { 592 if (!old) {
596 rcu_assign_pointer(mm->ioctx_table, table); 593 rcu_assign_pointer(mm->ioctx_table, table);
@@ -664,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
664 661
665 INIT_LIST_HEAD(&ctx->active_reqs); 662 INIT_LIST_HEAD(&ctx->active_reqs);
666 663
667 if (percpu_ref_init(&ctx->users, free_ioctx_users)) 664 if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
668 goto err; 665 goto err;
669 666
670 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) 667 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
671 goto err; 668 goto err;
672 669
673 ctx->cpu = alloc_percpu(struct kioctx_cpu); 670 ctx->cpu = alloc_percpu(struct kioctx_cpu);
@@ -715,8 +712,8 @@ err_ctx:
715err: 712err:
716 mutex_unlock(&ctx->ring_lock); 713 mutex_unlock(&ctx->ring_lock);
717 free_percpu(ctx->cpu); 714 free_percpu(ctx->cpu);
718 free_percpu(ctx->reqs.pcpu_count); 715 percpu_ref_exit(&ctx->reqs);
719 free_percpu(ctx->users.pcpu_count); 716 percpu_ref_exit(&ctx->users);
720 kmem_cache_free(kioctx_cachep, ctx); 717 kmem_cache_free(kioctx_cachep, ctx);
721 pr_debug("error allocating ioctx %d\n", err); 718 pr_debug("error allocating ioctx %d\n", err);
722 return ERR_PTR(err); 719 return ERR_PTR(err);
@@ -737,12 +734,9 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
737 734
738 735
739 spin_lock(&mm->ioctx_lock); 736 spin_lock(&mm->ioctx_lock);
740 rcu_read_lock(); 737 table = rcu_dereference_raw(mm->ioctx_table);
741 table = rcu_dereference(mm->ioctx_table);
742
743 WARN_ON(ctx != table->table[ctx->id]); 738 WARN_ON(ctx != table->table[ctx->id]);
744 table->table[ctx->id] = NULL; 739 table->table[ctx->id] = NULL;
745 rcu_read_unlock();
746 spin_unlock(&mm->ioctx_lock); 740 spin_unlock(&mm->ioctx_lock);
747 741
748 /* percpu_ref_kill() will do the necessary call_rcu() */ 742 /* percpu_ref_kill() will do the necessary call_rcu() */
@@ -791,40 +785,35 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
791 */ 785 */
792void exit_aio(struct mm_struct *mm) 786void exit_aio(struct mm_struct *mm)
793{ 787{
794 struct kioctx_table *table; 788 struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
795 struct kioctx *ctx; 789 int i;
796 unsigned i = 0;
797
798 while (1) {
799 rcu_read_lock();
800 table = rcu_dereference(mm->ioctx_table);
801
802 do {
803 if (!table || i >= table->nr) {
804 rcu_read_unlock();
805 rcu_assign_pointer(mm->ioctx_table, NULL);
806 if (table)
807 kfree(table);
808 return;
809 }
810 790
811 ctx = table->table[i++]; 791 if (!table)
812 } while (!ctx); 792 return;
813 793
814 rcu_read_unlock(); 794 for (i = 0; i < table->nr; ++i) {
795 struct kioctx *ctx = table->table[i];
796 struct completion requests_done =
797 COMPLETION_INITIALIZER_ONSTACK(requests_done);
815 798
799 if (!ctx)
800 continue;
816 /* 801 /*
817 * We don't need to bother with munmap() here - 802 * We don't need to bother with munmap() here - exit_mmap(mm)
818 * exit_mmap(mm) is coming and it'll unmap everything. 803 * is coming and it'll unmap everything. And we simply can't,
819 * Since aio_free_ring() uses non-zero ->mmap_size 804 * this is not necessarily our ->mm.
820 * as indicator that it needs to unmap the area, 805 * Since kill_ioctx() uses non-zero ->mmap_size as indicator
821 * just set it to 0; aio_free_ring() is the only 806 * that it needs to unmap the area, just set it to 0.
822 * place that uses ->mmap_size, so it's safe.
823 */ 807 */
824 ctx->mmap_size = 0; 808 ctx->mmap_size = 0;
809 kill_ioctx(mm, ctx, &requests_done);
825 810
826 kill_ioctx(mm, ctx, NULL); 811 /* Wait until all IO for the context are done. */
812 wait_for_completion(&requests_done);
827 } 813 }
814
815 RCU_INIT_POINTER(mm->ioctx_table, NULL);
816 kfree(table);
828} 817}
829 818
830static void put_reqs_available(struct kioctx *ctx, unsigned nr) 819static void put_reqs_available(struct kioctx *ctx, unsigned nr)
@@ -832,10 +821,8 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
832 struct kioctx_cpu *kcpu; 821 struct kioctx_cpu *kcpu;
833 unsigned long flags; 822 unsigned long flags;
834 823
835 preempt_disable();
836 kcpu = this_cpu_ptr(ctx->cpu);
837
838 local_irq_save(flags); 824 local_irq_save(flags);
825 kcpu = this_cpu_ptr(ctx->cpu);
839 kcpu->reqs_available += nr; 826 kcpu->reqs_available += nr;
840 827
841 while (kcpu->reqs_available >= ctx->req_batch * 2) { 828 while (kcpu->reqs_available >= ctx->req_batch * 2) {
@@ -844,7 +831,6 @@ static void put_reqs_available(struct kioctx *ctx, unsigned nr)
844 } 831 }
845 832
846 local_irq_restore(flags); 833 local_irq_restore(flags);
847 preempt_enable();
848} 834}
849 835
850static bool get_reqs_available(struct kioctx *ctx) 836static bool get_reqs_available(struct kioctx *ctx)
@@ -853,10 +839,8 @@ static bool get_reqs_available(struct kioctx *ctx)
853 bool ret = false; 839 bool ret = false;
854 unsigned long flags; 840 unsigned long flags;
855 841
856 preempt_disable();
857 kcpu = this_cpu_ptr(ctx->cpu);
858
859 local_irq_save(flags); 842 local_irq_save(flags);
843 kcpu = this_cpu_ptr(ctx->cpu);
860 if (!kcpu->reqs_available) { 844 if (!kcpu->reqs_available) {
861 int old, avail = atomic_read(&ctx->reqs_available); 845 int old, avail = atomic_read(&ctx->reqs_available);
862 846
@@ -876,10 +860,71 @@ static bool get_reqs_available(struct kioctx *ctx)
876 kcpu->reqs_available--; 860 kcpu->reqs_available--;
877out: 861out:
878 local_irq_restore(flags); 862 local_irq_restore(flags);
879 preempt_enable();
880 return ret; 863 return ret;
881} 864}
882 865
866/* refill_reqs_available
867 * Updates the reqs_available reference counts used for tracking the
868 * number of free slots in the completion ring. This can be called
869 * from aio_complete() (to optimistically update reqs_available) or
870 * from aio_get_req() (the we're out of events case). It must be
871 * called holding ctx->completion_lock.
872 */
873static void refill_reqs_available(struct kioctx *ctx, unsigned head,
874 unsigned tail)
875{
876 unsigned events_in_ring, completed;
877
878 /* Clamp head since userland can write to it. */
879 head %= ctx->nr_events;
880 if (head <= tail)
881 events_in_ring = tail - head;
882 else
883 events_in_ring = ctx->nr_events - (head - tail);
884
885 completed = ctx->completed_events;
886 if (events_in_ring < completed)
887 completed -= events_in_ring;
888 else
889 completed = 0;
890
891 if (!completed)
892 return;
893
894 ctx->completed_events -= completed;
895 put_reqs_available(ctx, completed);
896}
897
898/* user_refill_reqs_available
899 * Called to refill reqs_available when aio_get_req() encounters an
900 * out of space in the completion ring.
901 */
902static void user_refill_reqs_available(struct kioctx *ctx)
903{
904 spin_lock_irq(&ctx->completion_lock);
905 if (ctx->completed_events) {
906 struct aio_ring *ring;
907 unsigned head;
908
909 /* Access of ring->head may race with aio_read_events_ring()
910 * here, but that's okay since whether we read the old version
911 * or the new version, and either will be valid. The important
912 * part is that head cannot pass tail since we prevent
913 * aio_complete() from updating tail by holding
914 * ctx->completion_lock. Even if head is invalid, the check
915 * against ctx->completed_events below will make sure we do the
916 * safe/right thing.
917 */
918 ring = kmap_atomic(ctx->ring_pages[0]);
919 head = ring->head;
920 kunmap_atomic(ring);
921
922 refill_reqs_available(ctx, head, ctx->tail);
923 }
924
925 spin_unlock_irq(&ctx->completion_lock);
926}
927
883/* aio_get_req 928/* aio_get_req
884 * Allocate a slot for an aio request. 929 * Allocate a slot for an aio request.
885 * Returns NULL if no requests are free. 930 * Returns NULL if no requests are free.
@@ -888,8 +933,11 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
888{ 933{
889 struct kiocb *req; 934 struct kiocb *req;
890 935
891 if (!get_reqs_available(ctx)) 936 if (!get_reqs_available(ctx)) {
892 return NULL; 937 user_refill_reqs_available(ctx);
938 if (!get_reqs_available(ctx))
939 return NULL;
940 }
893 941
894 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); 942 req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
895 if (unlikely(!req)) 943 if (unlikely(!req))
@@ -948,8 +996,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
948 struct kioctx *ctx = iocb->ki_ctx; 996 struct kioctx *ctx = iocb->ki_ctx;
949 struct aio_ring *ring; 997 struct aio_ring *ring;
950 struct io_event *ev_page, *event; 998 struct io_event *ev_page, *event;
999 unsigned tail, pos, head;
951 unsigned long flags; 1000 unsigned long flags;
952 unsigned tail, pos;
953 1001
954 /* 1002 /*
955 * Special case handling for sync iocbs: 1003 * Special case handling for sync iocbs:
@@ -1010,10 +1058,14 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1010 ctx->tail = tail; 1058 ctx->tail = tail;
1011 1059
1012 ring = kmap_atomic(ctx->ring_pages[0]); 1060 ring = kmap_atomic(ctx->ring_pages[0]);
1061 head = ring->head;
1013 ring->tail = tail; 1062 ring->tail = tail;
1014 kunmap_atomic(ring); 1063 kunmap_atomic(ring);
1015 flush_dcache_page(ctx->ring_pages[0]); 1064 flush_dcache_page(ctx->ring_pages[0]);
1016 1065
1066 ctx->completed_events++;
1067 if (ctx->completed_events > 1)
1068 refill_reqs_available(ctx, head, tail);
1017 spin_unlock_irqrestore(&ctx->completion_lock, flags); 1069 spin_unlock_irqrestore(&ctx->completion_lock, flags);
1018 1070
1019 pr_debug("added to ring %p at [%u]\n", iocb, tail); 1071 pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1028,7 +1080,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1028 1080
1029 /* everything turned out well, dispose of the aiocb. */ 1081 /* everything turned out well, dispose of the aiocb. */
1030 kiocb_free(iocb); 1082 kiocb_free(iocb);
1031 put_reqs_available(ctx, 1);
1032 1083
1033 /* 1084 /*
1034 * We have to order our ring_info tail store above and test 1085 * We have to order our ring_info tail store above and test
@@ -1045,7 +1096,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1045} 1096}
1046EXPORT_SYMBOL(aio_complete); 1097EXPORT_SYMBOL(aio_complete);
1047 1098
1048/* aio_read_events 1099/* aio_read_events_ring
1049 * Pull an event off of the ioctx's event ring. Returns the number of 1100 * Pull an event off of the ioctx's event ring. Returns the number of
1050 * events fetched 1101 * events fetched
1051 */ 1102 */
@@ -1065,6 +1116,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
1065 tail = ring->tail; 1116 tail = ring->tail;
1066 kunmap_atomic(ring); 1117 kunmap_atomic(ring);
1067 1118
1119 /*
1120 * Ensure that once we've read the current tail pointer, that
1121 * we also see the events that were stored up to the tail.
1122 */
1123 smp_rmb();
1124
1068 pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); 1125 pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
1069 1126
1070 if (head == tail) 1127 if (head == tail)
@@ -1268,12 +1325,12 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
1268 if (compat) 1325 if (compat)
1269 ret = compat_rw_copy_check_uvector(rw, 1326 ret = compat_rw_copy_check_uvector(rw,
1270 (struct compat_iovec __user *)buf, 1327 (struct compat_iovec __user *)buf,
1271 *nr_segs, 1, *iovec, iovec); 1328 *nr_segs, UIO_FASTIOV, *iovec, iovec);
1272 else 1329 else
1273#endif 1330#endif
1274 ret = rw_copy_check_uvector(rw, 1331 ret = rw_copy_check_uvector(rw,
1275 (struct iovec __user *)buf, 1332 (struct iovec __user *)buf,
1276 *nr_segs, 1, *iovec, iovec); 1333 *nr_segs, UIO_FASTIOV, *iovec, iovec);
1277 if (ret < 0) 1334 if (ret < 0)
1278 return ret; 1335 return ret;
1279 1336
@@ -1297,9 +1354,8 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
1297} 1354}
1298 1355
1299/* 1356/*
1300 * aio_setup_iocb: 1357 * aio_run_iocb:
1301 * Performs the initial checks and aio retry method 1358 * Performs the initial checks and io submission.
1302 * setup for the kiocb at the time of io submission.
1303 */ 1359 */
1304static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, 1360static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
1305 char __user *buf, bool compat) 1361 char __user *buf, bool compat)
@@ -1311,7 +1367,7 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
1311 fmode_t mode; 1367 fmode_t mode;
1312 aio_rw_op *rw_op; 1368 aio_rw_op *rw_op;
1313 rw_iter_op *iter_op; 1369 rw_iter_op *iter_op;
1314 struct iovec inline_vec, *iovec = &inline_vec; 1370 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
1315 struct iov_iter iter; 1371 struct iov_iter iter;
1316 1372
1317 switch (opcode) { 1373 switch (opcode) {
@@ -1346,7 +1402,7 @@ rw_common:
1346 if (!ret) 1402 if (!ret)
1347 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); 1403 ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
1348 if (ret < 0) { 1404 if (ret < 0) {
1349 if (iovec != &inline_vec) 1405 if (iovec != inline_vecs)
1350 kfree(iovec); 1406 kfree(iovec);
1351 return ret; 1407 return ret;
1352 } 1408 }
@@ -1393,7 +1449,7 @@ rw_common:
1393 return -EINVAL; 1449 return -EINVAL;
1394 } 1450 }
1395 1451
1396 if (iovec != &inline_vec) 1452 if (iovec != inline_vecs)
1397 kfree(iovec); 1453 kfree(iovec);
1398 1454
1399 if (ret != -EIOCBQUEUED) { 1455 if (ret != -EIOCBQUEUED) {
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index acf32054edd8..8e98cf954bab 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -79,6 +79,10 @@ struct autofs_info {
79}; 79};
80 80
81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ 81#define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */
82#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered
83 * for expiry, so RCU_walk is
84 * not permitted
85 */
82#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ 86#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */
83 87
84struct autofs_wait_queue { 88struct autofs_wait_queue {
@@ -143,26 +147,12 @@ static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
143 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; 147 return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
144} 148}
145 149
146/* Does a dentry have some pending activity? */
147static inline int autofs4_ispending(struct dentry *dentry)
148{
149 struct autofs_info *inf = autofs4_dentry_ino(dentry);
150
151 if (inf->flags & AUTOFS_INF_PENDING)
152 return 1;
153
154 if (inf->flags & AUTOFS_INF_EXPIRING)
155 return 1;
156
157 return 0;
158}
159
160struct inode *autofs4_get_inode(struct super_block *, umode_t); 150struct inode *autofs4_get_inode(struct super_block *, umode_t);
161void autofs4_free_ino(struct autofs_info *); 151void autofs4_free_ino(struct autofs_info *);
162 152
163/* Expiration */ 153/* Expiration */
164int is_autofs4_dentry(struct dentry *); 154int is_autofs4_dentry(struct dentry *);
165int autofs4_expire_wait(struct dentry *dentry); 155int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
166int autofs4_expire_run(struct super_block *, struct vfsmount *, 156int autofs4_expire_run(struct super_block *, struct vfsmount *,
167 struct autofs_sb_info *, 157 struct autofs_sb_info *,
168 struct autofs_packet_expire __user *); 158 struct autofs_packet_expire __user *);
@@ -191,55 +181,6 @@ extern const struct file_operations autofs4_root_operations;
191extern const struct dentry_operations autofs4_dentry_operations; 181extern const struct dentry_operations autofs4_dentry_operations;
192 182
193/* VFS automount flags management functions */ 183/* VFS automount flags management functions */
194
195static inline void __managed_dentry_set_automount(struct dentry *dentry)
196{
197 dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
198}
199
200static inline void managed_dentry_set_automount(struct dentry *dentry)
201{
202 spin_lock(&dentry->d_lock);
203 __managed_dentry_set_automount(dentry);
204 spin_unlock(&dentry->d_lock);
205}
206
207static inline void __managed_dentry_clear_automount(struct dentry *dentry)
208{
209 dentry->d_flags &= ~DCACHE_NEED_AUTOMOUNT;
210}
211
212static inline void managed_dentry_clear_automount(struct dentry *dentry)
213{
214 spin_lock(&dentry->d_lock);
215 __managed_dentry_clear_automount(dentry);
216 spin_unlock(&dentry->d_lock);
217}
218
219static inline void __managed_dentry_set_transit(struct dentry *dentry)
220{
221 dentry->d_flags |= DCACHE_MANAGE_TRANSIT;
222}
223
224static inline void managed_dentry_set_transit(struct dentry *dentry)
225{
226 spin_lock(&dentry->d_lock);
227 __managed_dentry_set_transit(dentry);
228 spin_unlock(&dentry->d_lock);
229}
230
231static inline void __managed_dentry_clear_transit(struct dentry *dentry)
232{
233 dentry->d_flags &= ~DCACHE_MANAGE_TRANSIT;
234}
235
236static inline void managed_dentry_clear_transit(struct dentry *dentry)
237{
238 spin_lock(&dentry->d_lock);
239 __managed_dentry_clear_transit(dentry);
240 spin_unlock(&dentry->d_lock);
241}
242
243static inline void __managed_dentry_set_managed(struct dentry *dentry) 184static inline void __managed_dentry_set_managed(struct dentry *dentry)
244{ 185{
245 dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); 186 dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 5b570b6efa28..aaf96cb25452 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -450,7 +450,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
450 ino = autofs4_dentry_ino(path.dentry); 450 ino = autofs4_dentry_ino(path.dentry);
451 if (ino) { 451 if (ino) {
452 err = 0; 452 err = 0;
453 autofs4_expire_wait(path.dentry); 453 autofs4_expire_wait(path.dentry, 0);
454 spin_lock(&sbi->fs_lock); 454 spin_lock(&sbi->fs_lock);
455 param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); 455 param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
456 param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); 456 param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 394e90b02c5e..683a5b9ce22a 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -30,12 +30,6 @@ static inline int autofs4_can_expire(struct dentry *dentry,
30 /* Too young to die */ 30 /* Too young to die */
31 if (!timeout || time_after(ino->last_used + timeout, now)) 31 if (!timeout || time_after(ino->last_used + timeout, now))
32 return 0; 32 return 0;
33
34 /* update last_used here :-
35 - obviously makes sense if it is in use now
36 - less obviously, prevents rapid-fire expire
37 attempts if expire fails the first time */
38 ino->last_used = now;
39 } 33 }
40 return 1; 34 return 1;
41} 35}
@@ -255,12 +249,6 @@ static int autofs4_tree_busy(struct vfsmount *mnt,
255 struct autofs_info *ino = autofs4_dentry_ino(p); 249 struct autofs_info *ino = autofs4_dentry_ino(p);
256 unsigned int ino_count = atomic_read(&ino->count); 250 unsigned int ino_count = atomic_read(&ino->count);
257 251
258 /*
259 * Clean stale dentries below that have not been
260 * invalidated after a mount fail during lookup
261 */
262 d_invalidate(p);
263
264 /* allow for dget above and top is already dgot */ 252 /* allow for dget above and top is already dgot */
265 if (p == top) 253 if (p == top)
266 ino_count += 2; 254 ino_count += 2;
@@ -333,11 +321,19 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
333 if (ino->flags & AUTOFS_INF_PENDING) 321 if (ino->flags & AUTOFS_INF_PENDING)
334 goto out; 322 goto out;
335 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 323 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
336 struct autofs_info *ino = autofs4_dentry_ino(root); 324 ino->flags |= AUTOFS_INF_NO_RCU;
337 ino->flags |= AUTOFS_INF_EXPIRING;
338 init_completion(&ino->expire_complete);
339 spin_unlock(&sbi->fs_lock); 325 spin_unlock(&sbi->fs_lock);
340 return root; 326 synchronize_rcu();
327 spin_lock(&sbi->fs_lock);
328 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
329 ino->flags |= AUTOFS_INF_EXPIRING;
330 smp_mb();
331 ino->flags &= ~AUTOFS_INF_NO_RCU;
332 init_completion(&ino->expire_complete);
333 spin_unlock(&sbi->fs_lock);
334 return root;
335 }
336 ino->flags &= ~AUTOFS_INF_NO_RCU;
341 } 337 }
342out: 338out:
343 spin_unlock(&sbi->fs_lock); 339 spin_unlock(&sbi->fs_lock);
@@ -346,6 +342,89 @@ out:
346 return NULL; 342 return NULL;
347} 343}
348 344
345/* Check if 'dentry' should expire, or return a nearby
346 * dentry that is suitable.
347 * If returned dentry is different from arg dentry,
348 * then a dget() reference was taken, else not.
349 */
350static struct dentry *should_expire(struct dentry *dentry,
351 struct vfsmount *mnt,
352 unsigned long timeout,
353 int how)
354{
355 int do_now = how & AUTOFS_EXP_IMMEDIATE;
356 int exp_leaves = how & AUTOFS_EXP_LEAVES;
357 struct autofs_info *ino = autofs4_dentry_ino(dentry);
358 unsigned int ino_count;
359
360 /* No point expiring a pending mount */
361 if (ino->flags & AUTOFS_INF_PENDING)
362 return NULL;
363
364 /*
365 * Case 1: (i) indirect mount or top level pseudo direct mount
366 * (autofs-4.1).
367 * (ii) indirect mount with offset mount, check the "/"
368 * offset (autofs-5.0+).
369 */
370 if (d_mountpoint(dentry)) {
371 DPRINTK("checking mountpoint %p %.*s",
372 dentry, (int)dentry->d_name.len, dentry->d_name.name);
373
374 /* Can we umount this guy */
375 if (autofs4_mount_busy(mnt, dentry))
376 return NULL;
377
378 /* Can we expire this guy */
379 if (autofs4_can_expire(dentry, timeout, do_now))
380 return dentry;
381 return NULL;
382 }
383
384 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
385 DPRINTK("checking symlink %p %.*s",
386 dentry, (int)dentry->d_name.len, dentry->d_name.name);
387 /*
388 * A symlink can't be "busy" in the usual sense so
389 * just check last used for expire timeout.
390 */
391 if (autofs4_can_expire(dentry, timeout, do_now))
392 return dentry;
393 return NULL;
394 }
395
396 if (simple_empty(dentry))
397 return NULL;
398
399 /* Case 2: tree mount, expire iff entire tree is not busy */
400 if (!exp_leaves) {
401 /* Path walk currently on this dentry? */
402 ino_count = atomic_read(&ino->count) + 1;
403 if (d_count(dentry) > ino_count)
404 return NULL;
405
406 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now))
407 return dentry;
408 /*
409 * Case 3: pseudo direct mount, expire individual leaves
410 * (autofs-4.1).
411 */
412 } else {
413 /* Path walk currently on this dentry? */
414 struct dentry *expired;
415 ino_count = atomic_read(&ino->count) + 1;
416 if (d_count(dentry) > ino_count)
417 return NULL;
418
419 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
420 if (expired) {
421 if (expired == dentry)
422 dput(dentry);
423 return expired;
424 }
425 }
426 return NULL;
427}
349/* 428/*
350 * Find an eligible tree to time-out 429 * Find an eligible tree to time-out
351 * A tree is eligible if :- 430 * A tree is eligible if :-
@@ -360,11 +439,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
360 unsigned long timeout; 439 unsigned long timeout;
361 struct dentry *root = sb->s_root; 440 struct dentry *root = sb->s_root;
362 struct dentry *dentry; 441 struct dentry *dentry;
363 struct dentry *expired = NULL; 442 struct dentry *expired;
364 int do_now = how & AUTOFS_EXP_IMMEDIATE;
365 int exp_leaves = how & AUTOFS_EXP_LEAVES;
366 struct autofs_info *ino; 443 struct autofs_info *ino;
367 unsigned int ino_count;
368 444
369 if (!root) 445 if (!root)
370 return NULL; 446 return NULL;
@@ -376,77 +452,28 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
376 while ((dentry = get_next_positive_subdir(dentry, root))) { 452 while ((dentry = get_next_positive_subdir(dentry, root))) {
377 spin_lock(&sbi->fs_lock); 453 spin_lock(&sbi->fs_lock);
378 ino = autofs4_dentry_ino(dentry); 454 ino = autofs4_dentry_ino(dentry);
379 /* No point expiring a pending mount */ 455 if (ino->flags & AUTOFS_INF_NO_RCU)
380 if (ino->flags & AUTOFS_INF_PENDING) 456 expired = NULL;
381 goto next; 457 else
382 458 expired = should_expire(dentry, mnt, timeout, how);
383 /* 459 if (!expired) {
384 * Case 1: (i) indirect mount or top level pseudo direct mount 460 spin_unlock(&sbi->fs_lock);
385 * (autofs-4.1). 461 continue;
386 * (ii) indirect mount with offset mount, check the "/"
387 * offset (autofs-5.0+).
388 */
389 if (d_mountpoint(dentry)) {
390 DPRINTK("checking mountpoint %p %.*s",
391 dentry, (int)dentry->d_name.len, dentry->d_name.name);
392
393 /* Can we umount this guy */
394 if (autofs4_mount_busy(mnt, dentry))
395 goto next;
396
397 /* Can we expire this guy */
398 if (autofs4_can_expire(dentry, timeout, do_now)) {
399 expired = dentry;
400 goto found;
401 }
402 goto next;
403 }
404
405 if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
406 DPRINTK("checking symlink %p %.*s",
407 dentry, (int)dentry->d_name.len, dentry->d_name.name);
408 /*
409 * A symlink can't be "busy" in the usual sense so
410 * just check last used for expire timeout.
411 */
412 if (autofs4_can_expire(dentry, timeout, do_now)) {
413 expired = dentry;
414 goto found;
415 }
416 goto next;
417 } 462 }
418 463 ino = autofs4_dentry_ino(expired);
419 if (simple_empty(dentry)) 464 ino->flags |= AUTOFS_INF_NO_RCU;
420 goto next; 465 spin_unlock(&sbi->fs_lock);
421 466 synchronize_rcu();
422 /* Case 2: tree mount, expire iff entire tree is not busy */ 467 spin_lock(&sbi->fs_lock);
423 if (!exp_leaves) { 468 if (should_expire(expired, mnt, timeout, how)) {
424 /* Path walk currently on this dentry? */ 469 if (expired != dentry)
425 ino_count = atomic_read(&ino->count) + 1;
426 if (d_count(dentry) > ino_count)
427 goto next;
428
429 if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) {
430 expired = dentry;
431 goto found;
432 }
433 /*
434 * Case 3: pseudo direct mount, expire individual leaves
435 * (autofs-4.1).
436 */
437 } else {
438 /* Path walk currently on this dentry? */
439 ino_count = atomic_read(&ino->count) + 1;
440 if (d_count(dentry) > ino_count)
441 goto next;
442
443 expired = autofs4_check_leaves(mnt, dentry, timeout, do_now);
444 if (expired) {
445 dput(dentry); 470 dput(dentry);
446 goto found; 471 goto found;
447 }
448 } 472 }
449next: 473
474 ino->flags &= ~AUTOFS_INF_NO_RCU;
475 if (expired != dentry)
476 dput(expired);
450 spin_unlock(&sbi->fs_lock); 477 spin_unlock(&sbi->fs_lock);
451 } 478 }
452 return NULL; 479 return NULL;
@@ -454,8 +481,9 @@ next:
454found: 481found:
455 DPRINTK("returning %p %.*s", 482 DPRINTK("returning %p %.*s",
456 expired, (int)expired->d_name.len, expired->d_name.name); 483 expired, (int)expired->d_name.len, expired->d_name.name);
457 ino = autofs4_dentry_ino(expired);
458 ino->flags |= AUTOFS_INF_EXPIRING; 484 ino->flags |= AUTOFS_INF_EXPIRING;
485 smp_mb();
486 ino->flags &= ~AUTOFS_INF_NO_RCU;
459 init_completion(&ino->expire_complete); 487 init_completion(&ino->expire_complete);
460 spin_unlock(&sbi->fs_lock); 488 spin_unlock(&sbi->fs_lock);
461 spin_lock(&sbi->lookup_lock); 489 spin_lock(&sbi->lookup_lock);
@@ -468,13 +496,18 @@ found:
468 return expired; 496 return expired;
469} 497}
470 498
471int autofs4_expire_wait(struct dentry *dentry) 499int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
472{ 500{
473 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 501 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
474 struct autofs_info *ino = autofs4_dentry_ino(dentry); 502 struct autofs_info *ino = autofs4_dentry_ino(dentry);
475 int status; 503 int status;
476 504
477 /* Block on any pending expire */ 505 /* Block on any pending expire */
506 if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)))
507 return 0;
508 if (rcu_walk)
509 return -ECHILD;
510
478 spin_lock(&sbi->fs_lock); 511 spin_lock(&sbi->fs_lock);
479 if (ino->flags & AUTOFS_INF_EXPIRING) { 512 if (ino->flags & AUTOFS_INF_EXPIRING) {
480 spin_unlock(&sbi->fs_lock); 513 spin_unlock(&sbi->fs_lock);
@@ -526,6 +559,8 @@ int autofs4_expire_run(struct super_block *sb,
526 559
527 spin_lock(&sbi->fs_lock); 560 spin_lock(&sbi->fs_lock);
528 ino = autofs4_dentry_ino(dentry); 561 ino = autofs4_dentry_ino(dentry);
562 /* avoid rapid-fire expire attempts if expiry fails */
563 ino->last_used = now;
529 ino->flags &= ~AUTOFS_INF_EXPIRING; 564 ino->flags &= ~AUTOFS_INF_EXPIRING;
530 complete_all(&ino->expire_complete); 565 complete_all(&ino->expire_complete);
531 spin_unlock(&sbi->fs_lock); 566 spin_unlock(&sbi->fs_lock);
@@ -552,6 +587,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
552 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); 587 ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
553 588
554 spin_lock(&sbi->fs_lock); 589 spin_lock(&sbi->fs_lock);
590 /* avoid rapid-fire expire attempts if expiry fails */
591 ino->last_used = now;
555 ino->flags &= ~AUTOFS_INF_EXPIRING; 592 ino->flags &= ~AUTOFS_INF_EXPIRING;
556 complete_all(&ino->expire_complete); 593 complete_all(&ino->expire_complete);
557 spin_unlock(&sbi->fs_lock); 594 spin_unlock(&sbi->fs_lock);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index cc87c1abac97..d76d083f2f06 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -166,8 +166,10 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
166 const unsigned char *str = name->name; 166 const unsigned char *str = name->name;
167 struct list_head *p, *head; 167 struct list_head *p, *head;
168 168
169 spin_lock(&sbi->lookup_lock);
170 head = &sbi->active_list; 169 head = &sbi->active_list;
170 if (list_empty(head))
171 return NULL;
172 spin_lock(&sbi->lookup_lock);
171 list_for_each(p, head) { 173 list_for_each(p, head) {
172 struct autofs_info *ino; 174 struct autofs_info *ino;
173 struct dentry *active; 175 struct dentry *active;
@@ -208,7 +210,8 @@ next:
208 return NULL; 210 return NULL;
209} 211}
210 212
211static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) 213static struct dentry *autofs4_lookup_expiring(struct dentry *dentry,
214 bool rcu_walk)
212{ 215{
213 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 216 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
214 struct dentry *parent = dentry->d_parent; 217 struct dentry *parent = dentry->d_parent;
@@ -218,13 +221,20 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
218 const unsigned char *str = name->name; 221 const unsigned char *str = name->name;
219 struct list_head *p, *head; 222 struct list_head *p, *head;
220 223
221 spin_lock(&sbi->lookup_lock);
222 head = &sbi->expiring_list; 224 head = &sbi->expiring_list;
225 if (list_empty(head))
226 return NULL;
227 spin_lock(&sbi->lookup_lock);
223 list_for_each(p, head) { 228 list_for_each(p, head) {
224 struct autofs_info *ino; 229 struct autofs_info *ino;
225 struct dentry *expiring; 230 struct dentry *expiring;
226 struct qstr *qstr; 231 struct qstr *qstr;
227 232
233 if (rcu_walk) {
234 spin_unlock(&sbi->lookup_lock);
235 return ERR_PTR(-ECHILD);
236 }
237
228 ino = list_entry(p, struct autofs_info, expiring); 238 ino = list_entry(p, struct autofs_info, expiring);
229 expiring = ino->dentry; 239 expiring = ino->dentry;
230 240
@@ -260,13 +270,15 @@ next:
260 return NULL; 270 return NULL;
261} 271}
262 272
263static int autofs4_mount_wait(struct dentry *dentry) 273static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
264{ 274{
265 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 275 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
266 struct autofs_info *ino = autofs4_dentry_ino(dentry); 276 struct autofs_info *ino = autofs4_dentry_ino(dentry);
267 int status = 0; 277 int status = 0;
268 278
269 if (ino->flags & AUTOFS_INF_PENDING) { 279 if (ino->flags & AUTOFS_INF_PENDING) {
280 if (rcu_walk)
281 return -ECHILD;
270 DPRINTK("waiting for mount name=%.*s", 282 DPRINTK("waiting for mount name=%.*s",
271 dentry->d_name.len, dentry->d_name.name); 283 dentry->d_name.len, dentry->d_name.name);
272 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 284 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
@@ -276,20 +288,22 @@ static int autofs4_mount_wait(struct dentry *dentry)
276 return status; 288 return status;
277} 289}
278 290
279static int do_expire_wait(struct dentry *dentry) 291static int do_expire_wait(struct dentry *dentry, bool rcu_walk)
280{ 292{
281 struct dentry *expiring; 293 struct dentry *expiring;
282 294
283 expiring = autofs4_lookup_expiring(dentry); 295 expiring = autofs4_lookup_expiring(dentry, rcu_walk);
296 if (IS_ERR(expiring))
297 return PTR_ERR(expiring);
284 if (!expiring) 298 if (!expiring)
285 return autofs4_expire_wait(dentry); 299 return autofs4_expire_wait(dentry, rcu_walk);
286 else { 300 else {
287 /* 301 /*
288 * If we are racing with expire the request might not 302 * If we are racing with expire the request might not
289 * be quite complete, but the directory has been removed 303 * be quite complete, but the directory has been removed
290 * so it must have been successful, just wait for it. 304 * so it must have been successful, just wait for it.
291 */ 305 */
292 autofs4_expire_wait(expiring); 306 autofs4_expire_wait(expiring, 0);
293 autofs4_del_expiring(expiring); 307 autofs4_del_expiring(expiring);
294 dput(expiring); 308 dput(expiring);
295 } 309 }
@@ -341,7 +355,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
341 * and the directory was removed, so just go ahead and try 355 * and the directory was removed, so just go ahead and try
342 * the mount. 356 * the mount.
343 */ 357 */
344 status = do_expire_wait(dentry); 358 status = do_expire_wait(dentry, 0);
345 if (status && status != -EAGAIN) 359 if (status && status != -EAGAIN)
346 return NULL; 360 return NULL;
347 361
@@ -349,7 +363,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
349 spin_lock(&sbi->fs_lock); 363 spin_lock(&sbi->fs_lock);
350 if (ino->flags & AUTOFS_INF_PENDING) { 364 if (ino->flags & AUTOFS_INF_PENDING) {
351 spin_unlock(&sbi->fs_lock); 365 spin_unlock(&sbi->fs_lock);
352 status = autofs4_mount_wait(dentry); 366 status = autofs4_mount_wait(dentry, 0);
353 if (status) 367 if (status)
354 return ERR_PTR(status); 368 return ERR_PTR(status);
355 goto done; 369 goto done;
@@ -373,7 +387,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
373 * this because the leaves of the directory tree under the 387 * this because the leaves of the directory tree under the
374 * mount never trigger mounts themselves (they have an autofs 388 * mount never trigger mounts themselves (they have an autofs
375 * trigger mount mounted on them). But v4 pseudo direct mounts 389 * trigger mount mounted on them). But v4 pseudo direct mounts
376 * do need the leaves to to trigger mounts. In this case we 390 * do need the leaves to trigger mounts. In this case we
377 * have no choice but to use the list_empty() check and 391 * have no choice but to use the list_empty() check and
378 * require user space behave. 392 * require user space behave.
379 */ 393 */
@@ -390,7 +404,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
390 } 404 }
391 ino->flags |= AUTOFS_INF_PENDING; 405 ino->flags |= AUTOFS_INF_PENDING;
392 spin_unlock(&sbi->fs_lock); 406 spin_unlock(&sbi->fs_lock);
393 status = autofs4_mount_wait(dentry); 407 status = autofs4_mount_wait(dentry, 0);
394 spin_lock(&sbi->fs_lock); 408 spin_lock(&sbi->fs_lock);
395 ino->flags &= ~AUTOFS_INF_PENDING; 409 ino->flags &= ~AUTOFS_INF_PENDING;
396 if (status) { 410 if (status) {
@@ -419,28 +433,46 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
419 433
420 /* The daemon never waits. */ 434 /* The daemon never waits. */
421 if (autofs4_oz_mode(sbi)) { 435 if (autofs4_oz_mode(sbi)) {
422 if (rcu_walk)
423 return 0;
424 if (!d_mountpoint(dentry)) 436 if (!d_mountpoint(dentry))
425 return -EISDIR; 437 return -EISDIR;
426 return 0; 438 return 0;
427 } 439 }
428 440
429 /* We need to sleep, so we need pathwalk to be in ref-mode */
430 if (rcu_walk)
431 return -ECHILD;
432
433 /* Wait for pending expires */ 441 /* Wait for pending expires */
434 do_expire_wait(dentry); 442 if (do_expire_wait(dentry, rcu_walk) == -ECHILD)
443 return -ECHILD;
435 444
436 /* 445 /*
437 * This dentry may be under construction so wait on mount 446 * This dentry may be under construction so wait on mount
438 * completion. 447 * completion.
439 */ 448 */
440 status = autofs4_mount_wait(dentry); 449 status = autofs4_mount_wait(dentry, rcu_walk);
441 if (status) 450 if (status)
442 return status; 451 return status;
443 452
453 if (rcu_walk) {
454 /* We don't need fs_lock in rcu_walk mode,
455 * just testing 'AUTOFS_INFO_NO_RCU' is enough.
456 * simple_empty() takes a spinlock, so leave it
457 * to last.
458 * We only return -EISDIR when certain this isn't
459 * a mount-trap.
460 */
461 struct inode *inode;
462 if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
463 return 0;
464 if (d_mountpoint(dentry))
465 return 0;
466 inode = ACCESS_ONCE(dentry->d_inode);
467 if (inode && S_ISLNK(inode->i_mode))
468 return -EISDIR;
469 if (list_empty(&dentry->d_subdirs))
470 return 0;
471 if (!simple_empty(dentry))
472 return -EISDIR;
473 return 0;
474 }
475
444 spin_lock(&sbi->fs_lock); 476 spin_lock(&sbi->fs_lock);
445 /* 477 /*
446 * If the dentry has been selected for expire while we slept 478 * If the dentry has been selected for expire while we slept
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 7c93953030fb..afd2b4408adf 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -218,8 +218,9 @@ static int bad_inode_mknod (struct inode *dir, struct dentry *dentry,
218 return -EIO; 218 return -EIO;
219} 219}
220 220
221static int bad_inode_rename (struct inode *old_dir, struct dentry *old_dentry, 221static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry,
222 struct inode *new_dir, struct dentry *new_dentry) 222 struct inode *new_dir, struct dentry *new_dentry,
223 unsigned int flags)
223{ 224{
224 return -EIO; 225 return -EIO;
225} 226}
@@ -279,7 +280,7 @@ static const struct inode_operations bad_inode_ops =
279 .mkdir = bad_inode_mkdir, 280 .mkdir = bad_inode_mkdir,
280 .rmdir = bad_inode_rmdir, 281 .rmdir = bad_inode_rmdir,
281 .mknod = bad_inode_mknod, 282 .mknod = bad_inode_mknod,
282 .rename = bad_inode_rename, 283 .rename2 = bad_inode_rename2,
283 .readlink = bad_inode_readlink, 284 .readlink = bad_inode_readlink,
284 /* follow_link must be no-op, otherwise unmounting this inode 285 /* follow_link must be no-op, otherwise unmounting this inode
285 won't work */ 286 won't work */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 9c7faa8a9288..0826e91dacda 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -78,11 +78,11 @@
78/* 78/*
79 * In memory structure of each btree node 79 * In memory structure of each btree node
80 */ 80 */
81typedef struct { 81struct befs_btree_node {
82 befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */ 82 befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */
83 struct buffer_head *bh; 83 struct buffer_head *bh;
84 befs_btree_nodehead *od_node; /* on disk node */ 84 befs_btree_nodehead *od_node; /* on disk node */
85} befs_btree_node; 85};
86 86
87/* local constants */ 87/* local constants */
88static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; 88static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
@@ -90,27 +90,30 @@ static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL;
90/* local functions */ 90/* local functions */
91static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, 91static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
92 befs_btree_super * bt_super, 92 befs_btree_super * bt_super,
93 befs_btree_node * this_node, 93 struct befs_btree_node *this_node,
94 befs_off_t * node_off); 94 befs_off_t * node_off);
95 95
96static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, 96static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
97 befs_btree_super * sup); 97 befs_btree_super * sup);
98 98
99static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, 99static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
100 befs_btree_node * node, befs_off_t node_off); 100 struct befs_btree_node *node,
101 befs_off_t node_off);
101 102
102static int befs_leafnode(befs_btree_node * node); 103static int befs_leafnode(struct befs_btree_node *node);
103 104
104static fs16 *befs_bt_keylen_index(befs_btree_node * node); 105static fs16 *befs_bt_keylen_index(struct befs_btree_node *node);
105 106
106static fs64 *befs_bt_valarray(befs_btree_node * node); 107static fs64 *befs_bt_valarray(struct befs_btree_node *node);
107 108
108static char *befs_bt_keydata(befs_btree_node * node); 109static char *befs_bt_keydata(struct befs_btree_node *node);
109 110
110static int befs_find_key(struct super_block *sb, befs_btree_node * node, 111static int befs_find_key(struct super_block *sb,
112 struct befs_btree_node *node,
111 const char *findkey, befs_off_t * value); 113 const char *findkey, befs_off_t * value);
112 114
113static char *befs_bt_get_key(struct super_block *sb, befs_btree_node * node, 115static char *befs_bt_get_key(struct super_block *sb,
116 struct befs_btree_node *node,
114 int index, u16 * keylen); 117 int index, u16 * keylen);
115 118
116static int befs_compare_strings(const void *key1, int keylen1, 119static int befs_compare_strings(const void *key1, int keylen1,
@@ -191,7 +194,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds,
191 194
192static int 195static int
193befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, 196befs_bt_read_node(struct super_block *sb, befs_data_stream * ds,
194 befs_btree_node * node, befs_off_t node_off) 197 struct befs_btree_node *node, befs_off_t node_off)
195{ 198{
196 uint off = 0; 199 uint off = 0;
197 200
@@ -247,7 +250,7 @@ int
247befs_btree_find(struct super_block *sb, befs_data_stream * ds, 250befs_btree_find(struct super_block *sb, befs_data_stream * ds,
248 const char *key, befs_off_t * value) 251 const char *key, befs_off_t * value)
249{ 252{
250 befs_btree_node *this_node = NULL; 253 struct befs_btree_node *this_node = NULL;
251 befs_btree_super bt_super; 254 befs_btree_super bt_super;
252 befs_off_t node_off; 255 befs_off_t node_off;
253 int res; 256 int res;
@@ -260,11 +263,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
260 goto error; 263 goto error;
261 } 264 }
262 265
263 this_node = kmalloc(sizeof (befs_btree_node), 266 this_node = kmalloc(sizeof(struct befs_btree_node),
264 GFP_NOFS); 267 GFP_NOFS);
265 if (!this_node) { 268 if (!this_node) {
266 befs_error(sb, "befs_btree_find() failed to allocate %zu " 269 befs_error(sb, "befs_btree_find() failed to allocate %zu "
267 "bytes of memory", sizeof (befs_btree_node)); 270 "bytes of memory", sizeof(struct befs_btree_node));
268 goto error; 271 goto error;
269 } 272 }
270 273
@@ -333,7 +336,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds,
333 * Use binary search instead of a linear. 336 * Use binary search instead of a linear.
334 */ 337 */
335static int 338static int
336befs_find_key(struct super_block *sb, befs_btree_node * node, 339befs_find_key(struct super_block *sb, struct befs_btree_node *node,
337 const char *findkey, befs_off_t * value) 340 const char *findkey, befs_off_t * value)
338{ 341{
339 int first, last, mid; 342 int first, last, mid;
@@ -417,7 +420,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
417 loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize, 420 loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize,
418 befs_off_t * value) 421 befs_off_t * value)
419{ 422{
420 befs_btree_node *this_node; 423 struct befs_btree_node *this_node;
421 befs_btree_super bt_super; 424 befs_btree_super bt_super;
422 befs_off_t node_off = 0; 425 befs_off_t node_off = 0;
423 int cur_key; 426 int cur_key;
@@ -436,9 +439,10 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
436 goto error; 439 goto error;
437 } 440 }
438 441
439 if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { 442 this_node = kmalloc(sizeof(struct befs_btree_node), GFP_NOFS);
443 if (this_node == NULL) {
440 befs_error(sb, "befs_btree_read() failed to allocate %zu " 444 befs_error(sb, "befs_btree_read() failed to allocate %zu "
441 "bytes of memory", sizeof (befs_btree_node)); 445 "bytes of memory", sizeof(struct befs_btree_node));
442 goto error; 446 goto error;
443 } 447 }
444 448
@@ -545,7 +549,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds,
545 */ 549 */
546static int 550static int
547befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, 551befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
548 befs_btree_super * bt_super, befs_btree_node * this_node, 552 befs_btree_super *bt_super,
553 struct befs_btree_node *this_node,
549 befs_off_t * node_off) 554 befs_off_t * node_off)
550{ 555{
551 556
@@ -600,7 +605,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds,
600 * Return 1 if leaf, 0 if interior 605 * Return 1 if leaf, 0 if interior
601 */ 606 */
602static int 607static int
603befs_leafnode(befs_btree_node * node) 608befs_leafnode(struct befs_btree_node *node)
604{ 609{
605 /* all interior nodes (and only interior nodes) have an overflow node */ 610 /* all interior nodes (and only interior nodes) have an overflow node */
606 if (node->head.overflow == befs_bt_inval) 611 if (node->head.overflow == befs_bt_inval)
@@ -623,7 +628,7 @@ befs_leafnode(befs_btree_node * node)
623 * Except that rounding up to 8 works, and rounding up to 4 doesn't. 628 * Except that rounding up to 8 works, and rounding up to 4 doesn't.
624 */ 629 */
625static fs16 * 630static fs16 *
626befs_bt_keylen_index(befs_btree_node * node) 631befs_bt_keylen_index(struct befs_btree_node *node)
627{ 632{
628 const int keylen_align = 8; 633 const int keylen_align = 8;
629 unsigned long int off = 634 unsigned long int off =
@@ -644,7 +649,7 @@ befs_bt_keylen_index(befs_btree_node * node)
644 * of the node pointed to by the node header 649 * of the node pointed to by the node header
645 */ 650 */
646static fs64 * 651static fs64 *
647befs_bt_valarray(befs_btree_node * node) 652befs_bt_valarray(struct befs_btree_node *node)
648{ 653{
649 void *keylen_index_start = (void *) befs_bt_keylen_index(node); 654 void *keylen_index_start = (void *) befs_bt_keylen_index(node);
650 size_t keylen_index_size = node->head.all_key_count * sizeof (fs16); 655 size_t keylen_index_size = node->head.all_key_count * sizeof (fs16);
@@ -660,7 +665,7 @@ befs_bt_valarray(befs_btree_node * node)
660 * of the node pointed to by the node header 665 * of the node pointed to by the node header
661 */ 666 */
662static char * 667static char *
663befs_bt_keydata(befs_btree_node * node) 668befs_bt_keydata(struct befs_btree_node *node)
664{ 669{
665 return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead)); 670 return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead));
666} 671}
@@ -676,7 +681,7 @@ befs_bt_keydata(befs_btree_node * node)
676 * Returns NULL on failure (bad input) and sets *@keylen = 0 681 * Returns NULL on failure (bad input) and sets *@keylen = 0
677 */ 682 */
678static char * 683static char *
679befs_bt_get_key(struct super_block *sb, befs_btree_node * node, 684befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node,
680 int index, u16 * keylen) 685 int index, u16 * keylen)
681{ 686{
682 int prev_key_end; 687 int prev_key_end;
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a16fbd4e8241..4cf61ec6b7a8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -799,13 +799,11 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
799 799
800 befs_debug(sb, "---> %s", __func__); 800 befs_debug(sb, "---> %s", __func__);
801 801
802#ifndef CONFIG_BEFS_RW
803 if (!(sb->s_flags & MS_RDONLY)) { 802 if (!(sb->s_flags & MS_RDONLY)) {
804 befs_warning(sb, 803 befs_warning(sb,
805 "No write support. Marking filesystem read-only"); 804 "No write support. Marking filesystem read-only");
806 sb->s_flags |= MS_RDONLY; 805 sb->s_flags |= MS_RDONLY;
807 } 806 }
808#endif /* CONFIG_BEFS_RW */
809 807
810 /* 808 /*
811 * Set dummy blocksize to read super block. 809 * Set dummy blocksize to read super block.
@@ -834,16 +832,14 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
834 (befs_super_block *) ((void *) bh->b_data + x86_sb_off); 832 (befs_super_block *) ((void *) bh->b_data + x86_sb_off);
835 } 833 }
836 834
837 if (befs_load_sb(sb, disk_sb) != BEFS_OK) 835 if ((befs_load_sb(sb, disk_sb) != BEFS_OK) ||
836 (befs_check_sb(sb) != BEFS_OK))
838 goto unacquire_bh; 837 goto unacquire_bh;
839 838
840 befs_dump_super_block(sb, disk_sb); 839 befs_dump_super_block(sb, disk_sb);
841 840
842 brelse(bh); 841 brelse(bh);
843 842
844 if (befs_check_sb(sb) != BEFS_OK)
845 goto unacquire_priv_sbp;
846
847 if( befs_sb->num_blocks > ~((sector_t)0) ) { 843 if( befs_sb->num_blocks > ~((sector_t)0) ) {
848 befs_error(sb, "blocks count: %llu " 844 befs_error(sb, "blocks count: %llu "
849 "is larger than the host can use", 845 "is larger than the host can use",
diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h
index f7f87e233dd9..f40006db36df 100644
--- a/fs/bfs/bfs.h
+++ b/fs/bfs/bfs.h
@@ -46,6 +46,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode)
46 46
47/* inode.c */ 47/* inode.c */
48extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino); 48extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino);
49extern void bfs_dump_imap(const char *, struct super_block *);
49 50
50/* file.c */ 51/* file.c */
51extern const struct inode_operations bfs_file_inops; 52extern const struct inode_operations bfs_file_inops;
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index a399e6d9dc74..08063ae0a17c 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -75,8 +75,6 @@ const struct file_operations bfs_dir_operations = {
75 .llseek = generic_file_llseek, 75 .llseek = generic_file_llseek,
76}; 76};
77 77
78extern void dump_imap(const char *, struct super_block *);
79
80static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 78static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
81 bool excl) 79 bool excl)
82{ 80{
@@ -110,7 +108,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
110 BFS_I(inode)->i_eblock = 0; 108 BFS_I(inode)->i_eblock = 0;
111 insert_inode_hash(inode); 109 insert_inode_hash(inode);
112 mark_inode_dirty(inode); 110 mark_inode_dirty(inode);
113 dump_imap("create", s); 111 bfs_dump_imap("create", s);
114 112
115 err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, 113 err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len,
116 inode->i_ino); 114 inode->i_ino);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 7041ac35ace8..90bc079d9982 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,8 +30,6 @@ MODULE_LICENSE("GPL");
30#define dprintf(x...) 30#define dprintf(x...)
31#endif 31#endif
32 32
33void dump_imap(const char *prefix, struct super_block *s);
34
35struct inode *bfs_iget(struct super_block *sb, unsigned long ino) 33struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
36{ 34{
37 struct bfs_inode *di; 35 struct bfs_inode *di;
@@ -194,7 +192,7 @@ static void bfs_evict_inode(struct inode *inode)
194 info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; 192 info->si_freeb += bi->i_eblock + 1 - bi->i_sblock;
195 info->si_freei++; 193 info->si_freei++;
196 clear_bit(ino, info->si_imap); 194 clear_bit(ino, info->si_imap);
197 dump_imap("delete_inode", s); 195 bfs_dump_imap("delete_inode", s);
198 } 196 }
199 197
200 /* 198 /*
@@ -297,7 +295,7 @@ static const struct super_operations bfs_sops = {
297 .statfs = bfs_statfs, 295 .statfs = bfs_statfs,
298}; 296};
299 297
300void dump_imap(const char *prefix, struct super_block *s) 298void bfs_dump_imap(const char *prefix, struct super_block *s)
301{ 299{
302#ifdef DEBUG 300#ifdef DEBUG
303 int i; 301 int i;
@@ -443,7 +441,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent)
443 } 441 }
444 brelse(bh); 442 brelse(bh);
445 brelse(sbh); 443 brelse(sbh);
446 dump_imap("read_super", s); 444 bfs_dump_imap("read_super", s);
447 return 0; 445 return 0;
448 446
449out3: 447out3:
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ca0ba15a7306..929dec08c348 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -256,11 +256,8 @@ static int load_aout_binary(struct linux_binprm * bprm)
256 (current->mm->start_brk = N_BSSADDR(ex)); 256 (current->mm->start_brk = N_BSSADDR(ex));
257 257
258 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); 258 retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
259 if (retval < 0) { 259 if (retval < 0)
260 /* Someone check-me: is this error path enough? */
261 send_sig(SIGKILL, current, 0);
262 return retval; 260 return retval;
263 }
264 261
265 install_exec_creds(bprm); 262 install_exec_creds(bprm);
266 263
@@ -278,17 +275,13 @@ static int load_aout_binary(struct linux_binprm * bprm)
278 map_size = ex.a_text+ex.a_data; 275 map_size = ex.a_text+ex.a_data;
279#endif 276#endif
280 error = vm_brk(text_addr & PAGE_MASK, map_size); 277 error = vm_brk(text_addr & PAGE_MASK, map_size);
281 if (error != (text_addr & PAGE_MASK)) { 278 if (error != (text_addr & PAGE_MASK))
282 send_sig(SIGKILL, current, 0);
283 return error; 279 return error;
284 }
285 280
286 error = read_code(bprm->file, text_addr, pos, 281 error = read_code(bprm->file, text_addr, pos,
287 ex.a_text+ex.a_data); 282 ex.a_text+ex.a_data);
288 if ((signed long)error < 0) { 283 if ((signed long)error < 0)
289 send_sig(SIGKILL, current, 0);
290 return error; 284 return error;
291 }
292 } else { 285 } else {
293 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && 286 if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
294 (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) 287 (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
@@ -315,28 +308,22 @@ static int load_aout_binary(struct linux_binprm * bprm)
315 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, 308 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
316 fd_offset); 309 fd_offset);
317 310
318 if (error != N_TXTADDR(ex)) { 311 if (error != N_TXTADDR(ex))
319 send_sig(SIGKILL, current, 0);
320 return error; 312 return error;
321 }
322 313
323 error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, 314 error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
324 PROT_READ | PROT_WRITE | PROT_EXEC, 315 PROT_READ | PROT_WRITE | PROT_EXEC,
325 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, 316 MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
326 fd_offset + ex.a_text); 317 fd_offset + ex.a_text);
327 if (error != N_DATADDR(ex)) { 318 if (error != N_DATADDR(ex))
328 send_sig(SIGKILL, current, 0);
329 return error; 319 return error;
330 }
331 } 320 }
332beyond_if: 321beyond_if:
333 set_binfmt(&aout_format); 322 set_binfmt(&aout_format);
334 323
335 retval = set_brk(current->mm->start_brk, current->mm->brk); 324 retval = set_brk(current->mm->start_brk, current->mm->brk);
336 if (retval < 0) { 325 if (retval < 0)
337 send_sig(SIGKILL, current, 0);
338 return retval; 326 return retval;
339 }
340 327
341 current->mm->start_stack = 328 current->mm->start_stack =
342 (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); 329 (unsigned long) create_aout_tables((char __user *) bprm->p, bprm);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3892c1a23241..d8fc0605b9d2 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -738,10 +738,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
738 change some of these later */ 738 change some of these later */
739 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), 739 retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
740 executable_stack); 740 executable_stack);
741 if (retval < 0) { 741 if (retval < 0)
742 send_sig(SIGKILL, current, 0);
743 goto out_free_dentry; 742 goto out_free_dentry;
744 }
745 743
746 current->mm->start_stack = bprm->p; 744 current->mm->start_stack = bprm->p;
747 745
@@ -763,10 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
763 and clear the area. */ 761 and clear the area. */
764 retval = set_brk(elf_bss + load_bias, 762 retval = set_brk(elf_bss + load_bias,
765 elf_brk + load_bias); 763 elf_brk + load_bias);
766 if (retval) { 764 if (retval)
767 send_sig(SIGKILL, current, 0);
768 goto out_free_dentry; 765 goto out_free_dentry;
769 }
770 nbyte = ELF_PAGEOFFSET(elf_bss); 766 nbyte = ELF_PAGEOFFSET(elf_bss);
771 if (nbyte) { 767 if (nbyte) {
772 nbyte = ELF_MIN_ALIGN - nbyte; 768 nbyte = ELF_MIN_ALIGN - nbyte;
@@ -820,7 +816,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
820 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, 816 error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
821 elf_prot, elf_flags, 0); 817 elf_prot, elf_flags, 0);
822 if (BAD_ADDR(error)) { 818 if (BAD_ADDR(error)) {
823 send_sig(SIGKILL, current, 0);
824 retval = IS_ERR((void *)error) ? 819 retval = IS_ERR((void *)error) ?
825 PTR_ERR((void*)error) : -EINVAL; 820 PTR_ERR((void*)error) : -EINVAL;
826 goto out_free_dentry; 821 goto out_free_dentry;
@@ -851,7 +846,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
851 elf_ppnt->p_memsz > TASK_SIZE || 846 elf_ppnt->p_memsz > TASK_SIZE ||
852 TASK_SIZE - elf_ppnt->p_memsz < k) { 847 TASK_SIZE - elf_ppnt->p_memsz < k) {
853 /* set_brk can never work. Avoid overflows. */ 848 /* set_brk can never work. Avoid overflows. */
854 send_sig(SIGKILL, current, 0);
855 retval = -EINVAL; 849 retval = -EINVAL;
856 goto out_free_dentry; 850 goto out_free_dentry;
857 } 851 }
@@ -883,12 +877,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
883 * up getting placed where the bss needs to go. 877 * up getting placed where the bss needs to go.
884 */ 878 */
885 retval = set_brk(elf_bss, elf_brk); 879 retval = set_brk(elf_bss, elf_brk);
886 if (retval) { 880 if (retval)
887 send_sig(SIGKILL, current, 0);
888 goto out_free_dentry; 881 goto out_free_dentry;
889 }
890 if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { 882 if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) {
891 send_sig(SIGSEGV, current, 0);
892 retval = -EFAULT; /* Nobody gets to see this, but.. */ 883 retval = -EFAULT; /* Nobody gets to see this, but.. */
893 goto out_free_dentry; 884 goto out_free_dentry;
894 } 885 }
@@ -909,7 +900,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
909 elf_entry += loc->interp_elf_ex.e_entry; 900 elf_entry += loc->interp_elf_ex.e_entry;
910 } 901 }
911 if (BAD_ADDR(elf_entry)) { 902 if (BAD_ADDR(elf_entry)) {
912 force_sig(SIGSEGV, current);
913 retval = IS_ERR((void *)elf_entry) ? 903 retval = IS_ERR((void *)elf_entry) ?
914 (int)elf_entry : -EINVAL; 904 (int)elf_entry : -EINVAL;
915 goto out_free_dentry; 905 goto out_free_dentry;
@@ -922,7 +912,6 @@ static int load_elf_binary(struct linux_binprm *bprm)
922 } else { 912 } else {
923 elf_entry = loc->elf_ex.e_entry; 913 elf_entry = loc->elf_ex.e_entry;
924 if (BAD_ADDR(elf_entry)) { 914 if (BAD_ADDR(elf_entry)) {
925 force_sig(SIGSEGV, current);
926 retval = -EINVAL; 915 retval = -EINVAL;
927 goto out_free_dentry; 916 goto out_free_dentry;
928 } 917 }
@@ -934,19 +923,15 @@ static int load_elf_binary(struct linux_binprm *bprm)
934 923
935#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES 924#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
936 retval = arch_setup_additional_pages(bprm, !!elf_interpreter); 925 retval = arch_setup_additional_pages(bprm, !!elf_interpreter);
937 if (retval < 0) { 926 if (retval < 0)
938 send_sig(SIGKILL, current, 0);
939 goto out; 927 goto out;
940 }
941#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ 928#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
942 929
943 install_exec_creds(bprm); 930 install_exec_creds(bprm);
944 retval = create_elf_tables(bprm, &loc->elf_ex, 931 retval = create_elf_tables(bprm, &loc->elf_ex,
945 load_addr, interp_load_addr); 932 load_addr, interp_load_addr);
946 if (retval < 0) { 933 if (retval < 0)
947 send_sig(SIGKILL, current, 0);
948 goto out; 934 goto out;
949 }
950 /* N.B. passed_fileno might not be initialized? */ 935 /* N.B. passed_fileno might not be initialized? */
951 current->mm->end_code = end_code; 936 current->mm->end_code = end_code;
952 current->mm->start_code = start_code; 937 current->mm->start_code = start_code;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fe2a643ee005..d3634bfb7fe1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -317,8 +317,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
317 goto error; 317 goto error;
318 318
319 /* there's now no turning back... the old userspace image is dead, 319 /* there's now no turning back... the old userspace image is dead,
320 * defunct, deceased, etc. after this point we have to exit via 320 * defunct, deceased, etc.
321 * error_kill */ 321 */
322 set_personality(PER_LINUX_FDPIC); 322 set_personality(PER_LINUX_FDPIC);
323 if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) 323 if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
324 current->personality |= READ_IMPLIES_EXEC; 324 current->personality |= READ_IMPLIES_EXEC;
@@ -343,24 +343,22 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
343 343
344 retval = setup_arg_pages(bprm, current->mm->start_stack, 344 retval = setup_arg_pages(bprm, current->mm->start_stack,
345 executable_stack); 345 executable_stack);
346 if (retval < 0) { 346 if (retval < 0)
347 send_sig(SIGKILL, current, 0); 347 goto error;
348 goto error_kill;
349 }
350#endif 348#endif
351 349
352 /* load the executable and interpreter into memory */ 350 /* load the executable and interpreter into memory */
353 retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, 351 retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm,
354 "executable"); 352 "executable");
355 if (retval < 0) 353 if (retval < 0)
356 goto error_kill; 354 goto error;
357 355
358 if (interpreter_name) { 356 if (interpreter_name) {
359 retval = elf_fdpic_map_file(&interp_params, interpreter, 357 retval = elf_fdpic_map_file(&interp_params, interpreter,
360 current->mm, "interpreter"); 358 current->mm, "interpreter");
361 if (retval < 0) { 359 if (retval < 0) {
362 printk(KERN_ERR "Unable to load interpreter\n"); 360 printk(KERN_ERR "Unable to load interpreter\n");
363 goto error_kill; 361 goto error;
364 } 362 }
365 363
366 allow_write_access(interpreter); 364 allow_write_access(interpreter);
@@ -397,7 +395,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
397 if (IS_ERR_VALUE(current->mm->start_brk)) { 395 if (IS_ERR_VALUE(current->mm->start_brk)) {
398 retval = current->mm->start_brk; 396 retval = current->mm->start_brk;
399 current->mm->start_brk = 0; 397 current->mm->start_brk = 0;
400 goto error_kill; 398 goto error;
401 } 399 }
402 400
403 current->mm->brk = current->mm->start_brk; 401 current->mm->brk = current->mm->start_brk;
@@ -410,7 +408,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
410 install_exec_creds(bprm); 408 install_exec_creds(bprm);
411 if (create_elf_fdpic_tables(bprm, current->mm, 409 if (create_elf_fdpic_tables(bprm, current->mm,
412 &exec_params, &interp_params) < 0) 410 &exec_params, &interp_params) < 0)
413 goto error_kill; 411 goto error;
414 412
415 kdebug("- start_code %lx", current->mm->start_code); 413 kdebug("- start_code %lx", current->mm->start_code);
416 kdebug("- end_code %lx", current->mm->end_code); 414 kdebug("- end_code %lx", current->mm->end_code);
@@ -449,12 +447,6 @@ error:
449 kfree(interp_params.phdrs); 447 kfree(interp_params.phdrs);
450 kfree(interp_params.loadmap); 448 kfree(interp_params.loadmap);
451 return retval; 449 return retval;
452
453 /* unrecoverable error - kill the process */
454error_kill:
455 send_sig(SIGSEGV, current, 0);
456 goto error;
457
458} 450}
459 451
460/*****************************************************************************/ 452/*****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index b60500300dd7..fd8beb9657a2 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -62,7 +62,22 @@ static struct file_system_type bm_fs_type;
62static struct vfsmount *bm_mnt; 62static struct vfsmount *bm_mnt;
63static int entry_count; 63static int entry_count;
64 64
65/* 65/*
66 * Max length of the register string. Determined by:
67 * - 7 delimiters
68 * - name: ~50 bytes
69 * - type: 1 byte
70 * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE)
71 * - magic: 128 bytes (512 in escaped form)
72 * - mask: 128 bytes (512 in escaped form)
73 * - interp: ~50 bytes
74 * - flags: 5 bytes
75 * Round that up a bit, and then back off to hold the internal data
76 * (like struct Node).
77 */
78#define MAX_REGISTER_LENGTH 1920
79
80/*
66 * Check if we support the binfmt 81 * Check if we support the binfmt
67 * if we do, return the node, else NULL 82 * if we do, return the node, else NULL
68 * locking is done in load_misc_binary 83 * locking is done in load_misc_binary
@@ -279,7 +294,7 @@ static Node *create_entry(const char __user *buffer, size_t count)
279 294
280 /* some sanity checks */ 295 /* some sanity checks */
281 err = -EINVAL; 296 err = -EINVAL;
282 if ((count < 11) || (count > 256)) 297 if ((count < 11) || (count > MAX_REGISTER_LENGTH))
283 goto out; 298 goto out;
284 299
285 err = -ENOMEM; 300 err = -ENOMEM;
@@ -396,12 +411,12 @@ static int parse_command(const char __user *buffer, size_t count)
396{ 411{
397 char s[4]; 412 char s[4];
398 413
399 if (!count)
400 return 0;
401 if (count > 3) 414 if (count > 3)
402 return -EINVAL; 415 return -EINVAL;
403 if (copy_from_user(s, buffer, count)) 416 if (copy_from_user(s, buffer, count))
404 return -EFAULT; 417 return -EFAULT;
418 if (!count)
419 return 0;
405 if (s[count-1] == '\n') 420 if (s[count-1] == '\n')
406 count--; 421 count--;
407 if (count == 1 && s[0] == '0') 422 if (count == 1 && s[0] == '0')
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6d7274619bf9..cc9d4114cda0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -50,32 +50,22 @@ inline struct block_device *I_BDEV(struct inode *inode)
50EXPORT_SYMBOL(I_BDEV); 50EXPORT_SYMBOL(I_BDEV);
51 51
52/* 52/*
53 * Move the inode from its current bdi to a new bdi. If the inode is dirty we 53 * Move the inode from its current bdi to a new bdi. Make sure the inode
54 * need to move it onto the dirty list of @dst so that the inode is always on 54 * is clean before moving so that it doesn't linger on the old bdi.
55 * the right list.
56 */ 55 */
57static void bdev_inode_switch_bdi(struct inode *inode, 56static void bdev_inode_switch_bdi(struct inode *inode,
58 struct backing_dev_info *dst) 57 struct backing_dev_info *dst)
59{ 58{
60 struct backing_dev_info *old = inode->i_data.backing_dev_info; 59 while (true) {
61 bool wakeup_bdi = false; 60 spin_lock(&inode->i_lock);
62 61 if (!(inode->i_state & I_DIRTY)) {
63 if (unlikely(dst == old)) /* deadlock avoidance */ 62 inode->i_data.backing_dev_info = dst;
64 return; 63 spin_unlock(&inode->i_lock);
65 bdi_lock_two(&old->wb, &dst->wb); 64 return;
66 spin_lock(&inode->i_lock); 65 }
67 inode->i_data.backing_dev_info = dst; 66 spin_unlock(&inode->i_lock);
68 if (inode->i_state & I_DIRTY) { 67 WARN_ON_ONCE(write_inode_now(inode, true));
69 if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb))
70 wakeup_bdi = true;
71 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
72 } 68 }
73 spin_unlock(&inode->i_lock);
74 spin_unlock(&old->wb.list_lock);
75 spin_unlock(&dst->wb.list_lock);
76
77 if (wakeup_bdi)
78 bdi_wakeup_thread_delayed(dst);
79} 69}
80 70
81/* Kill _all_ buffers and pagecache , dirty or not.. */ 71/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -304,6 +294,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
304 return block_read_full_page(page, blkdev_get_block); 294 return block_read_full_page(page, blkdev_get_block);
305} 295}
306 296
297static int blkdev_readpages(struct file *file, struct address_space *mapping,
298 struct list_head *pages, unsigned nr_pages)
299{
300 return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
301}
302
307static int blkdev_write_begin(struct file *file, struct address_space *mapping, 303static int blkdev_write_begin(struct file *file, struct address_space *mapping,
308 loff_t pos, unsigned len, unsigned flags, 304 loff_t pos, unsigned len, unsigned flags,
309 struct page **pagep, void **fsdata) 305 struct page **pagep, void **fsdata)
@@ -1173,8 +1169,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1173 if (!ret) { 1169 if (!ret) {
1174 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); 1170 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
1175 bdi = blk_get_backing_dev_info(bdev); 1171 bdi = blk_get_backing_dev_info(bdev);
1176 if (bdi == NULL)
1177 bdi = &default_backing_dev_info;
1178 bdev_inode_switch_bdi(bdev->bd_inode, bdi); 1172 bdev_inode_switch_bdi(bdev->bd_inode, bdi);
1179 } 1173 }
1180 1174
@@ -1622,6 +1616,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
1622 1616
1623static const struct address_space_operations def_blk_aops = { 1617static const struct address_space_operations def_blk_aops = {
1624 .readpage = blkdev_readpage, 1618 .readpage = blkdev_readpage,
1619 .readpages = blkdev_readpages,
1625 .writepage = blkdev_writepage, 1620 .writepage = blkdev_writepage,
1626 .write_begin = blkdev_write_begin, 1621 .write_begin = blkdev_write_begin,
1627 .write_end = blkdev_write_end, 1622 .write_end = blkdev_write_end,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5a201d81049c..4dabeb893b7c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -22,7 +22,6 @@
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/spinlock.h> 23#include <linux/spinlock.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/workqueue.h>
26#include "async-thread.h" 25#include "async-thread.h"
27#include "ctree.h" 26#include "ctree.h"
28 27
@@ -55,13 +54,45 @@ struct btrfs_workqueue {
55 struct __btrfs_workqueue *high; 54 struct __btrfs_workqueue *high;
56}; 55};
57 56
58static inline struct __btrfs_workqueue 57static void normal_work_helper(struct btrfs_work *work);
59*__btrfs_alloc_workqueue(const char *name, int flags, int max_active, 58
59#define BTRFS_WORK_HELPER(name) \
60void btrfs_##name(struct work_struct *arg) \
61{ \
62 struct btrfs_work *work = container_of(arg, struct btrfs_work, \
63 normal_work); \
64 normal_work_helper(work); \
65}
66
67BTRFS_WORK_HELPER(worker_helper);
68BTRFS_WORK_HELPER(delalloc_helper);
69BTRFS_WORK_HELPER(flush_delalloc_helper);
70BTRFS_WORK_HELPER(cache_helper);
71BTRFS_WORK_HELPER(submit_helper);
72BTRFS_WORK_HELPER(fixup_helper);
73BTRFS_WORK_HELPER(endio_helper);
74BTRFS_WORK_HELPER(endio_meta_helper);
75BTRFS_WORK_HELPER(endio_meta_write_helper);
76BTRFS_WORK_HELPER(endio_raid56_helper);
77BTRFS_WORK_HELPER(endio_repair_helper);
78BTRFS_WORK_HELPER(rmw_helper);
79BTRFS_WORK_HELPER(endio_write_helper);
80BTRFS_WORK_HELPER(freespace_write_helper);
81BTRFS_WORK_HELPER(delayed_meta_helper);
82BTRFS_WORK_HELPER(readahead_helper);
83BTRFS_WORK_HELPER(qgroup_rescan_helper);
84BTRFS_WORK_HELPER(extent_refs_helper);
85BTRFS_WORK_HELPER(scrub_helper);
86BTRFS_WORK_HELPER(scrubwrc_helper);
87BTRFS_WORK_HELPER(scrubnc_helper);
88
89static struct __btrfs_workqueue *
90__btrfs_alloc_workqueue(const char *name, int flags, int max_active,
60 int thresh) 91 int thresh)
61{ 92{
62 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); 93 struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
63 94
64 if (unlikely(!ret)) 95 if (!ret)
65 return NULL; 96 return NULL;
66 97
67 ret->max_active = max_active; 98 ret->max_active = max_active;
@@ -85,7 +116,7 @@ static inline struct __btrfs_workqueue
85 ret->normal_wq = alloc_workqueue("%s-%s", flags, 116 ret->normal_wq = alloc_workqueue("%s-%s", flags,
86 ret->max_active, "btrfs", 117 ret->max_active, "btrfs",
87 name); 118 name);
88 if (unlikely(!ret->normal_wq)) { 119 if (!ret->normal_wq) {
89 kfree(ret); 120 kfree(ret);
90 return NULL; 121 return NULL;
91 } 122 }
@@ -107,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
107{ 138{
108 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); 139 struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
109 140
110 if (unlikely(!ret)) 141 if (!ret)
111 return NULL; 142 return NULL;
112 143
113 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, 144 ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
114 max_active, thresh); 145 max_active, thresh);
115 if (unlikely(!ret->normal)) { 146 if (!ret->normal) {
116 kfree(ret); 147 kfree(ret);
117 return NULL; 148 return NULL;
118 } 149 }
@@ -120,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
120 if (flags & WQ_HIGHPRI) { 151 if (flags & WQ_HIGHPRI) {
121 ret->high = __btrfs_alloc_workqueue(name, flags, max_active, 152 ret->high = __btrfs_alloc_workqueue(name, flags, max_active,
122 thresh); 153 thresh);
123 if (unlikely(!ret->high)) { 154 if (!ret->high) {
124 __btrfs_destroy_workqueue(ret->normal); 155 __btrfs_destroy_workqueue(ret->normal);
125 kfree(ret); 156 kfree(ret);
126 return NULL; 157 return NULL;
@@ -232,13 +263,11 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
232 spin_unlock_irqrestore(lock, flags); 263 spin_unlock_irqrestore(lock, flags);
233} 264}
234 265
235static void normal_work_helper(struct work_struct *arg) 266static void normal_work_helper(struct btrfs_work *work)
236{ 267{
237 struct btrfs_work *work;
238 struct __btrfs_workqueue *wq; 268 struct __btrfs_workqueue *wq;
239 int need_order = 0; 269 int need_order = 0;
240 270
241 work = container_of(arg, struct btrfs_work, normal_work);
242 /* 271 /*
243 * We should not touch things inside work in the following cases: 272 * We should not touch things inside work in the following cases:
244 * 1) after work->func() if it has no ordered_free 273 * 1) after work->func() if it has no ordered_free
@@ -262,7 +291,7 @@ static void normal_work_helper(struct work_struct *arg)
262 trace_btrfs_all_work_done(work); 291 trace_btrfs_all_work_done(work);
263} 292}
264 293
265void btrfs_init_work(struct btrfs_work *work, 294void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
266 btrfs_func_t func, 295 btrfs_func_t func,
267 btrfs_func_t ordered_func, 296 btrfs_func_t ordered_func,
268 btrfs_func_t ordered_free) 297 btrfs_func_t ordered_free)
@@ -270,7 +299,7 @@ void btrfs_init_work(struct btrfs_work *work,
270 work->func = func; 299 work->func = func;
271 work->ordered_func = ordered_func; 300 work->ordered_func = ordered_func;
272 work->ordered_free = ordered_free; 301 work->ordered_free = ordered_free;
273 INIT_WORK(&work->normal_work, normal_work_helper); 302 INIT_WORK(&work->normal_work, uniq_func);
274 INIT_LIST_HEAD(&work->ordered_list); 303 INIT_LIST_HEAD(&work->ordered_list);
275 work->flags = 0; 304 work->flags = 0;
276} 305}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 9c6b66d15fb0..e386c29ef1f6 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -19,12 +19,14 @@
19 19
20#ifndef __BTRFS_ASYNC_THREAD_ 20#ifndef __BTRFS_ASYNC_THREAD_
21#define __BTRFS_ASYNC_THREAD_ 21#define __BTRFS_ASYNC_THREAD_
22#include <linux/workqueue.h>
22 23
23struct btrfs_workqueue; 24struct btrfs_workqueue;
24/* Internal use only */ 25/* Internal use only */
25struct __btrfs_workqueue; 26struct __btrfs_workqueue;
26struct btrfs_work; 27struct btrfs_work;
27typedef void (*btrfs_func_t)(struct btrfs_work *arg); 28typedef void (*btrfs_func_t)(struct btrfs_work *arg);
29typedef void (*btrfs_work_func_t)(struct work_struct *arg);
28 30
29struct btrfs_work { 31struct btrfs_work {
30 btrfs_func_t func; 32 btrfs_func_t func;
@@ -38,11 +40,36 @@ struct btrfs_work {
38 unsigned long flags; 40 unsigned long flags;
39}; 41};
40 42
43#define BTRFS_WORK_HELPER_PROTO(name) \
44void btrfs_##name(struct work_struct *arg)
45
46BTRFS_WORK_HELPER_PROTO(worker_helper);
47BTRFS_WORK_HELPER_PROTO(delalloc_helper);
48BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
49BTRFS_WORK_HELPER_PROTO(cache_helper);
50BTRFS_WORK_HELPER_PROTO(submit_helper);
51BTRFS_WORK_HELPER_PROTO(fixup_helper);
52BTRFS_WORK_HELPER_PROTO(endio_helper);
53BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
54BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
55BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
56BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
57BTRFS_WORK_HELPER_PROTO(rmw_helper);
58BTRFS_WORK_HELPER_PROTO(endio_write_helper);
59BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
60BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
61BTRFS_WORK_HELPER_PROTO(readahead_helper);
62BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
63BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
64BTRFS_WORK_HELPER_PROTO(scrub_helper);
65BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
66BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
67
41struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, 68struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
42 int flags, 69 int flags,
43 int max_active, 70 int max_active,
44 int thresh); 71 int thresh);
45void btrfs_init_work(struct btrfs_work *work, 72void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
46 btrfs_func_t func, 73 btrfs_func_t func,
47 btrfs_func_t ordered_func, 74 btrfs_func_t ordered_func,
48 btrfs_func_t ordered_free); 75 btrfs_func_t ordered_free);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e25564bfcb46..2d3e32ebfd15 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -25,6 +25,9 @@
25#include "delayed-ref.h" 25#include "delayed-ref.h"
26#include "locking.h" 26#include "locking.h"
27 27
28/* Just an arbitrary number so we can be sure this happened */
29#define BACKREF_FOUND_SHARED 6
30
28struct extent_inode_elem { 31struct extent_inode_elem {
29 u64 inum; 32 u64 inum;
30 u64 offset; 33 u64 offset;
@@ -276,9 +279,8 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
276 } 279 }
277 if (ret > 0) 280 if (ret > 0)
278 goto next; 281 goto next;
279 ret = ulist_add_merge(parents, eb->start, 282 ret = ulist_add_merge_ptr(parents, eb->start,
280 (uintptr_t)eie, 283 eie, (void **)&old, GFP_NOFS);
281 (u64 *)&old, GFP_NOFS);
282 if (ret < 0) 284 if (ret < 0)
283 break; 285 break;
284 if (!ret && extent_item_pos) { 286 if (!ret && extent_item_pos) {
@@ -378,7 +380,8 @@ out:
378static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, 380static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
379 struct btrfs_path *path, u64 time_seq, 381 struct btrfs_path *path, u64 time_seq,
380 struct list_head *head, 382 struct list_head *head,
381 const u64 *extent_item_pos, u64 total_refs) 383 const u64 *extent_item_pos, u64 total_refs,
384 u64 root_objectid)
382{ 385{
383 int err; 386 int err;
384 int ret = 0; 387 int ret = 0;
@@ -403,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
403 continue; 406 continue;
404 if (ref->count == 0) 407 if (ref->count == 0)
405 continue; 408 continue;
409 if (root_objectid && ref->root_id != root_objectid) {
410 ret = BACKREF_FOUND_SHARED;
411 goto out;
412 }
406 err = __resolve_indirect_ref(fs_info, path, time_seq, ref, 413 err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
407 parents, extent_item_pos, 414 parents, extent_item_pos,
408 total_refs); 415 total_refs);
@@ -483,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
483 continue; 490 continue;
484 BUG_ON(!ref->wanted_disk_byte); 491 BUG_ON(!ref->wanted_disk_byte);
485 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 492 eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
486 fs_info->tree_root->leafsize, 0); 493 0);
487 if (!eb || !extent_buffer_uptodate(eb)) { 494 if (!eb || !extent_buffer_uptodate(eb)) {
488 free_extent_buffer(eb); 495 free_extent_buffer(eb);
489 return -EIO; 496 return -EIO;
@@ -562,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode)
562 * smaller or equal that seq to the list 569 * smaller or equal that seq to the list
563 */ 570 */
564static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, 571static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
565 struct list_head *prefs, u64 *total_refs) 572 struct list_head *prefs, u64 *total_refs,
573 u64 inum)
566{ 574{
567 struct btrfs_delayed_extent_op *extent_op = head->extent_op; 575 struct btrfs_delayed_extent_op *extent_op = head->extent_op;
568 struct rb_node *n = &head->node.rb_node; 576 struct rb_node *n = &head->node.rb_node;
@@ -626,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
626 key.objectid = ref->objectid; 634 key.objectid = ref->objectid;
627 key.type = BTRFS_EXTENT_DATA_KEY; 635 key.type = BTRFS_EXTENT_DATA_KEY;
628 key.offset = ref->offset; 636 key.offset = ref->offset;
637
638 /*
639 * Found a inum that doesn't match our known inum, we
640 * know it's shared.
641 */
642 if (inum && ref->objectid != inum) {
643 ret = BACKREF_FOUND_SHARED;
644 break;
645 }
646
629 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, 647 ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
630 node->bytenr, 648 node->bytenr,
631 node->ref_mod * sgn, GFP_ATOMIC); 649 node->ref_mod * sgn, GFP_ATOMIC);
@@ -660,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
660static int __add_inline_refs(struct btrfs_fs_info *fs_info, 678static int __add_inline_refs(struct btrfs_fs_info *fs_info,
661 struct btrfs_path *path, u64 bytenr, 679 struct btrfs_path *path, u64 bytenr,
662 int *info_level, struct list_head *prefs, 680 int *info_level, struct list_head *prefs,
663 u64 *total_refs) 681 u64 *total_refs, u64 inum)
664{ 682{
665 int ret = 0; 683 int ret = 0;
666 int slot; 684 int slot;
@@ -745,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
745 dref); 763 dref);
746 key.type = BTRFS_EXTENT_DATA_KEY; 764 key.type = BTRFS_EXTENT_DATA_KEY;
747 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 765 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
766
767 if (inum && key.objectid != inum) {
768 ret = BACKREF_FOUND_SHARED;
769 break;
770 }
771
748 root = btrfs_extent_data_ref_root(leaf, dref); 772 root = btrfs_extent_data_ref_root(leaf, dref);
749 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 773 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
750 bytenr, count, GFP_NOFS); 774 bytenr, count, GFP_NOFS);
@@ -766,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
766 */ 790 */
767static int __add_keyed_refs(struct btrfs_fs_info *fs_info, 791static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
768 struct btrfs_path *path, u64 bytenr, 792 struct btrfs_path *path, u64 bytenr,
769 int info_level, struct list_head *prefs) 793 int info_level, struct list_head *prefs, u64 inum)
770{ 794{
771 struct btrfs_root *extent_root = fs_info->extent_root; 795 struct btrfs_root *extent_root = fs_info->extent_root;
772 int ret; 796 int ret;
@@ -828,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
828 dref); 852 dref);
829 key.type = BTRFS_EXTENT_DATA_KEY; 853 key.type = BTRFS_EXTENT_DATA_KEY;
830 key.offset = btrfs_extent_data_ref_offset(leaf, dref); 854 key.offset = btrfs_extent_data_ref_offset(leaf, dref);
855
856 if (inum && key.objectid != inum) {
857 ret = BACKREF_FOUND_SHARED;
858 break;
859 }
860
831 root = btrfs_extent_data_ref_root(leaf, dref); 861 root = btrfs_extent_data_ref_root(leaf, dref);
832 ret = __add_prelim_ref(prefs, root, &key, 0, 0, 862 ret = __add_prelim_ref(prefs, root, &key, 0, 0,
833 bytenr, count, GFP_NOFS); 863 bytenr, count, GFP_NOFS);
@@ -855,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
855static int find_parent_nodes(struct btrfs_trans_handle *trans, 885static int find_parent_nodes(struct btrfs_trans_handle *trans,
856 struct btrfs_fs_info *fs_info, u64 bytenr, 886 struct btrfs_fs_info *fs_info, u64 bytenr,
857 u64 time_seq, struct ulist *refs, 887 u64 time_seq, struct ulist *refs,
858 struct ulist *roots, const u64 *extent_item_pos) 888 struct ulist *roots, const u64 *extent_item_pos,
889 u64 root_objectid, u64 inum)
859{ 890{
860 struct btrfs_key key; 891 struct btrfs_key key;
861 struct btrfs_path *path; 892 struct btrfs_path *path;
@@ -930,7 +961,8 @@ again:
930 } 961 }
931 spin_unlock(&delayed_refs->lock); 962 spin_unlock(&delayed_refs->lock);
932 ret = __add_delayed_refs(head, time_seq, 963 ret = __add_delayed_refs(head, time_seq,
933 &prefs_delayed, &total_refs); 964 &prefs_delayed, &total_refs,
965 inum);
934 mutex_unlock(&head->mutex); 966 mutex_unlock(&head->mutex);
935 if (ret) 967 if (ret)
936 goto out; 968 goto out;
@@ -952,11 +984,11 @@ again:
952 key.type == BTRFS_METADATA_ITEM_KEY)) { 984 key.type == BTRFS_METADATA_ITEM_KEY)) {
953 ret = __add_inline_refs(fs_info, path, bytenr, 985 ret = __add_inline_refs(fs_info, path, bytenr,
954 &info_level, &prefs, 986 &info_level, &prefs,
955 &total_refs); 987 &total_refs, inum);
956 if (ret) 988 if (ret)
957 goto out; 989 goto out;
958 ret = __add_keyed_refs(fs_info, path, bytenr, 990 ret = __add_keyed_refs(fs_info, path, bytenr,
959 info_level, &prefs); 991 info_level, &prefs, inum);
960 if (ret) 992 if (ret)
961 goto out; 993 goto out;
962 } 994 }
@@ -972,7 +1004,8 @@ again:
972 __merge_refs(&prefs, 1); 1004 __merge_refs(&prefs, 1);
973 1005
974 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, 1006 ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
975 extent_item_pos, total_refs); 1007 extent_item_pos, total_refs,
1008 root_objectid);
976 if (ret) 1009 if (ret)
977 goto out; 1010 goto out;
978 1011
@@ -982,6 +1015,11 @@ again:
982 ref = list_first_entry(&prefs, struct __prelim_ref, list); 1015 ref = list_first_entry(&prefs, struct __prelim_ref, list);
983 WARN_ON(ref->count < 0); 1016 WARN_ON(ref->count < 0);
984 if (roots && ref->count && ref->root_id && ref->parent == 0) { 1017 if (roots && ref->count && ref->root_id && ref->parent == 0) {
1018 if (root_objectid && ref->root_id != root_objectid) {
1019 ret = BACKREF_FOUND_SHARED;
1020 goto out;
1021 }
1022
985 /* no parent == root of tree */ 1023 /* no parent == root of tree */
986 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); 1024 ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
987 if (ret < 0) 1025 if (ret < 0)
@@ -990,27 +1028,28 @@ again:
990 if (ref->count && ref->parent) { 1028 if (ref->count && ref->parent) {
991 if (extent_item_pos && !ref->inode_list && 1029 if (extent_item_pos && !ref->inode_list &&
992 ref->level == 0) { 1030 ref->level == 0) {
993 u32 bsz;
994 struct extent_buffer *eb; 1031 struct extent_buffer *eb;
995 bsz = btrfs_level_size(fs_info->extent_root, 1032
996 ref->level);
997 eb = read_tree_block(fs_info->extent_root, 1033 eb = read_tree_block(fs_info->extent_root,
998 ref->parent, bsz, 0); 1034 ref->parent, 0);
999 if (!eb || !extent_buffer_uptodate(eb)) { 1035 if (!eb || !extent_buffer_uptodate(eb)) {
1000 free_extent_buffer(eb); 1036 free_extent_buffer(eb);
1001 ret = -EIO; 1037 ret = -EIO;
1002 goto out; 1038 goto out;
1003 } 1039 }
1040 btrfs_tree_read_lock(eb);
1041 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1004 ret = find_extent_in_eb(eb, bytenr, 1042 ret = find_extent_in_eb(eb, bytenr,
1005 *extent_item_pos, &eie); 1043 *extent_item_pos, &eie);
1044 btrfs_tree_read_unlock_blocking(eb);
1006 free_extent_buffer(eb); 1045 free_extent_buffer(eb);
1007 if (ret < 0) 1046 if (ret < 0)
1008 goto out; 1047 goto out;
1009 ref->inode_list = eie; 1048 ref->inode_list = eie;
1010 } 1049 }
1011 ret = ulist_add_merge(refs, ref->parent, 1050 ret = ulist_add_merge_ptr(refs, ref->parent,
1012 (uintptr_t)ref->inode_list, 1051 ref->inode_list,
1013 (u64 *)&eie, GFP_NOFS); 1052 (void **)&eie, GFP_NOFS);
1014 if (ret < 0) 1053 if (ret < 0)
1015 goto out; 1054 goto out;
1016 if (!ret && extent_item_pos) { 1055 if (!ret && extent_item_pos) {
@@ -1085,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
1085 return -ENOMEM; 1124 return -ENOMEM;
1086 1125
1087 ret = find_parent_nodes(trans, fs_info, bytenr, 1126 ret = find_parent_nodes(trans, fs_info, bytenr,
1088 time_seq, *leafs, NULL, extent_item_pos); 1127 time_seq, *leafs, NULL, extent_item_pos, 0, 0);
1089 if (ret < 0 && ret != -ENOENT) { 1128 if (ret < 0 && ret != -ENOENT) {
1090 free_leaf_list(*leafs); 1129 free_leaf_list(*leafs);
1091 return ret; 1130 return ret;
@@ -1128,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1128 ULIST_ITER_INIT(&uiter); 1167 ULIST_ITER_INIT(&uiter);
1129 while (1) { 1168 while (1) {
1130 ret = find_parent_nodes(trans, fs_info, bytenr, 1169 ret = find_parent_nodes(trans, fs_info, bytenr,
1131 time_seq, tmp, *roots, NULL); 1170 time_seq, tmp, *roots, NULL, 0, 0);
1132 if (ret < 0 && ret != -ENOENT) { 1171 if (ret < 0 && ret != -ENOENT) {
1133 ulist_free(tmp); 1172 ulist_free(tmp);
1134 ulist_free(*roots); 1173 ulist_free(*roots);
@@ -1159,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
1159 return ret; 1198 return ret;
1160} 1199}
1161 1200
1201int btrfs_check_shared(struct btrfs_trans_handle *trans,
1202 struct btrfs_fs_info *fs_info, u64 root_objectid,
1203 u64 inum, u64 bytenr)
1204{
1205 struct ulist *tmp = NULL;
1206 struct ulist *roots = NULL;
1207 struct ulist_iterator uiter;
1208 struct ulist_node *node;
1209 struct seq_list elem = {};
1210 int ret = 0;
1211
1212 tmp = ulist_alloc(GFP_NOFS);
1213 roots = ulist_alloc(GFP_NOFS);
1214 if (!tmp || !roots) {
1215 ulist_free(tmp);
1216 ulist_free(roots);
1217 return -ENOMEM;
1218 }
1219
1220 if (trans)
1221 btrfs_get_tree_mod_seq(fs_info, &elem);
1222 else
1223 down_read(&fs_info->commit_root_sem);
1224 ULIST_ITER_INIT(&uiter);
1225 while (1) {
1226 ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
1227 roots, NULL, root_objectid, inum);
1228 if (ret == BACKREF_FOUND_SHARED) {
1229 ret = 1;
1230 break;
1231 }
1232 if (ret < 0 && ret != -ENOENT)
1233 break;
1234 node = ulist_next(tmp, &uiter);
1235 if (!node)
1236 break;
1237 bytenr = node->val;
1238 cond_resched();
1239 }
1240 if (trans)
1241 btrfs_put_tree_mod_seq(fs_info, &elem);
1242 else
1243 up_read(&fs_info->commit_root_sem);
1244 ulist_free(tmp);
1245 ulist_free(roots);
1246 return ret;
1247}
1248
1162/* 1249/*
1163 * this makes the path point to (inum INODE_ITEM ioff) 1250 * this makes the path point to (inum INODE_ITEM ioff)
1164 */ 1251 */
@@ -1191,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1191 unsigned long ptr; 1278 unsigned long ptr;
1192 1279
1193 key.objectid = inode_objectid; 1280 key.objectid = inode_objectid;
1194 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 1281 key.type = BTRFS_INODE_EXTREF_KEY;
1195 key.offset = start_off; 1282 key.offset = start_off;
1196 1283
1197 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1284 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1231,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
1231 ret = -ENOENT; 1318 ret = -ENOENT;
1232 if (found_key.objectid != inode_objectid) 1319 if (found_key.objectid != inode_objectid)
1233 break; 1320 break;
1234 if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) 1321 if (found_key.type != BTRFS_INODE_EXTREF_KEY)
1235 break; 1322 break;
1236 1323
1237 ret = 0; 1324 ret = 0;
@@ -1364,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
1364 } 1451 }
1365 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); 1452 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
1366 if (found_key->type == BTRFS_METADATA_ITEM_KEY) 1453 if (found_key->type == BTRFS_METADATA_ITEM_KEY)
1367 size = fs_info->extent_root->leafsize; 1454 size = fs_info->extent_root->nodesize;
1368 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) 1455 else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
1369 size = found_key->offset; 1456 size = found_key->offset;
1370 1457
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 86fc20fec282..2a1ac6bfc724 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
71 u64 start_off, struct btrfs_path *path, 71 u64 start_off, struct btrfs_path *path,
72 struct btrfs_inode_extref **ret_extref, 72 struct btrfs_inode_extref **ret_extref,
73 u64 *found_off); 73 u64 *found_off);
74int btrfs_check_shared(struct btrfs_trans_handle *trans,
75 struct btrfs_fs_info *fs_info, u64 root_objectid,
76 u64 inum, u64 bytenr);
74 77
75int __init btrfs_prelim_ref_init(void); 78int __init btrfs_prelim_ref_init(void);
76void btrfs_prelim_ref_exit(void); 79void btrfs_prelim_ref_exit(void);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4794923c410c..4aadadcfab20 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,17 @@
44#define BTRFS_INODE_IN_DELALLOC_LIST 9 44#define BTRFS_INODE_IN_DELALLOC_LIST 9
45#define BTRFS_INODE_READDIO_NEED_LOCK 10 45#define BTRFS_INODE_READDIO_NEED_LOCK 10
46#define BTRFS_INODE_HAS_PROPS 11 46#define BTRFS_INODE_HAS_PROPS 11
47/*
48 * The following 3 bits are meant only for the btree inode.
49 * When any of them is set, it means an error happened while writing an
50 * extent buffer belonging to:
51 * 1) a non-log btree
52 * 2) a log btree and first log sub-transaction
53 * 3) a log btree and second log sub-transaction
54 */
55#define BTRFS_INODE_BTREE_ERR 12
56#define BTRFS_INODE_BTREE_LOG1_ERR 13
57#define BTRFS_INODE_BTREE_LOG2_ERR 14
47 58
48/* in memory btrfs inode */ 59/* in memory btrfs inode */
49struct btrfs_inode { 60struct btrfs_inode {
@@ -84,12 +95,6 @@ struct btrfs_inode {
84 */ 95 */
85 struct list_head delalloc_inodes; 96 struct list_head delalloc_inodes;
86 97
87 /*
88 * list for tracking inodes that must be sent to disk before a
89 * rename or truncate commit
90 */
91 struct list_head ordered_operations;
92
93 /* node for the red-black tree that links inodes in subvolume root */ 98 /* node for the red-black tree that links inodes in subvolume root */
94 struct rb_node rb_node; 99 struct rb_node rb_node;
95 100
@@ -127,6 +132,12 @@ struct btrfs_inode {
127 u64 delalloc_bytes; 132 u64 delalloc_bytes;
128 133
129 /* 134 /*
135 * total number of bytes pending defrag, used by stat to check whether
136 * it needs COW.
137 */
138 u64 defrag_bytes;
139
140 /*
130 * the size of the file stored in the metadata on disk. data=ordered 141 * the size of the file stored in the metadata on disk. data=ordered
131 * means the in-memory i_size might be larger than the size on disk 142 * means the in-memory i_size might be larger than the size on disk
132 * because not all the blocks are written yet. 143 * because not all the blocks are written yet.
@@ -240,13 +251,25 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
240 BTRFS_I(inode)->last_sub_trans <= 251 BTRFS_I(inode)->last_sub_trans <=
241 BTRFS_I(inode)->last_log_commit && 252 BTRFS_I(inode)->last_log_commit &&
242 BTRFS_I(inode)->last_sub_trans <= 253 BTRFS_I(inode)->last_sub_trans <=
243 BTRFS_I(inode)->root->last_log_commit) 254 BTRFS_I(inode)->root->last_log_commit) {
244 return 1; 255 /*
256 * After a ranged fsync we might have left some extent maps
257 * (that fall outside the fsync's range). So return false
258 * here if the list isn't empty, to make sure btrfs_log_inode()
259 * will be called and process those extent maps.
260 */
261 smp_mb();
262 if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
263 return 1;
264 }
245 return 0; 265 return 0;
246} 266}
247 267
268#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1
269
248struct btrfs_dio_private { 270struct btrfs_dio_private {
249 struct inode *inode; 271 struct inode *inode;
272 unsigned long flags;
250 u64 logical_offset; 273 u64 logical_offset;
251 u64 disk_bytenr; 274 u64 disk_bytenr;
252 u64 bytes; 275 u64 bytes;
@@ -263,7 +286,12 @@ struct btrfs_dio_private {
263 286
264 /* dio_bio came from fs/direct-io.c */ 287 /* dio_bio came from fs/direct-io.c */
265 struct bio *dio_bio; 288 struct bio *dio_bio;
266 u8 csum[0]; 289
290 /*
291 * The original bio may be splited to several sub-bios, this is
292 * done during endio of sub-bios
293 */
294 int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int);
267}; 295};
268 296
269/* 297/*
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index ce92ae30250f..cb7f3fe9c9f6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror(
807 807
808 /* super block bytenr is always the unmapped device bytenr */ 808 /* super block bytenr is always the unmapped device bytenr */
809 dev_bytenr = btrfs_sb_offset(superblock_mirror_num); 809 dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
810 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) 810 if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes)
811 return -1; 811 return -1;
812 bh = __bread(superblock_bdev, dev_bytenr / 4096, 812 bh = __bread(superblock_bdev, dev_bytenr / 4096,
813 BTRFS_SUPER_INFO_SIZE); 813 BTRFS_SUPER_INFO_SIZE);
@@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror(
820 btrfs_super_magic(super_tmp) != BTRFS_MAGIC || 820 btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
821 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || 821 memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
822 btrfs_super_nodesize(super_tmp) != state->metablock_size || 822 btrfs_super_nodesize(super_tmp) != state->metablock_size ||
823 btrfs_super_leafsize(super_tmp) != state->metablock_size ||
824 btrfs_super_sectorsize(super_tmp) != state->datablock_size) { 823 btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
825 brelse(bh); 824 brelse(bh);
826 return 0; 825 return 0;
@@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data(
1252 1251
1253 while (len > 0) { 1252 while (len > 0) {
1254 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); 1253 cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
1255 BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> 1254 BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE));
1256 PAGE_CACHE_SHIFT);
1257 kaddr = block_ctx->datav[i]; 1255 kaddr = block_ctx->datav[i];
1258 memcpy(dst, kaddr + offset_in_page, cur); 1256 memcpy(dst, kaddr + offset_in_page, cur);
1259 1257
@@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root,
3120 struct list_head *dev_head = &fs_devices->devices; 3118 struct list_head *dev_head = &fs_devices->devices;
3121 struct btrfs_device *device; 3119 struct btrfs_device *device;
3122 3120
3123 if (root->nodesize != root->leafsize) {
3124 printk(KERN_INFO
3125 "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
3126 root->nodesize, root->leafsize);
3127 return -1;
3128 }
3129 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { 3121 if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
3130 printk(KERN_INFO 3122 printk(KERN_INFO
3131 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 3123 "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3132 root->nodesize, PAGE_CACHE_SIZE); 3124 root->nodesize, PAGE_CACHE_SIZE);
3133 return -1; 3125 return -1;
3134 } 3126 }
3135 if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3136 printk(KERN_INFO
3137 "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
3138 root->leafsize, PAGE_CACHE_SIZE);
3139 return -1;
3140 }
3141 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { 3127 if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
3142 printk(KERN_INFO 3128 printk(KERN_INFO
3143 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", 3129 "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1daea0b47187..d3220d31d3cb 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root,
91 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 91 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
92 92
93 return sizeof(struct compressed_bio) + 93 return sizeof(struct compressed_bio) +
94 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 94 (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size;
95 csum_size;
96} 95}
97 96
98static struct bio *compressed_bio_alloc(struct block_device *bdev, 97static struct bio *compressed_bio_alloc(struct block_device *bdev,
@@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
389 * freed before we're done setting it up 388 * freed before we're done setting it up
390 */ 389 */
391 atomic_inc(&cb->pending_bios); 390 atomic_inc(&cb->pending_bios);
392 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 391 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
392 BTRFS_WQ_ENDIO_DATA);
393 BUG_ON(ret); /* -ENOMEM */ 393 BUG_ON(ret); /* -ENOMEM */
394 394
395 if (!skip_sum) { 395 if (!skip_sum) {
@@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
420 } 420 }
421 bio_get(bio); 421 bio_get(bio);
422 422
423 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 423 ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA);
424 BUG_ON(ret); /* -ENOMEM */ 424 BUG_ON(ret); /* -ENOMEM */
425 425
426 if (!skip_sum) { 426 if (!skip_sum) {
@@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
615 cb->compress_type = extent_compress_type(bio_flags); 615 cb->compress_type = extent_compress_type(bio_flags);
616 cb->orig_bio = bio; 616 cb->orig_bio = bio;
617 617
618 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 618 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE);
619 PAGE_CACHE_SIZE;
620 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, 619 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
621 GFP_NOFS); 620 GFP_NOFS);
622 if (!cb->compressed_pages) 621 if (!cb->compressed_pages)
@@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
670 PAGE_CACHE_SIZE) { 669 PAGE_CACHE_SIZE) {
671 bio_get(comp_bio); 670 bio_get(comp_bio);
672 671
673 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 672 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
673 BTRFS_WQ_ENDIO_DATA);
674 BUG_ON(ret); /* -ENOMEM */ 674 BUG_ON(ret); /* -ENOMEM */
675 675
676 /* 676 /*
@@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
686 comp_bio, sums); 686 comp_bio, sums);
687 BUG_ON(ret); /* -ENOMEM */ 687 BUG_ON(ret); /* -ENOMEM */
688 } 688 }
689 sums += (comp_bio->bi_iter.bi_size + 689 sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
690 root->sectorsize - 1) / root->sectorsize; 690 root->sectorsize);
691 691
692 ret = btrfs_map_bio(root, READ, comp_bio, 692 ret = btrfs_map_bio(root, READ, comp_bio,
693 mirror_num, 0); 693 mirror_num, 0);
@@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
708 } 708 }
709 bio_get(comp_bio); 709 bio_get(comp_bio);
710 710
711 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 711 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio,
712 BTRFS_WQ_ENDIO_DATA);
712 BUG_ON(ret); /* -ENOMEM */ 713 BUG_ON(ret); /* -ENOMEM */
713 714
714 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 715 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index aeab453b8e24..19bc6162fb8e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
258 else 258 else
259 btrfs_node_key(buf, &disk_key, 0); 259 btrfs_node_key(buf, &disk_key, 0);
260 260
261 cow = btrfs_alloc_free_block(trans, root, buf->len, 0, 261 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
262 new_root_objectid, &disk_key, level, 262 &disk_key, level, buf->start, 0);
263 buf->start, 0);
264 if (IS_ERR(cow)) 263 if (IS_ERR(cow))
265 return PTR_ERR(cow); 264 return PTR_ERR(cow);
266 265
@@ -280,9 +279,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
280 279
281 WARN_ON(btrfs_header_generation(buf) > trans->transid); 280 WARN_ON(btrfs_header_generation(buf) > trans->transid);
282 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 281 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
283 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 282 ret = btrfs_inc_ref(trans, root, cow, 1);
284 else 283 else
285 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 284 ret = btrfs_inc_ref(trans, root, cow, 0);
286 285
287 if (ret) 286 if (ret)
288 return ret; 287 return ret;
@@ -1035,14 +1034,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1035 if ((owner == root->root_key.objectid || 1034 if ((owner == root->root_key.objectid ||
1036 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && 1035 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1037 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 1036 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1038 ret = btrfs_inc_ref(trans, root, buf, 1, 1); 1037 ret = btrfs_inc_ref(trans, root, buf, 1);
1039 BUG_ON(ret); /* -ENOMEM */ 1038 BUG_ON(ret); /* -ENOMEM */
1040 1039
1041 if (root->root_key.objectid == 1040 if (root->root_key.objectid ==
1042 BTRFS_TREE_RELOC_OBJECTID) { 1041 BTRFS_TREE_RELOC_OBJECTID) {
1043 ret = btrfs_dec_ref(trans, root, buf, 0, 1); 1042 ret = btrfs_dec_ref(trans, root, buf, 0);
1044 BUG_ON(ret); /* -ENOMEM */ 1043 BUG_ON(ret); /* -ENOMEM */
1045 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1044 ret = btrfs_inc_ref(trans, root, cow, 1);
1046 BUG_ON(ret); /* -ENOMEM */ 1045 BUG_ON(ret); /* -ENOMEM */
1047 } 1046 }
1048 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; 1047 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -1050,9 +1049,9 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1050 1049
1051 if (root->root_key.objectid == 1050 if (root->root_key.objectid ==
1052 BTRFS_TREE_RELOC_OBJECTID) 1051 BTRFS_TREE_RELOC_OBJECTID)
1053 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1052 ret = btrfs_inc_ref(trans, root, cow, 1);
1054 else 1053 else
1055 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 1054 ret = btrfs_inc_ref(trans, root, cow, 0);
1056 BUG_ON(ret); /* -ENOMEM */ 1055 BUG_ON(ret); /* -ENOMEM */
1057 } 1056 }
1058 if (new_flags != 0) { 1057 if (new_flags != 0) {
@@ -1069,11 +1068,11 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1069 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 1068 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1070 if (root->root_key.objectid == 1069 if (root->root_key.objectid ==
1071 BTRFS_TREE_RELOC_OBJECTID) 1070 BTRFS_TREE_RELOC_OBJECTID)
1072 ret = btrfs_inc_ref(trans, root, cow, 1, 1); 1071 ret = btrfs_inc_ref(trans, root, cow, 1);
1073 else 1072 else
1074 ret = btrfs_inc_ref(trans, root, cow, 0, 1); 1073 ret = btrfs_inc_ref(trans, root, cow, 0);
1075 BUG_ON(ret); /* -ENOMEM */ 1074 BUG_ON(ret); /* -ENOMEM */
1076 ret = btrfs_dec_ref(trans, root, buf, 1, 1); 1075 ret = btrfs_dec_ref(trans, root, buf, 1);
1077 BUG_ON(ret); /* -ENOMEM */ 1076 BUG_ON(ret); /* -ENOMEM */
1078 } 1077 }
1079 clean_tree_block(trans, root, buf); 1078 clean_tree_block(trans, root, buf);
@@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1133 } else 1132 } else
1134 parent_start = 0; 1133 parent_start = 0;
1135 1134
1136 cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, 1135 cow = btrfs_alloc_tree_block(trans, root, parent_start,
1137 root->root_key.objectid, &disk_key, 1136 root->root_key.objectid, &disk_key, level,
1138 level, search_start, empty_size); 1137 search_start, empty_size);
1139 if (IS_ERR(cow)) 1138 if (IS_ERR(cow))
1140 return PTR_ERR(cow); 1139 return PTR_ERR(cow);
1141 1140
@@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1425 struct tree_mod_root *old_root = NULL; 1424 struct tree_mod_root *old_root = NULL;
1426 u64 old_generation = 0; 1425 u64 old_generation = 0;
1427 u64 logical; 1426 u64 logical;
1428 u32 blocksize;
1429 1427
1430 eb_root = btrfs_read_lock_root_node(root); 1428 eb_root = btrfs_read_lock_root_node(root);
1431 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); 1429 tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
@@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
1444 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { 1442 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1445 btrfs_tree_read_unlock(eb_root); 1443 btrfs_tree_read_unlock(eb_root);
1446 free_extent_buffer(eb_root); 1444 free_extent_buffer(eb_root);
1447 blocksize = btrfs_level_size(root, old_root->level); 1445 old = read_tree_block(root, logical, 0);
1448 old = read_tree_block(root, logical, blocksize, 0);
1449 if (WARN_ON(!old || !extent_buffer_uptodate(old))) { 1446 if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1450 free_extent_buffer(old); 1447 free_extent_buffer(old);
1451 btrfs_warn(root->fs_info, 1448 btrfs_warn(root->fs_info,
@@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
1506 struct btrfs_root *root, 1503 struct btrfs_root *root,
1507 struct extent_buffer *buf) 1504 struct extent_buffer *buf)
1508{ 1505{
1509#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1506 if (btrfs_test_is_dummy_root(root))
1510 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1511 return 0; 1507 return 0;
1512#endif 1508
1513 /* ensure we can see the force_cow */ 1509 /* ensure we can see the force_cow */
1514 smp_rmb(); 1510 smp_rmb();
1515 1511
@@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1651 WARN_ON(trans->transid != root->fs_info->generation); 1647 WARN_ON(trans->transid != root->fs_info->generation);
1652 1648
1653 parent_nritems = btrfs_header_nritems(parent); 1649 parent_nritems = btrfs_header_nritems(parent);
1654 blocksize = btrfs_level_size(root, parent_level - 1); 1650 blocksize = root->nodesize;
1655 end_slot = parent_nritems; 1651 end_slot = parent_nritems;
1656 1652
1657 if (parent_nritems == 1) 1653 if (parent_nritems == 1)
@@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1685 continue; 1681 continue;
1686 } 1682 }
1687 1683
1688 cur = btrfs_find_tree_block(root, blocknr, blocksize); 1684 cur = btrfs_find_tree_block(root, blocknr);
1689 if (cur) 1685 if (cur)
1690 uptodate = btrfs_buffer_uptodate(cur, gen, 0); 1686 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1691 else 1687 else
1692 uptodate = 0; 1688 uptodate = 0;
1693 if (!cur || !uptodate) { 1689 if (!cur || !uptodate) {
1694 if (!cur) { 1690 if (!cur) {
1695 cur = read_tree_block(root, blocknr, 1691 cur = read_tree_block(root, blocknr, gen);
1696 blocksize, gen);
1697 if (!cur || !extent_buffer_uptodate(cur)) { 1692 if (!cur || !extent_buffer_uptodate(cur)) {
1698 free_extent_buffer(cur); 1693 free_extent_buffer(cur);
1699 return -EIO; 1694 return -EIO;
@@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1872 BUG_ON(level == 0); 1867 BUG_ON(level == 0);
1873 1868
1874 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), 1869 eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1875 btrfs_level_size(root, level - 1),
1876 btrfs_node_ptr_generation(parent, slot)); 1870 btrfs_node_ptr_generation(parent, slot));
1877 if (eb && !extent_buffer_uptodate(eb)) { 1871 if (eb && !extent_buffer_uptodate(eb)) {
1878 free_extent_buffer(eb); 1872 free_extent_buffer(eb);
@@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root,
2267 node = path->nodes[level]; 2261 node = path->nodes[level];
2268 2262
2269 search = btrfs_node_blockptr(node, slot); 2263 search = btrfs_node_blockptr(node, slot);
2270 blocksize = btrfs_level_size(root, level - 1); 2264 blocksize = root->nodesize;
2271 eb = btrfs_find_tree_block(root, search, blocksize); 2265 eb = btrfs_find_tree_block(root, search);
2272 if (eb) { 2266 if (eb) {
2273 free_extent_buffer(eb); 2267 free_extent_buffer(eb);
2274 return; 2268 return;
@@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root,
2298 if ((search <= target && target - search <= 65536) || 2292 if ((search <= target && target - search <= 65536) ||
2299 (search > target && search - target <= 65536)) { 2293 (search > target && search - target <= 65536)) {
2300 gen = btrfs_node_ptr_generation(node, nr); 2294 gen = btrfs_node_ptr_generation(node, nr);
2301 readahead_tree_block(root, search, blocksize, gen); 2295 readahead_tree_block(root, search, blocksize);
2302 nread += blocksize; 2296 nread += blocksize;
2303 } 2297 }
2304 nscan++; 2298 nscan++;
@@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2325 2319
2326 nritems = btrfs_header_nritems(parent); 2320 nritems = btrfs_header_nritems(parent);
2327 slot = path->slots[level + 1]; 2321 slot = path->slots[level + 1];
2328 blocksize = btrfs_level_size(root, level); 2322 blocksize = root->nodesize;
2329 2323
2330 if (slot > 0) { 2324 if (slot > 0) {
2331 block1 = btrfs_node_blockptr(parent, slot - 1); 2325 block1 = btrfs_node_blockptr(parent, slot - 1);
2332 gen = btrfs_node_ptr_generation(parent, slot - 1); 2326 gen = btrfs_node_ptr_generation(parent, slot - 1);
2333 eb = btrfs_find_tree_block(root, block1, blocksize); 2327 eb = btrfs_find_tree_block(root, block1);
2334 /* 2328 /*
2335 * if we get -eagain from btrfs_buffer_uptodate, we 2329 * if we get -eagain from btrfs_buffer_uptodate, we
2336 * don't want to return eagain here. That will loop 2330 * don't want to return eagain here. That will loop
@@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root,
2343 if (slot + 1 < nritems) { 2337 if (slot + 1 < nritems) {
2344 block2 = btrfs_node_blockptr(parent, slot + 1); 2338 block2 = btrfs_node_blockptr(parent, slot + 1);
2345 gen = btrfs_node_ptr_generation(parent, slot + 1); 2339 gen = btrfs_node_ptr_generation(parent, slot + 1);
2346 eb = btrfs_find_tree_block(root, block2, blocksize); 2340 eb = btrfs_find_tree_block(root, block2);
2347 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) 2341 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2348 block2 = 0; 2342 block2 = 0;
2349 free_extent_buffer(eb); 2343 free_extent_buffer(eb);
2350 } 2344 }
2351 2345
2352 if (block1) 2346 if (block1)
2353 readahead_tree_block(root, block1, blocksize, 0); 2347 readahead_tree_block(root, block1, blocksize);
2354 if (block2) 2348 if (block2)
2355 readahead_tree_block(root, block2, blocksize, 0); 2349 readahead_tree_block(root, block2, blocksize);
2356} 2350}
2357 2351
2358 2352
@@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2454{ 2448{
2455 u64 blocknr; 2449 u64 blocknr;
2456 u64 gen; 2450 u64 gen;
2457 u32 blocksize;
2458 struct extent_buffer *b = *eb_ret; 2451 struct extent_buffer *b = *eb_ret;
2459 struct extent_buffer *tmp; 2452 struct extent_buffer *tmp;
2460 int ret; 2453 int ret;
2461 2454
2462 blocknr = btrfs_node_blockptr(b, slot); 2455 blocknr = btrfs_node_blockptr(b, slot);
2463 gen = btrfs_node_ptr_generation(b, slot); 2456 gen = btrfs_node_ptr_generation(b, slot);
2464 blocksize = btrfs_level_size(root, level - 1);
2465 2457
2466 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 2458 tmp = btrfs_find_tree_block(root, blocknr);
2467 if (tmp) { 2459 if (tmp) {
2468 /* first we do an atomic uptodate check */ 2460 /* first we do an atomic uptodate check */
2469 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { 2461 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
@@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
2507 btrfs_release_path(p); 2499 btrfs_release_path(p);
2508 2500
2509 ret = -EAGAIN; 2501 ret = -EAGAIN;
2510 tmp = read_tree_block(root, blocknr, blocksize, 0); 2502 tmp = read_tree_block(root, blocknr, 0);
2511 if (tmp) { 2503 if (tmp) {
2512 /* 2504 /*
2513 * If the read above didn't mark this buffer up to date, 2505 * If the read above didn't mark this buffer up to date,
@@ -2792,8 +2784,6 @@ again:
2792 if (!should_cow_block(trans, root, b)) 2784 if (!should_cow_block(trans, root, b))
2793 goto cow_done; 2785 goto cow_done;
2794 2786
2795 btrfs_set_path_blocking(p);
2796
2797 /* 2787 /*
2798 * must have write locks on this node and the 2788 * must have write locks on this node and the
2799 * parent 2789 * parent
@@ -2807,6 +2797,7 @@ again:
2807 goto again; 2797 goto again;
2808 } 2798 }
2809 2799
2800 btrfs_set_path_blocking(p);
2810 err = btrfs_cow_block(trans, root, b, 2801 err = btrfs_cow_block(trans, root, b,
2811 p->nodes[level + 1], 2802 p->nodes[level + 1],
2812 p->slots[level + 1], &b); 2803 p->slots[level + 1], &b);
@@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3362 else 3353 else
3363 btrfs_node_key(lower, &lower_key, 0); 3354 btrfs_node_key(lower, &lower_key, 0);
3364 3355
3365 c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3356 c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3366 root->root_key.objectid, &lower_key, 3357 &lower_key, level, root->node->start, 0);
3367 level, root->node->start, 0);
3368 if (IS_ERR(c)) 3358 if (IS_ERR(c))
3369 return PTR_ERR(c); 3359 return PTR_ERR(c);
3370 3360
@@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
3502 mid = (c_nritems + 1) / 2; 3492 mid = (c_nritems + 1) / 2;
3503 btrfs_node_key(c, &disk_key, mid); 3493 btrfs_node_key(c, &disk_key, mid);
3504 3494
3505 split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, 3495 split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3506 root->root_key.objectid, 3496 &disk_key, level, c->start, 0);
3507 &disk_key, level, c->start, 0);
3508 if (IS_ERR(split)) 3497 if (IS_ERR(split))
3509 return PTR_ERR(split); 3498 return PTR_ERR(split);
3510 3499
@@ -4282,13 +4271,12 @@ again:
4282 else 4271 else
4283 btrfs_item_key(l, &disk_key, mid); 4272 btrfs_item_key(l, &disk_key, mid);
4284 4273
4285 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 4274 right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4286 root->root_key.objectid, 4275 &disk_key, 0, l->start, 0);
4287 &disk_key, 0, l->start, 0);
4288 if (IS_ERR(right)) 4276 if (IS_ERR(right))
4289 return PTR_ERR(right); 4277 return PTR_ERR(right);
4290 4278
4291 root_add_used(root, root->leafsize); 4279 root_add_used(root, root->nodesize);
4292 4280
4293 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 4281 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4294 btrfs_set_header_bytenr(right, right->start); 4282 btrfs_set_header_bytenr(right, right->start);
@@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4626 ptr = btrfs_item_ptr_offset(leaf, slot); 4614 ptr = btrfs_item_ptr_offset(leaf, slot);
4627 memmove_extent_buffer(leaf, ptr, 4615 memmove_extent_buffer(leaf, ptr,
4628 (unsigned long)fi, 4616 (unsigned long)fi,
4629 offsetof(struct btrfs_file_extent_item, 4617 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4630 disk_bytenr));
4631 } 4618 }
4632 } 4619 }
4633 4620
@@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4738 int slot; 4725 int slot;
4739 struct btrfs_map_token token; 4726 struct btrfs_map_token token;
4740 4727
4728 if (path->slots[0] == 0) {
4729 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4730 fixup_low_keys(root, path, &disk_key, 1);
4731 }
4732 btrfs_unlock_up_safe(path, 1);
4733
4741 btrfs_init_map_token(&token); 4734 btrfs_init_map_token(&token);
4742 4735
4743 leaf = path->nodes[0]; 4736 leaf = path->nodes[0];
@@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4798 } 4791 }
4799 4792
4800 btrfs_set_header_nritems(leaf, nritems + nr); 4793 btrfs_set_header_nritems(leaf, nritems + nr);
4801
4802 if (slot == 0) {
4803 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4804 fixup_low_keys(root, path, &disk_key, 1);
4805 }
4806 btrfs_unlock_up_safe(path, 1);
4807 btrfs_mark_buffer_dirty(leaf); 4794 btrfs_mark_buffer_dirty(leaf);
4808 4795
4809 if (btrfs_leaf_free_space(root, leaf) < 0) { 4796 if (btrfs_leaf_free_space(root, leaf) < 0) {
@@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5145 u32 nritems; 5132 u32 nritems;
5146 int level; 5133 int level;
5147 int ret = 1; 5134 int ret = 1;
5135 int keep_locks = path->keep_locks;
5148 5136
5149 WARN_ON(!path->keep_locks); 5137 path->keep_locks = 1;
5150again: 5138again:
5151 cur = btrfs_read_lock_root_node(root); 5139 cur = btrfs_read_lock_root_node(root);
5152 level = btrfs_header_level(cur); 5140 level = btrfs_header_level(cur);
@@ -5210,7 +5198,6 @@ find_next_key:
5210 path->slots[level] = slot; 5198 path->slots[level] = slot;
5211 if (level == path->lowest_level) { 5199 if (level == path->lowest_level) {
5212 ret = 0; 5200 ret = 0;
5213 unlock_up(path, level, 1, 0, NULL);
5214 goto out; 5201 goto out;
5215 } 5202 }
5216 btrfs_set_path_blocking(path); 5203 btrfs_set_path_blocking(path);
@@ -5225,9 +5212,12 @@ find_next_key:
5225 btrfs_clear_path_blocking(path, NULL, 0); 5212 btrfs_clear_path_blocking(path, NULL, 0);
5226 } 5213 }
5227out: 5214out:
5228 if (ret == 0) 5215 path->keep_locks = keep_locks;
5216 if (ret == 0) {
5217 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5218 btrfs_set_path_blocking(path);
5229 memcpy(min_key, &found_key, sizeof(found_key)); 5219 memcpy(min_key, &found_key, sizeof(found_key));
5230 btrfs_set_path_blocking(path); 5220 }
5231 return ret; 5221 return ret;
5232} 5222}
5233 5223
@@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5375 goto out; 5365 goto out;
5376 } 5366 }
5377 5367
5378 tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); 5368 tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5379 if (!tmp_buf) { 5369 if (!tmp_buf) {
5380 ret = -ENOMEM; 5370 ret = -ENOMEM;
5381 goto out; 5371 goto out;
@@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
5520 goto out; 5510 goto out;
5521 advance_right = ADVANCE; 5511 advance_right = ADVANCE;
5522 } else { 5512 } else {
5523 enum btrfs_compare_tree_result cmp; 5513 enum btrfs_compare_tree_result result;
5524 5514
5525 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); 5515 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5526 ret = tree_compare_item(left_root, left_path, 5516 ret = tree_compare_item(left_root, left_path,
5527 right_path, tmp_buf); 5517 right_path, tmp_buf);
5528 if (ret) 5518 if (ret)
5529 cmp = BTRFS_COMPARE_TREE_CHANGED; 5519 result = BTRFS_COMPARE_TREE_CHANGED;
5530 else 5520 else
5531 cmp = BTRFS_COMPARE_TREE_SAME; 5521 result = BTRFS_COMPARE_TREE_SAME;
5532 ret = changed_cb(left_root, right_root, 5522 ret = changed_cb(left_root, right_root,
5533 left_path, right_path, 5523 left_path, right_path,
5534 &left_key, cmp, ctx); 5524 &left_key, result, ctx);
5535 if (ret < 0) 5525 if (ret < 0)
5536 goto out; 5526 goto out;
5537 advance_left = ADVANCE; 5527 advance_left = ADVANCE;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be91397f4e92..d557264ee974 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
34#include <linux/pagemap.h> 34#include <linux/pagemap.h>
35#include <linux/btrfs.h> 35#include <linux/btrfs.h>
36#include <linux/workqueue.h> 36#include <linux/workqueue.h>
37#include <linux/security.h>
37#include "extent_io.h" 38#include "extent_io.h"
38#include "extent_map.h" 39#include "extent_map.h"
39#include "async-thread.h" 40#include "async-thread.h"
@@ -62,13 +63,6 @@ struct btrfs_ordered_sum;
62 63
63#define BTRFS_COMPAT_EXTENT_TREE_V0 64#define BTRFS_COMPAT_EXTENT_TREE_V0
64 65
65/*
66 * files bigger than this get some pre-flushing when they are added
67 * to the ordered operations list. That way we limit the total
68 * work done by the commit
69 */
70#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
71
72/* holds pointers to all of the tree roots */ 66/* holds pointers to all of the tree roots */
73#define BTRFS_ROOT_TREE_OBJECTID 1ULL 67#define BTRFS_ROOT_TREE_OBJECTID 1ULL
74 68
@@ -391,10 +385,12 @@ struct btrfs_header {
391 sizeof(struct btrfs_header)) / \ 385 sizeof(struct btrfs_header)) / \
392 sizeof(struct btrfs_key_ptr)) 386 sizeof(struct btrfs_key_ptr))
393#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) 387#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
394#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) 388#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
389#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
390 (offsetof(struct btrfs_file_extent_item, disk_bytenr))
395#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ 391#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
396 sizeof(struct btrfs_item) - \ 392 sizeof(struct btrfs_item) - \
397 sizeof(struct btrfs_file_extent_item)) 393 BTRFS_FILE_EXTENT_INLINE_DATA_START)
398#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ 394#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
399 sizeof(struct btrfs_item) -\ 395 sizeof(struct btrfs_item) -\
400 sizeof(struct btrfs_dir_item)) 396 sizeof(struct btrfs_dir_item))
@@ -474,7 +470,7 @@ struct btrfs_super_block {
474 __le64 num_devices; 470 __le64 num_devices;
475 __le32 sectorsize; 471 __le32 sectorsize;
476 __le32 nodesize; 472 __le32 nodesize;
477 __le32 leafsize; 473 __le32 __unused_leafsize;
478 __le32 stripesize; 474 __le32 stripesize;
479 __le32 sys_chunk_array_size; 475 __le32 sys_chunk_array_size;
480 __le64 chunk_root_generation; 476 __le64 chunk_root_generation;
@@ -903,6 +899,8 @@ struct btrfs_file_extent_item {
903 /* 899 /*
904 * disk space consumed by the extent, checksum blocks are included 900 * disk space consumed by the extent, checksum blocks are included
905 * in these numbers 901 * in these numbers
902 *
903 * At this offset in the structure, the inline extent data start.
906 */ 904 */
907 __le64 disk_bytenr; 905 __le64 disk_bytenr;
908 __le64 disk_num_bytes; 906 __le64 disk_num_bytes;
@@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache {
1305 */ 1303 */
1306 struct list_head cluster_list; 1304 struct list_head cluster_list;
1307 1305
1308 /* For delayed block group creation */ 1306 /* For delayed block group creation or deletion of empty block groups */
1309 struct list_head new_bg_list; 1307 struct list_head bg_list;
1310}; 1308};
1311 1309
1312/* delayed seq elem */ 1310/* delayed seq elem */
@@ -1545,6 +1543,7 @@ struct btrfs_fs_info {
1545 struct btrfs_workqueue *endio_workers; 1543 struct btrfs_workqueue *endio_workers;
1546 struct btrfs_workqueue *endio_meta_workers; 1544 struct btrfs_workqueue *endio_meta_workers;
1547 struct btrfs_workqueue *endio_raid56_workers; 1545 struct btrfs_workqueue *endio_raid56_workers;
1546 struct btrfs_workqueue *endio_repair_workers;
1548 struct btrfs_workqueue *rmw_workers; 1547 struct btrfs_workqueue *rmw_workers;
1549 struct btrfs_workqueue *endio_meta_write_workers; 1548 struct btrfs_workqueue *endio_meta_write_workers;
1550 struct btrfs_workqueue *endio_write_workers; 1549 struct btrfs_workqueue *endio_write_workers;
@@ -1574,6 +1573,7 @@ struct btrfs_fs_info {
1574 int do_barriers; 1573 int do_barriers;
1575 int closing; 1574 int closing;
1576 int log_root_recovering; 1575 int log_root_recovering;
1576 int open;
1577 1577
1578 u64 total_pinned; 1578 u64 total_pinned;
1579 1579
@@ -1723,6 +1723,12 @@ struct btrfs_fs_info {
1723 1723
1724 /* Used to reclaim the metadata space in the background. */ 1724 /* Used to reclaim the metadata space in the background. */
1725 struct work_struct async_reclaim_work; 1725 struct work_struct async_reclaim_work;
1726
1727 spinlock_t unused_bgs_lock;
1728 struct list_head unused_bgs;
1729
1730 /* For btrfs to record security options */
1731 struct security_mnt_opts security_opts;
1726}; 1732};
1727 1733
1728struct btrfs_subvolume_writers { 1734struct btrfs_subvolume_writers {
@@ -1776,12 +1782,12 @@ struct btrfs_root {
1776 1782
1777 /* free ino cache stuff */ 1783 /* free ino cache stuff */
1778 struct btrfs_free_space_ctl *free_ino_ctl; 1784 struct btrfs_free_space_ctl *free_ino_ctl;
1779 enum btrfs_caching_type cached; 1785 enum btrfs_caching_type ino_cache_state;
1780 spinlock_t cache_lock; 1786 spinlock_t ino_cache_lock;
1781 wait_queue_head_t cache_wait; 1787 wait_queue_head_t ino_cache_wait;
1782 struct btrfs_free_space_ctl *free_ino_pinned; 1788 struct btrfs_free_space_ctl *free_ino_pinned;
1783 u64 cache_progress; 1789 u64 ino_cache_progress;
1784 struct inode *cache_inode; 1790 struct inode *ino_cache_inode;
1785 1791
1786 struct mutex log_mutex; 1792 struct mutex log_mutex;
1787 wait_queue_head_t log_writer_wait; 1793 wait_queue_head_t log_writer_wait;
@@ -1806,18 +1812,14 @@ struct btrfs_root {
1806 /* node allocations are done in nodesize units */ 1812 /* node allocations are done in nodesize units */
1807 u32 nodesize; 1813 u32 nodesize;
1808 1814
1809 /* leaf allocations are done in leafsize units */
1810 u32 leafsize;
1811
1812 u32 stripesize; 1815 u32 stripesize;
1813 1816
1814 u32 type; 1817 u32 type;
1815 1818
1816 u64 highest_objectid; 1819 u64 highest_objectid;
1817 1820
1818#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1821 /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
1819 u64 alloc_bytenr; 1822 u64 alloc_bytenr;
1820#endif
1821 1823
1822 u64 defrag_trans_start; 1824 u64 defrag_trans_start;
1823 struct btrfs_key defrag_progress; 1825 struct btrfs_key defrag_progress;
@@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args {
2094#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) 2096#define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24)
2095 2097
2096#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2098#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2099#define BTRFS_DEFAULT_MAX_INLINE (8192)
2097 2100
2098#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 2101#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
2099#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 2102#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block,
2995 sectorsize, 32); 2998 sectorsize, 32);
2996BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, 2999BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block,
2997 nodesize, 32); 3000 nodesize, 32);
2998BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block,
2999 leafsize, 32);
3000BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, 3001BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block,
3001 stripesize, 32); 3002 stripesize, 32);
3002BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, 3003BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block,
@@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression,
3049static inline unsigned long 3050static inline unsigned long
3050btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) 3051btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
3051{ 3052{
3052 unsigned long offset = (unsigned long)e; 3053 return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
3053 offset += offsetof(struct btrfs_file_extent_item, disk_bytenr);
3054 return offset;
3055} 3054}
3056 3055
3057static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) 3056static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
3058{ 3057{
3059 return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; 3058 return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
3060} 3059}
3061 3060
3062BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, 3061BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item,
@@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item,
3086static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, 3085static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
3087 struct btrfs_item *e) 3086 struct btrfs_item *e)
3088{ 3087{
3089 unsigned long offset; 3088 return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
3090 offset = offsetof(struct btrfs_file_extent_item, disk_bytenr);
3091 return btrfs_item_size(eb, e) - offset;
3092} 3089}
3093 3090
3094/* this returns the number of file bytes represented by the inline item. 3091/* this returns the number of file bytes represented by the inline item.
@@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
3232 return sb->s_fs_info; 3229 return sb->s_fs_info;
3233} 3230}
3234 3231
3235static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
3236{
3237 if (level == 0)
3238 return root->leafsize;
3239 return root->nodesize;
3240}
3241
3242/* helper function to cast into the data area of the leaf. */ 3232/* helper function to cast into the data area of the leaf. */
3243#define btrfs_item_ptr(leaf, slot, type) \ 3233#define btrfs_item_ptr(leaf, slot, type) \
3244 ((type *)(btrfs_leaf_data(leaf) + \ 3234 ((type *)(btrfs_leaf_data(leaf) + \
@@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
3263static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 3253static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3264 unsigned num_items) 3254 unsigned num_items)
3265{ 3255{
3266 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3256 return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3267 2 * num_items; 3257 2 * num_items;
3268} 3258}
3269 3259
@@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
3274static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, 3264static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
3275 unsigned num_items) 3265 unsigned num_items)
3276{ 3266{
3277 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3267 return root->nodesize * BTRFS_MAX_LEVEL * num_items;
3278 num_items;
3279} 3268}
3280 3269
3281int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, 3270int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
@@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group(
3305 u64 bytenr); 3294 u64 bytenr);
3306void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 3295void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3307int get_block_group_index(struct btrfs_block_group_cache *cache); 3296int get_block_group_index(struct btrfs_block_group_cache *cache);
3308struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 3297struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
3309 struct btrfs_root *root, u32 blocksize, 3298 struct btrfs_root *root, u64 parent,
3310 u64 parent, u64 root_objectid, 3299 u64 root_objectid,
3311 struct btrfs_disk_key *key, int level, 3300 struct btrfs_disk_key *key, int level,
3312 u64 hint, u64 empty_size); 3301 u64 hint, u64 empty_size);
3313void btrfs_free_tree_block(struct btrfs_trans_handle *trans, 3302void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
@@ -3326,9 +3315,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
3326 u64 min_alloc_size, u64 empty_size, u64 hint_byte, 3315 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
3327 struct btrfs_key *ins, int is_data, int delalloc); 3316 struct btrfs_key *ins, int is_data, int delalloc);
3328int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3317int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3329 struct extent_buffer *buf, int full_backref, int no_quota); 3318 struct extent_buffer *buf, int full_backref);
3330int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3319int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3331 struct extent_buffer *buf, int full_backref, int no_quota); 3320 struct extent_buffer *buf, int full_backref);
3332int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, 3321int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3333 struct btrfs_root *root, 3322 struct btrfs_root *root,
3334 u64 bytenr, u64 num_bytes, u64 flags, 3323 u64 bytenr, u64 num_bytes, u64 flags,
@@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
3363 u64 size); 3352 u64 size);
3364int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 3353int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
3365 struct btrfs_root *root, u64 group_start); 3354 struct btrfs_root *root, u64 group_start);
3355void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
3366void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, 3356void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
3367 struct btrfs_root *root); 3357 struct btrfs_root *root);
3368u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); 3358u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
@@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
3604 kfree(fs_info->uuid_root); 3594 kfree(fs_info->uuid_root);
3605 kfree(fs_info->super_copy); 3595 kfree(fs_info->super_copy);
3606 kfree(fs_info->super_for_commit); 3596 kfree(fs_info->super_for_commit);
3597 security_free_mnt_opts(&fs_info->security_opts);
3607 kfree(fs_info); 3598 kfree(fs_info);
3608} 3599}
3609 3600
@@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
3739int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 3730int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
3740 struct bio *bio, u32 *dst); 3731 struct bio *bio, u32 *dst);
3741int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, 3732int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
3742 struct btrfs_dio_private *dip, struct bio *bio, 3733 struct bio *bio, u64 logical_offset);
3743 u64 logical_offset);
3744int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 3734int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
3745 struct btrfs_root *root, 3735 struct btrfs_root *root,
3746 u64 objectid, u64 pos, 3736 u64 objectid, u64 pos,
@@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
4141/* Sanity test specific functions */ 4131/* Sanity test specific functions */
4142#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 4132#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4143void btrfs_test_destroy_inode(struct inode *inode); 4133void btrfs_test_destroy_inode(struct inode *inode);
4144int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
4145 u64 rfer, u64 excl);
4146#endif 4134#endif
4147 4135
4136static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
4137{
4138#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4139 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
4140 return 1;
4141#endif
4142 return 0;
4143}
4144
4148#endif 4145#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index da775bfdebc9..054577bddaf2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
1042 int ret; 1042 int ret;
1043 1043
1044 key.objectid = node->inode_id; 1044 key.objectid = node->inode_id;
1045 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 1045 key.type = BTRFS_INODE_ITEM_KEY;
1046 key.offset = 0; 1046 key.offset = 0;
1047 1047
1048 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) 1048 if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
@@ -1099,7 +1099,7 @@ err_out:
1099search: 1099search:
1100 btrfs_release_path(path); 1100 btrfs_release_path(path);
1101 1101
1102 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 1102 key.type = BTRFS_INODE_EXTREF_KEY;
1103 key.offset = -1; 1103 key.offset = -1;
1104 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1104 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1105 if (ret < 0) 1105 if (ret < 0)
@@ -1395,8 +1395,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1395 return -ENOMEM; 1395 return -ENOMEM;
1396 1396
1397 async_work->delayed_root = delayed_root; 1397 async_work->delayed_root = delayed_root;
1398 btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, 1398 btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
1399 NULL, NULL); 1399 btrfs_async_run_delayed_root, NULL, NULL);
1400 async_work->nr = nr; 1400 async_work->nr = nr;
1401 1401
1402 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); 1402 btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work);
@@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1473 } 1473 }
1474 1474
1475 delayed_item->key.objectid = btrfs_ino(dir); 1475 delayed_item->key.objectid = btrfs_ino(dir);
1476 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); 1476 delayed_item->key.type = BTRFS_DIR_INDEX_KEY;
1477 delayed_item->key.offset = index; 1477 delayed_item->key.offset = index;
1478 1478
1479 dir_item = (struct btrfs_dir_item *)delayed_item->data; 1479 dir_item = (struct btrfs_dir_item *)delayed_item->data;
@@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1542 return PTR_ERR(node); 1542 return PTR_ERR(node);
1543 1543
1544 item_key.objectid = btrfs_ino(dir); 1544 item_key.objectid = btrfs_ino(dir);
1545 btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); 1545 item_key.type = BTRFS_DIR_INDEX_KEY;
1546 item_key.offset = index; 1546 item_key.offset = index;
1547 1547
1548 ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); 1548 ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index eea26e1b2fda..6f662b34ba0e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found:
168 dev_replace->srcdev->total_bytes; 168 dev_replace->srcdev->total_bytes;
169 dev_replace->tgtdev->disk_total_bytes = 169 dev_replace->tgtdev->disk_total_bytes =
170 dev_replace->srcdev->disk_total_bytes; 170 dev_replace->srcdev->disk_total_bytes;
171 dev_replace->tgtdev->commit_total_bytes =
172 dev_replace->srcdev->commit_total_bytes;
171 dev_replace->tgtdev->bytes_used = 173 dev_replace->tgtdev->bytes_used =
172 dev_replace->srcdev->bytes_used; 174 dev_replace->srcdev->bytes_used;
175 dev_replace->tgtdev->commit_bytes_used =
176 dev_replace->srcdev->commit_bytes_used;
173 } 177 }
174 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; 178 dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
175 btrfs_init_dev_replace_tgtdev_for_resume(fs_info, 179 btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
@@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
329 args->start.tgtdev_name[0] == '\0') 333 args->start.tgtdev_name[0] == '\0')
330 return -EINVAL; 334 return -EINVAL;
331 335
332 mutex_lock(&fs_info->volume_mutex); 336 /*
333 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, 337 * Here we commit the transaction to make sure commit_total_bytes
334 &tgt_device); 338 * of all the devices are updated.
335 if (ret) { 339 */
336 btrfs_err(fs_info, "target device %s is invalid!", 340 trans = btrfs_attach_transaction(root);
337 args->start.tgtdev_name); 341 if (!IS_ERR(trans)) {
338 mutex_unlock(&fs_info->volume_mutex); 342 ret = btrfs_commit_transaction(trans, root);
339 return -EINVAL; 343 if (ret)
344 return ret;
345 } else if (PTR_ERR(trans) != -ENOENT) {
346 return PTR_ERR(trans);
340 } 347 }
341 348
349 /* the disk copy procedure reuses the scrub code */
350 mutex_lock(&fs_info->volume_mutex);
342 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, 351 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
343 args->start.srcdev_name, 352 args->start.srcdev_name,
344 &src_device); 353 &src_device);
345 mutex_unlock(&fs_info->volume_mutex);
346 if (ret) { 354 if (ret) {
347 ret = -EINVAL; 355 mutex_unlock(&fs_info->volume_mutex);
348 goto leave_no_lock; 356 return ret;
349 } 357 }
350 358
351 if (tgt_device->total_bytes < src_device->total_bytes) { 359 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
352 btrfs_err(fs_info, "target device is smaller than source device!"); 360 src_device, &tgt_device);
353 ret = -EINVAL; 361 mutex_unlock(&fs_info->volume_mutex);
354 goto leave_no_lock; 362 if (ret)
355 } 363 return ret;
356 364
357 btrfs_dev_replace_lock(dev_replace); 365 btrfs_dev_replace_lock(dev_replace);
358 switch (dev_replace->replace_state) { 366 switch (dev_replace->replace_state) {
@@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
380 src_device->devid, 388 src_device->devid,
381 rcu_str_deref(tgt_device->name)); 389 rcu_str_deref(tgt_device->name));
382 390
383 tgt_device->total_bytes = src_device->total_bytes;
384 tgt_device->disk_total_bytes = src_device->disk_total_bytes;
385 tgt_device->bytes_used = src_device->bytes_used;
386
387 /* 391 /*
388 * from now on, the writes to the srcdev are all duplicated to 392 * from now on, the writes to the srcdev are all duplicated to
389 * go to the tgtdev as well (refer to btrfs_map_block()). 393 * go to the tgtdev as well (refer to btrfs_map_block()).
@@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
414 418
415 /* the disk copy procedure reuses the scrub code */ 419 /* the disk copy procedure reuses the scrub code */
416 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 420 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
417 src_device->total_bytes, 421 btrfs_device_get_total_bytes(src_device),
418 &dev_replace->scrub_progress, 0, 1); 422 &dev_replace->scrub_progress, 0, 1);
419 423
420 ret = btrfs_dev_replace_finishing(root->fs_info, ret); 424 ret = btrfs_dev_replace_finishing(root->fs_info, ret);
@@ -426,9 +430,7 @@ leave:
426 dev_replace->srcdev = NULL; 430 dev_replace->srcdev = NULL;
427 dev_replace->tgtdev = NULL; 431 dev_replace->tgtdev = NULL;
428 btrfs_dev_replace_unlock(dev_replace); 432 btrfs_dev_replace_unlock(dev_replace);
429leave_no_lock: 433 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
430 if (tgt_device)
431 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
432 return ret; 434 return ret;
433} 435}
434 436
@@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
507 ret = btrfs_commit_transaction(trans, root); 509 ret = btrfs_commit_transaction(trans, root);
508 WARN_ON(ret); 510 WARN_ON(ret);
509 511
512 mutex_lock(&uuid_mutex);
510 /* keep away write_all_supers() during the finishing procedure */ 513 /* keep away write_all_supers() during the finishing procedure */
511 mutex_lock(&root->fs_info->chunk_mutex);
512 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 514 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
515 mutex_lock(&root->fs_info->chunk_mutex);
513 btrfs_dev_replace_lock(dev_replace); 516 btrfs_dev_replace_lock(dev_replace);
514 dev_replace->replace_state = 517 dev_replace->replace_state =
515 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 518 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
@@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
532 src_device->devid, 535 src_device->devid,
533 rcu_str_deref(tgt_device->name), scrub_ret); 536 rcu_str_deref(tgt_device->name), scrub_ret);
534 btrfs_dev_replace_unlock(dev_replace); 537 btrfs_dev_replace_unlock(dev_replace);
535 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
536 mutex_unlock(&root->fs_info->chunk_mutex); 538 mutex_unlock(&root->fs_info->chunk_mutex);
539 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
540 mutex_unlock(&uuid_mutex);
537 if (tgt_device) 541 if (tgt_device)
538 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); 542 btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
539 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 543 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
@@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
542 } 546 }
543 547
544 printk_in_rcu(KERN_INFO 548 printk_in_rcu(KERN_INFO
545 "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", 549 "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
546 src_device->missing ? "<missing disk>" : 550 src_device->missing ? "<missing disk>" :
547 rcu_str_deref(src_device->name), 551 rcu_str_deref(src_device->name),
548 src_device->devid, 552 src_device->devid,
@@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
550 tgt_device->is_tgtdev_for_dev_replace = 0; 554 tgt_device->is_tgtdev_for_dev_replace = 0;
551 tgt_device->devid = src_device->devid; 555 tgt_device->devid = src_device->devid;
552 src_device->devid = BTRFS_DEV_REPLACE_DEVID; 556 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
553 tgt_device->bytes_used = src_device->bytes_used;
554 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); 557 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
555 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); 558 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
556 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); 559 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
557 tgt_device->total_bytes = src_device->total_bytes; 560 btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
558 tgt_device->disk_total_bytes = src_device->disk_total_bytes; 561 btrfs_device_set_disk_total_bytes(tgt_device,
559 tgt_device->bytes_used = src_device->bytes_used; 562 src_device->disk_total_bytes);
563 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
564 ASSERT(list_empty(&src_device->resized_list));
565 tgt_device->commit_total_bytes = src_device->commit_total_bytes;
566 tgt_device->commit_bytes_used = src_device->bytes_used;
560 if (fs_info->sb->s_bdev == src_device->bdev) 567 if (fs_info->sb->s_bdev == src_device->bdev)
561 fs_info->sb->s_bdev = tgt_device->bdev; 568 fs_info->sb->s_bdev = tgt_device->bdev;
562 if (fs_info->fs_devices->latest_bdev == src_device->bdev) 569 if (fs_info->fs_devices->latest_bdev == src_device->bdev)
563 fs_info->fs_devices->latest_bdev = tgt_device->bdev; 570 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
564 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 571 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
572 fs_info->fs_devices->rw_devices++;
565 573
566 /* replace the sysfs entry */ 574 /* replace the sysfs entry */
567 btrfs_kobj_rm_device(fs_info, src_device); 575 btrfs_kobj_rm_device(fs_info, src_device);
568 btrfs_kobj_add_device(fs_info, tgt_device); 576 btrfs_kobj_add_device(fs_info, tgt_device);
569 577
578 btrfs_dev_replace_unlock(dev_replace);
579
570 btrfs_rm_dev_replace_blocked(fs_info); 580 btrfs_rm_dev_replace_blocked(fs_info);
571 581
572 btrfs_rm_dev_replace_srcdev(fs_info, src_device); 582 btrfs_rm_dev_replace_srcdev(fs_info, src_device);
@@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
580 * superblock is scratched out so that it is no longer marked to 590 * superblock is scratched out so that it is no longer marked to
581 * belong to this filesystem. 591 * belong to this filesystem.
582 */ 592 */
583 btrfs_dev_replace_unlock(dev_replace);
584 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
585 mutex_unlock(&root->fs_info->chunk_mutex); 593 mutex_unlock(&root->fs_info->chunk_mutex);
594 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
595 mutex_unlock(&uuid_mutex);
586 596
587 /* write back the superblocks */ 597 /* write back the superblocks */
588 trans = btrfs_start_transaction(root, 0); 598 trans = btrfs_start_transaction(root, 0);
@@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
643 struct btrfs_ioctl_dev_replace_args *args) 653 struct btrfs_ioctl_dev_replace_args *args)
644{ 654{
645 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 655 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
656 struct btrfs_device *srcdev;
646 657
647 btrfs_dev_replace_lock(dev_replace); 658 btrfs_dev_replace_lock(dev_replace);
648 /* even if !dev_replace_is_valid, the values are good enough for 659 /* even if !dev_replace_is_valid, the values are good enough for
@@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
665 break; 676 break;
666 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 677 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
667 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 678 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
679 srcdev = dev_replace->srcdev;
668 args->status.progress_1000 = div64_u64(dev_replace->cursor_left, 680 args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
669 div64_u64(dev_replace->srcdev->total_bytes, 1000)); 681 div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
670 break; 682 break;
671 } 683 }
672 btrfs_dev_replace_unlock(dev_replace); 684 btrfs_dev_replace_unlock(dev_replace);
@@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
825 837
826 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 838 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
827 dev_replace->committed_cursor_left, 839 dev_replace->committed_cursor_left,
828 dev_replace->srcdev->total_bytes, 840 btrfs_device_get_total_bytes(dev_replace->srcdev),
829 &dev_replace->scrub_progress, 0, 1); 841 &dev_replace->scrub_progress, 0, 1);
830 ret = btrfs_dev_replace_finishing(fs_info, ret); 842 ret = btrfs_dev_replace_finishing(fs_info, ret);
831 WARN_ON(ret); 843 WARN_ON(ret);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a0691df5dcea..fc8df866e919 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
86 BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); 86 BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root));
87 87
88 key.objectid = objectid; 88 key.objectid = objectid;
89 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 89 key.type = BTRFS_XATTR_ITEM_KEY;
90 key.offset = btrfs_name_hash(name, name_len); 90 key.offset = btrfs_name_hash(name, name_len);
91 91
92 data_size = sizeof(*dir_item) + name_len + data_len; 92 data_size = sizeof(*dir_item) + name_len + data_len;
@@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
137 u32 data_size; 137 u32 data_size;
138 138
139 key.objectid = btrfs_ino(dir); 139 key.objectid = btrfs_ino(dir);
140 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 140 key.type = BTRFS_DIR_ITEM_KEY;
141 key.offset = btrfs_name_hash(name, name_len); 141 key.offset = btrfs_name_hash(name, name_len);
142 142
143 path = btrfs_alloc_path(); 143 path = btrfs_alloc_path();
@@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
204 int cow = mod != 0; 204 int cow = mod != 0;
205 205
206 key.objectid = dir; 206 key.objectid = dir;
207 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 207 key.type = BTRFS_DIR_ITEM_KEY;
208 208
209 key.offset = btrfs_name_hash(name, name_len); 209 key.offset = btrfs_name_hash(name, name_len);
210 210
@@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
234 return -ENOMEM; 234 return -ENOMEM;
235 235
236 key.objectid = dir; 236 key.objectid = dir;
237 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 237 key.type = BTRFS_DIR_ITEM_KEY;
238 key.offset = btrfs_name_hash(name, name_len); 238 key.offset = btrfs_name_hash(name, name_len);
239 239
240 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 240 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
297 int cow = mod != 0; 297 int cow = mod != 0;
298 298
299 key.objectid = dir; 299 key.objectid = dir;
300 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 300 key.type = BTRFS_DIR_INDEX_KEY;
301 key.offset = objectid; 301 key.offset = objectid;
302 302
303 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 303 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
@@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
367 int cow = mod != 0; 367 int cow = mod != 0;
368 368
369 key.objectid = dir; 369 key.objectid = dir;
370 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 370 key.type = BTRFS_XATTR_ITEM_KEY;
371 key.offset = btrfs_name_hash(name, name_len); 371 key.offset = btrfs_name_hash(name, name_len);
372 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 372 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
373 if (ret < 0) 373 if (ret < 0)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 08e65e9cf2aa..1ad0f47ac850 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -39,7 +39,6 @@
39#include "btrfs_inode.h" 39#include "btrfs_inode.h"
40#include "volumes.h" 40#include "volumes.h"
41#include "print-tree.h" 41#include "print-tree.h"
42#include "async-thread.h"
43#include "locking.h" 42#include "locking.h"
44#include "tree-log.h" 43#include "tree-log.h"
45#include "free-space-cache.h" 44#include "free-space-cache.h"
@@ -60,8 +59,6 @@ static void end_workqueue_fn(struct btrfs_work *work);
60static void free_fs_root(struct btrfs_root *root); 59static void free_fs_root(struct btrfs_root *root);
61static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 60static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
62 int read_only); 61 int read_only);
63static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
64 struct btrfs_root *root);
65static void btrfs_destroy_ordered_extents(struct btrfs_root *root); 62static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
66static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, 63static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
67 struct btrfs_root *root); 64 struct btrfs_root *root);
@@ -75,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root);
75static void btrfs_error_commit_super(struct btrfs_root *root); 72static void btrfs_error_commit_super(struct btrfs_root *root);
76 73
77/* 74/*
78 * end_io_wq structs are used to do processing in task context when an IO is 75 * btrfs_end_io_wq structs are used to do processing in task context when an IO
79 * complete. This is used during reads to verify checksums, and it is used 76 * is complete. This is used during reads to verify checksums, and it is used
80 * by writes to insert metadata for new file extents after IO is complete. 77 * by writes to insert metadata for new file extents after IO is complete.
81 */ 78 */
82struct end_io_wq { 79struct btrfs_end_io_wq {
83 struct bio *bio; 80 struct bio *bio;
84 bio_end_io_t *end_io; 81 bio_end_io_t *end_io;
85 void *private; 82 void *private;
86 struct btrfs_fs_info *info; 83 struct btrfs_fs_info *info;
87 int error; 84 int error;
88 int metadata; 85 enum btrfs_wq_endio_type metadata;
89 struct list_head list; 86 struct list_head list;
90 struct btrfs_work work; 87 struct btrfs_work work;
91}; 88};
92 89
90static struct kmem_cache *btrfs_end_io_wq_cache;
91
92int __init btrfs_end_io_wq_init(void)
93{
94 btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
95 sizeof(struct btrfs_end_io_wq),
96 0,
97 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
98 NULL);
99 if (!btrfs_end_io_wq_cache)
100 return -ENOMEM;
101 return 0;
102}
103
104void btrfs_end_io_wq_exit(void)
105{
106 if (btrfs_end_io_wq_cache)
107 kmem_cache_destroy(btrfs_end_io_wq_cache);
108}
109
93/* 110/*
94 * async submit bios are used to offload expensive checksumming 111 * async submit bios are used to offload expensive checksumming
95 * onto the worker threads. They checksum file and metadata bios 112 * onto the worker threads. They checksum file and metadata bios
@@ -330,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
330{ 347{
331 struct extent_state *cached_state = NULL; 348 struct extent_state *cached_state = NULL;
332 int ret; 349 int ret;
333 bool need_lock = (current->journal_info == 350 bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
334 (void *)BTRFS_SEND_TRANS_STUB);
335 351
336 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 352 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
337 return 0; 353 return 0;
@@ -351,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
351 ret = 0; 367 ret = 0;
352 goto out; 368 goto out;
353 } 369 }
354 printk_ratelimited("parent transid verify failed on %llu wanted %llu " 370 printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
355 "found %llu\n", 371 eb->fs_info->sb->s_id, eb->start,
356 eb->start, parent_transid, btrfs_header_generation(eb)); 372 parent_transid, btrfs_header_generation(eb));
357 ret = 1; 373 ret = 1;
358 374
359 /* 375 /*
@@ -610,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
610 goto err; 626 goto err;
611 627
612 eb->read_mirror = mirror; 628 eb->read_mirror = mirror;
613 if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 629 if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
614 ret = -EIO; 630 ret = -EIO;
615 goto err; 631 goto err;
616 } 632 }
617 633
618 found_start = btrfs_header_bytenr(eb); 634 found_start = btrfs_header_bytenr(eb);
619 if (found_start != eb->start) { 635 if (found_start != eb->start) {
620 printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " 636 printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
621 "%llu %llu\n", 637 "%llu %llu\n",
622 found_start, eb->start); 638 eb->fs_info->sb->s_id, found_start, eb->start);
623 ret = -EIO; 639 ret = -EIO;
624 goto err; 640 goto err;
625 } 641 }
626 if (check_tree_block_fsid(root, eb)) { 642 if (check_tree_block_fsid(root, eb)) {
627 printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", 643 printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
628 eb->start); 644 eb->fs_info->sb->s_id, eb->start);
629 ret = -EIO; 645 ret = -EIO;
630 goto err; 646 goto err;
631 } 647 }
@@ -683,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
683 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; 699 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
684 700
685 eb = (struct extent_buffer *)page->private; 701 eb = (struct extent_buffer *)page->private;
686 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 702 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
687 eb->read_mirror = failed_mirror; 703 eb->read_mirror = failed_mirror;
688 atomic_dec(&eb->io_pages); 704 atomic_dec(&eb->io_pages);
689 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) 705 if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
@@ -693,52 +709,55 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
693 709
694static void end_workqueue_bio(struct bio *bio, int err) 710static void end_workqueue_bio(struct bio *bio, int err)
695{ 711{
696 struct end_io_wq *end_io_wq = bio->bi_private; 712 struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
697 struct btrfs_fs_info *fs_info; 713 struct btrfs_fs_info *fs_info;
714 struct btrfs_workqueue *wq;
715 btrfs_work_func_t func;
698 716
699 fs_info = end_io_wq->info; 717 fs_info = end_io_wq->info;
700 end_io_wq->error = err; 718 end_io_wq->error = err;
701 btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
702 719
703 if (bio->bi_rw & REQ_WRITE) { 720 if (bio->bi_rw & REQ_WRITE) {
704 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) 721 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
705 btrfs_queue_work(fs_info->endio_meta_write_workers, 722 wq = fs_info->endio_meta_write_workers;
706 &end_io_wq->work); 723 func = btrfs_endio_meta_write_helper;
707 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) 724 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
708 btrfs_queue_work(fs_info->endio_freespace_worker, 725 wq = fs_info->endio_freespace_worker;
709 &end_io_wq->work); 726 func = btrfs_freespace_write_helper;
710 else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 727 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
711 btrfs_queue_work(fs_info->endio_raid56_workers, 728 wq = fs_info->endio_raid56_workers;
712 &end_io_wq->work); 729 func = btrfs_endio_raid56_helper;
713 else 730 } else {
714 btrfs_queue_work(fs_info->endio_write_workers, 731 wq = fs_info->endio_write_workers;
715 &end_io_wq->work); 732 func = btrfs_endio_write_helper;
733 }
716 } else { 734 } else {
717 if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) 735 if (unlikely(end_io_wq->metadata ==
718 btrfs_queue_work(fs_info->endio_raid56_workers, 736 BTRFS_WQ_ENDIO_DIO_REPAIR)) {
719 &end_io_wq->work); 737 wq = fs_info->endio_repair_workers;
720 else if (end_io_wq->metadata) 738 func = btrfs_endio_repair_helper;
721 btrfs_queue_work(fs_info->endio_meta_workers, 739 } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
722 &end_io_wq->work); 740 wq = fs_info->endio_raid56_workers;
723 else 741 func = btrfs_endio_raid56_helper;
724 btrfs_queue_work(fs_info->endio_workers, 742 } else if (end_io_wq->metadata) {
725 &end_io_wq->work); 743 wq = fs_info->endio_meta_workers;
744 func = btrfs_endio_meta_helper;
745 } else {
746 wq = fs_info->endio_workers;
747 func = btrfs_endio_helper;
748 }
726 } 749 }
750
751 btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
752 btrfs_queue_work(wq, &end_io_wq->work);
727} 753}
728 754
729/*
730 * For the metadata arg you want
731 *
732 * 0 - if data
733 * 1 - if normal metadta
734 * 2 - if writing to the free space cache area
735 * 3 - raid parity work
736 */
737int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 755int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
738 int metadata) 756 enum btrfs_wq_endio_type metadata)
739{ 757{
740 struct end_io_wq *end_io_wq; 758 struct btrfs_end_io_wq *end_io_wq;
741 end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); 759
760 end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
742 if (!end_io_wq) 761 if (!end_io_wq)
743 return -ENOMEM; 762 return -ENOMEM;
744 763
@@ -830,7 +849,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
830 async->submit_bio_start = submit_bio_start; 849 async->submit_bio_start = submit_bio_start;
831 async->submit_bio_done = submit_bio_done; 850 async->submit_bio_done = submit_bio_done;
832 851
833 btrfs_init_work(&async->work, run_one_async_start, 852 btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
834 run_one_async_done, run_one_async_free); 853 run_one_async_done, run_one_async_free);
835 854
836 async->bio_flags = bio_flags; 855 async->bio_flags = bio_flags;
@@ -922,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
922 * can happen in the async kernel threads 941 * can happen in the async kernel threads
923 */ 942 */
924 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, 943 ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
925 bio, 1); 944 bio, BTRFS_WQ_ENDIO_METADATA);
926 if (ret) 945 if (ret)
927 goto out_w_error; 946 goto out_w_error;
928 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, 947 ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
@@ -1054,20 +1073,17 @@ static const struct address_space_operations btree_aops = {
1054 .set_page_dirty = btree_set_page_dirty, 1073 .set_page_dirty = btree_set_page_dirty,
1055}; 1074};
1056 1075
1057int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 1076void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
1058 u64 parent_transid)
1059{ 1077{
1060 struct extent_buffer *buf = NULL; 1078 struct extent_buffer *buf = NULL;
1061 struct inode *btree_inode = root->fs_info->btree_inode; 1079 struct inode *btree_inode = root->fs_info->btree_inode;
1062 int ret = 0;
1063 1080
1064 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 1081 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1065 if (!buf) 1082 if (!buf)
1066 return 0; 1083 return;
1067 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1084 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
1068 buf, 0, WAIT_NONE, btree_get_extent, 0); 1085 buf, 0, WAIT_NONE, btree_get_extent, 0);
1069 free_extent_buffer(buf); 1086 free_extent_buffer(buf);
1070 return ret;
1071} 1087}
1072 1088
1073int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, 1089int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1103,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1103} 1119}
1104 1120
1105struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1121struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1106 u64 bytenr, u32 blocksize) 1122 u64 bytenr)
1107{ 1123{
1108 return find_extent_buffer(root->fs_info, bytenr); 1124 return find_extent_buffer(root->fs_info, bytenr);
1109} 1125}
@@ -1111,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
1111struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 1127struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
1112 u64 bytenr, u32 blocksize) 1128 u64 bytenr, u32 blocksize)
1113{ 1129{
1114#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 1130 if (btrfs_test_is_dummy_root(root))
1115 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
1116 return alloc_test_extent_buffer(root->fs_info, bytenr, 1131 return alloc_test_extent_buffer(root->fs_info, bytenr,
1117 blocksize); 1132 blocksize);
1118#endif
1119 return alloc_extent_buffer(root->fs_info, bytenr, blocksize); 1133 return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
1120} 1134}
1121 1135
@@ -1133,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
1133} 1147}
1134 1148
1135struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 1149struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
1136 u32 blocksize, u64 parent_transid) 1150 u64 parent_transid)
1137{ 1151{
1138 struct extent_buffer *buf = NULL; 1152 struct extent_buffer *buf = NULL;
1139 int ret; 1153 int ret;
1140 1154
1141 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 1155 buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
1142 if (!buf) 1156 if (!buf)
1143 return NULL; 1157 return NULL;
1144 1158
@@ -1180,7 +1194,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1180 if (!writers) 1194 if (!writers)
1181 return ERR_PTR(-ENOMEM); 1195 return ERR_PTR(-ENOMEM);
1182 1196
1183 ret = percpu_counter_init(&writers->counter, 0); 1197 ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
1184 if (ret < 0) { 1198 if (ret < 0) {
1185 kfree(writers); 1199 kfree(writers);
1186 return ERR_PTR(ret); 1200 return ERR_PTR(ret);
@@ -1197,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
1197 kfree(writers); 1211 kfree(writers);
1198} 1212}
1199 1213
1200static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, 1214static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1201 u32 stripesize, struct btrfs_root *root, 1215 struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1202 struct btrfs_fs_info *fs_info,
1203 u64 objectid) 1216 u64 objectid)
1204{ 1217{
1205 root->node = NULL; 1218 root->node = NULL;
1206 root->commit_root = NULL; 1219 root->commit_root = NULL;
1207 root->sectorsize = sectorsize; 1220 root->sectorsize = sectorsize;
1208 root->nodesize = nodesize; 1221 root->nodesize = nodesize;
1209 root->leafsize = leafsize;
1210 root->stripesize = stripesize; 1222 root->stripesize = stripesize;
1211 root->state = 0; 1223 root->state = 0;
1212 root->orphan_cleanup_state = 0; 1224 root->orphan_cleanup_state = 0;
@@ -1292,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void)
1292 root = btrfs_alloc_root(NULL); 1304 root = btrfs_alloc_root(NULL);
1293 if (!root) 1305 if (!root)
1294 return ERR_PTR(-ENOMEM); 1306 return ERR_PTR(-ENOMEM);
1295 __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); 1307 __setup_root(4096, 4096, 4096, root, NULL, 1);
1296 set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); 1308 set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
1297 root->alloc_bytenr = 0; 1309 root->alloc_bytenr = 0;
1298 1310
@@ -1315,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
1315 if (!root) 1327 if (!root)
1316 return ERR_PTR(-ENOMEM); 1328 return ERR_PTR(-ENOMEM);
1317 1329
1318 __setup_root(tree_root->nodesize, tree_root->leafsize, 1330 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1319 tree_root->sectorsize, tree_root->stripesize, 1331 tree_root->stripesize, root, fs_info, objectid);
1320 root, fs_info, objectid);
1321 root->root_key.objectid = objectid; 1332 root->root_key.objectid = objectid;
1322 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1333 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
1323 root->root_key.offset = 0; 1334 root->root_key.offset = 0;
1324 1335
1325 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 1336 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
1326 0, objectid, NULL, 0, 0, 0);
1327 if (IS_ERR(leaf)) { 1337 if (IS_ERR(leaf)) {
1328 ret = PTR_ERR(leaf); 1338 ret = PTR_ERR(leaf);
1329 leaf = NULL; 1339 leaf = NULL;
@@ -1393,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1393 if (!root) 1403 if (!root)
1394 return ERR_PTR(-ENOMEM); 1404 return ERR_PTR(-ENOMEM);
1395 1405
1396 __setup_root(tree_root->nodesize, tree_root->leafsize, 1406 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1397 tree_root->sectorsize, tree_root->stripesize, 1407 tree_root->stripesize, root, fs_info,
1398 root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1408 BTRFS_TREE_LOG_OBJECTID);
1399 1409
1400 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; 1410 root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
1401 root->root_key.type = BTRFS_ROOT_ITEM_KEY; 1411 root->root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -1410,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1410 * updated (along with back refs to the log tree). 1420 * updated (along with back refs to the log tree).
1411 */ 1421 */
1412 1422
1413 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 1423 leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1414 BTRFS_TREE_LOG_OBJECTID, NULL, 1424 NULL, 0, 0, 0);
1415 0, 0, 0);
1416 if (IS_ERR(leaf)) { 1425 if (IS_ERR(leaf)) {
1417 kfree(root); 1426 kfree(root);
1418 return ERR_CAST(leaf); 1427 return ERR_CAST(leaf);
@@ -1462,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
1462 btrfs_set_stack_inode_generation(inode_item, 1); 1471 btrfs_set_stack_inode_generation(inode_item, 1);
1463 btrfs_set_stack_inode_size(inode_item, 3); 1472 btrfs_set_stack_inode_size(inode_item, 3);
1464 btrfs_set_stack_inode_nlink(inode_item, 1); 1473 btrfs_set_stack_inode_nlink(inode_item, 1);
1465 btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); 1474 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
1466 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 1475 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1467 1476
1468 btrfs_set_root_node(&log_root->root_item, log_root->node); 1477 btrfs_set_root_node(&log_root->root_item, log_root->node);
@@ -1482,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1482 struct btrfs_fs_info *fs_info = tree_root->fs_info; 1491 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1483 struct btrfs_path *path; 1492 struct btrfs_path *path;
1484 u64 generation; 1493 u64 generation;
1485 u32 blocksize;
1486 int ret; 1494 int ret;
1487 1495
1488 path = btrfs_alloc_path(); 1496 path = btrfs_alloc_path();
@@ -1495,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1495 goto alloc_fail; 1503 goto alloc_fail;
1496 } 1504 }
1497 1505
1498 __setup_root(tree_root->nodesize, tree_root->leafsize, 1506 __setup_root(tree_root->nodesize, tree_root->sectorsize,
1499 tree_root->sectorsize, tree_root->stripesize, 1507 tree_root->stripesize, root, fs_info, key->objectid);
1500 root, fs_info, key->objectid);
1501 1508
1502 ret = btrfs_find_root(tree_root, key, path, 1509 ret = btrfs_find_root(tree_root, key, path,
1503 &root->root_item, &root->root_key); 1510 &root->root_item, &root->root_key);
@@ -1508,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
1508 } 1515 }
1509 1516
1510 generation = btrfs_root_generation(&root->root_item); 1517 generation = btrfs_root_generation(&root->root_item);
1511 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1512 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1518 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1513 blocksize, generation); 1519 generation);
1514 if (!root->node) { 1520 if (!root->node) {
1515 ret = -ENOMEM; 1521 ret = -ENOMEM;
1516 goto find_fail; 1522 goto find_fail;
@@ -1570,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
1570 root->subv_writers = writers; 1576 root->subv_writers = writers;
1571 1577
1572 btrfs_init_free_ino_ctl(root); 1578 btrfs_init_free_ino_ctl(root);
1573 spin_lock_init(&root->cache_lock); 1579 spin_lock_init(&root->ino_cache_lock);
1574 init_waitqueue_head(&root->cache_wait); 1580 init_waitqueue_head(&root->ino_cache_wait);
1575 1581
1576 ret = get_anon_bdev(&root->anon_dev); 1582 ret = get_anon_bdev(&root->anon_dev);
1577 if (ret) 1583 if (ret)
@@ -1696,7 +1702,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1696 if (!device->bdev) 1702 if (!device->bdev)
1697 continue; 1703 continue;
1698 bdi = blk_get_backing_dev_info(device->bdev); 1704 bdi = blk_get_backing_dev_info(device->bdev);
1699 if (bdi && bdi_congested(bdi, bdi_bits)) { 1705 if (bdi_congested(bdi, bdi_bits)) {
1700 ret = 1; 1706 ret = 1;
1701 break; 1707 break;
1702 } 1708 }
@@ -1705,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1705 return ret; 1711 return ret;
1706} 1712}
1707 1713
1708/*
1709 * If this fails, caller must call bdi_destroy() to get rid of the
1710 * bdi again.
1711 */
1712static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) 1714static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1713{ 1715{
1714 int err; 1716 int err;
@@ -1731,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1731static void end_workqueue_fn(struct btrfs_work *work) 1733static void end_workqueue_fn(struct btrfs_work *work)
1732{ 1734{
1733 struct bio *bio; 1735 struct bio *bio;
1734 struct end_io_wq *end_io_wq; 1736 struct btrfs_end_io_wq *end_io_wq;
1735 int error; 1737 int error;
1736 1738
1737 end_io_wq = container_of(work, struct end_io_wq, work); 1739 end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1738 bio = end_io_wq->bio; 1740 bio = end_io_wq->bio;
1739 1741
1740 error = end_io_wq->error; 1742 error = end_io_wq->error;
1741 bio->bi_private = end_io_wq->private; 1743 bio->bi_private = end_io_wq->private;
1742 bio->bi_end_io = end_io_wq->end_io; 1744 bio->bi_end_io = end_io_wq->end_io;
1743 kfree(end_io_wq); 1745 kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1744 bio_endio_nodec(bio, error); 1746 bio_endio_nodec(bio, error);
1745} 1747}
1746 1748
@@ -1769,6 +1771,7 @@ static int cleaner_kthread(void *arg)
1769 } 1771 }
1770 1772
1771 btrfs_run_delayed_iputs(root); 1773 btrfs_run_delayed_iputs(root);
1774 btrfs_delete_unused_bgs(root->fs_info);
1772 again = btrfs_clean_one_deleted_snapshot(root); 1775 again = btrfs_clean_one_deleted_snapshot(root);
1773 mutex_unlock(&root->fs_info->cleaner_mutex); 1776 mutex_unlock(&root->fs_info->cleaner_mutex);
1774 1777
@@ -2060,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
2060 btrfs_destroy_workqueue(fs_info->endio_workers); 2063 btrfs_destroy_workqueue(fs_info->endio_workers);
2061 btrfs_destroy_workqueue(fs_info->endio_meta_workers); 2064 btrfs_destroy_workqueue(fs_info->endio_meta_workers);
2062 btrfs_destroy_workqueue(fs_info->endio_raid56_workers); 2065 btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2066 btrfs_destroy_workqueue(fs_info->endio_repair_workers);
2063 btrfs_destroy_workqueue(fs_info->rmw_workers); 2067 btrfs_destroy_workqueue(fs_info->rmw_workers);
2064 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); 2068 btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
2065 btrfs_destroy_workqueue(fs_info->endio_write_workers); 2069 btrfs_destroy_workqueue(fs_info->endio_write_workers);
@@ -2140,8 +2144,6 @@ int open_ctree(struct super_block *sb,
2140{ 2144{
2141 u32 sectorsize; 2145 u32 sectorsize;
2142 u32 nodesize; 2146 u32 nodesize;
2143 u32 leafsize;
2144 u32 blocksize;
2145 u32 stripesize; 2147 u32 stripesize;
2146 u64 generation; 2148 u64 generation;
2147 u64 features; 2149 u64 features;
@@ -2185,7 +2187,7 @@ int open_ctree(struct super_block *sb,
2185 goto fail_srcu; 2187 goto fail_srcu;
2186 } 2188 }
2187 2189
2188 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); 2190 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2189 if (ret) { 2191 if (ret) {
2190 err = ret; 2192 err = ret;
2191 goto fail_bdi; 2193 goto fail_bdi;
@@ -2193,13 +2195,13 @@ int open_ctree(struct super_block *sb,
2193 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * 2195 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2194 (1 + ilog2(nr_cpu_ids)); 2196 (1 + ilog2(nr_cpu_ids));
2195 2197
2196 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); 2198 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2197 if (ret) { 2199 if (ret) {
2198 err = ret; 2200 err = ret;
2199 goto fail_dirty_metadata_bytes; 2201 goto fail_dirty_metadata_bytes;
2200 } 2202 }
2201 2203
2202 ret = percpu_counter_init(&fs_info->bio_counter, 0); 2204 ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
2203 if (ret) { 2205 if (ret) {
2204 err = ret; 2206 err = ret;
2205 goto fail_delalloc_bytes; 2207 goto fail_delalloc_bytes;
@@ -2230,6 +2232,7 @@ int open_ctree(struct super_block *sb,
2230 spin_lock_init(&fs_info->super_lock); 2232 spin_lock_init(&fs_info->super_lock);
2231 spin_lock_init(&fs_info->qgroup_op_lock); 2233 spin_lock_init(&fs_info->qgroup_op_lock);
2232 spin_lock_init(&fs_info->buffer_lock); 2234 spin_lock_init(&fs_info->buffer_lock);
2235 spin_lock_init(&fs_info->unused_bgs_lock);
2233 rwlock_init(&fs_info->tree_mod_log_lock); 2236 rwlock_init(&fs_info->tree_mod_log_lock);
2234 mutex_init(&fs_info->reloc_mutex); 2237 mutex_init(&fs_info->reloc_mutex);
2235 mutex_init(&fs_info->delalloc_root_mutex); 2238 mutex_init(&fs_info->delalloc_root_mutex);
@@ -2239,6 +2242,7 @@ int open_ctree(struct super_block *sb,
2239 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 2242 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2240 INIT_LIST_HEAD(&fs_info->space_info); 2243 INIT_LIST_HEAD(&fs_info->space_info);
2241 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2244 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2245 INIT_LIST_HEAD(&fs_info->unused_bgs);
2242 btrfs_mapping_init(&fs_info->mapping_tree); 2246 btrfs_mapping_init(&fs_info->mapping_tree);
2243 btrfs_init_block_rsv(&fs_info->global_block_rsv, 2247 btrfs_init_block_rsv(&fs_info->global_block_rsv,
2244 BTRFS_BLOCK_RSV_GLOBAL); 2248 BTRFS_BLOCK_RSV_GLOBAL);
@@ -2257,7 +2261,7 @@ int open_ctree(struct super_block *sb,
2257 atomic_set(&fs_info->qgroup_op_seq, 0); 2261 atomic_set(&fs_info->qgroup_op_seq, 0);
2258 atomic64_set(&fs_info->tree_mod_seq, 0); 2262 atomic64_set(&fs_info->tree_mod_seq, 0);
2259 fs_info->sb = sb; 2263 fs_info->sb = sb;
2260 fs_info->max_inline = 8192 * 1024; 2264 fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
2261 fs_info->metadata_ratio = 0; 2265 fs_info->metadata_ratio = 0;
2262 fs_info->defrag_inodes = RB_ROOT; 2266 fs_info->defrag_inodes = RB_ROOT;
2263 fs_info->free_chunk_space = 0; 2267 fs_info->free_chunk_space = 0;
@@ -2386,7 +2390,7 @@ int open_ctree(struct super_block *sb,
2386 goto fail_alloc; 2390 goto fail_alloc;
2387 } 2391 }
2388 2392
2389 __setup_root(4096, 4096, 4096, 4096, tree_root, 2393 __setup_root(4096, 4096, 4096, tree_root,
2390 fs_info, BTRFS_ROOT_TREE_OBJECTID); 2394 fs_info, BTRFS_ROOT_TREE_OBJECTID);
2391 2395
2392 invalidate_bdev(fs_devices->latest_bdev); 2396 invalidate_bdev(fs_devices->latest_bdev);
@@ -2466,19 +2470,22 @@ int open_ctree(struct super_block *sb,
2466 goto fail_alloc; 2470 goto fail_alloc;
2467 } 2471 }
2468 2472
2469 if (btrfs_super_leafsize(disk_super) != 2473 /*
2474 * Leafsize and nodesize were always equal, this is only a sanity check.
2475 */
2476 if (le32_to_cpu(disk_super->__unused_leafsize) !=
2470 btrfs_super_nodesize(disk_super)) { 2477 btrfs_super_nodesize(disk_super)) {
2471 printk(KERN_ERR "BTRFS: couldn't mount because metadata " 2478 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2472 "blocksizes don't match. node %d leaf %d\n", 2479 "blocksizes don't match. node %d leaf %d\n",
2473 btrfs_super_nodesize(disk_super), 2480 btrfs_super_nodesize(disk_super),
2474 btrfs_super_leafsize(disk_super)); 2481 le32_to_cpu(disk_super->__unused_leafsize));
2475 err = -EINVAL; 2482 err = -EINVAL;
2476 goto fail_alloc; 2483 goto fail_alloc;
2477 } 2484 }
2478 if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { 2485 if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
2479 printk(KERN_ERR "BTRFS: couldn't mount because metadata " 2486 printk(KERN_ERR "BTRFS: couldn't mount because metadata "
2480 "blocksize (%d) was too large\n", 2487 "blocksize (%d) was too large\n",
2481 btrfs_super_leafsize(disk_super)); 2488 btrfs_super_nodesize(disk_super));
2482 err = -EINVAL; 2489 err = -EINVAL;
2483 goto fail_alloc; 2490 goto fail_alloc;
2484 } 2491 }
@@ -2495,17 +2502,16 @@ int open_ctree(struct super_block *sb,
2495 * flag our filesystem as having big metadata blocks if 2502 * flag our filesystem as having big metadata blocks if
2496 * they are bigger than the page size 2503 * they are bigger than the page size
2497 */ 2504 */
2498 if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { 2505 if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) {
2499 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) 2506 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2500 printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); 2507 printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n");
2501 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; 2508 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2502 } 2509 }
2503 2510
2504 nodesize = btrfs_super_nodesize(disk_super); 2511 nodesize = btrfs_super_nodesize(disk_super);
2505 leafsize = btrfs_super_leafsize(disk_super);
2506 sectorsize = btrfs_super_sectorsize(disk_super); 2512 sectorsize = btrfs_super_sectorsize(disk_super);
2507 stripesize = btrfs_super_stripesize(disk_super); 2513 stripesize = btrfs_super_stripesize(disk_super);
2508 fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); 2514 fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
2509 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); 2515 fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
2510 2516
2511 /* 2517 /*
@@ -2513,7 +2519,7 @@ int open_ctree(struct super_block *sb,
2513 * extent buffers for the same range. It leads to corruptions 2519 * extent buffers for the same range. It leads to corruptions
2514 */ 2520 */
2515 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && 2521 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2516 (sectorsize != leafsize)) { 2522 (sectorsize != nodesize)) {
2517 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " 2523 printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
2518 "are not allowed for mixed block groups on %s\n", 2524 "are not allowed for mixed block groups on %s\n",
2519 sb->s_id); 2525 sb->s_id);
@@ -2576,6 +2582,8 @@ int open_ctree(struct super_block *sb,
2576 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); 2582 btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
2577 fs_info->endio_raid56_workers = 2583 fs_info->endio_raid56_workers =
2578 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); 2584 btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
2585 fs_info->endio_repair_workers =
2586 btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
2579 fs_info->rmw_workers = 2587 fs_info->rmw_workers =
2580 btrfs_alloc_workqueue("rmw", flags, max_active, 2); 2588 btrfs_alloc_workqueue("rmw", flags, max_active, 2);
2581 fs_info->endio_write_workers = 2589 fs_info->endio_write_workers =
@@ -2597,11 +2605,12 @@ int open_ctree(struct super_block *sb,
2597 fs_info->submit_workers && fs_info->flush_workers && 2605 fs_info->submit_workers && fs_info->flush_workers &&
2598 fs_info->endio_workers && fs_info->endio_meta_workers && 2606 fs_info->endio_workers && fs_info->endio_meta_workers &&
2599 fs_info->endio_meta_write_workers && 2607 fs_info->endio_meta_write_workers &&
2608 fs_info->endio_repair_workers &&
2600 fs_info->endio_write_workers && fs_info->endio_raid56_workers && 2609 fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
2601 fs_info->endio_freespace_worker && fs_info->rmw_workers && 2610 fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2602 fs_info->caching_workers && fs_info->readahead_workers && 2611 fs_info->caching_workers && fs_info->readahead_workers &&
2603 fs_info->fixup_workers && fs_info->delayed_workers && 2612 fs_info->fixup_workers && fs_info->delayed_workers &&
2604 fs_info->fixup_workers && fs_info->extent_workers && 2613 fs_info->extent_workers &&
2605 fs_info->qgroup_rescan_workers)) { 2614 fs_info->qgroup_rescan_workers)) {
2606 err = -ENOMEM; 2615 err = -ENOMEM;
2607 goto fail_sb_buffer; 2616 goto fail_sb_buffer;
@@ -2612,7 +2621,6 @@ int open_ctree(struct super_block *sb,
2612 4 * 1024 * 1024 / PAGE_CACHE_SIZE); 2621 4 * 1024 * 1024 / PAGE_CACHE_SIZE);
2613 2622
2614 tree_root->nodesize = nodesize; 2623 tree_root->nodesize = nodesize;
2615 tree_root->leafsize = leafsize;
2616 tree_root->sectorsize = sectorsize; 2624 tree_root->sectorsize = sectorsize;
2617 tree_root->stripesize = stripesize; 2625 tree_root->stripesize = stripesize;
2618 2626
@@ -2639,16 +2647,14 @@ int open_ctree(struct super_block *sb,
2639 goto fail_sb_buffer; 2647 goto fail_sb_buffer;
2640 } 2648 }
2641 2649
2642 blocksize = btrfs_level_size(tree_root,
2643 btrfs_super_chunk_root_level(disk_super));
2644 generation = btrfs_super_chunk_root_generation(disk_super); 2650 generation = btrfs_super_chunk_root_generation(disk_super);
2645 2651
2646 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2652 __setup_root(nodesize, sectorsize, stripesize, chunk_root,
2647 chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); 2653 fs_info, BTRFS_CHUNK_TREE_OBJECTID);
2648 2654
2649 chunk_root->node = read_tree_block(chunk_root, 2655 chunk_root->node = read_tree_block(chunk_root,
2650 btrfs_super_chunk_root(disk_super), 2656 btrfs_super_chunk_root(disk_super),
2651 blocksize, generation); 2657 generation);
2652 if (!chunk_root->node || 2658 if (!chunk_root->node ||
2653 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2659 !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
2654 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", 2660 printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
@@ -2681,13 +2687,11 @@ int open_ctree(struct super_block *sb,
2681 } 2687 }
2682 2688
2683retry_root_backup: 2689retry_root_backup:
2684 blocksize = btrfs_level_size(tree_root,
2685 btrfs_super_root_level(disk_super));
2686 generation = btrfs_super_generation(disk_super); 2690 generation = btrfs_super_generation(disk_super);
2687 2691
2688 tree_root->node = read_tree_block(tree_root, 2692 tree_root->node = read_tree_block(tree_root,
2689 btrfs_super_root(disk_super), 2693 btrfs_super_root(disk_super),
2690 blocksize, generation); 2694 generation);
2691 if (!tree_root->node || 2695 if (!tree_root->node ||
2692 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { 2696 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
2693 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", 2697 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
@@ -2856,9 +2860,6 @@ retry_root_backup:
2856 err = -EIO; 2860 err = -EIO;
2857 goto fail_qgroup; 2861 goto fail_qgroup;
2858 } 2862 }
2859 blocksize =
2860 btrfs_level_size(tree_root,
2861 btrfs_super_log_root_level(disk_super));
2862 2863
2863 log_tree_root = btrfs_alloc_root(fs_info); 2864 log_tree_root = btrfs_alloc_root(fs_info);
2864 if (!log_tree_root) { 2865 if (!log_tree_root) {
@@ -2866,11 +2867,10 @@ retry_root_backup:
2866 goto fail_qgroup; 2867 goto fail_qgroup;
2867 } 2868 }
2868 2869
2869 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2870 __setup_root(nodesize, sectorsize, stripesize,
2870 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 2871 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
2871 2872
2872 log_tree_root->node = read_tree_block(tree_root, bytenr, 2873 log_tree_root->node = read_tree_block(tree_root, bytenr,
2873 blocksize,
2874 generation + 1); 2874 generation + 1);
2875 if (!log_tree_root->node || 2875 if (!log_tree_root->node ||
2876 !extent_buffer_uptodate(log_tree_root->node)) { 2876 !extent_buffer_uptodate(log_tree_root->node)) {
@@ -2977,6 +2977,8 @@ retry_root_backup:
2977 fs_info->update_uuid_tree_gen = 1; 2977 fs_info->update_uuid_tree_gen = 1;
2978 } 2978 }
2979 2979
2980 fs_info->open = 1;
2981
2980 return 0; 2982 return 0;
2981 2983
2982fail_qgroup: 2984fail_qgroup:
@@ -3136,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device,
3136 3138
3137 for (i = 0; i < max_mirrors; i++) { 3139 for (i = 0; i < max_mirrors; i++) {
3138 bytenr = btrfs_sb_offset(i); 3140 bytenr = btrfs_sb_offset(i);
3139 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 3141 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
3142 device->commit_total_bytes)
3140 break; 3143 break;
3141 3144
3142 if (wait) { 3145 if (wait) {
@@ -3452,8 +3455,10 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3452 btrfs_set_stack_device_generation(dev_item, 0); 3455 btrfs_set_stack_device_generation(dev_item, 0);
3453 btrfs_set_stack_device_type(dev_item, dev->type); 3456 btrfs_set_stack_device_type(dev_item, dev->type);
3454 btrfs_set_stack_device_id(dev_item, dev->devid); 3457 btrfs_set_stack_device_id(dev_item, dev->devid);
3455 btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); 3458 btrfs_set_stack_device_total_bytes(dev_item,
3456 btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); 3459 dev->commit_total_bytes);
3460 btrfs_set_stack_device_bytes_used(dev_item,
3461 dev->commit_bytes_used);
3457 btrfs_set_stack_device_io_align(dev_item, dev->io_align); 3462 btrfs_set_stack_device_io_align(dev_item, dev->io_align);
3458 btrfs_set_stack_device_io_width(dev_item, dev->io_width); 3463 btrfs_set_stack_device_io_width(dev_item, dev->io_width);
3459 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); 3464 btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
@@ -3528,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
3528 3533
3529static void free_fs_root(struct btrfs_root *root) 3534static void free_fs_root(struct btrfs_root *root)
3530{ 3535{
3531 iput(root->cache_inode); 3536 iput(root->ino_cache_inode);
3532 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 3537 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3533 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3538 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3534 root->orphan_block_rsv = NULL; 3539 root->orphan_block_rsv = NULL;
@@ -3619,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root)
3619 return btrfs_commit_transaction(trans, root); 3624 return btrfs_commit_transaction(trans, root);
3620} 3625}
3621 3626
3622int close_ctree(struct btrfs_root *root) 3627void close_ctree(struct btrfs_root *root)
3623{ 3628{
3624 struct btrfs_fs_info *fs_info = root->fs_info; 3629 struct btrfs_fs_info *fs_info = root->fs_info;
3625 int ret; 3630 int ret;
@@ -3685,6 +3690,7 @@ int close_ctree(struct btrfs_root *root)
3685 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 3690 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3686 btrfs_stop_all_workers(fs_info); 3691 btrfs_stop_all_workers(fs_info);
3687 3692
3693 fs_info->open = 0;
3688 free_root_pointers(fs_info, 1); 3694 free_root_pointers(fs_info, 1);
3689 3695
3690 iput(fs_info->btree_inode); 3696 iput(fs_info->btree_inode);
@@ -3707,8 +3713,6 @@ int close_ctree(struct btrfs_root *root)
3707 3713
3708 btrfs_free_block_rsv(root, root->orphan_block_rsv); 3714 btrfs_free_block_rsv(root, root->orphan_block_rsv);
3709 root->orphan_block_rsv = NULL; 3715 root->orphan_block_rsv = NULL;
3710
3711 return 0;
3712} 3716}
3713 3717
3714int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 3718int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
@@ -3810,10 +3814,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
3810static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, 3814static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
3811 int read_only) 3815 int read_only)
3812{ 3816{
3817 struct btrfs_super_block *sb = fs_info->super_copy;
3818 int ret = 0;
3819
3820 if (sb->root_level > BTRFS_MAX_LEVEL) {
3821 printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n",
3822 sb->root_level, BTRFS_MAX_LEVEL);
3823 ret = -EINVAL;
3824 }
3825 if (sb->chunk_root_level > BTRFS_MAX_LEVEL) {
3826 printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n",
3827 sb->chunk_root_level, BTRFS_MAX_LEVEL);
3828 ret = -EINVAL;
3829 }
3830 if (sb->log_root_level > BTRFS_MAX_LEVEL) {
3831 printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n",
3832 sb->log_root_level, BTRFS_MAX_LEVEL);
3833 ret = -EINVAL;
3834 }
3835
3813 /* 3836 /*
3814 * Placeholder for checks 3837 * The common minimum, we don't know if we can trust the nodesize/sectorsize
3838 * items yet, they'll be verified later. Issue just a warning.
3815 */ 3839 */
3816 return 0; 3840 if (!IS_ALIGNED(sb->root, 4096))
3841 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3842 sb->root);
3843 if (!IS_ALIGNED(sb->chunk_root, 4096))
3844 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3845 sb->chunk_root);
3846 if (!IS_ALIGNED(sb->log_root, 4096))
3847 printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
3848 sb->log_root);
3849
3850 if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
3851 printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
3852 fs_info->fsid, sb->dev_item.fsid);
3853 ret = -EINVAL;
3854 }
3855
3856 /*
3857 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
3858 * done later
3859 */
3860 if (sb->num_devices > (1UL << 31))
3861 printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
3862 sb->num_devices);
3863
3864 if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) {
3865 printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
3866 sb->bytenr, BTRFS_SUPER_INFO_OFFSET);
3867 ret = -EINVAL;
3868 }
3869
3870 /*
3871 * The generation is a global counter, we'll trust it more than the others
3872 * but it's still possible that it's the one that's wrong.
3873 */
3874 if (sb->generation < sb->chunk_root_generation)
3875 printk(KERN_WARNING
3876 "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n",
3877 sb->generation, sb->chunk_root_generation);
3878 if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1)
3879 printk(KERN_WARNING
3880 "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n",
3881 sb->generation, sb->cache_generation);
3882
3883 return ret;
3817} 3884}
3818 3885
3819static void btrfs_error_commit_super(struct btrfs_root *root) 3886static void btrfs_error_commit_super(struct btrfs_root *root)
@@ -3829,34 +3896,6 @@ static void btrfs_error_commit_super(struct btrfs_root *root)
3829 btrfs_cleanup_transaction(root); 3896 btrfs_cleanup_transaction(root);
3830} 3897}
3831 3898
3832static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
3833 struct btrfs_root *root)
3834{
3835 struct btrfs_inode *btrfs_inode;
3836 struct list_head splice;
3837
3838 INIT_LIST_HEAD(&splice);
3839
3840 mutex_lock(&root->fs_info->ordered_operations_mutex);
3841 spin_lock(&root->fs_info->ordered_root_lock);
3842
3843 list_splice_init(&t->ordered_operations, &splice);
3844 while (!list_empty(&splice)) {
3845 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
3846 ordered_operations);
3847
3848 list_del_init(&btrfs_inode->ordered_operations);
3849 spin_unlock(&root->fs_info->ordered_root_lock);
3850
3851 btrfs_invalidate_inodes(btrfs_inode->root);
3852
3853 spin_lock(&root->fs_info->ordered_root_lock);
3854 }
3855
3856 spin_unlock(&root->fs_info->ordered_root_lock);
3857 mutex_unlock(&root->fs_info->ordered_operations_mutex);
3858}
3859
3860static void btrfs_destroy_ordered_extents(struct btrfs_root *root) 3899static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
3861{ 3900{
3862 struct btrfs_ordered_extent *ordered; 3901 struct btrfs_ordered_extent *ordered;
@@ -4033,9 +4072,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root,
4033 4072
4034 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 4073 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
4035 while (start <= end) { 4074 while (start <= end) {
4036 eb = btrfs_find_tree_block(root, start, 4075 eb = btrfs_find_tree_block(root, start);
4037 root->leafsize); 4076 start += root->nodesize;
4038 start += root->leafsize;
4039 if (!eb) 4077 if (!eb)
4040 continue; 4078 continue;
4041 wait_on_extent_buffer_writeback(eb); 4079 wait_on_extent_buffer_writeback(eb);
@@ -4093,8 +4131,6 @@ again:
4093void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4131void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4094 struct btrfs_root *root) 4132 struct btrfs_root *root)
4095{ 4133{
4096 btrfs_destroy_ordered_operations(cur_trans, root);
4097
4098 btrfs_destroy_delayed_refs(cur_trans, root); 4134 btrfs_destroy_delayed_refs(cur_trans, root);
4099 4135
4100 cur_trans->state = TRANS_STATE_COMMIT_START; 4136 cur_trans->state = TRANS_STATE_COMMIT_START;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 23ce3ceba0a9..414651821fb3 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,11 +25,12 @@
25#define BTRFS_SUPER_MIRROR_MAX 3 25#define BTRFS_SUPER_MIRROR_MAX 3
26#define BTRFS_SUPER_MIRROR_SHIFT 12 26#define BTRFS_SUPER_MIRROR_SHIFT 12
27 27
28enum { 28enum btrfs_wq_endio_type {
29 BTRFS_WQ_ENDIO_DATA = 0, 29 BTRFS_WQ_ENDIO_DATA = 0,
30 BTRFS_WQ_ENDIO_METADATA = 1, 30 BTRFS_WQ_ENDIO_METADATA = 1,
31 BTRFS_WQ_ENDIO_FREE_SPACE = 2, 31 BTRFS_WQ_ENDIO_FREE_SPACE = 2,
32 BTRFS_WQ_ENDIO_RAID56 = 3, 32 BTRFS_WQ_ENDIO_RAID56 = 3,
33 BTRFS_WQ_ENDIO_DIO_REPAIR = 4,
33}; 34};
34 35
35static inline u64 btrfs_sb_offset(int mirror) 36static inline u64 btrfs_sb_offset(int mirror)
@@ -44,9 +45,8 @@ struct btrfs_device;
44struct btrfs_fs_devices; 45struct btrfs_fs_devices;
45 46
46struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, 47struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
47 u32 blocksize, u64 parent_transid); 48 u64 parent_transid);
48int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 49void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
49 u64 parent_transid);
50int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, 50int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
51 int mirror_num, struct extent_buffer **eb); 51 int mirror_num, struct extent_buffer **eb);
52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 52struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
@@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans,
56int open_ctree(struct super_block *sb, 56int open_ctree(struct super_block *sb,
57 struct btrfs_fs_devices *fs_devices, 57 struct btrfs_fs_devices *fs_devices,
58 char *options); 58 char *options);
59int close_ctree(struct btrfs_root *root); 59void close_ctree(struct btrfs_root *root);
60int write_ctree_super(struct btrfs_trans_handle *trans, 60int write_ctree_super(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, int max_mirrors); 61 struct btrfs_root *root, int max_mirrors);
62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_commit_super(struct btrfs_root *root); 63int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 64struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
65 u64 bytenr, u32 blocksize); 65 u64 bytenr);
66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, 66struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
67 struct btrfs_key *location); 67 struct btrfs_key *location);
68int btrfs_init_fs_root(struct btrfs_root *root); 68int btrfs_init_fs_root(struct btrfs_root *root);
@@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
119u32 btrfs_csum_data(char *data, u32 seed, size_t len); 119u32 btrfs_csum_data(char *data, u32 seed, size_t len);
120void btrfs_csum_final(u32 crc, char *result); 120void btrfs_csum_final(u32 crc, char *result);
121int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 121int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
122 int metadata); 122 enum btrfs_wq_endio_type metadata);
123int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 123int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
124 int rw, struct bio *bio, int mirror_num, 124 int rw, struct bio *bio, int mirror_num,
125 unsigned long bio_flags, u64 bio_offset, 125 unsigned long bio_flags, u64 bio_offset,
@@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data,
141 void (*flush_fn)(void *)); 141 void (*flush_fn)(void *));
142int btrfs_calc_num_tolerated_disk_barrier_failures( 142int btrfs_calc_num_tolerated_disk_barrier_failures(
143 struct btrfs_fs_info *fs_info); 143 struct btrfs_fs_info *fs_info);
144int __init btrfs_end_io_wq_init(void);
145void btrfs_end_io_wq_exit(void);
144 146
145#ifdef CONFIG_DEBUG_LOCK_ALLOC 147#ifdef CONFIG_DEBUG_LOCK_ALLOC
146void btrfs_init_lockdep(void); 148void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 41422a3de8ed..37d164540c3a 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
70 return ERR_PTR(-ESTALE); 70 return ERR_PTR(-ESTALE);
71 71
72 key.objectid = root_objectid; 72 key.objectid = root_objectid;
73 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 73 key.type = BTRFS_ROOT_ITEM_KEY;
74 key.offset = (u64)-1; 74 key.offset = (u64)-1;
75 75
76 index = srcu_read_lock(&fs_info->subvol_srcu); 76 index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
82 } 82 }
83 83
84 key.objectid = objectid; 84 key.objectid = objectid;
85 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 85 key.type = BTRFS_INODE_ITEM_KEY;
86 key.offset = 0; 86 key.offset = 0;
87 87
88 inode = btrfs_iget(sb, &key, root, NULL); 88 inode = btrfs_iget(sb, &key, root, NULL);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 813537f362f9..d56589571012 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -491,7 +491,7 @@ next:
491 key.objectid); 491 key.objectid);
492 if (key.type == BTRFS_METADATA_ITEM_KEY) 492 if (key.type == BTRFS_METADATA_ITEM_KEY)
493 last = key.objectid + 493 last = key.objectid +
494 fs_info->tree_root->leafsize; 494 fs_info->tree_root->nodesize;
495 else 495 else
496 last = key.objectid + key.offset; 496 last = key.objectid + key.offset;
497 497
@@ -552,7 +552,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
552 caching_ctl->block_group = cache; 552 caching_ctl->block_group = cache;
553 caching_ctl->progress = cache->key.objectid; 553 caching_ctl->progress = cache->key.objectid;
554 atomic_set(&caching_ctl->count, 1); 554 atomic_set(&caching_ctl->count, 1);
555 btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); 555 btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
556 caching_thread, NULL, NULL);
556 557
557 spin_lock(&cache->lock); 558 spin_lock(&cache->lock);
558 /* 559 /*
@@ -764,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
764 * different 765 * different
765 */ 766 */
766 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { 767 if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
767 offset = root->leafsize; 768 offset = root->nodesize;
768 metadata = 0; 769 metadata = 0;
769 } 770 }
770 771
@@ -798,13 +799,13 @@ again:
798 path->slots[0]); 799 path->slots[0]);
799 if (key.objectid == bytenr && 800 if (key.objectid == bytenr &&
800 key.type == BTRFS_EXTENT_ITEM_KEY && 801 key.type == BTRFS_EXTENT_ITEM_KEY &&
801 key.offset == root->leafsize) 802 key.offset == root->nodesize)
802 ret = 0; 803 ret = 0;
803 } 804 }
804 if (ret) { 805 if (ret) {
805 key.objectid = bytenr; 806 key.objectid = bytenr;
806 key.type = BTRFS_EXTENT_ITEM_KEY; 807 key.type = BTRFS_EXTENT_ITEM_KEY;
807 key.offset = root->leafsize; 808 key.offset = root->nodesize;
808 btrfs_release_path(path); 809 btrfs_release_path(path);
809 goto again; 810 goto again;
810 } 811 }
@@ -2650,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2650 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 2651 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
2651 num_heads = heads_to_leaves(root, num_heads); 2652 num_heads = heads_to_leaves(root, num_heads);
2652 if (num_heads > 1) 2653 if (num_heads > 1)
2653 num_bytes += (num_heads - 1) * root->leafsize; 2654 num_bytes += (num_heads - 1) * root->nodesize;
2654 num_bytes <<= 1; 2655 num_bytes <<= 1;
2655 global_rsv = &root->fs_info->global_block_rsv; 2656 global_rsv = &root->fs_info->global_block_rsv;
2656 2657
@@ -2749,8 +2750,8 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2749 async->sync = 0; 2750 async->sync = 0;
2750 init_completion(&async->wait); 2751 init_completion(&async->wait);
2751 2752
2752 btrfs_init_work(&async->work, delayed_ref_async_start, 2753 btrfs_init_work(&async->work, btrfs_extent_refs_helper,
2753 NULL, NULL); 2754 delayed_ref_async_start, NULL, NULL);
2754 2755
2755 btrfs_queue_work(root->fs_info->extent_workers, &async->work); 2756 btrfs_queue_work(root->fs_info->extent_workers, &async->work);
2756 2757
@@ -3057,7 +3058,7 @@ out:
3057static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 3058static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3058 struct btrfs_root *root, 3059 struct btrfs_root *root,
3059 struct extent_buffer *buf, 3060 struct extent_buffer *buf,
3060 int full_backref, int inc, int no_quota) 3061 int full_backref, int inc)
3061{ 3062{
3062 u64 bytenr; 3063 u64 bytenr;
3063 u64 num_bytes; 3064 u64 num_bytes;
@@ -3072,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3072 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3073 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3073 u64, u64, u64, u64, u64, u64, int); 3074 u64, u64, u64, u64, u64, u64, int);
3074 3075
3075#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 3076
3076 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) 3077 if (btrfs_test_is_dummy_root(root))
3077 return 0; 3078 return 0;
3078#endif 3079
3079 ref_root = btrfs_header_owner(buf); 3080 ref_root = btrfs_header_owner(buf);
3080 nritems = btrfs_header_nritems(buf); 3081 nritems = btrfs_header_nritems(buf);
3081 level = btrfs_header_level(buf); 3082 level = btrfs_header_level(buf);
@@ -3096,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3096 for (i = 0; i < nritems; i++) { 3097 for (i = 0; i < nritems; i++) {
3097 if (level == 0) { 3098 if (level == 0) {
3098 btrfs_item_key_to_cpu(buf, &key, i); 3099 btrfs_item_key_to_cpu(buf, &key, i);
3099 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) 3100 if (key.type != BTRFS_EXTENT_DATA_KEY)
3100 continue; 3101 continue;
3101 fi = btrfs_item_ptr(buf, i, 3102 fi = btrfs_item_ptr(buf, i,
3102 struct btrfs_file_extent_item); 3103 struct btrfs_file_extent_item);
@@ -3111,15 +3112,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3111 key.offset -= btrfs_file_extent_offset(buf, fi); 3112 key.offset -= btrfs_file_extent_offset(buf, fi);
3112 ret = process_func(trans, root, bytenr, num_bytes, 3113 ret = process_func(trans, root, bytenr, num_bytes,
3113 parent, ref_root, key.objectid, 3114 parent, ref_root, key.objectid,
3114 key.offset, no_quota); 3115 key.offset, 1);
3115 if (ret) 3116 if (ret)
3116 goto fail; 3117 goto fail;
3117 } else { 3118 } else {
3118 bytenr = btrfs_node_blockptr(buf, i); 3119 bytenr = btrfs_node_blockptr(buf, i);
3119 num_bytes = btrfs_level_size(root, level - 1); 3120 num_bytes = root->nodesize;
3120 ret = process_func(trans, root, bytenr, num_bytes, 3121 ret = process_func(trans, root, bytenr, num_bytes,
3121 parent, ref_root, level - 1, 0, 3122 parent, ref_root, level - 1, 0,
3122 no_quota); 3123 1);
3123 if (ret) 3124 if (ret)
3124 goto fail; 3125 goto fail;
3125 } 3126 }
@@ -3130,15 +3131,15 @@ fail:
3130} 3131}
3131 3132
3132int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3133int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3133 struct extent_buffer *buf, int full_backref, int no_quota) 3134 struct extent_buffer *buf, int full_backref)
3134{ 3135{
3135 return __btrfs_mod_ref(trans, root, buf, full_backref, 1, no_quota); 3136 return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3136} 3137}
3137 3138
3138int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, 3139int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3139 struct extent_buffer *buf, int full_backref, int no_quota) 3140 struct extent_buffer *buf, int full_backref)
3140{ 3141{
3141 return __btrfs_mod_ref(trans, root, buf, full_backref, 0, no_quota); 3142 return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
3142} 3143}
3143 3144
3144static int write_one_cache_group(struct btrfs_trans_handle *trans, 3145static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -3493,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3493 if (!found) 3494 if (!found)
3494 return -ENOMEM; 3495 return -ENOMEM;
3495 3496
3496 ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3497 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3497 if (ret) { 3498 if (ret) {
3498 kfree(found); 3499 kfree(found);
3499 return ret; 3500 return ret;
@@ -3586,13 +3587,7 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3586 */ 3587 */
3587static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3588static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3588{ 3589{
3589 /* 3590 u64 num_devices = root->fs_info->fs_devices->rw_devices;
3590 * we add in the count of missing devices because we want
3591 * to make sure that any RAID levels on a degraded FS
3592 * continue to be honored.
3593 */
3594 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3595 root->fs_info->fs_devices->missing_devices;
3596 u64 target; 3591 u64 target;
3597 u64 tmp; 3592 u64 tmp;
3598 3593
@@ -4348,11 +4343,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4348} 4343}
4349 4344
4350static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, 4345static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
4351 struct btrfs_fs_info *fs_info) 4346 struct btrfs_fs_info *fs_info,
4347 int flush_state)
4352{ 4348{
4353 u64 used; 4349 u64 used;
4354 4350
4355 spin_lock(&space_info->lock); 4351 spin_lock(&space_info->lock);
4352 /*
4353 * We run out of space and have not got any free space via flush_space,
4354 * so don't bother doing async reclaim.
4355 */
4356 if (flush_state > COMMIT_TRANS && space_info->full) {
4357 spin_unlock(&space_info->lock);
4358 return 0;
4359 }
4360
4356 used = space_info->bytes_used + space_info->bytes_reserved + 4361 used = space_info->bytes_used + space_info->bytes_reserved +
4357 space_info->bytes_pinned + space_info->bytes_readonly + 4362 space_info->bytes_pinned + space_info->bytes_readonly +
4358 space_info->bytes_may_use; 4363 space_info->bytes_may_use;
@@ -4385,11 +4390,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
4385 flush_space(fs_info->fs_root, space_info, to_reclaim, 4390 flush_space(fs_info->fs_root, space_info, to_reclaim,
4386 to_reclaim, flush_state); 4391 to_reclaim, flush_state);
4387 flush_state++; 4392 flush_state++;
4388 if (!btrfs_need_do_async_reclaim(space_info, fs_info)) 4393 if (!btrfs_need_do_async_reclaim(space_info, fs_info,
4394 flush_state))
4389 return; 4395 return;
4390 } while (flush_state <= COMMIT_TRANS); 4396 } while (flush_state <= COMMIT_TRANS);
4391 4397
4392 if (btrfs_need_do_async_reclaim(space_info, fs_info)) 4398 if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state))
4393 queue_work(system_unbound_wq, work); 4399 queue_work(system_unbound_wq, work);
4394} 4400}
4395 4401
@@ -4507,7 +4513,13 @@ again:
4507 space_info->flush = 1; 4513 space_info->flush = 1;
4508 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { 4514 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
4509 used += orig_bytes; 4515 used += orig_bytes;
4510 if (need_do_async_reclaim(space_info, root->fs_info, used) && 4516 /*
4517 * We will do the space reservation dance during log replay,
4518 * which means we won't have fs_info->fs_root set, so don't do
4519 * the async reclaim as we will panic.
4520 */
4521 if (!root->fs_info->log_root_recovering &&
4522 need_do_async_reclaim(space_info, root->fs_info, used) &&
4511 !work_busy(&root->fs_info->async_reclaim_work)) 4523 !work_busy(&root->fs_info->async_reclaim_work))
4512 queue_work(system_unbound_wq, 4524 queue_work(system_unbound_wq,
4513 &root->fs_info->async_reclaim_work); 4525 &root->fs_info->async_reclaim_work);
@@ -4844,7 +4856,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4844 if (num_bytes * 3 > meta_used) 4856 if (num_bytes * 3 > meta_used)
4845 num_bytes = div64_u64(meta_used, 3); 4857 num_bytes = div64_u64(meta_used, 3);
4846 4858
4847 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); 4859 return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
4848} 4860}
4849 4861
4850static void update_global_block_rsv(struct btrfs_fs_info *fs_info) 4862static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4993,7 +5005,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4993 5005
4994 if (root->fs_info->quota_enabled) { 5006 if (root->fs_info->quota_enabled) {
4995 /* One for parent inode, two for dir entries */ 5007 /* One for parent inode, two for dir entries */
4996 num_bytes = 3 * root->leafsize; 5008 num_bytes = 3 * root->nodesize;
4997 ret = btrfs_qgroup_reserve(root, num_bytes); 5009 ret = btrfs_qgroup_reserve(root, num_bytes);
4998 if (ret) 5010 if (ret)
4999 return ret; 5011 return ret;
@@ -5181,7 +5193,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5181 5193
5182 if (root->fs_info->quota_enabled) { 5194 if (root->fs_info->quota_enabled) {
5183 ret = btrfs_qgroup_reserve(root, num_bytes + 5195 ret = btrfs_qgroup_reserve(root, num_bytes +
5184 nr_extents * root->leafsize); 5196 nr_extents * root->nodesize);
5185 if (ret) 5197 if (ret)
5186 goto out_fail; 5198 goto out_fail;
5187 } 5199 }
@@ -5190,7 +5202,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5190 if (unlikely(ret)) { 5202 if (unlikely(ret)) {
5191 if (root->fs_info->quota_enabled) 5203 if (root->fs_info->quota_enabled)
5192 btrfs_qgroup_free(root, num_bytes + 5204 btrfs_qgroup_free(root, num_bytes +
5193 nr_extents * root->leafsize); 5205 nr_extents * root->nodesize);
5194 goto out_fail; 5206 goto out_fail;
5195 } 5207 }
5196 5208
@@ -5306,7 +5318,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5306 btrfs_ino(inode), to_free, 0); 5318 btrfs_ino(inode), to_free, 0);
5307 if (root->fs_info->quota_enabled) { 5319 if (root->fs_info->quota_enabled) {
5308 btrfs_qgroup_free(root, num_bytes + 5320 btrfs_qgroup_free(root, num_bytes +
5309 dropped * root->leafsize); 5321 dropped * root->nodesize);
5310 } 5322 }
5311 5323
5312 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 5324 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
@@ -5427,6 +5439,20 @@ static int update_block_group(struct btrfs_root *root,
5427 spin_unlock(&cache->space_info->lock); 5439 spin_unlock(&cache->space_info->lock);
5428 } else { 5440 } else {
5429 old_val -= num_bytes; 5441 old_val -= num_bytes;
5442
5443 /*
5444 * No longer have used bytes in this block group, queue
5445 * it for deletion.
5446 */
5447 if (old_val == 0) {
5448 spin_lock(&info->unused_bgs_lock);
5449 if (list_empty(&cache->bg_list)) {
5450 btrfs_get_block_group(cache);
5451 list_add_tail(&cache->bg_list,
5452 &info->unused_bgs);
5453 }
5454 spin_unlock(&info->unused_bgs_lock);
5455 }
5430 btrfs_set_block_group_used(&cache->item, old_val); 5456 btrfs_set_block_group_used(&cache->item, old_val);
5431 cache->pinned += num_bytes; 5457 cache->pinned += num_bytes;
5432 cache->space_info->bytes_pinned += num_bytes; 5458 cache->space_info->bytes_pinned += num_bytes;
@@ -6238,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6238 int ret; 6264 int ret;
6239 struct btrfs_fs_info *fs_info = root->fs_info; 6265 struct btrfs_fs_info *fs_info = root->fs_info;
6240 6266
6241#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 6267 if (btrfs_test_is_dummy_root(root))
6242 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
6243 return 0; 6268 return 0;
6244#endif 6269
6245 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); 6270 add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
6246 6271
6247 /* 6272 /*
@@ -6268,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6268 return ret; 6293 return ret;
6269} 6294}
6270 6295
6271static u64 stripe_align(struct btrfs_root *root,
6272 struct btrfs_block_group_cache *cache,
6273 u64 val, u64 num_bytes)
6274{
6275 u64 ret = ALIGN(val, root->stripesize);
6276 return ret;
6277}
6278
6279/* 6296/*
6280 * when we wait for progress in the block group caching, its because 6297 * when we wait for progress in the block group caching, its because
6281 * our allocation attempt failed at least once. So, we must sleep 6298 * our allocation attempt failed at least once. So, we must sleep
@@ -6469,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6469 bool have_caching_bg = false; 6486 bool have_caching_bg = false;
6470 6487
6471 WARN_ON(num_bytes < root->sectorsize); 6488 WARN_ON(num_bytes < root->sectorsize);
6472 btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); 6489 ins->type = BTRFS_EXTENT_ITEM_KEY;
6473 ins->objectid = 0; 6490 ins->objectid = 0;
6474 ins->offset = 0; 6491 ins->offset = 0;
6475 6492
@@ -6756,8 +6773,7 @@ unclustered_alloc:
6756 goto loop; 6773 goto loop;
6757 } 6774 }
6758checks: 6775checks:
6759 search_start = stripe_align(root, block_group, 6776 search_start = ALIGN(offset, root->stripesize);
6760 offset, num_bytes);
6761 6777
6762 /* move on to the next group */ 6778 /* move on to the next group */
6763 if (search_start + num_bytes > 6779 if (search_start + num_bytes >
@@ -7082,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7082 path = btrfs_alloc_path(); 7098 path = btrfs_alloc_path();
7083 if (!path) { 7099 if (!path) {
7084 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7100 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7085 root->leafsize); 7101 root->nodesize);
7086 return -ENOMEM; 7102 return -ENOMEM;
7087 } 7103 }
7088 7104
@@ -7091,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7091 ins, size); 7107 ins, size);
7092 if (ret) { 7108 if (ret) {
7093 btrfs_free_and_pin_reserved_extent(root, ins->objectid, 7109 btrfs_free_and_pin_reserved_extent(root, ins->objectid,
7094 root->leafsize); 7110 root->nodesize);
7095 btrfs_free_path(path); 7111 btrfs_free_path(path);
7096 return ret; 7112 return ret;
7097 } 7113 }
@@ -7106,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7106 7122
7107 if (skinny_metadata) { 7123 if (skinny_metadata) {
7108 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); 7124 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
7109 num_bytes = root->leafsize; 7125 num_bytes = root->nodesize;
7110 } else { 7126 } else {
7111 block_info = (struct btrfs_tree_block_info *)(extent_item + 1); 7127 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
7112 btrfs_set_tree_block_key(leaf, block_info, key); 7128 btrfs_set_tree_block_key(leaf, block_info, key);
@@ -7136,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7136 return ret; 7152 return ret;
7137 } 7153 }
7138 7154
7139 ret = update_block_group(root, ins->objectid, root->leafsize, 1); 7155 ret = update_block_group(root, ins->objectid, root->nodesize, 1);
7140 if (ret) { /* -ENOENT, logic error */ 7156 if (ret) { /* -ENOENT, logic error */
7141 btrfs_err(fs_info, "update block group failed for %llu %llu", 7157 btrfs_err(fs_info, "update block group failed for %llu %llu",
7142 ins->objectid, ins->offset); 7158 ins->objectid, ins->offset);
7143 BUG(); 7159 BUG();
7144 } 7160 }
7145 7161
7146 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); 7162 trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
7147 return ret; 7163 return ret;
7148} 7164}
7149 7165
@@ -7218,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
7218 btrfs_set_buffer_uptodate(buf); 7234 btrfs_set_buffer_uptodate(buf);
7219 7235
7220 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { 7236 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
7237 buf->log_index = root->log_transid % 2;
7221 /* 7238 /*
7222 * we allow two log transactions at a time, use different 7239 * we allow two log transactions at a time, use different
7223 * EXENT bit to differentiate dirty pages. 7240 * EXENT bit to differentiate dirty pages.
7224 */ 7241 */
7225 if (root->log_transid % 2 == 0) 7242 if (buf->log_index == 0)
7226 set_extent_dirty(&root->dirty_log_pages, buf->start, 7243 set_extent_dirty(&root->dirty_log_pages, buf->start,
7227 buf->start + buf->len - 1, GFP_NOFS); 7244 buf->start + buf->len - 1, GFP_NOFS);
7228 else 7245 else
7229 set_extent_new(&root->dirty_log_pages, buf->start, 7246 set_extent_new(&root->dirty_log_pages, buf->start,
7230 buf->start + buf->len - 1, GFP_NOFS); 7247 buf->start + buf->len - 1, GFP_NOFS);
7231 } else { 7248 } else {
7249 buf->log_index = -1;
7232 set_extent_dirty(&trans->transaction->dirty_pages, buf->start, 7250 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
7233 buf->start + buf->len - 1, GFP_NOFS); 7251 buf->start + buf->len - 1, GFP_NOFS);
7234 } 7252 }
@@ -7305,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
7305 * 7323 *
7306 * returns the tree buffer or NULL. 7324 * returns the tree buffer or NULL.
7307 */ 7325 */
7308struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 7326struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7309 struct btrfs_root *root, u32 blocksize, 7327 struct btrfs_root *root,
7310 u64 parent, u64 root_objectid, 7328 u64 parent, u64 root_objectid,
7311 struct btrfs_disk_key *key, int level, 7329 struct btrfs_disk_key *key, int level,
7312 u64 hint, u64 empty_size) 7330 u64 hint, u64 empty_size)
@@ -7316,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
7316 struct extent_buffer *buf; 7334 struct extent_buffer *buf;
7317 u64 flags = 0; 7335 u64 flags = 0;
7318 int ret; 7336 int ret;
7337 u32 blocksize = root->nodesize;
7319 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 7338 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
7320 SKINNY_METADATA); 7339 SKINNY_METADATA);
7321 7340
7322#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 7341 if (btrfs_test_is_dummy_root(root)) {
7323 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) {
7324 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, 7342 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
7325 blocksize, level); 7343 blocksize, level);
7326 if (!IS_ERR(buf)) 7344 if (!IS_ERR(buf))
7327 root->alloc_bytenr += blocksize; 7345 root->alloc_bytenr += blocksize;
7328 return buf; 7346 return buf;
7329 } 7347 }
7330#endif 7348
7331 block_rsv = use_block_rsv(trans, root, blocksize); 7349 block_rsv = use_block_rsv(trans, root, blocksize);
7332 if (IS_ERR(block_rsv)) 7350 if (IS_ERR(block_rsv))
7333 return ERR_CAST(block_rsv); 7351 return ERR_CAST(block_rsv);
@@ -7422,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7422 7440
7423 eb = path->nodes[wc->level]; 7441 eb = path->nodes[wc->level];
7424 nritems = btrfs_header_nritems(eb); 7442 nritems = btrfs_header_nritems(eb);
7425 blocksize = btrfs_level_size(root, wc->level - 1); 7443 blocksize = root->nodesize;
7426 7444
7427 for (slot = path->slots[wc->level]; slot < nritems; slot++) { 7445 for (slot = path->slots[wc->level]; slot < nritems; slot++) {
7428 if (nread >= wc->reada_count) 7446 if (nread >= wc->reada_count)
@@ -7469,15 +7487,224 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
7469 continue; 7487 continue;
7470 } 7488 }
7471reada: 7489reada:
7472 ret = readahead_tree_block(root, bytenr, blocksize, 7490 readahead_tree_block(root, bytenr, blocksize);
7473 generation);
7474 if (ret)
7475 break;
7476 nread++; 7491 nread++;
7477 } 7492 }
7478 wc->reada_slot = slot; 7493 wc->reada_slot = slot;
7479} 7494}
7480 7495
7496static int account_leaf_items(struct btrfs_trans_handle *trans,
7497 struct btrfs_root *root,
7498 struct extent_buffer *eb)
7499{
7500 int nr = btrfs_header_nritems(eb);
7501 int i, extent_type, ret;
7502 struct btrfs_key key;
7503 struct btrfs_file_extent_item *fi;
7504 u64 bytenr, num_bytes;
7505
7506 for (i = 0; i < nr; i++) {
7507 btrfs_item_key_to_cpu(eb, &key, i);
7508
7509 if (key.type != BTRFS_EXTENT_DATA_KEY)
7510 continue;
7511
7512 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
7513 /* filter out non qgroup-accountable extents */
7514 extent_type = btrfs_file_extent_type(eb, fi);
7515
7516 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
7517 continue;
7518
7519 bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
7520 if (!bytenr)
7521 continue;
7522
7523 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
7524
7525 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7526 root->objectid,
7527 bytenr, num_bytes,
7528 BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
7529 if (ret)
7530 return ret;
7531 }
7532 return 0;
7533}
7534
7535/*
7536 * Walk up the tree from the bottom, freeing leaves and any interior
7537 * nodes which have had all slots visited. If a node (leaf or
7538 * interior) is freed, the node above it will have it's slot
7539 * incremented. The root node will never be freed.
7540 *
7541 * At the end of this function, we should have a path which has all
7542 * slots incremented to the next position for a search. If we need to
7543 * read a new node it will be NULL and the node above it will have the
7544 * correct slot selected for a later read.
7545 *
7546 * If we increment the root nodes slot counter past the number of
7547 * elements, 1 is returned to signal completion of the search.
7548 */
7549static int adjust_slots_upwards(struct btrfs_root *root,
7550 struct btrfs_path *path, int root_level)
7551{
7552 int level = 0;
7553 int nr, slot;
7554 struct extent_buffer *eb;
7555
7556 if (root_level == 0)
7557 return 1;
7558
7559 while (level <= root_level) {
7560 eb = path->nodes[level];
7561 nr = btrfs_header_nritems(eb);
7562 path->slots[level]++;
7563 slot = path->slots[level];
7564 if (slot >= nr || level == 0) {
7565 /*
7566 * Don't free the root - we will detect this
7567 * condition after our loop and return a
7568 * positive value for caller to stop walking the tree.
7569 */
7570 if (level != root_level) {
7571 btrfs_tree_unlock_rw(eb, path->locks[level]);
7572 path->locks[level] = 0;
7573
7574 free_extent_buffer(eb);
7575 path->nodes[level] = NULL;
7576 path->slots[level] = 0;
7577 }
7578 } else {
7579 /*
7580 * We have a valid slot to walk back down
7581 * from. Stop here so caller can process these
7582 * new nodes.
7583 */
7584 break;
7585 }
7586
7587 level++;
7588 }
7589
7590 eb = path->nodes[root_level];
7591 if (path->slots[root_level] >= btrfs_header_nritems(eb))
7592 return 1;
7593
7594 return 0;
7595}
7596
7597/*
7598 * root_eb is the subtree root and is locked before this function is called.
7599 */
7600static int account_shared_subtree(struct btrfs_trans_handle *trans,
7601 struct btrfs_root *root,
7602 struct extent_buffer *root_eb,
7603 u64 root_gen,
7604 int root_level)
7605{
7606 int ret = 0;
7607 int level;
7608 struct extent_buffer *eb = root_eb;
7609 struct btrfs_path *path = NULL;
7610
7611 BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
7612 BUG_ON(root_eb == NULL);
7613
7614 if (!root->fs_info->quota_enabled)
7615 return 0;
7616
7617 if (!extent_buffer_uptodate(root_eb)) {
7618 ret = btrfs_read_buffer(root_eb, root_gen);
7619 if (ret)
7620 goto out;
7621 }
7622
7623 if (root_level == 0) {
7624 ret = account_leaf_items(trans, root, root_eb);
7625 goto out;
7626 }
7627
7628 path = btrfs_alloc_path();
7629 if (!path)
7630 return -ENOMEM;
7631
7632 /*
7633 * Walk down the tree. Missing extent blocks are filled in as
7634 * we go. Metadata is accounted every time we read a new
7635 * extent block.
7636 *
7637 * When we reach a leaf, we account for file extent items in it,
7638 * walk back up the tree (adjusting slot pointers as we go)
7639 * and restart the search process.
7640 */
7641 extent_buffer_get(root_eb); /* For path */
7642 path->nodes[root_level] = root_eb;
7643 path->slots[root_level] = 0;
7644 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
7645walk_down:
7646 level = root_level;
7647 while (level >= 0) {
7648 if (path->nodes[level] == NULL) {
7649 int parent_slot;
7650 u64 child_gen;
7651 u64 child_bytenr;
7652
7653 /* We need to get child blockptr/gen from
7654 * parent before we can read it. */
7655 eb = path->nodes[level + 1];
7656 parent_slot = path->slots[level + 1];
7657 child_bytenr = btrfs_node_blockptr(eb, parent_slot);
7658 child_gen = btrfs_node_ptr_generation(eb, parent_slot);
7659
7660 eb = read_tree_block(root, child_bytenr, child_gen);
7661 if (!eb || !extent_buffer_uptodate(eb)) {
7662 ret = -EIO;
7663 goto out;
7664 }
7665
7666 path->nodes[level] = eb;
7667 path->slots[level] = 0;
7668
7669 btrfs_tree_read_lock(eb);
7670 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
7671 path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
7672
7673 ret = btrfs_qgroup_record_ref(trans, root->fs_info,
7674 root->objectid,
7675 child_bytenr,
7676 root->nodesize,
7677 BTRFS_QGROUP_OPER_SUB_SUBTREE,
7678 0);
7679 if (ret)
7680 goto out;
7681
7682 }
7683
7684 if (level == 0) {
7685 ret = account_leaf_items(trans, root, path->nodes[level]);
7686 if (ret)
7687 goto out;
7688
7689 /* Nonzero return here means we completed our search */
7690 ret = adjust_slots_upwards(root, path, root_level);
7691 if (ret)
7692 break;
7693
7694 /* Restart search with new slots */
7695 goto walk_down;
7696 }
7697
7698 level--;
7699 }
7700
7701 ret = 0;
7702out:
7703 btrfs_free_path(path);
7704
7705 return ret;
7706}
7707
7481/* 7708/*
7482 * helper to process tree block while walking down the tree. 7709 * helper to process tree block while walking down the tree.
7483 * 7710 *
@@ -7532,9 +7759,9 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
7532 /* wc->stage == UPDATE_BACKREF */ 7759 /* wc->stage == UPDATE_BACKREF */
7533 if (!(wc->flags[level] & flag)) { 7760 if (!(wc->flags[level] & flag)) {
7534 BUG_ON(!path->locks[level]); 7761 BUG_ON(!path->locks[level]);
7535 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc); 7762 ret = btrfs_inc_ref(trans, root, eb, 1);
7536 BUG_ON(ret); /* -ENOMEM */ 7763 BUG_ON(ret); /* -ENOMEM */
7537 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc); 7764 ret = btrfs_dec_ref(trans, root, eb, 0);
7538 BUG_ON(ret); /* -ENOMEM */ 7765 BUG_ON(ret); /* -ENOMEM */
7539 ret = btrfs_set_disk_extent_flags(trans, root, eb->start, 7766 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
7540 eb->len, flag, 7767 eb->len, flag,
@@ -7581,6 +7808,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7581 int level = wc->level; 7808 int level = wc->level;
7582 int reada = 0; 7809 int reada = 0;
7583 int ret = 0; 7810 int ret = 0;
7811 bool need_account = false;
7584 7812
7585 generation = btrfs_node_ptr_generation(path->nodes[level], 7813 generation = btrfs_node_ptr_generation(path->nodes[level],
7586 path->slots[level]); 7814 path->slots[level]);
@@ -7596,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7596 } 7824 }
7597 7825
7598 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); 7826 bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7599 blocksize = btrfs_level_size(root, level - 1); 7827 blocksize = root->nodesize;
7600 7828
7601 next = btrfs_find_tree_block(root, bytenr, blocksize); 7829 next = btrfs_find_tree_block(root, bytenr);
7602 if (!next) { 7830 if (!next) {
7603 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 7831 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
7604 if (!next) 7832 if (!next)
@@ -7626,6 +7854,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7626 7854
7627 if (wc->stage == DROP_REFERENCE) { 7855 if (wc->stage == DROP_REFERENCE) {
7628 if (wc->refs[level - 1] > 1) { 7856 if (wc->refs[level - 1] > 1) {
7857 need_account = true;
7629 if (level == 1 && 7858 if (level == 1 &&
7630 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF)) 7859 (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7631 goto skip; 7860 goto skip;
@@ -7659,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7659 if (!next) { 7888 if (!next) {
7660 if (reada && level == 1) 7889 if (reada && level == 1)
7661 reada_walk_down(trans, root, wc, path); 7890 reada_walk_down(trans, root, wc, path);
7662 next = read_tree_block(root, bytenr, blocksize, generation); 7891 next = read_tree_block(root, bytenr, generation);
7663 if (!next || !extent_buffer_uptodate(next)) { 7892 if (!next || !extent_buffer_uptodate(next)) {
7664 free_extent_buffer(next); 7893 free_extent_buffer(next);
7665 return -EIO; 7894 return -EIO;
@@ -7689,6 +7918,16 @@ skip:
7689 parent = 0; 7918 parent = 0;
7690 } 7919 }
7691 7920
7921 if (need_account) {
7922 ret = account_shared_subtree(trans, root, next,
7923 generation, level - 1);
7924 if (ret) {
7925 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
7926 "%d accounting shared subtree. Quota "
7927 "is out of sync, rescan required.\n",
7928 root->fs_info->sb->s_id, ret);
7929 }
7930 }
7692 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 7931 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7693 root->root_key.objectid, level - 1, 0, 0); 7932 root->root_key.objectid, level - 1, 0, 0);
7694 BUG_ON(ret); /* -ENOMEM */ 7933 BUG_ON(ret); /* -ENOMEM */
@@ -7769,12 +8008,17 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7769 if (wc->refs[level] == 1) { 8008 if (wc->refs[level] == 1) {
7770 if (level == 0) { 8009 if (level == 0) {
7771 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) 8010 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7772 ret = btrfs_dec_ref(trans, root, eb, 1, 8011 ret = btrfs_dec_ref(trans, root, eb, 1);
7773 wc->for_reloc);
7774 else 8012 else
7775 ret = btrfs_dec_ref(trans, root, eb, 0, 8013 ret = btrfs_dec_ref(trans, root, eb, 0);
7776 wc->for_reloc);
7777 BUG_ON(ret); /* -ENOMEM */ 8014 BUG_ON(ret); /* -ENOMEM */
8015 ret = account_leaf_items(trans, root, eb);
8016 if (ret) {
8017 printk_ratelimited(KERN_ERR "BTRFS: %s Error "
8018 "%d accounting leaf items. Quota "
8019 "is out of sync, rescan required.\n",
8020 root->fs_info->sb->s_id, ret);
8021 }
7778 } 8022 }
7779 /* make block locked assertion in clean_tree_block happy */ 8023 /* make block locked assertion in clean_tree_block happy */
7780 if (!path->locks[level] && 8024 if (!path->locks[level] &&
@@ -7900,6 +8144,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
7900 int level; 8144 int level;
7901 bool root_dropped = false; 8145 bool root_dropped = false;
7902 8146
8147 btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);
8148
7903 path = btrfs_alloc_path(); 8149 path = btrfs_alloc_path();
7904 if (!path) { 8150 if (!path) {
7905 err = -ENOMEM; 8151 err = -ENOMEM;
@@ -8025,6 +8271,24 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
8025 goto out_end_trans; 8271 goto out_end_trans;
8026 } 8272 }
8027 8273
8274 /*
8275 * Qgroup update accounting is run from
8276 * delayed ref handling. This usually works
8277 * out because delayed refs are normally the
8278 * only way qgroup updates are added. However,
8279 * we may have added updates during our tree
8280 * walk so run qgroups here to make sure we
8281 * don't lose any updates.
8282 */
8283 ret = btrfs_delayed_qgroup_accounting(trans,
8284 root->fs_info);
8285 if (ret)
8286 printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8287 "running qgroup updates "
8288 "during snapshot delete. "
8289 "Quota is out of sync, "
8290 "rescan required.\n", ret);
8291
8028 btrfs_end_transaction_throttle(trans, tree_root); 8292 btrfs_end_transaction_throttle(trans, tree_root);
8029 if (!for_reloc && btrfs_need_cleaner_sleep(root)) { 8293 if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
8030 pr_debug("BTRFS: drop snapshot early exit\n"); 8294 pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8078,6 +8342,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
8078 } 8342 }
8079 root_dropped = true; 8343 root_dropped = true;
8080out_end_trans: 8344out_end_trans:
8345 ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
8346 if (ret)
8347 printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
8348 "running qgroup updates "
8349 "during snapshot delete. "
8350 "Quota is out of sync, "
8351 "rescan required.\n", ret);
8352
8081 btrfs_end_transaction_throttle(trans, tree_root); 8353 btrfs_end_transaction_throttle(trans, tree_root);
8082out_free: 8354out_free:
8083 kfree(wc); 8355 kfree(wc);
@@ -8181,13 +8453,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
8181 if (stripped) 8453 if (stripped)
8182 return extended_to_chunk(stripped); 8454 return extended_to_chunk(stripped);
8183 8455
8184 /* 8456 num_devices = root->fs_info->fs_devices->rw_devices;
8185 * we add in the count of missing devices because we want
8186 * to make sure that any RAID levels on a degraded FS
8187 * continue to be honored.
8188 */
8189 num_devices = root->fs_info->fs_devices->rw_devices +
8190 root->fs_info->fs_devices->missing_devices;
8191 8457
8192 stripped = BTRFS_BLOCK_GROUP_RAID0 | 8458 stripped = BTRFS_BLOCK_GROUP_RAID0 |
8193 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | 8459 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
@@ -8605,6 +8871,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8605 } 8871 }
8606 up_write(&info->commit_root_sem); 8872 up_write(&info->commit_root_sem);
8607 8873
8874 spin_lock(&info->unused_bgs_lock);
8875 while (!list_empty(&info->unused_bgs)) {
8876 block_group = list_first_entry(&info->unused_bgs,
8877 struct btrfs_block_group_cache,
8878 bg_list);
8879 list_del_init(&block_group->bg_list);
8880 btrfs_put_block_group(block_group);
8881 }
8882 spin_unlock(&info->unused_bgs_lock);
8883
8608 spin_lock(&info->block_group_cache_lock); 8884 spin_lock(&info->block_group_cache_lock);
8609 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { 8885 while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8610 block_group = rb_entry(n, struct btrfs_block_group_cache, 8886 block_group = rb_entry(n, struct btrfs_block_group_cache,
@@ -8739,7 +9015,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
8739 init_rwsem(&cache->data_rwsem); 9015 init_rwsem(&cache->data_rwsem);
8740 INIT_LIST_HEAD(&cache->list); 9016 INIT_LIST_HEAD(&cache->list);
8741 INIT_LIST_HEAD(&cache->cluster_list); 9017 INIT_LIST_HEAD(&cache->cluster_list);
8742 INIT_LIST_HEAD(&cache->new_bg_list); 9018 INIT_LIST_HEAD(&cache->bg_list);
8743 btrfs_init_free_space_ctl(cache); 9019 btrfs_init_free_space_ctl(cache);
8744 9020
8745 return cache; 9021 return cache;
@@ -8761,7 +9037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8761 root = info->extent_root; 9037 root = info->extent_root;
8762 key.objectid = 0; 9038 key.objectid = 0;
8763 key.offset = 0; 9039 key.offset = 0;
8764 btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); 9040 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8765 path = btrfs_alloc_path(); 9041 path = btrfs_alloc_path();
8766 if (!path) 9042 if (!path)
8767 return -ENOMEM; 9043 return -ENOMEM;
@@ -8880,8 +9156,18 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8880 __link_block_group(space_info, cache); 9156 __link_block_group(space_info, cache);
8881 9157
8882 set_avail_alloc_bits(root->fs_info, cache->flags); 9158 set_avail_alloc_bits(root->fs_info, cache->flags);
8883 if (btrfs_chunk_readonly(root, cache->key.objectid)) 9159 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
8884 set_block_group_ro(cache, 1); 9160 set_block_group_ro(cache, 1);
9161 } else if (btrfs_block_group_used(&cache->item) == 0) {
9162 spin_lock(&info->unused_bgs_lock);
9163 /* Should always be true but just in case. */
9164 if (list_empty(&cache->bg_list)) {
9165 btrfs_get_block_group(cache);
9166 list_add_tail(&cache->bg_list,
9167 &info->unused_bgs);
9168 }
9169 spin_unlock(&info->unused_bgs_lock);
9170 }
8885 } 9171 }
8886 9172
8887 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 9173 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -8922,10 +9208,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8922 struct btrfs_key key; 9208 struct btrfs_key key;
8923 int ret = 0; 9209 int ret = 0;
8924 9210
8925 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, 9211 list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
8926 new_bg_list) { 9212 list_del_init(&block_group->bg_list);
8927 list_del_init(&block_group->new_bg_list);
8928
8929 if (ret) 9213 if (ret)
8930 continue; 9214 continue;
8931 9215
@@ -9011,7 +9295,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9011 9295
9012 __link_block_group(cache->space_info, cache); 9296 __link_block_group(cache->space_info, cache);
9013 9297
9014 list_add_tail(&cache->new_bg_list, &trans->new_bgs); 9298 list_add_tail(&cache->bg_list, &trans->new_bgs);
9015 9299
9016 set_avail_alloc_bits(extent_root->fs_info, type); 9300 set_avail_alloc_bits(extent_root->fs_info, type);
9017 9301
@@ -9165,8 +9449,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
9165 9449
9166 memcpy(&key, &block_group->key, sizeof(key)); 9450 memcpy(&key, &block_group->key, sizeof(key));
9167 9451
9168 btrfs_clear_space_info_full(root->fs_info);
9169
9170 btrfs_put_block_group(block_group); 9452 btrfs_put_block_group(block_group);
9171 btrfs_put_block_group(block_group); 9453 btrfs_put_block_group(block_group);
9172 9454
@@ -9182,6 +9464,101 @@ out:
9182 return ret; 9464 return ret;
9183} 9465}
9184 9466
9467/*
9468 * Process the unused_bgs list and remove any that don't have any allocated
9469 * space inside of them.
9470 */
9471void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
9472{
9473 struct btrfs_block_group_cache *block_group;
9474 struct btrfs_space_info *space_info;
9475 struct btrfs_root *root = fs_info->extent_root;
9476 struct btrfs_trans_handle *trans;
9477 int ret = 0;
9478
9479 if (!fs_info->open)
9480 return;
9481
9482 spin_lock(&fs_info->unused_bgs_lock);
9483 while (!list_empty(&fs_info->unused_bgs)) {
9484 u64 start, end;
9485
9486 block_group = list_first_entry(&fs_info->unused_bgs,
9487 struct btrfs_block_group_cache,
9488 bg_list);
9489 space_info = block_group->space_info;
9490 list_del_init(&block_group->bg_list);
9491 if (ret || btrfs_mixed_space_info(space_info)) {
9492 btrfs_put_block_group(block_group);
9493 continue;
9494 }
9495 spin_unlock(&fs_info->unused_bgs_lock);
9496
9497 /* Don't want to race with allocators so take the groups_sem */
9498 down_write(&space_info->groups_sem);
9499 spin_lock(&block_group->lock);
9500 if (block_group->reserved ||
9501 btrfs_block_group_used(&block_group->item) ||
9502 block_group->ro) {
9503 /*
9504 * We want to bail if we made new allocations or have
9505 * outstanding allocations in this block group. We do
9506 * the ro check in case balance is currently acting on
9507 * this block group.
9508 */
9509 spin_unlock(&block_group->lock);
9510 up_write(&space_info->groups_sem);
9511 goto next;
9512 }
9513 spin_unlock(&block_group->lock);
9514
9515 /* We don't want to force the issue, only flip if it's ok. */
9516 ret = set_block_group_ro(block_group, 0);
9517 up_write(&space_info->groups_sem);
9518 if (ret < 0) {
9519 ret = 0;
9520 goto next;
9521 }
9522
9523 /*
9524 * Want to do this before we do anything else so we can recover
9525 * properly if we fail to join the transaction.
9526 */
9527 trans = btrfs_join_transaction(root);
9528 if (IS_ERR(trans)) {
9529 btrfs_set_block_group_rw(root, block_group);
9530 ret = PTR_ERR(trans);
9531 goto next;
9532 }
9533
9534 /*
9535 * We could have pending pinned extents for this block group,
9536 * just delete them, we don't care about them anymore.
9537 */
9538 start = block_group->key.objectid;
9539 end = start + block_group->key.offset - 1;
9540 clear_extent_bits(&fs_info->freed_extents[0], start, end,
9541 EXTENT_DIRTY, GFP_NOFS);
9542 clear_extent_bits(&fs_info->freed_extents[1], start, end,
9543 EXTENT_DIRTY, GFP_NOFS);
9544
9545 /* Reset pinned so btrfs_put_block_group doesn't complain */
9546 block_group->pinned = 0;
9547
9548 /*
9549 * Btrfs_remove_chunk will abort the transaction if things go
9550 * horribly wrong.
9551 */
9552 ret = btrfs_remove_chunk(trans, root,
9553 block_group->key.objectid);
9554 btrfs_end_transaction(trans, root);
9555next:
9556 btrfs_put_block_group(block_group);
9557 spin_lock(&fs_info->unused_bgs_lock);
9558 }
9559 spin_unlock(&fs_info->unused_bgs_lock);
9560}
9561
9185int btrfs_init_space_info(struct btrfs_fs_info *fs_info) 9562int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
9186{ 9563{
9187 struct btrfs_space_info *space_info; 9564 struct btrfs_space_info *space_info;
@@ -9313,7 +9690,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
9313 9690
9314int btrfs_start_nocow_write(struct btrfs_root *root) 9691int btrfs_start_nocow_write(struct btrfs_root *root)
9315{ 9692{
9316 if (unlikely(atomic_read(&root->will_be_snapshoted))) 9693 if (atomic_read(&root->will_be_snapshoted))
9317 return 0; 9694 return 0;
9318 9695
9319 percpu_counter_inc(&root->subv_writers->counter); 9696 percpu_counter_inc(&root->subv_writers->counter);
@@ -9321,7 +9698,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
9321 * Make sure counter is updated before we check for snapshot creation. 9698 * Make sure counter is updated before we check for snapshot creation.
9322 */ 9699 */
9323 smp_mb(); 9700 smp_mb();
9324 if (unlikely(atomic_read(&root->will_be_snapshoted))) { 9701 if (atomic_read(&root->will_be_snapshoted)) {
9325 btrfs_end_nocow_write(root); 9702 btrfs_end_nocow_write(root);
9326 return 0; 9703 return 0;
9327 } 9704 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a389820d158b..bf3f424e0013 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache;
25static struct kmem_cache *extent_buffer_cache; 25static struct kmem_cache *extent_buffer_cache;
26static struct bio_set *btrfs_bioset; 26static struct bio_set *btrfs_bioset;
27 27
28static inline bool extent_state_in_tree(const struct extent_state *state)
29{
30 return !RB_EMPTY_NODE(&state->rb_node);
31}
32
28#ifdef CONFIG_BTRFS_DEBUG 33#ifdef CONFIG_BTRFS_DEBUG
29static LIST_HEAD(buffers); 34static LIST_HEAD(buffers);
30static LIST_HEAD(states); 35static LIST_HEAD(states);
@@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void)
59 64
60 while (!list_empty(&states)) { 65 while (!list_empty(&states)) {
61 state = list_entry(states.next, struct extent_state, leak_list); 66 state = list_entry(states.next, struct extent_state, leak_list);
62 printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " 67 pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
63 "state %lu in tree %p refs %d\n", 68 state->start, state->end, state->state,
64 state->start, state->end, state->state, state->tree, 69 extent_state_in_tree(state),
65 atomic_read(&state->refs)); 70 atomic_read(&state->refs));
66 list_del(&state->leak_list); 71 list_del(&state->leak_list);
67 kmem_cache_free(extent_state_cache, state); 72 kmem_cache_free(extent_state_cache, state);
@@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
209 return state; 214 return state;
210 state->state = 0; 215 state->state = 0;
211 state->private = 0; 216 state->private = 0;
212 state->tree = NULL; 217 RB_CLEAR_NODE(&state->rb_node);
213 btrfs_leak_debug_add(&state->leak_list, &states); 218 btrfs_leak_debug_add(&state->leak_list, &states);
214 atomic_set(&state->refs, 1); 219 atomic_set(&state->refs, 1);
215 init_waitqueue_head(&state->wq); 220 init_waitqueue_head(&state->wq);
@@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state)
222 if (!state) 227 if (!state)
223 return; 228 return;
224 if (atomic_dec_and_test(&state->refs)) { 229 if (atomic_dec_and_test(&state->refs)) {
225 WARN_ON(state->tree); 230 WARN_ON(extent_state_in_tree(state));
226 btrfs_leak_debug_del(&state->leak_list); 231 btrfs_leak_debug_del(&state->leak_list);
227 trace_free_extent_state(state, _RET_IP_); 232 trace_free_extent_state(state, _RET_IP_);
228 kmem_cache_free(extent_state_cache, state); 233 kmem_cache_free(extent_state_cache, state);
@@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree,
371 other->state == state->state) { 376 other->state == state->state) {
372 merge_cb(tree, state, other); 377 merge_cb(tree, state, other);
373 state->start = other->start; 378 state->start = other->start;
374 other->tree = NULL;
375 rb_erase(&other->rb_node, &tree->state); 379 rb_erase(&other->rb_node, &tree->state);
380 RB_CLEAR_NODE(&other->rb_node);
376 free_extent_state(other); 381 free_extent_state(other);
377 } 382 }
378 } 383 }
@@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree,
383 other->state == state->state) { 388 other->state == state->state) {
384 merge_cb(tree, state, other); 389 merge_cb(tree, state, other);
385 state->end = other->end; 390 state->end = other->end;
386 other->tree = NULL;
387 rb_erase(&other->rb_node, &tree->state); 391 rb_erase(&other->rb_node, &tree->state);
392 RB_CLEAR_NODE(&other->rb_node);
388 free_extent_state(other); 393 free_extent_state(other);
389 } 394 }
390 } 395 }
@@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree,
442 found->start, found->end, start, end); 447 found->start, found->end, start, end);
443 return -EEXIST; 448 return -EEXIST;
444 } 449 }
445 state->tree = tree;
446 merge_state(tree, state); 450 merge_state(tree, state);
447 return 0; 451 return 0;
448} 452}
@@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
486 free_extent_state(prealloc); 490 free_extent_state(prealloc);
487 return -EEXIST; 491 return -EEXIST;
488 } 492 }
489 prealloc->tree = tree;
490 return 0; 493 return 0;
491} 494}
492 495
@@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
524 wake_up(&state->wq); 527 wake_up(&state->wq);
525 if (state->state == 0) { 528 if (state->state == 0) {
526 next = next_state(state); 529 next = next_state(state);
527 if (state->tree) { 530 if (extent_state_in_tree(state)) {
528 rb_erase(&state->rb_node, &tree->state); 531 rb_erase(&state->rb_node, &tree->state);
529 state->tree = NULL; 532 RB_CLEAR_NODE(&state->rb_node);
530 free_extent_state(state); 533 free_extent_state(state);
531 } else { 534 } else {
532 WARN_ON(1); 535 WARN_ON(1);
@@ -606,8 +609,8 @@ again:
606 cached_state = NULL; 609 cached_state = NULL;
607 } 610 }
608 611
609 if (cached && cached->tree && cached->start <= start && 612 if (cached && extent_state_in_tree(cached) &&
610 cached->end > start) { 613 cached->start <= start && cached->end > start) {
611 if (clear) 614 if (clear)
612 atomic_dec(&cached->refs); 615 atomic_dec(&cached->refs);
613 state = cached; 616 state = cached;
@@ -843,7 +846,7 @@ again:
843 if (cached_state && *cached_state) { 846 if (cached_state && *cached_state) {
844 state = *cached_state; 847 state = *cached_state;
845 if (state->start <= start && state->end > start && 848 if (state->start <= start && state->end > start &&
846 state->tree) { 849 extent_state_in_tree(state)) {
847 node = &state->rb_node; 850 node = &state->rb_node;
848 goto hit_next; 851 goto hit_next;
849 } 852 }
@@ -1069,7 +1072,7 @@ again:
1069 if (cached_state && *cached_state) { 1072 if (cached_state && *cached_state) {
1070 state = *cached_state; 1073 state = *cached_state;
1071 if (state->start <= start && state->end > start && 1074 if (state->start <= start && state->end > start &&
1072 state->tree) { 1075 extent_state_in_tree(state)) {
1073 node = &state->rb_node; 1076 node = &state->rb_node;
1074 goto hit_next; 1077 goto hit_next;
1075 } 1078 }
@@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1459 spin_lock(&tree->lock); 1462 spin_lock(&tree->lock);
1460 if (cached_state && *cached_state) { 1463 if (cached_state && *cached_state) {
1461 state = *cached_state; 1464 state = *cached_state;
1462 if (state->end == start - 1 && state->tree) { 1465 if (state->end == start - 1 && extent_state_in_tree(state)) {
1463 n = rb_next(&state->rb_node); 1466 n = rb_next(&state->rb_node);
1464 while (n) { 1467 while (n) {
1465 state = rb_entry(n, struct extent_state, 1468 state = rb_entry(n, struct extent_state,
@@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1905 int bitset = 0; 1908 int bitset = 0;
1906 1909
1907 spin_lock(&tree->lock); 1910 spin_lock(&tree->lock);
1908 if (cached && cached->tree && cached->start <= start && 1911 if (cached && extent_state_in_tree(cached) && cached->start <= start &&
1909 cached->end > start) 1912 cached->end > start)
1910 node = &cached->rb_node; 1913 node = &cached->rb_node;
1911 else 1914 else
@@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1959 SetPageUptodate(page); 1962 SetPageUptodate(page);
1960} 1963}
1961 1964
1962/* 1965int free_io_failure(struct inode *inode, struct io_failure_record *rec)
1963 * When IO fails, either with EIO or csum verification fails, we
1964 * try other mirrors that might have a good copy of the data. This
1965 * io_failure_record is used to record state as we go through all the
1966 * mirrors. If another mirror has good data, the page is set up to date
1967 * and things continue. If a good mirror can't be found, the original
1968 * bio end_io callback is called to indicate things have failed.
1969 */
1970struct io_failure_record {
1971 struct page *page;
1972 u64 start;
1973 u64 len;
1974 u64 logical;
1975 unsigned long bio_flags;
1976 int this_mirror;
1977 int failed_mirror;
1978 int in_validation;
1979};
1980
1981static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1982 int did_repair)
1983{ 1966{
1984 int ret; 1967 int ret;
1985 int err = 0; 1968 int err = 0;
@@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
2012 * currently, there can be no more than two copies of every data bit. thus, 1995 * currently, there can be no more than two copies of every data bit. thus,
2013 * exactly one rewrite is required. 1996 * exactly one rewrite is required.
2014 */ 1997 */
2015int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 1998int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
2016 u64 length, u64 logical, struct page *page, 1999 struct page *page, unsigned int pg_offset, int mirror_num)
2017 int mirror_num)
2018{ 2000{
2001 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2019 struct bio *bio; 2002 struct bio *bio;
2020 struct btrfs_device *dev; 2003 struct btrfs_device *dev;
2021 u64 map_length = 0; 2004 u64 map_length = 0;
@@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2053 return -EIO; 2036 return -EIO;
2054 } 2037 }
2055 bio->bi_bdev = dev->bdev; 2038 bio->bi_bdev = dev->bdev;
2056 bio_add_page(bio, page, length, start - page_offset(page)); 2039 bio_add_page(bio, page, length, pg_offset);
2057 2040
2058 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { 2041 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
2059 /* try to remap that extent elsewhere? */ 2042 /* try to remap that extent elsewhere? */
@@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2063 } 2046 }
2064 2047
2065 printk_ratelimited_in_rcu(KERN_INFO 2048 printk_ratelimited_in_rcu(KERN_INFO
2066 "BTRFS: read error corrected: ino %lu off %llu " 2049 "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
2067 "(dev %s sector %llu)\n", page->mapping->host->i_ino, 2050 btrfs_ino(inode), start,
2068 start, rcu_str_deref(dev->name), sector); 2051 rcu_str_deref(dev->name), sector);
2069
2070 bio_put(bio); 2052 bio_put(bio);
2071 return 0; 2053 return 0;
2072} 2054}
@@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2082 return -EROFS; 2064 return -EROFS;
2083 2065
2084 for (i = 0; i < num_pages; i++) { 2066 for (i = 0; i < num_pages; i++) {
2085 struct page *p = extent_buffer_page(eb, i); 2067 struct page *p = eb->pages[i];
2086 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 2068
2087 start, p, mirror_num); 2069 ret = repair_io_failure(root->fs_info->btree_inode, start,
2070 PAGE_CACHE_SIZE, start, p,
2071 start - page_offset(p), mirror_num);
2088 if (ret) 2072 if (ret)
2089 break; 2073 break;
2090 start += PAGE_CACHE_SIZE; 2074 start += PAGE_CACHE_SIZE;
@@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2097 * each time an IO finishes, we do a fast check in the IO failure tree 2081 * each time an IO finishes, we do a fast check in the IO failure tree
2098 * to see if we need to process or clean up an io_failure_record 2082 * to see if we need to process or clean up an io_failure_record
2099 */ 2083 */
2100static int clean_io_failure(u64 start, struct page *page) 2084int clean_io_failure(struct inode *inode, u64 start, struct page *page,
2085 unsigned int pg_offset)
2101{ 2086{
2102 u64 private; 2087 u64 private;
2103 u64 private_failure; 2088 u64 private_failure;
2104 struct io_failure_record *failrec; 2089 struct io_failure_record *failrec;
2105 struct inode *inode = page->mapping->host;
2106 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 2090 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2107 struct extent_state *state; 2091 struct extent_state *state;
2108 int num_copies; 2092 int num_copies;
2109 int did_repair = 0;
2110 int ret; 2093 int ret;
2111 2094
2112 private = 0; 2095 private = 0;
@@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page)
2127 /* there was no real error, just free the record */ 2110 /* there was no real error, just free the record */
2128 pr_debug("clean_io_failure: freeing dummy error at %llu\n", 2111 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2129 failrec->start); 2112 failrec->start);
2130 did_repair = 1;
2131 goto out; 2113 goto out;
2132 } 2114 }
2133 if (fs_info->sb->s_flags & MS_RDONLY) 2115 if (fs_info->sb->s_flags & MS_RDONLY)
@@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page)
2144 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2126 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2145 failrec->len); 2127 failrec->len);
2146 if (num_copies > 1) { 2128 if (num_copies > 1) {
2147 ret = repair_io_failure(fs_info, start, failrec->len, 2129 repair_io_failure(inode, start, failrec->len,
2148 failrec->logical, page, 2130 failrec->logical, page,
2149 failrec->failed_mirror); 2131 pg_offset, failrec->failed_mirror);
2150 did_repair = !ret;
2151 } 2132 }
2152 ret = 0;
2153 } 2133 }
2154 2134
2155out: 2135out:
2156 if (!ret) 2136 free_io_failure(inode, failrec);
2157 ret = free_io_failure(inode, failrec, did_repair);
2158 2137
2159 return ret; 2138 return 0;
2160} 2139}
2161 2140
2162/* 2141/*
2163 * this is a generic handler for readpage errors (default 2142 * Can be called when
2164 * readpage_io_failed_hook). if other copies exist, read those and write back 2143 * - hold extent lock
2165 * good data to the failed position. does not investigate in remapping the 2144 * - under ordered extent
2166 * failed extent elsewhere, hoping the device will be smart enough to do this as 2145 * - the inode is freeing
2167 * needed
2168 */ 2146 */
2147void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
2148{
2149 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2150 struct io_failure_record *failrec;
2151 struct extent_state *state, *next;
2169 2152
2170static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, 2153 if (RB_EMPTY_ROOT(&failure_tree->state))
2171 struct page *page, u64 start, u64 end, 2154 return;
2172 int failed_mirror) 2155
2156 spin_lock(&failure_tree->lock);
2157 state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2158 while (state) {
2159 if (state->start > end)
2160 break;
2161
2162 ASSERT(state->end <= end);
2163
2164 next = next_state(state);
2165
2166 failrec = (struct io_failure_record *)state->private;
2167 free_extent_state(state);
2168 kfree(failrec);
2169
2170 state = next;
2171 }
2172 spin_unlock(&failure_tree->lock);
2173}
2174
2175int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
2176 struct io_failure_record **failrec_ret)
2173{ 2177{
2174 struct io_failure_record *failrec = NULL; 2178 struct io_failure_record *failrec;
2175 u64 private; 2179 u64 private;
2176 struct extent_map *em; 2180 struct extent_map *em;
2177 struct inode *inode = page->mapping->host;
2178 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; 2181 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2179 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; 2182 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2180 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 2183 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2181 struct bio *bio;
2182 struct btrfs_io_bio *btrfs_failed_bio;
2183 struct btrfs_io_bio *btrfs_bio;
2184 int num_copies;
2185 int ret; 2184 int ret;
2186 int read_mode;
2187 u64 logical; 2185 u64 logical;
2188 2186
2189 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2190
2191 ret = get_state_private(failure_tree, start, &private); 2187 ret = get_state_private(failure_tree, start, &private);
2192 if (ret) { 2188 if (ret) {
2193 failrec = kzalloc(sizeof(*failrec), GFP_NOFS); 2189 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2194 if (!failrec) 2190 if (!failrec)
2195 return -ENOMEM; 2191 return -ENOMEM;
2192
2196 failrec->start = start; 2193 failrec->start = start;
2197 failrec->len = end - start + 1; 2194 failrec->len = end - start + 1;
2198 failrec->this_mirror = 0; 2195 failrec->this_mirror = 0;
@@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2212 em = NULL; 2209 em = NULL;
2213 } 2210 }
2214 read_unlock(&em_tree->lock); 2211 read_unlock(&em_tree->lock);
2215
2216 if (!em) { 2212 if (!em) {
2217 kfree(failrec); 2213 kfree(failrec);
2218 return -EIO; 2214 return -EIO;
2219 } 2215 }
2216
2220 logical = start - em->start; 2217 logical = start - em->start;
2221 logical = em->block_start + logical; 2218 logical = em->block_start + logical;
2222 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 2219 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
@@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2225 extent_set_compress_type(&failrec->bio_flags, 2222 extent_set_compress_type(&failrec->bio_flags,
2226 em->compress_type); 2223 em->compress_type);
2227 } 2224 }
2228 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " 2225
2229 "len=%llu\n", logical, start, failrec->len); 2226 pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n",
2227 logical, start, failrec->len);
2228
2230 failrec->logical = logical; 2229 failrec->logical = logical;
2231 free_extent_map(em); 2230 free_extent_map(em);
2232 2231
@@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2246 } 2245 }
2247 } else { 2246 } else {
2248 failrec = (struct io_failure_record *)(unsigned long)private; 2247 failrec = (struct io_failure_record *)(unsigned long)private;
2249 pr_debug("bio_readpage_error: (found) logical=%llu, " 2248 pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n",
2250 "start=%llu, len=%llu, validation=%d\n",
2251 failrec->logical, failrec->start, failrec->len, 2249 failrec->logical, failrec->start, failrec->len,
2252 failrec->in_validation); 2250 failrec->in_validation);
2253 /* 2251 /*
@@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2256 * clean_io_failure() clean all those errors at once. 2254 * clean_io_failure() clean all those errors at once.
2257 */ 2255 */
2258 } 2256 }
2257
2258 *failrec_ret = failrec;
2259
2260 return 0;
2261}
2262
2263int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
2264 struct io_failure_record *failrec, int failed_mirror)
2265{
2266 int num_copies;
2267
2259 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, 2268 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2260 failrec->logical, failrec->len); 2269 failrec->logical, failrec->len);
2261 if (num_copies == 1) { 2270 if (num_copies == 1) {
@@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2264 * all the retry and error correction code that follows. no 2273 * all the retry and error correction code that follows. no
2265 * matter what the error is, it is very likely to persist. 2274 * matter what the error is, it is very likely to persist.
2266 */ 2275 */
2267 pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", 2276 pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
2268 num_copies, failrec->this_mirror, failed_mirror); 2277 num_copies, failrec->this_mirror, failed_mirror);
2269 free_io_failure(inode, failrec, 0); 2278 return 0;
2270 return -EIO;
2271 } 2279 }
2272 2280
2273 /* 2281 /*
@@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2287 BUG_ON(failrec->in_validation); 2295 BUG_ON(failrec->in_validation);
2288 failrec->in_validation = 1; 2296 failrec->in_validation = 1;
2289 failrec->this_mirror = failed_mirror; 2297 failrec->this_mirror = failed_mirror;
2290 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2291 } else { 2298 } else {
2292 /* 2299 /*
2293 * we're ready to fulfill a) and b) alongside. get a good copy 2300 * we're ready to fulfill a) and b) alongside. get a good copy
@@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2303 failrec->this_mirror++; 2310 failrec->this_mirror++;
2304 if (failrec->this_mirror == failed_mirror) 2311 if (failrec->this_mirror == failed_mirror)
2305 failrec->this_mirror++; 2312 failrec->this_mirror++;
2306 read_mode = READ_SYNC;
2307 } 2313 }
2308 2314
2309 if (failrec->this_mirror > num_copies) { 2315 if (failrec->this_mirror > num_copies) {
2310 pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", 2316 pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
2311 num_copies, failrec->this_mirror, failed_mirror); 2317 num_copies, failrec->this_mirror, failed_mirror);
2312 free_io_failure(inode, failrec, 0); 2318 return 0;
2313 return -EIO;
2314 } 2319 }
2315 2320
2321 return 1;
2322}
2323
2324
2325struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
2326 struct io_failure_record *failrec,
2327 struct page *page, int pg_offset, int icsum,
2328 bio_end_io_t *endio_func, void *data)
2329{
2330 struct bio *bio;
2331 struct btrfs_io_bio *btrfs_failed_bio;
2332 struct btrfs_io_bio *btrfs_bio;
2333
2316 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 2334 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
2317 if (!bio) { 2335 if (!bio)
2318 free_io_failure(inode, failrec, 0); 2336 return NULL;
2319 return -EIO; 2337
2320 } 2338 bio->bi_end_io = endio_func;
2321 bio->bi_end_io = failed_bio->bi_end_io;
2322 bio->bi_iter.bi_sector = failrec->logical >> 9; 2339 bio->bi_iter.bi_sector = failrec->logical >> 9;
2323 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 2340 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2324 bio->bi_iter.bi_size = 0; 2341 bio->bi_iter.bi_size = 0;
2342 bio->bi_private = data;
2325 2343
2326 btrfs_failed_bio = btrfs_io_bio(failed_bio); 2344 btrfs_failed_bio = btrfs_io_bio(failed_bio);
2327 if (btrfs_failed_bio->csum) { 2345 if (btrfs_failed_bio->csum) {
@@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2330 2348
2331 btrfs_bio = btrfs_io_bio(bio); 2349 btrfs_bio = btrfs_io_bio(bio);
2332 btrfs_bio->csum = btrfs_bio->csum_inline; 2350 btrfs_bio->csum = btrfs_bio->csum_inline;
2333 phy_offset >>= inode->i_sb->s_blocksize_bits; 2351 icsum *= csum_size;
2334 phy_offset *= csum_size; 2352 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
2335 memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
2336 csum_size); 2353 csum_size);
2337 } 2354 }
2338 2355
2339 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 2356 bio_add_page(bio, page, failrec->len, pg_offset);
2357
2358 return bio;
2359}
2360
2361/*
2362 * this is a generic handler for readpage errors (default
2363 * readpage_io_failed_hook). if other copies exist, read those and write back
2364 * good data to the failed position. does not investigate in remapping the
2365 * failed extent elsewhere, hoping the device will be smart enough to do this as
2366 * needed
2367 */
2368
2369static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
2370 struct page *page, u64 start, u64 end,
2371 int failed_mirror)
2372{
2373 struct io_failure_record *failrec;
2374 struct inode *inode = page->mapping->host;
2375 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2376 struct bio *bio;
2377 int read_mode;
2378 int ret;
2379
2380 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2381
2382 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
2383 if (ret)
2384 return ret;
2385
2386 ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror);
2387 if (!ret) {
2388 free_io_failure(inode, failrec);
2389 return -EIO;
2390 }
2391
2392 if (failed_bio->bi_vcnt > 1)
2393 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2394 else
2395 read_mode = READ_SYNC;
2396
2397 phy_offset >>= inode->i_sb->s_blocksize_bits;
2398 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
2399 start - page_offset(page),
2400 (int)phy_offset, failed_bio->bi_end_io,
2401 NULL);
2402 if (!bio) {
2403 free_io_failure(inode, failrec);
2404 return -EIO;
2405 }
2340 2406
2341 pr_debug("bio_readpage_error: submitting new read[%#x] to " 2407 pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n",
2342 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, 2408 read_mode, failrec->this_mirror, failrec->in_validation);
2343 failrec->this_mirror, num_copies, failrec->in_validation);
2344 2409
2345 ret = tree->ops->submit_bio_hook(inode, read_mode, bio, 2410 ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2346 failrec->this_mirror, 2411 failrec->this_mirror,
2347 failrec->bio_flags, 0); 2412 failrec->bio_flags, 0);
2413 if (ret) {
2414 free_io_failure(inode, failrec);
2415 bio_put(bio);
2416 }
2417
2348 return ret; 2418 return ret;
2349} 2419}
2350 2420
@@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2469 struct inode *inode = page->mapping->host; 2539 struct inode *inode = page->mapping->host;
2470 2540
2471 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " 2541 pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2472 "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, 2542 "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
2473 io_bio->mirror_num); 2543 io_bio->mirror_num);
2474 tree = &BTRFS_I(inode)->io_tree; 2544 tree = &BTRFS_I(inode)->io_tree;
2475 2545
@@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2503 if (ret) 2573 if (ret)
2504 uptodate = 0; 2574 uptodate = 0;
2505 else 2575 else
2506 clean_io_failure(start, page); 2576 clean_io_failure(inode, start, page, 0);
2507 } 2577 }
2508 2578
2509 if (likely(uptodate)) 2579 if (likely(uptodate))
@@ -2532,6 +2602,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
2532 test_bit(BIO_UPTODATE, &bio->bi_flags); 2602 test_bit(BIO_UPTODATE, &bio->bi_flags);
2533 if (err) 2603 if (err)
2534 uptodate = 0; 2604 uptodate = 0;
2605 offset += len;
2535 continue; 2606 continue;
2536 } 2607 }
2537 } 2608 }
@@ -2539,12 +2610,12 @@ readpage_ok:
2539 if (likely(uptodate)) { 2610 if (likely(uptodate)) {
2540 loff_t i_size = i_size_read(inode); 2611 loff_t i_size = i_size_read(inode);
2541 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 2612 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2542 unsigned offset; 2613 unsigned off;
2543 2614
2544 /* Zero out the end if this page straddles i_size */ 2615 /* Zero out the end if this page straddles i_size */
2545 offset = i_size & (PAGE_CACHE_SIZE-1); 2616 off = i_size & (PAGE_CACHE_SIZE-1);
2546 if (page->index == end_index && offset) 2617 if (page->index == end_index && off)
2547 zero_user_segment(page, offset, PAGE_CACHE_SIZE); 2618 zero_user_segment(page, off, PAGE_CACHE_SIZE);
2548 SetPageUptodate(page); 2619 SetPageUptodate(page);
2549 } else { 2620 } else {
2550 ClearPageUptodate(page); 2621 ClearPageUptodate(page);
@@ -2617,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2617 2688
2618struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) 2689struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
2619{ 2690{
2620 return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); 2691 struct btrfs_io_bio *btrfs_bio;
2621} 2692 struct bio *new;
2622 2693
2694 new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
2695 if (new) {
2696 btrfs_bio = btrfs_io_bio(new);
2697 btrfs_bio->csum = NULL;
2698 btrfs_bio->csum_allocated = NULL;
2699 btrfs_bio->end_io = NULL;
2700 }
2701 return new;
2702}
2623 2703
2624/* this also allocates from the btrfs_bioset */ 2704/* this also allocates from the btrfs_bioset */
2625struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) 2705struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
@@ -3437,16 +3517,10 @@ done_unlocked:
3437 return 0; 3517 return 0;
3438} 3518}
3439 3519
3440static int eb_wait(void *word)
3441{
3442 io_schedule();
3443 return 0;
3444}
3445
3446void wait_on_extent_buffer_writeback(struct extent_buffer *eb) 3520void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3447{ 3521{
3448 wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait, 3522 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
3449 TASK_UNINTERRUPTIBLE); 3523 TASK_UNINTERRUPTIBLE);
3450} 3524}
3451 3525
3452static noinline_for_stack int 3526static noinline_for_stack int
@@ -3506,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
3506 3580
3507 num_pages = num_extent_pages(eb->start, eb->len); 3581 num_pages = num_extent_pages(eb->start, eb->len);
3508 for (i = 0; i < num_pages; i++) { 3582 for (i = 0; i < num_pages; i++) {
3509 struct page *p = extent_buffer_page(eb, i); 3583 struct page *p = eb->pages[i];
3510 3584
3511 if (!trylock_page(p)) { 3585 if (!trylock_page(p)) {
3512 if (!flush) { 3586 if (!flush) {
@@ -3527,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb)
3527 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); 3601 wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3528} 3602}
3529 3603
3604static void set_btree_ioerr(struct page *page)
3605{
3606 struct extent_buffer *eb = (struct extent_buffer *)page->private;
3607 struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode);
3608
3609 SetPageError(page);
3610 if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
3611 return;
3612
3613 /*
3614 * If writeback for a btree extent that doesn't belong to a log tree
3615 * failed, increment the counter transaction->eb_write_errors.
3616 * We do this because while the transaction is running and before it's
3617 * committing (when we call filemap_fdata[write|wait]_range against
3618 * the btree inode), we might have
3619 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
3620 * returns an error or an error happens during writeback, when we're
3621 * committing the transaction we wouldn't know about it, since the pages
3622 * can be no longer dirty nor marked anymore for writeback (if a
3623 * subsequent modification to the extent buffer didn't happen before the
3624 * transaction commit), which makes filemap_fdata[write|wait]_range not
3625 * able to find the pages tagged with SetPageError at transaction
3626 * commit time. So if this happens we must abort the transaction,
3627 * otherwise we commit a super block with btree roots that point to
3628 * btree nodes/leafs whose content on disk is invalid - either garbage
3629 * or the content of some node/leaf from a past generation that got
3630 * cowed or deleted and is no longer valid.
3631 *
3632 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
3633 * not be enough - we need to distinguish between log tree extents vs
3634 * non-log tree extents, and the next filemap_fdatawait_range() call
3635 * will catch and clear such errors in the mapping - and that call might
3636 * be from a log sync and not from a transaction commit. Also, checking
3637 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
3638 * not done and would not be reliable - the eb might have been released
3639 * from memory and reading it back again means that flag would not be
3640 * set (since it's a runtime flag, not persisted on disk).
3641 *
3642 * Using the flags below in the btree inode also makes us achieve the
3643 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
3644 * writeback for all dirty pages and before filemap_fdatawait_range()
3645 * is called, the writeback for all dirty pages had already finished
3646 * with errors - because we were not using AS_EIO/AS_ENOSPC,
3647 * filemap_fdatawait_range() would return success, as it could not know
3648 * that writeback errors happened (the pages were no longer tagged for
3649 * writeback).
3650 */
3651 switch (eb->log_index) {
3652 case -1:
3653 set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags);
3654 break;
3655 case 0:
3656 set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
3657 break;
3658 case 1:
3659 set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
3660 break;
3661 default:
3662 BUG(); /* unexpected, logic error */
3663 }
3664}
3665
3530static void end_bio_extent_buffer_writepage(struct bio *bio, int err) 3666static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3531{ 3667{
3532 struct bio_vec *bvec; 3668 struct bio_vec *bvec;
@@ -3540,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3540 BUG_ON(!eb); 3676 BUG_ON(!eb);
3541 done = atomic_dec_and_test(&eb->io_pages); 3677 done = atomic_dec_and_test(&eb->io_pages);
3542 3678
3543 if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { 3679 if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
3544 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3545 ClearPageUptodate(page); 3680 ClearPageUptodate(page);
3546 SetPageError(page); 3681 set_btree_ioerr(page);
3547 } 3682 }
3548 3683
3549 end_page_writeback(page); 3684 end_page_writeback(page);
@@ -3570,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3570 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; 3705 int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
3571 int ret = 0; 3706 int ret = 0;
3572 3707
3573 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3708 clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
3574 num_pages = num_extent_pages(eb->start, eb->len); 3709 num_pages = num_extent_pages(eb->start, eb->len);
3575 atomic_set(&eb->io_pages, num_pages); 3710 atomic_set(&eb->io_pages, num_pages);
3576 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) 3711 if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3577 bio_flags = EXTENT_BIO_TREE_LOG; 3712 bio_flags = EXTENT_BIO_TREE_LOG;
3578 3713
3579 for (i = 0; i < num_pages; i++) { 3714 for (i = 0; i < num_pages; i++) {
3580 struct page *p = extent_buffer_page(eb, i); 3715 struct page *p = eb->pages[i];
3581 3716
3582 clear_page_dirty_for_io(p); 3717 clear_page_dirty_for_io(p);
3583 set_page_writeback(p); 3718 set_page_writeback(p);
@@ -3587,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3587 0, epd->bio_flags, bio_flags); 3722 0, epd->bio_flags, bio_flags);
3588 epd->bio_flags = bio_flags; 3723 epd->bio_flags = bio_flags;
3589 if (ret) { 3724 if (ret) {
3590 set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 3725 set_btree_ioerr(p);
3591 SetPageError(p); 3726 end_page_writeback(p);
3592 if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) 3727 if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3593 end_extent_buffer_writeback(eb); 3728 end_extent_buffer_writeback(eb);
3594 ret = -EIO; 3729 ret = -EIO;
@@ -3601,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3601 3736
3602 if (unlikely(ret)) { 3737 if (unlikely(ret)) {
3603 for (; i < num_pages; i++) { 3738 for (; i < num_pages; i++) {
3604 struct page *p = extent_buffer_page(eb, i); 3739 struct page *p = eb->pages[i];
3740 clear_page_dirty_for_io(p);
3605 unlock_page(p); 3741 unlock_page(p);
3606 } 3742 }
3607 } 3743 }
@@ -4171,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
4171 return NULL; 4307 return NULL;
4172} 4308}
4173 4309
4174static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx)
4175{
4176 unsigned long cnt = *((unsigned long *)ctx);
4177
4178 cnt++;
4179 *((unsigned long *)ctx) = cnt;
4180
4181 /* Now we're sure that the extent is shared. */
4182 if (cnt > 1)
4183 return 1;
4184 return 0;
4185}
4186
4187int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 4310int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4188 __u64 start, __u64 len, get_extent_t *get_extent) 4311 __u64 start, __u64 len, get_extent_t *get_extent)
4189{ 4312{
@@ -4200,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4200 struct extent_map *em = NULL; 4323 struct extent_map *em = NULL;
4201 struct extent_state *cached_state = NULL; 4324 struct extent_state *cached_state = NULL;
4202 struct btrfs_path *path; 4325 struct btrfs_path *path;
4326 struct btrfs_root *root = BTRFS_I(inode)->root;
4203 int end = 0; 4327 int end = 0;
4204 u64 em_start = 0; 4328 u64 em_start = 0;
4205 u64 em_len = 0; 4329 u64 em_len = 0;
@@ -4213,15 +4337,15 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4213 return -ENOMEM; 4337 return -ENOMEM;
4214 path->leave_spinning = 1; 4338 path->leave_spinning = 1;
4215 4339
4216 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize); 4340 start = round_down(start, BTRFS_I(inode)->root->sectorsize);
4217 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize); 4341 len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start;
4218 4342
4219 /* 4343 /*
4220 * lookup the last file extent. We're not using i_size here 4344 * lookup the last file extent. We're not using i_size here
4221 * because there might be preallocation past i_size 4345 * because there might be preallocation past i_size
4222 */ 4346 */
4223 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 4347 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
4224 path, btrfs_ino(inode), -1, 0); 4348 0);
4225 if (ret < 0) { 4349 if (ret < 0) {
4226 btrfs_free_path(path); 4350 btrfs_free_path(path);
4227 return ret; 4351 return ret;
@@ -4229,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4229 WARN_ON(!ret); 4353 WARN_ON(!ret);
4230 path->slots[0]--; 4354 path->slots[0]--;
4231 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 4355 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
4232 found_type = btrfs_key_type(&found_key); 4356 found_type = found_key.type;
4233 4357
4234 /* No extents, but there might be delalloc bits */ 4358 /* No extents, but there might be delalloc bits */
4235 if (found_key.objectid != btrfs_ino(inode) || 4359 if (found_key.objectid != btrfs_ino(inode) ||
@@ -4314,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4314 } else if (em->block_start == EXTENT_MAP_DELALLOC) { 4438 } else if (em->block_start == EXTENT_MAP_DELALLOC) {
4315 flags |= (FIEMAP_EXTENT_DELALLOC | 4439 flags |= (FIEMAP_EXTENT_DELALLOC |
4316 FIEMAP_EXTENT_UNKNOWN); 4440 FIEMAP_EXTENT_UNKNOWN);
4317 } else { 4441 } else if (fieinfo->fi_extents_max) {
4318 unsigned long ref_cnt = 0; 4442 u64 bytenr = em->block_start -
4443 (em->start - em->orig_start);
4319 4444
4320 disko = em->block_start + offset_in_extent; 4445 disko = em->block_start + offset_in_extent;
4321 4446
4322 /* 4447 /*
4323 * As btrfs supports shared space, this information 4448 * As btrfs supports shared space, this information
4324 * can be exported to userspace tools via 4449 * can be exported to userspace tools via
4325 * flag FIEMAP_EXTENT_SHARED. 4450 * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
4451 * then we're just getting a count and we can skip the
4452 * lookup stuff.
4326 */ 4453 */
4327 ret = iterate_inodes_from_logical( 4454 ret = btrfs_check_shared(NULL, root->fs_info,
4328 em->block_start, 4455 root->objectid,
4329 BTRFS_I(inode)->root->fs_info, 4456 btrfs_ino(inode), bytenr);
4330 path, count_ext_ref, &ref_cnt); 4457 if (ret < 0)
4331 if (ret < 0 && ret != -ENOENT)
4332 goto out_free; 4458 goto out_free;
4333 4459 if (ret)
4334 if (ref_cnt > 1)
4335 flags |= FIEMAP_EXTENT_SHARED; 4460 flags |= FIEMAP_EXTENT_SHARED;
4461 ret = 0;
4336 } 4462 }
4337 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 4463 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4338 flags |= FIEMAP_EXTENT_ENCODED; 4464 flags |= FIEMAP_EXTENT_ENCODED;
@@ -4386,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb)
4386/* 4512/*
4387 * Helper for releasing extent buffer page. 4513 * Helper for releasing extent buffer page.
4388 */ 4514 */
4389static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, 4515static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
4390 unsigned long start_idx)
4391{ 4516{
4392 unsigned long index; 4517 unsigned long index;
4393 unsigned long num_pages;
4394 struct page *page; 4518 struct page *page;
4395 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); 4519 int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4396 4520
4397 BUG_ON(extent_buffer_under_io(eb)); 4521 BUG_ON(extent_buffer_under_io(eb));
4398 4522
4399 num_pages = num_extent_pages(eb->start, eb->len); 4523 index = num_extent_pages(eb->start, eb->len);
4400 index = start_idx + num_pages; 4524 if (index == 0)
4401 if (start_idx >= index)
4402 return; 4525 return;
4403 4526
4404 do { 4527 do {
4405 index--; 4528 index--;
4406 page = extent_buffer_page(eb, index); 4529 page = eb->pages[index];
4407 if (page && mapped) { 4530 if (page && mapped) {
4408 spin_lock(&page->mapping->private_lock); 4531 spin_lock(&page->mapping->private_lock);
4409 /* 4532 /*
@@ -4434,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4434 /* One for when we alloced the page */ 4557 /* One for when we alloced the page */
4435 page_cache_release(page); 4558 page_cache_release(page);
4436 } 4559 }
4437 } while (index != start_idx); 4560 } while (index != 0);
4438} 4561}
4439 4562
4440/* 4563/*
@@ -4442,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4442 */ 4565 */
4443static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) 4566static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4444{ 4567{
4445 btrfs_release_extent_buffer_page(eb, 0); 4568 btrfs_release_extent_buffer_page(eb);
4446 __free_extent_buffer(eb); 4569 __free_extent_buffer(eb);
4447} 4570}
4448 4571
@@ -4585,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb,
4585 4708
4586 num_pages = num_extent_pages(eb->start, eb->len); 4709 num_pages = num_extent_pages(eb->start, eb->len);
4587 for (i = 0; i < num_pages; i++) { 4710 for (i = 0; i < num_pages; i++) {
4588 struct page *p = extent_buffer_page(eb, i); 4711 struct page *p = eb->pages[i];
4712
4589 if (p != accessed) 4713 if (p != accessed)
4590 mark_page_accessed(p); 4714 mark_page_accessed(p);
4591 } 4715 }
@@ -4754,7 +4878,7 @@ again:
4754 */ 4878 */
4755 SetPageChecked(eb->pages[0]); 4879 SetPageChecked(eb->pages[0]);
4756 for (i = 1; i < num_pages; i++) { 4880 for (i = 1; i < num_pages; i++) {
4757 p = extent_buffer_page(eb, i); 4881 p = eb->pages[i];
4758 ClearPageChecked(p); 4882 ClearPageChecked(p);
4759 unlock_page(p); 4883 unlock_page(p);
4760 } 4884 }
@@ -4799,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb)
4799 } 4923 }
4800 4924
4801 /* Should be safe to release our pages at this point */ 4925 /* Should be safe to release our pages at this point */
4802 btrfs_release_extent_buffer_page(eb, 0); 4926 btrfs_release_extent_buffer_page(eb);
4803 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); 4927 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4804 return 1; 4928 return 1;
4805 } 4929 }
@@ -4865,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
4865 num_pages = num_extent_pages(eb->start, eb->len); 4989 num_pages = num_extent_pages(eb->start, eb->len);
4866 4990
4867 for (i = 0; i < num_pages; i++) { 4991 for (i = 0; i < num_pages; i++) {
4868 page = extent_buffer_page(eb, i); 4992 page = eb->pages[i];
4869 if (!PageDirty(page)) 4993 if (!PageDirty(page))
4870 continue; 4994 continue;
4871 4995
@@ -4901,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
4901 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); 5025 WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4902 5026
4903 for (i = 0; i < num_pages; i++) 5027 for (i = 0; i < num_pages; i++)
4904 set_page_dirty(extent_buffer_page(eb, i)); 5028 set_page_dirty(eb->pages[i]);
4905 return was_dirty; 5029 return was_dirty;
4906} 5030}
4907 5031
@@ -4914,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4914 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5038 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4915 num_pages = num_extent_pages(eb->start, eb->len); 5039 num_pages = num_extent_pages(eb->start, eb->len);
4916 for (i = 0; i < num_pages; i++) { 5040 for (i = 0; i < num_pages; i++) {
4917 page = extent_buffer_page(eb, i); 5041 page = eb->pages[i];
4918 if (page) 5042 if (page)
4919 ClearPageUptodate(page); 5043 ClearPageUptodate(page);
4920 } 5044 }
@@ -4930,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
4930 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 5054 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4931 num_pages = num_extent_pages(eb->start, eb->len); 5055 num_pages = num_extent_pages(eb->start, eb->len);
4932 for (i = 0; i < num_pages; i++) { 5056 for (i = 0; i < num_pages; i++) {
4933 page = extent_buffer_page(eb, i); 5057 page = eb->pages[i];
4934 SetPageUptodate(page); 5058 SetPageUptodate(page);
4935 } 5059 }
4936 return 0; 5060 return 0;
@@ -4970,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4970 5094
4971 num_pages = num_extent_pages(eb->start, eb->len); 5095 num_pages = num_extent_pages(eb->start, eb->len);
4972 for (i = start_i; i < num_pages; i++) { 5096 for (i = start_i; i < num_pages; i++) {
4973 page = extent_buffer_page(eb, i); 5097 page = eb->pages[i];
4974 if (wait == WAIT_NONE) { 5098 if (wait == WAIT_NONE) {
4975 if (!trylock_page(page)) 5099 if (!trylock_page(page))
4976 goto unlock_exit; 5100 goto unlock_exit;
@@ -4989,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
4989 goto unlock_exit; 5113 goto unlock_exit;
4990 } 5114 }
4991 5115
4992 clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); 5116 clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
4993 eb->read_mirror = 0; 5117 eb->read_mirror = 0;
4994 atomic_set(&eb->io_pages, num_reads); 5118 atomic_set(&eb->io_pages, num_reads);
4995 for (i = start_i; i < num_pages; i++) { 5119 for (i = start_i; i < num_pages; i++) {
4996 page = extent_buffer_page(eb, i); 5120 page = eb->pages[i];
4997 if (!PageUptodate(page)) { 5121 if (!PageUptodate(page)) {
4998 ClearPageError(page); 5122 ClearPageError(page);
4999 err = __extent_read_full_page(tree, page, 5123 err = __extent_read_full_page(tree, page,
@@ -5018,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
5018 return ret; 5142 return ret;
5019 5143
5020 for (i = start_i; i < num_pages; i++) { 5144 for (i = start_i; i < num_pages; i++) {
5021 page = extent_buffer_page(eb, i); 5145 page = eb->pages[i];
5022 wait_on_page_locked(page); 5146 wait_on_page_locked(page);
5023 if (!PageUptodate(page)) 5147 if (!PageUptodate(page))
5024 ret = -EIO; 5148 ret = -EIO;
@@ -5029,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
5029unlock_exit: 5153unlock_exit:
5030 i = start_i; 5154 i = start_i;
5031 while (locked_pages > 0) { 5155 while (locked_pages > 0) {
5032 page = extent_buffer_page(eb, i); 5156 page = eb->pages[i];
5033 i++; 5157 i++;
5034 unlock_page(page); 5158 unlock_page(page);
5035 locked_pages--; 5159 locked_pages--;
@@ -5055,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
5055 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5179 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5056 5180
5057 while (len > 0) { 5181 while (len > 0) {
5058 page = extent_buffer_page(eb, i); 5182 page = eb->pages[i];
5059 5183
5060 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5184 cur = min(len, (PAGE_CACHE_SIZE - offset));
5061 kaddr = page_address(page); 5185 kaddr = page_address(page);
@@ -5087,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv,
5087 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5211 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5088 5212
5089 while (len > 0) { 5213 while (len > 0) {
5090 page = extent_buffer_page(eb, i); 5214 page = eb->pages[i];
5091 5215
5092 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5216 cur = min(len, (PAGE_CACHE_SIZE - offset));
5093 kaddr = page_address(page); 5217 kaddr = page_address(page);
@@ -5136,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
5136 return -EINVAL; 5260 return -EINVAL;
5137 } 5261 }
5138 5262
5139 p = extent_buffer_page(eb, i); 5263 p = eb->pages[i];
5140 kaddr = page_address(p); 5264 kaddr = page_address(p);
5141 *map = kaddr + offset; 5265 *map = kaddr + offset;
5142 *map_len = PAGE_CACHE_SIZE - offset; 5266 *map_len = PAGE_CACHE_SIZE - offset;
@@ -5162,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
5162 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5286 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5163 5287
5164 while (len > 0) { 5288 while (len > 0) {
5165 page = extent_buffer_page(eb, i); 5289 page = eb->pages[i];
5166 5290
5167 cur = min(len, (PAGE_CACHE_SIZE - offset)); 5291 cur = min(len, (PAGE_CACHE_SIZE - offset));
5168 5292
@@ -5196,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
5196 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5320 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5197 5321
5198 while (len > 0) { 5322 while (len > 0) {
5199 page = extent_buffer_page(eb, i); 5323 page = eb->pages[i];
5200 WARN_ON(!PageUptodate(page)); 5324 WARN_ON(!PageUptodate(page));
5201 5325
5202 cur = min(len, PAGE_CACHE_SIZE - offset); 5326 cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5226,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
5226 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); 5350 offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
5227 5351
5228 while (len > 0) { 5352 while (len > 0) {
5229 page = extent_buffer_page(eb, i); 5353 page = eb->pages[i];
5230 WARN_ON(!PageUptodate(page)); 5354 WARN_ON(!PageUptodate(page));
5231 5355
5232 cur = min(len, PAGE_CACHE_SIZE - offset); 5356 cur = min(len, PAGE_CACHE_SIZE - offset);
@@ -5257,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
5257 (PAGE_CACHE_SIZE - 1); 5381 (PAGE_CACHE_SIZE - 1);
5258 5382
5259 while (len > 0) { 5383 while (len > 0) {
5260 page = extent_buffer_page(dst, i); 5384 page = dst->pages[i];
5261 WARN_ON(!PageUptodate(page)); 5385 WARN_ON(!PageUptodate(page));
5262 5386
5263 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 5387 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
@@ -5335,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5335 cur = min_t(unsigned long, cur, 5459 cur = min_t(unsigned long, cur,
5336 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 5460 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
5337 5461
5338 copy_pages(extent_buffer_page(dst, dst_i), 5462 copy_pages(dst->pages[dst_i], dst->pages[src_i],
5339 extent_buffer_page(dst, src_i),
5340 dst_off_in_page, src_off_in_page, cur); 5463 dst_off_in_page, src_off_in_page, cur);
5341 5464
5342 src_offset += cur; 5465 src_offset += cur;
@@ -5382,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5382 5505
5383 cur = min_t(unsigned long, len, src_off_in_page + 1); 5506 cur = min_t(unsigned long, len, src_off_in_page + 1);
5384 cur = min(cur, dst_off_in_page + 1); 5507 cur = min(cur, dst_off_in_page + 1);
5385 copy_pages(extent_buffer_page(dst, dst_i), 5508 copy_pages(dst->pages[dst_i], dst->pages[src_i],
5386 extent_buffer_page(dst, src_i),
5387 dst_off_in_page - cur + 1, 5509 dst_off_in_page - cur + 1,
5388 src_off_in_page - cur + 1, cur); 5510 src_off_in_page - cur + 1, cur);
5389 5511
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ccc264e7bde1..6d4b938be986 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -11,8 +11,6 @@
11#define EXTENT_NEW (1 << 4) 11#define EXTENT_NEW (1 << 4)
12#define EXTENT_DELALLOC (1 << 5) 12#define EXTENT_DELALLOC (1 << 5)
13#define EXTENT_DEFRAG (1 << 6) 13#define EXTENT_DEFRAG (1 << 6)
14#define EXTENT_DEFRAG_DONE (1 << 7)
15#define EXTENT_BUFFER_FILLED (1 << 8)
16#define EXTENT_BOUNDARY (1 << 9) 14#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 15#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 16#define EXTENT_DO_ACCOUNTING (1 << 11)
@@ -34,16 +32,16 @@
34 32
35/* these are bit numbers for test/set bit */ 33/* these are bit numbers for test/set bit */
36#define EXTENT_BUFFER_UPTODATE 0 34#define EXTENT_BUFFER_UPTODATE 0
37#define EXTENT_BUFFER_BLOCKING 1
38#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
39#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
40#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ 37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
41#define EXTENT_BUFFER_TREE_REF 5 38#define EXTENT_BUFFER_TREE_REF 5
42#define EXTENT_BUFFER_STALE 6 39#define EXTENT_BUFFER_STALE 6
43#define EXTENT_BUFFER_WRITEBACK 7 40#define EXTENT_BUFFER_WRITEBACK 7
44#define EXTENT_BUFFER_IOERR 8 41#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */
45#define EXTENT_BUFFER_DUMMY 9 42#define EXTENT_BUFFER_DUMMY 9
46#define EXTENT_BUFFER_IN_TREE 10 43#define EXTENT_BUFFER_IN_TREE 10
44#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */
47 45
48/* these are flags for extent_clear_unlock_delalloc */ 46/* these are flags for extent_clear_unlock_delalloc */
49#define PAGE_UNLOCK (1 << 0) 47#define PAGE_UNLOCK (1 << 0)
@@ -57,7 +55,6 @@
57 * map has page->private set to one. 55 * map has page->private set to one.
58 */ 56 */
59#define EXTENT_PAGE_PRIVATE 1 57#define EXTENT_PAGE_PRIVATE 1
60#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
61 58
62struct extent_state; 59struct extent_state;
63struct btrfs_root; 60struct btrfs_root;
@@ -108,7 +105,6 @@ struct extent_state {
108 struct rb_node rb_node; 105 struct rb_node rb_node;
109 106
110 /* ADD NEW ELEMENTS AFTER THIS */ 107 /* ADD NEW ELEMENTS AFTER THIS */
111 struct extent_io_tree *tree;
112 wait_queue_head_t wq; 108 wait_queue_head_t wq;
113 atomic_t refs; 109 atomic_t refs;
114 unsigned long state; 110 unsigned long state;
@@ -126,8 +122,6 @@ struct extent_state {
126struct extent_buffer { 122struct extent_buffer {
127 u64 start; 123 u64 start;
128 unsigned long len; 124 unsigned long len;
129 unsigned long map_start;
130 unsigned long map_len;
131 unsigned long bflags; 125 unsigned long bflags;
132 struct btrfs_fs_info *fs_info; 126 struct btrfs_fs_info *fs_info;
133 spinlock_t refs_lock; 127 spinlock_t refs_lock;
@@ -144,7 +138,9 @@ struct extent_buffer {
144 atomic_t blocking_readers; 138 atomic_t blocking_readers;
145 atomic_t spinning_readers; 139 atomic_t spinning_readers;
146 atomic_t spinning_writers; 140 atomic_t spinning_writers;
147 int lock_nested; 141 short lock_nested;
142 /* >= 0 if eb belongs to a log tree, -1 otherwise */
143 short log_index;
148 144
149 /* protects write locks */ 145 /* protects write locks */
150 rwlock_t lock; 146 rwlock_t lock;
@@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len)
286 (start >> PAGE_CACHE_SHIFT); 282 (start >> PAGE_CACHE_SHIFT);
287} 283}
288 284
289static inline struct page *extent_buffer_page(struct extent_buffer *eb,
290 unsigned long i)
291{
292 return eb->pages[i];
293}
294
295static inline void extent_buffer_get(struct extent_buffer *eb) 285static inline void extent_buffer_get(struct extent_buffer *eb)
296{ 286{
297 atomic_inc(&eb->refs); 287 atomic_inc(&eb->refs);
@@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
341 331
342struct btrfs_fs_info; 332struct btrfs_fs_info;
343 333
344int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, 334int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
345 u64 length, u64 logical, struct page *page, 335 struct page *page, unsigned int pg_offset,
346 int mirror_num); 336 int mirror_num);
337int clean_io_failure(struct inode *inode, u64 start, struct page *page,
338 unsigned int pg_offset);
347int end_extent_writepage(struct page *page, int err, u64 start, u64 end); 339int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
348int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, 340int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
349 int mirror_num); 341 int mirror_num);
342
343/*
344 * When IO fails, either with EIO or csum verification fails, we
345 * try other mirrors that might have a good copy of the data. This
346 * io_failure_record is used to record state as we go through all the
347 * mirrors. If another mirror has good data, the page is set up to date
348 * and things continue. If a good mirror can't be found, the original
349 * bio end_io callback is called to indicate things have failed.
350 */
351struct io_failure_record {
352 struct page *page;
353 u64 start;
354 u64 len;
355 u64 logical;
356 unsigned long bio_flags;
357 int this_mirror;
358 int failed_mirror;
359 int in_validation;
360};
361
362void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end);
363int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
364 struct io_failure_record **failrec_ret);
365int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio,
366 struct io_failure_record *failrec, int fail_mirror);
367struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio,
368 struct io_failure_record *failrec,
369 struct page *page, int pg_offset, int icsum,
370 bio_end_io_t *endio_func, void *data);
371int free_io_failure(struct inode *inode, struct io_failure_record *rec);
350#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 372#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
351noinline u64 find_lock_delalloc_range(struct inode *inode, 373noinline u64 find_lock_delalloc_range(struct inode *inode,
352 struct extent_io_tree *tree, 374 struct extent_io_tree *tree,
353 struct page *locked_page, u64 *start, 375 struct page *locked_page, u64 *start,
354 u64 *end, u64 max_bytes); 376 u64 *end, u64 max_bytes);
377#endif
355struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, 378struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
356 u64 start, unsigned long len); 379 u64 start, unsigned long len);
357#endif 380#endif
358#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f46cfe45d686..783a94355efd 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
55 return -ENOMEM; 55 return -ENOMEM;
56 file_key.objectid = objectid; 56 file_key.objectid = objectid;
57 file_key.offset = pos; 57 file_key.offset = pos;
58 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 58 file_key.type = BTRFS_EXTENT_DATA_KEY;
59 59
60 path->leave_spinning = 1; 60 path->leave_spinning = 1;
61 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 61 ret = btrfs_insert_empty_item(trans, root, path, &file_key,
@@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
100 100
101 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 101 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
102 file_key.offset = bytenr; 102 file_key.offset = bytenr;
103 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 103 file_key.type = BTRFS_EXTENT_CSUM_KEY;
104 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); 104 ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow);
105 if (ret < 0) 105 if (ret < 0)
106 goto fail; 106 goto fail;
@@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans,
111 goto fail; 111 goto fail;
112 path->slots[0]--; 112 path->slots[0]--;
113 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 113 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
114 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) 114 if (found_key.type != BTRFS_EXTENT_CSUM_KEY)
115 goto fail; 115 goto fail;
116 116
117 csum_offset = (bytenr - found_key.offset) >> 117 csum_offset = (bytenr - found_key.offset) >>
@@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
148 148
149 file_key.objectid = objectid; 149 file_key.objectid = objectid;
150 file_key.offset = offset; 150 file_key.offset = offset;
151 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 151 file_key.type = BTRFS_EXTENT_DATA_KEY;
152 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); 152 ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow);
153 return ret; 153 return ret;
154} 154}
@@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
299} 299}
300 300
301int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, 301int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
302 struct btrfs_dio_private *dip, struct bio *bio, 302 struct bio *bio, u64 offset)
303 u64 offset)
304{ 303{
305 int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; 304 return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
306 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
307 int ret;
308
309 len >>= inode->i_sb->s_blocksize_bits;
310 len *= csum_size;
311
312 ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
313 (u32 *)(dip->csum + len), 1);
314 return ret;
315} 305}
316 306
317int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 307int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
329 u64 csum_end; 319 u64 csum_end;
330 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); 320 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
331 321
332 ASSERT(start == ALIGN(start, root->sectorsize) && 322 ASSERT(IS_ALIGNED(start, root->sectorsize) &&
333 (end + 1) == ALIGN(end + 1, root->sectorsize)); 323 IS_ALIGNED(end + 1, root->sectorsize));
334 324
335 path = btrfs_alloc_path(); 325 path = btrfs_alloc_path();
336 if (!path) 326 if (!path)
@@ -720,7 +710,7 @@ again:
720 bytenr = sums->bytenr + total_bytes; 710 bytenr = sums->bytenr + total_bytes;
721 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 711 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
722 file_key.offset = bytenr; 712 file_key.offset = bytenr;
723 btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); 713 file_key.type = BTRFS_EXTENT_CSUM_KEY;
724 714
725 item = btrfs_lookup_csum(trans, root, path, bytenr, 1); 715 item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
726 if (!IS_ERR(item)) { 716 if (!IS_ERR(item)) {
@@ -756,7 +746,7 @@ again:
756 found_next = 1; 746 found_next = 1;
757 if (ret != 0) 747 if (ret != 0)
758 goto insert; 748 goto insert;
759 slot = 0; 749 slot = path->slots[0];
760 } 750 }
761 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); 751 btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
762 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || 752 if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
@@ -790,7 +780,7 @@ again:
790 csum_offset = (bytenr - found_key.offset) >> 780 csum_offset = (bytenr - found_key.offset) >>
791 root->fs_info->sb->s_blocksize_bits; 781 root->fs_info->sb->s_blocksize_bits;
792 782
793 if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || 783 if (found_key.type != BTRFS_EXTENT_CSUM_KEY ||
794 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || 784 found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
795 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { 785 csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) {
796 goto insert; 786 goto insert;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1f2b99cb55ea..a18ceabd99a8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
299 299
300 /* get the inode */ 300 /* get the inode */
301 key.objectid = defrag->root; 301 key.objectid = defrag->root;
302 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 302 key.type = BTRFS_ROOT_ITEM_KEY;
303 key.offset = (u64)-1; 303 key.offset = (u64)-1;
304 304
305 index = srcu_read_lock(&fs_info->subvol_srcu); 305 index = srcu_read_lock(&fs_info->subvol_srcu);
@@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
311 } 311 }
312 312
313 key.objectid = defrag->ino; 313 key.objectid = defrag->ino;
314 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 314 key.type = BTRFS_INODE_ITEM_KEY;
315 key.offset = 0; 315 key.offset = 0;
316 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); 316 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
317 if (IS_ERR(inode)) { 317 if (IS_ERR(inode)) {
@@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
452 if (unlikely(copied == 0)) 452 if (unlikely(copied == 0))
453 break; 453 break;
454 454
455 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 455 if (copied < PAGE_CACHE_SIZE - offset) {
456 offset += copied; 456 offset += copied;
457 } else { 457 } else {
458 pg++; 458 pg++;
@@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1481 bool force_page_uptodate = false; 1481 bool force_page_uptodate = false;
1482 bool need_unlock; 1482 bool need_unlock;
1483 1483
1484 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1484 nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE),
1485 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1485 PAGE_CACHE_SIZE / (sizeof(struct page *)));
1486 (sizeof(struct page *)));
1487 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); 1486 nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1488 nrptrs = max(nrptrs, 8); 1487 nrptrs = max(nrptrs, 8);
1489 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 1488 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
@@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1497 size_t write_bytes = min(iov_iter_count(i), 1496 size_t write_bytes = min(iov_iter_count(i),
1498 nrptrs * (size_t)PAGE_CACHE_SIZE - 1497 nrptrs * (size_t)PAGE_CACHE_SIZE -
1499 offset); 1498 offset);
1500 size_t num_pages = (write_bytes + offset + 1499 size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1501 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1500 PAGE_CACHE_SIZE);
1502 size_t reserve_bytes; 1501 size_t reserve_bytes;
1503 size_t dirty_pages; 1502 size_t dirty_pages;
1504 size_t copied; 1503 size_t copied;
@@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1526 * our prealloc extent may be smaller than 1525 * our prealloc extent may be smaller than
1527 * write_bytes, so scale down. 1526 * write_bytes, so scale down.
1528 */ 1527 */
1529 num_pages = (write_bytes + offset + 1528 num_pages = DIV_ROUND_UP(write_bytes + offset,
1530 PAGE_CACHE_SIZE - 1) >> 1529 PAGE_CACHE_SIZE);
1531 PAGE_CACHE_SHIFT;
1532 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1530 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1533 ret = 0; 1531 ret = 0;
1534 } else { 1532 } else {
@@ -1590,9 +1588,8 @@ again:
1590 dirty_pages = 0; 1588 dirty_pages = 0;
1591 } else { 1589 } else {
1592 force_page_uptodate = false; 1590 force_page_uptodate = false;
1593 dirty_pages = (copied + offset + 1591 dirty_pages = DIV_ROUND_UP(copied + offset,
1594 PAGE_CACHE_SIZE - 1) >> 1592 PAGE_CACHE_SIZE);
1595 PAGE_CACHE_SHIFT;
1596 } 1593 }
1597 1594
1598 /* 1595 /*
@@ -1653,7 +1650,7 @@ again:
1653 cond_resched(); 1650 cond_resched();
1654 1651
1655 balance_dirty_pages_ratelimited(inode->i_mapping); 1652 balance_dirty_pages_ratelimited(inode->i_mapping);
1656 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1653 if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1)
1657 btrfs_btree_balance_dirty(root); 1654 btrfs_btree_balance_dirty(root);
1658 1655
1659 pos += copied; 1656 pos += copied;
@@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1795 if (sync) 1792 if (sync)
1796 atomic_inc(&BTRFS_I(inode)->sync_writers); 1793 atomic_inc(&BTRFS_I(inode)->sync_writers);
1797 1794
1798 if (unlikely(file->f_flags & O_DIRECT)) { 1795 if (file->f_flags & O_DIRECT) {
1799 num_written = __btrfs_direct_write(iocb, from, pos); 1796 num_written = __btrfs_direct_write(iocb, from, pos);
1800 } else { 1797 } else {
1801 num_written = __btrfs_buffered_write(file, from, pos); 1798 num_written = __btrfs_buffered_write(file, from, pos);
@@ -1838,6 +1835,8 @@ out:
1838 1835
1839int btrfs_release_file(struct inode *inode, struct file *filp) 1836int btrfs_release_file(struct inode *inode, struct file *filp)
1840{ 1837{
1838 if (filp->private_data)
1839 btrfs_ioctl_trans_end(filp);
1841 /* 1840 /*
1842 * ordered_data_close is set by settattr when we are about to truncate 1841 * ordered_data_close is set by settattr when we are about to truncate
1843 * a file from a non-zero size to a zero size. This tries to 1842 * a file from a non-zero size to a zero size. This tries to
@@ -1845,29 +1844,25 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1845 * application were using truncate to replace a file in place. 1844 * application were using truncate to replace a file in place.
1846 */ 1845 */
1847 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, 1846 if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
1848 &BTRFS_I(inode)->runtime_flags)) { 1847 &BTRFS_I(inode)->runtime_flags))
1849 struct btrfs_trans_handle *trans;
1850 struct btrfs_root *root = BTRFS_I(inode)->root;
1851
1852 /*
1853 * We need to block on a committing transaction to keep us from
1854 * throwing a ordered operation on to the list and causing
1855 * something like sync to deadlock trying to flush out this
1856 * inode.
1857 */
1858 trans = btrfs_start_transaction(root, 0);
1859 if (IS_ERR(trans))
1860 return PTR_ERR(trans);
1861 btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
1862 btrfs_end_transaction(trans, root);
1863 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1864 filemap_flush(inode->i_mapping); 1848 filemap_flush(inode->i_mapping);
1865 }
1866 if (filp->private_data)
1867 btrfs_ioctl_trans_end(filp);
1868 return 0; 1849 return 0;
1869} 1850}
1870 1851
1852static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1853{
1854 int ret;
1855
1856 atomic_inc(&BTRFS_I(inode)->sync_writers);
1857 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1858 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1859 &BTRFS_I(inode)->runtime_flags))
1860 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1861 atomic_dec(&BTRFS_I(inode)->sync_writers);
1862
1863 return ret;
1864}
1865
1871/* 1866/*
1872 * fsync call for both files and directories. This logs the inode into 1867 * fsync call for both files and directories. This logs the inode into
1873 * the tree log instead of forcing full commits whenever possible. 1868 * the tree log instead of forcing full commits whenever possible.
@@ -1897,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1897 * multi-task, and make the performance up. See 1892 * multi-task, and make the performance up. See
1898 * btrfs_wait_ordered_range for an explanation of the ASYNC check. 1893 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1899 */ 1894 */
1900 atomic_inc(&BTRFS_I(inode)->sync_writers); 1895 ret = start_ordered_ops(inode, start, end);
1901 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1902 if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1903 &BTRFS_I(inode)->runtime_flags))
1904 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
1905 atomic_dec(&BTRFS_I(inode)->sync_writers);
1906 if (ret) 1896 if (ret)
1907 return ret; 1897 return ret;
1908 1898
1909 mutex_lock(&inode->i_mutex); 1899 mutex_lock(&inode->i_mutex);
1910
1911 /*
1912 * We flush the dirty pages again to avoid some dirty pages in the
1913 * range being left.
1914 */
1915 atomic_inc(&root->log_batch); 1900 atomic_inc(&root->log_batch);
1916 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 1901 full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1917 &BTRFS_I(inode)->runtime_flags); 1902 &BTRFS_I(inode)->runtime_flags);
1903 /*
1904 * We might have have had more pages made dirty after calling
1905 * start_ordered_ops and before acquiring the inode's i_mutex.
1906 */
1918 if (full_sync) { 1907 if (full_sync) {
1908 /*
1909 * For a full sync, we need to make sure any ordered operations
1910 * start and finish before we start logging the inode, so that
1911 * all extents are persisted and the respective file extent
1912 * items are in the fs/subvol btree.
1913 */
1919 ret = btrfs_wait_ordered_range(inode, start, end - start + 1); 1914 ret = btrfs_wait_ordered_range(inode, start, end - start + 1);
1920 if (ret) { 1915 } else {
1921 mutex_unlock(&inode->i_mutex); 1916 /*
1922 goto out; 1917 * Start any new ordered operations before starting to log the
1923 } 1918 * inode. We will wait for them to finish in btrfs_sync_log().
1919 *
1920 * Right before acquiring the inode's mutex, we might have new
1921 * writes dirtying pages, which won't immediately start the
1922 * respective ordered operations - that is done through the
1923 * fill_delalloc callbacks invoked from the writepage and
1924 * writepages address space operations. So make sure we start
1925 * all ordered operations before starting to log our inode. Not
1926 * doing this means that while logging the inode, writeback
1927 * could start and invoke writepage/writepages, which would call
1928 * the fill_delalloc callbacks (cow_file_range,
1929 * submit_compressed_extents). These callbacks add first an
1930 * extent map to the modified list of extents and then create
1931 * the respective ordered operation, which means in
1932 * tree-log.c:btrfs_log_inode() we might capture all existing
1933 * ordered operations (with btrfs_get_logged_extents()) before
1934 * the fill_delalloc callback adds its ordered operation, and by
1935 * the time we visit the modified list of extent maps (with
1936 * btrfs_log_changed_extents()), we see and process the extent
1937 * map they created. We then use the extent map to construct a
1938 * file extent item for logging without waiting for the
1939 * respective ordered operation to finish - this file extent
1940 * item points to a disk location that might not have yet been
1941 * written to, containing random data - so after a crash a log
1942 * replay will make our inode have file extent items that point
1943 * to disk locations containing invalid data, as we returned
1944 * success to userspace without waiting for the respective
1945 * ordered operation to finish, because it wasn't captured by
1946 * btrfs_get_logged_extents().
1947 */
1948 ret = start_ordered_ops(inode, start, end);
1949 }
1950 if (ret) {
1951 mutex_unlock(&inode->i_mutex);
1952 goto out;
1924 } 1953 }
1925 atomic_inc(&root->log_batch); 1954 atomic_inc(&root->log_batch);
1926 1955
@@ -1982,7 +2011,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1982 2011
1983 btrfs_init_log_ctx(&ctx); 2012 btrfs_init_log_ctx(&ctx);
1984 2013
1985 ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); 2014 ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
1986 if (ret < 0) { 2015 if (ret < 0) {
1987 /* Fallthrough and commit/free transaction. */ 2016 /* Fallthrough and commit/free transaction. */
1988 ret = 1; 2017 ret = 1;
@@ -2000,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2000 */ 2029 */
2001 mutex_unlock(&inode->i_mutex); 2030 mutex_unlock(&inode->i_mutex);
2002 2031
2032 /*
2033 * If any of the ordered extents had an error, just return it to user
2034 * space, so that the application knows some writes didn't succeed and
2035 * can take proper action (retry for e.g.). Blindly committing the
2036 * transaction in this case, would fool userspace that everything was
2037 * successful. And we also want to make sure our log doesn't contain
2038 * file extent items pointing to extents that weren't fully written to -
2039 * just like in the non fast fsync path, where we check for the ordered
2040 * operation's error flag before writing to the log tree and return -EIO
2041 * if any of them had this flag set (btrfs_wait_ordered_range) -
2042 * therefore we need to check for errors in the ordered operations,
2043 * which are indicated by ctx.io_err.
2044 */
2045 if (ctx.io_err) {
2046 btrfs_end_transaction(trans, root);
2047 ret = ctx.io_err;
2048 goto out;
2049 }
2050
2003 if (ret != BTRFS_NO_LOG_SYNC) { 2051 if (ret != BTRFS_NO_LOG_SYNC) {
2004 if (!ret) { 2052 if (!ret) {
2005 ret = btrfs_sync_log(trans, root, &ctx); 2053 ret = btrfs_sync_log(trans, root, &ctx);
@@ -2112,10 +2160,9 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
2112 goto out; 2160 goto out;
2113 } 2161 }
2114 2162
2115 if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) { 2163 if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2116 u64 num_bytes; 2164 u64 num_bytes;
2117 2165
2118 path->slots[0]++;
2119 key.offset = offset; 2166 key.offset = offset;
2120 btrfs_set_item_key_safe(root, path, &key); 2167 btrfs_set_item_key_safe(root, path, &key);
2121 fi = btrfs_item_ptr(leaf, path->slots[0], 2168 fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -2240,7 +2287,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2240 goto out_only_mutex; 2287 goto out_only_mutex;
2241 } 2288 }
2242 2289
2243 lockstart = round_up(offset , BTRFS_I(inode)->root->sectorsize); 2290 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
2244 lockend = round_down(offset + len, 2291 lockend = round_down(offset + len,
2245 BTRFS_I(inode)->root->sectorsize) - 1; 2292 BTRFS_I(inode)->root->sectorsize) - 1;
2246 same_page = ((offset >> PAGE_CACHE_SHIFT) == 2293 same_page = ((offset >> PAGE_CACHE_SHIFT) ==
@@ -2301,7 +2348,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2301 tail_start + tail_len, 0, 1); 2348 tail_start + tail_len, 0, 1);
2302 if (ret) 2349 if (ret)
2303 goto out_only_mutex; 2350 goto out_only_mutex;
2304 } 2351 }
2305 } 2352 }
2306 } 2353 }
2307 2354
@@ -2638,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
2638 struct btrfs_root *root = BTRFS_I(inode)->root; 2685 struct btrfs_root *root = BTRFS_I(inode)->root;
2639 struct extent_map *em = NULL; 2686 struct extent_map *em = NULL;
2640 struct extent_state *cached_state = NULL; 2687 struct extent_state *cached_state = NULL;
2641 u64 lockstart = *offset; 2688 u64 lockstart;
2642 u64 lockend = i_size_read(inode); 2689 u64 lockend;
2643 u64 start = *offset; 2690 u64 start;
2644 u64 len = i_size_read(inode); 2691 u64 len;
2645 int ret = 0; 2692 int ret = 0;
2646 2693
2647 lockend = max_t(u64, root->sectorsize, lockend); 2694 if (inode->i_size == 0)
2695 return -ENXIO;
2696
2697 /*
2698 * *offset can be negative, in this case we start finding DATA/HOLE from
2699 * the very start of the file.
2700 */
2701 start = max_t(loff_t, 0, *offset);
2702
2703 lockstart = round_down(start, root->sectorsize);
2704 lockend = round_up(i_size_read(inode), root->sectorsize);
2648 if (lockend <= lockstart) 2705 if (lockend <= lockstart)
2649 lockend = lockstart + root->sectorsize; 2706 lockend = lockstart + root->sectorsize;
2650
2651 lockend--; 2707 lockend--;
2652 len = lockend - lockstart + 1; 2708 len = lockend - lockstart + 1;
2653 2709
2654 len = max_t(u64, len, root->sectorsize);
2655 if (inode->i_size == 0)
2656 return -ENXIO;
2657
2658 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, 2710 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
2659 &cached_state); 2711 &cached_state);
2660 2712
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2b0a627cb5f9..33848196550e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
279 int num_pages; 279 int num_pages;
280 int check_crcs = 0; 280 int check_crcs = 0;
281 281
282 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 282 num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE);
283 PAGE_CACHE_SHIFT;
284 283
285 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) 284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
286 check_crcs = 1; 285 check_crcs = 1;
@@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
1998 return merged; 1997 return merged;
1999} 1998}
2000 1999
2000static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl,
2001 struct btrfs_free_space *info,
2002 bool update_stat)
2003{
2004 struct btrfs_free_space *bitmap;
2005 unsigned long i;
2006 unsigned long j;
2007 const u64 end = info->offset + info->bytes;
2008 const u64 bitmap_offset = offset_to_bitmap(ctl, end);
2009 u64 bytes;
2010
2011 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2012 if (!bitmap)
2013 return false;
2014
2015 i = offset_to_bit(bitmap->offset, ctl->unit, end);
2016 j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i);
2017 if (j == i)
2018 return false;
2019 bytes = (j - i) * ctl->unit;
2020 info->bytes += bytes;
2021
2022 if (update_stat)
2023 bitmap_clear_bits(ctl, bitmap, end, bytes);
2024 else
2025 __bitmap_clear_bits(ctl, bitmap, end, bytes);
2026
2027 if (!bitmap->bytes)
2028 free_bitmap(ctl, bitmap);
2029
2030 return true;
2031}
2032
2033static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl,
2034 struct btrfs_free_space *info,
2035 bool update_stat)
2036{
2037 struct btrfs_free_space *bitmap;
2038 u64 bitmap_offset;
2039 unsigned long i;
2040 unsigned long j;
2041 unsigned long prev_j;
2042 u64 bytes;
2043
2044 bitmap_offset = offset_to_bitmap(ctl, info->offset);
2045 /* If we're on a boundary, try the previous logical bitmap. */
2046 if (bitmap_offset == info->offset) {
2047 if (info->offset == 0)
2048 return false;
2049 bitmap_offset = offset_to_bitmap(ctl, info->offset - 1);
2050 }
2051
2052 bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0);
2053 if (!bitmap)
2054 return false;
2055
2056 i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1;
2057 j = 0;
2058 prev_j = (unsigned long)-1;
2059 for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) {
2060 if (j > i)
2061 break;
2062 prev_j = j;
2063 }
2064 if (prev_j == i)
2065 return false;
2066
2067 if (prev_j == (unsigned long)-1)
2068 bytes = (i + 1) * ctl->unit;
2069 else
2070 bytes = (i - prev_j) * ctl->unit;
2071
2072 info->offset -= bytes;
2073 info->bytes += bytes;
2074
2075 if (update_stat)
2076 bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2077 else
2078 __bitmap_clear_bits(ctl, bitmap, info->offset, bytes);
2079
2080 if (!bitmap->bytes)
2081 free_bitmap(ctl, bitmap);
2082
2083 return true;
2084}
2085
2086/*
2087 * We prefer always to allocate from extent entries, both for clustered and
2088 * non-clustered allocation requests. So when attempting to add a new extent
2089 * entry, try to see if there's adjacent free space in bitmap entries, and if
2090 * there is, migrate that space from the bitmaps to the extent.
2091 * Like this we get better chances of satisfying space allocation requests
2092 * because we attempt to satisfy them based on a single cache entry, and never
2093 * on 2 or more entries - even if the entries represent a contiguous free space
2094 * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry
2095 * ends).
2096 */
2097static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl,
2098 struct btrfs_free_space *info,
2099 bool update_stat)
2100{
2101 /*
2102 * Only work with disconnected entries, as we can change their offset,
2103 * and must be extent entries.
2104 */
2105 ASSERT(!info->bitmap);
2106 ASSERT(RB_EMPTY_NODE(&info->offset_index));
2107
2108 if (ctl->total_bitmaps > 0) {
2109 bool stole_end;
2110 bool stole_front = false;
2111
2112 stole_end = steal_from_bitmap_to_end(ctl, info, update_stat);
2113 if (ctl->total_bitmaps > 0)
2114 stole_front = steal_from_bitmap_to_front(ctl, info,
2115 update_stat);
2116
2117 if (stole_end || stole_front)
2118 try_merge_free_space(ctl, info, update_stat);
2119 }
2120}
2121
2001int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, 2122int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2002 u64 offset, u64 bytes) 2123 u64 offset, u64 bytes)
2003{ 2124{
@@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2010 2131
2011 info->offset = offset; 2132 info->offset = offset;
2012 info->bytes = bytes; 2133 info->bytes = bytes;
2134 RB_CLEAR_NODE(&info->offset_index);
2013 2135
2014 spin_lock(&ctl->tree_lock); 2136 spin_lock(&ctl->tree_lock);
2015 2137
@@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2029 goto out; 2151 goto out;
2030 } 2152 }
2031link: 2153link:
2154 /*
2155 * Only steal free space from adjacent bitmaps if we're sure we're not
2156 * going to add the new free space to existing bitmap entries - because
2157 * that would mean unnecessary work that would be reverted. Therefore
2158 * attempt to steal space from bitmaps if we're adding an extent entry.
2159 */
2160 steal_from_bitmap(ctl, info, true);
2161
2032 ret = link_free_space(ctl, info); 2162 ret = link_free_space(ctl, info);
2033 if (ret) 2163 if (ret)
2034 kmem_cache_free(btrfs_free_space_cachep, info); 2164 kmem_cache_free(btrfs_free_space_cachep, info);
@@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space(
2205 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2335 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2206 node = rb_next(&entry->offset_index); 2336 node = rb_next(&entry->offset_index);
2207 rb_erase(&entry->offset_index, &cluster->root); 2337 rb_erase(&entry->offset_index, &cluster->root);
2338 RB_CLEAR_NODE(&entry->offset_index);
2208 2339
2209 bitmap = (entry->bitmap != NULL); 2340 bitmap = (entry->bitmap != NULL);
2210 if (!bitmap) 2341 if (!bitmap) {
2211 try_merge_free_space(ctl, entry, false); 2342 try_merge_free_space(ctl, entry, false);
2343 steal_from_bitmap(ctl, entry, false);
2344 }
2212 tree_insert_offset(&ctl->free_space_offset, 2345 tree_insert_offset(&ctl->free_space_offset,
2213 entry->offset, &entry->offset_index, bitmap); 2346 entry->offset, &entry->offset_index, bitmap);
2214 } 2347 }
@@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3033{ 3166{
3034 struct inode *inode = NULL; 3167 struct inode *inode = NULL;
3035 3168
3036 spin_lock(&root->cache_lock); 3169 spin_lock(&root->ino_cache_lock);
3037 if (root->cache_inode) 3170 if (root->ino_cache_inode)
3038 inode = igrab(root->cache_inode); 3171 inode = igrab(root->ino_cache_inode);
3039 spin_unlock(&root->cache_lock); 3172 spin_unlock(&root->ino_cache_lock);
3040 if (inode) 3173 if (inode)
3041 return inode; 3174 return inode;
3042 3175
@@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3044 if (IS_ERR(inode)) 3177 if (IS_ERR(inode))
3045 return inode; 3178 return inode;
3046 3179
3047 spin_lock(&root->cache_lock); 3180 spin_lock(&root->ino_cache_lock);
3048 if (!btrfs_fs_closing(root->fs_info)) 3181 if (!btrfs_fs_closing(root->fs_info))
3049 root->cache_inode = igrab(inode); 3182 root->ino_cache_inode = igrab(inode);
3050 spin_unlock(&root->cache_lock); 3183 spin_unlock(&root->ino_cache_lock);
3051 3184
3052 return inode; 3185 return inode;
3053} 3186}
@@ -3176,6 +3309,7 @@ again:
3176 map = NULL; 3309 map = NULL;
3177 add_new_bitmap(ctl, info, offset); 3310 add_new_bitmap(ctl, info, offset);
3178 bitmap_info = info; 3311 bitmap_info = info;
3312 info = NULL;
3179 } 3313 }
3180 3314
3181 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 3315 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
@@ -3186,6 +3320,8 @@ again:
3186 if (bytes) 3320 if (bytes)
3187 goto again; 3321 goto again;
3188 3322
3323 if (info)
3324 kmem_cache_free(btrfs_free_space_cachep, info);
3189 if (map) 3325 if (map)
3190 kfree(map); 3326 kfree(map);
3191 return 0; 3327 return 0;
@@ -3260,6 +3396,7 @@ have_info:
3260 goto have_info; 3396 goto have_info;
3261 } 3397 }
3262 3398
3399 ret = 0;
3263 goto out; 3400 goto out;
3264 } 3401 }
3265 3402
diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c
index 85889aa82c62..aae520b2aee5 100644
--- a/fs/btrfs/hash.c
+++ b/fs/btrfs/hash.c
@@ -20,10 +20,8 @@ static struct crypto_shash *tfm;
20int __init btrfs_hash_init(void) 20int __init btrfs_hash_init(void)
21{ 21{
22 tfm = crypto_alloc_shash("crc32c", 0, 0); 22 tfm = crypto_alloc_shash("crc32c", 0, 0);
23 if (IS_ERR(tfm))
24 return PTR_ERR(tfm);
25 23
26 return 0; 24 return PTR_ERR_OR_ZERO(tfm);
27} 25}
28 26
29void btrfs_hash_exit(void) 27void btrfs_hash_exit(void)
@@ -33,18 +31,16 @@ void btrfs_hash_exit(void)
33 31
34u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) 32u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length)
35{ 33{
36 struct { 34 SHASH_DESC_ON_STACK(shash, tfm);
37 struct shash_desc shash; 35 u32 *ctx = (u32 *)shash_desc_ctx(shash);
38 char ctx[crypto_shash_descsize(tfm)];
39 } desc;
40 int err; 36 int err;
41 37
42 desc.shash.tfm = tfm; 38 shash->tfm = tfm;
43 desc.shash.flags = 0; 39 shash->flags = 0;
44 *(u32 *)desc.ctx = crc; 40 *ctx = crc;
45 41
46 err = crypto_shash_update(&desc.shash, address, length); 42 err = crypto_shash_update(shash, address, length);
47 BUG_ON(err); 43 BUG_ON(err);
48 44
49 return *(u32 *)desc.ctx; 45 return *ctx;
50} 46}
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 2be38df703c9..8ffa4783cbf4 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
135 u32 item_size; 135 u32 item_size;
136 136
137 key.objectid = inode_objectid; 137 key.objectid = inode_objectid;
138 btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); 138 key.type = BTRFS_INODE_EXTREF_KEY;
139 key.offset = btrfs_extref_hash(ref_objectid, name, name_len); 139 key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
140 140
141 path = btrfs_alloc_path(); 141 path = btrfs_alloc_path();
@@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
209 209
210 key.objectid = inode_objectid; 210 key.objectid = inode_objectid;
211 key.offset = ref_objectid; 211 key.offset = ref_objectid;
212 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); 212 key.type = BTRFS_INODE_REF_KEY;
213 213
214 path = btrfs_alloc_path(); 214 path = btrfs_alloc_path();
215 if (!path) 215 if (!path)
@@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
337 337
338 key.objectid = inode_objectid; 338 key.objectid = inode_objectid;
339 key.offset = ref_objectid; 339 key.offset = ref_objectid;
340 btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); 340 key.type = BTRFS_INODE_REF_KEY;
341 341
342 path = btrfs_alloc_path(); 342 path = btrfs_alloc_path();
343 if (!path) 343 if (!path)
@@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
400 struct btrfs_key key; 400 struct btrfs_key key;
401 int ret; 401 int ret;
402 key.objectid = objectid; 402 key.objectid = objectid;
403 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 403 key.type = BTRFS_INODE_ITEM_KEY;
404 key.offset = 0; 404 key.offset = 0;
405 405
406 ret = btrfs_insert_empty_item(trans, root, path, &key, 406 ret = btrfs_insert_empty_item(trans, root, path, &key,
@@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
420 struct btrfs_key found_key; 420 struct btrfs_key found_key;
421 421
422 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); 422 ret = btrfs_search_slot(trans, root, location, path, ins_len, cow);
423 if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && 423 if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY &&
424 location->offset == (u64)-1 && path->slots[0] != 0) { 424 location->offset == (u64)-1 && path->slots[0] != 0) {
425 slot = path->slots[0] - 1; 425 slot = path->slots[0] - 1;
426 leaf = path->nodes[0]; 426 leaf = path->nodes[0];
427 btrfs_item_key_to_cpu(leaf, &found_key, slot); 427 btrfs_item_key_to_cpu(leaf, &found_key, slot);
428 if (found_key.objectid == location->objectid && 428 if (found_key.objectid == location->objectid &&
429 btrfs_key_type(&found_key) == btrfs_key_type(location)) { 429 found_key.type == location->type) {
430 path->slots[0]--; 430 path->slots[0]--;
431 return 0; 431 return 0;
432 } 432 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 888fbe19079f..83d646bd2e4b 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -87,7 +87,7 @@ again:
87 */ 87 */
88 btrfs_item_key_to_cpu(leaf, &key, 0); 88 btrfs_item_key_to_cpu(leaf, &key, 0);
89 btrfs_release_path(path); 89 btrfs_release_path(path);
90 root->cache_progress = last; 90 root->ino_cache_progress = last;
91 up_read(&fs_info->commit_root_sem); 91 up_read(&fs_info->commit_root_sem);
92 schedule_timeout(1); 92 schedule_timeout(1);
93 goto again; 93 goto again;
@@ -106,7 +106,7 @@ again:
106 if (last != (u64)-1 && last + 1 != key.objectid) { 106 if (last != (u64)-1 && last + 1 != key.objectid) {
107 __btrfs_add_free_space(ctl, last + 1, 107 __btrfs_add_free_space(ctl, last + 1,
108 key.objectid - last - 1); 108 key.objectid - last - 1);
109 wake_up(&root->cache_wait); 109 wake_up(&root->ino_cache_wait);
110 } 110 }
111 111
112 last = key.objectid; 112 last = key.objectid;
@@ -119,14 +119,14 @@ next:
119 root->highest_objectid - last - 1); 119 root->highest_objectid - last - 1);
120 } 120 }
121 121
122 spin_lock(&root->cache_lock); 122 spin_lock(&root->ino_cache_lock);
123 root->cached = BTRFS_CACHE_FINISHED; 123 root->ino_cache_state = BTRFS_CACHE_FINISHED;
124 spin_unlock(&root->cache_lock); 124 spin_unlock(&root->ino_cache_lock);
125 125
126 root->cache_progress = (u64)-1; 126 root->ino_cache_progress = (u64)-1;
127 btrfs_unpin_free_ino(root); 127 btrfs_unpin_free_ino(root);
128out: 128out:
129 wake_up(&root->cache_wait); 129 wake_up(&root->ino_cache_wait);
130 up_read(&fs_info->commit_root_sem); 130 up_read(&fs_info->commit_root_sem);
131 131
132 btrfs_free_path(path); 132 btrfs_free_path(path);
@@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root)
144 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 144 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
145 return; 145 return;
146 146
147 spin_lock(&root->cache_lock); 147 spin_lock(&root->ino_cache_lock);
148 if (root->cached != BTRFS_CACHE_NO) { 148 if (root->ino_cache_state != BTRFS_CACHE_NO) {
149 spin_unlock(&root->cache_lock); 149 spin_unlock(&root->ino_cache_lock);
150 return; 150 return;
151 } 151 }
152 152
153 root->cached = BTRFS_CACHE_STARTED; 153 root->ino_cache_state = BTRFS_CACHE_STARTED;
154 spin_unlock(&root->cache_lock); 154 spin_unlock(&root->ino_cache_lock);
155 155
156 ret = load_free_ino_cache(root->fs_info, root); 156 ret = load_free_ino_cache(root->fs_info, root);
157 if (ret == 1) { 157 if (ret == 1) {
158 spin_lock(&root->cache_lock); 158 spin_lock(&root->ino_cache_lock);
159 root->cached = BTRFS_CACHE_FINISHED; 159 root->ino_cache_state = BTRFS_CACHE_FINISHED;
160 spin_unlock(&root->cache_lock); 160 spin_unlock(&root->ino_cache_lock);
161 return; 161 return;
162 } 162 }
163 163
@@ -196,11 +196,11 @@ again:
196 196
197 start_caching(root); 197 start_caching(root);
198 198
199 wait_event(root->cache_wait, 199 wait_event(root->ino_cache_wait,
200 root->cached == BTRFS_CACHE_FINISHED || 200 root->ino_cache_state == BTRFS_CACHE_FINISHED ||
201 root->free_ino_ctl->free_space > 0); 201 root->free_ino_ctl->free_space > 0);
202 202
203 if (root->cached == BTRFS_CACHE_FINISHED && 203 if (root->ino_cache_state == BTRFS_CACHE_FINISHED &&
204 root->free_ino_ctl->free_space == 0) 204 root->free_ino_ctl->free_space == 0)
205 return -ENOSPC; 205 return -ENOSPC;
206 else 206 else
@@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
214 if (!btrfs_test_opt(root, INODE_MAP_CACHE)) 214 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
215 return; 215 return;
216again: 216again:
217 if (root->cached == BTRFS_CACHE_FINISHED) { 217 if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
218 __btrfs_add_free_space(pinned, objectid, 1); 218 __btrfs_add_free_space(pinned, objectid, 1);
219 } else { 219 } else {
220 down_write(&root->fs_info->commit_root_sem); 220 down_write(&root->fs_info->commit_root_sem);
221 spin_lock(&root->cache_lock); 221 spin_lock(&root->ino_cache_lock);
222 if (root->cached == BTRFS_CACHE_FINISHED) { 222 if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
223 spin_unlock(&root->cache_lock); 223 spin_unlock(&root->ino_cache_lock);
224 up_write(&root->fs_info->commit_root_sem); 224 up_write(&root->fs_info->commit_root_sem);
225 goto again; 225 goto again;
226 } 226 }
227 spin_unlock(&root->cache_lock); 227 spin_unlock(&root->ino_cache_lock);
228 228
229 start_caching(root); 229 start_caching(root);
230 230
@@ -235,10 +235,10 @@ again:
235} 235}
236 236
237/* 237/*
238 * When a transaction is committed, we'll move those inode numbers which 238 * When a transaction is committed, we'll move those inode numbers which are
239 * are smaller than root->cache_progress from pinned tree to free_ino tree, 239 * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and
240 * and others will just be dropped, because the commit root we were 240 * others will just be dropped, because the commit root we were searching has
241 * searching has changed. 241 * changed.
242 * 242 *
243 * Must be called with root->fs_info->commit_root_sem held 243 * Must be called with root->fs_info->commit_root_sem held
244 */ 244 */
@@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
261 info = rb_entry(n, struct btrfs_free_space, offset_index); 261 info = rb_entry(n, struct btrfs_free_space, offset_index);
262 BUG_ON(info->bitmap); /* Logic error */ 262 BUG_ON(info->bitmap); /* Logic error */
263 263
264 if (info->offset > root->cache_progress) 264 if (info->offset > root->ino_cache_progress)
265 goto free; 265 goto free;
266 else if (info->offset + info->bytes > root->cache_progress) 266 else if (info->offset + info->bytes > root->ino_cache_progress)
267 count = root->cache_progress - info->offset + 1; 267 count = root->ino_cache_progress - info->offset + 1;
268 else 268 else
269 count = info->bytes; 269 count = info->bytes;
270 270
@@ -462,13 +462,13 @@ again:
462 } 462 }
463 } 463 }
464 464
465 spin_lock(&root->cache_lock); 465 spin_lock(&root->ino_cache_lock);
466 if (root->cached != BTRFS_CACHE_FINISHED) { 466 if (root->ino_cache_state != BTRFS_CACHE_FINISHED) {
467 ret = -1; 467 ret = -1;
468 spin_unlock(&root->cache_lock); 468 spin_unlock(&root->ino_cache_lock);
469 goto out_put; 469 goto out_put;
470 } 470 }
471 spin_unlock(&root->cache_lock); 471 spin_unlock(&root->ino_cache_lock);
472 472
473 spin_lock(&ctl->tree_lock); 473 spin_lock(&ctl->tree_lock);
474 prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; 474 prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3668048e16f8..d23362f4464e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
153 153
154 key.objectid = btrfs_ino(inode); 154 key.objectid = btrfs_ino(inode);
155 key.offset = start; 155 key.offset = start;
156 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 156 key.type = BTRFS_EXTENT_DATA_KEY;
157 157
158 datasize = btrfs_file_extent_calc_inline_size(cur_size); 158 datasize = btrfs_file_extent_calc_inline_size(cur_size);
159 path->leave_spinning = 1; 159 path->leave_spinning = 1;
@@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
249 data_len = compressed_size; 249 data_len = compressed_size;
250 250
251 if (start > 0 || 251 if (start > 0 ||
252 actual_end >= PAGE_CACHE_SIZE || 252 actual_end > PAGE_CACHE_SIZE ||
253 data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || 253 data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) ||
254 (!compressed_size && 254 (!compressed_size &&
255 (actual_end & (root->sectorsize - 1)) == 0) || 255 (actual_end & (root->sectorsize - 1)) == 0) ||
256 end + 1 < isize || 256 end + 1 < isize ||
@@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow,
348 return 0; 348 return 0;
349} 349}
350 350
351static inline int inode_need_compress(struct inode *inode)
352{
353 struct btrfs_root *root = BTRFS_I(inode)->root;
354
355 /* force compress */
356 if (btrfs_test_opt(root, FORCE_COMPRESS))
357 return 1;
358 /* bad compression ratios */
359 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
360 return 0;
361 if (btrfs_test_opt(root, COMPRESS) ||
362 BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
363 BTRFS_I(inode)->force_compress)
364 return 1;
365 return 0;
366}
367
351/* 368/*
352 * we create compressed extents in two phases. The first 369 * we create compressed extents in two phases. The first
353 * phase compresses a range of pages that have already been 370 * phase compresses a range of pages that have already been
@@ -444,10 +461,7 @@ again:
444 * inode has not been flagged as nocompress. This flag can 461 * inode has not been flagged as nocompress. This flag can
445 * change at any time if we discover bad compression ratios. 462 * change at any time if we discover bad compression ratios.
446 */ 463 */
447 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 464 if (inode_need_compress(inode)) {
448 (btrfs_test_opt(root, COMPRESS) ||
449 (BTRFS_I(inode)->force_compress) ||
450 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
451 WARN_ON(pages); 465 WARN_ON(pages);
452 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 466 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
453 if (!pages) { 467 if (!pages) {
@@ -709,6 +723,18 @@ retry:
709 unlock_extent(io_tree, async_extent->start, 723 unlock_extent(io_tree, async_extent->start,
710 async_extent->start + 724 async_extent->start +
711 async_extent->ram_size - 1); 725 async_extent->ram_size - 1);
726
727 /*
728 * we need to redirty the pages if we decide to
729 * fallback to uncompressed IO, otherwise we
730 * will not submit these pages down to lower
731 * layers.
732 */
733 extent_range_redirty_for_io(inode,
734 async_extent->start,
735 async_extent->start +
736 async_extent->ram_size - 1);
737
712 goto retry; 738 goto retry;
713 } 739 }
714 goto out_free; 740 goto out_free;
@@ -766,8 +792,12 @@ retry:
766 ins.offset, 792 ins.offset,
767 BTRFS_ORDERED_COMPRESSED, 793 BTRFS_ORDERED_COMPRESSED,
768 async_extent->compress_type); 794 async_extent->compress_type);
769 if (ret) 795 if (ret) {
796 btrfs_drop_extent_cache(inode, async_extent->start,
797 async_extent->start +
798 async_extent->ram_size - 1, 0);
770 goto out_free_reserve; 799 goto out_free_reserve;
800 }
771 801
772 /* 802 /*
773 * clear dirty, set writeback and unlock the pages. 803 * clear dirty, set writeback and unlock the pages.
@@ -959,14 +989,14 @@ static noinline int cow_file_range(struct inode *inode,
959 ret = btrfs_add_ordered_extent(inode, start, ins.objectid, 989 ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
960 ram_size, cur_alloc_size, 0); 990 ram_size, cur_alloc_size, 0);
961 if (ret) 991 if (ret)
962 goto out_reserve; 992 goto out_drop_extent_cache;
963 993
964 if (root->root_key.objectid == 994 if (root->root_key.objectid ==
965 BTRFS_DATA_RELOC_TREE_OBJECTID) { 995 BTRFS_DATA_RELOC_TREE_OBJECTID) {
966 ret = btrfs_reloc_clone_csums(inode, start, 996 ret = btrfs_reloc_clone_csums(inode, start,
967 cur_alloc_size); 997 cur_alloc_size);
968 if (ret) 998 if (ret)
969 goto out_reserve; 999 goto out_drop_extent_cache;
970 } 1000 }
971 1001
972 if (disk_num_bytes < cur_alloc_size) 1002 if (disk_num_bytes < cur_alloc_size)
@@ -994,6 +1024,8 @@ static noinline int cow_file_range(struct inode *inode,
994out: 1024out:
995 return ret; 1025 return ret;
996 1026
1027out_drop_extent_cache:
1028 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
997out_reserve: 1029out_reserve:
998 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 1030 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
999out_unlock: 1031out_unlock:
@@ -1076,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1076 async_cow->locked_page = locked_page; 1108 async_cow->locked_page = locked_page;
1077 async_cow->start = start; 1109 async_cow->start = start;
1078 1110
1079 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) 1111 if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1112 !btrfs_test_opt(root, FORCE_COMPRESS))
1080 cur_end = end; 1113 cur_end = end;
1081 else 1114 else
1082 cur_end = min(end, start + 512 * 1024 - 1); 1115 cur_end = min(end, start + 512 * 1024 - 1);
@@ -1084,8 +1117,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1084 async_cow->end = cur_end; 1117 async_cow->end = cur_end;
1085 INIT_LIST_HEAD(&async_cow->extents); 1118 INIT_LIST_HEAD(&async_cow->extents);
1086 1119
1087 btrfs_init_work(&async_cow->work, async_cow_start, 1120 btrfs_init_work(&async_cow->work,
1088 async_cow_submit, async_cow_free); 1121 btrfs_delalloc_helper,
1122 async_cow_start, async_cow_submit,
1123 async_cow_free);
1089 1124
1090 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1125 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1091 PAGE_CACHE_SHIFT; 1126 PAGE_CACHE_SHIFT;
@@ -1425,6 +1460,26 @@ error:
1425 return ret; 1460 return ret;
1426} 1461}
1427 1462
1463static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1464{
1465
1466 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1467 !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1468 return 0;
1469
1470 /*
1471 * @defrag_bytes is a hint value, no spinlock held here,
1472 * if is not zero, it means the file is defragging.
1473 * Force cow if given extent needs to be defragged.
1474 */
1475 if (BTRFS_I(inode)->defrag_bytes &&
1476 test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1477 EXTENT_DEFRAG, 0, NULL))
1478 return 1;
1479
1480 return 0;
1481}
1482
1428/* 1483/*
1429 * extent_io.c call back to do delayed allocation processing 1484 * extent_io.c call back to do delayed allocation processing
1430 */ 1485 */
@@ -1433,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1433 unsigned long *nr_written) 1488 unsigned long *nr_written)
1434{ 1489{
1435 int ret; 1490 int ret;
1436 struct btrfs_root *root = BTRFS_I(inode)->root; 1491 int force_cow = need_force_cow(inode, start, end);
1437 1492
1438 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { 1493 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1439 ret = run_delalloc_nocow(inode, locked_page, start, end, 1494 ret = run_delalloc_nocow(inode, locked_page, start, end,
1440 page_started, 1, nr_written); 1495 page_started, 1, nr_written);
1441 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { 1496 } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1442 ret = run_delalloc_nocow(inode, locked_page, start, end, 1497 ret = run_delalloc_nocow(inode, locked_page, start, end,
1443 page_started, 0, nr_written); 1498 page_started, 0, nr_written);
1444 } else if (!btrfs_test_opt(root, COMPRESS) && 1499 } else if (!inode_need_compress(inode)) {
1445 !(BTRFS_I(inode)->force_compress) &&
1446 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1447 ret = cow_file_range(inode, locked_page, start, end, 1500 ret = cow_file_range(inode, locked_page, start, end,
1448 page_started, nr_written, 1); 1501 page_started, nr_written, 1);
1449 } else { 1502 } else {
@@ -1535,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1535 struct extent_state *state, unsigned long *bits) 1588 struct extent_state *state, unsigned long *bits)
1536{ 1589{
1537 1590
1591 if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1592 WARN_ON(1);
1538 /* 1593 /*
1539 * set_bit and clear bit hooks normally require _irqsave/restore 1594 * set_bit and clear bit hooks normally require _irqsave/restore
1540 * but in this case, we are only testing for the DELALLOC 1595 * but in this case, we are only testing for the DELALLOC
@@ -1557,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode,
1557 root->fs_info->delalloc_batch); 1612 root->fs_info->delalloc_batch);
1558 spin_lock(&BTRFS_I(inode)->lock); 1613 spin_lock(&BTRFS_I(inode)->lock);
1559 BTRFS_I(inode)->delalloc_bytes += len; 1614 BTRFS_I(inode)->delalloc_bytes += len;
1615 if (*bits & EXTENT_DEFRAG)
1616 BTRFS_I(inode)->defrag_bytes += len;
1560 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, 1617 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1561 &BTRFS_I(inode)->runtime_flags)) 1618 &BTRFS_I(inode)->runtime_flags))
1562 btrfs_add_delalloc_inodes(root, inode); 1619 btrfs_add_delalloc_inodes(root, inode);
@@ -1571,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1571 struct extent_state *state, 1628 struct extent_state *state,
1572 unsigned long *bits) 1629 unsigned long *bits)
1573{ 1630{
1631 u64 len = state->end + 1 - state->start;
1632
1633 spin_lock(&BTRFS_I(inode)->lock);
1634 if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1635 BTRFS_I(inode)->defrag_bytes -= len;
1636 spin_unlock(&BTRFS_I(inode)->lock);
1637
1574 /* 1638 /*
1575 * set_bit and clear bit hooks normally require _irqsave/restore 1639 * set_bit and clear bit hooks normally require _irqsave/restore
1576 * but in this case, we are only testing for the DELALLOC 1640 * but in this case, we are only testing for the DELALLOC
@@ -1578,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1578 */ 1642 */
1579 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1643 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1580 struct btrfs_root *root = BTRFS_I(inode)->root; 1644 struct btrfs_root *root = BTRFS_I(inode)->root;
1581 u64 len = state->end + 1 - state->start;
1582 bool do_list = !btrfs_is_free_space_inode(inode); 1645 bool do_list = !btrfs_is_free_space_inode(inode);
1583 1646
1584 if (*bits & EXTENT_FIRST_DELALLOC) { 1647 if (*bits & EXTENT_FIRST_DELALLOC) {
@@ -1869,7 +1932,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1869 1932
1870 SetPageChecked(page); 1933 SetPageChecked(page);
1871 page_cache_get(page); 1934 page_cache_get(page);
1872 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); 1935 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
1936 btrfs_writepage_fixup_worker, NULL, NULL);
1873 fixup->page = page; 1937 fixup->page = page;
1874 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); 1938 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1875 return -EBUSY; 1939 return -EBUSY;
@@ -2639,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2639 goto out; 2703 goto out;
2640 } 2704 }
2641 2705
2706 btrfs_free_io_failure_record(inode, ordered_extent->file_offset,
2707 ordered_extent->file_offset +
2708 ordered_extent->len - 1);
2709
2642 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { 2710 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2643 truncated = true; 2711 truncated = true;
2644 logical_len = ordered_extent->truncated_len; 2712 logical_len = ordered_extent->truncated_len;
@@ -2810,7 +2878,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2810 struct inode *inode = page->mapping->host; 2878 struct inode *inode = page->mapping->host;
2811 struct btrfs_root *root = BTRFS_I(inode)->root; 2879 struct btrfs_root *root = BTRFS_I(inode)->root;
2812 struct btrfs_ordered_extent *ordered_extent = NULL; 2880 struct btrfs_ordered_extent *ordered_extent = NULL;
2813 struct btrfs_workqueue *workers; 2881 struct btrfs_workqueue *wq;
2882 btrfs_work_func_t func;
2814 2883
2815 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2884 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2816 2885
@@ -2819,15 +2888,53 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2819 end - start + 1, uptodate)) 2888 end - start + 1, uptodate))
2820 return 0; 2889 return 0;
2821 2890
2822 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); 2891 if (btrfs_is_free_space_inode(inode)) {
2892 wq = root->fs_info->endio_freespace_worker;
2893 func = btrfs_freespace_write_helper;
2894 } else {
2895 wq = root->fs_info->endio_write_workers;
2896 func = btrfs_endio_write_helper;
2897 }
2823 2898
2824 if (btrfs_is_free_space_inode(inode)) 2899 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
2825 workers = root->fs_info->endio_freespace_worker; 2900 NULL);
2826 else 2901 btrfs_queue_work(wq, &ordered_extent->work);
2827 workers = root->fs_info->endio_write_workers; 2902
2828 btrfs_queue_work(workers, &ordered_extent->work); 2903 return 0;
2904}
2905
2906static int __readpage_endio_check(struct inode *inode,
2907 struct btrfs_io_bio *io_bio,
2908 int icsum, struct page *page,
2909 int pgoff, u64 start, size_t len)
2910{
2911 char *kaddr;
2912 u32 csum_expected;
2913 u32 csum = ~(u32)0;
2914 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2915 DEFAULT_RATELIMIT_BURST);
2916
2917 csum_expected = *(((u32 *)io_bio->csum) + icsum);
2918
2919 kaddr = kmap_atomic(page);
2920 csum = btrfs_csum_data(kaddr + pgoff, csum, len);
2921 btrfs_csum_final(csum, (char *)&csum);
2922 if (csum != csum_expected)
2923 goto zeroit;
2829 2924
2925 kunmap_atomic(kaddr);
2830 return 0; 2926 return 0;
2927zeroit:
2928 if (__ratelimit(&_rs))
2929 btrfs_info(BTRFS_I(inode)->root->fs_info,
2930 "csum failed ino %llu off %llu csum %u expected csum %u",
2931 btrfs_ino(inode), start, csum, csum_expected);
2932 memset(kaddr + pgoff, 1, len);
2933 flush_dcache_page(page);
2934 kunmap_atomic(kaddr);
2935 if (csum_expected == 0)
2936 return 0;
2937 return -EIO;
2831} 2938}
2832 2939
2833/* 2940/*
@@ -2842,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2842 size_t offset = start - page_offset(page); 2949 size_t offset = start - page_offset(page);
2843 struct inode *inode = page->mapping->host; 2950 struct inode *inode = page->mapping->host;
2844 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 2951 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2845 char *kaddr;
2846 struct btrfs_root *root = BTRFS_I(inode)->root; 2952 struct btrfs_root *root = BTRFS_I(inode)->root;
2847 u32 csum_expected;
2848 u32 csum = ~(u32)0;
2849 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2850 DEFAULT_RATELIMIT_BURST);
2851 2953
2852 if (PageChecked(page)) { 2954 if (PageChecked(page)) {
2853 ClearPageChecked(page); 2955 ClearPageChecked(page);
2854 goto good; 2956 return 0;
2855 } 2957 }
2856 2958
2857 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 2959 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2858 goto good; 2960 return 0;
2859 2961
2860 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 2962 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2861 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 2963 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -2865,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2865 } 2967 }
2866 2968
2867 phy_offset >>= inode->i_sb->s_blocksize_bits; 2969 phy_offset >>= inode->i_sb->s_blocksize_bits;
2868 csum_expected = *(((u32 *)io_bio->csum) + phy_offset); 2970 return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
2869 2971 start, (size_t)(end - start + 1));
2870 kaddr = kmap_atomic(page);
2871 csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1);
2872 btrfs_csum_final(csum, (char *)&csum);
2873 if (csum != csum_expected)
2874 goto zeroit;
2875
2876 kunmap_atomic(kaddr);
2877good:
2878 return 0;
2879
2880zeroit:
2881 if (__ratelimit(&_rs))
2882 btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2883 btrfs_ino(page->mapping->host), start, csum, csum_expected);
2884 memset(kaddr + offset, 1, end - start + 1);
2885 flush_dcache_page(page);
2886 kunmap_atomic(kaddr);
2887 if (csum_expected == 0)
2888 return 0;
2889 return -EIO;
2890} 2972}
2891 2973
2892struct delayed_iput { 2974struct delayed_iput {
@@ -3133,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3133 path->reada = -1; 3215 path->reada = -1;
3134 3216
3135 key.objectid = BTRFS_ORPHAN_OBJECTID; 3217 key.objectid = BTRFS_ORPHAN_OBJECTID;
3136 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 3218 key.type = BTRFS_ORPHAN_ITEM_KEY;
3137 key.offset = (u64)-1; 3219 key.offset = (u64)-1;
3138 3220
3139 while (1) { 3221 while (1) {
@@ -3160,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
3160 /* make sure the item matches what we want */ 3242 /* make sure the item matches what we want */
3161 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) 3243 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3162 break; 3244 break;
3163 if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) 3245 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3164 break; 3246 break;
3165 3247
3166 /* release the path since we're done with it */ 3248 /* release the path since we're done with it */
@@ -3636,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3636 * without delay 3718 * without delay
3637 */ 3719 */
3638 if (!btrfs_is_free_space_inode(inode) 3720 if (!btrfs_is_free_space_inode(inode)
3639 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 3721 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3722 && !root->fs_info->log_root_recovering) {
3640 btrfs_update_root_times(trans, root); 3723 btrfs_update_root_times(trans, root);
3641 3724
3642 ret = btrfs_delayed_update_inode(trans, root, inode); 3725 ret = btrfs_delayed_update_inode(trans, root, inode);
@@ -4059,7 +4142,7 @@ search_again:
4059 fi = NULL; 4142 fi = NULL;
4060 leaf = path->nodes[0]; 4143 leaf = path->nodes[0];
4061 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4144 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4062 found_type = btrfs_key_type(&found_key); 4145 found_type = found_key.type;
4063 4146
4064 if (found_key.objectid != ino) 4147 if (found_key.objectid != ino)
4065 break; 4148 break;
@@ -4222,7 +4305,8 @@ out:
4222 btrfs_abort_transaction(trans, root, ret); 4305 btrfs_abort_transaction(trans, root, ret);
4223 } 4306 }
4224error: 4307error:
4225 if (last_size != (u64)-1) 4308 if (last_size != (u64)-1 &&
4309 root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
4226 btrfs_ordered_update_i_size(inode, last_size, NULL); 4310 btrfs_ordered_update_i_size(inode, last_size, NULL);
4227 btrfs_free_path(path); 4311 btrfs_free_path(path);
4228 return err; 4312 return err;
@@ -4662,6 +4746,11 @@ static void evict_inode_truncate_pages(struct inode *inode)
4662 clear_bit(EXTENT_FLAG_LOGGING, &em->flags); 4746 clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4663 remove_extent_mapping(map_tree, em); 4747 remove_extent_mapping(map_tree, em);
4664 free_extent_map(em); 4748 free_extent_map(em);
4749 if (need_resched()) {
4750 write_unlock(&map_tree->lock);
4751 cond_resched();
4752 write_lock(&map_tree->lock);
4753 }
4665 } 4754 }
4666 write_unlock(&map_tree->lock); 4755 write_unlock(&map_tree->lock);
4667 4756
@@ -4684,6 +4773,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
4684 &cached_state, GFP_NOFS); 4773 &cached_state, GFP_NOFS);
4685 free_extent_state(state); 4774 free_extent_state(state);
4686 4775
4776 cond_resched();
4687 spin_lock(&io_tree->lock); 4777 spin_lock(&io_tree->lock);
4688 } 4778 }
4689 spin_unlock(&io_tree->lock); 4779 spin_unlock(&io_tree->lock);
@@ -4714,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode)
4714 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ 4804 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4715 btrfs_wait_ordered_range(inode, 0, (u64)-1); 4805 btrfs_wait_ordered_range(inode, 0, (u64)-1);
4716 4806
4807 btrfs_free_io_failure_record(inode, 0, (u64)-1);
4808
4717 if (root->fs_info->log_root_recovering) { 4809 if (root->fs_info->log_root_recovering) {
4718 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 4810 BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4719 &BTRFS_I(inode)->runtime_flags)); 4811 &BTRFS_I(inode)->runtime_flags));
@@ -5262,7 +5354,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5262 btrfs_get_delayed_items(inode, &ins_list, &del_list); 5354 btrfs_get_delayed_items(inode, &ins_list, &del_list);
5263 } 5355 }
5264 5356
5265 btrfs_set_key_type(&key, key_type); 5357 key.type = key_type;
5266 key.offset = ctx->pos; 5358 key.offset = ctx->pos;
5267 key.objectid = btrfs_ino(inode); 5359 key.objectid = btrfs_ino(inode);
5268 5360
@@ -5287,7 +5379,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5287 5379
5288 if (found_key.objectid != key.objectid) 5380 if (found_key.objectid != key.objectid)
5289 break; 5381 break;
5290 if (btrfs_key_type(&found_key) != key_type) 5382 if (found_key.type != key_type)
5291 break; 5383 break;
5292 if (found_key.offset < ctx->pos) 5384 if (found_key.offset < ctx->pos)
5293 goto next; 5385 goto next;
@@ -5499,7 +5591,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
5499 int ret; 5591 int ret;
5500 5592
5501 key.objectid = btrfs_ino(inode); 5593 key.objectid = btrfs_ino(inode);
5502 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 5594 key.type = BTRFS_DIR_INDEX_KEY;
5503 key.offset = (u64)-1; 5595 key.offset = (u64)-1;
5504 5596
5505 path = btrfs_alloc_path(); 5597 path = btrfs_alloc_path();
@@ -5531,7 +5623,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
5531 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5623 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5532 5624
5533 if (found_key.objectid != btrfs_ino(inode) || 5625 if (found_key.objectid != btrfs_ino(inode) ||
5534 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 5626 found_key.type != BTRFS_DIR_INDEX_KEY) {
5535 BTRFS_I(inode)->index_cnt = 2; 5627 BTRFS_I(inode)->index_cnt = 2;
5536 goto out; 5628 goto out;
5537 } 5629 }
@@ -5565,6 +5657,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
5565 return ret; 5657 return ret;
5566} 5658}
5567 5659
5660static int btrfs_insert_inode_locked(struct inode *inode)
5661{
5662 struct btrfs_iget_args args;
5663 args.location = &BTRFS_I(inode)->location;
5664 args.root = BTRFS_I(inode)->root;
5665
5666 return insert_inode_locked4(inode,
5667 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
5668 btrfs_find_actor, &args);
5669}
5670
5568static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, 5671static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5569 struct btrfs_root *root, 5672 struct btrfs_root *root,
5570 struct inode *dir, 5673 struct inode *dir,
@@ -5594,6 +5697,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5594 } 5697 }
5595 5698
5596 /* 5699 /*
5700 * O_TMPFILE, set link count to 0, so that after this point,
5701 * we fill in an inode item with the correct link count.
5702 */
5703 if (!name)
5704 set_nlink(inode, 0);
5705
5706 /*
5597 * we have to initialize this early, so we can reclaim the inode 5707 * we have to initialize this early, so we can reclaim the inode
5598 * number if we fail afterwards in this function. 5708 * number if we fail afterwards in this function.
5599 */ 5709 */
@@ -5631,7 +5741,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5631 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); 5741 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5632 5742
5633 key[0].objectid = objectid; 5743 key[0].objectid = objectid;
5634 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 5744 key[0].type = BTRFS_INODE_ITEM_KEY;
5635 key[0].offset = 0; 5745 key[0].offset = 0;
5636 5746
5637 sizes[0] = sizeof(struct btrfs_inode_item); 5747 sizes[0] = sizeof(struct btrfs_inode_item);
@@ -5644,16 +5754,25 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5644 * add more hard links than can fit in the ref item. 5754 * add more hard links than can fit in the ref item.
5645 */ 5755 */
5646 key[1].objectid = objectid; 5756 key[1].objectid = objectid;
5647 btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); 5757 key[1].type = BTRFS_INODE_REF_KEY;
5648 key[1].offset = ref_objectid; 5758 key[1].offset = ref_objectid;
5649 5759
5650 sizes[1] = name_len + sizeof(*ref); 5760 sizes[1] = name_len + sizeof(*ref);
5651 } 5761 }
5652 5762
5763 location = &BTRFS_I(inode)->location;
5764 location->objectid = objectid;
5765 location->offset = 0;
5766 location->type = BTRFS_INODE_ITEM_KEY;
5767
5768 ret = btrfs_insert_inode_locked(inode);
5769 if (ret < 0)
5770 goto fail;
5771
5653 path->leave_spinning = 1; 5772 path->leave_spinning = 1;
5654 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); 5773 ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5655 if (ret != 0) 5774 if (ret != 0)
5656 goto fail; 5775 goto fail_unlock;
5657 5776
5658 inode_init_owner(inode, dir, mode); 5777 inode_init_owner(inode, dir, mode);
5659 inode_set_bytes(inode, 0); 5778 inode_set_bytes(inode, 0);
@@ -5676,11 +5795,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5676 btrfs_mark_buffer_dirty(path->nodes[0]); 5795 btrfs_mark_buffer_dirty(path->nodes[0]);
5677 btrfs_free_path(path); 5796 btrfs_free_path(path);
5678 5797
5679 location = &BTRFS_I(inode)->location;
5680 location->objectid = objectid;
5681 location->offset = 0;
5682 btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5683
5684 btrfs_inherit_iflags(inode, dir); 5798 btrfs_inherit_iflags(inode, dir);
5685 5799
5686 if (S_ISREG(mode)) { 5800 if (S_ISREG(mode)) {
@@ -5691,7 +5805,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5691 BTRFS_INODE_NODATASUM; 5805 BTRFS_INODE_NODATASUM;
5692 } 5806 }
5693 5807
5694 btrfs_insert_inode_hash(inode);
5695 inode_tree_add(inode); 5808 inode_tree_add(inode);
5696 5809
5697 trace_btrfs_inode_new(inode); 5810 trace_btrfs_inode_new(inode);
@@ -5706,6 +5819,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5706 btrfs_ino(inode), root->root_key.objectid, ret); 5819 btrfs_ino(inode), root->root_key.objectid, ret);
5707 5820
5708 return inode; 5821 return inode;
5822
5823fail_unlock:
5824 unlock_new_inode(inode);
5709fail: 5825fail:
5710 if (dir && name) 5826 if (dir && name)
5711 BTRFS_I(dir)->index_cnt--; 5827 BTRFS_I(dir)->index_cnt--;
@@ -5739,7 +5855,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
5739 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 5855 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5740 } else { 5856 } else {
5741 key.objectid = ino; 5857 key.objectid = ino;
5742 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 5858 key.type = BTRFS_INODE_ITEM_KEY;
5743 key.offset = 0; 5859 key.offset = 0;
5744 } 5860 }
5745 5861
@@ -5840,28 +5956,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5840 goto out_unlock; 5956 goto out_unlock;
5841 } 5957 }
5842 5958
5843 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5844 if (err) {
5845 drop_inode = 1;
5846 goto out_unlock;
5847 }
5848
5849 /* 5959 /*
5850 * If the active LSM wants to access the inode during 5960 * If the active LSM wants to access the inode during
5851 * d_instantiate it needs these. Smack checks to see 5961 * d_instantiate it needs these. Smack checks to see
5852 * if the filesystem supports xattrs by looking at the 5962 * if the filesystem supports xattrs by looking at the
5853 * ops vector. 5963 * ops vector.
5854 */ 5964 */
5855
5856 inode->i_op = &btrfs_special_inode_operations; 5965 inode->i_op = &btrfs_special_inode_operations;
5857 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 5966 init_special_inode(inode, inode->i_mode, rdev);
5967
5968 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5858 if (err) 5969 if (err)
5859 drop_inode = 1; 5970 goto out_unlock_inode;
5860 else { 5971
5861 init_special_inode(inode, inode->i_mode, rdev); 5972 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5973 if (err) {
5974 goto out_unlock_inode;
5975 } else {
5862 btrfs_update_inode(trans, root, inode); 5976 btrfs_update_inode(trans, root, inode);
5977 unlock_new_inode(inode);
5863 d_instantiate(dentry, inode); 5978 d_instantiate(dentry, inode);
5864 } 5979 }
5980
5865out_unlock: 5981out_unlock:
5866 btrfs_end_transaction(trans, root); 5982 btrfs_end_transaction(trans, root);
5867 btrfs_balance_delayed_items(root); 5983 btrfs_balance_delayed_items(root);
@@ -5871,6 +5987,12 @@ out_unlock:
5871 iput(inode); 5987 iput(inode);
5872 } 5988 }
5873 return err; 5989 return err;
5990
5991out_unlock_inode:
5992 drop_inode = 1;
5993 unlock_new_inode(inode);
5994 goto out_unlock;
5995
5874} 5996}
5875 5997
5876static int btrfs_create(struct inode *dir, struct dentry *dentry, 5998static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5905,15 +6027,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5905 goto out_unlock; 6027 goto out_unlock;
5906 } 6028 }
5907 drop_inode_on_err = 1; 6029 drop_inode_on_err = 1;
5908
5909 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5910 if (err)
5911 goto out_unlock;
5912
5913 err = btrfs_update_inode(trans, root, inode);
5914 if (err)
5915 goto out_unlock;
5916
5917 /* 6030 /*
5918 * If the active LSM wants to access the inode during 6031 * If the active LSM wants to access the inode during
5919 * d_instantiate it needs these. Smack checks to see 6032 * d_instantiate it needs these. Smack checks to see
@@ -5922,14 +6035,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
5922 */ 6035 */
5923 inode->i_fop = &btrfs_file_operations; 6036 inode->i_fop = &btrfs_file_operations;
5924 inode->i_op = &btrfs_file_inode_operations; 6037 inode->i_op = &btrfs_file_inode_operations;
6038 inode->i_mapping->a_ops = &btrfs_aops;
6039 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
6040
6041 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6042 if (err)
6043 goto out_unlock_inode;
6044
6045 err = btrfs_update_inode(trans, root, inode);
6046 if (err)
6047 goto out_unlock_inode;
5925 6048
5926 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 6049 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5927 if (err) 6050 if (err)
5928 goto out_unlock; 6051 goto out_unlock_inode;
5929 6052
5930 inode->i_mapping->a_ops = &btrfs_aops;
5931 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5932 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 6053 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6054 unlock_new_inode(inode);
5933 d_instantiate(dentry, inode); 6055 d_instantiate(dentry, inode);
5934 6056
5935out_unlock: 6057out_unlock:
@@ -5941,6 +6063,11 @@ out_unlock:
5941 btrfs_balance_delayed_items(root); 6063 btrfs_balance_delayed_items(root);
5942 btrfs_btree_balance_dirty(root); 6064 btrfs_btree_balance_dirty(root);
5943 return err; 6065 return err;
6066
6067out_unlock_inode:
6068 unlock_new_inode(inode);
6069 goto out_unlock;
6070
5944} 6071}
5945 6072
5946static int btrfs_link(struct dentry *old_dentry, struct inode *dir, 6073static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6048,25 +6175,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6048 } 6175 }
6049 6176
6050 drop_on_err = 1; 6177 drop_on_err = 1;
6178 /* these must be set before we unlock the inode */
6179 inode->i_op = &btrfs_dir_inode_operations;
6180 inode->i_fop = &btrfs_dir_file_operations;
6051 6181
6052 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); 6182 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6053 if (err) 6183 if (err)
6054 goto out_fail; 6184 goto out_fail_inode;
6055
6056 inode->i_op = &btrfs_dir_inode_operations;
6057 inode->i_fop = &btrfs_dir_file_operations;
6058 6185
6059 btrfs_i_size_write(inode, 0); 6186 btrfs_i_size_write(inode, 0);
6060 err = btrfs_update_inode(trans, root, inode); 6187 err = btrfs_update_inode(trans, root, inode);
6061 if (err) 6188 if (err)
6062 goto out_fail; 6189 goto out_fail_inode;
6063 6190
6064 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, 6191 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
6065 dentry->d_name.len, 0, index); 6192 dentry->d_name.len, 0, index);
6066 if (err) 6193 if (err)
6067 goto out_fail; 6194 goto out_fail_inode;
6068 6195
6069 d_instantiate(dentry, inode); 6196 d_instantiate(dentry, inode);
6197 /*
6198 * mkdir is special. We're unlocking after we call d_instantiate
6199 * to avoid a race with nfsd calling d_instantiate.
6200 */
6201 unlock_new_inode(inode);
6070 drop_on_err = 0; 6202 drop_on_err = 0;
6071 6203
6072out_fail: 6204out_fail:
@@ -6076,23 +6208,66 @@ out_fail:
6076 btrfs_balance_delayed_items(root); 6208 btrfs_balance_delayed_items(root);
6077 btrfs_btree_balance_dirty(root); 6209 btrfs_btree_balance_dirty(root);
6078 return err; 6210 return err;
6211
6212out_fail_inode:
6213 unlock_new_inode(inode);
6214 goto out_fail;
6215}
6216
6217/* Find next extent map of a given extent map, caller needs to ensure locks */
6218static struct extent_map *next_extent_map(struct extent_map *em)
6219{
6220 struct rb_node *next;
6221
6222 next = rb_next(&em->rb_node);
6223 if (!next)
6224 return NULL;
6225 return container_of(next, struct extent_map, rb_node);
6226}
6227
6228static struct extent_map *prev_extent_map(struct extent_map *em)
6229{
6230 struct rb_node *prev;
6231
6232 prev = rb_prev(&em->rb_node);
6233 if (!prev)
6234 return NULL;
6235 return container_of(prev, struct extent_map, rb_node);
6079} 6236}
6080 6237
6081/* helper for btfs_get_extent. Given an existing extent in the tree, 6238/* helper for btfs_get_extent. Given an existing extent in the tree,
6239 * the existing extent is the nearest extent to map_start,
6082 * and an extent that you want to insert, deal with overlap and insert 6240 * and an extent that you want to insert, deal with overlap and insert
6083 * the new extent into the tree. 6241 * the best fitted new extent into the tree.
6084 */ 6242 */
6085static int merge_extent_mapping(struct extent_map_tree *em_tree, 6243static int merge_extent_mapping(struct extent_map_tree *em_tree,
6086 struct extent_map *existing, 6244 struct extent_map *existing,
6087 struct extent_map *em, 6245 struct extent_map *em,
6088 u64 map_start, u64 map_len) 6246 u64 map_start)
6089{ 6247{
6248 struct extent_map *prev;
6249 struct extent_map *next;
6250 u64 start;
6251 u64 end;
6090 u64 start_diff; 6252 u64 start_diff;
6091 6253
6092 BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); 6254 BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6093 start_diff = map_start - em->start; 6255
6094 em->start = map_start; 6256 if (existing->start > map_start) {
6095 em->len = map_len; 6257 next = existing;
6258 prev = prev_extent_map(next);
6259 } else {
6260 prev = existing;
6261 next = next_extent_map(prev);
6262 }
6263
6264 start = prev ? extent_map_end(prev) : em->start;
6265 start = max_t(u64, start, em->start);
6266 end = next ? next->start : extent_map_end(em);
6267 end = min_t(u64, end, extent_map_end(em));
6268 start_diff = start - em->start;
6269 em->start = start;
6270 em->len = end - start;
6096 if (em->block_start < EXTENT_MAP_LAST_BYTE && 6271 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6097 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 6272 !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6098 em->block_start += start_diff; 6273 em->block_start += start_diff;
@@ -6220,7 +6395,7 @@ again:
6220 struct btrfs_file_extent_item); 6395 struct btrfs_file_extent_item);
6221 /* are we inside the extent that was found? */ 6396 /* are we inside the extent that was found? */
6222 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6397 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6223 found_type = btrfs_key_type(&found_key); 6398 found_type = found_key.type;
6224 if (found_key.objectid != objectid || 6399 if (found_key.objectid != objectid ||
6225 found_type != BTRFS_EXTENT_DATA_KEY) { 6400 found_type != BTRFS_EXTENT_DATA_KEY) {
6226 /* 6401 /*
@@ -6263,6 +6438,8 @@ next:
6263 goto not_found; 6438 goto not_found;
6264 if (start + len <= found_key.offset) 6439 if (start + len <= found_key.offset)
6265 goto not_found; 6440 goto not_found;
6441 if (start > found_key.offset)
6442 goto next;
6266 em->start = start; 6443 em->start = start;
6267 em->orig_start = start; 6444 em->orig_start = start;
6268 em->len = found_key.offset - start; 6445 em->len = found_key.offset - start;
@@ -6367,26 +6544,21 @@ insert:
6367 6544
6368 ret = 0; 6545 ret = 0;
6369 6546
6370 existing = lookup_extent_mapping(em_tree, start, len); 6547 existing = search_extent_mapping(em_tree, start, len);
6371 if (existing && (existing->start > start || 6548 /*
6372 existing->start + existing->len <= start)) { 6549 * existing will always be non-NULL, since there must be
6550 * extent causing the -EEXIST.
6551 */
6552 if (start >= extent_map_end(existing) ||
6553 start <= existing->start) {
6554 /*
6555 * The existing extent map is the one nearest to
6556 * the [start, start + len) range which overlaps
6557 */
6558 err = merge_extent_mapping(em_tree, existing,
6559 em, start);
6373 free_extent_map(existing); 6560 free_extent_map(existing);
6374 existing = NULL; 6561 if (err) {
6375 }
6376 if (!existing) {
6377 existing = lookup_extent_mapping(em_tree, em->start,
6378 em->len);
6379 if (existing) {
6380 err = merge_extent_mapping(em_tree, existing,
6381 em, start,
6382 root->sectorsize);
6383 free_extent_map(existing);
6384 if (err) {
6385 free_extent_map(em);
6386 em = NULL;
6387 }
6388 } else {
6389 err = -EIO;
6390 free_extent_map(em); 6562 free_extent_map(em);
6391 em = NULL; 6563 em = NULL;
6392 } 6564 }
@@ -6998,8 +7170,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6998 block_start, len, 7170 block_start, len,
6999 orig_block_len, 7171 orig_block_len,
7000 ram_bytes, type); 7172 ram_bytes, type);
7001 if (IS_ERR(em)) 7173 if (IS_ERR(em)) {
7174 ret = PTR_ERR(em);
7002 goto unlock_err; 7175 goto unlock_err;
7176 }
7003 } 7177 }
7004 7178
7005 ret = btrfs_add_ordered_extent_dio(inode, start, 7179 ret = btrfs_add_ordered_extent_dio(inode, start,
@@ -7074,45 +7248,277 @@ unlock_err:
7074 return ret; 7248 return ret;
7075} 7249}
7076 7250
7077static void btrfs_endio_direct_read(struct bio *bio, int err) 7251static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7252 int rw, int mirror_num)
7078{ 7253{
7079 struct btrfs_dio_private *dip = bio->bi_private;
7080 struct bio_vec *bvec;
7081 struct inode *inode = dip->inode;
7082 struct btrfs_root *root = BTRFS_I(inode)->root; 7254 struct btrfs_root *root = BTRFS_I(inode)->root;
7083 struct bio *dio_bio; 7255 int ret;
7084 u32 *csums = (u32 *)dip->csum; 7256
7257 BUG_ON(rw & REQ_WRITE);
7258
7259 bio_get(bio);
7260
7261 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7262 BTRFS_WQ_ENDIO_DIO_REPAIR);
7263 if (ret)
7264 goto err;
7265
7266 ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
7267err:
7268 bio_put(bio);
7269 return ret;
7270}
7271
7272static int btrfs_check_dio_repairable(struct inode *inode,
7273 struct bio *failed_bio,
7274 struct io_failure_record *failrec,
7275 int failed_mirror)
7276{
7277 int num_copies;
7278
7279 num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
7280 failrec->logical, failrec->len);
7281 if (num_copies == 1) {
7282 /*
7283 * we only have a single copy of the data, so don't bother with
7284 * all the retry and error correction code that follows. no
7285 * matter what the error is, it is very likely to persist.
7286 */
7287 pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
7288 num_copies, failrec->this_mirror, failed_mirror);
7289 return 0;
7290 }
7291
7292 failrec->failed_mirror = failed_mirror;
7293 failrec->this_mirror++;
7294 if (failrec->this_mirror == failed_mirror)
7295 failrec->this_mirror++;
7296
7297 if (failrec->this_mirror > num_copies) {
7298 pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
7299 num_copies, failrec->this_mirror, failed_mirror);
7300 return 0;
7301 }
7302
7303 return 1;
7304}
7305
7306static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7307 struct page *page, u64 start, u64 end,
7308 int failed_mirror, bio_end_io_t *repair_endio,
7309 void *repair_arg)
7310{
7311 struct io_failure_record *failrec;
7312 struct bio *bio;
7313 int isector;
7314 int read_mode;
7315 int ret;
7316
7317 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
7318
7319 ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7320 if (ret)
7321 return ret;
7322
7323 ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7324 failed_mirror);
7325 if (!ret) {
7326 free_io_failure(inode, failrec);
7327 return -EIO;
7328 }
7329
7330 if (failed_bio->bi_vcnt > 1)
7331 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
7332 else
7333 read_mode = READ_SYNC;
7334
7335 isector = start - btrfs_io_bio(failed_bio)->logical;
7336 isector >>= inode->i_sb->s_blocksize_bits;
7337 bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7338 0, isector, repair_endio, repair_arg);
7339 if (!bio) {
7340 free_io_failure(inode, failrec);
7341 return -EIO;
7342 }
7343
7344 btrfs_debug(BTRFS_I(inode)->root->fs_info,
7345 "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
7346 read_mode, failrec->this_mirror, failrec->in_validation);
7347
7348 ret = submit_dio_repair_bio(inode, bio, read_mode,
7349 failrec->this_mirror);
7350 if (ret) {
7351 free_io_failure(inode, failrec);
7352 bio_put(bio);
7353 }
7354
7355 return ret;
7356}
7357
7358struct btrfs_retry_complete {
7359 struct completion done;
7360 struct inode *inode;
7085 u64 start; 7361 u64 start;
7362 int uptodate;
7363};
7364
7365static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
7366{
7367 struct btrfs_retry_complete *done = bio->bi_private;
7368 struct bio_vec *bvec;
7369 int i;
7370
7371 if (err)
7372 goto end;
7373
7374 done->uptodate = 1;
7375 bio_for_each_segment_all(bvec, bio, i)
7376 clean_io_failure(done->inode, done->start, bvec->bv_page, 0);
7377end:
7378 complete(&done->done);
7379 bio_put(bio);
7380}
7381
7382static int __btrfs_correct_data_nocsum(struct inode *inode,
7383 struct btrfs_io_bio *io_bio)
7384{
7385 struct bio_vec *bvec;
7386 struct btrfs_retry_complete done;
7387 u64 start;
7388 int i;
7389 int ret;
7390
7391 start = io_bio->logical;
7392 done.inode = inode;
7393
7394 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7395try_again:
7396 done.uptodate = 0;
7397 done.start = start;
7398 init_completion(&done.done);
7399
7400 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7401 start + bvec->bv_len - 1,
7402 io_bio->mirror_num,
7403 btrfs_retry_endio_nocsum, &done);
7404 if (ret)
7405 return ret;
7406
7407 wait_for_completion(&done.done);
7408
7409 if (!done.uptodate) {
7410 /* We might have another mirror, so try again */
7411 goto try_again;
7412 }
7413
7414 start += bvec->bv_len;
7415 }
7416
7417 return 0;
7418}
7419
7420static void btrfs_retry_endio(struct bio *bio, int err)
7421{
7422 struct btrfs_retry_complete *done = bio->bi_private;
7423 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7424 struct bio_vec *bvec;
7425 int uptodate;
7426 int ret;
7086 int i; 7427 int i;
7087 7428
7088 start = dip->logical_offset; 7429 if (err)
7430 goto end;
7431
7432 uptodate = 1;
7089 bio_for_each_segment_all(bvec, bio, i) { 7433 bio_for_each_segment_all(bvec, bio, i) {
7090 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 7434 ret = __readpage_endio_check(done->inode, io_bio, i,
7091 struct page *page = bvec->bv_page; 7435 bvec->bv_page, 0,
7092 char *kaddr; 7436 done->start, bvec->bv_len);
7093 u32 csum = ~(u32)0; 7437 if (!ret)
7094 unsigned long flags; 7438 clean_io_failure(done->inode, done->start,
7095 7439 bvec->bv_page, 0);
7096 local_irq_save(flags); 7440 else
7097 kaddr = kmap_atomic(page); 7441 uptodate = 0;
7098 csum = btrfs_csum_data(kaddr + bvec->bv_offset, 7442 }
7099 csum, bvec->bv_len); 7443
7100 btrfs_csum_final(csum, (char *)&csum); 7444 done->uptodate = uptodate;
7101 kunmap_atomic(kaddr); 7445end:
7102 local_irq_restore(flags); 7446 complete(&done->done);
7103 7447 bio_put(bio);
7104 flush_dcache_page(bvec->bv_page); 7448}
7105 if (csum != csums[i]) { 7449
7106 btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", 7450static int __btrfs_subio_endio_read(struct inode *inode,
7107 btrfs_ino(inode), start, csum, 7451 struct btrfs_io_bio *io_bio, int err)
7108 csums[i]); 7452{
7109 err = -EIO; 7453 struct bio_vec *bvec;
7110 } 7454 struct btrfs_retry_complete done;
7455 u64 start;
7456 u64 offset = 0;
7457 int i;
7458 int ret;
7459
7460 err = 0;
7461 start = io_bio->logical;
7462 done.inode = inode;
7463
7464 bio_for_each_segment_all(bvec, &io_bio->bio, i) {
7465 ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
7466 0, start, bvec->bv_len);
7467 if (likely(!ret))
7468 goto next;
7469try_again:
7470 done.uptodate = 0;
7471 done.start = start;
7472 init_completion(&done.done);
7473
7474 ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start,
7475 start + bvec->bv_len - 1,
7476 io_bio->mirror_num,
7477 btrfs_retry_endio, &done);
7478 if (ret) {
7479 err = ret;
7480 goto next;
7111 } 7481 }
7112 7482
7483 wait_for_completion(&done.done);
7484
7485 if (!done.uptodate) {
7486 /* We might have another mirror, so try again */
7487 goto try_again;
7488 }
7489next:
7490 offset += bvec->bv_len;
7113 start += bvec->bv_len; 7491 start += bvec->bv_len;
7114 } 7492 }
7115 7493
7494 return err;
7495}
7496
7497static int btrfs_subio_endio_read(struct inode *inode,
7498 struct btrfs_io_bio *io_bio, int err)
7499{
7500 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7501
7502 if (skip_csum) {
7503 if (unlikely(err))
7504 return __btrfs_correct_data_nocsum(inode, io_bio);
7505 else
7506 return 0;
7507 } else {
7508 return __btrfs_subio_endio_read(inode, io_bio, err);
7509 }
7510}
7511
7512static void btrfs_endio_direct_read(struct bio *bio, int err)
7513{
7514 struct btrfs_dio_private *dip = bio->bi_private;
7515 struct inode *inode = dip->inode;
7516 struct bio *dio_bio;
7517 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7518
7519 if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
7520 err = btrfs_subio_endio_read(inode, io_bio, err);
7521
7116 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, 7522 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7117 dip->logical_offset + dip->bytes - 1); 7523 dip->logical_offset + dip->bytes - 1);
7118 dio_bio = dip->dio_bio; 7524 dio_bio = dip->dio_bio;
@@ -7123,6 +7529,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
7123 if (err) 7529 if (err)
7124 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); 7530 clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
7125 dio_end_io(dio_bio, err); 7531 dio_end_io(dio_bio, err);
7532
7533 if (io_bio->end_io)
7534 io_bio->end_io(io_bio, err);
7126 bio_put(bio); 7535 bio_put(bio);
7127} 7536}
7128 7537
@@ -7146,7 +7555,8 @@ again:
7146 if (!ret) 7555 if (!ret)
7147 goto out_test; 7556 goto out_test;
7148 7557
7149 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); 7558 btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
7559 finish_ordered_fn, NULL, NULL);
7150 btrfs_queue_work(root->fs_info->endio_write_workers, 7560 btrfs_queue_work(root->fs_info->endio_write_workers,
7151 &ordered->work); 7561 &ordered->work);
7152out_test: 7562out_test:
@@ -7187,12 +7597,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
7187{ 7597{
7188 struct btrfs_dio_private *dip = bio->bi_private; 7598 struct btrfs_dio_private *dip = bio->bi_private;
7189 7599
7600 if (err)
7601 btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7602 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7603 btrfs_ino(dip->inode), bio->bi_rw,
7604 (unsigned long long)bio->bi_iter.bi_sector,
7605 bio->bi_iter.bi_size, err);
7606
7607 if (dip->subio_endio)
7608 err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
7609
7190 if (err) { 7610 if (err) {
7191 btrfs_err(BTRFS_I(dip->inode)->root->fs_info,
7192 "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d",
7193 btrfs_ino(dip->inode), bio->bi_rw,
7194 (unsigned long long)bio->bi_iter.bi_sector,
7195 bio->bi_iter.bi_size, err);
7196 dip->errors = 1; 7611 dip->errors = 1;
7197 7612
7198 /* 7613 /*
@@ -7223,6 +7638,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
7223 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); 7638 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
7224} 7639}
7225 7640
7641static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
7642 struct inode *inode,
7643 struct btrfs_dio_private *dip,
7644 struct bio *bio,
7645 u64 file_offset)
7646{
7647 struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7648 struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
7649 int ret;
7650
7651 /*
7652 * We load all the csum data we need when we submit
7653 * the first bio to reduce the csum tree search and
7654 * contention.
7655 */
7656 if (dip->logical_offset == file_offset) {
7657 ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio,
7658 file_offset);
7659 if (ret)
7660 return ret;
7661 }
7662
7663 if (bio == dip->orig_bio)
7664 return 0;
7665
7666 file_offset -= dip->logical_offset;
7667 file_offset >>= inode->i_sb->s_blocksize_bits;
7668 io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
7669
7670 return 0;
7671}
7672
7226static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 7673static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7227 int rw, u64 file_offset, int skip_sum, 7674 int rw, u64 file_offset, int skip_sum,
7228 int async_submit) 7675 int async_submit)
@@ -7238,7 +7685,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7238 bio_get(bio); 7685 bio_get(bio);
7239 7686
7240 if (!write) { 7687 if (!write) {
7241 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 7688 ret = btrfs_bio_wq_end_io(root->fs_info, bio,
7689 BTRFS_WQ_ENDIO_DATA);
7242 if (ret) 7690 if (ret)
7243 goto err; 7691 goto err;
7244 } 7692 }
@@ -7261,13 +7709,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
7261 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); 7709 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
7262 if (ret) 7710 if (ret)
7263 goto err; 7711 goto err;
7264 } else if (!skip_sum) { 7712 } else {
7265 ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, 7713 ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio,
7266 file_offset); 7714 file_offset);
7267 if (ret) 7715 if (ret)
7268 goto err; 7716 goto err;
7269 } 7717 }
7270
7271map: 7718map:
7272 ret = btrfs_map_bio(root, rw, bio, 0, async_submit); 7719 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
7273err: 7720err:
@@ -7288,19 +7735,18 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7288 u64 submit_len = 0; 7735 u64 submit_len = 0;
7289 u64 map_length; 7736 u64 map_length;
7290 int nr_pages = 0; 7737 int nr_pages = 0;
7291 int ret = 0; 7738 int ret;
7292 int async_submit = 0; 7739 int async_submit = 0;
7293 7740
7294 map_length = orig_bio->bi_iter.bi_size; 7741 map_length = orig_bio->bi_iter.bi_size;
7295 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, 7742 ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
7296 &map_length, NULL, 0); 7743 &map_length, NULL, 0);
7297 if (ret) { 7744 if (ret)
7298 bio_put(orig_bio);
7299 return -EIO; 7745 return -EIO;
7300 }
7301 7746
7302 if (map_length >= orig_bio->bi_iter.bi_size) { 7747 if (map_length >= orig_bio->bi_iter.bi_size) {
7303 bio = orig_bio; 7748 bio = orig_bio;
7749 dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
7304 goto submit; 7750 goto submit;
7305 } 7751 }
7306 7752
@@ -7314,14 +7760,16 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7314 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 7760 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
7315 if (!bio) 7761 if (!bio)
7316 return -ENOMEM; 7762 return -ENOMEM;
7763
7317 bio->bi_private = dip; 7764 bio->bi_private = dip;
7318 bio->bi_end_io = btrfs_end_dio_bio; 7765 bio->bi_end_io = btrfs_end_dio_bio;
7766 btrfs_io_bio(bio)->logical = file_offset;
7319 atomic_inc(&dip->pending_bios); 7767 atomic_inc(&dip->pending_bios);
7320 7768
7321 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 7769 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
7322 if (unlikely(map_length < submit_len + bvec->bv_len || 7770 if (map_length < submit_len + bvec->bv_len ||
7323 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 7771 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
7324 bvec->bv_offset) < bvec->bv_len)) { 7772 bvec->bv_offset) < bvec->bv_len) {
7325 /* 7773 /*
7326 * inc the count before we submit the bio so 7774 * inc the count before we submit the bio so
7327 * we know the end IO handler won't happen before 7775 * we know the end IO handler won't happen before
@@ -7350,6 +7798,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
7350 goto out_err; 7798 goto out_err;
7351 bio->bi_private = dip; 7799 bio->bi_private = dip;
7352 bio->bi_end_io = btrfs_end_dio_bio; 7800 bio->bi_end_io = btrfs_end_dio_bio;
7801 btrfs_io_bio(bio)->logical = file_offset;
7353 7802
7354 map_length = orig_bio->bi_iter.bi_size; 7803 map_length = orig_bio->bi_iter.bi_size;
7355 ret = btrfs_map_block(root->fs_info, rw, 7804 ret = btrfs_map_block(root->fs_info, rw,
@@ -7393,11 +7842,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7393 struct btrfs_root *root = BTRFS_I(inode)->root; 7842 struct btrfs_root *root = BTRFS_I(inode)->root;
7394 struct btrfs_dio_private *dip; 7843 struct btrfs_dio_private *dip;
7395 struct bio *io_bio; 7844 struct bio *io_bio;
7845 struct btrfs_io_bio *btrfs_bio;
7396 int skip_sum; 7846 int skip_sum;
7397 int sum_len;
7398 int write = rw & REQ_WRITE; 7847 int write = rw & REQ_WRITE;
7399 int ret = 0; 7848 int ret = 0;
7400 u16 csum_size;
7401 7849
7402 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 7850 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7403 7851
@@ -7407,16 +7855,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7407 goto free_ordered; 7855 goto free_ordered;
7408 } 7856 }
7409 7857
7410 if (!skip_sum && !write) { 7858 dip = kzalloc(sizeof(*dip), GFP_NOFS);
7411 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7412 sum_len = dio_bio->bi_iter.bi_size >>
7413 inode->i_sb->s_blocksize_bits;
7414 sum_len *= csum_size;
7415 } else {
7416 sum_len = 0;
7417 }
7418
7419 dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7420 if (!dip) { 7859 if (!dip) {
7421 ret = -ENOMEM; 7860 ret = -ENOMEM;
7422 goto free_io_bio; 7861 goto free_io_bio;
@@ -7428,20 +7867,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7428 dip->bytes = dio_bio->bi_iter.bi_size; 7867 dip->bytes = dio_bio->bi_iter.bi_size;
7429 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; 7868 dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
7430 io_bio->bi_private = dip; 7869 io_bio->bi_private = dip;
7431 dip->errors = 0;
7432 dip->orig_bio = io_bio; 7870 dip->orig_bio = io_bio;
7433 dip->dio_bio = dio_bio; 7871 dip->dio_bio = dio_bio;
7434 atomic_set(&dip->pending_bios, 0); 7872 atomic_set(&dip->pending_bios, 0);
7873 btrfs_bio = btrfs_io_bio(io_bio);
7874 btrfs_bio->logical = file_offset;
7435 7875
7436 if (write) 7876 if (write) {
7437 io_bio->bi_end_io = btrfs_endio_direct_write; 7877 io_bio->bi_end_io = btrfs_endio_direct_write;
7438 else 7878 } else {
7439 io_bio->bi_end_io = btrfs_endio_direct_read; 7879 io_bio->bi_end_io = btrfs_endio_direct_read;
7880 dip->subio_endio = btrfs_subio_endio_read;
7881 }
7440 7882
7441 ret = btrfs_submit_direct_hook(rw, dip, skip_sum); 7883 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7442 if (!ret) 7884 if (!ret)
7443 return; 7885 return;
7444 7886
7887 if (btrfs_bio->end_io)
7888 btrfs_bio->end_io(btrfs_bio, ret);
7445free_io_bio: 7889free_io_bio:
7446 bio_put(io_bio); 7890 bio_put(io_bio);
7447 7891
@@ -7522,7 +7966,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7522 count = iov_iter_count(iter); 7966 count = iov_iter_count(iter);
7523 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, 7967 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7524 &BTRFS_I(inode)->runtime_flags)) 7968 &BTRFS_I(inode)->runtime_flags))
7525 filemap_fdatawrite_range(inode->i_mapping, offset, count); 7969 filemap_fdatawrite_range(inode->i_mapping, offset,
7970 offset + count - 1);
7526 7971
7527 if (rw & WRITE) { 7972 if (rw & WRITE) {
7528 /* 7973 /*
@@ -7537,8 +7982,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7537 ret = btrfs_delalloc_reserve_space(inode, count); 7982 ret = btrfs_delalloc_reserve_space(inode, count);
7538 if (ret) 7983 if (ret)
7539 goto out; 7984 goto out;
7540 } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, 7985 } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7541 &BTRFS_I(inode)->runtime_flags))) { 7986 &BTRFS_I(inode)->runtime_flags)) {
7542 inode_dio_done(inode); 7987 inode_dio_done(inode);
7543 flags = DIO_LOCKING | DIO_SKIP_HOLES; 7988 flags = DIO_LOCKING | DIO_SKIP_HOLES;
7544 wakeup = false; 7989 wakeup = false;
@@ -7939,27 +8384,6 @@ static int btrfs_truncate(struct inode *inode)
7939 BUG_ON(ret); 8384 BUG_ON(ret);
7940 8385
7941 /* 8386 /*
7942 * setattr is responsible for setting the ordered_data_close flag,
7943 * but that is only tested during the last file release. That
7944 * could happen well after the next commit, leaving a great big
7945 * window where new writes may get lost if someone chooses to write
7946 * to this file after truncating to zero
7947 *
7948 * The inode doesn't have any dirty data here, and so if we commit
7949 * this is a noop. If someone immediately starts writing to the inode
7950 * it is very likely we'll catch some of their writes in this
7951 * transaction, and the commit will find this file on the ordered
7952 * data list with good things to send down.
7953 *
7954 * This is a best effort solution, there is still a window where
7955 * using truncate to replace the contents of the file will
7956 * end up with a zero length file after a crash.
7957 */
7958 if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7959 &BTRFS_I(inode)->runtime_flags))
7960 btrfs_add_ordered_operation(trans, root, inode);
7961
7962 /*
7963 * So if we truncate and then write and fsync we normally would just 8387 * So if we truncate and then write and fsync we normally would just
7964 * write the extents that changed, which is a problem if we need to 8388 * write the extents that changed, which is a problem if we need to
7965 * first truncate that entire inode. So set this flag so we write out 8389 * first truncate that entire inode. So set this flag so we write out
@@ -8050,6 +8474,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8050 8474
8051 set_nlink(inode, 1); 8475 set_nlink(inode, 1);
8052 btrfs_i_size_write(inode, 0); 8476 btrfs_i_size_write(inode, 0);
8477 unlock_new_inode(inode);
8053 8478
8054 err = btrfs_subvol_inherit_props(trans, new_root, parent_root); 8479 err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
8055 if (err) 8480 if (err)
@@ -8078,6 +8503,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8078 ei->last_sub_trans = 0; 8503 ei->last_sub_trans = 0;
8079 ei->logged_trans = 0; 8504 ei->logged_trans = 0;
8080 ei->delalloc_bytes = 0; 8505 ei->delalloc_bytes = 0;
8506 ei->defrag_bytes = 0;
8081 ei->disk_i_size = 0; 8507 ei->disk_i_size = 0;
8082 ei->flags = 0; 8508 ei->flags = 0;
8083 ei->csum_bytes = 0; 8509 ei->csum_bytes = 0;
@@ -8106,7 +8532,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
8106 mutex_init(&ei->delalloc_mutex); 8532 mutex_init(&ei->delalloc_mutex);
8107 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 8533 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8108 INIT_LIST_HEAD(&ei->delalloc_inodes); 8534 INIT_LIST_HEAD(&ei->delalloc_inodes);
8109 INIT_LIST_HEAD(&ei->ordered_operations);
8110 RB_CLEAR_NODE(&ei->rb_node); 8535 RB_CLEAR_NODE(&ei->rb_node);
8111 8536
8112 return inode; 8537 return inode;
@@ -8137,6 +8562,7 @@ void btrfs_destroy_inode(struct inode *inode)
8137 WARN_ON(BTRFS_I(inode)->reserved_extents); 8562 WARN_ON(BTRFS_I(inode)->reserved_extents);
8138 WARN_ON(BTRFS_I(inode)->delalloc_bytes); 8563 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8139 WARN_ON(BTRFS_I(inode)->csum_bytes); 8564 WARN_ON(BTRFS_I(inode)->csum_bytes);
8565 WARN_ON(BTRFS_I(inode)->defrag_bytes);
8140 8566
8141 /* 8567 /*
8142 * This can happen where we create an inode, but somebody else also 8568 * This can happen where we create an inode, but somebody else also
@@ -8146,17 +8572,6 @@ void btrfs_destroy_inode(struct inode *inode)
8146 if (!root) 8572 if (!root)
8147 goto free; 8573 goto free;
8148 8574
8149 /*
8150 * Make sure we're properly removed from the ordered operation
8151 * lists.
8152 */
8153 smp_mb();
8154 if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
8155 spin_lock(&root->fs_info->ordered_root_lock);
8156 list_del_init(&BTRFS_I(inode)->ordered_operations);
8157 spin_unlock(&root->fs_info->ordered_root_lock);
8158 }
8159
8160 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, 8575 if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
8161 &BTRFS_I(inode)->runtime_flags)) { 8576 &BTRFS_I(inode)->runtime_flags)) {
8162 btrfs_info(root->fs_info, "inode %llu still on the orphan list", 8577 btrfs_info(root->fs_info, "inode %llu still on the orphan list",
@@ -8338,12 +8753,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8338 ret = 0; 8753 ret = 0;
8339 8754
8340 /* 8755 /*
8341 * we're using rename to replace one file with another. 8756 * we're using rename to replace one file with another. Start IO on it
8342 * and the replacement file is large. Start IO on it now so 8757 * now so we don't add too much work to the end of the transaction
8343 * we don't add too much work to the end of the transaction
8344 */ 8758 */
8345 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && 8759 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8346 old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8347 filemap_flush(old_inode->i_mapping); 8760 filemap_flush(old_inode->i_mapping);
8348 8761
8349 /* close the racy window with snapshot create/destroy ioctl */ 8762 /* close the racy window with snapshot create/destroy ioctl */
@@ -8391,12 +8804,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
8391 */ 8804 */
8392 btrfs_pin_log_trans(root); 8805 btrfs_pin_log_trans(root);
8393 } 8806 }
8394 /*
8395 * make sure the inode gets flushed if it is replacing
8396 * something.
8397 */
8398 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8399 btrfs_add_ordered_operation(trans, root, old_inode);
8400 8807
8401 inode_inc_iversion(old_dir); 8808 inode_inc_iversion(old_dir);
8402 inode_inc_iversion(new_dir); 8809 inode_inc_iversion(new_dir);
@@ -8476,6 +8883,16 @@ out_notrans:
8476 return ret; 8883 return ret;
8477} 8884}
8478 8885
8886static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
8887 struct inode *new_dir, struct dentry *new_dentry,
8888 unsigned int flags)
8889{
8890 if (flags & ~RENAME_NOREPLACE)
8891 return -EINVAL;
8892
8893 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
8894}
8895
8479static void btrfs_run_delalloc_work(struct btrfs_work *work) 8896static void btrfs_run_delalloc_work(struct btrfs_work *work)
8480{ 8897{
8481 struct btrfs_delalloc_work *delalloc_work; 8898 struct btrfs_delalloc_work *delalloc_work;
@@ -8514,7 +8931,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8514 work->inode = inode; 8931 work->inode = inode;
8515 work->wait = wait; 8932 work->wait = wait;
8516 work->delay_iput = delay_iput; 8933 work->delay_iput = delay_iput;
8517 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); 8934 WARN_ON_ONCE(!inode);
8935 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
8936 btrfs_run_delalloc_work, NULL, NULL);
8518 8937
8519 return work; 8938 return work;
8520} 8939}
@@ -8559,7 +8978,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
8559 spin_unlock(&root->delalloc_lock); 8978 spin_unlock(&root->delalloc_lock);
8560 8979
8561 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); 8980 work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8562 if (unlikely(!work)) { 8981 if (!work) {
8563 if (delay_iput) 8982 if (delay_iput)
8564 btrfs_add_delayed_iput(inode); 8983 btrfs_add_delayed_iput(inode);
8565 else 8984 else
@@ -8718,12 +9137,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8718 goto out_unlock; 9137 goto out_unlock;
8719 } 9138 }
8720 9139
8721 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8722 if (err) {
8723 drop_inode = 1;
8724 goto out_unlock;
8725 }
8726
8727 /* 9140 /*
8728 * If the active LSM wants to access the inode during 9141 * If the active LSM wants to access the inode during
8729 * d_instantiate it needs these. Smack checks to see 9142 * d_instantiate it needs these. Smack checks to see
@@ -8732,34 +9145,32 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8732 */ 9145 */
8733 inode->i_fop = &btrfs_file_operations; 9146 inode->i_fop = &btrfs_file_operations;
8734 inode->i_op = &btrfs_file_inode_operations; 9147 inode->i_op = &btrfs_file_inode_operations;
9148 inode->i_mapping->a_ops = &btrfs_aops;
9149 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
9150 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
9151
9152 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
9153 if (err)
9154 goto out_unlock_inode;
8735 9155
8736 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); 9156 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8737 if (err) 9157 if (err)
8738 drop_inode = 1; 9158 goto out_unlock_inode;
8739 else {
8740 inode->i_mapping->a_ops = &btrfs_aops;
8741 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8742 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8743 }
8744 if (drop_inode)
8745 goto out_unlock;
8746 9159
8747 path = btrfs_alloc_path(); 9160 path = btrfs_alloc_path();
8748 if (!path) { 9161 if (!path) {
8749 err = -ENOMEM; 9162 err = -ENOMEM;
8750 drop_inode = 1; 9163 goto out_unlock_inode;
8751 goto out_unlock;
8752 } 9164 }
8753 key.objectid = btrfs_ino(inode); 9165 key.objectid = btrfs_ino(inode);
8754 key.offset = 0; 9166 key.offset = 0;
8755 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 9167 key.type = BTRFS_EXTENT_DATA_KEY;
8756 datasize = btrfs_file_extent_calc_inline_size(name_len); 9168 datasize = btrfs_file_extent_calc_inline_size(name_len);
8757 err = btrfs_insert_empty_item(trans, root, path, &key, 9169 err = btrfs_insert_empty_item(trans, root, path, &key,
8758 datasize); 9170 datasize);
8759 if (err) { 9171 if (err) {
8760 drop_inode = 1;
8761 btrfs_free_path(path); 9172 btrfs_free_path(path);
8762 goto out_unlock; 9173 goto out_unlock_inode;
8763 } 9174 }
8764 leaf = path->nodes[0]; 9175 leaf = path->nodes[0];
8765 ei = btrfs_item_ptr(leaf, path->slots[0], 9176 ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8783,12 +9194,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8783 inode_set_bytes(inode, name_len); 9194 inode_set_bytes(inode, name_len);
8784 btrfs_i_size_write(inode, name_len); 9195 btrfs_i_size_write(inode, name_len);
8785 err = btrfs_update_inode(trans, root, inode); 9196 err = btrfs_update_inode(trans, root, inode);
8786 if (err) 9197 if (err) {
8787 drop_inode = 1; 9198 drop_inode = 1;
9199 goto out_unlock_inode;
9200 }
9201
9202 unlock_new_inode(inode);
9203 d_instantiate(dentry, inode);
8788 9204
8789out_unlock: 9205out_unlock:
8790 if (!err)
8791 d_instantiate(dentry, inode);
8792 btrfs_end_transaction(trans, root); 9206 btrfs_end_transaction(trans, root);
8793 if (drop_inode) { 9207 if (drop_inode) {
8794 inode_dec_link_count(inode); 9208 inode_dec_link_count(inode);
@@ -8796,6 +9210,11 @@ out_unlock:
8796 } 9210 }
8797 btrfs_btree_balance_dirty(root); 9211 btrfs_btree_balance_dirty(root);
8798 return err; 9212 return err;
9213
9214out_unlock_inode:
9215 drop_inode = 1;
9216 unlock_new_inode(inode);
9217 goto out_unlock;
8799} 9218}
8800 9219
8801static int __btrfs_prealloc_file_range(struct inode *inode, int mode, 9220static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -8979,14 +9398,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
8979 goto out; 9398 goto out;
8980 } 9399 }
8981 9400
8982 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
8983 if (ret)
8984 goto out;
8985
8986 ret = btrfs_update_inode(trans, root, inode);
8987 if (ret)
8988 goto out;
8989
8990 inode->i_fop = &btrfs_file_operations; 9401 inode->i_fop = &btrfs_file_operations;
8991 inode->i_op = &btrfs_file_inode_operations; 9402 inode->i_op = &btrfs_file_inode_operations;
8992 9403
@@ -8994,10 +9405,26 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
8994 inode->i_mapping->backing_dev_info = &root->fs_info->bdi; 9405 inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8995 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 9406 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8996 9407
9408 ret = btrfs_init_inode_security(trans, inode, dir, NULL);
9409 if (ret)
9410 goto out_inode;
9411
9412 ret = btrfs_update_inode(trans, root, inode);
9413 if (ret)
9414 goto out_inode;
8997 ret = btrfs_orphan_add(trans, inode); 9415 ret = btrfs_orphan_add(trans, inode);
8998 if (ret) 9416 if (ret)
8999 goto out; 9417 goto out_inode;
9000 9418
9419 /*
9420 * We set number of links to 0 in btrfs_new_inode(), and here we set
9421 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
9422 * through:
9423 *
9424 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9425 */
9426 set_nlink(inode, 1);
9427 unlock_new_inode(inode);
9001 d_tmpfile(dentry, inode); 9428 d_tmpfile(dentry, inode);
9002 mark_inode_dirty(inode); 9429 mark_inode_dirty(inode);
9003 9430
@@ -9007,8 +9434,12 @@ out:
9007 iput(inode); 9434 iput(inode);
9008 btrfs_balance_delayed_items(root); 9435 btrfs_balance_delayed_items(root);
9009 btrfs_btree_balance_dirty(root); 9436 btrfs_btree_balance_dirty(root);
9010
9011 return ret; 9437 return ret;
9438
9439out_inode:
9440 unlock_new_inode(inode);
9441 goto out;
9442
9012} 9443}
9013 9444
9014static const struct inode_operations btrfs_dir_inode_operations = { 9445static const struct inode_operations btrfs_dir_inode_operations = {
@@ -9019,7 +9450,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
9019 .link = btrfs_link, 9450 .link = btrfs_link,
9020 .mkdir = btrfs_mkdir, 9451 .mkdir = btrfs_mkdir,
9021 .rmdir = btrfs_rmdir, 9452 .rmdir = btrfs_rmdir,
9022 .rename = btrfs_rename, 9453 .rename2 = btrfs_rename2,
9023 .symlink = btrfs_symlink, 9454 .symlink = btrfs_symlink,
9024 .setattr = btrfs_setattr, 9455 .setattr = btrfs_setattr,
9025 .mknod = btrfs_mknod, 9456 .mknod = btrfs_mknod,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 47aceb494d1d..8d2b76e29d3b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
332 goto out_drop; 332 goto out_drop;
333 333
334 } else { 334 } else {
335 ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
336 if (ret && ret != -ENODATA)
337 goto out_drop;
335 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); 338 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
336 } 339 }
337 340
@@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir,
477 if (ret) 480 if (ret)
478 goto fail; 481 goto fail;
479 482
480 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 483 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
481 0, objectid, NULL, 0, 0, 0);
482 if (IS_ERR(leaf)) { 484 if (IS_ERR(leaf)) {
483 ret = PTR_ERR(leaf); 485 ret = PTR_ERR(leaf);
484 goto fail; 486 goto fail;
@@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir,
503 btrfs_set_stack_inode_generation(inode_item, 1); 505 btrfs_set_stack_inode_generation(inode_item, 1);
504 btrfs_set_stack_inode_size(inode_item, 3); 506 btrfs_set_stack_inode_size(inode_item, 3);
505 btrfs_set_stack_inode_nlink(inode_item, 1); 507 btrfs_set_stack_inode_nlink(inode_item, 1);
506 btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); 508 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
507 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 509 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
508 510
509 btrfs_set_root_flags(&root_item, 0); 511 btrfs_set_root_flags(&root_item, 0);
@@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir,
535 537
536 key.objectid = objectid; 538 key.objectid = objectid;
537 key.offset = 0; 539 key.offset = 0;
538 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 540 key.type = BTRFS_ROOT_ITEM_KEY;
539 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 541 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
540 &root_item); 542 &root_item);
541 if (ret) 543 if (ret)
@@ -915,7 +917,7 @@ out_unlock:
915 * file you want to defrag, we return 0 to let you know to skip this 917 * file you want to defrag, we return 0 to let you know to skip this
916 * part of the file 918 * part of the file
917 */ 919 */
918static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) 920static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
919{ 921{
920 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 922 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
921 struct extent_map *em = NULL; 923 struct extent_map *em = NULL;
@@ -950,7 +952,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
950 */ 952 */
951static int find_new_extents(struct btrfs_root *root, 953static int find_new_extents(struct btrfs_root *root,
952 struct inode *inode, u64 newer_than, 954 struct inode *inode, u64 newer_than,
953 u64 *off, int thresh) 955 u64 *off, u32 thresh)
954{ 956{
955 struct btrfs_path *path; 957 struct btrfs_path *path;
956 struct btrfs_key min_key; 958 struct btrfs_key min_key;
@@ -969,12 +971,9 @@ static int find_new_extents(struct btrfs_root *root,
969 min_key.offset = *off; 971 min_key.offset = *off;
970 972
971 while (1) { 973 while (1) {
972 path->keep_locks = 1;
973 ret = btrfs_search_forward(root, &min_key, path, newer_than); 974 ret = btrfs_search_forward(root, &min_key, path, newer_than);
974 if (ret != 0) 975 if (ret != 0)
975 goto none; 976 goto none;
976 path->keep_locks = 0;
977 btrfs_unlock_up_safe(path, 1);
978process_slot: 977process_slot:
979 if (min_key.objectid != ino) 978 if (min_key.objectid != ino)
980 goto none; 979 goto none;
@@ -1052,15 +1051,17 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
1052 return false; 1051 return false;
1053 1052
1054 next = defrag_lookup_extent(inode, em->start + em->len); 1053 next = defrag_lookup_extent(inode, em->start + em->len);
1055 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || 1054 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
1056 (em->block_start + em->block_len == next->block_start)) 1055 ret = false;
1056 else if ((em->block_start + em->block_len == next->block_start) &&
1057 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
1057 ret = false; 1058 ret = false;
1058 1059
1059 free_extent_map(next); 1060 free_extent_map(next);
1060 return ret; 1061 return ret;
1061} 1062}
1062 1063
1063static int should_defrag_range(struct inode *inode, u64 start, int thresh, 1064static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
1064 u64 *last_len, u64 *skip, u64 *defrag_end, 1065 u64 *last_len, u64 *skip, u64 *defrag_end,
1065 int compress) 1066 int compress)
1066{ 1067{
@@ -1088,7 +1089,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
1088 } 1089 }
1089 1090
1090 next_mergeable = defrag_check_next_extent(inode, em); 1091 next_mergeable = defrag_check_next_extent(inode, em);
1091
1092 /* 1092 /*
1093 * we hit a real extent, if it is big or the next extent is not a 1093 * we hit a real extent, if it is big or the next extent is not a
1094 * real extent, don't bother defragging it 1094 * real extent, don't bother defragging it
@@ -1291,7 +1291,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1291 int ret; 1291 int ret;
1292 int defrag_count = 0; 1292 int defrag_count = 0;
1293 int compress_type = BTRFS_COMPRESS_ZLIB; 1293 int compress_type = BTRFS_COMPRESS_ZLIB;
1294 int extent_thresh = range->extent_thresh; 1294 u32 extent_thresh = range->extent_thresh;
1295 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 1295 unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
1296 unsigned long cluster = max_cluster; 1296 unsigned long cluster = max_cluster;
1297 u64 new_align = ~((u64)128 * 1024 - 1); 1297 u64 new_align = ~((u64)128 * 1024 - 1);
@@ -1367,8 +1367,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1367 inode->i_mapping->writeback_index = i; 1367 inode->i_mapping->writeback_index = i;
1368 1368
1369 while (i <= last_index && defrag_count < max_to_defrag && 1369 while (i <= last_index && defrag_count < max_to_defrag &&
1370 (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 1370 (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) {
1371 PAGE_CACHE_SHIFT)) {
1372 /* 1371 /*
1373 * make sure we stop running if someone unmounts 1372 * make sure we stop running if someone unmounts
1374 * the FS 1373 * the FS
@@ -1391,7 +1390,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1391 * the should_defrag function tells us how much to skip 1390 * the should_defrag function tells us how much to skip
1392 * bump our counter by the suggested amount 1391 * bump our counter by the suggested amount
1393 */ 1392 */
1394 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1393 next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE);
1395 i = max(i + 1, next); 1394 i = max(i + 1, next);
1396 continue; 1395 continue;
1397 } 1396 }
@@ -1586,7 +1585,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1586 goto out_free; 1585 goto out_free;
1587 } 1586 }
1588 1587
1589 old_size = device->total_bytes; 1588 old_size = btrfs_device_get_total_bytes(device);
1590 1589
1591 if (mod < 0) { 1590 if (mod < 0) {
1592 if (new_size > old_size) { 1591 if (new_size > old_size) {
@@ -1735,7 +1734,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1735 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | 1734 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
1736 BTRFS_SUBVOL_QGROUP_INHERIT)) { 1735 BTRFS_SUBVOL_QGROUP_INHERIT)) {
1737 ret = -EOPNOTSUPP; 1736 ret = -EOPNOTSUPP;
1738 goto out; 1737 goto free_args;
1739 } 1738 }
1740 1739
1741 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) 1740 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1745,27 +1744,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1745 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { 1744 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1746 if (vol_args->size > PAGE_CACHE_SIZE) { 1745 if (vol_args->size > PAGE_CACHE_SIZE) {
1747 ret = -EINVAL; 1746 ret = -EINVAL;
1748 goto out; 1747 goto free_args;
1749 } 1748 }
1750 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); 1749 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1751 if (IS_ERR(inherit)) { 1750 if (IS_ERR(inherit)) {
1752 ret = PTR_ERR(inherit); 1751 ret = PTR_ERR(inherit);
1753 goto out; 1752 goto free_args;
1754 } 1753 }
1755 } 1754 }
1756 1755
1757 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, 1756 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1758 vol_args->fd, subvol, ptr, 1757 vol_args->fd, subvol, ptr,
1759 readonly, inherit); 1758 readonly, inherit);
1759 if (ret)
1760 goto free_inherit;
1760 1761
1761 if (ret == 0 && ptr && 1762 if (ptr && copy_to_user(arg +
1762 copy_to_user(arg + 1763 offsetof(struct btrfs_ioctl_vol_args_v2,
1763 offsetof(struct btrfs_ioctl_vol_args_v2, 1764 transid),
1764 transid), ptr, sizeof(*ptr))) 1765 ptr, sizeof(*ptr)))
1765 ret = -EFAULT; 1766 ret = -EFAULT;
1766out: 1767
1767 kfree(vol_args); 1768free_inherit:
1768 kfree(inherit); 1769 kfree(inherit);
1770free_args:
1771 kfree(vol_args);
1769 return ret; 1772 return ret;
1770} 1773}
1771 1774
@@ -2117,8 +2120,6 @@ static noinline int search_ioctl(struct inode *inode,
2117 key.type = sk->min_type; 2120 key.type = sk->min_type;
2118 key.offset = sk->min_offset; 2121 key.offset = sk->min_offset;
2119 2122
2120 path->keep_locks = 1;
2121
2122 while (1) { 2123 while (1) {
2123 ret = btrfs_search_forward(root, &key, path, sk->min_transid); 2124 ret = btrfs_search_forward(root, &key, path, sk->min_transid);
2124 if (ret != 0) { 2125 if (ret != 0) {
@@ -2451,9 +2452,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2451 goto out_dput; 2452 goto out_dput;
2452 } 2453 }
2453 2454
2454 err = d_invalidate(dentry); 2455 d_invalidate(dentry);
2455 if (err)
2456 goto out_unlock;
2457 2456
2458 down_write(&root->fs_info->subvol_sem); 2457 down_write(&root->fs_info->subvol_sem);
2459 2458
@@ -2538,7 +2537,6 @@ out_release:
2538 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); 2537 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
2539out_up_write: 2538out_up_write:
2540 up_write(&root->fs_info->subvol_sem); 2539 up_write(&root->fs_info->subvol_sem);
2541out_unlock:
2542 if (err) { 2540 if (err) {
2543 spin_lock(&dest->root_item_lock); 2541 spin_lock(&dest->root_item_lock);
2544 root_flags = btrfs_root_flags(&dest->root_item); 2542 root_flags = btrfs_root_flags(&dest->root_item);
@@ -2554,9 +2552,9 @@ out_unlock:
2554 ASSERT(dest->send_in_progress == 0); 2552 ASSERT(dest->send_in_progress == 0);
2555 2553
2556 /* the last ref */ 2554 /* the last ref */
2557 if (dest->cache_inode) { 2555 if (dest->ino_cache_inode) {
2558 iput(dest->cache_inode); 2556 iput(dest->ino_cache_inode);
2559 dest->cache_inode = NULL; 2557 dest->ino_cache_inode = NULL;
2560 } 2558 }
2561 } 2559 }
2562out_dput: 2560out_dput:
@@ -2662,6 +2660,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
2662 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2660 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2663 ret = btrfs_init_new_device(root, vol_args->name); 2661 ret = btrfs_init_new_device(root, vol_args->name);
2664 2662
2663 if (!ret)
2664 btrfs_info(root->fs_info, "disk added %s",vol_args->name);
2665
2665 kfree(vol_args); 2666 kfree(vol_args);
2666out: 2667out:
2667 mutex_unlock(&root->fs_info->volume_mutex); 2668 mutex_unlock(&root->fs_info->volume_mutex);
@@ -2685,7 +2686,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2685 vol_args = memdup_user(arg, sizeof(*vol_args)); 2686 vol_args = memdup_user(arg, sizeof(*vol_args));
2686 if (IS_ERR(vol_args)) { 2687 if (IS_ERR(vol_args)) {
2687 ret = PTR_ERR(vol_args); 2688 ret = PTR_ERR(vol_args);
2688 goto out; 2689 goto err_drop;
2689 } 2690 }
2690 2691
2691 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2692 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2701,8 +2702,12 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2701 mutex_unlock(&root->fs_info->volume_mutex); 2702 mutex_unlock(&root->fs_info->volume_mutex);
2702 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2703 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2703 2704
2705 if (!ret)
2706 btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
2707
2704out: 2708out:
2705 kfree(vol_args); 2709 kfree(vol_args);
2710err_drop:
2706 mnt_drop_write_file(file); 2711 mnt_drop_write_file(file);
2707 return ret; 2712 return ret;
2708} 2713}
@@ -2764,8 +2769,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2764 } 2769 }
2765 2770
2766 di_args->devid = dev->devid; 2771 di_args->devid = dev->devid;
2767 di_args->bytes_used = dev->bytes_used; 2772 di_args->bytes_used = btrfs_device_get_bytes_used(dev);
2768 di_args->total_bytes = dev->total_bytes; 2773 di_args->total_bytes = btrfs_device_get_total_bytes(dev);
2769 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); 2774 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2770 if (dev->name) { 2775 if (dev->name) {
2771 struct rcu_string *name; 2776 struct rcu_string *name;
@@ -3191,7 +3196,7 @@ static void clone_update_extent_map(struct inode *inode,
3191 em->start + em->len - 1, 0); 3196 em->start + em->len - 1, 0);
3192 } 3197 }
3193 3198
3194 if (unlikely(ret)) 3199 if (ret)
3195 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 3200 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3196 &BTRFS_I(inode)->runtime_flags); 3201 &BTRFS_I(inode)->runtime_flags);
3197} 3202}
@@ -3226,7 +3231,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
3226 u64 last_dest_end = destoff; 3231 u64 last_dest_end = destoff;
3227 3232
3228 ret = -ENOMEM; 3233 ret = -ENOMEM;
3229 buf = vmalloc(btrfs_level_size(root, 0)); 3234 buf = vmalloc(root->nodesize);
3230 if (!buf) 3235 if (!buf)
3231 return ret; 3236 return ret;
3232 3237
@@ -3279,11 +3284,11 @@ process_slot:
3279 slot = path->slots[0]; 3284 slot = path->slots[0];
3280 3285
3281 btrfs_item_key_to_cpu(leaf, &key, slot); 3286 btrfs_item_key_to_cpu(leaf, &key, slot);
3282 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || 3287 if (key.type > BTRFS_EXTENT_DATA_KEY ||
3283 key.objectid != btrfs_ino(src)) 3288 key.objectid != btrfs_ino(src))
3284 break; 3289 break;
3285 3290
3286 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { 3291 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3287 struct btrfs_file_extent_item *extent; 3292 struct btrfs_file_extent_item *extent;
3288 int type; 3293 int type;
3289 u32 size; 3294 u32 size;
@@ -3527,7 +3532,8 @@ process_slot:
3527 btrfs_mark_buffer_dirty(leaf); 3532 btrfs_mark_buffer_dirty(leaf);
3528 btrfs_release_path(path); 3533 btrfs_release_path(path);
3529 3534
3530 last_dest_end = new_key.offset + datal; 3535 last_dest_end = ALIGN(new_key.offset + datal,
3536 root->sectorsize);
3531 ret = clone_finish_inode_update(trans, inode, 3537 ret = clone_finish_inode_update(trans, inode,
3532 last_dest_end, 3538 last_dest_end,
3533 destoff, olen); 3539 destoff, olen);
@@ -5309,6 +5315,12 @@ long btrfs_ioctl(struct file *file, unsigned int
5309 if (ret) 5315 if (ret)
5310 return ret; 5316 return ret;
5311 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); 5317 ret = btrfs_sync_fs(file->f_dentry->d_sb, 1);
5318 /*
5319 * The transaction thread may want to do more work,
5320 * namely it pokes the cleaner ktread that will start
5321 * processing uncleaned subvols.
5322 */
5323 wake_up_process(root->fs_info->transaction_kthread);
5312 return ret; 5324 return ret;
5313 } 5325 }
5314 case BTRFS_IOC_START_SYNC: 5326 case BTRFS_IOC_START_SYNC:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index dfad8514f0da..78285f30909e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
266 char *data_in; 266 char *data_in;
267 unsigned long page_in_index = 0; 267 unsigned long page_in_index = 0;
268 unsigned long page_out_index = 0; 268 unsigned long page_out_index = 0;
269 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 269 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
270 PAGE_CACHE_SIZE;
271 unsigned long buf_start; 270 unsigned long buf_start;
272 unsigned long buf_offset = 0; 271 unsigned long buf_offset = 0;
273 unsigned long bytes; 272 unsigned long bytes;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 7187b14faa6c..ac734ec4cc20 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -571,18 +571,6 @@ void btrfs_remove_ordered_extent(struct inode *inode,
571 571
572 trace_btrfs_ordered_extent_remove(inode, entry); 572 trace_btrfs_ordered_extent_remove(inode, entry);
573 573
574 /*
575 * we have no more ordered extents for this inode and
576 * no dirty pages. We can safely remove it from the
577 * list of ordered extents
578 */
579 if (RB_EMPTY_ROOT(&tree->tree) &&
580 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
581 spin_lock(&root->fs_info->ordered_root_lock);
582 list_del_init(&BTRFS_I(inode)->ordered_operations);
583 spin_unlock(&root->fs_info->ordered_root_lock);
584 }
585
586 if (!root->nr_ordered_extents) { 574 if (!root->nr_ordered_extents) {
587 spin_lock(&root->fs_info->ordered_root_lock); 575 spin_lock(&root->fs_info->ordered_root_lock);
588 BUG_ON(list_empty(&root->ordered_root)); 576 BUG_ON(list_empty(&root->ordered_root));
@@ -627,6 +615,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
627 spin_unlock(&root->ordered_extent_lock); 615 spin_unlock(&root->ordered_extent_lock);
628 616
629 btrfs_init_work(&ordered->flush_work, 617 btrfs_init_work(&ordered->flush_work,
618 btrfs_flush_delalloc_helper,
630 btrfs_run_ordered_extent_work, NULL, NULL); 619 btrfs_run_ordered_extent_work, NULL, NULL);
631 list_add_tail(&ordered->work_list, &works); 620 list_add_tail(&ordered->work_list, &works);
632 btrfs_queue_work(root->fs_info->flush_workers, 621 btrfs_queue_work(root->fs_info->flush_workers,
@@ -687,81 +676,6 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
687} 676}
688 677
689/* 678/*
690 * this is used during transaction commit to write all the inodes
691 * added to the ordered operation list. These files must be fully on
692 * disk before the transaction commits.
693 *
694 * we have two modes here, one is to just start the IO via filemap_flush
695 * and the other is to wait for all the io. When we wait, we have an
696 * extra check to make sure the ordered operation list really is empty
697 * before we return
698 */
699int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
700 struct btrfs_root *root, int wait)
701{
702 struct btrfs_inode *btrfs_inode;
703 struct inode *inode;
704 struct btrfs_transaction *cur_trans = trans->transaction;
705 struct list_head splice;
706 struct list_head works;
707 struct btrfs_delalloc_work *work, *next;
708 int ret = 0;
709
710 INIT_LIST_HEAD(&splice);
711 INIT_LIST_HEAD(&works);
712
713 mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
714 spin_lock(&root->fs_info->ordered_root_lock);
715 list_splice_init(&cur_trans->ordered_operations, &splice);
716 while (!list_empty(&splice)) {
717 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
718 ordered_operations);
719 inode = &btrfs_inode->vfs_inode;
720
721 list_del_init(&btrfs_inode->ordered_operations);
722
723 /*
724 * the inode may be getting freed (in sys_unlink path).
725 */
726 inode = igrab(inode);
727 if (!inode)
728 continue;
729
730 if (!wait)
731 list_add_tail(&BTRFS_I(inode)->ordered_operations,
732 &cur_trans->ordered_operations);
733 spin_unlock(&root->fs_info->ordered_root_lock);
734
735 work = btrfs_alloc_delalloc_work(inode, wait, 1);
736 if (!work) {
737 spin_lock(&root->fs_info->ordered_root_lock);
738 if (list_empty(&BTRFS_I(inode)->ordered_operations))
739 list_add_tail(&btrfs_inode->ordered_operations,
740 &splice);
741 list_splice_tail(&splice,
742 &cur_trans->ordered_operations);
743 spin_unlock(&root->fs_info->ordered_root_lock);
744 ret = -ENOMEM;
745 goto out;
746 }
747 list_add_tail(&work->list, &works);
748 btrfs_queue_work(root->fs_info->flush_workers,
749 &work->work);
750
751 cond_resched();
752 spin_lock(&root->fs_info->ordered_root_lock);
753 }
754 spin_unlock(&root->fs_info->ordered_root_lock);
755out:
756 list_for_each_entry_safe(work, next, &works, list) {
757 list_del_init(&work->list);
758 btrfs_wait_and_free_delalloc_work(work);
759 }
760 mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
761 return ret;
762}
763
764/*
765 * Used to start IO or wait for a given ordered extent to finish. 679 * Used to start IO or wait for a given ordered extent to finish.
766 * 680 *
767 * If wait is one, this effectively waits on page writeback for all the pages 681 * If wait is one, this effectively waits on page writeback for all the pages
@@ -1120,42 +1034,6 @@ out:
1120 return index; 1034 return index;
1121} 1035}
1122 1036
1123
1124/*
1125 * add a given inode to the list of inodes that must be fully on
1126 * disk before a transaction commit finishes.
1127 *
1128 * This basically gives us the ext3 style data=ordered mode, and it is mostly
1129 * used to make sure renamed files are fully on disk.
1130 *
1131 * It is a noop if the inode is already fully on disk.
1132 *
1133 * If trans is not null, we'll do a friendly check for a transaction that
1134 * is already flushing things and force the IO down ourselves.
1135 */
1136void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
1137 struct btrfs_root *root, struct inode *inode)
1138{
1139 struct btrfs_transaction *cur_trans = trans->transaction;
1140 u64 last_mod;
1141
1142 last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
1143
1144 /*
1145 * if this file hasn't been changed since the last transaction
1146 * commit, we can safely return without doing anything
1147 */
1148 if (last_mod <= root->fs_info->last_trans_committed)
1149 return;
1150
1151 spin_lock(&root->fs_info->ordered_root_lock);
1152 if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
1153 list_add_tail(&BTRFS_I(inode)->ordered_operations,
1154 &cur_trans->ordered_operations);
1155 }
1156 spin_unlock(&root->fs_info->ordered_root_lock);
1157}
1158
1159int __init ordered_data_init(void) 1037int __init ordered_data_init(void)
1160{ 1038{
1161 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent", 1039 btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 246897058efb..d81a274d621e 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -190,11 +190,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
190 struct btrfs_ordered_extent *ordered); 190 struct btrfs_ordered_extent *ordered);
191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 191int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
192 u32 *sum, int len); 192 u32 *sum, int len);
193int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
194 struct btrfs_root *root, int wait);
195void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
196 struct btrfs_root *root,
197 struct inode *inode);
198int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 193int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr);
199void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 194void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr);
200void btrfs_get_logged_extents(struct inode *inode, 195void btrfs_get_logged_extents(struct inode *inode,
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 65793edb38ca..47767d5b8f0b 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
27 int ret = 0; 27 int ret = 0;
28 28
29 key.objectid = BTRFS_ORPHAN_OBJECTID; 29 key.objectid = BTRFS_ORPHAN_OBJECTID;
30 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 30 key.type = BTRFS_ORPHAN_ITEM_KEY;
31 key.offset = offset; 31 key.offset = offset;
32 32
33 path = btrfs_alloc_path(); 33 path = btrfs_alloc_path();
@@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
48 int ret = 0; 48 int ret = 0;
49 49
50 key.objectid = BTRFS_ORPHAN_OBJECTID; 50 key.objectid = BTRFS_ORPHAN_OBJECTID;
51 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 51 key.type = BTRFS_ORPHAN_ITEM_KEY;
52 key.offset = offset; 52 key.offset = offset;
53 53
54 path = btrfs_alloc_path(); 54 path = btrfs_alloc_path();
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9626b4ad3b9a..647ab12fdf5d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
195 for (i = 0 ; i < nr ; i++) { 195 for (i = 0 ; i < nr ; i++) {
196 item = btrfs_item_nr(i); 196 item = btrfs_item_nr(i);
197 btrfs_item_key_to_cpu(l, &key, i); 197 btrfs_item_key_to_cpu(l, &key, i);
198 type = btrfs_key_type(&key); 198 type = key.type;
199 printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " 199 printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
200 "itemsize %d\n", 200 "itemsize %d\n",
201 i, key.objectid, type, key.offset, 201 i, key.objectid, type, key.offset,
@@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
336 for (i = 0; i < nr; i++) { 336 for (i = 0; i < nr; i++) {
337 struct extent_buffer *next = read_tree_block(root, 337 struct extent_buffer *next = read_tree_block(root,
338 btrfs_node_blockptr(c, i), 338 btrfs_node_blockptr(c, i),
339 btrfs_level_size(root, level - 1),
340 btrfs_node_ptr_generation(c, i)); 339 btrfs_node_ptr_generation(c, i));
341 if (btrfs_is_leaf(next) && 340 if (btrfs_is_leaf(next) &&
342 level != 1) 341 level != 1)
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 98cb6b2630f9..48b60dbf807f 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
539 struct extent_buffer *leaf; 539 struct extent_buffer *leaf;
540 struct btrfs_key key; 540 struct btrfs_key key;
541 541
542#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 542 if (btrfs_test_is_dummy_root(quota_root))
543 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &quota_root->state)))
544 return 0; 543 return 0;
545#endif 544
546 path = btrfs_alloc_path(); 545 path = btrfs_alloc_path();
547 if (!path) 546 if (!path)
548 return -ENOMEM; 547 return -ENOMEM;
@@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
551 key.type = BTRFS_QGROUP_INFO_KEY; 550 key.type = BTRFS_QGROUP_INFO_KEY;
552 key.offset = qgroupid; 551 key.offset = qgroupid;
553 552
553 /*
554 * Avoid a transaction abort by catching -EEXIST here. In that
555 * case, we proceed by re-initializing the existing structure
556 * on disk.
557 */
558
554 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 559 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
555 sizeof(*qgroup_info)); 560 sizeof(*qgroup_info));
556 if (ret) 561 if (ret && ret != -EEXIST)
557 goto out; 562 goto out;
558 563
559 leaf = path->nodes[0]; 564 leaf = path->nodes[0];
@@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
572 key.type = BTRFS_QGROUP_LIMIT_KEY; 577 key.type = BTRFS_QGROUP_LIMIT_KEY;
573 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 578 ret = btrfs_insert_empty_item(trans, quota_root, path, &key,
574 sizeof(*qgroup_limit)); 579 sizeof(*qgroup_limit));
575 if (ret) 580 if (ret && ret != -EEXIST)
576 goto out; 581 goto out;
577 582
578 leaf = path->nodes[0]; 583 leaf = path->nodes[0];
@@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
692 int ret; 697 int ret;
693 int slot; 698 int slot;
694 699
695#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 700 if (btrfs_test_is_dummy_root(root))
696 if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
697 return 0; 701 return 0;
698#endif 702
699 key.objectid = 0; 703 key.objectid = 0;
700 key.type = BTRFS_QGROUP_INFO_KEY; 704 key.type = BTRFS_QGROUP_INFO_KEY;
701 key.offset = qgroup->qgroupid; 705 key.offset = qgroup->qgroupid;
@@ -1201,6 +1205,50 @@ out:
1201 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1205 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1202 return ret; 1206 return ret;
1203} 1207}
1208
1209static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
1210 struct btrfs_qgroup_operation *oper2)
1211{
1212 /*
1213 * Ignore seq and type here, we're looking for any operation
1214 * at all related to this extent on that root.
1215 */
1216 if (oper1->bytenr < oper2->bytenr)
1217 return -1;
1218 if (oper1->bytenr > oper2->bytenr)
1219 return 1;
1220 if (oper1->ref_root < oper2->ref_root)
1221 return -1;
1222 if (oper1->ref_root > oper2->ref_root)
1223 return 1;
1224 return 0;
1225}
1226
1227static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
1228 struct btrfs_qgroup_operation *oper)
1229{
1230 struct rb_node *n;
1231 struct btrfs_qgroup_operation *cur;
1232 int cmp;
1233
1234 spin_lock(&fs_info->qgroup_op_lock);
1235 n = fs_info->qgroup_op_tree.rb_node;
1236 while (n) {
1237 cur = rb_entry(n, struct btrfs_qgroup_operation, n);
1238 cmp = comp_oper_exist(cur, oper);
1239 if (cmp < 0) {
1240 n = n->rb_right;
1241 } else if (cmp) {
1242 n = n->rb_left;
1243 } else {
1244 spin_unlock(&fs_info->qgroup_op_lock);
1245 return -EEXIST;
1246 }
1247 }
1248 spin_unlock(&fs_info->qgroup_op_lock);
1249 return 0;
1250}
1251
1204static int comp_oper(struct btrfs_qgroup_operation *oper1, 1252static int comp_oper(struct btrfs_qgroup_operation *oper1,
1205 struct btrfs_qgroup_operation *oper2) 1253 struct btrfs_qgroup_operation *oper2)
1206{ 1254{
@@ -1290,6 +1338,25 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
1290 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); 1338 oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
1291 INIT_LIST_HEAD(&oper->elem.list); 1339 INIT_LIST_HEAD(&oper->elem.list);
1292 oper->elem.seq = 0; 1340 oper->elem.seq = 0;
1341
1342 trace_btrfs_qgroup_record_ref(oper);
1343
1344 if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
1345 /*
1346 * If any operation for this bytenr/ref_root combo
1347 * exists, then we know it's not exclusively owned and
1348 * shouldn't be queued up.
1349 *
1350 * This also catches the case where we have a cloned
1351 * extent that gets queued up multiple times during
1352 * drop snapshot.
1353 */
1354 if (qgroup_oper_exists(fs_info, oper)) {
1355 kfree(oper);
1356 return 0;
1357 }
1358 }
1359
1293 ret = insert_qgroup_oper(fs_info, oper); 1360 ret = insert_qgroup_oper(fs_info, oper);
1294 if (ret) { 1361 if (ret) {
1295 /* Shouldn't happen so have an assert for developers */ 1362 /* Shouldn't happen so have an assert for developers */
@@ -1884,6 +1951,111 @@ out:
1884} 1951}
1885 1952
1886/* 1953/*
1954 * Process a reference to a shared subtree. This type of operation is
1955 * queued during snapshot removal when we encounter extents which are
1956 * shared between more than one root.
1957 */
1958static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
1959 struct btrfs_fs_info *fs_info,
1960 struct btrfs_qgroup_operation *oper)
1961{
1962 struct ulist *roots = NULL;
1963 struct ulist_node *unode;
1964 struct ulist_iterator uiter;
1965 struct btrfs_qgroup_list *glist;
1966 struct ulist *parents;
1967 int ret = 0;
1968 int err;
1969 struct btrfs_qgroup *qg;
1970 u64 root_obj = 0;
1971 struct seq_list elem = {};
1972
1973 parents = ulist_alloc(GFP_NOFS);
1974 if (!parents)
1975 return -ENOMEM;
1976
1977 btrfs_get_tree_mod_seq(fs_info, &elem);
1978 ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
1979 elem.seq, &roots);
1980 btrfs_put_tree_mod_seq(fs_info, &elem);
1981 if (ret < 0)
1982 goto out;
1983
1984 if (roots->nnodes != 1)
1985 goto out;
1986
1987 ULIST_ITER_INIT(&uiter);
1988 unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
1989 /*
1990 * If we find our ref root then that means all refs
1991 * this extent has to the root have not yet been
1992 * deleted. In that case, we do nothing and let the
1993 * last ref for this bytenr drive our update.
1994 *
1995 * This can happen for example if an extent is
1996 * referenced multiple times in a snapshot (clone,
1997 * etc). If we are in the middle of snapshot removal,
1998 * queued updates for such an extent will find the
1999 * root if we have not yet finished removing the
2000 * snapshot.
2001 */
2002 if (unode->val == oper->ref_root)
2003 goto out;
2004
2005 root_obj = unode->val;
2006 BUG_ON(!root_obj);
2007
2008 spin_lock(&fs_info->qgroup_lock);
2009 qg = find_qgroup_rb(fs_info, root_obj);
2010 if (!qg)
2011 goto out_unlock;
2012
2013 qg->excl += oper->num_bytes;
2014 qg->excl_cmpr += oper->num_bytes;
2015 qgroup_dirty(fs_info, qg);
2016
2017 /*
2018 * Adjust counts for parent groups. First we find all
2019 * parents, then in the 2nd loop we do the adjustment
2020 * while adding parents of the parents to our ulist.
2021 */
2022 list_for_each_entry(glist, &qg->groups, next_group) {
2023 err = ulist_add(parents, glist->group->qgroupid,
2024 ptr_to_u64(glist->group), GFP_ATOMIC);
2025 if (err < 0) {
2026 ret = err;
2027 goto out_unlock;
2028 }
2029 }
2030
2031 ULIST_ITER_INIT(&uiter);
2032 while ((unode = ulist_next(parents, &uiter))) {
2033 qg = u64_to_ptr(unode->aux);
2034 qg->excl += oper->num_bytes;
2035 qg->excl_cmpr += oper->num_bytes;
2036 qgroup_dirty(fs_info, qg);
2037
2038 /* Add any parents of the parents */
2039 list_for_each_entry(glist, &qg->groups, next_group) {
2040 err = ulist_add(parents, glist->group->qgroupid,
2041 ptr_to_u64(glist->group), GFP_ATOMIC);
2042 if (err < 0) {
2043 ret = err;
2044 goto out_unlock;
2045 }
2046 }
2047 }
2048
2049out_unlock:
2050 spin_unlock(&fs_info->qgroup_lock);
2051
2052out:
2053 ulist_free(roots);
2054 ulist_free(parents);
2055 return ret;
2056}
2057
2058/*
1887 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted 2059 * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
1888 * from the fs. First, all roots referencing the extent are searched, and 2060 * from the fs. First, all roots referencing the extent are searched, and
1889 * then the space is accounted accordingly to the different roots. The 2061 * then the space is accounted accordingly to the different roots. The
@@ -1911,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
1911 2083
1912 ASSERT(is_fstree(oper->ref_root)); 2084 ASSERT(is_fstree(oper->ref_root));
1913 2085
2086 trace_btrfs_qgroup_account(oper);
2087
1914 switch (oper->type) { 2088 switch (oper->type) {
1915 case BTRFS_QGROUP_OPER_ADD_EXCL: 2089 case BTRFS_QGROUP_OPER_ADD_EXCL:
1916 case BTRFS_QGROUP_OPER_SUB_EXCL: 2090 case BTRFS_QGROUP_OPER_SUB_EXCL:
@@ -1920,6 +2094,9 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
1920 case BTRFS_QGROUP_OPER_SUB_SHARED: 2094 case BTRFS_QGROUP_OPER_SUB_SHARED:
1921 ret = qgroup_shared_accounting(trans, fs_info, oper); 2095 ret = qgroup_shared_accounting(trans, fs_info, oper);
1922 break; 2096 break;
2097 case BTRFS_QGROUP_OPER_SUB_SUBTREE:
2098 ret = qgroup_subtree_accounting(trans, fs_info, oper);
2099 break;
1923 default: 2100 default:
1924 ASSERT(0); 2101 ASSERT(0);
1925 } 2102 }
@@ -2068,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2068 if (srcid) { 2245 if (srcid) {
2069 struct btrfs_root *srcroot; 2246 struct btrfs_root *srcroot;
2070 struct btrfs_key srckey; 2247 struct btrfs_key srckey;
2071 int srcroot_level;
2072 2248
2073 srckey.objectid = srcid; 2249 srckey.objectid = srcid;
2074 srckey.type = BTRFS_ROOT_ITEM_KEY; 2250 srckey.type = BTRFS_ROOT_ITEM_KEY;
@@ -2080,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
2080 } 2256 }
2081 2257
2082 rcu_read_lock(); 2258 rcu_read_lock();
2083 srcroot_level = btrfs_header_level(srcroot->node); 2259 level_size = srcroot->nodesize;
2084 level_size = btrfs_level_size(srcroot, srcroot_level);
2085 rcu_read_unlock(); 2260 rcu_read_unlock();
2086 } 2261 }
2087 2262
@@ -2397,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2397 found.type != BTRFS_METADATA_ITEM_KEY) 2572 found.type != BTRFS_METADATA_ITEM_KEY)
2398 continue; 2573 continue;
2399 if (found.type == BTRFS_METADATA_ITEM_KEY) 2574 if (found.type == BTRFS_METADATA_ITEM_KEY)
2400 num_bytes = fs_info->extent_root->leafsize; 2575 num_bytes = fs_info->extent_root->nodesize;
2401 else 2576 else
2402 num_bytes = found.offset; 2577 num_bytes = found.offset;
2403 2578
@@ -2551,6 +2726,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
2551 memset(&fs_info->qgroup_rescan_work, 0, 2726 memset(&fs_info->qgroup_rescan_work, 0,
2552 sizeof(fs_info->qgroup_rescan_work)); 2727 sizeof(fs_info->qgroup_rescan_work));
2553 btrfs_init_work(&fs_info->qgroup_rescan_work, 2728 btrfs_init_work(&fs_info->qgroup_rescan_work,
2729 btrfs_qgroup_rescan_helper,
2554 btrfs_qgroup_rescan_worker, NULL, NULL); 2730 btrfs_qgroup_rescan_worker, NULL, NULL);
2555 2731
2556 if (ret) { 2732 if (ret) {
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 5952ff1fbd7a..18cc68ca3090 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -44,6 +44,7 @@ enum btrfs_qgroup_operation_type {
44 BTRFS_QGROUP_OPER_ADD_SHARED, 44 BTRFS_QGROUP_OPER_ADD_SHARED,
45 BTRFS_QGROUP_OPER_SUB_EXCL, 45 BTRFS_QGROUP_OPER_SUB_EXCL,
46 BTRFS_QGROUP_OPER_SUB_SHARED, 46 BTRFS_QGROUP_OPER_SUB_SHARED,
47 BTRFS_QGROUP_OPER_SUB_SUBTREE,
47}; 48};
48 49
49struct btrfs_qgroup_operation { 50struct btrfs_qgroup_operation {
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 4a88f073fdd7..6a41631cb959 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
913{ 913{
914 unsigned long nr = stripe_len * nr_stripes; 914 unsigned long nr = stripe_len * nr_stripes;
915 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 915 return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
916} 916}
917 917
918/* 918/*
@@ -1416,7 +1416,8 @@ cleanup:
1416 1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{ 1418{
1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); 1419 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1420 rmw_work, NULL, NULL);
1420 1421
1421 btrfs_queue_work(rbio->fs_info->rmw_workers, 1422 btrfs_queue_work(rbio->fs_info->rmw_workers,
1422 &rbio->work); 1423 &rbio->work);
@@ -1424,7 +1425,8 @@ static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1424 1425
1425static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1426static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1426{ 1427{
1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); 1428 btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1429 read_rebuild_work, NULL, NULL);
1428 1430
1429 btrfs_queue_work(rbio->fs_info->rmw_workers, 1431 btrfs_queue_work(rbio->fs_info->rmw_workers,
1430 &rbio->work); 1432 &rbio->work);
@@ -1440,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440 struct btrfs_bio *bbio = rbio->bbio; 1442 struct btrfs_bio *bbio = rbio->bbio;
1441 struct bio_list bio_list; 1443 struct bio_list bio_list;
1442 int ret; 1444 int ret;
1443 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1445 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1444 int pagenr; 1446 int pagenr;
1445 int stripe; 1447 int stripe;
1446 struct bio *bio; 1448 struct bio *bio;
@@ -1665,7 +1667,8 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1665 plug = container_of(cb, struct btrfs_plug_cb, cb); 1667 plug = container_of(cb, struct btrfs_plug_cb, cb);
1666 1668
1667 if (from_schedule) { 1669 if (from_schedule) {
1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1670 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1671 unplug_work, NULL, NULL);
1669 btrfs_queue_work(plug->info->rmw_workers, 1672 btrfs_queue_work(plug->info->rmw_workers,
1670 &plug->work); 1673 &plug->work);
1671 return; 1674 return;
@@ -1722,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1722 int pagenr, stripe; 1725 int pagenr, stripe;
1723 void **pointers; 1726 void **pointers;
1724 int faila = -1, failb = -1; 1727 int faila = -1, failb = -1;
1725 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1728 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1726 struct page *page; 1729 struct page *page;
1727 int err; 1730 int err;
1728 int i; 1731 int i;
@@ -1937,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1937 struct btrfs_bio *bbio = rbio->bbio; 1940 struct btrfs_bio *bbio = rbio->bbio;
1938 struct bio_list bio_list; 1941 struct bio_list bio_list;
1939 int ret; 1942 int ret;
1940 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1943 int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1941 int pagenr; 1944 int pagenr;
1942 int stripe; 1945 int stripe;
1943 struct bio *bio; 1946 struct bio *bio;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 09230cf3a244..b63ae20618fb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
347 if (!re) 347 if (!re)
348 return NULL; 348 return NULL;
349 349
350 blocksize = btrfs_level_size(root, level); 350 blocksize = root->nodesize;
351 re->logical = logical; 351 re->logical = logical;
352 re->blocksize = blocksize; 352 re->blocksize = blocksize;
353 re->top = *top; 353 re->top = *top;
@@ -798,7 +798,8 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
798 /* FIXME we cannot handle this properly right now */ 798 /* FIXME we cannot handle this properly right now */
799 BUG(); 799 BUG();
800 } 800 }
801 btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL); 801 btrfs_init_work(&rmw->work, btrfs_readahead_helper,
802 reada_start_machine_worker, NULL, NULL);
802 rmw->fs_info = fs_info; 803 rmw->fs_info = fs_info;
803 804
804 btrfs_queue_work(fs_info->readahead_workers, &rmw->work); 805 btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 65245a07275b..74257d6436ad 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -736,7 +736,8 @@ again:
736 err = ret; 736 err = ret;
737 goto out; 737 goto out;
738 } 738 }
739 BUG_ON(!ret || !path1->slots[0]); 739 ASSERT(ret);
740 ASSERT(path1->slots[0]);
740 741
741 path1->slots[0]--; 742 path1->slots[0]--;
742 743
@@ -746,10 +747,10 @@ again:
746 * the backref was added previously when processing 747 * the backref was added previously when processing
747 * backref of type BTRFS_TREE_BLOCK_REF_KEY 748 * backref of type BTRFS_TREE_BLOCK_REF_KEY
748 */ 749 */
749 BUG_ON(!list_is_singular(&cur->upper)); 750 ASSERT(list_is_singular(&cur->upper));
750 edge = list_entry(cur->upper.next, struct backref_edge, 751 edge = list_entry(cur->upper.next, struct backref_edge,
751 list[LOWER]); 752 list[LOWER]);
752 BUG_ON(!list_empty(&edge->list[UPPER])); 753 ASSERT(list_empty(&edge->list[UPPER]));
753 exist = edge->node[UPPER]; 754 exist = edge->node[UPPER];
754 /* 755 /*
755 * add the upper level block to pending list if we need 756 * add the upper level block to pending list if we need
@@ -831,7 +832,7 @@ again:
831 cur->cowonly = 1; 832 cur->cowonly = 1;
832 } 833 }
833#else 834#else
834 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 835 ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY);
835 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { 836 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
836#endif 837#endif
837 if (key.objectid == key.offset) { 838 if (key.objectid == key.offset) {
@@ -840,7 +841,7 @@ again:
840 * backref of this type. 841 * backref of this type.
841 */ 842 */
842 root = find_reloc_root(rc, cur->bytenr); 843 root = find_reloc_root(rc, cur->bytenr);
843 BUG_ON(!root); 844 ASSERT(root);
844 cur->root = root; 845 cur->root = root;
845 break; 846 break;
846 } 847 }
@@ -868,7 +869,7 @@ again:
868 } else { 869 } else {
869 upper = rb_entry(rb_node, struct backref_node, 870 upper = rb_entry(rb_node, struct backref_node,
870 rb_node); 871 rb_node);
871 BUG_ON(!upper->checked); 872 ASSERT(upper->checked);
872 INIT_LIST_HEAD(&edge->list[UPPER]); 873 INIT_LIST_HEAD(&edge->list[UPPER]);
873 } 874 }
874 list_add_tail(&edge->list[LOWER], &cur->upper); 875 list_add_tail(&edge->list[LOWER], &cur->upper);
@@ -892,7 +893,7 @@ again:
892 893
893 if (btrfs_root_level(&root->root_item) == cur->level) { 894 if (btrfs_root_level(&root->root_item) == cur->level) {
894 /* tree root */ 895 /* tree root */
895 BUG_ON(btrfs_root_bytenr(&root->root_item) != 896 ASSERT(btrfs_root_bytenr(&root->root_item) ==
896 cur->bytenr); 897 cur->bytenr);
897 if (should_ignore_root(root)) 898 if (should_ignore_root(root))
898 list_add(&cur->list, &useless); 899 list_add(&cur->list, &useless);
@@ -927,7 +928,7 @@ again:
927 need_check = true; 928 need_check = true;
928 for (; level < BTRFS_MAX_LEVEL; level++) { 929 for (; level < BTRFS_MAX_LEVEL; level++) {
929 if (!path2->nodes[level]) { 930 if (!path2->nodes[level]) {
930 BUG_ON(btrfs_root_bytenr(&root->root_item) != 931 ASSERT(btrfs_root_bytenr(&root->root_item) ==
931 lower->bytenr); 932 lower->bytenr);
932 if (should_ignore_root(root)) 933 if (should_ignore_root(root))
933 list_add(&lower->list, &useless); 934 list_add(&lower->list, &useless);
@@ -977,12 +978,15 @@ again:
977 need_check = false; 978 need_check = false;
978 list_add_tail(&edge->list[UPPER], 979 list_add_tail(&edge->list[UPPER],
979 &list); 980 &list);
980 } else 981 } else {
982 if (upper->checked)
983 need_check = true;
981 INIT_LIST_HEAD(&edge->list[UPPER]); 984 INIT_LIST_HEAD(&edge->list[UPPER]);
985 }
982 } else { 986 } else {
983 upper = rb_entry(rb_node, struct backref_node, 987 upper = rb_entry(rb_node, struct backref_node,
984 rb_node); 988 rb_node);
985 BUG_ON(!upper->checked); 989 ASSERT(upper->checked);
986 INIT_LIST_HEAD(&edge->list[UPPER]); 990 INIT_LIST_HEAD(&edge->list[UPPER]);
987 if (!upper->owner) 991 if (!upper->owner)
988 upper->owner = btrfs_header_owner(eb); 992 upper->owner = btrfs_header_owner(eb);
@@ -1026,7 +1030,7 @@ next:
1026 * everything goes well, connect backref nodes and insert backref nodes 1030 * everything goes well, connect backref nodes and insert backref nodes
1027 * into the cache. 1031 * into the cache.
1028 */ 1032 */
1029 BUG_ON(!node->checked); 1033 ASSERT(node->checked);
1030 cowonly = node->cowonly; 1034 cowonly = node->cowonly;
1031 if (!cowonly) { 1035 if (!cowonly) {
1032 rb_node = tree_insert(&cache->rb_root, node->bytenr, 1036 rb_node = tree_insert(&cache->rb_root, node->bytenr,
@@ -1062,8 +1066,21 @@ next:
1062 continue; 1066 continue;
1063 } 1067 }
1064 1068
1065 BUG_ON(!upper->checked); 1069 if (!upper->checked) {
1066 BUG_ON(cowonly != upper->cowonly); 1070 /*
1071 * Still want to blow up for developers since this is a
1072 * logic bug.
1073 */
1074 ASSERT(0);
1075 err = -EINVAL;
1076 goto out;
1077 }
1078 if (cowonly != upper->cowonly) {
1079 ASSERT(0);
1080 err = -EINVAL;
1081 goto out;
1082 }
1083
1067 if (!cowonly) { 1084 if (!cowonly) {
1068 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1085 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1069 &upper->rb_node); 1086 &upper->rb_node);
@@ -1086,7 +1103,7 @@ next:
1086 while (!list_empty(&useless)) { 1103 while (!list_empty(&useless)) {
1087 upper = list_entry(useless.next, struct backref_node, list); 1104 upper = list_entry(useless.next, struct backref_node, list);
1088 list_del_init(&upper->list); 1105 list_del_init(&upper->list);
1089 BUG_ON(!list_empty(&upper->upper)); 1106 ASSERT(list_empty(&upper->upper));
1090 if (upper == node) 1107 if (upper == node)
1091 node = NULL; 1108 node = NULL;
1092 if (upper->lowest) { 1109 if (upper->lowest) {
@@ -1119,29 +1136,45 @@ out:
1119 if (err) { 1136 if (err) {
1120 while (!list_empty(&useless)) { 1137 while (!list_empty(&useless)) {
1121 lower = list_entry(useless.next, 1138 lower = list_entry(useless.next,
1122 struct backref_node, upper); 1139 struct backref_node, list);
1123 list_del_init(&lower->upper); 1140 list_del_init(&lower->list);
1124 } 1141 }
1125 upper = node; 1142 while (!list_empty(&list)) {
1126 INIT_LIST_HEAD(&list); 1143 edge = list_first_entry(&list, struct backref_edge,
1127 while (upper) { 1144 list[UPPER]);
1128 if (RB_EMPTY_NODE(&upper->rb_node)) { 1145 list_del(&edge->list[UPPER]);
1129 list_splice_tail(&upper->upper, &list);
1130 free_backref_node(cache, upper);
1131 }
1132
1133 if (list_empty(&list))
1134 break;
1135
1136 edge = list_entry(list.next, struct backref_edge,
1137 list[LOWER]);
1138 list_del(&edge->list[LOWER]); 1146 list_del(&edge->list[LOWER]);
1147 lower = edge->node[LOWER];
1139 upper = edge->node[UPPER]; 1148 upper = edge->node[UPPER];
1140 free_backref_edge(cache, edge); 1149 free_backref_edge(cache, edge);
1150
1151 /*
1152 * Lower is no longer linked to any upper backref nodes
1153 * and isn't in the cache, we can free it ourselves.
1154 */
1155 if (list_empty(&lower->upper) &&
1156 RB_EMPTY_NODE(&lower->rb_node))
1157 list_add(&lower->list, &useless);
1158
1159 if (!RB_EMPTY_NODE(&upper->rb_node))
1160 continue;
1161
1162 /* Add this guy's upper edges to the list to proces */
1163 list_for_each_entry(edge, &upper->upper, list[LOWER])
1164 list_add_tail(&edge->list[UPPER], &list);
1165 if (list_empty(&upper->upper))
1166 list_add(&upper->list, &useless);
1167 }
1168
1169 while (!list_empty(&useless)) {
1170 lower = list_entry(useless.next,
1171 struct backref_node, list);
1172 list_del_init(&lower->list);
1173 free_backref_node(cache, lower);
1141 } 1174 }
1142 return ERR_PTR(err); 1175 return ERR_PTR(err);
1143 } 1176 }
1144 BUG_ON(node && node->detached); 1177 ASSERT(!node || !node->detached);
1145 return node; 1178 return node;
1146} 1179}
1147 1180
@@ -1787,7 +1820,7 @@ again:
1787 btrfs_node_key_to_cpu(parent, next_key, slot + 1); 1820 btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1788 1821
1789 old_bytenr = btrfs_node_blockptr(parent, slot); 1822 old_bytenr = btrfs_node_blockptr(parent, slot);
1790 blocksize = btrfs_level_size(dest, level - 1); 1823 blocksize = dest->nodesize;
1791 old_ptr_gen = btrfs_node_ptr_generation(parent, slot); 1824 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1792 1825
1793 if (level <= max_level) { 1826 if (level <= max_level) {
@@ -1813,8 +1846,7 @@ again:
1813 break; 1846 break;
1814 } 1847 }
1815 1848
1816 eb = read_tree_block(dest, old_bytenr, blocksize, 1849 eb = read_tree_block(dest, old_bytenr, old_ptr_gen);
1817 old_ptr_gen);
1818 if (!eb || !extent_buffer_uptodate(eb)) { 1850 if (!eb || !extent_buffer_uptodate(eb)) {
1819 ret = (!eb) ? -ENOMEM : -EIO; 1851 ret = (!eb) ? -ENOMEM : -EIO;
1820 free_extent_buffer(eb); 1852 free_extent_buffer(eb);
@@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1944 u64 bytenr; 1976 u64 bytenr;
1945 u64 ptr_gen = 0; 1977 u64 ptr_gen = 0;
1946 u64 last_snapshot; 1978 u64 last_snapshot;
1947 u32 blocksize;
1948 u32 nritems; 1979 u32 nritems;
1949 1980
1950 last_snapshot = btrfs_root_last_snapshot(&root->root_item); 1981 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
@@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1970 } 2001 }
1971 2002
1972 bytenr = btrfs_node_blockptr(eb, path->slots[i]); 2003 bytenr = btrfs_node_blockptr(eb, path->slots[i]);
1973 blocksize = btrfs_level_size(root, i - 1); 2004 eb = read_tree_block(root, bytenr, ptr_gen);
1974 eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
1975 if (!eb || !extent_buffer_uptodate(eb)) { 2005 if (!eb || !extent_buffer_uptodate(eb)) {
1976 free_extent_buffer(eb); 2006 free_extent_buffer(eb);
1977 return -EIO; 2007 return -EIO;
@@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list)
2316} 2346}
2317 2347
2318static noinline_for_stack 2348static noinline_for_stack
2319int merge_reloc_roots(struct reloc_control *rc) 2349void merge_reloc_roots(struct reloc_control *rc)
2320{ 2350{
2321 struct btrfs_root *root; 2351 struct btrfs_root *root;
2322 struct btrfs_root *reloc_root; 2352 struct btrfs_root *reloc_root;
@@ -2397,7 +2427,6 @@ out:
2397 } 2427 }
2398 2428
2399 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2429 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2400 return ret;
2401} 2430}
2402 2431
2403static void free_block_list(struct rb_root *blocks) 2432static void free_block_list(struct rb_root *blocks)
@@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc,
2544 if (next->processed && (reserve || next != node)) 2573 if (next->processed && (reserve || next != node))
2545 break; 2574 break;
2546 2575
2547 num_bytes += btrfs_level_size(rc->extent_root, 2576 num_bytes += rc->extent_root->nodesize;
2548 next->level);
2549 2577
2550 if (list_empty(&next->upper)) 2578 if (list_empty(&next->upper))
2551 break; 2579 break;
@@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2679 goto next; 2707 goto next;
2680 } 2708 }
2681 2709
2682 blocksize = btrfs_level_size(root, node->level); 2710 blocksize = root->nodesize;
2683 generation = btrfs_node_ptr_generation(upper->eb, slot); 2711 generation = btrfs_node_ptr_generation(upper->eb, slot);
2684 eb = read_tree_block(root, bytenr, blocksize, generation); 2712 eb = read_tree_block(root, bytenr, generation);
2685 if (!eb || !extent_buffer_uptodate(eb)) { 2713 if (!eb || !extent_buffer_uptodate(eb)) {
2686 free_extent_buffer(eb); 2714 free_extent_buffer(eb);
2687 err = -EIO; 2715 err = -EIO;
@@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc,
2789 u32 blocksize; 2817 u32 blocksize;
2790 if (node->level == 0 || 2818 if (node->level == 0 ||
2791 in_block_group(node->bytenr, rc->block_group)) { 2819 in_block_group(node->bytenr, rc->block_group)) {
2792 blocksize = btrfs_level_size(rc->extent_root, node->level); 2820 blocksize = rc->extent_root->nodesize;
2793 mark_block_processed(rc, node->bytenr, blocksize); 2821 mark_block_processed(rc, node->bytenr, blocksize);
2794 } 2822 }
2795 node->processed = 1; 2823 node->processed = 1;
@@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc,
2843 2871
2844 BUG_ON(block->key_ready); 2872 BUG_ON(block->key_ready);
2845 eb = read_tree_block(rc->extent_root, block->bytenr, 2873 eb = read_tree_block(rc->extent_root, block->bytenr,
2846 block->key.objectid, block->key.offset); 2874 block->key.offset);
2847 if (!eb || !extent_buffer_uptodate(eb)) { 2875 if (!eb || !extent_buffer_uptodate(eb)) {
2848 free_extent_buffer(eb); 2876 free_extent_buffer(eb);
2849 return -EIO; 2877 return -EIO;
@@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc,
2858 return 0; 2886 return 0;
2859} 2887}
2860 2888
2861static int reada_tree_block(struct reloc_control *rc,
2862 struct tree_block *block)
2863{
2864 BUG_ON(block->key_ready);
2865 if (block->key.type == BTRFS_METADATA_ITEM_KEY)
2866 readahead_tree_block(rc->extent_root, block->bytenr,
2867 block->key.objectid,
2868 rc->extent_root->leafsize);
2869 else
2870 readahead_tree_block(rc->extent_root, block->bytenr,
2871 block->key.objectid, block->key.offset);
2872 return 0;
2873}
2874
2875/* 2889/*
2876 * helper function to relocate a tree block 2890 * helper function to relocate a tree block
2877 */ 2891 */
@@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2951 while (rb_node) { 2965 while (rb_node) {
2952 block = rb_entry(rb_node, struct tree_block, rb_node); 2966 block = rb_entry(rb_node, struct tree_block, rb_node);
2953 if (!block->key_ready) 2967 if (!block->key_ready)
2954 reada_tree_block(rc, block); 2968 readahead_tree_block(rc->extent_root, block->bytenr,
2969 block->key.objectid);
2955 rb_node = rb_next(rb_node); 2970 rb_node = rb_next(rb_node);
2956 } 2971 }
2957 2972
@@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc,
3313 return -ENOMEM; 3328 return -ENOMEM;
3314 3329
3315 block->bytenr = extent_key->objectid; 3330 block->bytenr = extent_key->objectid;
3316 block->key.objectid = rc->extent_root->leafsize; 3331 block->key.objectid = rc->extent_root->nodesize;
3317 block->key.offset = generation; 3332 block->key.offset = generation;
3318 block->level = level; 3333 block->level = level;
3319 block->key_ready = 0; 3334 block->key_ready = 0;
@@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc,
3640 struct btrfs_extent_inline_ref *iref; 3655 struct btrfs_extent_inline_ref *iref;
3641 unsigned long ptr; 3656 unsigned long ptr;
3642 unsigned long end; 3657 unsigned long end;
3643 u32 blocksize = btrfs_level_size(rc->extent_root, 0); 3658 u32 blocksize = rc->extent_root->nodesize;
3644 int ret = 0; 3659 int ret = 0;
3645 int err = 0; 3660 int err = 0;
3646 3661
@@ -3783,7 +3798,7 @@ next:
3783 } 3798 }
3784 3799
3785 if (key.type == BTRFS_METADATA_ITEM_KEY && 3800 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3786 key.objectid + rc->extent_root->leafsize <= 3801 key.objectid + rc->extent_root->nodesize <=
3787 rc->search_start) { 3802 rc->search_start) {
3788 path->slots[0]++; 3803 path->slots[0]++;
3789 goto next; 3804 goto next;
@@ -3801,7 +3816,7 @@ next:
3801 rc->search_start = key.objectid + key.offset; 3816 rc->search_start = key.objectid + key.offset;
3802 else 3817 else
3803 rc->search_start = key.objectid + 3818 rc->search_start = key.objectid +
3804 rc->extent_root->leafsize; 3819 rc->extent_root->nodesize;
3805 memcpy(extent_key, &key, sizeof(key)); 3820 memcpy(extent_key, &key, sizeof(key));
3806 return 0; 3821 return 0;
3807 } 3822 }
@@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
4096 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 4111 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
4097 BTRFS_INODE_PREALLOC); 4112 BTRFS_INODE_PREALLOC);
4098 btrfs_mark_buffer_dirty(leaf); 4113 btrfs_mark_buffer_dirty(leaf);
4099 btrfs_release_path(path);
4100out: 4114out:
4101 btrfs_free_path(path); 4115 btrfs_free_path(path);
4102 return ret; 4116 return ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b6d198f5181e..efa083113827 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -137,7 +137,6 @@ struct scrub_ctx {
137 int pages_per_rd_bio; 137 int pages_per_rd_bio;
138 u32 sectorsize; 138 u32 sectorsize;
139 u32 nodesize; 139 u32 nodesize;
140 u32 leafsize;
141 140
142 int is_dev_replace; 141 int is_dev_replace;
143 struct scrub_wr_ctx wr_ctx; 142 struct scrub_wr_ctx wr_ctx;
@@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx {
178struct scrub_warning { 177struct scrub_warning {
179 struct btrfs_path *path; 178 struct btrfs_path *path;
180 u64 extent_item_size; 179 u64 extent_item_size;
181 char *scratch_buf;
182 char *msg_buf;
183 const char *errstr; 180 const char *errstr;
184 sector_t sector; 181 sector_t sector;
185 u64 logical; 182 u64 logical;
186 struct btrfs_device *dev; 183 struct btrfs_device *dev;
187 int msg_bufsize;
188 int scratch_bufsize;
189}; 184};
190 185
191
192static void scrub_pending_bio_inc(struct scrub_ctx *sctx); 186static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
193static void scrub_pending_bio_dec(struct scrub_ctx *sctx); 187static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
194static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); 188static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
@@ -428,8 +422,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
428 sbio->index = i; 422 sbio->index = i;
429 sbio->sctx = sctx; 423 sbio->sctx = sctx;
430 sbio->page_count = 0; 424 sbio->page_count = 0;
431 btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, 425 btrfs_init_work(&sbio->work, btrfs_scrub_helper,
432 NULL, NULL); 426 scrub_bio_end_io_worker, NULL, NULL);
433 427
434 if (i != SCRUB_BIOS_PER_SCTX - 1) 428 if (i != SCRUB_BIOS_PER_SCTX - 1)
435 sctx->bios[i]->next_free = i + 1; 429 sctx->bios[i]->next_free = i + 1;
@@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
438 } 432 }
439 sctx->first_free = 0; 433 sctx->first_free = 0;
440 sctx->nodesize = dev->dev_root->nodesize; 434 sctx->nodesize = dev->dev_root->nodesize;
441 sctx->leafsize = dev->dev_root->leafsize;
442 sctx->sectorsize = dev->dev_root->sectorsize; 435 sctx->sectorsize = dev->dev_root->sectorsize;
443 atomic_set(&sctx->bios_in_flight, 0); 436 atomic_set(&sctx->bios_in_flight, 0);
444 atomic_set(&sctx->workers_pending, 0); 437 atomic_set(&sctx->workers_pending, 0);
@@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
553 u64 ref_root; 546 u64 ref_root;
554 u32 item_size; 547 u32 item_size;
555 u8 ref_level; 548 u8 ref_level;
556 const int bufsize = 4096;
557 int ret; 549 int ret;
558 550
559 WARN_ON(sblock->page_count < 1); 551 WARN_ON(sblock->page_count < 1);
@@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
561 fs_info = sblock->sctx->dev_root->fs_info; 553 fs_info = sblock->sctx->dev_root->fs_info;
562 554
563 path = btrfs_alloc_path(); 555 path = btrfs_alloc_path();
556 if (!path)
557 return;
564 558
565 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
566 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
567 swarn.sector = (sblock->pagev[0]->physical) >> 9; 559 swarn.sector = (sblock->pagev[0]->physical) >> 9;
568 swarn.logical = sblock->pagev[0]->logical; 560 swarn.logical = sblock->pagev[0]->logical;
569 swarn.errstr = errstr; 561 swarn.errstr = errstr;
570 swarn.dev = NULL; 562 swarn.dev = NULL;
571 swarn.msg_bufsize = bufsize;
572 swarn.scratch_bufsize = bufsize;
573
574 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
575 goto out;
576 563
577 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, 564 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
578 &flags); 565 &flags);
@@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
613 600
614out: 601out:
615 btrfs_free_path(path); 602 btrfs_free_path(path);
616 kfree(swarn.scratch_buf);
617 kfree(swarn.msg_buf);
618} 603}
619 604
620static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) 605static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
@@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
681 ret = -EIO; 666 ret = -EIO;
682 goto out; 667 goto out;
683 } 668 }
684 fs_info = BTRFS_I(inode)->root->fs_info; 669 ret = repair_io_failure(inode, offset, PAGE_SIZE,
685 ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
686 fixup->logical, page, 670 fixup->logical, page,
671 offset - page_offset(page),
687 fixup->mirror_num); 672 fixup->mirror_num);
688 unlock_page(page); 673 unlock_page(page);
689 corrected = !ret; 674 corrected = !ret;
@@ -999,8 +984,8 @@ nodatasum_case:
999 fixup_nodatasum->root = fs_info->extent_root; 984 fixup_nodatasum->root = fs_info->extent_root;
1000 fixup_nodatasum->mirror_num = failed_mirror_index + 1; 985 fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1001 scrub_pending_trans_workers_inc(sctx); 986 scrub_pending_trans_workers_inc(sctx);
1002 btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum, 987 btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1003 NULL, NULL); 988 scrub_fixup_nodatasum, NULL, NULL);
1004 btrfs_queue_work(fs_info->scrub_workers, 989 btrfs_queue_work(fs_info->scrub_workers,
1005 &fixup_nodatasum->work); 990 &fixup_nodatasum->work);
1006 goto out; 991 goto out;
@@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1361 return; 1346 return;
1362} 1347}
1363 1348
1349static inline int scrub_check_fsid(u8 fsid[],
1350 struct scrub_page *spage)
1351{
1352 struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1353 int ret;
1354
1355 ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1356 return !ret;
1357}
1358
1364static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, 1359static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1365 struct scrub_block *sblock, 1360 struct scrub_block *sblock,
1366 int is_metadata, int have_csum, 1361 int is_metadata, int have_csum,
@@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1380 h = (struct btrfs_header *)mapped_buffer; 1375 h = (struct btrfs_header *)mapped_buffer;
1381 1376
1382 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || 1377 if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1383 memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || 1378 !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1384 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1379 memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1385 BTRFS_UUID_SIZE)) { 1380 BTRFS_UUID_SIZE)) {
1386 sblock->header_error = 1; 1381 sblock->header_error = 1;
@@ -1616,7 +1611,8 @@ static void scrub_wr_bio_end_io(struct bio *bio, int err)
1616 sbio->err = err; 1611 sbio->err = err;
1617 sbio->bio = bio; 1612 sbio->bio = bio;
1618 1613
1619 btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); 1614 btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1615 scrub_wr_bio_end_io_worker, NULL, NULL);
1620 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); 1616 btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1621} 1617}
1622 1618
@@ -1750,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
1750 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) 1746 if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1751 ++fail; 1747 ++fail;
1752 1748
1753 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1749 if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1754 ++fail; 1750 ++fail;
1755 1751
1756 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, 1752 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1757 BTRFS_UUID_SIZE)) 1753 BTRFS_UUID_SIZE))
1758 ++fail; 1754 ++fail;
1759 1755
1760 WARN_ON(sctx->nodesize != sctx->leafsize);
1761 len = sctx->nodesize - BTRFS_CSUM_SIZE; 1756 len = sctx->nodesize - BTRFS_CSUM_SIZE;
1762 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; 1757 mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1763 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; 1758 p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
@@ -1790,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1790{ 1785{
1791 struct btrfs_super_block *s; 1786 struct btrfs_super_block *s;
1792 struct scrub_ctx *sctx = sblock->sctx; 1787 struct scrub_ctx *sctx = sblock->sctx;
1793 struct btrfs_root *root = sctx->dev_root;
1794 struct btrfs_fs_info *fs_info = root->fs_info;
1795 u8 calculated_csum[BTRFS_CSUM_SIZE]; 1788 u8 calculated_csum[BTRFS_CSUM_SIZE];
1796 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 1789 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1797 struct page *page; 1790 struct page *page;
@@ -1816,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock)
1816 if (sblock->pagev[0]->generation != btrfs_super_generation(s)) 1809 if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1817 ++fail_gen; 1810 ++fail_gen;
1818 1811
1819 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) 1812 if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
1820 ++fail_cor; 1813 ++fail_cor;
1821 1814
1822 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; 1815 len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
@@ -2195,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2195 sctx->stat.data_bytes_scrubbed += len; 2188 sctx->stat.data_bytes_scrubbed += len;
2196 spin_unlock(&sctx->stat_lock); 2189 spin_unlock(&sctx->stat_lock);
2197 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 2190 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2198 WARN_ON(sctx->nodesize != sctx->leafsize);
2199 blocksize = sctx->nodesize; 2191 blocksize = sctx->nodesize;
2200 spin_lock(&sctx->stat_lock); 2192 spin_lock(&sctx->stat_lock);
2201 sctx->stat.tree_extents_scrubbed++; 2193 sctx->stat.tree_extents_scrubbed++;
@@ -2486,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2486 btrfs_item_key_to_cpu(l, &key, slot); 2478 btrfs_item_key_to_cpu(l, &key, slot);
2487 2479
2488 if (key.type == BTRFS_METADATA_ITEM_KEY) 2480 if (key.type == BTRFS_METADATA_ITEM_KEY)
2489 bytes = root->leafsize; 2481 bytes = root->nodesize;
2490 else 2482 else
2491 bytes = key.offset; 2483 bytes = key.offset;
2492 2484
@@ -2713,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2713 if (found_key.objectid != scrub_dev->devid) 2705 if (found_key.objectid != scrub_dev->devid)
2714 break; 2706 break;
2715 2707
2716 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) 2708 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
2717 break; 2709 break;
2718 2710
2719 if (found_key.offset >= end) 2711 if (found_key.offset >= end)
@@ -2827,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2827 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 2819 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
2828 return -EIO; 2820 return -EIO;
2829 2821
2830 gen = root->fs_info->last_trans_committed; 2822 /* Seed devices of a new filesystem has their own generation. */
2823 if (scrub_dev->fs_devices != root->fs_info->fs_devices)
2824 gen = scrub_dev->generation;
2825 else
2826 gen = root->fs_info->last_trans_committed;
2831 2827
2832 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 2828 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2833 bytenr = btrfs_sb_offset(i); 2829 bytenr = btrfs_sb_offset(i);
2834 if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) 2830 if (bytenr + BTRFS_SUPER_INFO_SIZE >
2831 scrub_dev->commit_total_bytes)
2835 break; 2832 break;
2836 2833
2837 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, 2834 ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
@@ -2904,21 +2901,11 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2904 struct scrub_ctx *sctx; 2901 struct scrub_ctx *sctx;
2905 int ret; 2902 int ret;
2906 struct btrfs_device *dev; 2903 struct btrfs_device *dev;
2904 struct rcu_string *name;
2907 2905
2908 if (btrfs_fs_closing(fs_info)) 2906 if (btrfs_fs_closing(fs_info))
2909 return -EINVAL; 2907 return -EINVAL;
2910 2908
2911 /*
2912 * check some assumptions
2913 */
2914 if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2915 btrfs_err(fs_info,
2916 "scrub: size assumption nodesize == leafsize (%d == %d) fails",
2917 fs_info->chunk_root->nodesize,
2918 fs_info->chunk_root->leafsize);
2919 return -EINVAL;
2920 }
2921
2922 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { 2909 if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2923 /* 2910 /*
2924 * in this case scrub is unable to calculate the checksum 2911 * in this case scrub is unable to calculate the checksum
@@ -2965,6 +2952,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2965 return -ENODEV; 2952 return -ENODEV;
2966 } 2953 }
2967 2954
2955 if (!is_dev_replace && !readonly && !dev->writeable) {
2956 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2957 rcu_read_lock();
2958 name = rcu_dereference(dev->name);
2959 btrfs_err(fs_info, "scrub: device %s is not writable",
2960 name->str);
2961 rcu_read_unlock();
2962 return -EROFS;
2963 }
2964
2968 mutex_lock(&fs_info->scrub_lock); 2965 mutex_lock(&fs_info->scrub_lock);
2969 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) { 2966 if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2970 mutex_unlock(&fs_info->scrub_lock); 2967 mutex_unlock(&fs_info->scrub_lock);
@@ -3203,7 +3200,8 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3203 nocow_ctx->len = len; 3200 nocow_ctx->len = len;
3204 nocow_ctx->mirror_num = mirror_num; 3201 nocow_ctx->mirror_num = mirror_num;
3205 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace; 3202 nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3206 btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL); 3203 btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3204 copy_nocow_pages_worker, NULL, NULL);
3207 INIT_LIST_HEAD(&nocow_ctx->inodes); 3205 INIT_LIST_HEAD(&nocow_ctx->inodes);
3208 btrfs_queue_work(fs_info->scrub_nocow_workers, 3206 btrfs_queue_work(fs_info->scrub_nocow_workers,
3209 &nocow_ctx->work); 3207 &nocow_ctx->work);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6528aa662181..874828dd0a86 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
515 set_fs(KERNEL_DS); 515 set_fs(KERNEL_DS);
516 516
517 while (pos < len) { 517 while (pos < len) {
518 ret = vfs_write(filp, (char *)buf + pos, len - pos, off); 518 ret = vfs_write(filp, (__force const char __user *)buf + pos,
519 len - pos, off);
519 /* TODO handle that correctly */ 520 /* TODO handle that correctly */
520 /*if (ret == -ERESTARTSYS) { 521 /*if (ret == -ERESTARTSYS) {
521 continue; 522 continue;
@@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
985 int num; 986 int num;
986 u8 type; 987 u8 type;
987 988
988 if (found_key->type == BTRFS_XATTR_ITEM_KEY) 989 /*
989 buf_len = BTRFS_MAX_XATTR_SIZE(root); 990 * Start with a small buffer (1 page). If later we end up needing more
990 else 991 * space, which can happen for xattrs on a fs with a leaf size greater
991 buf_len = PATH_MAX; 992 * then the page size, attempt to increase the buffer. Typically xattr
992 993 * values are small.
994 */
995 buf_len = PATH_MAX;
993 buf = kmalloc(buf_len, GFP_NOFS); 996 buf = kmalloc(buf_len, GFP_NOFS);
994 if (!buf) { 997 if (!buf) {
995 ret = -ENOMEM; 998 ret = -ENOMEM;
@@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1016 ret = -ENAMETOOLONG; 1019 ret = -ENAMETOOLONG;
1017 goto out; 1020 goto out;
1018 } 1021 }
1019 if (name_len + data_len > buf_len) { 1022 if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) {
1020 ret = -E2BIG; 1023 ret = -E2BIG;
1021 goto out; 1024 goto out;
1022 } 1025 }
@@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1024 /* 1027 /*
1025 * Path too long 1028 * Path too long
1026 */ 1029 */
1027 if (name_len + data_len > buf_len) { 1030 if (name_len + data_len > PATH_MAX) {
1028 ret = -ENAMETOOLONG; 1031 ret = -ENAMETOOLONG;
1029 goto out; 1032 goto out;
1030 } 1033 }
1031 } 1034 }
1032 1035
1036 if (name_len + data_len > buf_len) {
1037 buf_len = name_len + data_len;
1038 if (is_vmalloc_addr(buf)) {
1039 vfree(buf);
1040 buf = NULL;
1041 } else {
1042 char *tmp = krealloc(buf, buf_len,
1043 GFP_NOFS | __GFP_NOWARN);
1044
1045 if (!tmp)
1046 kfree(buf);
1047 buf = tmp;
1048 }
1049 if (!buf) {
1050 buf = vmalloc(buf_len);
1051 if (!buf) {
1052 ret = -ENOMEM;
1053 goto out;
1054 }
1055 }
1056 }
1057
1033 read_extent_buffer(eb, buf, (unsigned long)(di + 1), 1058 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
1034 name_len + data_len); 1059 name_len + data_len);
1035 1060
@@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1050 } 1075 }
1051 1076
1052out: 1077out:
1053 kfree(buf); 1078 kvfree(buf);
1054 return ret; 1079 return ret;
1055} 1080}
1056 1081
@@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx,
3302 if (ret < 0 && ret != -ENOENT) { 3327 if (ret < 0 && ret != -ENOENT) {
3303 goto out; 3328 goto out;
3304 } else if (ret == -ENOENT) { 3329 } else if (ret == -ENOENT) {
3305 ret = 1; 3330 ret = 0;
3306 break; 3331 break;
3307 } 3332 }
3308 3333
@@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5703 NULL); 5728 NULL);
5704 sort_clone_roots = 1; 5729 sort_clone_roots = 1;
5705 5730
5706 current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; 5731 current->journal_info = BTRFS_SEND_TRANS_STUB;
5707 ret = send_subvol(sctx); 5732 ret = send_subvol(sctx);
5708 current->journal_info = NULL; 5733 current->journal_info = NULL;
5709 if (ret < 0) 5734 if (ret < 0)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8e16bca69c56..a2b97ef10317 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -60,6 +60,7 @@
60#include "backref.h" 60#include "backref.h"
61#include "tests/btrfs-tests.h" 61#include "tests/btrfs-tests.h"
62 62
63#include "qgroup.h"
63#define CREATE_TRACE_POINTS 64#define CREATE_TRACE_POINTS
64#include <trace/events/btrfs.h> 65#include <trace/events/btrfs.h>
65 66
@@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
307 308
308static void btrfs_put_super(struct super_block *sb) 309static void btrfs_put_super(struct super_block *sb)
309{ 310{
310 (void)close_ctree(btrfs_sb(sb)->tree_root); 311 close_ctree(btrfs_sb(sb)->tree_root);
311 /* FIXME: need to fix VFS to return error? */
312 /* AV: return it _where_? ->put_super() can be triggered by any number
313 * of async events, up to and including delivery of SIGKILL to the
314 * last process that kept it busy. Or segfault in the aforementioned
315 * process... Whom would you report that to?
316 */
317} 312}
318 313
319enum { 314enum {
@@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
400 int ret = 0; 395 int ret = 0;
401 char *compress_type; 396 char *compress_type;
402 bool compress_force = false; 397 bool compress_force = false;
403 bool compress = false;
404 398
405 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); 399 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
406 if (cache_gen) 400 if (cache_gen)
@@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
478 /* Fallthrough */ 472 /* Fallthrough */
479 case Opt_compress: 473 case Opt_compress:
480 case Opt_compress_type: 474 case Opt_compress_type:
481 compress = true;
482 if (token == Opt_compress || 475 if (token == Opt_compress ||
483 token == Opt_compress_force || 476 token == Opt_compress_force ||
484 strcmp(args[0].from, "zlib") == 0) { 477 strcmp(args[0].from, "zlib") == 0) {
@@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
508 btrfs_set_and_info(root, FORCE_COMPRESS, 501 btrfs_set_and_info(root, FORCE_COMPRESS,
509 "force %s compression", 502 "force %s compression",
510 compress_type); 503 compress_type);
511 } else if (compress) { 504 } else {
512 if (!btrfs_test_opt(root, COMPRESS)) 505 if (!btrfs_test_opt(root, COMPRESS))
513 btrfs_info(root->fs_info, 506 btrfs_info(root->fs_info,
514 "btrfs: use %s compression", 507 "btrfs: use %s compression",
515 compress_type); 508 compress_type);
509 /*
510 * If we remount from compress-force=xxx to
511 * compress=xxx, we need clear FORCE_COMPRESS
512 * flag, otherwise, there is no way for users
513 * to disable forcible compression separately.
514 */
515 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
516 } 516 }
517 break; 517 break;
518 case Opt_ssd: 518 case Opt_ssd:
@@ -851,7 +851,6 @@ static struct dentry *get_default_root(struct super_block *sb,
851 struct btrfs_path *path; 851 struct btrfs_path *path;
852 struct btrfs_key location; 852 struct btrfs_key location;
853 struct inode *inode; 853 struct inode *inode;
854 struct dentry *dentry;
855 u64 dir_id; 854 u64 dir_id;
856 int new = 0; 855 int new = 0;
857 856
@@ -922,13 +921,7 @@ setup_root:
922 return dget(sb->s_root); 921 return dget(sb->s_root);
923 } 922 }
924 923
925 dentry = d_obtain_alias(inode); 924 return d_obtain_root(inode);
926 if (!IS_ERR(dentry)) {
927 spin_lock(&dentry->d_lock);
928 dentry->d_flags &= ~DCACHE_DISCONNECTED;
929 spin_unlock(&dentry->d_lock);
930 }
931 return dentry;
932} 925}
933 926
934static int btrfs_fill_super(struct super_block *sb, 927static int btrfs_fill_super(struct super_block *sb,
@@ -1021,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1021 seq_puts(seq, ",nodatacow"); 1014 seq_puts(seq, ",nodatacow");
1022 if (btrfs_test_opt(root, NOBARRIER)) 1015 if (btrfs_test_opt(root, NOBARRIER))
1023 seq_puts(seq, ",nobarrier"); 1016 seq_puts(seq, ",nobarrier");
1024 if (info->max_inline != 8192 * 1024) 1017 if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
1025 seq_printf(seq, ",max_inline=%llu", info->max_inline); 1018 seq_printf(seq, ",max_inline=%llu", info->max_inline);
1026 if (info->alloc_start != 0) 1019 if (info->alloc_start != 0)
1027 seq_printf(seq, ",alloc_start=%llu", info->alloc_start); 1020 seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
@@ -1222,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
1222 return root; 1215 return root;
1223} 1216}
1224 1217
1218static int parse_security_options(char *orig_opts,
1219 struct security_mnt_opts *sec_opts)
1220{
1221 char *secdata = NULL;
1222 int ret = 0;
1223
1224 secdata = alloc_secdata();
1225 if (!secdata)
1226 return -ENOMEM;
1227 ret = security_sb_copy_data(orig_opts, secdata);
1228 if (ret) {
1229 free_secdata(secdata);
1230 return ret;
1231 }
1232 ret = security_sb_parse_opts_str(secdata, sec_opts);
1233 free_secdata(secdata);
1234 return ret;
1235}
1236
1237static int setup_security_options(struct btrfs_fs_info *fs_info,
1238 struct super_block *sb,
1239 struct security_mnt_opts *sec_opts)
1240{
1241 int ret = 0;
1242
1243 /*
1244 * Call security_sb_set_mnt_opts() to check whether new sec_opts
1245 * is valid.
1246 */
1247 ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL);
1248 if (ret)
1249 return ret;
1250
1251#ifdef CONFIG_SECURITY
1252 if (!fs_info->security_opts.num_mnt_opts) {
1253 /* first time security setup, copy sec_opts to fs_info */
1254 memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
1255 } else {
1256 /*
1257 * Since SELinux(the only one supports security_mnt_opts) does
1258 * NOT support changing context during remount/mount same sb,
1259 * This must be the same or part of the same security options,
1260 * just free it.
1261 */
1262 security_free_mnt_opts(sec_opts);
1263 }
1264#endif
1265 return ret;
1266}
1267
1225/* 1268/*
1226 * Find a superblock for the given device / mount point. 1269 * Find a superblock for the given device / mount point.
1227 * 1270 *
@@ -1236,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1236 struct dentry *root; 1279 struct dentry *root;
1237 struct btrfs_fs_devices *fs_devices = NULL; 1280 struct btrfs_fs_devices *fs_devices = NULL;
1238 struct btrfs_fs_info *fs_info = NULL; 1281 struct btrfs_fs_info *fs_info = NULL;
1282 struct security_mnt_opts new_sec_opts;
1239 fmode_t mode = FMODE_READ; 1283 fmode_t mode = FMODE_READ;
1240 char *subvol_name = NULL; 1284 char *subvol_name = NULL;
1241 u64 subvol_objectid = 0; 1285 u64 subvol_objectid = 0;
@@ -1258,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1258 return root; 1302 return root;
1259 } 1303 }
1260 1304
1305 security_init_mnt_opts(&new_sec_opts);
1306 if (data) {
1307 error = parse_security_options(data, &new_sec_opts);
1308 if (error)
1309 return ERR_PTR(error);
1310 }
1311
1261 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 1312 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
1262 if (error) 1313 if (error)
1263 return ERR_PTR(error); 1314 goto error_sec_opts;
1264 1315
1265 /* 1316 /*
1266 * Setup a dummy root and fs_info for test/set super. This is because 1317 * Setup a dummy root and fs_info for test/set super. This is because
@@ -1269,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1269 * then open_ctree will properly initialize everything later. 1320 * then open_ctree will properly initialize everything later.
1270 */ 1321 */
1271 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 1322 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
1272 if (!fs_info) 1323 if (!fs_info) {
1273 return ERR_PTR(-ENOMEM); 1324 error = -ENOMEM;
1325 goto error_sec_opts;
1326 }
1274 1327
1275 fs_info->fs_devices = fs_devices; 1328 fs_info->fs_devices = fs_devices;
1276 1329
1277 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 1330 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
1278 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); 1331 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
1332 security_init_mnt_opts(&fs_info->security_opts);
1279 if (!fs_info->super_copy || !fs_info->super_for_commit) { 1333 if (!fs_info->super_copy || !fs_info->super_for_commit) {
1280 error = -ENOMEM; 1334 error = -ENOMEM;
1281 goto error_fs_info; 1335 goto error_fs_info;
@@ -1313,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1313 } 1367 }
1314 1368
1315 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); 1369 root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
1316 if (IS_ERR(root)) 1370 if (IS_ERR(root)) {
1317 deactivate_locked_super(s); 1371 deactivate_locked_super(s);
1372 error = PTR_ERR(root);
1373 goto error_sec_opts;
1374 }
1375
1376 fs_info = btrfs_sb(s);
1377 error = setup_security_options(fs_info, s, &new_sec_opts);
1378 if (error) {
1379 dput(root);
1380 deactivate_locked_super(s);
1381 goto error_sec_opts;
1382 }
1318 1383
1319 return root; 1384 return root;
1320 1385
@@ -1322,6 +1387,8 @@ error_close_devices:
1322 btrfs_close_devices(fs_devices); 1387 btrfs_close_devices(fs_devices);
1323error_fs_info: 1388error_fs_info:
1324 free_fs_info(fs_info); 1389 free_fs_info(fs_info);
1390error_sec_opts:
1391 security_free_mnt_opts(&new_sec_opts);
1325 return ERR_PTR(error); 1392 return ERR_PTR(error);
1326} 1393}
1327 1394
@@ -1403,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
1403 sync_filesystem(sb); 1470 sync_filesystem(sb);
1404 btrfs_remount_prepare(fs_info); 1471 btrfs_remount_prepare(fs_info);
1405 1472
1473 if (data) {
1474 struct security_mnt_opts new_sec_opts;
1475
1476 security_init_mnt_opts(&new_sec_opts);
1477 ret = parse_security_options(data, &new_sec_opts);
1478 if (ret)
1479 goto restore;
1480 ret = setup_security_options(fs_info, sb,
1481 &new_sec_opts);
1482 if (ret) {
1483 security_free_mnt_opts(&new_sec_opts);
1484 goto restore;
1485 }
1486 }
1487
1406 ret = btrfs_parse_options(root, data); 1488 ret = btrfs_parse_options(root, data);
1407 if (ret) { 1489 if (ret) {
1408 ret = -EINVAL; 1490 ret = -EINVAL;
@@ -1672,6 +1754,21 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1672 return 0; 1754 return 0;
1673} 1755}
1674 1756
1757/*
1758 * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
1759 *
1760 * If there's a redundant raid level at DATA block groups, use the respective
1761 * multiplier to scale the sizes.
1762 *
1763 * Unused device space usage is based on simulating the chunk allocator
1764 * algorithm that respects the device sizes, order of allocations and the
1765 * 'alloc_start' value, this is a close approximation of the actual use but
1766 * there are other factors that may change the result (like a new metadata
1767 * chunk).
1768 *
1769 * FIXME: not accurate for mixed block groups, total and free/used are ok,
1770 * available appears slightly larger.
1771 */
1675static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1772static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1676{ 1773{
1677 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); 1774 struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
@@ -1682,36 +1779,66 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1682 u64 total_free_data = 0; 1779 u64 total_free_data = 0;
1683 int bits = dentry->d_sb->s_blocksize_bits; 1780 int bits = dentry->d_sb->s_blocksize_bits;
1684 __be32 *fsid = (__be32 *)fs_info->fsid; 1781 __be32 *fsid = (__be32 *)fs_info->fsid;
1782 unsigned factor = 1;
1783 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
1685 int ret; 1784 int ret;
1686 1785
1687 /* holding chunk_muext to avoid allocating new chunks */ 1786 /*
1787 * holding chunk_muext to avoid allocating new chunks, holding
1788 * device_list_mutex to avoid the device being removed
1789 */
1790 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1688 mutex_lock(&fs_info->chunk_mutex); 1791 mutex_lock(&fs_info->chunk_mutex);
1689 rcu_read_lock(); 1792 rcu_read_lock();
1690 list_for_each_entry_rcu(found, head, list) { 1793 list_for_each_entry_rcu(found, head, list) {
1691 if (found->flags & BTRFS_BLOCK_GROUP_DATA) { 1794 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
1795 int i;
1796
1692 total_free_data += found->disk_total - found->disk_used; 1797 total_free_data += found->disk_total - found->disk_used;
1693 total_free_data -= 1798 total_free_data -=
1694 btrfs_account_ro_block_groups_free_space(found); 1799 btrfs_account_ro_block_groups_free_space(found);
1800
1801 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1802 if (!list_empty(&found->block_groups[i])) {
1803 switch (i) {
1804 case BTRFS_RAID_DUP:
1805 case BTRFS_RAID_RAID1:
1806 case BTRFS_RAID_RAID10:
1807 factor = 2;
1808 }
1809 }
1810 }
1695 } 1811 }
1696 1812
1697 total_used += found->disk_used; 1813 total_used += found->disk_used;
1698 } 1814 }
1815
1699 rcu_read_unlock(); 1816 rcu_read_unlock();
1700 1817
1701 buf->f_namelen = BTRFS_NAME_LEN; 1818 buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
1702 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1819 buf->f_blocks >>= bits;
1703 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1820 buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);
1704 buf->f_bsize = dentry->d_sb->s_blocksize; 1821
1705 buf->f_type = BTRFS_SUPER_MAGIC; 1822 /* Account global block reserve as used, it's in logical size already */
1823 spin_lock(&block_rsv->lock);
1824 buf->f_bfree -= block_rsv->size >> bits;
1825 spin_unlock(&block_rsv->lock);
1826
1706 buf->f_bavail = total_free_data; 1827 buf->f_bavail = total_free_data;
1707 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); 1828 ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
1708 if (ret) { 1829 if (ret) {
1709 mutex_unlock(&fs_info->chunk_mutex); 1830 mutex_unlock(&fs_info->chunk_mutex);
1831 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1710 return ret; 1832 return ret;
1711 } 1833 }
1712 buf->f_bavail += total_free_data; 1834 buf->f_bavail += div_u64(total_free_data, factor);
1713 buf->f_bavail = buf->f_bavail >> bits; 1835 buf->f_bavail = buf->f_bavail >> bits;
1714 mutex_unlock(&fs_info->chunk_mutex); 1836 mutex_unlock(&fs_info->chunk_mutex);
1837 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1838
1839 buf->f_type = BTRFS_SUPER_MAGIC;
1840 buf->f_bsize = dentry->d_sb->s_blocksize;
1841 buf->f_namelen = BTRFS_NAME_LEN;
1715 1842
1716 /* We treat it as constant endianness (it doesn't matter _which_) 1843 /* We treat it as constant endianness (it doesn't matter _which_)
1717 because we want the fsid to come out the same whether mounted 1844 because we want the fsid to come out the same whether mounted
@@ -1737,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = {
1737 .name = "btrfs", 1864 .name = "btrfs",
1738 .mount = btrfs_mount, 1865 .mount = btrfs_mount,
1739 .kill_sb = btrfs_kill_super, 1866 .kill_sb = btrfs_kill_super,
1740 .fs_flags = FS_REQUIRES_DEV, 1867 .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
1741}; 1868};
1742MODULE_ALIAS_FS("btrfs"); 1869MODULE_ALIAS_FS("btrfs");
1743 1870
@@ -1961,11 +2088,15 @@ static int __init init_btrfs_fs(void)
1961 2088
1962 err = btrfs_prelim_ref_init(); 2089 err = btrfs_prelim_ref_init();
1963 if (err) 2090 if (err)
2091 goto free_delayed_ref;
2092
2093 err = btrfs_end_io_wq_init();
2094 if (err)
1964 goto free_prelim_ref; 2095 goto free_prelim_ref;
1965 2096
1966 err = btrfs_interface_init(); 2097 err = btrfs_interface_init();
1967 if (err) 2098 if (err)
1968 goto free_delayed_ref; 2099 goto free_end_io_wq;
1969 2100
1970 btrfs_init_lockdep(); 2101 btrfs_init_lockdep();
1971 2102
@@ -1983,6 +2114,8 @@ static int __init init_btrfs_fs(void)
1983 2114
1984unregister_ioctl: 2115unregister_ioctl:
1985 btrfs_interface_exit(); 2116 btrfs_interface_exit();
2117free_end_io_wq:
2118 btrfs_end_io_wq_exit();
1986free_prelim_ref: 2119free_prelim_ref:
1987 btrfs_prelim_ref_exit(); 2120 btrfs_prelim_ref_exit();
1988free_delayed_ref: 2121free_delayed_ref:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 78699364f537..b2e7bb4393f6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj,
242 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 242 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
243 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); 243 return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf);
244} 244}
245BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); 245BTRFS_ATTR(global_rsv_size, global_rsv_size_show);
246 246
247static ssize_t global_rsv_reserved_show(struct kobject *kobj, 247static ssize_t global_rsv_reserved_show(struct kobject *kobj,
248 struct kobj_attribute *a, char *buf) 248 struct kobj_attribute *a, char *buf)
@@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj,
251 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 251 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
252 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); 252 return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf);
253} 253}
254BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); 254BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show);
255 255
256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) 256#define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj)
257#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) 257#define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj)
@@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \
306 struct btrfs_space_info *sinfo = to_space_info(kobj); \ 306 struct btrfs_space_info *sinfo = to_space_info(kobj); \
307 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ 307 return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \
308} \ 308} \
309BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) 309BTRFS_ATTR(field, btrfs_space_info_show_##field)
310 310
311static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, 311static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj,
312 struct kobj_attribute *a, 312 struct kobj_attribute *a,
@@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved);
325SPACE_INFO_ATTR(bytes_may_use); 325SPACE_INFO_ATTR(bytes_may_use);
326SPACE_INFO_ATTR(disk_used); 326SPACE_INFO_ATTR(disk_used);
327SPACE_INFO_ATTR(disk_total); 327SPACE_INFO_ATTR(disk_total);
328BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); 328BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
329 329
330static struct attribute *space_info_attrs[] = { 330static struct attribute *space_info_attrs[] = {
331 BTRFS_ATTR_PTR(flags), 331 BTRFS_ATTR_PTR(flags),
@@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
363 struct kobj_attribute *a, char *buf) 363 struct kobj_attribute *a, char *buf)
364{ 364{
365 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 365 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
366 return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); 366 char *label = fs_info->super_copy->label;
367 return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
367} 368}
368 369
369static ssize_t btrfs_label_store(struct kobject *kobj, 370static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
374 struct btrfs_trans_handle *trans; 375 struct btrfs_trans_handle *trans;
375 struct btrfs_root *root = fs_info->fs_root; 376 struct btrfs_root *root = fs_info->fs_root;
376 int ret; 377 int ret;
378 size_t p_len;
377 379
378 if (len >= BTRFS_LABEL_SIZE) 380 if (fs_info->sb->s_flags & MS_RDONLY)
381 return -EROFS;
382
383 /*
384 * p_len is the len until the first occurrence of either
385 * '\n' or '\0'
386 */
387 p_len = strcspn(buf, "\n");
388
389 if (p_len >= BTRFS_LABEL_SIZE)
379 return -EINVAL; 390 return -EINVAL;
380 391
381 trans = btrfs_start_transaction(root, 0); 392 trans = btrfs_start_transaction(root, 0);
@@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
383 return PTR_ERR(trans); 394 return PTR_ERR(trans);
384 395
385 spin_lock(&root->fs_info->super_lock); 396 spin_lock(&root->fs_info->super_lock);
386 strcpy(fs_info->super_copy->label, buf); 397 memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE);
398 memcpy(fs_info->super_copy->label, buf, p_len);
387 spin_unlock(&root->fs_info->super_lock); 399 spin_unlock(&root->fs_info->super_lock);
388 ret = btrfs_commit_transaction(trans, root); 400 ret = btrfs_commit_transaction(trans, root);
389 401
@@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
392 404
393 return ret; 405 return ret;
394} 406}
395BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); 407BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store);
396
397static ssize_t btrfs_no_store(struct kobject *kobj,
398 struct kobj_attribute *a,
399 const char *buf, size_t len)
400{
401 return -EPERM;
402}
403 408
404static ssize_t btrfs_nodesize_show(struct kobject *kobj, 409static ssize_t btrfs_nodesize_show(struct kobject *kobj,
405 struct kobj_attribute *a, char *buf) 410 struct kobj_attribute *a, char *buf)
@@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
409 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); 414 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
410} 415}
411 416
412BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); 417BTRFS_ATTR(nodesize, btrfs_nodesize_show);
413 418
414static ssize_t btrfs_sectorsize_show(struct kobject *kobj, 419static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
415 struct kobj_attribute *a, char *buf) 420 struct kobj_attribute *a, char *buf)
@@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
419 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); 424 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
420} 425}
421 426
422BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); 427BTRFS_ATTR(sectorsize, btrfs_sectorsize_show);
423 428
424static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, 429static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
425 struct kobj_attribute *a, char *buf) 430 struct kobj_attribute *a, char *buf)
@@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
429 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); 434 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize);
430} 435}
431 436
432BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); 437BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show);
433 438
434static struct attribute *btrfs_attrs[] = { 439static struct attribute *btrfs_attrs[] = {
435 BTRFS_ATTR_PTR(label), 440 BTRFS_ATTR_PTR(label),
@@ -614,7 +619,7 @@ int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info,
614 if (!fs_info->device_dir_kobj) 619 if (!fs_info->device_dir_kobj)
615 return -EINVAL; 620 return -EINVAL;
616 621
617 if (one_device) { 622 if (one_device && one_device->bdev) {
618 disk = one_device->bdev->bd_part; 623 disk = one_device->bdev->bd_part;
619 disk_kobj = &part_to_dev(disk)->kobj; 624 disk_kobj = &part_to_dev(disk)->kobj;
620 625
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index ac46df37504c..f7dd298b3cf6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -20,16 +20,20 @@ enum btrfs_feature_set {
20 .store = _store, \ 20 .store = _store, \
21} 21}
22 22
23#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ 23#define BTRFS_ATTR_RW(_name, _show, _store) \
24static struct kobj_attribute btrfs_attr_##_name = \ 24 static struct kobj_attribute btrfs_attr_##_name = \
25 __INIT_KOBJ_ATTR(_name, _mode, _show, _store) 25 __INIT_KOBJ_ATTR(_name, 0644, _show, _store)
26#define BTRFS_ATTR(_name, _mode, _show) \ 26
27 BTRFS_ATTR_RW(_name, _mode, _show, NULL) 27#define BTRFS_ATTR(_name, _show) \
28 static struct kobj_attribute btrfs_attr_##_name = \
29 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
30
28#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) 31#define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr)
29 32
30#define BTRFS_RAID_ATTR(_name, _show) \ 33#define BTRFS_RAID_ATTR(_name, _show) \
31static struct kobj_attribute btrfs_raid_attr_##_name = \ 34 static struct kobj_attribute btrfs_raid_attr_##_name = \
32 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) 35 __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)
36
33#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) 37#define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr)
34 38
35 39
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index c8d9ddf84c69..2299bfde39ee 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
40 cache->key.offset = 1024 * 1024 * 1024; 40 cache->key.offset = 1024 * 1024 * 1024;
41 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 41 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
42 cache->sectorsize = 4096; 42 cache->sectorsize = 4096;
43 cache->full_stripe_len = 4096;
43 44
44 spin_lock_init(&cache->lock); 45 spin_lock_init(&cache->lock);
45 INIT_LIST_HEAD(&cache->list); 46 INIT_LIST_HEAD(&cache->list);
46 INIT_LIST_HEAD(&cache->cluster_list); 47 INIT_LIST_HEAD(&cache->cluster_list);
47 INIT_LIST_HEAD(&cache->new_bg_list); 48 INIT_LIST_HEAD(&cache->bg_list);
48 49
49 btrfs_init_free_space_ctl(cache); 50 btrfs_init_free_space_ctl(cache);
50 51
@@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
364 return 0; 365 return 0;
365} 366}
366 367
368/* Used by test_steal_space_from_bitmap_to_extent(). */
369static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl,
370 struct btrfs_free_space *info)
371{
372 return ctl->free_extents > 0;
373}
374
375/* Used by test_steal_space_from_bitmap_to_extent(). */
376static int
377check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache,
378 const int num_extents,
379 const int num_bitmaps)
380{
381 if (cache->free_space_ctl->free_extents != num_extents) {
382 test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
383 cache->free_space_ctl->free_extents, num_extents);
384 return -EINVAL;
385 }
386 if (cache->free_space_ctl->total_bitmaps != num_bitmaps) {
387 test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n",
388 cache->free_space_ctl->total_bitmaps, num_bitmaps);
389 return -EINVAL;
390 }
391 return 0;
392}
393
394/* Used by test_steal_space_from_bitmap_to_extent(). */
395static int check_cache_empty(struct btrfs_block_group_cache *cache)
396{
397 u64 offset;
398 u64 max_extent_size;
399
400 /*
401 * Now lets confirm that there's absolutely no free space left to
402 * allocate.
403 */
404 if (cache->free_space_ctl->free_space != 0) {
405 test_msg("Cache free space is not 0\n");
406 return -EINVAL;
407 }
408
409 /* And any allocation request, no matter how small, should fail now. */
410 offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0,
411 &max_extent_size);
412 if (offset != 0) {
413 test_msg("Space allocation did not fail, returned offset: %llu",
414 offset);
415 return -EINVAL;
416 }
417
418 /* And no extent nor bitmap entries in the cache anymore. */
419 return check_num_extents_and_bitmaps(cache, 0, 0);
420}
421
422/*
423 * Before we were able to steal free space from a bitmap entry to an extent
424 * entry, we could end up with 2 entries representing a contiguous free space.
425 * One would be an extent entry and the other a bitmap entry. Since in order
426 * to allocate space to a caller we use only 1 entry, we couldn't return that
427 * whole range to the caller if it was requested. This forced the caller to
428 * either assume ENOSPC or perform several smaller space allocations, which
429 * wasn't optimal as they could be spread all over the block group while under
430 * concurrency (extra overhead and fragmentation).
431 *
432 * This stealing approach is benefical, since we always prefer to allocate from
433 * extent entries, both for clustered and non-clustered allocation requests.
434 */
435static int
436test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
437{
438 int ret;
439 u64 offset;
440 u64 max_extent_size;
441
442 bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
443 struct btrfs_free_space *);
444
445 test_msg("Running space stealing from bitmap to extent\n");
446
447 /*
448 * For this test, we want to ensure we end up with an extent entry
449 * immediately adjacent to a bitmap entry, where the bitmap starts
450 * at an offset where the extent entry ends. We keep adding and
451 * removing free space to reach into this state, but to get there
452 * we need to reach a point where marking new free space doesn't
453 * result in adding new extent entries or merging the new space
454 * with existing extent entries - the space ends up being marked
455 * in an existing bitmap that covers the new free space range.
456 *
457 * To get there, we need to reach the threshold defined set at
458 * cache->free_space_ctl->extents_thresh, which currently is
459 * 256 extents on a x86_64 system at least, and a few other
460 * conditions (check free_space_cache.c). Instead of making the
461 * test much longer and complicated, use a "use_bitmap" operation
462 * that forces use of bitmaps as soon as we have at least 1
463 * extent entry.
464 */
465 use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
466 cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
467
468 /*
469 * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
470 */
471 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
472 128 * 1024, 0);
473 if (ret) {
474 test_msg("Couldn't add extent entry %d\n", ret);
475 return ret;
476 }
477
478 /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
479 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
480 128 * 1024 * 1024 - 512 * 1024, 1);
481 if (ret) {
482 test_msg("Couldn't add bitmap entry %d\n", ret);
483 return ret;
484 }
485
486 ret = check_num_extents_and_bitmaps(cache, 2, 1);
487 if (ret)
488 return ret;
489
490 /*
491 * Now make only the first 256Kb of the bitmap marked as free, so that
492 * we end up with only the following ranges marked as free space:
493 *
494 * [128Mb - 256Kb, 128Mb - 128Kb[
495 * [128Mb + 512Kb, 128Mb + 768Kb[
496 */
497 ret = btrfs_remove_free_space(cache,
498 128 * 1024 * 1024 + 768 * 1024,
499 128 * 1024 * 1024 - 768 * 1024);
500 if (ret) {
501 test_msg("Failed to free part of bitmap space %d\n", ret);
502 return ret;
503 }
504
505 /* Confirm that only those 2 ranges are marked as free. */
506 if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
507 128 * 1024)) {
508 test_msg("Free space range missing\n");
509 return -ENOENT;
510 }
511 if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
512 256 * 1024)) {
513 test_msg("Free space range missing\n");
514 return -ENOENT;
515 }
516
517 /*
518 * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
519 * as free anymore.
520 */
521 if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
522 128 * 1024 * 1024 - 768 * 1024)) {
523 test_msg("Bitmap region not removed from space cache\n");
524 return -EINVAL;
525 }
526
527 /*
528 * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
529 * covered by the bitmap, isn't marked as free.
530 */
531 if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
532 256 * 1024)) {
533 test_msg("Invalid bitmap region marked as free\n");
534 return -EINVAL;
535 }
536
537 /*
538 * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
539 * by the bitmap too, isn't marked as free either.
540 */
541 if (test_check_exists(cache, 128 * 1024 * 1024,
542 256 * 1024)) {
543 test_msg("Invalid bitmap region marked as free\n");
544 return -EINVAL;
545 }
546
547 /*
548 * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But,
549 * lets make sure the free space cache marks it as free in the bitmap,
550 * and doesn't insert a new extent entry to represent this region.
551 */
552 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
553 if (ret) {
554 test_msg("Error adding free space: %d\n", ret);
555 return ret;
556 }
557 /* Confirm the region is marked as free. */
558 if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
559 test_msg("Bitmap region not marked as free\n");
560 return -ENOENT;
561 }
562
563 /*
564 * Confirm that no new extent entries or bitmap entries were added to
565 * the cache after adding that free space region.
566 */
567 ret = check_num_extents_and_bitmaps(cache, 2, 1);
568 if (ret)
569 return ret;
570
571 /*
572 * Now lets add a small free space region to the right of the previous
573 * one, which is not contiguous with it and is part of the bitmap too.
574 * The goal is to test that the bitmap entry space stealing doesn't
575 * steal this space region.
576 */
577 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
578 4096);
579 if (ret) {
580 test_msg("Error adding free space: %d\n", ret);
581 return ret;
582 }
583
584 /*
585 * Confirm that no new extent entries or bitmap entries were added to
586 * the cache after adding that free space region.
587 */
588 ret = check_num_extents_and_bitmaps(cache, 2, 1);
589 if (ret)
590 return ret;
591
592 /*
593 * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will
594 * expand the range covered by the existing extent entry that represents
595 * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
596 */
597 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
598 128 * 1024);
599 if (ret) {
600 test_msg("Error adding free space: %d\n", ret);
601 return ret;
602 }
603 /* Confirm the region is marked as free. */
604 if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
605 128 * 1024)) {
606 test_msg("Extent region not marked as free\n");
607 return -ENOENT;
608 }
609
610 /*
611 * Confirm that our extent entry didn't stole all free space from the
612 * bitmap, because of the small 4Kb free space region.
613 */
614 ret = check_num_extents_and_bitmaps(cache, 2, 1);
615 if (ret)
616 return ret;
617
618 /*
619 * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free
620 * space. Without stealing bitmap free space into extent entry space,
621 * we would have all this free space represented by 2 entries in the
622 * cache:
623 *
624 * extent entry covering range: [128Mb - 256Kb, 128Mb[
625 * bitmap entry covering range: [128Mb, 128Mb + 768Kb[
626 *
627 * Attempting to allocate the whole free space (1Mb) would fail, because
628 * we can't allocate from multiple entries.
629 * With the bitmap free space stealing, we get a single extent entry
630 * that represents the 1Mb free space, and therefore we're able to
631 * allocate the whole free space at once.
632 */
633 if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
634 1 * 1024 * 1024)) {
635 test_msg("Expected region not marked as free\n");
636 return -ENOENT;
637 }
638
639 if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
640 test_msg("Cache free space is not 1Mb + 4Kb\n");
641 return -EINVAL;
642 }
643
644 offset = btrfs_find_space_for_alloc(cache,
645 0, 1 * 1024 * 1024, 0,
646 &max_extent_size);
647 if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
648 test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
649 offset);
650 return -EINVAL;
651 }
652
653 /* All that remains is a 4Kb free space region in a bitmap. Confirm. */
654 ret = check_num_extents_and_bitmaps(cache, 1, 1);
655 if (ret)
656 return ret;
657
658 if (cache->free_space_ctl->free_space != 4096) {
659 test_msg("Cache free space is not 4Kb\n");
660 return -EINVAL;
661 }
662
663 offset = btrfs_find_space_for_alloc(cache,
664 0, 4096, 0,
665 &max_extent_size);
666 if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
667 test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
668 offset);
669 return -EINVAL;
670 }
671
672 ret = check_cache_empty(cache);
673 if (ret)
674 return ret;
675
676 __btrfs_remove_free_space_cache(cache->free_space_ctl);
677
678 /*
679 * Now test a similar scenario, but where our extent entry is located
680 * to the right of the bitmap entry, so that we can check that stealing
681 * space from a bitmap to the front of an extent entry works.
682 */
683
684 /*
685 * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
686 */
687 ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
688 128 * 1024, 0);
689 if (ret) {
690 test_msg("Couldn't add extent entry %d\n", ret);
691 return ret;
692 }
693
694 /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
695 ret = test_add_free_space_entry(cache, 0,
696 128 * 1024 * 1024 - 512 * 1024, 1);
697 if (ret) {
698 test_msg("Couldn't add bitmap entry %d\n", ret);
699 return ret;
700 }
701
702 ret = check_num_extents_and_bitmaps(cache, 2, 1);
703 if (ret)
704 return ret;
705
706 /*
707 * Now make only the last 256Kb of the bitmap marked as free, so that
708 * we end up with only the following ranges marked as free space:
709 *
710 * [128Mb + 128b, 128Mb + 256Kb[
711 * [128Mb - 768Kb, 128Mb - 512Kb[
712 */
713 ret = btrfs_remove_free_space(cache,
714 0,
715 128 * 1024 * 1024 - 768 * 1024);
716 if (ret) {
717 test_msg("Failed to free part of bitmap space %d\n", ret);
718 return ret;
719 }
720
721 /* Confirm that only those 2 ranges are marked as free. */
722 if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
723 128 * 1024)) {
724 test_msg("Free space range missing\n");
725 return -ENOENT;
726 }
727 if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
728 256 * 1024)) {
729 test_msg("Free space range missing\n");
730 return -ENOENT;
731 }
732
733 /*
734 * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
735 * as free anymore.
736 */
737 if (test_check_exists(cache, 0,
738 128 * 1024 * 1024 - 768 * 1024)) {
739 test_msg("Bitmap region not removed from space cache\n");
740 return -EINVAL;
741 }
742
743 /*
744 * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
745 * covered by the bitmap, isn't marked as free.
746 */
747 if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
748 512 * 1024)) {
749 test_msg("Invalid bitmap region marked as free\n");
750 return -EINVAL;
751 }
752
753 /*
754 * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But,
755 * lets make sure the free space cache marks it as free in the bitmap,
756 * and doesn't insert a new extent entry to represent this region.
757 */
758 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
759 512 * 1024);
760 if (ret) {
761 test_msg("Error adding free space: %d\n", ret);
762 return ret;
763 }
764 /* Confirm the region is marked as free. */
765 if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
766 512 * 1024)) {
767 test_msg("Bitmap region not marked as free\n");
768 return -ENOENT;
769 }
770
771 /*
772 * Confirm that no new extent entries or bitmap entries were added to
773 * the cache after adding that free space region.
774 */
775 ret = check_num_extents_and_bitmaps(cache, 2, 1);
776 if (ret)
777 return ret;
778
779 /*
780 * Now lets add a small free space region to the left of the previous
781 * one, which is not contiguous with it and is part of the bitmap too.
782 * The goal is to test that the bitmap entry space stealing doesn't
783 * steal this space region.
784 */
785 ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
786 if (ret) {
787 test_msg("Error adding free space: %d\n", ret);
788 return ret;
789 }
790
791 /*
792 * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will
793 * expand the range covered by the existing extent entry that represents
794 * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
795 */
796 ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
797 if (ret) {
798 test_msg("Error adding free space: %d\n", ret);
799 return ret;
800 }
801 /* Confirm the region is marked as free. */
802 if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
803 test_msg("Extent region not marked as free\n");
804 return -ENOENT;
805 }
806
807 /*
808 * Confirm that our extent entry didn't stole all free space from the
809 * bitmap, because of the small 8Kb free space region.
810 */
811 ret = check_num_extents_and_bitmaps(cache, 2, 1);
812 if (ret)
813 return ret;
814
815 /*
816 * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free
817 * space. Without stealing bitmap free space into extent entry space,
818 * we would have all this free space represented by 2 entries in the
819 * cache:
820 *
821 * extent entry covering range: [128Mb, 128Mb + 256Kb[
822 * bitmap entry covering range: [128Mb - 768Kb, 128Mb[
823 *
824 * Attempting to allocate the whole free space (1Mb) would fail, because
825 * we can't allocate from multiple entries.
826 * With the bitmap free space stealing, we get a single extent entry
827 * that represents the 1Mb free space, and therefore we're able to
828 * allocate the whole free space at once.
829 */
830 if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
831 1 * 1024 * 1024)) {
832 test_msg("Expected region not marked as free\n");
833 return -ENOENT;
834 }
835
836 if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
837 test_msg("Cache free space is not 1Mb + 8Kb\n");
838 return -EINVAL;
839 }
840
841 offset = btrfs_find_space_for_alloc(cache,
842 0, 1 * 1024 * 1024, 0,
843 &max_extent_size);
844 if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
845 test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
846 offset);
847 return -EINVAL;
848 }
849
850 /* All that remains is a 8Kb free space region in a bitmap. Confirm. */
851 ret = check_num_extents_and_bitmaps(cache, 1, 1);
852 if (ret)
853 return ret;
854
855 if (cache->free_space_ctl->free_space != 8192) {
856 test_msg("Cache free space is not 8Kb\n");
857 return -EINVAL;
858 }
859
860 offset = btrfs_find_space_for_alloc(cache,
861 0, 8192, 0,
862 &max_extent_size);
863 if (offset != (32 * 1024 * 1024)) {
864 test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
865 offset);
866 return -EINVAL;
867 }
868
869 ret = check_cache_empty(cache);
870 if (ret)
871 return ret;
872
873 cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
874 __btrfs_remove_free_space_cache(cache->free_space_ctl);
875
876 return 0;
877}
878
367int btrfs_test_free_space_cache(void) 879int btrfs_test_free_space_cache(void)
368{ 880{
369 struct btrfs_block_group_cache *cache; 881 struct btrfs_block_group_cache *cache;
@@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void)
386 ret = test_bitmaps_and_extents(cache); 898 ret = test_bitmaps_and_extents(cache);
387 if (ret) 899 if (ret)
388 goto out; 900 goto out;
901
902 ret = test_steal_space_from_bitmap_to_extent(cache);
389out: 903out:
390 __btrfs_remove_free_space_cache(cache->free_space_ctl); 904 __btrfs_remove_free_space_cache(cache->free_space_ctl);
391 kfree(cache->free_space_ctl); 905 kfree(cache->free_space_ctl);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5f379affdf23..dcaae3616728 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -218,7 +218,6 @@ loop:
218 spin_lock_init(&cur_trans->delayed_refs.lock); 218 spin_lock_init(&cur_trans->delayed_refs.lock);
219 219
220 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 220 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
221 INIT_LIST_HEAD(&cur_trans->ordered_operations);
222 INIT_LIST_HEAD(&cur_trans->pending_chunks); 221 INIT_LIST_HEAD(&cur_trans->pending_chunks);
223 INIT_LIST_HEAD(&cur_trans->switch_commits); 222 INIT_LIST_HEAD(&cur_trans->switch_commits);
224 list_add_tail(&cur_trans->list, &fs_info->trans_list); 223 list_add_tail(&cur_trans->list, &fs_info->trans_list);
@@ -387,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
387 int ret; 386 int ret;
388 387
389 /* Send isn't supposed to start transactions. */ 388 /* Send isn't supposed to start transactions. */
390 ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); 389 ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
391 390
392 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) 391 if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
393 return ERR_PTR(-EROFS); 392 return ERR_PTR(-EROFS);
@@ -409,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
409 if (num_items > 0 && root != root->fs_info->chunk_root) { 408 if (num_items > 0 && root != root->fs_info->chunk_root) {
410 if (root->fs_info->quota_enabled && 409 if (root->fs_info->quota_enabled &&
411 is_fstree(root->root_key.objectid)) { 410 is_fstree(root->root_key.objectid)) {
412 qgroup_reserved = num_items * root->leafsize; 411 qgroup_reserved = num_items * root->nodesize;
413 ret = btrfs_qgroup_reserve(root, qgroup_reserved); 412 ret = btrfs_qgroup_reserve(root, qgroup_reserved);
414 if (ret) 413 if (ret)
415 return ERR_PTR(ret); 414 return ERR_PTR(ret);
@@ -419,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
419 /* 418 /*
420 * Do the reservation for the relocation root creation 419 * Do the reservation for the relocation root creation
421 */ 420 */
422 if (unlikely(need_reserve_reloc_root(root))) { 421 if (need_reserve_reloc_root(root)) {
423 num_bytes += root->nodesize; 422 num_bytes += root->nodesize;
424 reloc_reserved = true; 423 reloc_reserved = true;
425 } 424 }
@@ -610,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
610 if (transid <= root->fs_info->last_trans_committed) 609 if (transid <= root->fs_info->last_trans_committed)
611 goto out; 610 goto out;
612 611
613 ret = -EINVAL;
614 /* find specified transaction */ 612 /* find specified transaction */
615 spin_lock(&root->fs_info->trans_lock); 613 spin_lock(&root->fs_info->trans_lock);
616 list_for_each_entry(t, &root->fs_info->trans_list, list) { 614 list_for_each_entry(t, &root->fs_info->trans_list, list) {
@@ -626,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
626 } 624 }
627 } 625 }
628 spin_unlock(&root->fs_info->trans_lock); 626 spin_unlock(&root->fs_info->trans_lock);
629 /* The specified transaction doesn't exist */ 627
630 if (!cur_trans) 628 /*
629 * The specified transaction doesn't exist, or we
630 * raced with btrfs_commit_transaction
631 */
632 if (!cur_trans) {
633 if (transid > root->fs_info->last_trans_committed)
634 ret = -EINVAL;
631 goto out; 635 goto out;
636 }
632 } else { 637 } else {
633 /* find newest transaction that is committing | committed */ 638 /* find newest transaction that is committing | committed */
634 spin_lock(&root->fs_info->trans_lock); 639 spin_lock(&root->fs_info->trans_lock);
@@ -852,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
852 struct extent_state *cached_state = NULL; 857 struct extent_state *cached_state = NULL;
853 u64 start = 0; 858 u64 start = 0;
854 u64 end; 859 u64 end;
860 struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
861 bool errors = false;
855 862
856 while (!find_first_extent_bit(dirty_pages, start, &start, &end, 863 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
857 EXTENT_NEED_WAIT, &cached_state)) { 864 EXTENT_NEED_WAIT, &cached_state)) {
@@ -865,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
865 } 872 }
866 if (err) 873 if (err)
867 werr = err; 874 werr = err;
875
876 if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
877 if ((mark & EXTENT_DIRTY) &&
878 test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR,
879 &btree_ino->runtime_flags))
880 errors = true;
881
882 if ((mark & EXTENT_NEW) &&
883 test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR,
884 &btree_ino->runtime_flags))
885 errors = true;
886 } else {
887 if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR,
888 &btree_ino->runtime_flags))
889 errors = true;
890 }
891
892 if (errors && !werr)
893 werr = -EIO;
894
868 return werr; 895 return werr;
869} 896}
870 897
@@ -1612,27 +1639,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
1612 kmem_cache_free(btrfs_trans_handle_cachep, trans); 1639 kmem_cache_free(btrfs_trans_handle_cachep, trans);
1613} 1640}
1614 1641
1615static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
1616 struct btrfs_root *root)
1617{
1618 int ret;
1619
1620 ret = btrfs_run_delayed_items(trans, root);
1621 if (ret)
1622 return ret;
1623
1624 /*
1625 * rename don't use btrfs_join_transaction, so, once we
1626 * set the transaction to blocked above, we aren't going
1627 * to get any new ordered operations. We can safely run
1628 * it here and no for sure that nothing new will be added
1629 * to the list
1630 */
1631 ret = btrfs_run_ordered_operations(trans, root, 1);
1632
1633 return ret;
1634}
1635
1636static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) 1642static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1637{ 1643{
1638 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1644 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
@@ -1651,15 +1657,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1651{ 1657{
1652 struct btrfs_transaction *cur_trans = trans->transaction; 1658 struct btrfs_transaction *cur_trans = trans->transaction;
1653 struct btrfs_transaction *prev_trans = NULL; 1659 struct btrfs_transaction *prev_trans = NULL;
1660 struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode);
1654 int ret; 1661 int ret;
1655 1662
1656 ret = btrfs_run_ordered_operations(trans, root, 0);
1657 if (ret) {
1658 btrfs_abort_transaction(trans, root, ret);
1659 btrfs_end_transaction(trans, root);
1660 return ret;
1661 }
1662
1663 /* Stop the commit early if ->aborted is set */ 1663 /* Stop the commit early if ->aborted is set */
1664 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { 1664 if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
1665 ret = cur_trans->aborted; 1665 ret = cur_trans->aborted;
@@ -1740,7 +1740,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1740 if (ret) 1740 if (ret)
1741 goto cleanup_transaction; 1741 goto cleanup_transaction;
1742 1742
1743 ret = btrfs_flush_all_pending_stuffs(trans, root); 1743 ret = btrfs_run_delayed_items(trans, root);
1744 if (ret) 1744 if (ret)
1745 goto cleanup_transaction; 1745 goto cleanup_transaction;
1746 1746
@@ -1748,7 +1748,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1748 extwriter_counter_read(cur_trans) == 0); 1748 extwriter_counter_read(cur_trans) == 0);
1749 1749
1750 /* some pending stuffs might be added after the previous flush. */ 1750 /* some pending stuffs might be added after the previous flush. */
1751 ret = btrfs_flush_all_pending_stuffs(trans, root); 1751 ret = btrfs_run_delayed_items(trans, root);
1752 if (ret) 1752 if (ret)
1753 goto cleanup_transaction; 1753 goto cleanup_transaction;
1754 1754
@@ -1897,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1897 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, 1897 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1898 sizeof(*root->fs_info->super_copy)); 1898 sizeof(*root->fs_info->super_copy));
1899 1899
1900 btrfs_update_commit_device_size(root->fs_info);
1901 btrfs_update_commit_device_bytes_used(root, cur_trans);
1902
1903 clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
1904 clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
1905
1900 spin_lock(&root->fs_info->trans_lock); 1906 spin_lock(&root->fs_info->trans_lock);
1901 cur_trans->state = TRANS_STATE_UNBLOCKED; 1907 cur_trans->state = TRANS_STATE_UNBLOCKED;
1902 root->fs_info->running_transaction = NULL; 1908 root->fs_info->running_transaction = NULL;
@@ -2010,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
2010 ret = btrfs_drop_snapshot(root, NULL, 0, 0); 2016 ret = btrfs_drop_snapshot(root, NULL, 0, 0);
2011 else 2017 else
2012 ret = btrfs_drop_snapshot(root, NULL, 1, 0); 2018 ret = btrfs_drop_snapshot(root, NULL, 1, 0);
2013 /* 2019
2014 * If we encounter a transaction abort during snapshot cleaning, we
2015 * don't want to crash here
2016 */
2017 return (ret < 0) ? 0 : 1; 2020 return (ret < 0) ? 0 : 1;
2018} 2021}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7dd558ed0716..d8f40e1a5d2d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -55,7 +55,6 @@ struct btrfs_transaction {
55 wait_queue_head_t writer_wait; 55 wait_queue_head_t writer_wait;
56 wait_queue_head_t commit_wait; 56 wait_queue_head_t commit_wait;
57 struct list_head pending_snapshots; 57 struct list_head pending_snapshots;
58 struct list_head ordered_operations;
59 struct list_head pending_chunks; 58 struct list_head pending_chunks;
60 struct list_head switch_commits; 59 struct list_head switch_commits;
61 struct btrfs_delayed_ref_root delayed_refs; 60 struct btrfs_delayed_ref_root delayed_refs;
@@ -80,7 +79,7 @@ struct btrfs_transaction {
80#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ 79#define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \
81 __TRANS_ATTACH) 80 __TRANS_ATTACH)
82 81
83#define BTRFS_SEND_TRANS_STUB 1 82#define BTRFS_SEND_TRANS_STUB ((void *)1)
84 83
85struct btrfs_trans_handle { 84struct btrfs_trans_handle {
86 u64 transid; 85 u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9e1f2cd5e67a..1475979e5718 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,11 @@
94#define LOG_WALK_REPLAY_ALL 3 94#define LOG_WALK_REPLAY_ALL 3
95 95
96static int btrfs_log_inode(struct btrfs_trans_handle *trans, 96static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode, 97 struct btrfs_root *root, struct inode *inode,
98 int inode_only); 98 int inode_only,
99 const loff_t start,
100 const loff_t end,
101 struct btrfs_log_ctx *ctx);
99static int link_to_fixup_dir(struct btrfs_trans_handle *trans, 102static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
100 struct btrfs_root *root, 103 struct btrfs_root *root,
101 struct btrfs_path *path, u64 objectid); 104 struct btrfs_path *path, u64 objectid);
@@ -1496,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1496 return -EIO; 1499 return -EIO;
1497 1500
1498 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1501 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1499 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1502 key.type = BTRFS_ORPHAN_ITEM_KEY;
1500 key.offset = objectid; 1503 key.offset = objectid;
1501 1504
1502 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1505 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
@@ -1635,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1635 found_key.type == log_key.type && 1638 found_key.type == log_key.type &&
1636 found_key.offset == log_key.offset && 1639 found_key.offset == log_key.offset &&
1637 btrfs_dir_type(path->nodes[0], dst_di) == log_type) { 1640 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1641 update_size = false;
1638 goto out; 1642 goto out;
1639 } 1643 }
1640 1644
@@ -2155,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2155 2159
2156 bytenr = btrfs_node_blockptr(cur, path->slots[*level]); 2160 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2157 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); 2161 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2158 blocksize = btrfs_level_size(root, *level - 1); 2162 blocksize = root->nodesize;
2159 2163
2160 parent = path->nodes[*level]; 2164 parent = path->nodes[*level];
2161 root_owner = btrfs_header_owner(parent); 2165 root_owner = btrfs_header_owner(parent);
@@ -2981,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2981 min_key.type = key_type; 2985 min_key.type = key_type;
2982 min_key.offset = min_offset; 2986 min_key.offset = min_offset;
2983 2987
2984 path->keep_locks = 1;
2985
2986 ret = btrfs_search_forward(root, &min_key, path, trans->transid); 2988 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
2987 2989
2988 /* 2990 /*
@@ -3298,7 +3300,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3298 struct list_head ordered_sums; 3300 struct list_head ordered_sums;
3299 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 3301 int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3300 bool has_extents = false; 3302 bool has_extents = false;
3301 bool need_find_last_extent = (*last_extent == 0); 3303 bool need_find_last_extent = true;
3302 bool done = false; 3304 bool done = false;
3303 3305
3304 INIT_LIST_HEAD(&ordered_sums); 3306 INIT_LIST_HEAD(&ordered_sums);
@@ -3352,8 +3354,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3352 */ 3354 */
3353 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { 3355 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3354 has_extents = true; 3356 has_extents = true;
3355 if (need_find_last_extent && 3357 if (first_key.objectid == (u64)-1)
3356 first_key.objectid == (u64)-1)
3357 first_key = ins_keys[i]; 3358 first_key = ins_keys[i];
3358 } else { 3359 } else {
3359 need_find_last_extent = false; 3360 need_find_last_extent = false;
@@ -3363,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3363 * or deletes of this inode don't have to relog the inode 3364 * or deletes of this inode don't have to relog the inode
3364 * again 3365 * again
3365 */ 3366 */
3366 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && 3367 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
3367 !skip_csum) { 3368 !skip_csum) {
3368 int found_type; 3369 int found_type;
3369 extent = btrfs_item_ptr(src, start_slot + i, 3370 extent = btrfs_item_ptr(src, start_slot + i,
@@ -3427,6 +3428,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3427 if (!has_extents) 3428 if (!has_extents)
3428 return ret; 3429 return ret;
3429 3430
3431 if (need_find_last_extent && *last_extent == first_key.offset) {
3432 /*
3433 * We don't have any leafs between our current one and the one
3434 * we processed before that can have file extent items for our
3435 * inode (and have a generation number smaller than our current
3436 * transaction id).
3437 */
3438 need_find_last_extent = false;
3439 }
3440
3430 /* 3441 /*
3431 * Because we use btrfs_search_forward we could skip leaves that were 3442 * Because we use btrfs_search_forward we could skip leaves that were
3432 * not modified and then assume *last_extent is valid when it really 3443 * not modified and then assume *last_extent is valid when it really
@@ -3537,7 +3548,7 @@ fill_holes:
3537 0, 0); 3548 0, 0);
3538 if (ret) 3549 if (ret)
3539 break; 3550 break;
3540 *last_extent = offset + len; 3551 *last_extent = extent_end;
3541 } 3552 }
3542 /* 3553 /*
3543 * Need to let the callers know we dropped the path so they should 3554 * Need to let the callers know we dropped the path so they should
@@ -3562,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
3562 return 0; 3573 return 0;
3563} 3574}
3564 3575
3565static int log_one_extent(struct btrfs_trans_handle *trans, 3576static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3566 struct inode *inode, struct btrfs_root *root, 3577 struct inode *inode,
3567 struct extent_map *em, struct btrfs_path *path, 3578 struct btrfs_root *root,
3568 struct list_head *logged_list) 3579 const struct extent_map *em,
3580 const struct list_head *logged_list,
3581 bool *ordered_io_error)
3569{ 3582{
3570 struct btrfs_root *log = root->log_root;
3571 struct btrfs_file_extent_item *fi;
3572 struct extent_buffer *leaf;
3573 struct btrfs_ordered_extent *ordered; 3583 struct btrfs_ordered_extent *ordered;
3574 struct list_head ordered_sums; 3584 struct btrfs_root *log = root->log_root;
3575 struct btrfs_map_token token;
3576 struct btrfs_key key;
3577 u64 mod_start = em->mod_start; 3585 u64 mod_start = em->mod_start;
3578 u64 mod_len = em->mod_len; 3586 u64 mod_len = em->mod_len;
3587 const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3579 u64 csum_offset; 3588 u64 csum_offset;
3580 u64 csum_len; 3589 u64 csum_len;
3581 u64 extent_offset = em->start - em->orig_start; 3590 LIST_HEAD(ordered_sums);
3582 u64 block_len; 3591 int ret = 0;
3583 int ret;
3584 bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
3585 int extent_inserted = 0;
3586
3587 INIT_LIST_HEAD(&ordered_sums);
3588 btrfs_init_map_token(&token);
3589
3590 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3591 em->start + em->len, NULL, 0, 1,
3592 sizeof(*fi), &extent_inserted);
3593 if (ret)
3594 return ret;
3595
3596 if (!extent_inserted) {
3597 key.objectid = btrfs_ino(inode);
3598 key.type = BTRFS_EXTENT_DATA_KEY;
3599 key.offset = em->start;
3600
3601 ret = btrfs_insert_empty_item(trans, log, path, &key,
3602 sizeof(*fi));
3603 if (ret)
3604 return ret;
3605 }
3606 leaf = path->nodes[0];
3607 fi = btrfs_item_ptr(leaf, path->slots[0],
3608 struct btrfs_file_extent_item);
3609
3610 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3611 &token);
3612 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3613 skip_csum = true;
3614 btrfs_set_token_file_extent_type(leaf, fi,
3615 BTRFS_FILE_EXTENT_PREALLOC,
3616 &token);
3617 } else {
3618 btrfs_set_token_file_extent_type(leaf, fi,
3619 BTRFS_FILE_EXTENT_REG,
3620 &token);
3621 if (em->block_start == EXTENT_MAP_HOLE)
3622 skip_csum = true;
3623 }
3624
3625 block_len = max(em->block_len, em->orig_block_len);
3626 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3627 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3628 em->block_start,
3629 &token);
3630 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3631 &token);
3632 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3633 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3634 em->block_start -
3635 extent_offset, &token);
3636 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3637 &token);
3638 } else {
3639 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3640 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3641 &token);
3642 }
3643 3592
3644 btrfs_set_token_file_extent_offset(leaf, fi, 3593 *ordered_io_error = false;
3645 em->start - em->orig_start,
3646 &token);
3647 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3648 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3649 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3650 &token);
3651 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3652 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3653 btrfs_mark_buffer_dirty(leaf);
3654 3594
3655 btrfs_release_path(path); 3595 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
3656 if (ret) { 3596 em->block_start == EXTENT_MAP_HOLE)
3657 return ret;
3658 }
3659
3660 if (skip_csum)
3661 return 0; 3597 return 0;
3662 3598
3663 /* 3599 /*
3664 * First check and see if our csums are on our outstanding ordered 3600 * Wait far any ordered extent that covers our extent map. If it
3665 * extents. 3601 * finishes without an error, first check and see if our csums are on
3602 * our outstanding ordered extents.
3666 */ 3603 */
3667 list_for_each_entry(ordered, logged_list, log_list) { 3604 list_for_each_entry(ordered, logged_list, log_list) {
3668 struct btrfs_ordered_sum *sum; 3605 struct btrfs_ordered_sum *sum;
@@ -3674,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3674 mod_start + mod_len <= ordered->file_offset) 3611 mod_start + mod_len <= ordered->file_offset)
3675 continue; 3612 continue;
3676 3613
3614 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
3615 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3616 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
3617 const u64 start = ordered->file_offset;
3618 const u64 end = ordered->file_offset + ordered->len - 1;
3619
3620 WARN_ON(ordered->inode != inode);
3621 filemap_fdatawrite_range(inode->i_mapping, start, end);
3622 }
3623
3624 wait_event(ordered->wait,
3625 (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) ||
3626 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3627
3628 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3629 *ordered_io_error = true;
3630 break;
3631 }
3677 /* 3632 /*
3678 * We are going to copy all the csums on this ordered extent, so 3633 * We are going to copy all the csums on this ordered extent, so
3679 * go ahead and adjust mod_start and mod_len in case this 3634 * go ahead and adjust mod_start and mod_len in case this
@@ -3705,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3705 } 3660 }
3706 } 3661 }
3707 3662
3663 if (skip_csum)
3664 continue;
3665
3708 /* 3666 /*
3709 * To keep us from looping for the above case of an ordered 3667 * To keep us from looping for the above case of an ordered
3710 * extent that falls inside of the logged extent. 3668 * extent that falls inside of the logged extent.
@@ -3722,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
3722 list_for_each_entry(sum, &ordered->list, list) { 3680 list_for_each_entry(sum, &ordered->list, list) {
3723 ret = btrfs_csum_file_blocks(trans, log, sum); 3681 ret = btrfs_csum_file_blocks(trans, log, sum);
3724 if (ret) 3682 if (ret)
3725 goto unlocked; 3683 break;
3726 } 3684 }
3727
3728 } 3685 }
3729unlocked:
3730 3686
3731 if (!mod_len || ret) 3687 if (*ordered_io_error || !mod_len || ret || skip_csum)
3732 return ret; 3688 return ret;
3733 3689
3734 if (em->compress_type) { 3690 if (em->compress_type) {
3735 csum_offset = 0; 3691 csum_offset = 0;
3736 csum_len = block_len; 3692 csum_len = max(em->block_len, em->orig_block_len);
3737 } else { 3693 } else {
3738 csum_offset = mod_start - em->start; 3694 csum_offset = mod_start - em->start;
3739 csum_len = mod_len; 3695 csum_len = mod_len;
@@ -3760,11 +3716,106 @@ unlocked:
3760 return ret; 3716 return ret;
3761} 3717}
3762 3718
3719static int log_one_extent(struct btrfs_trans_handle *trans,
3720 struct inode *inode, struct btrfs_root *root,
3721 const struct extent_map *em,
3722 struct btrfs_path *path,
3723 const struct list_head *logged_list,
3724 struct btrfs_log_ctx *ctx)
3725{
3726 struct btrfs_root *log = root->log_root;
3727 struct btrfs_file_extent_item *fi;
3728 struct extent_buffer *leaf;
3729 struct btrfs_map_token token;
3730 struct btrfs_key key;
3731 u64 extent_offset = em->start - em->orig_start;
3732 u64 block_len;
3733 int ret;
3734 int extent_inserted = 0;
3735 bool ordered_io_err = false;
3736
3737 ret = wait_ordered_extents(trans, inode, root, em, logged_list,
3738 &ordered_io_err);
3739 if (ret)
3740 return ret;
3741
3742 if (ordered_io_err) {
3743 ctx->io_err = -EIO;
3744 return 0;
3745 }
3746
3747 btrfs_init_map_token(&token);
3748
3749 ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
3750 em->start + em->len, NULL, 0, 1,
3751 sizeof(*fi), &extent_inserted);
3752 if (ret)
3753 return ret;
3754
3755 if (!extent_inserted) {
3756 key.objectid = btrfs_ino(inode);
3757 key.type = BTRFS_EXTENT_DATA_KEY;
3758 key.offset = em->start;
3759
3760 ret = btrfs_insert_empty_item(trans, log, path, &key,
3761 sizeof(*fi));
3762 if (ret)
3763 return ret;
3764 }
3765 leaf = path->nodes[0];
3766 fi = btrfs_item_ptr(leaf, path->slots[0],
3767 struct btrfs_file_extent_item);
3768
3769 btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
3770 &token);
3771 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3772 btrfs_set_token_file_extent_type(leaf, fi,
3773 BTRFS_FILE_EXTENT_PREALLOC,
3774 &token);
3775 else
3776 btrfs_set_token_file_extent_type(leaf, fi,
3777 BTRFS_FILE_EXTENT_REG,
3778 &token);
3779
3780 block_len = max(em->block_len, em->orig_block_len);
3781 if (em->compress_type != BTRFS_COMPRESS_NONE) {
3782 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3783 em->block_start,
3784 &token);
3785 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3786 &token);
3787 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
3788 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
3789 em->block_start -
3790 extent_offset, &token);
3791 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
3792 &token);
3793 } else {
3794 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
3795 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
3796 &token);
3797 }
3798
3799 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
3800 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
3801 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
3802 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
3803 &token);
3804 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
3805 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
3806 btrfs_mark_buffer_dirty(leaf);
3807
3808 btrfs_release_path(path);
3809
3810 return ret;
3811}
3812
3763static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, 3813static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
3764 struct btrfs_root *root, 3814 struct btrfs_root *root,
3765 struct inode *inode, 3815 struct inode *inode,
3766 struct btrfs_path *path, 3816 struct btrfs_path *path,
3767 struct list_head *logged_list) 3817 struct list_head *logged_list,
3818 struct btrfs_log_ctx *ctx)
3768{ 3819{
3769 struct extent_map *em, *n; 3820 struct extent_map *em, *n;
3770 struct list_head extents; 3821 struct list_head extents;
@@ -3822,7 +3873,8 @@ process:
3822 3873
3823 write_unlock(&tree->lock); 3874 write_unlock(&tree->lock);
3824 3875
3825 ret = log_one_extent(trans, inode, root, em, path, logged_list); 3876 ret = log_one_extent(trans, inode, root, em, path, logged_list,
3877 ctx);
3826 write_lock(&tree->lock); 3878 write_lock(&tree->lock);
3827 clear_em_logging(tree, em); 3879 clear_em_logging(tree, em);
3828 free_extent_map(em); 3880 free_extent_map(em);
@@ -3849,8 +3901,11 @@ process:
3849 * This handles both files and directories. 3901 * This handles both files and directories.
3850 */ 3902 */
3851static int btrfs_log_inode(struct btrfs_trans_handle *trans, 3903static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3852 struct btrfs_root *root, struct inode *inode, 3904 struct btrfs_root *root, struct inode *inode,
3853 int inode_only) 3905 int inode_only,
3906 const loff_t start,
3907 const loff_t end,
3908 struct btrfs_log_ctx *ctx)
3854{ 3909{
3855 struct btrfs_path *path; 3910 struct btrfs_path *path;
3856 struct btrfs_path *dst_path; 3911 struct btrfs_path *dst_path;
@@ -3867,6 +3922,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3867 int ins_nr; 3922 int ins_nr;
3868 bool fast_search = false; 3923 bool fast_search = false;
3869 u64 ino = btrfs_ino(inode); 3924 u64 ino = btrfs_ino(inode);
3925 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
3870 3926
3871 path = btrfs_alloc_path(); 3927 path = btrfs_alloc_path();
3872 if (!path) 3928 if (!path)
@@ -3950,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3950 err = ret; 4006 err = ret;
3951 goto out_unlock; 4007 goto out_unlock;
3952 } 4008 }
3953 path->keep_locks = 1;
3954 4009
3955 while (1) { 4010 while (1) {
3956 ins_nr = 0; 4011 ins_nr = 0;
@@ -3980,7 +4035,8 @@ again:
3980 if (ret < 0) { 4035 if (ret < 0) {
3981 err = ret; 4036 err = ret;
3982 goto out_unlock; 4037 goto out_unlock;
3983 } if (ret) { 4038 }
4039 if (ret) {
3984 ins_nr = 0; 4040 ins_nr = 0;
3985 btrfs_release_path(path); 4041 btrfs_release_path(path);
3986 continue; 4042 continue;
@@ -4034,19 +4090,41 @@ log_extents:
4034 btrfs_release_path(dst_path); 4090 btrfs_release_path(dst_path);
4035 if (fast_search) { 4091 if (fast_search) {
4036 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4092 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4037 &logged_list); 4093 &logged_list, ctx);
4038 if (ret) { 4094 if (ret) {
4039 err = ret; 4095 err = ret;
4040 goto out_unlock; 4096 goto out_unlock;
4041 } 4097 }
4042 } else if (inode_only == LOG_INODE_ALL) { 4098 } else if (inode_only == LOG_INODE_ALL) {
4043 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
4044 struct extent_map *em, *n; 4099 struct extent_map *em, *n;
4045 4100
4046 write_lock(&tree->lock); 4101 write_lock(&em_tree->lock);
4047 list_for_each_entry_safe(em, n, &tree->modified_extents, list) 4102 /*
4048 list_del_init(&em->list); 4103 * We can't just remove every em if we're called for a ranged
4049 write_unlock(&tree->lock); 4104 * fsync - that is, one that doesn't cover the whole possible
4105 * file range (0 to LLONG_MAX). This is because we can have
4106 * em's that fall outside the range we're logging and therefore
4107 * their ordered operations haven't completed yet
4108 * (btrfs_finish_ordered_io() not invoked yet). This means we
4109 * didn't get their respective file extent item in the fs/subvol
4110 * tree yet, and need to let the next fast fsync (one which
4111 * consults the list of modified extent maps) find the em so
4112 * that it logs a matching file extent item and waits for the
4113 * respective ordered operation to complete (if it's still
4114 * running).
4115 *
4116 * Removing every em outside the range we're logging would make
4117 * the next fast fsync not log their matching file extent items,
4118 * therefore making us lose data after a log replay.
4119 */
4120 list_for_each_entry_safe(em, n, &em_tree->modified_extents,
4121 list) {
4122 const u64 mod_end = em->mod_start + em->mod_len - 1;
4123
4124 if (em->mod_start >= start && mod_end <= end)
4125 list_del_init(&em->list);
4126 }
4127 write_unlock(&em_tree->lock);
4050 } 4128 }
4051 4129
4052 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 4130 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4056,6 +4134,7 @@ log_extents:
4056 goto out_unlock; 4134 goto out_unlock;
4057 } 4135 }
4058 } 4136 }
4137
4059 BTRFS_I(inode)->logged_trans = trans->transid; 4138 BTRFS_I(inode)->logged_trans = trans->transid;
4060 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; 4139 BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
4061out_unlock: 4140out_unlock:
@@ -4152,7 +4231,10 @@ out:
4152 */ 4231 */
4153static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 4232static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4154 struct btrfs_root *root, struct inode *inode, 4233 struct btrfs_root *root, struct inode *inode,
4155 struct dentry *parent, int exists_only, 4234 struct dentry *parent,
4235 const loff_t start,
4236 const loff_t end,
4237 int exists_only,
4156 struct btrfs_log_ctx *ctx) 4238 struct btrfs_log_ctx *ctx)
4157{ 4239{
4158 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 4240 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4198,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4198 if (ret) 4280 if (ret)
4199 goto end_no_trans; 4281 goto end_no_trans;
4200 4282
4201 ret = btrfs_log_inode(trans, root, inode, inode_only); 4283 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
4202 if (ret) 4284 if (ret)
4203 goto end_trans; 4285 goto end_trans;
4204 4286
@@ -4226,7 +4308,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
4226 4308
4227 if (BTRFS_I(inode)->generation > 4309 if (BTRFS_I(inode)->generation >
4228 root->fs_info->last_trans_committed) { 4310 root->fs_info->last_trans_committed) {
4229 ret = btrfs_log_inode(trans, root, inode, inode_only); 4311 ret = btrfs_log_inode(trans, root, inode, inode_only,
4312 0, LLONG_MAX, ctx);
4230 if (ret) 4313 if (ret)
4231 goto end_trans; 4314 goto end_trans;
4232 } 4315 }
@@ -4260,13 +4343,15 @@ end_no_trans:
4260 */ 4343 */
4261int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 4344int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
4262 struct btrfs_root *root, struct dentry *dentry, 4345 struct btrfs_root *root, struct dentry *dentry,
4346 const loff_t start,
4347 const loff_t end,
4263 struct btrfs_log_ctx *ctx) 4348 struct btrfs_log_ctx *ctx)
4264{ 4349{
4265 struct dentry *parent = dget_parent(dentry); 4350 struct dentry *parent = dget_parent(dentry);
4266 int ret; 4351 int ret;
4267 4352
4268 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 4353 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
4269 0, ctx); 4354 start, end, 0, ctx);
4270 dput(parent); 4355 dput(parent);
4271 4356
4272 return ret; 4357 return ret;
@@ -4316,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
4316again: 4401again:
4317 key.objectid = BTRFS_TREE_LOG_OBJECTID; 4402 key.objectid = BTRFS_TREE_LOG_OBJECTID;
4318 key.offset = (u64)-1; 4403 key.offset = (u64)-1;
4319 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 4404 key.type = BTRFS_ROOT_ITEM_KEY;
4320 4405
4321 while (1) { 4406 while (1) {
4322 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 4407 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
@@ -4503,6 +4588,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
4503 root->fs_info->last_trans_committed)) 4588 root->fs_info->last_trans_committed))
4504 return 0; 4589 return 0;
4505 4590
4506 return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); 4591 return btrfs_log_inode_parent(trans, root, inode, parent, 0,
4592 LLONG_MAX, 1, NULL);
4507} 4593}
4508 4594
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..154990c26dcb 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -28,6 +28,7 @@
28struct btrfs_log_ctx { 28struct btrfs_log_ctx {
29 int log_ret; 29 int log_ret;
30 int log_transid; 30 int log_transid;
31 int io_err;
31 struct list_head list; 32 struct list_head list;
32}; 33};
33 34
@@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx)
35{ 36{
36 ctx->log_ret = 0; 37 ctx->log_ret = 0;
37 ctx->log_transid = 0; 38 ctx->log_transid = 0;
39 ctx->io_err = 0;
38 INIT_LIST_HEAD(&ctx->list); 40 INIT_LIST_HEAD(&ctx->list);
39} 41}
40 42
@@ -59,6 +61,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
59int btrfs_recover_log_trees(struct btrfs_root *tree_root); 61int btrfs_recover_log_trees(struct btrfs_root *tree_root);
60int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 62int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, struct dentry *dentry, 63 struct btrfs_root *root, struct dentry *dentry,
64 const loff_t start,
65 const loff_t end,
62 struct btrfs_log_ctx *ctx); 66 struct btrfs_log_ctx *ctx);
63int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, 67int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root, 68 struct btrfs_root *root,
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h
index 7f78cbf5cf41..4c29db604bbe 100644
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -57,6 +57,21 @@ void ulist_free(struct ulist *ulist);
57int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); 57int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask);
58int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, 58int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux,
59 u64 *old_aux, gfp_t gfp_mask); 59 u64 *old_aux, gfp_t gfp_mask);
60
61/* just like ulist_add_merge() but take a pointer for the aux data */
62static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux,
63 void **old_aux, gfp_t gfp_mask)
64{
65#if BITS_PER_LONG == 32
66 u64 old64 = (uintptr_t)*old_aux;
67 int ret = ulist_add_merge(ulist, val, (uintptr_t)aux, &old64, gfp_mask);
68 *old_aux = (void *)((uintptr_t)old64);
69 return ret;
70#else
71 return ulist_add_merge(ulist, val, (u64)aux, (u64 *)old_aux, gfp_mask);
72#endif
73}
74
60struct ulist_node *ulist_next(struct ulist *ulist, 75struct ulist_node *ulist_next(struct ulist *ulist,
61 struct ulist_iterator *uiter); 76 struct ulist_iterator *uiter);
62 77
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index f6a4c03ee7d8..778282944530 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
279 key.offset = 0; 279 key.offset = 0;
280 280
281again_search_slot: 281again_search_slot:
282 path->keep_locks = 1;
283 ret = btrfs_search_forward(root, &key, path, 0); 282 ret = btrfs_search_forward(root, &key, path, 0);
284 if (ret) { 283 if (ret) {
285 if (ret > 0) 284 if (ret > 0)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6cb82f62cb7c..d47289c715c8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
50static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); 50static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
51static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); 51static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
52 52
53static DEFINE_MUTEX(uuid_mutex); 53DEFINE_MUTEX(uuid_mutex);
54static LIST_HEAD(fs_uuids); 54static LIST_HEAD(fs_uuids);
55 55
56static void lock_chunks(struct btrfs_root *root) 56static void lock_chunks(struct btrfs_root *root)
@@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void)
74 mutex_init(&fs_devs->device_list_mutex); 74 mutex_init(&fs_devs->device_list_mutex);
75 75
76 INIT_LIST_HEAD(&fs_devs->devices); 76 INIT_LIST_HEAD(&fs_devs->devices);
77 INIT_LIST_HEAD(&fs_devs->resized_devices);
77 INIT_LIST_HEAD(&fs_devs->alloc_list); 78 INIT_LIST_HEAD(&fs_devs->alloc_list);
78 INIT_LIST_HEAD(&fs_devs->list); 79 INIT_LIST_HEAD(&fs_devs->list);
79 80
@@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void)
154 155
155 INIT_LIST_HEAD(&dev->dev_list); 156 INIT_LIST_HEAD(&dev->dev_list);
156 INIT_LIST_HEAD(&dev->dev_alloc_list); 157 INIT_LIST_HEAD(&dev->dev_alloc_list);
158 INIT_LIST_HEAD(&dev->resized_list);
157 159
158 spin_lock_init(&dev->io_lock); 160 spin_lock_init(&dev->io_lock);
159 161
160 spin_lock_init(&dev->reada_lock); 162 spin_lock_init(&dev->reada_lock);
161 atomic_set(&dev->reada_in_flight, 0); 163 atomic_set(&dev->reada_in_flight, 0);
164 atomic_set(&dev->dev_stats_ccnt, 0);
162 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); 165 INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
163 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); 166 INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
164 167
@@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path,
474 return PTR_ERR(fs_devices); 477 return PTR_ERR(fs_devices);
475 478
476 list_add(&fs_devices->list, &fs_uuids); 479 list_add(&fs_devices->list, &fs_uuids);
477 fs_devices->latest_devid = devid;
478 fs_devices->latest_trans = found_transid;
479 480
480 device = NULL; 481 device = NULL;
481 } else { 482 } else {
482 device = __find_device(&fs_devices->devices, devid, 483 device = __find_device(&fs_devices->devices, devid,
483 disk_super->dev_item.uuid); 484 disk_super->dev_item.uuid);
484 } 485 }
486
485 if (!device) { 487 if (!device) {
486 if (fs_devices->opened) 488 if (fs_devices->opened)
487 return -EBUSY; 489 return -EBUSY;
@@ -508,6 +510,43 @@ static noinline int device_list_add(const char *path,
508 ret = 1; 510 ret = 1;
509 device->fs_devices = fs_devices; 511 device->fs_devices = fs_devices;
510 } else if (!device->name || strcmp(device->name->str, path)) { 512 } else if (!device->name || strcmp(device->name->str, path)) {
513 /*
514 * When FS is already mounted.
515 * 1. If you are here and if the device->name is NULL that
516 * means this device was missing at time of FS mount.
517 * 2. If you are here and if the device->name is different
518 * from 'path' that means either
519 * a. The same device disappeared and reappeared with
520 * different name. or
521 * b. The missing-disk-which-was-replaced, has
522 * reappeared now.
523 *
524 * We must allow 1 and 2a above. But 2b would be a spurious
525 * and unintentional.
526 *
527 * Further in case of 1 and 2a above, the disk at 'path'
528 * would have missed some transaction when it was away and
529 * in case of 2a the stale bdev has to be updated as well.
530 * 2b must not be allowed at all time.
531 */
532
533 /*
534 * For now, we do allow update to btrfs_fs_device through the
535 * btrfs dev scan cli after FS has been mounted. We're still
536 * tracking a problem where systems fail mount by subvolume id
537 * when we reject replacement on a mounted FS.
538 */
539 if (!fs_devices->opened && found_transid < device->generation) {
540 /*
541 * That is if the FS is _not_ mounted and if you
542 * are here, that means there is more than one
543 * disk with same uuid and devid.We keep the one
544 * with larger generation number or the last-in if
545 * generation are equal.
546 */
547 return -EEXIST;
548 }
549
511 name = rcu_string_strdup(path, GFP_NOFS); 550 name = rcu_string_strdup(path, GFP_NOFS);
512 if (!name) 551 if (!name)
513 return -ENOMEM; 552 return -ENOMEM;
@@ -519,10 +558,15 @@ static noinline int device_list_add(const char *path,
519 } 558 }
520 } 559 }
521 560
522 if (found_transid > fs_devices->latest_trans) { 561 /*
523 fs_devices->latest_devid = devid; 562 * Unmount does not free the btrfs_device struct but would zero
524 fs_devices->latest_trans = found_transid; 563 * generation along with most of the other members. So just update
525 } 564 * it back. We need it to pick the disk with largest generation
565 * (as above).
566 */
567 if (!fs_devices->opened)
568 device->generation = found_transid;
569
526 *fs_devices_ret = fs_devices; 570 *fs_devices_ret = fs_devices;
527 571
528 return ret; 572 return ret;
@@ -538,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
538 if (IS_ERR(fs_devices)) 582 if (IS_ERR(fs_devices))
539 return fs_devices; 583 return fs_devices;
540 584
541 fs_devices->latest_devid = orig->latest_devid; 585 mutex_lock(&orig->device_list_mutex);
542 fs_devices->latest_trans = orig->latest_trans;
543 fs_devices->total_devices = orig->total_devices; 586 fs_devices->total_devices = orig->total_devices;
544 587
545 /* We have held the volume lock, it is safe to get the devices. */ 588 /* We have held the volume lock, it is safe to get the devices. */
@@ -568,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
568 device->fs_devices = fs_devices; 611 device->fs_devices = fs_devices;
569 fs_devices->num_devices++; 612 fs_devices->num_devices++;
570 } 613 }
614 mutex_unlock(&orig->device_list_mutex);
571 return fs_devices; 615 return fs_devices;
572error: 616error:
617 mutex_unlock(&orig->device_list_mutex);
573 free_fs_devices(fs_devices); 618 free_fs_devices(fs_devices);
574 return ERR_PTR(-ENOMEM); 619 return ERR_PTR(-ENOMEM);
575} 620}
@@ -578,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
578 struct btrfs_fs_devices *fs_devices, int step) 623 struct btrfs_fs_devices *fs_devices, int step)
579{ 624{
580 struct btrfs_device *device, *next; 625 struct btrfs_device *device, *next;
581 626 struct btrfs_device *latest_dev = NULL;
582 struct block_device *latest_bdev = NULL;
583 u64 latest_devid = 0;
584 u64 latest_transid = 0;
585 627
586 mutex_lock(&uuid_mutex); 628 mutex_lock(&uuid_mutex);
587again: 629again:
@@ -589,11 +631,9 @@ again:
589 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 631 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
590 if (device->in_fs_metadata) { 632 if (device->in_fs_metadata) {
591 if (!device->is_tgtdev_for_dev_replace && 633 if (!device->is_tgtdev_for_dev_replace &&
592 (!latest_transid || 634 (!latest_dev ||
593 device->generation > latest_transid)) { 635 device->generation > latest_dev->generation)) {
594 latest_devid = device->devid; 636 latest_dev = device;
595 latest_transid = device->generation;
596 latest_bdev = device->bdev;
597 } 637 }
598 continue; 638 continue;
599 } 639 }
@@ -635,9 +675,7 @@ again:
635 goto again; 675 goto again;
636 } 676 }
637 677
638 fs_devices->latest_bdev = latest_bdev; 678 fs_devices->latest_bdev = latest_dev->bdev;
639 fs_devices->latest_devid = latest_devid;
640 fs_devices->latest_trans = latest_transid;
641 679
642 mutex_unlock(&uuid_mutex); 680 mutex_unlock(&uuid_mutex);
643} 681}
@@ -686,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
686 fs_devices->rw_devices--; 724 fs_devices->rw_devices--;
687 } 725 }
688 726
689 if (device->can_discard)
690 fs_devices->num_can_discard--;
691 if (device->missing) 727 if (device->missing)
692 fs_devices->missing_devices--; 728 fs_devices->missing_devices--;
693 729
@@ -752,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
752 struct block_device *bdev; 788 struct block_device *bdev;
753 struct list_head *head = &fs_devices->devices; 789 struct list_head *head = &fs_devices->devices;
754 struct btrfs_device *device; 790 struct btrfs_device *device;
755 struct block_device *latest_bdev = NULL; 791 struct btrfs_device *latest_dev = NULL;
756 struct buffer_head *bh; 792 struct buffer_head *bh;
757 struct btrfs_super_block *disk_super; 793 struct btrfs_super_block *disk_super;
758 u64 latest_devid = 0;
759 u64 latest_transid = 0;
760 u64 devid; 794 u64 devid;
761 int seeding = 1; 795 int seeding = 1;
762 int ret = 0; 796 int ret = 0;
@@ -784,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
784 goto error_brelse; 818 goto error_brelse;
785 819
786 device->generation = btrfs_super_generation(disk_super); 820 device->generation = btrfs_super_generation(disk_super);
787 if (!latest_transid || device->generation > latest_transid) { 821 if (!latest_dev ||
788 latest_devid = devid; 822 device->generation > latest_dev->generation)
789 latest_transid = device->generation; 823 latest_dev = device;
790 latest_bdev = bdev;
791 }
792 824
793 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { 825 if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
794 device->writeable = 0; 826 device->writeable = 0;
@@ -798,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
798 } 830 }
799 831
800 q = bdev_get_queue(bdev); 832 q = bdev_get_queue(bdev);
801 if (blk_queue_discard(q)) { 833 if (blk_queue_discard(q))
802 device->can_discard = 1; 834 device->can_discard = 1;
803 fs_devices->num_can_discard++;
804 }
805 835
806 device->bdev = bdev; 836 device->bdev = bdev;
807 device->in_fs_metadata = 0; 837 device->in_fs_metadata = 0;
@@ -831,9 +861,7 @@ error_brelse:
831 } 861 }
832 fs_devices->seeding = seeding; 862 fs_devices->seeding = seeding;
833 fs_devices->opened = 1; 863 fs_devices->opened = 1;
834 fs_devices->latest_bdev = latest_bdev; 864 fs_devices->latest_bdev = latest_dev->bdev;
835 fs_devices->latest_devid = latest_devid;
836 fs_devices->latest_trans = latest_transid;
837 fs_devices->total_rw_bytes = 0; 865 fs_devices->total_rw_bytes = 0;
838out: 866out:
839 return ret; 867 return ret;
@@ -1007,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1007 if (key.objectid > device->devid) 1035 if (key.objectid > device->devid)
1008 break; 1036 break;
1009 1037
1010 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1038 if (key.type != BTRFS_DEV_EXTENT_KEY)
1011 goto next; 1039 goto next;
1012 1040
1013 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 1041 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
@@ -1159,7 +1187,7 @@ again:
1159 if (key.objectid > device->devid) 1187 if (key.objectid > device->devid)
1160 break; 1188 break;
1161 1189
1162 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) 1190 if (key.type != BTRFS_DEV_EXTENT_KEY)
1163 goto next; 1191 goto next;
1164 1192
1165 if (key.offset > search_start) { 1193 if (key.offset > search_start) {
@@ -1238,7 +1266,7 @@ out:
1238 1266
1239static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, 1267static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1240 struct btrfs_device *device, 1268 struct btrfs_device *device,
1241 u64 start) 1269 u64 start, u64 *dev_extent_len)
1242{ 1270{
1243 int ret; 1271 int ret;
1244 struct btrfs_path *path; 1272 struct btrfs_path *path;
@@ -1280,13 +1308,8 @@ again:
1280 goto out; 1308 goto out;
1281 } 1309 }
1282 1310
1283 if (device->bytes_used > 0) { 1311 *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1284 u64 len = btrfs_dev_extent_length(leaf, extent); 1312
1285 device->bytes_used -= len;
1286 spin_lock(&root->fs_info->free_chunk_lock);
1287 root->fs_info->free_chunk_space += len;
1288 spin_unlock(&root->fs_info->free_chunk_lock);
1289 }
1290 ret = btrfs_del_item(trans, root, path); 1313 ret = btrfs_del_item(trans, root, path);
1291 if (ret) { 1314 if (ret) {
1292 btrfs_error(root->fs_info, ret, 1315 btrfs_error(root->fs_info, ret,
@@ -1436,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
1436 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 1459 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1437 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 1460 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1438 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 1461 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1439 btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); 1462 btrfs_set_device_total_bytes(leaf, dev_item,
1440 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 1463 btrfs_device_get_disk_total_bytes(device));
1464 btrfs_set_device_bytes_used(leaf, dev_item,
1465 btrfs_device_get_bytes_used(device));
1441 btrfs_set_device_group(leaf, dev_item, 0); 1466 btrfs_set_device_group(leaf, dev_item, 0);
1442 btrfs_set_device_seek_speed(leaf, dev_item, 0); 1467 btrfs_set_device_seek_speed(leaf, dev_item, 0);
1443 btrfs_set_device_bandwidth(leaf, dev_item, 0); 1468 btrfs_set_device_bandwidth(leaf, dev_item, 0);
@@ -1493,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1493 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1518 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1494 key.type = BTRFS_DEV_ITEM_KEY; 1519 key.type = BTRFS_DEV_ITEM_KEY;
1495 key.offset = device->devid; 1520 key.offset = device->devid;
1496 lock_chunks(root);
1497 1521
1498 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1522 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1499 if (ret < 0) 1523 if (ret < 0)
@@ -1509,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1509 goto out; 1533 goto out;
1510out: 1534out:
1511 btrfs_free_path(path); 1535 btrfs_free_path(path);
1512 unlock_chunks(root);
1513 btrfs_commit_transaction(trans, root); 1536 btrfs_commit_transaction(trans, root);
1514 return ret; 1537 return ret;
1515} 1538}
@@ -1625,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1625 if (device->writeable) { 1648 if (device->writeable) {
1626 lock_chunks(root); 1649 lock_chunks(root);
1627 list_del_init(&device->dev_alloc_list); 1650 list_del_init(&device->dev_alloc_list);
1651 device->fs_devices->rw_devices--;
1628 unlock_chunks(root); 1652 unlock_chunks(root);
1629 root->fs_info->fs_devices->rw_devices--;
1630 clear_super = true; 1653 clear_super = true;
1631 } 1654 }
1632 1655
@@ -1645,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1645 if (ret) 1668 if (ret)
1646 goto error_undo; 1669 goto error_undo;
1647 1670
1648 spin_lock(&root->fs_info->free_chunk_lock);
1649 root->fs_info->free_chunk_space = device->total_bytes -
1650 device->bytes_used;
1651 spin_unlock(&root->fs_info->free_chunk_lock);
1652
1653 device->in_fs_metadata = 0; 1671 device->in_fs_metadata = 0;
1654 btrfs_scrub_cancel_dev(root->fs_info, device); 1672 btrfs_scrub_cancel_dev(root->fs_info, device);
1655 1673
@@ -1671,7 +1689,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1671 device->fs_devices->total_devices--; 1689 device->fs_devices->total_devices--;
1672 1690
1673 if (device->missing) 1691 if (device->missing)
1674 root->fs_info->fs_devices->missing_devices--; 1692 device->fs_devices->missing_devices--;
1675 1693
1676 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1694 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1677 struct btrfs_device, dev_list); 1695 struct btrfs_device, dev_list);
@@ -1703,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1703 fs_devices = fs_devices->seed; 1721 fs_devices = fs_devices->seed;
1704 } 1722 }
1705 cur_devices->seed = NULL; 1723 cur_devices->seed = NULL;
1706 lock_chunks(root);
1707 __btrfs_close_devices(cur_devices); 1724 __btrfs_close_devices(cur_devices);
1708 unlock_chunks(root);
1709 free_fs_devices(cur_devices); 1725 free_fs_devices(cur_devices);
1710 } 1726 }
1711 1727
@@ -1778,8 +1794,8 @@ error_undo:
1778 lock_chunks(root); 1794 lock_chunks(root);
1779 list_add(&device->dev_alloc_list, 1795 list_add(&device->dev_alloc_list,
1780 &root->fs_info->fs_devices->alloc_list); 1796 &root->fs_info->fs_devices->alloc_list);
1797 device->fs_devices->rw_devices++;
1781 unlock_chunks(root); 1798 unlock_chunks(root);
1782 root->fs_info->fs_devices->rw_devices++;
1783 } 1799 }
1784 goto error_brelse; 1800 goto error_brelse;
1785} 1801}
@@ -1787,25 +1803,57 @@ error_undo:
1787void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, 1803void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
1788 struct btrfs_device *srcdev) 1804 struct btrfs_device *srcdev)
1789{ 1805{
1806 struct btrfs_fs_devices *fs_devices;
1807
1790 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); 1808 WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1791 1809
1810 /*
1811 * in case of fs with no seed, srcdev->fs_devices will point
1812 * to fs_devices of fs_info. However when the dev being replaced is
1813 * a seed dev it will point to the seed's local fs_devices. In short
1814 * srcdev will have its correct fs_devices in both the cases.
1815 */
1816 fs_devices = srcdev->fs_devices;
1817
1792 list_del_rcu(&srcdev->dev_list); 1818 list_del_rcu(&srcdev->dev_list);
1793 list_del_rcu(&srcdev->dev_alloc_list); 1819 list_del_rcu(&srcdev->dev_alloc_list);
1794 fs_info->fs_devices->num_devices--; 1820 fs_devices->num_devices--;
1795 if (srcdev->missing) { 1821 if (srcdev->missing)
1796 fs_info->fs_devices->missing_devices--; 1822 fs_devices->missing_devices--;
1797 fs_info->fs_devices->rw_devices++;
1798 }
1799 if (srcdev->can_discard)
1800 fs_info->fs_devices->num_can_discard--;
1801 if (srcdev->bdev) {
1802 fs_info->fs_devices->open_devices--;
1803 1823
1804 /* zero out the old super */ 1824 if (srcdev->writeable) {
1825 fs_devices->rw_devices--;
1826 /* zero out the old super if it is writable */
1805 btrfs_scratch_superblock(srcdev); 1827 btrfs_scratch_superblock(srcdev);
1806 } 1828 }
1807 1829
1830 if (srcdev->bdev)
1831 fs_devices->open_devices--;
1832
1808 call_rcu(&srcdev->rcu, free_device); 1833 call_rcu(&srcdev->rcu, free_device);
1834
1835 /*
1836 * unless fs_devices is seed fs, num_devices shouldn't go
1837 * zero
1838 */
1839 BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
1840
1841 /* if this is no devs we rather delete the fs_devices */
1842 if (!fs_devices->num_devices) {
1843 struct btrfs_fs_devices *tmp_fs_devices;
1844
1845 tmp_fs_devices = fs_info->fs_devices;
1846 while (tmp_fs_devices) {
1847 if (tmp_fs_devices->seed == fs_devices) {
1848 tmp_fs_devices->seed = fs_devices->seed;
1849 break;
1850 }
1851 tmp_fs_devices = tmp_fs_devices->seed;
1852 }
1853 fs_devices->seed = NULL;
1854 __btrfs_close_devices(fs_devices);
1855 free_fs_devices(fs_devices);
1856 }
1809} 1857}
1810 1858
1811void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 1859void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
@@ -1813,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1813{ 1861{
1814 struct btrfs_device *next_device; 1862 struct btrfs_device *next_device;
1815 1863
1864 mutex_lock(&uuid_mutex);
1816 WARN_ON(!tgtdev); 1865 WARN_ON(!tgtdev);
1817 mutex_lock(&fs_info->fs_devices->device_list_mutex); 1866 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1818 if (tgtdev->bdev) { 1867 if (tgtdev->bdev) {
@@ -1820,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1820 fs_info->fs_devices->open_devices--; 1869 fs_info->fs_devices->open_devices--;
1821 } 1870 }
1822 fs_info->fs_devices->num_devices--; 1871 fs_info->fs_devices->num_devices--;
1823 if (tgtdev->can_discard)
1824 fs_info->fs_devices->num_can_discard++;
1825 1872
1826 next_device = list_entry(fs_info->fs_devices->devices.next, 1873 next_device = list_entry(fs_info->fs_devices->devices.next,
1827 struct btrfs_device, dev_list); 1874 struct btrfs_device, dev_list);
@@ -1834,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1834 call_rcu(&tgtdev->rcu, free_device); 1881 call_rcu(&tgtdev->rcu, free_device);
1835 1882
1836 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 1883 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1884 mutex_unlock(&uuid_mutex);
1837} 1885}
1838 1886
1839static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 1887static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -1932,15 +1980,18 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
1932 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1980 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1933 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, 1981 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1934 synchronize_rcu); 1982 synchronize_rcu);
1983 list_for_each_entry(device, &seed_devices->devices, dev_list)
1984 device->fs_devices = seed_devices;
1935 1985
1986 lock_chunks(root);
1936 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1987 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1937 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1988 unlock_chunks(root);
1938 device->fs_devices = seed_devices;
1939 }
1940 1989
1941 fs_devices->seeding = 0; 1990 fs_devices->seeding = 0;
1942 fs_devices->num_devices = 0; 1991 fs_devices->num_devices = 0;
1943 fs_devices->open_devices = 0; 1992 fs_devices->open_devices = 0;
1993 fs_devices->missing_devices = 0;
1994 fs_devices->rotating = 0;
1944 fs_devices->seed = seed_devices; 1995 fs_devices->seed = seed_devices;
1945 1996
1946 generate_random_uuid(fs_devices->fsid); 1997 generate_random_uuid(fs_devices->fsid);
@@ -2039,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2039 struct list_head *devices; 2090 struct list_head *devices;
2040 struct super_block *sb = root->fs_info->sb; 2091 struct super_block *sb = root->fs_info->sb;
2041 struct rcu_string *name; 2092 struct rcu_string *name;
2042 u64 total_bytes; 2093 u64 tmp;
2043 int seeding_dev = 0; 2094 int seeding_dev = 0;
2044 int ret = 0; 2095 int ret = 0;
2045 2096
@@ -2095,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2095 goto error; 2146 goto error;
2096 } 2147 }
2097 2148
2098 lock_chunks(root);
2099
2100 q = bdev_get_queue(bdev); 2149 q = bdev_get_queue(bdev);
2101 if (blk_queue_discard(q)) 2150 if (blk_queue_discard(q))
2102 device->can_discard = 1; 2151 device->can_discard = 1;
@@ -2107,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2107 device->sector_size = root->sectorsize; 2156 device->sector_size = root->sectorsize;
2108 device->total_bytes = i_size_read(bdev->bd_inode); 2157 device->total_bytes = i_size_read(bdev->bd_inode);
2109 device->disk_total_bytes = device->total_bytes; 2158 device->disk_total_bytes = device->total_bytes;
2159 device->commit_total_bytes = device->total_bytes;
2110 device->dev_root = root->fs_info->dev_root; 2160 device->dev_root = root->fs_info->dev_root;
2111 device->bdev = bdev; 2161 device->bdev = bdev;
2112 device->in_fs_metadata = 1; 2162 device->in_fs_metadata = 1;
@@ -2124,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2124 device->fs_devices = root->fs_info->fs_devices; 2174 device->fs_devices = root->fs_info->fs_devices;
2125 2175
2126 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2176 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2177 lock_chunks(root);
2127 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); 2178 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2128 list_add(&device->dev_alloc_list, 2179 list_add(&device->dev_alloc_list,
2129 &root->fs_info->fs_devices->alloc_list); 2180 &root->fs_info->fs_devices->alloc_list);
@@ -2131,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2131 root->fs_info->fs_devices->open_devices++; 2182 root->fs_info->fs_devices->open_devices++;
2132 root->fs_info->fs_devices->rw_devices++; 2183 root->fs_info->fs_devices->rw_devices++;
2133 root->fs_info->fs_devices->total_devices++; 2184 root->fs_info->fs_devices->total_devices++;
2134 if (device->can_discard)
2135 root->fs_info->fs_devices->num_can_discard++;
2136 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 2185 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2137 2186
2138 spin_lock(&root->fs_info->free_chunk_lock); 2187 spin_lock(&root->fs_info->free_chunk_lock);
@@ -2142,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2142 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 2191 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2143 root->fs_info->fs_devices->rotating = 1; 2192 root->fs_info->fs_devices->rotating = 1;
2144 2193
2145 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); 2194 tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
2146 btrfs_set_super_total_bytes(root->fs_info->super_copy, 2195 btrfs_set_super_total_bytes(root->fs_info->super_copy,
2147 total_bytes + device->total_bytes); 2196 tmp + device->total_bytes);
2148 2197
2149 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); 2198 tmp = btrfs_super_num_devices(root->fs_info->super_copy);
2150 btrfs_set_super_num_devices(root->fs_info->super_copy, 2199 btrfs_set_super_num_devices(root->fs_info->super_copy,
2151 total_bytes + 1); 2200 tmp + 1);
2152 2201
2153 /* add sysfs device entry */ 2202 /* add sysfs device entry */
2154 btrfs_kobj_add_device(root->fs_info, device); 2203 btrfs_kobj_add_device(root->fs_info, device);
2155 2204
2205 /*
2206 * we've got more storage, clear any full flags on the space
2207 * infos
2208 */
2209 btrfs_clear_space_info_full(root->fs_info);
2210
2211 unlock_chunks(root);
2156 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2212 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2157 2213
2158 if (seeding_dev) { 2214 if (seeding_dev) {
2159 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; 2215 lock_chunks(root);
2160 ret = init_first_rw_device(trans, root, device); 2216 ret = init_first_rw_device(trans, root, device);
2217 unlock_chunks(root);
2161 if (ret) { 2218 if (ret) {
2162 btrfs_abort_transaction(trans, root, ret); 2219 btrfs_abort_transaction(trans, root, ret);
2163 goto error_trans; 2220 goto error_trans;
2164 } 2221 }
2222 }
2223
2224 ret = btrfs_add_device(trans, root, device);
2225 if (ret) {
2226 btrfs_abort_transaction(trans, root, ret);
2227 goto error_trans;
2228 }
2229
2230 if (seeding_dev) {
2231 char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2232
2165 ret = btrfs_finish_sprout(trans, root); 2233 ret = btrfs_finish_sprout(trans, root);
2166 if (ret) { 2234 if (ret) {
2167 btrfs_abort_transaction(trans, root, ret); 2235 btrfs_abort_transaction(trans, root, ret);
@@ -2175,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2175 root->fs_info->fsid); 2243 root->fs_info->fsid);
2176 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) 2244 if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
2177 goto error_trans; 2245 goto error_trans;
2178 } else {
2179 ret = btrfs_add_device(trans, root, device);
2180 if (ret) {
2181 btrfs_abort_transaction(trans, root, ret);
2182 goto error_trans;
2183 }
2184 } 2246 }
2185 2247
2186 /*
2187 * we've got more storage, clear any full flags on the space
2188 * infos
2189 */
2190 btrfs_clear_space_info_full(root->fs_info);
2191
2192 unlock_chunks(root);
2193 root->fs_info->num_tolerated_disk_barrier_failures = 2248 root->fs_info->num_tolerated_disk_barrier_failures =
2194 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); 2249 btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2195 ret = btrfs_commit_transaction(trans, root); 2250 ret = btrfs_commit_transaction(trans, root);
@@ -2221,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2221 return ret; 2276 return ret;
2222 2277
2223error_trans: 2278error_trans:
2224 unlock_chunks(root);
2225 btrfs_end_transaction(trans, root); 2279 btrfs_end_transaction(trans, root);
2226 rcu_string_free(device->name); 2280 rcu_string_free(device->name);
2227 btrfs_kobj_rm_device(root->fs_info, device); 2281 btrfs_kobj_rm_device(root->fs_info, device);
@@ -2236,6 +2290,7 @@ error:
2236} 2290}
2237 2291
2238int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 2292int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2293 struct btrfs_device *srcdev,
2239 struct btrfs_device **device_out) 2294 struct btrfs_device **device_out)
2240{ 2295{
2241 struct request_queue *q; 2296 struct request_queue *q;
@@ -2248,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2248 int ret = 0; 2303 int ret = 0;
2249 2304
2250 *device_out = NULL; 2305 *device_out = NULL;
2251 if (fs_info->fs_devices->seeding) 2306 if (fs_info->fs_devices->seeding) {
2307 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2252 return -EINVAL; 2308 return -EINVAL;
2309 }
2253 2310
2254 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, 2311 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2255 fs_info->bdev_holder); 2312 fs_info->bdev_holder);
2256 if (IS_ERR(bdev)) 2313 if (IS_ERR(bdev)) {
2314 btrfs_err(fs_info, "target device %s is invalid!", device_path);
2257 return PTR_ERR(bdev); 2315 return PTR_ERR(bdev);
2316 }
2258 2317
2259 filemap_write_and_wait(bdev->bd_inode->i_mapping); 2318 filemap_write_and_wait(bdev->bd_inode->i_mapping);
2260 2319
2261 devices = &fs_info->fs_devices->devices; 2320 devices = &fs_info->fs_devices->devices;
2262 list_for_each_entry(device, devices, dev_list) { 2321 list_for_each_entry(device, devices, dev_list) {
2263 if (device->bdev == bdev) { 2322 if (device->bdev == bdev) {
2323 btrfs_err(fs_info, "target device is in the filesystem!");
2264 ret = -EEXIST; 2324 ret = -EEXIST;
2265 goto error; 2325 goto error;
2266 } 2326 }
2267 } 2327 }
2268 2328
2329
2330 if (i_size_read(bdev->bd_inode) <
2331 btrfs_device_get_total_bytes(srcdev)) {
2332 btrfs_err(fs_info, "target device is smaller than source device!");
2333 ret = -EINVAL;
2334 goto error;
2335 }
2336
2337
2269 device = btrfs_alloc_device(NULL, &devid, NULL); 2338 device = btrfs_alloc_device(NULL, &devid, NULL);
2270 if (IS_ERR(device)) { 2339 if (IS_ERR(device)) {
2271 ret = PTR_ERR(device); 2340 ret = PTR_ERR(device);
@@ -2289,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2289 device->io_width = root->sectorsize; 2358 device->io_width = root->sectorsize;
2290 device->io_align = root->sectorsize; 2359 device->io_align = root->sectorsize;
2291 device->sector_size = root->sectorsize; 2360 device->sector_size = root->sectorsize;
2292 device->total_bytes = i_size_read(bdev->bd_inode); 2361 device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2293 device->disk_total_bytes = device->total_bytes; 2362 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2363 device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2364 ASSERT(list_empty(&srcdev->resized_list));
2365 device->commit_total_bytes = srcdev->commit_total_bytes;
2366 device->commit_bytes_used = device->bytes_used;
2294 device->dev_root = fs_info->dev_root; 2367 device->dev_root = fs_info->dev_root;
2295 device->bdev = bdev; 2368 device->bdev = bdev;
2296 device->in_fs_metadata = 1; 2369 device->in_fs_metadata = 1;
@@ -2302,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2302 list_add(&device->dev_list, &fs_info->fs_devices->devices); 2375 list_add(&device->dev_list, &fs_info->fs_devices->devices);
2303 fs_info->fs_devices->num_devices++; 2376 fs_info->fs_devices->num_devices++;
2304 fs_info->fs_devices->open_devices++; 2377 fs_info->fs_devices->open_devices++;
2305 if (device->can_discard)
2306 fs_info->fs_devices->num_can_discard++;
2307 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 2378 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2308 2379
2309 *device_out = device; 2380 *device_out = device;
@@ -2362,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2362 btrfs_set_device_io_align(leaf, dev_item, device->io_align); 2433 btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2363 btrfs_set_device_io_width(leaf, dev_item, device->io_width); 2434 btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2364 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); 2435 btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2365 btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); 2436 btrfs_set_device_total_bytes(leaf, dev_item,
2366 btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); 2437 btrfs_device_get_disk_total_bytes(device));
2438 btrfs_set_device_bytes_used(leaf, dev_item,
2439 btrfs_device_get_bytes_used(device));
2367 btrfs_mark_buffer_dirty(leaf); 2440 btrfs_mark_buffer_dirty(leaf);
2368 2441
2369out: 2442out:
@@ -2371,40 +2444,44 @@ out:
2371 return ret; 2444 return ret;
2372} 2445}
2373 2446
2374static int __btrfs_grow_device(struct btrfs_trans_handle *trans, 2447int btrfs_grow_device(struct btrfs_trans_handle *trans,
2375 struct btrfs_device *device, u64 new_size) 2448 struct btrfs_device *device, u64 new_size)
2376{ 2449{
2377 struct btrfs_super_block *super_copy = 2450 struct btrfs_super_block *super_copy =
2378 device->dev_root->fs_info->super_copy; 2451 device->dev_root->fs_info->super_copy;
2379 u64 old_total = btrfs_super_total_bytes(super_copy); 2452 struct btrfs_fs_devices *fs_devices;
2380 u64 diff = new_size - device->total_bytes; 2453 u64 old_total;
2454 u64 diff;
2381 2455
2382 if (!device->writeable) 2456 if (!device->writeable)
2383 return -EACCES; 2457 return -EACCES;
2458
2459 lock_chunks(device->dev_root);
2460 old_total = btrfs_super_total_bytes(super_copy);
2461 diff = new_size - device->total_bytes;
2462
2384 if (new_size <= device->total_bytes || 2463 if (new_size <= device->total_bytes ||
2385 device->is_tgtdev_for_dev_replace) 2464 device->is_tgtdev_for_dev_replace) {
2465 unlock_chunks(device->dev_root);
2386 return -EINVAL; 2466 return -EINVAL;
2467 }
2468
2469 fs_devices = device->dev_root->fs_info->fs_devices;
2387 2470
2388 btrfs_set_super_total_bytes(super_copy, old_total + diff); 2471 btrfs_set_super_total_bytes(super_copy, old_total + diff);
2389 device->fs_devices->total_rw_bytes += diff; 2472 device->fs_devices->total_rw_bytes += diff;
2390 2473
2391 device->total_bytes = new_size; 2474 btrfs_device_set_total_bytes(device, new_size);
2392 device->disk_total_bytes = new_size; 2475 btrfs_device_set_disk_total_bytes(device, new_size);
2393 btrfs_clear_space_info_full(device->dev_root->fs_info); 2476 btrfs_clear_space_info_full(device->dev_root->fs_info);
2477 if (list_empty(&device->resized_list))
2478 list_add_tail(&device->resized_list,
2479 &fs_devices->resized_devices);
2480 unlock_chunks(device->dev_root);
2394 2481
2395 return btrfs_update_device(trans, device); 2482 return btrfs_update_device(trans, device);
2396} 2483}
2397 2484
2398int btrfs_grow_device(struct btrfs_trans_handle *trans,
2399 struct btrfs_device *device, u64 new_size)
2400{
2401 int ret;
2402 lock_chunks(device->dev_root);
2403 ret = __btrfs_grow_device(trans, device, new_size);
2404 unlock_chunks(device->dev_root);
2405 return ret;
2406}
2407
2408static int btrfs_free_chunk(struct btrfs_trans_handle *trans, 2485static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root, 2486 struct btrfs_root *root,
2410 u64 chunk_tree, u64 chunk_objectid, 2487 u64 chunk_tree, u64 chunk_objectid,
@@ -2456,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2456 u32 cur; 2533 u32 cur;
2457 struct btrfs_key key; 2534 struct btrfs_key key;
2458 2535
2536 lock_chunks(root);
2459 array_size = btrfs_super_sys_array_size(super_copy); 2537 array_size = btrfs_super_sys_array_size(super_copy);
2460 2538
2461 ptr = super_copy->sys_chunk_array; 2539 ptr = super_copy->sys_chunk_array;
@@ -2485,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2485 cur += len; 2563 cur += len;
2486 } 2564 }
2487 } 2565 }
2566 unlock_chunks(root);
2488 return ret; 2567 return ret;
2489} 2568}
2490 2569
2491static int btrfs_relocate_chunk(struct btrfs_root *root, 2570int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2492 u64 chunk_tree, u64 chunk_objectid, 2571 struct btrfs_root *root, u64 chunk_offset)
2493 u64 chunk_offset)
2494{ 2572{
2495 struct extent_map_tree *em_tree; 2573 struct extent_map_tree *em_tree;
2496 struct btrfs_root *extent_root;
2497 struct btrfs_trans_handle *trans;
2498 struct extent_map *em; 2574 struct extent_map *em;
2575 struct btrfs_root *extent_root = root->fs_info->extent_root;
2499 struct map_lookup *map; 2576 struct map_lookup *map;
2500 int ret; 2577 u64 dev_extent_len = 0;
2501 int i; 2578 u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2579 u64 chunk_tree = root->fs_info->chunk_root->objectid;
2580 int i, ret = 0;
2502 2581
2582 /* Just in case */
2503 root = root->fs_info->chunk_root; 2583 root = root->fs_info->chunk_root;
2504 extent_root = root->fs_info->extent_root;
2505 em_tree = &root->fs_info->mapping_tree.map_tree; 2584 em_tree = &root->fs_info->mapping_tree.map_tree;
2506 2585
2507 ret = btrfs_can_relocate(extent_root, chunk_offset);
2508 if (ret)
2509 return -ENOSPC;
2510
2511 /* step one, relocate all the extents inside this chunk */
2512 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2513 if (ret)
2514 return ret;
2515
2516 trans = btrfs_start_transaction(root, 0);
2517 if (IS_ERR(trans)) {
2518 ret = PTR_ERR(trans);
2519 btrfs_std_error(root->fs_info, ret);
2520 return ret;
2521 }
2522
2523 lock_chunks(root);
2524
2525 /*
2526 * step two, delete the device extents and the
2527 * chunk tree entries
2528 */
2529 read_lock(&em_tree->lock); 2586 read_lock(&em_tree->lock);
2530 em = lookup_extent_mapping(em_tree, chunk_offset, 1); 2587 em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2531 read_unlock(&em_tree->lock); 2588 read_unlock(&em_tree->lock);
2532 2589
2533 BUG_ON(!em || em->start > chunk_offset || 2590 if (!em || em->start > chunk_offset ||
2534 em->start + em->len < chunk_offset); 2591 em->start + em->len < chunk_offset) {
2592 /*
2593 * This is a logic error, but we don't want to just rely on the
2594 * user having built with ASSERT enabled, so if ASSERT doens't
2595 * do anything we still error out.
2596 */
2597 ASSERT(0);
2598 if (em)
2599 free_extent_map(em);
2600 return -EINVAL;
2601 }
2535 map = (struct map_lookup *)em->bdev; 2602 map = (struct map_lookup *)em->bdev;
2536 2603
2537 for (i = 0; i < map->num_stripes; i++) { 2604 for (i = 0; i < map->num_stripes; i++) {
2538 ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, 2605 struct btrfs_device *device = map->stripes[i].dev;
2539 map->stripes[i].physical); 2606 ret = btrfs_free_dev_extent(trans, device,
2540 BUG_ON(ret); 2607 map->stripes[i].physical,
2608 &dev_extent_len);
2609 if (ret) {
2610 btrfs_abort_transaction(trans, root, ret);
2611 goto out;
2612 }
2613
2614 if (device->bytes_used > 0) {
2615 lock_chunks(root);
2616 btrfs_device_set_bytes_used(device,
2617 device->bytes_used - dev_extent_len);
2618 spin_lock(&root->fs_info->free_chunk_lock);
2619 root->fs_info->free_chunk_space += dev_extent_len;
2620 spin_unlock(&root->fs_info->free_chunk_lock);
2621 btrfs_clear_space_info_full(root->fs_info);
2622 unlock_chunks(root);
2623 }
2541 2624
2542 if (map->stripes[i].dev) { 2625 if (map->stripes[i].dev) {
2543 ret = btrfs_update_device(trans, map->stripes[i].dev); 2626 ret = btrfs_update_device(trans, map->stripes[i].dev);
2544 BUG_ON(ret); 2627 if (ret) {
2628 btrfs_abort_transaction(trans, root, ret);
2629 goto out;
2630 }
2545 } 2631 }
2546 } 2632 }
2547 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, 2633 ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2548 chunk_offset); 2634 chunk_offset);
2549 2635 if (ret) {
2550 BUG_ON(ret); 2636 btrfs_abort_transaction(trans, root, ret);
2637 goto out;
2638 }
2551 2639
2552 trace_btrfs_chunk_free(root, map, chunk_offset, em->len); 2640 trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2553 2641
2554 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 2642 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2555 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 2643 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2556 BUG_ON(ret); 2644 if (ret) {
2645 btrfs_abort_transaction(trans, root, ret);
2646 goto out;
2647 }
2557 } 2648 }
2558 2649
2559 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); 2650 ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
2560 BUG_ON(ret); 2651 if (ret) {
2652 btrfs_abort_transaction(trans, extent_root, ret);
2653 goto out;
2654 }
2561 2655
2562 write_lock(&em_tree->lock); 2656 write_lock(&em_tree->lock);
2563 remove_extent_mapping(em_tree, em); 2657 remove_extent_mapping(em_tree, em);
@@ -2565,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
2565 2659
2566 /* once for the tree */ 2660 /* once for the tree */
2567 free_extent_map(em); 2661 free_extent_map(em);
2662out:
2568 /* once for us */ 2663 /* once for us */
2569 free_extent_map(em); 2664 free_extent_map(em);
2665 return ret;
2666}
2570 2667
2571 unlock_chunks(root); 2668static int btrfs_relocate_chunk(struct btrfs_root *root,
2669 u64 chunk_tree, u64 chunk_objectid,
2670 u64 chunk_offset)
2671{
2672 struct btrfs_root *extent_root;
2673 struct btrfs_trans_handle *trans;
2674 int ret;
2675
2676 root = root->fs_info->chunk_root;
2677 extent_root = root->fs_info->extent_root;
2678
2679 ret = btrfs_can_relocate(extent_root, chunk_offset);
2680 if (ret)
2681 return -ENOSPC;
2682
2683 /* step one, relocate all the extents inside this chunk */
2684 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2685 if (ret)
2686 return ret;
2687
2688 trans = btrfs_start_transaction(root, 0);
2689 if (IS_ERR(trans)) {
2690 ret = PTR_ERR(trans);
2691 btrfs_std_error(root->fs_info, ret);
2692 return ret;
2693 }
2694
2695 /*
2696 * step two, delete the device extents and the
2697 * chunk tree entries
2698 */
2699 ret = btrfs_remove_chunk(trans, root, chunk_offset);
2572 btrfs_end_transaction(trans, root); 2700 btrfs_end_transaction(trans, root);
2573 return 0; 2701 return ret;
2574} 2702}
2575 2703
2576static int btrfs_relocate_sys_chunks(struct btrfs_root *root) 2704static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
@@ -2623,8 +2751,8 @@ again:
2623 found_key.offset); 2751 found_key.offset);
2624 if (ret == -ENOSPC) 2752 if (ret == -ENOSPC)
2625 failed++; 2753 failed++;
2626 else if (ret) 2754 else
2627 BUG(); 2755 BUG_ON(ret);
2628 } 2756 }
2629 2757
2630 if (found_key.offset == 0) 2758 if (found_key.offset == 0)
@@ -3031,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3031 /* step one make some room on all the devices */ 3159 /* step one make some room on all the devices */
3032 devices = &fs_info->fs_devices->devices; 3160 devices = &fs_info->fs_devices->devices;
3033 list_for_each_entry(device, devices, dev_list) { 3161 list_for_each_entry(device, devices, dev_list) {
3034 old_size = device->total_bytes; 3162 old_size = btrfs_device_get_total_bytes(device);
3035 size_to_free = div_factor(old_size, 1); 3163 size_to_free = div_factor(old_size, 1);
3036 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); 3164 size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
3037 if (!device->writeable || 3165 if (!device->writeable ||
3038 device->total_bytes - device->bytes_used > size_to_free || 3166 btrfs_device_get_total_bytes(device) -
3167 btrfs_device_get_bytes_used(device) > size_to_free ||
3039 device->is_tgtdev_for_dev_replace) 3168 device->is_tgtdev_for_dev_replace)
3040 continue; 3169 continue;
3041 3170
@@ -3590,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data)
3590 max_key.type = BTRFS_ROOT_ITEM_KEY; 3719 max_key.type = BTRFS_ROOT_ITEM_KEY;
3591 max_key.offset = (u64)-1; 3720 max_key.offset = (u64)-1;
3592 3721
3593 path->keep_locks = 1;
3594
3595 while (1) { 3722 while (1) {
3596 ret = btrfs_search_forward(root, &key, path, 0); 3723 ret = btrfs_search_forward(root, &key, path, 0);
3597 if (ret) { 3724 if (ret) {
@@ -3843,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3843 struct btrfs_key key; 3970 struct btrfs_key key;
3844 struct btrfs_super_block *super_copy = root->fs_info->super_copy; 3971 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3845 u64 old_total = btrfs_super_total_bytes(super_copy); 3972 u64 old_total = btrfs_super_total_bytes(super_copy);
3846 u64 old_size = device->total_bytes; 3973 u64 old_size = btrfs_device_get_total_bytes(device);
3847 u64 diff = device->total_bytes - new_size; 3974 u64 diff = old_size - new_size;
3848 3975
3849 if (device->is_tgtdev_for_dev_replace) 3976 if (device->is_tgtdev_for_dev_replace)
3850 return -EINVAL; 3977 return -EINVAL;
@@ -3857,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3857 3984
3858 lock_chunks(root); 3985 lock_chunks(root);
3859 3986
3860 device->total_bytes = new_size; 3987 btrfs_device_set_total_bytes(device, new_size);
3861 if (device->writeable) { 3988 if (device->writeable) {
3862 device->fs_devices->total_rw_bytes -= diff; 3989 device->fs_devices->total_rw_bytes -= diff;
3863 spin_lock(&root->fs_info->free_chunk_lock); 3990 spin_lock(&root->fs_info->free_chunk_lock);
@@ -3923,7 +4050,7 @@ again:
3923 ret = -ENOSPC; 4050 ret = -ENOSPC;
3924 lock_chunks(root); 4051 lock_chunks(root);
3925 4052
3926 device->total_bytes = old_size; 4053 btrfs_device_set_total_bytes(device, old_size);
3927 if (device->writeable) 4054 if (device->writeable)
3928 device->fs_devices->total_rw_bytes += diff; 4055 device->fs_devices->total_rw_bytes += diff;
3929 spin_lock(&root->fs_info->free_chunk_lock); 4056 spin_lock(&root->fs_info->free_chunk_lock);
@@ -3941,18 +4068,17 @@ again:
3941 } 4068 }
3942 4069
3943 lock_chunks(root); 4070 lock_chunks(root);
4071 btrfs_device_set_disk_total_bytes(device, new_size);
4072 if (list_empty(&device->resized_list))
4073 list_add_tail(&device->resized_list,
4074 &root->fs_info->fs_devices->resized_devices);
3944 4075
3945 device->disk_total_bytes = new_size;
3946 /* Now btrfs_update_device() will change the on-disk size. */
3947 ret = btrfs_update_device(trans, device);
3948 if (ret) {
3949 unlock_chunks(root);
3950 btrfs_end_transaction(trans, root);
3951 goto done;
3952 }
3953 WARN_ON(diff > old_total); 4076 WARN_ON(diff > old_total);
3954 btrfs_set_super_total_bytes(super_copy, old_total - diff); 4077 btrfs_set_super_total_bytes(super_copy, old_total - diff);
3955 unlock_chunks(root); 4078 unlock_chunks(root);
4079
4080 /* Now btrfs_update_device() will change the on-disk size. */
4081 ret = btrfs_update_device(trans, device);
3956 btrfs_end_transaction(trans, root); 4082 btrfs_end_transaction(trans, root);
3957done: 4083done:
3958 btrfs_free_path(path); 4084 btrfs_free_path(path);
@@ -3968,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
3968 u32 array_size; 4094 u32 array_size;
3969 u8 *ptr; 4095 u8 *ptr;
3970 4096
4097 lock_chunks(root);
3971 array_size = btrfs_super_sys_array_size(super_copy); 4098 array_size = btrfs_super_sys_array_size(super_copy);
3972 if (array_size + item_size + sizeof(disk_key) 4099 if (array_size + item_size + sizeof(disk_key)
3973 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) 4100 > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4101 unlock_chunks(root);
3974 return -EFBIG; 4102 return -EFBIG;
4103 }
3975 4104
3976 ptr = super_copy->sys_chunk_array + array_size; 4105 ptr = super_copy->sys_chunk_array + array_size;
3977 btrfs_cpu_key_to_disk(&disk_key, key); 4106 btrfs_cpu_key_to_disk(&disk_key, key);
@@ -3980,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root,
3980 memcpy(ptr, chunk, item_size); 4109 memcpy(ptr, chunk, item_size);
3981 item_size += sizeof(disk_key); 4110 item_size += sizeof(disk_key);
3982 btrfs_set_super_sys_array_size(super_copy, array_size + item_size); 4111 btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4112 unlock_chunks(root);
4113
3983 return 0; 4114 return 0;
3984} 4115}
3985 4116
@@ -4349,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4349 if (ret) 4480 if (ret)
4350 goto error_del_extent; 4481 goto error_del_extent;
4351 4482
4483 for (i = 0; i < map->num_stripes; i++) {
4484 num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4485 btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4486 }
4487
4488 spin_lock(&extent_root->fs_info->free_chunk_lock);
4489 extent_root->fs_info->free_chunk_space -= (stripe_size *
4490 map->num_stripes);
4491 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4492
4352 free_extent_map(em); 4493 free_extent_map(em);
4353 check_raid56_incompat_flag(extent_root->fs_info, type); 4494 check_raid56_incompat_flag(extent_root->fs_info, type);
4354 4495
@@ -4420,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4420 device = map->stripes[i].dev; 4561 device = map->stripes[i].dev;
4421 dev_offset = map->stripes[i].physical; 4562 dev_offset = map->stripes[i].physical;
4422 4563
4423 device->bytes_used += stripe_size;
4424 ret = btrfs_update_device(trans, device); 4564 ret = btrfs_update_device(trans, device);
4425 if (ret) 4565 if (ret)
4426 goto out; 4566 goto out;
@@ -4433,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4433 goto out; 4573 goto out;
4434 } 4574 }
4435 4575
4436 spin_lock(&extent_root->fs_info->free_chunk_lock);
4437 extent_root->fs_info->free_chunk_space -= (stripe_size *
4438 map->num_stripes);
4439 spin_unlock(&extent_root->fs_info->free_chunk_lock);
4440
4441 stripe = &chunk->stripe; 4576 stripe = &chunk->stripe;
4442 for (i = 0; i < map->num_stripes; i++) { 4577 for (i = 0; i < map->num_stripes; i++) {
4443 device = map->stripes[i].dev; 4578 device = map->stripes[i].dev;
@@ -4517,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4517 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); 4652 alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4518 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, 4653 ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4519 alloc_profile); 4654 alloc_profile);
4520 if (ret) { 4655 return ret;
4521 btrfs_abort_transaction(trans, root, ret); 4656}
4522 goto out; 4657
4658static inline int btrfs_chunk_max_errors(struct map_lookup *map)
4659{
4660 int max_errors;
4661
4662 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4663 BTRFS_BLOCK_GROUP_RAID10 |
4664 BTRFS_BLOCK_GROUP_RAID5 |
4665 BTRFS_BLOCK_GROUP_DUP)) {
4666 max_errors = 1;
4667 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4668 max_errors = 2;
4669 } else {
4670 max_errors = 0;
4523 } 4671 }
4524 4672
4525 ret = btrfs_add_device(trans, fs_info->chunk_root, device); 4673 return max_errors;
4526 if (ret)
4527 btrfs_abort_transaction(trans, root, ret);
4528out:
4529 return ret;
4530} 4674}
4531 4675
4532int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) 4676int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -4535,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4535 struct map_lookup *map; 4679 struct map_lookup *map;
4536 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; 4680 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4537 int readonly = 0; 4681 int readonly = 0;
4682 int miss_ndevs = 0;
4538 int i; 4683 int i;
4539 4684
4540 read_lock(&map_tree->map_tree.lock); 4685 read_lock(&map_tree->map_tree.lock);
@@ -4543,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4543 if (!em) 4688 if (!em)
4544 return 1; 4689 return 1;
4545 4690
4546 if (btrfs_test_opt(root, DEGRADED)) {
4547 free_extent_map(em);
4548 return 0;
4549 }
4550
4551 map = (struct map_lookup *)em->bdev; 4691 map = (struct map_lookup *)em->bdev;
4552 for (i = 0; i < map->num_stripes; i++) { 4692 for (i = 0; i < map->num_stripes; i++) {
4693 if (map->stripes[i].dev->missing) {
4694 miss_ndevs++;
4695 continue;
4696 }
4697
4553 if (!map->stripes[i].dev->writeable) { 4698 if (!map->stripes[i].dev->writeable) {
4554 readonly = 1; 4699 readonly = 1;
4555 break; 4700 goto end;
4556 } 4701 }
4557 } 4702 }
4703
4704 /*
4705 * If the number of missing devices is larger than max errors,
4706 * we can not write the data into that chunk successfully, so
4707 * set it readonly.
4708 */
4709 if (miss_ndevs > btrfs_chunk_max_errors(map))
4710 readonly = 1;
4711end:
4558 free_extent_map(em); 4712 free_extent_map(em);
4559 return readonly; 4713 return readonly;
4560} 4714}
@@ -4955,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4955 num_stripes = min_t(u64, map->num_stripes, 5109 num_stripes = min_t(u64, map->num_stripes,
4956 stripe_nr_end - stripe_nr_orig); 5110 stripe_nr_end - stripe_nr_orig);
4957 stripe_index = do_div(stripe_nr, map->num_stripes); 5111 stripe_index = do_div(stripe_nr, map->num_stripes);
5112 if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
5113 mirror_num = 1;
4958 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 5114 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
4959 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) 5115 if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
4960 num_stripes = map->num_stripes; 5116 num_stripes = map->num_stripes;
@@ -5058,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5058 /* We distribute the parity blocks across stripes */ 5214 /* We distribute the parity blocks across stripes */
5059 tmp = stripe_nr + stripe_index; 5215 tmp = stripe_nr + stripe_index;
5060 stripe_index = do_div(tmp, map->num_stripes); 5216 stripe_index = do_div(tmp, map->num_stripes);
5217 if (!(rw & (REQ_WRITE | REQ_DISCARD |
5218 REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
5219 mirror_num = 1;
5061 } 5220 }
5062 } else { 5221 } else {
5063 /* 5222 /*
@@ -5165,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5165 } 5324 }
5166 } 5325 }
5167 5326
5168 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { 5327 if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5169 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 5328 max_errors = btrfs_chunk_max_errors(map);
5170 BTRFS_BLOCK_GROUP_RAID10 |
5171 BTRFS_BLOCK_GROUP_RAID5 |
5172 BTRFS_BLOCK_GROUP_DUP)) {
5173 max_errors = 1;
5174 } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5175 max_errors = 2;
5176 }
5177 }
5178 5329
5179 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && 5330 if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5180 dev_replace->tgtdev != NULL) { 5331 dev_replace->tgtdev != NULL) {
@@ -5557,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5557 name = rcu_dereference(dev->name); 5708 name = rcu_dereference(dev->name);
5558 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " 5709 pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5559 "(%s id %llu), size=%u\n", rw, 5710 "(%s id %llu), size=%u\n", rw,
5560 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, 5711 (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
5561 name->str, dev->devid, bio->bi_size); 5712 name->str, dev->devid, bio->bi_iter.bi_size);
5562 rcu_read_unlock(); 5713 rcu_read_unlock();
5563 } 5714 }
5564#endif 5715#endif
@@ -5736,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5736} 5887}
5737 5888
5738static struct btrfs_device *add_missing_dev(struct btrfs_root *root, 5889static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5890 struct btrfs_fs_devices *fs_devices,
5739 u64 devid, u8 *dev_uuid) 5891 u64 devid, u8 *dev_uuid)
5740{ 5892{
5741 struct btrfs_device *device; 5893 struct btrfs_device *device;
5742 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5743 5894
5744 device = btrfs_alloc_device(NULL, &devid, dev_uuid); 5895 device = btrfs_alloc_device(NULL, &devid, dev_uuid);
5745 if (IS_ERR(device)) 5896 if (IS_ERR(device))
@@ -5800,7 +5951,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5800 else 5951 else
5801 generate_random_uuid(dev->uuid); 5952 generate_random_uuid(dev->uuid);
5802 5953
5803 btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL); 5954 btrfs_init_work(&dev->work, btrfs_submit_helper,
5955 pending_bios_fn, NULL, NULL);
5804 5956
5805 return dev; 5957 return dev;
5806} 5958}
@@ -5875,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
5875 } 6027 }
5876 if (!map->stripes[i].dev) { 6028 if (!map->stripes[i].dev) {
5877 map->stripes[i].dev = 6029 map->stripes[i].dev =
5878 add_missing_dev(root, devid, uuid); 6030 add_missing_dev(root, root->fs_info->fs_devices,
6031 devid, uuid);
5879 if (!map->stripes[i].dev) { 6032 if (!map->stripes[i].dev) {
5880 free_extent_map(em); 6033 free_extent_map(em);
5881 return -EIO; 6034 return -EIO;
@@ -5902,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf,
5902 device->devid = btrfs_device_id(leaf, dev_item); 6055 device->devid = btrfs_device_id(leaf, dev_item);
5903 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); 6056 device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
5904 device->total_bytes = device->disk_total_bytes; 6057 device->total_bytes = device->disk_total_bytes;
6058 device->commit_total_bytes = device->disk_total_bytes;
5905 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); 6059 device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6060 device->commit_bytes_used = device->bytes_used;
5906 device->type = btrfs_device_type(leaf, dev_item); 6061 device->type = btrfs_device_type(leaf, dev_item);
5907 device->io_align = btrfs_device_io_align(leaf, dev_item); 6062 device->io_align = btrfs_device_io_align(leaf, dev_item);
5908 device->io_width = btrfs_device_io_width(leaf, dev_item); 6063 device->io_width = btrfs_device_io_width(leaf, dev_item);
@@ -5914,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf,
5914 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); 6069 read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
5915} 6070}
5916 6071
5917static int open_seed_devices(struct btrfs_root *root, u8 *fsid) 6072static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
6073 u8 *fsid)
5918{ 6074{
5919 struct btrfs_fs_devices *fs_devices; 6075 struct btrfs_fs_devices *fs_devices;
5920 int ret; 6076 int ret;
@@ -5923,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
5923 6079
5924 fs_devices = root->fs_info->fs_devices->seed; 6080 fs_devices = root->fs_info->fs_devices->seed;
5925 while (fs_devices) { 6081 while (fs_devices) {
5926 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { 6082 if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
5927 ret = 0; 6083 return fs_devices;
5928 goto out; 6084
5929 }
5930 fs_devices = fs_devices->seed; 6085 fs_devices = fs_devices->seed;
5931 } 6086 }
5932 6087
5933 fs_devices = find_fsid(fsid); 6088 fs_devices = find_fsid(fsid);
5934 if (!fs_devices) { 6089 if (!fs_devices) {
5935 ret = -ENOENT; 6090 if (!btrfs_test_opt(root, DEGRADED))
5936 goto out; 6091 return ERR_PTR(-ENOENT);
6092
6093 fs_devices = alloc_fs_devices(fsid);
6094 if (IS_ERR(fs_devices))
6095 return fs_devices;
6096
6097 fs_devices->seeding = 1;
6098 fs_devices->opened = 1;
6099 return fs_devices;
5937 } 6100 }
5938 6101
5939 fs_devices = clone_fs_devices(fs_devices); 6102 fs_devices = clone_fs_devices(fs_devices);
5940 if (IS_ERR(fs_devices)) { 6103 if (IS_ERR(fs_devices))
5941 ret = PTR_ERR(fs_devices); 6104 return fs_devices;
5942 goto out;
5943 }
5944 6105
5945 ret = __btrfs_open_devices(fs_devices, FMODE_READ, 6106 ret = __btrfs_open_devices(fs_devices, FMODE_READ,
5946 root->fs_info->bdev_holder); 6107 root->fs_info->bdev_holder);
5947 if (ret) { 6108 if (ret) {
5948 free_fs_devices(fs_devices); 6109 free_fs_devices(fs_devices);
6110 fs_devices = ERR_PTR(ret);
5949 goto out; 6111 goto out;
5950 } 6112 }
5951 6113
5952 if (!fs_devices->seeding) { 6114 if (!fs_devices->seeding) {
5953 __btrfs_close_devices(fs_devices); 6115 __btrfs_close_devices(fs_devices);
5954 free_fs_devices(fs_devices); 6116 free_fs_devices(fs_devices);
5955 ret = -EINVAL; 6117 fs_devices = ERR_PTR(-EINVAL);
5956 goto out; 6118 goto out;
5957 } 6119 }
5958 6120
5959 fs_devices->seed = root->fs_info->fs_devices->seed; 6121 fs_devices->seed = root->fs_info->fs_devices->seed;
5960 root->fs_info->fs_devices->seed = fs_devices; 6122 root->fs_info->fs_devices->seed = fs_devices;
5961out: 6123out:
5962 return ret; 6124 return fs_devices;
5963} 6125}
5964 6126
5965static int read_one_dev(struct btrfs_root *root, 6127static int read_one_dev(struct btrfs_root *root,
5966 struct extent_buffer *leaf, 6128 struct extent_buffer *leaf,
5967 struct btrfs_dev_item *dev_item) 6129 struct btrfs_dev_item *dev_item)
5968{ 6130{
6131 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
5969 struct btrfs_device *device; 6132 struct btrfs_device *device;
5970 u64 devid; 6133 u64 devid;
5971 int ret; 6134 int ret;
@@ -5979,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root,
5979 BTRFS_UUID_SIZE); 6142 BTRFS_UUID_SIZE);
5980 6143
5981 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { 6144 if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
5982 ret = open_seed_devices(root, fs_uuid); 6145 fs_devices = open_seed_devices(root, fs_uuid);
5983 if (ret && !btrfs_test_opt(root, DEGRADED)) 6146 if (IS_ERR(fs_devices))
5984 return ret; 6147 return PTR_ERR(fs_devices);
5985 } 6148 }
5986 6149
5987 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); 6150 device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
5988 if (!device || !device->bdev) { 6151 if (!device) {
5989 if (!btrfs_test_opt(root, DEGRADED)) 6152 if (!btrfs_test_opt(root, DEGRADED))
5990 return -EIO; 6153 return -EIO;
5991 6154
5992 if (!device) { 6155 btrfs_warn(root->fs_info, "devid %llu missing", devid);
5993 btrfs_warn(root->fs_info, "devid %llu missing", devid); 6156 device = add_missing_dev(root, fs_devices, devid, dev_uuid);
5994 device = add_missing_dev(root, devid, dev_uuid); 6157 if (!device)
5995 if (!device) 6158 return -ENOMEM;
5996 return -ENOMEM; 6159 } else {
5997 } else if (!device->missing) { 6160 if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
6161 return -EIO;
6162
6163 if(!device->bdev && !device->missing) {
5998 /* 6164 /*
5999 * this happens when a device that was properly setup 6165 * this happens when a device that was properly setup
6000 * in the device info lists suddenly goes bad. 6166 * in the device info lists suddenly goes bad.
6001 * device->bdev is NULL, and so we have to set 6167 * device->bdev is NULL, and so we have to set
6002 * device->missing to one here 6168 * device->missing to one here
6003 */ 6169 */
6004 root->fs_info->fs_devices->missing_devices++; 6170 device->fs_devices->missing_devices++;
6005 device->missing = 1; 6171 device->missing = 1;
6006 } 6172 }
6173
6174 /* Move the device to its own fs_devices */
6175 if (device->fs_devices != fs_devices) {
6176 ASSERT(device->missing);
6177
6178 list_move(&device->dev_list, &fs_devices->devices);
6179 device->fs_devices->num_devices--;
6180 fs_devices->num_devices++;
6181
6182 device->fs_devices->missing_devices--;
6183 fs_devices->missing_devices++;
6184
6185 device->fs_devices = fs_devices;
6186 }
6007 } 6187 }
6008 6188
6009 if (device->fs_devices != root->fs_info->fs_devices) { 6189 if (device->fs_devices != root->fs_info->fs_devices) {
@@ -6319,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
6319 struct btrfs_root *dev_root = fs_info->dev_root; 6499 struct btrfs_root *dev_root = fs_info->dev_root;
6320 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6500 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6321 struct btrfs_device *device; 6501 struct btrfs_device *device;
6502 int stats_cnt;
6322 int ret = 0; 6503 int ret = 0;
6323 6504
6324 mutex_lock(&fs_devices->device_list_mutex); 6505 mutex_lock(&fs_devices->device_list_mutex);
6325 list_for_each_entry(device, &fs_devices->devices, dev_list) { 6506 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6326 if (!device->dev_stats_valid || !device->dev_stats_dirty) 6507 if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
6327 continue; 6508 continue;
6328 6509
6510 stats_cnt = atomic_read(&device->dev_stats_ccnt);
6329 ret = update_dev_stat_item(trans, dev_root, device); 6511 ret = update_dev_stat_item(trans, dev_root, device);
6330 if (!ret) 6512 if (!ret)
6331 device->dev_stats_dirty = 0; 6513 atomic_sub(stats_cnt, &device->dev_stats_ccnt);
6332 } 6514 }
6333 mutex_unlock(&fs_devices->device_list_mutex); 6515 mutex_unlock(&fs_devices->device_list_mutex);
6334 6516
@@ -6427,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device)
6427 6609
6428 return 0; 6610 return 0;
6429} 6611}
6612
6613/*
6614 * Update the size of all devices, which is used for writing out the
6615 * super blocks.
6616 */
6617void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
6618{
6619 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6620 struct btrfs_device *curr, *next;
6621
6622 if (list_empty(&fs_devices->resized_devices))
6623 return;
6624
6625 mutex_lock(&fs_devices->device_list_mutex);
6626 lock_chunks(fs_info->dev_root);
6627 list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
6628 resized_list) {
6629 list_del_init(&curr->resized_list);
6630 curr->commit_total_bytes = curr->disk_total_bytes;
6631 }
6632 unlock_chunks(fs_info->dev_root);
6633 mutex_unlock(&fs_devices->device_list_mutex);
6634}
6635
6636/* Must be invoked during the transaction commit */
6637void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
6638 struct btrfs_transaction *transaction)
6639{
6640 struct extent_map *em;
6641 struct map_lookup *map;
6642 struct btrfs_device *dev;
6643 int i;
6644
6645 if (list_empty(&transaction->pending_chunks))
6646 return;
6647
6648 /* In order to kick the device replace finish process */
6649 lock_chunks(root);
6650 list_for_each_entry(em, &transaction->pending_chunks, list) {
6651 map = (struct map_lookup *)em->bdev;
6652
6653 for (i = 0; i < map->num_stripes; i++) {
6654 dev = map->stripes[i].dev;
6655 dev->commit_bytes_used = dev->bytes_used;
6656 }
6657 }
6658 unlock_chunks(root);
6659}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2aaa00c47816..08980fa23039 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -24,6 +24,8 @@
24#include <linux/btrfs.h> 24#include <linux/btrfs.h>
25#include "async-thread.h" 25#include "async-thread.h"
26 26
27extern struct mutex uuid_mutex;
28
27#define BTRFS_STRIPE_LEN (64 * 1024) 29#define BTRFS_STRIPE_LEN (64 * 1024)
28 30
29struct buffer_head; 31struct buffer_head;
@@ -32,41 +34,59 @@ struct btrfs_pending_bios {
32 struct bio *tail; 34 struct bio *tail;
33}; 35};
34 36
37/*
38 * Use sequence counter to get consistent device stat data on
39 * 32-bit processors.
40 */
41#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
42#include <linux/seqlock.h>
43#define __BTRFS_NEED_DEVICE_DATA_ORDERED
44#define btrfs_device_data_ordered_init(device) \
45 seqcount_init(&device->data_seqcount)
46#else
47#define btrfs_device_data_ordered_init(device) do { } while (0)
48#endif
49
35struct btrfs_device { 50struct btrfs_device {
36 struct list_head dev_list; 51 struct list_head dev_list;
37 struct list_head dev_alloc_list; 52 struct list_head dev_alloc_list;
38 struct btrfs_fs_devices *fs_devices; 53 struct btrfs_fs_devices *fs_devices;
54
39 struct btrfs_root *dev_root; 55 struct btrfs_root *dev_root;
40 56
57 struct rcu_string *name;
58
59 u64 generation;
60
61 spinlock_t io_lock ____cacheline_aligned;
62 int running_pending;
41 /* regular prio bios */ 63 /* regular prio bios */
42 struct btrfs_pending_bios pending_bios; 64 struct btrfs_pending_bios pending_bios;
43 /* WRITE_SYNC bios */ 65 /* WRITE_SYNC bios */
44 struct btrfs_pending_bios pending_sync_bios; 66 struct btrfs_pending_bios pending_sync_bios;
45 67
46 u64 generation; 68 struct block_device *bdev;
47 int running_pending; 69
70 /* the mode sent to blkdev_get */
71 fmode_t mode;
72
48 int writeable; 73 int writeable;
49 int in_fs_metadata; 74 int in_fs_metadata;
50 int missing; 75 int missing;
51 int can_discard; 76 int can_discard;
52 int is_tgtdev_for_dev_replace; 77 int is_tgtdev_for_dev_replace;
53 78
54 spinlock_t io_lock; 79#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED
55 /* the mode sent to blkdev_get */ 80 seqcount_t data_seqcount;
56 fmode_t mode; 81#endif
57
58 struct block_device *bdev;
59
60
61 struct rcu_string *name;
62 82
63 /* the internal btrfs device id */ 83 /* the internal btrfs device id */
64 u64 devid; 84 u64 devid;
65 85
66 /* size of the device */ 86 /* size of the device in memory */
67 u64 total_bytes; 87 u64 total_bytes;
68 88
69 /* size of the disk */ 89 /* size of the device on disk */
70 u64 disk_total_bytes; 90 u64 disk_total_bytes;
71 91
72 /* bytes used */ 92 /* bytes used */
@@ -83,10 +103,26 @@ struct btrfs_device {
83 /* minimal io size for this device */ 103 /* minimal io size for this device */
84 u32 sector_size; 104 u32 sector_size;
85 105
86
87 /* physical drive uuid (or lvm uuid) */ 106 /* physical drive uuid (or lvm uuid) */
88 u8 uuid[BTRFS_UUID_SIZE]; 107 u8 uuid[BTRFS_UUID_SIZE];
89 108
109 /*
110 * size of the device on the current transaction
111 *
112 * This variant is update when committing the transaction,
113 * and protected by device_list_mutex
114 */
115 u64 commit_total_bytes;
116
117 /* bytes used on the current transaction */
118 u64 commit_bytes_used;
119 /*
120 * used to manage the device which is resized
121 *
122 * It is protected by chunk_lock.
123 */
124 struct list_head resized_list;
125
90 /* for sending down flush barriers */ 126 /* for sending down flush barriers */
91 int nobarriers; 127 int nobarriers;
92 struct bio *flush_bio; 128 struct bio *flush_bio;
@@ -107,26 +143,90 @@ struct btrfs_device {
107 struct radix_tree_root reada_zones; 143 struct radix_tree_root reada_zones;
108 struct radix_tree_root reada_extents; 144 struct radix_tree_root reada_extents;
109 145
110
111 /* disk I/O failure stats. For detailed description refer to 146 /* disk I/O failure stats. For detailed description refer to
112 * enum btrfs_dev_stat_values in ioctl.h */ 147 * enum btrfs_dev_stat_values in ioctl.h */
113 int dev_stats_valid; 148 int dev_stats_valid;
114 int dev_stats_dirty; /* counters need to be written to disk */ 149
150 /* Counter to record the change of device stats */
151 atomic_t dev_stats_ccnt;
115 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; 152 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
116}; 153};
117 154
155/*
156 * If we read those variants at the context of their own lock, we needn't
157 * use the following helpers, reading them directly is safe.
158 */
159#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
160#define BTRFS_DEVICE_GETSET_FUNCS(name) \
161static inline u64 \
162btrfs_device_get_##name(const struct btrfs_device *dev) \
163{ \
164 u64 size; \
165 unsigned int seq; \
166 \
167 do { \
168 seq = read_seqcount_begin(&dev->data_seqcount); \
169 size = dev->name; \
170 } while (read_seqcount_retry(&dev->data_seqcount, seq)); \
171 return size; \
172} \
173 \
174static inline void \
175btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
176{ \
177 preempt_disable(); \
178 write_seqcount_begin(&dev->data_seqcount); \
179 dev->name = size; \
180 write_seqcount_end(&dev->data_seqcount); \
181 preempt_enable(); \
182}
183#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
184#define BTRFS_DEVICE_GETSET_FUNCS(name) \
185static inline u64 \
186btrfs_device_get_##name(const struct btrfs_device *dev) \
187{ \
188 u64 size; \
189 \
190 preempt_disable(); \
191 size = dev->name; \
192 preempt_enable(); \
193 return size; \
194} \
195 \
196static inline void \
197btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
198{ \
199 preempt_disable(); \
200 dev->name = size; \
201 preempt_enable(); \
202}
203#else
204#define BTRFS_DEVICE_GETSET_FUNCS(name) \
205static inline u64 \
206btrfs_device_get_##name(const struct btrfs_device *dev) \
207{ \
208 return dev->name; \
209} \
210 \
211static inline void \
212btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \
213{ \
214 dev->name = size; \
215}
216#endif
217
218BTRFS_DEVICE_GETSET_FUNCS(total_bytes);
219BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes);
220BTRFS_DEVICE_GETSET_FUNCS(bytes_used);
221
118struct btrfs_fs_devices { 222struct btrfs_fs_devices {
119 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 223 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */
120 224
121 /* the device with this id has the most recent copy of the super */
122 u64 latest_devid;
123 u64 latest_trans;
124 u64 num_devices; 225 u64 num_devices;
125 u64 open_devices; 226 u64 open_devices;
126 u64 rw_devices; 227 u64 rw_devices;
127 u64 missing_devices; 228 u64 missing_devices;
128 u64 total_rw_bytes; 229 u64 total_rw_bytes;
129 u64 num_can_discard;
130 u64 total_devices; 230 u64 total_devices;
131 struct block_device *latest_bdev; 231 struct block_device *latest_bdev;
132 232
@@ -139,6 +239,7 @@ struct btrfs_fs_devices {
139 struct mutex device_list_mutex; 239 struct mutex device_list_mutex;
140 struct list_head devices; 240 struct list_head devices;
141 241
242 struct list_head resized_devices;
142 /* devices not currently being allocated */ 243 /* devices not currently being allocated */
143 struct list_head alloc_list; 244 struct list_head alloc_list;
144 struct list_head list; 245 struct list_head list;
@@ -167,8 +268,9 @@ struct btrfs_fs_devices {
167 */ 268 */
168typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); 269typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
169struct btrfs_io_bio { 270struct btrfs_io_bio {
170 unsigned long mirror_num; 271 unsigned int mirror_num;
171 unsigned long stripe_index; 272 unsigned int stripe_index;
273 u64 logical;
172 u8 *csum; 274 u8 *csum;
173 u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; 275 u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
174 u8 *csum_allocated; 276 u8 *csum_allocated;
@@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
325int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 427int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
326int btrfs_init_new_device(struct btrfs_root *root, char *path); 428int btrfs_init_new_device(struct btrfs_root *root, char *path);
327int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, 429int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
430 struct btrfs_device *srcdev,
328 struct btrfs_device **device_out); 431 struct btrfs_device **device_out);
329int btrfs_balance(struct btrfs_balance_control *bctl, 432int btrfs_balance(struct btrfs_balance_control *bctl,
330 struct btrfs_ioctl_balance_args *bargs); 433 struct btrfs_ioctl_balance_args *bargs);
@@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
360int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, 463int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
361 struct btrfs_root *extent_root, 464 struct btrfs_root *extent_root,
362 u64 chunk_offset, u64 chunk_size); 465 u64 chunk_offset, u64 chunk_size);
466int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
467 struct btrfs_root *root, u64 chunk_offset);
468
469static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev)
470{
471 return atomic_read(&dev->dev_stats_ccnt);
472}
473
363static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, 474static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
364 int index) 475 int index)
365{ 476{
366 atomic_inc(dev->dev_stat_values + index); 477 atomic_inc(dev->dev_stat_values + index);
367 dev->dev_stats_dirty = 1; 478 smp_mb__before_atomic();
479 atomic_inc(&dev->dev_stats_ccnt);
368} 480}
369 481
370static inline int btrfs_dev_stat_read(struct btrfs_device *dev, 482static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
@@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
379 int ret; 491 int ret;
380 492
381 ret = atomic_xchg(dev->dev_stat_values + index, 0); 493 ret = atomic_xchg(dev->dev_stat_values + index, 0);
382 dev->dev_stats_dirty = 1; 494 smp_mb__before_atomic();
495 atomic_inc(&dev->dev_stats_ccnt);
383 return ret; 496 return ret;
384} 497}
385 498
@@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
387 int index, unsigned long val) 500 int index, unsigned long val)
388{ 501{
389 atomic_set(dev->dev_stat_values + index, val); 502 atomic_set(dev->dev_stat_values + index, val);
390 dev->dev_stats_dirty = 1; 503 smp_mb__before_atomic();
504 atomic_inc(&dev->dev_stats_ccnt);
391} 505}
392 506
393static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, 507static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
@@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
395{ 509{
396 btrfs_dev_stat_set(dev, index, 0); 510 btrfs_dev_stat_set(dev, index, 0);
397} 511}
512
513void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info);
514void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
515 struct btrfs_transaction *transaction);
398#endif 516#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ad8328d797ea..dcf20131fbe4 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
237 * first xattr that we find and walk forward 237 * first xattr that we find and walk forward
238 */ 238 */
239 key.objectid = btrfs_ino(inode); 239 key.objectid = btrfs_ino(inode);
240 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 240 key.type = BTRFS_XATTR_ITEM_KEY;
241 key.offset = 0; 241 key.offset = 0;
242 242
243 path = btrfs_alloc_path(); 243 path = btrfs_alloc_path();
@@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
273 /* check to make sure this item is what we want */ 273 /* check to make sure this item is what we want */
274 if (found_key.objectid != key.objectid) 274 if (found_key.objectid != key.objectid)
275 break; 275 break;
276 if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) 276 if (found_key.type != BTRFS_XATTR_ITEM_KEY)
277 break; 277 break;
278 278
279 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 279 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index b67d8fc81277..759fa4e2de8f 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -33,8 +33,7 @@
33#include "compression.h" 33#include "compression.h"
34 34
35struct workspace { 35struct workspace {
36 z_stream inf_strm; 36 z_stream strm;
37 z_stream def_strm;
38 char *buf; 37 char *buf;
39 struct list_head list; 38 struct list_head list;
40}; 39};
@@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws)
43{ 42{
44 struct workspace *workspace = list_entry(ws, struct workspace, list); 43 struct workspace *workspace = list_entry(ws, struct workspace, list);
45 44
46 vfree(workspace->def_strm.workspace); 45 vfree(workspace->strm.workspace);
47 vfree(workspace->inf_strm.workspace);
48 kfree(workspace->buf); 46 kfree(workspace->buf);
49 kfree(workspace); 47 kfree(workspace);
50} 48}
@@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws)
52static struct list_head *zlib_alloc_workspace(void) 50static struct list_head *zlib_alloc_workspace(void)
53{ 51{
54 struct workspace *workspace; 52 struct workspace *workspace;
53 int workspacesize;
55 54
56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 55 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
57 if (!workspace) 56 if (!workspace)
58 return ERR_PTR(-ENOMEM); 57 return ERR_PTR(-ENOMEM);
59 58
60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( 59 workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
61 MAX_WBITS, MAX_MEM_LEVEL)); 60 zlib_inflate_workspacesize());
62 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->strm.workspace = vmalloc(workspacesize);
63 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
64 if (!workspace->def_strm.workspace || 63 if (!workspace->strm.workspace || !workspace->buf)
65 !workspace->inf_strm.workspace || !workspace->buf)
66 goto fail; 64 goto fail;
67 65
68 INIT_LIST_HEAD(&workspace->list); 66 INIT_LIST_HEAD(&workspace->list);
@@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws,
96 *total_out = 0; 94 *total_out = 0;
97 *total_in = 0; 95 *total_in = 0;
98 96
99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 97 if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) {
100 printk(KERN_WARNING "BTRFS: deflateInit failed\n"); 98 printk(KERN_WARNING "BTRFS: deflateInit failed\n");
101 ret = -EIO; 99 ret = -EIO;
102 goto out; 100 goto out;
103 } 101 }
104 102
105 workspace->def_strm.total_in = 0; 103 workspace->strm.total_in = 0;
106 workspace->def_strm.total_out = 0; 104 workspace->strm.total_out = 0;
107 105
108 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); 106 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
109 data_in = kmap(in_page); 107 data_in = kmap(in_page);
@@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws,
117 pages[0] = out_page; 115 pages[0] = out_page;
118 nr_pages = 1; 116 nr_pages = 1;
119 117
120 workspace->def_strm.next_in = data_in; 118 workspace->strm.next_in = data_in;
121 workspace->def_strm.next_out = cpage_out; 119 workspace->strm.next_out = cpage_out;
122 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 120 workspace->strm.avail_out = PAGE_CACHE_SIZE;
123 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); 121 workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE);
124 122
125 while (workspace->def_strm.total_in < len) { 123 while (workspace->strm.total_in < len) {
126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 124 ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH);
127 if (ret != Z_OK) { 125 if (ret != Z_OK) {
128 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", 126 printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n",
129 ret); 127 ret);
130 zlib_deflateEnd(&workspace->def_strm); 128 zlib_deflateEnd(&workspace->strm);
131 ret = -EIO; 129 ret = -EIO;
132 goto out; 130 goto out;
133 } 131 }
134 132
135 /* we're making it bigger, give up */ 133 /* we're making it bigger, give up */
136 if (workspace->def_strm.total_in > 8192 && 134 if (workspace->strm.total_in > 8192 &&
137 workspace->def_strm.total_in < 135 workspace->strm.total_in <
138 workspace->def_strm.total_out) { 136 workspace->strm.total_out) {
139 ret = -E2BIG; 137 ret = -E2BIG;
140 goto out; 138 goto out;
141 } 139 }
@@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws,
143 * before the total_in so we will pull in a new page for 141 * before the total_in so we will pull in a new page for
144 * the stream end if required 142 * the stream end if required
145 */ 143 */
146 if (workspace->def_strm.avail_out == 0) { 144 if (workspace->strm.avail_out == 0) {
147 kunmap(out_page); 145 kunmap(out_page);
148 if (nr_pages == nr_dest_pages) { 146 if (nr_pages == nr_dest_pages) {
149 out_page = NULL; 147 out_page = NULL;
@@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws,
158 cpage_out = kmap(out_page); 156 cpage_out = kmap(out_page);
159 pages[nr_pages] = out_page; 157 pages[nr_pages] = out_page;
160 nr_pages++; 158 nr_pages++;
161 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 159 workspace->strm.avail_out = PAGE_CACHE_SIZE;
162 workspace->def_strm.next_out = cpage_out; 160 workspace->strm.next_out = cpage_out;
163 } 161 }
164 /* we're all done */ 162 /* we're all done */
165 if (workspace->def_strm.total_in >= len) 163 if (workspace->strm.total_in >= len)
166 break; 164 break;
167 165
168 /* we've read in a full page, get a new one */ 166 /* we've read in a full page, get a new one */
169 if (workspace->def_strm.avail_in == 0) { 167 if (workspace->strm.avail_in == 0) {
170 if (workspace->def_strm.total_out > max_out) 168 if (workspace->strm.total_out > max_out)
171 break; 169 break;
172 170
173 bytes_left = len - workspace->def_strm.total_in; 171 bytes_left = len - workspace->strm.total_in;
174 kunmap(in_page); 172 kunmap(in_page);
175 page_cache_release(in_page); 173 page_cache_release(in_page);
176 174
@@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws,
178 in_page = find_get_page(mapping, 176 in_page = find_get_page(mapping,
179 start >> PAGE_CACHE_SHIFT); 177 start >> PAGE_CACHE_SHIFT);
180 data_in = kmap(in_page); 178 data_in = kmap(in_page);
181 workspace->def_strm.avail_in = min(bytes_left, 179 workspace->strm.avail_in = min(bytes_left,
182 PAGE_CACHE_SIZE); 180 PAGE_CACHE_SIZE);
183 workspace->def_strm.next_in = data_in; 181 workspace->strm.next_in = data_in;
184 } 182 }
185 } 183 }
186 workspace->def_strm.avail_in = 0; 184 workspace->strm.avail_in = 0;
187 ret = zlib_deflate(&workspace->def_strm, Z_FINISH); 185 ret = zlib_deflate(&workspace->strm, Z_FINISH);
188 zlib_deflateEnd(&workspace->def_strm); 186 zlib_deflateEnd(&workspace->strm);
189 187
190 if (ret != Z_STREAM_END) { 188 if (ret != Z_STREAM_END) {
191 ret = -EIO; 189 ret = -EIO;
192 goto out; 190 goto out;
193 } 191 }
194 192
195 if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { 193 if (workspace->strm.total_out >= workspace->strm.total_in) {
196 ret = -E2BIG; 194 ret = -E2BIG;
197 goto out; 195 goto out;
198 } 196 }
199 197
200 ret = 0; 198 ret = 0;
201 *total_out = workspace->def_strm.total_out; 199 *total_out = workspace->strm.total_out;
202 *total_in = workspace->def_strm.total_in; 200 *total_in = workspace->strm.total_in;
203out: 201out:
204 *out_pages = nr_pages; 202 *out_pages = nr_pages;
205 if (out_page) 203 if (out_page)
@@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
225 size_t total_out = 0; 223 size_t total_out = 0;
226 unsigned long page_in_index = 0; 224 unsigned long page_in_index = 0;
227 unsigned long page_out_index = 0; 225 unsigned long page_out_index = 0;
228 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 226 unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE);
229 PAGE_CACHE_SIZE;
230 unsigned long buf_start; 227 unsigned long buf_start;
231 unsigned long pg_offset; 228 unsigned long pg_offset;
232 229
233 data_in = kmap(pages_in[page_in_index]); 230 data_in = kmap(pages_in[page_in_index]);
234 workspace->inf_strm.next_in = data_in; 231 workspace->strm.next_in = data_in;
235 workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); 232 workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE);
236 workspace->inf_strm.total_in = 0; 233 workspace->strm.total_in = 0;
237 234
238 workspace->inf_strm.total_out = 0; 235 workspace->strm.total_out = 0;
239 workspace->inf_strm.next_out = workspace->buf; 236 workspace->strm.next_out = workspace->buf;
240 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 237 workspace->strm.avail_out = PAGE_CACHE_SIZE;
241 pg_offset = 0; 238 pg_offset = 0;
242 239
243 /* If it's deflate, and it's got no preset dictionary, then 240 /* If it's deflate, and it's got no preset dictionary, then
@@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
247 !(((data_in[0]<<8) + data_in[1]) % 31)) { 244 !(((data_in[0]<<8) + data_in[1]) % 31)) {
248 245
249 wbits = -((data_in[0] >> 4) + 8); 246 wbits = -((data_in[0] >> 4) + 8);
250 workspace->inf_strm.next_in += 2; 247 workspace->strm.next_in += 2;
251 workspace->inf_strm.avail_in -= 2; 248 workspace->strm.avail_in -= 2;
252 } 249 }
253 250
254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 251 if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
255 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 252 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
256 return -EIO; 253 return -EIO;
257 } 254 }
258 while (workspace->inf_strm.total_in < srclen) { 255 while (workspace->strm.total_in < srclen) {
259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 256 ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
260 if (ret != Z_OK && ret != Z_STREAM_END) 257 if (ret != Z_OK && ret != Z_STREAM_END)
261 break; 258 break;
262 259
263 buf_start = total_out; 260 buf_start = total_out;
264 total_out = workspace->inf_strm.total_out; 261 total_out = workspace->strm.total_out;
265 262
266 /* we didn't make progress in this inflate call, we're done */ 263 /* we didn't make progress in this inflate call, we're done */
267 if (buf_start == total_out) 264 if (buf_start == total_out)
@@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
276 goto done; 273 goto done;
277 } 274 }
278 275
279 workspace->inf_strm.next_out = workspace->buf; 276 workspace->strm.next_out = workspace->buf;
280 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 277 workspace->strm.avail_out = PAGE_CACHE_SIZE;
281 278
282 if (workspace->inf_strm.avail_in == 0) { 279 if (workspace->strm.avail_in == 0) {
283 unsigned long tmp; 280 unsigned long tmp;
284 kunmap(pages_in[page_in_index]); 281 kunmap(pages_in[page_in_index]);
285 page_in_index++; 282 page_in_index++;
@@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
288 break; 285 break;
289 } 286 }
290 data_in = kmap(pages_in[page_in_index]); 287 data_in = kmap(pages_in[page_in_index]);
291 workspace->inf_strm.next_in = data_in; 288 workspace->strm.next_in = data_in;
292 tmp = srclen - workspace->inf_strm.total_in; 289 tmp = srclen - workspace->strm.total_in;
293 workspace->inf_strm.avail_in = min(tmp, 290 workspace->strm.avail_in = min(tmp,
294 PAGE_CACHE_SIZE); 291 PAGE_CACHE_SIZE);
295 } 292 }
296 } 293 }
@@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
299 else 296 else
300 ret = 0; 297 ret = 0;
301done: 298done:
302 zlib_inflateEnd(&workspace->inf_strm); 299 zlib_inflateEnd(&workspace->strm);
303 if (data_in) 300 if (data_in)
304 kunmap(pages_in[page_in_index]); 301 kunmap(pages_in[page_in_index]);
305 return ret; 302 return ret;
@@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
317 unsigned long total_out = 0; 314 unsigned long total_out = 0;
318 char *kaddr; 315 char *kaddr;
319 316
320 workspace->inf_strm.next_in = data_in; 317 workspace->strm.next_in = data_in;
321 workspace->inf_strm.avail_in = srclen; 318 workspace->strm.avail_in = srclen;
322 workspace->inf_strm.total_in = 0; 319 workspace->strm.total_in = 0;
323 320
324 workspace->inf_strm.next_out = workspace->buf; 321 workspace->strm.next_out = workspace->buf;
325 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 322 workspace->strm.avail_out = PAGE_CACHE_SIZE;
326 workspace->inf_strm.total_out = 0; 323 workspace->strm.total_out = 0;
327 /* If it's deflate, and it's got no preset dictionary, then 324 /* If it's deflate, and it's got no preset dictionary, then
328 we can tell zlib to skip the adler32 check. */ 325 we can tell zlib to skip the adler32 check. */
329 if (srclen > 2 && !(data_in[1] & PRESET_DICT) && 326 if (srclen > 2 && !(data_in[1] & PRESET_DICT) &&
@@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
331 !(((data_in[0]<<8) + data_in[1]) % 31)) { 328 !(((data_in[0]<<8) + data_in[1]) % 31)) {
332 329
333 wbits = -((data_in[0] >> 4) + 8); 330 wbits = -((data_in[0] >> 4) + 8);
334 workspace->inf_strm.next_in += 2; 331 workspace->strm.next_in += 2;
335 workspace->inf_strm.avail_in -= 2; 332 workspace->strm.avail_in -= 2;
336 } 333 }
337 334
338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 335 if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) {
339 printk(KERN_WARNING "BTRFS: inflateInit failed\n"); 336 printk(KERN_WARNING "BTRFS: inflateInit failed\n");
340 return -EIO; 337 return -EIO;
341 } 338 }
@@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
346 unsigned long bytes; 343 unsigned long bytes;
347 unsigned long pg_offset = 0; 344 unsigned long pg_offset = 0;
348 345
349 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 346 ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
350 if (ret != Z_OK && ret != Z_STREAM_END) 347 if (ret != Z_OK && ret != Z_STREAM_END)
351 break; 348 break;
352 349
353 buf_start = total_out; 350 buf_start = total_out;
354 total_out = workspace->inf_strm.total_out; 351 total_out = workspace->strm.total_out;
355 352
356 if (total_out == buf_start) { 353 if (total_out == buf_start) {
357 ret = -EIO; 354 ret = -EIO;
@@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
377 pg_offset += bytes; 374 pg_offset += bytes;
378 bytes_left -= bytes; 375 bytes_left -= bytes;
379next: 376next:
380 workspace->inf_strm.next_out = workspace->buf; 377 workspace->strm.next_out = workspace->buf;
381 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 378 workspace->strm.avail_out = PAGE_CACHE_SIZE;
382 } 379 }
383 380
384 if (ret != Z_STREAM_END && bytes_left != 0) 381 if (ret != Z_STREAM_END && bytes_left != 0)
@@ -386,7 +383,7 @@ next:
386 else 383 else
387 ret = 0; 384 ret = 0;
388 385
389 zlib_inflateEnd(&workspace->inf_strm); 386 zlib_inflateEnd(&workspace->strm);
390 return ret; 387 return ret;
391} 388}
392 389
diff --git a/fs/buffer.c b/fs/buffer.c
index eba6e4f621ce..9614adc7e754 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -61,16 +61,9 @@ inline void touch_buffer(struct buffer_head *bh)
61} 61}
62EXPORT_SYMBOL(touch_buffer); 62EXPORT_SYMBOL(touch_buffer);
63 63
64static int sleep_on_buffer(void *word)
65{
66 io_schedule();
67 return 0;
68}
69
70void __lock_buffer(struct buffer_head *bh) 64void __lock_buffer(struct buffer_head *bh)
71{ 65{
72 wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer, 66 wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
73 TASK_UNINTERRUPTIBLE);
74} 67}
75EXPORT_SYMBOL(__lock_buffer); 68EXPORT_SYMBOL(__lock_buffer);
76 69
@@ -123,7 +116,7 @@ EXPORT_SYMBOL(buffer_check_dirty_writeback);
123 */ 116 */
124void __wait_on_buffer(struct buffer_head * bh) 117void __wait_on_buffer(struct buffer_head * bh)
125{ 118{
126 wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE); 119 wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
127} 120}
128EXPORT_SYMBOL(__wait_on_buffer); 121EXPORT_SYMBOL(__wait_on_buffer);
129 122
@@ -1029,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
1029 bh = page_buffers(page); 1022 bh = page_buffers(page);
1030 if (bh->b_size == size) { 1023 if (bh->b_size == size) {
1031 end_block = init_page_buffers(page, bdev, 1024 end_block = init_page_buffers(page, bdev,
1032 index << sizebits, size); 1025 (sector_t)index << sizebits,
1026 size);
1033 goto done; 1027 goto done;
1034 } 1028 }
1035 if (!try_to_free_buffers(page)) 1029 if (!try_to_free_buffers(page))
@@ -1050,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
1050 */ 1044 */
1051 spin_lock(&inode->i_mapping->private_lock); 1045 spin_lock(&inode->i_mapping->private_lock);
1052 link_dev_buffers(page, bh); 1046 link_dev_buffers(page, bh);
1053 end_block = init_page_buffers(page, bdev, index << sizebits, size); 1047 end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1048 size);
1054 spin_unlock(&inode->i_mapping->private_lock); 1049 spin_unlock(&inode->i_mapping->private_lock);
1055done: 1050done:
1056 ret = (block < end_block) ? 1 : -ENXIO; 1051 ret = (block < end_block) ? 1 : -ENXIO;
@@ -1258,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh)
1258 * a local interrupt disable for that. 1253 * a local interrupt disable for that.
1259 */ 1254 */
1260 1255
1261#define BH_LRU_SIZE 8 1256#define BH_LRU_SIZE 16
1262 1257
1263struct bh_lru { 1258struct bh_lru {
1264 struct buffer_head *bhs[BH_LRU_SIZE]; 1259 struct buffer_head *bhs[BH_LRU_SIZE];
@@ -1336,8 +1331,8 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1336 for (i = 0; i < BH_LRU_SIZE; i++) { 1331 for (i = 0; i < BH_LRU_SIZE; i++) {
1337 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); 1332 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1338 1333
1339 if (bh && bh->b_bdev == bdev && 1334 if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1340 bh->b_blocknr == block && bh->b_size == size) { 1335 bh->b_size == size) {
1341 if (i) { 1336 if (i) {
1342 while (i) { 1337 while (i) {
1343 __this_cpu_write(bh_lrus.bhs[i], 1338 __this_cpu_write(bh_lrus.bhs[i],
@@ -2323,6 +2318,11 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
2323 err = 0; 2318 err = 0;
2324 2319
2325 balance_dirty_pages_ratelimited(mapping); 2320 balance_dirty_pages_ratelimited(mapping);
2321
2322 if (unlikely(fatal_signal_pending(current))) {
2323 err = -EINTR;
2324 goto out;
2325 }
2326 } 2326 }
2327 2327
2328 /* page covers the boundary, find the boundary offset */ 2328 /* page covers the boundary, find the boundary offset */
@@ -2961,7 +2961,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2961 2961
2962/* 2962/*
2963 * This allows us to do IO even on the odd last sectors 2963 * This allows us to do IO even on the odd last sectors
2964 * of a device, even if the bh block size is some multiple 2964 * of a device, even if the block size is some multiple
2965 * of the physical sector size. 2965 * of the physical sector size.
2966 * 2966 *
2967 * We'll just truncate the bio to the size of the device, 2967 * We'll just truncate the bio to the size of the device,
@@ -2971,10 +2971,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
2971 * errors, this only handles the "we need to be able to 2971 * errors, this only handles the "we need to be able to
2972 * do IO at the final sector" case. 2972 * do IO at the final sector" case.
2973 */ 2973 */
2974static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) 2974void guard_bio_eod(int rw, struct bio *bio)
2975{ 2975{
2976 sector_t maxsector; 2976 sector_t maxsector;
2977 unsigned bytes; 2977 struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2978 unsigned truncated_bytes;
2978 2979
2979 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; 2980 maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2980 if (!maxsector) 2981 if (!maxsector)
@@ -2989,23 +2990,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
2989 return; 2990 return;
2990 2991
2991 maxsector -= bio->bi_iter.bi_sector; 2992 maxsector -= bio->bi_iter.bi_sector;
2992 bytes = bio->bi_iter.bi_size; 2993 if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2993 if (likely((bytes >> 9) <= maxsector))
2994 return; 2994 return;
2995 2995
2996 /* Uhhuh. We've got a bh that straddles the device size! */ 2996 /* Uhhuh. We've got a bio that straddles the device size! */
2997 bytes = maxsector << 9; 2997 truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2998 2998
2999 /* Truncate the bio.. */ 2999 /* Truncate the bio.. */
3000 bio->bi_iter.bi_size = bytes; 3000 bio->bi_iter.bi_size -= truncated_bytes;
3001 bio->bi_io_vec[0].bv_len = bytes; 3001 bvec->bv_len -= truncated_bytes;
3002 3002
3003 /* ..and clear the end of the buffer for reads */ 3003 /* ..and clear the end of the buffer for reads */
3004 if ((rw & RW_MASK) == READ) { 3004 if ((rw & RW_MASK) == READ) {
3005 void *kaddr = kmap_atomic(bh->b_page); 3005 zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
3006 memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); 3006 truncated_bytes);
3007 kunmap_atomic(kaddr);
3008 flush_dcache_page(bh->b_page);
3009 } 3007 }
3010} 3008}
3011 3009
@@ -3046,7 +3044,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3046 bio->bi_flags |= bio_flags; 3044 bio->bi_flags |= bio_flags;
3047 3045
3048 /* Take care of bh's that straddle the end of the device */ 3046 /* Take care of bh's that straddle the end of the device */
3049 guard_bh_eod(rw, bio, bh); 3047 guard_bio_eod(rw, bio);
3050 3048
3051 if (buffer_meta(bh)) 3049 if (buffer_meta(bh))
3052 rw |= REQ_META; 3050 rw |= REQ_META;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index d749731dc0ee..fbb08e97438d 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -50,18 +50,18 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args)
50 cache->brun_percent < 100); 50 cache->brun_percent < 100);
51 51
52 if (*args) { 52 if (*args) {
53 pr_err("'bind' command doesn't take an argument"); 53 pr_err("'bind' command doesn't take an argument\n");
54 return -EINVAL; 54 return -EINVAL;
55 } 55 }
56 56
57 if (!cache->rootdirname) { 57 if (!cache->rootdirname) {
58 pr_err("No cache directory specified"); 58 pr_err("No cache directory specified\n");
59 return -EINVAL; 59 return -EINVAL;
60 } 60 }
61 61
62 /* don't permit already bound caches to be re-bound */ 62 /* don't permit already bound caches to be re-bound */
63 if (test_bit(CACHEFILES_READY, &cache->flags)) { 63 if (test_bit(CACHEFILES_READY, &cache->flags)) {
64 pr_err("Cache already bound"); 64 pr_err("Cache already bound\n");
65 return -EBUSY; 65 return -EBUSY;
66 } 66 }
67 67
@@ -248,7 +248,7 @@ error_open_root:
248 kmem_cache_free(cachefiles_object_jar, fsdef); 248 kmem_cache_free(cachefiles_object_jar, fsdef);
249error_root_object: 249error_root_object:
250 cachefiles_end_secure(cache, saved_cred); 250 cachefiles_end_secure(cache, saved_cred);
251 pr_err("Failed to register: %d", ret); 251 pr_err("Failed to register: %d\n", ret);
252 return ret; 252 return ret;
253} 253}
254 254
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index b078d3081d6c..ce1b115dcc28 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -315,7 +315,7 @@ static unsigned int cachefiles_daemon_poll(struct file *file,
315static int cachefiles_daemon_range_error(struct cachefiles_cache *cache, 315static int cachefiles_daemon_range_error(struct cachefiles_cache *cache,
316 char *args) 316 char *args)
317{ 317{
318 pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%"); 318 pr_err("Free space limits must be in range 0%%<=stop<cull<run<100%%\n");
319 319
320 return -EINVAL; 320 return -EINVAL;
321} 321}
@@ -475,12 +475,12 @@ static int cachefiles_daemon_dir(struct cachefiles_cache *cache, char *args)
475 _enter(",%s", args); 475 _enter(",%s", args);
476 476
477 if (!*args) { 477 if (!*args) {
478 pr_err("Empty directory specified"); 478 pr_err("Empty directory specified\n");
479 return -EINVAL; 479 return -EINVAL;
480 } 480 }
481 481
482 if (cache->rootdirname) { 482 if (cache->rootdirname) {
483 pr_err("Second cache directory specified"); 483 pr_err("Second cache directory specified\n");
484 return -EEXIST; 484 return -EEXIST;
485 } 485 }
486 486
@@ -503,12 +503,12 @@ static int cachefiles_daemon_secctx(struct cachefiles_cache *cache, char *args)
503 _enter(",%s", args); 503 _enter(",%s", args);
504 504
505 if (!*args) { 505 if (!*args) {
506 pr_err("Empty security context specified"); 506 pr_err("Empty security context specified\n");
507 return -EINVAL; 507 return -EINVAL;
508 } 508 }
509 509
510 if (cache->secctx) { 510 if (cache->secctx) {
511 pr_err("Second security context specified"); 511 pr_err("Second security context specified\n");
512 return -EINVAL; 512 return -EINVAL;
513 } 513 }
514 514
@@ -531,7 +531,7 @@ static int cachefiles_daemon_tag(struct cachefiles_cache *cache, char *args)
531 _enter(",%s", args); 531 _enter(",%s", args);
532 532
533 if (!*args) { 533 if (!*args) {
534 pr_err("Empty tag specified"); 534 pr_err("Empty tag specified\n");
535 return -EINVAL; 535 return -EINVAL;
536 } 536 }
537 537
@@ -562,12 +562,12 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
562 goto inval; 562 goto inval;
563 563
564 if (!test_bit(CACHEFILES_READY, &cache->flags)) { 564 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
565 pr_err("cull applied to unready cache"); 565 pr_err("cull applied to unready cache\n");
566 return -EIO; 566 return -EIO;
567 } 567 }
568 568
569 if (test_bit(CACHEFILES_DEAD, &cache->flags)) { 569 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
570 pr_err("cull applied to dead cache"); 570 pr_err("cull applied to dead cache\n");
571 return -EIO; 571 return -EIO;
572 } 572 }
573 573
@@ -587,11 +587,11 @@ static int cachefiles_daemon_cull(struct cachefiles_cache *cache, char *args)
587 587
588notdir: 588notdir:
589 path_put(&path); 589 path_put(&path);
590 pr_err("cull command requires dirfd to be a directory"); 590 pr_err("cull command requires dirfd to be a directory\n");
591 return -ENOTDIR; 591 return -ENOTDIR;
592 592
593inval: 593inval:
594 pr_err("cull command requires dirfd and filename"); 594 pr_err("cull command requires dirfd and filename\n");
595 return -EINVAL; 595 return -EINVAL;
596} 596}
597 597
@@ -614,7 +614,7 @@ static int cachefiles_daemon_debug(struct cachefiles_cache *cache, char *args)
614 return 0; 614 return 0;
615 615
616inval: 616inval:
617 pr_err("debug command requires mask"); 617 pr_err("debug command requires mask\n");
618 return -EINVAL; 618 return -EINVAL;
619} 619}
620 620
@@ -634,12 +634,12 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
634 goto inval; 634 goto inval;
635 635
636 if (!test_bit(CACHEFILES_READY, &cache->flags)) { 636 if (!test_bit(CACHEFILES_READY, &cache->flags)) {
637 pr_err("inuse applied to unready cache"); 637 pr_err("inuse applied to unready cache\n");
638 return -EIO; 638 return -EIO;
639 } 639 }
640 640
641 if (test_bit(CACHEFILES_DEAD, &cache->flags)) { 641 if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
642 pr_err("inuse applied to dead cache"); 642 pr_err("inuse applied to dead cache\n");
643 return -EIO; 643 return -EIO;
644 } 644 }
645 645
@@ -659,11 +659,11 @@ static int cachefiles_daemon_inuse(struct cachefiles_cache *cache, char *args)
659 659
660notdir: 660notdir:
661 path_put(&path); 661 path_put(&path);
662 pr_err("inuse command requires dirfd to be a directory"); 662 pr_err("inuse command requires dirfd to be a directory\n");
663 return -ENOTDIR; 663 return -ENOTDIR;
664 664
665inval: 665inval:
666 pr_err("inuse command requires dirfd and filename"); 666 pr_err("inuse command requires dirfd and filename\n");
667 return -EINVAL; 667 return -EINVAL;
668} 668}
669 669
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 584743d456c3..1c7293c3a93a 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -268,20 +268,27 @@ static void cachefiles_drop_object(struct fscache_object *_object)
268 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); 268 ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000);
269#endif 269#endif
270 270
271 /* delete retired objects */ 271 /* We need to tidy the object up if we did in fact manage to open it.
272 if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && 272 * It's possible for us to get here before the object is fully
273 _object != cache->cache.fsdef 273 * initialised if the parent goes away or the object gets retired
274 ) { 274 * before we set it up.
275 _debug("- retire object OBJ%x", object->fscache.debug_id); 275 */
276 cachefiles_begin_secure(cache, &saved_cred); 276 if (object->dentry) {
277 cachefiles_delete_object(cache, object); 277 /* delete retired objects */
278 cachefiles_end_secure(cache, saved_cred); 278 if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) &&
279 } 279 _object != cache->cache.fsdef
280 ) {
281 _debug("- retire object OBJ%x", object->fscache.debug_id);
282 cachefiles_begin_secure(cache, &saved_cred);
283 cachefiles_delete_object(cache, object);
284 cachefiles_end_secure(cache, saved_cred);
285 }
280 286
281 /* close the filesystem stuff attached to the object */ 287 /* close the filesystem stuff attached to the object */
282 if (object->backer != object->dentry) 288 if (object->backer != object->dentry)
283 dput(object->backer); 289 dput(object->backer);
284 object->backer = NULL; 290 object->backer = NULL;
291 }
285 292
286 /* note that the object is now inactive */ 293 /* note that the object is now inactive */
287 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { 294 if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 3d50998abf57..8c52472d2efa 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -255,7 +255,7 @@ extern int cachefiles_remove_object_xattr(struct cachefiles_cache *cache,
255 255
256#define cachefiles_io_error(___cache, FMT, ...) \ 256#define cachefiles_io_error(___cache, FMT, ...) \
257do { \ 257do { \
258 pr_err("I/O Error: " FMT, ##__VA_ARGS__); \ 258 pr_err("I/O Error: " FMT"\n", ##__VA_ARGS__); \
259 fscache_io_error(&(___cache)->cache); \ 259 fscache_io_error(&(___cache)->cache); \
260 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \ 260 set_bit(CACHEFILES_DEAD, &(___cache)->flags); \
261} while (0) 261} while (0)
diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c
index 180edfb45f66..711f13d8c2de 100644
--- a/fs/cachefiles/main.c
+++ b/fs/cachefiles/main.c
@@ -84,7 +84,7 @@ error_proc:
84error_object_jar: 84error_object_jar:
85 misc_deregister(&cachefiles_dev); 85 misc_deregister(&cachefiles_dev);
86error_dev: 86error_dev:
87 pr_err("failed to register: %d", ret); 87 pr_err("failed to register: %d\n", ret);
88 return ret; 88 return ret;
89} 89}
90 90
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 5bf2b41e66d3..e12f189d539b 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -189,7 +189,7 @@ try_again:
189 /* an old object from a previous incarnation is hogging the slot - we 189 /* an old object from a previous incarnation is hogging the slot - we
190 * need to wait for it to be destroyed */ 190 * need to wait for it to be destroyed */
191wait_for_old_object: 191wait_for_old_object:
192 if (fscache_object_is_live(&object->fscache)) { 192 if (fscache_object_is_live(&xobject->fscache)) {
193 pr_err("\n"); 193 pr_err("\n");
194 pr_err("Error: Unexpected object collision\n"); 194 pr_err("Error: Unexpected object collision\n");
195 cachefiles_printk_object(object, xobject); 195 cachefiles_printk_object(object, xobject);
@@ -543,7 +543,7 @@ lookup_again:
543 next, next->d_inode, next->d_inode->i_ino); 543 next, next->d_inode, next->d_inode->i_ino);
544 544
545 } else if (!S_ISDIR(next->d_inode->i_mode)) { 545 } else if (!S_ISDIR(next->d_inode->i_mode)) {
546 pr_err("inode %lu is not a directory", 546 pr_err("inode %lu is not a directory\n",
547 next->d_inode->i_ino); 547 next->d_inode->i_ino);
548 ret = -ENOBUFS; 548 ret = -ENOBUFS;
549 goto error; 549 goto error;
@@ -574,7 +574,7 @@ lookup_again:
574 } else if (!S_ISDIR(next->d_inode->i_mode) && 574 } else if (!S_ISDIR(next->d_inode->i_mode) &&
575 !S_ISREG(next->d_inode->i_mode) 575 !S_ISREG(next->d_inode->i_mode)
576 ) { 576 ) {
577 pr_err("inode %lu is not a file or directory", 577 pr_err("inode %lu is not a file or directory\n",
578 next->d_inode->i_ino); 578 next->d_inode->i_ino);
579 ret = -ENOBUFS; 579 ret = -ENOBUFS;
580 goto error; 580 goto error;
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
768 ASSERT(subdir->d_inode); 768 ASSERT(subdir->d_inode);
769 769
770 if (!S_ISDIR(subdir->d_inode->i_mode)) { 770 if (!S_ISDIR(subdir->d_inode->i_mode)) {
771 pr_err("%s is not a directory", dirname); 771 pr_err("%s is not a directory\n", dirname);
772 ret = -EIO; 772 ret = -EIO;
773 goto check_error; 773 goto check_error;
774 } 774 }
@@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
779 !subdir->d_inode->i_op->lookup || 779 !subdir->d_inode->i_op->lookup ||
780 !subdir->d_inode->i_op->mkdir || 780 !subdir->d_inode->i_op->mkdir ||
781 !subdir->d_inode->i_op->create || 781 !subdir->d_inode->i_op->create ||
782 !subdir->d_inode->i_op->rename || 782 (!subdir->d_inode->i_op->rename &&
783 !subdir->d_inode->i_op->rename2) ||
783 !subdir->d_inode->i_op->rmdir || 784 !subdir->d_inode->i_op->rmdir ||
784 !subdir->d_inode->i_op->unlink) 785 !subdir->d_inode->i_op->unlink)
785 goto check_error; 786 goto check_error;
@@ -795,13 +796,13 @@ check_error:
795mkdir_error: 796mkdir_error:
796 mutex_unlock(&dir->d_inode->i_mutex); 797 mutex_unlock(&dir->d_inode->i_mutex);
797 dput(subdir); 798 dput(subdir);
798 pr_err("mkdir %s failed with error %d", dirname, ret); 799 pr_err("mkdir %s failed with error %d\n", dirname, ret);
799 return ERR_PTR(ret); 800 return ERR_PTR(ret);
800 801
801lookup_error: 802lookup_error:
802 mutex_unlock(&dir->d_inode->i_mutex); 803 mutex_unlock(&dir->d_inode->i_mutex);
803 ret = PTR_ERR(subdir); 804 ret = PTR_ERR(subdir);
804 pr_err("Lookup %s failed with error %d", dirname, ret); 805 pr_err("Lookup %s failed with error %d\n", dirname, ret);
805 return ERR_PTR(ret); 806 return ERR_PTR(ret);
806 807
807nomem_d_alloc: 808nomem_d_alloc:
@@ -891,7 +892,7 @@ lookup_error:
891 if (ret == -EIO) { 892 if (ret == -EIO) {
892 cachefiles_io_error(cache, "Lookup failed"); 893 cachefiles_io_error(cache, "Lookup failed");
893 } else if (ret != -ENOMEM) { 894 } else if (ret != -ENOMEM) {
894 pr_err("Internal error: %d", ret); 895 pr_err("Internal error: %d\n", ret);
895 ret = -EIO; 896 ret = -EIO;
896 } 897 }
897 898
@@ -950,7 +951,7 @@ error:
950 } 951 }
951 952
952 if (ret != -ENOMEM) { 953 if (ret != -ENOMEM) {
953 pr_err("Internal error: %d", ret); 954 pr_err("Internal error: %d\n", ret);
954 ret = -EIO; 955 ret = -EIO;
955 } 956 }
956 957
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 4b1fb5ca65b8..616db0e77b44 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
151 struct cachefiles_one_read *monitor; 151 struct cachefiles_one_read *monitor;
152 struct cachefiles_object *object; 152 struct cachefiles_object *object;
153 struct fscache_retrieval *op; 153 struct fscache_retrieval *op;
154 struct pagevec pagevec;
155 int error, max; 154 int error, max;
156 155
157 op = container_of(_op, struct fscache_retrieval, op); 156 op = container_of(_op, struct fscache_retrieval, op);
@@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
160 159
161 _enter("{ino=%lu}", object->backer->d_inode->i_ino); 160 _enter("{ino=%lu}", object->backer->d_inode->i_ino);
162 161
163 pagevec_init(&pagevec, 0);
164
165 max = 8; 162 max = 8;
166 spin_lock_irq(&object->work_lock); 163 spin_lock_irq(&object->work_lock);
167 164
@@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
396{ 393{
397 struct cachefiles_object *object; 394 struct cachefiles_object *object;
398 struct cachefiles_cache *cache; 395 struct cachefiles_cache *cache;
399 struct pagevec pagevec;
400 struct inode *inode; 396 struct inode *inode;
401 sector_t block0, block; 397 sector_t block0, block;
402 unsigned shift; 398 unsigned shift;
@@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
427 op->op.flags |= FSCACHE_OP_ASYNC; 423 op->op.flags |= FSCACHE_OP_ASYNC;
428 op->op.processor = cachefiles_read_copier; 424 op->op.processor = cachefiles_read_copier;
429 425
430 pagevec_init(&pagevec, 0);
431
432 /* we assume the absence or presence of the first block is a good 426 /* we assume the absence or presence of the first block is a good
433 * enough indication for the page as a whole 427 * enough indication for the page as a whole
434 * - TODO: don't use bmap() for this as it is _not_ actually good 428 * - TODO: don't use bmap() for this as it is _not_ actually good
@@ -886,7 +880,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
886{ 880{
887 struct cachefiles_object *object; 881 struct cachefiles_object *object;
888 struct cachefiles_cache *cache; 882 struct cachefiles_cache *cache;
889 mm_segment_t old_fs;
890 struct file *file; 883 struct file *file;
891 struct path path; 884 struct path path;
892 loff_t pos, eof; 885 loff_t pos, eof;
@@ -920,36 +913,27 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
920 if (IS_ERR(file)) { 913 if (IS_ERR(file)) {
921 ret = PTR_ERR(file); 914 ret = PTR_ERR(file);
922 } else { 915 } else {
923 ret = -EIO; 916 pos = (loff_t) page->index << PAGE_SHIFT;
924 if (file->f_op->write) { 917
925 pos = (loff_t) page->index << PAGE_SHIFT; 918 /* we mustn't write more data than we have, so we have
926 919 * to beware of a partial page at EOF */
927 /* we mustn't write more data than we have, so we have 920 eof = object->fscache.store_limit_l;
928 * to beware of a partial page at EOF */ 921 len = PAGE_SIZE;
929 eof = object->fscache.store_limit_l; 922 if (eof & ~PAGE_MASK) {
930 len = PAGE_SIZE; 923 ASSERTCMP(pos, <, eof);
931 if (eof & ~PAGE_MASK) { 924 if (eof - pos < PAGE_SIZE) {
932 ASSERTCMP(pos, <, eof); 925 _debug("cut short %llx to %llx",
933 if (eof - pos < PAGE_SIZE) { 926 pos, eof);
934 _debug("cut short %llx to %llx", 927 len = eof - pos;
935 pos, eof); 928 ASSERTCMP(pos + len, ==, eof);
936 len = eof - pos;
937 ASSERTCMP(pos + len, ==, eof);
938 }
939 } 929 }
940
941 data = kmap(page);
942 file_start_write(file);
943 old_fs = get_fs();
944 set_fs(KERNEL_DS);
945 ret = file->f_op->write(
946 file, (const void __user *) data, len, &pos);
947 set_fs(old_fs);
948 kunmap(page);
949 file_end_write(file);
950 if (ret != len)
951 ret = -EIO;
952 } 930 }
931
932 data = kmap(page);
933 ret = __kernel_write(file, data, len, &pos);
934 kunmap(page);
935 if (ret != len)
936 ret = -EIO;
953 fput(file); 937 fput(file);
954 } 938 }
955 939
diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c
index 1ad51ffbb275..acbc1f094fb1 100644
--- a/fs/cachefiles/xattr.c
+++ b/fs/cachefiles/xattr.c
@@ -51,7 +51,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
51 } 51 }
52 52
53 if (ret != -EEXIST) { 53 if (ret != -EEXIST) {
54 pr_err("Can't set xattr on %*.*s [%lu] (err %d)", 54 pr_err("Can't set xattr on %*.*s [%lu] (err %d)\n",
55 dentry->d_name.len, dentry->d_name.len, 55 dentry->d_name.len, dentry->d_name.len,
56 dentry->d_name.name, dentry->d_inode->i_ino, 56 dentry->d_name.name, dentry->d_inode->i_ino,
57 -ret); 57 -ret);
@@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object)
64 if (ret == -ERANGE) 64 if (ret == -ERANGE)
65 goto bad_type_length; 65 goto bad_type_length;
66 66
67 pr_err("Can't read xattr on %*.*s [%lu] (err %d)", 67 pr_err("Can't read xattr on %*.*s [%lu] (err %d)\n",
68 dentry->d_name.len, dentry->d_name.len, 68 dentry->d_name.len, dentry->d_name.len,
69 dentry->d_name.name, dentry->d_inode->i_ino, 69 dentry->d_name.name, dentry->d_inode->i_ino,
70 -ret); 70 -ret);
@@ -85,14 +85,14 @@ error:
85 return ret; 85 return ret;
86 86
87bad_type_length: 87bad_type_length:
88 pr_err("Cache object %lu type xattr length incorrect", 88 pr_err("Cache object %lu type xattr length incorrect\n",
89 dentry->d_inode->i_ino); 89 dentry->d_inode->i_ino);
90 ret = -EIO; 90 ret = -EIO;
91 goto error; 91 goto error;
92 92
93bad_type: 93bad_type:
94 xtype[2] = 0; 94 xtype[2] = 0;
95 pr_err("Cache object %*.*s [%lu] type %s not %s", 95 pr_err("Cache object %*.*s [%lu] type %s not %s\n",
96 dentry->d_name.len, dentry->d_name.len, 96 dentry->d_name.len, dentry->d_name.len,
97 dentry->d_name.name, dentry->d_inode->i_ino, 97 dentry->d_name.name, dentry->d_inode->i_ino,
98 xtype, type); 98 xtype, type);
@@ -293,7 +293,7 @@ error:
293 return ret; 293 return ret;
294 294
295bad_type_length: 295bad_type_length:
296 pr_err("Cache object %lu xattr length incorrect", 296 pr_err("Cache object %lu xattr length incorrect\n",
297 dentry->d_inode->i_ino); 297 dentry->d_inode->i_ino);
298 ret = -EIO; 298 ret = -EIO;
299 goto error; 299 goto error;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 469f2e8657e8..5bd853ba44ff 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -169,26 +169,109 @@ out:
169 return ret; 169 return ret;
170} 170}
171 171
172int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) 172int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
173 struct ceph_acls_info *info)
173{ 174{
174 struct posix_acl *default_acl, *acl; 175 struct posix_acl *acl, *default_acl;
175 int error; 176 size_t val_size1 = 0, val_size2 = 0;
177 struct ceph_pagelist *pagelist = NULL;
178 void *tmp_buf = NULL;
179 int err;
176 180
177 error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); 181 err = posix_acl_create(dir, mode, &default_acl, &acl);
178 if (error) 182 if (err)
179 return error; 183 return err;
184
185 if (acl) {
186 int ret = posix_acl_equiv_mode(acl, mode);
187 if (ret < 0)
188 goto out_err;
189 if (ret == 0) {
190 posix_acl_release(acl);
191 acl = NULL;
192 }
193 }
180 194
181 if (!default_acl && !acl) 195 if (!default_acl && !acl)
182 cache_no_acl(inode); 196 return 0;
197
198 if (acl)
199 val_size1 = posix_acl_xattr_size(acl->a_count);
200 if (default_acl)
201 val_size2 = posix_acl_xattr_size(default_acl->a_count);
202
203 err = -ENOMEM;
204 tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS);
205 if (!tmp_buf)
206 goto out_err;
207 pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS);
208 if (!pagelist)
209 goto out_err;
210 ceph_pagelist_init(pagelist);
211
212 err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
213 if (err)
214 goto out_err;
215
216 ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
183 217
184 if (default_acl) {
185 error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
186 posix_acl_release(default_acl);
187 }
188 if (acl) { 218 if (acl) {
189 if (!error) 219 size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
190 error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); 220 err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
191 posix_acl_release(acl); 221 if (err)
222 goto out_err;
223 ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
224 len);
225 err = posix_acl_to_xattr(&init_user_ns, acl,
226 tmp_buf, val_size1);
227 if (err < 0)
228 goto out_err;
229 ceph_pagelist_encode_32(pagelist, val_size1);
230 ceph_pagelist_append(pagelist, tmp_buf, val_size1);
192 } 231 }
193 return error; 232 if (default_acl) {
233 size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
234 err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
235 if (err)
236 goto out_err;
237 err = ceph_pagelist_encode_string(pagelist,
238 POSIX_ACL_XATTR_DEFAULT, len);
239 err = posix_acl_to_xattr(&init_user_ns, default_acl,
240 tmp_buf, val_size2);
241 if (err < 0)
242 goto out_err;
243 ceph_pagelist_encode_32(pagelist, val_size2);
244 ceph_pagelist_append(pagelist, tmp_buf, val_size2);
245 }
246
247 kfree(tmp_buf);
248
249 info->acl = acl;
250 info->default_acl = default_acl;
251 info->pagelist = pagelist;
252 return 0;
253
254out_err:
255 posix_acl_release(acl);
256 posix_acl_release(default_acl);
257 kfree(tmp_buf);
258 if (pagelist)
259 ceph_pagelist_release(pagelist);
260 return err;
261}
262
263void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info)
264{
265 if (!inode)
266 return;
267 ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl);
268 ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl);
269}
270
271void ceph_release_acls_info(struct ceph_acls_info *info)
272{
273 posix_acl_release(info->acl);
274 posix_acl_release(info->default_acl);
275 if (info->pagelist)
276 ceph_pagelist_release(info->pagelist);
194} 277}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 90b3954d48ed..18c06bbaf136 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1076,12 +1076,6 @@ retry_locked:
1076 /* past end of file? */ 1076 /* past end of file? */
1077 i_size = inode->i_size; /* caller holds i_mutex */ 1077 i_size = inode->i_size; /* caller holds i_mutex */
1078 1078
1079 if (i_size + len > inode->i_sb->s_maxbytes) {
1080 /* file is too big */
1081 r = -EINVAL;
1082 goto fail;
1083 }
1084
1085 if (page_off >= i_size || 1079 if (page_off >= i_size ||
1086 (pos_in_page == 0 && (pos+len) >= i_size && 1080 (pos_in_page == 0 && (pos+len) >= i_size &&
1087 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { 1081 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
@@ -1099,9 +1093,6 @@ retry_locked:
1099 if (r < 0) 1093 if (r < 0)
1100 goto fail_nosnap; 1094 goto fail_nosnap;
1101 goto retry_locked; 1095 goto retry_locked;
1102
1103fail:
1104 up_read(&mdsc->snap_rwsem);
1105fail_nosnap: 1096fail_nosnap:
1106 unlock_page(page); 1097 unlock_page(page);
1107 return r; 1098 return r;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 1fde164b74b5..659f2ea9e6f7 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2397,12 +2397,12 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2397 u64 max_size = le64_to_cpu(grant->max_size); 2397 u64 max_size = le64_to_cpu(grant->max_size);
2398 struct timespec mtime, atime, ctime; 2398 struct timespec mtime, atime, ctime;
2399 int check_caps = 0; 2399 int check_caps = 0;
2400 bool wake = 0; 2400 bool wake = false;
2401 bool writeback = 0; 2401 bool writeback = false;
2402 bool queue_trunc = 0; 2402 bool queue_trunc = false;
2403 bool queue_invalidate = 0; 2403 bool queue_invalidate = false;
2404 bool queue_revalidate = 0; 2404 bool queue_revalidate = false;
2405 bool deleted_inode = 0; 2405 bool deleted_inode = false;
2406 2406
2407 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2407 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2408 inode, cap, mds, seq, ceph_cap_string(newcaps)); 2408 inode, cap, mds, seq, ceph_cap_string(newcaps));
@@ -2437,7 +2437,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2437 /* there were locked pages.. invalidate later 2437 /* there were locked pages.. invalidate later
2438 in a separate thread. */ 2438 in a separate thread. */
2439 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { 2439 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2440 queue_invalidate = 1; 2440 queue_invalidate = true;
2441 ci->i_rdcache_revoking = ci->i_rdcache_gen; 2441 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2442 } 2442 }
2443 } 2443 }
@@ -2466,7 +2466,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2466 set_nlink(inode, le32_to_cpu(grant->nlink)); 2466 set_nlink(inode, le32_to_cpu(grant->nlink));
2467 if (inode->i_nlink == 0 && 2467 if (inode->i_nlink == 0 &&
2468 (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) 2468 (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
2469 deleted_inode = 1; 2469 deleted_inode = true;
2470 } 2470 }
2471 2471
2472 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { 2472 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
@@ -2487,7 +2487,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2487 /* Do we need to revalidate our fscache cookie. Don't bother on the 2487 /* Do we need to revalidate our fscache cookie. Don't bother on the
2488 * first cache cap as we already validate at cookie creation time. */ 2488 * first cache cap as we already validate at cookie creation time. */
2489 if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) 2489 if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1)
2490 queue_revalidate = 1; 2490 queue_revalidate = true;
2491 2491
2492 if (newcaps & CEPH_CAP_ANY_RD) { 2492 if (newcaps & CEPH_CAP_ANY_RD) {
2493 /* ctime/mtime/atime? */ 2493 /* ctime/mtime/atime? */
@@ -2516,7 +2516,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2516 ci->i_wanted_max_size = 0; /* reset */ 2516 ci->i_wanted_max_size = 0; /* reset */
2517 ci->i_requested_max_size = 0; 2517 ci->i_requested_max_size = 0;
2518 } 2518 }
2519 wake = 1; 2519 wake = true;
2520 } 2520 }
2521 } 2521 }
2522 2522
@@ -2546,7 +2546,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2546 ceph_cap_string(newcaps), 2546 ceph_cap_string(newcaps),
2547 ceph_cap_string(revoking)); 2547 ceph_cap_string(revoking));
2548 if (revoking & used & CEPH_CAP_FILE_BUFFER) 2548 if (revoking & used & CEPH_CAP_FILE_BUFFER)
2549 writeback = 1; /* initiate writeback; will delay ack */ 2549 writeback = true; /* initiate writeback; will delay ack */
2550 else if (revoking == CEPH_CAP_FILE_CACHE && 2550 else if (revoking == CEPH_CAP_FILE_CACHE &&
2551 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && 2551 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2552 queue_invalidate) 2552 queue_invalidate)
@@ -2572,7 +2572,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2572 cap->implemented |= newcaps; /* add bits only, to 2572 cap->implemented |= newcaps; /* add bits only, to
2573 * avoid stepping on a 2573 * avoid stepping on a
2574 * pending revocation */ 2574 * pending revocation */
2575 wake = 1; 2575 wake = true;
2576 } 2576 }
2577 BUG_ON(cap->issued & ~cap->implemented); 2577 BUG_ON(cap->issued & ~cap->implemented);
2578 2578
@@ -2586,7 +2586,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
2586 kick_flushing_inode_caps(mdsc, session, inode); 2586 kick_flushing_inode_caps(mdsc, session, inode);
2587 up_read(&mdsc->snap_rwsem); 2587 up_read(&mdsc->snap_rwsem);
2588 if (newcaps & ~issued) 2588 if (newcaps & ~issued)
2589 wake = 1; 2589 wake = true;
2590 } 2590 }
2591 2591
2592 if (queue_trunc) { 2592 if (queue_trunc) {
@@ -3045,6 +3045,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3045 } 3045 }
3046 } 3046 }
3047 3047
3048 /* lookup ino */
3049 inode = ceph_find_inode(sb, vino);
3050 ci = ceph_inode(inode);
3051 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
3052 vino.snap, inode);
3053
3048 mutex_lock(&session->s_mutex); 3054 mutex_lock(&session->s_mutex);
3049 session->s_seq++; 3055 session->s_seq++;
3050 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 3056 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -3053,11 +3059,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3053 if (op == CEPH_CAP_OP_IMPORT) 3059 if (op == CEPH_CAP_OP_IMPORT)
3054 ceph_add_cap_releases(mdsc, session); 3060 ceph_add_cap_releases(mdsc, session);
3055 3061
3056 /* lookup ino */
3057 inode = ceph_find_inode(sb, vino);
3058 ci = ceph_inode(inode);
3059 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
3060 vino.snap, inode);
3061 if (!inode) { 3062 if (!inode) {
3062 dout(" i don't have ino %llx\n", vino.ino); 3063 dout(" i don't have ino %llx\n", vino.ino);
3063 3064
@@ -3277,7 +3278,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
3277 rel->ino = cpu_to_le64(ceph_ino(inode)); 3278 rel->ino = cpu_to_le64(ceph_ino(inode));
3278 rel->cap_id = cpu_to_le64(cap->cap_id); 3279 rel->cap_id = cpu_to_le64(cap->cap_id);
3279 rel->seq = cpu_to_le32(cap->seq); 3280 rel->seq = cpu_to_le32(cap->seq);
3280 rel->issue_seq = cpu_to_le32(cap->issue_seq), 3281 rel->issue_seq = cpu_to_le32(cap->issue_seq);
3281 rel->mseq = cpu_to_le32(cap->mseq); 3282 rel->mseq = cpu_to_le32(cap->mseq);
3282 rel->caps = cpu_to_le32(cap->implemented); 3283 rel->caps = cpu_to_le32(cap->implemented);
3283 rel->wanted = cpu_to_le32(cap->mds_wanted); 3284 rel->wanted = cpu_to_le32(cap->mds_wanted);
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 5a743ac141ab..5d5a4c8c8496 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -158,10 +158,47 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
158 return 0; 158 return 0;
159} 159}
160 160
161static int mds_sessions_show(struct seq_file *s, void *ptr)
162{
163 struct ceph_fs_client *fsc = s->private;
164 struct ceph_mds_client *mdsc = fsc->mdsc;
165 struct ceph_auth_client *ac = fsc->client->monc.auth;
166 struct ceph_options *opt = fsc->client->options;
167 int mds = -1;
168
169 mutex_lock(&mdsc->mutex);
170
171 /* The 'num' portion of an 'entity name' */
172 seq_printf(s, "global_id %llu\n", ac->global_id);
173
174 /* The -o name mount argument */
175 seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : "");
176
177 /* The list of MDS session rank+state */
178 for (mds = 0; mds < mdsc->max_sessions; mds++) {
179 struct ceph_mds_session *session =
180 __ceph_lookup_mds_session(mdsc, mds);
181 if (!session) {
182 continue;
183 }
184 mutex_unlock(&mdsc->mutex);
185 seq_printf(s, "mds.%d %s\n",
186 session->s_mds,
187 ceph_session_state_name(session->s_state));
188
189 ceph_put_mds_session(session);
190 mutex_lock(&mdsc->mutex);
191 }
192 mutex_unlock(&mdsc->mutex);
193
194 return 0;
195}
196
161CEPH_DEFINE_SHOW_FUNC(mdsmap_show) 197CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
162CEPH_DEFINE_SHOW_FUNC(mdsc_show) 198CEPH_DEFINE_SHOW_FUNC(mdsc_show)
163CEPH_DEFINE_SHOW_FUNC(caps_show) 199CEPH_DEFINE_SHOW_FUNC(caps_show)
164CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) 200CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
201CEPH_DEFINE_SHOW_FUNC(mds_sessions_show)
165 202
166 203
167/* 204/*
@@ -193,6 +230,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
193 debugfs_remove(fsc->debugfs_bdi); 230 debugfs_remove(fsc->debugfs_bdi);
194 debugfs_remove(fsc->debugfs_congestion_kb); 231 debugfs_remove(fsc->debugfs_congestion_kb);
195 debugfs_remove(fsc->debugfs_mdsmap); 232 debugfs_remove(fsc->debugfs_mdsmap);
233 debugfs_remove(fsc->debugfs_mds_sessions);
196 debugfs_remove(fsc->debugfs_caps); 234 debugfs_remove(fsc->debugfs_caps);
197 debugfs_remove(fsc->debugfs_mdsc); 235 debugfs_remove(fsc->debugfs_mdsc);
198 debugfs_remove(fsc->debugfs_dentry_lru); 236 debugfs_remove(fsc->debugfs_dentry_lru);
@@ -231,6 +269,14 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
231 if (!fsc->debugfs_mdsmap) 269 if (!fsc->debugfs_mdsmap)
232 goto out; 270 goto out;
233 271
272 fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions",
273 0600,
274 fsc->client->debugfs_dir,
275 fsc,
276 &mds_sessions_show_fops);
277 if (!fsc->debugfs_mds_sessions)
278 goto out;
279
234 fsc->debugfs_mdsc = debugfs_create_file("mdsc", 280 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
235 0600, 281 0600,
236 fsc->client->debugfs_dir, 282 fsc->client->debugfs_dir,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c29d6ae68874..e6d63f8f98c0 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -682,17 +682,22 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
682 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 682 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
683 struct ceph_mds_client *mdsc = fsc->mdsc; 683 struct ceph_mds_client *mdsc = fsc->mdsc;
684 struct ceph_mds_request *req; 684 struct ceph_mds_request *req;
685 struct ceph_acls_info acls = {};
685 int err; 686 int err;
686 687
687 if (ceph_snap(dir) != CEPH_NOSNAP) 688 if (ceph_snap(dir) != CEPH_NOSNAP)
688 return -EROFS; 689 return -EROFS;
689 690
691 err = ceph_pre_init_acls(dir, &mode, &acls);
692 if (err < 0)
693 return err;
694
690 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", 695 dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n",
691 dir, dentry, mode, rdev); 696 dir, dentry, mode, rdev);
692 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); 697 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
693 if (IS_ERR(req)) { 698 if (IS_ERR(req)) {
694 d_drop(dentry); 699 err = PTR_ERR(req);
695 return PTR_ERR(req); 700 goto out;
696 } 701 }
697 req->r_dentry = dget(dentry); 702 req->r_dentry = dget(dentry);
698 req->r_num_caps = 2; 703 req->r_num_caps = 2;
@@ -701,15 +706,20 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry,
701 req->r_args.mknod.rdev = cpu_to_le32(rdev); 706 req->r_args.mknod.rdev = cpu_to_le32(rdev);
702 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 707 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
703 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 708 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
709 if (acls.pagelist) {
710 req->r_pagelist = acls.pagelist;
711 acls.pagelist = NULL;
712 }
704 err = ceph_mdsc_do_request(mdsc, dir, req); 713 err = ceph_mdsc_do_request(mdsc, dir, req);
705 if (!err && !req->r_reply_info.head->is_dentry) 714 if (!err && !req->r_reply_info.head->is_dentry)
706 err = ceph_handle_notrace_create(dir, dentry); 715 err = ceph_handle_notrace_create(dir, dentry);
707 ceph_mdsc_put_request(req); 716 ceph_mdsc_put_request(req);
708 717out:
709 if (!err) 718 if (!err)
710 ceph_init_acl(dentry, dentry->d_inode, dir); 719 ceph_init_inode_acls(dentry->d_inode, &acls);
711 else 720 else
712 d_drop(dentry); 721 d_drop(dentry);
722 ceph_release_acls_info(&acls);
713 return err; 723 return err;
714} 724}
715 725
@@ -733,8 +743,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
733 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); 743 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
734 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); 744 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
735 if (IS_ERR(req)) { 745 if (IS_ERR(req)) {
736 d_drop(dentry); 746 err = PTR_ERR(req);
737 return PTR_ERR(req); 747 goto out;
738 } 748 }
739 req->r_dentry = dget(dentry); 749 req->r_dentry = dget(dentry);
740 req->r_num_caps = 2; 750 req->r_num_caps = 2;
@@ -746,9 +756,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
746 if (!err && !req->r_reply_info.head->is_dentry) 756 if (!err && !req->r_reply_info.head->is_dentry)
747 err = ceph_handle_notrace_create(dir, dentry); 757 err = ceph_handle_notrace_create(dir, dentry);
748 ceph_mdsc_put_request(req); 758 ceph_mdsc_put_request(req);
749 if (!err) 759out:
750 ceph_init_acl(dentry, dentry->d_inode, dir); 760 if (err)
751 else
752 d_drop(dentry); 761 d_drop(dentry);
753 return err; 762 return err;
754} 763}
@@ -758,6 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
758 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); 767 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
759 struct ceph_mds_client *mdsc = fsc->mdsc; 768 struct ceph_mds_client *mdsc = fsc->mdsc;
760 struct ceph_mds_request *req; 769 struct ceph_mds_request *req;
770 struct ceph_acls_info acls = {};
761 int err = -EROFS; 771 int err = -EROFS;
762 int op; 772 int op;
763 773
@@ -772,6 +782,12 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
772 } else { 782 } else {
773 goto out; 783 goto out;
774 } 784 }
785
786 mode |= S_IFDIR;
787 err = ceph_pre_init_acls(dir, &mode, &acls);
788 if (err < 0)
789 goto out;
790
775 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); 791 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
776 if (IS_ERR(req)) { 792 if (IS_ERR(req)) {
777 err = PTR_ERR(req); 793 err = PTR_ERR(req);
@@ -784,15 +800,20 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
784 req->r_args.mkdir.mode = cpu_to_le32(mode); 800 req->r_args.mkdir.mode = cpu_to_le32(mode);
785 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 801 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
786 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 802 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
803 if (acls.pagelist) {
804 req->r_pagelist = acls.pagelist;
805 acls.pagelist = NULL;
806 }
787 err = ceph_mdsc_do_request(mdsc, dir, req); 807 err = ceph_mdsc_do_request(mdsc, dir, req);
788 if (!err && !req->r_reply_info.head->is_dentry) 808 if (!err && !req->r_reply_info.head->is_dentry)
789 err = ceph_handle_notrace_create(dir, dentry); 809 err = ceph_handle_notrace_create(dir, dentry);
790 ceph_mdsc_put_request(req); 810 ceph_mdsc_put_request(req);
791out: 811out:
792 if (!err) 812 if (!err)
793 ceph_init_acl(dentry, dentry->d_inode, dir); 813 ceph_init_inode_acls(dentry->d_inode, &acls);
794 else 814 else
795 d_drop(dentry); 815 d_drop(dentry);
816 ceph_release_acls_info(&acls);
796 return err; 817 return err;
797} 818}
798 819
@@ -1069,7 +1090,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
1069 ceph_dentry_lru_touch(dentry); 1090 ceph_dentry_lru_touch(dentry);
1070 } else { 1091 } else {
1071 ceph_dir_clear_complete(dir); 1092 ceph_dir_clear_complete(dir);
1072 d_drop(dentry);
1073 } 1093 }
1074 iput(dir); 1094 iput(dir);
1075 return valid; 1095 return valid;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 302085100c28..d7e0da8366e6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -235,6 +235,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
235 struct ceph_mds_client *mdsc = fsc->mdsc; 235 struct ceph_mds_client *mdsc = fsc->mdsc;
236 struct ceph_mds_request *req; 236 struct ceph_mds_request *req;
237 struct dentry *dn; 237 struct dentry *dn;
238 struct ceph_acls_info acls = {};
238 int err; 239 int err;
239 240
240 dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", 241 dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n",
@@ -248,22 +249,34 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
248 if (err < 0) 249 if (err < 0)
249 return err; 250 return err;
250 251
252 if (flags & O_CREAT) {
253 err = ceph_pre_init_acls(dir, &mode, &acls);
254 if (err < 0)
255 return err;
256 }
257
251 /* do the open */ 258 /* do the open */
252 req = prepare_open_request(dir->i_sb, flags, mode); 259 req = prepare_open_request(dir->i_sb, flags, mode);
253 if (IS_ERR(req)) 260 if (IS_ERR(req)) {
254 return PTR_ERR(req); 261 err = PTR_ERR(req);
262 goto out_acl;
263 }
255 req->r_dentry = dget(dentry); 264 req->r_dentry = dget(dentry);
256 req->r_num_caps = 2; 265 req->r_num_caps = 2;
257 if (flags & O_CREAT) { 266 if (flags & O_CREAT) {
258 req->r_dentry_drop = CEPH_CAP_FILE_SHARED; 267 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
259 req->r_dentry_unless = CEPH_CAP_FILE_EXCL; 268 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
269 if (acls.pagelist) {
270 req->r_pagelist = acls.pagelist;
271 acls.pagelist = NULL;
272 }
260 } 273 }
261 req->r_locked_dir = dir; /* caller holds dir->i_mutex */ 274 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
262 err = ceph_mdsc_do_request(mdsc, 275 err = ceph_mdsc_do_request(mdsc,
263 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 276 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
264 req); 277 req);
265 if (err) 278 if (err)
266 goto out_err; 279 goto out_req;
267 280
268 err = ceph_handle_snapdir(req, dentry, err); 281 err = ceph_handle_snapdir(req, dentry, err);
269 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 282 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
@@ -278,7 +291,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
278 dn = NULL; 291 dn = NULL;
279 } 292 }
280 if (err) 293 if (err)
281 goto out_err; 294 goto out_req;
282 if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { 295 if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) {
283 /* make vfs retry on splice, ENOENT, or symlink */ 296 /* make vfs retry on splice, ENOENT, or symlink */
284 dout("atomic_open finish_no_open on dn %p\n", dn); 297 dout("atomic_open finish_no_open on dn %p\n", dn);
@@ -286,15 +299,17 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
286 } else { 299 } else {
287 dout("atomic_open finish_open on dn %p\n", dn); 300 dout("atomic_open finish_open on dn %p\n", dn);
288 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { 301 if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
289 ceph_init_acl(dentry, dentry->d_inode, dir); 302 ceph_init_inode_acls(dentry->d_inode, &acls);
290 *opened |= FILE_CREATED; 303 *opened |= FILE_CREATED;
291 } 304 }
292 err = finish_open(file, dentry, ceph_open, opened); 305 err = finish_open(file, dentry, ceph_open, opened);
293 } 306 }
294out_err: 307out_req:
295 if (!req->r_err && req->r_target_inode) 308 if (!req->r_err && req->r_target_inode)
296 ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); 309 ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
297 ceph_mdsc_put_request(req); 310 ceph_mdsc_put_request(req);
311out_acl:
312 ceph_release_acls_info(&acls);
298 dout("atomic_open result=%d\n", err); 313 dout("atomic_open result=%d\n", err);
299 return err; 314 return err;
300} 315}
@@ -423,6 +438,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
423 dout("sync_read on file %p %llu~%u %s\n", file, off, 438 dout("sync_read on file %p %llu~%u %s\n", file, off,
424 (unsigned)len, 439 (unsigned)len,
425 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 440 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
441
442 if (!len)
443 return 0;
426 /* 444 /*
427 * flush any page cache pages in this range. this 445 * flush any page cache pages in this range. this
428 * will make concurrent normal and sync io slow, 446 * will make concurrent normal and sync io slow,
@@ -470,8 +488,11 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
470 size_t left = ret; 488 size_t left = ret;
471 489
472 while (left) { 490 while (left) {
473 int copy = min_t(size_t, PAGE_SIZE, left); 491 size_t page_off = off & ~PAGE_MASK;
474 l = copy_page_to_iter(pages[k++], 0, copy, i); 492 size_t copy = min_t(size_t,
493 PAGE_SIZE - page_off, left);
494 l = copy_page_to_iter(pages[k++], page_off,
495 copy, i);
475 off += l; 496 off += l;
476 left -= l; 497 left -= l;
477 if (l < copy) 498 if (l < copy)
@@ -531,7 +552,7 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
531 * objects, rollback on failure, etc.) 552 * objects, rollback on failure, etc.)
532 */ 553 */
533static ssize_t 554static ssize_t
534ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from) 555ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
535{ 556{
536 struct file *file = iocb->ki_filp; 557 struct file *file = iocb->ki_filp;
537 struct inode *inode = file_inode(file); 558 struct inode *inode = file_inode(file);
@@ -547,7 +568,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
547 int check_caps = 0; 568 int check_caps = 0;
548 int ret; 569 int ret;
549 struct timespec mtime = CURRENT_TIME; 570 struct timespec mtime = CURRENT_TIME;
550 loff_t pos = iocb->ki_pos;
551 size_t count = iov_iter_count(from); 571 size_t count = iov_iter_count(from);
552 572
553 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 573 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -646,7 +666,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from)
646 * correct atomic write, we should e.g. take write locks on all 666 * correct atomic write, we should e.g. take write locks on all
647 * objects, rollback on failure, etc.) 667 * objects, rollback on failure, etc.)
648 */ 668 */
649static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from) 669static ssize_t
670ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
650{ 671{
651 struct file *file = iocb->ki_filp; 672 struct file *file = iocb->ki_filp;
652 struct inode *inode = file_inode(file); 673 struct inode *inode = file_inode(file);
@@ -663,7 +684,6 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, struct iov_iter *from)
663 int check_caps = 0; 684 int check_caps = 0;
664 int ret; 685 int ret;
665 struct timespec mtime = CURRENT_TIME; 686 struct timespec mtime = CURRENT_TIME;
666 loff_t pos = iocb->ki_pos;
667 size_t count = iov_iter_count(from); 687 size_t count = iov_iter_count(from);
668 688
669 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 689 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
@@ -821,8 +841,7 @@ again:
821 ceph_put_cap_refs(ci, got); 841 ceph_put_cap_refs(ci, got);
822 842
823 if (checkeof && ret >= 0) { 843 if (checkeof && ret >= 0) {
824 int statret = ceph_do_getattr(inode, 844 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
825 CEPH_STAT_CAP_SIZE);
826 845
827 /* hit EOF or hole? */ 846 /* hit EOF or hole? */
828 if (statret == 0 && iocb->ki_pos < inode->i_size && 847 if (statret == 0 && iocb->ki_pos < inode->i_size &&
@@ -831,7 +850,6 @@ again:
831 ", reading more\n", iocb->ki_pos, 850 ", reading more\n", iocb->ki_pos,
832 inode->i_size); 851 inode->i_size);
833 852
834 iov_iter_advance(to, ret);
835 read += ret; 853 read += ret;
836 len -= ret; 854 len -= ret;
837 checkeof = 0; 855 checkeof = 0;
@@ -918,9 +936,9 @@ retry_snap:
918 /* we might need to revert back to that point */ 936 /* we might need to revert back to that point */
919 data = *from; 937 data = *from;
920 if (file->f_flags & O_DIRECT) 938 if (file->f_flags & O_DIRECT)
921 written = ceph_sync_direct_write(iocb, &data); 939 written = ceph_sync_direct_write(iocb, &data, pos);
922 else 940 else
923 written = ceph_sync_write(iocb, &data); 941 written = ceph_sync_write(iocb, &data, pos);
924 if (written == -EOLDSNAPC) { 942 if (written == -EOLDSNAPC) {
925 dout("aio_write %p %llx.%llx %llu~%u" 943 dout("aio_write %p %llx.%llx %llu~%u"
926 "got EOLDSNAPC, retrying\n", 944 "got EOLDSNAPC, retrying\n",
@@ -990,7 +1008,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
990 mutex_lock(&inode->i_mutex); 1008 mutex_lock(&inode->i_mutex);
991 1009
992 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 1010 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
993 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); 1011 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
994 if (ret < 0) { 1012 if (ret < 0) {
995 offset = ret; 1013 offset = ret;
996 goto out; 1014 goto out;
@@ -1177,6 +1195,9 @@ static long ceph_fallocate(struct file *file, int mode,
1177 loff_t endoff = 0; 1195 loff_t endoff = 0;
1178 loff_t size; 1196 loff_t size;
1179 1197
1198 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
1199 return -EOPNOTSUPP;
1200
1180 if (!S_ISREG(inode->i_mode)) 1201 if (!S_ISREG(inode->i_mode))
1181 return -EOPNOTSUPP; 1202 return -EOPNOTSUPP;
1182 1203
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 04c89c266cec..7b6139004401 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -766,7 +766,7 @@ static int fill_inode(struct inode *inode,
766 766
767 /* xattrs */ 767 /* xattrs */
768 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ 768 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
769 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && 769 if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
770 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { 770 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
771 if (ci->i_xattrs.blob) 771 if (ci->i_xattrs.blob)
772 ceph_buffer_put(ci->i_xattrs.blob); 772 ceph_buffer_put(ci->i_xattrs.blob);
@@ -1813,10 +1813,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1813 if (ia_valid & ATTR_SIZE) { 1813 if (ia_valid & ATTR_SIZE) {
1814 dout("setattr %p size %lld -> %lld\n", inode, 1814 dout("setattr %p size %lld -> %lld\n", inode,
1815 inode->i_size, attr->ia_size); 1815 inode->i_size, attr->ia_size);
1816 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1817 err = -EINVAL;
1818 goto out;
1819 }
1820 if ((issued & CEPH_CAP_FILE_EXCL) && 1816 if ((issued & CEPH_CAP_FILE_EXCL) &&
1821 attr->ia_size > inode->i_size) { 1817 attr->ia_size > inode->i_size) {
1822 inode->i_size = attr->ia_size; 1818 inode->i_size = attr->ia_size;
@@ -1896,8 +1892,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1896 if (mask & CEPH_SETATTR_SIZE) 1892 if (mask & CEPH_SETATTR_SIZE)
1897 __ceph_do_pending_vmtruncate(inode); 1893 __ceph_do_pending_vmtruncate(inode);
1898 return err; 1894 return err;
1899out:
1900 spin_unlock(&ci->i_ceph_lock);
1901out_put: 1895out_put:
1902 ceph_mdsc_put_request(req); 1896 ceph_mdsc_put_request(req);
1903 return err; 1897 return err;
@@ -1907,7 +1901,7 @@ out_put:
1907 * Verify that we have a lease on the given mask. If not, 1901 * Verify that we have a lease on the given mask. If not,
1908 * do a getattr against an mds. 1902 * do a getattr against an mds.
1909 */ 1903 */
1910int ceph_do_getattr(struct inode *inode, int mask) 1904int ceph_do_getattr(struct inode *inode, int mask, bool force)
1911{ 1905{
1912 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 1906 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
1913 struct ceph_mds_client *mdsc = fsc->mdsc; 1907 struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1920,7 +1914,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
1920 } 1914 }
1921 1915
1922 dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); 1916 dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
1923 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1917 if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1924 return 0; 1918 return 0;
1925 1919
1926 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 1920 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
@@ -1948,7 +1942,7 @@ int ceph_permission(struct inode *inode, int mask)
1948 if (mask & MAY_NOT_BLOCK) 1942 if (mask & MAY_NOT_BLOCK)
1949 return -ECHILD; 1943 return -ECHILD;
1950 1944
1951 err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); 1945 err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false);
1952 1946
1953 if (!err) 1947 if (!err)
1954 err = generic_permission(inode, mask); 1948 err = generic_permission(inode, mask);
@@ -1966,7 +1960,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1966 struct ceph_inode_info *ci = ceph_inode(inode); 1960 struct ceph_inode_info *ci = ceph_inode(inode);
1967 int err; 1961 int err;
1968 1962
1969 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); 1963 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false);
1970 if (!err) { 1964 if (!err) {
1971 generic_fillattr(inode, stat); 1965 generic_fillattr(inode, stat);
1972 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); 1966 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index a822a6e58290..f851d8d70158 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -19,7 +19,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
19 struct ceph_ioctl_layout l; 19 struct ceph_ioctl_layout l;
20 int err; 20 int err;
21 21
22 err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); 22 err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
23 if (!err) { 23 if (!err) {
24 l.stripe_unit = ceph_file_layout_su(ci->i_layout); 24 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
25 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); 25 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
@@ -41,7 +41,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc,
41 /* validate striping parameters */ 41 /* validate striping parameters */
42 if ((l->object_size & ~PAGE_MASK) || 42 if ((l->object_size & ~PAGE_MASK) ||
43 (l->stripe_unit & ~PAGE_MASK) || 43 (l->stripe_unit & ~PAGE_MASK) ||
44 (l->stripe_unit != 0 && 44 ((unsigned)l->stripe_unit != 0 &&
45 ((unsigned)l->object_size % (unsigned)l->stripe_unit))) 45 ((unsigned)l->object_size % (unsigned)l->stripe_unit)))
46 return -EINVAL; 46 return -EINVAL;
47 47
@@ -74,7 +74,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
74 return -EFAULT; 74 return -EFAULT;
75 75
76 /* validate changed params against current layout */ 76 /* validate changed params against current layout */
77 err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); 77 err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
78 if (err) 78 if (err)
79 return err; 79 return err;
80 80
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 92a2548278fc..a92d3f5c6c12 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/debugfs.h> 8#include <linux/debugfs.h>
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/utsname.h>
10 11
11#include "super.h" 12#include "super.h"
12#include "mds_client.h" 13#include "mds_client.h"
@@ -334,7 +335,7 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
334/* 335/*
335 * sessions 336 * sessions
336 */ 337 */
337static const char *session_state_name(int s) 338const char *ceph_session_state_name(int s)
338{ 339{
339 switch (s) { 340 switch (s) {
340 case CEPH_MDS_SESSION_NEW: return "new"; 341 case CEPH_MDS_SESSION_NEW: return "new";
@@ -542,6 +543,8 @@ void ceph_mdsc_release_request(struct kref *kref)
542 } 543 }
543 kfree(req->r_path1); 544 kfree(req->r_path1);
544 kfree(req->r_path2); 545 kfree(req->r_path2);
546 if (req->r_pagelist)
547 ceph_pagelist_release(req->r_pagelist);
545 put_request_session(req); 548 put_request_session(req);
546 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); 549 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
547 kfree(req); 550 kfree(req);
@@ -812,6 +815,74 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
812 h = msg->front.iov_base; 815 h = msg->front.iov_base;
813 h->op = cpu_to_le32(op); 816 h->op = cpu_to_le32(op);
814 h->seq = cpu_to_le64(seq); 817 h->seq = cpu_to_le64(seq);
818
819 return msg;
820}
821
822/*
823 * session message, specialization for CEPH_SESSION_REQUEST_OPEN
824 * to include additional client metadata fields.
825 */
826static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
827{
828 struct ceph_msg *msg;
829 struct ceph_mds_session_head *h;
830 int i = -1;
831 int metadata_bytes = 0;
832 int metadata_key_count = 0;
833 struct ceph_options *opt = mdsc->fsc->client->options;
834 void *p;
835
836 const char* metadata[3][2] = {
837 {"hostname", utsname()->nodename},
838 {"entity_id", opt->name ? opt->name : ""},
839 {NULL, NULL}
840 };
841
842 /* Calculate serialized length of metadata */
843 metadata_bytes = 4; /* map length */
844 for (i = 0; metadata[i][0] != NULL; ++i) {
845 metadata_bytes += 8 + strlen(metadata[i][0]) +
846 strlen(metadata[i][1]);
847 metadata_key_count++;
848 }
849
850 /* Allocate the message */
851 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes,
852 GFP_NOFS, false);
853 if (!msg) {
854 pr_err("create_session_msg ENOMEM creating msg\n");
855 return NULL;
856 }
857 h = msg->front.iov_base;
858 h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
859 h->seq = cpu_to_le64(seq);
860
861 /*
862 * Serialize client metadata into waiting buffer space, using
863 * the format that userspace expects for map<string, string>
864 */
865 msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */
866
867 /* The write pointer, following the session_head structure */
868 p = msg->front.iov_base + sizeof(*h);
869
870 /* Number of entries in the map */
871 ceph_encode_32(&p, metadata_key_count);
872
873 /* Two length-prefixed strings for each entry in the map */
874 for (i = 0; metadata[i][0] != NULL; ++i) {
875 size_t const key_len = strlen(metadata[i][0]);
876 size_t const val_len = strlen(metadata[i][1]);
877
878 ceph_encode_32(&p, key_len);
879 memcpy(p, metadata[i][0], key_len);
880 p += key_len;
881 ceph_encode_32(&p, val_len);
882 memcpy(p, metadata[i][1], val_len);
883 p += val_len;
884 }
885
815 return msg; 886 return msg;
816} 887}
817 888
@@ -835,7 +906,7 @@ static int __open_session(struct ceph_mds_client *mdsc,
835 session->s_renew_requested = jiffies; 906 session->s_renew_requested = jiffies;
836 907
837 /* send connect message */ 908 /* send connect message */
838 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); 909 msg = create_session_open_msg(mdsc, session->s_seq);
839 if (!msg) 910 if (!msg)
840 return -ENOMEM; 911 return -ENOMEM;
841 ceph_con_send(&session->s_con, msg); 912 ceph_con_send(&session->s_con, msg);
@@ -1164,7 +1235,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc,
1164 struct ceph_msg *msg; 1235 struct ceph_msg *msg;
1165 1236
1166 dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", 1237 dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
1167 session->s_mds, session_state_name(session->s_state), seq); 1238 session->s_mds, ceph_session_state_name(session->s_state), seq);
1168 msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); 1239 msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
1169 if (!msg) 1240 if (!msg)
1170 return -ENOMEM; 1241 return -ENOMEM;
@@ -1216,7 +1287,7 @@ static int request_close_session(struct ceph_mds_client *mdsc,
1216 struct ceph_msg *msg; 1287 struct ceph_msg *msg;
1217 1288
1218 dout("request_close_session mds%d state %s seq %lld\n", 1289 dout("request_close_session mds%d state %s seq %lld\n",
1219 session->s_mds, session_state_name(session->s_state), 1290 session->s_mds, ceph_session_state_name(session->s_state),
1220 session->s_seq); 1291 session->s_seq);
1221 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); 1292 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
1222 if (!msg) 1293 if (!msg)
@@ -1847,13 +1918,15 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1847 msg->front.iov_len = p - msg->front.iov_base; 1918 msg->front.iov_len = p - msg->front.iov_base;
1848 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1919 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1849 1920
1850 if (req->r_data_len) { 1921 if (req->r_pagelist) {
1851 /* outbound data set only by ceph_sync_setxattr() */ 1922 struct ceph_pagelist *pagelist = req->r_pagelist;
1852 BUG_ON(!req->r_pages); 1923 atomic_inc(&pagelist->refcnt);
1853 ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); 1924 ceph_msg_data_add_pagelist(msg, pagelist);
1925 msg->hdr.data_len = cpu_to_le32(pagelist->length);
1926 } else {
1927 msg->hdr.data_len = 0;
1854 } 1928 }
1855 1929
1856 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1857 msg->hdr.data_off = cpu_to_le16(0); 1930 msg->hdr.data_off = cpu_to_le16(0);
1858 1931
1859out_free2: 1932out_free2:
@@ -1904,6 +1977,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1904 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1977 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1905 1978
1906 if (req->r_got_unsafe) { 1979 if (req->r_got_unsafe) {
1980 void *p;
1907 /* 1981 /*
1908 * Replay. Do not regenerate message (and rebuild 1982 * Replay. Do not regenerate message (and rebuild
1909 * paths, etc.); just use the original message. 1983 * paths, etc.); just use the original message.
@@ -1924,8 +1998,13 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1924 1998
1925 /* remove cap/dentry releases from message */ 1999 /* remove cap/dentry releases from message */
1926 rhead->num_releases = 0; 2000 rhead->num_releases = 0;
1927 msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); 2001
1928 msg->front.iov_len = req->r_request_release_offset; 2002 /* time stamp */
2003 p = msg->front.iov_base + req->r_request_release_offset;
2004 ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
2005
2006 msg->front.iov_len = p - msg->front.iov_base;
2007 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1929 return 0; 2008 return 0;
1930 } 2009 }
1931 2010
@@ -2001,7 +2080,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
2001 req->r_session = get_session(session); 2080 req->r_session = get_session(session);
2002 2081
2003 dout("do_request mds%d session %p state %s\n", mds, session, 2082 dout("do_request mds%d session %p state %s\n", mds, session,
2004 session_state_name(session->s_state)); 2083 ceph_session_state_name(session->s_state));
2005 if (session->s_state != CEPH_MDS_SESSION_OPEN && 2084 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
2006 session->s_state != CEPH_MDS_SESSION_HUNG) { 2085 session->s_state != CEPH_MDS_SESSION_HUNG) {
2007 if (session->s_state == CEPH_MDS_SESSION_NEW || 2086 if (session->s_state == CEPH_MDS_SESSION_NEW ||
@@ -2061,16 +2140,18 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
2061static void kick_requests(struct ceph_mds_client *mdsc, int mds) 2140static void kick_requests(struct ceph_mds_client *mdsc, int mds)
2062{ 2141{
2063 struct ceph_mds_request *req; 2142 struct ceph_mds_request *req;
2064 struct rb_node *p; 2143 struct rb_node *p = rb_first(&mdsc->request_tree);
2065 2144
2066 dout("kick_requests mds%d\n", mds); 2145 dout("kick_requests mds%d\n", mds);
2067 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) { 2146 while (p) {
2068 req = rb_entry(p, struct ceph_mds_request, r_node); 2147 req = rb_entry(p, struct ceph_mds_request, r_node);
2148 p = rb_next(p);
2069 if (req->r_got_unsafe) 2149 if (req->r_got_unsafe)
2070 continue; 2150 continue;
2071 if (req->r_session && 2151 if (req->r_session &&
2072 req->r_session->s_mds == mds) { 2152 req->r_session->s_mds == mds) {
2073 dout(" kicking tid %llu\n", req->r_tid); 2153 dout(" kicking tid %llu\n", req->r_tid);
2154 list_del_init(&req->r_wait);
2074 __do_request(mdsc, req); 2155 __do_request(mdsc, req);
2075 } 2156 }
2076 } 2157 }
@@ -2248,6 +2329,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2248 */ 2329 */
2249 if (result == -ESTALE) { 2330 if (result == -ESTALE) {
2250 dout("got ESTALE on request %llu", req->r_tid); 2331 dout("got ESTALE on request %llu", req->r_tid);
2332 req->r_resend_mds = -1;
2251 if (req->r_direct_mode != USE_AUTH_MDS) { 2333 if (req->r_direct_mode != USE_AUTH_MDS) {
2252 dout("not using auth, setting for that now"); 2334 dout("not using auth, setting for that now");
2253 req->r_direct_mode = USE_AUTH_MDS; 2335 req->r_direct_mode = USE_AUTH_MDS;
@@ -2436,7 +2518,7 @@ static void handle_session(struct ceph_mds_session *session,
2436 2518
2437 dout("handle_session mds%d %s %p state %s seq %llu\n", 2519 dout("handle_session mds%d %s %p state %s seq %llu\n",
2438 mds, ceph_session_op_name(op), session, 2520 mds, ceph_session_op_name(op), session,
2439 session_state_name(session->s_state), seq); 2521 ceph_session_state_name(session->s_state), seq);
2440 2522
2441 if (session->s_state == CEPH_MDS_SESSION_HUNG) { 2523 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
2442 session->s_state = CEPH_MDS_SESSION_OPEN; 2524 session->s_state = CEPH_MDS_SESSION_OPEN;
@@ -2463,9 +2545,8 @@ static void handle_session(struct ceph_mds_session *session,
2463 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) 2545 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2464 pr_info("mds%d reconnect denied\n", session->s_mds); 2546 pr_info("mds%d reconnect denied\n", session->s_mds);
2465 remove_session_caps(session); 2547 remove_session_caps(session);
2466 wake = 1; /* for good measure */ 2548 wake = 2; /* for good measure */
2467 wake_up_all(&mdsc->session_close_wq); 2549 wake_up_all(&mdsc->session_close_wq);
2468 kick_requests(mdsc, mds);
2469 break; 2550 break;
2470 2551
2471 case CEPH_SESSION_STALE: 2552 case CEPH_SESSION_STALE:
@@ -2495,6 +2576,8 @@ static void handle_session(struct ceph_mds_session *session,
2495 if (wake) { 2576 if (wake) {
2496 mutex_lock(&mdsc->mutex); 2577 mutex_lock(&mdsc->mutex);
2497 __wake_requests(mdsc, &session->s_waiting); 2578 __wake_requests(mdsc, &session->s_waiting);
2579 if (wake == 2)
2580 kick_requests(mdsc, mds);
2498 mutex_unlock(&mdsc->mutex); 2581 mutex_unlock(&mdsc->mutex);
2499 } 2582 }
2500 return; 2583 return;
@@ -2687,18 +2770,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2687 session->s_state = CEPH_MDS_SESSION_RECONNECTING; 2770 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2688 session->s_seq = 0; 2771 session->s_seq = 0;
2689 2772
2690 ceph_con_close(&session->s_con);
2691 ceph_con_open(&session->s_con,
2692 CEPH_ENTITY_TYPE_MDS, mds,
2693 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2694
2695 /* replay unsafe requests */
2696 replay_unsafe_requests(mdsc, session);
2697
2698 down_read(&mdsc->snap_rwsem);
2699
2700 dout("session %p state %s\n", session, 2773 dout("session %p state %s\n", session,
2701 session_state_name(session->s_state)); 2774 ceph_session_state_name(session->s_state));
2702 2775
2703 spin_lock(&session->s_gen_ttl_lock); 2776 spin_lock(&session->s_gen_ttl_lock);
2704 session->s_cap_gen++; 2777 session->s_cap_gen++;
@@ -2715,6 +2788,19 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2715 discard_cap_releases(mdsc, session); 2788 discard_cap_releases(mdsc, session);
2716 spin_unlock(&session->s_cap_lock); 2789 spin_unlock(&session->s_cap_lock);
2717 2790
2791 /* trim unused caps to reduce MDS's cache rejoin time */
2792 shrink_dcache_parent(mdsc->fsc->sb->s_root);
2793
2794 ceph_con_close(&session->s_con);
2795 ceph_con_open(&session->s_con,
2796 CEPH_ENTITY_TYPE_MDS, mds,
2797 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2798
2799 /* replay unsafe requests */
2800 replay_unsafe_requests(mdsc, session);
2801
2802 down_read(&mdsc->snap_rwsem);
2803
2718 /* traverse this session's caps */ 2804 /* traverse this session's caps */
2719 s_nr_caps = session->s_nr_caps; 2805 s_nr_caps = session->s_nr_caps;
2720 err = ceph_pagelist_encode_32(pagelist, s_nr_caps); 2806 err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
@@ -2783,7 +2869,6 @@ fail:
2783 mutex_unlock(&session->s_mutex); 2869 mutex_unlock(&session->s_mutex);
2784fail_nomsg: 2870fail_nomsg:
2785 ceph_pagelist_release(pagelist); 2871 ceph_pagelist_release(pagelist);
2786 kfree(pagelist);
2787fail_nopagelist: 2872fail_nopagelist:
2788 pr_err("error %d preparing reconnect for mds%d\n", err, mds); 2873 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2789 return; 2874 return;
@@ -2819,7 +2904,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2819 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", 2904 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
2820 ceph_mds_state_name(newstate), 2905 ceph_mds_state_name(newstate),
2821 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", 2906 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
2822 session_state_name(s->s_state)); 2907 ceph_session_state_name(s->s_state));
2823 2908
2824 if (i >= newmap->m_max_mds || 2909 if (i >= newmap->m_max_mds ||
2825 memcmp(ceph_mdsmap_get_addr(oldmap, i), 2910 memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2931,14 +3016,15 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2931 if (dname.len != get_unaligned_le32(h+1)) 3016 if (dname.len != get_unaligned_le32(h+1))
2932 goto bad; 3017 goto bad;
2933 3018
2934 mutex_lock(&session->s_mutex);
2935 session->s_seq++;
2936
2937 /* lookup inode */ 3019 /* lookup inode */
2938 inode = ceph_find_inode(sb, vino); 3020 inode = ceph_find_inode(sb, vino);
2939 dout("handle_lease %s, ino %llx %p %.*s\n", 3021 dout("handle_lease %s, ino %llx %p %.*s\n",
2940 ceph_lease_op_name(h->action), vino.ino, inode, 3022 ceph_lease_op_name(h->action), vino.ino, inode,
2941 dname.len, dname.name); 3023 dname.len, dname.name);
3024
3025 mutex_lock(&session->s_mutex);
3026 session->s_seq++;
3027
2942 if (inode == NULL) { 3028 if (inode == NULL) {
2943 dout("handle_lease no inode %llx\n", vino.ino); 3029 dout("handle_lease no inode %llx\n", vino.ino);
2944 goto release; 3030 goto release;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e00737cf523c..3288359353e9 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -202,9 +202,7 @@ struct ceph_mds_request {
202 bool r_direct_is_hash; /* true if r_direct_hash is valid */ 202 bool r_direct_is_hash; /* true if r_direct_hash is valid */
203 203
204 /* data payload is used for xattr ops */ 204 /* data payload is used for xattr ops */
205 struct page **r_pages; 205 struct ceph_pagelist *r_pagelist;
206 int r_num_pages;
207 int r_data_len;
208 206
209 /* what caps shall we drop? */ 207 /* what caps shall we drop? */
210 int r_inode_drop, r_inode_unless; 208 int r_inode_drop, r_inode_unless;
@@ -332,6 +330,8 @@ ceph_get_mds_session(struct ceph_mds_session *s)
332 return s; 330 return s;
333} 331}
334 332
333extern const char *ceph_session_state_name(int s);
334
335extern void ceph_put_mds_session(struct ceph_mds_session *s); 335extern void ceph_put_mds_session(struct ceph_mds_session *s);
336 336
337extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 337extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 06150fd745ac..f6e12377335c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -755,7 +755,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
755 goto out; 755 goto out;
756 } 756 }
757 } else { 757 } else {
758 root = d_obtain_alias(inode); 758 root = d_obtain_root(inode);
759 } 759 }
760 ceph_init_dentry(root); 760 ceph_init_dentry(root);
761 dout("open_root_inode success, root dentry is %p\n", root); 761 dout("open_root_inode success, root dentry is %p\n", root);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 12b20744e386..b82f507979b8 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -95,6 +95,7 @@ struct ceph_fs_client {
95 struct dentry *debugfs_congestion_kb; 95 struct dentry *debugfs_congestion_kb;
96 struct dentry *debugfs_bdi; 96 struct dentry *debugfs_bdi;
97 struct dentry *debugfs_mdsc, *debugfs_mdsmap; 97 struct dentry *debugfs_mdsc, *debugfs_mdsmap;
98 struct dentry *debugfs_mds_sessions;
98#endif 99#endif
99 100
100#ifdef CONFIG_CEPH_FSCACHE 101#ifdef CONFIG_CEPH_FSCACHE
@@ -714,7 +715,7 @@ extern void ceph_queue_vmtruncate(struct inode *inode);
714extern void ceph_queue_invalidate(struct inode *inode); 715extern void ceph_queue_invalidate(struct inode *inode);
715extern void ceph_queue_writeback(struct inode *inode); 716extern void ceph_queue_writeback(struct inode *inode);
716 717
717extern int ceph_do_getattr(struct inode *inode, int mask); 718extern int ceph_do_getattr(struct inode *inode, int mask, bool force);
718extern int ceph_permission(struct inode *inode, int mask); 719extern int ceph_permission(struct inode *inode, int mask);
719extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); 720extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
720extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, 721extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -733,15 +734,23 @@ extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
733extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); 734extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
734extern void __init ceph_xattr_init(void); 735extern void __init ceph_xattr_init(void);
735extern void ceph_xattr_exit(void); 736extern void ceph_xattr_exit(void);
737extern const struct xattr_handler *ceph_xattr_handlers[];
736 738
737/* acl.c */ 739/* acl.c */
738extern const struct xattr_handler *ceph_xattr_handlers[]; 740struct ceph_acls_info {
741 void *default_acl;
742 void *acl;
743 struct ceph_pagelist *pagelist;
744};
739 745
740#ifdef CONFIG_CEPH_FS_POSIX_ACL 746#ifdef CONFIG_CEPH_FS_POSIX_ACL
741 747
742struct posix_acl *ceph_get_acl(struct inode *, int); 748struct posix_acl *ceph_get_acl(struct inode *, int);
743int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); 749int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type);
744int ceph_init_acl(struct dentry *, struct inode *, struct inode *); 750int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
751 struct ceph_acls_info *info);
752void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info);
753void ceph_release_acls_info(struct ceph_acls_info *info);
745 754
746static inline void ceph_forget_all_cached_acls(struct inode *inode) 755static inline void ceph_forget_all_cached_acls(struct inode *inode)
747{ 756{
@@ -753,12 +762,18 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode)
753#define ceph_get_acl NULL 762#define ceph_get_acl NULL
754#define ceph_set_acl NULL 763#define ceph_set_acl NULL
755 764
756static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, 765static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
757 struct inode *dir) 766 struct ceph_acls_info *info)
758{ 767{
759 return 0; 768 return 0;
760} 769}
761 770static inline void ceph_init_inode_acls(struct inode *inode,
771 struct ceph_acls_info *info)
772{
773}
774static inline void ceph_release_acls_info(struct ceph_acls_info *info)
775{
776}
762static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) 777static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode)
763{ 778{
764 return 0; 779 return 0;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index c9c2b887381e..678b0d2bbbc4 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,4 +1,5 @@
1#include <linux/ceph/ceph_debug.h> 1#include <linux/ceph/ceph_debug.h>
2#include <linux/ceph/pagelist.h>
2 3
3#include "super.h" 4#include "super.h"
4#include "mds_client.h" 5#include "mds_client.h"
@@ -284,8 +285,7 @@ static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs)
284 return ceph_dir_vxattrs_name_size; 285 return ceph_dir_vxattrs_name_size;
285 if (vxattrs == ceph_file_vxattrs) 286 if (vxattrs == ceph_file_vxattrs)
286 return ceph_file_vxattrs_name_size; 287 return ceph_file_vxattrs_name_size;
287 BUG(); 288 BUG_ON(vxattrs);
288
289 return 0; 289 return 0;
290} 290}
291 291
@@ -592,12 +592,12 @@ start:
592 xattr_version = ci->i_xattrs.version; 592 xattr_version = ci->i_xattrs.version;
593 spin_unlock(&ci->i_ceph_lock); 593 spin_unlock(&ci->i_ceph_lock);
594 594
595 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *), 595 xattrs = kcalloc(numattr, sizeof(struct ceph_inode_xattr *),
596 GFP_NOFS); 596 GFP_NOFS);
597 err = -ENOMEM; 597 err = -ENOMEM;
598 if (!xattrs) 598 if (!xattrs)
599 goto bad_lock; 599 goto bad_lock;
600 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *)); 600
601 for (i = 0; i < numattr; i++) { 601 for (i = 0; i < numattr; i++) {
602 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr), 602 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
603 GFP_NOFS); 603 GFP_NOFS);
@@ -736,24 +736,20 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
736 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 736 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
737 ci->i_xattrs.version, ci->i_xattrs.index_version); 737 ci->i_xattrs.version, ci->i_xattrs.index_version);
738 738
739 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 739 if (ci->i_xattrs.version == 0 ||
740 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 740 !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
741 goto get_xattr;
742 } else {
743 spin_unlock(&ci->i_ceph_lock); 741 spin_unlock(&ci->i_ceph_lock);
744 /* get xattrs from mds (if we don't already have them) */ 742 /* get xattrs from mds (if we don't already have them) */
745 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); 743 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
746 if (err) 744 if (err)
747 return err; 745 return err;
746 spin_lock(&ci->i_ceph_lock);
748 } 747 }
749 748
750 spin_lock(&ci->i_ceph_lock);
751
752 err = __build_xattrs(inode); 749 err = __build_xattrs(inode);
753 if (err < 0) 750 if (err < 0)
754 goto out; 751 goto out;
755 752
756get_xattr:
757 err = -ENODATA; /* == ENOATTR */ 753 err = -ENODATA; /* == ENOATTR */
758 xattr = __get_xattr(ci, name); 754 xattr = __get_xattr(ci, name);
759 if (!xattr) 755 if (!xattr)
@@ -798,23 +794,18 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
798 dout("listxattr %p ver=%lld index_ver=%lld\n", inode, 794 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
799 ci->i_xattrs.version, ci->i_xattrs.index_version); 795 ci->i_xattrs.version, ci->i_xattrs.index_version);
800 796
801 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 797 if (ci->i_xattrs.version == 0 ||
802 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { 798 !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) {
803 goto list_xattr;
804 } else {
805 spin_unlock(&ci->i_ceph_lock); 799 spin_unlock(&ci->i_ceph_lock);
806 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); 800 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
807 if (err) 801 if (err)
808 return err; 802 return err;
803 spin_lock(&ci->i_ceph_lock);
809 } 804 }
810 805
811 spin_lock(&ci->i_ceph_lock);
812
813 err = __build_xattrs(inode); 806 err = __build_xattrs(inode);
814 if (err < 0) 807 if (err < 0)
815 goto out; 808 goto out;
816
817list_xattr:
818 /* 809 /*
819 * Start with virtual dir xattr names (if any) (including 810 * Start with virtual dir xattr names (if any) (including
820 * terminating '\0' characters for each). 811 * terminating '\0' characters for each).
@@ -860,35 +851,25 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
860 struct ceph_inode_info *ci = ceph_inode(inode); 851 struct ceph_inode_info *ci = ceph_inode(inode);
861 struct ceph_mds_request *req; 852 struct ceph_mds_request *req;
862 struct ceph_mds_client *mdsc = fsc->mdsc; 853 struct ceph_mds_client *mdsc = fsc->mdsc;
854 struct ceph_pagelist *pagelist = NULL;
863 int err; 855 int err;
864 int i, nr_pages; 856
865 struct page **pages = NULL; 857 if (value) {
866 void *kaddr; 858 /* copy value into pagelist */
867 859 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
868 /* copy value into some pages */ 860 if (!pagelist)
869 nr_pages = calc_pages_for(0, size);
870 if (nr_pages) {
871 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
872 if (!pages)
873 return -ENOMEM; 861 return -ENOMEM;
874 err = -ENOMEM; 862
875 for (i = 0; i < nr_pages; i++) { 863 ceph_pagelist_init(pagelist);
876 pages[i] = __page_cache_alloc(GFP_NOFS); 864 err = ceph_pagelist_append(pagelist, value, size);
877 if (!pages[i]) { 865 if (err)
878 nr_pages = i; 866 goto out;
879 goto out; 867 } else {
880 } 868 flags |= CEPH_XATTR_REMOVE;
881 kaddr = kmap(pages[i]);
882 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
883 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
884 }
885 } 869 }
886 870
887 dout("setxattr value=%.*s\n", (int)size, value); 871 dout("setxattr value=%.*s\n", (int)size, value);
888 872
889 if (!value)
890 flags |= CEPH_XATTR_REMOVE;
891
892 /* do request */ 873 /* do request */
893 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, 874 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
894 USE_AUTH_MDS); 875 USE_AUTH_MDS);
@@ -903,9 +884,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
903 req->r_args.setxattr.flags = cpu_to_le32(flags); 884 req->r_args.setxattr.flags = cpu_to_le32(flags);
904 req->r_path2 = kstrdup(name, GFP_NOFS); 885 req->r_path2 = kstrdup(name, GFP_NOFS);
905 886
906 req->r_pages = pages; 887 req->r_pagelist = pagelist;
907 req->r_num_pages = nr_pages; 888 pagelist = NULL;
908 req->r_data_len = size;
909 889
910 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); 890 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
911 err = ceph_mdsc_do_request(mdsc, NULL, req); 891 err = ceph_mdsc_do_request(mdsc, NULL, req);
@@ -913,11 +893,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
913 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); 893 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
914 894
915out: 895out:
916 if (pages) { 896 if (pagelist)
917 for (i = 0; i < nr_pages; i++) 897 ceph_pagelist_release(pagelist);
918 __free_page(pages[i]);
919 kfree(pages);
920 }
921 return err; 898 return err;
922} 899}
923 900
@@ -968,7 +945,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
968retry: 945retry:
969 issued = __ceph_caps_issued(ci, NULL); 946 issued = __ceph_caps_issued(ci, NULL);
970 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); 947 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
971 if (!(issued & CEPH_CAP_XATTR_EXCL)) 948 if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
972 goto do_sync; 949 goto do_sync;
973 __build_xattrs(inode); 950 __build_xattrs(inode);
974 951
@@ -1077,7 +1054,7 @@ retry:
1077 issued = __ceph_caps_issued(ci, NULL); 1054 issued = __ceph_caps_issued(ci, NULL);
1078 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); 1055 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
1079 1056
1080 if (!(issued & CEPH_CAP_XATTR_EXCL)) 1057 if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL))
1081 goto do_sync; 1058 goto do_sync;
1082 __build_xattrs(inode); 1059 __build_xattrs(inode);
1083 1060
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 603f18a65c12..a2172f3f69e3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -22,6 +22,11 @@ config CIFS
22 support for OS/2 and Windows ME and similar servers is provided as 22 support for OS/2 and Windows ME and similar servers is provided as
23 well. 23 well.
24 24
25 The module also provides optional support for the followon
26 protocols for CIFS including SMB3, which enables
27 useful performance and security features (see the description
28 of CONFIG_CIFS_SMB2).
29
25 The cifs module provides an advanced network file system 30 The cifs module provides an advanced network file system
26 client for mounting to CIFS compliant servers. It includes 31 client for mounting to CIFS compliant servers. It includes
27 support for DFS (hierarchical name space), secure per-user 32 support for DFS (hierarchical name space), secure per-user
@@ -121,7 +126,8 @@ config CIFS_ACL
121 depends on CIFS_XATTR && KEYS 126 depends on CIFS_XATTR && KEYS
122 help 127 help
123 Allows fetching CIFS/NTFS ACL from the server. The DACL blob 128 Allows fetching CIFS/NTFS ACL from the server. The DACL blob
124 is handed over to the application/caller. 129 is handed over to the application/caller. See the man
130 page for getcifsacl for more information.
125 131
126config CIFS_DEBUG 132config CIFS_DEBUG
127 bool "Enable CIFS debugging routines" 133 bool "Enable CIFS debugging routines"
@@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT
162 Allows NFS server to export a CIFS mounted share (nfsd over cifs) 168 Allows NFS server to export a CIFS mounted share (nfsd over cifs)
163 169
164config CIFS_SMB2 170config CIFS_SMB2
165 bool "SMB2 network file system support" 171 bool "SMB2 and SMB3 network file system support"
166 depends on CIFS && INET 172 depends on CIFS && INET
167 select NLS 173 select NLS
168 select KEYS 174 select KEYS
@@ -170,16 +176,21 @@ config CIFS_SMB2
170 select DNS_RESOLVER 176 select DNS_RESOLVER
171 177
172 help 178 help
173 This enables experimental support for the SMB2 (Server Message Block 179 This enables support for the Server Message Block version 2
174 version 2) protocol. The SMB2 protocol is the successor to the 180 family of protocols, including SMB3. SMB3 support is
175 popular CIFS and SMB network file sharing protocols. SMB2 is the 181 enabled on mount by specifying "vers=3.0" in the mount
176 native file sharing mechanism for recent versions of Windows 182 options. These protocols are the successors to the popular
177 operating systems (since Vista). SMB2 enablement will eventually 183 CIFS and SMB network file sharing protocols. SMB3 is the
178 allow users better performance, security and features, than would be 184 native file sharing mechanism for the more recent
179 possible with cifs. Note that smb2 mount options also are simpler 185 versions of Windows (Windows 8 and Windows 2012 and
180 (compared to cifs) due to protocol improvements. 186 later) and Samba server and many others support SMB3 well.
181 187 In general SMB3 enables better performance, security
182 Unless you are a developer or tester, say N. 188 and features, than would be possible with CIFS (Note that
189 when mounting to Samba, due to the CIFS POSIX extensions,
190 CIFS mounts can provide slightly better POSIX compatibility
191 than SMB3 mounts do though). Note that SMB2/SMB3 mount
192 options are also slightly simpler (compared to CIFS) due
193 to protocol improvements.
183 194
184config CIFS_FSCACHE 195config CIFS_FSCACHE
185 bool "Provide CIFS client caching support" 196 bool "Provide CIFS client caching support"
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index f3ac4154cbb6..44ec72684df5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -213,7 +213,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
213 tcon->nativeFileSystem); 213 tcon->nativeFileSystem);
214 } 214 }
215 seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x" 215 seq_printf(m, "DevInfo: 0x%x Attributes: 0x%x"
216 "\n\tPathComponentMax: %d Status: 0x%d", 216 "\n\tPathComponentMax: %d Status: %d",
217 le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics), 217 le32_to_cpu(tcon->fsDevInfo.DeviceCharacteristics),
218 le32_to_cpu(tcon->fsAttrInfo.Attributes), 218 le32_to_cpu(tcon->fsAttrInfo.Attributes),
219 le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength), 219 le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength),
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 58df174deb10..b8602f199815 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -195,15 +195,15 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
195 else 195 else
196 noff = tkn_e - (sb_mountdata + off) + 1; 196 noff = tkn_e - (sb_mountdata + off) + 1;
197 197
198 if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) { 198 if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) {
199 off += noff; 199 off += noff;
200 continue; 200 continue;
201 } 201 }
202 if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) { 202 if (strncasecmp(sb_mountdata + off, "ip=", 3) == 0) {
203 off += noff; 203 off += noff;
204 continue; 204 continue;
205 } 205 }
206 if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) { 206 if (strncasecmp(sb_mountdata + off, "prefixpath=", 11) == 0) {
207 off += noff; 207 off += noff;
208 continue; 208 continue;
209 } 209 }
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 9409fa10bd5c..3182273a3407 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -45,6 +45,7 @@
45#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ 45#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */
46#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ 46#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
47#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ 47#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
48#define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */
48 49
49struct cifs_sb_info { 50struct cifs_sb_info {
50 struct rb_root tlink_tree; 51 struct rb_root tlink_tree;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index a3e932547617..f4cf200b3c76 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -62,7 +62,6 @@ cifs_spnego_key_destroy(struct key *key)
62struct key_type cifs_spnego_key_type = { 62struct key_type cifs_spnego_key_type = {
63 .name = "cifs.spnego", 63 .name = "cifs.spnego",
64 .instantiate = cifs_spnego_key_instantiate, 64 .instantiate = cifs_spnego_key_instantiate,
65 .match = user_match,
66 .destroy = cifs_spnego_key_destroy, 65 .destroy = cifs_spnego_key_destroy,
67 .describe = user_describe, 66 .describe = user_describe,
68}; 67};
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 15e9505aa35f..0303c6793d90 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -20,6 +20,7 @@
20 */ 20 */
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include "cifs_fs_sb.h"
23#include "cifs_unicode.h" 24#include "cifs_unicode.h"
24#include "cifs_uniupr.h" 25#include "cifs_uniupr.h"
25#include "cifspdu.h" 26#include "cifspdu.h"
@@ -61,26 +62,24 @@ cifs_utf16_bytes(const __le16 *from, int maxbytes,
61 return outlen; 62 return outlen;
62} 63}
63 64
64/* 65int cifs_remap(struct cifs_sb_info *cifs_sb)
65 * cifs_mapchar - convert a host-endian char to proper char in codepage
66 * @target - where converted character should be copied
67 * @src_char - 2 byte host-endian source character
68 * @cp - codepage to which character should be converted
69 * @mapchar - should character be mapped according to mapchars mount option?
70 *
71 * This function handles the conversion of a single character. It is the
72 * responsibility of the caller to ensure that the target buffer is large
73 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
74 */
75static int
76cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
77 bool mapchar)
78{ 66{
79 int len = 1; 67 int map_type;
68
69 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
70 map_type = SFM_MAP_UNI_RSVD;
71 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
72 map_type = SFU_MAP_UNI_RSVD;
73 else
74 map_type = NO_MAP_UNI_RSVD;
80 75
81 if (!mapchar) 76 return map_type;
82 goto cp_convert; 77}
83 78
79/* Convert character using the SFU - "Services for Unix" remapping range */
80static bool
81convert_sfu_char(const __u16 src_char, char *target)
82{
84 /* 83 /*
85 * BB: Cannot handle remapping UNI_SLASH until all the calls to 84 * BB: Cannot handle remapping UNI_SLASH until all the calls to
86 * build_path_from_dentry are modified, as they use slash as 85 * build_path_from_dentry are modified, as they use slash as
@@ -106,19 +105,74 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
106 *target = '<'; 105 *target = '<';
107 break; 106 break;
108 default: 107 default:
109 goto cp_convert; 108 return false;
110 } 109 }
110 return true;
111}
112
113/* Convert character using the SFM - "Services for Mac" remapping range */
114static bool
115convert_sfm_char(const __u16 src_char, char *target)
116{
117 switch (src_char) {
118 case SFM_COLON:
119 *target = ':';
120 break;
121 case SFM_ASTERISK:
122 *target = '*';
123 break;
124 case SFM_QUESTION:
125 *target = '?';
126 break;
127 case SFM_PIPE:
128 *target = '|';
129 break;
130 case SFM_GRTRTHAN:
131 *target = '>';
132 break;
133 case SFM_LESSTHAN:
134 *target = '<';
135 break;
136 case SFM_SLASH:
137 *target = '\\';
138 break;
139 default:
140 return false;
141 }
142 return true;
143}
111 144
112out:
113 return len;
114 145
115cp_convert: 146/*
147 * cifs_mapchar - convert a host-endian char to proper char in codepage
148 * @target - where converted character should be copied
149 * @src_char - 2 byte host-endian source character
150 * @cp - codepage to which character should be converted
151 * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2?
152 *
153 * This function handles the conversion of a single character. It is the
154 * responsibility of the caller to ensure that the target buffer is large
155 * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
156 */
157static int
158cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
159 int maptype)
160{
161 int len = 1;
162
163 if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target))
164 return len;
165 else if ((maptype == SFU_MAP_UNI_RSVD) &&
166 convert_sfu_char(src_char, target))
167 return len;
168
169 /* if character not one of seven in special remap set */
116 len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); 170 len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
117 if (len <= 0) { 171 if (len <= 0) {
118 *target = '?'; 172 *target = '?';
119 len = 1; 173 len = 1;
120 } 174 }
121 goto out; 175 return len;
122} 176}
123 177
124/* 178/*
@@ -145,7 +199,7 @@ cp_convert:
145 */ 199 */
146int 200int
147cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, 201cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
148 const struct nls_table *codepage, bool mapchar) 202 const struct nls_table *codepage, int map_type)
149{ 203{
150 int i, charlen, safelen; 204 int i, charlen, safelen;
151 int outlen = 0; 205 int outlen = 0;
@@ -172,13 +226,13 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
172 * conversion bleed into the null terminator 226 * conversion bleed into the null terminator
173 */ 227 */
174 if (outlen >= safelen) { 228 if (outlen >= safelen) {
175 charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar); 229 charlen = cifs_mapchar(tmp, ftmp, codepage, map_type);
176 if ((outlen + charlen) > (tolen - nullsize)) 230 if ((outlen + charlen) > (tolen - nullsize))
177 break; 231 break;
178 } 232 }
179 233
180 /* put converted char into 'to' buffer */ 234 /* put converted char into 'to' buffer */
181 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); 235 charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type);
182 outlen += charlen; 236 outlen += charlen;
183 } 237 }
184 238
@@ -267,7 +321,7 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
267 if (!dst) 321 if (!dst)
268 return NULL; 322 return NULL;
269 cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage, 323 cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage,
270 false); 324 NO_MAP_UNI_RSVD);
271 } else { 325 } else {
272 len = strnlen(src, maxlen); 326 len = strnlen(src, maxlen);
273 len++; 327 len++;
@@ -280,6 +334,66 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
280 return dst; 334 return dst;
281} 335}
282 336
337static __le16 convert_to_sfu_char(char src_char)
338{
339 __le16 dest_char;
340
341 switch (src_char) {
342 case ':':
343 dest_char = cpu_to_le16(UNI_COLON);
344 break;
345 case '*':
346 dest_char = cpu_to_le16(UNI_ASTERISK);
347 break;
348 case '?':
349 dest_char = cpu_to_le16(UNI_QUESTION);
350 break;
351 case '<':
352 dest_char = cpu_to_le16(UNI_LESSTHAN);
353 break;
354 case '>':
355 dest_char = cpu_to_le16(UNI_GRTRTHAN);
356 break;
357 case '|':
358 dest_char = cpu_to_le16(UNI_PIPE);
359 break;
360 default:
361 dest_char = 0;
362 }
363
364 return dest_char;
365}
366
367static __le16 convert_to_sfm_char(char src_char)
368{
369 __le16 dest_char;
370
371 switch (src_char) {
372 case ':':
373 dest_char = cpu_to_le16(SFM_COLON);
374 break;
375 case '*':
376 dest_char = cpu_to_le16(SFM_ASTERISK);
377 break;
378 case '?':
379 dest_char = cpu_to_le16(SFM_QUESTION);
380 break;
381 case '<':
382 dest_char = cpu_to_le16(SFM_LESSTHAN);
383 break;
384 case '>':
385 dest_char = cpu_to_le16(SFM_GRTRTHAN);
386 break;
387 case '|':
388 dest_char = cpu_to_le16(SFM_PIPE);
389 break;
390 default:
391 dest_char = 0;
392 }
393
394 return dest_char;
395}
396
283/* 397/*
284 * Convert 16 bit Unicode pathname to wire format from string in current code 398 * Convert 16 bit Unicode pathname to wire format from string in current code
285 * page. Conversion may involve remapping up the six characters that are 399 * page. Conversion may involve remapping up the six characters that are
@@ -288,7 +402,7 @@ cifs_strndup_from_utf16(const char *src, const int maxlen,
288 */ 402 */
289int 403int
290cifsConvertToUTF16(__le16 *target, const char *source, int srclen, 404cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
291 const struct nls_table *cp, int mapChars) 405 const struct nls_table *cp, int map_chars)
292{ 406{
293 int i, charlen; 407 int i, charlen;
294 int j = 0; 408 int j = 0;
@@ -296,39 +410,30 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen,
296 __le16 dst_char; 410 __le16 dst_char;
297 wchar_t tmp; 411 wchar_t tmp;
298 412
299 if (!mapChars) 413 if (map_chars == NO_MAP_UNI_RSVD)
300 return cifs_strtoUTF16(target, source, PATH_MAX, cp); 414 return cifs_strtoUTF16(target, source, PATH_MAX, cp);
301 415
302 for (i = 0; i < srclen; j++) { 416 for (i = 0; i < srclen; j++) {
303 src_char = source[i]; 417 src_char = source[i];
304 charlen = 1; 418 charlen = 1;
305 switch (src_char) { 419
306 case 0: 420 /* check if end of string */
421 if (src_char == 0)
307 goto ctoUTF16_out; 422 goto ctoUTF16_out;
308 case ':': 423
309 dst_char = cpu_to_le16(UNI_COLON); 424 /* see if we must remap this char */
310 break; 425 if (map_chars == SFU_MAP_UNI_RSVD)
311 case '*': 426 dst_char = convert_to_sfu_char(src_char);
312 dst_char = cpu_to_le16(UNI_ASTERISK); 427 else if (map_chars == SFM_MAP_UNI_RSVD)
313 break; 428 dst_char = convert_to_sfm_char(src_char);
314 case '?': 429 else
315 dst_char = cpu_to_le16(UNI_QUESTION); 430 dst_char = 0;
316 break;
317 case '<':
318 dst_char = cpu_to_le16(UNI_LESSTHAN);
319 break;
320 case '>':
321 dst_char = cpu_to_le16(UNI_GRTRTHAN);
322 break;
323 case '|':
324 dst_char = cpu_to_le16(UNI_PIPE);
325 break;
326 /* 431 /*
327 * FIXME: We can not handle remapping backslash (UNI_SLASH) 432 * FIXME: We can not handle remapping backslash (UNI_SLASH)
328 * until all the calls to build_path_from_dentry are modified, 433 * until all the calls to build_path_from_dentry are modified,
329 * as they use backslash as separator. 434 * as they use backslash as separator.
330 */ 435 */
331 default: 436 if (dst_char == 0) {
332 charlen = cp->char2uni(source + i, srclen - i, &tmp); 437 charlen = cp->char2uni(source + i, srclen - i, &tmp);
333 dst_char = cpu_to_le16(tmp); 438 dst_char = cpu_to_le16(tmp);
334 439
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index d8eac3b6cefb..bdc52cb9a676 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -52,6 +52,34 @@
52#define UNI_PIPE (__u16) ('|' + 0xF000) 52#define UNI_PIPE (__u16) ('|' + 0xF000)
53#define UNI_SLASH (__u16) ('\\' + 0xF000) 53#define UNI_SLASH (__u16) ('\\' + 0xF000)
54 54
55/*
56 * Macs use an older "SFM" mapping of the symbols above. Fortunately it does
57 * not conflict (although almost does) with the mapping above.
58 */
59
60#define SFM_ASTERISK ((__u16) 0xF021)
61#define SFM_QUESTION ((__u16) 0xF025)
62#define SFM_COLON ((__u16) 0xF022)
63#define SFM_GRTRTHAN ((__u16) 0xF024)
64#define SFM_LESSTHAN ((__u16) 0xF023)
65#define SFM_PIPE ((__u16) 0xF027)
66#define SFM_SLASH ((__u16) 0xF026)
67
68/*
69 * Mapping mechanism to use when one of the seven reserved characters is
70 * encountered. We can only map using one of the mechanisms at a time
71 * since otherwise readdir could return directory entries which we would
72 * not be able to open
73 *
74 * NO_MAP_UNI_RSVD = do not perform any remapping of the character
75 * SFM_MAP_UNI_RSVD = map reserved characters using SFM scheme (MAC compatible)
76 * SFU_MAP_UNI_RSVD = map reserved characters ala SFU ("mapchars" option)
77 *
78 */
79#define NO_MAP_UNI_RSVD 0
80#define SFM_MAP_UNI_RSVD 1
81#define SFU_MAP_UNI_RSVD 2
82
55/* Just define what we want from uniupr.h. We don't want to define the tables 83/* Just define what we want from uniupr.h. We don't want to define the tables
56 * in each source file. 84 * in each source file.
57 */ 85 */
@@ -75,7 +103,7 @@ extern const struct UniCaseRange CifsUniLowerRange[];
75 103
76#ifdef __KERNEL__ 104#ifdef __KERNEL__
77int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, 105int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen,
78 const struct nls_table *codepage, bool mapchar); 106 const struct nls_table *cp, int map_type);
79int cifs_utf16_bytes(const __le16 *from, int maxbytes, 107int cifs_utf16_bytes(const __le16 *from, int maxbytes,
80 const struct nls_table *codepage); 108 const struct nls_table *codepage);
81int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *); 109int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *);
@@ -84,6 +112,7 @@ char *cifs_strndup_from_utf16(const char *src, const int maxlen,
84 const struct nls_table *codepage); 112 const struct nls_table *codepage);
85extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen, 113extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen,
86 const struct nls_table *cp, int mapChars); 114 const struct nls_table *cp, int mapChars);
115extern int cifs_remap(struct cifs_sb_info *cifs_sb);
87#ifdef CONFIG_CIFS_SMB2 116#ifdef CONFIG_CIFS_SMB2
88extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen, 117extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
89 int *utf16_len, const struct nls_table *cp, 118 int *utf16_len, const struct nls_table *cp,
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7ff866dbb89e..6d00c419cbae 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -84,7 +84,6 @@ static struct key_type cifs_idmap_key_type = {
84 .instantiate = cifs_idmap_key_instantiate, 84 .instantiate = cifs_idmap_key_instantiate,
85 .destroy = cifs_idmap_key_destroy, 85 .destroy = cifs_idmap_key_destroy,
86 .describe = user_describe, 86 .describe = user_describe,
87 .match = user_match,
88}; 87};
89 88
90static char * 89static char *
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 4934347321d3..4ac7445e6ec7 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -431,7 +431,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
431 return -ENOMEM; 431 return -ENOMEM;
432 cifs_from_utf16(ses->domainName, 432 cifs_from_utf16(ses->domainName,
433 (__le16 *)blobptr, attrsize, attrsize, 433 (__le16 *)blobptr, attrsize, attrsize,
434 nls_cp, false); 434 nls_cp, NO_MAP_UNI_RSVD);
435 break; 435 break;
436 } 436 }
437 } 437 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 888398067420..9d7996e8e793 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -207,6 +207,19 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
207 return 0; 207 return 0;
208} 208}
209 209
210static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len)
211{
212 struct super_block *sb = file->f_path.dentry->d_sb;
213 struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
214 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
215 struct TCP_Server_Info *server = tcon->ses->server;
216
217 if (server->ops->fallocate)
218 return server->ops->fallocate(file, tcon, mode, off, len);
219
220 return -EOPNOTSUPP;
221}
222
210static int cifs_permission(struct inode *inode, int mask) 223static int cifs_permission(struct inode *inode, int mask)
211{ 224{
212 struct cifs_sb_info *cifs_sb; 225 struct cifs_sb_info *cifs_sb;
@@ -800,7 +813,8 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
800 return generic_file_llseek(file, offset, whence); 813 return generic_file_llseek(file, offset, whence);
801} 814}
802 815
803static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) 816static int
817cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv)
804{ 818{
805 /* 819 /*
806 * Note that this is called by vfs setlease with i_lock held to 820 * Note that this is called by vfs setlease with i_lock held to
@@ -812,10 +826,11 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
812 if (!(S_ISREG(inode->i_mode))) 826 if (!(S_ISREG(inode->i_mode)))
813 return -EINVAL; 827 return -EINVAL;
814 828
815 /* check if file is oplocked */ 829 /* Check if file is oplocked if this is request for new lease */
816 if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || 830 if (arg == F_UNLCK ||
831 ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
817 ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode)))) 832 ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
818 return generic_setlease(file, arg, lease); 833 return generic_setlease(file, arg, lease, priv);
819 else if (tlink_tcon(cfile->tlink)->local_lease && 834 else if (tlink_tcon(cfile->tlink)->local_lease &&
820 !CIFS_CACHE_READ(CIFS_I(inode))) 835 !CIFS_CACHE_READ(CIFS_I(inode)))
821 /* 836 /*
@@ -826,7 +841,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
826 * knows that the file won't be changed on the server by anyone 841 * knows that the file won't be changed on the server by anyone
827 * else. 842 * else.
828 */ 843 */
829 return generic_setlease(file, arg, lease); 844 return generic_setlease(file, arg, lease, priv);
830 else 845 else
831 return -EAGAIN; 846 return -EAGAIN;
832} 847}
@@ -848,7 +863,7 @@ const struct inode_operations cifs_dir_inode_ops = {
848 .link = cifs_hardlink, 863 .link = cifs_hardlink,
849 .mkdir = cifs_mkdir, 864 .mkdir = cifs_mkdir,
850 .rmdir = cifs_rmdir, 865 .rmdir = cifs_rmdir,
851 .rename = cifs_rename, 866 .rename2 = cifs_rename2,
852 .permission = cifs_permission, 867 .permission = cifs_permission,
853/* revalidate:cifs_revalidate, */ 868/* revalidate:cifs_revalidate, */
854 .setattr = cifs_setattr, 869 .setattr = cifs_setattr,
@@ -908,6 +923,7 @@ const struct file_operations cifs_file_ops = {
908 .unlocked_ioctl = cifs_ioctl, 923 .unlocked_ioctl = cifs_ioctl,
909#endif /* CONFIG_CIFS_POSIX */ 924#endif /* CONFIG_CIFS_POSIX */
910 .setlease = cifs_setlease, 925 .setlease = cifs_setlease,
926 .fallocate = cifs_fallocate,
911}; 927};
912 928
913const struct file_operations cifs_file_strict_ops = { 929const struct file_operations cifs_file_strict_ops = {
@@ -927,6 +943,7 @@ const struct file_operations cifs_file_strict_ops = {
927 .unlocked_ioctl = cifs_ioctl, 943 .unlocked_ioctl = cifs_ioctl,
928#endif /* CONFIG_CIFS_POSIX */ 944#endif /* CONFIG_CIFS_POSIX */
929 .setlease = cifs_setlease, 945 .setlease = cifs_setlease,
946 .fallocate = cifs_fallocate,
930}; 947};
931 948
932const struct file_operations cifs_file_direct_ops = { 949const struct file_operations cifs_file_direct_ops = {
@@ -947,6 +964,7 @@ const struct file_operations cifs_file_direct_ops = {
947#endif /* CONFIG_CIFS_POSIX */ 964#endif /* CONFIG_CIFS_POSIX */
948 .llseek = cifs_llseek, 965 .llseek = cifs_llseek,
949 .setlease = cifs_setlease, 966 .setlease = cifs_setlease,
967 .fallocate = cifs_fallocate,
950}; 968};
951 969
952const struct file_operations cifs_file_nobrl_ops = { 970const struct file_operations cifs_file_nobrl_ops = {
@@ -965,6 +983,7 @@ const struct file_operations cifs_file_nobrl_ops = {
965 .unlocked_ioctl = cifs_ioctl, 983 .unlocked_ioctl = cifs_ioctl,
966#endif /* CONFIG_CIFS_POSIX */ 984#endif /* CONFIG_CIFS_POSIX */
967 .setlease = cifs_setlease, 985 .setlease = cifs_setlease,
986 .fallocate = cifs_fallocate,
968}; 987};
969 988
970const struct file_operations cifs_file_strict_nobrl_ops = { 989const struct file_operations cifs_file_strict_nobrl_ops = {
@@ -983,6 +1002,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
983 .unlocked_ioctl = cifs_ioctl, 1002 .unlocked_ioctl = cifs_ioctl,
984#endif /* CONFIG_CIFS_POSIX */ 1003#endif /* CONFIG_CIFS_POSIX */
985 .setlease = cifs_setlease, 1004 .setlease = cifs_setlease,
1005 .fallocate = cifs_fallocate,
986}; 1006};
987 1007
988const struct file_operations cifs_file_direct_nobrl_ops = { 1008const struct file_operations cifs_file_direct_nobrl_ops = {
@@ -1002,6 +1022,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
1002#endif /* CONFIG_CIFS_POSIX */ 1022#endif /* CONFIG_CIFS_POSIX */
1003 .llseek = cifs_llseek, 1023 .llseek = cifs_llseek,
1004 .setlease = cifs_setlease, 1024 .setlease = cifs_setlease,
1025 .fallocate = cifs_fallocate,
1005}; 1026};
1006 1027
1007const struct file_operations cifs_dir_ops = { 1028const struct file_operations cifs_dir_ops = {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 70f178a7c759..002e0c173939 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -68,8 +68,8 @@ extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *);
68extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); 68extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
69extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); 69extern int cifs_mkdir(struct inode *, struct dentry *, umode_t);
70extern int cifs_rmdir(struct inode *, struct dentry *); 70extern int cifs_rmdir(struct inode *, struct dentry *);
71extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 71extern int cifs_rename2(struct inode *, struct dentry *, struct inode *,
72 struct dentry *); 72 struct dentry *, unsigned int);
73extern int cifs_revalidate_file_attr(struct file *filp); 73extern int cifs_revalidate_file_attr(struct file *filp);
74extern int cifs_revalidate_dentry_attr(struct dentry *); 74extern int cifs_revalidate_dentry_attr(struct dentry *);
75extern int cifs_revalidate_file(struct file *filp); 75extern int cifs_revalidate_file(struct file *filp);
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
136extern const struct export_operations cifs_export_ops; 136extern const struct export_operations cifs_export_ops;
137#endif /* CONFIG_CIFS_NFSD_EXPORT */ 137#endif /* CONFIG_CIFS_NFSD_EXPORT */
138 138
139#define CIFS_VERSION "2.03" 139#define CIFS_VERSION "2.05"
140#endif /* _CIFSFS_H */ 140#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index de6aed8c78e5..02a33e529904 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,11 +70,6 @@
70#define SERVER_NAME_LENGTH 40 70#define SERVER_NAME_LENGTH 40
71#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) 71#define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1)
72 72
73/* used to define string lengths for reversing unicode strings */
74/* (256+1)*2 = 514 */
75/* (max path length + 1 for null) * 2 for unicode */
76#define MAX_NAME 514
77
78/* SMB echo "timeout" -- FIXME: tunable? */ 73/* SMB echo "timeout" -- FIXME: tunable? */
79#define SMB_ECHO_INTERVAL (60 * HZ) 74#define SMB_ECHO_INTERVAL (60 * HZ)
80 75
@@ -328,11 +323,11 @@ struct smb_version_operations {
328 int (*async_writev)(struct cifs_writedata *, 323 int (*async_writev)(struct cifs_writedata *,
329 void (*release)(struct kref *)); 324 void (*release)(struct kref *));
330 /* sync read from the server */ 325 /* sync read from the server */
331 int (*sync_read)(const unsigned int, struct cifsFileInfo *, 326 int (*sync_read)(const unsigned int, struct cifs_fid *,
332 struct cifs_io_parms *, unsigned int *, char **, 327 struct cifs_io_parms *, unsigned int *, char **,
333 int *); 328 int *);
334 /* sync write to the server */ 329 /* sync write to the server */
335 int (*sync_write)(const unsigned int, struct cifsFileInfo *, 330 int (*sync_write)(const unsigned int, struct cifs_fid *,
336 struct cifs_io_parms *, unsigned int *, struct kvec *, 331 struct cifs_io_parms *, unsigned int *, struct kvec *,
337 unsigned long); 332 unsigned long);
338 /* open dir, start readdir */ 333 /* open dir, start readdir */
@@ -404,6 +399,15 @@ struct smb_version_operations {
404 const struct cifs_fid *, u32 *); 399 const struct cifs_fid *, u32 *);
405 int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *, 400 int (*set_acl)(struct cifs_ntsd *, __u32, struct inode *, const char *,
406 int); 401 int);
402 /* writepages retry size */
403 unsigned int (*wp_retry_size)(struct inode *);
404 /* get mtu credits */
405 int (*wait_mtu_credits)(struct TCP_Server_Info *, unsigned int,
406 unsigned int *, unsigned int *);
407 /* check if we need to issue closedir */
408 bool (*dir_needs_close)(struct cifsFileInfo *);
409 long (*fallocate)(struct file *, struct cifs_tcon *, int, loff_t,
410 loff_t);
407}; 411};
408 412
409struct smb_version_values { 413struct smb_version_values {
@@ -462,6 +466,7 @@ struct smb_vol {
462 bool direct_io:1; 466 bool direct_io:1;
463 bool strict_io:1; /* strict cache behavior */ 467 bool strict_io:1; /* strict cache behavior */
464 bool remap:1; /* set to remap seven reserved chars in filenames */ 468 bool remap:1; /* set to remap seven reserved chars in filenames */
469 bool sfu_remap:1; /* remap seven reserved chars ala SFU */
465 bool posix_paths:1; /* unset to not ask for posix pathnames. */ 470 bool posix_paths:1; /* unset to not ask for posix pathnames. */
466 bool no_linux_ext:1; 471 bool no_linux_ext:1;
467 bool sfu_emul:1; 472 bool sfu_emul:1;
@@ -495,6 +500,7 @@ struct smb_vol {
495#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ 500#define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
496 CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \ 501 CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \
497 CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \ 502 CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \
503 CIFS_MOUNT_MAP_SFM_CHR | \
498 CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \ 504 CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \
499 CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \ 505 CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \
500 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ 506 CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \
@@ -640,6 +646,16 @@ add_credits(struct TCP_Server_Info *server, const unsigned int add,
640} 646}
641 647
642static inline void 648static inline void
649add_credits_and_wake_if(struct TCP_Server_Info *server, const unsigned int add,
650 const int optype)
651{
652 if (add) {
653 server->ops->add_credits(server, add, optype);
654 wake_up(&server->request_q);
655 }
656}
657
658static inline void
643set_credits(struct TCP_Server_Info *server, const int val) 659set_credits(struct TCP_Server_Info *server, const int val)
644{ 660{
645 server->ops->set_credits(server, val); 661 server->ops->set_credits(server, val);
@@ -868,6 +884,7 @@ struct cifs_tcon {
868 for this mount even if server would support */ 884 for this mount even if server would support */
869 bool local_lease:1; /* check leases (only) on local system not remote */ 885 bool local_lease:1; /* check leases (only) on local system not remote */
870 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ 886 bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
887 bool broken_sparse_sup; /* if server or share does not support sparse */
871 bool need_reconnect:1; /* connection reset, tid now invalid */ 888 bool need_reconnect:1; /* connection reset, tid now invalid */
872#ifdef CONFIG_CIFS_SMB2 889#ifdef CONFIG_CIFS_SMB2
873 bool print:1; /* set if connection to printer share */ 890 bool print:1; /* set if connection to printer share */
@@ -1044,6 +1061,7 @@ struct cifs_readdata {
1044 struct address_space *mapping; 1061 struct address_space *mapping;
1045 __u64 offset; 1062 __u64 offset;
1046 unsigned int bytes; 1063 unsigned int bytes;
1064 unsigned int got_bytes;
1047 pid_t pid; 1065 pid_t pid;
1048 int result; 1066 int result;
1049 struct work_struct work; 1067 struct work_struct work;
@@ -1053,6 +1071,7 @@ struct cifs_readdata {
1053 struct kvec iov; 1071 struct kvec iov;
1054 unsigned int pagesz; 1072 unsigned int pagesz;
1055 unsigned int tailsz; 1073 unsigned int tailsz;
1074 unsigned int credits;
1056 unsigned int nr_pages; 1075 unsigned int nr_pages;
1057 struct page *pages[]; 1076 struct page *pages[];
1058}; 1077};
@@ -1073,6 +1092,7 @@ struct cifs_writedata {
1073 int result; 1092 int result;
1074 unsigned int pagesz; 1093 unsigned int pagesz;
1075 unsigned int tailsz; 1094 unsigned int tailsz;
1095 unsigned int credits;
1076 unsigned int nr_pages; 1096 unsigned int nr_pages;
1077 struct page *pages[]; 1097 struct page *pages[];
1078}; 1098};
@@ -1398,6 +1418,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
1398#define CIFS_OBREAK_OP 0x0100 /* oplock break request */ 1418#define CIFS_OBREAK_OP 0x0100 /* oplock break request */
1399#define CIFS_NEG_OP 0x0200 /* negotiate request */ 1419#define CIFS_NEG_OP 0x0200 /* negotiate request */
1400#define CIFS_OP_MASK 0x0380 /* mask request type */ 1420#define CIFS_OP_MASK 0x0380 /* mask request type */
1421#define CIFS_HAS_CREDITS 0x0400 /* already has credits */
1401 1422
1402/* Security Flags: indicate type of session setup needed */ 1423/* Security Flags: indicate type of session setup needed */
1403#define CIFSSEC_MAY_SIGN 0x00001 1424#define CIFSSEC_MAY_SIGN 0x00001
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 33df36ef9d52..5f9822ac0245 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -2253,6 +2253,29 @@ typedef struct {
2253/* minimum includes first three fields, and empty FS Name */ 2253/* minimum includes first three fields, and empty FS Name */
2254#define MIN_FS_ATTR_INFO_SIZE 12 2254#define MIN_FS_ATTR_INFO_SIZE 12
2255 2255
2256
2257/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */
2258#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000
2259#define FILE_SUPPORTS_USN_JOURNAL 0x02000000
2260#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000
2261#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000
2262#define FILE_SUPPORTS_HARD_LINKS 0x00400000
2263#define FILE_SUPPORTS_TRANSACTIONS 0x00200000
2264#define FILE_SEQUENTIAL_WRITE_ONCE 0x00100000
2265#define FILE_READ_ONLY_VOLUME 0x00080000
2266#define FILE_NAMED_STREAMS 0x00040000
2267#define FILE_SUPPORTS_ENCRYPTION 0x00020000
2268#define FILE_SUPPORTS_OBJECT_IDS 0x00010000
2269#define FILE_VOLUME_IS_COMPRESSED 0x00008000
2270#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100
2271#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080
2272#define FILE_SUPPORTS_SPARSE_FILES 0x00000040
2273#define FILE_VOLUME_QUOTAS 0x00000020
2274#define FILE_FILE_COMPRESSION 0x00000010
2275#define FILE_PERSISTENT_ACLS 0x00000008
2276#define FILE_UNICODE_ON_DISK 0x00000004
2277#define FILE_CASE_PRESERVED_NAMES 0x00000002
2278#define FILE_CASE_SENSITIVE_SEARCH 0x00000001
2256typedef struct { 2279typedef struct {
2257 __le32 Attributes; 2280 __le32 Attributes;
2258 __le32 MaxPathNameComponentLength; 2281 __le32 MaxPathNameComponentLength;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ca7980a1e303..c31ce98c1704 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -36,6 +36,7 @@ extern struct smb_hdr *cifs_buf_get(void);
36extern void cifs_buf_release(void *); 36extern void cifs_buf_release(void *);
37extern struct smb_hdr *cifs_small_buf_get(void); 37extern struct smb_hdr *cifs_small_buf_get(void);
38extern void cifs_small_buf_release(void *); 38extern void cifs_small_buf_release(void *);
39extern void free_rsp_buf(int, void *);
39extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx, 40extern void cifs_rqst_page_to_kvec(struct smb_rqst *rqst, unsigned int idx,
40 struct kvec *iov); 41 struct kvec *iov);
41extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, 42extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *,
@@ -89,6 +90,9 @@ extern struct mid_q_entry *cifs_setup_async_request(struct TCP_Server_Info *,
89 struct smb_rqst *); 90 struct smb_rqst *);
90extern int cifs_check_receive(struct mid_q_entry *mid, 91extern int cifs_check_receive(struct mid_q_entry *mid,
91 struct TCP_Server_Info *server, bool log_error); 92 struct TCP_Server_Info *server, bool log_error);
93extern int cifs_wait_mtu_credits(struct TCP_Server_Info *server,
94 unsigned int size, unsigned int *num,
95 unsigned int *credits);
92extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *, 96extern int SendReceive2(const unsigned int /* xid */ , struct cifs_ses *,
93 struct kvec *, int /* nvec to send */, 97 struct kvec *, int /* nvec to send */,
94 int * /* type of buf returned */ , const int flags); 98 int * /* type of buf returned */ , const int flags);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 6ce4e0954b98..61d00a6e398f 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -196,10 +196,6 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command)
196 if (rc) 196 if (rc)
197 goto out; 197 goto out;
198 198
199 /*
200 * FIXME: check if wsize needs updated due to negotiated smb buffer
201 * size shrinking
202 */
203 atomic_inc(&tconInfoReconnectCount); 199 atomic_inc(&tconInfoReconnectCount);
204 200
205 /* tell server Unix caps we support */ 201 /* tell server Unix caps we support */
@@ -871,7 +867,7 @@ CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
871 int rc = 0; 867 int rc = 0;
872 int bytes_returned; 868 int bytes_returned;
873 int name_len; 869 int name_len;
874 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; 870 int remap = cifs_remap(cifs_sb);
875 871
876DelFileRetry: 872DelFileRetry:
877 rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB, 873 rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB,
@@ -917,7 +913,7 @@ CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
917 int rc = 0; 913 int rc = 0;
918 int bytes_returned; 914 int bytes_returned;
919 int name_len; 915 int name_len;
920 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; 916 int remap = cifs_remap(cifs_sb);
921 917
922 cifs_dbg(FYI, "In CIFSSMBRmDir\n"); 918 cifs_dbg(FYI, "In CIFSSMBRmDir\n");
923RmDirRetry: 919RmDirRetry:
@@ -962,7 +958,7 @@ CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
962 CREATE_DIRECTORY_RSP *pSMBr = NULL; 958 CREATE_DIRECTORY_RSP *pSMBr = NULL;
963 int bytes_returned; 959 int bytes_returned;
964 int name_len; 960 int name_len;
965 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; 961 int remap = cifs_remap(cifs_sb);
966 962
967 cifs_dbg(FYI, "In CIFSSMBMkDir\n"); 963 cifs_dbg(FYI, "In CIFSSMBMkDir\n");
968MkDirRetry: 964MkDirRetry:
@@ -1284,7 +1280,7 @@ CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock,
1284 __u16 count; 1280 __u16 count;
1285 struct cifs_sb_info *cifs_sb = oparms->cifs_sb; 1281 struct cifs_sb_info *cifs_sb = oparms->cifs_sb;
1286 struct cifs_tcon *tcon = oparms->tcon; 1282 struct cifs_tcon *tcon = oparms->tcon;
1287 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; 1283 int remap = cifs_remap(cifs_sb);
1288 const struct nls_table *nls = cifs_sb->local_nls; 1284 const struct nls_table *nls = cifs_sb->local_nls;
1289 int create_options = oparms->create_options; 1285 int create_options = oparms->create_options;
1290 int desired_access = oparms->desired_access; 1286 int desired_access = oparms->desired_access;
@@ -1517,7 +1513,6 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid)
1517 return length; 1513 return length;
1518 1514
1519 server->total_read += length; 1515 server->total_read += length;
1520 rdata->bytes = length;
1521 1516
1522 cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n", 1517 cifs_dbg(FYI, "total_read=%u buflen=%u remaining=%u\n",
1523 server->total_read, buflen, data_len); 1518 server->total_read, buflen, data_len);
@@ -1560,12 +1555,18 @@ cifs_readv_callback(struct mid_q_entry *mid)
1560 rc); 1555 rc);
1561 } 1556 }
1562 /* FIXME: should this be counted toward the initiating task? */ 1557 /* FIXME: should this be counted toward the initiating task? */
1563 task_io_account_read(rdata->bytes); 1558 task_io_account_read(rdata->got_bytes);
1564 cifs_stats_bytes_read(tcon, rdata->bytes); 1559 cifs_stats_bytes_read(tcon, rdata->got_bytes);
1565 break; 1560 break;
1566 case MID_REQUEST_SUBMITTED: 1561 case MID_REQUEST_SUBMITTED:
1567 case MID_RETRY_NEEDED: 1562 case MID_RETRY_NEEDED:
1568 rdata->result = -EAGAIN; 1563 rdata->result = -EAGAIN;
1564 if (server->sign && rdata->got_bytes)
1565 /* reset bytes number since we can not check a sign */
1566 rdata->got_bytes = 0;
1567 /* FIXME: should this be counted toward the initiating task? */
1568 task_io_account_read(rdata->got_bytes);
1569 cifs_stats_bytes_read(tcon, rdata->got_bytes);
1569 break; 1570 break;
1570 default: 1571 default:
1571 rdata->result = -EIO; 1572 rdata->result = -EIO;
@@ -1734,10 +1735,7 @@ CIFSSMBRead(const unsigned int xid, struct cifs_io_parms *io_parms,
1734 1735
1735/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ 1736/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
1736 if (*buf) { 1737 if (*buf) {
1737 if (resp_buf_type == CIFS_SMALL_BUFFER) 1738 free_rsp_buf(resp_buf_type, iov[0].iov_base);
1738 cifs_small_buf_release(iov[0].iov_base);
1739 else if (resp_buf_type == CIFS_LARGE_BUFFER)
1740 cifs_buf_release(iov[0].iov_base);
1741 } else if (resp_buf_type != CIFS_NO_BUFFER) { 1739 } else if (resp_buf_type != CIFS_NO_BUFFER) {
1742 /* return buffer to caller to free */ 1740 /* return buffer to caller to free */
1743 *buf = iov[0].iov_base; 1741 *buf = iov[0].iov_base;
@@ -1899,28 +1897,80 @@ cifs_writedata_release(struct kref *refcount)
1899static void 1897static void
1900cifs_writev_requeue(struct cifs_writedata *wdata) 1898cifs_writev_requeue(struct cifs_writedata *wdata)
1901{ 1899{
1902 int i, rc; 1900 int i, rc = 0;
1903 struct inode *inode = wdata->cfile->dentry->d_inode; 1901 struct inode *inode = wdata->cfile->dentry->d_inode;
1904 struct TCP_Server_Info *server; 1902 struct TCP_Server_Info *server;
1903 unsigned int rest_len;
1905 1904
1906 for (i = 0; i < wdata->nr_pages; i++) { 1905 server = tlink_tcon(wdata->cfile->tlink)->ses->server;
1907 lock_page(wdata->pages[i]); 1906 i = 0;
1908 clear_page_dirty_for_io(wdata->pages[i]); 1907 rest_len = wdata->bytes;
1909 }
1910
1911 do { 1908 do {
1912 server = tlink_tcon(wdata->cfile->tlink)->ses->server; 1909 struct cifs_writedata *wdata2;
1913 rc = server->ops->async_writev(wdata, cifs_writedata_release); 1910 unsigned int j, nr_pages, wsize, tailsz, cur_len;
1914 } while (rc == -EAGAIN); 1911
1912 wsize = server->ops->wp_retry_size(inode);
1913 if (wsize < rest_len) {
1914 nr_pages = wsize / PAGE_CACHE_SIZE;
1915 if (!nr_pages) {
1916 rc = -ENOTSUPP;
1917 break;
1918 }
1919 cur_len = nr_pages * PAGE_CACHE_SIZE;
1920 tailsz = PAGE_CACHE_SIZE;
1921 } else {
1922 nr_pages = DIV_ROUND_UP(rest_len, PAGE_CACHE_SIZE);
1923 cur_len = rest_len;
1924 tailsz = rest_len - (nr_pages - 1) * PAGE_CACHE_SIZE;
1925 }
1915 1926
1916 for (i = 0; i < wdata->nr_pages; i++) { 1927 wdata2 = cifs_writedata_alloc(nr_pages, cifs_writev_complete);
1917 unlock_page(wdata->pages[i]); 1928 if (!wdata2) {
1918 if (rc != 0) { 1929 rc = -ENOMEM;
1919 SetPageError(wdata->pages[i]); 1930 break;
1920 end_page_writeback(wdata->pages[i]);
1921 page_cache_release(wdata->pages[i]);
1922 } 1931 }
1923 } 1932
1933 for (j = 0; j < nr_pages; j++) {
1934 wdata2->pages[j] = wdata->pages[i + j];
1935 lock_page(wdata2->pages[j]);
1936 clear_page_dirty_for_io(wdata2->pages[j]);
1937 }
1938
1939 wdata2->sync_mode = wdata->sync_mode;
1940 wdata2->nr_pages = nr_pages;
1941 wdata2->offset = page_offset(wdata2->pages[0]);
1942 wdata2->pagesz = PAGE_CACHE_SIZE;
1943 wdata2->tailsz = tailsz;
1944 wdata2->bytes = cur_len;
1945
1946 wdata2->cfile = find_writable_file(CIFS_I(inode), false);
1947 if (!wdata2->cfile) {
1948 cifs_dbg(VFS, "No writable handles for inode\n");
1949 rc = -EBADF;
1950 break;
1951 }
1952 wdata2->pid = wdata2->cfile->pid;
1953 rc = server->ops->async_writev(wdata2, cifs_writedata_release);
1954
1955 for (j = 0; j < nr_pages; j++) {
1956 unlock_page(wdata2->pages[j]);
1957 if (rc != 0 && rc != -EAGAIN) {
1958 SetPageError(wdata2->pages[j]);
1959 end_page_writeback(wdata2->pages[j]);
1960 page_cache_release(wdata2->pages[j]);
1961 }
1962 }
1963
1964 if (rc) {
1965 kref_put(&wdata2->refcount, cifs_writedata_release);
1966 if (rc == -EAGAIN)
1967 continue;
1968 break;
1969 }
1970
1971 rest_len -= cur_len;
1972 i += nr_pages;
1973 } while (i < wdata->nr_pages);
1924 1974
1925 mapping_set_error(inode->i_mapping, rc); 1975 mapping_set_error(inode->i_mapping, rc);
1926 kref_put(&wdata->refcount, cifs_writedata_release); 1976 kref_put(&wdata->refcount, cifs_writedata_release);
@@ -2203,10 +2253,7 @@ CIFSSMBWrite2(const unsigned int xid, struct cifs_io_parms *io_parms,
2203 } 2253 }
2204 2254
2205/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ 2255/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
2206 if (resp_buf_type == CIFS_SMALL_BUFFER) 2256 free_rsp_buf(resp_buf_type, iov[0].iov_base);
2207 cifs_small_buf_release(iov[0].iov_base);
2208 else if (resp_buf_type == CIFS_LARGE_BUFFER)
2209 cifs_buf_release(iov[0].iov_base);
2210 2257
2211 /* Note: On -EAGAIN error only caller can retry on handle based calls 2258 /* Note: On -EAGAIN error only caller can retry on handle based calls
2212 since file handle passed in no longer valid */ 2259 since file handle passed in no longer valid */
@@ -2451,10 +2498,7 @@ plk_err_exit:
2451 if (pSMB) 2498 if (pSMB)
2452 cifs_small_buf_release(pSMB); 2499 cifs_small_buf_release(pSMB);
2453 2500
2454 if (resp_buf_type == CIFS_SMALL_BUFFER) 2501 free_rsp_buf(resp_buf_type, iov[0].iov_base);
2455 cifs_small_buf_release(iov[0].iov_base);
2456 else if (resp_buf_type == CIFS_LARGE_BUFFER)
2457 cifs_buf_release(iov[0].iov_base);
2458 2502
2459 /* Note: On -EAGAIN error only caller can retry on handle based calls 2503 /* Note: On -EAGAIN error only caller can retry on handle based calls
2460 since file handle passed in no longer valid */ 2504 since file handle passed in no longer valid */
@@ -2528,7 +2572,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
2528 int bytes_returned; 2572 int bytes_returned;
2529 int name_len, name_len2; 2573 int name_len, name_len2;
2530 __u16 count; 2574 __u16 count;
2531 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; 2575 int remap = cifs_remap(cifs_sb);
2532 2576
2533 cifs_dbg(FYI, "In CIFSSMBRename\n"); 2577 cifs_dbg(FYI, "In CIFSSMBRename\n");
2534renameRetry: 2578renameRetry:
@@ -2924,7 +2968,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
2924 int bytes_returned; 2968 int bytes_returned;
2925 int name_len, name_len2; 2969 int name_len, name_len2;
2926 __u16 count; 2970 __u16 count;
2927 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; 2971 int remap = cifs_remap(cifs_sb);
2928 2972
2929 cifs_dbg(FYI, "In CIFSCreateHardLink\n"); 2973 cifs_dbg(FYI, "In CIFSCreateHardLink\n");
2930winCreateHardLinkRetry: 2974winCreateHardLinkRetry:
@@ -3838,10 +3882,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid,
3838 } 3882 }
3839 } 3883 }
3840qsec_out: 3884qsec_out:
3841 if (buf_type == CIFS_SMALL_BUFFER) 3885 free_rsp_buf(buf_type, iov[0].iov_base);
3842 cifs_small_buf_release(iov[0].iov_base);
3843 else if (buf_type == CIFS_LARGE_BUFFER)
3844 cifs_buf_release(iov[0].iov_base);
3845/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ 3886/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */
3846 return rc; 3887 return rc;
3847} 3888}
@@ -4326,7 +4367,7 @@ findFirstRetry:
4326 return rc; 4367 return rc;
4327 4368
4328 nls_codepage = cifs_sb->local_nls; 4369 nls_codepage = cifs_sb->local_nls;
4329 remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; 4370 remap = cifs_remap(cifs_sb);
4330 4371
4331 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { 4372 if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) {
4332 name_len = 4373 name_len =
@@ -5486,7 +5527,7 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
5486 int name_len; 5527 int name_len;
5487 int rc = 0; 5528 int rc = 0;
5488 int bytes_returned = 0; 5529 int bytes_returned = 0;
5489 int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; 5530 int remap = cifs_remap(cifs_sb);
5490 5531
5491 __u16 params, byte_count, data_count, param_offset, offset; 5532 __u16 params, byte_count, data_count, param_offset, offset;
5492 5533
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 20d75b8ddb26..24fa08d261fb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -70,6 +70,7 @@ enum {
70 Opt_forcegid, Opt_noforcegid, 70 Opt_forcegid, Opt_noforcegid,
71 Opt_noblocksend, Opt_noautotune, 71 Opt_noblocksend, Opt_noautotune,
72 Opt_hard, Opt_soft, Opt_perm, Opt_noperm, 72 Opt_hard, Opt_soft, Opt_perm, Opt_noperm,
73 Opt_mapposix, Opt_nomapposix,
73 Opt_mapchars, Opt_nomapchars, Opt_sfu, 74 Opt_mapchars, Opt_nomapchars, Opt_sfu,
74 Opt_nosfu, Opt_nodfs, Opt_posixpaths, 75 Opt_nosfu, Opt_nodfs, Opt_posixpaths,
75 Opt_noposixpaths, Opt_nounix, 76 Opt_noposixpaths, Opt_nounix,
@@ -124,8 +125,10 @@ static const match_table_t cifs_mount_option_tokens = {
124 { Opt_soft, "soft" }, 125 { Opt_soft, "soft" },
125 { Opt_perm, "perm" }, 126 { Opt_perm, "perm" },
126 { Opt_noperm, "noperm" }, 127 { Opt_noperm, "noperm" },
127 { Opt_mapchars, "mapchars" }, 128 { Opt_mapchars, "mapchars" }, /* SFU style */
128 { Opt_nomapchars, "nomapchars" }, 129 { Opt_nomapchars, "nomapchars" },
130 { Opt_mapposix, "mapposix" }, /* SFM style */
131 { Opt_nomapposix, "nomapposix" },
129 { Opt_sfu, "sfu" }, 132 { Opt_sfu, "sfu" },
130 { Opt_nosfu, "nosfu" }, 133 { Opt_nosfu, "nosfu" },
131 { Opt_nodfs, "nodfs" }, 134 { Opt_nodfs, "nodfs" },
@@ -557,7 +560,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
557 try_to_freeze(); 560 try_to_freeze();
558 561
559 if (server_unresponsive(server)) { 562 if (server_unresponsive(server)) {
560 total_read = -EAGAIN; 563 total_read = -ECONNABORTED;
561 break; 564 break;
562 } 565 }
563 566
@@ -571,7 +574,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
571 break; 574 break;
572 } else if (server->tcpStatus == CifsNeedReconnect) { 575 } else if (server->tcpStatus == CifsNeedReconnect) {
573 cifs_reconnect(server); 576 cifs_reconnect(server);
574 total_read = -EAGAIN; 577 total_read = -ECONNABORTED;
575 break; 578 break;
576 } else if (length == -ERESTARTSYS || 579 } else if (length == -ERESTARTSYS ||
577 length == -EAGAIN || 580 length == -EAGAIN ||
@@ -588,7 +591,7 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct kvec *iov_orig,
588 cifs_dbg(FYI, "Received no data or error: expecting %d\n" 591 cifs_dbg(FYI, "Received no data or error: expecting %d\n"
589 "got %d", to_read, length); 592 "got %d", to_read, length);
590 cifs_reconnect(server); 593 cifs_reconnect(server);
591 total_read = -EAGAIN; 594 total_read = -ECONNABORTED;
592 break; 595 break;
593 } 596 }
594 } 597 }
@@ -786,7 +789,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
786 cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length); 789 cifs_dbg(VFS, "SMB response too long (%u bytes)\n", pdu_length);
787 cifs_reconnect(server); 790 cifs_reconnect(server);
788 wake_up(&server->response_q); 791 wake_up(&server->response_q);
789 return -EAGAIN; 792 return -ECONNABORTED;
790 } 793 }
791 794
792 /* switch to large buffer if too big for a small one */ 795 /* switch to large buffer if too big for a small one */
@@ -1231,6 +1234,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1231 vol->linux_uid = current_uid(); 1234 vol->linux_uid = current_uid();
1232 vol->linux_gid = current_gid(); 1235 vol->linux_gid = current_gid();
1233 1236
1237 /*
1238 * default to SFM style remapping of seven reserved characters
1239 * unless user overrides it or we negotiate CIFS POSIX where
1240 * it is unnecessary. Can not simultaneously use more than one mapping
1241 * since then readdir could list files that open could not open
1242 */
1243 vol->remap = true;
1244
1234 /* default to only allowing write access to owner of the mount */ 1245 /* default to only allowing write access to owner of the mount */
1235 vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR; 1246 vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
1236 1247
@@ -1338,10 +1349,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1338 vol->noperm = 1; 1349 vol->noperm = 1;
1339 break; 1350 break;
1340 case Opt_mapchars: 1351 case Opt_mapchars:
1341 vol->remap = 1; 1352 vol->sfu_remap = true;
1353 vol->remap = false; /* disable SFM mapping */
1342 break; 1354 break;
1343 case Opt_nomapchars: 1355 case Opt_nomapchars:
1344 vol->remap = 0; 1356 vol->sfu_remap = false;
1357 break;
1358 case Opt_mapposix:
1359 vol->remap = true;
1360 vol->sfu_remap = false; /* disable SFU mapping */
1361 break;
1362 case Opt_nomapposix:
1363 vol->remap = false;
1345 break; 1364 break;
1346 case Opt_sfu: 1365 case Opt_sfu:
1347 vol->sfu_emul = 1; 1366 vol->sfu_emul = 1;
@@ -1600,6 +1619,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1600 tmp_end++; 1619 tmp_end++;
1601 if (!(tmp_end < end && tmp_end[1] == delim)) { 1620 if (!(tmp_end < end && tmp_end[1] == delim)) {
1602 /* No it is not. Set the password to NULL */ 1621 /* No it is not. Set the password to NULL */
1622 kfree(vol->password);
1603 vol->password = NULL; 1623 vol->password = NULL;
1604 break; 1624 break;
1605 } 1625 }
@@ -1637,6 +1657,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1637 options = end; 1657 options = end;
1638 } 1658 }
1639 1659
1660 kfree(vol->password);
1640 /* Now build new password string */ 1661 /* Now build new password string */
1641 temp_len = strlen(value); 1662 temp_len = strlen(value);
1642 vol->password = kzalloc(temp_len+1, GFP_KERNEL); 1663 vol->password = kzalloc(temp_len+1, GFP_KERNEL);
@@ -1716,7 +1737,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1716 goto cifs_parse_mount_err; 1737 goto cifs_parse_mount_err;
1717 } 1738 }
1718 1739
1719 if (strnicmp(string, "default", 7) != 0) { 1740 if (strncasecmp(string, "default", 7) != 0) {
1720 vol->iocharset = kstrdup(string, 1741 vol->iocharset = kstrdup(string,
1721 GFP_KERNEL); 1742 GFP_KERNEL);
1722 if (!vol->iocharset) { 1743 if (!vol->iocharset) {
@@ -1788,7 +1809,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
1788 if (string == NULL) 1809 if (string == NULL)
1789 goto out_nomem; 1810 goto out_nomem;
1790 1811
1791 if (strnicmp(string, "1", 1) == 0) { 1812 if (strncasecmp(string, "1", 1) == 0) {
1792 /* This is the default */ 1813 /* This is the default */
1793 break; 1814 break;
1794 } 1815 }
@@ -3195,6 +3216,8 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
3195 if (pvolume_info->server_ino) 3216 if (pvolume_info->server_ino)
3196 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; 3217 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM;
3197 if (pvolume_info->remap) 3218 if (pvolume_info->remap)
3219 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SFM_CHR;
3220 if (pvolume_info->sfu_remap)
3198 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR; 3221 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR;
3199 if (pvolume_info->no_xattr) 3222 if (pvolume_info->no_xattr)
3200 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR; 3223 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR;
@@ -3237,10 +3260,20 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
3237 } 3260 }
3238 if (pvolume_info->mfsymlinks) { 3261 if (pvolume_info->mfsymlinks) {
3239 if (pvolume_info->sfu_emul) { 3262 if (pvolume_info->sfu_emul) {
3240 cifs_dbg(VFS, "mount option mfsymlinks ignored if sfu mount option is used\n"); 3263 /*
3241 } else { 3264 * Our SFU ("Services for Unix" emulation does not allow
3242 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; 3265 * creating symlinks but does allow reading existing SFU
3266 * symlinks (it does allow both creating and reading SFU
3267 * style mknod and FIFOs though). When "mfsymlinks" and
3268 * "sfu" are both enabled at the same time, it allows
3269 * reading both types of symlinks, but will only create
3270 * them with mfsymlinks format. This allows better
3271 * Apple compatibility (probably better for Samba too)
3272 * while still recognizing old Windows style symlinks.
3273 */
3274 cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n");
3243 } 3275 }
3276 cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS;
3244 } 3277 }
3245 3278
3246 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) 3279 if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm))
@@ -3328,8 +3361,7 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
3328 ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1; 3361 ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1;
3329 3362
3330 rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls, 3363 rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls,
3331 &num_referrals, &referrals, 3364 &num_referrals, &referrals, cifs_remap(cifs_sb));
3332 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
3333 3365
3334 if (!rc && num_referrals > 0) { 3366 if (!rc && num_referrals > 0) {
3335 char *fake_devname = NULL; 3367 char *fake_devname = NULL;
@@ -3934,13 +3966,6 @@ cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
3934 return tlink_tcon(cifs_sb_master_tlink(cifs_sb)); 3966 return tlink_tcon(cifs_sb_master_tlink(cifs_sb));
3935} 3967}
3936 3968
3937static int
3938cifs_sb_tcon_pending_wait(void *unused)
3939{
3940 schedule();
3941 return signal_pending(current) ? -ERESTARTSYS : 0;
3942}
3943
3944/* find and return a tlink with given uid */ 3969/* find and return a tlink with given uid */
3945static struct tcon_link * 3970static struct tcon_link *
3946tlink_rb_search(struct rb_root *root, kuid_t uid) 3971tlink_rb_search(struct rb_root *root, kuid_t uid)
@@ -4039,11 +4064,10 @@ cifs_sb_tlink(struct cifs_sb_info *cifs_sb)
4039 } else { 4064 } else {
4040wait_for_construction: 4065wait_for_construction:
4041 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING, 4066 ret = wait_on_bit(&tlink->tl_flags, TCON_LINK_PENDING,
4042 cifs_sb_tcon_pending_wait,
4043 TASK_INTERRUPTIBLE); 4067 TASK_INTERRUPTIBLE);
4044 if (ret) { 4068 if (ret) {
4045 cifs_put_tlink(tlink); 4069 cifs_put_tlink(tlink);
4046 return ERR_PTR(ret); 4070 return ERR_PTR(-ERESTARTSYS);
4047 } 4071 }
4048 4072
4049 /* if it's good, return it */ 4073 /* if it's good, return it */
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3db0c5fd9a11..b72bc29cba23 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -461,8 +461,8 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
461 461
462 xid = get_xid(); 462 xid = get_xid();
463 463
464 cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n", 464 cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
465 inode, direntry->d_name.name, direntry); 465 inode, direntry, direntry);
466 466
467 tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); 467 tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
468 if (IS_ERR(tlink)) { 468 if (IS_ERR(tlink)) {
@@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
497 goto out; 497 goto out;
498 } 498 }
499 499
500 if (file->f_flags & O_DIRECT &&
501 CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
502 if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
503 file->f_op = &cifs_file_direct_nobrl_ops;
504 else
505 file->f_op = &cifs_file_direct_ops;
506 }
507
500 file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); 508 file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
501 if (file_info == NULL) { 509 if (file_info == NULL) {
502 if (server->ops->close) 510 if (server->ops->close)
@@ -532,8 +540,8 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
532 struct cifs_fid fid; 540 struct cifs_fid fid;
533 __u32 oplock; 541 __u32 oplock;
534 542
535 cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", 543 cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
536 inode, direntry->d_name.name, direntry); 544 inode, direntry, direntry);
537 545
538 tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); 546 tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb));
539 rc = PTR_ERR(tlink); 547 rc = PTR_ERR(tlink);
@@ -569,12 +577,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
569 struct cifs_io_parms io_parms; 577 struct cifs_io_parms io_parms;
570 char *full_path = NULL; 578 char *full_path = NULL;
571 struct inode *newinode = NULL; 579 struct inode *newinode = NULL;
572 int oplock = 0; 580 __u32 oplock = 0;
573 struct cifs_fid fid; 581 struct cifs_fid fid;
574 struct cifs_open_parms oparms; 582 struct cifs_open_parms oparms;
575 FILE_ALL_INFO *buf = NULL; 583 FILE_ALL_INFO *buf = NULL;
576 unsigned int bytes_written; 584 unsigned int bytes_written;
577 struct win_dev *pdev; 585 struct win_dev *pdev;
586 struct kvec iov[2];
578 587
579 if (!old_valid_dev(device_number)) 588 if (!old_valid_dev(device_number))
580 return -EINVAL; 589 return -EINVAL;
@@ -650,7 +659,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
650 oparms.fid = &fid; 659 oparms.fid = &fid;
651 oparms.reconnect = false; 660 oparms.reconnect = false;
652 661
653 rc = CIFS_open(xid, &oparms, &oplock, buf); 662 if (tcon->ses->server->oplocks)
663 oplock = REQ_OPLOCK;
664 else
665 oplock = 0;
666 rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf);
654 if (rc) 667 if (rc)
655 goto mknod_out; 668 goto mknod_out;
656 669
@@ -660,25 +673,26 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode,
660 */ 673 */
661 674
662 pdev = (struct win_dev *)buf; 675 pdev = (struct win_dev *)buf;
663 io_parms.netfid = fid.netfid;
664 io_parms.pid = current->tgid; 676 io_parms.pid = current->tgid;
665 io_parms.tcon = tcon; 677 io_parms.tcon = tcon;
666 io_parms.offset = 0; 678 io_parms.offset = 0;
667 io_parms.length = sizeof(struct win_dev); 679 io_parms.length = sizeof(struct win_dev);
680 iov[1].iov_base = buf;
681 iov[1].iov_len = sizeof(struct win_dev);
668 if (S_ISCHR(mode)) { 682 if (S_ISCHR(mode)) {
669 memcpy(pdev->type, "IntxCHR", 8); 683 memcpy(pdev->type, "IntxCHR", 8);
670 pdev->major = cpu_to_le64(MAJOR(device_number)); 684 pdev->major = cpu_to_le64(MAJOR(device_number));
671 pdev->minor = cpu_to_le64(MINOR(device_number)); 685 pdev->minor = cpu_to_le64(MINOR(device_number));
672 rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev, 686 rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
673 NULL, 0); 687 &bytes_written, iov, 1);
674 } else if (S_ISBLK(mode)) { 688 } else if (S_ISBLK(mode)) {
675 memcpy(pdev->type, "IntxBLK", 8); 689 memcpy(pdev->type, "IntxBLK", 8);
676 pdev->major = cpu_to_le64(MAJOR(device_number)); 690 pdev->major = cpu_to_le64(MAJOR(device_number));
677 pdev->minor = cpu_to_le64(MINOR(device_number)); 691 pdev->minor = cpu_to_le64(MINOR(device_number));
678 rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev, 692 rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms,
679 NULL, 0); 693 &bytes_written, iov, 1);
680 } /* else if (S_ISFIFO) */ 694 } /* else if (S_ISFIFO) */
681 CIFSSMBClose(xid, tcon, fid.netfid); 695 tcon->ses->server->ops->close(xid, tcon, &fid);
682 d_drop(direntry); 696 d_drop(direntry);
683 697
684 /* FIXME: add code here to set EAs */ 698 /* FIXME: add code here to set EAs */
@@ -705,8 +719,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
705 719
706 xid = get_xid(); 720 xid = get_xid();
707 721
708 cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n", 722 cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n",
709 parent_dir_inode, direntry->d_name.name, direntry); 723 parent_dir_inode, direntry, direntry);
710 724
711 /* check whether path exists */ 725 /* check whether path exists */
712 726
@@ -825,7 +839,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags)
825{ 839{
826 int rc = 0; 840 int rc = 0;
827 841
828 cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name); 842 cifs_dbg(FYI, "In cifs d_delete, name = %pd\n", direntry);
829 843
830 return rc; 844 return rc;
831} */ 845} */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e90a1e9aa627..3e4d00a06c44 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file)
467 cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", 467 cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
468 inode, file->f_flags, full_path); 468 inode, file->f_flags, full_path);
469 469
470 if (file->f_flags & O_DIRECT &&
471 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
472 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
473 file->f_op = &cifs_file_direct_nobrl_ops;
474 else
475 file->f_op = &cifs_file_direct_ops;
476 }
477
470 if (server->oplocks) 478 if (server->oplocks)
471 oplock = REQ_OPLOCK; 479 oplock = REQ_OPLOCK;
472 else 480 else
@@ -762,7 +770,7 @@ int cifs_closedir(struct inode *inode, struct file *file)
762 770
763 cifs_dbg(FYI, "Freeing private data in close dir\n"); 771 cifs_dbg(FYI, "Freeing private data in close dir\n");
764 spin_lock(&cifs_file_list_lock); 772 spin_lock(&cifs_file_list_lock);
765 if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { 773 if (server->ops->dir_needs_close(cfile)) {
766 cfile->invalidHandle = true; 774 cfile->invalidHandle = true;
767 spin_unlock(&cifs_file_list_lock); 775 spin_unlock(&cifs_file_list_lock);
768 if (server->ops->close_dir) 776 if (server->ops->close_dir)
@@ -1642,8 +1650,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
1642 1650
1643 cifs_sb = CIFS_SB(dentry->d_sb); 1651 cifs_sb = CIFS_SB(dentry->d_sb);
1644 1652
1645 cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n", 1653 cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n",
1646 write_size, *offset, dentry->d_name.name); 1654 write_size, *offset, dentry);
1647 1655
1648 tcon = tlink_tcon(open_file->tlink); 1656 tcon = tlink_tcon(open_file->tlink);
1649 server = tcon->ses->server; 1657 server = tcon->ses->server;
@@ -1670,8 +1678,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
1670 break; 1678 break;
1671 } 1679 }
1672 1680
1673 len = min((size_t)cifs_sb->wsize, 1681 len = min(server->ops->wp_retry_size(dentry->d_inode),
1674 write_size - total_written); 1682 (unsigned int)write_size - total_written);
1675 /* iov[0] is reserved for smb header */ 1683 /* iov[0] is reserved for smb header */
1676 iov[1].iov_base = (char *)write_data + total_written; 1684 iov[1].iov_base = (char *)write_data + total_written;
1677 iov[1].iov_len = len; 1685 iov[1].iov_len = len;
@@ -1679,8 +1687,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data,
1679 io_parms.tcon = tcon; 1687 io_parms.tcon = tcon;
1680 io_parms.offset = *offset; 1688 io_parms.offset = *offset;
1681 io_parms.length = len; 1689 io_parms.length = len;
1682 rc = server->ops->sync_write(xid, open_file, &io_parms, 1690 rc = server->ops->sync_write(xid, &open_file->fid,
1683 &bytes_written, iov, 1); 1691 &io_parms, &bytes_written, iov, 1);
1684 } 1692 }
1685 if (rc || (bytes_written == 0)) { 1693 if (rc || (bytes_written == 0)) {
1686 if (total_written) 1694 if (total_written)
@@ -1878,15 +1886,163 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1878 return rc; 1886 return rc;
1879} 1887}
1880 1888
1889static struct cifs_writedata *
1890wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping,
1891 pgoff_t end, pgoff_t *index,
1892 unsigned int *found_pages)
1893{
1894 unsigned int nr_pages;
1895 struct page **pages;
1896 struct cifs_writedata *wdata;
1897
1898 wdata = cifs_writedata_alloc((unsigned int)tofind,
1899 cifs_writev_complete);
1900 if (!wdata)
1901 return NULL;
1902
1903 /*
1904 * find_get_pages_tag seems to return a max of 256 on each
1905 * iteration, so we must call it several times in order to
1906 * fill the array or the wsize is effectively limited to
1907 * 256 * PAGE_CACHE_SIZE.
1908 */
1909 *found_pages = 0;
1910 pages = wdata->pages;
1911 do {
1912 nr_pages = find_get_pages_tag(mapping, index,
1913 PAGECACHE_TAG_DIRTY, tofind,
1914 pages);
1915 *found_pages += nr_pages;
1916 tofind -= nr_pages;
1917 pages += nr_pages;
1918 } while (nr_pages && tofind && *index <= end);
1919
1920 return wdata;
1921}
1922
1923static unsigned int
1924wdata_prepare_pages(struct cifs_writedata *wdata, unsigned int found_pages,
1925 struct address_space *mapping,
1926 struct writeback_control *wbc,
1927 pgoff_t end, pgoff_t *index, pgoff_t *next, bool *done)
1928{
1929 unsigned int nr_pages = 0, i;
1930 struct page *page;
1931
1932 for (i = 0; i < found_pages; i++) {
1933 page = wdata->pages[i];
1934 /*
1935 * At this point we hold neither mapping->tree_lock nor
1936 * lock on the page itself: the page may be truncated or
1937 * invalidated (changing page->mapping to NULL), or even
1938 * swizzled back from swapper_space to tmpfs file
1939 * mapping
1940 */
1941
1942 if (nr_pages == 0)
1943 lock_page(page);
1944 else if (!trylock_page(page))
1945 break;
1946
1947 if (unlikely(page->mapping != mapping)) {
1948 unlock_page(page);
1949 break;
1950 }
1951
1952 if (!wbc->range_cyclic && page->index > end) {
1953 *done = true;
1954 unlock_page(page);
1955 break;
1956 }
1957
1958 if (*next && (page->index != *next)) {
1959 /* Not next consecutive page */
1960 unlock_page(page);
1961 break;
1962 }
1963
1964 if (wbc->sync_mode != WB_SYNC_NONE)
1965 wait_on_page_writeback(page);
1966
1967 if (PageWriteback(page) ||
1968 !clear_page_dirty_for_io(page)) {
1969 unlock_page(page);
1970 break;
1971 }
1972
1973 /*
1974 * This actually clears the dirty bit in the radix tree.
1975 * See cifs_writepage() for more commentary.
1976 */
1977 set_page_writeback(page);
1978 if (page_offset(page) >= i_size_read(mapping->host)) {
1979 *done = true;
1980 unlock_page(page);
1981 end_page_writeback(page);
1982 break;
1983 }
1984
1985 wdata->pages[i] = page;
1986 *next = page->index + 1;
1987 ++nr_pages;
1988 }
1989
1990 /* reset index to refind any pages skipped */
1991 if (nr_pages == 0)
1992 *index = wdata->pages[0]->index + 1;
1993
1994 /* put any pages we aren't going to use */
1995 for (i = nr_pages; i < found_pages; i++) {
1996 page_cache_release(wdata->pages[i]);
1997 wdata->pages[i] = NULL;
1998 }
1999
2000 return nr_pages;
2001}
2002
2003static int
2004wdata_send_pages(struct cifs_writedata *wdata, unsigned int nr_pages,
2005 struct address_space *mapping, struct writeback_control *wbc)
2006{
2007 int rc = 0;
2008 struct TCP_Server_Info *server;
2009 unsigned int i;
2010
2011 wdata->sync_mode = wbc->sync_mode;
2012 wdata->nr_pages = nr_pages;
2013 wdata->offset = page_offset(wdata->pages[0]);
2014 wdata->pagesz = PAGE_CACHE_SIZE;
2015 wdata->tailsz = min(i_size_read(mapping->host) -
2016 page_offset(wdata->pages[nr_pages - 1]),
2017 (loff_t)PAGE_CACHE_SIZE);
2018 wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) + wdata->tailsz;
2019
2020 if (wdata->cfile != NULL)
2021 cifsFileInfo_put(wdata->cfile);
2022 wdata->cfile = find_writable_file(CIFS_I(mapping->host), false);
2023 if (!wdata->cfile) {
2024 cifs_dbg(VFS, "No writable handles for inode\n");
2025 rc = -EBADF;
2026 } else {
2027 wdata->pid = wdata->cfile->pid;
2028 server = tlink_tcon(wdata->cfile->tlink)->ses->server;
2029 rc = server->ops->async_writev(wdata, cifs_writedata_release);
2030 }
2031
2032 for (i = 0; i < nr_pages; ++i)
2033 unlock_page(wdata->pages[i]);
2034
2035 return rc;
2036}
2037
1881static int cifs_writepages(struct address_space *mapping, 2038static int cifs_writepages(struct address_space *mapping,
1882 struct writeback_control *wbc) 2039 struct writeback_control *wbc)
1883{ 2040{
1884 struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb); 2041 struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
2042 struct TCP_Server_Info *server;
1885 bool done = false, scanned = false, range_whole = false; 2043 bool done = false, scanned = false, range_whole = false;
1886 pgoff_t end, index; 2044 pgoff_t end, index;
1887 struct cifs_writedata *wdata; 2045 struct cifs_writedata *wdata;
1888 struct TCP_Server_Info *server;
1889 struct page *page;
1890 int rc = 0; 2046 int rc = 0;
1891 2047
1892 /* 2048 /*
@@ -1906,152 +2062,50 @@ static int cifs_writepages(struct address_space *mapping,
1906 range_whole = true; 2062 range_whole = true;
1907 scanned = true; 2063 scanned = true;
1908 } 2064 }
2065 server = cifs_sb_master_tcon(cifs_sb)->ses->server;
1909retry: 2066retry:
1910 while (!done && index <= end) { 2067 while (!done && index <= end) {
1911 unsigned int i, nr_pages, found_pages; 2068 unsigned int i, nr_pages, found_pages, wsize, credits;
1912 pgoff_t next = 0, tofind; 2069 pgoff_t next = 0, tofind, saved_index = index;
1913 struct page **pages;
1914 2070
1915 tofind = min((cifs_sb->wsize / PAGE_CACHE_SIZE) - 1, 2071 rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
1916 end - index) + 1; 2072 &wsize, &credits);
2073 if (rc)
2074 break;
1917 2075
1918 wdata = cifs_writedata_alloc((unsigned int)tofind, 2076 tofind = min((wsize / PAGE_CACHE_SIZE) - 1, end - index) + 1;
1919 cifs_writev_complete); 2077
2078 wdata = wdata_alloc_and_fillpages(tofind, mapping, end, &index,
2079 &found_pages);
1920 if (!wdata) { 2080 if (!wdata) {
1921 rc = -ENOMEM; 2081 rc = -ENOMEM;
2082 add_credits_and_wake_if(server, credits, 0);
1922 break; 2083 break;
1923 } 2084 }
1924 2085
1925 /*
1926 * find_get_pages_tag seems to return a max of 256 on each
1927 * iteration, so we must call it several times in order to
1928 * fill the array or the wsize is effectively limited to
1929 * 256 * PAGE_CACHE_SIZE.
1930 */
1931 found_pages = 0;
1932 pages = wdata->pages;
1933 do {
1934 nr_pages = find_get_pages_tag(mapping, &index,
1935 PAGECACHE_TAG_DIRTY,
1936 tofind, pages);
1937 found_pages += nr_pages;
1938 tofind -= nr_pages;
1939 pages += nr_pages;
1940 } while (nr_pages && tofind && index <= end);
1941
1942 if (found_pages == 0) { 2086 if (found_pages == 0) {
1943 kref_put(&wdata->refcount, cifs_writedata_release); 2087 kref_put(&wdata->refcount, cifs_writedata_release);
2088 add_credits_and_wake_if(server, credits, 0);
1944 break; 2089 break;
1945 } 2090 }
1946 2091
1947 nr_pages = 0; 2092 nr_pages = wdata_prepare_pages(wdata, found_pages, mapping, wbc,
1948 for (i = 0; i < found_pages; i++) { 2093 end, &index, &next, &done);
1949 page = wdata->pages[i];
1950 /*
1951 * At this point we hold neither mapping->tree_lock nor
1952 * lock on the page itself: the page may be truncated or
1953 * invalidated (changing page->mapping to NULL), or even
1954 * swizzled back from swapper_space to tmpfs file
1955 * mapping
1956 */
1957
1958 if (nr_pages == 0)
1959 lock_page(page);
1960 else if (!trylock_page(page))
1961 break;
1962
1963 if (unlikely(page->mapping != mapping)) {
1964 unlock_page(page);
1965 break;
1966 }
1967
1968 if (!wbc->range_cyclic && page->index > end) {
1969 done = true;
1970 unlock_page(page);
1971 break;
1972 }
1973
1974 if (next && (page->index != next)) {
1975 /* Not next consecutive page */
1976 unlock_page(page);
1977 break;
1978 }
1979
1980 if (wbc->sync_mode != WB_SYNC_NONE)
1981 wait_on_page_writeback(page);
1982
1983 if (PageWriteback(page) ||
1984 !clear_page_dirty_for_io(page)) {
1985 unlock_page(page);
1986 break;
1987 }
1988
1989 /*
1990 * This actually clears the dirty bit in the radix tree.
1991 * See cifs_writepage() for more commentary.
1992 */
1993 set_page_writeback(page);
1994
1995 if (page_offset(page) >= i_size_read(mapping->host)) {
1996 done = true;
1997 unlock_page(page);
1998 end_page_writeback(page);
1999 break;
2000 }
2001
2002 wdata->pages[i] = page;
2003 next = page->index + 1;
2004 ++nr_pages;
2005 }
2006
2007 /* reset index to refind any pages skipped */
2008 if (nr_pages == 0)
2009 index = wdata->pages[0]->index + 1;
2010
2011 /* put any pages we aren't going to use */
2012 for (i = nr_pages; i < found_pages; i++) {
2013 page_cache_release(wdata->pages[i]);
2014 wdata->pages[i] = NULL;
2015 }
2016 2094
2017 /* nothing to write? */ 2095 /* nothing to write? */
2018 if (nr_pages == 0) { 2096 if (nr_pages == 0) {
2019 kref_put(&wdata->refcount, cifs_writedata_release); 2097 kref_put(&wdata->refcount, cifs_writedata_release);
2098 add_credits_and_wake_if(server, credits, 0);
2020 continue; 2099 continue;
2021 } 2100 }
2022 2101
2023 wdata->sync_mode = wbc->sync_mode; 2102 wdata->credits = credits;
2024 wdata->nr_pages = nr_pages;
2025 wdata->offset = page_offset(wdata->pages[0]);
2026 wdata->pagesz = PAGE_CACHE_SIZE;
2027 wdata->tailsz =
2028 min(i_size_read(mapping->host) -
2029 page_offset(wdata->pages[nr_pages - 1]),
2030 (loff_t)PAGE_CACHE_SIZE);
2031 wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
2032 wdata->tailsz;
2033
2034 do {
2035 if (wdata->cfile != NULL)
2036 cifsFileInfo_put(wdata->cfile);
2037 wdata->cfile = find_writable_file(CIFS_I(mapping->host),
2038 false);
2039 if (!wdata->cfile) {
2040 cifs_dbg(VFS, "No writable handles for inode\n");
2041 rc = -EBADF;
2042 break;
2043 }
2044 wdata->pid = wdata->cfile->pid;
2045 server = tlink_tcon(wdata->cfile->tlink)->ses->server;
2046 rc = server->ops->async_writev(wdata,
2047 cifs_writedata_release);
2048 } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
2049 2103
2050 for (i = 0; i < nr_pages; ++i) 2104 rc = wdata_send_pages(wdata, nr_pages, mapping, wbc);
2051 unlock_page(wdata->pages[i]);
2052 2105
2053 /* send failure -- clean up the mess */ 2106 /* send failure -- clean up the mess */
2054 if (rc != 0) { 2107 if (rc != 0) {
2108 add_credits_and_wake_if(server, wdata->credits, 0);
2055 for (i = 0; i < nr_pages; ++i) { 2109 for (i = 0; i < nr_pages; ++i) {
2056 if (rc == -EAGAIN) 2110 if (rc == -EAGAIN)
2057 redirty_page_for_writepage(wbc, 2111 redirty_page_for_writepage(wbc,
@@ -2066,6 +2120,11 @@ retry:
2066 } 2120 }
2067 kref_put(&wdata->refcount, cifs_writedata_release); 2121 kref_put(&wdata->refcount, cifs_writedata_release);
2068 2122
2123 if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN) {
2124 index = saved_index;
2125 continue;
2126 }
2127
2069 wbc->nr_to_write -= nr_pages; 2128 wbc->nr_to_write -= nr_pages;
2070 if (wbc->nr_to_write <= 0) 2129 if (wbc->nr_to_write <= 0)
2071 done = true; 2130 done = true;
@@ -2214,8 +2273,8 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
2214 2273
2215 xid = get_xid(); 2274 xid = get_xid();
2216 2275
2217 cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", 2276 cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n",
2218 file->f_path.dentry->d_name.name, datasync); 2277 file, datasync);
2219 2278
2220 if (!CIFS_CACHE_READ(CIFS_I(inode))) { 2279 if (!CIFS_CACHE_READ(CIFS_I(inode))) {
2221 rc = cifs_zap_mapping(inode); 2280 rc = cifs_zap_mapping(inode);
@@ -2256,8 +2315,8 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2256 2315
2257 xid = get_xid(); 2316 xid = get_xid();
2258 2317
2259 cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", 2318 cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n",
2260 file->f_path.dentry->d_name.name, datasync); 2319 file, datasync);
2261 2320
2262 tcon = tlink_tcon(smbfile->tlink); 2321 tcon = tlink_tcon(smbfile->tlink);
2263 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { 2322 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
@@ -2362,123 +2421,109 @@ cifs_uncached_writev_complete(struct work_struct *work)
2362 kref_put(&wdata->refcount, cifs_uncached_writedata_release); 2421 kref_put(&wdata->refcount, cifs_uncached_writedata_release);
2363} 2422}
2364 2423
2365/* attempt to send write to server, retry on any -EAGAIN errors */
2366static int 2424static int
2367cifs_uncached_retry_writev(struct cifs_writedata *wdata) 2425wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
2426 size_t *len, unsigned long *num_pages)
2368{ 2427{
2369 int rc; 2428 size_t save_len, copied, bytes, cur_len = *len;
2370 struct TCP_Server_Info *server; 2429 unsigned long i, nr_pages = *num_pages;
2371 2430
2372 server = tlink_tcon(wdata->cfile->tlink)->ses->server; 2431 save_len = cur_len;
2432 for (i = 0; i < nr_pages; i++) {
2433 bytes = min_t(const size_t, cur_len, PAGE_SIZE);
2434 copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
2435 cur_len -= copied;
2436 /*
2437 * If we didn't copy as much as we expected, then that
2438 * may mean we trod into an unmapped area. Stop copying
2439 * at that point. On the next pass through the big
2440 * loop, we'll likely end up getting a zero-length
2441 * write and bailing out of it.
2442 */
2443 if (copied < bytes)
2444 break;
2445 }
2446 cur_len = save_len - cur_len;
2447 *len = cur_len;
2373 2448
2374 do { 2449 /*
2375 if (wdata->cfile->invalidHandle) { 2450 * If we have no data to send, then that probably means that
2376 rc = cifs_reopen_file(wdata->cfile, false); 2451 * the copy above failed altogether. That's most likely because
2377 if (rc != 0) 2452 * the address in the iovec was bogus. Return -EFAULT and let
2378 continue; 2453 * the caller free anything we allocated and bail out.
2379 } 2454 */
2380 rc = server->ops->async_writev(wdata, 2455 if (!cur_len)
2381 cifs_uncached_writedata_release); 2456 return -EFAULT;
2382 } while (rc == -EAGAIN);
2383 2457
2384 return rc; 2458 /*
2459 * i + 1 now represents the number of pages we actually used in
2460 * the copy phase above.
2461 */
2462 *num_pages = i + 1;
2463 return 0;
2385} 2464}
2386 2465
2387static ssize_t 2466static int
2388cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) 2467cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from,
2468 struct cifsFileInfo *open_file,
2469 struct cifs_sb_info *cifs_sb, struct list_head *wdata_list)
2389{ 2470{
2390 unsigned long nr_pages, i; 2471 int rc = 0;
2391 size_t bytes, copied, len, cur_len; 2472 size_t cur_len;
2392 ssize_t total_written = 0; 2473 unsigned long nr_pages, num_pages, i;
2393 loff_t offset; 2474 struct cifs_writedata *wdata;
2394 struct cifsFileInfo *open_file; 2475 struct iov_iter saved_from;
2395 struct cifs_tcon *tcon; 2476 loff_t saved_offset = offset;
2396 struct cifs_sb_info *cifs_sb;
2397 struct cifs_writedata *wdata, *tmp;
2398 struct list_head wdata_list;
2399 int rc;
2400 pid_t pid; 2477 pid_t pid;
2401 2478 struct TCP_Server_Info *server;
2402 len = iov_iter_count(from);
2403 rc = generic_write_checks(file, poffset, &len, 0);
2404 if (rc)
2405 return rc;
2406
2407 if (!len)
2408 return 0;
2409
2410 iov_iter_truncate(from, len);
2411
2412 INIT_LIST_HEAD(&wdata_list);
2413 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2414 open_file = file->private_data;
2415 tcon = tlink_tcon(open_file->tlink);
2416
2417 if (!tcon->ses->server->ops->async_writev)
2418 return -ENOSYS;
2419
2420 offset = *poffset;
2421 2479
2422 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) 2480 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
2423 pid = open_file->pid; 2481 pid = open_file->pid;
2424 else 2482 else
2425 pid = current->tgid; 2483 pid = current->tgid;
2426 2484
2485 server = tlink_tcon(open_file->tlink)->ses->server;
2486 memcpy(&saved_from, from, sizeof(struct iov_iter));
2487
2427 do { 2488 do {
2428 size_t save_len; 2489 unsigned int wsize, credits;
2490
2491 rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
2492 &wsize, &credits);
2493 if (rc)
2494 break;
2429 2495
2430 nr_pages = get_numpages(cifs_sb->wsize, len, &cur_len); 2496 nr_pages = get_numpages(wsize, len, &cur_len);
2431 wdata = cifs_writedata_alloc(nr_pages, 2497 wdata = cifs_writedata_alloc(nr_pages,
2432 cifs_uncached_writev_complete); 2498 cifs_uncached_writev_complete);
2433 if (!wdata) { 2499 if (!wdata) {
2434 rc = -ENOMEM; 2500 rc = -ENOMEM;
2501 add_credits_and_wake_if(server, credits, 0);
2435 break; 2502 break;
2436 } 2503 }
2437 2504
2438 rc = cifs_write_allocate_pages(wdata->pages, nr_pages); 2505 rc = cifs_write_allocate_pages(wdata->pages, nr_pages);
2439 if (rc) { 2506 if (rc) {
2440 kfree(wdata); 2507 kfree(wdata);
2508 add_credits_and_wake_if(server, credits, 0);
2441 break; 2509 break;
2442 } 2510 }
2443 2511
2444 save_len = cur_len; 2512 num_pages = nr_pages;
2445 for (i = 0; i < nr_pages; i++) { 2513 rc = wdata_fill_from_iovec(wdata, from, &cur_len, &num_pages);
2446 bytes = min_t(size_t, cur_len, PAGE_SIZE); 2514 if (rc) {
2447 copied = copy_page_from_iter(wdata->pages[i], 0, bytes,
2448 from);
2449 cur_len -= copied;
2450 /*
2451 * If we didn't copy as much as we expected, then that
2452 * may mean we trod into an unmapped area. Stop copying
2453 * at that point. On the next pass through the big
2454 * loop, we'll likely end up getting a zero-length
2455 * write and bailing out of it.
2456 */
2457 if (copied < bytes)
2458 break;
2459 }
2460 cur_len = save_len - cur_len;
2461
2462 /*
2463 * If we have no data to send, then that probably means that
2464 * the copy above failed altogether. That's most likely because
2465 * the address in the iovec was bogus. Set the rc to -EFAULT,
2466 * free anything we allocated and bail out.
2467 */
2468 if (!cur_len) {
2469 for (i = 0; i < nr_pages; i++) 2515 for (i = 0; i < nr_pages; i++)
2470 put_page(wdata->pages[i]); 2516 put_page(wdata->pages[i]);
2471 kfree(wdata); 2517 kfree(wdata);
2472 rc = -EFAULT; 2518 add_credits_and_wake_if(server, credits, 0);
2473 break; 2519 break;
2474 } 2520 }
2475 2521
2476 /* 2522 /*
2477 * i + 1 now represents the number of pages we actually used in 2523 * Bring nr_pages down to the number of pages we actually used,
2478 * the copy phase above. Bring nr_pages down to that, and free 2524 * and free any pages that we didn't use.
2479 * any pages that we didn't use.
2480 */ 2525 */
2481 for ( ; nr_pages > i + 1; nr_pages--) 2526 for ( ; nr_pages > num_pages; nr_pages--)
2482 put_page(wdata->pages[nr_pages - 1]); 2527 put_page(wdata->pages[nr_pages - 1]);
2483 2528
2484 wdata->sync_mode = WB_SYNC_ALL; 2529 wdata->sync_mode = WB_SYNC_ALL;
@@ -2489,18 +2534,69 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
2489 wdata->bytes = cur_len; 2534 wdata->bytes = cur_len;
2490 wdata->pagesz = PAGE_SIZE; 2535 wdata->pagesz = PAGE_SIZE;
2491 wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); 2536 wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE);
2492 rc = cifs_uncached_retry_writev(wdata); 2537 wdata->credits = credits;
2538
2539 if (!wdata->cfile->invalidHandle ||
2540 !cifs_reopen_file(wdata->cfile, false))
2541 rc = server->ops->async_writev(wdata,
2542 cifs_uncached_writedata_release);
2493 if (rc) { 2543 if (rc) {
2544 add_credits_and_wake_if(server, wdata->credits, 0);
2494 kref_put(&wdata->refcount, 2545 kref_put(&wdata->refcount,
2495 cifs_uncached_writedata_release); 2546 cifs_uncached_writedata_release);
2547 if (rc == -EAGAIN) {
2548 memcpy(from, &saved_from,
2549 sizeof(struct iov_iter));
2550 iov_iter_advance(from, offset - saved_offset);
2551 continue;
2552 }
2496 break; 2553 break;
2497 } 2554 }
2498 2555
2499 list_add_tail(&wdata->list, &wdata_list); 2556 list_add_tail(&wdata->list, wdata_list);
2500 offset += cur_len; 2557 offset += cur_len;
2501 len -= cur_len; 2558 len -= cur_len;
2502 } while (len > 0); 2559 } while (len > 0);
2503 2560
2561 return rc;
2562}
2563
2564static ssize_t
2565cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset)
2566{
2567 size_t len;
2568 ssize_t total_written = 0;
2569 struct cifsFileInfo *open_file;
2570 struct cifs_tcon *tcon;
2571 struct cifs_sb_info *cifs_sb;
2572 struct cifs_writedata *wdata, *tmp;
2573 struct list_head wdata_list;
2574 struct iov_iter saved_from;
2575 int rc;
2576
2577 len = iov_iter_count(from);
2578 rc = generic_write_checks(file, poffset, &len, 0);
2579 if (rc)
2580 return rc;
2581
2582 if (!len)
2583 return 0;
2584
2585 iov_iter_truncate(from, len);
2586
2587 INIT_LIST_HEAD(&wdata_list);
2588 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2589 open_file = file->private_data;
2590 tcon = tlink_tcon(open_file->tlink);
2591
2592 if (!tcon->ses->server->ops->async_writev)
2593 return -ENOSYS;
2594
2595 memcpy(&saved_from, from, sizeof(struct iov_iter));
2596
2597 rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb,
2598 &wdata_list);
2599
2504 /* 2600 /*
2505 * If at least one write was successfully sent, then discard any rc 2601 * If at least one write was successfully sent, then discard any rc
2506 * value from the later writes. If the other write succeeds, then 2602 * value from the later writes. If the other write succeeds, then
@@ -2529,7 +2625,25 @@ restart_loop:
2529 2625
2530 /* resend call if it's a retryable error */ 2626 /* resend call if it's a retryable error */
2531 if (rc == -EAGAIN) { 2627 if (rc == -EAGAIN) {
2532 rc = cifs_uncached_retry_writev(wdata); 2628 struct list_head tmp_list;
2629 struct iov_iter tmp_from;
2630
2631 INIT_LIST_HEAD(&tmp_list);
2632 list_del_init(&wdata->list);
2633
2634 memcpy(&tmp_from, &saved_from,
2635 sizeof(struct iov_iter));
2636 iov_iter_advance(&tmp_from,
2637 wdata->offset - *poffset);
2638
2639 rc = cifs_write_from_iter(wdata->offset,
2640 wdata->bytes, &tmp_from,
2641 open_file, cifs_sb, &tmp_list);
2642
2643 list_splice(&tmp_list, &wdata_list);
2644
2645 kref_put(&wdata->refcount,
2646 cifs_uncached_writedata_release);
2533 goto restart_loop; 2647 goto restart_loop;
2534 } 2648 }
2535 } 2649 }
@@ -2722,26 +2836,6 @@ cifs_uncached_readdata_release(struct kref *refcount)
2722 cifs_readdata_release(refcount); 2836 cifs_readdata_release(refcount);
2723} 2837}
2724 2838
2725static int
2726cifs_retry_async_readv(struct cifs_readdata *rdata)
2727{
2728 int rc;
2729 struct TCP_Server_Info *server;
2730
2731 server = tlink_tcon(rdata->cfile->tlink)->ses->server;
2732
2733 do {
2734 if (rdata->cfile->invalidHandle) {
2735 rc = cifs_reopen_file(rdata->cfile, true);
2736 if (rc != 0)
2737 continue;
2738 }
2739 rc = server->ops->async_readv(rdata);
2740 } while (rc == -EAGAIN);
2741
2742 return rc;
2743}
2744
2745/** 2839/**
2746 * cifs_readdata_to_iov - copy data from pages in response to an iovec 2840 * cifs_readdata_to_iov - copy data from pages in response to an iovec
2747 * @rdata: the readdata response with list of pages holding data 2841 * @rdata: the readdata response with list of pages holding data
@@ -2754,7 +2848,7 @@ cifs_retry_async_readv(struct cifs_readdata *rdata)
2754static int 2848static int
2755cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) 2849cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
2756{ 2850{
2757 size_t remaining = rdata->bytes; 2851 size_t remaining = rdata->got_bytes;
2758 unsigned int i; 2852 unsigned int i;
2759 2853
2760 for (i = 0; i < rdata->nr_pages; i++) { 2854 for (i = 0; i < rdata->nr_pages; i++) {
@@ -2782,11 +2876,12 @@ static int
2782cifs_uncached_read_into_pages(struct TCP_Server_Info *server, 2876cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
2783 struct cifs_readdata *rdata, unsigned int len) 2877 struct cifs_readdata *rdata, unsigned int len)
2784{ 2878{
2785 int total_read = 0, result = 0; 2879 int result = 0;
2786 unsigned int i; 2880 unsigned int i;
2787 unsigned int nr_pages = rdata->nr_pages; 2881 unsigned int nr_pages = rdata->nr_pages;
2788 struct kvec iov; 2882 struct kvec iov;
2789 2883
2884 rdata->got_bytes = 0;
2790 rdata->tailsz = PAGE_SIZE; 2885 rdata->tailsz = PAGE_SIZE;
2791 for (i = 0; i < nr_pages; i++) { 2886 for (i = 0; i < nr_pages; i++) {
2792 struct page *page = rdata->pages[i]; 2887 struct page *page = rdata->pages[i];
@@ -2820,55 +2915,45 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server,
2820 if (result < 0) 2915 if (result < 0)
2821 break; 2916 break;
2822 2917
2823 total_read += result; 2918 rdata->got_bytes += result;
2824 } 2919 }
2825 2920
2826 return total_read > 0 ? total_read : result; 2921 return rdata->got_bytes > 0 && result != -ECONNABORTED ?
2922 rdata->got_bytes : result;
2827} 2923}
2828 2924
2829ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) 2925static int
2926cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file,
2927 struct cifs_sb_info *cifs_sb, struct list_head *rdata_list)
2830{ 2928{
2831 struct file *file = iocb->ki_filp; 2929 struct cifs_readdata *rdata;
2832 ssize_t rc; 2930 unsigned int npages, rsize, credits;
2833 size_t len, cur_len; 2931 size_t cur_len;
2834 ssize_t total_read = 0; 2932 int rc;
2835 loff_t offset = iocb->ki_pos;
2836 unsigned int npages;
2837 struct cifs_sb_info *cifs_sb;
2838 struct cifs_tcon *tcon;
2839 struct cifsFileInfo *open_file;
2840 struct cifs_readdata *rdata, *tmp;
2841 struct list_head rdata_list;
2842 pid_t pid; 2933 pid_t pid;
2934 struct TCP_Server_Info *server;
2843 2935
2844 len = iov_iter_count(to); 2936 server = tlink_tcon(open_file->tlink)->ses->server;
2845 if (!len)
2846 return 0;
2847
2848 INIT_LIST_HEAD(&rdata_list);
2849 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
2850 open_file = file->private_data;
2851 tcon = tlink_tcon(open_file->tlink);
2852
2853 if (!tcon->ses->server->ops->async_readv)
2854 return -ENOSYS;
2855 2937
2856 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) 2938 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
2857 pid = open_file->pid; 2939 pid = open_file->pid;
2858 else 2940 else
2859 pid = current->tgid; 2941 pid = current->tgid;
2860 2942
2861 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
2862 cifs_dbg(FYI, "attempting read on write only file instance\n");
2863
2864 do { 2943 do {
2865 cur_len = min_t(const size_t, len - total_read, cifs_sb->rsize); 2944 rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
2945 &rsize, &credits);
2946 if (rc)
2947 break;
2948
2949 cur_len = min_t(const size_t, len, rsize);
2866 npages = DIV_ROUND_UP(cur_len, PAGE_SIZE); 2950 npages = DIV_ROUND_UP(cur_len, PAGE_SIZE);
2867 2951
2868 /* allocate a readdata struct */ 2952 /* allocate a readdata struct */
2869 rdata = cifs_readdata_alloc(npages, 2953 rdata = cifs_readdata_alloc(npages,
2870 cifs_uncached_readv_complete); 2954 cifs_uncached_readv_complete);
2871 if (!rdata) { 2955 if (!rdata) {
2956 add_credits_and_wake_if(server, credits, 0);
2872 rc = -ENOMEM; 2957 rc = -ENOMEM;
2873 break; 2958 break;
2874 } 2959 }
@@ -2884,44 +2969,113 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
2884 rdata->pid = pid; 2969 rdata->pid = pid;
2885 rdata->pagesz = PAGE_SIZE; 2970 rdata->pagesz = PAGE_SIZE;
2886 rdata->read_into_pages = cifs_uncached_read_into_pages; 2971 rdata->read_into_pages = cifs_uncached_read_into_pages;
2972 rdata->credits = credits;
2887 2973
2888 rc = cifs_retry_async_readv(rdata); 2974 if (!rdata->cfile->invalidHandle ||
2975 !cifs_reopen_file(rdata->cfile, true))
2976 rc = server->ops->async_readv(rdata);
2889error: 2977error:
2890 if (rc) { 2978 if (rc) {
2979 add_credits_and_wake_if(server, rdata->credits, 0);
2891 kref_put(&rdata->refcount, 2980 kref_put(&rdata->refcount,
2892 cifs_uncached_readdata_release); 2981 cifs_uncached_readdata_release);
2982 if (rc == -EAGAIN)
2983 continue;
2893 break; 2984 break;
2894 } 2985 }
2895 2986
2896 list_add_tail(&rdata->list, &rdata_list); 2987 list_add_tail(&rdata->list, rdata_list);
2897 offset += cur_len; 2988 offset += cur_len;
2898 len -= cur_len; 2989 len -= cur_len;
2899 } while (len > 0); 2990 } while (len > 0);
2900 2991
2992 return rc;
2993}
2994
2995ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
2996{
2997 struct file *file = iocb->ki_filp;
2998 ssize_t rc;
2999 size_t len;
3000 ssize_t total_read = 0;
3001 loff_t offset = iocb->ki_pos;
3002 struct cifs_sb_info *cifs_sb;
3003 struct cifs_tcon *tcon;
3004 struct cifsFileInfo *open_file;
3005 struct cifs_readdata *rdata, *tmp;
3006 struct list_head rdata_list;
3007
3008 len = iov_iter_count(to);
3009 if (!len)
3010 return 0;
3011
3012 INIT_LIST_HEAD(&rdata_list);
3013 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
3014 open_file = file->private_data;
3015 tcon = tlink_tcon(open_file->tlink);
3016
3017 if (!tcon->ses->server->ops->async_readv)
3018 return -ENOSYS;
3019
3020 if ((file->f_flags & O_ACCMODE) == O_WRONLY)
3021 cifs_dbg(FYI, "attempting read on write only file instance\n");
3022
3023 rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list);
3024
2901 /* if at least one read request send succeeded, then reset rc */ 3025 /* if at least one read request send succeeded, then reset rc */
2902 if (!list_empty(&rdata_list)) 3026 if (!list_empty(&rdata_list))
2903 rc = 0; 3027 rc = 0;
2904 3028
2905 len = iov_iter_count(to); 3029 len = iov_iter_count(to);
2906 /* the loop below should proceed in the order of increasing offsets */ 3030 /* the loop below should proceed in the order of increasing offsets */
3031again:
2907 list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { 3032 list_for_each_entry_safe(rdata, tmp, &rdata_list, list) {
2908 again:
2909 if (!rc) { 3033 if (!rc) {
2910 /* FIXME: freezable sleep too? */ 3034 /* FIXME: freezable sleep too? */
2911 rc = wait_for_completion_killable(&rdata->done); 3035 rc = wait_for_completion_killable(&rdata->done);
2912 if (rc) 3036 if (rc)
2913 rc = -EINTR; 3037 rc = -EINTR;
2914 else if (rdata->result) { 3038 else if (rdata->result == -EAGAIN) {
2915 rc = rdata->result;
2916 /* resend call if it's a retryable error */ 3039 /* resend call if it's a retryable error */
2917 if (rc == -EAGAIN) { 3040 struct list_head tmp_list;
2918 rc = cifs_retry_async_readv(rdata); 3041 unsigned int got_bytes = rdata->got_bytes;
2919 goto again; 3042
3043 list_del_init(&rdata->list);
3044 INIT_LIST_HEAD(&tmp_list);
3045
3046 /*
3047 * Got a part of data and then reconnect has
3048 * happened -- fill the buffer and continue
3049 * reading.
3050 */
3051 if (got_bytes && got_bytes < rdata->bytes) {
3052 rc = cifs_readdata_to_iov(rdata, to);
3053 if (rc) {
3054 kref_put(&rdata->refcount,
3055 cifs_uncached_readdata_release);
3056 continue;
3057 }
2920 } 3058 }
2921 } else { 3059
3060 rc = cifs_send_async_read(
3061 rdata->offset + got_bytes,
3062 rdata->bytes - got_bytes,
3063 rdata->cfile, cifs_sb,
3064 &tmp_list);
3065
3066 list_splice(&tmp_list, &rdata_list);
3067
3068 kref_put(&rdata->refcount,
3069 cifs_uncached_readdata_release);
3070 goto again;
3071 } else if (rdata->result)
3072 rc = rdata->result;
3073 else
2922 rc = cifs_readdata_to_iov(rdata, to); 3074 rc = cifs_readdata_to_iov(rdata, to);
2923 }
2924 3075
3076 /* if there was a short read -- discard anything left */
3077 if (rdata->got_bytes && rdata->got_bytes < rdata->bytes)
3078 rc = -ENODATA;
2925 } 3079 }
2926 list_del_init(&rdata->list); 3080 list_del_init(&rdata->list);
2927 kref_put(&rdata->refcount, cifs_uncached_readdata_release); 3081 kref_put(&rdata->refcount, cifs_uncached_readdata_release);
@@ -3030,18 +3184,19 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
3030 3184
3031 for (total_read = 0, cur_offset = read_data; read_size > total_read; 3185 for (total_read = 0, cur_offset = read_data; read_size > total_read;
3032 total_read += bytes_read, cur_offset += bytes_read) { 3186 total_read += bytes_read, cur_offset += bytes_read) {
3033 current_read_size = min_t(uint, read_size - total_read, rsize); 3187 do {
3034 /* 3188 current_read_size = min_t(uint, read_size - total_read,
3035 * For windows me and 9x we do not want to request more than it 3189 rsize);
3036 * negotiated since it will refuse the read then. 3190 /*
3037 */ 3191 * For windows me and 9x we do not want to request more
3038 if ((tcon->ses) && !(tcon->ses->capabilities & 3192 * than it negotiated since it will refuse the read
3193 * then.
3194 */
3195 if ((tcon->ses) && !(tcon->ses->capabilities &
3039 tcon->ses->server->vals->cap_large_files)) { 3196 tcon->ses->server->vals->cap_large_files)) {
3040 current_read_size = min_t(uint, current_read_size, 3197 current_read_size = min_t(uint,
3041 CIFSMaxBufSize); 3198 current_read_size, CIFSMaxBufSize);
3042 } 3199 }
3043 rc = -EAGAIN;
3044 while (rc == -EAGAIN) {
3045 if (open_file->invalidHandle) { 3200 if (open_file->invalidHandle) {
3046 rc = cifs_reopen_file(open_file, true); 3201 rc = cifs_reopen_file(open_file, true);
3047 if (rc != 0) 3202 if (rc != 0)
@@ -3051,10 +3206,11 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
3051 io_parms.tcon = tcon; 3206 io_parms.tcon = tcon;
3052 io_parms.offset = *offset; 3207 io_parms.offset = *offset;
3053 io_parms.length = current_read_size; 3208 io_parms.length = current_read_size;
3054 rc = server->ops->sync_read(xid, open_file, &io_parms, 3209 rc = server->ops->sync_read(xid, &open_file->fid, &io_parms,
3055 &bytes_read, &cur_offset, 3210 &bytes_read, &cur_offset,
3056 &buf_type); 3211 &buf_type);
3057 } 3212 } while (rc == -EAGAIN);
3213
3058 if (rc || (bytes_read == 0)) { 3214 if (rc || (bytes_read == 0)) {
3059 if (total_read) { 3215 if (total_read) {
3060 break; 3216 break;
@@ -3133,25 +3289,30 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
3133static void 3289static void
3134cifs_readv_complete(struct work_struct *work) 3290cifs_readv_complete(struct work_struct *work)
3135{ 3291{
3136 unsigned int i; 3292 unsigned int i, got_bytes;
3137 struct cifs_readdata *rdata = container_of(work, 3293 struct cifs_readdata *rdata = container_of(work,
3138 struct cifs_readdata, work); 3294 struct cifs_readdata, work);
3139 3295
3296 got_bytes = rdata->got_bytes;
3140 for (i = 0; i < rdata->nr_pages; i++) { 3297 for (i = 0; i < rdata->nr_pages; i++) {
3141 struct page *page = rdata->pages[i]; 3298 struct page *page = rdata->pages[i];
3142 3299
3143 lru_cache_add_file(page); 3300 lru_cache_add_file(page);
3144 3301
3145 if (rdata->result == 0) { 3302 if (rdata->result == 0 ||
3303 (rdata->result == -EAGAIN && got_bytes)) {
3146 flush_dcache_page(page); 3304 flush_dcache_page(page);
3147 SetPageUptodate(page); 3305 SetPageUptodate(page);
3148 } 3306 }
3149 3307
3150 unlock_page(page); 3308 unlock_page(page);
3151 3309
3152 if (rdata->result == 0) 3310 if (rdata->result == 0 ||
3311 (rdata->result == -EAGAIN && got_bytes))
3153 cifs_readpage_to_fscache(rdata->mapping->host, page); 3312 cifs_readpage_to_fscache(rdata->mapping->host, page);
3154 3313
3314 got_bytes -= min_t(unsigned int, PAGE_CACHE_SIZE, got_bytes);
3315
3155 page_cache_release(page); 3316 page_cache_release(page);
3156 rdata->pages[i] = NULL; 3317 rdata->pages[i] = NULL;
3157 } 3318 }
@@ -3162,7 +3323,7 @@ static int
3162cifs_readpages_read_into_pages(struct TCP_Server_Info *server, 3323cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
3163 struct cifs_readdata *rdata, unsigned int len) 3324 struct cifs_readdata *rdata, unsigned int len)
3164{ 3325{
3165 int total_read = 0, result = 0; 3326 int result = 0;
3166 unsigned int i; 3327 unsigned int i;
3167 u64 eof; 3328 u64 eof;
3168 pgoff_t eof_index; 3329 pgoff_t eof_index;
@@ -3174,6 +3335,7 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
3174 eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0; 3335 eof_index = eof ? (eof - 1) >> PAGE_CACHE_SHIFT : 0;
3175 cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index); 3336 cifs_dbg(FYI, "eof=%llu eof_index=%lu\n", eof, eof_index);
3176 3337
3338 rdata->got_bytes = 0;
3177 rdata->tailsz = PAGE_CACHE_SIZE; 3339 rdata->tailsz = PAGE_CACHE_SIZE;
3178 for (i = 0; i < nr_pages; i++) { 3340 for (i = 0; i < nr_pages; i++) {
3179 struct page *page = rdata->pages[i]; 3341 struct page *page = rdata->pages[i];
@@ -3228,10 +3390,70 @@ cifs_readpages_read_into_pages(struct TCP_Server_Info *server,
3228 if (result < 0) 3390 if (result < 0)
3229 break; 3391 break;
3230 3392
3231 total_read += result; 3393 rdata->got_bytes += result;
3394 }
3395
3396 return rdata->got_bytes > 0 && result != -ECONNABORTED ?
3397 rdata->got_bytes : result;
3398}
3399
3400static int
3401readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
3402 unsigned int rsize, struct list_head *tmplist,
3403 unsigned int *nr_pages, loff_t *offset, unsigned int *bytes)
3404{
3405 struct page *page, *tpage;
3406 unsigned int expected_index;
3407 int rc;
3408
3409 INIT_LIST_HEAD(tmplist);
3410
3411 page = list_entry(page_list->prev, struct page, lru);
3412
3413 /*
3414 * Lock the page and put it in the cache. Since no one else
3415 * should have access to this page, we're safe to simply set
3416 * PG_locked without checking it first.
3417 */
3418 __set_page_locked(page);
3419 rc = add_to_page_cache_locked(page, mapping,
3420 page->index, GFP_KERNEL);
3421
3422 /* give up if we can't stick it in the cache */
3423 if (rc) {
3424 __clear_page_locked(page);
3425 return rc;
3232 } 3426 }
3233 3427
3234 return total_read > 0 ? total_read : result; 3428 /* move first page to the tmplist */
3429 *offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
3430 *bytes = PAGE_CACHE_SIZE;
3431 *nr_pages = 1;
3432 list_move_tail(&page->lru, tmplist);
3433
3434 /* now try and add more pages onto the request */
3435 expected_index = page->index + 1;
3436 list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
3437 /* discontinuity ? */
3438 if (page->index != expected_index)
3439 break;
3440
3441 /* would this page push the read over the rsize? */
3442 if (*bytes + PAGE_CACHE_SIZE > rsize)
3443 break;
3444
3445 __set_page_locked(page);
3446 if (add_to_page_cache_locked(page, mapping, page->index,
3447 GFP_KERNEL)) {
3448 __clear_page_locked(page);
3449 break;
3450 }
3451 list_move_tail(&page->lru, tmplist);
3452 (*bytes) += PAGE_CACHE_SIZE;
3453 expected_index++;
3454 (*nr_pages)++;
3455 }
3456 return rc;
3235} 3457}
3236 3458
3237static int cifs_readpages(struct file *file, struct address_space *mapping, 3459static int cifs_readpages(struct file *file, struct address_space *mapping,
@@ -3241,19 +3463,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3241 struct list_head tmplist; 3463 struct list_head tmplist;
3242 struct cifsFileInfo *open_file = file->private_data; 3464 struct cifsFileInfo *open_file = file->private_data;
3243 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); 3465 struct cifs_sb_info *cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
3244 unsigned int rsize = cifs_sb->rsize; 3466 struct TCP_Server_Info *server;
3245 pid_t pid; 3467 pid_t pid;
3246 3468
3247 /* 3469 /*
3248 * Give up immediately if rsize is too small to read an entire page.
3249 * The VFS will fall back to readpage. We should never reach this
3250 * point however since we set ra_pages to 0 when the rsize is smaller
3251 * than a cache page.
3252 */
3253 if (unlikely(rsize < PAGE_CACHE_SIZE))
3254 return 0;
3255
3256 /*
3257 * Reads as many pages as possible from fscache. Returns -ENOBUFS 3470 * Reads as many pages as possible from fscache. Returns -ENOBUFS
3258 * immediately if the cookie is negative 3471 * immediately if the cookie is negative
3259 * 3472 *
@@ -3271,7 +3484,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3271 pid = current->tgid; 3484 pid = current->tgid;
3272 3485
3273 rc = 0; 3486 rc = 0;
3274 INIT_LIST_HEAD(&tmplist); 3487 server = tlink_tcon(open_file->tlink)->ses->server;
3275 3488
3276 cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n", 3489 cifs_dbg(FYI, "%s: file=%p mapping=%p num_pages=%u\n",
3277 __func__, file, mapping, num_pages); 3490 __func__, file, mapping, num_pages);
@@ -3288,58 +3501,35 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3288 * the rdata->pages, then we want them in increasing order. 3501 * the rdata->pages, then we want them in increasing order.
3289 */ 3502 */
3290 while (!list_empty(page_list)) { 3503 while (!list_empty(page_list)) {
3291 unsigned int i; 3504 unsigned int i, nr_pages, bytes, rsize;
3292 unsigned int bytes = PAGE_CACHE_SIZE;
3293 unsigned int expected_index;
3294 unsigned int nr_pages = 1;
3295 loff_t offset; 3505 loff_t offset;
3296 struct page *page, *tpage; 3506 struct page *page, *tpage;
3297 struct cifs_readdata *rdata; 3507 struct cifs_readdata *rdata;
3508 unsigned credits;
3298 3509
3299 page = list_entry(page_list->prev, struct page, lru); 3510 rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
3511 &rsize, &credits);
3512 if (rc)
3513 break;
3300 3514
3301 /* 3515 /*
3302 * Lock the page and put it in the cache. Since no one else 3516 * Give up immediately if rsize is too small to read an entire
3303 * should have access to this page, we're safe to simply set 3517 * page. The VFS will fall back to readpage. We should never
3304 * PG_locked without checking it first. 3518 * reach this point however since we set ra_pages to 0 when the
3519 * rsize is smaller than a cache page.
3305 */ 3520 */
3306 __set_page_locked(page); 3521 if (unlikely(rsize < PAGE_CACHE_SIZE)) {
3307 rc = add_to_page_cache_locked(page, mapping, 3522 add_credits_and_wake_if(server, credits, 0);
3308 page->index, GFP_KERNEL); 3523 return 0;
3524 }
3309 3525
3310 /* give up if we can't stick it in the cache */ 3526 rc = readpages_get_pages(mapping, page_list, rsize, &tmplist,
3527 &nr_pages, &offset, &bytes);
3311 if (rc) { 3528 if (rc) {
3312 __clear_page_locked(page); 3529 add_credits_and_wake_if(server, credits, 0);
3313 break; 3530 break;
3314 } 3531 }
3315 3532
3316 /* move first page to the tmplist */
3317 offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
3318 list_move_tail(&page->lru, &tmplist);
3319
3320 /* now try and add more pages onto the request */
3321 expected_index = page->index + 1;
3322 list_for_each_entry_safe_reverse(page, tpage, page_list, lru) {
3323 /* discontinuity ? */
3324 if (page->index != expected_index)
3325 break;
3326
3327 /* would this page push the read over the rsize? */
3328 if (bytes + PAGE_CACHE_SIZE > rsize)
3329 break;
3330
3331 __set_page_locked(page);
3332 if (add_to_page_cache_locked(page, mapping,
3333 page->index, GFP_KERNEL)) {
3334 __clear_page_locked(page);
3335 break;
3336 }
3337 list_move_tail(&page->lru, &tmplist);
3338 bytes += PAGE_CACHE_SIZE;
3339 expected_index++;
3340 nr_pages++;
3341 }
3342
3343 rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete); 3533 rdata = cifs_readdata_alloc(nr_pages, cifs_readv_complete);
3344 if (!rdata) { 3534 if (!rdata) {
3345 /* best to give up if we're out of mem */ 3535 /* best to give up if we're out of mem */
@@ -3350,6 +3540,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3350 page_cache_release(page); 3540 page_cache_release(page);
3351 } 3541 }
3352 rc = -ENOMEM; 3542 rc = -ENOMEM;
3543 add_credits_and_wake_if(server, credits, 0);
3353 break; 3544 break;
3354 } 3545 }
3355 3546
@@ -3360,20 +3551,25 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
3360 rdata->pid = pid; 3551 rdata->pid = pid;
3361 rdata->pagesz = PAGE_CACHE_SIZE; 3552 rdata->pagesz = PAGE_CACHE_SIZE;
3362 rdata->read_into_pages = cifs_readpages_read_into_pages; 3553 rdata->read_into_pages = cifs_readpages_read_into_pages;
3554 rdata->credits = credits;
3363 3555
3364 list_for_each_entry_safe(page, tpage, &tmplist, lru) { 3556 list_for_each_entry_safe(page, tpage, &tmplist, lru) {
3365 list_del(&page->lru); 3557 list_del(&page->lru);
3366 rdata->pages[rdata->nr_pages++] = page; 3558 rdata->pages[rdata->nr_pages++] = page;
3367 } 3559 }
3368 3560
3369 rc = cifs_retry_async_readv(rdata); 3561 if (!rdata->cfile->invalidHandle ||
3370 if (rc != 0) { 3562 !cifs_reopen_file(rdata->cfile, true))
3563 rc = server->ops->async_readv(rdata);
3564 if (rc) {
3565 add_credits_and_wake_if(server, rdata->credits, 0);
3371 for (i = 0; i < rdata->nr_pages; i++) { 3566 for (i = 0; i < rdata->nr_pages; i++) {
3372 page = rdata->pages[i]; 3567 page = rdata->pages[i];
3373 lru_cache_add_file(page); 3568 lru_cache_add_file(page);
3374 unlock_page(page); 3569 unlock_page(page);
3375 page_cache_release(page); 3570 page_cache_release(page);
3376 } 3571 }
3572 /* Fallback to the readpage in error/reconnect cases */
3377 kref_put(&rdata->refcount, cifs_readdata_release); 3573 kref_put(&rdata->refcount, cifs_readdata_release);
3378 break; 3574 break;
3379 } 3575 }
@@ -3618,13 +3814,6 @@ static int cifs_launder_page(struct page *page)
3618 return rc; 3814 return rc;
3619} 3815}
3620 3816
3621static int
3622cifs_pending_writers_wait(void *unused)
3623{
3624 schedule();
3625 return 0;
3626}
3627
3628void cifs_oplock_break(struct work_struct *work) 3817void cifs_oplock_break(struct work_struct *work)
3629{ 3818{
3630 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, 3819 struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo,
@@ -3636,7 +3825,7 @@ void cifs_oplock_break(struct work_struct *work)
3636 int rc = 0; 3825 int rc = 0;
3637 3826
3638 wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, 3827 wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS,
3639 cifs_pending_writers_wait, TASK_UNINTERRUPTIBLE); 3828 TASK_UNINTERRUPTIBLE);
3640 3829
3641 server->ops->downgrade_oplock(server, cinode, 3830 server->ops->downgrade_oplock(server, cinode,
3642 test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags)); 3831 test_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags));
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a174605f6afa..197cb503d528 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -30,6 +30,7 @@
30#include "cifsproto.h" 30#include "cifsproto.h"
31#include "cifs_debug.h" 31#include "cifs_debug.h"
32#include "cifs_fs_sb.h" 32#include "cifs_fs_sb.h"
33#include "cifs_unicode.h"
33#include "fscache.h" 34#include "fscache.h"
34 35
35 36
@@ -412,7 +413,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
412 struct cifs_sb_info *cifs_sb, unsigned int xid) 413 struct cifs_sb_info *cifs_sb, unsigned int xid)
413{ 414{
414 int rc; 415 int rc;
415 int oplock = 0; 416 __u32 oplock;
416 struct tcon_link *tlink; 417 struct tcon_link *tlink;
417 struct cifs_tcon *tcon; 418 struct cifs_tcon *tcon;
418 struct cifs_fid fid; 419 struct cifs_fid fid;
@@ -451,8 +452,13 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
451 oparms.fid = &fid; 452 oparms.fid = &fid;
452 oparms.reconnect = false; 453 oparms.reconnect = false;
453 454
454 rc = CIFS_open(xid, &oparms, &oplock, NULL); 455 if (tcon->ses->server->oplocks)
456 oplock = REQ_OPLOCK;
457 else
458 oplock = 0;
459 rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL);
455 if (rc) { 460 if (rc) {
461 cifs_dbg(FYI, "check sfu type of %s, open rc = %d\n", path, rc);
456 cifs_put_tlink(tlink); 462 cifs_put_tlink(tlink);
457 return rc; 463 return rc;
458 } 464 }
@@ -464,7 +470,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
464 io_parms.offset = 0; 470 io_parms.offset = 0;
465 io_parms.length = 24; 471 io_parms.length = 24;
466 472
467 rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); 473 rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms,
474 &bytes_read, &pbuf, &buf_type);
468 if ((rc == 0) && (bytes_read >= 8)) { 475 if ((rc == 0) && (bytes_read >= 8)) {
469 if (memcmp("IntxBLK", pbuf, 8) == 0) { 476 if (memcmp("IntxBLK", pbuf, 8) == 0) {
470 cifs_dbg(FYI, "Block device\n"); 477 cifs_dbg(FYI, "Block device\n");
@@ -504,7 +511,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
504 fattr->cf_dtype = DT_REG; 511 fattr->cf_dtype = DT_REG;
505 rc = -EOPNOTSUPP; /* or some unknown SFU type */ 512 rc = -EOPNOTSUPP; /* or some unknown SFU type */
506 } 513 }
507 CIFSSMBClose(xid, tcon, fid.netfid); 514
515 tcon->ses->server->ops->close(xid, tcon, &fid);
508 cifs_put_tlink(tlink); 516 cifs_put_tlink(tlink);
509 return rc; 517 return rc;
510} 518}
@@ -539,7 +547,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
539 rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, 547 rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path,
540 "SETFILEBITS", ea_value, 4 /* size of buf */, 548 "SETFILEBITS", ea_value, 4 /* size of buf */,
541 cifs_sb->local_nls, 549 cifs_sb->local_nls,
542 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 550 cifs_remap(cifs_sb));
543 cifs_put_tlink(tlink); 551 cifs_put_tlink(tlink);
544 if (rc < 0) 552 if (rc < 0)
545 return (int)rc; 553 return (int)rc;
@@ -952,11 +960,18 @@ struct inode *cifs_root_iget(struct super_block *sb)
952 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); 960 struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
953 961
954 xid = get_xid(); 962 xid = get_xid();
955 if (tcon->unix_ext) 963 if (tcon->unix_ext) {
956 rc = cifs_get_inode_info_unix(&inode, "", sb, xid); 964 rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
957 else 965 /* some servers mistakenly claim POSIX support */
958 rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL); 966 if (rc != -EOPNOTSUPP)
967 goto iget_no_retry;
968 cifs_dbg(VFS, "server does not support POSIX extensions");
969 tcon->unix_ext = false;
970 }
971
972 rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
959 973
974iget_no_retry:
960 if (!inode) { 975 if (!inode) {
961 inode = ERR_PTR(rc); 976 inode = ERR_PTR(rc);
962 goto out; 977 goto out;
@@ -1117,8 +1132,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
1117 /* rename the file */ 1132 /* rename the file */
1118 rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL, 1133 rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL,
1119 cifs_sb->local_nls, 1134 cifs_sb->local_nls,
1120 cifs_sb->mnt_cifs_flags & 1135 cifs_remap(cifs_sb));
1121 CIFS_MOUNT_MAP_SPECIAL_CHR);
1122 if (rc != 0) { 1136 if (rc != 0) {
1123 rc = -EBUSY; 1137 rc = -EBUSY;
1124 goto undo_setattr; 1138 goto undo_setattr;
@@ -1159,8 +1173,7 @@ out:
1159 */ 1173 */
1160undo_rename: 1174undo_rename:
1161 CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name, 1175 CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name,
1162 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 1176 cifs_sb->local_nls, cifs_remap(cifs_sb));
1163 CIFS_MOUNT_MAP_SPECIAL_CHR);
1164undo_setattr: 1177undo_setattr:
1165 if (dosattr != origattr) { 1178 if (dosattr != origattr) {
1166 info_buf->Attributes = cpu_to_le32(origattr); 1179 info_buf->Attributes = cpu_to_le32(origattr);
@@ -1226,7 +1239,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry)
1226 le64_to_cpu(tcon->fsUnixInfo.Capability))) { 1239 le64_to_cpu(tcon->fsUnixInfo.Capability))) {
1227 rc = CIFSPOSIXDelFile(xid, tcon, full_path, 1240 rc = CIFSPOSIXDelFile(xid, tcon, full_path,
1228 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, 1241 SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls,
1229 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 1242 cifs_remap(cifs_sb));
1230 cifs_dbg(FYI, "posix del rc %d\n", rc); 1243 cifs_dbg(FYI, "posix del rc %d\n", rc);
1231 if ((rc == 0) || (rc == -ENOENT)) 1244 if ((rc == 0) || (rc == -ENOENT))
1232 goto psx_del_no_retry; 1245 goto psx_del_no_retry;
@@ -1349,8 +1362,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
1349 } 1362 }
1350 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, 1363 CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args,
1351 cifs_sb->local_nls, 1364 cifs_sb->local_nls,
1352 cifs_sb->mnt_cifs_flags & 1365 cifs_remap(cifs_sb));
1353 CIFS_MOUNT_MAP_SPECIAL_CHR);
1354 } else { 1366 } else {
1355 struct TCP_Server_Info *server = tcon->ses->server; 1367 struct TCP_Server_Info *server = tcon->ses->server;
1356 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && 1368 if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
@@ -1392,8 +1404,7 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
1392 mode &= ~current_umask(); 1404 mode &= ~current_umask();
1393 rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode, 1405 rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode,
1394 NULL /* netfid */, info, &oplock, full_path, 1406 NULL /* netfid */, info, &oplock, full_path,
1395 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 1407 cifs_sb->local_nls, cifs_remap(cifs_sb));
1396 CIFS_MOUNT_MAP_SPECIAL_CHR);
1397 if (rc == -EOPNOTSUPP) 1408 if (rc == -EOPNOTSUPP)
1398 goto posix_mkdir_out; 1409 goto posix_mkdir_out;
1399 else if (rc) { 1410 else if (rc) {
@@ -1419,8 +1430,8 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode,
1419 d_instantiate(dentry, newinode); 1430 d_instantiate(dentry, newinode);
1420 1431
1421#ifdef CONFIG_CIFS_DEBUG2 1432#ifdef CONFIG_CIFS_DEBUG2
1422 cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n", 1433 cifs_dbg(FYI, "instantiated dentry %p %pd to inode %p\n",
1423 dentry, dentry->d_name.name, newinode); 1434 dentry, dentry, newinode);
1424 1435
1425 if (newinode->i_nlink != 2) 1436 if (newinode->i_nlink != 2)
1426 cifs_dbg(FYI, "unexpected number of links %d\n", 1437 cifs_dbg(FYI, "unexpected number of links %d\n",
@@ -1617,8 +1628,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
1617 if (rc == 0) { 1628 if (rc == 0) {
1618 rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, 1629 rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid,
1619 (const char *) to_dentry->d_name.name, 1630 (const char *) to_dentry->d_name.name,
1620 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 1631 cifs_sb->local_nls, cifs_remap(cifs_sb));
1621 CIFS_MOUNT_MAP_SPECIAL_CHR);
1622 CIFSSMBClose(xid, tcon, fid.netfid); 1632 CIFSSMBClose(xid, tcon, fid.netfid);
1623 } 1633 }
1624do_rename_exit: 1634do_rename_exit:
@@ -1627,8 +1637,9 @@ do_rename_exit:
1627} 1637}
1628 1638
1629int 1639int
1630cifs_rename(struct inode *source_dir, struct dentry *source_dentry, 1640cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
1631 struct inode *target_dir, struct dentry *target_dentry) 1641 struct inode *target_dir, struct dentry *target_dentry,
1642 unsigned int flags)
1632{ 1643{
1633 char *from_name = NULL; 1644 char *from_name = NULL;
1634 char *to_name = NULL; 1645 char *to_name = NULL;
@@ -1640,6 +1651,9 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1640 unsigned int xid; 1651 unsigned int xid;
1641 int rc, tmprc; 1652 int rc, tmprc;
1642 1653
1654 if (flags & ~RENAME_NOREPLACE)
1655 return -EINVAL;
1656
1643 cifs_sb = CIFS_SB(source_dir->i_sb); 1657 cifs_sb = CIFS_SB(source_dir->i_sb);
1644 tlink = cifs_sb_tlink(cifs_sb); 1658 tlink = cifs_sb_tlink(cifs_sb);
1645 if (IS_ERR(tlink)) 1659 if (IS_ERR(tlink))
@@ -1667,6 +1681,12 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1667 rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, 1681 rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry,
1668 to_name); 1682 to_name);
1669 1683
1684 /*
1685 * No-replace is the natural behavior for CIFS, so skip unlink hacks.
1686 */
1687 if (flags & RENAME_NOREPLACE)
1688 goto cifs_rename_exit;
1689
1670 if (rc == -EEXIST && tcon->unix_ext) { 1690 if (rc == -EEXIST && tcon->unix_ext) {
1671 /* 1691 /*
1672 * Are src and dst hardlinks of same inode? We can only tell 1692 * Are src and dst hardlinks of same inode? We can only tell
@@ -1684,16 +1704,14 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1684 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name, 1704 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name,
1685 info_buf_source, 1705 info_buf_source,
1686 cifs_sb->local_nls, 1706 cifs_sb->local_nls,
1687 cifs_sb->mnt_cifs_flags & 1707 cifs_remap(cifs_sb));
1688 CIFS_MOUNT_MAP_SPECIAL_CHR);
1689 if (tmprc != 0) 1708 if (tmprc != 0)
1690 goto unlink_target; 1709 goto unlink_target;
1691 1710
1692 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name, 1711 tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name,
1693 info_buf_target, 1712 info_buf_target,
1694 cifs_sb->local_nls, 1713 cifs_sb->local_nls,
1695 cifs_sb->mnt_cifs_flags & 1714 cifs_remap(cifs_sb));
1696 CIFS_MOUNT_MAP_SPECIAL_CHR);
1697 1715
1698 if (tmprc == 0 && (info_buf_source->UniqueId == 1716 if (tmprc == 0 && (info_buf_source->UniqueId ==
1699 info_buf_target->UniqueId)) { 1717 info_buf_target->UniqueId)) {
@@ -1710,13 +1728,22 @@ cifs_rename(struct inode *source_dir, struct dentry *source_dentry,
1710unlink_target: 1728unlink_target:
1711 /* Try unlinking the target dentry if it's not negative */ 1729 /* Try unlinking the target dentry if it's not negative */
1712 if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { 1730 if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
1713 tmprc = cifs_unlink(target_dir, target_dentry); 1731 if (d_is_dir(target_dentry))
1732 tmprc = cifs_rmdir(target_dir, target_dentry);
1733 else
1734 tmprc = cifs_unlink(target_dir, target_dentry);
1714 if (tmprc) 1735 if (tmprc)
1715 goto cifs_rename_exit; 1736 goto cifs_rename_exit;
1716 rc = cifs_do_rename(xid, source_dentry, from_name, 1737 rc = cifs_do_rename(xid, source_dentry, from_name,
1717 target_dentry, to_name); 1738 target_dentry, to_name);
1718 } 1739 }
1719 1740
1741 /* force revalidate to go get info when needed */
1742 CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0;
1743
1744 source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime =
1745 target_dir->i_mtime = current_fs_time(source_dir->i_sb);
1746
1720cifs_rename_exit: 1747cifs_rename_exit:
1721 kfree(info_buf_source); 1748 kfree(info_buf_source);
1722 kfree(from_name); 1749 kfree(from_name);
@@ -1780,7 +1807,7 @@ cifs_invalidate_mapping(struct inode *inode)
1780 * @word: long word containing the bit lock 1807 * @word: long word containing the bit lock
1781 */ 1808 */
1782static int 1809static int
1783cifs_wait_bit_killable(void *word) 1810cifs_wait_bit_killable(struct wait_bit_key *key)
1784{ 1811{
1785 if (fatal_signal_pending(current)) 1812 if (fatal_signal_pending(current))
1786 return -ERESTARTSYS; 1813 return -ERESTARTSYS;
@@ -1794,8 +1821,8 @@ cifs_revalidate_mapping(struct inode *inode)
1794 int rc; 1821 int rc;
1795 unsigned long *flags = &CIFS_I(inode)->flags; 1822 unsigned long *flags = &CIFS_I(inode)->flags;
1796 1823
1797 rc = wait_on_bit_lock(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, 1824 rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable,
1798 TASK_KILLABLE); 1825 TASK_KILLABLE);
1799 if (rc) 1826 if (rc)
1800 return rc; 1827 return rc;
1801 1828
@@ -2049,8 +2076,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
2049 rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN, 2076 rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN,
2050 GENERIC_WRITE, CREATE_NOT_DIR, &netfid, 2077 GENERIC_WRITE, CREATE_NOT_DIR, &netfid,
2051 &oplock, NULL, cifs_sb->local_nls, 2078 &oplock, NULL, cifs_sb->local_nls,
2052 cifs_sb->mnt_cifs_flags & 2079 cifs_remap(cifs_sb));
2053 CIFS_MOUNT_MAP_SPECIAL_CHR);
2054 if (rc == 0) { 2080 if (rc == 0) {
2055 unsigned int bytes_written; 2081 unsigned int bytes_written;
2056 2082
@@ -2092,8 +2118,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
2092 struct cifs_unix_set_info_args *args = NULL; 2118 struct cifs_unix_set_info_args *args = NULL;
2093 struct cifsFileInfo *open_file; 2119 struct cifsFileInfo *open_file;
2094 2120
2095 cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n", 2121 cifs_dbg(FYI, "setattr_unix on file %pd attrs->ia_valid=0x%x\n",
2096 direntry->d_name.name, attrs->ia_valid); 2122 direntry, attrs->ia_valid);
2097 2123
2098 xid = get_xid(); 2124 xid = get_xid();
2099 2125
@@ -2235,8 +2261,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
2235 2261
2236 xid = get_xid(); 2262 xid = get_xid();
2237 2263
2238 cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n", 2264 cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n",
2239 direntry->d_name.name, attrs->ia_valid); 2265 direntry, attrs->ia_valid);
2240 2266
2241 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) 2267 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM)
2242 attrs->ia_valid |= ATTR_FORCE; 2268 attrs->ia_valid |= ATTR_FORCE;
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 68559fd557fb..2ec6037f61c7 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -28,6 +28,10 @@
28#include "cifsproto.h" 28#include "cifsproto.h"
29#include "cifs_debug.h" 29#include "cifs_debug.h"
30#include "cifs_fs_sb.h" 30#include "cifs_fs_sb.h"
31#include "cifs_unicode.h"
32#ifdef CONFIG_CIFS_SMB2
33#include "smb2proto.h"
34#endif
31 35
32/* 36/*
33 * M-F Symlink Functions - Begin 37 * M-F Symlink Functions - Begin
@@ -213,8 +217,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
213 if (rc) 217 if (rc)
214 goto out; 218 goto out;
215 219
216 rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb, 220 if (tcon->ses->server->ops->create_mf_symlink)
217 fromName, buf, &bytes_written); 221 rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
222 cifs_sb, fromName, buf, &bytes_written);
223 else
224 rc = -EOPNOTSUPP;
225
218 if (rc) 226 if (rc)
219 goto out; 227 goto out;
220 228
@@ -339,9 +347,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
339 if (rc) 347 if (rc)
340 return rc; 348 return rc;
341 349
342 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) 350 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
351 rc = -ENOENT;
343 /* it's not a symlink */ 352 /* it's not a symlink */
344 goto out; 353 goto out;
354 }
345 355
346 io_parms.netfid = fid.netfid; 356 io_parms.netfid = fid.netfid;
347 io_parms.pid = current->tgid; 357 io_parms.pid = current->tgid;
@@ -395,6 +405,134 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
395} 405}
396 406
397/* 407/*
408 * SMB 2.1/SMB3 Protocol specific functions
409 */
410#ifdef CONFIG_CIFS_SMB2
411int
412smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
413 struct cifs_sb_info *cifs_sb, const unsigned char *path,
414 char *pbuf, unsigned int *pbytes_read)
415{
416 int rc;
417 struct cifs_fid fid;
418 struct cifs_open_parms oparms;
419 struct cifs_io_parms io_parms;
420 int buf_type = CIFS_NO_BUFFER;
421 __le16 *utf16_path;
422 __u8 oplock = SMB2_OPLOCK_LEVEL_II;
423 struct smb2_file_all_info *pfile_info = NULL;
424
425 oparms.tcon = tcon;
426 oparms.cifs_sb = cifs_sb;
427 oparms.desired_access = GENERIC_READ;
428 oparms.create_options = CREATE_NOT_DIR;
429 if (backup_cred(cifs_sb))
430 oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
431 oparms.disposition = FILE_OPEN;
432 oparms.fid = &fid;
433 oparms.reconnect = false;
434
435 utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
436 if (utf16_path == NULL)
437 return -ENOMEM;
438
439 pfile_info = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
440 GFP_KERNEL);
441
442 if (pfile_info == NULL) {
443 kfree(utf16_path);
444 return -ENOMEM;
445 }
446
447 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL);
448 if (rc)
449 goto qmf_out_open_fail;
450
451 if (pfile_info->EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
452 /* it's not a symlink */
453 rc = -ENOENT; /* Is there a better rc to return? */
454 goto qmf_out;
455 }
456
457 io_parms.netfid = fid.netfid;
458 io_parms.pid = current->tgid;
459 io_parms.tcon = tcon;
460 io_parms.offset = 0;
461 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
462 io_parms.persistent_fid = fid.persistent_fid;
463 io_parms.volatile_fid = fid.volatile_fid;
464 rc = SMB2_read(xid, &io_parms, pbytes_read, &pbuf, &buf_type);
465qmf_out:
466 SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
467qmf_out_open_fail:
468 kfree(utf16_path);
469 kfree(pfile_info);
470 return rc;
471}
472
473int
474smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
475 struct cifs_sb_info *cifs_sb, const unsigned char *path,
476 char *pbuf, unsigned int *pbytes_written)
477{
478 int rc;
479 struct cifs_fid fid;
480 struct cifs_open_parms oparms;
481 struct cifs_io_parms io_parms;
482 int create_options = CREATE_NOT_DIR;
483 __le16 *utf16_path;
484 __u8 oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE;
485 struct kvec iov[2];
486
487 if (backup_cred(cifs_sb))
488 create_options |= CREATE_OPEN_BACKUP_INTENT;
489
490 cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
491
492 utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
493 if (!utf16_path)
494 return -ENOMEM;
495
496 oparms.tcon = tcon;
497 oparms.cifs_sb = cifs_sb;
498 oparms.desired_access = GENERIC_WRITE;
499 oparms.create_options = create_options;
500 oparms.disposition = FILE_CREATE;
501 oparms.fid = &fid;
502 oparms.reconnect = false;
503
504 rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
505 if (rc) {
506 kfree(utf16_path);
507 return rc;
508 }
509
510 io_parms.netfid = fid.netfid;
511 io_parms.pid = current->tgid;
512 io_parms.tcon = tcon;
513 io_parms.offset = 0;
514 io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE;
515 io_parms.persistent_fid = fid.persistent_fid;
516 io_parms.volatile_fid = fid.volatile_fid;
517
518 /* iov[0] is reserved for smb header */
519 iov[1].iov_base = pbuf;
520 iov[1].iov_len = CIFS_MF_SYMLINK_FILE_SIZE;
521
522 rc = SMB2_write(xid, &io_parms, pbytes_written, iov, 1);
523
524 /* Make sure we wrote all of the symlink data */
525 if ((rc == 0) && (*pbytes_written != CIFS_MF_SYMLINK_FILE_SIZE))
526 rc = -EIO;
527
528 SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
529
530 kfree(utf16_path);
531 return rc;
532}
533#endif /* CONFIG_CIFS_SMB2 */
534
535/*
398 * M-F Symlink Functions - End 536 * M-F Symlink Functions - End
399 */ 537 */
400 538
@@ -429,8 +567,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
429 if (tcon->unix_ext) 567 if (tcon->unix_ext)
430 rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name, 568 rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name,
431 cifs_sb->local_nls, 569 cifs_sb->local_nls,
432 cifs_sb->mnt_cifs_flags & 570 cifs_remap(cifs_sb));
433 CIFS_MOUNT_MAP_SPECIAL_CHR);
434 else { 571 else {
435 server = tcon->ses->server; 572 server = tcon->ses->server;
436 if (!server->ops->create_hardlink) { 573 if (!server->ops->create_hardlink) {
@@ -455,11 +592,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
455 spin_lock(&old_file->d_inode->i_lock); 592 spin_lock(&old_file->d_inode->i_lock);
456 inc_nlink(old_file->d_inode); 593 inc_nlink(old_file->d_inode);
457 spin_unlock(&old_file->d_inode->i_lock); 594 spin_unlock(&old_file->d_inode->i_lock);
458 /* 595
459 * BB should we make this contingent on superblock flag
460 * NOATIME?
461 */
462 /* old_file->d_inode->i_ctime = CURRENT_TIME; */
463 /* 596 /*
464 * parent dir timestamps will update from srv within a 597 * parent dir timestamps will update from srv within a
465 * second, would it really be worth it to set the parent 598 * second, would it really be worth it to set the parent
@@ -469,7 +602,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
469 } 602 }
470 /* 603 /*
471 * if not oplocked will force revalidate to get info on source 604 * if not oplocked will force revalidate to get info on source
472 * file from srv 605 * file from srv. Note Samba server prior to 4.2 has bug -
606 * not updating src file ctime on hardlinks but Windows servers
607 * handle it properly
473 */ 608 */
474 cifsInode->time = 0; 609 cifsInode->time = 0;
475 610
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 3b0c62e622da..b7415d596dbd 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -226,6 +226,15 @@ cifs_small_buf_release(void *buf_to_free)
226 return; 226 return;
227} 227}
228 228
229void
230free_rsp_buf(int resp_buftype, void *rsp)
231{
232 if (resp_buftype == CIFS_SMALL_BUFFER)
233 cifs_small_buf_release(rsp);
234 else if (resp_buftype == CIFS_LARGE_BUFFER)
235 cifs_buf_release(rsp);
236}
237
229/* NB: MID can not be set if treeCon not passed in, in that 238/* NB: MID can not be set if treeCon not passed in, in that
230 case it is responsbility of caller to set the mid */ 239 case it is responsbility of caller to set the mid */
231void 240void
@@ -414,7 +423,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
414 return true; 423 return true;
415 } 424 }
416 if (pSMBr->hdr.Status.CifsError) { 425 if (pSMBr->hdr.Status.CifsError) {
417 cifs_dbg(FYI, "notify err 0x%d\n", 426 cifs_dbg(FYI, "notify err 0x%x\n",
418 pSMBr->hdr.Status.CifsError); 427 pSMBr->hdr.Status.CifsError);
419 return true; 428 return true;
420 } 429 }
@@ -441,7 +450,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
441 if (pSMB->hdr.WordCount != 8) 450 if (pSMB->hdr.WordCount != 8)
442 return false; 451 return false;
443 452
444 cifs_dbg(FYI, "oplock type 0x%d level 0x%d\n", 453 cifs_dbg(FYI, "oplock type 0x%x level 0x%x\n",
445 pSMB->LockType, pSMB->OplockLevel); 454 pSMB->LockType, pSMB->OplockLevel);
446 if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE)) 455 if (!(pSMB->LockType & LOCKING_ANDX_OPLOCK_RELEASE))
447 return false; 456 return false;
@@ -565,13 +574,6 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
565 cinode->oplock = 0; 574 cinode->oplock = 0;
566} 575}
567 576
568static int
569cifs_oplock_break_wait(void *unused)
570{
571 schedule();
572 return signal_pending(current) ? -ERESTARTSYS : 0;
573}
574
575/* 577/*
576 * We wait for oplock breaks to be processed before we attempt to perform 578 * We wait for oplock breaks to be processed before we attempt to perform
577 * writes. 579 * writes.
@@ -582,7 +584,7 @@ int cifs_get_writer(struct cifsInodeInfo *cinode)
582 584
583start: 585start:
584 rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK, 586 rc = wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_OPLOCK_BREAK,
585 cifs_oplock_break_wait, TASK_KILLABLE); 587 TASK_KILLABLE);
586 if (rc) 588 if (rc)
587 return rc; 589 return rc;
588 590
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6834b9c3bec1..b333ff60781d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc)
925 /* BB what about the timezone? BB */ 925 /* BB what about the timezone? BB */
926 926
927 /* Subtract the NTFS time offset, then convert to 1s intervals. */ 927 /* Subtract the NTFS time offset, then convert to 1s intervals. */
928 u64 t; 928 s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
929
930 /*
931 * Unfortunately can not use normal 64 bit division on 32 bit arch, but
932 * the alternative, do_div, does not work with negative numbers so have
933 * to special case them
934 */
935 if (t < 0) {
936 t = -t;
937 ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
938 ts.tv_nsec = -ts.tv_nsec;
939 ts.tv_sec = -t;
940 } else {
941 ts.tv_nsec = (long)do_div(t, 10000000) * 100;
942 ts.tv_sec = t;
943 }
929 944
930 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
931 ts.tv_nsec = do_div(t, 10000000) * 100;
932 ts.tv_sec = t;
933 return ts; 945 return ts;
934} 946}
935 947
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b15862e0f68c..8fd2a95860ba 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -87,8 +87,6 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
87 return; 87 return;
88 88
89 if (dentry) { 89 if (dentry) {
90 int err;
91
92 inode = dentry->d_inode; 90 inode = dentry->d_inode;
93 if (inode) { 91 if (inode) {
94 /* 92 /*
@@ -105,10 +103,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
105 goto out; 103 goto out;
106 } 104 }
107 } 105 }
108 err = d_invalidate(dentry); 106 d_invalidate(dentry);
109 dput(dentry); 107 dput(dentry);
110 if (err)
111 return;
112 } 108 }
113 109
114 /* 110 /*
@@ -243,7 +239,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
243 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, 239 rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
244 OPEN_REPARSE_POINT, &fid, &oplock, NULL, 240 OPEN_REPARSE_POINT, &fid, &oplock, NULL,
245 cifs_sb->local_nls, 241 cifs_sb->local_nls,
246 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 242 cifs_remap(cifs_sb);
247 if (!rc) { 243 if (!rc) {
248 tmpbuffer = kmalloc(maxpath); 244 tmpbuffer = kmalloc(maxpath);
249 rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path, 245 rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
@@ -593,11 +589,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
593 /* close and restart search */ 589 /* close and restart search */
594 cifs_dbg(FYI, "search backing up - close and restart search\n"); 590 cifs_dbg(FYI, "search backing up - close and restart search\n");
595 spin_lock(&cifs_file_list_lock); 591 spin_lock(&cifs_file_list_lock);
596 if (!cfile->srch_inf.endOfSearch && !cfile->invalidHandle) { 592 if (server->ops->dir_needs_close(cfile)) {
597 cfile->invalidHandle = true; 593 cfile->invalidHandle = true;
598 spin_unlock(&cifs_file_list_lock); 594 spin_unlock(&cifs_file_list_lock);
599 if (server->ops->close) 595 if (server->ops->close_dir)
600 server->ops->close(xid, tcon, &cfile->fid); 596 server->ops->close_dir(xid, tcon, &cfile->fid);
601 } else 597 } else
602 spin_unlock(&cifs_file_list_lock); 598 spin_unlock(&cifs_file_list_lock);
603 if (cfile->srch_inf.ntwrk_buf_start) { 599 if (cfile->srch_inf.ntwrk_buf_start) {
@@ -708,15 +704,15 @@ static int cifs_filldir(char *find_entry, struct file *file,
708 704
709 if (file_info->srch_inf.unicode) { 705 if (file_info->srch_inf.unicode) {
710 struct nls_table *nlt = cifs_sb->local_nls; 706 struct nls_table *nlt = cifs_sb->local_nls;
707 int map_type;
711 708
709 map_type = cifs_remap(cifs_sb);
712 name.name = scratch_buf; 710 name.name = scratch_buf;
713 name.len = 711 name.len =
714 cifs_from_utf16((char *)name.name, (__le16 *)de.name, 712 cifs_from_utf16((char *)name.name, (__le16 *)de.name,
715 UNICODE_NAME_MAX, 713 UNICODE_NAME_MAX,
716 min_t(size_t, de.namelen, 714 min_t(size_t, de.namelen,
717 (size_t)max_len), nlt, 715 (size_t)max_len), nlt, map_type);
718 cifs_sb->mnt_cifs_flags &
719 CIFS_MOUNT_MAP_SPECIAL_CHR);
720 name.len -= nls_nullsize(nlt); 716 name.len -= nls_nullsize(nlt);
721 } else { 717 } else {
722 name.name = de.name; 718 name.name = de.name;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index e87387dbf39f..57db63ff88da 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
243 kfree(ses->serverOS); 243 kfree(ses->serverOS);
244 244
245 ses->serverOS = kzalloc(len + 1, GFP_KERNEL); 245 ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
246 if (ses->serverOS) 246 if (ses->serverOS) {
247 strncpy(ses->serverOS, bcc_ptr, len); 247 strncpy(ses->serverOS, bcc_ptr, len);
248 if (strncmp(ses->serverOS, "OS/2", 4) == 0) 248 if (strncmp(ses->serverOS, "OS/2", 4) == 0)
249 cifs_dbg(FYI, "OS/2 server\n"); 249 cifs_dbg(FYI, "OS/2 server\n");
250 }
250 251
251 bcc_ptr += len + 1; 252 bcc_ptr += len + 1;
252 bleft -= len + 1; 253 bleft -= len + 1;
@@ -520,382 +521,551 @@ select_sectype(struct TCP_Server_Info *server, enum securityEnum requested)
520 } 521 }
521} 522}
522 523
523int 524struct sess_data {
524CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses, 525 unsigned int xid;
525 const struct nls_table *nls_cp) 526 struct cifs_ses *ses;
527 struct nls_table *nls_cp;
528 void (*func)(struct sess_data *);
529 int result;
530
531 /* we will send the SMB in three pieces:
532 * a fixed length beginning part, an optional
533 * SPNEGO blob (which can be zero length), and a
534 * last part which will include the strings
535 * and rest of bcc area. This allows us to avoid
536 * a large buffer 17K allocation
537 */
538 int buf0_type;
539 struct kvec iov[3];
540};
541
542static int
543sess_alloc_buffer(struct sess_data *sess_data, int wct)
526{ 544{
527 int rc = 0; 545 int rc;
528 int wct; 546 struct cifs_ses *ses = sess_data->ses;
529 struct smb_hdr *smb_buf; 547 struct smb_hdr *smb_buf;
530 char *bcc_ptr;
531 char *str_area;
532 SESSION_SETUP_ANDX *pSMB;
533 __u32 capabilities;
534 __u16 count;
535 int resp_buf_type;
536 struct kvec iov[3];
537 enum securityEnum type;
538 __u16 action, bytes_remaining;
539 struct key *spnego_key = NULL;
540 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
541 u16 blob_len;
542 char *ntlmsspblob = NULL;
543 548
544 if (ses == NULL) { 549 rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses,
545 WARN(1, "%s: ses == NULL!", __func__); 550 (void **)&smb_buf);
546 return -EINVAL;
547 }
548 551
549 type = select_sectype(ses->server, ses->sectype); 552 if (rc)
550 cifs_dbg(FYI, "sess setup type %d\n", type); 553 return rc;
551 if (type == Unspecified) { 554
552 cifs_dbg(VFS, 555 sess_data->iov[0].iov_base = (char *)smb_buf;
553 "Unable to select appropriate authentication method!"); 556 sess_data->iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
554 return -EINVAL; 557 /*
558 * This variable will be used to clear the buffer
559 * allocated above in case of any error in the calling function.
560 */
561 sess_data->buf0_type = CIFS_SMALL_BUFFER;
562
563 /* 2000 big enough to fit max user, domain, NOS name etc. */
564 sess_data->iov[2].iov_base = kmalloc(2000, GFP_KERNEL);
565 if (!sess_data->iov[2].iov_base) {
566 rc = -ENOMEM;
567 goto out_free_smb_buf;
555 } 568 }
556 569
557 if (type == RawNTLMSSP) { 570 return 0;
558 /* if memory allocation is successful, caller of this function
559 * frees it.
560 */
561 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
562 if (!ses->ntlmssp)
563 return -ENOMEM;
564 ses->ntlmssp->sesskey_per_smbsess = false;
565 571
572out_free_smb_buf:
573 kfree(smb_buf);
574 sess_data->iov[0].iov_base = NULL;
575 sess_data->iov[0].iov_len = 0;
576 sess_data->buf0_type = CIFS_NO_BUFFER;
577 return rc;
578}
579
580static void
581sess_free_buffer(struct sess_data *sess_data)
582{
583
584 free_rsp_buf(sess_data->buf0_type, sess_data->iov[0].iov_base);
585 sess_data->buf0_type = CIFS_NO_BUFFER;
586 kfree(sess_data->iov[2].iov_base);
587}
588
589static int
590sess_establish_session(struct sess_data *sess_data)
591{
592 struct cifs_ses *ses = sess_data->ses;
593
594 mutex_lock(&ses->server->srv_mutex);
595 if (!ses->server->session_estab) {
596 if (ses->server->sign) {
597 ses->server->session_key.response =
598 kmemdup(ses->auth_key.response,
599 ses->auth_key.len, GFP_KERNEL);
600 if (!ses->server->session_key.response) {
601 mutex_unlock(&ses->server->srv_mutex);
602 return -ENOMEM;
603 }
604 ses->server->session_key.len =
605 ses->auth_key.len;
606 }
607 ses->server->sequence_number = 0x2;
608 ses->server->session_estab = true;
566 } 609 }
610 mutex_unlock(&ses->server->srv_mutex);
567 611
568ssetup_ntlmssp_authenticate: 612 cifs_dbg(FYI, "CIFS session established successfully\n");
569 if (phase == NtLmChallenge) 613 spin_lock(&GlobalMid_Lock);
570 phase = NtLmAuthenticate; /* if ntlmssp, now final phase */ 614 ses->status = CifsGood;
615 ses->need_reconnect = false;
616 spin_unlock(&GlobalMid_Lock);
571 617
572 if (type == LANMAN) { 618 return 0;
573#ifndef CONFIG_CIFS_WEAK_PW_HASH 619}
574 /* LANMAN and plaintext are less secure and off by default.
575 So we make this explicitly be turned on in kconfig (in the
576 build) and turned on at runtime (changed from the default)
577 in proc/fs/cifs or via mount parm. Unfortunately this is
578 needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
579 return -EOPNOTSUPP;
580#endif
581 wct = 10; /* lanman 2 style sessionsetup */
582 } else if ((type == NTLM) || (type == NTLMv2)) {
583 /* For NTLMv2 failures eventually may need to retry NTLM */
584 wct = 13; /* old style NTLM sessionsetup */
585 } else /* same size: negotiate or auth, NTLMSSP or extended security */
586 wct = 12;
587 620
588 rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, 621static int
589 (void **)&smb_buf); 622sess_sendreceive(struct sess_data *sess_data)
590 if (rc) 623{
591 return rc; 624 int rc;
625 struct smb_hdr *smb_buf = (struct smb_hdr *) sess_data->iov[0].iov_base;
626 __u16 count;
627
628 count = sess_data->iov[1].iov_len + sess_data->iov[2].iov_len;
629 smb_buf->smb_buf_length =
630 cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
631 put_bcc(count, smb_buf);
632
633 rc = SendReceive2(sess_data->xid, sess_data->ses,
634 sess_data->iov, 3 /* num_iovecs */,
635 &sess_data->buf0_type,
636 CIFS_LOG_ERROR);
592 637
593 pSMB = (SESSION_SETUP_ANDX *)smb_buf; 638 return rc;
639}
594 640
641/*
642 * LANMAN and plaintext are less secure and off by default.
643 * So we make this explicitly be turned on in kconfig (in the
644 * build) and turned on at runtime (changed from the default)
645 * in proc/fs/cifs or via mount parm. Unfortunately this is
646 * needed for old Win (e.g. Win95), some obscure NAS and OS/2
647 */
648#ifdef CONFIG_CIFS_WEAK_PW_HASH
649static void
650sess_auth_lanman(struct sess_data *sess_data)
651{
652 int rc = 0;
653 struct smb_hdr *smb_buf;
654 SESSION_SETUP_ANDX *pSMB;
655 char *bcc_ptr;
656 struct cifs_ses *ses = sess_data->ses;
657 char lnm_session_key[CIFS_AUTH_RESP_SIZE];
658 __u32 capabilities;
659 __u16 bytes_remaining;
660
661 /* lanman 2 style sessionsetup */
662 /* wct = 10 */
663 rc = sess_alloc_buffer(sess_data, 10);
664 if (rc)
665 goto out;
666
667 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
668 bcc_ptr = sess_data->iov[2].iov_base;
595 capabilities = cifs_ssetup_hdr(ses, pSMB); 669 capabilities = cifs_ssetup_hdr(ses, pSMB);
596 670
597 /* we will send the SMB in three pieces: 671 pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
598 a fixed length beginning part, an optional
599 SPNEGO blob (which can be zero length), and a
600 last part which will include the strings
601 and rest of bcc area. This allows us to avoid
602 a large buffer 17K allocation */
603 iov[0].iov_base = (char *)pSMB;
604 iov[0].iov_len = be32_to_cpu(smb_buf->smb_buf_length) + 4;
605
606 /* setting this here allows the code at the end of the function
607 to free the request buffer if there's an error */
608 resp_buf_type = CIFS_SMALL_BUFFER;
609 672
610 /* 2000 big enough to fit max user, domain, NOS name etc. */ 673 /* no capabilities flags in old lanman negotiation */
611 str_area = kmalloc(2000, GFP_KERNEL); 674 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
612 if (str_area == NULL) {
613 rc = -ENOMEM;
614 goto ssetup_exit;
615 }
616 bcc_ptr = str_area;
617 675
618 iov[1].iov_base = NULL; 676 /* Calculate hash with password and copy into bcc_ptr.
619 iov[1].iov_len = 0; 677 * Encryption Key (stored as in cryptkey) gets used if the
678 * security mode bit in Negottiate Protocol response states
679 * to use challenge/response method (i.e. Password bit is 1).
680 */
681 rc = calc_lanman_hash(ses->password, ses->server->cryptkey,
682 ses->server->sec_mode & SECMODE_PW_ENCRYPT ?
683 true : false, lnm_session_key);
620 684
621 if (type == LANMAN) { 685 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
622#ifdef CONFIG_CIFS_WEAK_PW_HASH 686 bcc_ptr += CIFS_AUTH_RESP_SIZE;
623 char lnm_session_key[CIFS_AUTH_RESP_SIZE]; 687
688 /*
689 * can not sign if LANMAN negotiated so no need
690 * to calculate signing key? but what if server
691 * changed to do higher than lanman dialect and
692 * we reconnected would we ever calc signing_key?
693 */
624 694
625 pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; 695 cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
696 /* Unicode not allowed for LANMAN dialects */
697 ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
626 698
627 /* no capabilities flags in old lanman negotiation */ 699 sess_data->iov[2].iov_len = (long) bcc_ptr -
700 (long) sess_data->iov[2].iov_base;
701
702 rc = sess_sendreceive(sess_data);
703 if (rc)
704 goto out;
628 705
629 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); 706 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
707 smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
630 708
631 /* Calculate hash with password and copy into bcc_ptr. 709 /* lanman response has a word count of 3 */
632 * Encryption Key (stored as in cryptkey) gets used if the 710 if (smb_buf->WordCount != 3) {
633 * security mode bit in Negottiate Protocol response states 711 rc = -EIO;
634 * to use challenge/response method (i.e. Password bit is 1). 712 cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
635 */ 713 goto out;
714 }
636 715
637 rc = calc_lanman_hash(ses->password, ses->server->cryptkey, 716 if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
638 ses->server->sec_mode & SECMODE_PW_ENCRYPT ? 717 cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
639 true : false, lnm_session_key);
640 718
641 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); 719 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
642 bcc_ptr += CIFS_AUTH_RESP_SIZE; 720 cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
643 721
644 /* can not sign if LANMAN negotiated so no need 722 bytes_remaining = get_bcc(smb_buf);
645 to calculate signing key? but what if server 723 bcc_ptr = pByteArea(smb_buf);
646 changed to do higher than lanman dialect and 724
647 we reconnected would we ever calc signing_key? */ 725 /* BB check if Unicode and decode strings */
726 if (bytes_remaining == 0) {
727 /* no string area to decode, do nothing */
728 } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
729 /* unicode string area must be word-aligned */
730 if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
731 ++bcc_ptr;
732 --bytes_remaining;
733 }
734 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
735 sess_data->nls_cp);
736 } else {
737 decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
738 sess_data->nls_cp);
739 }
740
741 rc = sess_establish_session(sess_data);
742out:
743 sess_data->result = rc;
744 sess_data->func = NULL;
745 sess_free_buffer(sess_data);
746}
648 747
649 cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n");
650 /* Unicode not allowed for LANMAN dialects */
651 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
652#endif 748#endif
653 } else if (type == NTLM) { 749
654 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); 750static void
655 pSMB->req_no_secext.CaseInsensitivePasswordLength = 751sess_auth_ntlm(struct sess_data *sess_data)
752{
753 int rc = 0;
754 struct smb_hdr *smb_buf;
755 SESSION_SETUP_ANDX *pSMB;
756 char *bcc_ptr;
757 struct cifs_ses *ses = sess_data->ses;
758 __u32 capabilities;
759 __u16 bytes_remaining;
760
761 /* old style NTLM sessionsetup */
762 /* wct = 13 */
763 rc = sess_alloc_buffer(sess_data, 13);
764 if (rc)
765 goto out;
766
767 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
768 bcc_ptr = sess_data->iov[2].iov_base;
769 capabilities = cifs_ssetup_hdr(ses, pSMB);
770
771 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
772 pSMB->req_no_secext.CaseInsensitivePasswordLength =
656 cpu_to_le16(CIFS_AUTH_RESP_SIZE); 773 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
657 pSMB->req_no_secext.CaseSensitivePasswordLength = 774 pSMB->req_no_secext.CaseSensitivePasswordLength =
658 cpu_to_le16(CIFS_AUTH_RESP_SIZE); 775 cpu_to_le16(CIFS_AUTH_RESP_SIZE);
659 776
660 /* calculate ntlm response and session key */ 777 /* calculate ntlm response and session key */
661 rc = setup_ntlm_response(ses, nls_cp); 778 rc = setup_ntlm_response(ses, sess_data->nls_cp);
662 if (rc) { 779 if (rc) {
663 cifs_dbg(VFS, "Error %d during NTLM authentication\n", 780 cifs_dbg(VFS, "Error %d during NTLM authentication\n",
664 rc); 781 rc);
665 goto ssetup_exit; 782 goto out;
666 } 783 }
667 784
668 /* copy ntlm response */ 785 /* copy ntlm response */
669 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, 786 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
670 CIFS_AUTH_RESP_SIZE); 787 CIFS_AUTH_RESP_SIZE);
671 bcc_ptr += CIFS_AUTH_RESP_SIZE; 788 bcc_ptr += CIFS_AUTH_RESP_SIZE;
672 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, 789 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
673 CIFS_AUTH_RESP_SIZE); 790 CIFS_AUTH_RESP_SIZE);
674 bcc_ptr += CIFS_AUTH_RESP_SIZE; 791 bcc_ptr += CIFS_AUTH_RESP_SIZE;
675 792
676 if (ses->capabilities & CAP_UNICODE) { 793 if (ses->capabilities & CAP_UNICODE) {
677 /* unicode strings must be word aligned */ 794 /* unicode strings must be word aligned */
678 if (iov[0].iov_len % 2) { 795 if (sess_data->iov[0].iov_len % 2) {
679 *bcc_ptr = 0; 796 *bcc_ptr = 0;
680 bcc_ptr++; 797 bcc_ptr++;
681 }
682 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
683 } else
684 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
685 } else if (type == NTLMv2) {
686 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
687
688 /* LM2 password would be here if we supported it */
689 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
690
691 /* calculate nlmv2 response and session key */
692 rc = setup_ntlmv2_rsp(ses, nls_cp);
693 if (rc) {
694 cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n",
695 rc);
696 goto ssetup_exit;
697 } 798 }
698 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, 799 unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
699 ses->auth_key.len - CIFS_SESS_KEY_SIZE); 800 } else {
700 bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE; 801 ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
701 802 }
702 /* set case sensitive password length after tilen may get
703 * assigned, tilen is 0 otherwise.
704 */
705 pSMB->req_no_secext.CaseSensitivePasswordLength =
706 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
707 803
708 if (ses->capabilities & CAP_UNICODE) {
709 if (iov[0].iov_len % 2) {
710 *bcc_ptr = 0;
711 bcc_ptr++;
712 }
713 unicode_ssetup_strings(&bcc_ptr, ses, nls_cp);
714 } else
715 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
716 } else if (type == Kerberos) {
717#ifdef CONFIG_CIFS_UPCALL
718 struct cifs_spnego_msg *msg;
719 804
720 spnego_key = cifs_get_spnego_key(ses); 805 sess_data->iov[2].iov_len = (long) bcc_ptr -
721 if (IS_ERR(spnego_key)) { 806 (long) sess_data->iov[2].iov_base;
722 rc = PTR_ERR(spnego_key);
723 spnego_key = NULL;
724 goto ssetup_exit;
725 }
726 807
727 msg = spnego_key->payload.data; 808 rc = sess_sendreceive(sess_data);
728 /* check version field to make sure that cifs.upcall is 809 if (rc)
729 sending us a response in an expected form */ 810 goto out;
730 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
731 cifs_dbg(VFS, "incorrect version of cifs.upcall "
732 "expected %d but got %d)",
733 CIFS_SPNEGO_UPCALL_VERSION, msg->version);
734 rc = -EKEYREJECTED;
735 goto ssetup_exit;
736 }
737 811
738 ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len, 812 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
739 GFP_KERNEL); 813 smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
740 if (!ses->auth_key.response) {
741 cifs_dbg(VFS,
742 "Kerberos can't allocate (%u bytes) memory",
743 msg->sesskey_len);
744 rc = -ENOMEM;
745 goto ssetup_exit;
746 }
747 ses->auth_key.len = msg->sesskey_len;
748
749 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
750 capabilities |= CAP_EXTENDED_SECURITY;
751 pSMB->req.Capabilities = cpu_to_le32(capabilities);
752 iov[1].iov_base = msg->data + msg->sesskey_len;
753 iov[1].iov_len = msg->secblob_len;
754 pSMB->req.SecurityBlobLength = cpu_to_le16(iov[1].iov_len);
755
756 if (ses->capabilities & CAP_UNICODE) {
757 /* unicode strings must be word aligned */
758 if ((iov[0].iov_len + iov[1].iov_len) % 2) {
759 *bcc_ptr = 0;
760 bcc_ptr++;
761 }
762 unicode_oslm_strings(&bcc_ptr, nls_cp);
763 unicode_domain_string(&bcc_ptr, ses, nls_cp);
764 } else
765 /* BB: is this right? */
766 ascii_ssetup_strings(&bcc_ptr, ses, nls_cp);
767#else /* ! CONFIG_CIFS_UPCALL */
768 cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
769 rc = -ENOSYS;
770 goto ssetup_exit;
771#endif /* CONFIG_CIFS_UPCALL */
772 } else if (type == RawNTLMSSP) {
773 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
774 cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
775 rc = -ENOSYS;
776 goto ssetup_exit;
777 }
778 814
779 cifs_dbg(FYI, "ntlmssp session setup phase %d\n", phase); 815 if (smb_buf->WordCount != 3) {
780 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; 816 rc = -EIO;
781 capabilities |= CAP_EXTENDED_SECURITY; 817 cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
782 pSMB->req.Capabilities |= cpu_to_le32(capabilities); 818 goto out;
783 switch(phase) { 819 }
784 case NtLmNegotiate:
785 build_ntlmssp_negotiate_blob(
786 pSMB->req.SecurityBlob, ses);
787 iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
788 iov[1].iov_base = pSMB->req.SecurityBlob;
789 pSMB->req.SecurityBlobLength =
790 cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
791 break;
792 case NtLmAuthenticate:
793 /*
794 * 5 is an empirical value, large enough to hold
795 * authenticate message plus max 10 of av paris,
796 * domain, user, workstation names, flags, etc.
797 */
798 ntlmsspblob = kzalloc(
799 5*sizeof(struct _AUTHENTICATE_MESSAGE),
800 GFP_KERNEL);
801 if (!ntlmsspblob) {
802 rc = -ENOMEM;
803 goto ssetup_exit;
804 }
805 820
806 rc = build_ntlmssp_auth_blob(ntlmsspblob, 821 if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
807 &blob_len, ses, nls_cp); 822 cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
808 if (rc) 823
809 goto ssetup_exit; 824 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
810 iov[1].iov_len = blob_len; 825 cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
811 iov[1].iov_base = ntlmsspblob; 826
812 pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len); 827 bytes_remaining = get_bcc(smb_buf);
813 /* 828 bcc_ptr = pByteArea(smb_buf);
814 * Make sure that we tell the server that we are using 829
815 * the uid that it just gave us back on the response 830 /* BB check if Unicode and decode strings */
816 * (challenge) 831 if (bytes_remaining == 0) {
817 */ 832 /* no string area to decode, do nothing */
818 smb_buf->Uid = ses->Suid; 833 } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
819 break; 834 /* unicode string area must be word-aligned */
820 default: 835 if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
821 cifs_dbg(VFS, "invalid phase %d\n", phase); 836 ++bcc_ptr;
822 rc = -ENOSYS; 837 --bytes_remaining;
823 goto ssetup_exit;
824 } 838 }
825 /* unicode strings must be word aligned */ 839 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
826 if ((iov[0].iov_len + iov[1].iov_len) % 2) { 840 sess_data->nls_cp);
841 } else {
842 decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
843 sess_data->nls_cp);
844 }
845
846 rc = sess_establish_session(sess_data);
847out:
848 sess_data->result = rc;
849 sess_data->func = NULL;
850 sess_free_buffer(sess_data);
851 kfree(ses->auth_key.response);
852 ses->auth_key.response = NULL;
853}
854
855static void
856sess_auth_ntlmv2(struct sess_data *sess_data)
857{
858 int rc = 0;
859 struct smb_hdr *smb_buf;
860 SESSION_SETUP_ANDX *pSMB;
861 char *bcc_ptr;
862 struct cifs_ses *ses = sess_data->ses;
863 __u32 capabilities;
864 __u16 bytes_remaining;
865
866 /* old style NTLM sessionsetup */
867 /* wct = 13 */
868 rc = sess_alloc_buffer(sess_data, 13);
869 if (rc)
870 goto out;
871
872 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
873 bcc_ptr = sess_data->iov[2].iov_base;
874 capabilities = cifs_ssetup_hdr(ses, pSMB);
875
876 pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities);
877
878 /* LM2 password would be here if we supported it */
879 pSMB->req_no_secext.CaseInsensitivePasswordLength = 0;
880
881 /* calculate nlmv2 response and session key */
882 rc = setup_ntlmv2_rsp(ses, sess_data->nls_cp);
883 if (rc) {
884 cifs_dbg(VFS, "Error %d during NTLMv2 authentication\n", rc);
885 goto out;
886 }
887
888 memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE,
889 ses->auth_key.len - CIFS_SESS_KEY_SIZE);
890 bcc_ptr += ses->auth_key.len - CIFS_SESS_KEY_SIZE;
891
892 /* set case sensitive password length after tilen may get
893 * assigned, tilen is 0 otherwise.
894 */
895 pSMB->req_no_secext.CaseSensitivePasswordLength =
896 cpu_to_le16(ses->auth_key.len - CIFS_SESS_KEY_SIZE);
897
898 if (ses->capabilities & CAP_UNICODE) {
899 if (sess_data->iov[0].iov_len % 2) {
827 *bcc_ptr = 0; 900 *bcc_ptr = 0;
828 bcc_ptr++; 901 bcc_ptr++;
829 } 902 }
830 unicode_oslm_strings(&bcc_ptr, nls_cp); 903 unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
831 } else { 904 } else {
832 cifs_dbg(VFS, "secType %d not supported!\n", type); 905 ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
833 rc = -ENOSYS;
834 goto ssetup_exit;
835 } 906 }
836 907
837 iov[2].iov_base = str_area;
838 iov[2].iov_len = (long) bcc_ptr - (long) str_area;
839 908
840 count = iov[1].iov_len + iov[2].iov_len; 909 sess_data->iov[2].iov_len = (long) bcc_ptr -
841 smb_buf->smb_buf_length = 910 (long) sess_data->iov[2].iov_base;
842 cpu_to_be32(be32_to_cpu(smb_buf->smb_buf_length) + count);
843 911
844 put_bcc(count, smb_buf); 912 rc = sess_sendreceive(sess_data);
913 if (rc)
914 goto out;
845 915
846 rc = SendReceive2(xid, ses, iov, 3 /* num_iovecs */, &resp_buf_type, 916 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
847 CIFS_LOG_ERROR); 917 smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
848 /* SMB request buf freed in SendReceive2 */
849 918
850 pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; 919 if (smb_buf->WordCount != 3) {
851 smb_buf = (struct smb_hdr *)iov[0].iov_base; 920 rc = -EIO;
921 cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
922 goto out;
923 }
852 924
853 if ((type == RawNTLMSSP) && (resp_buf_type != CIFS_NO_BUFFER) && 925 if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
854 (smb_buf->Status.CifsError == 926 cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
855 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))) { 927
856 if (phase != NtLmNegotiate) { 928 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
857 cifs_dbg(VFS, "Unexpected more processing error\n"); 929 cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
858 goto ssetup_exit; 930
931 bytes_remaining = get_bcc(smb_buf);
932 bcc_ptr = pByteArea(smb_buf);
933
934 /* BB check if Unicode and decode strings */
935 if (bytes_remaining == 0) {
936 /* no string area to decode, do nothing */
937 } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
938 /* unicode string area must be word-aligned */
939 if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
940 ++bcc_ptr;
941 --bytes_remaining;
942 }
943 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
944 sess_data->nls_cp);
945 } else {
946 decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
947 sess_data->nls_cp);
948 }
949
950 rc = sess_establish_session(sess_data);
951out:
952 sess_data->result = rc;
953 sess_data->func = NULL;
954 sess_free_buffer(sess_data);
955 kfree(ses->auth_key.response);
956 ses->auth_key.response = NULL;
957}
958
959#ifdef CONFIG_CIFS_UPCALL
960static void
961sess_auth_kerberos(struct sess_data *sess_data)
962{
963 int rc = 0;
964 struct smb_hdr *smb_buf;
965 SESSION_SETUP_ANDX *pSMB;
966 char *bcc_ptr;
967 struct cifs_ses *ses = sess_data->ses;
968 __u32 capabilities;
969 __u16 bytes_remaining;
970 struct key *spnego_key = NULL;
971 struct cifs_spnego_msg *msg;
972 u16 blob_len;
973
974 /* extended security */
975 /* wct = 12 */
976 rc = sess_alloc_buffer(sess_data, 12);
977 if (rc)
978 goto out;
979
980 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
981 bcc_ptr = sess_data->iov[2].iov_base;
982 capabilities = cifs_ssetup_hdr(ses, pSMB);
983
984 spnego_key = cifs_get_spnego_key(ses);
985 if (IS_ERR(spnego_key)) {
986 rc = PTR_ERR(spnego_key);
987 spnego_key = NULL;
988 goto out;
989 }
990
991 msg = spnego_key->payload.data;
992 /*
993 * check version field to make sure that cifs.upcall is
994 * sending us a response in an expected form
995 */
996 if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
997 cifs_dbg(VFS,
998 "incorrect version of cifs.upcall (expected %d but got %d)",
999 CIFS_SPNEGO_UPCALL_VERSION, msg->version);
1000 rc = -EKEYREJECTED;
1001 goto out_put_spnego_key;
1002 }
1003
1004 ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
1005 GFP_KERNEL);
1006 if (!ses->auth_key.response) {
1007 cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
1008 msg->sesskey_len);
1009 rc = -ENOMEM;
1010 goto out_put_spnego_key;
1011 }
1012 ses->auth_key.len = msg->sesskey_len;
1013
1014 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
1015 capabilities |= CAP_EXTENDED_SECURITY;
1016 pSMB->req.Capabilities = cpu_to_le32(capabilities);
1017 sess_data->iov[1].iov_base = msg->data + msg->sesskey_len;
1018 sess_data->iov[1].iov_len = msg->secblob_len;
1019 pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len);
1020
1021 if (ses->capabilities & CAP_UNICODE) {
1022 /* unicode strings must be word aligned */
1023 if ((sess_data->iov[0].iov_len
1024 + sess_data->iov[1].iov_len) % 2) {
1025 *bcc_ptr = 0;
1026 bcc_ptr++;
859 } 1027 }
860 /* NTLMSSP Negotiate sent now processing challenge (response) */ 1028 unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
861 phase = NtLmChallenge; /* process ntlmssp challenge */ 1029 unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp);
862 rc = 0; /* MORE_PROC rc is not an error here, but expected */ 1030 } else {
1031 /* BB: is this right? */
1032 ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp);
863 } 1033 }
1034
1035 sess_data->iov[2].iov_len = (long) bcc_ptr -
1036 (long) sess_data->iov[2].iov_base;
1037
1038 rc = sess_sendreceive(sess_data);
864 if (rc) 1039 if (rc)
865 goto ssetup_exit; 1040 goto out_put_spnego_key;
866 1041
867 if ((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { 1042 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
1043 smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
1044
1045 if (smb_buf->WordCount != 4) {
868 rc = -EIO; 1046 rc = -EIO;
869 cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); 1047 cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
870 goto ssetup_exit; 1048 goto out_put_spnego_key;
871 } 1049 }
872 action = le16_to_cpu(pSMB->resp.Action); 1050
873 if (action & GUEST_LOGIN) 1051 if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
874 cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ 1052 cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
1053
875 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ 1054 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
876 cifs_dbg(FYI, "UID = %llu\n", ses->Suid); 1055 cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
877 /* response can have either 3 or 4 word count - Samba sends 3 */ 1056
878 /* and lanman response is 3 */
879 bytes_remaining = get_bcc(smb_buf); 1057 bytes_remaining = get_bcc(smb_buf);
880 bcc_ptr = pByteArea(smb_buf); 1058 bcc_ptr = pByteArea(smb_buf);
881 1059
882 if (smb_buf->WordCount == 4) { 1060 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
883 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); 1061 if (blob_len > bytes_remaining) {
884 if (blob_len > bytes_remaining) { 1062 cifs_dbg(VFS, "bad security blob length %d\n",
885 cifs_dbg(VFS, "bad security blob length %d\n", 1063 blob_len);
886 blob_len); 1064 rc = -EINVAL;
887 rc = -EINVAL; 1065 goto out_put_spnego_key;
888 goto ssetup_exit;
889 }
890 if (phase == NtLmChallenge) {
891 rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
892 /* now goto beginning for ntlmssp authenticate phase */
893 if (rc)
894 goto ssetup_exit;
895 }
896 bcc_ptr += blob_len;
897 bytes_remaining -= blob_len;
898 } 1066 }
1067 bcc_ptr += blob_len;
1068 bytes_remaining -= blob_len;
899 1069
900 /* BB check if Unicode and decode strings */ 1070 /* BB check if Unicode and decode strings */
901 if (bytes_remaining == 0) { 1071 if (bytes_remaining == 0) {
@@ -906,60 +1076,362 @@ ssetup_ntlmssp_authenticate:
906 ++bcc_ptr; 1076 ++bcc_ptr;
907 --bytes_remaining; 1077 --bytes_remaining;
908 } 1078 }
909 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); 1079 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
1080 sess_data->nls_cp);
910 } else { 1081 } else {
911 decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, nls_cp); 1082 decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
1083 sess_data->nls_cp);
912 } 1084 }
913 1085
914ssetup_exit: 1086 rc = sess_establish_session(sess_data);
915 if (spnego_key) { 1087out_put_spnego_key:
916 key_invalidate(spnego_key); 1088 key_invalidate(spnego_key);
917 key_put(spnego_key); 1089 key_put(spnego_key);
1090out:
1091 sess_data->result = rc;
1092 sess_data->func = NULL;
1093 sess_free_buffer(sess_data);
1094 kfree(ses->auth_key.response);
1095 ses->auth_key.response = NULL;
1096}
1097
1098#endif /* ! CONFIG_CIFS_UPCALL */
1099
1100/*
1101 * The required kvec buffers have to be allocated before calling this
1102 * function.
1103 */
1104static int
1105_sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
1106{
1107 struct smb_hdr *smb_buf;
1108 SESSION_SETUP_ANDX *pSMB;
1109 struct cifs_ses *ses = sess_data->ses;
1110 __u32 capabilities;
1111 char *bcc_ptr;
1112
1113 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
1114 smb_buf = (struct smb_hdr *)pSMB;
1115
1116 capabilities = cifs_ssetup_hdr(ses, pSMB);
1117 if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) {
1118 cifs_dbg(VFS, "NTLMSSP requires Unicode support\n");
1119 return -ENOSYS;
918 } 1120 }
919 kfree(str_area);
920 kfree(ntlmsspblob);
921 ntlmsspblob = NULL;
922 if (resp_buf_type == CIFS_SMALL_BUFFER) {
923 cifs_dbg(FYI, "ssetup freeing small buf %p\n", iov[0].iov_base);
924 cifs_small_buf_release(iov[0].iov_base);
925 } else if (resp_buf_type == CIFS_LARGE_BUFFER)
926 cifs_buf_release(iov[0].iov_base);
927 1121
928 /* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */ 1122 pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
929 if ((phase == NtLmChallenge) && (rc == 0)) 1123 capabilities |= CAP_EXTENDED_SECURITY;
930 goto ssetup_ntlmssp_authenticate; 1124 pSMB->req.Capabilities |= cpu_to_le32(capabilities);
1125
1126 bcc_ptr = sess_data->iov[2].iov_base;
1127 /* unicode strings must be word aligned */
1128 if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) {
1129 *bcc_ptr = 0;
1130 bcc_ptr++;
1131 }
1132 unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp);
1133
1134 sess_data->iov[2].iov_len = (long) bcc_ptr -
1135 (long) sess_data->iov[2].iov_base;
1136
1137 return 0;
1138}
1139
1140static void
1141sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data);
1142
1143static void
1144sess_auth_rawntlmssp_negotiate(struct sess_data *sess_data)
1145{
1146 int rc;
1147 struct smb_hdr *smb_buf;
1148 SESSION_SETUP_ANDX *pSMB;
1149 struct cifs_ses *ses = sess_data->ses;
1150 __u16 bytes_remaining;
1151 char *bcc_ptr;
1152 u16 blob_len;
1153
1154 cifs_dbg(FYI, "rawntlmssp session setup negotiate phase\n");
1155
1156 /*
1157 * if memory allocation is successful, caller of this function
1158 * frees it.
1159 */
1160 ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
1161 if (!ses->ntlmssp) {
1162 rc = -ENOMEM;
1163 goto out;
1164 }
1165 ses->ntlmssp->sesskey_per_smbsess = false;
1166
1167 /* wct = 12 */
1168 rc = sess_alloc_buffer(sess_data, 12);
1169 if (rc)
1170 goto out;
1171
1172 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
1173
1174 /* Build security blob before we assemble the request */
1175 build_ntlmssp_negotiate_blob(pSMB->req.SecurityBlob, ses);
1176 sess_data->iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
1177 sess_data->iov[1].iov_base = pSMB->req.SecurityBlob;
1178 pSMB->req.SecurityBlobLength = cpu_to_le16(sizeof(NEGOTIATE_MESSAGE));
1179
1180 rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
1181 if (rc)
1182 goto out;
1183
1184 rc = sess_sendreceive(sess_data);
1185
1186 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
1187 smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
1188
1189 /* If true, rc here is expected and not an error */
1190 if (sess_data->buf0_type != CIFS_NO_BUFFER &&
1191 smb_buf->Status.CifsError ==
1192 cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED))
1193 rc = 0;
1194
1195 if (rc)
1196 goto out;
1197
1198 cifs_dbg(FYI, "rawntlmssp session setup challenge phase\n");
1199
1200 if (smb_buf->WordCount != 4) {
1201 rc = -EIO;
1202 cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
1203 goto out;
1204 }
1205
1206 ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */
1207 cifs_dbg(FYI, "UID = %llu\n", ses->Suid);
1208
1209 bytes_remaining = get_bcc(smb_buf);
1210 bcc_ptr = pByteArea(smb_buf);
1211
1212 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
1213 if (blob_len > bytes_remaining) {
1214 cifs_dbg(VFS, "bad security blob length %d\n",
1215 blob_len);
1216 rc = -EINVAL;
1217 goto out;
1218 }
1219
1220 rc = decode_ntlmssp_challenge(bcc_ptr, blob_len, ses);
1221out:
1222 sess_free_buffer(sess_data);
931 1223
932 if (!rc) { 1224 if (!rc) {
933 mutex_lock(&ses->server->srv_mutex); 1225 sess_data->func = sess_auth_rawntlmssp_authenticate;
934 if (!ses->server->session_estab) { 1226 return;
935 if (ses->server->sign) { 1227 }
936 ses->server->session_key.response = 1228
937 kmemdup(ses->auth_key.response, 1229 /* Else error. Cleanup */
938 ses->auth_key.len, GFP_KERNEL); 1230 kfree(ses->auth_key.response);
939 if (!ses->server->session_key.response) { 1231 ses->auth_key.response = NULL;
940 rc = -ENOMEM; 1232 kfree(ses->ntlmssp);
941 mutex_unlock(&ses->server->srv_mutex); 1233 ses->ntlmssp = NULL;
942 goto keycp_exit; 1234
943 } 1235 sess_data->func = NULL;
944 ses->server->session_key.len = 1236 sess_data->result = rc;
945 ses->auth_key.len; 1237}
946 } 1238
947 ses->server->sequence_number = 0x2; 1239static void
948 ses->server->session_estab = true; 1240sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
949 } 1241{
950 mutex_unlock(&ses->server->srv_mutex); 1242 int rc;
1243 struct smb_hdr *smb_buf;
1244 SESSION_SETUP_ANDX *pSMB;
1245 struct cifs_ses *ses = sess_data->ses;
1246 __u16 bytes_remaining;
1247 char *bcc_ptr;
1248 char *ntlmsspblob = NULL;
1249 u16 blob_len;
1250
1251 cifs_dbg(FYI, "rawntlmssp session setup authenticate phase\n");
951 1252
952 cifs_dbg(FYI, "CIFS session established successfully\n"); 1253 /* wct = 12 */
953 spin_lock(&GlobalMid_Lock); 1254 rc = sess_alloc_buffer(sess_data, 12);
954 ses->status = CifsGood; 1255 if (rc)
955 ses->need_reconnect = false; 1256 goto out;
956 spin_unlock(&GlobalMid_Lock); 1257
1258 /* Build security blob before we assemble the request */
1259 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
1260 smb_buf = (struct smb_hdr *)pSMB;
1261 /*
1262 * 5 is an empirical value, large enough to hold
1263 * authenticate message plus max 10 of av paris,
1264 * domain, user, workstation names, flags, etc.
1265 */
1266 ntlmsspblob = kzalloc(5*sizeof(struct _AUTHENTICATE_MESSAGE),
1267 GFP_KERNEL);
1268 if (!ntlmsspblob) {
1269 rc = -ENOMEM;
1270 goto out;
957 } 1271 }
958 1272
959keycp_exit: 1273 rc = build_ntlmssp_auth_blob(ntlmsspblob,
1274 &blob_len, ses, sess_data->nls_cp);
1275 if (rc)
1276 goto out_free_ntlmsspblob;
1277 sess_data->iov[1].iov_len = blob_len;
1278 sess_data->iov[1].iov_base = ntlmsspblob;
1279 pSMB->req.SecurityBlobLength = cpu_to_le16(blob_len);
1280 /*
1281 * Make sure that we tell the server that we are using
1282 * the uid that it just gave us back on the response
1283 * (challenge)
1284 */
1285 smb_buf->Uid = ses->Suid;
1286
1287 rc = _sess_auth_rawntlmssp_assemble_req(sess_data);
1288 if (rc)
1289 goto out_free_ntlmsspblob;
1290
1291 rc = sess_sendreceive(sess_data);
1292 if (rc)
1293 goto out_free_ntlmsspblob;
1294
1295 pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base;
1296 smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base;
1297 if (smb_buf->WordCount != 4) {
1298 rc = -EIO;
1299 cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount);
1300 goto out_free_ntlmsspblob;
1301 }
1302
1303 if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN)
1304 cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */
1305
1306 bytes_remaining = get_bcc(smb_buf);
1307 bcc_ptr = pByteArea(smb_buf);
1308 blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength);
1309 if (blob_len > bytes_remaining) {
1310 cifs_dbg(VFS, "bad security blob length %d\n",
1311 blob_len);
1312 rc = -EINVAL;
1313 goto out_free_ntlmsspblob;
1314 }
1315 bcc_ptr += blob_len;
1316 bytes_remaining -= blob_len;
1317
1318
1319 /* BB check if Unicode and decode strings */
1320 if (bytes_remaining == 0) {
1321 /* no string area to decode, do nothing */
1322 } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) {
1323 /* unicode string area must be word-aligned */
1324 if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) {
1325 ++bcc_ptr;
1326 --bytes_remaining;
1327 }
1328 decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses,
1329 sess_data->nls_cp);
1330 } else {
1331 decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,
1332 sess_data->nls_cp);
1333 }
1334
1335out_free_ntlmsspblob:
1336 kfree(ntlmsspblob);
1337out:
1338 sess_free_buffer(sess_data);
1339
1340 if (!rc)
1341 rc = sess_establish_session(sess_data);
1342
1343 /* Cleanup */
960 kfree(ses->auth_key.response); 1344 kfree(ses->auth_key.response);
961 ses->auth_key.response = NULL; 1345 ses->auth_key.response = NULL;
962 kfree(ses->ntlmssp); 1346 kfree(ses->ntlmssp);
1347 ses->ntlmssp = NULL;
1348
1349 sess_data->func = NULL;
1350 sess_data->result = rc;
1351}
1352
1353static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data)
1354{
1355 int type;
1356
1357 type = select_sectype(ses->server, ses->sectype);
1358 cifs_dbg(FYI, "sess setup type %d\n", type);
1359 if (type == Unspecified) {
1360 cifs_dbg(VFS,
1361 "Unable to select appropriate authentication method!");
1362 return -EINVAL;
1363 }
1364
1365 switch (type) {
1366 case LANMAN:
1367 /* LANMAN and plaintext are less secure and off by default.
1368 * So we make this explicitly be turned on in kconfig (in the
1369 * build) and turned on at runtime (changed from the default)
1370 * in proc/fs/cifs or via mount parm. Unfortunately this is
1371 * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */
1372#ifdef CONFIG_CIFS_WEAK_PW_HASH
1373 sess_data->func = sess_auth_lanman;
1374 break;
1375#else
1376 return -EOPNOTSUPP;
1377#endif
1378 case NTLM:
1379 sess_data->func = sess_auth_ntlm;
1380 break;
1381 case NTLMv2:
1382 sess_data->func = sess_auth_ntlmv2;
1383 break;
1384 case Kerberos:
1385#ifdef CONFIG_CIFS_UPCALL
1386 sess_data->func = sess_auth_kerberos;
1387 break;
1388#else
1389 cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
1390 return -ENOSYS;
1391 break;
1392#endif /* CONFIG_CIFS_UPCALL */
1393 case RawNTLMSSP:
1394 sess_data->func = sess_auth_rawntlmssp_negotiate;
1395 break;
1396 default:
1397 cifs_dbg(VFS, "secType %d not supported!\n", type);
1398 return -ENOSYS;
1399 }
1400
1401 return 0;
1402}
1403
1404int CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
1405 const struct nls_table *nls_cp)
1406{
1407 int rc = 0;
1408 struct sess_data *sess_data;
1409
1410 if (ses == NULL) {
1411 WARN(1, "%s: ses == NULL!", __func__);
1412 return -EINVAL;
1413 }
1414
1415 sess_data = kzalloc(sizeof(struct sess_data), GFP_KERNEL);
1416 if (!sess_data)
1417 return -ENOMEM;
1418
1419 rc = select_sec(ses, sess_data);
1420 if (rc)
1421 goto out;
1422
1423 sess_data->xid = xid;
1424 sess_data->ses = ses;
1425 sess_data->buf0_type = CIFS_NO_BUFFER;
1426 sess_data->nls_cp = (struct nls_table *) nls_cp;
1427
1428 while (sess_data->func)
1429 sess_data->func(sess_data);
1430
1431 /* Store result before we free sess_data */
1432 rc = sess_data->result;
963 1433
1434out:
1435 kfree(sess_data);
964 return rc; 1436 return rc;
965} 1437}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index d1fdfa848703..d2979036a4c7 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -23,6 +23,7 @@
23#include "cifsproto.h" 23#include "cifsproto.h"
24#include "cifs_debug.h" 24#include "cifs_debug.h"
25#include "cifspdu.h" 25#include "cifspdu.h"
26#include "cifs_unicode.h"
26 27
27/* 28/*
28 * An NT cancel request header looks just like the original request except: 29 * An NT cancel request header looks just like the original request except:
@@ -530,13 +531,11 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
530 531
531 rc = CIFSSMBQPathInfo(xid, tcon, full_path, file_info, 532 rc = CIFSSMBQPathInfo(xid, tcon, full_path, file_info,
532 0 /* not legacy */, cifs_sb->local_nls, 533 0 /* not legacy */, cifs_sb->local_nls,
533 cifs_sb->mnt_cifs_flags & 534 cifs_remap(cifs_sb));
534 CIFS_MOUNT_MAP_SPECIAL_CHR);
535 535
536 if (rc == -EOPNOTSUPP || rc == -EINVAL) 536 if (rc == -EOPNOTSUPP || rc == -EINVAL)
537 rc = SMBQueryInformation(xid, tcon, full_path, file_info, 537 rc = SMBQueryInformation(xid, tcon, full_path, file_info,
538 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 538 cifs_sb->local_nls, cifs_remap(cifs_sb));
539 CIFS_MOUNT_MAP_SPECIAL_CHR);
540 kfree(file_info); 539 kfree(file_info);
541 return rc; 540 return rc;
542} 541}
@@ -552,8 +551,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
552 551
553 /* could do find first instead but this returns more info */ 552 /* could do find first instead but this returns more info */
554 rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */, 553 rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */,
555 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 554 cifs_sb->local_nls, cifs_remap(cifs_sb));
556 CIFS_MOUNT_MAP_SPECIAL_CHR);
557 /* 555 /*
558 * BB optimize code so we do not make the above call when server claims 556 * BB optimize code so we do not make the above call when server claims
559 * no NT SMB support and the above call failed at least once - set flag 557 * no NT SMB support and the above call failed at least once - set flag
@@ -562,8 +560,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
562 if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { 560 if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) {
563 rc = SMBQueryInformation(xid, tcon, full_path, data, 561 rc = SMBQueryInformation(xid, tcon, full_path, data,
564 cifs_sb->local_nls, 562 cifs_sb->local_nls,
565 cifs_sb->mnt_cifs_flags & 563 cifs_remap(cifs_sb));
566 CIFS_MOUNT_MAP_SPECIAL_CHR);
567 *adjustTZ = true; 564 *adjustTZ = true;
568 } 565 }
569 566
@@ -586,7 +583,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
586 tmprc = CIFS_open(xid, &oparms, &oplock, NULL); 583 tmprc = CIFS_open(xid, &oparms, &oplock, NULL);
587 if (tmprc == -EOPNOTSUPP) 584 if (tmprc == -EOPNOTSUPP)
588 *symlink = true; 585 *symlink = true;
589 else 586 else if (tmprc == 0)
590 CIFSSMBClose(xid, tcon, fid.netfid); 587 CIFSSMBClose(xid, tcon, fid.netfid);
591 } 588 }
592 589
@@ -611,8 +608,7 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon,
611 */ 608 */
612 return CIFSGetSrvInodeNumber(xid, tcon, full_path, uniqueid, 609 return CIFSGetSrvInodeNumber(xid, tcon, full_path, uniqueid,
613 cifs_sb->local_nls, 610 cifs_sb->local_nls,
614 cifs_sb->mnt_cifs_flags & 611 cifs_remap(cifs_sb));
615 CIFS_MOUNT_MAP_SPECIAL_CHR);
616} 612}
617 613
618static int 614static int
@@ -703,8 +699,7 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path,
703 dosattrs = cifsInode->cifsAttrs|ATTR_READONLY; 699 dosattrs = cifsInode->cifsAttrs|ATTR_READONLY;
704 info.Attributes = cpu_to_le32(dosattrs); 700 info.Attributes = cpu_to_le32(dosattrs);
705 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls, 701 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls,
706 cifs_sb->mnt_cifs_flags & 702 cifs_remap(cifs_sb));
707 CIFS_MOUNT_MAP_SPECIAL_CHR);
708 if (rc == 0) 703 if (rc == 0)
709 cifsInode->cifsAttrs = dosattrs; 704 cifsInode->cifsAttrs = dosattrs;
710} 705}
@@ -720,8 +715,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
720 oparms->create_options, 715 oparms->create_options,
721 &oparms->fid->netfid, oplock, buf, 716 &oparms->fid->netfid, oplock, buf,
722 oparms->cifs_sb->local_nls, 717 oparms->cifs_sb->local_nls,
723 oparms->cifs_sb->mnt_cifs_flags 718 cifs_remap(oparms->cifs_sb));
724 & CIFS_MOUNT_MAP_SPECIAL_CHR);
725 return CIFS_open(xid, oparms, oplock, buf); 719 return CIFS_open(xid, oparms, oplock, buf);
726} 720}
727 721
@@ -749,21 +743,21 @@ cifs_flush_file(const unsigned int xid, struct cifs_tcon *tcon,
749} 743}
750 744
751static int 745static int
752cifs_sync_read(const unsigned int xid, struct cifsFileInfo *cfile, 746cifs_sync_read(const unsigned int xid, struct cifs_fid *pfid,
753 struct cifs_io_parms *parms, unsigned int *bytes_read, 747 struct cifs_io_parms *parms, unsigned int *bytes_read,
754 char **buf, int *buf_type) 748 char **buf, int *buf_type)
755{ 749{
756 parms->netfid = cfile->fid.netfid; 750 parms->netfid = pfid->netfid;
757 return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type); 751 return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type);
758} 752}
759 753
760static int 754static int
761cifs_sync_write(const unsigned int xid, struct cifsFileInfo *cfile, 755cifs_sync_write(const unsigned int xid, struct cifs_fid *pfid,
762 struct cifs_io_parms *parms, unsigned int *written, 756 struct cifs_io_parms *parms, unsigned int *written,
763 struct kvec *iov, unsigned long nr_segs) 757 struct kvec *iov, unsigned long nr_segs)
764{ 758{
765 759
766 parms->netfid = cfile->fid.netfid; 760 parms->netfid = pfid->netfid;
767 return CIFSSMBWrite2(xid, parms, written, iov, nr_segs); 761 return CIFSSMBWrite2(xid, parms, written, iov, nr_segs);
768} 762}
769 763
@@ -800,8 +794,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
800 tcon = tlink_tcon(tlink); 794 tcon = tlink_tcon(tlink);
801 795
802 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls, 796 rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls,
803 cifs_sb->mnt_cifs_flags & 797 cifs_remap(cifs_sb));
804 CIFS_MOUNT_MAP_SPECIAL_CHR);
805 if (rc == 0) { 798 if (rc == 0) {
806 cinode->cifsAttrs = le32_to_cpu(buf->Attributes); 799 cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
807 goto out; 800 goto out;
@@ -1009,6 +1002,18 @@ cifs_is_read_op(__u32 oplock)
1009 return oplock == OPLOCK_READ; 1002 return oplock == OPLOCK_READ;
1010} 1003}
1011 1004
1005static unsigned int
1006cifs_wp_retry_size(struct inode *inode)
1007{
1008 return CIFS_SB(inode->i_sb)->wsize;
1009}
1010
1011static bool
1012cifs_dir_needs_close(struct cifsFileInfo *cfile)
1013{
1014 return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle;
1015}
1016
1012struct smb_version_operations smb1_operations = { 1017struct smb_version_operations smb1_operations = {
1013 .send_cancel = send_nt_cancel, 1018 .send_cancel = send_nt_cancel,
1014 .compare_fids = cifs_compare_fids, 1019 .compare_fids = cifs_compare_fids,
@@ -1019,6 +1024,7 @@ struct smb_version_operations smb1_operations = {
1019 .set_credits = cifs_set_credits, 1024 .set_credits = cifs_set_credits,
1020 .get_credits_field = cifs_get_credits_field, 1025 .get_credits_field = cifs_get_credits_field,
1021 .get_credits = cifs_get_credits, 1026 .get_credits = cifs_get_credits,
1027 .wait_mtu_credits = cifs_wait_mtu_credits,
1022 .get_next_mid = cifs_get_next_mid, 1028 .get_next_mid = cifs_get_next_mid,
1023 .read_data_offset = cifs_read_data_offset, 1029 .read_data_offset = cifs_read_data_offset,
1024 .read_data_length = cifs_read_data_length, 1030 .read_data_length = cifs_read_data_length,
@@ -1078,6 +1084,8 @@ struct smb_version_operations smb1_operations = {
1078 .query_mf_symlink = cifs_query_mf_symlink, 1084 .query_mf_symlink = cifs_query_mf_symlink,
1079 .create_mf_symlink = cifs_create_mf_symlink, 1085 .create_mf_symlink = cifs_create_mf_symlink,
1080 .is_read_op = cifs_is_read_op, 1086 .is_read_op = cifs_is_read_op,
1087 .wp_retry_size = cifs_wp_retry_size,
1088 .dir_needs_close = cifs_dir_needs_close,
1081#ifdef CONFIG_CIFS_XATTR 1089#ifdef CONFIG_CIFS_XATTR
1082 .query_all_EAs = CIFSSMBQAllEAs, 1090 .query_all_EAs = CIFSSMBQAllEAs,
1083 .set_EA = CIFSSMBSetEA, 1091 .set_EA = CIFSSMBSetEA,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 3f17b4550831..45992944e238 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
50 goto out; 50 goto out;
51 } 51 }
52 52
53 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, 53 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
54 GFP_KERNEL); 54 GFP_KERNEL);
55 if (smb2_data == NULL) { 55 if (smb2_data == NULL) {
56 rc = -ENOMEM; 56 rc = -ENOMEM;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 84c012a6aba0..899bbc86f73e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -91,7 +91,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
91 case SMB2_OP_SET_EOF: 91 case SMB2_OP_SET_EOF:
92 tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid, 92 tmprc = SMB2_set_eof(xid, tcon, fid.persistent_fid,
93 fid.volatile_fid, current->tgid, 93 fid.volatile_fid, current->tgid,
94 (__le64 *)data); 94 (__le64 *)data, false);
95 break; 95 break;
96 case SMB2_OP_SET_INFO: 96 case SMB2_OP_SET_INFO:
97 tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid, 97 tmprc = SMB2_set_info(xid, tcon, fid.persistent_fid,
@@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
131 *adjust_tz = false; 131 *adjust_tz = false;
132 *symlink = false; 132 *symlink = false;
133 133
134 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, 134 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
135 GFP_KERNEL); 135 GFP_KERNEL);
136 if (smb2_data == NULL) 136 if (smb2_data == NULL)
137 return -ENOMEM; 137 return -ENOMEM;
diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c
index 94bd4fbb13d3..8257a5a97cc0 100644
--- a/fs/cifs/smb2maperror.c
+++ b/fs/cifs/smb2maperror.c
@@ -214,7 +214,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
214 {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"}, 214 {STATUS_BREAKPOINT, -EIO, "STATUS_BREAKPOINT"},
215 {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"}, 215 {STATUS_SINGLE_STEP, -EIO, "STATUS_SINGLE_STEP"},
216 {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"}, 216 {STATUS_BUFFER_OVERFLOW, -EIO, "STATUS_BUFFER_OVERFLOW"},
217 {STATUS_NO_MORE_FILES, -EIO, "STATUS_NO_MORE_FILES"}, 217 {STATUS_NO_MORE_FILES, -ENODATA, "STATUS_NO_MORE_FILES"},
218 {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"}, 218 {STATUS_WAKE_SYSTEM_DEBUGGER, -EIO, "STATUS_WAKE_SYSTEM_DEBUGGER"},
219 {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"}, 219 {STATUS_HANDLES_CLOSED, -EIO, "STATUS_HANDLES_CLOSED"},
220 {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"}, 220 {STATUS_NO_INHERITANCE, -EIO, "STATUS_NO_INHERITANCE"},
@@ -256,6 +256,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
256 {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO, 256 {STATUS_DLL_MIGHT_BE_INCOMPATIBLE, -EIO,
257 "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"}, 257 "STATUS_DLL_MIGHT_BE_INCOMPATIBLE"},
258 {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"}, 258 {STATUS_STOPPED_ON_SYMLINK, -EOPNOTSUPP, "STATUS_STOPPED_ON_SYMLINK"},
259 {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EOPNOTSUPP,
260 "STATUS_REPARSE_NOT_HANDLED"},
259 {STATUS_DEVICE_REQUIRES_CLEANING, -EIO, 261 {STATUS_DEVICE_REQUIRES_CLEANING, -EIO,
260 "STATUS_DEVICE_REQUIRES_CLEANING"}, 262 "STATUS_DEVICE_REQUIRES_CLEANING"},
261 {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"}, 263 {STATUS_DEVICE_DOOR_OPEN, -EIO, "STATUS_DEVICE_DOOR_OPEN"},
@@ -298,7 +300,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
298 {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"}, 300 {STATUS_INVALID_PARAMETER, -EINVAL, "STATUS_INVALID_PARAMETER"},
299 {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"}, 301 {STATUS_NO_SUCH_DEVICE, -ENODEV, "STATUS_NO_SUCH_DEVICE"},
300 {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"}, 302 {STATUS_NO_SUCH_FILE, -ENOENT, "STATUS_NO_SUCH_FILE"},
301 {STATUS_INVALID_DEVICE_REQUEST, -EIO, "STATUS_INVALID_DEVICE_REQUEST"}, 303 {STATUS_INVALID_DEVICE_REQUEST, -EOPNOTSUPP, "STATUS_INVALID_DEVICE_REQUEST"},
302 {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"}, 304 {STATUS_END_OF_FILE, -ENODATA, "STATUS_END_OF_FILE"},
303 {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"}, 305 {STATUS_WRONG_VOLUME, -EIO, "STATUS_WRONG_VOLUME"},
304 {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"}, 306 {STATUS_NO_MEDIA_IN_DEVICE, -EIO, "STATUS_NO_MEDIA_IN_DEVICE"},
@@ -605,7 +607,7 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
605 {STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"}, 607 {STATUS_MAPPED_FILE_SIZE_ZERO, -EIO, "STATUS_MAPPED_FILE_SIZE_ZERO"},
606 {STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"}, 608 {STATUS_TOO_MANY_OPENED_FILES, -EMFILE, "STATUS_TOO_MANY_OPENED_FILES"},
607 {STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"}, 609 {STATUS_CANCELLED, -EIO, "STATUS_CANCELLED"},
608 {STATUS_CANNOT_DELETE, -EIO, "STATUS_CANNOT_DELETE"}, 610 {STATUS_CANNOT_DELETE, -EACCES, "STATUS_CANNOT_DELETE"},
609 {STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"}, 611 {STATUS_INVALID_COMPUTER_NAME, -EIO, "STATUS_INVALID_COMPUTER_NAME"},
610 {STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"}, 612 {STATUS_FILE_DELETED, -EIO, "STATUS_FILE_DELETED"},
611 {STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"}, 613 {STATUS_SPECIAL_ACCOUNT, -EIO, "STATUS_SPECIAL_ACCOUNT"},
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index b8021fde987d..1a08a34838fc 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -178,9 +178,24 @@ smb2_check_message(char *buf, unsigned int length)
178 /* Windows 7 server returns 24 bytes more */ 178 /* Windows 7 server returns 24 bytes more */
179 if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE) 179 if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
180 return 0; 180 return 0;
181 /* server can return one byte more */ 181 /* server can return one byte more due to implied bcc[0] */
182 if (clc_len == 4 + len + 1) 182 if (clc_len == 4 + len + 1)
183 return 0; 183 return 0;
184
185 /*
186 * MacOS server pads after SMB2.1 write response with 3 bytes
187 * of junk. Other servers match RFC1001 len to actual
188 * SMB2/SMB3 frame length (header + smb2 response specific data)
189 * Log the server error (once), but allow it and continue
190 * since the frame is parseable.
191 */
192 if (clc_len < 4 /* RFC1001 header size */ + len) {
193 printk_once(KERN_WARNING
194 "SMB2 server sent bad RFC1001 len %d not %d\n",
195 len, clc_len - 4);
196 return 0;
197 }
198
184 return 1; 199 return 1;
185 } 200 }
186 return 0; 201 return 0;
@@ -364,6 +379,14 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
364 int len; 379 int len;
365 const char *start_of_path; 380 const char *start_of_path;
366 __le16 *to; 381 __le16 *to;
382 int map_type;
383
384 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR)
385 map_type = SFM_MAP_UNI_RSVD;
386 else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR)
387 map_type = SFU_MAP_UNI_RSVD;
388 else
389 map_type = NO_MAP_UNI_RSVD;
367 390
368 /* Windows doesn't allow paths beginning with \ */ 391 /* Windows doesn't allow paths beginning with \ */
369 if (from[0] == '\\') 392 if (from[0] == '\\')
@@ -371,9 +394,7 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
371 else 394 else
372 start_of_path = from; 395 start_of_path = from;
373 to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len, 396 to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len,
374 cifs_sb->local_nls, 397 cifs_sb->local_nls, map_type);
375 cifs_sb->mnt_cifs_flags &
376 CIFS_MOUNT_MAP_SPECIAL_CHR);
377 return to; 398 return to;
378} 399}
379 400
@@ -437,7 +458,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
437 continue; 458 continue;
438 459
439 cifs_dbg(FYI, "found in the open list\n"); 460 cifs_dbg(FYI, "found in the open list\n");
440 cifs_dbg(FYI, "lease key match, lease break 0x%d\n", 461 cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
441 le32_to_cpu(rsp->NewLeaseState)); 462 le32_to_cpu(rsp->NewLeaseState));
442 463
443 server->ops->set_oplock_level(cinode, lease_state, 0, NULL); 464 server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
@@ -467,7 +488,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
467 } 488 }
468 489
469 cifs_dbg(FYI, "found in the pending open list\n"); 490 cifs_dbg(FYI, "found in the pending open list\n");
470 cifs_dbg(FYI, "lease key match, lease break 0x%d\n", 491 cifs_dbg(FYI, "lease key match, lease break 0x%x\n",
471 le32_to_cpu(rsp->NewLeaseState)); 492 le32_to_cpu(rsp->NewLeaseState));
472 493
473 open->oplock = lease_state; 494 open->oplock = lease_state;
@@ -546,7 +567,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
546 return false; 567 return false;
547 } 568 }
548 569
549 cifs_dbg(FYI, "oplock level 0x%d\n", rsp->OplockLevel); 570 cifs_dbg(FYI, "oplock level 0x%x\n", rsp->OplockLevel);
550 571
551 /* look up tcon based on tid & uid */ 572 /* look up tcon based on tid & uid */
552 spin_lock(&cifs_tcp_ses_lock); 573 spin_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 787844bde384..c5f521bcdee2 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/pagemap.h> 20#include <linux/pagemap.h>
21#include <linux/vfs.h> 21#include <linux/vfs.h>
22#include <linux/falloc.h>
22#include "cifsglob.h" 23#include "cifsglob.h"
23#include "smb2pdu.h" 24#include "smb2pdu.h"
24#include "smb2proto.h" 25#include "smb2proto.h"
@@ -112,6 +113,53 @@ smb2_get_credits(struct mid_q_entry *mid)
112 return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest); 113 return le16_to_cpu(((struct smb2_hdr *)mid->resp_buf)->CreditRequest);
113} 114}
114 115
116static int
117smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
118 unsigned int *num, unsigned int *credits)
119{
120 int rc = 0;
121 unsigned int scredits;
122
123 spin_lock(&server->req_lock);
124 while (1) {
125 if (server->credits <= 0) {
126 spin_unlock(&server->req_lock);
127 cifs_num_waiters_inc(server);
128 rc = wait_event_killable(server->request_q,
129 has_credits(server, &server->credits));
130 cifs_num_waiters_dec(server);
131 if (rc)
132 return rc;
133 spin_lock(&server->req_lock);
134 } else {
135 if (server->tcpStatus == CifsExiting) {
136 spin_unlock(&server->req_lock);
137 return -ENOENT;
138 }
139
140 scredits = server->credits;
141 /* can deadlock with reopen */
142 if (scredits == 1) {
143 *num = SMB2_MAX_BUFFER_SIZE;
144 *credits = 0;
145 break;
146 }
147
148 /* leave one credit for a possible reopen */
149 scredits--;
150 *num = min_t(unsigned int, size,
151 scredits * SMB2_MAX_BUFFER_SIZE);
152
153 *credits = DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE);
154 server->credits -= *credits;
155 server->in_flight++;
156 break;
157 }
158 }
159 spin_unlock(&server->req_lock);
160 return rc;
161}
162
115static __u64 163static __u64
116smb2_get_next_mid(struct TCP_Server_Info *server) 164smb2_get_next_mid(struct TCP_Server_Info *server)
117{ 165{
@@ -182,8 +230,9 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
182 /* start with specified wsize, or default */ 230 /* start with specified wsize, or default */
183 wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE; 231 wsize = volume_info->wsize ? volume_info->wsize : CIFS_DEFAULT_IOSIZE;
184 wsize = min_t(unsigned int, wsize, server->max_write); 232 wsize = min_t(unsigned int, wsize, server->max_write);
185 /* set it to the maximum buffer size value we can send with 1 credit */ 233
186 wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); 234 if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
235 wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE);
187 236
188 return wsize; 237 return wsize;
189} 238}
@@ -197,8 +246,9 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
197 /* start with specified rsize, or default */ 246 /* start with specified rsize, or default */
198 rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE; 247 rsize = volume_info->rsize ? volume_info->rsize : CIFS_DEFAULT_IOSIZE;
199 rsize = min_t(unsigned int, rsize, server->max_read); 248 rsize = min_t(unsigned int, rsize, server->max_read);
200 /* set it to the maximum buffer size value we can send with 1 credit */ 249
201 rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); 250 if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU))
251 rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE);
202 252
203 return rsize; 253 return rsize;
204} 254}
@@ -215,15 +265,18 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
215 FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, 265 FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
216 NULL /* no data input */, 0 /* no data input */, 266 NULL /* no data input */, 0 /* no data input */,
217 (char **)&out_buf, &ret_data_len); 267 (char **)&out_buf, &ret_data_len);
218 268 if (rc != 0)
219 if ((rc == 0) && (ret_data_len > 0)) { 269 cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
270 else if (ret_data_len < sizeof(struct network_interface_info_ioctl_rsp)) {
271 cifs_dbg(VFS, "server returned bad net interface info buf\n");
272 rc = -EINVAL;
273 } else {
220 /* Dump info on first interface */ 274 /* Dump info on first interface */
221 cifs_dbg(FYI, "Adapter Capability 0x%x\t", 275 cifs_dbg(FYI, "Adapter Capability 0x%x\t",
222 le32_to_cpu(out_buf->Capability)); 276 le32_to_cpu(out_buf->Capability));
223 cifs_dbg(FYI, "Link Speed %lld\n", 277 cifs_dbg(FYI, "Link Speed %lld\n",
224 le64_to_cpu(out_buf->LinkSpeed)); 278 le64_to_cpu(out_buf->LinkSpeed));
225 } else 279 }
226 cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
227 280
228 return rc; 281 return rc;
229} 282}
@@ -339,7 +392,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
339 int rc; 392 int rc;
340 struct smb2_file_all_info *smb2_data; 393 struct smb2_file_all_info *smb2_data;
341 394
342 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, 395 smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
343 GFP_KERNEL); 396 GFP_KERNEL);
344 if (smb2_data == NULL) 397 if (smb2_data == NULL)
345 return -ENOMEM; 398 return -ENOMEM;
@@ -661,33 +714,94 @@ smb2_read_data_length(char *buf)
661 714
662 715
663static int 716static int
664smb2_sync_read(const unsigned int xid, struct cifsFileInfo *cfile, 717smb2_sync_read(const unsigned int xid, struct cifs_fid *pfid,
665 struct cifs_io_parms *parms, unsigned int *bytes_read, 718 struct cifs_io_parms *parms, unsigned int *bytes_read,
666 char **buf, int *buf_type) 719 char **buf, int *buf_type)
667{ 720{
668 parms->persistent_fid = cfile->fid.persistent_fid; 721 parms->persistent_fid = pfid->persistent_fid;
669 parms->volatile_fid = cfile->fid.volatile_fid; 722 parms->volatile_fid = pfid->volatile_fid;
670 return SMB2_read(xid, parms, bytes_read, buf, buf_type); 723 return SMB2_read(xid, parms, bytes_read, buf, buf_type);
671} 724}
672 725
673static int 726static int
674smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile, 727smb2_sync_write(const unsigned int xid, struct cifs_fid *pfid,
675 struct cifs_io_parms *parms, unsigned int *written, 728 struct cifs_io_parms *parms, unsigned int *written,
676 struct kvec *iov, unsigned long nr_segs) 729 struct kvec *iov, unsigned long nr_segs)
677{ 730{
678 731
679 parms->persistent_fid = cfile->fid.persistent_fid; 732 parms->persistent_fid = pfid->persistent_fid;
680 parms->volatile_fid = cfile->fid.volatile_fid; 733 parms->volatile_fid = pfid->volatile_fid;
681 return SMB2_write(xid, parms, written, iov, nr_segs); 734 return SMB2_write(xid, parms, written, iov, nr_segs);
682} 735}
683 736
737/* Set or clear the SPARSE_FILE attribute based on value passed in setsparse */
738static bool smb2_set_sparse(const unsigned int xid, struct cifs_tcon *tcon,
739 struct cifsFileInfo *cfile, struct inode *inode, __u8 setsparse)
740{
741 struct cifsInodeInfo *cifsi;
742 int rc;
743
744 cifsi = CIFS_I(inode);
745
746 /* if file already sparse don't bother setting sparse again */
747 if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && setsparse)
748 return true; /* already sparse */
749
750 if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) && !setsparse)
751 return true; /* already not sparse */
752
753 /*
754 * Can't check for sparse support on share the usual way via the
755 * FS attribute info (FILE_SUPPORTS_SPARSE_FILES) on the share
756 * since Samba server doesn't set the flag on the share, yet
757 * supports the set sparse FSCTL and returns sparse correctly
758 * in the file attributes. If we fail setting sparse though we
759 * mark that server does not support sparse files for this share
760 * to avoid repeatedly sending the unsupported fsctl to server
761 * if the file is repeatedly extended.
762 */
763 if (tcon->broken_sparse_sup)
764 return false;
765
766 rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
767 cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
768 true /* is_fctl */, &setsparse, 1, NULL, NULL);
769 if (rc) {
770 tcon->broken_sparse_sup = true;
771 cifs_dbg(FYI, "set sparse rc = %d\n", rc);
772 return false;
773 }
774
775 if (setsparse)
776 cifsi->cifsAttrs |= FILE_ATTRIBUTE_SPARSE_FILE;
777 else
778 cifsi->cifsAttrs &= (~FILE_ATTRIBUTE_SPARSE_FILE);
779
780 return true;
781}
782
684static int 783static int
685smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, 784smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
686 struct cifsFileInfo *cfile, __u64 size, bool set_alloc) 785 struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
687{ 786{
688 __le64 eof = cpu_to_le64(size); 787 __le64 eof = cpu_to_le64(size);
788 struct inode *inode;
789
790 /*
791 * If extending file more than one page make sparse. Many Linux fs
792 * make files sparse by default when extending via ftruncate
793 */
794 inode = cfile->dentry->d_inode;
795
796 if (!set_alloc && (size > inode->i_size + 8192)) {
797 __u8 set_sparse = 1;
798
799 /* whether set sparse succeeds or not, extend the file */
800 smb2_set_sparse(xid, tcon, cfile, inode, set_sparse);
801 }
802
689 return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, 803 return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
690 cfile->fid.volatile_fid, cfile->pid, &eof); 804 cfile->fid.volatile_fid, cfile->pid, &eof, false);
691} 805}
692 806
693static int 807static int
@@ -904,6 +1018,105 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
904 return rc; 1018 return rc;
905} 1019}
906 1020
1021static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
1022 loff_t offset, loff_t len, bool keep_size)
1023{
1024 struct inode *inode;
1025 struct cifsInodeInfo *cifsi;
1026 struct cifsFileInfo *cfile = file->private_data;
1027 struct file_zero_data_information fsctl_buf;
1028 long rc;
1029 unsigned int xid;
1030
1031 xid = get_xid();
1032
1033 inode = cfile->dentry->d_inode;
1034 cifsi = CIFS_I(inode);
1035
1036 /* if file not oplocked can't be sure whether asking to extend size */
1037 if (!CIFS_CACHE_READ(cifsi))
1038 if (keep_size == false)
1039 return -EOPNOTSUPP;
1040
1041 /*
1042 * Must check if file sparse since fallocate -z (zero range) assumes
1043 * non-sparse allocation
1044 */
1045 if (!(cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE))
1046 return -EOPNOTSUPP;
1047
1048 /*
1049 * need to make sure we are not asked to extend the file since the SMB3
1050 * fsctl does not change the file size. In the future we could change
1051 * this to zero the first part of the range then set the file size
1052 * which for a non sparse file would zero the newly extended range
1053 */
1054 if (keep_size == false)
1055 if (i_size_read(inode) < offset + len)
1056 return -EOPNOTSUPP;
1057
1058 cifs_dbg(FYI, "offset %lld len %lld", offset, len);
1059
1060 fsctl_buf.FileOffset = cpu_to_le64(offset);
1061 fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
1062
1063 rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
1064 cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
1065 true /* is_fctl */, (char *)&fsctl_buf,
1066 sizeof(struct file_zero_data_information), NULL, NULL);
1067 free_xid(xid);
1068 return rc;
1069}
1070
1071static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
1072 loff_t offset, loff_t len)
1073{
1074 struct inode *inode;
1075 struct cifsInodeInfo *cifsi;
1076 struct cifsFileInfo *cfile = file->private_data;
1077 struct file_zero_data_information fsctl_buf;
1078 long rc;
1079 unsigned int xid;
1080 __u8 set_sparse = 1;
1081
1082 xid = get_xid();
1083
1084 inode = cfile->dentry->d_inode;
1085 cifsi = CIFS_I(inode);
1086
1087 /* Need to make file sparse, if not already, before freeing range. */
1088 /* Consider adding equivalent for compressed since it could also work */
1089 if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse))
1090 return -EOPNOTSUPP;
1091
1092 cifs_dbg(FYI, "offset %lld len %lld", offset, len);
1093
1094 fsctl_buf.FileOffset = cpu_to_le64(offset);
1095 fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
1096
1097 rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
1098 cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
1099 true /* is_fctl */, (char *)&fsctl_buf,
1100 sizeof(struct file_zero_data_information), NULL, NULL);
1101 free_xid(xid);
1102 return rc;
1103}
1104
1105static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode,
1106 loff_t off, loff_t len)
1107{
1108 /* KEEP_SIZE already checked for by do_fallocate */
1109 if (mode & FALLOC_FL_PUNCH_HOLE)
1110 return smb3_punch_hole(file, tcon, off, len);
1111 else if (mode & FALLOC_FL_ZERO_RANGE) {
1112 if (mode & FALLOC_FL_KEEP_SIZE)
1113 return smb3_zero_range(file, tcon, off, len, true);
1114 return smb3_zero_range(file, tcon, off, len, false);
1115 }
1116
1117 return -EOPNOTSUPP;
1118}
1119
907static void 1120static void
908smb2_downgrade_oplock(struct TCP_Server_Info *server, 1121smb2_downgrade_oplock(struct TCP_Server_Info *server,
909 struct cifsInodeInfo *cinode, bool set_level2) 1122 struct cifsInodeInfo *cinode, bool set_level2)
@@ -1104,6 +1317,19 @@ smb3_parse_lease_buf(void *buf, unsigned int *epoch)
1104 return le32_to_cpu(lc->lcontext.LeaseState); 1317 return le32_to_cpu(lc->lcontext.LeaseState);
1105} 1318}
1106 1319
1320static unsigned int
1321smb2_wp_retry_size(struct inode *inode)
1322{
1323 return min_t(unsigned int, CIFS_SB(inode->i_sb)->wsize,
1324 SMB2_MAX_BUFFER_SIZE);
1325}
1326
1327static bool
1328smb2_dir_needs_close(struct cifsFileInfo *cfile)
1329{
1330 return !cfile->invalidHandle;
1331}
1332
1107struct smb_version_operations smb20_operations = { 1333struct smb_version_operations smb20_operations = {
1108 .compare_fids = smb2_compare_fids, 1334 .compare_fids = smb2_compare_fids,
1109 .setup_request = smb2_setup_request, 1335 .setup_request = smb2_setup_request,
@@ -1113,6 +1339,7 @@ struct smb_version_operations smb20_operations = {
1113 .set_credits = smb2_set_credits, 1339 .set_credits = smb2_set_credits,
1114 .get_credits_field = smb2_get_credits_field, 1340 .get_credits_field = smb2_get_credits_field,
1115 .get_credits = smb2_get_credits, 1341 .get_credits = smb2_get_credits,
1342 .wait_mtu_credits = cifs_wait_mtu_credits,
1116 .get_next_mid = smb2_get_next_mid, 1343 .get_next_mid = smb2_get_next_mid,
1117 .read_data_offset = smb2_read_data_offset, 1344 .read_data_offset = smb2_read_data_offset,
1118 .read_data_length = smb2_read_data_length, 1345 .read_data_length = smb2_read_data_length,
@@ -1177,6 +1404,8 @@ struct smb_version_operations smb20_operations = {
1177 .create_lease_buf = smb2_create_lease_buf, 1404 .create_lease_buf = smb2_create_lease_buf,
1178 .parse_lease_buf = smb2_parse_lease_buf, 1405 .parse_lease_buf = smb2_parse_lease_buf,
1179 .clone_range = smb2_clone_range, 1406 .clone_range = smb2_clone_range,
1407 .wp_retry_size = smb2_wp_retry_size,
1408 .dir_needs_close = smb2_dir_needs_close,
1180}; 1409};
1181 1410
1182struct smb_version_operations smb21_operations = { 1411struct smb_version_operations smb21_operations = {
@@ -1188,6 +1417,7 @@ struct smb_version_operations smb21_operations = {
1188 .set_credits = smb2_set_credits, 1417 .set_credits = smb2_set_credits,
1189 .get_credits_field = smb2_get_credits_field, 1418 .get_credits_field = smb2_get_credits_field,
1190 .get_credits = smb2_get_credits, 1419 .get_credits = smb2_get_credits,
1420 .wait_mtu_credits = smb2_wait_mtu_credits,
1191 .get_next_mid = smb2_get_next_mid, 1421 .get_next_mid = smb2_get_next_mid,
1192 .read_data_offset = smb2_read_data_offset, 1422 .read_data_offset = smb2_read_data_offset,
1193 .read_data_length = smb2_read_data_length, 1423 .read_data_length = smb2_read_data_length,
@@ -1225,6 +1455,8 @@ struct smb_version_operations smb21_operations = {
1225 .rename = smb2_rename_path, 1455 .rename = smb2_rename_path,
1226 .create_hardlink = smb2_create_hardlink, 1456 .create_hardlink = smb2_create_hardlink,
1227 .query_symlink = smb2_query_symlink, 1457 .query_symlink = smb2_query_symlink,
1458 .query_mf_symlink = smb3_query_mf_symlink,
1459 .create_mf_symlink = smb3_create_mf_symlink,
1228 .open = smb2_open_file, 1460 .open = smb2_open_file,
1229 .set_fid = smb2_set_fid, 1461 .set_fid = smb2_set_fid,
1230 .close = smb2_close_file, 1462 .close = smb2_close_file,
@@ -1252,6 +1484,8 @@ struct smb_version_operations smb21_operations = {
1252 .create_lease_buf = smb2_create_lease_buf, 1484 .create_lease_buf = smb2_create_lease_buf,
1253 .parse_lease_buf = smb2_parse_lease_buf, 1485 .parse_lease_buf = smb2_parse_lease_buf,
1254 .clone_range = smb2_clone_range, 1486 .clone_range = smb2_clone_range,
1487 .wp_retry_size = smb2_wp_retry_size,
1488 .dir_needs_close = smb2_dir_needs_close,
1255}; 1489};
1256 1490
1257struct smb_version_operations smb30_operations = { 1491struct smb_version_operations smb30_operations = {
@@ -1263,6 +1497,7 @@ struct smb_version_operations smb30_operations = {
1263 .set_credits = smb2_set_credits, 1497 .set_credits = smb2_set_credits,
1264 .get_credits_field = smb2_get_credits_field, 1498 .get_credits_field = smb2_get_credits_field,
1265 .get_credits = smb2_get_credits, 1499 .get_credits = smb2_get_credits,
1500 .wait_mtu_credits = smb2_wait_mtu_credits,
1266 .get_next_mid = smb2_get_next_mid, 1501 .get_next_mid = smb2_get_next_mid,
1267 .read_data_offset = smb2_read_data_offset, 1502 .read_data_offset = smb2_read_data_offset,
1268 .read_data_length = smb2_read_data_length, 1503 .read_data_length = smb2_read_data_length,
@@ -1301,6 +1536,8 @@ struct smb_version_operations smb30_operations = {
1301 .rename = smb2_rename_path, 1536 .rename = smb2_rename_path,
1302 .create_hardlink = smb2_create_hardlink, 1537 .create_hardlink = smb2_create_hardlink,
1303 .query_symlink = smb2_query_symlink, 1538 .query_symlink = smb2_query_symlink,
1539 .query_mf_symlink = smb3_query_mf_symlink,
1540 .create_mf_symlink = smb3_create_mf_symlink,
1304 .open = smb2_open_file, 1541 .open = smb2_open_file,
1305 .set_fid = smb2_set_fid, 1542 .set_fid = smb2_set_fid,
1306 .close = smb2_close_file, 1543 .close = smb2_close_file,
@@ -1330,6 +1567,9 @@ struct smb_version_operations smb30_operations = {
1330 .parse_lease_buf = smb3_parse_lease_buf, 1567 .parse_lease_buf = smb3_parse_lease_buf,
1331 .clone_range = smb2_clone_range, 1568 .clone_range = smb2_clone_range,
1332 .validate_negotiate = smb3_validate_negotiate, 1569 .validate_negotiate = smb3_validate_negotiate,
1570 .wp_retry_size = smb2_wp_retry_size,
1571 .dir_needs_close = smb2_dir_needs_close,
1572 .fallocate = smb3_fallocate,
1333}; 1573};
1334 1574
1335struct smb_version_values smb20_values = { 1575struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index b0b260dbb19d..8f1672bb82d5 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -108,7 +108,6 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
108 if (!tcon) 108 if (!tcon)
109 goto out; 109 goto out;
110 110
111 /* BB FIXME when we do write > 64K add +1 for every 64K in req or rsp */
112 /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ 111 /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */
113 /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ 112 /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */
114 if ((tcon->ses) && 113 if ((tcon->ses) &&
@@ -245,10 +244,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
245 if (rc) 244 if (rc)
246 goto out; 245 goto out;
247 atomic_inc(&tconInfoReconnectCount); 246 atomic_inc(&tconInfoReconnectCount);
248 /*
249 * BB FIXME add code to check if wsize needs update due to negotiated
250 * smb buffer size shrinking.
251 */
252out: 247out:
253 /* 248 /*
254 * Check if handle based operation so we know whether we can continue 249 * Check if handle based operation so we know whether we can continue
@@ -309,16 +304,6 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon,
309 return rc; 304 return rc;
310} 305}
311 306
312static void
313free_rsp_buf(int resp_buftype, void *rsp)
314{
315 if (resp_buftype == CIFS_SMALL_BUFFER)
316 cifs_small_buf_release(rsp);
317 else if (resp_buftype == CIFS_LARGE_BUFFER)
318 cifs_buf_release(rsp);
319}
320
321
322/* 307/*
323 * 308 *
324 * SMB2 Worker functions follow: 309 * SMB2 Worker functions follow:
@@ -545,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
545 struct smb2_sess_setup_rsp *rsp = NULL; 530 struct smb2_sess_setup_rsp *rsp = NULL;
546 struct kvec iov[2]; 531 struct kvec iov[2];
547 int rc = 0; 532 int rc = 0;
548 int resp_buftype; 533 int resp_buftype = CIFS_NO_BUFFER;
549 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ 534 __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
550 struct TCP_Server_Info *server = ses->server; 535 struct TCP_Server_Info *server = ses->server;
551 u16 blob_length = 0; 536 u16 blob_length = 0;
@@ -922,7 +907,8 @@ tcon_exit:
922tcon_error_exit: 907tcon_error_exit:
923 if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) { 908 if (rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) {
924 cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); 909 cifs_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree);
925 tcon->bad_network_name = true; 910 if (tcon)
911 tcon->bad_network_name = true;
926 } 912 }
927 goto tcon_exit; 913 goto tcon_exit;
928} 914}
@@ -1112,6 +1098,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
1112 1098
1113 if (oparms->create_options & CREATE_OPTION_READONLY) 1099 if (oparms->create_options & CREATE_OPTION_READONLY)
1114 file_attributes |= ATTR_READONLY; 1100 file_attributes |= ATTR_READONLY;
1101 if (oparms->create_options & CREATE_OPTION_SPECIAL)
1102 file_attributes |= ATTR_SYSTEM;
1115 1103
1116 req->ImpersonationLevel = IL_IMPERSONATION; 1104 req->ImpersonationLevel = IL_IMPERSONATION;
1117 req->DesiredAccess = cpu_to_le32(oparms->desired_access); 1105 req->DesiredAccess = cpu_to_le32(oparms->desired_access);
@@ -1239,7 +1227,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
1239 1227
1240 cifs_dbg(FYI, "SMB2 IOCTL\n"); 1228 cifs_dbg(FYI, "SMB2 IOCTL\n");
1241 1229
1242 *out_data = NULL; 1230 if (out_data != NULL)
1231 *out_data = NULL;
1232
1243 /* zero out returned data len, in case of error */ 1233 /* zero out returned data len, in case of error */
1244 if (plen) 1234 if (plen)
1245 *plen = 0; 1235 *plen = 0;
@@ -1415,8 +1405,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
1415 rsp = (struct smb2_close_rsp *)iov[0].iov_base; 1405 rsp = (struct smb2_close_rsp *)iov[0].iov_base;
1416 1406
1417 if (rc != 0) { 1407 if (rc != 0) {
1418 if (tcon) 1408 cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
1419 cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
1420 goto close_exit; 1409 goto close_exit;
1421 } 1410 }
1422 1411
@@ -1545,7 +1534,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
1545{ 1534{
1546 return query_info(xid, tcon, persistent_fid, volatile_fid, 1535 return query_info(xid, tcon, persistent_fid, volatile_fid,
1547 FILE_ALL_INFORMATION, 1536 FILE_ALL_INFORMATION,
1548 sizeof(struct smb2_file_all_info) + MAX_NAME * 2, 1537 sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
1549 sizeof(struct smb2_file_all_info), data); 1538 sizeof(struct smb2_file_all_info), data);
1550} 1539}
1551 1540
@@ -1738,12 +1727,18 @@ smb2_readv_callback(struct mid_q_entry *mid)
1738 rc); 1727 rc);
1739 } 1728 }
1740 /* FIXME: should this be counted toward the initiating task? */ 1729 /* FIXME: should this be counted toward the initiating task? */
1741 task_io_account_read(rdata->bytes); 1730 task_io_account_read(rdata->got_bytes);
1742 cifs_stats_bytes_read(tcon, rdata->bytes); 1731 cifs_stats_bytes_read(tcon, rdata->got_bytes);
1743 break; 1732 break;
1744 case MID_REQUEST_SUBMITTED: 1733 case MID_REQUEST_SUBMITTED:
1745 case MID_RETRY_NEEDED: 1734 case MID_RETRY_NEEDED:
1746 rdata->result = -EAGAIN; 1735 rdata->result = -EAGAIN;
1736 if (server->sign && rdata->got_bytes)
1737 /* reset bytes number since we can not check a sign */
1738 rdata->got_bytes = 0;
1739 /* FIXME: should this be counted toward the initiating task? */
1740 task_io_account_read(rdata->got_bytes);
1741 cifs_stats_bytes_read(tcon, rdata->got_bytes);
1747 break; 1742 break;
1748 default: 1743 default:
1749 if (rdata->result != -ENODATA) 1744 if (rdata->result != -ENODATA)
@@ -1762,11 +1757,12 @@ smb2_readv_callback(struct mid_q_entry *mid)
1762int 1757int
1763smb2_async_readv(struct cifs_readdata *rdata) 1758smb2_async_readv(struct cifs_readdata *rdata)
1764{ 1759{
1765 int rc; 1760 int rc, flags = 0;
1766 struct smb2_hdr *buf; 1761 struct smb2_hdr *buf;
1767 struct cifs_io_parms io_parms; 1762 struct cifs_io_parms io_parms;
1768 struct smb_rqst rqst = { .rq_iov = &rdata->iov, 1763 struct smb_rqst rqst = { .rq_iov = &rdata->iov,
1769 .rq_nvec = 1 }; 1764 .rq_nvec = 1 };
1765 struct TCP_Server_Info *server;
1770 1766
1771 cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n", 1767 cifs_dbg(FYI, "%s: offset=%llu bytes=%u\n",
1772 __func__, rdata->offset, rdata->bytes); 1768 __func__, rdata->offset, rdata->bytes);
@@ -1777,18 +1773,41 @@ smb2_async_readv(struct cifs_readdata *rdata)
1777 io_parms.persistent_fid = rdata->cfile->fid.persistent_fid; 1773 io_parms.persistent_fid = rdata->cfile->fid.persistent_fid;
1778 io_parms.volatile_fid = rdata->cfile->fid.volatile_fid; 1774 io_parms.volatile_fid = rdata->cfile->fid.volatile_fid;
1779 io_parms.pid = rdata->pid; 1775 io_parms.pid = rdata->pid;
1776
1777 server = io_parms.tcon->ses->server;
1778
1780 rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0); 1779 rc = smb2_new_read_req(&rdata->iov, &io_parms, 0, 0);
1781 if (rc) 1780 if (rc) {
1781 if (rc == -EAGAIN && rdata->credits) {
1782 /* credits was reset by reconnect */
1783 rdata->credits = 0;
1784 /* reduce in_flight value since we won't send the req */
1785 spin_lock(&server->req_lock);
1786 server->in_flight--;
1787 spin_unlock(&server->req_lock);
1788 }
1782 return rc; 1789 return rc;
1790 }
1783 1791
1784 buf = (struct smb2_hdr *)rdata->iov.iov_base; 1792 buf = (struct smb2_hdr *)rdata->iov.iov_base;
1785 /* 4 for rfc1002 length field */ 1793 /* 4 for rfc1002 length field */
1786 rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4; 1794 rdata->iov.iov_len = get_rfc1002_length(rdata->iov.iov_base) + 4;
1787 1795
1796 if (rdata->credits) {
1797 buf->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes,
1798 SMB2_MAX_BUFFER_SIZE));
1799 spin_lock(&server->req_lock);
1800 server->credits += rdata->credits -
1801 le16_to_cpu(buf->CreditCharge);
1802 spin_unlock(&server->req_lock);
1803 wake_up(&server->request_q);
1804 flags = CIFS_HAS_CREDITS;
1805 }
1806
1788 kref_get(&rdata->refcount); 1807 kref_get(&rdata->refcount);
1789 rc = cifs_call_async(io_parms.tcon->ses->server, &rqst, 1808 rc = cifs_call_async(io_parms.tcon->ses->server, &rqst,
1790 cifs_readv_receive, smb2_readv_callback, 1809 cifs_readv_receive, smb2_readv_callback,
1791 rdata, 0); 1810 rdata, flags);
1792 if (rc) { 1811 if (rc) {
1793 kref_put(&rdata->refcount, cifs_readdata_release); 1812 kref_put(&rdata->refcount, cifs_readdata_release);
1794 cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE); 1813 cifs_stats_fail_inc(io_parms.tcon, SMB2_READ_HE);
@@ -1906,15 +1925,25 @@ int
1906smb2_async_writev(struct cifs_writedata *wdata, 1925smb2_async_writev(struct cifs_writedata *wdata,
1907 void (*release)(struct kref *kref)) 1926 void (*release)(struct kref *kref))
1908{ 1927{
1909 int rc = -EACCES; 1928 int rc = -EACCES, flags = 0;
1910 struct smb2_write_req *req = NULL; 1929 struct smb2_write_req *req = NULL;
1911 struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); 1930 struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink);
1931 struct TCP_Server_Info *server = tcon->ses->server;
1912 struct kvec iov; 1932 struct kvec iov;
1913 struct smb_rqst rqst; 1933 struct smb_rqst rqst;
1914 1934
1915 rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req); 1935 rc = small_smb2_init(SMB2_WRITE, tcon, (void **) &req);
1916 if (rc) 1936 if (rc) {
1937 if (rc == -EAGAIN && wdata->credits) {
1938 /* credits was reset by reconnect */
1939 wdata->credits = 0;
1940 /* reduce in_flight value since we won't send the req */
1941 spin_lock(&server->req_lock);
1942 server->in_flight--;
1943 spin_unlock(&server->req_lock);
1944 }
1917 goto async_writev_out; 1945 goto async_writev_out;
1946 }
1918 1947
1919 req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid); 1948 req->hdr.ProcessId = cpu_to_le32(wdata->cfile->pid);
1920 1949
@@ -1947,9 +1976,20 @@ smb2_async_writev(struct cifs_writedata *wdata,
1947 1976
1948 inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */); 1977 inc_rfc1001_len(&req->hdr, wdata->bytes - 1 /* Buffer */);
1949 1978
1979 if (wdata->credits) {
1980 req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(wdata->bytes,
1981 SMB2_MAX_BUFFER_SIZE));
1982 spin_lock(&server->req_lock);
1983 server->credits += wdata->credits -
1984 le16_to_cpu(req->hdr.CreditCharge);
1985 spin_unlock(&server->req_lock);
1986 wake_up(&server->request_q);
1987 flags = CIFS_HAS_CREDITS;
1988 }
1989
1950 kref_get(&wdata->refcount); 1990 kref_get(&wdata->refcount);
1951 rc = cifs_call_async(tcon->ses->server, &rqst, NULL, 1991 rc = cifs_call_async(server, &rqst, NULL, smb2_writev_callback, wdata,
1952 smb2_writev_callback, wdata, 0); 1992 flags);
1953 1993
1954 if (rc) { 1994 if (rc) {
1955 kref_put(&wdata->refcount, release); 1995 kref_put(&wdata->refcount, release);
@@ -2141,6 +2181,10 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
2141 rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base; 2181 rsp = (struct smb2_query_directory_rsp *)iov[0].iov_base;
2142 2182
2143 if (rc) { 2183 if (rc) {
2184 if (rc == -ENODATA && rsp->hdr.Status == STATUS_NO_MORE_FILES) {
2185 srch_inf->endOfSearch = true;
2186 rc = 0;
2187 }
2144 cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); 2188 cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE);
2145 goto qdir_exit; 2189 goto qdir_exit;
2146 } 2190 }
@@ -2178,11 +2222,6 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
2178 else 2222 else
2179 cifs_dbg(VFS, "illegal search buffer type\n"); 2223 cifs_dbg(VFS, "illegal search buffer type\n");
2180 2224
2181 if (rsp->hdr.Status == STATUS_NO_MORE_FILES)
2182 srch_inf->endOfSearch = 1;
2183 else
2184 srch_inf->endOfSearch = 0;
2185
2186 return rc; 2225 return rc;
2187 2226
2188qdir_exit: 2227qdir_exit:
@@ -2325,7 +2364,7 @@ SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
2325 2364
2326int 2365int
2327SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, 2366SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
2328 u64 volatile_fid, u32 pid, __le64 *eof) 2367 u64 volatile_fid, u32 pid, __le64 *eof, bool is_falloc)
2329{ 2368{
2330 struct smb2_file_eof_info info; 2369 struct smb2_file_eof_info info;
2331 void *data; 2370 void *data;
@@ -2336,8 +2375,12 @@ SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
2336 data = &info; 2375 data = &info;
2337 size = sizeof(struct smb2_file_eof_info); 2376 size = sizeof(struct smb2_file_eof_info);
2338 2377
2339 return send_set_info(xid, tcon, persistent_fid, volatile_fid, pid, 2378 if (is_falloc)
2340 FILE_END_OF_FILE_INFORMATION, 1, &data, &size); 2379 return send_set_info(xid, tcon, persistent_fid, volatile_fid,
2380 pid, FILE_ALLOCATION_INFORMATION, 1, &data, &size);
2381 else
2382 return send_set_info(xid, tcon, persistent_fid, volatile_fid,
2383 pid, FILE_END_OF_FILE_INFORMATION, 1, &data, &size);
2341} 2384}
2342 2385
2343int 2386int
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 69f3595d3952..e3188abdafd0 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -352,6 +352,8 @@ struct smb2_tree_disconnect_rsp {
352#define FILE_ATTRIBUTE_OFFLINE 0x00001000 352#define FILE_ATTRIBUTE_OFFLINE 0x00001000
353#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 353#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000
354#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 354#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000
355#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000
356#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000
355 357
356/* Oplock levels */ 358/* Oplock levels */
357#define SMB2_OPLOCK_LEVEL_NONE 0x00 359#define SMB2_OPLOCK_LEVEL_NONE 0x00
@@ -573,6 +575,12 @@ struct copychunk_ioctl {
573 __u32 Reserved2; 575 __u32 Reserved2;
574} __packed; 576} __packed;
575 577
578/* this goes in the ioctl buffer when doing FSCTL_SET_ZERO_DATA */
579struct file_zero_data_information {
580 __le64 FileOffset;
581 __le64 BeyondFinalZero;
582} __packed;
583
576struct copychunk_ioctl_rsp { 584struct copychunk_ioctl_rsp {
577 __le32 ChunksWritten; 585 __le32 ChunksWritten;
578 __le32 ChunkBytesWritten; 586 __le32 ChunkBytesWritten;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 0ce48db20a65..79dc650c18b2 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -82,7 +82,13 @@ extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
82extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, 82extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
83 const char *from_name, const char *to_name, 83 const char *from_name, const char *to_name,
84 struct cifs_sb_info *cifs_sb); 84 struct cifs_sb_info *cifs_sb);
85 85extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
86 struct cifs_sb_info *cifs_sb, const unsigned char *path,
87 char *pbuf, unsigned int *pbytes_written);
88extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
89 struct cifs_sb_info *cifs_sb,
90 const unsigned char *path, char *pbuf,
91 unsigned int *pbytes_read);
86extern int smb2_open_file(const unsigned int xid, 92extern int smb2_open_file(const unsigned int xid,
87 struct cifs_open_parms *oparms, 93 struct cifs_open_parms *oparms,
88 __u32 *oplock, FILE_ALL_INFO *buf); 94 __u32 *oplock, FILE_ALL_INFO *buf);
@@ -139,7 +145,7 @@ extern int SMB2_set_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
139 __le16 *target_file); 145 __le16 *target_file);
140extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, 146extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
141 u64 persistent_fid, u64 volatile_fid, u32 pid, 147 u64 persistent_fid, u64 volatile_fid, u32 pid,
142 __le64 *eof); 148 __le64 *eof, bool is_fallocate);
143extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, 149extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon,
144 u64 persistent_fid, u64 volatile_fid, 150 u64 persistent_fid, u64 volatile_fid,
145 FILE_BASIC_INFO *buf); 151 FILE_BASIC_INFO *buf);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 59c748ce872f..5111e7272db6 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -466,7 +466,12 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
466static inline void 466static inline void
467smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr) 467smb2_seq_num_into_buf(struct TCP_Server_Info *server, struct smb2_hdr *hdr)
468{ 468{
469 unsigned int i, num = le16_to_cpu(hdr->CreditCharge);
470
469 hdr->MessageId = get_next_mid64(server); 471 hdr->MessageId = get_next_mid64(server);
472 /* skip message numbers according to CreditCharge field */
473 for (i = 1; i < num; i++)
474 get_next_mid(server);
470} 475}
471 476
472static struct mid_q_entry * 477static struct mid_q_entry *
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 43eb1367b103..6c1566366a66 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -29,6 +29,7 @@
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/kernel.h> 30#include <linux/kernel.h>
31#include <linux/random.h> 31#include <linux/random.h>
32#include "cifs_fs_sb.h"
32#include "cifs_unicode.h" 33#include "cifs_unicode.h"
33#include "cifspdu.h" 34#include "cifspdu.h"
34#include "cifsglob.h" 35#include "cifsglob.h"
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index 0e538b5c9622..83efa59535be 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -63,7 +63,7 @@
63#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */ 63#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */
64#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */ 64#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */
65#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */ 65#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */
66#define FSCTL_SET_ZERO_DATA 0x000900C8 /* BB add struct */ 66#define FSCTL_SET_ZERO_DATA 0x000980C8
67#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */ 67#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */
68#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */ 68#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */
69#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */ 69#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 18cd5650a5fc..9d087f4e7d4e 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -448,6 +448,15 @@ wait_for_free_request(struct TCP_Server_Info *server, const int timeout,
448 return wait_for_free_credits(server, timeout, val); 448 return wait_for_free_credits(server, timeout, val);
449} 449}
450 450
451int
452cifs_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size,
453 unsigned int *num, unsigned int *credits)
454{
455 *num = size;
456 *credits = 0;
457 return 0;
458}
459
451static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf, 460static int allocate_mid(struct cifs_ses *ses, struct smb_hdr *in_buf,
452 struct mid_q_entry **ppmidQ) 461 struct mid_q_entry **ppmidQ)
453{ 462{
@@ -531,20 +540,23 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
531{ 540{
532 int rc, timeout, optype; 541 int rc, timeout, optype;
533 struct mid_q_entry *mid; 542 struct mid_q_entry *mid;
543 unsigned int credits = 0;
534 544
535 timeout = flags & CIFS_TIMEOUT_MASK; 545 timeout = flags & CIFS_TIMEOUT_MASK;
536 optype = flags & CIFS_OP_MASK; 546 optype = flags & CIFS_OP_MASK;
537 547
538 rc = wait_for_free_request(server, timeout, optype); 548 if ((flags & CIFS_HAS_CREDITS) == 0) {
539 if (rc) 549 rc = wait_for_free_request(server, timeout, optype);
540 return rc; 550 if (rc)
551 return rc;
552 credits = 1;
553 }
541 554
542 mutex_lock(&server->srv_mutex); 555 mutex_lock(&server->srv_mutex);
543 mid = server->ops->setup_async_request(server, rqst); 556 mid = server->ops->setup_async_request(server, rqst);
544 if (IS_ERR(mid)) { 557 if (IS_ERR(mid)) {
545 mutex_unlock(&server->srv_mutex); 558 mutex_unlock(&server->srv_mutex);
546 add_credits(server, 1, optype); 559 add_credits_and_wake_if(server, credits, optype);
547 wake_up(&server->request_q);
548 return PTR_ERR(mid); 560 return PTR_ERR(mid);
549 } 561 }
550 562
@@ -572,8 +584,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
572 return 0; 584 return 0;
573 585
574 cifs_delete_mid(mid); 586 cifs_delete_mid(mid);
575 add_credits(server, 1, optype); 587 add_credits_and_wake_if(server, credits, optype);
576 wake_up(&server->request_q);
577 return rc; 588 return rc;
578} 589}
579 590
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index 5ac836a86b18..72a4d10653d6 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -28,6 +28,8 @@
28#include "cifsglob.h" 28#include "cifsglob.h"
29#include "cifsproto.h" 29#include "cifsproto.h"
30#include "cifs_debug.h" 30#include "cifs_debug.h"
31#include "cifs_fs_sb.h"
32#include "cifs_unicode.h"
31 33
32#define MAX_EA_VALUE_SIZE 65535 34#define MAX_EA_VALUE_SIZE 65535
33#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" 35#define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib"
@@ -85,8 +87,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name)
85 if (pTcon->ses->server->ops->set_EA) 87 if (pTcon->ses->server->ops->set_EA)
86 rc = pTcon->ses->server->ops->set_EA(xid, pTcon, 88 rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
87 full_path, ea_name, NULL, (__u16)0, 89 full_path, ea_name, NULL, (__u16)0,
88 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 90 cifs_sb->local_nls, cifs_remap(cifs_sb));
89 CIFS_MOUNT_MAP_SPECIAL_CHR);
90 } 91 }
91remove_ea_exit: 92remove_ea_exit:
92 kfree(full_path); 93 kfree(full_path);
@@ -154,8 +155,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
154 if (pTcon->ses->server->ops->set_EA) 155 if (pTcon->ses->server->ops->set_EA)
155 rc = pTcon->ses->server->ops->set_EA(xid, pTcon, 156 rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
156 full_path, ea_name, ea_value, (__u16)value_size, 157 full_path, ea_name, ea_value, (__u16)value_size,
157 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 158 cifs_sb->local_nls, cifs_remap(cifs_sb));
158 CIFS_MOUNT_MAP_SPECIAL_CHR);
159 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) 159 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN)
160 == 0) { 160 == 0) {
161 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 161 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
@@ -165,8 +165,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
165 if (pTcon->ses->server->ops->set_EA) 165 if (pTcon->ses->server->ops->set_EA)
166 rc = pTcon->ses->server->ops->set_EA(xid, pTcon, 166 rc = pTcon->ses->server->ops->set_EA(xid, pTcon,
167 full_path, ea_name, ea_value, (__u16)value_size, 167 full_path, ea_name, ea_value, (__u16)value_size,
168 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 168 cifs_sb->local_nls, cifs_remap(cifs_sb));
169 CIFS_MOUNT_MAP_SPECIAL_CHR);
170 } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, 169 } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL,
171 strlen(CIFS_XATTR_CIFS_ACL)) == 0) { 170 strlen(CIFS_XATTR_CIFS_ACL)) == 0) {
172#ifdef CONFIG_CIFS_ACL 171#ifdef CONFIG_CIFS_ACL
@@ -199,8 +198,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
199 rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, 198 rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
200 ea_value, (const int)value_size, 199 ea_value, (const int)value_size,
201 ACL_TYPE_ACCESS, cifs_sb->local_nls, 200 ACL_TYPE_ACCESS, cifs_sb->local_nls,
202 cifs_sb->mnt_cifs_flags & 201 cifs_remap(cifs_sb));
203 CIFS_MOUNT_MAP_SPECIAL_CHR);
204 cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc); 202 cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc);
205#else 203#else
206 cifs_dbg(FYI, "set POSIX ACL not supported\n"); 204 cifs_dbg(FYI, "set POSIX ACL not supported\n");
@@ -212,8 +210,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
212 rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, 210 rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
213 ea_value, (const int)value_size, 211 ea_value, (const int)value_size,
214 ACL_TYPE_DEFAULT, cifs_sb->local_nls, 212 ACL_TYPE_DEFAULT, cifs_sb->local_nls,
215 cifs_sb->mnt_cifs_flags & 213 cifs_remap(cifs_sb));
216 CIFS_MOUNT_MAP_SPECIAL_CHR);
217 cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc); 214 cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc);
218#else 215#else
219 cifs_dbg(FYI, "set default POSIX ACL not supported\n"); 216 cifs_dbg(FYI, "set default POSIX ACL not supported\n");
@@ -285,8 +282,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
285 if (pTcon->ses->server->ops->query_all_EAs) 282 if (pTcon->ses->server->ops->query_all_EAs)
286 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, 283 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
287 full_path, ea_name, ea_value, buf_size, 284 full_path, ea_name, ea_value, buf_size,
288 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 285 cifs_sb->local_nls, cifs_remap(cifs_sb));
289 CIFS_MOUNT_MAP_SPECIAL_CHR);
290 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { 286 } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) {
291 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) 287 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR)
292 goto get_ea_exit; 288 goto get_ea_exit;
@@ -295,8 +291,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
295 if (pTcon->ses->server->ops->query_all_EAs) 291 if (pTcon->ses->server->ops->query_all_EAs)
296 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, 292 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
297 full_path, ea_name, ea_value, buf_size, 293 full_path, ea_name, ea_value, buf_size,
298 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 294 cifs_sb->local_nls, cifs_remap(cifs_sb));
299 CIFS_MOUNT_MAP_SPECIAL_CHR);
300 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, 295 } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
301 strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { 296 strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
302#ifdef CONFIG_CIFS_POSIX 297#ifdef CONFIG_CIFS_POSIX
@@ -304,8 +299,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
304 rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, 299 rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
305 ea_value, buf_size, ACL_TYPE_ACCESS, 300 ea_value, buf_size, ACL_TYPE_ACCESS,
306 cifs_sb->local_nls, 301 cifs_sb->local_nls,
307 cifs_sb->mnt_cifs_flags & 302 cifs_remap(cifs_sb));
308 CIFS_MOUNT_MAP_SPECIAL_CHR);
309#else 303#else
310 cifs_dbg(FYI, "Query POSIX ACL not supported yet\n"); 304 cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
311#endif /* CONFIG_CIFS_POSIX */ 305#endif /* CONFIG_CIFS_POSIX */
@@ -316,8 +310,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
316 rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, 310 rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
317 ea_value, buf_size, ACL_TYPE_DEFAULT, 311 ea_value, buf_size, ACL_TYPE_DEFAULT,
318 cifs_sb->local_nls, 312 cifs_sb->local_nls,
319 cifs_sb->mnt_cifs_flags & 313 cifs_remap(cifs_sb));
320 CIFS_MOUNT_MAP_SPECIAL_CHR);
321#else 314#else
322 cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n"); 315 cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n");
323#endif /* CONFIG_CIFS_POSIX */ 316#endif /* CONFIG_CIFS_POSIX */
@@ -421,8 +414,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size)
421 if (pTcon->ses->server->ops->query_all_EAs) 414 if (pTcon->ses->server->ops->query_all_EAs)
422 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, 415 rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
423 full_path, NULL, data, buf_size, 416 full_path, NULL, data, buf_size,
424 cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & 417 cifs_sb->local_nls, cifs_remap(cifs_sb));
425 CIFS_MOUNT_MAP_SPECIAL_CHR);
426list_ea_exit: 418list_ea_exit:
427 kfree(full_path); 419 kfree(full_path);
428 free_xid(xid); 420 free_xid(xid);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 1da168c61d35..278f8fdeb9ef 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -13,7 +13,7 @@
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <asm/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/string.h> 17#include <linux/string.h>
18#include <linux/list.h> 18#include <linux/list.h>
19#include <linux/sched.h> 19#include <linux/sched.h>
diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c
index 2849f41e72a2..1326d38960db 100644
--- a/fs/coda/coda_linux.c
+++ b/fs/coda/coda_linux.c
@@ -13,7 +13,7 @@
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/stat.h> 14#include <linux/stat.h>
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <asm/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/string.h> 17#include <linux/string.h>
18 18
19#include <linux/coda.h> 19#include <linux/coda.h>
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index cd8a63238b11..9c3dedc000d1 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -19,8 +19,7 @@
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/spinlock.h> 20#include <linux/spinlock.h>
21#include <linux/namei.h> 21#include <linux/namei.h>
22 22#include <linux/uaccess.h>
23#include <asm/uaccess.h>
24 23
25#include <linux/coda.h> 24#include <linux/coda.h>
26#include <linux/coda_psdev.h> 25#include <linux/coda_psdev.h>
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 9e83b7790212..d244d743a232 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -18,7 +18,7 @@
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/slab.h> 20#include <linux/slab.h>
21#include <asm/uaccess.h> 21#include <linux/uaccess.h>
22 22
23#include <linux/coda.h> 23#include <linux/coda.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index fe3afb2de880..b945410bfcd5 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -21,9 +21,7 @@
21#include <linux/vfs.h> 21#include <linux/vfs.h>
22#include <linux/slab.h> 22#include <linux/slab.h>
23#include <linux/pid_namespace.h> 23#include <linux/pid_namespace.h>
24 24#include <linux/uaccess.h>
25#include <asm/uaccess.h>
26
27#include <linux/fs.h> 25#include <linux/fs.h>
28#include <linux/vmalloc.h> 26#include <linux/vmalloc.h>
29 27
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 3f5de96bbb58..4326d172fc27 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -16,7 +16,7 @@
16#include <linux/string.h> 16#include <linux/string.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <asm/uaccess.h> 19#include <linux/uaccess.h>
20 20
21#include <linux/coda.h> 21#include <linux/coda.h>
22#include <linux/coda_psdev.h> 22#include <linux/coda_psdev.h>
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 5c1e4242368b..822629126e89 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -40,7 +40,7 @@
40#include <linux/pid_namespace.h> 40#include <linux/pid_namespace.h>
41#include <asm/io.h> 41#include <asm/io.h>
42#include <asm/poll.h> 42#include <asm/poll.h>
43#include <asm/uaccess.h> 43#include <linux/uaccess.h>
44 44
45#include <linux/coda.h> 45#include <linux/coda.h>
46#include <linux/coda_psdev.h> 46#include <linux/coda_psdev.h>
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index 21fcf8dcb9cd..5bb6e27298a4 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -27,7 +27,7 @@
27#include <linux/string.h> 27#include <linux/string.h>
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/mutex.h> 29#include <linux/mutex.h>
30#include <asm/uaccess.h> 30#include <linux/uaccess.h>
31#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
32#include <linux/vfs.h> 32#include <linux/vfs.h>
33 33
diff --git a/fs/compat.c b/fs/compat.c
index 66d3d3c6b4b2..b13df99f3534 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -794,25 +794,21 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
794 char *kernel_type; 794 char *kernel_type;
795 unsigned long data_page; 795 unsigned long data_page;
796 char *kernel_dev; 796 char *kernel_dev;
797 struct filename *dir;
798 int retval; 797 int retval;
799 798
800 retval = copy_mount_string(type, &kernel_type); 799 kernel_type = copy_mount_string(type);
801 if (retval < 0) 800 retval = PTR_ERR(kernel_type);
801 if (IS_ERR(kernel_type))
802 goto out; 802 goto out;
803 803
804 dir = getname(dir_name); 804 kernel_dev = copy_mount_string(dev_name);
805 retval = PTR_ERR(dir); 805 retval = PTR_ERR(kernel_dev);
806 if (IS_ERR(dir)) 806 if (IS_ERR(kernel_dev))
807 goto out1; 807 goto out1;
808 808
809 retval = copy_mount_string(dev_name, &kernel_dev);
810 if (retval < 0)
811 goto out2;
812
813 retval = copy_mount_options(data, &data_page); 809 retval = copy_mount_options(data, &data_page);
814 if (retval < 0) 810 if (retval < 0)
815 goto out3; 811 goto out2;
816 812
817 retval = -EINVAL; 813 retval = -EINVAL;
818 814
@@ -821,19 +817,17 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
821 do_ncp_super_data_conv((void *)data_page); 817 do_ncp_super_data_conv((void *)data_page);
822 } else if (!strcmp(kernel_type, NFS4_NAME)) { 818 } else if (!strcmp(kernel_type, NFS4_NAME)) {
823 if (do_nfs4_super_data_conv((void *) data_page)) 819 if (do_nfs4_super_data_conv((void *) data_page))
824 goto out4; 820 goto out3;
825 } 821 }
826 } 822 }
827 823
828 retval = do_mount(kernel_dev, dir->name, kernel_type, 824 retval = do_mount(kernel_dev, dir_name, kernel_type,
829 flags, (void*)data_page); 825 flags, (void*)data_page);
830 826
831 out4:
832 free_page(data_page);
833 out3: 827 out3:
834 kfree(kernel_dev); 828 free_page(data_page);
835 out2: 829 out2:
836 putname(dir); 830 kfree(kernel_dev);
837 out1: 831 out1:
838 kfree(kernel_type); 832 kfree(kernel_type);
839 out: 833 out:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index e82289047272..afec6450450f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -59,7 +59,7 @@
59#include <linux/gfp.h> 59#include <linux/gfp.h>
60 60
61#include <net/bluetooth/bluetooth.h> 61#include <net/bluetooth/bluetooth.h>
62#include <net/bluetooth/hci.h> 62#include <net/bluetooth/hci_sock.h>
63#include <net/bluetooth/rfcomm.h> 63#include <net/bluetooth/rfcomm.h>
64 64
65#include <linux/capi.h> 65#include <linux/capi.h>
diff --git a/fs/coredump.c b/fs/coredump.c
index a93f7e6ea4cf..b5c86ffd5033 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -199,6 +199,14 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
199 err = cn_printf(cn, "%d", 199 err = cn_printf(cn, "%d",
200 task_tgid_nr(current)); 200 task_tgid_nr(current));
201 break; 201 break;
202 case 'i':
203 err = cn_printf(cn, "%d",
204 task_pid_vnr(current));
205 break;
206 case 'I':
207 err = cn_printf(cn, "%d",
208 task_pid_nr(current));
209 break;
202 /* uid */ 210 /* uid */
203 case 'u': 211 case 'u':
204 err = cn_printf(cn, "%d", cred->uid); 212 err = cn_printf(cn, "%d", cred->uid);
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index ddcfe590b8a8..355c522f3585 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -11,6 +11,8 @@
11 * The actual compression is based on zlib, see the other files. 11 * The actual compression is based on zlib, see the other files.
12 */ 12 */
13 13
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
14#include <linux/module.h> 16#include <linux/module.h>
15#include <linux/fs.h> 17#include <linux/fs.h>
16#include <linux/pagemap.h> 18#include <linux/pagemap.h>
@@ -21,7 +23,7 @@
21#include <linux/vfs.h> 23#include <linux/vfs.h>
22#include <linux/mutex.h> 24#include <linux/mutex.h>
23#include <uapi/linux/cramfs_fs.h> 25#include <uapi/linux/cramfs_fs.h>
24#include <asm/uaccess.h> 26#include <linux/uaccess.h>
25 27
26#include "internal.h" 28#include "internal.h"
27 29
@@ -153,7 +155,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
153 155
154static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; 156static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE];
155static unsigned buffer_blocknr[READ_BUFFERS]; 157static unsigned buffer_blocknr[READ_BUFFERS];
156static struct super_block * buffer_dev[READ_BUFFERS]; 158static struct super_block *buffer_dev[READ_BUFFERS];
157static int next_buffer; 159static int next_buffer;
158 160
159/* 161/*
@@ -205,6 +207,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
205 207
206 for (i = 0; i < BLKS_PER_BUF; i++) { 208 for (i = 0; i < BLKS_PER_BUF; i++) {
207 struct page *page = pages[i]; 209 struct page *page = pages[i];
210
208 if (page) { 211 if (page) {
209 wait_on_page_locked(page); 212 wait_on_page_locked(page);
210 if (!PageUptodate(page)) { 213 if (!PageUptodate(page)) {
@@ -223,6 +226,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
223 data = read_buffers[buffer]; 226 data = read_buffers[buffer];
224 for (i = 0; i < BLKS_PER_BUF; i++) { 227 for (i = 0; i < BLKS_PER_BUF; i++) {
225 struct page *page = pages[i]; 228 struct page *page = pages[i];
229
226 if (page) { 230 if (page) {
227 memcpy(data, kmap(page), PAGE_CACHE_SIZE); 231 memcpy(data, kmap(page), PAGE_CACHE_SIZE);
228 kunmap(page); 232 kunmap(page);
@@ -237,6 +241,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i
237static void cramfs_kill_sb(struct super_block *sb) 241static void cramfs_kill_sb(struct super_block *sb)
238{ 242{
239 struct cramfs_sb_info *sbi = CRAMFS_SB(sb); 243 struct cramfs_sb_info *sbi = CRAMFS_SB(sb);
244
240 kill_block_super(sb); 245 kill_block_super(sb);
241 kfree(sbi); 246 kfree(sbi);
242} 247}
@@ -277,7 +282,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
277 /* check for wrong endianness */ 282 /* check for wrong endianness */
278 if (super.magic == CRAMFS_MAGIC_WEND) { 283 if (super.magic == CRAMFS_MAGIC_WEND) {
279 if (!silent) 284 if (!silent)
280 printk(KERN_ERR "cramfs: wrong endianness\n"); 285 pr_err("wrong endianness\n");
281 return -EINVAL; 286 return -EINVAL;
282 } 287 }
283 288
@@ -287,22 +292,22 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
287 mutex_unlock(&read_mutex); 292 mutex_unlock(&read_mutex);
288 if (super.magic != CRAMFS_MAGIC) { 293 if (super.magic != CRAMFS_MAGIC) {
289 if (super.magic == CRAMFS_MAGIC_WEND && !silent) 294 if (super.magic == CRAMFS_MAGIC_WEND && !silent)
290 printk(KERN_ERR "cramfs: wrong endianness\n"); 295 pr_err("wrong endianness\n");
291 else if (!silent) 296 else if (!silent)
292 printk(KERN_ERR "cramfs: wrong magic\n"); 297 pr_err("wrong magic\n");
293 return -EINVAL; 298 return -EINVAL;
294 } 299 }
295 } 300 }
296 301
297 /* get feature flags first */ 302 /* get feature flags first */
298 if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { 303 if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) {
299 printk(KERN_ERR "cramfs: unsupported filesystem features\n"); 304 pr_err("unsupported filesystem features\n");
300 return -EINVAL; 305 return -EINVAL;
301 } 306 }
302 307
303 /* Check that the root inode is in a sane state */ 308 /* Check that the root inode is in a sane state */
304 if (!S_ISDIR(super.root.mode)) { 309 if (!S_ISDIR(super.root.mode)) {
305 printk(KERN_ERR "cramfs: root is not a directory\n"); 310 pr_err("root is not a directory\n");
306 return -EINVAL; 311 return -EINVAL;
307 } 312 }
308 /* correct strange, hard-coded permissions of mkcramfs */ 313 /* correct strange, hard-coded permissions of mkcramfs */
@@ -310,23 +315,23 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent)
310 315
311 root_offset = super.root.offset << 2; 316 root_offset = super.root.offset << 2;
312 if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { 317 if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) {
313 sbi->size=super.size; 318 sbi->size = super.size;
314 sbi->blocks=super.fsid.blocks; 319 sbi->blocks = super.fsid.blocks;
315 sbi->files=super.fsid.files; 320 sbi->files = super.fsid.files;
316 } else { 321 } else {
317 sbi->size=1<<28; 322 sbi->size = 1<<28;
318 sbi->blocks=0; 323 sbi->blocks = 0;
319 sbi->files=0; 324 sbi->files = 0;
320 } 325 }
321 sbi->magic=super.magic; 326 sbi->magic = super.magic;
322 sbi->flags=super.flags; 327 sbi->flags = super.flags;
323 if (root_offset == 0) 328 if (root_offset == 0)
324 printk(KERN_INFO "cramfs: empty filesystem"); 329 pr_info("empty filesystem");
325 else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && 330 else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) &&
326 ((root_offset != sizeof(struct cramfs_super)) && 331 ((root_offset != sizeof(struct cramfs_super)) &&
327 (root_offset != 512 + sizeof(struct cramfs_super)))) 332 (root_offset != 512 + sizeof(struct cramfs_super))))
328 { 333 {
329 printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); 334 pr_err("bad root offset %lu\n", root_offset);
330 return -EINVAL; 335 return -EINVAL;
331 } 336 }
332 337
@@ -425,7 +430,7 @@ static int cramfs_readdir(struct file *file, struct dir_context *ctx)
425/* 430/*
426 * Lookup and fill in the inode data.. 431 * Lookup and fill in the inode data..
427 */ 432 */
428static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 433static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
429{ 434{
430 unsigned int offset = 0; 435 unsigned int offset = 0;
431 struct inode *inode = NULL; 436 struct inode *inode = NULL;
@@ -483,7 +488,7 @@ out:
483 return NULL; 488 return NULL;
484} 489}
485 490
486static int cramfs_readpage(struct file *file, struct page * page) 491static int cramfs_readpage(struct file *file, struct page *page)
487{ 492{
488 struct inode *inode = page->mapping->host; 493 struct inode *inode = page->mapping->host;
489 u32 maxblock; 494 u32 maxblock;
@@ -511,7 +516,7 @@ static int cramfs_readpage(struct file *file, struct page * page)
511 if (compr_len == 0) 516 if (compr_len == 0)
512 ; /* hole */ 517 ; /* hole */
513 else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { 518 else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) {
514 pr_err("cramfs: bad compressed blocksize %u\n", 519 pr_err("bad compressed blocksize %u\n",
515 compr_len); 520 compr_len);
516 goto err; 521 goto err;
517 } else { 522 } else {
diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c
index 1760c1b84d97..ec4f1d4fdad0 100644
--- a/fs/cramfs/uncompress.c
+++ b/fs/cramfs/uncompress.c
@@ -15,6 +15,8 @@
15 * then is used by multiple filesystems. 15 * then is used by multiple filesystems.
16 */ 16 */
17 17
18#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
19
18#include <linux/kernel.h> 20#include <linux/kernel.h>
19#include <linux/errno.h> 21#include <linux/errno.h>
20#include <linux/vmalloc.h> 22#include <linux/vmalloc.h>
@@ -37,7 +39,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
37 39
38 err = zlib_inflateReset(&stream); 40 err = zlib_inflateReset(&stream);
39 if (err != Z_OK) { 41 if (err != Z_OK) {
40 printk("zlib_inflateReset error %d\n", err); 42 pr_err("zlib_inflateReset error %d\n", err);
41 zlib_inflateEnd(&stream); 43 zlib_inflateEnd(&stream);
42 zlib_inflateInit(&stream); 44 zlib_inflateInit(&stream);
43 } 45 }
@@ -48,8 +50,8 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen)
48 return stream.total_out; 50 return stream.total_out;
49 51
50err: 52err:
51 printk("Error %d while decompressing!\n", err); 53 pr_err("Error %d while decompressing!\n", err);
52 printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); 54 pr_err("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen);
53 return -EIO; 55 return -EIO;
54} 56}
55 57
@@ -57,7 +59,7 @@ int cramfs_uncompress_init(void)
57{ 59{
58 if (!initialized++) { 60 if (!initialized++) {
59 stream.workspace = vmalloc(zlib_inflate_workspacesize()); 61 stream.workspace = vmalloc(zlib_inflate_workspacesize());
60 if ( !stream.workspace ) { 62 if (!stream.workspace) {
61 initialized = 0; 63 initialized = 0;
62 return -ENOMEM; 64 return -ENOMEM;
63 } 65 }
diff --git a/fs/dcache.c b/fs/dcache.c
index 06f65857a855..d5a23fd0da90 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
106 unsigned int hash) 106 unsigned int hash)
107{ 107{
108 hash += (unsigned long) parent / L1_CACHE_BYTES; 108 hash += (unsigned long) parent / L1_CACHE_BYTES;
109 hash = hash + (hash >> d_hash_shift); 109 return dentry_hashtable + hash_32(hash, d_hash_shift);
110 return dentry_hashtable + (hash & d_hash_mask);
111} 110}
112 111
113/* Statistics gathering. */ 112/* Statistics gathering. */
@@ -236,18 +235,49 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
236 return dentry_string_cmp(cs, ct, tcount); 235 return dentry_string_cmp(cs, ct, tcount);
237} 236}
238 237
238struct external_name {
239 union {
240 atomic_t count;
241 struct rcu_head head;
242 } u;
243 unsigned char name[];
244};
245
246static inline struct external_name *external_name(struct dentry *dentry)
247{
248 return container_of(dentry->d_name.name, struct external_name, name[0]);
249}
250
239static void __d_free(struct rcu_head *head) 251static void __d_free(struct rcu_head *head)
240{ 252{
241 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); 253 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
242 254
243 WARN_ON(!hlist_unhashed(&dentry->d_alias)); 255 WARN_ON(!hlist_unhashed(&dentry->d_alias));
244 if (dname_external(dentry))
245 kfree(dentry->d_name.name);
246 kmem_cache_free(dentry_cache, dentry); 256 kmem_cache_free(dentry_cache, dentry);
247} 257}
248 258
259static void __d_free_external(struct rcu_head *head)
260{
261 struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
262 WARN_ON(!hlist_unhashed(&dentry->d_alias));
263 kfree(external_name(dentry));
264 kmem_cache_free(dentry_cache, dentry);
265}
266
267static inline int dname_external(const struct dentry *dentry)
268{
269 return dentry->d_name.name != dentry->d_iname;
270}
271
249static void dentry_free(struct dentry *dentry) 272static void dentry_free(struct dentry *dentry)
250{ 273{
274 if (unlikely(dname_external(dentry))) {
275 struct external_name *p = external_name(dentry);
276 if (likely(atomic_dec_and_test(&p->u.count))) {
277 call_rcu(&dentry->d_u.d_rcu, __d_free_external);
278 return;
279 }
280 }
251 /* if dentry was never visible to RCU, immediate free is OK */ 281 /* if dentry was never visible to RCU, immediate free is OK */
252 if (!(dentry->d_flags & DCACHE_RCUACCESS)) 282 if (!(dentry->d_flags & DCACHE_RCUACCESS))
253 __d_free(&dentry->d_u.d_rcu); 283 __d_free(&dentry->d_u.d_rcu);
@@ -457,7 +487,7 @@ static void __dentry_kill(struct dentry *dentry)
457 * inform the fs via d_prune that this dentry is about to be 487 * inform the fs via d_prune that this dentry is about to be
458 * unhashed and destroyed. 488 * unhashed and destroyed.
459 */ 489 */
460 if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) 490 if (dentry->d_flags & DCACHE_OP_PRUNE)
461 dentry->d_op->d_prune(dentry); 491 dentry->d_op->d_prune(dentry);
462 492
463 if (dentry->d_flags & DCACHE_LRU_LIST) { 493 if (dentry->d_flags & DCACHE_LRU_LIST) {
@@ -620,62 +650,6 @@ kill_it:
620} 650}
621EXPORT_SYMBOL(dput); 651EXPORT_SYMBOL(dput);
622 652
623/**
624 * d_invalidate - invalidate a dentry
625 * @dentry: dentry to invalidate
626 *
627 * Try to invalidate the dentry if it turns out to be
628 * possible. If there are other dentries that can be
629 * reached through this one we can't delete it and we
630 * return -EBUSY. On success we return 0.
631 *
632 * no dcache lock.
633 */
634
635int d_invalidate(struct dentry * dentry)
636{
637 /*
638 * If it's already been dropped, return OK.
639 */
640 spin_lock(&dentry->d_lock);
641 if (d_unhashed(dentry)) {
642 spin_unlock(&dentry->d_lock);
643 return 0;
644 }
645 /*
646 * Check whether to do a partial shrink_dcache
647 * to get rid of unused child entries.
648 */
649 if (!list_empty(&dentry->d_subdirs)) {
650 spin_unlock(&dentry->d_lock);
651 shrink_dcache_parent(dentry);
652 spin_lock(&dentry->d_lock);
653 }
654
655 /*
656 * Somebody else still using it?
657 *
658 * If it's a directory, we can't drop it
659 * for fear of somebody re-populating it
660 * with children (even though dropping it
661 * would make it unreachable from the root,
662 * we might still populate it if it was a
663 * working directory or similar).
664 * We also need to leave mountpoints alone,
665 * directory or not.
666 */
667 if (dentry->d_lockref.count > 1 && dentry->d_inode) {
668 if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
669 spin_unlock(&dentry->d_lock);
670 return -EBUSY;
671 }
672 }
673
674 __d_drop(dentry);
675 spin_unlock(&dentry->d_lock);
676 return 0;
677}
678EXPORT_SYMBOL(d_invalidate);
679 653
680/* This must be called with d_lock held */ 654/* This must be called with d_lock held */
681static inline void __dget_dlock(struct dentry *dentry) 655static inline void __dget_dlock(struct dentry *dentry)
@@ -731,20 +705,18 @@ EXPORT_SYMBOL(dget_parent);
731/** 705/**
732 * d_find_alias - grab a hashed alias of inode 706 * d_find_alias - grab a hashed alias of inode
733 * @inode: inode in question 707 * @inode: inode in question
734 * @want_discon: flag, used by d_splice_alias, to request
735 * that only a DISCONNECTED alias be returned.
736 * 708 *
737 * If inode has a hashed alias, or is a directory and has any alias, 709 * If inode has a hashed alias, or is a directory and has any alias,
738 * acquire the reference to alias and return it. Otherwise return NULL. 710 * acquire the reference to alias and return it. Otherwise return NULL.
739 * Notice that if inode is a directory there can be only one alias and 711 * Notice that if inode is a directory there can be only one alias and
740 * it can be unhashed only if it has no children, or if it is the root 712 * it can be unhashed only if it has no children, or if it is the root
741 * of a filesystem. 713 * of a filesystem, or if the directory was renamed and d_revalidate
714 * was the first vfs operation to notice.
742 * 715 *
743 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer 716 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
744 * any other hashed alias over that one unless @want_discon is set, 717 * any other hashed alias over that one.
745 * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
746 */ 718 */
747static struct dentry *__d_find_alias(struct inode *inode, int want_discon) 719static struct dentry *__d_find_alias(struct inode *inode)
748{ 720{
749 struct dentry *alias, *discon_alias; 721 struct dentry *alias, *discon_alias;
750 722
@@ -756,7 +728,7 @@ again:
756 if (IS_ROOT(alias) && 728 if (IS_ROOT(alias) &&
757 (alias->d_flags & DCACHE_DISCONNECTED)) { 729 (alias->d_flags & DCACHE_DISCONNECTED)) {
758 discon_alias = alias; 730 discon_alias = alias;
759 } else if (!want_discon) { 731 } else {
760 __dget_dlock(alias); 732 __dget_dlock(alias);
761 spin_unlock(&alias->d_lock); 733 spin_unlock(&alias->d_lock);
762 return alias; 734 return alias;
@@ -768,12 +740,9 @@ again:
768 alias = discon_alias; 740 alias = discon_alias;
769 spin_lock(&alias->d_lock); 741 spin_lock(&alias->d_lock);
770 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) { 742 if (S_ISDIR(inode->i_mode) || !d_unhashed(alias)) {
771 if (IS_ROOT(alias) && 743 __dget_dlock(alias);
772 (alias->d_flags & DCACHE_DISCONNECTED)) { 744 spin_unlock(&alias->d_lock);
773 __dget_dlock(alias); 745 return alias;
774 spin_unlock(&alias->d_lock);
775 return alias;
776 }
777 } 746 }
778 spin_unlock(&alias->d_lock); 747 spin_unlock(&alias->d_lock);
779 goto again; 748 goto again;
@@ -787,7 +756,7 @@ struct dentry *d_find_alias(struct inode *inode)
787 756
788 if (!hlist_empty(&inode->i_dentry)) { 757 if (!hlist_empty(&inode->i_dentry)) {
789 spin_lock(&inode->i_lock); 758 spin_lock(&inode->i_lock);
790 de = __d_find_alias(inode, 0); 759 de = __d_find_alias(inode);
791 spin_unlock(&inode->i_lock); 760 spin_unlock(&inode->i_lock);
792 } 761 }
793 return de; 762 return de;
@@ -806,20 +775,13 @@ restart:
806 hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { 775 hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
807 spin_lock(&dentry->d_lock); 776 spin_lock(&dentry->d_lock);
808 if (!dentry->d_lockref.count) { 777 if (!dentry->d_lockref.count) {
809 /* 778 struct dentry *parent = lock_parent(dentry);
810 * inform the fs via d_prune that this dentry 779 if (likely(!dentry->d_lockref.count)) {
811 * is about to be unhashed and destroyed. 780 __dentry_kill(dentry);
812 */ 781 goto restart;
813 if ((dentry->d_flags & DCACHE_OP_PRUNE) && 782 }
814 !d_unhashed(dentry)) 783 if (parent)
815 dentry->d_op->d_prune(dentry); 784 spin_unlock(&parent->d_lock);
816
817 __dget_dlock(dentry);
818 __d_drop(dentry);
819 spin_unlock(&dentry->d_lock);
820 spin_unlock(&inode->i_lock);
821 dput(dentry);
822 goto restart;
823 } 785 }
824 spin_unlock(&dentry->d_lock); 786 spin_unlock(&dentry->d_lock);
825 } 787 }
@@ -1200,7 +1162,7 @@ EXPORT_SYMBOL(have_submounts);
1200 * reachable (e.g. NFS can unhash a directory dentry and then the complete 1162 * reachable (e.g. NFS can unhash a directory dentry and then the complete
1201 * subtree can become unreachable). 1163 * subtree can become unreachable).
1202 * 1164 *
1203 * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For 1165 * Only one of d_invalidate() and d_set_mounted() must succeed. For
1204 * this reason take rename_lock and d_lock on dentry and ancestors. 1166 * this reason take rename_lock and d_lock on dentry and ancestors.
1205 */ 1167 */
1206int d_set_mounted(struct dentry *dentry) 1168int d_set_mounted(struct dentry *dentry)
@@ -1209,7 +1171,7 @@ int d_set_mounted(struct dentry *dentry)
1209 int ret = -ENOENT; 1171 int ret = -ENOENT;
1210 write_seqlock(&rename_lock); 1172 write_seqlock(&rename_lock);
1211 for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { 1173 for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
1212 /* Need exclusion wrt. check_submounts_and_drop() */ 1174 /* Need exclusion wrt. d_invalidate() */
1213 spin_lock(&p->d_lock); 1175 spin_lock(&p->d_lock);
1214 if (unlikely(d_unhashed(p))) { 1176 if (unlikely(d_unhashed(p))) {
1215 spin_unlock(&p->d_lock); 1177 spin_unlock(&p->d_lock);
@@ -1353,70 +1315,84 @@ void shrink_dcache_for_umount(struct super_block *sb)
1353 } 1315 }
1354} 1316}
1355 1317
1356static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry) 1318struct detach_data {
1319 struct select_data select;
1320 struct dentry *mountpoint;
1321};
1322static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry)
1357{ 1323{
1358 struct select_data *data = _data; 1324 struct detach_data *data = _data;
1359 1325
1360 if (d_mountpoint(dentry)) { 1326 if (d_mountpoint(dentry)) {
1361 data->found = -EBUSY; 1327 __dget_dlock(dentry);
1328 data->mountpoint = dentry;
1362 return D_WALK_QUIT; 1329 return D_WALK_QUIT;
1363 } 1330 }
1364 1331
1365 return select_collect(_data, dentry); 1332 return select_collect(&data->select, dentry);
1366} 1333}
1367 1334
1368static void check_and_drop(void *_data) 1335static void check_and_drop(void *_data)
1369{ 1336{
1370 struct select_data *data = _data; 1337 struct detach_data *data = _data;
1371 1338
1372 if (d_mountpoint(data->start)) 1339 if (!data->mountpoint && !data->select.found)
1373 data->found = -EBUSY; 1340 __d_drop(data->select.start);
1374 if (!data->found)
1375 __d_drop(data->start);
1376} 1341}
1377 1342
1378/** 1343/**
1379 * check_submounts_and_drop - prune dcache, check for submounts and drop 1344 * d_invalidate - detach submounts, prune dcache, and drop
1345 * @dentry: dentry to invalidate (aka detach, prune and drop)
1380 * 1346 *
1381 * All done as a single atomic operation relative to has_unlinked_ancestor(). 1347 * no dcache lock.
1382 * Returns 0 if successfully unhashed @parent. If there were submounts then
1383 * return -EBUSY.
1384 * 1348 *
1385 * @dentry: dentry to prune and drop 1349 * The final d_drop is done as an atomic operation relative to
1350 * rename_lock ensuring there are no races with d_set_mounted. This
1351 * ensures there are no unhashed dentries on the path to a mountpoint.
1386 */ 1352 */
1387int check_submounts_and_drop(struct dentry *dentry) 1353void d_invalidate(struct dentry *dentry)
1388{ 1354{
1389 int ret = 0; 1355 /*
1356 * If it's already been dropped, return OK.
1357 */
1358 spin_lock(&dentry->d_lock);
1359 if (d_unhashed(dentry)) {
1360 spin_unlock(&dentry->d_lock);
1361 return;
1362 }
1363 spin_unlock(&dentry->d_lock);
1390 1364
1391 /* Negative dentries can be dropped without further checks */ 1365 /* Negative dentries can be dropped without further checks */
1392 if (!dentry->d_inode) { 1366 if (!dentry->d_inode) {
1393 d_drop(dentry); 1367 d_drop(dentry);
1394 goto out; 1368 return;
1395 } 1369 }
1396 1370
1397 for (;;) { 1371 for (;;) {
1398 struct select_data data; 1372 struct detach_data data;
1399 1373
1400 INIT_LIST_HEAD(&data.dispose); 1374 data.mountpoint = NULL;
1401 data.start = dentry; 1375 INIT_LIST_HEAD(&data.select.dispose);
1402 data.found = 0; 1376 data.select.start = dentry;
1377 data.select.found = 0;
1378
1379 d_walk(dentry, &data, detach_and_collect, check_and_drop);
1403 1380
1404 d_walk(dentry, &data, check_and_collect, check_and_drop); 1381 if (data.select.found)
1405 ret = data.found; 1382 shrink_dentry_list(&data.select.dispose);
1406 1383
1407 if (!list_empty(&data.dispose)) 1384 if (data.mountpoint) {
1408 shrink_dentry_list(&data.dispose); 1385 detach_mounts(data.mountpoint);
1386 dput(data.mountpoint);
1387 }
1409 1388
1410 if (ret <= 0) 1389 if (!data.mountpoint && !data.select.found)
1411 break; 1390 break;
1412 1391
1413 cond_resched(); 1392 cond_resched();
1414 } 1393 }
1415
1416out:
1417 return ret;
1418} 1394}
1419EXPORT_SYMBOL(check_submounts_and_drop); 1395EXPORT_SYMBOL(d_invalidate);
1420 1396
1421/** 1397/**
1422 * __d_alloc - allocate a dcache entry 1398 * __d_alloc - allocate a dcache entry
@@ -1445,11 +1421,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
1445 */ 1421 */
1446 dentry->d_iname[DNAME_INLINE_LEN-1] = 0; 1422 dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
1447 if (name->len > DNAME_INLINE_LEN-1) { 1423 if (name->len > DNAME_INLINE_LEN-1) {
1448 dname = kmalloc(name->len + 1, GFP_KERNEL); 1424 size_t size = offsetof(struct external_name, name[1]);
1449 if (!dname) { 1425 struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
1426 if (!p) {
1450 kmem_cache_free(dentry_cache, dentry); 1427 kmem_cache_free(dentry_cache, dentry);
1451 return NULL; 1428 return NULL;
1452 } 1429 }
1430 atomic_set(&p->u.count, 1);
1431 dname = p->name;
1453 } else { 1432 } else {
1454 dname = dentry->d_iname; 1433 dname = dentry->d_iname;
1455 } 1434 }
@@ -1781,25 +1760,7 @@ struct dentry *d_find_any_alias(struct inode *inode)
1781} 1760}
1782EXPORT_SYMBOL(d_find_any_alias); 1761EXPORT_SYMBOL(d_find_any_alias);
1783 1762
1784/** 1763static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected)
1785 * d_obtain_alias - find or allocate a dentry for a given inode
1786 * @inode: inode to allocate the dentry for
1787 *
1788 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
1789 * similar open by handle operations. The returned dentry may be anonymous,
1790 * or may have a full name (if the inode was already in the cache).
1791 *
1792 * When called on a directory inode, we must ensure that the inode only ever
1793 * has one dentry. If a dentry is found, that is returned instead of
1794 * allocating a new one.
1795 *
1796 * On successful return, the reference to the inode has been transferred
1797 * to the dentry. In case of an error the reference on the inode is released.
1798 * To make it easier to use in export operations a %NULL or IS_ERR inode may
1799 * be passed in and will be the error will be propagate to the return value,
1800 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
1801 */
1802struct dentry *d_obtain_alias(struct inode *inode)
1803{ 1764{
1804 static const struct qstr anonstring = QSTR_INIT("/", 1); 1765 static const struct qstr anonstring = QSTR_INIT("/", 1);
1805 struct dentry *tmp; 1766 struct dentry *tmp;
@@ -1830,7 +1791,10 @@ struct dentry *d_obtain_alias(struct inode *inode)
1830 } 1791 }
1831 1792
1832 /* attach a disconnected dentry */ 1793 /* attach a disconnected dentry */
1833 add_flags = d_flags_for_inode(inode) | DCACHE_DISCONNECTED; 1794 add_flags = d_flags_for_inode(inode);
1795
1796 if (disconnected)
1797 add_flags |= DCACHE_DISCONNECTED;
1834 1798
1835 spin_lock(&tmp->d_lock); 1799 spin_lock(&tmp->d_lock);
1836 tmp->d_inode = inode; 1800 tmp->d_inode = inode;
@@ -1851,59 +1815,51 @@ struct dentry *d_obtain_alias(struct inode *inode)
1851 iput(inode); 1815 iput(inode);
1852 return res; 1816 return res;
1853} 1817}
1854EXPORT_SYMBOL(d_obtain_alias);
1855 1818
1856/** 1819/**
1857 * d_splice_alias - splice a disconnected dentry into the tree if one exists 1820 * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
1858 * @inode: the inode which may have a disconnected dentry 1821 * @inode: inode to allocate the dentry for
1859 * @dentry: a negative dentry which we want to point to the inode.
1860 *
1861 * If inode is a directory and has a 'disconnected' dentry (i.e. IS_ROOT and
1862 * DCACHE_DISCONNECTED), then d_move that in place of the given dentry
1863 * and return it, else simply d_add the inode to the dentry and return NULL.
1864 * 1822 *
1865 * This is needed in the lookup routine of any filesystem that is exportable 1823 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
1866 * (via knfsd) so that we can build dcache paths to directories effectively. 1824 * similar open by handle operations. The returned dentry may be anonymous,
1825 * or may have a full name (if the inode was already in the cache).
1867 * 1826 *
1868 * If a dentry was found and moved, then it is returned. Otherwise NULL 1827 * When called on a directory inode, we must ensure that the inode only ever
1869 * is returned. This matches the expected return value of ->lookup. 1828 * has one dentry. If a dentry is found, that is returned instead of
1829 * allocating a new one.
1870 * 1830 *
1871 * Cluster filesystems may call this function with a negative, hashed dentry. 1831 * On successful return, the reference to the inode has been transferred
1872 * In that case, we know that the inode will be a regular file, and also this 1832 * to the dentry. In case of an error the reference on the inode is released.
1873 * will only occur during atomic_open. So we need to check for the dentry 1833 * To make it easier to use in export operations a %NULL or IS_ERR inode may
1874 * being already hashed only in the final case. 1834 * be passed in and the error will be propagated to the return value,
1835 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
1875 */ 1836 */
1876struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) 1837struct dentry *d_obtain_alias(struct inode *inode)
1877{ 1838{
1878 struct dentry *new = NULL; 1839 return __d_obtain_alias(inode, 1);
1879 1840}
1880 if (IS_ERR(inode)) 1841EXPORT_SYMBOL(d_obtain_alias);
1881 return ERR_CAST(inode);
1882 1842
1883 if (inode && S_ISDIR(inode->i_mode)) { 1843/**
1884 spin_lock(&inode->i_lock); 1844 * d_obtain_root - find or allocate a dentry for a given inode
1885 new = __d_find_alias(inode, 1); 1845 * @inode: inode to allocate the dentry for
1886 if (new) { 1846 *
1887 BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED)); 1847 * Obtain an IS_ROOT dentry for the root of a filesystem.
1888 spin_unlock(&inode->i_lock); 1848 *
1889 security_d_instantiate(new, inode); 1849 * We must ensure that directory inodes only ever have one dentry. If a
1890 d_move(new, dentry); 1850 * dentry is found, that is returned instead of allocating a new one.
1891 iput(inode); 1851 *
1892 } else { 1852 * On successful return, the reference to the inode has been transferred
1893 /* already taking inode->i_lock, so d_add() by hand */ 1853 * to the dentry. In case of an error the reference on the inode is
1894 __d_instantiate(dentry, inode); 1854 * released. A %NULL or IS_ERR inode may be passed in and will be the
1895 spin_unlock(&inode->i_lock); 1855 * error will be propagate to the return value, with a %NULL @inode
1896 security_d_instantiate(dentry, inode); 1856 * replaced by ERR_PTR(-ESTALE).
1897 d_rehash(dentry); 1857 */
1898 } 1858struct dentry *d_obtain_root(struct inode *inode)
1899 } else { 1859{
1900 d_instantiate(dentry, inode); 1860 return __d_obtain_alias(inode, 0);
1901 if (d_unhashed(dentry))
1902 d_rehash(dentry);
1903 }
1904 return new;
1905} 1861}
1906EXPORT_SYMBOL(d_splice_alias); 1862EXPORT_SYMBOL(d_obtain_root);
1907 1863
1908/** 1864/**
1909 * d_add_ci - lookup or allocate new dentry with case-exact name 1865 * d_add_ci - lookup or allocate new dentry with case-exact name
@@ -2142,10 +2098,10 @@ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
2142 struct dentry *dentry; 2098 struct dentry *dentry;
2143 unsigned seq; 2099 unsigned seq;
2144 2100
2145 do { 2101 do {
2146 seq = read_seqbegin(&rename_lock); 2102 seq = read_seqbegin(&rename_lock);
2147 dentry = __d_lookup(parent, name); 2103 dentry = __d_lookup(parent, name);
2148 if (dentry) 2104 if (dentry)
2149 break; 2105 break;
2150 } while (read_seqretry(&rename_lock, seq)); 2106 } while (read_seqretry(&rename_lock, seq));
2151 return dentry; 2107 return dentry;
@@ -2402,10 +2358,10 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
2402} 2358}
2403EXPORT_SYMBOL(dentry_update_name_case); 2359EXPORT_SYMBOL(dentry_update_name_case);
2404 2360
2405static void switch_names(struct dentry *dentry, struct dentry *target) 2361static void swap_names(struct dentry *dentry, struct dentry *target)
2406{ 2362{
2407 if (dname_external(target)) { 2363 if (unlikely(dname_external(target))) {
2408 if (dname_external(dentry)) { 2364 if (unlikely(dname_external(dentry))) {
2409 /* 2365 /*
2410 * Both external: swap the pointers 2366 * Both external: swap the pointers
2411 */ 2367 */
@@ -2421,7 +2377,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
2421 target->d_name.name = target->d_iname; 2377 target->d_name.name = target->d_iname;
2422 } 2378 }
2423 } else { 2379 } else {
2424 if (dname_external(dentry)) { 2380 if (unlikely(dname_external(dentry))) {
2425 /* 2381 /*
2426 * dentry:external, target:internal. Give dentry's 2382 * dentry:external, target:internal. Give dentry's
2427 * storage to target and make dentry internal 2383 * storage to target and make dentry internal
@@ -2442,7 +2398,25 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
2442 } 2398 }
2443 } 2399 }
2444 } 2400 }
2445 swap(dentry->d_name.len, target->d_name.len); 2401 swap(dentry->d_name.hash_len, target->d_name.hash_len);
2402}
2403
2404static void copy_name(struct dentry *dentry, struct dentry *target)
2405{
2406 struct external_name *old_name = NULL;
2407 if (unlikely(dname_external(dentry)))
2408 old_name = external_name(dentry);
2409 if (unlikely(dname_external(target))) {
2410 atomic_inc(&external_name(target)->u.count);
2411 dentry->d_name = target->d_name;
2412 } else {
2413 memcpy(dentry->d_iname, target->d_name.name,
2414 target->d_name.len + 1);
2415 dentry->d_name.name = dentry->d_iname;
2416 dentry->d_name.hash_len = target->d_name.hash_len;
2417 }
2418 if (old_name && likely(atomic_dec_and_test(&old_name->u.count)))
2419 kfree_rcu(old_name, u.head);
2446} 2420}
2447 2421
2448static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) 2422static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
@@ -2472,25 +2446,29 @@ static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target)
2472 } 2446 }
2473} 2447}
2474 2448
2475static void dentry_unlock_parents_for_move(struct dentry *dentry, 2449static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
2476 struct dentry *target)
2477{ 2450{
2478 if (target->d_parent != dentry->d_parent) 2451 if (target->d_parent != dentry->d_parent)
2479 spin_unlock(&dentry->d_parent->d_lock); 2452 spin_unlock(&dentry->d_parent->d_lock);
2480 if (target->d_parent != target) 2453 if (target->d_parent != target)
2481 spin_unlock(&target->d_parent->d_lock); 2454 spin_unlock(&target->d_parent->d_lock);
2455 spin_unlock(&target->d_lock);
2456 spin_unlock(&dentry->d_lock);
2482} 2457}
2483 2458
2484/* 2459/*
2485 * When switching names, the actual string doesn't strictly have to 2460 * When switching names, the actual string doesn't strictly have to
2486 * be preserved in the target - because we're dropping the target 2461 * be preserved in the target - because we're dropping the target
2487 * anyway. As such, we can just do a simple memcpy() to copy over 2462 * anyway. As such, we can just do a simple memcpy() to copy over
2488 * the new name before we switch. 2463 * the new name before we switch, unless we are going to rehash
2489 * 2464 * it. Note that if we *do* unhash the target, we are not allowed
2490 * Note that we have to be a lot more careful about getting the hash 2465 * to rehash it without giving it a new name/hash key - whether
2491 * switched - we have to switch the hash value properly even if it 2466 * we swap or overwrite the names here, resulting name won't match
2492 * then no longer matches the actual (corrupted) string of the target. 2467 * the reality in filesystem; it's only there for d_path() purposes.
2493 * The hash value has to match the hash queue that the dentry is on.. 2468 * Note that all of this is happening under rename_lock, so the
2469 * any hash lookup seeing it in the middle of manipulations will
2470 * be discarded anyway. So we do not care what happens to the hash
2471 * key in that case.
2494 */ 2472 */
2495/* 2473/*
2496 * __d_move - move a dentry 2474 * __d_move - move a dentry
@@ -2536,36 +2514,33 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
2536 d_hash(dentry->d_parent, dentry->d_name.hash)); 2514 d_hash(dentry->d_parent, dentry->d_name.hash));
2537 } 2515 }
2538 2516
2539 list_del(&dentry->d_u.d_child);
2540 list_del(&target->d_u.d_child);
2541
2542 /* Switch the names.. */ 2517 /* Switch the names.. */
2543 switch_names(dentry, target); 2518 if (exchange)
2544 swap(dentry->d_name.hash, target->d_name.hash); 2519 swap_names(dentry, target);
2520 else
2521 copy_name(dentry, target);
2545 2522
2546 /* ... and switch the parents */ 2523 /* ... and switch them in the tree */
2547 if (IS_ROOT(dentry)) { 2524 if (IS_ROOT(dentry)) {
2525 /* splicing a tree */
2548 dentry->d_parent = target->d_parent; 2526 dentry->d_parent = target->d_parent;
2549 target->d_parent = target; 2527 target->d_parent = target;
2550 INIT_LIST_HEAD(&target->d_u.d_child); 2528 list_del_init(&target->d_u.d_child);
2529 list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2551 } else { 2530 } else {
2531 /* swapping two dentries */
2552 swap(dentry->d_parent, target->d_parent); 2532 swap(dentry->d_parent, target->d_parent);
2553 2533 list_move(&target->d_u.d_child, &target->d_parent->d_subdirs);
2554 /* And add them back to the (new) parent lists */ 2534 list_move(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2555 list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); 2535 if (exchange)
2536 fsnotify_d_move(target);
2537 fsnotify_d_move(dentry);
2556 } 2538 }
2557 2539
2558 list_add(&dentry->d_u.d_child, &dentry->d_parent->d_subdirs);
2559
2560 write_seqcount_end(&target->d_seq); 2540 write_seqcount_end(&target->d_seq);
2561 write_seqcount_end(&dentry->d_seq); 2541 write_seqcount_end(&dentry->d_seq);
2562 2542
2563 dentry_unlock_parents_for_move(dentry, target); 2543 dentry_unlock_for_move(dentry, target);
2564 if (exchange)
2565 fsnotify_d_move(target);
2566 spin_unlock(&target->d_lock);
2567 fsnotify_d_move(dentry);
2568 spin_unlock(&dentry->d_lock);
2569} 2544}
2570 2545
2571/* 2546/*
@@ -2650,10 +2625,8 @@ static struct dentry *__d_unalias(struct inode *inode,
2650 goto out_err; 2625 goto out_err;
2651 m2 = &alias->d_parent->d_inode->i_mutex; 2626 m2 = &alias->d_parent->d_inode->i_mutex;
2652out_unalias: 2627out_unalias:
2653 if (likely(!d_mountpoint(alias))) { 2628 __d_move(alias, dentry, false);
2654 __d_move(alias, dentry, false); 2629 ret = alias;
2655 ret = alias;
2656 }
2657out_err: 2630out_err:
2658 spin_unlock(&inode->i_lock); 2631 spin_unlock(&inode->i_lock);
2659 if (m2) 2632 if (m2)
@@ -2663,38 +2636,71 @@ out_err:
2663 return ret; 2636 return ret;
2664} 2637}
2665 2638
2666/* 2639/**
2667 * Prepare an anonymous dentry for life in the superblock's dentry tree as a 2640 * d_splice_alias - splice a disconnected dentry into the tree if one exists
2668 * named dentry in place of the dentry to be replaced. 2641 * @inode: the inode which may have a disconnected dentry
2669 * returns with anon->d_lock held! 2642 * @dentry: a negative dentry which we want to point to the inode.
2643 *
2644 * If inode is a directory and has an IS_ROOT alias, then d_move that in
2645 * place of the given dentry and return it, else simply d_add the inode
2646 * to the dentry and return NULL.
2647 *
2648 * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
2649 * we should error out: directories can't have multiple aliases.
2650 *
2651 * This is needed in the lookup routine of any filesystem that is exportable
2652 * (via knfsd) so that we can build dcache paths to directories effectively.
2653 *
2654 * If a dentry was found and moved, then it is returned. Otherwise NULL
2655 * is returned. This matches the expected return value of ->lookup.
2656 *
2657 * Cluster filesystems may call this function with a negative, hashed dentry.
2658 * In that case, we know that the inode will be a regular file, and also this
2659 * will only occur during atomic_open. So we need to check for the dentry
2660 * being already hashed only in the final case.
2670 */ 2661 */
2671static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) 2662struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
2672{ 2663{
2673 struct dentry *dparent; 2664 struct dentry *new = NULL;
2674
2675 dentry_lock_for_move(anon, dentry);
2676
2677 write_seqcount_begin(&dentry->d_seq);
2678 write_seqcount_begin_nested(&anon->d_seq, DENTRY_D_LOCK_NESTED);
2679
2680 dparent = dentry->d_parent;
2681
2682 switch_names(dentry, anon);
2683 swap(dentry->d_name.hash, anon->d_name.hash);
2684
2685 dentry->d_parent = dentry;
2686 list_del_init(&dentry->d_u.d_child);
2687 anon->d_parent = dparent;
2688 list_move(&anon->d_u.d_child, &dparent->d_subdirs);
2689
2690 write_seqcount_end(&dentry->d_seq);
2691 write_seqcount_end(&anon->d_seq);
2692 2665
2693 dentry_unlock_parents_for_move(anon, dentry); 2666 if (IS_ERR(inode))
2694 spin_unlock(&dentry->d_lock); 2667 return ERR_CAST(inode);
2695 2668
2696 /* anon->d_lock still locked, returns locked */ 2669 if (inode && S_ISDIR(inode->i_mode)) {
2670 spin_lock(&inode->i_lock);
2671 new = __d_find_any_alias(inode);
2672 if (new) {
2673 if (!IS_ROOT(new)) {
2674 spin_unlock(&inode->i_lock);
2675 dput(new);
2676 return ERR_PTR(-EIO);
2677 }
2678 if (d_ancestor(new, dentry)) {
2679 spin_unlock(&inode->i_lock);
2680 dput(new);
2681 return ERR_PTR(-EIO);
2682 }
2683 write_seqlock(&rename_lock);
2684 __d_move(new, dentry, false);
2685 write_sequnlock(&rename_lock);
2686 spin_unlock(&inode->i_lock);
2687 security_d_instantiate(new, inode);
2688 iput(inode);
2689 } else {
2690 /* already taking inode->i_lock, so d_add() by hand */
2691 __d_instantiate(dentry, inode);
2692 spin_unlock(&inode->i_lock);
2693 security_d_instantiate(dentry, inode);
2694 d_rehash(dentry);
2695 }
2696 } else {
2697 d_instantiate(dentry, inode);
2698 if (d_unhashed(dentry))
2699 d_rehash(dentry);
2700 }
2701 return new;
2697} 2702}
2703EXPORT_SYMBOL(d_splice_alias);
2698 2704
2699/** 2705/**
2700 * d_materialise_unique - introduce an inode into the tree 2706 * d_materialise_unique - introduce an inode into the tree
@@ -2724,7 +2730,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2724 struct dentry *alias; 2730 struct dentry *alias;
2725 2731
2726 /* Does an aliased dentry already exist? */ 2732 /* Does an aliased dentry already exist? */
2727 alias = __d_find_alias(inode, 0); 2733 alias = __d_find_alias(inode);
2728 if (alias) { 2734 if (alias) {
2729 actual = alias; 2735 actual = alias;
2730 write_seqlock(&rename_lock); 2736 write_seqlock(&rename_lock);
@@ -2736,9 +2742,8 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2736 } else if (IS_ROOT(alias)) { 2742 } else if (IS_ROOT(alias)) {
2737 /* Is this an anonymous mountpoint that we 2743 /* Is this an anonymous mountpoint that we
2738 * could splice into our tree? */ 2744 * could splice into our tree? */
2739 __d_materialise_dentry(dentry, alias); 2745 __d_move(alias, dentry, false);
2740 write_sequnlock(&rename_lock); 2746 write_sequnlock(&rename_lock);
2741 __d_drop(alias);
2742 goto found; 2747 goto found;
2743 } else { 2748 } else {
2744 /* Nope, but we must(!) avoid directory 2749 /* Nope, but we must(!) avoid directory
@@ -2764,13 +2769,9 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
2764 actual = __d_instantiate_unique(dentry, inode); 2769 actual = __d_instantiate_unique(dentry, inode);
2765 if (!actual) 2770 if (!actual)
2766 actual = dentry; 2771 actual = dentry;
2767 else
2768 BUG_ON(!d_unhashed(actual));
2769 2772
2770 spin_lock(&actual->d_lock); 2773 d_rehash(actual);
2771found: 2774found:
2772 _d_rehash(actual);
2773 spin_unlock(&actual->d_lock);
2774 spin_unlock(&inode->i_lock); 2775 spin_unlock(&inode->i_lock);
2775out_nolock: 2776out_nolock:
2776 if (actual == dentry) { 2777 if (actual == dentry) {
@@ -2807,6 +2808,9 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
2807 * the beginning of the name. The sequence number check at the caller will 2808 * the beginning of the name. The sequence number check at the caller will
2808 * retry it again when a d_move() does happen. So any garbage in the buffer 2809 * retry it again when a d_move() does happen. So any garbage in the buffer
2809 * due to mismatched pointer and length will be discarded. 2810 * due to mismatched pointer and length will be discarded.
2811 *
2812 * Data dependency barrier is needed to make sure that we see that terminating
2813 * NUL. Alpha strikes again, film at 11...
2810 */ 2814 */
2811static int prepend_name(char **buffer, int *buflen, struct qstr *name) 2815static int prepend_name(char **buffer, int *buflen, struct qstr *name)
2812{ 2816{
@@ -2814,6 +2818,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
2814 u32 dlen = ACCESS_ONCE(name->len); 2818 u32 dlen = ACCESS_ONCE(name->len);
2815 char *p; 2819 char *p;
2816 2820
2821 smp_read_barrier_depends();
2822
2817 *buflen -= dlen + 1; 2823 *buflen -= dlen + 1;
2818 if (*buflen < 0) 2824 if (*buflen < 0)
2819 return -ENAMETOOLONG; 2825 return -ENAMETOOLONG;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 63146295153b..76c08c2beb2f 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -451,7 +451,7 @@ static ssize_t read_file_bool(struct file *file, char __user *user_buf,
451{ 451{
452 char buf[3]; 452 char buf[3];
453 u32 *val = file->private_data; 453 u32 *val = file->private_data;
454 454
455 if (*val) 455 if (*val)
456 buf[0] = 'Y'; 456 buf[0] = 'Y';
457 else 457 else
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 8c41b52da358..1e3b99d3db0d 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -66,7 +66,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev
66 break; 66 break;
67 } 67 }
68 } 68 }
69 return inode; 69 return inode;
70} 70}
71 71
72/* SMP-safe */ 72/* SMP-safe */
@@ -317,7 +317,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
317 goto exit; 317 goto exit;
318 318
319 /* If the parent is not specified, we create it in the root. 319 /* If the parent is not specified, we create it in the root.
320 * We need the root dentry to do this, which is in the super 320 * We need the root dentry to do this, which is in the super
321 * block. A pointer to that is in the struct vfsmount that we 321 * block. A pointer to that is in the struct vfsmount that we
322 * have around. 322 * have around.
323 */ 323 */
@@ -330,7 +330,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
330 switch (mode & S_IFMT) { 330 switch (mode & S_IFMT) {
331 case S_IFDIR: 331 case S_IFDIR:
332 error = debugfs_mkdir(parent->d_inode, dentry, mode); 332 error = debugfs_mkdir(parent->d_inode, dentry, mode);
333 333
334 break; 334 break;
335 case S_IFLNK: 335 case S_IFLNK:
336 error = debugfs_link(parent->d_inode, dentry, mode, 336 error = debugfs_link(parent->d_inode, dentry, mode,
@@ -534,7 +534,7 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
534 */ 534 */
535void debugfs_remove_recursive(struct dentry *dentry) 535void debugfs_remove_recursive(struct dentry *dentry)
536{ 536{
537 struct dentry *child, *next, *parent; 537 struct dentry *child, *parent;
538 538
539 if (IS_ERR_OR_NULL(dentry)) 539 if (IS_ERR_OR_NULL(dentry))
540 return; 540 return;
@@ -546,30 +546,49 @@ void debugfs_remove_recursive(struct dentry *dentry)
546 parent = dentry; 546 parent = dentry;
547 down: 547 down:
548 mutex_lock(&parent->d_inode->i_mutex); 548 mutex_lock(&parent->d_inode->i_mutex);
549 list_for_each_entry_safe(child, next, &parent->d_subdirs, d_u.d_child) { 549 loop:
550 /*
551 * The parent->d_subdirs is protected by the d_lock. Outside that
552 * lock, the child can be unlinked and set to be freed which can
553 * use the d_u.d_child as the rcu head and corrupt this list.
554 */
555 spin_lock(&parent->d_lock);
556 list_for_each_entry(child, &parent->d_subdirs, d_u.d_child) {
550 if (!debugfs_positive(child)) 557 if (!debugfs_positive(child))
551 continue; 558 continue;
552 559
553 /* perhaps simple_empty(child) makes more sense */ 560 /* perhaps simple_empty(child) makes more sense */
554 if (!list_empty(&child->d_subdirs)) { 561 if (!list_empty(&child->d_subdirs)) {
562 spin_unlock(&parent->d_lock);
555 mutex_unlock(&parent->d_inode->i_mutex); 563 mutex_unlock(&parent->d_inode->i_mutex);
556 parent = child; 564 parent = child;
557 goto down; 565 goto down;
558 } 566 }
559 up: 567
568 spin_unlock(&parent->d_lock);
569
560 if (!__debugfs_remove(child, parent)) 570 if (!__debugfs_remove(child, parent))
561 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 571 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
572
573 /*
574 * The parent->d_lock protects agaist child from unlinking
575 * from d_subdirs. When releasing the parent->d_lock we can
576 * no longer trust that the next pointer is valid.
577 * Restart the loop. We'll skip this one with the
578 * debugfs_positive() check.
579 */
580 goto loop;
562 } 581 }
582 spin_unlock(&parent->d_lock);
563 583
564 mutex_unlock(&parent->d_inode->i_mutex); 584 mutex_unlock(&parent->d_inode->i_mutex);
565 child = parent; 585 child = parent;
566 parent = parent->d_parent; 586 parent = parent->d_parent;
567 mutex_lock(&parent->d_inode->i_mutex); 587 mutex_lock(&parent->d_inode->i_mutex);
568 588
569 if (child != dentry) { 589 if (child != dentry)
570 next = list_next_entry(child, d_u.d_child); 590 /* go up */
571 goto up; 591 goto loop;
572 }
573 592
574 if (!__debugfs_remove(child, parent)) 593 if (!__debugfs_remove(child, parent))
575 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 594 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 17e39b047de5..e181b6b2e297 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -158,7 +158,7 @@ static inline int dio_refill_pages(struct dio *dio, struct dio_submit *sdio)
158{ 158{
159 ssize_t ret; 159 ssize_t ret;
160 160
161 ret = iov_iter_get_pages(sdio->iter, dio->pages, DIO_PAGES * PAGE_SIZE, 161 ret = iov_iter_get_pages(sdio->iter, dio->pages, LONG_MAX, DIO_PAGES,
162 &sdio->from); 162 &sdio->from);
163 163
164 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) { 164 if (ret < 0 && sdio->blocks_available && (dio->rw & WRITE)) {
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 8d77ba7b1756..1323c568e362 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -718,16 +718,11 @@ static const struct file_operations waiters_fops = {
718 718
719void dlm_delete_debug_file(struct dlm_ls *ls) 719void dlm_delete_debug_file(struct dlm_ls *ls)
720{ 720{
721 if (ls->ls_debug_rsb_dentry) 721 debugfs_remove(ls->ls_debug_rsb_dentry);
722 debugfs_remove(ls->ls_debug_rsb_dentry); 722 debugfs_remove(ls->ls_debug_waiters_dentry);
723 if (ls->ls_debug_waiters_dentry) 723 debugfs_remove(ls->ls_debug_locks_dentry);
724 debugfs_remove(ls->ls_debug_waiters_dentry); 724 debugfs_remove(ls->ls_debug_all_dentry);
725 if (ls->ls_debug_locks_dentry) 725 debugfs_remove(ls->ls_debug_toss_dentry);
726 debugfs_remove(ls->ls_debug_locks_dentry);
727 if (ls->ls_debug_all_dentry)
728 debugfs_remove(ls->ls_debug_all_dentry);
729 if (ls->ls_debug_toss_dentry)
730 debugfs_remove(ls->ls_debug_toss_dentry);
731} 726}
732 727
733int dlm_create_debug_file(struct dlm_ls *ls) 728int dlm_create_debug_file(struct dlm_ls *ls)
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index f704458ea5f5..e0ab3a93eeff 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -30,7 +30,7 @@ struct plock_op {
30 30
31struct plock_xop { 31struct plock_xop {
32 struct plock_op xop; 32 struct plock_op xop;
33 void *callback; 33 int (*callback)(struct file_lock *fl, int result);
34 void *fl; 34 void *fl;
35 void *file; 35 void *file;
36 struct file_lock flc; 36 struct file_lock flc;
@@ -190,7 +190,7 @@ static int dlm_plock_callback(struct plock_op *op)
190 struct file *file; 190 struct file *file;
191 struct file_lock *fl; 191 struct file_lock *fl;
192 struct file_lock *flc; 192 struct file_lock *flc;
193 int (*notify)(void *, void *, int) = NULL; 193 int (*notify)(struct file_lock *fl, int result) = NULL;
194 struct plock_xop *xop = (struct plock_xop *)op; 194 struct plock_xop *xop = (struct plock_xop *)op;
195 int rv = 0; 195 int rv = 0;
196 196
@@ -209,7 +209,7 @@ static int dlm_plock_callback(struct plock_op *op)
209 notify = xop->callback; 209 notify = xop->callback;
210 210
211 if (op->info.rv) { 211 if (op->info.rv) {
212 notify(fl, NULL, op->info.rv); 212 notify(fl, op->info.rv);
213 goto out; 213 goto out;
214 } 214 }
215 215
@@ -228,7 +228,7 @@ static int dlm_plock_callback(struct plock_op *op)
228 (unsigned long long)op->info.number, file, fl); 228 (unsigned long long)op->info.number, file, fl);
229 } 229 }
230 230
231 rv = notify(fl, NULL, 0); 231 rv = notify(fl, 0);
232 if (rv) { 232 if (rv) {
233 /* XXX: We need to cancel the fs lock here: */ 233 /* XXX: We need to cancel the fs lock here: */
234 log_print("dlm_plock_callback: lock granted after lock request " 234 log_print("dlm_plock_callback: lock granted after lock request "
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 9d61947d473a..f3f5e72a29ba 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -206,7 +206,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in)
206 206
207 rs = (struct rcom_status *)rc_in->rc_buf; 207 rs = (struct rcom_status *)rc_in->rc_buf;
208 208
209 if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) { 209 if (!(le32_to_cpu(rs->rs_flags) & DLM_RSF_NEED_SLOTS)) {
210 status = dlm_recover_status(ls); 210 status = dlm_recover_status(ls);
211 goto do_create; 211 goto do_create;
212 } 212 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index db0fad3269c0..f5bce9096555 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -229,8 +229,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file)
229 if (rc) { 229 if (rc) {
230 printk(KERN_ERR "%s: Error attempting to initialize " 230 printk(KERN_ERR "%s: Error attempting to initialize "
231 "the lower file for the dentry with name " 231 "the lower file for the dentry with name "
232 "[%s]; rc = [%d]\n", __func__, 232 "[%pd]; rc = [%d]\n", __func__,
233 ecryptfs_dentry->d_name.name, rc); 233 ecryptfs_dentry, rc);
234 goto out_free; 234 goto out_free;
235 } 235 }
236 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) 236 if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE)
@@ -327,7 +327,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
327 struct file *lower_file = ecryptfs_file_to_lower(file); 327 struct file *lower_file = ecryptfs_file_to_lower(file);
328 long rc = -ENOIOCTLCMD; 328 long rc = -ENOIOCTLCMD;
329 329
330 if (lower_file->f_op && lower_file->f_op->compat_ioctl) 330 if (lower_file->f_op->compat_ioctl)
331 rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); 331 rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
332 return rc; 332 return rc;
333} 333}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index d4a9431ec73c..1686dc2da9fd 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -53,9 +53,7 @@ static void unlock_dir(struct dentry *dir)
53 53
54static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) 54static int ecryptfs_inode_test(struct inode *inode, void *lower_inode)
55{ 55{
56 if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode) 56 return ecryptfs_inode_to_lower(inode) == lower_inode;
57 return 1;
58 return 0;
59} 57}
60 58
61static int ecryptfs_inode_set(struct inode *inode, void *opaque) 59static int ecryptfs_inode_set(struct inode *inode, void *opaque)
@@ -192,12 +190,6 @@ ecryptfs_do_create(struct inode *directory_inode,
192 190
193 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); 191 lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
194 lower_dir_dentry = lock_parent(lower_dentry); 192 lower_dir_dentry = lock_parent(lower_dentry);
195 if (IS_ERR(lower_dir_dentry)) {
196 ecryptfs_printk(KERN_ERR, "Error locking directory of "
197 "dentry\n");
198 inode = ERR_CAST(lower_dir_dentry);
199 goto out;
200 }
201 rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true); 193 rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true);
202 if (rc) { 194 if (rc) {
203 printk(KERN_ERR "%s: Failure to create dentry in lower fs; " 195 printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
@@ -215,7 +207,6 @@ ecryptfs_do_create(struct inode *directory_inode,
215 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); 207 fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode);
216out_lock: 208out_lock:
217 unlock_dir(lower_dir_dentry); 209 unlock_dir(lower_dir_dentry);
218out:
219 return inode; 210 return inode;
220} 211}
221 212
@@ -250,8 +241,8 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry,
250 if (rc) { 241 if (rc) {
251 printk(KERN_ERR "%s: Error attempting to initialize " 242 printk(KERN_ERR "%s: Error attempting to initialize "
252 "the lower file for the dentry with name " 243 "the lower file for the dentry with name "
253 "[%s]; rc = [%d]\n", __func__, 244 "[%pd]; rc = [%d]\n", __func__,
254 ecryptfs_dentry->d_name.name, rc); 245 ecryptfs_dentry, rc);
255 goto out; 246 goto out;
256 } 247 }
257 rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); 248 rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode);
@@ -313,8 +304,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
313 if (rc) { 304 if (rc) {
314 printk(KERN_ERR "%s: Error attempting to initialize " 305 printk(KERN_ERR "%s: Error attempting to initialize "
315 "the lower file for the dentry with name " 306 "the lower file for the dentry with name "
316 "[%s]; rc = [%d]\n", __func__, 307 "[%pd]; rc = [%d]\n", __func__,
317 dentry->d_name.name, rc); 308 dentry, rc);
318 return rc; 309 return rc;
319 } 310 }
320 311
@@ -418,8 +409,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
418 if (IS_ERR(lower_dentry)) { 409 if (IS_ERR(lower_dentry)) {
419 rc = PTR_ERR(lower_dentry); 410 rc = PTR_ERR(lower_dentry);
420 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " 411 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
421 "[%d] on lower_dentry = [%s]\n", __func__, rc, 412 "[%d] on lower_dentry = [%pd]\n", __func__, rc,
422 ecryptfs_dentry->d_name.name); 413 ecryptfs_dentry);
423 goto out; 414 goto out;
424 } 415 }
425 if (lower_dentry->d_inode) 416 if (lower_dentry->d_inode)
@@ -1039,7 +1030,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value,
1039 } 1030 }
1040 1031
1041 rc = vfs_setxattr(lower_dentry, name, value, size, flags); 1032 rc = vfs_setxattr(lower_dentry, name, value, size, flags);
1042 if (!rc) 1033 if (!rc && dentry->d_inode)
1043 fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); 1034 fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode);
1044out: 1035out:
1045 return rc; 1036 return rc;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 4725a07f003c..635e8e16a5b7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -26,7 +26,6 @@
26 */ 26 */
27 27
28#include <linux/string.h> 28#include <linux/string.h>
29#include <linux/syscalls.h>
30#include <linux/pagemap.h> 29#include <linux/pagemap.h>
31#include <linux/key.h> 30#include <linux/key.h>
32#include <linux/random.h> 31#include <linux/random.h>
@@ -1846,7 +1845,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1846 "(Tag 11 not allowed by itself)\n"); 1845 "(Tag 11 not allowed by itself)\n");
1847 rc = -EIO; 1846 rc = -EIO;
1848 goto out_wipe_list; 1847 goto out_wipe_list;
1849 break;
1850 default: 1848 default:
1851 ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] " 1849 ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] "
1852 "of the file header; hex value of " 1850 "of the file header; hex value of "
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index e57380e5f6bd..286f10b0363b 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -434,8 +434,7 @@ void ecryptfs_release_messaging(void)
434 mutex_lock(&ecryptfs_msg_ctx_lists_mux); 434 mutex_lock(&ecryptfs_msg_ctx_lists_mux);
435 for (i = 0; i < ecryptfs_message_buf_len; i++) { 435 for (i = 0; i < ecryptfs_message_buf_len; i++) {
436 mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); 436 mutex_lock(&ecryptfs_msg_ctx_arr[i].mux);
437 if (ecryptfs_msg_ctx_arr[i].msg) 437 kfree(ecryptfs_msg_ctx_arr[i].msg);
438 kfree(ecryptfs_msg_ctx_arr[i].msg);
439 mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); 438 mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux);
440 } 439 }
441 kfree(ecryptfs_msg_ctx_arr); 440 kfree(ecryptfs_msg_ctx_arr);
diff --git a/fs/efs/namei.c b/fs/efs/namei.c
index 356c044e2cd3..bbee8f063dfa 100644
--- a/fs/efs/namei.c
+++ b/fs/efs/namei.c
@@ -12,7 +12,8 @@
12#include "efs.h" 12#include "efs.h"
13 13
14 14
15static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) { 15static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
16{
16 struct buffer_head *bh; 17 struct buffer_head *bh;
17 18
18 int slot, namelen; 19 int slot, namelen;
@@ -40,10 +41,10 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
40 if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { 41 if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) {
41 pr_err("%s(): invalid directory block\n", __func__); 42 pr_err("%s(): invalid directory block\n", __func__);
42 brelse(bh); 43 brelse(bh);
43 return(0); 44 return 0;
44 } 45 }
45 46
46 for(slot = 0; slot < dirblock->slots; slot++) { 47 for (slot = 0; slot < dirblock->slots; slot++) {
47 dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); 48 dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot));
48 49
49 namelen = dirslot->namelen; 50 namelen = dirslot->namelen;
@@ -52,12 +53,12 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len)
52 if ((namelen == len) && (!memcmp(name, nameptr, len))) { 53 if ((namelen == len) && (!memcmp(name, nameptr, len))) {
53 inodenum = be32_to_cpu(dirslot->inode); 54 inodenum = be32_to_cpu(dirslot->inode);
54 brelse(bh); 55 brelse(bh);
55 return(inodenum); 56 return inodenum;
56 } 57 }
57 } 58 }
58 brelse(bh); 59 brelse(bh);
59 } 60 }
60 return(0); 61 return 0;
61} 62}
62 63
63struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 64struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b10b48c2a7af..7bcfff900f05 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1852 goto error_tgt_fput; 1852 goto error_tgt_fput;
1853 1853
1854 /* Check if EPOLLWAKEUP is allowed */ 1854 /* Check if EPOLLWAKEUP is allowed */
1855 ep_take_care_of_epollwakeup(&epds); 1855 if (ep_op_has_event(op))
1856 ep_take_care_of_epollwakeup(&epds);
1856 1857
1857 /* 1858 /*
1858 * We have to check that the file structure underneath the file descriptor 1859 * We have to check that the file structure underneath the file descriptor
diff --git a/fs/exec.c b/fs/exec.c
index a3d33fe592d6..7302b75a9820 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -368,10 +368,6 @@ static int bprm_mm_init(struct linux_binprm *bprm)
368 if (!mm) 368 if (!mm)
369 goto err; 369 goto err;
370 370
371 err = init_new_context(current, mm);
372 if (err)
373 goto err;
374
375 err = __bprm_mm_init(bprm); 371 err = __bprm_mm_init(bprm);
376 if (err) 372 if (err)
377 goto err; 373 goto err;
@@ -1216,7 +1212,7 @@ EXPORT_SYMBOL(install_exec_creds);
1216/* 1212/*
1217 * determine how safe it is to execute the proposed program 1213 * determine how safe it is to execute the proposed program
1218 * - the caller must hold ->cred_guard_mutex to protect against 1214 * - the caller must hold ->cred_guard_mutex to protect against
1219 * PTRACE_ATTACH 1215 * PTRACE_ATTACH or seccomp thread-sync
1220 */ 1216 */
1221static void check_unsafe_exec(struct linux_binprm *bprm) 1217static void check_unsafe_exec(struct linux_binprm *bprm)
1222{ 1218{
@@ -1234,7 +1230,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
1234 * This isn't strictly necessary, but it makes it harder for LSMs to 1230 * This isn't strictly necessary, but it makes it harder for LSMs to
1235 * mess up. 1231 * mess up.
1236 */ 1232 */
1237 if (current->no_new_privs) 1233 if (task_no_new_privs(current))
1238 bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; 1234 bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
1239 1235
1240 t = p; 1236 t = p;
@@ -1272,7 +1268,7 @@ int prepare_binprm(struct linux_binprm *bprm)
1272 bprm->cred->egid = current_egid(); 1268 bprm->cred->egid = current_egid();
1273 1269
1274 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && 1270 if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) &&
1275 !current->no_new_privs && 1271 !task_no_new_privs(current) &&
1276 kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && 1272 kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) &&
1277 kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { 1273 kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) {
1278 /* Set-uid? */ 1274 /* Set-uid? */
@@ -1376,18 +1372,23 @@ int search_binary_handler(struct linux_binprm *bprm)
1376 read_unlock(&binfmt_lock); 1372 read_unlock(&binfmt_lock);
1377 bprm->recursion_depth++; 1373 bprm->recursion_depth++;
1378 retval = fmt->load_binary(bprm); 1374 retval = fmt->load_binary(bprm);
1375 read_lock(&binfmt_lock);
1376 put_binfmt(fmt);
1379 bprm->recursion_depth--; 1377 bprm->recursion_depth--;
1380 if (retval >= 0 || retval != -ENOEXEC || 1378 if (retval < 0 && !bprm->mm) {
1381 bprm->mm == NULL || bprm->file == NULL) { 1379 /* we got to flush_old_exec() and failed after it */
1382 put_binfmt(fmt); 1380 read_unlock(&binfmt_lock);
1381 force_sigsegv(SIGSEGV, current);
1382 return retval;
1383 }
1384 if (retval != -ENOEXEC || !bprm->file) {
1385 read_unlock(&binfmt_lock);
1383 return retval; 1386 return retval;
1384 } 1387 }
1385 read_lock(&binfmt_lock);
1386 put_binfmt(fmt);
1387 } 1388 }
1388 read_unlock(&binfmt_lock); 1389 read_unlock(&binfmt_lock);
1389 1390
1390 if (need_retry && retval == -ENOEXEC) { 1391 if (need_retry) {
1391 if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && 1392 if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
1392 printable(bprm->buf[2]) && printable(bprm->buf[3])) 1393 printable(bprm->buf[2]) && printable(bprm->buf[3]))
1393 return retval; 1394 return retval;
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index 7f20f25c232c..84529b8a331b 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -116,7 +116,7 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width,
116 num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, 116 num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa,
117 pages_in_unit - i); 117 pages_in_unit - i);
118 118
119 __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); 119 __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL);
120 if (unlikely(!__a1pa)) { 120 if (unlikely(!__a1pa)) {
121 ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", 121 ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n",
122 num_a1pa); 122 num_a1pa);
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 3750031cfa2f..170dc41e8bf4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -161,7 +161,7 @@ static struct kmem_cache * ext2_inode_cachep;
161static struct inode *ext2_alloc_inode(struct super_block *sb) 161static struct inode *ext2_alloc_inode(struct super_block *sb)
162{ 162{
163 struct ext2_inode_info *ei; 163 struct ext2_inode_info *ei;
164 ei = (struct ext2_inode_info *)kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL); 164 ei = kmem_cache_alloc(ext2_inode_cachep, GFP_KERNEL);
165 if (!ei) 165 if (!ei)
166 return NULL; 166 return NULL;
167 ei->i_block_alloc_info = NULL; 167 ei->i_block_alloc_info = NULL;
@@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1067 ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); 1067 ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
1068 1068
1069 err = percpu_counter_init(&sbi->s_freeblocks_counter, 1069 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1070 ext2_count_free_blocks(sb)); 1070 ext2_count_free_blocks(sb), GFP_KERNEL);
1071 if (!err) { 1071 if (!err) {
1072 err = percpu_counter_init(&sbi->s_freeinodes_counter, 1072 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1073 ext2_count_free_inodes(sb)); 1073 ext2_count_free_inodes(sb), GFP_KERNEL);
1074 } 1074 }
1075 if (!err) { 1075 if (!err) {
1076 err = percpu_counter_init(&sbi->s_dirs_counter, 1076 err = percpu_counter_init(&sbi->s_dirs_counter,
1077 ext2_count_dirs(sb)); 1077 ext2_count_dirs(sb), GFP_KERNEL);
1078 } 1078 }
1079 if (err) { 1079 if (err) {
1080 ext2_msg(sb, KERN_ERR, "error: insufficient memory"); 1080 ext2_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h
index e85ff15a060e..fc3cdcf24aed 100644
--- a/fs/ext3/ext3.h
+++ b/fs/ext3/ext3.h
@@ -237,6 +237,8 @@ struct ext3_new_group_data {
237#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 237#define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
238#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 238#define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
239 239
240/* Number of supported quota types */
241#define EXT3_MAXQUOTAS 2
240 242
241/* 243/*
242 * Mount options 244 * Mount options
@@ -248,7 +250,7 @@ struct ext3_mount_options {
248 unsigned long s_commit_interval; 250 unsigned long s_commit_interval;
249#ifdef CONFIG_QUOTA 251#ifdef CONFIG_QUOTA
250 int s_jquota_fmt; 252 int s_jquota_fmt;
251 char *s_qf_names[MAXQUOTAS]; 253 char *s_qf_names[EXT3_MAXQUOTAS];
252#endif 254#endif
253}; 255};
254 256
@@ -669,7 +671,7 @@ struct ext3_sb_info {
669 unsigned long s_commit_interval; 671 unsigned long s_commit_interval;
670 struct block_device *journal_bdev; 672 struct block_device *journal_bdev;
671#ifdef CONFIG_QUOTA 673#ifdef CONFIG_QUOTA
672 char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ 674 char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */
673 int s_jquota_fmt; /* Format of quota to use */ 675 int s_jquota_fmt; /* Format of quota to use */
674#endif 676#endif
675}; 677};
@@ -1183,9 +1185,9 @@ extern const struct inode_operations ext3_fast_symlink_inode_operations;
1183#define EXT3_QUOTA_INIT_BLOCKS(sb) 0 1185#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
1184#define EXT3_QUOTA_DEL_BLOCKS(sb) 0 1186#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
1185#endif 1187#endif
1186#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) 1188#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb))
1187#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) 1189#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb))
1188#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) 1190#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb))
1189 1191
1190int 1192int
1191ext3_mark_iloc_dirty(handle_t *handle, 1193ext3_mark_iloc_dirty(handle_t *handle,
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 08cdfe5461e3..7015db0bafd1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -441,7 +441,7 @@ static void ext3_put_super (struct super_block * sb)
441 percpu_counter_destroy(&sbi->s_dirs_counter); 441 percpu_counter_destroy(&sbi->s_dirs_counter);
442 brelse(sbi->s_sbh); 442 brelse(sbi->s_sbh);
443#ifdef CONFIG_QUOTA 443#ifdef CONFIG_QUOTA
444 for (i = 0; i < MAXQUOTAS; i++) 444 for (i = 0; i < EXT3_MAXQUOTAS; i++)
445 kfree(sbi->s_qf_names[i]); 445 kfree(sbi->s_qf_names[i]);
446#endif 446#endif
447 447
@@ -1555,7 +1555,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1555 /* Needed for iput() to work correctly and not trash data */ 1555 /* Needed for iput() to work correctly and not trash data */
1556 sb->s_flags |= MS_ACTIVE; 1556 sb->s_flags |= MS_ACTIVE;
1557 /* Turn on quotas so that they are updated correctly */ 1557 /* Turn on quotas so that they are updated correctly */
1558 for (i = 0; i < MAXQUOTAS; i++) { 1558 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
1559 if (EXT3_SB(sb)->s_qf_names[i]) { 1559 if (EXT3_SB(sb)->s_qf_names[i]) {
1560 int ret = ext3_quota_on_mount(sb, i); 1560 int ret = ext3_quota_on_mount(sb, i);
1561 if (ret < 0) 1561 if (ret < 0)
@@ -1606,7 +1606,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1606 PLURAL(nr_truncates)); 1606 PLURAL(nr_truncates));
1607#ifdef CONFIG_QUOTA 1607#ifdef CONFIG_QUOTA
1608 /* Turn quotas off */ 1608 /* Turn quotas off */
1609 for (i = 0; i < MAXQUOTAS; i++) { 1609 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
1610 if (sb_dqopt(sb)->files[i]) 1610 if (sb_dqopt(sb)->files[i])
1611 dquot_quota_off(sb, i); 1611 dquot_quota_off(sb, i);
1612 } 1612 }
@@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2039 goto failed_mount2; 2039 goto failed_mount2;
2040 } 2040 }
2041 err = percpu_counter_init(&sbi->s_freeblocks_counter, 2041 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2042 ext3_count_free_blocks(sb)); 2042 ext3_count_free_blocks(sb), GFP_KERNEL);
2043 if (!err) { 2043 if (!err) {
2044 err = percpu_counter_init(&sbi->s_freeinodes_counter, 2044 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2045 ext3_count_free_inodes(sb)); 2045 ext3_count_free_inodes(sb), GFP_KERNEL);
2046 } 2046 }
2047 if (!err) { 2047 if (!err) {
2048 err = percpu_counter_init(&sbi->s_dirs_counter, 2048 err = percpu_counter_init(&sbi->s_dirs_counter,
2049 ext3_count_dirs(sb)); 2049 ext3_count_dirs(sb), GFP_KERNEL);
2050 } 2050 }
2051 if (err) { 2051 if (err) {
2052 ext3_msg(sb, KERN_ERR, "error: insufficient memory"); 2052 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
@@ -2139,7 +2139,7 @@ failed_mount2:
2139 kfree(sbi->s_group_desc); 2139 kfree(sbi->s_group_desc);
2140failed_mount: 2140failed_mount:
2141#ifdef CONFIG_QUOTA 2141#ifdef CONFIG_QUOTA
2142 for (i = 0; i < MAXQUOTAS; i++) 2142 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2143 kfree(sbi->s_qf_names[i]); 2143 kfree(sbi->s_qf_names[i]);
2144#endif 2144#endif
2145 ext3_blkdev_remove(sbi); 2145 ext3_blkdev_remove(sbi);
@@ -2659,7 +2659,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2659 old_opts.s_commit_interval = sbi->s_commit_interval; 2659 old_opts.s_commit_interval = sbi->s_commit_interval;
2660#ifdef CONFIG_QUOTA 2660#ifdef CONFIG_QUOTA
2661 old_opts.s_jquota_fmt = sbi->s_jquota_fmt; 2661 old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
2662 for (i = 0; i < MAXQUOTAS; i++) 2662 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2663 if (sbi->s_qf_names[i]) { 2663 if (sbi->s_qf_names[i]) {
2664 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], 2664 old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i],
2665 GFP_KERNEL); 2665 GFP_KERNEL);
@@ -2763,7 +2763,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2763 } 2763 }
2764#ifdef CONFIG_QUOTA 2764#ifdef CONFIG_QUOTA
2765 /* Release old quota file names */ 2765 /* Release old quota file names */
2766 for (i = 0; i < MAXQUOTAS; i++) 2766 for (i = 0; i < EXT3_MAXQUOTAS; i++)
2767 kfree(old_opts.s_qf_names[i]); 2767 kfree(old_opts.s_qf_names[i]);
2768#endif 2768#endif
2769 if (enable_quota) 2769 if (enable_quota)
@@ -2777,7 +2777,7 @@ restore_opts:
2777 sbi->s_commit_interval = old_opts.s_commit_interval; 2777 sbi->s_commit_interval = old_opts.s_commit_interval;
2778#ifdef CONFIG_QUOTA 2778#ifdef CONFIG_QUOTA
2779 sbi->s_jquota_fmt = old_opts.s_jquota_fmt; 2779 sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
2780 for (i = 0; i < MAXQUOTAS; i++) { 2780 for (i = 0; i < EXT3_MAXQUOTAS; i++) {
2781 kfree(sbi->s_qf_names[i]); 2781 kfree(sbi->s_qf_names[i]);
2782 sbi->s_qf_names[i] = old_opts.s_qf_names[i]; 2782 sbi->s_qf_names[i] = old_opts.s_qf_names[i];
2783 } 2783 }
@@ -2828,8 +2828,9 @@ static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf)
2828 */ 2828 */
2829 overhead += ngroups * (2 + sbi->s_itb_per_group); 2829 overhead += ngroups * (2 + sbi->s_itb_per_group);
2830 2830
2831 /* Add the journal blocks as well */ 2831 /* Add the internal journal blocks as well */
2832 overhead += sbi->s_journal->j_maxlen; 2832 if (sbi->s_journal && !sbi->journal_bdev)
2833 overhead += sbi->s_journal->j_maxlen;
2833 2834
2834 sbi->s_overhead_last = overhead; 2835 sbi->s_overhead_last = overhead;
2835 smp_wmb(); 2836 smp_wmb();
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index fca382037ddd..581ef40fbe90 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -639,7 +639,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
639 if (!(*errp) && 639 if (!(*errp) &&
640 ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { 640 ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) {
641 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 641 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
642 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
643 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 642 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
644 dquot_alloc_block_nofail(inode, 643 dquot_alloc_block_nofail(inode,
645 EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); 644 EXT4_C2B(EXT4_SB(inode->i_sb), ar.len));
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ef1bed66c14f..0bb3f9ea0832 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -571,6 +571,31 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
571 return 0; 571 return 0;
572} 572}
573 573
574int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
575 int buf_size)
576{
577 struct ext4_dir_entry_2 *de;
578 int nlen, rlen;
579 unsigned int offset = 0;
580 char *top;
581
582 de = (struct ext4_dir_entry_2 *)buf;
583 top = buf + buf_size;
584 while ((char *) de < top) {
585 if (ext4_check_dir_entry(dir, NULL, de, bh,
586 buf, buf_size, offset))
587 return -EIO;
588 nlen = EXT4_DIR_REC_LEN(de->name_len);
589 rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
590 de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
591 offset += rlen;
592 }
593 if ((char *) de > top)
594 return -EIO;
595
596 return 0;
597}
598
574const struct file_operations ext4_dir_operations = { 599const struct file_operations ext4_dir_operations = {
575 .llseek = ext4_dir_llseek, 600 .llseek = ext4_dir_llseek,
576 .read = generic_read_dir, 601 .read = generic_read_dir,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7cc5a0e23688..b0c225cdb52c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -591,7 +591,6 @@ enum {
591#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 591#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008
592#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 592#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
593#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 593#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
594#define EXT4_FREE_BLOCKS_RESERVE 0x0040
595 594
596/* 595/*
597 * ioctl commands 596 * ioctl commands
@@ -1826,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
1826/* 1825/*
1827 * Special error return code only used by dx_probe() and its callers. 1826 * Special error return code only used by dx_probe() and its callers.
1828 */ 1827 */
1829#define ERR_BAD_DX_DIR -75000 1828#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
1830 1829
1831/* 1830/*
1832 * Timeout and state flag for lazy initialization inode thread. 1831 * Timeout and state flag for lazy initialization inode thread.
@@ -2029,6 +2028,8 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
2029 2028
2030 return ext4_filetype_table[filetype]; 2029 return ext4_filetype_table[filetype];
2031} 2030}
2031extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
2032 void *buf, int buf_size);
2032 2033
2033/* fsync.c */ 2034/* fsync.c */
2034extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2035extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
@@ -2144,8 +2145,8 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2144extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2145extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2145extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); 2146extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2146extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2147extern void ext4_ind_truncate(handle_t *, struct inode *inode);
2147extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 2148extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
2148 ext4_lblk_t first, ext4_lblk_t stop); 2149 ext4_lblk_t start, ext4_lblk_t end);
2149 2150
2150/* ioctl.c */ 2151/* ioctl.c */
2151extern long ext4_ioctl(struct file *, unsigned int, unsigned long); 2152extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
@@ -2453,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
2453 up_write(&EXT4_I(inode)->i_data_sem); 2454 up_write(&EXT4_I(inode)->i_data_sem);
2454} 2455}
2455 2456
2457/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
2458static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
2459{
2460 int changed = 0;
2461
2462 if (newsize > inode->i_size) {
2463 i_size_write(inode, newsize);
2464 changed = 1;
2465 }
2466 if (newsize > EXT4_I(inode)->i_disksize) {
2467 ext4_update_i_disksize(inode, newsize);
2468 changed |= 2;
2469 }
2470 return changed;
2471}
2472
2456struct ext4_group_info { 2473struct ext4_group_info {
2457 unsigned long bb_state; 2474 unsigned long bb_state;
2458 struct rb_root bb_free_root; 2475 struct rb_root bb_free_root;
@@ -2560,7 +2577,6 @@ extern const struct file_operations ext4_file_operations;
2560extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); 2577extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
2561 2578
2562/* inline.c */ 2579/* inline.c */
2563extern int ext4_has_inline_data(struct inode *inode);
2564extern int ext4_get_max_inline_size(struct inode *inode); 2580extern int ext4_get_max_inline_size(struct inode *inode);
2565extern int ext4_find_inline_data_nolock(struct inode *inode); 2581extern int ext4_find_inline_data_nolock(struct inode *inode);
2566extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, 2582extern int ext4_init_inline_data(handle_t *handle, struct inode *inode,
@@ -2626,6 +2642,12 @@ extern void ext4_inline_data_truncate(struct inode *inode, int *has_inline);
2626 2642
2627extern int ext4_convert_inline_data(struct inode *inode); 2643extern int ext4_convert_inline_data(struct inode *inode);
2628 2644
2645static inline int ext4_has_inline_data(struct inode *inode)
2646{
2647 return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
2648 EXT4_I(inode)->i_inline_off;
2649}
2650
2629/* namei.c */ 2651/* namei.c */
2630extern const struct inode_operations ext4_dir_inode_operations; 2652extern const struct inode_operations ext4_dir_inode_operations;
2631extern const struct inode_operations ext4_special_inode_operations; 2653extern const struct inode_operations ext4_special_inode_operations;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4da228a0e6d0..74292a71b384 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -161,6 +161,8 @@ int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
161 struct inode *inode, struct ext4_ext_path *path) 161 struct inode *inode, struct ext4_ext_path *path)
162{ 162{
163 int err; 163 int err;
164
165 WARN_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
164 if (path->p_bh) { 166 if (path->p_bh) {
165 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh)); 167 ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
166 /* path points to block */ 168 /* path points to block */
@@ -1808,8 +1810,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle,
1808 1810
1809 brelse(path[1].p_bh); 1811 brelse(path[1].p_bh);
1810 ext4_free_blocks(handle, inode, NULL, blk, 1, 1812 ext4_free_blocks(handle, inode, NULL, blk, 1,
1811 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET | 1813 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1812 EXT4_FREE_BLOCKS_RESERVE);
1813} 1814}
1814 1815
1815/* 1816/*
@@ -3253,7 +3254,7 @@ out:
3253 3254
3254fix_extent_len: 3255fix_extent_len:
3255 ex->ee_len = orig_ex.ee_len; 3256 ex->ee_len = orig_ex.ee_len;
3256 ext4_ext_dirty(handle, inode, path + depth); 3257 ext4_ext_dirty(handle, inode, path + path->p_depth);
3257 return err; 3258 return err;
3258} 3259}
3259 3260
@@ -4664,7 +4665,8 @@ retry:
4664} 4665}
4665 4666
4666static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, 4667static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4667 ext4_lblk_t len, int flags, int mode) 4668 ext4_lblk_t len, loff_t new_size,
4669 int flags, int mode)
4668{ 4670{
4669 struct inode *inode = file_inode(file); 4671 struct inode *inode = file_inode(file);
4670 handle_t *handle; 4672 handle_t *handle;
@@ -4673,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4673 int retries = 0; 4675 int retries = 0;
4674 struct ext4_map_blocks map; 4676 struct ext4_map_blocks map;
4675 unsigned int credits; 4677 unsigned int credits;
4678 loff_t epos;
4676 4679
4677 map.m_lblk = offset; 4680 map.m_lblk = offset;
4681 map.m_len = len;
4678 /* 4682 /*
4679 * Don't normalize the request if it can fit in one extent so 4683 * Don't normalize the request if it can fit in one extent so
4680 * that it doesn't get unnecessarily split into multiple 4684 * that it doesn't get unnecessarily split into multiple
@@ -4689,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
4689 credits = ext4_chunk_trans_blocks(inode, len); 4693 credits = ext4_chunk_trans_blocks(inode, len);
4690 4694
4691retry: 4695retry:
4692 while (ret >= 0 && ret < len) { 4696 while (ret >= 0 && len) {
4693 map.m_lblk = map.m_lblk + ret;
4694 map.m_len = len = len - ret;
4695 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, 4697 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4696 credits); 4698 credits);
4697 if (IS_ERR(handle)) { 4699 if (IS_ERR(handle)) {
@@ -4708,6 +4710,21 @@ retry:
4708 ret2 = ext4_journal_stop(handle); 4710 ret2 = ext4_journal_stop(handle);
4709 break; 4711 break;
4710 } 4712 }
4713 map.m_lblk += ret;
4714 map.m_len = len = len - ret;
4715 epos = (loff_t)map.m_lblk << inode->i_blkbits;
4716 inode->i_ctime = ext4_current_time(inode);
4717 if (new_size) {
4718 if (epos > new_size)
4719 epos = new_size;
4720 if (ext4_update_inode_size(inode, epos) & 0x1)
4721 inode->i_mtime = inode->i_ctime;
4722 } else {
4723 if (epos > inode->i_size)
4724 ext4_set_inode_flag(inode,
4725 EXT4_INODE_EOFBLOCKS);
4726 }
4727 ext4_mark_inode_dirty(handle, inode);
4711 ret2 = ext4_journal_stop(handle); 4728 ret2 = ext4_journal_stop(handle);
4712 if (ret2) 4729 if (ret2)
4713 break; 4730 break;
@@ -4730,7 +4747,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4730 loff_t new_size = 0; 4747 loff_t new_size = 0;
4731 int ret = 0; 4748 int ret = 0;
4732 int flags; 4749 int flags;
4733 int partial; 4750 int credits;
4751 int partial_begin, partial_end;
4734 loff_t start, end; 4752 loff_t start, end;
4735 ext4_lblk_t lblk; 4753 ext4_lblk_t lblk;
4736 struct address_space *mapping = inode->i_mapping; 4754 struct address_space *mapping = inode->i_mapping;
@@ -4770,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4770 4788
4771 if (start < offset || end > offset + len) 4789 if (start < offset || end > offset + len)
4772 return -EINVAL; 4790 return -EINVAL;
4773 partial = (offset + len) & ((1 << blkbits) - 1); 4791 partial_begin = offset & ((1 << blkbits) - 1);
4792 partial_end = (offset + len) & ((1 << blkbits) - 1);
4774 4793
4775 lblk = start >> blkbits; 4794 lblk = start >> blkbits;
4776 max_blocks = (end >> blkbits); 4795 max_blocks = (end >> blkbits);
@@ -4804,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4804 * If we have a partial block after EOF we have to allocate 4823 * If we have a partial block after EOF we have to allocate
4805 * the entire block. 4824 * the entire block.
4806 */ 4825 */
4807 if (partial) 4826 if (partial_end)
4808 max_blocks += 1; 4827 max_blocks += 1;
4809 } 4828 }
4810 4829
@@ -4812,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4812 4831
4813 /* Now release the pages and zero block aligned part of pages*/ 4832 /* Now release the pages and zero block aligned part of pages*/
4814 truncate_pagecache_range(inode, start, end - 1); 4833 truncate_pagecache_range(inode, start, end - 1);
4834 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4815 4835
4816 /* Wait all existing dio workers, newcomers will block on i_mutex */ 4836 /* Wait all existing dio workers, newcomers will block on i_mutex */
4817 ext4_inode_block_unlocked_dio(inode); 4837 ext4_inode_block_unlocked_dio(inode);
@@ -4824,13 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4824 if (ret) 4844 if (ret)
4825 goto out_dio; 4845 goto out_dio;
4826 4846
4827 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, 4847 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4828 mode); 4848 flags, mode);
4829 if (ret) 4849 if (ret)
4830 goto out_dio; 4850 goto out_dio;
4831 } 4851 }
4852 if (!partial_begin && !partial_end)
4853 goto out_dio;
4832 4854
4833 handle = ext4_journal_start(inode, EXT4_HT_MISC, 4); 4855 /*
4856 * In worst case we have to writeout two nonadjacent unwritten
4857 * blocks and update the inode
4858 */
4859 credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
4860 if (ext4_should_journal_data(inode))
4861 credits += 2;
4862 handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
4834 if (IS_ERR(handle)) { 4863 if (IS_ERR(handle)) {
4835 ret = PTR_ERR(handle); 4864 ret = PTR_ERR(handle);
4836 ext4_std_error(inode->i_sb, ret); 4865 ext4_std_error(inode->i_sb, ret);
@@ -4838,12 +4867,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4838 } 4867 }
4839 4868
4840 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4869 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4841
4842 if (new_size) { 4870 if (new_size) {
4843 if (new_size > i_size_read(inode)) 4871 ext4_update_inode_size(inode, new_size);
4844 i_size_write(inode, new_size);
4845 if (new_size > EXT4_I(inode)->i_disksize)
4846 ext4_update_i_disksize(inode, new_size);
4847 } else { 4872 } else {
4848 /* 4873 /*
4849 * Mark that we allocate beyond EOF so the subsequent truncate 4874 * Mark that we allocate beyond EOF so the subsequent truncate
@@ -4852,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
4852 if ((offset + len) > i_size_read(inode)) 4877 if ((offset + len) > i_size_read(inode))
4853 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS); 4878 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4854 } 4879 }
4855
4856 ext4_mark_inode_dirty(handle, inode); 4880 ext4_mark_inode_dirty(handle, inode);
4857 4881
4858 /* Zero out partial block at the edges of the range */ 4882 /* Zero out partial block at the edges of the range */
@@ -4879,13 +4903,11 @@ out_mutex:
4879long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4903long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4880{ 4904{
4881 struct inode *inode = file_inode(file); 4905 struct inode *inode = file_inode(file);
4882 handle_t *handle;
4883 loff_t new_size = 0; 4906 loff_t new_size = 0;
4884 unsigned int max_blocks; 4907 unsigned int max_blocks;
4885 int ret = 0; 4908 int ret = 0;
4886 int flags; 4909 int flags;
4887 ext4_lblk_t lblk; 4910 ext4_lblk_t lblk;
4888 struct timespec tv;
4889 unsigned int blkbits = inode->i_blkbits; 4911 unsigned int blkbits = inode->i_blkbits;
4890 4912
4891 /* Return error if mode is not supported */ 4913 /* Return error if mode is not supported */
@@ -4936,36 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4936 goto out; 4958 goto out;
4937 } 4959 }
4938 4960
4939 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode); 4961 ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
4962 flags, mode);
4940 if (ret) 4963 if (ret)
4941 goto out; 4964 goto out;
4942 4965
4943 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 4966 if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
4944 if (IS_ERR(handle)) 4967 ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
4945 goto out; 4968 EXT4_I(inode)->i_sync_tid);
4946
4947 tv = inode->i_ctime = ext4_current_time(inode);
4948
4949 if (new_size) {
4950 if (new_size > i_size_read(inode)) {
4951 i_size_write(inode, new_size);
4952 inode->i_mtime = tv;
4953 }
4954 if (new_size > EXT4_I(inode)->i_disksize)
4955 ext4_update_i_disksize(inode, new_size);
4956 } else {
4957 /*
4958 * Mark that we allocate beyond EOF so the subsequent truncate
4959 * can proceed even if the new size is the same as i_size.
4960 */
4961 if ((offset + len) > i_size_read(inode))
4962 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4963 } 4969 }
4964 ext4_mark_inode_dirty(handle, inode);
4965 if (file->f_flags & O_SYNC)
4966 ext4_handle_sync(handle);
4967
4968 ext4_journal_stop(handle);
4969out: 4970out:
4970 mutex_unlock(&inode->i_mutex); 4971 mutex_unlock(&inode->i_mutex);
4971 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4972 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
@@ -5403,16 +5404,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
5403 int ret; 5404 int ret;
5404 5405
5405 /* Collapse range works only on fs block size aligned offsets. */ 5406 /* Collapse range works only on fs block size aligned offsets. */
5406 if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || 5407 if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
5407 len & (EXT4_BLOCK_SIZE(sb) - 1)) 5408 len & (EXT4_CLUSTER_SIZE(sb) - 1))
5408 return -EINVAL; 5409 return -EINVAL;
5409 5410
5410 if (!S_ISREG(inode->i_mode)) 5411 if (!S_ISREG(inode->i_mode))
5411 return -EINVAL; 5412 return -EINVAL;
5412 5413
5413 if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1)
5414 return -EOPNOTSUPP;
5415
5416 trace_ext4_collapse_range(inode, offset, len); 5414 trace_ext4_collapse_range(inode, offset, len);
5417 5415
5418 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5416 punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8695f70af1ef..aca7b24a4432 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -200,10 +200,6 @@ static const struct vm_operations_struct ext4_file_vm_ops = {
200 200
201static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 201static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
202{ 202{
203 struct address_space *mapping = file->f_mapping;
204
205 if (!mapping->a_ops->readpage)
206 return -ENOEXEC;
207 file_accessed(file); 203 file_accessed(file);
208 vma->vm_ops = &ext4_file_vm_ops; 204 vma->vm_ops = &ext4_file_vm_ops;
209 return 0; 205 return 0;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index fd69da194826..e75f840000a0 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -1295,97 +1295,220 @@ do_indirects:
1295 } 1295 }
1296} 1296}
1297 1297
1298static int free_hole_blocks(handle_t *handle, struct inode *inode, 1298/**
1299 struct buffer_head *parent_bh, __le32 *i_data, 1299 * ext4_ind_remove_space - remove space from the range
1300 int level, ext4_lblk_t first, 1300 * @handle: JBD handle for this transaction
1301 ext4_lblk_t count, int max) 1301 * @inode: inode we are dealing with
1302 * @start: First block to remove
1303 * @end: One block after the last block to remove (exclusive)
1304 *
1305 * Free the blocks in the defined range (end is exclusive endpoint of
1306 * range). This is used by ext4_punch_hole().
1307 */
1308int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
1309 ext4_lblk_t start, ext4_lblk_t end)
1302{ 1310{
1303 struct buffer_head *bh = NULL; 1311 struct ext4_inode_info *ei = EXT4_I(inode);
1312 __le32 *i_data = ei->i_data;
1304 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1313 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb);
1305 int ret = 0; 1314 ext4_lblk_t offsets[4], offsets2[4];
1306 int i, inc; 1315 Indirect chain[4], chain2[4];
1307 ext4_lblk_t offset; 1316 Indirect *partial, *partial2;
1308 __le32 blk; 1317 ext4_lblk_t max_block;
1309 1318 __le32 nr = 0, nr2 = 0;
1310 inc = 1 << ((EXT4_BLOCK_SIZE_BITS(inode->i_sb) - 2) * level); 1319 int n = 0, n2 = 0;
1311 for (i = 0, offset = 0; i < max; i++, i_data++, offset += inc) { 1320 unsigned blocksize = inode->i_sb->s_blocksize;
1312 if (offset >= count + first)
1313 break;
1314 if (*i_data == 0 || (offset + inc) <= first)
1315 continue;
1316 blk = *i_data;
1317 if (level > 0) {
1318 ext4_lblk_t first2;
1319 ext4_lblk_t count2;
1320 1321
1321 bh = sb_bread(inode->i_sb, le32_to_cpu(blk)); 1322 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
1322 if (!bh) { 1323 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
1323 EXT4_ERROR_INODE_BLOCK(inode, le32_to_cpu(blk), 1324 if (end >= max_block)
1324 "Read failure"); 1325 end = max_block;
1325 return -EIO; 1326 if ((start >= end) || (start > max_block))
1326 } 1327 return 0;
1327 if (first > offset) { 1328
1328 first2 = first - offset; 1329 n = ext4_block_to_path(inode, start, offsets, NULL);
1329 count2 = count; 1330 n2 = ext4_block_to_path(inode, end, offsets2, NULL);
1331
1332 BUG_ON(n > n2);
1333
1334 if ((n == 1) && (n == n2)) {
1335 /* We're punching only within direct block range */
1336 ext4_free_data(handle, inode, NULL, i_data + offsets[0],
1337 i_data + offsets2[0]);
1338 return 0;
1339 } else if (n2 > n) {
1340 /*
1341 * Start and end are on a different levels so we're going to
1342 * free partial block at start, and partial block at end of
1343 * the range. If there are some levels in between then
1344 * do_indirects label will take care of that.
1345 */
1346
1347 if (n == 1) {
1348 /*
1349 * Start is at the direct block level, free
1350 * everything to the end of the level.
1351 */
1352 ext4_free_data(handle, inode, NULL, i_data + offsets[0],
1353 i_data + EXT4_NDIR_BLOCKS);
1354 goto end_range;
1355 }
1356
1357
1358 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
1359 if (nr) {
1360 if (partial == chain) {
1361 /* Shared branch grows from the inode */
1362 ext4_free_branches(handle, inode, NULL,
1363 &nr, &nr+1, (chain+n-1) - partial);
1364 *partial->p = 0;
1330 } else { 1365 } else {
1331 first2 = 0; 1366 /* Shared branch grows from an indirect block */
1332 count2 = count - (offset - first); 1367 BUFFER_TRACE(partial->bh, "get_write_access");
1368 ext4_free_branches(handle, inode, partial->bh,
1369 partial->p,
1370 partial->p+1, (chain+n-1) - partial);
1333 } 1371 }
1334 ret = free_hole_blocks(handle, inode, bh, 1372 }
1335 (__le32 *)bh->b_data, level - 1, 1373
1336 first2, count2, 1374 /*
1337 inode->i_sb->s_blocksize >> 2); 1375 * Clear the ends of indirect blocks on the shared branch
1338 if (ret) { 1376 * at the start of the range
1339 brelse(bh); 1377 */
1340 goto err; 1378 while (partial > chain) {
1379 ext4_free_branches(handle, inode, partial->bh,
1380 partial->p + 1,
1381 (__le32 *)partial->bh->b_data+addr_per_block,
1382 (chain+n-1) - partial);
1383 BUFFER_TRACE(partial->bh, "call brelse");
1384 brelse(partial->bh);
1385 partial--;
1386 }
1387
1388end_range:
1389 partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
1390 if (nr2) {
1391 if (partial2 == chain2) {
1392 /*
1393 * Remember, end is exclusive so here we're at
1394 * the start of the next level we're not going
1395 * to free. Everything was covered by the start
1396 * of the range.
1397 */
1398 return 0;
1399 } else {
1400 /* Shared branch grows from an indirect block */
1401 partial2--;
1341 } 1402 }
1403 } else {
1404 /*
1405 * ext4_find_shared returns Indirect structure which
1406 * points to the last element which should not be
1407 * removed by truncate. But this is end of the range
1408 * in punch_hole so we need to point to the next element
1409 */
1410 partial2->p++;
1342 } 1411 }
1343 if (level == 0 || 1412
1344 (bh && all_zeroes((__le32 *)bh->b_data, 1413 /*
1345 (__le32 *)bh->b_data + addr_per_block))) { 1414 * Clear the ends of indirect blocks on the shared branch
1346 ext4_free_data(handle, inode, parent_bh, 1415 * at the end of the range
1347 i_data, i_data + 1); 1416 */
1417 while (partial2 > chain2) {
1418 ext4_free_branches(handle, inode, partial2->bh,
1419 (__le32 *)partial2->bh->b_data,
1420 partial2->p,
1421 (chain2+n2-1) - partial2);
1422 BUFFER_TRACE(partial2->bh, "call brelse");
1423 brelse(partial2->bh);
1424 partial2--;
1348 } 1425 }
1349 brelse(bh); 1426 goto do_indirects;
1350 bh = NULL;
1351 } 1427 }
1352 1428
1353err: 1429 /* Punch happened within the same level (n == n2) */
1354 return ret; 1430 partial = ext4_find_shared(inode, n, offsets, chain, &nr);
1355} 1431 partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2);
1356 1432 /*
1357int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 1433 * ext4_find_shared returns Indirect structure which
1358 ext4_lblk_t first, ext4_lblk_t stop) 1434 * points to the last element which should not be
1359{ 1435 * removed by truncate. But this is end of the range
1360 int addr_per_block = EXT4_ADDR_PER_BLOCK(inode->i_sb); 1436 * in punch_hole so we need to point to the next element
1361 int level, ret = 0; 1437 */
1362 int num = EXT4_NDIR_BLOCKS; 1438 partial2->p++;
1363 ext4_lblk_t count, max = EXT4_NDIR_BLOCKS; 1439 while ((partial > chain) || (partial2 > chain2)) {
1364 __le32 *i_data = EXT4_I(inode)->i_data; 1440 /* We're at the same block, so we're almost finished */
1365 1441 if ((partial->bh && partial2->bh) &&
1366 count = stop - first; 1442 (partial->bh->b_blocknr == partial2->bh->b_blocknr)) {
1367 for (level = 0; level < 4; level++, max *= addr_per_block) { 1443 if ((partial > chain) && (partial2 > chain2)) {
1368 if (first < max) { 1444 ext4_free_branches(handle, inode, partial->bh,
1369 ret = free_hole_blocks(handle, inode, NULL, i_data, 1445 partial->p + 1,
1370 level, first, count, num); 1446 partial2->p,
1371 if (ret) 1447 (chain+n-1) - partial);
1372 goto err; 1448 BUFFER_TRACE(partial->bh, "call brelse");
1373 if (count > max - first) 1449 brelse(partial->bh);
1374 count -= max - first; 1450 BUFFER_TRACE(partial2->bh, "call brelse");
1375 else 1451 brelse(partial2->bh);
1376 break; 1452 }
1377 first = 0; 1453 return 0;
1378 } else {
1379 first -= max;
1380 } 1454 }
1381 i_data += num; 1455 /*
1382 if (level == 0) { 1456 * Clear the ends of indirect blocks on the shared branch
1383 num = 1; 1457 * at the start of the range
1384 max = 1; 1458 */
1459 if (partial > chain) {
1460 ext4_free_branches(handle, inode, partial->bh,
1461 partial->p + 1,
1462 (__le32 *)partial->bh->b_data+addr_per_block,
1463 (chain+n-1) - partial);
1464 BUFFER_TRACE(partial->bh, "call brelse");
1465 brelse(partial->bh);
1466 partial--;
1467 }
1468 /*
1469 * Clear the ends of indirect blocks on the shared branch
1470 * at the end of the range
1471 */
1472 if (partial2 > chain2) {
1473 ext4_free_branches(handle, inode, partial2->bh,
1474 (__le32 *)partial2->bh->b_data,
1475 partial2->p,
1476 (chain2+n-1) - partial2);
1477 BUFFER_TRACE(partial2->bh, "call brelse");
1478 brelse(partial2->bh);
1479 partial2--;
1385 } 1480 }
1386 } 1481 }
1387 1482
1388err: 1483do_indirects:
1389 return ret; 1484 /* Kill the remaining (whole) subtrees */
1485 switch (offsets[0]) {
1486 default:
1487 if (++n >= n2)
1488 return 0;
1489 nr = i_data[EXT4_IND_BLOCK];
1490 if (nr) {
1491 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
1492 i_data[EXT4_IND_BLOCK] = 0;
1493 }
1494 case EXT4_IND_BLOCK:
1495 if (++n >= n2)
1496 return 0;
1497 nr = i_data[EXT4_DIND_BLOCK];
1498 if (nr) {
1499 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
1500 i_data[EXT4_DIND_BLOCK] = 0;
1501 }
1502 case EXT4_DIND_BLOCK:
1503 if (++n >= n2)
1504 return 0;
1505 nr = i_data[EXT4_TIND_BLOCK];
1506 if (nr) {
1507 ext4_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
1508 i_data[EXT4_TIND_BLOCK] = 0;
1509 }
1510 case EXT4_TIND_BLOCK:
1511 ;
1512 }
1513 return 0;
1390} 1514}
1391
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 645205d8ada6..bea662bd0ca6 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -120,12 +120,6 @@ int ext4_get_max_inline_size(struct inode *inode)
120 return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE; 120 return max_inline_size + EXT4_MIN_INLINE_DATA_SIZE;
121} 121}
122 122
123int ext4_has_inline_data(struct inode *inode)
124{
125 return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
126 EXT4_I(inode)->i_inline_off;
127}
128
129/* 123/*
130 * this function does not take xattr_sem, which is OK because it is 124 * this function does not take xattr_sem, which is OK because it is
131 * currently only used in a code path coming form ext4_iget, before 125 * currently only used in a code path coming form ext4_iget, before
@@ -1178,6 +1172,18 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
1178 if (error < 0) 1172 if (error < 0)
1179 goto out; 1173 goto out;
1180 1174
1175 /*
1176 * Make sure the inline directory entries pass checks before we try to
1177 * convert them, so that we avoid touching stuff that needs fsck.
1178 */
1179 if (S_ISDIR(inode->i_mode)) {
1180 error = ext4_check_all_de(inode, iloc->bh,
1181 buf + EXT4_INLINE_DOTDOT_SIZE,
1182 inline_size - EXT4_INLINE_DOTDOT_SIZE);
1183 if (error)
1184 goto out;
1185 }
1186
1181 error = ext4_destroy_inline_data_nolock(handle, inode); 1187 error = ext4_destroy_inline_data_nolock(handle, inode);
1182 if (error) 1188 if (error)
1183 goto out; 1189 goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8a064734e6eb..3aa26e9117c4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -325,18 +325,6 @@ qsize_t *ext4_get_reserved_space(struct inode *inode)
325#endif 325#endif
326 326
327/* 327/*
328 * Calculate the number of metadata blocks need to reserve
329 * to allocate a block located at @lblock
330 */
331static int ext4_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblock)
332{
333 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
334 return ext4_ext_calc_metadata_amount(inode, lblock);
335
336 return ext4_ind_calc_metadata_amount(inode, lblock);
337}
338
339/*
340 * Called with i_data_sem down, which is important since we can call 328 * Called with i_data_sem down, which is important since we can call
341 * ext4_discard_preallocations() from here. 329 * ext4_discard_preallocations() from here.
342 */ 330 */
@@ -357,35 +345,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
357 used = ei->i_reserved_data_blocks; 345 used = ei->i_reserved_data_blocks;
358 } 346 }
359 347
360 if (unlikely(ei->i_allocated_meta_blocks > ei->i_reserved_meta_blocks)) {
361 ext4_warning(inode->i_sb, "ino %lu, allocated %d "
362 "with only %d reserved metadata blocks "
363 "(releasing %d blocks with reserved %d data blocks)",
364 inode->i_ino, ei->i_allocated_meta_blocks,
365 ei->i_reserved_meta_blocks, used,
366 ei->i_reserved_data_blocks);
367 WARN_ON(1);
368 ei->i_allocated_meta_blocks = ei->i_reserved_meta_blocks;
369 }
370
371 /* Update per-inode reservations */ 348 /* Update per-inode reservations */
372 ei->i_reserved_data_blocks -= used; 349 ei->i_reserved_data_blocks -= used;
373 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 350 percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);
374 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
375 used + ei->i_allocated_meta_blocks);
376 ei->i_allocated_meta_blocks = 0;
377 351
378 if (ei->i_reserved_data_blocks == 0) {
379 /*
380 * We can release all of the reserved metadata blocks
381 * only when we have written all of the delayed
382 * allocation blocks.
383 */
384 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
385 ei->i_reserved_meta_blocks);
386 ei->i_reserved_meta_blocks = 0;
387 ei->i_da_metadata_calc_len = 0;
388 }
389 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 352 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
390 353
391 /* Update quota subsystem for data blocks */ 354 /* Update quota subsystem for data blocks */
@@ -1092,27 +1055,11 @@ static int ext4_write_end(struct file *file,
1092 } else 1055 } else
1093 copied = block_write_end(file, mapping, pos, 1056 copied = block_write_end(file, mapping, pos,
1094 len, copied, page, fsdata); 1057 len, copied, page, fsdata);
1095
1096 /* 1058 /*
1097 * No need to use i_size_read() here, the i_size 1059 * it's important to update i_size while still holding page lock:
1098 * cannot change under us because we hole i_mutex.
1099 *
1100 * But it's important to update i_size while still holding page lock:
1101 * page writeout could otherwise come in and zero beyond i_size. 1060 * page writeout could otherwise come in and zero beyond i_size.
1102 */ 1061 */
1103 if (pos + copied > inode->i_size) { 1062 i_size_changed = ext4_update_inode_size(inode, pos + copied);
1104 i_size_write(inode, pos + copied);
1105 i_size_changed = 1;
1106 }
1107
1108 if (pos + copied > EXT4_I(inode)->i_disksize) {
1109 /* We need to mark inode dirty even if
1110 * new_i_size is less that inode->i_size
1111 * but greater than i_disksize. (hint delalloc)
1112 */
1113 ext4_update_i_disksize(inode, (pos + copied));
1114 i_size_changed = 1;
1115 }
1116 unlock_page(page); 1063 unlock_page(page);
1117 page_cache_release(page); 1064 page_cache_release(page);
1118 1065
@@ -1160,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file,
1160 int ret = 0, ret2; 1107 int ret = 0, ret2;
1161 int partial = 0; 1108 int partial = 0;
1162 unsigned from, to; 1109 unsigned from, to;
1163 loff_t new_i_size; 1110 int size_changed = 0;
1164 1111
1165 trace_ext4_journalled_write_end(inode, pos, len, copied); 1112 trace_ext4_journalled_write_end(inode, pos, len, copied);
1166 from = pos & (PAGE_CACHE_SIZE - 1); 1113 from = pos & (PAGE_CACHE_SIZE - 1);
@@ -1183,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file,
1183 if (!partial) 1130 if (!partial)
1184 SetPageUptodate(page); 1131 SetPageUptodate(page);
1185 } 1132 }
1186 new_i_size = pos + copied; 1133 size_changed = ext4_update_inode_size(inode, pos + copied);
1187 if (new_i_size > inode->i_size)
1188 i_size_write(inode, pos+copied);
1189 ext4_set_inode_state(inode, EXT4_STATE_JDATA); 1134 ext4_set_inode_state(inode, EXT4_STATE_JDATA);
1190 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; 1135 EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
1191 if (new_i_size > EXT4_I(inode)->i_disksize) { 1136 unlock_page(page);
1192 ext4_update_i_disksize(inode, new_i_size); 1137 page_cache_release(page);
1138
1139 if (size_changed) {
1193 ret2 = ext4_mark_inode_dirty(handle, inode); 1140 ret2 = ext4_mark_inode_dirty(handle, inode);
1194 if (!ret) 1141 if (!ret)
1195 ret = ret2; 1142 ret = ret2;
1196 } 1143 }
1197 1144
1198 unlock_page(page);
1199 page_cache_release(page);
1200 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1145 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1201 /* if we have allocated more blocks and copied 1146 /* if we have allocated more blocks and copied
1202 * less. We will have blocks allocated outside 1147 * less. We will have blocks allocated outside
@@ -1222,49 +1167,6 @@ static int ext4_journalled_write_end(struct file *file,
1222} 1167}
1223 1168
1224/* 1169/*
1225 * Reserve a metadata for a single block located at lblock
1226 */
1227static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
1228{
1229 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1230 struct ext4_inode_info *ei = EXT4_I(inode);
1231 unsigned int md_needed;
1232 ext4_lblk_t save_last_lblock;
1233 int save_len;
1234
1235 /*
1236 * recalculate the amount of metadata blocks to reserve
1237 * in order to allocate nrblocks
1238 * worse case is one extent per block
1239 */
1240 spin_lock(&ei->i_block_reservation_lock);
1241 /*
1242 * ext4_calc_metadata_amount() has side effects, which we have
1243 * to be prepared undo if we fail to claim space.
1244 */
1245 save_len = ei->i_da_metadata_calc_len;
1246 save_last_lblock = ei->i_da_metadata_calc_last_lblock;
1247 md_needed = EXT4_NUM_B2C(sbi,
1248 ext4_calc_metadata_amount(inode, lblock));
1249 trace_ext4_da_reserve_space(inode, md_needed);
1250
1251 /*
1252 * We do still charge estimated metadata to the sb though;
1253 * we cannot afford to run out of free blocks.
1254 */
1255 if (ext4_claim_free_clusters(sbi, md_needed, 0)) {
1256 ei->i_da_metadata_calc_len = save_len;
1257 ei->i_da_metadata_calc_last_lblock = save_last_lblock;
1258 spin_unlock(&ei->i_block_reservation_lock);
1259 return -ENOSPC;
1260 }
1261 ei->i_reserved_meta_blocks += md_needed;
1262 spin_unlock(&ei->i_block_reservation_lock);
1263
1264 return 0; /* success */
1265}
1266
1267/*
1268 * Reserve a single cluster located at lblock 1170 * Reserve a single cluster located at lblock
1269 */ 1171 */
1270static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) 1172static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
@@ -1273,8 +1175,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1273 struct ext4_inode_info *ei = EXT4_I(inode); 1175 struct ext4_inode_info *ei = EXT4_I(inode);
1274 unsigned int md_needed; 1176 unsigned int md_needed;
1275 int ret; 1177 int ret;
1276 ext4_lblk_t save_last_lblock;
1277 int save_len;
1278 1178
1279 /* 1179 /*
1280 * We will charge metadata quota at writeout time; this saves 1180 * We will charge metadata quota at writeout time; this saves
@@ -1295,25 +1195,15 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
1295 * ext4_calc_metadata_amount() has side effects, which we have 1195 * ext4_calc_metadata_amount() has side effects, which we have
1296 * to be prepared undo if we fail to claim space. 1196 * to be prepared undo if we fail to claim space.
1297 */ 1197 */
1298 save_len = ei->i_da_metadata_calc_len; 1198 md_needed = 0;
1299 save_last_lblock = ei->i_da_metadata_calc_last_lblock; 1199 trace_ext4_da_reserve_space(inode, 0);
1300 md_needed = EXT4_NUM_B2C(sbi,
1301 ext4_calc_metadata_amount(inode, lblock));
1302 trace_ext4_da_reserve_space(inode, md_needed);
1303 1200
1304 /* 1201 if (ext4_claim_free_clusters(sbi, 1, 0)) {
1305 * We do still charge estimated metadata to the sb though;
1306 * we cannot afford to run out of free blocks.
1307 */
1308 if (ext4_claim_free_clusters(sbi, md_needed + 1, 0)) {
1309 ei->i_da_metadata_calc_len = save_len;
1310 ei->i_da_metadata_calc_last_lblock = save_last_lblock;
1311 spin_unlock(&ei->i_block_reservation_lock); 1202 spin_unlock(&ei->i_block_reservation_lock);
1312 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); 1203 dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
1313 return -ENOSPC; 1204 return -ENOSPC;
1314 } 1205 }
1315 ei->i_reserved_data_blocks++; 1206 ei->i_reserved_data_blocks++;
1316 ei->i_reserved_meta_blocks += md_needed;
1317 spin_unlock(&ei->i_block_reservation_lock); 1207 spin_unlock(&ei->i_block_reservation_lock);
1318 1208
1319 return 0; /* success */ 1209 return 0; /* success */
@@ -1346,20 +1236,6 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1346 } 1236 }
1347 ei->i_reserved_data_blocks -= to_free; 1237 ei->i_reserved_data_blocks -= to_free;
1348 1238
1349 if (ei->i_reserved_data_blocks == 0) {
1350 /*
1351 * We can release all of the reserved metadata blocks
1352 * only when we have written all of the delayed
1353 * allocation blocks.
1354 * Note that in case of bigalloc, i_reserved_meta_blocks,
1355 * i_reserved_data_blocks, etc. refer to number of clusters.
1356 */
1357 percpu_counter_sub(&sbi->s_dirtyclusters_counter,
1358 ei->i_reserved_meta_blocks);
1359 ei->i_reserved_meta_blocks = 0;
1360 ei->i_da_metadata_calc_len = 0;
1361 }
1362
1363 /* update fs dirty data blocks counter */ 1239 /* update fs dirty data blocks counter */
1364 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); 1240 percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);
1365 1241
@@ -1500,10 +1376,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1500 ext4_msg(sb, KERN_CRIT, "Block reservation details"); 1376 ext4_msg(sb, KERN_CRIT, "Block reservation details");
1501 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", 1377 ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
1502 ei->i_reserved_data_blocks); 1378 ei->i_reserved_data_blocks);
1503 ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
1504 ei->i_reserved_meta_blocks);
1505 ext4_msg(sb, KERN_CRIT, "i_allocated_meta_blocks=%u",
1506 ei->i_allocated_meta_blocks);
1507 return; 1379 return;
1508} 1380}
1509 1381
@@ -1620,13 +1492,6 @@ add_delayed:
1620 retval = ret; 1492 retval = ret;
1621 goto out_unlock; 1493 goto out_unlock;
1622 } 1494 }
1623 } else {
1624 ret = ext4_da_reserve_metadata(inode, iblock);
1625 if (ret) {
1626 /* not enough space to reserve */
1627 retval = ret;
1628 goto out_unlock;
1629 }
1630 } 1495 }
1631 1496
1632 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 1497 ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
@@ -2212,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2212 struct ext4_map_blocks *map = &mpd->map; 2077 struct ext4_map_blocks *map = &mpd->map;
2213 int err; 2078 int err;
2214 loff_t disksize; 2079 loff_t disksize;
2080 int progress = 0;
2215 2081
2216 mpd->io_submit.io_end->offset = 2082 mpd->io_submit.io_end->offset =
2217 ((loff_t)map->m_lblk) << inode->i_blkbits; 2083 ((loff_t)map->m_lblk) << inode->i_blkbits;
@@ -2228,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2228 * is non-zero, a commit should free up blocks. 2094 * is non-zero, a commit should free up blocks.
2229 */ 2095 */
2230 if ((err == -ENOMEM) || 2096 if ((err == -ENOMEM) ||
2231 (err == -ENOSPC && ext4_count_free_clusters(sb))) 2097 (err == -ENOSPC && ext4_count_free_clusters(sb))) {
2098 if (progress)
2099 goto update_disksize;
2232 return err; 2100 return err;
2101 }
2233 ext4_msg(sb, KERN_CRIT, 2102 ext4_msg(sb, KERN_CRIT,
2234 "Delayed block allocation failed for " 2103 "Delayed block allocation failed for "
2235 "inode %lu at logical offset %llu with" 2104 "inode %lu at logical offset %llu with"
@@ -2246,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle,
2246 *give_up_on_write = true; 2115 *give_up_on_write = true;
2247 return err; 2116 return err;
2248 } 2117 }
2118 progress = 1;
2249 /* 2119 /*
2250 * Update buffer state, submit mapped pages, and get us new 2120 * Update buffer state, submit mapped pages, and get us new
2251 * extent to map 2121 * extent to map
2252 */ 2122 */
2253 err = mpage_map_and_submit_buffers(mpd); 2123 err = mpage_map_and_submit_buffers(mpd);
2254 if (err < 0) 2124 if (err < 0)
2255 return err; 2125 goto update_disksize;
2256 } while (map->m_len); 2126 } while (map->m_len);
2257 2127
2128update_disksize:
2258 /* 2129 /*
2259 * Update on-disk size after IO is submitted. Races with 2130 * Update on-disk size after IO is submitted. Races with
2260 * truncate are avoided by checking i_size under i_data_sem. 2131 * truncate are avoided by checking i_size under i_data_sem.
@@ -2843,8 +2714,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
2843{ 2714{
2844 trace_ext4_alloc_da_blocks(inode); 2715 trace_ext4_alloc_da_blocks(inode);
2845 2716
2846 if (!EXT4_I(inode)->i_reserved_data_blocks && 2717 if (!EXT4_I(inode)->i_reserved_data_blocks)
2847 !EXT4_I(inode)->i_reserved_meta_blocks)
2848 return 0; 2718 return 0;
2849 2719
2850 /* 2720 /*
@@ -3624,7 +3494,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3624 ret = ext4_ext_remove_space(inode, first_block, 3494 ret = ext4_ext_remove_space(inode, first_block,
3625 stop_block - 1); 3495 stop_block - 1);
3626 else 3496 else
3627 ret = ext4_free_hole_blocks(handle, inode, first_block, 3497 ret = ext4_ind_remove_space(handle, inode, first_block,
3628 stop_block); 3498 stop_block);
3629 3499
3630 up_write(&EXT4_I(inode)->i_data_sem); 3500 up_write(&EXT4_I(inode)->i_data_sem);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 2dcb936be90e..748c9136a60a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1412 int last = first + count - 1; 1412 int last = first + count - 1;
1413 struct super_block *sb = e4b->bd_sb; 1413 struct super_block *sb = e4b->bd_sb;
1414 1414
1415 if (WARN_ON(count == 0))
1416 return;
1415 BUG_ON(last >= (sb->s_blocksize << 3)); 1417 BUG_ON(last >= (sb->s_blocksize << 3));
1416 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1418 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1417 /* Don't bother if the block group is corrupt. */ 1419 /* Don't bother if the block group is corrupt. */
@@ -3075,8 +3077,9 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3075 (23 - bsbits)) << 23; 3077 (23 - bsbits)) << 23;
3076 size = 8 * 1024 * 1024; 3078 size = 8 * 1024 * 1024;
3077 } else { 3079 } else {
3078 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 3080 start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3079 size = ac->ac_o_ex.fe_len << bsbits; 3081 size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
3082 ac->ac_o_ex.fe_len) << bsbits;
3080 } 3083 }
3081 size = size >> bsbits; 3084 size = size >> bsbits;
3082 start = start_off >> bsbits; 3085 start = start_off >> bsbits;
@@ -3216,8 +3219,30 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3216static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 3219static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3217{ 3220{
3218 struct ext4_prealloc_space *pa = ac->ac_pa; 3221 struct ext4_prealloc_space *pa = ac->ac_pa;
3222 struct ext4_buddy e4b;
3223 int err;
3219 3224
3220 if (pa && pa->pa_type == MB_INODE_PA) 3225 if (pa == NULL) {
3226 if (ac->ac_f_ex.fe_len == 0)
3227 return;
3228 err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3229 if (err) {
3230 /*
3231 * This should never happen since we pin the
3232 * pages in the ext4_allocation_context so
3233 * ext4_mb_load_buddy() should never fail.
3234 */
3235 WARN(1, "mb_load_buddy failed (%d)", err);
3236 return;
3237 }
3238 ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3239 mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3240 ac->ac_f_ex.fe_len);
3241 ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3242 ext4_mb_unload_buddy(&e4b);
3243 return;
3244 }
3245 if (pa->pa_type == MB_INODE_PA)
3221 pa->pa_free += ac->ac_b_ex.fe_len; 3246 pa->pa_free += ac->ac_b_ex.fe_len;
3222} 3247}
3223 3248
@@ -4109,7 +4134,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4109 * per cpu locality group is to reduce the contention between block 4134 * per cpu locality group is to reduce the contention between block
4110 * request from multiple CPUs. 4135 * request from multiple CPUs.
4111 */ 4136 */
4112 ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); 4137 ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
4113 4138
4114 /* we're going to use group allocation */ 4139 /* we're going to use group allocation */
4115 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4140 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
@@ -4627,7 +4652,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4627 struct buffer_head *gd_bh; 4652 struct buffer_head *gd_bh;
4628 ext4_group_t block_group; 4653 ext4_group_t block_group;
4629 struct ext4_sb_info *sbi; 4654 struct ext4_sb_info *sbi;
4630 struct ext4_inode_info *ei = EXT4_I(inode);
4631 struct ext4_buddy e4b; 4655 struct ext4_buddy e4b;
4632 unsigned int count_clusters; 4656 unsigned int count_clusters;
4633 int err = 0; 4657 int err = 0;
@@ -4838,19 +4862,7 @@ do_more:
4838 &sbi->s_flex_groups[flex_group].free_clusters); 4862 &sbi->s_flex_groups[flex_group].free_clusters);
4839 } 4863 }
4840 4864
4841 if (flags & EXT4_FREE_BLOCKS_RESERVE && ei->i_reserved_data_blocks) { 4865 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4842 percpu_counter_add(&sbi->s_dirtyclusters_counter,
4843 count_clusters);
4844 spin_lock(&ei->i_block_reservation_lock);
4845 if (flags & EXT4_FREE_BLOCKS_METADATA)
4846 ei->i_reserved_meta_blocks += count_clusters;
4847 else
4848 ei->i_reserved_data_blocks += count_clusters;
4849 spin_unlock(&ei->i_block_reservation_lock);
4850 if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4851 dquot_reclaim_block(inode,
4852 EXT4_C2B(sbi, count_clusters));
4853 } else if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4854 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); 4866 dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4855 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); 4867 percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4856 4868
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index ec092437d3e0..d3567f27bae7 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -39,6 +39,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
39 newext.ee_block = cpu_to_le32(lb->first_block); 39 newext.ee_block = cpu_to_le32(lb->first_block);
40 newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); 40 newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1);
41 ext4_ext_store_pblock(&newext, lb->first_pblock); 41 ext4_ext_store_pblock(&newext, lb->first_pblock);
42 /* Locking only for convinience since we are operating on temp inode */
43 down_write(&EXT4_I(inode)->i_data_sem);
42 path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); 44 path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0);
43 45
44 if (IS_ERR(path)) { 46 if (IS_ERR(path)) {
@@ -61,7 +63,9 @@ static int finish_range(handle_t *handle, struct inode *inode,
61 */ 63 */
62 if (needed && ext4_handle_has_enough_credits(handle, 64 if (needed && ext4_handle_has_enough_credits(handle,
63 EXT4_RESERVE_TRANS_BLOCKS)) { 65 EXT4_RESERVE_TRANS_BLOCKS)) {
66 up_write((&EXT4_I(inode)->i_data_sem));
64 retval = ext4_journal_restart(handle, needed); 67 retval = ext4_journal_restart(handle, needed);
68 down_write((&EXT4_I(inode)->i_data_sem));
65 if (retval) 69 if (retval)
66 goto err_out; 70 goto err_out;
67 } else if (needed) { 71 } else if (needed) {
@@ -70,13 +74,16 @@ static int finish_range(handle_t *handle, struct inode *inode,
70 /* 74 /*
71 * IF not able to extend the journal restart the journal 75 * IF not able to extend the journal restart the journal
72 */ 76 */
77 up_write((&EXT4_I(inode)->i_data_sem));
73 retval = ext4_journal_restart(handle, needed); 78 retval = ext4_journal_restart(handle, needed);
79 down_write((&EXT4_I(inode)->i_data_sem));
74 if (retval) 80 if (retval)
75 goto err_out; 81 goto err_out;
76 } 82 }
77 } 83 }
78 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); 84 retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
79err_out: 85err_out:
86 up_write((&EXT4_I(inode)->i_data_sem));
80 if (path) { 87 if (path) {
81 ext4_ext_drop_refs(path); 88 ext4_ext_drop_refs(path);
82 kfree(path); 89 kfree(path);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 2484c7ec6a72..671a74b14fd7 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1013,10 +1013,11 @@ data_copy:
1013 *err = -EBUSY; 1013 *err = -EBUSY;
1014 goto unlock_pages; 1014 goto unlock_pages;
1015 } 1015 }
1016 1016 ext4_double_down_write_data_sem(orig_inode, donor_inode);
1017 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, 1017 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
1018 orig_blk_offset, 1018 orig_blk_offset,
1019 block_len_in_page, err); 1019 block_len_in_page, err);
1020 ext4_double_up_write_data_sem(orig_inode, donor_inode);
1020 if (*err) { 1021 if (*err) {
1021 if (replaced_count) { 1022 if (replaced_count) {
1022 block_len_in_page = replaced_count; 1023 block_len_in_page = replaced_count;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 3520ab8a6639..603e4ebbd0ac 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1227 buffer */ 1227 buffer */
1228 int num = 0; 1228 int num = 0;
1229 ext4_lblk_t nblocks; 1229 ext4_lblk_t nblocks;
1230 int i, err; 1230 int i, err = 0;
1231 int namelen; 1231 int namelen;
1232 1232
1233 *res_dir = NULL; 1233 *res_dir = NULL;
@@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
1264 * return. Otherwise, fall back to doing a search the 1264 * return. Otherwise, fall back to doing a search the
1265 * old fashioned way. 1265 * old fashioned way.
1266 */ 1266 */
1267 if (bh || (err != ERR_BAD_DX_DIR)) 1267 if (err == -ENOENT)
1268 return NULL;
1269 if (err && err != ERR_BAD_DX_DIR)
1270 return ERR_PTR(err);
1271 if (bh)
1268 return bh; 1272 return bh;
1269 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1273 dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
1270 "falling back\n")); 1274 "falling back\n"));
@@ -1295,6 +1299,11 @@ restart:
1295 } 1299 }
1296 num++; 1300 num++;
1297 bh = ext4_getblk(NULL, dir, b++, 0, &err); 1301 bh = ext4_getblk(NULL, dir, b++, 0, &err);
1302 if (unlikely(err)) {
1303 if (ra_max == 0)
1304 return ERR_PTR(err);
1305 break;
1306 }
1298 bh_use[ra_max] = bh; 1307 bh_use[ra_max] = bh;
1299 if (bh) 1308 if (bh)
1300 ll_rw_block(READ | REQ_META | REQ_PRIO, 1309 ll_rw_block(READ | REQ_META | REQ_PRIO,
@@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
1417 return ERR_PTR(-ENAMETOOLONG); 1426 return ERR_PTR(-ENAMETOOLONG);
1418 1427
1419 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 1428 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
1429 if (IS_ERR(bh))
1430 return (struct dentry *) bh;
1420 inode = NULL; 1431 inode = NULL;
1421 if (bh) { 1432 if (bh) {
1422 __u32 ino = le32_to_cpu(de->inode); 1433 __u32 ino = le32_to_cpu(de->inode);
@@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
1450 struct buffer_head *bh; 1461 struct buffer_head *bh;
1451 1462
1452 bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); 1463 bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
1464 if (IS_ERR(bh))
1465 return (struct dentry *) bh;
1453 if (!bh) 1466 if (!bh)
1454 return ERR_PTR(-ENOENT); 1467 return ERR_PTR(-ENOENT);
1455 ino = le32_to_cpu(de->inode); 1468 ino = le32_to_cpu(de->inode);
@@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
2727 2740
2728 retval = -ENOENT; 2741 retval = -ENOENT;
2729 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2742 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2743 if (IS_ERR(bh))
2744 return PTR_ERR(bh);
2730 if (!bh) 2745 if (!bh)
2731 goto end_rmdir; 2746 goto end_rmdir;
2732 2747
@@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2794 2809
2795 retval = -ENOENT; 2810 retval = -ENOENT;
2796 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); 2811 bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
2812 if (IS_ERR(bh))
2813 return PTR_ERR(bh);
2797 if (!bh) 2814 if (!bh)
2798 goto end_unlink; 2815 goto end_unlink;
2799 2816
@@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
3121 struct ext4_dir_entry_2 *de; 3138 struct ext4_dir_entry_2 *de;
3122 3139
3123 bh = ext4_find_entry(dir, d_name, &de, NULL); 3140 bh = ext4_find_entry(dir, d_name, &de, NULL);
3141 if (IS_ERR(bh))
3142 return PTR_ERR(bh);
3124 if (bh) { 3143 if (bh) {
3125 retval = ext4_delete_entry(handle, dir, de, bh); 3144 retval = ext4_delete_entry(handle, dir, de, bh);
3126 brelse(bh); 3145 brelse(bh);
@@ -3128,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
3128 return retval; 3147 return retval;
3129} 3148}
3130 3149
3131static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent) 3150static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
3151 int force_reread)
3132{ 3152{
3133 int retval; 3153 int retval;
3134 /* 3154 /*
@@ -3140,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
3140 if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino || 3160 if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
3141 ent->de->name_len != ent->dentry->d_name.len || 3161 ent->de->name_len != ent->dentry->d_name.len ||
3142 strncmp(ent->de->name, ent->dentry->d_name.name, 3162 strncmp(ent->de->name, ent->dentry->d_name.name,
3143 ent->de->name_len)) { 3163 ent->de->name_len) ||
3164 force_reread) {
3144 retval = ext4_find_delete_entry(handle, ent->dir, 3165 retval = ext4_find_delete_entry(handle, ent->dir,
3145 &ent->dentry->d_name); 3166 &ent->dentry->d_name);
3146 } else { 3167 } else {
@@ -3191,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3191 .dentry = new_dentry, 3212 .dentry = new_dentry,
3192 .inode = new_dentry->d_inode, 3213 .inode = new_dentry->d_inode,
3193 }; 3214 };
3215 int force_reread;
3194 int retval; 3216 int retval;
3195 3217
3196 dquot_initialize(old.dir); 3218 dquot_initialize(old.dir);
@@ -3202,6 +3224,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3202 dquot_initialize(new.inode); 3224 dquot_initialize(new.inode);
3203 3225
3204 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL); 3226 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
3227 if (IS_ERR(old.bh))
3228 return PTR_ERR(old.bh);
3205 /* 3229 /*
3206 * Check for inode number is _not_ due to possible IO errors. 3230 * Check for inode number is _not_ due to possible IO errors.
3207 * We might rmdir the source, keep it as pwd of some process 3231 * We might rmdir the source, keep it as pwd of some process
@@ -3214,6 +3238,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3214 3238
3215 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, 3239 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3216 &new.de, &new.inlined); 3240 &new.de, &new.inlined);
3241 if (IS_ERR(new.bh)) {
3242 retval = PTR_ERR(new.bh);
3243 new.bh = NULL;
3244 goto end_rename;
3245 }
3217 if (new.bh) { 3246 if (new.bh) {
3218 if (!new.inode) { 3247 if (!new.inode) {
3219 brelse(new.bh); 3248 brelse(new.bh);
@@ -3246,6 +3275,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3246 if (retval) 3275 if (retval)
3247 goto end_rename; 3276 goto end_rename;
3248 } 3277 }
3278 /*
3279 * If we're renaming a file within an inline_data dir and adding or
3280 * setting the new dirent causes a conversion from inline_data to
3281 * extents/blockmap, we need to force the dirent delete code to
3282 * re-read the directory, or else we end up trying to delete a dirent
3283 * from what is now the extent tree root (or a block map).
3284 */
3285 force_reread = (new.dir->i_ino == old.dir->i_ino &&
3286 ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
3249 if (!new.bh) { 3287 if (!new.bh) {
3250 retval = ext4_add_entry(handle, new.dentry, old.inode); 3288 retval = ext4_add_entry(handle, new.dentry, old.inode);
3251 if (retval) 3289 if (retval)
@@ -3256,6 +3294,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3256 if (retval) 3294 if (retval)
3257 goto end_rename; 3295 goto end_rename;
3258 } 3296 }
3297 if (force_reread)
3298 force_reread = !ext4_test_inode_flag(new.dir,
3299 EXT4_INODE_INLINE_DATA);
3259 3300
3260 /* 3301 /*
3261 * Like most other Unix systems, set the ctime for inodes on a 3302 * Like most other Unix systems, set the ctime for inodes on a
@@ -3267,7 +3308,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
3267 /* 3308 /*
3268 * ok, that's it 3309 * ok, that's it
3269 */ 3310 */
3270 ext4_rename_delete(handle, &old); 3311 ext4_rename_delete(handle, &old, force_reread);
3271 3312
3272 if (new.inode) { 3313 if (new.inode) {
3273 ext4_dec_count(handle, new.inode); 3314 ext4_dec_count(handle, new.inode);
@@ -3330,6 +3371,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3330 3371
3331 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, 3372 old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
3332 &old.de, &old.inlined); 3373 &old.de, &old.inlined);
3374 if (IS_ERR(old.bh))
3375 return PTR_ERR(old.bh);
3333 /* 3376 /*
3334 * Check for inode number is _not_ due to possible IO errors. 3377 * Check for inode number is _not_ due to possible IO errors.
3335 * We might rmdir the source, keep it as pwd of some process 3378 * We might rmdir the source, keep it as pwd of some process
@@ -3342,6 +3385,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
3342 3385
3343 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, 3386 new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
3344 &new.de, &new.inlined); 3387 &new.de, &new.inlined);
3388 if (IS_ERR(new.bh)) {
3389 retval = PTR_ERR(new.bh);
3390 new.bh = NULL;
3391 goto end_rename;
3392 }
3345 3393
3346 /* RENAME_EXCHANGE case: old *and* new must both exist */ 3394 /* RENAME_EXCHANGE case: old *and* new must both exist */
3347 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino) 3395 if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
@@ -3455,7 +3503,6 @@ const struct inode_operations ext4_dir_inode_operations = {
3455 .rmdir = ext4_rmdir, 3503 .rmdir = ext4_rmdir,
3456 .mknod = ext4_mknod, 3504 .mknod = ext4_mknod,
3457 .tmpfile = ext4_tmpfile, 3505 .tmpfile = ext4_tmpfile,
3458 .rename = ext4_rename,
3459 .rename2 = ext4_rename2, 3506 .rename2 = ext4_rename2,
3460 .setattr = ext4_setattr, 3507 .setattr = ext4_setattr,
3461 .setxattr = generic_setxattr, 3508 .setxattr = generic_setxattr,
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f03e2e..1e43b905ff98 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -575,6 +575,7 @@ handle_bb:
575 bh = bclean(handle, sb, block); 575 bh = bclean(handle, sb, block);
576 if (IS_ERR(bh)) { 576 if (IS_ERR(bh)) {
577 err = PTR_ERR(bh); 577 err = PTR_ERR(bh);
578 bh = NULL;
578 goto out; 579 goto out;
579 } 580 }
580 overhead = ext4_group_overhead_blocks(sb, group); 581 overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@ handle_ib:
603 bh = bclean(handle, sb, block); 604 bh = bclean(handle, sb, block);
604 if (IS_ERR(bh)) { 605 if (IS_ERR(bh)) {
605 err = PTR_ERR(bh); 606 err = PTR_ERR(bh);
607 bh = NULL;
606 goto out; 608 goto out;
607 } 609 }
608 610
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6df7bc611dbd..05c159218bc2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2142,10 +2142,6 @@ static int ext4_check_descriptors(struct super_block *sb,
2142 } 2142 }
2143 if (NULL != first_not_zeroed) 2143 if (NULL != first_not_zeroed)
2144 *first_not_zeroed = grp; 2144 *first_not_zeroed = grp;
2145
2146 ext4_free_blocks_count_set(sbi->s_es,
2147 EXT4_C2B(sbi, ext4_count_free_clusters(sb)));
2148 sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
2149 return 1; 2145 return 1;
2150} 2146}
2151 2147
@@ -3185,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb)
3185 3181
3186 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, 3182 if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
3187 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { 3183 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
3188 /* journal checksum v2 */ 3184 /* journal checksum v3 */
3189 compat = 0; 3185 compat = 0;
3190 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2; 3186 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
3191 } else { 3187 } else {
3192 /* journal checksum v1 */ 3188 /* journal checksum v1 */
3193 compat = JBD2_FEATURE_COMPAT_CHECKSUM; 3189 compat = JBD2_FEATURE_COMPAT_CHECKSUM;
@@ -3209,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
3209 jbd2_journal_clear_features(sbi->s_journal, 3205 jbd2_journal_clear_features(sbi->s_journal,
3210 JBD2_FEATURE_COMPAT_CHECKSUM, 0, 3206 JBD2_FEATURE_COMPAT_CHECKSUM, 0,
3211 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | 3207 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
3208 JBD2_FEATURE_INCOMPAT_CSUM_V3 |
3212 JBD2_FEATURE_INCOMPAT_CSUM_V2); 3209 JBD2_FEATURE_INCOMPAT_CSUM_V2);
3213 } 3210 }
3214 3211
@@ -3883,13 +3880,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3883 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); 3880 ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
3884 goto failed_mount2; 3881 goto failed_mount2;
3885 } 3882 }
3886 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
3887 if (!ext4_fill_flex_info(sb)) {
3888 ext4_msg(sb, KERN_ERR,
3889 "unable to initialize "
3890 "flex_bg meta info!");
3891 goto failed_mount2;
3892 }
3893 3883
3894 sbi->s_gdb_count = db_count; 3884 sbi->s_gdb_count = db_count;
3895 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3885 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
@@ -3902,22 +3892,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3902 /* Register extent status tree shrinker */ 3892 /* Register extent status tree shrinker */
3903 ext4_es_register_shrinker(sbi); 3893 ext4_es_register_shrinker(sbi);
3904 3894
3905 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3895 err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
3906 ext4_count_free_clusters(sb));
3907 if (!err) {
3908 err = percpu_counter_init(&sbi->s_freeinodes_counter,
3909 ext4_count_free_inodes(sb));
3910 }
3911 if (!err) {
3912 err = percpu_counter_init(&sbi->s_dirs_counter,
3913 ext4_count_dirs(sb));
3914 }
3915 if (!err) {
3916 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0);
3917 }
3918 if (!err) {
3919 err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0);
3920 }
3921 if (err) { 3896 if (err) {
3922 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3897 ext4_msg(sb, KERN_ERR, "insufficient memory");
3923 goto failed_mount3; 3898 goto failed_mount3;
@@ -4022,18 +3997,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
4022 3997
4023 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; 3998 sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
4024 3999
4025 /*
4026 * The journal may have updated the bg summary counts, so we
4027 * need to update the global counters.
4028 */
4029 percpu_counter_set(&sbi->s_freeclusters_counter,
4030 ext4_count_free_clusters(sb));
4031 percpu_counter_set(&sbi->s_freeinodes_counter,
4032 ext4_count_free_inodes(sb));
4033 percpu_counter_set(&sbi->s_dirs_counter,
4034 ext4_count_dirs(sb));
4035 percpu_counter_set(&sbi->s_dirtyclusters_counter, 0);
4036
4037no_journal: 4000no_journal:
4038 if (ext4_mballoc_ready) { 4001 if (ext4_mballoc_ready) {
4039 sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id); 4002 sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
@@ -4141,6 +4104,36 @@ no_journal:
4141 goto failed_mount5; 4104 goto failed_mount5;
4142 } 4105 }
4143 4106
4107 block = ext4_count_free_clusters(sb);
4108 ext4_free_blocks_count_set(sbi->s_es,
4109 EXT4_C2B(sbi, block));
4110 err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
4111 GFP_KERNEL);
4112 if (!err) {
4113 unsigned long freei = ext4_count_free_inodes(sb);
4114 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
4115 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
4116 GFP_KERNEL);
4117 }
4118 if (!err)
4119 err = percpu_counter_init(&sbi->s_dirs_counter,
4120 ext4_count_dirs(sb), GFP_KERNEL);
4121 if (!err)
4122 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
4123 GFP_KERNEL);
4124 if (err) {
4125 ext4_msg(sb, KERN_ERR, "insufficient memory");
4126 goto failed_mount6;
4127 }
4128
4129 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
4130 if (!ext4_fill_flex_info(sb)) {
4131 ext4_msg(sb, KERN_ERR,
4132 "unable to initialize "
4133 "flex_bg meta info!");
4134 goto failed_mount6;
4135 }
4136
4144 err = ext4_register_li_request(sb, first_not_zeroed); 4137 err = ext4_register_li_request(sb, first_not_zeroed);
4145 if (err) 4138 if (err)
4146 goto failed_mount6; 4139 goto failed_mount6;
@@ -4215,6 +4208,12 @@ failed_mount7:
4215 ext4_unregister_li_request(sb); 4208 ext4_unregister_li_request(sb);
4216failed_mount6: 4209failed_mount6:
4217 ext4_mb_release(sb); 4210 ext4_mb_release(sb);
4211 if (sbi->s_flex_groups)
4212 ext4_kvfree(sbi->s_flex_groups);
4213 percpu_counter_destroy(&sbi->s_freeclusters_counter);
4214 percpu_counter_destroy(&sbi->s_freeinodes_counter);
4215 percpu_counter_destroy(&sbi->s_dirs_counter);
4216 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
4218failed_mount5: 4217failed_mount5:
4219 ext4_ext_release(sb); 4218 ext4_ext_release(sb);
4220 ext4_release_system_zone(sb); 4219 ext4_release_system_zone(sb);
@@ -4233,12 +4232,6 @@ failed_mount_wq:
4233failed_mount3: 4232failed_mount3:
4234 ext4_es_unregister_shrinker(sbi); 4233 ext4_es_unregister_shrinker(sbi);
4235 del_timer_sync(&sbi->s_err_report); 4234 del_timer_sync(&sbi->s_err_report);
4236 if (sbi->s_flex_groups)
4237 ext4_kvfree(sbi->s_flex_groups);
4238 percpu_counter_destroy(&sbi->s_freeclusters_counter);
4239 percpu_counter_destroy(&sbi->s_freeinodes_counter);
4240 percpu_counter_destroy(&sbi->s_dirs_counter);
4241 percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
4242 percpu_counter_destroy(&sbi->s_extent_cache_cnt); 4235 percpu_counter_destroy(&sbi->s_extent_cache_cnt);
4243 if (sbi->s_mmp_tsk) 4236 if (sbi->s_mmp_tsk)
4244 kthread_stop(sbi->s_mmp_tsk); 4237 kthread_stop(sbi->s_mmp_tsk);
@@ -4556,11 +4549,13 @@ static int ext4_commit_super(struct super_block *sb, int sync)
4556 else 4549 else
4557 es->s_kbytes_written = 4550 es->s_kbytes_written =
4558 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); 4551 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
4559 ext4_free_blocks_count_set(es, 4552 if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter))
4553 ext4_free_blocks_count_set(es,
4560 EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( 4554 EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
4561 &EXT4_SB(sb)->s_freeclusters_counter))); 4555 &EXT4_SB(sb)->s_freeclusters_counter)));
4562 es->s_free_inodes_count = 4556 if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter))
4563 cpu_to_le32(percpu_counter_sum_positive( 4557 es->s_free_inodes_count =
4558 cpu_to_le32(percpu_counter_sum_positive(
4564 &EXT4_SB(sb)->s_freeinodes_counter)); 4559 &EXT4_SB(sb)->s_freeinodes_counter));
4565 BUFFER_TRACE(sbh, "marking dirty"); 4560 BUFFER_TRACE(sbh, "marking dirty");
4566 ext4_superblock_csum_set(sb); 4561 ext4_superblock_csum_set(sb);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 214fe1054fce..736a348509f7 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -23,7 +23,7 @@ config F2FS_STAT_FS
23 mounted as f2fs. Each file shows the whole f2fs information. 23 mounted as f2fs. Each file shows the whole f2fs information.
24 24
25 /sys/kernel/debug/f2fs/status includes: 25 /sys/kernel/debug/f2fs/status includes:
26 - major file system information managed by f2fs currently 26 - major filesystem information managed by f2fs currently
27 - average SIT information about whole segments 27 - average SIT information about whole segments
28 - current memory footprint consumed by f2fs. 28 - current memory footprint consumed by f2fs.
29 29
@@ -68,6 +68,6 @@ config F2FS_CHECK_FS
68 bool "F2FS consistency checking feature" 68 bool "F2FS consistency checking feature"
69 depends on F2FS_FS 69 depends on F2FS_FS
70 help 70 help
71 Enables BUG_ONs which check the file system consistency in runtime. 71 Enables BUG_ONs which check the filesystem consistency in runtime.
72 72
73 If you want to improve the performance, say N. 73 If you want to improve the performance, say N.
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index dbe2141d10ad..83b9b5a8d112 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -203,12 +203,6 @@ static int __f2fs_set_acl(struct inode *inode, int type,
203 size_t size = 0; 203 size_t size = 0;
204 int error; 204 int error;
205 205
206 if (acl) {
207 error = posix_acl_valid(acl);
208 if (error < 0)
209 return error;
210 }
211
212 switch (type) { 206 switch (type) {
213 case ACL_TYPE_ACCESS: 207 case ACL_TYPE_ACCESS:
214 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; 208 name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 0b4710c1d370..dd10a031c052 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -22,7 +22,7 @@
22#include "segment.h" 22#include "segment.h"
23#include <trace/events/f2fs.h> 23#include <trace/events/f2fs.h>
24 24
25static struct kmem_cache *orphan_entry_slab; 25static struct kmem_cache *ino_entry_slab;
26static struct kmem_cache *inode_entry_slab; 26static struct kmem_cache *inode_entry_slab;
27 27
28/* 28/*
@@ -72,7 +72,22 @@ out:
72 return page; 72 return page;
73} 73}
74 74
75static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) 75struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index)
76{
77 bool readahead = false;
78 struct page *page;
79
80 page = find_get_page(META_MAPPING(sbi), index);
81 if (!page || (page && !PageUptodate(page)))
82 readahead = true;
83 f2fs_put_page(page, 0);
84
85 if (readahead)
86 ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
87 return get_meta_page(sbi, index);
88}
89
90static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
76{ 91{
77 switch (type) { 92 switch (type) {
78 case META_NAT: 93 case META_NAT:
@@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
82 case META_SSA: 97 case META_SSA:
83 case META_CP: 98 case META_CP:
84 return 0; 99 return 0;
100 case META_POR:
101 return MAX_BLKADDR(sbi);
85 default: 102 default:
86 BUG(); 103 BUG();
87 } 104 }
@@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
90/* 107/*
91 * Readahead CP/NAT/SIT/SSA pages 108 * Readahead CP/NAT/SIT/SSA pages
92 */ 109 */
93int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) 110int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
94{ 111{
95 block_t prev_blk_addr = 0; 112 block_t prev_blk_addr = 0;
96 struct page *page; 113 struct page *page;
97 int blkno = start; 114 block_t blkno = start;
98 int max_blks = get_max_meta_blks(sbi, type); 115 block_t max_blks = get_max_meta_blks(sbi, type);
99 116
100 struct f2fs_io_info fio = { 117 struct f2fs_io_info fio = {
101 .type = META, 118 .type = META,
@@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
125 break; 142 break;
126 case META_SSA: 143 case META_SSA:
127 case META_CP: 144 case META_CP:
128 /* get ssa/cp block addr */ 145 case META_POR:
146 if (unlikely(blkno >= max_blks))
147 goto out;
148 if (unlikely(blkno < SEG0_BLKADDR(sbi)))
149 goto out;
129 blk_addr = blkno; 150 blk_addr = blkno;
130 break; 151 break;
131 default: 152 default:
@@ -151,8 +172,7 @@ out:
151static int f2fs_write_meta_page(struct page *page, 172static int f2fs_write_meta_page(struct page *page,
152 struct writeback_control *wbc) 173 struct writeback_control *wbc)
153{ 174{
154 struct inode *inode = page->mapping->host; 175 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
155 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
156 176
157 trace_f2fs_writepage(page, META); 177 trace_f2fs_writepage(page, META);
158 178
@@ -160,14 +180,11 @@ static int f2fs_write_meta_page(struct page *page,
160 goto redirty_out; 180 goto redirty_out;
161 if (wbc->for_reclaim) 181 if (wbc->for_reclaim)
162 goto redirty_out; 182 goto redirty_out;
163 183 if (unlikely(f2fs_cp_error(sbi)))
164 /* Should not write any meta pages, if any IO error was occurred */ 184 goto redirty_out;
165 if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG)))
166 goto no_write;
167 185
168 f2fs_wait_on_page_writeback(page, META); 186 f2fs_wait_on_page_writeback(page, META);
169 write_meta_page(sbi, page); 187 write_meta_page(sbi, page);
170no_write:
171 dec_page_count(sbi, F2FS_DIRTY_META); 188 dec_page_count(sbi, F2FS_DIRTY_META);
172 unlock_page(page); 189 unlock_page(page);
173 return 0; 190 return 0;
@@ -180,7 +197,7 @@ redirty_out:
180static int f2fs_write_meta_pages(struct address_space *mapping, 197static int f2fs_write_meta_pages(struct address_space *mapping,
181 struct writeback_control *wbc) 198 struct writeback_control *wbc)
182{ 199{
183 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 200 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
184 long diff, written; 201 long diff, written;
185 202
186 trace_f2fs_writepages(mapping->host, wbc, META); 203 trace_f2fs_writepages(mapping->host, wbc, META);
@@ -262,15 +279,12 @@ continue_unlock:
262 279
263static int f2fs_set_meta_page_dirty(struct page *page) 280static int f2fs_set_meta_page_dirty(struct page *page)
264{ 281{
265 struct address_space *mapping = page->mapping;
266 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
267
268 trace_f2fs_set_page_dirty(page, META); 282 trace_f2fs_set_page_dirty(page, META);
269 283
270 SetPageUptodate(page); 284 SetPageUptodate(page);
271 if (!PageDirty(page)) { 285 if (!PageDirty(page)) {
272 __set_page_dirty_nobuffers(page); 286 __set_page_dirty_nobuffers(page);
273 inc_page_count(sbi, F2FS_DIRTY_META); 287 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
274 return 1; 288 return 1;
275 } 289 }
276 return 0; 290 return 0;
@@ -282,78 +296,126 @@ const struct address_space_operations f2fs_meta_aops = {
282 .set_page_dirty = f2fs_set_meta_page_dirty, 296 .set_page_dirty = f2fs_set_meta_page_dirty,
283}; 297};
284 298
299static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
300{
301 struct ino_entry *e;
302retry:
303 spin_lock(&sbi->ino_lock[type]);
304
305 e = radix_tree_lookup(&sbi->ino_root[type], ino);
306 if (!e) {
307 e = kmem_cache_alloc(ino_entry_slab, GFP_ATOMIC);
308 if (!e) {
309 spin_unlock(&sbi->ino_lock[type]);
310 goto retry;
311 }
312 if (radix_tree_insert(&sbi->ino_root[type], ino, e)) {
313 spin_unlock(&sbi->ino_lock[type]);
314 kmem_cache_free(ino_entry_slab, e);
315 goto retry;
316 }
317 memset(e, 0, sizeof(struct ino_entry));
318 e->ino = ino;
319
320 list_add_tail(&e->list, &sbi->ino_list[type]);
321 }
322 spin_unlock(&sbi->ino_lock[type]);
323}
324
325static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
326{
327 struct ino_entry *e;
328
329 spin_lock(&sbi->ino_lock[type]);
330 e = radix_tree_lookup(&sbi->ino_root[type], ino);
331 if (e) {
332 list_del(&e->list);
333 radix_tree_delete(&sbi->ino_root[type], ino);
334 if (type == ORPHAN_INO)
335 sbi->n_orphans--;
336 spin_unlock(&sbi->ino_lock[type]);
337 kmem_cache_free(ino_entry_slab, e);
338 return;
339 }
340 spin_unlock(&sbi->ino_lock[type]);
341}
342
343void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
344{
345 /* add new dirty ino entry into list */
346 __add_ino_entry(sbi, ino, type);
347}
348
349void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
350{
351 /* remove dirty ino entry from list */
352 __remove_ino_entry(sbi, ino, type);
353}
354
355/* mode should be APPEND_INO or UPDATE_INO */
356bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
357{
358 struct ino_entry *e;
359 spin_lock(&sbi->ino_lock[mode]);
360 e = radix_tree_lookup(&sbi->ino_root[mode], ino);
361 spin_unlock(&sbi->ino_lock[mode]);
362 return e ? true : false;
363}
364
365void release_dirty_inode(struct f2fs_sb_info *sbi)
366{
367 struct ino_entry *e, *tmp;
368 int i;
369
370 for (i = APPEND_INO; i <= UPDATE_INO; i++) {
371 spin_lock(&sbi->ino_lock[i]);
372 list_for_each_entry_safe(e, tmp, &sbi->ino_list[i], list) {
373 list_del(&e->list);
374 radix_tree_delete(&sbi->ino_root[i], e->ino);
375 kmem_cache_free(ino_entry_slab, e);
376 }
377 spin_unlock(&sbi->ino_lock[i]);
378 }
379}
380
285int acquire_orphan_inode(struct f2fs_sb_info *sbi) 381int acquire_orphan_inode(struct f2fs_sb_info *sbi)
286{ 382{
287 int err = 0; 383 int err = 0;
288 384
289 spin_lock(&sbi->orphan_inode_lock); 385 spin_lock(&sbi->ino_lock[ORPHAN_INO]);
290 if (unlikely(sbi->n_orphans >= sbi->max_orphans)) 386 if (unlikely(sbi->n_orphans >= sbi->max_orphans))
291 err = -ENOSPC; 387 err = -ENOSPC;
292 else 388 else
293 sbi->n_orphans++; 389 sbi->n_orphans++;
294 spin_unlock(&sbi->orphan_inode_lock); 390 spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
295 391
296 return err; 392 return err;
297} 393}
298 394
299void release_orphan_inode(struct f2fs_sb_info *sbi) 395void release_orphan_inode(struct f2fs_sb_info *sbi)
300{ 396{
301 spin_lock(&sbi->orphan_inode_lock); 397 spin_lock(&sbi->ino_lock[ORPHAN_INO]);
302 f2fs_bug_on(sbi->n_orphans == 0); 398 f2fs_bug_on(sbi, sbi->n_orphans == 0);
303 sbi->n_orphans--; 399 sbi->n_orphans--;
304 spin_unlock(&sbi->orphan_inode_lock); 400 spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
305} 401}
306 402
307void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 403void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
308{ 404{
309 struct list_head *head; 405 /* add new orphan ino entry into list */
310 struct orphan_inode_entry *new, *orphan; 406 __add_ino_entry(sbi, ino, ORPHAN_INO);
311
312 new = f2fs_kmem_cache_alloc(orphan_entry_slab, GFP_ATOMIC);
313 new->ino = ino;
314
315 spin_lock(&sbi->orphan_inode_lock);
316 head = &sbi->orphan_inode_list;
317 list_for_each_entry(orphan, head, list) {
318 if (orphan->ino == ino) {
319 spin_unlock(&sbi->orphan_inode_lock);
320 kmem_cache_free(orphan_entry_slab, new);
321 return;
322 }
323
324 if (orphan->ino > ino)
325 break;
326 }
327
328 /* add new orphan entry into list which is sorted by inode number */
329 list_add_tail(&new->list, &orphan->list);
330 spin_unlock(&sbi->orphan_inode_lock);
331} 407}
332 408
333void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 409void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
334{ 410{
335 struct list_head *head; 411 /* remove orphan entry from orphan list */
336 struct orphan_inode_entry *orphan; 412 __remove_ino_entry(sbi, ino, ORPHAN_INO);
337
338 spin_lock(&sbi->orphan_inode_lock);
339 head = &sbi->orphan_inode_list;
340 list_for_each_entry(orphan, head, list) {
341 if (orphan->ino == ino) {
342 list_del(&orphan->list);
343 f2fs_bug_on(sbi->n_orphans == 0);
344 sbi->n_orphans--;
345 spin_unlock(&sbi->orphan_inode_lock);
346 kmem_cache_free(orphan_entry_slab, orphan);
347 return;
348 }
349 }
350 spin_unlock(&sbi->orphan_inode_lock);
351} 413}
352 414
353static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) 415static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
354{ 416{
355 struct inode *inode = f2fs_iget(sbi->sb, ino); 417 struct inode *inode = f2fs_iget(sbi->sb, ino);
356 f2fs_bug_on(IS_ERR(inode)); 418 f2fs_bug_on(sbi, IS_ERR(inode));
357 clear_nlink(inode); 419 clear_nlink(inode);
358 420
359 /* truncate all the data during iput */ 421 /* truncate all the data during iput */
@@ -398,23 +460,23 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
398 struct f2fs_orphan_block *orphan_blk = NULL; 460 struct f2fs_orphan_block *orphan_blk = NULL;
399 unsigned int nentries = 0; 461 unsigned int nentries = 0;
400 unsigned short index; 462 unsigned short index;
401 unsigned short orphan_blocks = (unsigned short)((sbi->n_orphans + 463 unsigned short orphan_blocks =
402 (F2FS_ORPHANS_PER_BLOCK - 1)) / F2FS_ORPHANS_PER_BLOCK); 464 (unsigned short)GET_ORPHAN_BLOCKS(sbi->n_orphans);
403 struct page *page = NULL; 465 struct page *page = NULL;
404 struct orphan_inode_entry *orphan = NULL; 466 struct ino_entry *orphan = NULL;
405 467
406 for (index = 0; index < orphan_blocks; index++) 468 for (index = 0; index < orphan_blocks; index++)
407 grab_meta_page(sbi, start_blk + index); 469 grab_meta_page(sbi, start_blk + index);
408 470
409 index = 1; 471 index = 1;
410 spin_lock(&sbi->orphan_inode_lock); 472 spin_lock(&sbi->ino_lock[ORPHAN_INO]);
411 head = &sbi->orphan_inode_list; 473 head = &sbi->ino_list[ORPHAN_INO];
412 474
413 /* loop for each orphan inode entry and write them in Jornal block */ 475 /* loop for each orphan inode entry and write them in Jornal block */
414 list_for_each_entry(orphan, head, list) { 476 list_for_each_entry(orphan, head, list) {
415 if (!page) { 477 if (!page) {
416 page = find_get_page(META_MAPPING(sbi), start_blk++); 478 page = find_get_page(META_MAPPING(sbi), start_blk++);
417 f2fs_bug_on(!page); 479 f2fs_bug_on(sbi, !page);
418 orphan_blk = 480 orphan_blk =
419 (struct f2fs_orphan_block *)page_address(page); 481 (struct f2fs_orphan_block *)page_address(page);
420 memset(orphan_blk, 0, sizeof(*orphan_blk)); 482 memset(orphan_blk, 0, sizeof(*orphan_blk));
@@ -448,7 +510,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk)
448 f2fs_put_page(page, 1); 510 f2fs_put_page(page, 1);
449 } 511 }
450 512
451 spin_unlock(&sbi->orphan_inode_lock); 513 spin_unlock(&sbi->ino_lock[ORPHAN_INO]);
452} 514}
453 515
454static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, 516static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
@@ -574,7 +636,7 @@ fail_no_cp:
574 636
575static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) 637static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
576{ 638{
577 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 639 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
578 640
579 if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) 641 if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
580 return -EEXIST; 642 return -EEXIST;
@@ -586,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
586 return 0; 648 return 0;
587} 649}
588 650
589void set_dirty_dir_page(struct inode *inode, struct page *page) 651void update_dirty_page(struct inode *inode, struct page *page)
590{ 652{
591 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 653 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
592 struct dir_inode_entry *new; 654 struct dir_inode_entry *new;
593 int ret = 0; 655 int ret = 0;
594 656
595 if (!S_ISDIR(inode->i_mode)) 657 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
596 return; 658 return;
597 659
660 if (!S_ISDIR(inode->i_mode)) {
661 inode_inc_dirty_pages(inode);
662 goto out;
663 }
664
598 new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 665 new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
599 new->inode = inode; 666 new->inode = inode;
600 INIT_LIST_HEAD(&new->list); 667 INIT_LIST_HEAD(&new->list);
601 668
602 spin_lock(&sbi->dir_inode_lock); 669 spin_lock(&sbi->dir_inode_lock);
603 ret = __add_dirty_inode(inode, new); 670 ret = __add_dirty_inode(inode, new);
604 inode_inc_dirty_dents(inode); 671 inode_inc_dirty_pages(inode);
605 SetPagePrivate(page);
606 spin_unlock(&sbi->dir_inode_lock); 672 spin_unlock(&sbi->dir_inode_lock);
607 673
608 if (ret) 674 if (ret)
609 kmem_cache_free(inode_entry_slab, new); 675 kmem_cache_free(inode_entry_slab, new);
676out:
677 SetPagePrivate(page);
610} 678}
611 679
612void add_dirty_dir_inode(struct inode *inode) 680void add_dirty_dir_inode(struct inode *inode)
613{ 681{
614 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 682 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
615 struct dir_inode_entry *new = 683 struct dir_inode_entry *new =
616 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); 684 f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
617 int ret = 0; 685 int ret = 0;
@@ -629,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode)
629 697
630void remove_dirty_dir_inode(struct inode *inode) 698void remove_dirty_dir_inode(struct inode *inode)
631{ 699{
632 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 700 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
633 struct dir_inode_entry *entry; 701 struct dir_inode_entry *entry;
634 702
635 if (!S_ISDIR(inode->i_mode)) 703 if (!S_ISDIR(inode->i_mode))
636 return; 704 return;
637 705
638 spin_lock(&sbi->dir_inode_lock); 706 spin_lock(&sbi->dir_inode_lock);
639 if (get_dirty_dents(inode) || 707 if (get_dirty_pages(inode) ||
640 !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { 708 !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
641 spin_unlock(&sbi->dir_inode_lock); 709 spin_unlock(&sbi->dir_inode_lock);
642 return; 710 return;
@@ -689,7 +757,7 @@ retry:
689/* 757/*
690 * Freeze all the FS-operations for checkpoint. 758 * Freeze all the FS-operations for checkpoint.
691 */ 759 */
692static void block_operations(struct f2fs_sb_info *sbi) 760static int block_operations(struct f2fs_sb_info *sbi)
693{ 761{
694 struct writeback_control wbc = { 762 struct writeback_control wbc = {
695 .sync_mode = WB_SYNC_ALL, 763 .sync_mode = WB_SYNC_ALL,
@@ -697,6 +765,7 @@ static void block_operations(struct f2fs_sb_info *sbi)
697 .for_reclaim = 0, 765 .for_reclaim = 0,
698 }; 766 };
699 struct blk_plug plug; 767 struct blk_plug plug;
768 int err = 0;
700 769
701 blk_start_plug(&plug); 770 blk_start_plug(&plug);
702 771
@@ -706,27 +775,38 @@ retry_flush_dents:
706 if (get_pages(sbi, F2FS_DIRTY_DENTS)) { 775 if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
707 f2fs_unlock_all(sbi); 776 f2fs_unlock_all(sbi);
708 sync_dirty_dir_inodes(sbi); 777 sync_dirty_dir_inodes(sbi);
778 if (unlikely(f2fs_cp_error(sbi))) {
779 err = -EIO;
780 goto out;
781 }
709 goto retry_flush_dents; 782 goto retry_flush_dents;
710 } 783 }
711 784
712 /* 785 /*
713 * POR: we should ensure that there is no dirty node pages 786 * POR: we should ensure that there are no dirty node pages
714 * until finishing nat/sit flush. 787 * until finishing nat/sit flush.
715 */ 788 */
716retry_flush_nodes: 789retry_flush_nodes:
717 mutex_lock(&sbi->node_write); 790 down_write(&sbi->node_write);
718 791
719 if (get_pages(sbi, F2FS_DIRTY_NODES)) { 792 if (get_pages(sbi, F2FS_DIRTY_NODES)) {
720 mutex_unlock(&sbi->node_write); 793 up_write(&sbi->node_write);
721 sync_node_pages(sbi, 0, &wbc); 794 sync_node_pages(sbi, 0, &wbc);
795 if (unlikely(f2fs_cp_error(sbi))) {
796 f2fs_unlock_all(sbi);
797 err = -EIO;
798 goto out;
799 }
722 goto retry_flush_nodes; 800 goto retry_flush_nodes;
723 } 801 }
802out:
724 blk_finish_plug(&plug); 803 blk_finish_plug(&plug);
804 return err;
725} 805}
726 806
727static void unblock_operations(struct f2fs_sb_info *sbi) 807static void unblock_operations(struct f2fs_sb_info *sbi)
728{ 808{
729 mutex_unlock(&sbi->node_write); 809 up_write(&sbi->node_write);
730 f2fs_unlock_all(sbi); 810 f2fs_unlock_all(sbi);
731} 811}
732 812
@@ -745,10 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
745 finish_wait(&sbi->cp_wait, &wait); 825 finish_wait(&sbi->cp_wait, &wait);
746} 826}
747 827
748static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) 828static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
749{ 829{
750 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 830 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
751 nid_t last_nid = 0; 831 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
832 struct f2fs_nm_info *nm_i = NM_I(sbi);
833 nid_t last_nid = nm_i->next_scan_nid;
752 block_t start_blk; 834 block_t start_blk;
753 struct page *cp_page; 835 struct page *cp_page;
754 unsigned int data_sum_blocks, orphan_blocks; 836 unsigned int data_sum_blocks, orphan_blocks;
@@ -761,11 +843,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
761 * This avoids to conduct wrong roll-forward operations and uses 843 * This avoids to conduct wrong roll-forward operations and uses
762 * metapages, so should be called prior to sync_meta_pages below. 844 * metapages, so should be called prior to sync_meta_pages below.
763 */ 845 */
764 discard_next_dnode(sbi); 846 discard_next_dnode(sbi, NEXT_FREE_BLKADDR(sbi, curseg));
765 847
766 /* Flush all the NAT/SIT pages */ 848 /* Flush all the NAT/SIT pages */
767 while (get_pages(sbi, F2FS_DIRTY_META)) 849 while (get_pages(sbi, F2FS_DIRTY_META)) {
768 sync_meta_pages(sbi, META, LONG_MAX); 850 sync_meta_pages(sbi, META, LONG_MAX);
851 if (unlikely(f2fs_cp_error(sbi)))
852 return;
853 }
769 854
770 next_free_nid(sbi, &last_nid); 855 next_free_nid(sbi, &last_nid);
771 856
@@ -776,7 +861,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
776 ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); 861 ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi));
777 ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); 862 ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi));
778 ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); 863 ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
779 for (i = 0; i < 3; i++) { 864 for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
780 ckpt->cur_node_segno[i] = 865 ckpt->cur_node_segno[i] =
781 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE)); 866 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_NODE));
782 ckpt->cur_node_blkoff[i] = 867 ckpt->cur_node_blkoff[i] =
@@ -784,7 +869,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
784 ckpt->alloc_type[i + CURSEG_HOT_NODE] = 869 ckpt->alloc_type[i + CURSEG_HOT_NODE] =
785 curseg_alloc_type(sbi, i + CURSEG_HOT_NODE); 870 curseg_alloc_type(sbi, i + CURSEG_HOT_NODE);
786 } 871 }
787 for (i = 0; i < 3; i++) { 872 for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) {
788 ckpt->cur_data_segno[i] = 873 ckpt->cur_data_segno[i] =
789 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA)); 874 cpu_to_le32(curseg_segno(sbi, i + CURSEG_HOT_DATA));
790 ckpt->cur_data_blkoff[i] = 875 ckpt->cur_data_blkoff[i] =
@@ -799,24 +884,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
799 884
800 /* 2 cp + n data seg summary + orphan inode blocks */ 885 /* 2 cp + n data seg summary + orphan inode blocks */
801 data_sum_blocks = npages_for_summary_flush(sbi); 886 data_sum_blocks = npages_for_summary_flush(sbi);
802 if (data_sum_blocks < 3) 887 if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
803 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); 888 set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
804 else 889 else
805 clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); 890 clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
806 891
807 orphan_blocks = (sbi->n_orphans + F2FS_ORPHANS_PER_BLOCK - 1) 892 orphan_blocks = GET_ORPHAN_BLOCKS(sbi->n_orphans);
808 / F2FS_ORPHANS_PER_BLOCK;
809 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + 893 ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
810 orphan_blocks); 894 orphan_blocks);
811 895
812 if (is_umount) { 896 if (cpc->reason == CP_UMOUNT) {
813 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); 897 set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
814 ckpt->cp_pack_total_block_count = cpu_to_le32(2 + 898 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
815 cp_payload_blks + data_sum_blocks + 899 cp_payload_blks + data_sum_blocks +
816 orphan_blocks + NR_CURSEG_NODE_TYPE); 900 orphan_blocks + NR_CURSEG_NODE_TYPE);
817 } else { 901 } else {
818 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); 902 clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
819 ckpt->cp_pack_total_block_count = cpu_to_le32(2 + 903 ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
820 cp_payload_blks + data_sum_blocks + 904 cp_payload_blks + data_sum_blocks +
821 orphan_blocks); 905 orphan_blocks);
822 } 906 }
@@ -826,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
826 else 910 else
827 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); 911 clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
828 912
913 if (sbi->need_fsck)
914 set_ckpt_flags(ckpt, CP_FSCK_FLAG);
915
829 /* update SIT/NAT bitmap */ 916 /* update SIT/NAT bitmap */
830 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); 917 get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP));
831 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); 918 get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP));
@@ -860,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
860 947
861 write_data_summaries(sbi, start_blk); 948 write_data_summaries(sbi, start_blk);
862 start_blk += data_sum_blocks; 949 start_blk += data_sum_blocks;
863 if (is_umount) { 950 if (cpc->reason == CP_UMOUNT) {
864 write_node_summaries(sbi, start_blk); 951 write_node_summaries(sbi, start_blk);
865 start_blk += NR_CURSEG_NODE_TYPE; 952 start_blk += NR_CURSEG_NODE_TYPE;
866 } 953 }
@@ -875,6 +962,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
875 /* wait for previous submitted node/meta pages writeback */ 962 /* wait for previous submitted node/meta pages writeback */
876 wait_on_all_pages_writeback(sbi); 963 wait_on_all_pages_writeback(sbi);
877 964
965 if (unlikely(f2fs_cp_error(sbi)))
966 return;
967
878 filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); 968 filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
879 filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); 969 filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
880 970
@@ -885,26 +975,35 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
885 /* Here, we only have one bio having CP pack */ 975 /* Here, we only have one bio having CP pack */
886 sync_meta_pages(sbi, META_FLUSH, LONG_MAX); 976 sync_meta_pages(sbi, META_FLUSH, LONG_MAX);
887 977
888 if (unlikely(!is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { 978 release_dirty_inode(sbi);
889 clear_prefree_segments(sbi); 979
890 F2FS_RESET_SB_DIRT(sbi); 980 if (unlikely(f2fs_cp_error(sbi)))
891 } 981 return;
982
983 clear_prefree_segments(sbi);
984 F2FS_RESET_SB_DIRT(sbi);
892} 985}
893 986
894/* 987/*
895 * We guarantee that this checkpoint procedure should not fail. 988 * We guarantee that this checkpoint procedure will not fail.
896 */ 989 */
897void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) 990void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
898{ 991{
899 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); 992 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
900 unsigned long long ckpt_ver; 993 unsigned long long ckpt_ver;
901 994
902 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); 995 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
903 996
904 mutex_lock(&sbi->cp_mutex); 997 mutex_lock(&sbi->cp_mutex);
905 block_operations(sbi);
906 998
907 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); 999 if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
1000 goto out;
1001 if (unlikely(f2fs_cp_error(sbi)))
1002 goto out;
1003 if (block_operations(sbi))
1004 goto out;
1005
1006 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
908 1007
909 f2fs_submit_merged_bio(sbi, DATA, WRITE); 1008 f2fs_submit_merged_bio(sbi, DATA, WRITE);
910 f2fs_submit_merged_bio(sbi, NODE, WRITE); 1009 f2fs_submit_merged_bio(sbi, NODE, WRITE);
@@ -920,43 +1019,49 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
920 1019
921 /* write cached NAT/SIT entries to NAT/SIT area */ 1020 /* write cached NAT/SIT entries to NAT/SIT area */
922 flush_nat_entries(sbi); 1021 flush_nat_entries(sbi);
923 flush_sit_entries(sbi); 1022 flush_sit_entries(sbi, cpc);
924 1023
925 /* unlock all the fs_lock[] in do_checkpoint() */ 1024 /* unlock all the fs_lock[] in do_checkpoint() */
926 do_checkpoint(sbi, is_umount); 1025 do_checkpoint(sbi, cpc);
927 1026
928 unblock_operations(sbi); 1027 unblock_operations(sbi);
929 mutex_unlock(&sbi->cp_mutex);
930
931 stat_inc_cp_count(sbi->stat_info); 1028 stat_inc_cp_count(sbi->stat_info);
932 trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); 1029out:
1030 mutex_unlock(&sbi->cp_mutex);
1031 trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
933} 1032}
934 1033
935void init_orphan_info(struct f2fs_sb_info *sbi) 1034void init_ino_entry_info(struct f2fs_sb_info *sbi)
936{ 1035{
937 spin_lock_init(&sbi->orphan_inode_lock); 1036 int i;
938 INIT_LIST_HEAD(&sbi->orphan_inode_list); 1037
939 sbi->n_orphans = 0; 1038 for (i = 0; i < MAX_INO_ENTRY; i++) {
1039 INIT_RADIX_TREE(&sbi->ino_root[i], GFP_ATOMIC);
1040 spin_lock_init(&sbi->ino_lock[i]);
1041 INIT_LIST_HEAD(&sbi->ino_list[i]);
1042 }
1043
940 /* 1044 /*
941 * considering 512 blocks in a segment 8 blocks are needed for cp 1045 * considering 512 blocks in a segment 8 blocks are needed for cp
942 * and log segment summaries. Remaining blocks are used to keep 1046 * and log segment summaries. Remaining blocks are used to keep
943 * orphan entries with the limitation one reserved segment 1047 * orphan entries with the limitation one reserved segment
944 * for cp pack we can have max 1020*504 orphan entries 1048 * for cp pack we can have max 1020*504 orphan entries
945 */ 1049 */
946 sbi->max_orphans = (sbi->blocks_per_seg - 2 - NR_CURSEG_TYPE) 1050 sbi->n_orphans = 0;
947 * F2FS_ORPHANS_PER_BLOCK; 1051 sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
1052 NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK;
948} 1053}
949 1054
950int __init create_checkpoint_caches(void) 1055int __init create_checkpoint_caches(void)
951{ 1056{
952 orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", 1057 ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry",
953 sizeof(struct orphan_inode_entry)); 1058 sizeof(struct ino_entry));
954 if (!orphan_entry_slab) 1059 if (!ino_entry_slab)
955 return -ENOMEM; 1060 return -ENOMEM;
956 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry", 1061 inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
957 sizeof(struct dir_inode_entry)); 1062 sizeof(struct dir_inode_entry));
958 if (!inode_entry_slab) { 1063 if (!inode_entry_slab) {
959 kmem_cache_destroy(orphan_entry_slab); 1064 kmem_cache_destroy(ino_entry_slab);
960 return -ENOMEM; 1065 return -ENOMEM;
961 } 1066 }
962 return 0; 1067 return 0;
@@ -964,6 +1069,6 @@ int __init create_checkpoint_caches(void)
964 1069
965void destroy_checkpoint_caches(void) 1070void destroy_checkpoint_caches(void)
966{ 1071{
967 kmem_cache_destroy(orphan_entry_slab); 1072 kmem_cache_destroy(ino_entry_slab);
968 kmem_cache_destroy(inode_entry_slab); 1073 kmem_cache_destroy(inode_entry_slab);
969} 1074}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index f8cf619edb5f..8e58c4cc2cb9 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -53,7 +53,7 @@ static void f2fs_write_end_io(struct bio *bio, int err)
53 struct page *page = bvec->bv_page; 53 struct page *page = bvec->bv_page;
54 54
55 if (unlikely(err)) { 55 if (unlikely(err)) {
56 SetPageError(page); 56 set_page_dirty(page);
57 set_bit(AS_EIO, &page->mapping->flags); 57 set_bit(AS_EIO, &page->mapping->flags);
58 f2fs_stop_checkpoint(sbi); 58 f2fs_stop_checkpoint(sbi);
59 } 59 }
@@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
85 bio = bio_alloc(GFP_NOIO, npages); 85 bio = bio_alloc(GFP_NOIO, npages);
86 86
87 bio->bi_bdev = sbi->sb->s_bdev; 87 bio->bi_bdev = sbi->sb->s_bdev;
88 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); 88 bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
89 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; 89 bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
90 bio->bi_private = sbi; 90 bio->bi_private = sbi;
91 91
@@ -139,7 +139,10 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
139 /* change META to META_FLUSH in the checkpoint procedure */ 139 /* change META to META_FLUSH in the checkpoint procedure */
140 if (type >= META_FLUSH) { 140 if (type >= META_FLUSH) {
141 io->fio.type = META_FLUSH; 141 io->fio.type = META_FLUSH;
142 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; 142 if (test_opt(sbi, NOBARRIER))
143 io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO;
144 else
145 io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO;
143 } 146 }
144 __submit_merged_bio(io); 147 __submit_merged_bio(io);
145 up_write(&io->io_rwsem); 148 up_write(&io->io_rwsem);
@@ -190,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
190 __submit_merged_bio(io); 193 __submit_merged_bio(io);
191alloc_new: 194alloc_new:
192 if (io->bio == NULL) { 195 if (io->bio == NULL) {
193 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 196 int bio_blocks = MAX_BIO_BLOCKS(sbi);
194 197
195 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); 198 io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
196 io->fio = *fio; 199 io->fio = *fio;
@@ -233,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
233 236
234int reserve_new_block(struct dnode_of_data *dn) 237int reserve_new_block(struct dnode_of_data *dn)
235{ 238{
236 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 239 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
237 240
238 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 241 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
239 return -EPERM; 242 return -EPERM;
@@ -255,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
255 int err; 258 int err;
256 259
257 /* if inode_page exists, index should be zero */ 260 /* if inode_page exists, index should be zero */
258 f2fs_bug_on(!need_put && index); 261 f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index);
259 262
260 err = get_dnode_of_data(dn, index, ALLOC_NODE); 263 err = get_dnode_of_data(dn, index, ALLOC_NODE);
261 if (err) 264 if (err)
@@ -318,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
318 block_t start_blkaddr, end_blkaddr; 321 block_t start_blkaddr, end_blkaddr;
319 int need_update = true; 322 int need_update = true;
320 323
321 f2fs_bug_on(blk_addr == NEW_ADDR); 324 f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
322 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + 325 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
323 dn->ofs_in_node; 326 dn->ofs_in_node;
324 327
@@ -393,7 +396,6 @@ end_update:
393 396
394struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) 397struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
395{ 398{
396 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
397 struct address_space *mapping = inode->i_mapping; 399 struct address_space *mapping = inode->i_mapping;
398 struct dnode_of_data dn; 400 struct dnode_of_data dn;
399 struct page *page; 401 struct page *page;
@@ -426,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
426 return page; 428 return page;
427 } 429 }
428 430
429 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 431 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
430 sync ? READ_SYNC : READA); 432 sync ? READ_SYNC : READA);
431 if (err) 433 if (err)
432 return ERR_PTR(err); 434 return ERR_PTR(err);
@@ -448,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
448 */ 450 */
449struct page *get_lock_data_page(struct inode *inode, pgoff_t index) 451struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
450{ 452{
451 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
452 struct address_space *mapping = inode->i_mapping; 453 struct address_space *mapping = inode->i_mapping;
453 struct dnode_of_data dn; 454 struct dnode_of_data dn;
454 struct page *page; 455 struct page *page;
@@ -487,7 +488,8 @@ repeat:
487 return page; 488 return page;
488 } 489 }
489 490
490 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); 491 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
492 dn.data_blkaddr, READ_SYNC);
491 if (err) 493 if (err)
492 return ERR_PTR(err); 494 return ERR_PTR(err);
493 495
@@ -514,7 +516,6 @@ repeat:
514struct page *get_new_data_page(struct inode *inode, 516struct page *get_new_data_page(struct inode *inode,
515 struct page *ipage, pgoff_t index, bool new_i_size) 517 struct page *ipage, pgoff_t index, bool new_i_size)
516{ 518{
517 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
518 struct address_space *mapping = inode->i_mapping; 519 struct address_space *mapping = inode->i_mapping;
519 struct page *page; 520 struct page *page;
520 struct dnode_of_data dn; 521 struct dnode_of_data dn;
@@ -538,8 +539,8 @@ repeat:
538 zero_user_segment(page, 0, PAGE_CACHE_SIZE); 539 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
539 SetPageUptodate(page); 540 SetPageUptodate(page);
540 } else { 541 } else {
541 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 542 err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
542 READ_SYNC); 543 dn.data_blkaddr, READ_SYNC);
543 if (err) 544 if (err)
544 goto put_err; 545 goto put_err;
545 546
@@ -570,10 +571,12 @@ put_err:
570 571
571static int __allocate_data_block(struct dnode_of_data *dn) 572static int __allocate_data_block(struct dnode_of_data *dn)
572{ 573{
573 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 574 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
575 struct f2fs_inode_info *fi = F2FS_I(dn->inode);
574 struct f2fs_summary sum; 576 struct f2fs_summary sum;
575 block_t new_blkaddr; 577 block_t new_blkaddr;
576 struct node_info ni; 578 struct node_info ni;
579 pgoff_t fofs;
577 int type; 580 int type;
578 581
579 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) 582 if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
@@ -596,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn)
596 update_extent_cache(new_blkaddr, dn); 599 update_extent_cache(new_blkaddr, dn);
597 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); 600 clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
598 601
602 /* update i_size */
603 fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
604 dn->ofs_in_node;
605 if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
606 i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
607
599 dn->data_blkaddr = new_blkaddr; 608 dn->data_blkaddr = new_blkaddr;
600 return 0; 609 return 0;
601} 610}
@@ -611,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
611static int __get_data_block(struct inode *inode, sector_t iblock, 620static int __get_data_block(struct inode *inode, sector_t iblock,
612 struct buffer_head *bh_result, int create, bool fiemap) 621 struct buffer_head *bh_result, int create, bool fiemap)
613{ 622{
614 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
615 unsigned int blkbits = inode->i_sb->s_blocksize_bits; 623 unsigned int blkbits = inode->i_sb->s_blocksize_bits;
616 unsigned maxblocks = bh_result->b_size >> blkbits; 624 unsigned maxblocks = bh_result->b_size >> blkbits;
617 struct dnode_of_data dn; 625 struct dnode_of_data dn;
@@ -626,8 +634,10 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
626 if (check_extent_cache(inode, pgofs, bh_result)) 634 if (check_extent_cache(inode, pgofs, bh_result))
627 goto out; 635 goto out;
628 636
629 if (create) 637 if (create) {
630 f2fs_lock_op(sbi); 638 f2fs_balance_fs(F2FS_I_SB(inode));
639 f2fs_lock_op(F2FS_I_SB(inode));
640 }
631 641
632 /* When reading holes, we need its node page */ 642 /* When reading holes, we need its node page */
633 set_new_dnode(&dn, inode, NULL, NULL, 0); 643 set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -686,7 +696,7 @@ get_next:
686 allocated = true; 696 allocated = true;
687 blkaddr = dn.data_blkaddr; 697 blkaddr = dn.data_blkaddr;
688 } 698 }
689 /* Give more consecutive addresses for the read ahead */ 699 /* Give more consecutive addresses for the readahead */
690 if (blkaddr == (bh_result->b_blocknr + ofs)) { 700 if (blkaddr == (bh_result->b_blocknr + ofs)) {
691 ofs++; 701 ofs++;
692 dn.ofs_in_node++; 702 dn.ofs_in_node++;
@@ -702,7 +712,7 @@ put_out:
702 f2fs_put_dnode(&dn); 712 f2fs_put_dnode(&dn);
703unlock_out: 713unlock_out:
704 if (create) 714 if (create)
705 f2fs_unlock_op(sbi); 715 f2fs_unlock_op(F2FS_I_SB(inode));
706out: 716out:
707 trace_f2fs_get_data_block(inode, iblock, bh_result, err); 717 trace_f2fs_get_data_block(inode, iblock, bh_result, err);
708 return err; 718 return err;
@@ -734,7 +744,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page)
734 744
735 trace_f2fs_readpage(page, DATA); 745 trace_f2fs_readpage(page, DATA);
736 746
737 /* If the file has inline data, try to read it directlly */ 747 /* If the file has inline data, try to read it directly */
738 if (f2fs_has_inline_data(inode)) 748 if (f2fs_has_inline_data(inode))
739 ret = f2fs_read_inline_data(inode, page); 749 ret = f2fs_read_inline_data(inode, page);
740 else 750 else
@@ -784,9 +794,11 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
784 !is_cold_data(page) && 794 !is_cold_data(page) &&
785 need_inplace_update(inode))) { 795 need_inplace_update(inode))) {
786 rewrite_data_page(page, old_blkaddr, fio); 796 rewrite_data_page(page, old_blkaddr, fio);
797 set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
787 } else { 798 } else {
788 write_data_page(page, &dn, &new_blkaddr, fio); 799 write_data_page(page, &dn, &new_blkaddr, fio);
789 update_extent_cache(new_blkaddr, &dn); 800 update_extent_cache(new_blkaddr, &dn);
801 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
790 } 802 }
791out_writepage: 803out_writepage:
792 f2fs_put_dnode(&dn); 804 f2fs_put_dnode(&dn);
@@ -797,7 +809,7 @@ static int f2fs_write_data_page(struct page *page,
797 struct writeback_control *wbc) 809 struct writeback_control *wbc)
798{ 810{
799 struct inode *inode = page->mapping->host; 811 struct inode *inode = page->mapping->host;
800 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 812 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
801 loff_t i_size = i_size_read(inode); 813 loff_t i_size = i_size_read(inode);
802 const pgoff_t end_index = ((unsigned long long) i_size) 814 const pgoff_t end_index = ((unsigned long long) i_size)
803 >> PAGE_CACHE_SHIFT; 815 >> PAGE_CACHE_SHIFT;
@@ -829,10 +841,19 @@ write:
829 841
830 /* Dentry blocks are controlled by checkpoint */ 842 /* Dentry blocks are controlled by checkpoint */
831 if (S_ISDIR(inode->i_mode)) { 843 if (S_ISDIR(inode->i_mode)) {
844 if (unlikely(f2fs_cp_error(sbi)))
845 goto redirty_out;
832 err = do_write_data_page(page, &fio); 846 err = do_write_data_page(page, &fio);
833 goto done; 847 goto done;
834 } 848 }
835 849
850 /* we should bypass data pages to proceed the kworkder jobs */
851 if (unlikely(f2fs_cp_error(sbi))) {
852 SetPageError(page);
853 unlock_page(page);
854 goto out;
855 }
856
836 if (!wbc->for_reclaim) 857 if (!wbc->for_reclaim)
837 need_balance_fs = true; 858 need_balance_fs = true;
838 else if (has_not_enough_free_secs(sbi, 0)) 859 else if (has_not_enough_free_secs(sbi, 0))
@@ -850,7 +871,7 @@ done:
850 871
851 clear_cold_data(page); 872 clear_cold_data(page);
852out: 873out:
853 inode_dec_dirty_dents(inode); 874 inode_dec_dirty_pages(inode);
854 unlock_page(page); 875 unlock_page(page);
855 if (need_balance_fs) 876 if (need_balance_fs)
856 f2fs_balance_fs(sbi); 877 f2fs_balance_fs(sbi);
@@ -876,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
876 struct writeback_control *wbc) 897 struct writeback_control *wbc)
877{ 898{
878 struct inode *inode = mapping->host; 899 struct inode *inode = mapping->host;
879 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 900 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
880 bool locked = false; 901 bool locked = false;
881 int ret; 902 int ret;
882 long diff; 903 long diff;
@@ -888,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
888 return 0; 909 return 0;
889 910
890 if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && 911 if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE &&
891 get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) && 912 get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
892 available_free_memory(sbi, DIRTY_DENTS)) 913 available_free_memory(sbi, DIRTY_DENTS))
893 goto skip_write; 914 goto skip_write;
894 915
@@ -910,16 +931,26 @@ static int f2fs_write_data_pages(struct address_space *mapping,
910 return ret; 931 return ret;
911 932
912skip_write: 933skip_write:
913 wbc->pages_skipped += get_dirty_dents(inode); 934 wbc->pages_skipped += get_dirty_pages(inode);
914 return 0; 935 return 0;
915} 936}
916 937
938static void f2fs_write_failed(struct address_space *mapping, loff_t to)
939{
940 struct inode *inode = mapping->host;
941
942 if (to > inode->i_size) {
943 truncate_pagecache(inode, inode->i_size);
944 truncate_blocks(inode, inode->i_size, true);
945 }
946}
947
917static int f2fs_write_begin(struct file *file, struct address_space *mapping, 948static int f2fs_write_begin(struct file *file, struct address_space *mapping,
918 loff_t pos, unsigned len, unsigned flags, 949 loff_t pos, unsigned len, unsigned flags,
919 struct page **pagep, void **fsdata) 950 struct page **pagep, void **fsdata)
920{ 951{
921 struct inode *inode = mapping->host; 952 struct inode *inode = mapping->host;
922 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 953 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
923 struct page *page; 954 struct page *page;
924 pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; 955 pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
925 struct dnode_of_data dn; 956 struct dnode_of_data dn;
@@ -929,13 +960,15 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
929 960
930 f2fs_balance_fs(sbi); 961 f2fs_balance_fs(sbi);
931repeat: 962repeat:
932 err = f2fs_convert_inline_data(inode, pos + len); 963 err = f2fs_convert_inline_data(inode, pos + len, NULL);
933 if (err) 964 if (err)
934 return err; 965 goto fail;
935 966
936 page = grab_cache_page_write_begin(mapping, index, flags); 967 page = grab_cache_page_write_begin(mapping, index, flags);
937 if (!page) 968 if (!page) {
938 return -ENOMEM; 969 err = -ENOMEM;
970 goto fail;
971 }
939 972
940 /* to avoid latency during memory pressure */ 973 /* to avoid latency during memory pressure */
941 unlock_page(page); 974 unlock_page(page);
@@ -949,10 +982,9 @@ repeat:
949 set_new_dnode(&dn, inode, NULL, NULL, 0); 982 set_new_dnode(&dn, inode, NULL, NULL, 0);
950 err = f2fs_reserve_block(&dn, index); 983 err = f2fs_reserve_block(&dn, index);
951 f2fs_unlock_op(sbi); 984 f2fs_unlock_op(sbi);
952
953 if (err) { 985 if (err) {
954 f2fs_put_page(page, 0); 986 f2fs_put_page(page, 0);
955 return err; 987 goto fail;
956 } 988 }
957inline_data: 989inline_data:
958 lock_page(page); 990 lock_page(page);
@@ -982,19 +1014,20 @@ inline_data:
982 err = f2fs_read_inline_data(inode, page); 1014 err = f2fs_read_inline_data(inode, page);
983 if (err) { 1015 if (err) {
984 page_cache_release(page); 1016 page_cache_release(page);
985 return err; 1017 goto fail;
986 } 1018 }
987 } else { 1019 } else {
988 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, 1020 err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
989 READ_SYNC); 1021 READ_SYNC);
990 if (err) 1022 if (err)
991 return err; 1023 goto fail;
992 } 1024 }
993 1025
994 lock_page(page); 1026 lock_page(page);
995 if (unlikely(!PageUptodate(page))) { 1027 if (unlikely(!PageUptodate(page))) {
996 f2fs_put_page(page, 1); 1028 f2fs_put_page(page, 1);
997 return -EIO; 1029 err = -EIO;
1030 goto fail;
998 } 1031 }
999 if (unlikely(page->mapping != mapping)) { 1032 if (unlikely(page->mapping != mapping)) {
1000 f2fs_put_page(page, 1); 1033 f2fs_put_page(page, 1);
@@ -1005,6 +1038,9 @@ out:
1005 SetPageUptodate(page); 1038 SetPageUptodate(page);
1006 clear_cold_data(page); 1039 clear_cold_data(page);
1007 return 0; 1040 return 0;
1041fail:
1042 f2fs_write_failed(mapping, pos + len);
1043 return err;
1008} 1044}
1009 1045
1010static int f2fs_write_end(struct file *file, 1046static int f2fs_write_end(struct file *file,
@@ -1016,8 +1052,10 @@ static int f2fs_write_end(struct file *file,
1016 1052
1017 trace_f2fs_write_end(inode, pos, len, copied); 1053 trace_f2fs_write_end(inode, pos, len, copied);
1018 1054
1019 SetPageUptodate(page); 1055 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
1020 set_page_dirty(page); 1056 register_inmem_page(inode, page);
1057 else
1058 set_page_dirty(page);
1021 1059
1022 if (pos + copied > i_size_read(inode)) { 1060 if (pos + copied > i_size_read(inode)) {
1023 i_size_write(inode, pos + copied); 1061 i_size_write(inode, pos + copied);
@@ -1050,7 +1088,10 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1050 struct iov_iter *iter, loff_t offset) 1088 struct iov_iter *iter, loff_t offset)
1051{ 1089{
1052 struct file *file = iocb->ki_filp; 1090 struct file *file = iocb->ki_filp;
1053 struct inode *inode = file->f_mapping->host; 1091 struct address_space *mapping = file->f_mapping;
1092 struct inode *inode = mapping->host;
1093 size_t count = iov_iter_count(iter);
1094 int err;
1054 1095
1055 /* Let buffer I/O handle the inline data case. */ 1096 /* Let buffer I/O handle the inline data case. */
1056 if (f2fs_has_inline_data(inode)) 1097 if (f2fs_has_inline_data(inode))
@@ -1059,19 +1100,27 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
1059 if (check_direct_IO(inode, rw, iter, offset)) 1100 if (check_direct_IO(inode, rw, iter, offset))
1060 return 0; 1101 return 0;
1061 1102
1062 /* clear fsync mark to recover these blocks */ 1103 trace_f2fs_direct_IO_enter(inode, offset, count, rw);
1063 fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); 1104
1105 err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
1106 if (err < 0 && (rw & WRITE))
1107 f2fs_write_failed(mapping, offset + count);
1108
1109 trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
1064 1110
1065 return blockdev_direct_IO(rw, iocb, inode, iter, offset, 1111 return err;
1066 get_data_block);
1067} 1112}
1068 1113
1069static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, 1114static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
1070 unsigned int length) 1115 unsigned int length)
1071{ 1116{
1072 struct inode *inode = page->mapping->host; 1117 struct inode *inode = page->mapping->host;
1118
1119 if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
1120 return;
1121
1073 if (PageDirty(page)) 1122 if (PageDirty(page))
1074 inode_dec_dirty_dents(inode); 1123 inode_dec_dirty_pages(inode);
1075 ClearPagePrivate(page); 1124 ClearPagePrivate(page);
1076} 1125}
1077 1126
@@ -1093,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
1093 1142
1094 if (!PageDirty(page)) { 1143 if (!PageDirty(page)) {
1095 __set_page_dirty_nobuffers(page); 1144 __set_page_dirty_nobuffers(page);
1096 set_dirty_dir_page(inode, page); 1145 update_dirty_page(inode, page);
1097 return 1; 1146 return 1;
1098 } 1147 }
1099 return 0; 1148 return 0;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index b52c12cf5873..0a91ab813a9e 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -32,7 +32,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
32 struct f2fs_stat_info *si = F2FS_STAT(sbi); 32 struct f2fs_stat_info *si = F2FS_STAT(sbi);
33 int i; 33 int i;
34 34
35 /* valid check of the segment numbers */ 35 /* validation check of the segment numbers */
36 si->hit_ext = sbi->read_hit_ext; 36 si->hit_ext = sbi->read_hit_ext;
37 si->total_ext = sbi->total_hit_ext; 37 si->total_ext = sbi->total_hit_ext;
38 si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); 38 si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
93 total_vblocks = 0; 93 total_vblocks = 0;
94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); 94 blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
95 hblks_per_sec = blks_per_sec / 2; 95 hblks_per_sec = blks_per_sec / 2;
96 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 96 for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
97 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); 97 vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
98 dist = abs(vblocks - hblks_per_sec); 98 dist = abs(vblocks - hblks_per_sec);
99 bimodal += dist * dist; 99 bimodal += dist * dist;
@@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
103 ndirty++; 103 ndirty++;
104 } 104 }
105 } 105 }
106 dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; 106 dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100;
107 si->bimodal = bimodal / dist; 107 si->bimodal = bimodal / dist;
108 if (si->dirty_count) 108 if (si->dirty_count)
109 si->avg_vblocks = total_vblocks / ndirty; 109 si->avg_vblocks = total_vblocks / ndirty;
@@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
131 131
132 /* build sit */ 132 /* build sit */
133 si->base_mem += sizeof(struct sit_info); 133 si->base_mem += sizeof(struct sit_info);
134 si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry); 134 si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
135 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); 135 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
136 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); 136 si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
137 if (sbi->segs_per_sec > 1) 137 if (sbi->segs_per_sec > 1)
138 si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); 138 si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
139 si->base_mem += __bitmap_size(sbi, SIT_BITMAP); 139 si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
140 140
141 /* build free segmap */ 141 /* build free segmap */
142 si->base_mem += sizeof(struct free_segmap_info); 142 si->base_mem += sizeof(struct free_segmap_info);
143 si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); 143 si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
144 si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); 144 si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
145 145
146 /* build curseg */ 146 /* build curseg */
147 si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; 147 si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE;
@@ -149,10 +149,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
149 149
150 /* build dirty segmap */ 150 /* build dirty segmap */
151 si->base_mem += sizeof(struct dirty_seglist_info); 151 si->base_mem += sizeof(struct dirty_seglist_info);
152 si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); 152 si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi));
153 si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); 153 si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi));
154 154
155 /* buld nm */ 155 /* build nm */
156 si->base_mem += sizeof(struct f2fs_nm_info); 156 si->base_mem += sizeof(struct f2fs_nm_info);
157 si->base_mem += __bitmap_size(sbi, NAT_BITMAP); 157 si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
158 158
@@ -167,7 +167,7 @@ get_cache:
167 si->cache_mem += npages << PAGE_CACHE_SHIFT; 167 si->cache_mem += npages << PAGE_CACHE_SHIFT;
168 npages = META_MAPPING(sbi)->nrpages; 168 npages = META_MAPPING(sbi)->nrpages;
169 si->cache_mem += npages << PAGE_CACHE_SHIFT; 169 si->cache_mem += npages << PAGE_CACHE_SHIFT;
170 si->cache_mem += sbi->n_orphans * sizeof(struct orphan_inode_entry); 170 si->cache_mem += sbi->n_orphans * sizeof(struct ino_entry);
171 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry); 171 si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
172} 172}
173 173
@@ -345,21 +345,14 @@ void __init f2fs_create_root_stats(void)
345 345
346 f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); 346 f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
347 if (!f2fs_debugfs_root) 347 if (!f2fs_debugfs_root)
348 goto bail; 348 return;
349 349
350 file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, 350 file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
351 NULL, &stat_fops); 351 NULL, &stat_fops);
352 if (!file) 352 if (!file) {
353 goto free_debugfs_dir; 353 debugfs_remove(f2fs_debugfs_root);
354 354 f2fs_debugfs_root = NULL;
355 return; 355 }
356
357free_debugfs_dir:
358 debugfs_remove(f2fs_debugfs_root);
359
360bail:
361 f2fs_debugfs_root = NULL;
362 return;
363} 356}
364 357
365void f2fs_destroy_root_stats(void) 358void f2fs_destroy_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a4addd72ebbd..b54f87149c09 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -77,8 +77,8 @@ static unsigned long dir_block_index(unsigned int level,
77 return bidx; 77 return bidx;
78} 78}
79 79
80static bool early_match_name(const char *name, size_t namelen, 80static bool early_match_name(size_t namelen, f2fs_hash_t namehash,
81 f2fs_hash_t namehash, struct f2fs_dir_entry *de) 81 struct f2fs_dir_entry *de)
82{ 82{
83 if (le16_to_cpu(de->name_len) != namelen) 83 if (le16_to_cpu(de->name_len) != namelen)
84 return false; 84 return false;
@@ -90,7 +90,7 @@ static bool early_match_name(const char *name, size_t namelen,
90} 90}
91 91
92static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, 92static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
93 const char *name, size_t namelen, int *max_slots, 93 struct qstr *name, int *max_slots,
94 f2fs_hash_t namehash, struct page **res_page) 94 f2fs_hash_t namehash, struct page **res_page)
95{ 95{
96 struct f2fs_dir_entry *de; 96 struct f2fs_dir_entry *de;
@@ -109,9 +109,10 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
109 continue; 109 continue;
110 } 110 }
111 de = &dentry_blk->dentry[bit_pos]; 111 de = &dentry_blk->dentry[bit_pos];
112 if (early_match_name(name, namelen, namehash, de)) { 112 if (early_match_name(name->len, namehash, de)) {
113 if (!memcmp(dentry_blk->filename[bit_pos], 113 if (!memcmp(dentry_blk->filename[bit_pos],
114 name, namelen)) { 114 name->name,
115 name->len)) {
115 *res_page = dentry_page; 116 *res_page = dentry_page;
116 goto found; 117 goto found;
117 } 118 }
@@ -120,6 +121,13 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
120 *max_slots = max_len; 121 *max_slots = max_len;
121 max_len = 0; 122 max_len = 0;
122 } 123 }
124
125 /*
126 * For the most part, it should be a bug when name_len is zero.
127 * We stop here for figuring out where the bugs has occurred.
128 */
129 f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len);
130
123 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); 131 bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
124 } 132 }
125 133
@@ -132,10 +140,10 @@ found:
132} 140}
133 141
134static struct f2fs_dir_entry *find_in_level(struct inode *dir, 142static struct f2fs_dir_entry *find_in_level(struct inode *dir,
135 unsigned int level, const char *name, size_t namelen, 143 unsigned int level, struct qstr *name,
136 f2fs_hash_t namehash, struct page **res_page) 144 f2fs_hash_t namehash, struct page **res_page)
137{ 145{
138 int s = GET_DENTRY_SLOTS(namelen); 146 int s = GET_DENTRY_SLOTS(name->len);
139 unsigned int nbucket, nblock; 147 unsigned int nbucket, nblock;
140 unsigned int bidx, end_block; 148 unsigned int bidx, end_block;
141 struct page *dentry_page; 149 struct page *dentry_page;
@@ -143,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
143 bool room = false; 151 bool room = false;
144 int max_slots = 0; 152 int max_slots = 0;
145 153
146 f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); 154 f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
147 155
148 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); 156 nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
149 nblock = bucket_blocks(level); 157 nblock = bucket_blocks(level);
@@ -160,8 +168,8 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
160 continue; 168 continue;
161 } 169 }
162 170
163 de = find_in_block(dentry_page, name, namelen, 171 de = find_in_block(dentry_page, name, &max_slots,
164 &max_slots, namehash, res_page); 172 namehash, res_page);
165 if (de) 173 if (de)
166 break; 174 break;
167 175
@@ -187,8 +195,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
187struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, 195struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
188 struct qstr *child, struct page **res_page) 196 struct qstr *child, struct page **res_page)
189{ 197{
190 const char *name = child->name;
191 size_t namelen = child->len;
192 unsigned long npages = dir_blocks(dir); 198 unsigned long npages = dir_blocks(dir);
193 struct f2fs_dir_entry *de = NULL; 199 struct f2fs_dir_entry *de = NULL;
194 f2fs_hash_t name_hash; 200 f2fs_hash_t name_hash;
@@ -200,12 +206,11 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
200 206
201 *res_page = NULL; 207 *res_page = NULL;
202 208
203 name_hash = f2fs_dentry_hash(name, namelen); 209 name_hash = f2fs_dentry_hash(child);
204 max_depth = F2FS_I(dir)->i_current_depth; 210 max_depth = F2FS_I(dir)->i_current_depth;
205 211
206 for (level = 0; level < max_depth; level++) { 212 for (level = 0; level < max_depth; level++) {
207 de = find_in_level(dir, level, name, 213 de = find_in_level(dir, level, child, name_hash, res_page);
208 namelen, name_hash, res_page);
209 if (de) 214 if (de)
210 break; 215 break;
211 } 216 }
@@ -279,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage)
279 284
280int update_dent_inode(struct inode *inode, const struct qstr *name) 285int update_dent_inode(struct inode *inode, const struct qstr *name)
281{ 286{
282 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
283 struct page *page; 287 struct page *page;
284 288
285 page = get_node_page(sbi, inode->i_ino); 289 page = get_node_page(F2FS_I_SB(inode), inode->i_ino);
286 if (IS_ERR(page)) 290 if (IS_ERR(page))
287 return PTR_ERR(page); 291 return PTR_ERR(page);
288 292
@@ -298,14 +302,13 @@ static int make_empty_dir(struct inode *inode,
298 struct page *dentry_page; 302 struct page *dentry_page;
299 struct f2fs_dentry_block *dentry_blk; 303 struct f2fs_dentry_block *dentry_blk;
300 struct f2fs_dir_entry *de; 304 struct f2fs_dir_entry *de;
301 void *kaddr;
302 305
303 dentry_page = get_new_data_page(inode, page, 0, true); 306 dentry_page = get_new_data_page(inode, page, 0, true);
304 if (IS_ERR(dentry_page)) 307 if (IS_ERR(dentry_page))
305 return PTR_ERR(dentry_page); 308 return PTR_ERR(dentry_page);
306 309
307 kaddr = kmap_atomic(dentry_page); 310
308 dentry_blk = (struct f2fs_dentry_block *)kaddr; 311 dentry_blk = kmap_atomic(dentry_page);
309 312
310 de = &dentry_blk->dentry[0]; 313 de = &dentry_blk->dentry[0];
311 de->name_len = cpu_to_le16(1); 314 de->name_len = cpu_to_le16(1);
@@ -323,7 +326,7 @@ static int make_empty_dir(struct inode *inode,
323 326
324 test_and_set_bit_le(0, &dentry_blk->dentry_bitmap); 327 test_and_set_bit_le(0, &dentry_blk->dentry_bitmap);
325 test_and_set_bit_le(1, &dentry_blk->dentry_bitmap); 328 test_and_set_bit_le(1, &dentry_blk->dentry_bitmap);
326 kunmap_atomic(kaddr); 329 kunmap_atomic(dentry_blk);
327 330
328 set_page_dirty(dentry_page); 331 set_page_dirty(dentry_page);
329 f2fs_put_page(dentry_page, 1); 332 f2fs_put_page(dentry_page, 1);
@@ -337,7 +340,7 @@ static struct page *init_inode_metadata(struct inode *inode,
337 int err; 340 int err;
338 341
339 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { 342 if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) {
340 page = new_inode_page(inode, name); 343 page = new_inode_page(inode);
341 if (IS_ERR(page)) 344 if (IS_ERR(page))
342 return page; 345 return page;
343 346
@@ -355,14 +358,15 @@ static struct page *init_inode_metadata(struct inode *inode,
355 if (err) 358 if (err)
356 goto put_error; 359 goto put_error;
357 } else { 360 } else {
358 page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); 361 page = get_node_page(F2FS_I_SB(dir), inode->i_ino);
359 if (IS_ERR(page)) 362 if (IS_ERR(page))
360 return page; 363 return page;
361 364
362 set_cold_node(inode, page); 365 set_cold_node(inode, page);
363 } 366 }
364 367
365 init_dent_inode(name, page); 368 if (name)
369 init_dent_inode(name, page);
366 370
367 /* 371 /*
368 * This file should be checkpointed during fsync. 372 * This file should be checkpointed during fsync.
@@ -370,6 +374,12 @@ static struct page *init_inode_metadata(struct inode *inode,
370 */ 374 */
371 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { 375 if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) {
372 file_lost_pino(inode); 376 file_lost_pino(inode);
377 /*
378 * If link the tmpfile to alias through linkat path,
379 * we should remove this inode from orphan list.
380 */
381 if (inode->i_nlink == 0)
382 remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino);
373 inc_nlink(inode); 383 inc_nlink(inode);
374 } 384 }
375 return page; 385 return page;
@@ -379,7 +389,7 @@ put_error:
379error: 389error:
380 /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ 390 /* once the failed inode becomes a bad inode, i_mode is S_IFREG */
381 truncate_inode_pages(&inode->i_data, 0); 391 truncate_inode_pages(&inode->i_data, 0);
382 truncate_blocks(inode, 0); 392 truncate_blocks(inode, 0, false);
383 remove_dirty_dir_inode(inode); 393 remove_dirty_dir_inode(inode);
384 remove_inode_page(inode); 394 remove_inode_page(inode);
385 return ERR_PTR(err); 395 return ERR_PTR(err);
@@ -453,7 +463,7 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
453 int err = 0; 463 int err = 0;
454 int i; 464 int i;
455 465
456 dentry_hash = f2fs_dentry_hash(name->name, name->len); 466 dentry_hash = f2fs_dentry_hash(name);
457 level = 0; 467 level = 0;
458 current_depth = F2FS_I(dir)->i_current_depth; 468 current_depth = F2FS_I(dir)->i_current_depth;
459 if (F2FS_I(dir)->chash == dentry_hash) { 469 if (F2FS_I(dir)->chash == dentry_hash) {
@@ -529,8 +539,29 @@ fail:
529 return err; 539 return err;
530} 540}
531 541
542int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
543{
544 struct page *page;
545 int err = 0;
546
547 down_write(&F2FS_I(inode)->i_sem);
548 page = init_inode_metadata(inode, dir, NULL);
549 if (IS_ERR(page)) {
550 err = PTR_ERR(page);
551 goto fail;
552 }
553 /* we don't need to mark_inode_dirty now */
554 update_inode(inode, page);
555 f2fs_put_page(page, 1);
556
557 clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
558fail:
559 up_write(&F2FS_I(inode)->i_sem);
560 return err;
561}
562
532/* 563/*
533 * It only removes the dentry from the dentry page,corresponding name 564 * It only removes the dentry from the dentry page, corresponding name
534 * entry in name page does not need to be touched during deletion. 565 * entry in name page does not need to be touched during deletion.
535 */ 566 */
536void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, 567void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
@@ -538,17 +569,15 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
538{ 569{
539 struct f2fs_dentry_block *dentry_blk; 570 struct f2fs_dentry_block *dentry_blk;
540 unsigned int bit_pos; 571 unsigned int bit_pos;
541 struct address_space *mapping = page->mapping; 572 struct inode *dir = page->mapping->host;
542 struct inode *dir = mapping->host;
543 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); 573 int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
544 void *kaddr = page_address(page);
545 int i; 574 int i;
546 575
547 lock_page(page); 576 lock_page(page);
548 f2fs_wait_on_page_writeback(page, DATA); 577 f2fs_wait_on_page_writeback(page, DATA);
549 578
550 dentry_blk = (struct f2fs_dentry_block *)kaddr; 579 dentry_blk = page_address(page);
551 bit_pos = dentry - (struct f2fs_dir_entry *)dentry_blk->dentry; 580 bit_pos = dentry - dentry_blk->dentry;
552 for (i = 0; i < slots; i++) 581 for (i = 0; i < slots; i++)
553 test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); 582 test_and_clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap);
554 583
@@ -562,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
562 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 591 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
563 592
564 if (inode) { 593 if (inode) {
565 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 594 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
566 595
567 down_write(&F2FS_I(inode)->i_sem); 596 down_write(&F2FS_I(inode)->i_sem);
568 597
@@ -589,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
589 truncate_hole(dir, page->index, page->index + 1); 618 truncate_hole(dir, page->index, page->index + 1);
590 clear_page_dirty_for_io(page); 619 clear_page_dirty_for_io(page);
591 ClearPageUptodate(page); 620 ClearPageUptodate(page);
592 inode_dec_dirty_dents(dir); 621 inode_dec_dirty_pages(dir);
593 } 622 }
594 f2fs_put_page(page, 1); 623 f2fs_put_page(page, 1);
595} 624}
@@ -603,7 +632,6 @@ bool f2fs_empty_dir(struct inode *dir)
603 unsigned long nblock = dir_blocks(dir); 632 unsigned long nblock = dir_blocks(dir);
604 633
605 for (bidx = 0; bidx < nblock; bidx++) { 634 for (bidx = 0; bidx < nblock; bidx++) {
606 void *kaddr;
607 dentry_page = get_lock_data_page(dir, bidx); 635 dentry_page = get_lock_data_page(dir, bidx);
608 if (IS_ERR(dentry_page)) { 636 if (IS_ERR(dentry_page)) {
609 if (PTR_ERR(dentry_page) == -ENOENT) 637 if (PTR_ERR(dentry_page) == -ENOENT)
@@ -612,8 +640,8 @@ bool f2fs_empty_dir(struct inode *dir)
612 return false; 640 return false;
613 } 641 }
614 642
615 kaddr = kmap_atomic(dentry_page); 643
616 dentry_blk = (struct f2fs_dentry_block *)kaddr; 644 dentry_blk = kmap_atomic(dentry_page);
617 if (bidx == 0) 645 if (bidx == 0)
618 bit_pos = 2; 646 bit_pos = 2;
619 else 647 else
@@ -621,7 +649,7 @@ bool f2fs_empty_dir(struct inode *dir)
621 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, 649 bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap,
622 NR_DENTRY_IN_BLOCK, 650 NR_DENTRY_IN_BLOCK,
623 bit_pos); 651 bit_pos);
624 kunmap_atomic(kaddr); 652 kunmap_atomic(dentry_blk);
625 653
626 f2fs_put_page(dentry_page, 1); 654 f2fs_put_page(dentry_page, 1);
627 655
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 58df97e174d0..8171e80b2ee9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,10 +21,16 @@
21#include <linux/sched.h> 21#include <linux/sched.h>
22 22
23#ifdef CONFIG_F2FS_CHECK_FS 23#ifdef CONFIG_F2FS_CHECK_FS
24#define f2fs_bug_on(condition) BUG_ON(condition) 24#define f2fs_bug_on(sbi, condition) BUG_ON(condition)
25#define f2fs_down_write(x, y) down_write_nest_lock(x, y) 25#define f2fs_down_write(x, y) down_write_nest_lock(x, y)
26#else 26#else
27#define f2fs_bug_on(condition) 27#define f2fs_bug_on(sbi, condition) \
28 do { \
29 if (unlikely(condition)) { \
30 WARN_ON(1); \
31 sbi->need_fsck = true; \
32 } \
33 } while (0)
28#define f2fs_down_write(x, y) down_write(x) 34#define f2fs_down_write(x, y) down_write(x)
29#endif 35#endif
30 36
@@ -41,6 +47,7 @@
41#define F2FS_MOUNT_INLINE_XATTR 0x00000080 47#define F2FS_MOUNT_INLINE_XATTR 0x00000080
42#define F2FS_MOUNT_INLINE_DATA 0x00000100 48#define F2FS_MOUNT_INLINE_DATA 0x00000100
43#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 49#define F2FS_MOUNT_FLUSH_MERGE 0x00000200
50#define F2FS_MOUNT_NOBARRIER 0x00000400
44 51
45#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) 52#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
46#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) 53#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -89,6 +96,20 @@ enum {
89 SIT_BITMAP 96 SIT_BITMAP
90}; 97};
91 98
99enum {
100 CP_UMOUNT,
101 CP_SYNC,
102 CP_DISCARD,
103};
104
105struct cp_control {
106 int reason;
107 __u64 trim_start;
108 __u64 trim_end;
109 __u64 trim_minlen;
110 __u64 trimmed;
111};
112
92/* 113/*
93 * For CP/NAT/SIT/SSA readahead 114 * For CP/NAT/SIT/SSA readahead
94 */ 115 */
@@ -96,11 +117,19 @@ enum {
96 META_CP, 117 META_CP,
97 META_NAT, 118 META_NAT,
98 META_SIT, 119 META_SIT,
99 META_SSA 120 META_SSA,
121 META_POR,
122};
123
124/* for the list of ino */
125enum {
126 ORPHAN_INO, /* for orphan ino list */
127 APPEND_INO, /* for append ino list */
128 UPDATE_INO, /* for update ino list */
129 MAX_INO_ENTRY, /* max. list */
100}; 130};
101 131
102/* for the list of orphan inodes */ 132struct ino_entry {
103struct orphan_inode_entry {
104 struct list_head list; /* list head */ 133 struct list_head list; /* list head */
105 nid_t ino; /* inode number */ 134 nid_t ino; /* inode number */
106}; 135};
@@ -122,7 +151,9 @@ struct discard_entry {
122struct fsync_inode_entry { 151struct fsync_inode_entry {
123 struct list_head list; /* list head */ 152 struct list_head list; /* list head */
124 struct inode *inode; /* vfs inode pointer */ 153 struct inode *inode; /* vfs inode pointer */
125 block_t blkaddr; /* block address locating the last inode */ 154 block_t blkaddr; /* block address locating the last fsync */
155 block_t last_dentry; /* block address locating the last dentry */
156 block_t last_inode; /* block address locating the last inode */
126}; 157};
127 158
128#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) 159#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats))
@@ -133,6 +164,9 @@ struct fsync_inode_entry {
133#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) 164#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se)
134#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) 165#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno)
135 166
167#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum))
168#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum))
169
136static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) 170static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i)
137{ 171{
138 int before = nats_in_cursum(rs); 172 int before = nats_in_cursum(rs);
@@ -147,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i)
147 return before; 181 return before;
148} 182}
149 183
184static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
185 int type)
186{
187 if (type == NAT_JOURNAL)
188 return size <= MAX_NAT_JENTRIES(sum);
189 return size <= MAX_SIT_JENTRIES(sum);
190}
191
150/* 192/*
151 * ioctl commands 193 * ioctl commands
152 */ 194 */
153#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS 195#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
154#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS 196#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
197
198#define F2FS_IOCTL_MAGIC 0xf5
199#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
200#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
201#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
155 202
156#if defined(__KERNEL__) && defined(CONFIG_COMPAT) 203#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
157/* 204/*
@@ -214,13 +261,16 @@ struct f2fs_inode_info {
214 /* Use below internally in f2fs*/ 261 /* Use below internally in f2fs*/
215 unsigned long flags; /* use to pass per-file flags */ 262 unsigned long flags; /* use to pass per-file flags */
216 struct rw_semaphore i_sem; /* protect fi info */ 263 struct rw_semaphore i_sem; /* protect fi info */
217 atomic_t dirty_dents; /* # of dirty dentry pages */ 264 atomic_t dirty_pages; /* # of dirty pages */
218 f2fs_hash_t chash; /* hash value of given file name */ 265 f2fs_hash_t chash; /* hash value of given file name */
219 unsigned int clevel; /* maximum level of given file name */ 266 unsigned int clevel; /* maximum level of given file name */
220 nid_t i_xattr_nid; /* node id that contains xattrs */ 267 nid_t i_xattr_nid; /* node id that contains xattrs */
221 unsigned long long xattr_ver; /* cp version of xattr modification */ 268 unsigned long long xattr_ver; /* cp version of xattr modification */
222 struct extent_info ext; /* in-memory extent cache entry */ 269 struct extent_info ext; /* in-memory extent cache entry */
223 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ 270 struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
271
272 struct list_head inmem_pages; /* inmemory pages managed by f2fs */
273 struct mutex inmem_lock; /* lock for inmemory pages */
224}; 274};
225 275
226static inline void get_extent_info(struct extent_info *ext, 276static inline void get_extent_info(struct extent_info *ext,
@@ -252,10 +302,11 @@ struct f2fs_nm_info {
252 302
253 /* NAT cache management */ 303 /* NAT cache management */
254 struct radix_tree_root nat_root;/* root of the nat entry cache */ 304 struct radix_tree_root nat_root;/* root of the nat entry cache */
305 struct radix_tree_root nat_set_root;/* root of the nat set cache */
255 rwlock_t nat_tree_lock; /* protect nat_tree_lock */ 306 rwlock_t nat_tree_lock; /* protect nat_tree_lock */
256 unsigned int nat_cnt; /* the # of cached nat entries */
257 struct list_head nat_entries; /* cached nat entry list (clean) */ 307 struct list_head nat_entries; /* cached nat entry list (clean) */
258 struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ 308 unsigned int nat_cnt; /* the # of cached nat entries */
309 unsigned int dirty_nat_cnt; /* total num of nat entries in set */
259 310
260 /* free node ids management */ 311 /* free node ids management */
261 struct radix_tree_root free_nid_root;/* root of the free_nid cache */ 312 struct radix_tree_root free_nid_root;/* root of the free_nid cache */
@@ -322,18 +373,16 @@ enum {
322}; 373};
323 374
324struct flush_cmd { 375struct flush_cmd {
325 struct flush_cmd *next;
326 struct completion wait; 376 struct completion wait;
377 struct llist_node llnode;
327 int ret; 378 int ret;
328}; 379};
329 380
330struct flush_cmd_control { 381struct flush_cmd_control {
331 struct task_struct *f2fs_issue_flush; /* flush thread */ 382 struct task_struct *f2fs_issue_flush; /* flush thread */
332 wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ 383 wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */
333 struct flush_cmd *issue_list; /* list for command issue */ 384 struct llist_head issue_list; /* list for command issue */
334 struct flush_cmd *dispatch_list; /* list for command dispatch */ 385 struct llist_node *dispatch_list; /* list for command dispatch */
335 spinlock_t issue_lock; /* for issue list lock */
336 struct flush_cmd *issue_tail; /* list tail of issue list */
337}; 386};
338 387
339struct f2fs_sm_info { 388struct f2fs_sm_info {
@@ -359,8 +408,11 @@ struct f2fs_sm_info {
359 int nr_discards; /* # of discards in the list */ 408 int nr_discards; /* # of discards in the list */
360 int max_discards; /* max. discards to be issued */ 409 int max_discards; /* max. discards to be issued */
361 410
411 struct list_head sit_entry_set; /* sit entry set list */
412
362 unsigned int ipu_policy; /* in-place-update policy */ 413 unsigned int ipu_policy; /* in-place-update policy */
363 unsigned int min_ipu_util; /* in-place-update threshold */ 414 unsigned int min_ipu_util; /* in-place-update threshold */
415 unsigned int min_fsync_blocks; /* threshold for fsync */
364 416
365 /* for flush command control */ 417 /* for flush command control */
366 struct flush_cmd_control *cmd_control_info; 418 struct flush_cmd_control *cmd_control_info;
@@ -385,7 +437,7 @@ enum count_type {
385}; 437};
386 438
387/* 439/*
388 * The below are the page types of bios used in submti_bio(). 440 * The below are the page types of bios used in submit_bio().
389 * The available types are: 441 * The available types are:
390 * DATA User data pages. It operates as async mode. 442 * DATA User data pages. It operates as async mode.
391 * NODE Node pages. It operates as async mode. 443 * NODE Node pages. It operates as async mode.
@@ -424,6 +476,7 @@ struct f2fs_sb_info {
424 struct buffer_head *raw_super_buf; /* buffer head of raw sb */ 476 struct buffer_head *raw_super_buf; /* buffer head of raw sb */
425 struct f2fs_super_block *raw_super; /* raw super block pointer */ 477 struct f2fs_super_block *raw_super; /* raw super block pointer */
426 int s_dirty; /* dirty flag for checkpoint */ 478 int s_dirty; /* dirty flag for checkpoint */
479 bool need_fsck; /* need fsck.f2fs to fix */
427 480
428 /* for node-related operations */ 481 /* for node-related operations */
429 struct f2fs_nm_info *nm_info; /* node manager */ 482 struct f2fs_nm_info *nm_info; /* node manager */
@@ -442,14 +495,17 @@ struct f2fs_sb_info {
442 struct inode *meta_inode; /* cache meta blocks */ 495 struct inode *meta_inode; /* cache meta blocks */
443 struct mutex cp_mutex; /* checkpoint procedure lock */ 496 struct mutex cp_mutex; /* checkpoint procedure lock */
444 struct rw_semaphore cp_rwsem; /* blocking FS operations */ 497 struct rw_semaphore cp_rwsem; /* blocking FS operations */
445 struct mutex node_write; /* locking node writes */ 498 struct rw_semaphore node_write; /* locking node writes */
446 struct mutex writepages; /* mutex for writepages() */ 499 struct mutex writepages; /* mutex for writepages() */
447 bool por_doing; /* recovery is doing or not */ 500 bool por_doing; /* recovery is doing or not */
448 wait_queue_head_t cp_wait; 501 wait_queue_head_t cp_wait;
449 502
450 /* for orphan inode management */ 503 /* for inode management */
451 struct list_head orphan_inode_list; /* orphan inode list */ 504 struct radix_tree_root ino_root[MAX_INO_ENTRY]; /* ino entry array */
452 spinlock_t orphan_inode_lock; /* for orphan inode list */ 505 spinlock_t ino_lock[MAX_INO_ENTRY]; /* for ino entry lock */
506 struct list_head ino_list[MAX_INO_ENTRY]; /* inode list head */
507
508 /* for orphan inode, use 0'th array */
453 unsigned int n_orphans; /* # of orphan inodes */ 509 unsigned int n_orphans; /* # of orphan inodes */
454 unsigned int max_orphans; /* max orphan inodes */ 510 unsigned int max_orphans; /* max orphan inodes */
455 511
@@ -457,7 +513,7 @@ struct f2fs_sb_info {
457 struct list_head dir_inode_list; /* dir inode list */ 513 struct list_head dir_inode_list; /* dir inode list */
458 spinlock_t dir_inode_lock; /* for dir inode list lock */ 514 spinlock_t dir_inode_lock; /* for dir inode list lock */
459 515
460 /* basic file system units */ 516 /* basic filesystem units */
461 unsigned int log_sectors_per_block; /* log2 sectors per block */ 517 unsigned int log_sectors_per_block; /* log2 sectors per block */
462 unsigned int log_blocksize; /* log2 block size */ 518 unsigned int log_blocksize; /* log2 block size */
463 unsigned int blocksize; /* block size */ 519 unsigned int blocksize; /* block size */
@@ -526,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb)
526 return sb->s_fs_info; 582 return sb->s_fs_info;
527} 583}
528 584
585static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode)
586{
587 return F2FS_SB(inode->i_sb);
588}
589
590static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping)
591{
592 return F2FS_I_SB(mapping->host);
593}
594
595static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page)
596{
597 return F2FS_M_SB(page->mapping);
598}
599
529static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) 600static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi)
530{ 601{
531 return (struct f2fs_super_block *)(sbi->raw_super); 602 return (struct f2fs_super_block *)(sbi->raw_super);
@@ -690,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
690 blkcnt_t count) 761 blkcnt_t count)
691{ 762{
692 spin_lock(&sbi->stat_lock); 763 spin_lock(&sbi->stat_lock);
693 f2fs_bug_on(sbi->total_valid_block_count < (block_t) count); 764 f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
694 f2fs_bug_on(inode->i_blocks < count); 765 f2fs_bug_on(sbi, inode->i_blocks < count);
695 inode->i_blocks -= count; 766 inode->i_blocks -= count;
696 sbi->total_valid_block_count -= (block_t)count; 767 sbi->total_valid_block_count -= (block_t)count;
697 spin_unlock(&sbi->stat_lock); 768 spin_unlock(&sbi->stat_lock);
@@ -703,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
703 F2FS_SET_SB_DIRT(sbi); 774 F2FS_SET_SB_DIRT(sbi);
704} 775}
705 776
706static inline void inode_inc_dirty_dents(struct inode *inode) 777static inline void inode_inc_dirty_pages(struct inode *inode)
707{ 778{
708 inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); 779 atomic_inc(&F2FS_I(inode)->dirty_pages);
709 atomic_inc(&F2FS_I(inode)->dirty_dents); 780 if (S_ISDIR(inode->i_mode))
781 inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
710} 782}
711 783
712static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) 784static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -714,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
714 atomic_dec(&sbi->nr_pages[count_type]); 786 atomic_dec(&sbi->nr_pages[count_type]);
715} 787}
716 788
717static inline void inode_dec_dirty_dents(struct inode *inode) 789static inline void inode_dec_dirty_pages(struct inode *inode)
718{ 790{
719 if (!S_ISDIR(inode->i_mode)) 791 if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
720 return; 792 return;
721 793
722 dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); 794 atomic_dec(&F2FS_I(inode)->dirty_pages);
723 atomic_dec(&F2FS_I(inode)->dirty_dents); 795
796 if (S_ISDIR(inode->i_mode))
797 dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
724} 798}
725 799
726static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) 800static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -728,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
728 return atomic_read(&sbi->nr_pages[count_type]); 802 return atomic_read(&sbi->nr_pages[count_type]);
729} 803}
730 804
731static inline int get_dirty_dents(struct inode *inode) 805static inline int get_dirty_pages(struct inode *inode)
732{ 806{
733 return atomic_read(&F2FS_I(inode)->dirty_dents); 807 return atomic_read(&F2FS_I(inode)->dirty_pages);
734} 808}
735 809
736static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) 810static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -768,7 +842,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
768 if (flag == NAT_BITMAP) 842 if (flag == NAT_BITMAP)
769 return &ckpt->sit_nat_version_bitmap; 843 return &ckpt->sit_nat_version_bitmap;
770 else 844 else
771 return ((unsigned char *)ckpt + F2FS_BLKSIZE); 845 return (unsigned char *)ckpt + F2FS_BLKSIZE;
772 } else { 846 } else {
773 offset = (flag == NAT_BITMAP) ? 847 offset = (flag == NAT_BITMAP) ?
774 le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; 848 le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0;
@@ -786,7 +860,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
786 860
787 /* 861 /*
788 * odd numbered checkpoint should at cp segment 0 862 * odd numbered checkpoint should at cp segment 0
789 * and even segent must be at cp segment 1 863 * and even segment must be at cp segment 1
790 */ 864 */
791 if (!(ckpt_version & 1)) 865 if (!(ckpt_version & 1))
792 start_addr += sbi->blocks_per_seg; 866 start_addr += sbi->blocks_per_seg;
@@ -835,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
835{ 909{
836 spin_lock(&sbi->stat_lock); 910 spin_lock(&sbi->stat_lock);
837 911
838 f2fs_bug_on(!sbi->total_valid_block_count); 912 f2fs_bug_on(sbi, !sbi->total_valid_block_count);
839 f2fs_bug_on(!sbi->total_valid_node_count); 913 f2fs_bug_on(sbi, !sbi->total_valid_node_count);
840 f2fs_bug_on(!inode->i_blocks); 914 f2fs_bug_on(sbi, !inode->i_blocks);
841 915
842 inode->i_blocks--; 916 inode->i_blocks--;
843 sbi->total_valid_node_count--; 917 sbi->total_valid_node_count--;
@@ -854,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi)
854static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) 928static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
855{ 929{
856 spin_lock(&sbi->stat_lock); 930 spin_lock(&sbi->stat_lock);
857 f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count); 931 f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count);
858 sbi->total_valid_inode_count++; 932 sbi->total_valid_inode_count++;
859 spin_unlock(&sbi->stat_lock); 933 spin_unlock(&sbi->stat_lock);
860} 934}
@@ -862,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi)
862static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) 936static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi)
863{ 937{
864 spin_lock(&sbi->stat_lock); 938 spin_lock(&sbi->stat_lock);
865 f2fs_bug_on(!sbi->total_valid_inode_count); 939 f2fs_bug_on(sbi, !sbi->total_valid_inode_count);
866 sbi->total_valid_inode_count--; 940 sbi->total_valid_inode_count--;
867 spin_unlock(&sbi->stat_lock); 941 spin_unlock(&sbi->stat_lock);
868} 942}
@@ -878,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock)
878 return; 952 return;
879 953
880 if (unlock) { 954 if (unlock) {
881 f2fs_bug_on(!PageLocked(page)); 955 f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page));
882 unlock_page(page); 956 unlock_page(page);
883 } 957 }
884 page_cache_release(page); 958 page_cache_release(page);
@@ -983,11 +1057,17 @@ enum {
983 FI_NO_EXTENT, /* not to use the extent cache */ 1057 FI_NO_EXTENT, /* not to use the extent cache */
984 FI_INLINE_XATTR, /* used for inline xattr */ 1058 FI_INLINE_XATTR, /* used for inline xattr */
985 FI_INLINE_DATA, /* used for inline data*/ 1059 FI_INLINE_DATA, /* used for inline data*/
1060 FI_APPEND_WRITE, /* inode has appended data */
1061 FI_UPDATE_WRITE, /* inode has in-place-update data */
1062 FI_NEED_IPU, /* used for ipu per file */
1063 FI_ATOMIC_FILE, /* indicate atomic file */
1064 FI_VOLATILE_FILE, /* indicate volatile file */
986}; 1065};
987 1066
988static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) 1067static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
989{ 1068{
990 set_bit(flag, &fi->flags); 1069 if (!test_bit(flag, &fi->flags))
1070 set_bit(flag, &fi->flags);
991} 1071}
992 1072
993static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) 1073static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
@@ -997,7 +1077,8 @@ static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag)
997 1077
998static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) 1078static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag)
999{ 1079{
1000 clear_bit(flag, &fi->flags); 1080 if (test_bit(flag, &fi->flags))
1081 clear_bit(flag, &fi->flags);
1001} 1082}
1002 1083
1003static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) 1084static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode)
@@ -1067,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode)
1067 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); 1148 return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA);
1068} 1149}
1069 1150
1151static inline bool f2fs_is_atomic_file(struct inode *inode)
1152{
1153 return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE);
1154}
1155
1156static inline bool f2fs_is_volatile_file(struct inode *inode)
1157{
1158 return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
1159}
1160
1070static inline void *inline_data_addr(struct page *page) 1161static inline void *inline_data_addr(struct page *page)
1071{ 1162{
1072 struct f2fs_inode *ri = F2FS_INODE(page); 1163 struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1078,6 +1169,11 @@ static inline int f2fs_readonly(struct super_block *sb)
1078 return sb->s_flags & MS_RDONLY; 1169 return sb->s_flags & MS_RDONLY;
1079} 1170}
1080 1171
1172static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
1173{
1174 return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
1175}
1176
1081static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) 1177static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
1082{ 1178{
1083 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); 1179 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
@@ -1099,7 +1195,7 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi)
1099 */ 1195 */
1100int f2fs_sync_file(struct file *, loff_t, loff_t, int); 1196int f2fs_sync_file(struct file *, loff_t, loff_t, int);
1101void truncate_data_blocks(struct dnode_of_data *); 1197void truncate_data_blocks(struct dnode_of_data *);
1102int truncate_blocks(struct inode *, u64); 1198int truncate_blocks(struct inode *, u64, bool);
1103void f2fs_truncate(struct inode *); 1199void f2fs_truncate(struct inode *);
1104int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 1200int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
1105int f2fs_setattr(struct dentry *, struct iattr *); 1201int f2fs_setattr(struct dentry *, struct iattr *);
@@ -1118,6 +1214,7 @@ void update_inode(struct inode *, struct page *);
1118void update_inode_page(struct inode *); 1214void update_inode_page(struct inode *);
1119int f2fs_write_inode(struct inode *, struct writeback_control *); 1215int f2fs_write_inode(struct inode *, struct writeback_control *);
1120void f2fs_evict_inode(struct inode *); 1216void f2fs_evict_inode(struct inode *);
1217void handle_failed_inode(struct inode *);
1121 1218
1122/* 1219/*
1123 * namei.c 1220 * namei.c
@@ -1136,6 +1233,7 @@ void f2fs_set_link(struct inode *, struct f2fs_dir_entry *,
1136int update_dent_inode(struct inode *, const struct qstr *); 1233int update_dent_inode(struct inode *, const struct qstr *);
1137int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); 1234int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *);
1138void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *); 1235void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *);
1236int f2fs_do_tmpfile(struct inode *, struct inode *);
1139int f2fs_make_empty(struct inode *, struct inode *); 1237int f2fs_make_empty(struct inode *, struct inode *);
1140bool f2fs_empty_dir(struct inode *); 1238bool f2fs_empty_dir(struct inode *);
1141 1239
@@ -1155,7 +1253,7 @@ void f2fs_msg(struct super_block *, const char *, const char *, ...);
1155/* 1253/*
1156 * hash.c 1254 * hash.c
1157 */ 1255 */
1158f2fs_hash_t f2fs_dentry_hash(const char *, size_t); 1256f2fs_hash_t f2fs_dentry_hash(const struct qstr *);
1159 1257
1160/* 1258/*
1161 * node.c 1259 * node.c
@@ -1164,16 +1262,16 @@ struct dnode_of_data;
1164struct node_info; 1262struct node_info;
1165 1263
1166bool available_free_memory(struct f2fs_sb_info *, int); 1264bool available_free_memory(struct f2fs_sb_info *, int);
1167int is_checkpointed_node(struct f2fs_sb_info *, nid_t); 1265bool is_checkpointed_node(struct f2fs_sb_info *, nid_t);
1168bool fsync_mark_done(struct f2fs_sb_info *, nid_t); 1266bool has_fsynced_inode(struct f2fs_sb_info *, nid_t);
1169void fsync_mark_clear(struct f2fs_sb_info *, nid_t); 1267bool need_inode_block_update(struct f2fs_sb_info *, nid_t);
1170void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); 1268void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *);
1171int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); 1269int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int);
1172int truncate_inode_blocks(struct inode *, pgoff_t); 1270int truncate_inode_blocks(struct inode *, pgoff_t);
1173int truncate_xattr_node(struct inode *, struct page *); 1271int truncate_xattr_node(struct inode *, struct page *);
1174int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); 1272int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t);
1175void remove_inode_page(struct inode *); 1273void remove_inode_page(struct inode *);
1176struct page *new_inode_page(struct inode *, const struct qstr *); 1274struct page *new_inode_page(struct inode *);
1177struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); 1275struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *);
1178void ra_node_page(struct f2fs_sb_info *, nid_t); 1276void ra_node_page(struct f2fs_sb_info *, nid_t);
1179struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); 1277struct page *get_node_page(struct f2fs_sb_info *, pgoff_t);
@@ -1183,9 +1281,8 @@ int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *);
1183bool alloc_nid(struct f2fs_sb_info *, nid_t *); 1281bool alloc_nid(struct f2fs_sb_info *, nid_t *);
1184void alloc_nid_done(struct f2fs_sb_info *, nid_t); 1282void alloc_nid_done(struct f2fs_sb_info *, nid_t);
1185void alloc_nid_failed(struct f2fs_sb_info *, nid_t); 1283void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
1186void recover_node_page(struct f2fs_sb_info *, struct page *, 1284void recover_inline_xattr(struct inode *, struct page *);
1187 struct f2fs_summary *, struct node_info *, block_t); 1285void recover_xattr_data(struct inode *, struct page *, block_t);
1188bool recover_xattr_data(struct inode *, struct page *, block_t);
1189int recover_inode_page(struct f2fs_sb_info *, struct page *); 1286int recover_inode_page(struct f2fs_sb_info *, struct page *);
1190int restore_node_summary(struct f2fs_sb_info *, unsigned int, 1287int restore_node_summary(struct f2fs_sb_info *, unsigned int,
1191 struct f2fs_summary_block *); 1288 struct f2fs_summary_block *);
@@ -1198,6 +1295,8 @@ void destroy_node_manager_caches(void);
1198/* 1295/*
1199 * segment.c 1296 * segment.c
1200 */ 1297 */
1298void register_inmem_page(struct inode *, struct page *);
1299void commit_inmem_pages(struct inode *, bool);
1201void f2fs_balance_fs(struct f2fs_sb_info *); 1300void f2fs_balance_fs(struct f2fs_sb_info *);
1202void f2fs_balance_fs_bg(struct f2fs_sb_info *); 1301void f2fs_balance_fs_bg(struct f2fs_sb_info *);
1203int f2fs_issue_flush(struct f2fs_sb_info *); 1302int f2fs_issue_flush(struct f2fs_sb_info *);
@@ -1206,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *);
1206void invalidate_blocks(struct f2fs_sb_info *, block_t); 1305void invalidate_blocks(struct f2fs_sb_info *, block_t);
1207void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); 1306void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
1208void clear_prefree_segments(struct f2fs_sb_info *); 1307void clear_prefree_segments(struct f2fs_sb_info *);
1209void discard_next_dnode(struct f2fs_sb_info *); 1308void release_discard_addrs(struct f2fs_sb_info *);
1309void discard_next_dnode(struct f2fs_sb_info *, block_t);
1210int npages_for_summary_flush(struct f2fs_sb_info *); 1310int npages_for_summary_flush(struct f2fs_sb_info *);
1211void allocate_new_segments(struct f2fs_sb_info *); 1311void allocate_new_segments(struct f2fs_sb_info *);
1312int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
1212struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); 1313struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
1213void write_meta_page(struct f2fs_sb_info *, struct page *); 1314void write_meta_page(struct f2fs_sb_info *, struct page *);
1214void write_node_page(struct f2fs_sb_info *, struct page *, 1315void write_node_page(struct f2fs_sb_info *, struct page *,
@@ -1218,8 +1319,6 @@ void write_data_page(struct page *, struct dnode_of_data *, block_t *,
1218void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *); 1319void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
1219void recover_data_page(struct f2fs_sb_info *, struct page *, 1320void recover_data_page(struct f2fs_sb_info *, struct page *,
1220 struct f2fs_summary *, block_t, block_t); 1321 struct f2fs_summary *, block_t, block_t);
1221void rewrite_node_page(struct f2fs_sb_info *, struct page *,
1222 struct f2fs_summary *, block_t, block_t);
1223void allocate_data_block(struct f2fs_sb_info *, struct page *, 1322void allocate_data_block(struct f2fs_sb_info *, struct page *,
1224 block_t, block_t *, struct f2fs_summary *, int); 1323 block_t, block_t *, struct f2fs_summary *, int);
1225void f2fs_wait_on_page_writeback(struct page *, enum page_type); 1324void f2fs_wait_on_page_writeback(struct page *, enum page_type);
@@ -1227,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t);
1227void write_node_summaries(struct f2fs_sb_info *, block_t); 1326void write_node_summaries(struct f2fs_sb_info *, block_t);
1228int lookup_journal_in_cursum(struct f2fs_summary_block *, 1327int lookup_journal_in_cursum(struct f2fs_summary_block *,
1229 int, unsigned int, int); 1328 int, unsigned int, int);
1230void flush_sit_entries(struct f2fs_sb_info *); 1329void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *);
1231int build_segment_manager(struct f2fs_sb_info *); 1330int build_segment_manager(struct f2fs_sb_info *);
1232void destroy_segment_manager(struct f2fs_sb_info *); 1331void destroy_segment_manager(struct f2fs_sb_info *);
1233int __init create_segment_manager_caches(void); 1332int __init create_segment_manager_caches(void);
@@ -1238,20 +1337,25 @@ void destroy_segment_manager_caches(void);
1238 */ 1337 */
1239struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); 1338struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
1240struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); 1339struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
1241int ra_meta_pages(struct f2fs_sb_info *, int, int, int); 1340struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t);
1341int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
1242long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); 1342long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
1343void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
1344void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
1345void release_dirty_inode(struct f2fs_sb_info *);
1346bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
1243int acquire_orphan_inode(struct f2fs_sb_info *); 1347int acquire_orphan_inode(struct f2fs_sb_info *);
1244void release_orphan_inode(struct f2fs_sb_info *); 1348void release_orphan_inode(struct f2fs_sb_info *);
1245void add_orphan_inode(struct f2fs_sb_info *, nid_t); 1349void add_orphan_inode(struct f2fs_sb_info *, nid_t);
1246void remove_orphan_inode(struct f2fs_sb_info *, nid_t); 1350void remove_orphan_inode(struct f2fs_sb_info *, nid_t);
1247void recover_orphan_inodes(struct f2fs_sb_info *); 1351void recover_orphan_inodes(struct f2fs_sb_info *);
1248int get_valid_checkpoint(struct f2fs_sb_info *); 1352int get_valid_checkpoint(struct f2fs_sb_info *);
1249void set_dirty_dir_page(struct inode *, struct page *); 1353void update_dirty_page(struct inode *, struct page *);
1250void add_dirty_dir_inode(struct inode *); 1354void add_dirty_dir_inode(struct inode *);
1251void remove_dirty_dir_inode(struct inode *); 1355void remove_dirty_dir_inode(struct inode *);
1252void sync_dirty_dir_inodes(struct f2fs_sb_info *); 1356void sync_dirty_dir_inodes(struct f2fs_sb_info *);
1253void write_checkpoint(struct f2fs_sb_info *, bool); 1357void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
1254void init_orphan_info(struct f2fs_sb_info *); 1358void init_ino_entry_info(struct f2fs_sb_info *);
1255int __init create_checkpoint_caches(void); 1359int __init create_checkpoint_caches(void);
1256void destroy_checkpoint_caches(void); 1360void destroy_checkpoint_caches(void);
1257 1361
@@ -1295,7 +1399,6 @@ bool space_for_roll_forward(struct f2fs_sb_info *);
1295struct f2fs_stat_info { 1399struct f2fs_stat_info {
1296 struct list_head stat_list; 1400 struct list_head stat_list;
1297 struct f2fs_sb_info *sbi; 1401 struct f2fs_sb_info *sbi;
1298 struct mutex stat_lock;
1299 int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; 1402 int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
1300 int main_area_segs, main_area_sections, main_area_zones; 1403 int main_area_segs, main_area_sections, main_area_zones;
1301 int hit_ext, total_ext; 1404 int hit_ext, total_ext;
@@ -1335,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
1335#define stat_inc_inline_inode(inode) \ 1438#define stat_inc_inline_inode(inode) \
1336 do { \ 1439 do { \
1337 if (f2fs_has_inline_data(inode)) \ 1440 if (f2fs_has_inline_data(inode)) \
1338 ((F2FS_SB(inode->i_sb))->inline_inode++); \ 1441 ((F2FS_I_SB(inode))->inline_inode++); \
1339 } while (0) 1442 } while (0)
1340#define stat_dec_inline_inode(inode) \ 1443#define stat_dec_inline_inode(inode) \
1341 do { \ 1444 do { \
1342 if (f2fs_has_inline_data(inode)) \ 1445 if (f2fs_has_inline_data(inode)) \
1343 ((F2FS_SB(inode->i_sb))->inline_inode--); \ 1446 ((F2FS_I_SB(inode))->inline_inode--); \
1344 } while (0) 1447 } while (0)
1345 1448
1346#define stat_inc_seg_type(sbi, curseg) \ 1449#define stat_inc_seg_type(sbi, curseg) \
@@ -1417,8 +1520,8 @@ extern const struct inode_operations f2fs_special_inode_operations;
1417 */ 1520 */
1418bool f2fs_may_inline(struct inode *); 1521bool f2fs_may_inline(struct inode *);
1419int f2fs_read_inline_data(struct inode *, struct page *); 1522int f2fs_read_inline_data(struct inode *, struct page *);
1420int f2fs_convert_inline_data(struct inode *, pgoff_t); 1523int f2fs_convert_inline_data(struct inode *, pgoff_t, struct page *);
1421int f2fs_write_inline_data(struct inode *, struct page *, unsigned int); 1524int f2fs_write_inline_data(struct inode *, struct page *, unsigned int);
1422void truncate_inline_data(struct inode *, u64); 1525void truncate_inline_data(struct inode *, u64);
1423int recover_inline_data(struct inode *, struct page *); 1526bool recover_inline_data(struct inode *, struct page *);
1424#endif 1527#endif
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 7d8b96275092..8e68bb64f835 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
33{ 33{
34 struct page *page = vmf->page; 34 struct page *page = vmf->page;
35 struct inode *inode = file_inode(vma->vm_file); 35 struct inode *inode = file_inode(vma->vm_file);
36 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 36 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
37 struct dnode_of_data dn; 37 struct dnode_of_data dn;
38 int err; 38 int err;
39 39
@@ -41,6 +41,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
41 41
42 sb_start_pagefault(inode->i_sb); 42 sb_start_pagefault(inode->i_sb);
43 43
44 /* force to convert with normal data indices */
45 err = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, page);
46 if (err)
47 goto out;
48
44 /* block allocation */ 49 /* block allocation */
45 f2fs_lock_op(sbi); 50 f2fs_lock_op(sbi);
46 set_new_dnode(&dn, inode, NULL, NULL, 0); 51 set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -110,11 +115,31 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
110 return 1; 115 return 1;
111} 116}
112 117
118static inline bool need_do_checkpoint(struct inode *inode)
119{
120 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
121 bool need_cp = false;
122
123 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
124 need_cp = true;
125 else if (file_wrong_pino(inode))
126 need_cp = true;
127 else if (!space_for_roll_forward(sbi))
128 need_cp = true;
129 else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
130 need_cp = true;
131 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
132 need_cp = true;
133
134 return need_cp;
135}
136
113int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) 137int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
114{ 138{
115 struct inode *inode = file->f_mapping->host; 139 struct inode *inode = file->f_mapping->host;
116 struct f2fs_inode_info *fi = F2FS_I(inode); 140 struct f2fs_inode_info *fi = F2FS_I(inode);
117 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 141 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
142 nid_t ino = inode->i_ino;
118 int ret = 0; 143 int ret = 0;
119 bool need_cp = false; 144 bool need_cp = false;
120 struct writeback_control wbc = { 145 struct writeback_control wbc = {
@@ -127,32 +152,47 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
127 return 0; 152 return 0;
128 153
129 trace_f2fs_sync_file_enter(inode); 154 trace_f2fs_sync_file_enter(inode);
155
156 /* if fdatasync is triggered, let's do in-place-update */
157 if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
158 set_inode_flag(fi, FI_NEED_IPU);
130 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 159 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
160 clear_inode_flag(fi, FI_NEED_IPU);
161
131 if (ret) { 162 if (ret) {
132 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 163 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
133 return ret; 164 return ret;
134 } 165 }
135 166
167 /*
168 * if there is no written data, don't waste time to write recovery info.
169 */
170 if (!is_inode_flag_set(fi, FI_APPEND_WRITE) &&
171 !exist_written_data(sbi, ino, APPEND_INO)) {
172 struct page *i = find_get_page(NODE_MAPPING(sbi), ino);
173
174 /* But we need to avoid that there are some inode updates */
175 if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) {
176 f2fs_put_page(i, 0);
177 goto go_write;
178 }
179 f2fs_put_page(i, 0);
180
181 if (is_inode_flag_set(fi, FI_UPDATE_WRITE) ||
182 exist_written_data(sbi, ino, UPDATE_INO))
183 goto flush_out;
184 goto out;
185 }
186go_write:
136 /* guarantee free sections for fsync */ 187 /* guarantee free sections for fsync */
137 f2fs_balance_fs(sbi); 188 f2fs_balance_fs(sbi);
138 189
139 down_read(&fi->i_sem);
140
141 /* 190 /*
142 * Both of fdatasync() and fsync() are able to be recovered from 191 * Both of fdatasync() and fsync() are able to be recovered from
143 * sudden-power-off. 192 * sudden-power-off.
144 */ 193 */
145 if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) 194 down_read(&fi->i_sem);
146 need_cp = true; 195 need_cp = need_do_checkpoint(inode);
147 else if (file_wrong_pino(inode))
148 need_cp = true;
149 else if (!space_for_roll_forward(sbi))
150 need_cp = true;
151 else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
152 need_cp = true;
153 else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi)))
154 need_cp = true;
155
156 up_read(&fi->i_sem); 196 up_read(&fi->i_sem);
157 197
158 if (need_cp) { 198 if (need_cp) {
@@ -176,19 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
176 up_write(&fi->i_sem); 216 up_write(&fi->i_sem);
177 } 217 }
178 } else { 218 } else {
179 /* if there is no written node page, write its inode page */ 219sync_nodes:
180 while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { 220 sync_node_pages(sbi, ino, &wbc);
181 if (fsync_mark_done(sbi, inode->i_ino)) 221
182 goto out; 222 if (need_inode_block_update(sbi, ino)) {
183 mark_inode_dirty_sync(inode); 223 mark_inode_dirty_sync(inode);
184 ret = f2fs_write_inode(inode, NULL); 224 ret = f2fs_write_inode(inode, NULL);
185 if (ret) 225 if (ret)
186 goto out; 226 goto out;
227 goto sync_nodes;
187 } 228 }
188 ret = wait_on_node_pages_writeback(sbi, inode->i_ino); 229
230 ret = wait_on_node_pages_writeback(sbi, ino);
189 if (ret) 231 if (ret)
190 goto out; 232 goto out;
191 ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); 233
234 /* once recovery info is written, don't need to tack this */
235 remove_dirty_inode(sbi, ino, APPEND_INO);
236 clear_inode_flag(fi, FI_APPEND_WRITE);
237flush_out:
238 remove_dirty_inode(sbi, ino, UPDATE_INO);
239 clear_inode_flag(fi, FI_UPDATE_WRITE);
240 ret = f2fs_issue_flush(F2FS_I_SB(inode));
192 } 241 }
193out: 242out:
194 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); 243 trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
@@ -206,8 +255,9 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping,
206 255
207 /* find first dirty page index */ 256 /* find first dirty page index */
208 pagevec_init(&pvec, 0); 257 pagevec_init(&pvec, 0);
209 nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); 258 nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs,
210 pgofs = nr_pages ? pvec.pages[0]->index: LONG_MAX; 259 PAGECACHE_TAG_DIRTY, 1);
260 pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX;
211 pagevec_release(&pvec); 261 pagevec_release(&pvec);
212 return pgofs; 262 return pgofs;
213} 263}
@@ -262,7 +312,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
262 if (err && err != -ENOENT) { 312 if (err && err != -ENOENT) {
263 goto fail; 313 goto fail;
264 } else if (err == -ENOENT) { 314 } else if (err == -ENOENT) {
265 /* direct node is not exist */ 315 /* direct node does not exists */
266 if (whence == SEEK_DATA) { 316 if (whence == SEEK_DATA) {
267 pgofs = PGOFS_OF_NEXT_DNODE(pgofs, 317 pgofs = PGOFS_OF_NEXT_DNODE(pgofs,
268 F2FS_I(inode)); 318 F2FS_I(inode));
@@ -272,8 +322,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
272 } 322 }
273 } 323 }
274 324
275 end_offset = IS_INODE(dn.node_page) ? 325 end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
276 ADDRS_PER_INODE(F2FS_I(inode)) : ADDRS_PER_BLOCK;
277 326
278 /* find data/hole in dnode block */ 327 /* find data/hole in dnode block */
279 for (; dn.ofs_in_node < end_offset; 328 for (; dn.ofs_in_node < end_offset;
@@ -315,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
315 maxbytes, i_size_read(inode)); 364 maxbytes, i_size_read(inode));
316 case SEEK_DATA: 365 case SEEK_DATA:
317 case SEEK_HOLE: 366 case SEEK_HOLE:
367 if (offset < 0)
368 return -ENXIO;
318 return f2fs_seek_block(file, offset, whence); 369 return f2fs_seek_block(file, offset, whence);
319 } 370 }
320 371
@@ -331,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
331int truncate_data_blocks_range(struct dnode_of_data *dn, int count) 382int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
332{ 383{
333 int nr_free = 0, ofs = dn->ofs_in_node; 384 int nr_free = 0, ofs = dn->ofs_in_node;
334 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 385 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
335 struct f2fs_node *raw_node; 386 struct f2fs_node *raw_node;
336 __le32 *addr; 387 __le32 *addr;
337 388
@@ -380,19 +431,21 @@ static void truncate_partial_data_page(struct inode *inode, u64 from)
380 return; 431 return;
381 432
382 lock_page(page); 433 lock_page(page);
383 if (unlikely(page->mapping != inode->i_mapping)) { 434 if (unlikely(!PageUptodate(page) ||
384 f2fs_put_page(page, 1); 435 page->mapping != inode->i_mapping))
385 return; 436 goto out;
386 } 437
387 f2fs_wait_on_page_writeback(page, DATA); 438 f2fs_wait_on_page_writeback(page, DATA);
388 zero_user(page, offset, PAGE_CACHE_SIZE - offset); 439 zero_user(page, offset, PAGE_CACHE_SIZE - offset);
389 set_page_dirty(page); 440 set_page_dirty(page);
441
442out:
390 f2fs_put_page(page, 1); 443 f2fs_put_page(page, 1);
391} 444}
392 445
393int truncate_blocks(struct inode *inode, u64 from) 446int truncate_blocks(struct inode *inode, u64 from, bool lock)
394{ 447{
395 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 448 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
396 unsigned int blocksize = inode->i_sb->s_blocksize; 449 unsigned int blocksize = inode->i_sb->s_blocksize;
397 struct dnode_of_data dn; 450 struct dnode_of_data dn;
398 pgoff_t free_from; 451 pgoff_t free_from;
@@ -406,14 +459,16 @@ int truncate_blocks(struct inode *inode, u64 from)
406 free_from = (pgoff_t) 459 free_from = (pgoff_t)
407 ((from + blocksize - 1) >> (sbi->log_blocksize)); 460 ((from + blocksize - 1) >> (sbi->log_blocksize));
408 461
409 f2fs_lock_op(sbi); 462 if (lock)
463 f2fs_lock_op(sbi);
410 464
411 set_new_dnode(&dn, inode, NULL, NULL, 0); 465 set_new_dnode(&dn, inode, NULL, NULL, 0);
412 err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); 466 err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
413 if (err) { 467 if (err) {
414 if (err == -ENOENT) 468 if (err == -ENOENT)
415 goto free_next; 469 goto free_next;
416 f2fs_unlock_op(sbi); 470 if (lock)
471 f2fs_unlock_op(sbi);
417 trace_f2fs_truncate_blocks_exit(inode, err); 472 trace_f2fs_truncate_blocks_exit(inode, err);
418 return err; 473 return err;
419 } 474 }
@@ -421,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from)
421 count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); 476 count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
422 477
423 count -= dn.ofs_in_node; 478 count -= dn.ofs_in_node;
424 f2fs_bug_on(count < 0); 479 f2fs_bug_on(sbi, count < 0);
425 480
426 if (dn.ofs_in_node || IS_INODE(dn.node_page)) { 481 if (dn.ofs_in_node || IS_INODE(dn.node_page)) {
427 truncate_data_blocks_range(&dn, count); 482 truncate_data_blocks_range(&dn, count);
@@ -431,7 +486,8 @@ int truncate_blocks(struct inode *inode, u64 from)
431 f2fs_put_dnode(&dn); 486 f2fs_put_dnode(&dn);
432free_next: 487free_next:
433 err = truncate_inode_blocks(inode, free_from); 488 err = truncate_inode_blocks(inode, free_from);
434 f2fs_unlock_op(sbi); 489 if (lock)
490 f2fs_unlock_op(sbi);
435done: 491done:
436 /* lastly zero out the first data page */ 492 /* lastly zero out the first data page */
437 truncate_partial_data_page(inode, from); 493 truncate_partial_data_page(inode, from);
@@ -448,7 +504,7 @@ void f2fs_truncate(struct inode *inode)
448 504
449 trace_f2fs_truncate(inode); 505 trace_f2fs_truncate(inode);
450 506
451 if (!truncate_blocks(inode, i_size_read(inode))) { 507 if (!truncate_blocks(inode, i_size_read(inode), true)) {
452 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 508 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
453 mark_inode_dirty(inode); 509 mark_inode_dirty(inode);
454 } 510 }
@@ -504,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
504 if (err) 560 if (err)
505 return err; 561 return err;
506 562
507 if ((attr->ia_valid & ATTR_SIZE) && 563 if (attr->ia_valid & ATTR_SIZE) {
508 attr->ia_size != i_size_read(inode)) { 564 err = f2fs_convert_inline_data(inode, attr->ia_size, NULL);
509 err = f2fs_convert_inline_data(inode, attr->ia_size);
510 if (err) 565 if (err)
511 return err; 566 return err;
512 567
513 truncate_setsize(inode, attr->ia_size); 568 if (attr->ia_size != i_size_read(inode)) {
514 f2fs_truncate(inode); 569 truncate_setsize(inode, attr->ia_size);
515 f2fs_balance_fs(F2FS_SB(inode->i_sb)); 570 f2fs_truncate(inode);
571 f2fs_balance_fs(F2FS_I_SB(inode));
572 } else {
573 /*
574 * giving a chance to truncate blocks past EOF which
575 * are fallocated with FALLOC_FL_KEEP_SIZE.
576 */
577 f2fs_truncate(inode);
578 }
516 } 579 }
517 580
518 __setattr_copy(inode, attr); 581 __setattr_copy(inode, attr);
@@ -546,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = {
546static void fill_zero(struct inode *inode, pgoff_t index, 609static void fill_zero(struct inode *inode, pgoff_t index,
547 loff_t start, loff_t len) 610 loff_t start, loff_t len)
548{ 611{
549 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 612 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
550 struct page *page; 613 struct page *page;
551 614
552 if (!len) 615 if (!len)
@@ -595,7 +658,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
595 loff_t off_start, off_end; 658 loff_t off_start, off_end;
596 int ret = 0; 659 int ret = 0;
597 660
598 ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1); 661 if (!S_ISREG(inode->i_mode))
662 return -EOPNOTSUPP;
663
664 /* skip punching hole beyond i_size */
665 if (offset >= inode->i_size)
666 return ret;
667
668 ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
599 if (ret) 669 if (ret)
600 return ret; 670 return ret;
601 671
@@ -618,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
618 if (pg_start < pg_end) { 688 if (pg_start < pg_end) {
619 struct address_space *mapping = inode->i_mapping; 689 struct address_space *mapping = inode->i_mapping;
620 loff_t blk_start, blk_end; 690 loff_t blk_start, blk_end;
621 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 691 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
622 692
623 f2fs_balance_fs(sbi); 693 f2fs_balance_fs(sbi);
624 694
@@ -639,17 +709,19 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
639static int expand_inode_data(struct inode *inode, loff_t offset, 709static int expand_inode_data(struct inode *inode, loff_t offset,
640 loff_t len, int mode) 710 loff_t len, int mode)
641{ 711{
642 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 712 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
643 pgoff_t index, pg_start, pg_end; 713 pgoff_t index, pg_start, pg_end;
644 loff_t new_size = i_size_read(inode); 714 loff_t new_size = i_size_read(inode);
645 loff_t off_start, off_end; 715 loff_t off_start, off_end;
646 int ret = 0; 716 int ret = 0;
647 717
718 f2fs_balance_fs(sbi);
719
648 ret = inode_newsize_ok(inode, (len + offset)); 720 ret = inode_newsize_ok(inode, (len + offset));
649 if (ret) 721 if (ret)
650 return ret; 722 return ret;
651 723
652 ret = f2fs_convert_inline_data(inode, offset + len); 724 ret = f2fs_convert_inline_data(inode, offset + len, NULL);
653 if (ret) 725 if (ret)
654 return ret; 726 return ret;
655 727
@@ -733,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
733 return flags & F2FS_OTHER_FLMASK; 805 return flags & F2FS_OTHER_FLMASK;
734} 806}
735 807
736long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 808static int f2fs_ioc_getflags(struct file *filp, unsigned long arg)
737{ 809{
738 struct inode *inode = file_inode(filp); 810 struct inode *inode = file_inode(filp);
739 struct f2fs_inode_info *fi = F2FS_I(inode); 811 struct f2fs_inode_info *fi = F2FS_I(inode);
740 unsigned int flags; 812 unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
813 return put_user(flags, (int __user *)arg);
814}
815
816static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
817{
818 struct inode *inode = file_inode(filp);
819 struct f2fs_inode_info *fi = F2FS_I(inode);
820 unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE;
821 unsigned int oldflags;
741 int ret; 822 int ret;
742 823
743 switch (cmd) { 824 ret = mnt_want_write_file(filp);
744 case F2FS_IOC_GETFLAGS: 825 if (ret)
745 flags = fi->i_flags & FS_FL_USER_VISIBLE; 826 return ret;
746 return put_user(flags, (int __user *) arg);
747 case F2FS_IOC_SETFLAGS:
748 {
749 unsigned int oldflags;
750 827
751 ret = mnt_want_write_file(filp); 828 if (!inode_owner_or_capable(inode)) {
752 if (ret) 829 ret = -EACCES;
753 return ret; 830 goto out;
831 }
754 832
755 if (!inode_owner_or_capable(inode)) { 833 if (get_user(flags, (int __user *)arg)) {
756 ret = -EACCES; 834 ret = -EFAULT;
757 goto out; 835 goto out;
758 } 836 }
837
838 flags = f2fs_mask_flags(inode->i_mode, flags);
839
840 mutex_lock(&inode->i_mutex);
759 841
760 if (get_user(flags, (int __user *) arg)) { 842 oldflags = fi->i_flags;
761 ret = -EFAULT; 843
844 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
845 if (!capable(CAP_LINUX_IMMUTABLE)) {
846 mutex_unlock(&inode->i_mutex);
847 ret = -EPERM;
762 goto out; 848 goto out;
763 } 849 }
850 }
851
852 flags = flags & FS_FL_USER_MODIFIABLE;
853 flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
854 fi->i_flags = flags;
855 mutex_unlock(&inode->i_mutex);
764 856
765 flags = f2fs_mask_flags(inode->i_mode, flags); 857 f2fs_set_inode_flags(inode);
858 inode->i_ctime = CURRENT_TIME;
859 mark_inode_dirty(inode);
860out:
861 mnt_drop_write_file(filp);
862 return ret;
863}
766 864
767 mutex_lock(&inode->i_mutex); 865static int f2fs_ioc_start_atomic_write(struct file *filp)
866{
867 struct inode *inode = file_inode(filp);
868 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
768 869
769 oldflags = fi->i_flags; 870 if (!inode_owner_or_capable(inode))
871 return -EACCES;
770 872
771 if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { 873 f2fs_balance_fs(sbi);
772 if (!capable(CAP_LINUX_IMMUTABLE)) {
773 mutex_unlock(&inode->i_mutex);
774 ret = -EPERM;
775 goto out;
776 }
777 }
778 874
779 flags = flags & FS_FL_USER_MODIFIABLE; 875 set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
780 flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
781 fi->i_flags = flags;
782 mutex_unlock(&inode->i_mutex);
783 876
784 f2fs_set_inode_flags(inode); 877 return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL);
785 inode->i_ctime = CURRENT_TIME; 878}
786 mark_inode_dirty(inode); 879
787out: 880static int f2fs_ioc_commit_atomic_write(struct file *filp)
788 mnt_drop_write_file(filp); 881{
882 struct inode *inode = file_inode(filp);
883 int ret;
884
885 if (!inode_owner_or_capable(inode))
886 return -EACCES;
887
888 if (f2fs_is_volatile_file(inode))
889 return 0;
890
891 ret = mnt_want_write_file(filp);
892 if (ret)
789 return ret; 893 return ret;
790 } 894
895 if (f2fs_is_atomic_file(inode))
896 commit_inmem_pages(inode, false);
897
898 ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
899 mnt_drop_write_file(filp);
900 return ret;
901}
902
903static int f2fs_ioc_start_volatile_write(struct file *filp)
904{
905 struct inode *inode = file_inode(filp);
906
907 if (!inode_owner_or_capable(inode))
908 return -EACCES;
909
910 set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
911 return 0;
912}
913
914static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
915{
916 struct inode *inode = file_inode(filp);
917 struct super_block *sb = inode->i_sb;
918 struct request_queue *q = bdev_get_queue(sb->s_bdev);
919 struct fstrim_range range;
920 int ret;
921
922 if (!capable(CAP_SYS_ADMIN))
923 return -EPERM;
924
925 if (!blk_queue_discard(q))
926 return -EOPNOTSUPP;
927
928 if (copy_from_user(&range, (struct fstrim_range __user *)arg,
929 sizeof(range)))
930 return -EFAULT;
931
932 range.minlen = max((unsigned int)range.minlen,
933 q->limits.discard_granularity);
934 ret = f2fs_trim_fs(F2FS_SB(sb), &range);
935 if (ret < 0)
936 return ret;
937
938 if (copy_to_user((struct fstrim_range __user *)arg, &range,
939 sizeof(range)))
940 return -EFAULT;
941 return 0;
942}
943
944long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
945{
946 switch (cmd) {
947 case F2FS_IOC_GETFLAGS:
948 return f2fs_ioc_getflags(filp, arg);
949 case F2FS_IOC_SETFLAGS:
950 return f2fs_ioc_setflags(filp, arg);
951 case F2FS_IOC_START_ATOMIC_WRITE:
952 return f2fs_ioc_start_atomic_write(filp);
953 case F2FS_IOC_COMMIT_ATOMIC_WRITE:
954 return f2fs_ioc_commit_atomic_write(filp);
955 case F2FS_IOC_START_VOLATILE_WRITE:
956 return f2fs_ioc_start_volatile_write(filp);
957 case FITRIM:
958 return f2fs_ioc_fitrim(filp, arg);
791 default: 959 default:
792 return -ENOTTY; 960 return -ENOTTY;
793 } 961 }
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index b90dbe55403a..2a8f4acdb86b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -58,7 +58,7 @@ static int gc_thread_func(void *data)
58 * 3. IO subsystem is idle by checking the # of requests in 58 * 3. IO subsystem is idle by checking the # of requests in
59 * bdev's request list. 59 * bdev's request list.
60 * 60 *
61 * Note) We have to avoid triggering GCs too much frequently. 61 * Note) We have to avoid triggering GCs frequently.
62 * Because it is possible that some segments can be 62 * Because it is possible that some segments can be
63 * invalidated soon after by user update or deletion. 63 * invalidated soon after by user update or deletion.
64 * So, I'd like to wait some time to collect dirty segments. 64 * So, I'd like to wait some time to collect dirty segments.
@@ -186,7 +186,6 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
186static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) 186static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
187{ 187{
188 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 188 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
189 unsigned int hint = 0;
190 unsigned int secno; 189 unsigned int secno;
191 190
192 /* 191 /*
@@ -194,11 +193,9 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
194 * selected by background GC before. 193 * selected by background GC before.
195 * Those segments guarantee they have small valid blocks. 194 * Those segments guarantee they have small valid blocks.
196 */ 195 */
197next: 196 for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
198 secno = find_next_bit(dirty_i->victim_secmap, TOTAL_SECS(sbi), hint++);
199 if (secno < TOTAL_SECS(sbi)) {
200 if (sec_usage_check(sbi, secno)) 197 if (sec_usage_check(sbi, secno))
201 goto next; 198 continue;
202 clear_bit(secno, dirty_i->victim_secmap); 199 clear_bit(secno, dirty_i->victim_secmap);
203 return secno * sbi->segs_per_sec; 200 return secno * sbi->segs_per_sec;
204 } 201 }
@@ -225,7 +222,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
225 222
226 u = (vblocks * 100) >> sbi->log_blocks_per_seg; 223 u = (vblocks * 100) >> sbi->log_blocks_per_seg;
227 224
228 /* Handle if the system time is changed by user */ 225 /* Handle if the system time has changed by the user */
229 if (mtime < sit_i->min_mtime) 226 if (mtime < sit_i->min_mtime)
230 sit_i->min_mtime = mtime; 227 sit_i->min_mtime = mtime;
231 if (mtime > sit_i->max_mtime) 228 if (mtime > sit_i->max_mtime)
@@ -266,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
266 unsigned int secno, max_cost; 263 unsigned int secno, max_cost;
267 int nsearched = 0; 264 int nsearched = 0;
268 265
266 mutex_lock(&dirty_i->seglist_lock);
267
269 p.alloc_mode = alloc_mode; 268 p.alloc_mode = alloc_mode;
270 select_policy(sbi, gc_type, type, &p); 269 select_policy(sbi, gc_type, type, &p);
271 270
272 p.min_segno = NULL_SEGNO; 271 p.min_segno = NULL_SEGNO;
273 p.min_cost = max_cost = get_max_cost(sbi, &p); 272 p.min_cost = max_cost = get_max_cost(sbi, &p);
274 273
275 mutex_lock(&dirty_i->seglist_lock);
276
277 if (p.alloc_mode == LFS && gc_type == FG_GC) { 274 if (p.alloc_mode == LFS && gc_type == FG_GC) {
278 p.min_segno = check_bg_victims(sbi); 275 p.min_segno = check_bg_victims(sbi);
279 if (p.min_segno != NULL_SEGNO) 276 if (p.min_segno != NULL_SEGNO)
@@ -284,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
284 unsigned long cost; 281 unsigned long cost;
285 unsigned int segno; 282 unsigned int segno;
286 283
287 segno = find_next_bit(p.dirty_segmap, 284 segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
288 TOTAL_SEGS(sbi), p.offset); 285 if (segno >= MAIN_SEGS(sbi)) {
289 if (segno >= TOTAL_SEGS(sbi)) {
290 if (sbi->last_victim[p.gc_mode]) { 286 if (sbi->last_victim[p.gc_mode]) {
291 sbi->last_victim[p.gc_mode] = 0; 287 sbi->last_victim[p.gc_mode] = 0;
292 p.offset = 0; 288 p.offset = 0;
@@ -426,6 +422,12 @@ next_step:
426 if (IS_ERR(node_page)) 422 if (IS_ERR(node_page))
427 continue; 423 continue;
428 424
425 /* block may become invalid during get_node_page */
426 if (check_valid_map(sbi, segno, off) == 0) {
427 f2fs_put_page(node_page, 1);
428 continue;
429 }
430
429 /* set page dirty and write it */ 431 /* set page dirty and write it */
430 if (gc_type == FG_GC) { 432 if (gc_type == FG_GC) {
431 f2fs_wait_on_page_writeback(node_page, NODE); 433 f2fs_wait_on_page_writeback(node_page, NODE);
@@ -534,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type)
534 f2fs_wait_on_page_writeback(page, DATA); 536 f2fs_wait_on_page_writeback(page, DATA);
535 537
536 if (clear_page_dirty_for_io(page)) 538 if (clear_page_dirty_for_io(page))
537 inode_dec_dirty_dents(inode); 539 inode_dec_dirty_pages(inode);
538 set_cold_data(page); 540 set_cold_data(page);
539 do_write_data_page(page, &fio); 541 do_write_data_page(page, &fio);
540 clear_cold_data(page); 542 clear_cold_data(page);
@@ -596,7 +598,7 @@ next_step:
596 598
597 if (phase == 2) { 599 if (phase == 2) {
598 inode = f2fs_iget(sb, dni.ino); 600 inode = f2fs_iget(sb, dni.ino);
599 if (IS_ERR(inode)) 601 if (IS_ERR(inode) || is_bad_inode(inode))
600 continue; 602 continue;
601 603
602 start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); 604 start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
@@ -691,17 +693,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
691 int gc_type = BG_GC; 693 int gc_type = BG_GC;
692 int nfree = 0; 694 int nfree = 0;
693 int ret = -1; 695 int ret = -1;
696 struct cp_control cpc = {
697 .reason = CP_SYNC,
698 };
694 699
695 INIT_LIST_HEAD(&ilist); 700 INIT_LIST_HEAD(&ilist);
696gc_more: 701gc_more:
697 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) 702 if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
698 goto stop; 703 goto stop;
699 if (unlikely(is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ERROR_FLAG))) 704 if (unlikely(f2fs_cp_error(sbi)))
700 goto stop; 705 goto stop;
701 706
702 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { 707 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
703 gc_type = FG_GC; 708 gc_type = FG_GC;
704 write_checkpoint(sbi, false); 709 write_checkpoint(sbi, &cpc);
705 } 710 }
706 711
707 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) 712 if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
@@ -726,7 +731,7 @@ gc_more:
726 goto gc_more; 731 goto gc_more;
727 732
728 if (gc_type == FG_GC) 733 if (gc_type == FG_GC)
729 write_checkpoint(sbi, false); 734 write_checkpoint(sbi, &cpc);
730stop: 735stop:
731 mutex_unlock(&sbi->gc_mutex); 736 mutex_unlock(&sbi->gc_mutex);
732 737
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 5d5eb6047bf4..16f0b2b22999 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -91,7 +91,7 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
91 block_t invalid_user_blocks = sbi->user_block_count - 91 block_t invalid_user_blocks = sbi->user_block_count -
92 written_block_count(sbi); 92 written_block_count(sbi);
93 /* 93 /*
94 * Background GC is triggered with the following condition. 94 * Background GC is triggered with the following conditions.
95 * 1. There are a number of invalid blocks. 95 * 1. There are a number of invalid blocks.
96 * 2. There is not enough free space. 96 * 2. There is not enough free space.
97 */ 97 */
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index 6eb8d269b53b..a844fcfb9a8d 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -42,7 +42,8 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[])
42 buf[1] += b1; 42 buf[1] += b1;
43} 43}
44 44
45static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num) 45static void str2hashbuf(const unsigned char *msg, size_t len,
46 unsigned int *buf, int num)
46{ 47{
47 unsigned pad, val; 48 unsigned pad, val;
48 int i; 49 int i;
@@ -69,12 +70,14 @@ static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
69 *buf++ = pad; 70 *buf++ = pad;
70} 71}
71 72
72f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len) 73f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info)
73{ 74{
74 __u32 hash; 75 __u32 hash;
75 f2fs_hash_t f2fs_hash; 76 f2fs_hash_t f2fs_hash;
76 const char *p; 77 const unsigned char *p;
77 __u32 in[8], buf[4]; 78 __u32 in[8], buf[4];
79 const unsigned char *name = name_info->name;
80 size_t len = name_info->len;
78 81
79 if ((len <= 2) && (name[0] == '.') && 82 if ((len <= 2) && (name[0] == '.') &&
80 (name[1] == '.' || name[1] == '\0')) 83 (name[1] == '.' || name[1] == '\0'))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 1bba5228c197..88036fd75797 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -15,11 +15,13 @@
15 15
16bool f2fs_may_inline(struct inode *inode) 16bool f2fs_may_inline(struct inode *inode)
17{ 17{
18 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
19 block_t nr_blocks; 18 block_t nr_blocks;
20 loff_t i_size; 19 loff_t i_size;
21 20
22 if (!test_opt(sbi, INLINE_DATA)) 21 if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
22 return false;
23
24 if (f2fs_is_atomic_file(inode))
23 return false; 25 return false;
24 26
25 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; 27 nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2;
@@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode)
35 37
36int f2fs_read_inline_data(struct inode *inode, struct page *page) 38int f2fs_read_inline_data(struct inode *inode, struct page *page)
37{ 39{
38 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
39 struct page *ipage; 40 struct page *ipage;
40 void *src_addr, *dst_addr; 41 void *src_addr, *dst_addr;
41 42
@@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
44 goto out; 45 goto out;
45 } 46 }
46 47
47 ipage = get_node_page(sbi, inode->i_ino); 48 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
48 if (IS_ERR(ipage)) { 49 if (IS_ERR(ipage)) {
49 unlock_page(page); 50 unlock_page(page);
50 return PTR_ERR(ipage); 51 return PTR_ERR(ipage);
@@ -68,12 +69,12 @@ out:
68 69
69static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) 70static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
70{ 71{
71 int err; 72 int err = 0;
72 struct page *ipage; 73 struct page *ipage;
73 struct dnode_of_data dn; 74 struct dnode_of_data dn;
74 void *src_addr, *dst_addr; 75 void *src_addr, *dst_addr;
75 block_t new_blk_addr; 76 block_t new_blk_addr;
76 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 77 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
77 struct f2fs_io_info fio = { 78 struct f2fs_io_info fio = {
78 .type = DATA, 79 .type = DATA,
79 .rw = WRITE_SYNC | REQ_PRIO, 80 .rw = WRITE_SYNC | REQ_PRIO,
@@ -86,6 +87,10 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page)
86 goto out; 87 goto out;
87 } 88 }
88 89
90 /* someone else converted inline_data already */
91 if (!f2fs_has_inline_data(inode))
92 goto out;
93
89 /* 94 /*
90 * i_addr[0] is not used for inline data, 95 * i_addr[0] is not used for inline data,
91 * so reserving new block will not destroy inline data 96 * so reserving new block will not destroy inline data
@@ -124,9 +129,10 @@ out:
124 return err; 129 return err;
125} 130}
126 131
127int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size) 132int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size,
133 struct page *page)
128{ 134{
129 struct page *page; 135 struct page *new_page = page;
130 int err; 136 int err;
131 137
132 if (!f2fs_has_inline_data(inode)) 138 if (!f2fs_has_inline_data(inode))
@@ -134,17 +140,20 @@ int f2fs_convert_inline_data(struct inode *inode, pgoff_t to_size)
134 else if (to_size <= MAX_INLINE_DATA) 140 else if (to_size <= MAX_INLINE_DATA)
135 return 0; 141 return 0;
136 142
137 page = grab_cache_page(inode->i_mapping, 0); 143 if (!page || page->index != 0) {
138 if (!page) 144 new_page = grab_cache_page(inode->i_mapping, 0);
139 return -ENOMEM; 145 if (!new_page)
146 return -ENOMEM;
147 }
140 148
141 err = __f2fs_convert_inline_data(inode, page); 149 err = __f2fs_convert_inline_data(inode, new_page);
142 f2fs_put_page(page, 1); 150 if (!page || page->index != 0)
151 f2fs_put_page(new_page, 1);
143 return err; 152 return err;
144} 153}
145 154
146int f2fs_write_inline_data(struct inode *inode, 155int f2fs_write_inline_data(struct inode *inode,
147 struct page *page, unsigned size) 156 struct page *page, unsigned size)
148{ 157{
149 void *src_addr, *dst_addr; 158 void *src_addr, *dst_addr;
150 struct page *ipage; 159 struct page *ipage;
@@ -172,6 +181,7 @@ int f2fs_write_inline_data(struct inode *inode,
172 stat_inc_inline_inode(inode); 181 stat_inc_inline_inode(inode);
173 } 182 }
174 183
184 set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
175 sync_inode_page(&dn); 185 sync_inode_page(&dn);
176 f2fs_put_dnode(&dn); 186 f2fs_put_dnode(&dn);
177 187
@@ -180,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode,
180 190
181void truncate_inline_data(struct inode *inode, u64 from) 191void truncate_inline_data(struct inode *inode, u64 from)
182{ 192{
183 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
184 struct page *ipage; 193 struct page *ipage;
185 194
186 if (from >= MAX_INLINE_DATA) 195 if (from >= MAX_INLINE_DATA)
187 return; 196 return;
188 197
189 ipage = get_node_page(sbi, inode->i_ino); 198 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
190 if (IS_ERR(ipage)) 199 if (IS_ERR(ipage))
191 return; 200 return;
192 201
@@ -198,9 +207,9 @@ void truncate_inline_data(struct inode *inode, u64 from)
198 f2fs_put_page(ipage, 1); 207 f2fs_put_page(ipage, 1);
199} 208}
200 209
201int recover_inline_data(struct inode *inode, struct page *npage) 210bool recover_inline_data(struct inode *inode, struct page *npage)
202{ 211{
203 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 212 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
204 struct f2fs_inode *ri = NULL; 213 struct f2fs_inode *ri = NULL;
205 void *src_addr, *dst_addr; 214 void *src_addr, *dst_addr;
206 struct page *ipage; 215 struct page *ipage;
@@ -217,10 +226,10 @@ int recover_inline_data(struct inode *inode, struct page *npage)
217 ri = F2FS_INODE(npage); 226 ri = F2FS_INODE(npage);
218 227
219 if (f2fs_has_inline_data(inode) && 228 if (f2fs_has_inline_data(inode) &&
220 ri && ri->i_inline & F2FS_INLINE_DATA) { 229 ri && (ri->i_inline & F2FS_INLINE_DATA)) {
221process_inline: 230process_inline:
222 ipage = get_node_page(sbi, inode->i_ino); 231 ipage = get_node_page(sbi, inode->i_ino);
223 f2fs_bug_on(IS_ERR(ipage)); 232 f2fs_bug_on(sbi, IS_ERR(ipage));
224 233
225 f2fs_wait_on_page_writeback(ipage, NODE); 234 f2fs_wait_on_page_writeback(ipage, NODE);
226 235
@@ -229,22 +238,22 @@ process_inline:
229 memcpy(dst_addr, src_addr, MAX_INLINE_DATA); 238 memcpy(dst_addr, src_addr, MAX_INLINE_DATA);
230 update_inode(inode, ipage); 239 update_inode(inode, ipage);
231 f2fs_put_page(ipage, 1); 240 f2fs_put_page(ipage, 1);
232 return -1; 241 return true;
233 } 242 }
234 243
235 if (f2fs_has_inline_data(inode)) { 244 if (f2fs_has_inline_data(inode)) {
236 ipage = get_node_page(sbi, inode->i_ino); 245 ipage = get_node_page(sbi, inode->i_ino);
237 f2fs_bug_on(IS_ERR(ipage)); 246 f2fs_bug_on(sbi, IS_ERR(ipage));
238 f2fs_wait_on_page_writeback(ipage, NODE); 247 f2fs_wait_on_page_writeback(ipage, NODE);
239 zero_user_segment(ipage, INLINE_DATA_OFFSET, 248 zero_user_segment(ipage, INLINE_DATA_OFFSET,
240 INLINE_DATA_OFFSET + MAX_INLINE_DATA); 249 INLINE_DATA_OFFSET + MAX_INLINE_DATA);
241 clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); 250 clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
242 update_inode(inode, ipage); 251 update_inode(inode, ipage);
243 f2fs_put_page(ipage, 1); 252 f2fs_put_page(ipage, 1);
244 } else if (ri && ri->i_inline & F2FS_INLINE_DATA) { 253 } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
245 truncate_blocks(inode, 0); 254 truncate_blocks(inode, 0, false);
246 set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); 255 set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
247 goto process_inline; 256 goto process_inline;
248 } 257 }
249 return 0; 258 return false;
250} 259}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 2cf6962f6cc8..0deead4505e7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
69 69
70static int do_read_inode(struct inode *inode) 70static int do_read_inode(struct inode *inode)
71{ 71{
72 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 72 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
73 struct f2fs_inode_info *fi = F2FS_I(inode); 73 struct f2fs_inode_info *fi = F2FS_I(inode);
74 struct page *node_page; 74 struct page *node_page;
75 struct f2fs_inode *ri; 75 struct f2fs_inode *ri;
@@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page)
218 218
219void update_inode_page(struct inode *inode) 219void update_inode_page(struct inode *inode)
220{ 220{
221 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 221 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
222 struct page *node_page; 222 struct page *node_page;
223retry: 223retry:
224 node_page = get_node_page(sbi, inode->i_ino); 224 node_page = get_node_page(sbi, inode->i_ino);
@@ -238,7 +238,7 @@ retry:
238 238
239int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) 239int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
240{ 240{
241 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 241 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
242 242
243 if (inode->i_ino == F2FS_NODE_INO(sbi) || 243 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
244 inode->i_ino == F2FS_META_INO(sbi)) 244 inode->i_ino == F2FS_META_INO(sbi))
@@ -266,16 +266,21 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
266 */ 266 */
267void f2fs_evict_inode(struct inode *inode) 267void f2fs_evict_inode(struct inode *inode)
268{ 268{
269 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 269 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
270 nid_t xnid = F2FS_I(inode)->i_xattr_nid;
271
272 /* some remained atomic pages should discarded */
273 if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
274 commit_inmem_pages(inode, true);
270 275
271 trace_f2fs_evict_inode(inode); 276 trace_f2fs_evict_inode(inode);
272 truncate_inode_pages_final(&inode->i_data); 277 truncate_inode_pages_final(&inode->i_data);
273 278
274 if (inode->i_ino == F2FS_NODE_INO(sbi) || 279 if (inode->i_ino == F2FS_NODE_INO(sbi) ||
275 inode->i_ino == F2FS_META_INO(sbi)) 280 inode->i_ino == F2FS_META_INO(sbi))
276 goto no_delete; 281 goto out_clear;
277 282
278 f2fs_bug_on(get_dirty_dents(inode)); 283 f2fs_bug_on(sbi, get_dirty_pages(inode));
279 remove_dirty_dir_inode(inode); 284 remove_dirty_dir_inode(inode);
280 285
281 if (inode->i_nlink || is_bad_inode(inode)) 286 if (inode->i_nlink || is_bad_inode(inode))
@@ -295,6 +300,36 @@ void f2fs_evict_inode(struct inode *inode)
295 300
296 sb_end_intwrite(inode->i_sb); 301 sb_end_intwrite(inode->i_sb);
297no_delete: 302no_delete:
298 clear_inode(inode);
299 invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); 303 invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
304 if (xnid)
305 invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
306 if (is_inode_flag_set(F2FS_I(inode), FI_APPEND_WRITE))
307 add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
308 if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE))
309 add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
310out_clear:
311 clear_inode(inode);
312}
313
314/* caller should call f2fs_lock_op() */
315void handle_failed_inode(struct inode *inode)
316{
317 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
318
319 clear_nlink(inode);
320 make_bad_inode(inode);
321 unlock_new_inode(inode);
322
323 i_size_write(inode, 0);
324 if (F2FS_HAS_BLOCKS(inode))
325 f2fs_truncate(inode);
326
327 remove_inode_page(inode);
328 stat_dec_inline_inode(inode);
329
330 alloc_nid_failed(sbi, inode->i_ino);
331 f2fs_unlock_op(sbi);
332
333 /* iput will drop the inode object */
334 iput(inode);
300} 335}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a6bdddc33ce2..0d2526e5aa11 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -13,6 +13,7 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/sched.h> 14#include <linux/sched.h>
15#include <linux/ctype.h> 15#include <linux/ctype.h>
16#include <linux/dcache.h>
16 17
17#include "f2fs.h" 18#include "f2fs.h"
18#include "node.h" 19#include "node.h"
@@ -22,14 +23,13 @@
22 23
23static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) 24static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
24{ 25{
25 struct super_block *sb = dir->i_sb; 26 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
26 struct f2fs_sb_info *sbi = F2FS_SB(sb);
27 nid_t ino; 27 nid_t ino;
28 struct inode *inode; 28 struct inode *inode;
29 bool nid_free = false; 29 bool nid_free = false;
30 int err; 30 int err;
31 31
32 inode = new_inode(sb); 32 inode = new_inode(dir->i_sb);
33 if (!inode) 33 if (!inode)
34 return ERR_PTR(-ENOMEM); 34 return ERR_PTR(-ENOMEM);
35 35
@@ -102,8 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode,
102static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, 102static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
103 bool excl) 103 bool excl)
104{ 104{
105 struct super_block *sb = dir->i_sb; 105 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
106 struct f2fs_sb_info *sbi = F2FS_SB(sb);
107 struct inode *inode; 106 struct inode *inode;
108 nid_t ino = 0; 107 nid_t ino = 0;
109 int err; 108 int err;
@@ -124,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
124 123
125 f2fs_lock_op(sbi); 124 f2fs_lock_op(sbi);
126 err = f2fs_add_link(dentry, inode); 125 err = f2fs_add_link(dentry, inode);
127 f2fs_unlock_op(sbi);
128 if (err) 126 if (err)
129 goto out; 127 goto out;
128 f2fs_unlock_op(sbi);
130 129
131 alloc_nid_done(sbi, ino); 130 alloc_nid_done(sbi, ino);
132 131
@@ -134,11 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
134 unlock_new_inode(inode); 133 unlock_new_inode(inode);
135 return 0; 134 return 0;
136out: 135out:
137 clear_nlink(inode); 136 handle_failed_inode(inode);
138 unlock_new_inode(inode);
139 make_bad_inode(inode);
140 iput(inode);
141 alloc_nid_failed(sbi, ino);
142 return err; 137 return err;
143} 138}
144 139
@@ -146,8 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
146 struct dentry *dentry) 141 struct dentry *dentry)
147{ 142{
148 struct inode *inode = old_dentry->d_inode; 143 struct inode *inode = old_dentry->d_inode;
149 struct super_block *sb = dir->i_sb; 144 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
150 struct f2fs_sb_info *sbi = F2FS_SB(sb);
151 int err; 145 int err;
152 146
153 f2fs_balance_fs(sbi); 147 f2fs_balance_fs(sbi);
@@ -158,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
158 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 152 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
159 f2fs_lock_op(sbi); 153 f2fs_lock_op(sbi);
160 err = f2fs_add_link(dentry, inode); 154 err = f2fs_add_link(dentry, inode);
161 f2fs_unlock_op(sbi);
162 if (err) 155 if (err)
163 goto out; 156 goto out;
157 f2fs_unlock_op(sbi);
164 158
165 d_instantiate(dentry, inode); 159 d_instantiate(dentry, inode);
166 return 0; 160 return 0;
167out: 161out:
168 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 162 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
169 iput(inode); 163 iput(inode);
164 f2fs_unlock_op(sbi);
170 return err; 165 return err;
171} 166}
172 167
@@ -207,8 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
207 202
208static int f2fs_unlink(struct inode *dir, struct dentry *dentry) 203static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
209{ 204{
210 struct super_block *sb = dir->i_sb; 205 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
211 struct f2fs_sb_info *sbi = F2FS_SB(sb);
212 struct inode *inode = dentry->d_inode; 206 struct inode *inode = dentry->d_inode;
213 struct f2fs_dir_entry *de; 207 struct f2fs_dir_entry *de;
214 struct page *page; 208 struct page *page;
@@ -232,7 +226,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
232 f2fs_delete_entry(de, page, inode); 226 f2fs_delete_entry(de, page, inode);
233 f2fs_unlock_op(sbi); 227 f2fs_unlock_op(sbi);
234 228
235 /* In order to evict this inode, we set it dirty */ 229 /* In order to evict this inode, we set it dirty */
236 mark_inode_dirty(inode); 230 mark_inode_dirty(inode);
237fail: 231fail:
238 trace_f2fs_unlink_exit(inode, err); 232 trace_f2fs_unlink_exit(inode, err);
@@ -242,8 +236,7 @@ fail:
242static int f2fs_symlink(struct inode *dir, struct dentry *dentry, 236static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
243 const char *symname) 237 const char *symname)
244{ 238{
245 struct super_block *sb = dir->i_sb; 239 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
246 struct f2fs_sb_info *sbi = F2FS_SB(sb);
247 struct inode *inode; 240 struct inode *inode;
248 size_t symlen = strlen(symname) + 1; 241 size_t symlen = strlen(symname) + 1;
249 int err; 242 int err;
@@ -259,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
259 252
260 f2fs_lock_op(sbi); 253 f2fs_lock_op(sbi);
261 err = f2fs_add_link(dentry, inode); 254 err = f2fs_add_link(dentry, inode);
262 f2fs_unlock_op(sbi);
263 if (err) 255 if (err)
264 goto out; 256 goto out;
257 f2fs_unlock_op(sbi);
265 258
266 err = page_symlink(inode, symname, symlen); 259 err = page_symlink(inode, symname, symlen);
267 alloc_nid_done(sbi, inode->i_ino); 260 alloc_nid_done(sbi, inode->i_ino);
@@ -270,17 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
270 unlock_new_inode(inode); 263 unlock_new_inode(inode);
271 return err; 264 return err;
272out: 265out:
273 clear_nlink(inode); 266 handle_failed_inode(inode);
274 unlock_new_inode(inode);
275 make_bad_inode(inode);
276 iput(inode);
277 alloc_nid_failed(sbi, inode->i_ino);
278 return err; 267 return err;
279} 268}
280 269
281static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 270static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
282{ 271{
283 struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); 272 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
284 struct inode *inode; 273 struct inode *inode;
285 int err; 274 int err;
286 275
@@ -298,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
298 set_inode_flag(F2FS_I(inode), FI_INC_LINK); 287 set_inode_flag(F2FS_I(inode), FI_INC_LINK);
299 f2fs_lock_op(sbi); 288 f2fs_lock_op(sbi);
300 err = f2fs_add_link(dentry, inode); 289 err = f2fs_add_link(dentry, inode);
301 f2fs_unlock_op(sbi);
302 if (err) 290 if (err)
303 goto out_fail; 291 goto out_fail;
292 f2fs_unlock_op(sbi);
304 293
305 alloc_nid_done(sbi, inode->i_ino); 294 alloc_nid_done(sbi, inode->i_ino);
306 295
@@ -311,11 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
311 300
312out_fail: 301out_fail:
313 clear_inode_flag(F2FS_I(inode), FI_INC_LINK); 302 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
314 clear_nlink(inode); 303 handle_failed_inode(inode);
315 unlock_new_inode(inode);
316 make_bad_inode(inode);
317 iput(inode);
318 alloc_nid_failed(sbi, inode->i_ino);
319 return err; 304 return err;
320} 305}
321 306
@@ -330,8 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry)
330static int f2fs_mknod(struct inode *dir, struct dentry *dentry, 315static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
331 umode_t mode, dev_t rdev) 316 umode_t mode, dev_t rdev)
332{ 317{
333 struct super_block *sb = dir->i_sb; 318 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
334 struct f2fs_sb_info *sbi = F2FS_SB(sb);
335 struct inode *inode; 319 struct inode *inode;
336 int err = 0; 320 int err = 0;
337 321
@@ -349,28 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
349 333
350 f2fs_lock_op(sbi); 334 f2fs_lock_op(sbi);
351 err = f2fs_add_link(dentry, inode); 335 err = f2fs_add_link(dentry, inode);
352 f2fs_unlock_op(sbi);
353 if (err) 336 if (err)
354 goto out; 337 goto out;
338 f2fs_unlock_op(sbi);
355 339
356 alloc_nid_done(sbi, inode->i_ino); 340 alloc_nid_done(sbi, inode->i_ino);
357 d_instantiate(dentry, inode); 341 d_instantiate(dentry, inode);
358 unlock_new_inode(inode); 342 unlock_new_inode(inode);
359 return 0; 343 return 0;
360out: 344out:
361 clear_nlink(inode); 345 handle_failed_inode(inode);
362 unlock_new_inode(inode);
363 make_bad_inode(inode);
364 iput(inode);
365 alloc_nid_failed(sbi, inode->i_ino);
366 return err; 346 return err;
367} 347}
368 348
369static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, 349static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
370 struct inode *new_dir, struct dentry *new_dentry) 350 struct inode *new_dir, struct dentry *new_dentry)
371{ 351{
372 struct super_block *sb = old_dir->i_sb; 352 struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
373 struct f2fs_sb_info *sbi = F2FS_SB(sb);
374 struct inode *old_inode = old_dentry->d_inode; 353 struct inode *old_inode = old_dentry->d_inode;
375 struct inode *new_inode = new_dentry->d_inode; 354 struct inode *new_inode = new_dentry->d_inode;
376 struct page *old_dir_page; 355 struct page *old_dir_page;
@@ -393,8 +372,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
393 goto out_old; 372 goto out_old;
394 } 373 }
395 374
396 f2fs_lock_op(sbi);
397
398 if (new_inode) { 375 if (new_inode) {
399 376
400 err = -ENOTEMPTY; 377 err = -ENOTEMPTY;
@@ -407,6 +384,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
407 if (!new_entry) 384 if (!new_entry)
408 goto out_dir; 385 goto out_dir;
409 386
387 f2fs_lock_op(sbi);
388
410 err = acquire_orphan_inode(sbi); 389 err = acquire_orphan_inode(sbi);
411 if (err) 390 if (err)
412 goto put_out_dir; 391 goto put_out_dir;
@@ -435,9 +414,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
435 update_inode_page(old_inode); 414 update_inode_page(old_inode);
436 update_inode_page(new_inode); 415 update_inode_page(new_inode);
437 } else { 416 } else {
417 f2fs_lock_op(sbi);
418
438 err = f2fs_add_link(new_dentry, old_inode); 419 err = f2fs_add_link(new_dentry, old_inode);
439 if (err) 420 if (err) {
421 f2fs_unlock_op(sbi);
440 goto out_dir; 422 goto out_dir;
423 }
441 424
442 if (old_dir_entry) { 425 if (old_dir_entry) {
443 inc_nlink(new_dir); 426 inc_nlink(new_dir);
@@ -472,6 +455,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
472 return 0; 455 return 0;
473 456
474put_out_dir: 457put_out_dir:
458 f2fs_unlock_op(sbi);
475 kunmap(new_page); 459 kunmap(new_page);
476 f2fs_put_page(new_page, 0); 460 f2fs_put_page(new_page, 0);
477out_dir: 461out_dir:
@@ -479,7 +463,150 @@ out_dir:
479 kunmap(old_dir_page); 463 kunmap(old_dir_page);
480 f2fs_put_page(old_dir_page, 0); 464 f2fs_put_page(old_dir_page, 0);
481 } 465 }
466out_old:
467 kunmap(old_page);
468 f2fs_put_page(old_page, 0);
469out:
470 return err;
471}
472
473static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
474 struct inode *new_dir, struct dentry *new_dentry)
475{
476 struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir);
477 struct inode *old_inode = old_dentry->d_inode;
478 struct inode *new_inode = new_dentry->d_inode;
479 struct page *old_dir_page, *new_dir_page;
480 struct page *old_page, *new_page;
481 struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL;
482 struct f2fs_dir_entry *old_entry, *new_entry;
483 int old_nlink = 0, new_nlink = 0;
484 int err = -ENOENT;
485
486 f2fs_balance_fs(sbi);
487
488 old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
489 if (!old_entry)
490 goto out;
491
492 new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page);
493 if (!new_entry)
494 goto out_old;
495
496 /* prepare for updating ".." directory entry info later */
497 if (old_dir != new_dir) {
498 if (S_ISDIR(old_inode->i_mode)) {
499 err = -EIO;
500 old_dir_entry = f2fs_parent_dir(old_inode,
501 &old_dir_page);
502 if (!old_dir_entry)
503 goto out_new;
504 }
505
506 if (S_ISDIR(new_inode->i_mode)) {
507 err = -EIO;
508 new_dir_entry = f2fs_parent_dir(new_inode,
509 &new_dir_page);
510 if (!new_dir_entry)
511 goto out_old_dir;
512 }
513 }
514
515 /*
516 * If cross rename between file and directory those are not
517 * in the same directory, we will inc nlink of file's parent
518 * later, so we should check upper boundary of its nlink.
519 */
520 if ((!old_dir_entry || !new_dir_entry) &&
521 old_dir_entry != new_dir_entry) {
522 old_nlink = old_dir_entry ? -1 : 1;
523 new_nlink = -old_nlink;
524 err = -EMLINK;
525 if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) ||
526 (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX))
527 goto out_new_dir;
528 }
529
530 f2fs_lock_op(sbi);
531
532 err = update_dent_inode(old_inode, &new_dentry->d_name);
533 if (err)
534 goto out_unlock;
535
536 err = update_dent_inode(new_inode, &old_dentry->d_name);
537 if (err)
538 goto out_undo;
539
540 /* update ".." directory entry info of old dentry */
541 if (old_dir_entry)
542 f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
543
544 /* update ".." directory entry info of new dentry */
545 if (new_dir_entry)
546 f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir);
547
548 /* update directory entry info of old dir inode */
549 f2fs_set_link(old_dir, old_entry, old_page, new_inode);
550
551 down_write(&F2FS_I(old_inode)->i_sem);
552 file_lost_pino(old_inode);
553 up_write(&F2FS_I(old_inode)->i_sem);
554
555 update_inode_page(old_inode);
556
557 old_dir->i_ctime = CURRENT_TIME;
558 if (old_nlink) {
559 down_write(&F2FS_I(old_dir)->i_sem);
560 if (old_nlink < 0)
561 drop_nlink(old_dir);
562 else
563 inc_nlink(old_dir);
564 up_write(&F2FS_I(old_dir)->i_sem);
565 }
566 mark_inode_dirty(old_dir);
567 update_inode_page(old_dir);
568
569 /* update directory entry info of new dir inode */
570 f2fs_set_link(new_dir, new_entry, new_page, old_inode);
571
572 down_write(&F2FS_I(new_inode)->i_sem);
573 file_lost_pino(new_inode);
574 up_write(&F2FS_I(new_inode)->i_sem);
575
576 update_inode_page(new_inode);
577
578 new_dir->i_ctime = CURRENT_TIME;
579 if (new_nlink) {
580 down_write(&F2FS_I(new_dir)->i_sem);
581 if (new_nlink < 0)
582 drop_nlink(new_dir);
583 else
584 inc_nlink(new_dir);
585 up_write(&F2FS_I(new_dir)->i_sem);
586 }
587 mark_inode_dirty(new_dir);
588 update_inode_page(new_dir);
589
482 f2fs_unlock_op(sbi); 590 f2fs_unlock_op(sbi);
591 return 0;
592out_undo:
593 /* Still we may fail to recover name info of f2fs_inode here */
594 update_dent_inode(old_inode, &old_dentry->d_name);
595out_unlock:
596 f2fs_unlock_op(sbi);
597out_new_dir:
598 if (new_dir_entry) {
599 kunmap(new_dir_page);
600 f2fs_put_page(new_dir_page, 0);
601 }
602out_old_dir:
603 if (old_dir_entry) {
604 kunmap(old_dir_page);
605 f2fs_put_page(old_dir_page, 0);
606 }
607out_new:
608 kunmap(new_page);
609 f2fs_put_page(new_page, 0);
483out_old: 610out_old:
484 kunmap(old_page); 611 kunmap(old_page);
485 f2fs_put_page(old_page, 0); 612 f2fs_put_page(old_page, 0);
@@ -487,6 +614,66 @@ out:
487 return err; 614 return err;
488} 615}
489 616
617static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
618 struct inode *new_dir, struct dentry *new_dentry,
619 unsigned int flags)
620{
621 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
622 return -EINVAL;
623
624 if (flags & RENAME_EXCHANGE) {
625 return f2fs_cross_rename(old_dir, old_dentry,
626 new_dir, new_dentry);
627 }
628 /*
629 * VFS has already handled the new dentry existence case,
630 * here, we just deal with "RENAME_NOREPLACE" as regular rename.
631 */
632 return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry);
633}
634
635static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
636{
637 struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
638 struct inode *inode;
639 int err;
640
641 inode = f2fs_new_inode(dir, mode);
642 if (IS_ERR(inode))
643 return PTR_ERR(inode);
644
645 inode->i_op = &f2fs_file_inode_operations;
646 inode->i_fop = &f2fs_file_operations;
647 inode->i_mapping->a_ops = &f2fs_dblock_aops;
648
649 f2fs_lock_op(sbi);
650 err = acquire_orphan_inode(sbi);
651 if (err)
652 goto out;
653
654 err = f2fs_do_tmpfile(inode, dir);
655 if (err)
656 goto release_out;
657
658 /*
659 * add this non-linked tmpfile to orphan list, in this way we could
660 * remove all unused data of tmpfile after abnormal power-off.
661 */
662 add_orphan_inode(sbi, inode->i_ino);
663 f2fs_unlock_op(sbi);
664
665 alloc_nid_done(sbi, inode->i_ino);
666 d_tmpfile(dentry, inode);
667 unlock_new_inode(inode);
668 return 0;
669
670release_out:
671 release_orphan_inode(sbi);
672out:
673 handle_failed_inode(inode);
674 return err;
675}
676
490const struct inode_operations f2fs_dir_inode_operations = { 677const struct inode_operations f2fs_dir_inode_operations = {
491 .create = f2fs_create, 678 .create = f2fs_create,
492 .lookup = f2fs_lookup, 679 .lookup = f2fs_lookup,
@@ -496,7 +683,8 @@ const struct inode_operations f2fs_dir_inode_operations = {
496 .mkdir = f2fs_mkdir, 683 .mkdir = f2fs_mkdir,
497 .rmdir = f2fs_rmdir, 684 .rmdir = f2fs_rmdir,
498 .mknod = f2fs_mknod, 685 .mknod = f2fs_mknod,
499 .rename = f2fs_rename, 686 .rename2 = f2fs_rename2,
687 .tmpfile = f2fs_tmpfile,
500 .getattr = f2fs_getattr, 688 .getattr = f2fs_getattr,
501 .setattr = f2fs_setattr, 689 .setattr = f2fs_setattr,
502 .get_acl = f2fs_get_acl, 690 .get_acl = f2fs_get_acl,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 4b697ccc9b0c..44b8afef43d9 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -25,6 +25,7 @@
25 25
26static struct kmem_cache *nat_entry_slab; 26static struct kmem_cache *nat_entry_slab;
27static struct kmem_cache *free_nid_slab; 27static struct kmem_cache *free_nid_slab;
28static struct kmem_cache *nat_entry_set_slab;
28 29
29bool available_free_memory(struct f2fs_sb_info *sbi, int type) 30bool available_free_memory(struct f2fs_sb_info *sbi, int type)
30{ 31{
@@ -53,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
53static void clear_node_page_dirty(struct page *page) 54static void clear_node_page_dirty(struct page *page)
54{ 55{
55 struct address_space *mapping = page->mapping; 56 struct address_space *mapping = page->mapping;
56 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
57 unsigned int long flags; 57 unsigned int long flags;
58 58
59 if (PageDirty(page)) { 59 if (PageDirty(page)) {
@@ -64,7 +64,7 @@ static void clear_node_page_dirty(struct page *page)
64 spin_unlock_irqrestore(&mapping->tree_lock, flags); 64 spin_unlock_irqrestore(&mapping->tree_lock, flags);
65 65
66 clear_page_dirty_for_io(page); 66 clear_page_dirty_for_io(page);
67 dec_page_count(sbi, F2FS_DIRTY_NODES); 67 dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES);
68 } 68 }
69 ClearPageUptodate(page); 69 ClearPageUptodate(page);
70} 70}
@@ -90,12 +90,8 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
90 90
91 /* get current nat block page with lock */ 91 /* get current nat block page with lock */
92 src_page = get_meta_page(sbi, src_off); 92 src_page = get_meta_page(sbi, src_off);
93
94 /* Dirty src_page means that it is already the new target NAT page. */
95 if (PageDirty(src_page))
96 return src_page;
97
98 dst_page = grab_meta_page(sbi, dst_off); 93 dst_page = grab_meta_page(sbi, dst_off);
94 f2fs_bug_on(sbi, PageDirty(src_page));
99 95
100 src_addr = page_address(src_page); 96 src_addr = page_address(src_page);
101 dst_addr = page_address(dst_page); 97 dst_addr = page_address(dst_page);
@@ -127,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
127 kmem_cache_free(nat_entry_slab, e); 123 kmem_cache_free(nat_entry_slab, e);
128} 124}
129 125
130int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) 126static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
127 struct nat_entry *ne)
128{
129 nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
130 struct nat_entry_set *head;
131
132 if (get_nat_flag(ne, IS_DIRTY))
133 return;
134retry:
135 head = radix_tree_lookup(&nm_i->nat_set_root, set);
136 if (!head) {
137 head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC);
138
139 INIT_LIST_HEAD(&head->entry_list);
140 INIT_LIST_HEAD(&head->set_list);
141 head->set = set;
142 head->entry_cnt = 0;
143
144 if (radix_tree_insert(&nm_i->nat_set_root, set, head)) {
145 cond_resched();
146 goto retry;
147 }
148 }
149 list_move_tail(&ne->list, &head->entry_list);
150 nm_i->dirty_nat_cnt++;
151 head->entry_cnt++;
152 set_nat_flag(ne, IS_DIRTY, true);
153}
154
155static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
156 struct nat_entry *ne)
157{
158 nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK;
159 struct nat_entry_set *head;
160
161 head = radix_tree_lookup(&nm_i->nat_set_root, set);
162 if (head) {
163 list_move_tail(&ne->list, &nm_i->nat_entries);
164 set_nat_flag(ne, IS_DIRTY, false);
165 head->entry_cnt--;
166 nm_i->dirty_nat_cnt--;
167 }
168}
169
170static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i,
171 nid_t start, unsigned int nr, struct nat_entry_set **ep)
172{
173 return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep,
174 start, nr);
175}
176
177bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
131{ 178{
132 struct f2fs_nm_info *nm_i = NM_I(sbi); 179 struct f2fs_nm_info *nm_i = NM_I(sbi);
133 struct nat_entry *e; 180 struct nat_entry *e;
134 int is_cp = 1; 181 bool is_cp = true;
135 182
136 read_lock(&nm_i->nat_tree_lock); 183 read_lock(&nm_i->nat_tree_lock);
137 e = __lookup_nat_cache(nm_i, nid); 184 e = __lookup_nat_cache(nm_i, nid);
138 if (e && !e->checkpointed) 185 if (e && !get_nat_flag(e, IS_CHECKPOINTED))
139 is_cp = 0; 186 is_cp = false;
140 read_unlock(&nm_i->nat_tree_lock); 187 read_unlock(&nm_i->nat_tree_lock);
141 return is_cp; 188 return is_cp;
142} 189}
143 190
144bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) 191bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
145{ 192{
146 struct f2fs_nm_info *nm_i = NM_I(sbi); 193 struct f2fs_nm_info *nm_i = NM_I(sbi);
147 struct nat_entry *e; 194 struct nat_entry *e;
148 bool fsync_done = false; 195 bool fsynced = false;
149 196
150 read_lock(&nm_i->nat_tree_lock); 197 read_lock(&nm_i->nat_tree_lock);
151 e = __lookup_nat_cache(nm_i, nid); 198 e = __lookup_nat_cache(nm_i, ino);
152 if (e) 199 if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
153 fsync_done = e->fsync_done; 200 fsynced = true;
154 read_unlock(&nm_i->nat_tree_lock); 201 read_unlock(&nm_i->nat_tree_lock);
155 return fsync_done; 202 return fsynced;
156} 203}
157 204
158void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) 205bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
159{ 206{
160 struct f2fs_nm_info *nm_i = NM_I(sbi); 207 struct f2fs_nm_info *nm_i = NM_I(sbi);
161 struct nat_entry *e; 208 struct nat_entry *e;
209 bool need_update = true;
162 210
163 write_lock(&nm_i->nat_tree_lock); 211 read_lock(&nm_i->nat_tree_lock);
164 e = __lookup_nat_cache(nm_i, nid); 212 e = __lookup_nat_cache(nm_i, ino);
165 if (e) 213 if (e && get_nat_flag(e, HAS_LAST_FSYNC) &&
166 e->fsync_done = false; 214 (get_nat_flag(e, IS_CHECKPOINTED) ||
167 write_unlock(&nm_i->nat_tree_lock); 215 get_nat_flag(e, HAS_FSYNCED_INODE)))
216 need_update = false;
217 read_unlock(&nm_i->nat_tree_lock);
218 return need_update;
168} 219}
169 220
170static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) 221static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
@@ -180,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid)
180 } 231 }
181 memset(new, 0, sizeof(struct nat_entry)); 232 memset(new, 0, sizeof(struct nat_entry));
182 nat_set_nid(new, nid); 233 nat_set_nid(new, nid);
183 new->checkpointed = true; 234 nat_reset_flag(new);
184 list_add_tail(&new->list, &nm_i->nat_entries); 235 list_add_tail(&new->list, &nm_i->nat_entries);
185 nm_i->nat_cnt++; 236 nm_i->nat_cnt++;
186 return new; 237 return new;
@@ -219,7 +270,7 @@ retry:
219 goto retry; 270 goto retry;
220 } 271 }
221 e->ni = *ni; 272 e->ni = *ni;
222 f2fs_bug_on(ni->blk_addr == NEW_ADDR); 273 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
223 } else if (new_blkaddr == NEW_ADDR) { 274 } else if (new_blkaddr == NEW_ADDR) {
224 /* 275 /*
225 * when nid is reallocated, 276 * when nid is reallocated,
@@ -227,20 +278,20 @@ retry:
227 * So, reinitialize it with new information. 278 * So, reinitialize it with new information.
228 */ 279 */
229 e->ni = *ni; 280 e->ni = *ni;
230 f2fs_bug_on(ni->blk_addr != NULL_ADDR); 281 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
231 } 282 }
232 283
233 /* sanity check */ 284 /* sanity check */
234 f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); 285 f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
235 f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && 286 f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR &&
236 new_blkaddr == NULL_ADDR); 287 new_blkaddr == NULL_ADDR);
237 f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && 288 f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR &&
238 new_blkaddr == NEW_ADDR); 289 new_blkaddr == NEW_ADDR);
239 f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && 290 f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR &&
240 nat_get_blkaddr(e) != NULL_ADDR && 291 nat_get_blkaddr(e) != NULL_ADDR &&
241 new_blkaddr == NEW_ADDR); 292 new_blkaddr == NEW_ADDR);
242 293
243 /* increament version no as node is removed */ 294 /* increment version no as node is removed */
244 if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { 295 if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
245 unsigned char version = nat_get_version(e); 296 unsigned char version = nat_get_version(e);
246 nat_set_version(e, inc_node_version(version)); 297 nat_set_version(e, inc_node_version(version));
@@ -248,12 +299,17 @@ retry:
248 299
249 /* change address */ 300 /* change address */
250 nat_set_blkaddr(e, new_blkaddr); 301 nat_set_blkaddr(e, new_blkaddr);
302 if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR)
303 set_nat_flag(e, IS_CHECKPOINTED, false);
251 __set_nat_cache_dirty(nm_i, e); 304 __set_nat_cache_dirty(nm_i, e);
252 305
253 /* update fsync_mark if its inode nat entry is still alive */ 306 /* update fsync_mark if its inode nat entry is still alive */
254 e = __lookup_nat_cache(nm_i, ni->ino); 307 e = __lookup_nat_cache(nm_i, ni->ino);
255 if (e) 308 if (e) {
256 e->fsync_done = fsync_done; 309 if (fsync_done && ni->nid == ni->ino)
310 set_nat_flag(e, HAS_FSYNCED_INODE, true);
311 set_nat_flag(e, HAS_LAST_FSYNC, fsync_done);
312 }
257 write_unlock(&nm_i->nat_tree_lock); 313 write_unlock(&nm_i->nat_tree_lock);
258} 314}
259 315
@@ -277,7 +333,7 @@ int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
277} 333}
278 334
279/* 335/*
280 * This function returns always success 336 * This function always returns success
281 */ 337 */
282void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) 338void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
283{ 339{
@@ -414,7 +470,7 @@ got:
414 */ 470 */
415int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) 471int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
416{ 472{
417 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 473 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
418 struct page *npage[4]; 474 struct page *npage[4];
419 struct page *parent; 475 struct page *parent;
420 int offset[4]; 476 int offset[4];
@@ -507,15 +563,15 @@ release_out:
507 563
508static void truncate_node(struct dnode_of_data *dn) 564static void truncate_node(struct dnode_of_data *dn)
509{ 565{
510 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 566 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
511 struct node_info ni; 567 struct node_info ni;
512 568
513 get_node_info(sbi, dn->nid, &ni); 569 get_node_info(sbi, dn->nid, &ni);
514 if (dn->inode->i_blocks == 0) { 570 if (dn->inode->i_blocks == 0) {
515 f2fs_bug_on(ni.blk_addr != NULL_ADDR); 571 f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR);
516 goto invalidate; 572 goto invalidate;
517 } 573 }
518 f2fs_bug_on(ni.blk_addr == NULL_ADDR); 574 f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
519 575
520 /* Deallocate node address */ 576 /* Deallocate node address */
521 invalidate_blocks(sbi, ni.blk_addr); 577 invalidate_blocks(sbi, ni.blk_addr);
@@ -543,14 +599,13 @@ invalidate:
543 599
544static int truncate_dnode(struct dnode_of_data *dn) 600static int truncate_dnode(struct dnode_of_data *dn)
545{ 601{
546 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
547 struct page *page; 602 struct page *page;
548 603
549 if (dn->nid == 0) 604 if (dn->nid == 0)
550 return 1; 605 return 1;
551 606
552 /* get direct node */ 607 /* get direct node */
553 page = get_node_page(sbi, dn->nid); 608 page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
554 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) 609 if (IS_ERR(page) && PTR_ERR(page) == -ENOENT)
555 return 1; 610 return 1;
556 else if (IS_ERR(page)) 611 else if (IS_ERR(page))
@@ -567,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn)
567static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, 622static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
568 int ofs, int depth) 623 int ofs, int depth)
569{ 624{
570 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
571 struct dnode_of_data rdn = *dn; 625 struct dnode_of_data rdn = *dn;
572 struct page *page; 626 struct page *page;
573 struct f2fs_node *rn; 627 struct f2fs_node *rn;
@@ -581,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
581 635
582 trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); 636 trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr);
583 637
584 page = get_node_page(sbi, dn->nid); 638 page = get_node_page(F2FS_I_SB(dn->inode), dn->nid);
585 if (IS_ERR(page)) { 639 if (IS_ERR(page)) {
586 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); 640 trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page));
587 return PTR_ERR(page); 641 return PTR_ERR(page);
@@ -639,7 +693,6 @@ out_err:
639static int truncate_partial_nodes(struct dnode_of_data *dn, 693static int truncate_partial_nodes(struct dnode_of_data *dn,
640 struct f2fs_inode *ri, int *offset, int depth) 694 struct f2fs_inode *ri, int *offset, int depth)
641{ 695{
642 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb);
643 struct page *pages[2]; 696 struct page *pages[2];
644 nid_t nid[3]; 697 nid_t nid[3];
645 nid_t child_nid; 698 nid_t child_nid;
@@ -653,8 +706,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
653 706
654 /* get indirect nodes in the path */ 707 /* get indirect nodes in the path */
655 for (i = 0; i < idx + 1; i++) { 708 for (i = 0; i < idx + 1; i++) {
656 /* refernece count'll be increased */ 709 /* reference count'll be increased */
657 pages[i] = get_node_page(sbi, nid[i]); 710 pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]);
658 if (IS_ERR(pages[i])) { 711 if (IS_ERR(pages[i])) {
659 err = PTR_ERR(pages[i]); 712 err = PTR_ERR(pages[i]);
660 idx = i - 1; 713 idx = i - 1;
@@ -699,7 +752,7 @@ fail:
699 */ 752 */
700int truncate_inode_blocks(struct inode *inode, pgoff_t from) 753int truncate_inode_blocks(struct inode *inode, pgoff_t from)
701{ 754{
702 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 755 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
703 int err = 0, cont = 1; 756 int err = 0, cont = 1;
704 int level, offset[4], noffset[4]; 757 int level, offset[4], noffset[4];
705 unsigned int nofs = 0; 758 unsigned int nofs = 0;
@@ -795,7 +848,7 @@ fail:
795 848
796int truncate_xattr_node(struct inode *inode, struct page *page) 849int truncate_xattr_node(struct inode *inode, struct page *page)
797{ 850{
798 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 851 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
799 nid_t nid = F2FS_I(inode)->i_xattr_nid; 852 nid_t nid = F2FS_I(inode)->i_xattr_nid;
800 struct dnode_of_data dn; 853 struct dnode_of_data dn;
801 struct page *npage; 854 struct page *npage;
@@ -826,26 +879,31 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
826 */ 879 */
827void remove_inode_page(struct inode *inode) 880void remove_inode_page(struct inode *inode)
828{ 881{
829 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
830 struct page *page;
831 nid_t ino = inode->i_ino;
832 struct dnode_of_data dn; 882 struct dnode_of_data dn;
833 883
834 page = get_node_page(sbi, ino); 884 set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino);
835 if (IS_ERR(page)) 885 if (get_dnode_of_data(&dn, 0, LOOKUP_NODE))
836 return; 886 return;
837 887
838 if (truncate_xattr_node(inode, page)) { 888 if (truncate_xattr_node(inode, dn.inode_page)) {
839 f2fs_put_page(page, 1); 889 f2fs_put_dnode(&dn);
840 return; 890 return;
841 } 891 }
842 /* 0 is possible, after f2fs_new_inode() is failed */ 892
843 f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); 893 /* remove potential inline_data blocks */
844 set_new_dnode(&dn, inode, page, page, ino); 894 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
895 S_ISLNK(inode->i_mode))
896 truncate_data_blocks_range(&dn, 1);
897
898 /* 0 is possible, after f2fs_new_inode() has failed */
899 f2fs_bug_on(F2FS_I_SB(inode),
900 inode->i_blocks != 0 && inode->i_blocks != 1);
901
902 /* will put inode & node pages */
845 truncate_node(&dn); 903 truncate_node(&dn);
846} 904}
847 905
848struct page *new_inode_page(struct inode *inode, const struct qstr *name) 906struct page *new_inode_page(struct inode *inode)
849{ 907{
850 struct dnode_of_data dn; 908 struct dnode_of_data dn;
851 909
@@ -859,7 +917,7 @@ struct page *new_inode_page(struct inode *inode, const struct qstr *name)
859struct page *new_node_page(struct dnode_of_data *dn, 917struct page *new_node_page(struct dnode_of_data *dn,
860 unsigned int ofs, struct page *ipage) 918 unsigned int ofs, struct page *ipage)
861{ 919{
862 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 920 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
863 struct node_info old_ni, new_ni; 921 struct node_info old_ni, new_ni;
864 struct page *page; 922 struct page *page;
865 int err; 923 int err;
@@ -879,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn,
879 get_node_info(sbi, dn->nid, &old_ni); 937 get_node_info(sbi, dn->nid, &old_ni);
880 938
881 /* Reinitialize old_ni with new node page */ 939 /* Reinitialize old_ni with new node page */
882 f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); 940 f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR);
883 new_ni = old_ni; 941 new_ni = old_ni;
884 new_ni.ino = dn->inode->i_ino; 942 new_ni.ino = dn->inode->i_ino;
885 set_node_addr(sbi, &new_ni, NEW_ADDR, false); 943 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
@@ -917,7 +975,7 @@ fail:
917 */ 975 */
918static int read_node_page(struct page *page, int rw) 976static int read_node_page(struct page *page, int rw)
919{ 977{
920 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 978 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
921 struct node_info ni; 979 struct node_info ni;
922 980
923 get_node_info(sbi, page->index, &ni); 981 get_node_info(sbi, page->index, &ni);
@@ -993,7 +1051,7 @@ got_it:
993 */ 1051 */
994struct page *get_node_page_ra(struct page *parent, int start) 1052struct page *get_node_page_ra(struct page *parent, int start)
995{ 1053{
996 struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); 1054 struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
997 struct blk_plug plug; 1055 struct blk_plug plug;
998 struct page *page; 1056 struct page *page;
999 int err, i, end; 1057 int err, i, end;
@@ -1123,17 +1181,24 @@ continue_unlock:
1123 1181
1124 /* called by fsync() */ 1182 /* called by fsync() */
1125 if (ino && IS_DNODE(page)) { 1183 if (ino && IS_DNODE(page)) {
1126 int mark = !is_checkpointed_node(sbi, ino);
1127 set_fsync_mark(page, 1); 1184 set_fsync_mark(page, 1);
1128 if (IS_INODE(page)) 1185 if (IS_INODE(page)) {
1129 set_dentry_mark(page, mark); 1186 if (!is_checkpointed_node(sbi, ino) &&
1187 !has_fsynced_inode(sbi, ino))
1188 set_dentry_mark(page, 1);
1189 else
1190 set_dentry_mark(page, 0);
1191 }
1130 nwritten++; 1192 nwritten++;
1131 } else { 1193 } else {
1132 set_fsync_mark(page, 0); 1194 set_fsync_mark(page, 0);
1133 set_dentry_mark(page, 0); 1195 set_dentry_mark(page, 0);
1134 } 1196 }
1135 NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); 1197
1136 wrote++; 1198 if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc))
1199 unlock_page(page);
1200 else
1201 wrote++;
1137 1202
1138 if (--wbc->nr_to_write == 0) 1203 if (--wbc->nr_to_write == 0)
1139 break; 1204 break;
@@ -1202,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino)
1202static int f2fs_write_node_page(struct page *page, 1267static int f2fs_write_node_page(struct page *page,
1203 struct writeback_control *wbc) 1268 struct writeback_control *wbc)
1204{ 1269{
1205 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1270 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1206 nid_t nid; 1271 nid_t nid;
1207 block_t new_addr; 1272 block_t new_addr;
1208 struct node_info ni; 1273 struct node_info ni;
@@ -1215,12 +1280,14 @@ static int f2fs_write_node_page(struct page *page,
1215 1280
1216 if (unlikely(sbi->por_doing)) 1281 if (unlikely(sbi->por_doing))
1217 goto redirty_out; 1282 goto redirty_out;
1283 if (unlikely(f2fs_cp_error(sbi)))
1284 goto redirty_out;
1218 1285
1219 f2fs_wait_on_page_writeback(page, NODE); 1286 f2fs_wait_on_page_writeback(page, NODE);
1220 1287
1221 /* get old block addr of this node page */ 1288 /* get old block addr of this node page */
1222 nid = nid_of_node(page); 1289 nid = nid_of_node(page);
1223 f2fs_bug_on(page->index != nid); 1290 f2fs_bug_on(sbi, page->index != nid);
1224 1291
1225 get_node_info(sbi, nid, &ni); 1292 get_node_info(sbi, nid, &ni);
1226 1293
@@ -1234,12 +1301,12 @@ static int f2fs_write_node_page(struct page *page,
1234 if (wbc->for_reclaim) 1301 if (wbc->for_reclaim)
1235 goto redirty_out; 1302 goto redirty_out;
1236 1303
1237 mutex_lock(&sbi->node_write); 1304 down_read(&sbi->node_write);
1238 set_page_writeback(page); 1305 set_page_writeback(page);
1239 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr); 1306 write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
1240 set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page)); 1307 set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
1241 dec_page_count(sbi, F2FS_DIRTY_NODES); 1308 dec_page_count(sbi, F2FS_DIRTY_NODES);
1242 mutex_unlock(&sbi->node_write); 1309 up_read(&sbi->node_write);
1243 unlock_page(page); 1310 unlock_page(page);
1244 return 0; 1311 return 0;
1245 1312
@@ -1251,7 +1318,7 @@ redirty_out:
1251static int f2fs_write_node_pages(struct address_space *mapping, 1318static int f2fs_write_node_pages(struct address_space *mapping,
1252 struct writeback_control *wbc) 1319 struct writeback_control *wbc)
1253{ 1320{
1254 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); 1321 struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
1255 long diff; 1322 long diff;
1256 1323
1257 trace_f2fs_writepages(mapping->host, wbc, NODE); 1324 trace_f2fs_writepages(mapping->host, wbc, NODE);
@@ -1276,15 +1343,12 @@ skip_write:
1276 1343
1277static int f2fs_set_node_page_dirty(struct page *page) 1344static int f2fs_set_node_page_dirty(struct page *page)
1278{ 1345{
1279 struct address_space *mapping = page->mapping;
1280 struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb);
1281
1282 trace_f2fs_set_page_dirty(page, NODE); 1346 trace_f2fs_set_page_dirty(page, NODE);
1283 1347
1284 SetPageUptodate(page); 1348 SetPageUptodate(page);
1285 if (!PageDirty(page)) { 1349 if (!PageDirty(page)) {
1286 __set_page_dirty_nobuffers(page); 1350 __set_page_dirty_nobuffers(page);
1287 inc_page_count(sbi, F2FS_DIRTY_NODES); 1351 inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
1288 SetPagePrivate(page); 1352 SetPagePrivate(page);
1289 return 1; 1353 return 1;
1290 } 1354 }
@@ -1295,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
1295 unsigned int length) 1359 unsigned int length)
1296{ 1360{
1297 struct inode *inode = page->mapping->host; 1361 struct inode *inode = page->mapping->host;
1298 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1299 if (PageDirty(page)) 1362 if (PageDirty(page))
1300 dec_page_count(sbi, F2FS_DIRTY_NODES); 1363 dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
1301 ClearPagePrivate(page); 1364 ClearPagePrivate(page);
1302} 1365}
1303 1366
@@ -1350,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
1350 read_lock(&nm_i->nat_tree_lock); 1413 read_lock(&nm_i->nat_tree_lock);
1351 ne = __lookup_nat_cache(nm_i, nid); 1414 ne = __lookup_nat_cache(nm_i, nid);
1352 if (ne && 1415 if (ne &&
1353 (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) 1416 (!get_nat_flag(ne, IS_CHECKPOINTED) ||
1417 nat_get_blkaddr(ne) != NULL_ADDR))
1354 allocated = true; 1418 allocated = true;
1355 read_unlock(&nm_i->nat_tree_lock); 1419 read_unlock(&nm_i->nat_tree_lock);
1356 if (allocated) 1420 if (allocated)
@@ -1407,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
1407 break; 1471 break;
1408 1472
1409 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); 1473 blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
1410 f2fs_bug_on(blk_addr == NEW_ADDR); 1474 f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
1411 if (blk_addr == NULL_ADDR) { 1475 if (blk_addr == NULL_ADDR) {
1412 if (add_free_nid(sbi, start_nid, true) < 0) 1476 if (add_free_nid(sbi, start_nid, true) < 0)
1413 break; 1477 break;
@@ -1477,12 +1541,12 @@ retry:
1477 1541
1478 /* We should not use stale free nids created by build_free_nids */ 1542 /* We should not use stale free nids created by build_free_nids */
1479 if (nm_i->fcnt && !on_build_free_nids(nm_i)) { 1543 if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
1480 f2fs_bug_on(list_empty(&nm_i->free_nid_list)); 1544 f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
1481 list_for_each_entry(i, &nm_i->free_nid_list, list) 1545 list_for_each_entry(i, &nm_i->free_nid_list, list)
1482 if (i->state == NID_NEW) 1546 if (i->state == NID_NEW)
1483 break; 1547 break;
1484 1548
1485 f2fs_bug_on(i->state != NID_NEW); 1549 f2fs_bug_on(sbi, i->state != NID_NEW);
1486 *nid = i->nid; 1550 *nid = i->nid;
1487 i->state = NID_ALLOC; 1551 i->state = NID_ALLOC;
1488 nm_i->fcnt--; 1552 nm_i->fcnt--;
@@ -1508,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
1508 1572
1509 spin_lock(&nm_i->free_nid_list_lock); 1573 spin_lock(&nm_i->free_nid_list_lock);
1510 i = __lookup_free_nid_list(nm_i, nid); 1574 i = __lookup_free_nid_list(nm_i, nid);
1511 f2fs_bug_on(!i || i->state != NID_ALLOC); 1575 f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
1512 __del_from_free_nid_list(nm_i, i); 1576 __del_from_free_nid_list(nm_i, i);
1513 spin_unlock(&nm_i->free_nid_list_lock); 1577 spin_unlock(&nm_i->free_nid_list_lock);
1514 1578
@@ -1529,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1529 1593
1530 spin_lock(&nm_i->free_nid_list_lock); 1594 spin_lock(&nm_i->free_nid_list_lock);
1531 i = __lookup_free_nid_list(nm_i, nid); 1595 i = __lookup_free_nid_list(nm_i, nid);
1532 f2fs_bug_on(!i || i->state != NID_ALLOC); 1596 f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
1533 if (!available_free_memory(sbi, FREE_NIDS)) { 1597 if (!available_free_memory(sbi, FREE_NIDS)) {
1534 __del_from_free_nid_list(nm_i, i); 1598 __del_from_free_nid_list(nm_i, i);
1535 need_free = true; 1599 need_free = true;
@@ -1543,35 +1607,21 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
1543 kmem_cache_free(free_nid_slab, i); 1607 kmem_cache_free(free_nid_slab, i);
1544} 1608}
1545 1609
1546void recover_node_page(struct f2fs_sb_info *sbi, struct page *page, 1610void recover_inline_xattr(struct inode *inode, struct page *page)
1547 struct f2fs_summary *sum, struct node_info *ni,
1548 block_t new_blkaddr)
1549{
1550 rewrite_node_page(sbi, page, sum, ni->blk_addr, new_blkaddr);
1551 set_node_addr(sbi, ni, new_blkaddr, false);
1552 clear_node_page_dirty(page);
1553}
1554
1555static void recover_inline_xattr(struct inode *inode, struct page *page)
1556{ 1611{
1557 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1558 void *src_addr, *dst_addr; 1612 void *src_addr, *dst_addr;
1559 size_t inline_size; 1613 size_t inline_size;
1560 struct page *ipage; 1614 struct page *ipage;
1561 struct f2fs_inode *ri; 1615 struct f2fs_inode *ri;
1562 1616
1563 if (!f2fs_has_inline_xattr(inode)) 1617 ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
1564 return; 1618 f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage));
1565
1566 if (!IS_INODE(page))
1567 return;
1568 1619
1569 ri = F2FS_INODE(page); 1620 ri = F2FS_INODE(page);
1570 if (!(ri->i_inline & F2FS_INLINE_XATTR)) 1621 if (!(ri->i_inline & F2FS_INLINE_XATTR)) {
1571 return; 1622 clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR);
1572 1623 goto update_inode;
1573 ipage = get_node_page(sbi, inode->i_ino); 1624 }
1574 f2fs_bug_on(IS_ERR(ipage));
1575 1625
1576 dst_addr = inline_xattr_addr(ipage); 1626 dst_addr = inline_xattr_addr(ipage);
1577 src_addr = inline_xattr_addr(page); 1627 src_addr = inline_xattr_addr(page);
@@ -1579,30 +1629,25 @@ static void recover_inline_xattr(struct inode *inode, struct page *page)
1579 1629
1580 f2fs_wait_on_page_writeback(ipage, NODE); 1630 f2fs_wait_on_page_writeback(ipage, NODE);
1581 memcpy(dst_addr, src_addr, inline_size); 1631 memcpy(dst_addr, src_addr, inline_size);
1582 1632update_inode:
1583 update_inode(inode, ipage); 1633 update_inode(inode, ipage);
1584 f2fs_put_page(ipage, 1); 1634 f2fs_put_page(ipage, 1);
1585} 1635}
1586 1636
1587bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) 1637void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1588{ 1638{
1589 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 1639 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1590 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; 1640 nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid;
1591 nid_t new_xnid = nid_of_node(page); 1641 nid_t new_xnid = nid_of_node(page);
1592 struct node_info ni; 1642 struct node_info ni;
1593 1643
1594 recover_inline_xattr(inode, page);
1595
1596 if (!f2fs_has_xattr_block(ofs_of_node(page)))
1597 return false;
1598
1599 /* 1: invalidate the previous xattr nid */ 1644 /* 1: invalidate the previous xattr nid */
1600 if (!prev_xnid) 1645 if (!prev_xnid)
1601 goto recover_xnid; 1646 goto recover_xnid;
1602 1647
1603 /* Deallocate node address */ 1648 /* Deallocate node address */
1604 get_node_info(sbi, prev_xnid, &ni); 1649 get_node_info(sbi, prev_xnid, &ni);
1605 f2fs_bug_on(ni.blk_addr == NULL_ADDR); 1650 f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR);
1606 invalidate_blocks(sbi, ni.blk_addr); 1651 invalidate_blocks(sbi, ni.blk_addr);
1607 dec_valid_node_count(sbi, inode); 1652 dec_valid_node_count(sbi, inode);
1608 set_node_addr(sbi, &ni, NULL_ADDR, false); 1653 set_node_addr(sbi, &ni, NULL_ADDR, false);
@@ -1610,7 +1655,7 @@ bool recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr)
1610recover_xnid: 1655recover_xnid:
1611 /* 2: allocate new xattr nid */ 1656 /* 2: allocate new xattr nid */
1612 if (unlikely(!inc_valid_node_count(sbi, inode))) 1657 if (unlikely(!inc_valid_node_count(sbi, inode)))
1613 f2fs_bug_on(1); 1658 f2fs_bug_on(sbi, 1);
1614 1659
1615 remove_free_nid(NM_I(sbi), new_xnid); 1660 remove_free_nid(NM_I(sbi), new_xnid);
1616 get_node_info(sbi, new_xnid, &ni); 1661 get_node_info(sbi, new_xnid, &ni);
@@ -1623,7 +1668,6 @@ recover_xnid:
1623 set_node_addr(sbi, &ni, blkaddr, false); 1668 set_node_addr(sbi, &ni, blkaddr, false);
1624 1669
1625 update_inode_page(inode); 1670 update_inode_page(inode);
1626 return true;
1627} 1671}
1628 1672
1629int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) 1673int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
@@ -1642,7 +1686,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1642 if (!ipage) 1686 if (!ipage)
1643 return -ENOMEM; 1687 return -ENOMEM;
1644 1688
1645 /* Should not use this inode from free nid list */ 1689 /* Should not use this inode from free nid list */
1646 remove_free_nid(NM_I(sbi), ino); 1690 remove_free_nid(NM_I(sbi), ino);
1647 1691
1648 SetPageUptodate(ipage); 1692 SetPageUptodate(ipage);
@@ -1656,6 +1700,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1656 dst->i_blocks = cpu_to_le64(1); 1700 dst->i_blocks = cpu_to_le64(1);
1657 dst->i_links = cpu_to_le32(1); 1701 dst->i_links = cpu_to_le32(1);
1658 dst->i_xattr_nid = 0; 1702 dst->i_xattr_nid = 0;
1703 dst->i_inline = src->i_inline & F2FS_INLINE_XATTR;
1659 1704
1660 new_ni = old_ni; 1705 new_ni = old_ni;
1661 new_ni.ino = ino; 1706 new_ni.ino = ino;
@@ -1664,13 +1709,14 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
1664 WARN_ON(1); 1709 WARN_ON(1);
1665 set_node_addr(sbi, &new_ni, NEW_ADDR, false); 1710 set_node_addr(sbi, &new_ni, NEW_ADDR, false);
1666 inc_valid_inode_count(sbi); 1711 inc_valid_inode_count(sbi);
1712 set_page_dirty(ipage);
1667 f2fs_put_page(ipage, 1); 1713 f2fs_put_page(ipage, 1);
1668 return 0; 1714 return 0;
1669} 1715}
1670 1716
1671/* 1717/*
1672 * ra_sum_pages() merge contiguous pages into one bio and submit. 1718 * ra_sum_pages() merge contiguous pages into one bio and submit.
1673 * these pre-readed pages are alloced in bd_inode's mapping tree. 1719 * these pre-read pages are allocated in bd_inode's mapping tree.
1674 */ 1720 */
1675static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages, 1721static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
1676 int start, int nrpages) 1722 int start, int nrpages)
@@ -1702,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1702 struct f2fs_summary *sum_entry; 1748 struct f2fs_summary *sum_entry;
1703 struct inode *inode = sbi->sb->s_bdev->bd_inode; 1749 struct inode *inode = sbi->sb->s_bdev->bd_inode;
1704 block_t addr; 1750 block_t addr;
1705 int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1751 int bio_blocks = MAX_BIO_BLOCKS(sbi);
1706 struct page *pages[bio_blocks]; 1752 struct page *pages[bio_blocks];
1707 int i, idx, last_offset, nrpages, err = 0; 1753 int i, idx, last_offset, nrpages, err = 0;
1708 1754
@@ -1714,7 +1760,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
1714 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) { 1760 for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
1715 nrpages = min(last_offset - i, bio_blocks); 1761 nrpages = min(last_offset - i, bio_blocks);
1716 1762
1717 /* read ahead node pages */ 1763 /* readahead node pages */
1718 nrpages = ra_sum_pages(sbi, pages, addr, nrpages); 1764 nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
1719 if (!nrpages) 1765 if (!nrpages)
1720 return -ENOMEM; 1766 return -ENOMEM;
@@ -1744,7 +1790,7 @@ skip:
1744 return err; 1790 return err;
1745} 1791}
1746 1792
1747static bool flush_nats_in_journal(struct f2fs_sb_info *sbi) 1793static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
1748{ 1794{
1749 struct f2fs_nm_info *nm_i = NM_I(sbi); 1795 struct f2fs_nm_info *nm_i = NM_I(sbi);
1750 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1796 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1752,12 +1798,6 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
1752 int i; 1798 int i;
1753 1799
1754 mutex_lock(&curseg->curseg_mutex); 1800 mutex_lock(&curseg->curseg_mutex);
1755
1756 if (nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) {
1757 mutex_unlock(&curseg->curseg_mutex);
1758 return false;
1759 }
1760
1761 for (i = 0; i < nats_in_cursum(sum); i++) { 1801 for (i = 0; i < nats_in_cursum(sum); i++) {
1762 struct nat_entry *ne; 1802 struct nat_entry *ne;
1763 struct f2fs_nat_entry raw_ne; 1803 struct f2fs_nat_entry raw_ne;
@@ -1767,107 +1807,147 @@ static bool flush_nats_in_journal(struct f2fs_sb_info *sbi)
1767retry: 1807retry:
1768 write_lock(&nm_i->nat_tree_lock); 1808 write_lock(&nm_i->nat_tree_lock);
1769 ne = __lookup_nat_cache(nm_i, nid); 1809 ne = __lookup_nat_cache(nm_i, nid);
1770 if (ne) { 1810 if (ne)
1771 __set_nat_cache_dirty(nm_i, ne); 1811 goto found;
1772 write_unlock(&nm_i->nat_tree_lock); 1812
1773 continue;
1774 }
1775 ne = grab_nat_entry(nm_i, nid); 1813 ne = grab_nat_entry(nm_i, nid);
1776 if (!ne) { 1814 if (!ne) {
1777 write_unlock(&nm_i->nat_tree_lock); 1815 write_unlock(&nm_i->nat_tree_lock);
1778 goto retry; 1816 goto retry;
1779 } 1817 }
1780 node_info_from_raw_nat(&ne->ni, &raw_ne); 1818 node_info_from_raw_nat(&ne->ni, &raw_ne);
1819found:
1781 __set_nat_cache_dirty(nm_i, ne); 1820 __set_nat_cache_dirty(nm_i, ne);
1782 write_unlock(&nm_i->nat_tree_lock); 1821 write_unlock(&nm_i->nat_tree_lock);
1783 } 1822 }
1784 update_nats_in_cursum(sum, -i); 1823 update_nats_in_cursum(sum, -i);
1785 mutex_unlock(&curseg->curseg_mutex); 1824 mutex_unlock(&curseg->curseg_mutex);
1786 return true;
1787} 1825}
1788 1826
1789/* 1827static void __adjust_nat_entry_set(struct nat_entry_set *nes,
1790 * This function is called during the checkpointing process. 1828 struct list_head *head, int max)
1791 */ 1829{
1792void flush_nat_entries(struct f2fs_sb_info *sbi) 1830 struct nat_entry_set *cur;
1831
1832 if (nes->entry_cnt >= max)
1833 goto add_out;
1834
1835 list_for_each_entry(cur, head, set_list) {
1836 if (cur->entry_cnt >= nes->entry_cnt) {
1837 list_add(&nes->set_list, cur->set_list.prev);
1838 return;
1839 }
1840 }
1841add_out:
1842 list_add_tail(&nes->set_list, head);
1843}
1844
1845static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
1846 struct nat_entry_set *set)
1793{ 1847{
1794 struct f2fs_nm_info *nm_i = NM_I(sbi);
1795 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); 1848 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1796 struct f2fs_summary_block *sum = curseg->sum_blk; 1849 struct f2fs_summary_block *sum = curseg->sum_blk;
1850 nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
1851 bool to_journal = true;
1852 struct f2fs_nat_block *nat_blk;
1797 struct nat_entry *ne, *cur; 1853 struct nat_entry *ne, *cur;
1798 struct page *page = NULL; 1854 struct page *page = NULL;
1799 struct f2fs_nat_block *nat_blk = NULL;
1800 nid_t start_nid = 0, end_nid = 0;
1801 bool flushed;
1802 1855
1803 flushed = flush_nats_in_journal(sbi); 1856 /*
1857 * there are two steps to flush nat entries:
1858 * #1, flush nat entries to journal in current hot data summary block.
1859 * #2, flush nat entries to nat page.
1860 */
1861 if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL))
1862 to_journal = false;
1804 1863
1805 if (!flushed) 1864 if (to_journal) {
1806 mutex_lock(&curseg->curseg_mutex); 1865 mutex_lock(&curseg->curseg_mutex);
1866 } else {
1867 page = get_next_nat_page(sbi, start_nid);
1868 nat_blk = page_address(page);
1869 f2fs_bug_on(sbi, !nat_blk);
1870 }
1807 1871
1808 /* 1) flush dirty nat caches */ 1872 /* flush dirty nats in nat entry set */
1809 list_for_each_entry_safe(ne, cur, &nm_i->dirty_nat_entries, list) { 1873 list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
1810 nid_t nid; 1874 struct f2fs_nat_entry *raw_ne;
1811 struct f2fs_nat_entry raw_ne; 1875 nid_t nid = nat_get_nid(ne);
1812 int offset = -1; 1876 int offset;
1813 1877
1814 if (nat_get_blkaddr(ne) == NEW_ADDR) 1878 if (nat_get_blkaddr(ne) == NEW_ADDR)
1815 continue; 1879 continue;
1816 1880
1817 nid = nat_get_nid(ne); 1881 if (to_journal) {
1882 offset = lookup_journal_in_cursum(sum,
1883 NAT_JOURNAL, nid, 1);
1884 f2fs_bug_on(sbi, offset < 0);
1885 raw_ne = &nat_in_journal(sum, offset);
1886 nid_in_journal(sum, offset) = cpu_to_le32(nid);
1887 } else {
1888 raw_ne = &nat_blk->entries[nid - start_nid];
1889 }
1890 raw_nat_from_node_info(raw_ne, &ne->ni);
1818 1891
1819 if (flushed) 1892 write_lock(&NM_I(sbi)->nat_tree_lock);
1820 goto to_nat_page; 1893 nat_reset_flag(ne);
1894 __clear_nat_cache_dirty(NM_I(sbi), ne);
1895 write_unlock(&NM_I(sbi)->nat_tree_lock);
1821 1896
1822 /* if there is room for nat enries in curseg->sumpage */ 1897 if (nat_get_blkaddr(ne) == NULL_ADDR)
1823 offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1); 1898 add_free_nid(sbi, nid, false);
1824 if (offset >= 0) { 1899 }
1825 raw_ne = nat_in_journal(sum, offset);
1826 goto flush_now;
1827 }
1828to_nat_page:
1829 if (!page || (start_nid > nid || nid > end_nid)) {
1830 if (page) {
1831 f2fs_put_page(page, 1);
1832 page = NULL;
1833 }
1834 start_nid = START_NID(nid);
1835 end_nid = start_nid + NAT_ENTRY_PER_BLOCK - 1;
1836 1900
1837 /* 1901 if (to_journal)
1838 * get nat block with dirty flag, increased reference 1902 mutex_unlock(&curseg->curseg_mutex);
1839 * count, mapped and lock 1903 else
1840 */ 1904 f2fs_put_page(page, 1);
1841 page = get_next_nat_page(sbi, start_nid);
1842 nat_blk = page_address(page);
1843 }
1844 1905
1845 f2fs_bug_on(!nat_blk); 1906 if (!set->entry_cnt) {
1846 raw_ne = nat_blk->entries[nid - start_nid]; 1907 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
1847flush_now: 1908 kmem_cache_free(nat_entry_set_slab, set);
1848 raw_nat_from_node_info(&raw_ne, &ne->ni); 1909 }
1910}
1849 1911
1850 if (offset < 0) { 1912/*
1851 nat_blk->entries[nid - start_nid] = raw_ne; 1913 * This function is called during the checkpointing process.
1852 } else { 1914 */
1853 nat_in_journal(sum, offset) = raw_ne; 1915void flush_nat_entries(struct f2fs_sb_info *sbi)
1854 nid_in_journal(sum, offset) = cpu_to_le32(nid); 1916{
1855 } 1917 struct f2fs_nm_info *nm_i = NM_I(sbi);
1918 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
1919 struct f2fs_summary_block *sum = curseg->sum_blk;
1920 struct nat_entry_set *setvec[NATVEC_SIZE];
1921 struct nat_entry_set *set, *tmp;
1922 unsigned int found;
1923 nid_t set_idx = 0;
1924 LIST_HEAD(sets);
1925
1926 /*
1927 * if there are no enough space in journal to store dirty nat
1928 * entries, remove all entries from journal and merge them
1929 * into nat entry set.
1930 */
1931 if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
1932 remove_nats_in_journal(sbi);
1933
1934 if (!nm_i->dirty_nat_cnt)
1935 return;
1856 1936
1857 if (nat_get_blkaddr(ne) == NULL_ADDR && 1937 while ((found = __gang_lookup_nat_set(nm_i,
1858 add_free_nid(sbi, nid, false) <= 0) { 1938 set_idx, NATVEC_SIZE, setvec))) {
1859 write_lock(&nm_i->nat_tree_lock); 1939 unsigned idx;
1860 __del_from_nat_cache(nm_i, ne); 1940 set_idx = setvec[found - 1]->set + 1;
1861 write_unlock(&nm_i->nat_tree_lock); 1941 for (idx = 0; idx < found; idx++)
1862 } else { 1942 __adjust_nat_entry_set(setvec[idx], &sets,
1863 write_lock(&nm_i->nat_tree_lock); 1943 MAX_NAT_JENTRIES(sum));
1864 __clear_nat_cache_dirty(nm_i, ne);
1865 write_unlock(&nm_i->nat_tree_lock);
1866 }
1867 } 1944 }
1868 if (!flushed) 1945
1869 mutex_unlock(&curseg->curseg_mutex); 1946 /* flush dirty nats in nat entry set */
1870 f2fs_put_page(page, 1); 1947 list_for_each_entry_safe(set, tmp, &sets, set_list)
1948 __flush_nat_entry_set(sbi, set);
1949
1950 f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
1871} 1951}
1872 1952
1873static int init_node_manager(struct f2fs_sb_info *sbi) 1953static int init_node_manager(struct f2fs_sb_info *sbi)
@@ -1886,7 +1966,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
1886 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; 1966 nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
1887 1967
1888 /* not used nids: 0, node, meta, (and root counted as valid node) */ 1968 /* not used nids: 0, node, meta, (and root counted as valid node) */
1889 nm_i->available_nids = nm_i->max_nid - 3; 1969 nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
1890 nm_i->fcnt = 0; 1970 nm_i->fcnt = 0;
1891 nm_i->nat_cnt = 0; 1971 nm_i->nat_cnt = 0;
1892 nm_i->ram_thresh = DEF_RAM_THRESHOLD; 1972 nm_i->ram_thresh = DEF_RAM_THRESHOLD;
@@ -1894,8 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
1894 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); 1974 INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
1895 INIT_LIST_HEAD(&nm_i->free_nid_list); 1975 INIT_LIST_HEAD(&nm_i->free_nid_list);
1896 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); 1976 INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC);
1977 INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC);
1897 INIT_LIST_HEAD(&nm_i->nat_entries); 1978 INIT_LIST_HEAD(&nm_i->nat_entries);
1898 INIT_LIST_HEAD(&nm_i->dirty_nat_entries);
1899 1979
1900 mutex_init(&nm_i->build_lock); 1980 mutex_init(&nm_i->build_lock);
1901 spin_lock_init(&nm_i->free_nid_list_lock); 1981 spin_lock_init(&nm_i->free_nid_list_lock);
@@ -1944,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
1944 /* destroy free nid list */ 2024 /* destroy free nid list */
1945 spin_lock(&nm_i->free_nid_list_lock); 2025 spin_lock(&nm_i->free_nid_list_lock);
1946 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { 2026 list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
1947 f2fs_bug_on(i->state == NID_ALLOC); 2027 f2fs_bug_on(sbi, i->state == NID_ALLOC);
1948 __del_from_free_nid_list(nm_i, i); 2028 __del_from_free_nid_list(nm_i, i);
1949 nm_i->fcnt--; 2029 nm_i->fcnt--;
1950 spin_unlock(&nm_i->free_nid_list_lock); 2030 spin_unlock(&nm_i->free_nid_list_lock);
1951 kmem_cache_free(free_nid_slab, i); 2031 kmem_cache_free(free_nid_slab, i);
1952 spin_lock(&nm_i->free_nid_list_lock); 2032 spin_lock(&nm_i->free_nid_list_lock);
1953 } 2033 }
1954 f2fs_bug_on(nm_i->fcnt); 2034 f2fs_bug_on(sbi, nm_i->fcnt);
1955 spin_unlock(&nm_i->free_nid_list_lock); 2035 spin_unlock(&nm_i->free_nid_list_lock);
1956 2036
1957 /* destroy nat cache */ 2037 /* destroy nat cache */
@@ -1963,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
1963 for (idx = 0; idx < found; idx++) 2043 for (idx = 0; idx < found; idx++)
1964 __del_from_nat_cache(nm_i, natvec[idx]); 2044 __del_from_nat_cache(nm_i, natvec[idx]);
1965 } 2045 }
1966 f2fs_bug_on(nm_i->nat_cnt); 2046 f2fs_bug_on(sbi, nm_i->nat_cnt);
1967 write_unlock(&nm_i->nat_tree_lock); 2047 write_unlock(&nm_i->nat_tree_lock);
1968 2048
1969 kfree(nm_i->nat_bitmap); 2049 kfree(nm_i->nat_bitmap);
@@ -1976,19 +2056,30 @@ int __init create_node_manager_caches(void)
1976 nat_entry_slab = f2fs_kmem_cache_create("nat_entry", 2056 nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
1977 sizeof(struct nat_entry)); 2057 sizeof(struct nat_entry));
1978 if (!nat_entry_slab) 2058 if (!nat_entry_slab)
1979 return -ENOMEM; 2059 goto fail;
1980 2060
1981 free_nid_slab = f2fs_kmem_cache_create("free_nid", 2061 free_nid_slab = f2fs_kmem_cache_create("free_nid",
1982 sizeof(struct free_nid)); 2062 sizeof(struct free_nid));
1983 if (!free_nid_slab) { 2063 if (!free_nid_slab)
1984 kmem_cache_destroy(nat_entry_slab); 2064 goto destory_nat_entry;
1985 return -ENOMEM; 2065
1986 } 2066 nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
2067 sizeof(struct nat_entry_set));
2068 if (!nat_entry_set_slab)
2069 goto destory_free_nid;
1987 return 0; 2070 return 0;
2071
2072destory_free_nid:
2073 kmem_cache_destroy(free_nid_slab);
2074destory_nat_entry:
2075 kmem_cache_destroy(nat_entry_slab);
2076fail:
2077 return -ENOMEM;
1988} 2078}
1989 2079
1990void destroy_node_manager_caches(void) 2080void destroy_node_manager_caches(void)
1991{ 2081{
2082 kmem_cache_destroy(nat_entry_set_slab);
1992 kmem_cache_destroy(free_nid_slab); 2083 kmem_cache_destroy(free_nid_slab);
1993 kmem_cache_destroy(nat_entry_slab); 2084 kmem_cache_destroy(nat_entry_slab);
1994} 2085}
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 7281112cd1c8..8d5e6e0dd840 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -39,10 +39,16 @@ struct node_info {
39 unsigned char version; /* version of the node */ 39 unsigned char version; /* version of the node */
40}; 40};
41 41
42enum {
43 IS_CHECKPOINTED, /* is it checkpointed before? */
44 HAS_FSYNCED_INODE, /* is the inode fsynced before? */
45 HAS_LAST_FSYNC, /* has the latest node fsync mark? */
46 IS_DIRTY, /* this nat entry is dirty? */
47};
48
42struct nat_entry { 49struct nat_entry {
43 struct list_head list; /* for clean or dirty nat list */ 50 struct list_head list; /* for clean or dirty nat list */
44 bool checkpointed; /* whether it is checkpointed or not */ 51 unsigned char flag; /* for node information bits */
45 bool fsync_done; /* whether the latest node has fsync mark */
46 struct node_info ni; /* in-memory node information */ 52 struct node_info ni; /* in-memory node information */
47}; 53};
48 54
@@ -55,18 +61,32 @@ struct nat_entry {
55#define nat_get_version(nat) (nat->ni.version) 61#define nat_get_version(nat) (nat->ni.version)
56#define nat_set_version(nat, v) (nat->ni.version = v) 62#define nat_set_version(nat, v) (nat->ni.version = v)
57 63
58#define __set_nat_cache_dirty(nm_i, ne) \
59 do { \
60 ne->checkpointed = false; \
61 list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \
62 } while (0)
63#define __clear_nat_cache_dirty(nm_i, ne) \
64 do { \
65 ne->checkpointed = true; \
66 list_move_tail(&ne->list, &nm_i->nat_entries); \
67 } while (0)
68#define inc_node_version(version) (++version) 64#define inc_node_version(version) (++version)
69 65
66static inline void set_nat_flag(struct nat_entry *ne,
67 unsigned int type, bool set)
68{
69 unsigned char mask = 0x01 << type;
70 if (set)
71 ne->flag |= mask;
72 else
73 ne->flag &= ~mask;
74}
75
76static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
77{
78 unsigned char mask = 0x01 << type;
79 return ne->flag & mask;
80}
81
82static inline void nat_reset_flag(struct nat_entry *ne)
83{
84 /* these states can be set only after checkpoint was done */
85 set_nat_flag(ne, IS_CHECKPOINTED, true);
86 set_nat_flag(ne, HAS_FSYNCED_INODE, false);
87 set_nat_flag(ne, HAS_LAST_FSYNC, true);
88}
89
70static inline void node_info_from_raw_nat(struct node_info *ni, 90static inline void node_info_from_raw_nat(struct node_info *ni,
71 struct f2fs_nat_entry *raw_ne) 91 struct f2fs_nat_entry *raw_ne)
72{ 92{
@@ -89,6 +109,13 @@ enum mem_type {
89 DIRTY_DENTS /* indicates dirty dentry pages */ 109 DIRTY_DENTS /* indicates dirty dentry pages */
90}; 110};
91 111
112struct nat_entry_set {
113 struct list_head set_list; /* link with other nat sets */
114 struct list_head entry_list; /* link with dirty nat entries */
115 nid_t set; /* set number*/
116 unsigned int entry_cnt; /* the # of nat entries in set */
117};
118
92/* 119/*
93 * For free nid mangement 120 * For free nid mangement
94 */ 121 */
@@ -103,18 +130,19 @@ struct free_nid {
103 int state; /* in use or not: NID_NEW or NID_ALLOC */ 130 int state; /* in use or not: NID_NEW or NID_ALLOC */
104}; 131};
105 132
106static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) 133static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
107{ 134{
108 struct f2fs_nm_info *nm_i = NM_I(sbi); 135 struct f2fs_nm_info *nm_i = NM_I(sbi);
109 struct free_nid *fnid; 136 struct free_nid *fnid;
110 137
111 if (nm_i->fcnt <= 0)
112 return -1;
113 spin_lock(&nm_i->free_nid_list_lock); 138 spin_lock(&nm_i->free_nid_list_lock);
139 if (nm_i->fcnt <= 0) {
140 spin_unlock(&nm_i->free_nid_list_lock);
141 return;
142 }
114 fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); 143 fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
115 *nid = fnid->nid; 144 *nid = fnid->nid;
116 spin_unlock(&nm_i->free_nid_list_lock); 145 spin_unlock(&nm_i->free_nid_list_lock);
117 return 0;
118} 146}
119 147
120/* 148/*
@@ -190,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src)
190 218
191static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) 219static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr)
192{ 220{
193 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 221 struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page));
194 struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
195 struct f2fs_node *rn = F2FS_NODE(page); 222 struct f2fs_node *rn = F2FS_NODE(page);
196 223
197 rn->footer.cp_ver = ckpt->checkpoint_ver; 224 rn->footer.cp_ver = ckpt->checkpoint_ver;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index a112368a4a86..ebd013225788 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -14,6 +14,37 @@
14#include "node.h" 14#include "node.h"
15#include "segment.h" 15#include "segment.h"
16 16
17/*
18 * Roll forward recovery scenarios.
19 *
20 * [Term] F: fsync_mark, D: dentry_mark
21 *
22 * 1. inode(x) | CP | inode(x) | dnode(F)
23 * -> Update the latest inode(x).
24 *
25 * 2. inode(x) | CP | inode(F) | dnode(F)
26 * -> No problem.
27 *
28 * 3. inode(x) | CP | dnode(F) | inode(x)
29 * -> Recover to the latest dnode(F), and drop the last inode(x)
30 *
31 * 4. inode(x) | CP | dnode(F) | inode(F)
32 * -> No problem.
33 *
34 * 5. CP | inode(x) | dnode(F)
35 * -> The inode(DF) was missing. Should drop this dnode(F).
36 *
37 * 6. CP | inode(DF) | dnode(F)
38 * -> No problem.
39 *
40 * 7. CP | dnode(F) | inode(DF)
41 * -> If f2fs_iget fails, then goto next to find inode(DF).
42 *
43 * 8. CP | dnode(F) | inode(x)
44 * -> If f2fs_iget fails, then goto next to find inode(DF).
45 * But it will fail due to no inode(DF).
46 */
47
17static struct kmem_cache *fsync_entry_slab; 48static struct kmem_cache *fsync_entry_slab;
18 49
19bool space_for_roll_forward(struct f2fs_sb_info *sbi) 50bool space_for_roll_forward(struct f2fs_sb_info *sbi)
@@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
36 return NULL; 67 return NULL;
37} 68}
38 69
39static int recover_dentry(struct page *ipage, struct inode *inode) 70static int recover_dentry(struct inode *inode, struct page *ipage)
40{ 71{
41 struct f2fs_inode *raw_inode = F2FS_INODE(ipage); 72 struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
42 nid_t pino = le32_to_cpu(raw_inode->i_pino); 73 nid_t pino = le32_to_cpu(raw_inode->i_pino);
@@ -62,8 +93,10 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
62 } 93 }
63retry: 94retry:
64 de = f2fs_find_entry(dir, &name, &page); 95 de = f2fs_find_entry(dir, &name, &page);
65 if (de && inode->i_ino == le32_to_cpu(de->ino)) 96 if (de && inode->i_ino == le32_to_cpu(de->ino)) {
97 clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
66 goto out_unmap_put; 98 goto out_unmap_put;
99 }
67 if (de) { 100 if (de) {
68 einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); 101 einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino));
69 if (IS_ERR(einode)) { 102 if (IS_ERR(einode)) {
@@ -73,7 +106,7 @@ retry:
73 err = -EEXIST; 106 err = -EEXIST;
74 goto out_unmap_put; 107 goto out_unmap_put;
75 } 108 }
76 err = acquire_orphan_inode(F2FS_SB(inode->i_sb)); 109 err = acquire_orphan_inode(F2FS_I_SB(inode));
77 if (err) { 110 if (err) {
78 iput(einode); 111 iput(einode);
79 goto out_unmap_put; 112 goto out_unmap_put;
@@ -108,35 +141,28 @@ out:
108 return err; 141 return err;
109} 142}
110 143
111static int recover_inode(struct inode *inode, struct page *node_page) 144static void recover_inode(struct inode *inode, struct page *page)
112{ 145{
113 struct f2fs_inode *raw_inode = F2FS_INODE(node_page); 146 struct f2fs_inode *raw = F2FS_INODE(page);
114
115 if (!IS_INODE(node_page))
116 return 0;
117
118 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
119 i_size_write(inode, le64_to_cpu(raw_inode->i_size));
120 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
121 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
122 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
123 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
124 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
125 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
126 147
127 if (is_dent_dnode(node_page)) 148 inode->i_mode = le16_to_cpu(raw->i_mode);
128 return recover_dentry(node_page, inode); 149 i_size_write(inode, le64_to_cpu(raw->i_size));
150 inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
151 inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
152 inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
153 inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
154 inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
155 inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
129 156
130 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", 157 f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
131 ino_of_node(node_page), raw_inode->i_name); 158 ino_of_node(page), F2FS_INODE(page)->i_name);
132 return 0;
133} 159}
134 160
135static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) 161static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
136{ 162{
137 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); 163 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
138 struct curseg_info *curseg; 164 struct curseg_info *curseg;
139 struct page *page; 165 struct page *page = NULL;
140 block_t blkaddr; 166 block_t blkaddr;
141 int err = 0; 167 int err = 0;
142 168
@@ -144,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
144 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); 170 curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
145 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 171 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
146 172
147 /* read node page */
148 page = alloc_page(GFP_F2FS_ZERO);
149 if (!page)
150 return -ENOMEM;
151 lock_page(page);
152
153 while (1) { 173 while (1) {
154 struct fsync_inode_entry *entry; 174 struct fsync_inode_entry *entry;
155 175
156 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); 176 if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
157 if (err) 177 return 0;
158 return err;
159 178
160 lock_page(page); 179 page = get_meta_page_ra(sbi, blkaddr);
161 180
162 if (cp_ver != cpver_of_node(page)) 181 if (cp_ver != cpver_of_node(page))
163 break; 182 break;
@@ -178,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
178 } 197 }
179 198
180 /* add this fsync inode to the list */ 199 /* add this fsync inode to the list */
181 entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); 200 entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
182 if (!entry) { 201 if (!entry) {
183 err = -ENOMEM; 202 err = -ENOMEM;
184 break; 203 break;
185 } 204 }
186 205 /*
206 * CP | dnode(F) | inode(DF)
207 * For this case, we should not give up now.
208 */
187 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); 209 entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
188 if (IS_ERR(entry->inode)) { 210 if (IS_ERR(entry->inode)) {
189 err = PTR_ERR(entry->inode); 211 err = PTR_ERR(entry->inode);
190 kmem_cache_free(fsync_entry_slab, entry); 212 kmem_cache_free(fsync_entry_slab, entry);
213 if (err == -ENOENT)
214 goto next;
191 break; 215 break;
192 } 216 }
193 list_add_tail(&entry->list, head); 217 list_add_tail(&entry->list, head);
194 } 218 }
195 entry->blkaddr = blkaddr; 219 entry->blkaddr = blkaddr;
196 220
197 err = recover_inode(entry->inode, page); 221 if (IS_INODE(page)) {
198 if (err && err != -ENOENT) 222 entry->last_inode = blkaddr;
199 break; 223 if (is_dent_dnode(page))
224 entry->last_dentry = blkaddr;
225 }
200next: 226next:
201 /* check next segment */ 227 /* check next segment */
202 blkaddr = next_blkaddr_of_node(page); 228 blkaddr = next_blkaddr_of_node(page);
229 f2fs_put_page(page, 1);
203 } 230 }
204 231 f2fs_put_page(page, 1);
205 unlock_page(page);
206 __free_pages(page, 0);
207
208 return err; 232 return err;
209} 233}
210 234
@@ -277,16 +301,30 @@ got_it:
277 ino = ino_of_node(node_page); 301 ino = ino_of_node(node_page);
278 f2fs_put_page(node_page, 1); 302 f2fs_put_page(node_page, 1);
279 303
280 /* Deallocate previous index in the node page */ 304 if (ino != dn->inode->i_ino) {
281 inode = f2fs_iget(sbi->sb, ino); 305 /* Deallocate previous index in the node page */
282 if (IS_ERR(inode)) 306 inode = f2fs_iget(sbi->sb, ino);
283 return PTR_ERR(inode); 307 if (IS_ERR(inode))
308 return PTR_ERR(inode);
309 } else {
310 inode = dn->inode;
311 }
284 312
285 bidx = start_bidx_of_node(offset, F2FS_I(inode)) + 313 bidx = start_bidx_of_node(offset, F2FS_I(inode)) +
286 le16_to_cpu(sum.ofs_in_node); 314 le16_to_cpu(sum.ofs_in_node);
287 315
288 truncate_hole(inode, bidx, bidx + 1); 316 if (ino != dn->inode->i_ino) {
289 iput(inode); 317 truncate_hole(inode, bidx, bidx + 1);
318 iput(inode);
319 } else {
320 struct dnode_of_data tdn;
321 set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0);
322 if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
323 return 0;
324 if (tdn.data_blkaddr != NULL_ADDR)
325 truncate_data_blocks_range(&tdn, 1);
326 f2fs_put_page(tdn.node_page, 1);
327 }
290 return 0; 328 return 0;
291} 329}
292 330
@@ -300,12 +338,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
300 struct node_info ni; 338 struct node_info ni;
301 int err = 0, recovered = 0; 339 int err = 0, recovered = 0;
302 340
303 if (recover_inline_data(inode, page)) 341 /* step 1: recover xattr */
342 if (IS_INODE(page)) {
343 recover_inline_xattr(inode, page);
344 } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
345 recover_xattr_data(inode, page, blkaddr);
304 goto out; 346 goto out;
347 }
305 348
306 if (recover_xattr_data(inode, page, blkaddr)) 349 /* step 2: recover inline data */
350 if (recover_inline_data(inode, page))
307 goto out; 351 goto out;
308 352
353 /* step 3: recover data indices */
309 start = start_bidx_of_node(ofs_of_node(page), fi); 354 start = start_bidx_of_node(ofs_of_node(page), fi);
310 end = start + ADDRS_PER_PAGE(page, fi); 355 end = start + ADDRS_PER_PAGE(page, fi);
311 356
@@ -322,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
322 f2fs_wait_on_page_writeback(dn.node_page, NODE); 367 f2fs_wait_on_page_writeback(dn.node_page, NODE);
323 368
324 get_node_info(sbi, dn.nid, &ni); 369 get_node_info(sbi, dn.nid, &ni);
325 f2fs_bug_on(ni.ino != ino_of_node(page)); 370 f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
326 f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page)); 371 f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
327 372
328 for (; start < end; start++) { 373 for (; start < end; start++) {
329 block_t src, dest; 374 block_t src, dest;
@@ -335,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
335 if (src == NULL_ADDR) { 380 if (src == NULL_ADDR) {
336 err = reserve_new_block(&dn); 381 err = reserve_new_block(&dn);
337 /* We should not get -ENOSPC */ 382 /* We should not get -ENOSPC */
338 f2fs_bug_on(err); 383 f2fs_bug_on(sbi, err);
339 } 384 }
340 385
341 /* Check the previous node page having this index */ 386 /* Check the previous node page having this index */
@@ -362,8 +407,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
362 fill_node_footer(dn.node_page, dn.nid, ni.ino, 407 fill_node_footer(dn.node_page, dn.nid, ni.ino,
363 ofs_of_node(page), false); 408 ofs_of_node(page), false);
364 set_page_dirty(dn.node_page); 409 set_page_dirty(dn.node_page);
365
366 recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
367err: 410err:
368 f2fs_put_dnode(&dn); 411 f2fs_put_dnode(&dn);
369 f2fs_unlock_op(sbi); 412 f2fs_unlock_op(sbi);
@@ -379,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
379{ 422{
380 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); 423 unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
381 struct curseg_info *curseg; 424 struct curseg_info *curseg;
382 struct page *page; 425 struct page *page = NULL;
383 int err = 0; 426 int err = 0;
384 block_t blkaddr; 427 block_t blkaddr;
385 428
@@ -387,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi,
387 curseg = CURSEG_I(sbi, type); 430 curseg = CURSEG_I(sbi, type);
388 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 431 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
389 432
390 /* read node page */
391 page = alloc_page(GFP_F2FS_ZERO);
392 if (!page)
393 return -ENOMEM;
394
395 lock_page(page);
396
397 while (1) { 433 while (1) {
398 struct fsync_inode_entry *entry; 434 struct fsync_inode_entry *entry;
399 435
400 err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); 436 if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi))
401 if (err) 437 break;
402 return err;
403 438
404 lock_page(page); 439 page = get_meta_page_ra(sbi, blkaddr);
405 440
406 if (cp_ver != cpver_of_node(page)) 441 if (cp_ver != cpver_of_node(page)) {
442 f2fs_put_page(page, 1);
407 break; 443 break;
444 }
408 445
409 entry = get_fsync_inode(head, ino_of_node(page)); 446 entry = get_fsync_inode(head, ino_of_node(page));
410 if (!entry) 447 if (!entry)
411 goto next; 448 goto next;
412 449 /*
450 * inode(x) | CP | inode(x) | dnode(F)
451 * In this case, we can lose the latest inode(x).
452 * So, call recover_inode for the inode update.
453 */
454 if (entry->last_inode == blkaddr)
455 recover_inode(entry->inode, page);
456 if (entry->last_dentry == blkaddr) {
457 err = recover_dentry(entry->inode, page);
458 if (err) {
459 f2fs_put_page(page, 1);
460 break;
461 }
462 }
413 err = do_recover_data(sbi, entry->inode, page, blkaddr); 463 err = do_recover_data(sbi, entry->inode, page, blkaddr);
414 if (err) 464 if (err) {
465 f2fs_put_page(page, 1);
415 break; 466 break;
467 }
416 468
417 if (entry->blkaddr == blkaddr) { 469 if (entry->blkaddr == blkaddr) {
418 iput(entry->inode); 470 iput(entry->inode);
@@ -422,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi,
422next: 474next:
423 /* check next segment */ 475 /* check next segment */
424 blkaddr = next_blkaddr_of_node(page); 476 blkaddr = next_blkaddr_of_node(page);
477 f2fs_put_page(page, 1);
425 } 478 }
426
427 unlock_page(page);
428 __free_pages(page, 0);
429
430 if (!err) 479 if (!err)
431 allocate_new_segments(sbi); 480 allocate_new_segments(sbi);
432 return err; 481 return err;
@@ -434,7 +483,9 @@ next:
434 483
435int recover_fsync_data(struct f2fs_sb_info *sbi) 484int recover_fsync_data(struct f2fs_sb_info *sbi)
436{ 485{
486 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
437 struct list_head inode_list; 487 struct list_head inode_list;
488 block_t blkaddr;
438 int err; 489 int err;
439 bool need_writecp = false; 490 bool need_writecp = false;
440 491
@@ -447,6 +498,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
447 498
448 /* step #1: find fsynced inode numbers */ 499 /* step #1: find fsynced inode numbers */
449 sbi->por_doing = true; 500 sbi->por_doing = true;
501
502 /* prevent checkpoint */
503 mutex_lock(&sbi->cp_mutex);
504
505 blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
506
450 err = find_fsync_dnodes(sbi, &inode_list); 507 err = find_fsync_dnodes(sbi, &inode_list);
451 if (err) 508 if (err)
452 goto out; 509 goto out;
@@ -458,12 +515,38 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
458 515
459 /* step #2: recover data */ 516 /* step #2: recover data */
460 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); 517 err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
461 f2fs_bug_on(!list_empty(&inode_list)); 518 if (!err)
519 f2fs_bug_on(sbi, !list_empty(&inode_list));
462out: 520out:
463 destroy_fsync_dnodes(&inode_list); 521 destroy_fsync_dnodes(&inode_list);
464 kmem_cache_destroy(fsync_entry_slab); 522 kmem_cache_destroy(fsync_entry_slab);
523
524 /* truncate meta pages to be used by the recovery */
525 truncate_inode_pages_range(META_MAPPING(sbi),
526 MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
527
528 if (err) {
529 truncate_inode_pages_final(NODE_MAPPING(sbi));
530 truncate_inode_pages_final(META_MAPPING(sbi));
531 }
532
465 sbi->por_doing = false; 533 sbi->por_doing = false;
466 if (!err && need_writecp) 534 if (err) {
467 write_checkpoint(sbi, false); 535 discard_next_dnode(sbi, blkaddr);
536
537 /* Flush all the NAT/SIT pages */
538 while (get_pages(sbi, F2FS_DIRTY_META))
539 sync_meta_pages(sbi, META, LONG_MAX);
540 set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG);
541 mutex_unlock(&sbi->cp_mutex);
542 } else if (need_writecp) {
543 struct cp_control cpc = {
544 .reason = CP_SYNC,
545 };
546 mutex_unlock(&sbi->cp_mutex);
547 write_checkpoint(sbi, &cpc);
548 } else {
549 mutex_unlock(&sbi->cp_mutex);
550 }
468 return err; 551 return err;
469} 552}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d04613df710a..923cb76fdc46 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -25,6 +25,8 @@
25#define __reverse_ffz(x) __reverse_ffs(~(x)) 25#define __reverse_ffz(x) __reverse_ffs(~(x))
26 26
27static struct kmem_cache *discard_entry_slab; 27static struct kmem_cache *discard_entry_slab;
28static struct kmem_cache *sit_entry_set_slab;
29static struct kmem_cache *inmem_entry_slab;
28 30
29/* 31/*
30 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since 32 * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
@@ -62,7 +64,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
62} 64}
63 65
64/* 66/*
65 * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c becasue 67 * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
66 * f2fs_set_bit makes MSB and LSB reversed in a byte. 68 * f2fs_set_bit makes MSB and LSB reversed in a byte.
67 * Example: 69 * Example:
68 * LSB <--> MSB 70 * LSB <--> MSB
@@ -172,6 +174,60 @@ found_middle:
172 return result + __reverse_ffz(tmp); 174 return result + __reverse_ffz(tmp);
173} 175}
174 176
177void register_inmem_page(struct inode *inode, struct page *page)
178{
179 struct f2fs_inode_info *fi = F2FS_I(inode);
180 struct inmem_pages *new;
181
182 new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
183
184 /* add atomic page indices to the list */
185 new->page = page;
186 INIT_LIST_HEAD(&new->list);
187
188 /* increase reference count with clean state */
189 mutex_lock(&fi->inmem_lock);
190 get_page(page);
191 list_add_tail(&new->list, &fi->inmem_pages);
192 mutex_unlock(&fi->inmem_lock);
193}
194
195void commit_inmem_pages(struct inode *inode, bool abort)
196{
197 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
198 struct f2fs_inode_info *fi = F2FS_I(inode);
199 struct inmem_pages *cur, *tmp;
200 bool submit_bio = false;
201 struct f2fs_io_info fio = {
202 .type = DATA,
203 .rw = WRITE_SYNC,
204 };
205
206 f2fs_balance_fs(sbi);
207 f2fs_lock_op(sbi);
208
209 mutex_lock(&fi->inmem_lock);
210 list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
211 lock_page(cur->page);
212 if (!abort && cur->page->mapping == inode->i_mapping) {
213 f2fs_wait_on_page_writeback(cur->page, DATA);
214 if (clear_page_dirty_for_io(cur->page))
215 inode_dec_dirty_pages(inode);
216 do_write_data_page(cur->page, &fio);
217 submit_bio = true;
218 }
219 f2fs_put_page(cur->page, 1);
220 list_del(&cur->list);
221 kmem_cache_free(inmem_entry_slab, cur);
222 }
223 if (submit_bio)
224 f2fs_submit_merged_bio(sbi, DATA, WRITE);
225 mutex_unlock(&fi->inmem_lock);
226
227 filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
228 f2fs_unlock_op(sbi);
229}
230
175/* 231/*
176 * This function balances dirty node and dentry pages. 232 * This function balances dirty node and dentry pages.
177 * In addition, it controls garbage collection. 233 * In addition, it controls garbage collection.
@@ -205,24 +261,20 @@ repeat:
205 if (kthread_should_stop()) 261 if (kthread_should_stop())
206 return 0; 262 return 0;
207 263
208 spin_lock(&fcc->issue_lock); 264 if (!llist_empty(&fcc->issue_list)) {
209 if (fcc->issue_list) {
210 fcc->dispatch_list = fcc->issue_list;
211 fcc->issue_list = fcc->issue_tail = NULL;
212 }
213 spin_unlock(&fcc->issue_lock);
214
215 if (fcc->dispatch_list) {
216 struct bio *bio = bio_alloc(GFP_NOIO, 0); 265 struct bio *bio = bio_alloc(GFP_NOIO, 0);
217 struct flush_cmd *cmd, *next; 266 struct flush_cmd *cmd, *next;
218 int ret; 267 int ret;
219 268
269 fcc->dispatch_list = llist_del_all(&fcc->issue_list);
270 fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
271
220 bio->bi_bdev = sbi->sb->s_bdev; 272 bio->bi_bdev = sbi->sb->s_bdev;
221 ret = submit_bio_wait(WRITE_FLUSH, bio); 273 ret = submit_bio_wait(WRITE_FLUSH, bio);
222 274
223 for (cmd = fcc->dispatch_list; cmd; cmd = next) { 275 llist_for_each_entry_safe(cmd, next,
276 fcc->dispatch_list, llnode) {
224 cmd->ret = ret; 277 cmd->ret = ret;
225 next = cmd->next;
226 complete(&cmd->wait); 278 complete(&cmd->wait);
227 } 279 }
228 bio_put(bio); 280 bio_put(bio);
@@ -230,7 +282,7 @@ repeat:
230 } 282 }
231 283
232 wait_event_interruptible(*q, 284 wait_event_interruptible(*q,
233 kthread_should_stop() || fcc->issue_list); 285 kthread_should_stop() || !llist_empty(&fcc->issue_list));
234 goto repeat; 286 goto repeat;
235} 287}
236 288
@@ -239,19 +291,18 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
239 struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; 291 struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
240 struct flush_cmd cmd; 292 struct flush_cmd cmd;
241 293
294 trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER),
295 test_opt(sbi, FLUSH_MERGE));
296
297 if (test_opt(sbi, NOBARRIER))
298 return 0;
299
242 if (!test_opt(sbi, FLUSH_MERGE)) 300 if (!test_opt(sbi, FLUSH_MERGE))
243 return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); 301 return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL);
244 302
245 init_completion(&cmd.wait); 303 init_completion(&cmd.wait);
246 cmd.next = NULL;
247 304
248 spin_lock(&fcc->issue_lock); 305 llist_add(&cmd.llnode, &fcc->issue_list);
249 if (fcc->issue_list)
250 fcc->issue_tail->next = &cmd;
251 else
252 fcc->issue_list = &cmd;
253 fcc->issue_tail = &cmd;
254 spin_unlock(&fcc->issue_lock);
255 306
256 if (!fcc->dispatch_list) 307 if (!fcc->dispatch_list)
257 wake_up(&fcc->flush_wait_queue); 308 wake_up(&fcc->flush_wait_queue);
@@ -270,15 +321,15 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
270 fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); 321 fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
271 if (!fcc) 322 if (!fcc)
272 return -ENOMEM; 323 return -ENOMEM;
273 spin_lock_init(&fcc->issue_lock);
274 init_waitqueue_head(&fcc->flush_wait_queue); 324 init_waitqueue_head(&fcc->flush_wait_queue);
275 sbi->sm_info->cmd_control_info = fcc; 325 init_llist_head(&fcc->issue_list);
326 SM_I(sbi)->cmd_control_info = fcc;
276 fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, 327 fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
277 "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); 328 "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
278 if (IS_ERR(fcc->f2fs_issue_flush)) { 329 if (IS_ERR(fcc->f2fs_issue_flush)) {
279 err = PTR_ERR(fcc->f2fs_issue_flush); 330 err = PTR_ERR(fcc->f2fs_issue_flush);
280 kfree(fcc); 331 kfree(fcc);
281 sbi->sm_info->cmd_control_info = NULL; 332 SM_I(sbi)->cmd_control_info = NULL;
282 return err; 333 return err;
283 } 334 }
284 335
@@ -287,13 +338,12 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
287 338
288void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) 339void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
289{ 340{
290 struct flush_cmd_control *fcc = 341 struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
291 sbi->sm_info->cmd_control_info;
292 342
293 if (fcc && fcc->f2fs_issue_flush) 343 if (fcc && fcc->f2fs_issue_flush)
294 kthread_stop(fcc->f2fs_issue_flush); 344 kthread_stop(fcc->f2fs_issue_flush);
295 kfree(fcc); 345 kfree(fcc);
296 sbi->sm_info->cmd_control_info = NULL; 346 SM_I(sbi)->cmd_control_info = NULL;
297} 347}
298 348
299static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, 349static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
@@ -312,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
312 struct seg_entry *sentry = get_seg_entry(sbi, segno); 362 struct seg_entry *sentry = get_seg_entry(sbi, segno);
313 enum dirty_type t = sentry->type; 363 enum dirty_type t = sentry->type;
314 364
365 if (unlikely(t >= DIRTY)) {
366 f2fs_bug_on(sbi, 1);
367 return;
368 }
315 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) 369 if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t]))
316 dirty_i->nr_dirty[t]++; 370 dirty_i->nr_dirty[t]++;
317 } 371 }
@@ -371,17 +425,14 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno)
371static int f2fs_issue_discard(struct f2fs_sb_info *sbi, 425static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
372 block_t blkstart, block_t blklen) 426 block_t blkstart, block_t blklen)
373{ 427{
374 sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); 428 sector_t start = SECTOR_FROM_BLOCK(blkstart);
375 sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); 429 sector_t len = SECTOR_FROM_BLOCK(blklen);
376 trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); 430 trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
377 return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); 431 return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0);
378} 432}
379 433
380void discard_next_dnode(struct f2fs_sb_info *sbi) 434void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr)
381{ 435{
382 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
383 block_t blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
384
385 if (f2fs_issue_discard(sbi, blkaddr, 1)) { 436 if (f2fs_issue_discard(sbi, blkaddr, 1)) {
386 struct page *page = grab_meta_page(sbi, blkaddr); 437 struct page *page = grab_meta_page(sbi, blkaddr);
387 /* zero-filled page */ 438 /* zero-filled page */
@@ -390,22 +441,48 @@ void discard_next_dnode(struct f2fs_sb_info *sbi)
390 } 441 }
391} 442}
392 443
393static void add_discard_addrs(struct f2fs_sb_info *sbi, 444static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
394 unsigned int segno, struct seg_entry *se)
395{ 445{
396 struct list_head *head = &SM_I(sbi)->discard_list; 446 struct list_head *head = &SM_I(sbi)->discard_list;
397 struct discard_entry *new; 447 struct discard_entry *new;
398 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); 448 int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
399 int max_blocks = sbi->blocks_per_seg; 449 int max_blocks = sbi->blocks_per_seg;
450 struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
400 unsigned long *cur_map = (unsigned long *)se->cur_valid_map; 451 unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
401 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; 452 unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
402 unsigned long dmap[entries]; 453 unsigned long dmap[entries];
403 unsigned int start = 0, end = -1; 454 unsigned int start = 0, end = -1;
455 bool force = (cpc->reason == CP_DISCARD);
404 int i; 456 int i;
405 457
406 if (!test_opt(sbi, DISCARD)) 458 if (!force && !test_opt(sbi, DISCARD))
407 return; 459 return;
408 460
461 if (force && !se->valid_blocks) {
462 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
463 /*
464 * if this segment is registered in the prefree list, then
465 * we should skip adding a discard candidate, and let the
466 * checkpoint do that later.
467 */
468 mutex_lock(&dirty_i->seglist_lock);
469 if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) {
470 mutex_unlock(&dirty_i->seglist_lock);
471 cpc->trimmed += sbi->blocks_per_seg;
472 return;
473 }
474 mutex_unlock(&dirty_i->seglist_lock);
475
476 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
477 INIT_LIST_HEAD(&new->list);
478 new->blkaddr = START_BLOCK(sbi, cpc->trim_start);
479 new->len = sbi->blocks_per_seg;
480 list_add_tail(&new->list, head);
481 SM_I(sbi)->nr_discards += sbi->blocks_per_seg;
482 cpc->trimmed += sbi->blocks_per_seg;
483 return;
484 }
485
409 /* zero block will be discarded through the prefree list */ 486 /* zero block will be discarded through the prefree list */
410 if (!se->valid_blocks || se->valid_blocks == max_blocks) 487 if (!se->valid_blocks || se->valid_blocks == max_blocks)
411 return; 488 return;
@@ -414,40 +491,50 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi,
414 for (i = 0; i < entries; i++) 491 for (i = 0; i < entries; i++)
415 dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; 492 dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
416 493
417 while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { 494 while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
418 start = __find_rev_next_bit(dmap, max_blocks, end + 1); 495 start = __find_rev_next_bit(dmap, max_blocks, end + 1);
419 if (start >= max_blocks) 496 if (start >= max_blocks)
420 break; 497 break;
421 498
422 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); 499 end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
423 500
501 if (end - start < cpc->trim_minlen)
502 continue;
503
424 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); 504 new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS);
425 INIT_LIST_HEAD(&new->list); 505 INIT_LIST_HEAD(&new->list);
426 new->blkaddr = START_BLOCK(sbi, segno) + start; 506 new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start;
427 new->len = end - start; 507 new->len = end - start;
508 cpc->trimmed += end - start;
428 509
429 list_add_tail(&new->list, head); 510 list_add_tail(&new->list, head);
430 SM_I(sbi)->nr_discards += end - start; 511 SM_I(sbi)->nr_discards += end - start;
431 } 512 }
432} 513}
433 514
515void release_discard_addrs(struct f2fs_sb_info *sbi)
516{
517 struct list_head *head = &(SM_I(sbi)->discard_list);
518 struct discard_entry *entry, *this;
519
520 /* drop caches */
521 list_for_each_entry_safe(entry, this, head, list) {
522 list_del(&entry->list);
523 kmem_cache_free(discard_entry_slab, entry);
524 }
525}
526
434/* 527/*
435 * Should call clear_prefree_segments after checkpoint is done. 528 * Should call clear_prefree_segments after checkpoint is done.
436 */ 529 */
437static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) 530static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi)
438{ 531{
439 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 532 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
440 unsigned int segno = -1; 533 unsigned int segno;
441 unsigned int total_segs = TOTAL_SEGS(sbi);
442 534
443 mutex_lock(&dirty_i->seglist_lock); 535 mutex_lock(&dirty_i->seglist_lock);
444 while (1) { 536 for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi))
445 segno = find_next_bit(dirty_i->dirty_segmap[PRE], total_segs,
446 segno + 1);
447 if (segno >= total_segs)
448 break;
449 __set_test_and_free(sbi, segno); 537 __set_test_and_free(sbi, segno);
450 }
451 mutex_unlock(&dirty_i->seglist_lock); 538 mutex_unlock(&dirty_i->seglist_lock);
452} 539}
453 540
@@ -457,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
457 struct discard_entry *entry, *this; 544 struct discard_entry *entry, *this;
458 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 545 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
459 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; 546 unsigned long *prefree_map = dirty_i->dirty_segmap[PRE];
460 unsigned int total_segs = TOTAL_SEGS(sbi);
461 unsigned int start = 0, end = -1; 547 unsigned int start = 0, end = -1;
462 548
463 mutex_lock(&dirty_i->seglist_lock); 549 mutex_lock(&dirty_i->seglist_lock);
464 550
465 while (1) { 551 while (1) {
466 int i; 552 int i;
467 start = find_next_bit(prefree_map, total_segs, end + 1); 553 start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1);
468 if (start >= total_segs) 554 if (start >= MAIN_SEGS(sbi))
469 break; 555 break;
470 end = find_next_zero_bit(prefree_map, total_segs, start + 1); 556 end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi),
557 start + 1);
471 558
472 for (i = start; i < end; i++) 559 for (i = start; i < end; i++)
473 clear_bit(i, prefree_map); 560 clear_bit(i, prefree_map);
@@ -491,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi)
491 } 578 }
492} 579}
493 580
494static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) 581static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno)
495{ 582{
496 struct sit_info *sit_i = SIT_I(sbi); 583 struct sit_info *sit_i = SIT_I(sbi);
497 if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) 584
585 if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) {
498 sit_i->dirty_sentries++; 586 sit_i->dirty_sentries++;
587 return false;
588 }
589
590 return true;
499} 591}
500 592
501static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, 593static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type,
@@ -519,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
519 new_vblocks = se->valid_blocks + del; 611 new_vblocks = se->valid_blocks + del;
520 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); 612 offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
521 613
522 f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || 614 f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) ||
523 (new_vblocks > sbi->blocks_per_seg))); 615 (new_vblocks > sbi->blocks_per_seg)));
524 616
525 se->valid_blocks = new_vblocks; 617 se->valid_blocks = new_vblocks;
@@ -529,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
529 /* Update valid block bitmap */ 621 /* Update valid block bitmap */
530 if (del > 0) { 622 if (del > 0) {
531 if (f2fs_set_bit(offset, se->cur_valid_map)) 623 if (f2fs_set_bit(offset, se->cur_valid_map))
532 BUG(); 624 f2fs_bug_on(sbi, 1);
533 } else { 625 } else {
534 if (!f2fs_clear_bit(offset, se->cur_valid_map)) 626 if (!f2fs_clear_bit(offset, se->cur_valid_map))
535 BUG(); 627 f2fs_bug_on(sbi, 1);
536 } 628 }
537 if (!f2fs_test_bit(offset, se->ckpt_valid_map)) 629 if (!f2fs_test_bit(offset, se->ckpt_valid_map))
538 se->ckpt_valid_blocks += del; 630 se->ckpt_valid_blocks += del;
@@ -561,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
561 unsigned int segno = GET_SEGNO(sbi, addr); 653 unsigned int segno = GET_SEGNO(sbi, addr);
562 struct sit_info *sit_i = SIT_I(sbi); 654 struct sit_info *sit_i = SIT_I(sbi);
563 655
564 f2fs_bug_on(addr == NULL_ADDR); 656 f2fs_bug_on(sbi, addr == NULL_ADDR);
565 if (addr == NEW_ADDR) 657 if (addr == NEW_ADDR)
566 return; 658 return;
567 659
@@ -637,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type)
637 unsigned int segno = curseg->segno + 1; 729 unsigned int segno = curseg->segno + 1;
638 struct free_segmap_info *free_i = FREE_I(sbi); 730 struct free_segmap_info *free_i = FREE_I(sbi);
639 731
640 if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) 732 if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
641 return !test_bit(segno, free_i->free_segmap); 733 return !test_bit(segno, free_i->free_segmap);
642 return 0; 734 return 0;
643} 735}
@@ -651,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
651{ 743{
652 struct free_segmap_info *free_i = FREE_I(sbi); 744 struct free_segmap_info *free_i = FREE_I(sbi);
653 unsigned int segno, secno, zoneno; 745 unsigned int segno, secno, zoneno;
654 unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; 746 unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
655 unsigned int hint = *newseg / sbi->segs_per_sec; 747 unsigned int hint = *newseg / sbi->segs_per_sec;
656 unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); 748 unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg);
657 unsigned int left_start = hint; 749 unsigned int left_start = hint;
@@ -663,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
663 755
664 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { 756 if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
665 segno = find_next_zero_bit(free_i->free_segmap, 757 segno = find_next_zero_bit(free_i->free_segmap,
666 TOTAL_SEGS(sbi), *newseg + 1); 758 MAIN_SEGS(sbi), *newseg + 1);
667 if (segno - *newseg < sbi->segs_per_sec - 759 if (segno - *newseg < sbi->segs_per_sec -
668 (*newseg % sbi->segs_per_sec)) 760 (*newseg % sbi->segs_per_sec))
669 goto got_it; 761 goto got_it;
670 } 762 }
671find_other_zone: 763find_other_zone:
672 secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); 764 secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
673 if (secno >= TOTAL_SECS(sbi)) { 765 if (secno >= MAIN_SECS(sbi)) {
674 if (dir == ALLOC_RIGHT) { 766 if (dir == ALLOC_RIGHT) {
675 secno = find_next_zero_bit(free_i->free_secmap, 767 secno = find_next_zero_bit(free_i->free_secmap,
676 TOTAL_SECS(sbi), 0); 768 MAIN_SECS(sbi), 0);
677 f2fs_bug_on(secno >= TOTAL_SECS(sbi)); 769 f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
678 } else { 770 } else {
679 go_left = 1; 771 go_left = 1;
680 left_start = hint - 1; 772 left_start = hint - 1;
@@ -689,8 +781,8 @@ find_other_zone:
689 continue; 781 continue;
690 } 782 }
691 left_start = find_next_zero_bit(free_i->free_secmap, 783 left_start = find_next_zero_bit(free_i->free_secmap,
692 TOTAL_SECS(sbi), 0); 784 MAIN_SECS(sbi), 0);
693 f2fs_bug_on(left_start >= TOTAL_SECS(sbi)); 785 f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
694 break; 786 break;
695 } 787 }
696 secno = left_start; 788 secno = left_start;
@@ -729,7 +821,7 @@ skip_left:
729 } 821 }
730got_it: 822got_it:
731 /* set it as dirty segment in free segmap */ 823 /* set it as dirty segment in free segmap */
732 f2fs_bug_on(test_bit(segno, free_i->free_segmap)); 824 f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
733 __set_inuse(sbi, segno); 825 __set_inuse(sbi, segno);
734 *newseg = segno; 826 *newseg = segno;
735 write_unlock(&free_i->segmap_lock); 827 write_unlock(&free_i->segmap_lock);
@@ -811,7 +903,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi,
811} 903}
812 904
813/* 905/*
814 * This function always allocates a used segment (from dirty seglist) by SSR 906 * This function always allocates a used segment(from dirty seglist) by SSR
815 * manner, so it should recover the existing segment information of valid blocks 907 * manner, so it should recover the existing segment information of valid blocks
816 */ 908 */
817static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) 909static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse)
@@ -901,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = {
901 .allocate_segment = allocate_segment_by_default, 993 .allocate_segment = allocate_segment_by_default,
902}; 994};
903 995
996int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
997{
998 __u64 start = range->start >> sbi->log_blocksize;
999 __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
1000 unsigned int start_segno, end_segno;
1001 struct cp_control cpc;
1002
1003 if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) ||
1004 range->len < sbi->blocksize)
1005 return -EINVAL;
1006
1007 if (end <= MAIN_BLKADDR(sbi))
1008 goto out;
1009
1010 /* start/end segment number in main_area */
1011 start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
1012 end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
1013 GET_SEGNO(sbi, end);
1014 cpc.reason = CP_DISCARD;
1015 cpc.trim_start = start_segno;
1016 cpc.trim_end = end_segno;
1017 cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
1018 cpc.trimmed = 0;
1019
1020 /* do checkpoint to issue discard commands safely */
1021 write_checkpoint(sbi, &cpc);
1022out:
1023 range->len = cpc.trimmed << sbi->log_blocksize;
1024 return 0;
1025}
1026
904static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) 1027static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
905{ 1028{
906 struct curseg_info *curseg = CURSEG_I(sbi, type); 1029 struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -956,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type)
956 1079
957static int __get_segment_type(struct page *page, enum page_type p_type) 1080static int __get_segment_type(struct page *page, enum page_type p_type)
958{ 1081{
959 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); 1082 switch (F2FS_P_SB(page)->active_logs) {
960 switch (sbi->active_logs) {
961 case 2: 1083 case 2:
962 return __get_segment_type_2(page, p_type); 1084 return __get_segment_type_2(page, p_type);
963 case 4: 1085 case 4:
964 return __get_segment_type_4(page, p_type); 1086 return __get_segment_type_4(page, p_type);
965 } 1087 }
966 /* NR_CURSEG_TYPE(6) logs by default */ 1088 /* NR_CURSEG_TYPE(6) logs by default */
967 f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE); 1089 f2fs_bug_on(F2FS_P_SB(page),
1090 F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE);
968 return __get_segment_type_6(page, p_type); 1091 return __get_segment_type_6(page, p_type);
969} 1092}
970 1093
@@ -974,14 +1097,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
974{ 1097{
975 struct sit_info *sit_i = SIT_I(sbi); 1098 struct sit_info *sit_i = SIT_I(sbi);
976 struct curseg_info *curseg; 1099 struct curseg_info *curseg;
977 unsigned int old_cursegno;
978 1100
979 curseg = CURSEG_I(sbi, type); 1101 curseg = CURSEG_I(sbi, type);
980 1102
981 mutex_lock(&curseg->curseg_mutex); 1103 mutex_lock(&curseg->curseg_mutex);
982 1104
983 *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); 1105 *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
984 old_cursegno = curseg->segno;
985 1106
986 /* 1107 /*
987 * __add_sum_entry should be resided under the curseg_mutex 1108 * __add_sum_entry should be resided under the curseg_mutex
@@ -1002,7 +1123,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
1002 * since SSR needs latest valid block information. 1123 * since SSR needs latest valid block information.
1003 */ 1124 */
1004 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); 1125 refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
1005 locate_dirty_segment(sbi, old_cursegno);
1006 1126
1007 mutex_unlock(&sit_i->sentry_lock); 1127 mutex_unlock(&sit_i->sentry_lock);
1008 1128
@@ -1047,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
1047void write_data_page(struct page *page, struct dnode_of_data *dn, 1167void write_data_page(struct page *page, struct dnode_of_data *dn,
1048 block_t *new_blkaddr, struct f2fs_io_info *fio) 1168 block_t *new_blkaddr, struct f2fs_io_info *fio)
1049{ 1169{
1050 struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); 1170 struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
1051 struct f2fs_summary sum; 1171 struct f2fs_summary sum;
1052 struct node_info ni; 1172 struct node_info ni;
1053 1173
1054 f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); 1174 f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
1055 get_node_info(sbi, dn->nid, &ni); 1175 get_node_info(sbi, dn->nid, &ni);
1056 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); 1176 set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
1057 1177
@@ -1061,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
1061void rewrite_data_page(struct page *page, block_t old_blkaddr, 1181void rewrite_data_page(struct page *page, block_t old_blkaddr,
1062 struct f2fs_io_info *fio) 1182 struct f2fs_io_info *fio)
1063{ 1183{
1064 struct inode *inode = page->mapping->host; 1184 f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
1065 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
1066 f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio);
1067} 1185}
1068 1186
1069void recover_data_page(struct f2fs_sb_info *sbi, 1187void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1109,55 +1227,6 @@ void recover_data_page(struct f2fs_sb_info *sbi,
1109 mutex_unlock(&curseg->curseg_mutex); 1227 mutex_unlock(&curseg->curseg_mutex);
1110} 1228}
1111 1229
1112void rewrite_node_page(struct f2fs_sb_info *sbi,
1113 struct page *page, struct f2fs_summary *sum,
1114 block_t old_blkaddr, block_t new_blkaddr)
1115{
1116 struct sit_info *sit_i = SIT_I(sbi);
1117 int type = CURSEG_WARM_NODE;
1118 struct curseg_info *curseg;
1119 unsigned int segno, old_cursegno;
1120 block_t next_blkaddr = next_blkaddr_of_node(page);
1121 unsigned int next_segno = GET_SEGNO(sbi, next_blkaddr);
1122 struct f2fs_io_info fio = {
1123 .type = NODE,
1124 .rw = WRITE_SYNC,
1125 };
1126
1127 curseg = CURSEG_I(sbi, type);
1128
1129 mutex_lock(&curseg->curseg_mutex);
1130 mutex_lock(&sit_i->sentry_lock);
1131
1132 segno = GET_SEGNO(sbi, new_blkaddr);
1133 old_cursegno = curseg->segno;
1134
1135 /* change the current segment */
1136 if (segno != curseg->segno) {
1137 curseg->next_segno = segno;
1138 change_curseg(sbi, type, true);
1139 }
1140 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
1141 __add_sum_entry(sbi, type, sum);
1142
1143 /* change the current log to the next block addr in advance */
1144 if (next_segno != segno) {
1145 curseg->next_segno = next_segno;
1146 change_curseg(sbi, type, true);
1147 }
1148 curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, next_blkaddr);
1149
1150 /* rewrite node page */
1151 set_page_writeback(page);
1152 f2fs_submit_page_mbio(sbi, page, new_blkaddr, &fio);
1153 f2fs_submit_merged_bio(sbi, NODE, WRITE);
1154 refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
1155 locate_dirty_segment(sbi, old_cursegno);
1156
1157 mutex_unlock(&sit_i->sentry_lock);
1158 mutex_unlock(&curseg->curseg_mutex);
1159}
1160
1161static inline bool is_merged_page(struct f2fs_sb_info *sbi, 1230static inline bool is_merged_page(struct f2fs_sb_info *sbi,
1162 struct page *page, enum page_type type) 1231 struct page *page, enum page_type type)
1163{ 1232{
@@ -1185,8 +1254,9 @@ out:
1185void f2fs_wait_on_page_writeback(struct page *page, 1254void f2fs_wait_on_page_writeback(struct page *page,
1186 enum page_type type) 1255 enum page_type type)
1187{ 1256{
1188 struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
1189 if (PageWriteback(page)) { 1257 if (PageWriteback(page)) {
1258 struct f2fs_sb_info *sbi = F2FS_P_SB(page);
1259
1190 if (is_merged_page(sbi, page, type)) 1260 if (is_merged_page(sbi, page, type))
1191 f2fs_submit_merged_bio(sbi, type, WRITE); 1261 f2fs_submit_merged_bio(sbi, type, WRITE);
1192 wait_on_page_writeback(page); 1262 wait_on_page_writeback(page);
@@ -1455,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi,
1455 unsigned int segno) 1525 unsigned int segno)
1456{ 1526{
1457 struct sit_info *sit_i = SIT_I(sbi); 1527 struct sit_info *sit_i = SIT_I(sbi);
1458 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno); 1528 unsigned int offset = SIT_BLOCK_OFFSET(segno);
1459 block_t blk_addr = sit_i->sit_base_addr + offset; 1529 block_t blk_addr = sit_i->sit_base_addr + offset;
1460 1530
1461 check_seg_range(sbi, segno); 1531 check_seg_range(sbi, segno);
@@ -1481,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
1481 /* get current sit block page without lock */ 1551 /* get current sit block page without lock */
1482 src_page = get_meta_page(sbi, src_off); 1552 src_page = get_meta_page(sbi, src_off);
1483 dst_page = grab_meta_page(sbi, dst_off); 1553 dst_page = grab_meta_page(sbi, dst_off);
1484 f2fs_bug_on(PageDirty(src_page)); 1554 f2fs_bug_on(sbi, PageDirty(src_page));
1485 1555
1486 src_addr = page_address(src_page); 1556 src_addr = page_address(src_page);
1487 dst_addr = page_address(dst_page); 1557 dst_addr = page_address(dst_page);
@@ -1495,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi,
1495 return dst_page; 1565 return dst_page;
1496} 1566}
1497 1567
1498static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) 1568static struct sit_entry_set *grab_sit_entry_set(void)
1569{
1570 struct sit_entry_set *ses =
1571 f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC);
1572
1573 ses->entry_cnt = 0;
1574 INIT_LIST_HEAD(&ses->set_list);
1575 return ses;
1576}
1577
1578static void release_sit_entry_set(struct sit_entry_set *ses)
1579{
1580 list_del(&ses->set_list);
1581 kmem_cache_free(sit_entry_set_slab, ses);
1582}
1583
1584static void adjust_sit_entry_set(struct sit_entry_set *ses,
1585 struct list_head *head)
1586{
1587 struct sit_entry_set *next = ses;
1588
1589 if (list_is_last(&ses->set_list, head))
1590 return;
1591
1592 list_for_each_entry_continue(next, head, set_list)
1593 if (ses->entry_cnt <= next->entry_cnt)
1594 break;
1595
1596 list_move_tail(&ses->set_list, &next->set_list);
1597}
1598
1599static void add_sit_entry(unsigned int segno, struct list_head *head)
1600{
1601 struct sit_entry_set *ses;
1602 unsigned int start_segno = START_SEGNO(segno);
1603
1604 list_for_each_entry(ses, head, set_list) {
1605 if (ses->start_segno == start_segno) {
1606 ses->entry_cnt++;
1607 adjust_sit_entry_set(ses, head);
1608 return;
1609 }
1610 }
1611
1612 ses = grab_sit_entry_set();
1613
1614 ses->start_segno = start_segno;
1615 ses->entry_cnt++;
1616 list_add(&ses->set_list, head);
1617}
1618
1619static void add_sits_in_set(struct f2fs_sb_info *sbi)
1620{
1621 struct f2fs_sm_info *sm_info = SM_I(sbi);
1622 struct list_head *set_list = &sm_info->sit_entry_set;
1623 unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap;
1624 unsigned int segno;
1625
1626 for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi))
1627 add_sit_entry(segno, set_list);
1628}
1629
1630static void remove_sits_in_journal(struct f2fs_sb_info *sbi)
1499{ 1631{
1500 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1632 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1501 struct f2fs_summary_block *sum = curseg->sum_blk; 1633 struct f2fs_summary_block *sum = curseg->sum_blk;
1502 int i; 1634 int i;
1503 1635
1504 /* 1636 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) {
1505 * If the journal area in the current summary is full of sit entries, 1637 unsigned int segno;
1506 * all the sit entries will be flushed. Otherwise the sit entries 1638 bool dirtied;
1507 * are not able to replace with newly hot sit entries. 1639
1508 */ 1640 segno = le32_to_cpu(segno_in_journal(sum, i));
1509 if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) { 1641 dirtied = __mark_sit_entry_dirty(sbi, segno);
1510 for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { 1642
1511 unsigned int segno; 1643 if (!dirtied)
1512 segno = le32_to_cpu(segno_in_journal(sum, i)); 1644 add_sit_entry(segno, &SM_I(sbi)->sit_entry_set);
1513 __mark_sit_entry_dirty(sbi, segno);
1514 }
1515 update_sits_in_cursum(sum, -sits_in_cursum(sum));
1516 return true;
1517 } 1645 }
1518 return false; 1646 update_sits_in_cursum(sum, -sits_in_cursum(sum));
1519} 1647}
1520 1648
1521/* 1649/*
1522 * CP calls this function, which flushes SIT entries including sit_journal, 1650 * CP calls this function, which flushes SIT entries including sit_journal,
1523 * and moves prefree segs to free segs. 1651 * and moves prefree segs to free segs.
1524 */ 1652 */
1525void flush_sit_entries(struct f2fs_sb_info *sbi) 1653void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
1526{ 1654{
1527 struct sit_info *sit_i = SIT_I(sbi); 1655 struct sit_info *sit_i = SIT_I(sbi);
1528 unsigned long *bitmap = sit_i->dirty_sentries_bitmap; 1656 unsigned long *bitmap = sit_i->dirty_sentries_bitmap;
1529 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); 1657 struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA);
1530 struct f2fs_summary_block *sum = curseg->sum_blk; 1658 struct f2fs_summary_block *sum = curseg->sum_blk;
1531 unsigned long nsegs = TOTAL_SEGS(sbi); 1659 struct sit_entry_set *ses, *tmp;
1532 struct page *page = NULL; 1660 struct list_head *head = &SM_I(sbi)->sit_entry_set;
1533 struct f2fs_sit_block *raw_sit = NULL; 1661 bool to_journal = true;
1534 unsigned int start = 0, end = 0; 1662 struct seg_entry *se;
1535 unsigned int segno = -1;
1536 bool flushed;
1537 1663
1538 mutex_lock(&curseg->curseg_mutex); 1664 mutex_lock(&curseg->curseg_mutex);
1539 mutex_lock(&sit_i->sentry_lock); 1665 mutex_lock(&sit_i->sentry_lock);
1540 1666
1541 /* 1667 /*
1542 * "flushed" indicates whether sit entries in journal are flushed 1668 * add and account sit entries of dirty bitmap in sit entry
1543 * to the SIT area or not. 1669 * set temporarily
1670 */
1671 add_sits_in_set(sbi);
1672
1673 /*
1674 * if there are no enough space in journal to store dirty sit
1675 * entries, remove all entries from journal and add and account
1676 * them in sit entry set.
1544 */ 1677 */
1545 flushed = flush_sits_in_journal(sbi); 1678 if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL))
1679 remove_sits_in_journal(sbi);
1546 1680
1547 while ((segno = find_next_bit(bitmap, nsegs, segno + 1)) < nsegs) { 1681 if (!sit_i->dirty_sentries)
1548 struct seg_entry *se = get_seg_entry(sbi, segno); 1682 goto out;
1549 int sit_offset, offset;
1550 1683
1551 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); 1684 /*
1685 * there are two steps to flush sit entries:
1686 * #1, flush sit entries to journal in current cold data summary block.
1687 * #2, flush sit entries to sit page.
1688 */
1689 list_for_each_entry_safe(ses, tmp, head, set_list) {
1690 struct page *page;
1691 struct f2fs_sit_block *raw_sit = NULL;
1692 unsigned int start_segno = ses->start_segno;
1693 unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK,
1694 (unsigned long)MAIN_SEGS(sbi));
1695 unsigned int segno = start_segno;
1696
1697 if (to_journal &&
1698 !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL))
1699 to_journal = false;
1700
1701 if (!to_journal) {
1702 page = get_next_sit_page(sbi, start_segno);
1703 raw_sit = page_address(page);
1704 }
1552 1705
1553 /* add discard candidates */ 1706 /* flush dirty sit entries in region of current sit set */
1554 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) 1707 for_each_set_bit_from(segno, bitmap, end) {
1555 add_discard_addrs(sbi, segno, se); 1708 int offset, sit_offset;
1556 1709
1557 if (flushed) 1710 se = get_seg_entry(sbi, segno);
1558 goto to_sit_page;
1559 1711
1560 offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1); 1712 /* add discard candidates */
1561 if (offset >= 0) { 1713 if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
1562 segno_in_journal(sum, offset) = cpu_to_le32(segno); 1714 cpc->trim_start = segno;
1563 seg_info_to_raw_sit(se, &sit_in_journal(sum, offset)); 1715 add_discard_addrs(sbi, cpc);
1564 goto flush_done;
1565 }
1566to_sit_page:
1567 if (!page || (start > segno) || (segno > end)) {
1568 if (page) {
1569 f2fs_put_page(page, 1);
1570 page = NULL;
1571 } 1716 }
1572 1717
1573 start = START_SEGNO(sit_i, segno); 1718 if (to_journal) {
1574 end = start + SIT_ENTRY_PER_BLOCK - 1; 1719 offset = lookup_journal_in_cursum(sum,
1720 SIT_JOURNAL, segno, 1);
1721 f2fs_bug_on(sbi, offset < 0);
1722 segno_in_journal(sum, offset) =
1723 cpu_to_le32(segno);
1724 seg_info_to_raw_sit(se,
1725 &sit_in_journal(sum, offset));
1726 } else {
1727 sit_offset = SIT_ENTRY_OFFSET(sit_i, segno);
1728 seg_info_to_raw_sit(se,
1729 &raw_sit->entries[sit_offset]);
1730 }
1575 1731
1576 /* read sit block that will be updated */ 1732 __clear_bit(segno, bitmap);
1577 page = get_next_sit_page(sbi, start); 1733 sit_i->dirty_sentries--;
1578 raw_sit = page_address(page); 1734 ses->entry_cnt--;
1579 } 1735 }
1580 1736
1581 /* udpate entry in SIT block */ 1737 if (!to_journal)
1582 seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); 1738 f2fs_put_page(page, 1);
1583flush_done: 1739
1584 __clear_bit(segno, bitmap); 1740 f2fs_bug_on(sbi, ses->entry_cnt);
1585 sit_i->dirty_sentries--; 1741 release_sit_entry_set(ses);
1742 }
1743
1744 f2fs_bug_on(sbi, !list_empty(head));
1745 f2fs_bug_on(sbi, sit_i->dirty_sentries);
1746out:
1747 if (cpc->reason == CP_DISCARD) {
1748 for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++)
1749 add_discard_addrs(sbi, cpc);
1586 } 1750 }
1587 mutex_unlock(&sit_i->sentry_lock); 1751 mutex_unlock(&sit_i->sentry_lock);
1588 mutex_unlock(&curseg->curseg_mutex); 1752 mutex_unlock(&curseg->curseg_mutex);
1589 1753
1590 /* writeout last modified SIT block */
1591 f2fs_put_page(page, 1);
1592
1593 set_prefree_as_free_segments(sbi); 1754 set_prefree_as_free_segments(sbi);
1594} 1755}
1595 1756
@@ -1609,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1609 1770
1610 SM_I(sbi)->sit_info = sit_i; 1771 SM_I(sbi)->sit_info = sit_i;
1611 1772
1612 sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry)); 1773 sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
1613 if (!sit_i->sentries) 1774 if (!sit_i->sentries)
1614 return -ENOMEM; 1775 return -ENOMEM;
1615 1776
1616 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1777 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1617 sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); 1778 sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
1618 if (!sit_i->dirty_sentries_bitmap) 1779 if (!sit_i->dirty_sentries_bitmap)
1619 return -ENOMEM; 1780 return -ENOMEM;
1620 1781
1621 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 1782 for (start = 0; start < MAIN_SEGS(sbi); start++) {
1622 sit_i->sentries[start].cur_valid_map 1783 sit_i->sentries[start].cur_valid_map
1623 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); 1784 = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
1624 sit_i->sentries[start].ckpt_valid_map 1785 sit_i->sentries[start].ckpt_valid_map
@@ -1629,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1629 } 1790 }
1630 1791
1631 if (sbi->segs_per_sec > 1) { 1792 if (sbi->segs_per_sec > 1) {
1632 sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * 1793 sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
1633 sizeof(struct sec_entry)); 1794 sizeof(struct sec_entry));
1634 if (!sit_i->sec_entries) 1795 if (!sit_i->sec_entries)
1635 return -ENOMEM; 1796 return -ENOMEM;
@@ -1664,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
1664 1825
1665static int build_free_segmap(struct f2fs_sb_info *sbi) 1826static int build_free_segmap(struct f2fs_sb_info *sbi)
1666{ 1827{
1667 struct f2fs_sm_info *sm_info = SM_I(sbi);
1668 struct free_segmap_info *free_i; 1828 struct free_segmap_info *free_i;
1669 unsigned int bitmap_size, sec_bitmap_size; 1829 unsigned int bitmap_size, sec_bitmap_size;
1670 1830
@@ -1675,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
1675 1835
1676 SM_I(sbi)->free_info = free_i; 1836 SM_I(sbi)->free_info = free_i;
1677 1837
1678 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1838 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1679 free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); 1839 free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
1680 if (!free_i->free_segmap) 1840 if (!free_i->free_segmap)
1681 return -ENOMEM; 1841 return -ENOMEM;
1682 1842
1683 sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); 1843 sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
1684 free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); 1844 free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
1685 if (!free_i->free_secmap) 1845 if (!free_i->free_secmap)
1686 return -ENOMEM; 1846 return -ENOMEM;
@@ -1690,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
1690 memset(free_i->free_secmap, 0xff, sec_bitmap_size); 1850 memset(free_i->free_secmap, 0xff, sec_bitmap_size);
1691 1851
1692 /* init free segmap information */ 1852 /* init free segmap information */
1693 free_i->start_segno = 1853 free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
1694 (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr);
1695 free_i->free_segments = 0; 1854 free_i->free_segments = 0;
1696 free_i->free_sections = 0; 1855 free_i->free_sections = 0;
1697 rwlock_init(&free_i->segmap_lock); 1856 rwlock_init(&free_i->segmap_lock);
@@ -1703,7 +1862,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
1703 struct curseg_info *array; 1862 struct curseg_info *array;
1704 int i; 1863 int i;
1705 1864
1706 array = kzalloc(sizeof(*array) * NR_CURSEG_TYPE, GFP_KERNEL); 1865 array = kcalloc(NR_CURSEG_TYPE, sizeof(*array), GFP_KERNEL);
1707 if (!array) 1866 if (!array)
1708 return -ENOMEM; 1867 return -ENOMEM;
1709 1868
@@ -1728,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
1728 int sit_blk_cnt = SIT_BLK_CNT(sbi); 1887 int sit_blk_cnt = SIT_BLK_CNT(sbi);
1729 unsigned int i, start, end; 1888 unsigned int i, start, end;
1730 unsigned int readed, start_blk = 0; 1889 unsigned int readed, start_blk = 0;
1731 int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 1890 int nrpages = MAX_BIO_BLOCKS(sbi);
1732 1891
1733 do { 1892 do {
1734 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); 1893 readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
@@ -1736,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
1736 start = start_blk * sit_i->sents_per_block; 1895 start = start_blk * sit_i->sents_per_block;
1737 end = (start_blk + readed) * sit_i->sents_per_block; 1896 end = (start_blk + readed) * sit_i->sents_per_block;
1738 1897
1739 for (; start < end && start < TOTAL_SEGS(sbi); start++) { 1898 for (; start < end && start < MAIN_SEGS(sbi); start++) {
1740 struct seg_entry *se = &sit_i->sentries[start]; 1899 struct seg_entry *se = &sit_i->sentries[start];
1741 struct f2fs_sit_block *sit_blk; 1900 struct f2fs_sit_block *sit_blk;
1742 struct f2fs_sit_entry sit; 1901 struct f2fs_sit_entry sit;
@@ -1774,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
1774 unsigned int start; 1933 unsigned int start;
1775 int type; 1934 int type;
1776 1935
1777 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 1936 for (start = 0; start < MAIN_SEGS(sbi); start++) {
1778 struct seg_entry *sentry = get_seg_entry(sbi, start); 1937 struct seg_entry *sentry = get_seg_entry(sbi, start);
1779 if (!sentry->valid_blocks) 1938 if (!sentry->valid_blocks)
1780 __set_free(sbi, start); 1939 __set_free(sbi, start);
@@ -1791,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1791{ 1950{
1792 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 1951 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1793 struct free_segmap_info *free_i = FREE_I(sbi); 1952 struct free_segmap_info *free_i = FREE_I(sbi);
1794 unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); 1953 unsigned int segno = 0, offset = 0;
1795 unsigned short valid_blocks; 1954 unsigned short valid_blocks;
1796 1955
1797 while (1) { 1956 while (1) {
1798 /* find dirty segment based on free segmap */ 1957 /* find dirty segment based on free segmap */
1799 segno = find_next_inuse(free_i, total_segs, offset); 1958 segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset);
1800 if (segno >= total_segs) 1959 if (segno >= MAIN_SEGS(sbi))
1801 break; 1960 break;
1802 offset = segno + 1; 1961 offset = segno + 1;
1803 valid_blocks = get_valid_blocks(sbi, segno, 0); 1962 valid_blocks = get_valid_blocks(sbi, segno, 0);
1804 if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks) 1963 if (valid_blocks == sbi->blocks_per_seg || !valid_blocks)
1964 continue;
1965 if (valid_blocks > sbi->blocks_per_seg) {
1966 f2fs_bug_on(sbi, 1);
1805 continue; 1967 continue;
1968 }
1806 mutex_lock(&dirty_i->seglist_lock); 1969 mutex_lock(&dirty_i->seglist_lock);
1807 __locate_dirty_segment(sbi, segno, DIRTY); 1970 __locate_dirty_segment(sbi, segno, DIRTY);
1808 mutex_unlock(&dirty_i->seglist_lock); 1971 mutex_unlock(&dirty_i->seglist_lock);
@@ -1812,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
1812static int init_victim_secmap(struct f2fs_sb_info *sbi) 1975static int init_victim_secmap(struct f2fs_sb_info *sbi)
1813{ 1976{
1814 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); 1977 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
1815 unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); 1978 unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
1816 1979
1817 dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); 1980 dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
1818 if (!dirty_i->victim_secmap) 1981 if (!dirty_i->victim_secmap)
@@ -1833,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
1833 SM_I(sbi)->dirty_info = dirty_i; 1996 SM_I(sbi)->dirty_info = dirty_i;
1834 mutex_init(&dirty_i->seglist_lock); 1997 mutex_init(&dirty_i->seglist_lock);
1835 1998
1836 bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); 1999 bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
1837 2000
1838 for (i = 0; i < NR_DIRTY_TYPE; i++) { 2001 for (i = 0; i < NR_DIRTY_TYPE; i++) {
1839 dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); 2002 dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
@@ -1857,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
1857 2020
1858 sit_i->min_mtime = LLONG_MAX; 2021 sit_i->min_mtime = LLONG_MAX;
1859 2022
1860 for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { 2023 for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
1861 unsigned int i; 2024 unsigned int i;
1862 unsigned long long mtime = 0; 2025 unsigned long long mtime = 0;
1863 2026
@@ -1895,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
1895 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); 2058 sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
1896 sm_info->rec_prefree_segments = sm_info->main_segments * 2059 sm_info->rec_prefree_segments = sm_info->main_segments *
1897 DEF_RECLAIM_PREFREE_SEGMENTS / 100; 2060 DEF_RECLAIM_PREFREE_SEGMENTS / 100;
1898 sm_info->ipu_policy = F2FS_IPU_DISABLE; 2061 sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
1899 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; 2062 sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
2063 sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
1900 2064
1901 INIT_LIST_HEAD(&sm_info->discard_list); 2065 INIT_LIST_HEAD(&sm_info->discard_list);
1902 sm_info->nr_discards = 0; 2066 sm_info->nr_discards = 0;
1903 sm_info->max_discards = 0; 2067 sm_info->max_discards = 0;
1904 2068
2069 INIT_LIST_HEAD(&sm_info->sit_entry_set);
2070
1905 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { 2071 if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
1906 err = create_flush_cmd_control(sbi); 2072 err = create_flush_cmd_control(sbi);
1907 if (err) 2073 if (err)
@@ -1997,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
1997 return; 2163 return;
1998 2164
1999 if (sit_i->sentries) { 2165 if (sit_i->sentries) {
2000 for (start = 0; start < TOTAL_SEGS(sbi); start++) { 2166 for (start = 0; start < MAIN_SEGS(sbi); start++) {
2001 kfree(sit_i->sentries[start].cur_valid_map); 2167 kfree(sit_i->sentries[start].cur_valid_map);
2002 kfree(sit_i->sentries[start].ckpt_valid_map); 2168 kfree(sit_i->sentries[start].ckpt_valid_map);
2003 } 2169 }
@@ -2031,11 +2197,30 @@ int __init create_segment_manager_caches(void)
2031 discard_entry_slab = f2fs_kmem_cache_create("discard_entry", 2197 discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
2032 sizeof(struct discard_entry)); 2198 sizeof(struct discard_entry));
2033 if (!discard_entry_slab) 2199 if (!discard_entry_slab)
2034 return -ENOMEM; 2200 goto fail;
2201
2202 sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
2203 sizeof(struct nat_entry_set));
2204 if (!sit_entry_set_slab)
2205 goto destory_discard_entry;
2206
2207 inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
2208 sizeof(struct inmem_pages));
2209 if (!inmem_entry_slab)
2210 goto destroy_sit_entry_set;
2035 return 0; 2211 return 0;
2212
2213destroy_sit_entry_set:
2214 kmem_cache_destroy(sit_entry_set_slab);
2215destory_discard_entry:
2216 kmem_cache_destroy(discard_entry_slab);
2217fail:
2218 return -ENOMEM;
2036} 2219}
2037 2220
2038void destroy_segment_manager_caches(void) 2221void destroy_segment_manager_caches(void)
2039{ 2222{
2223 kmem_cache_destroy(sit_entry_set_slab);
2040 kmem_cache_destroy(discard_entry_slab); 2224 kmem_cache_destroy(discard_entry_slab);
2225 kmem_cache_destroy(inmem_entry_slab);
2041} 2226}
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7091204680f4..2495bec1c621 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -45,16 +45,26 @@
45 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ 45 (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \
46 sbi->segs_per_sec)) \ 46 sbi->segs_per_sec)) \
47 47
48#define START_BLOCK(sbi, segno) \ 48#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr)
49 (SM_I(sbi)->seg0_blkaddr + \ 49#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr)
50
51#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments)
52#define MAIN_SECS(sbi) (sbi->total_sections)
53
54#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count)
55#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg)
56
57#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
58#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \
59 sbi->log_blocks_per_seg))
60
61#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \
50 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) 62 (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg))
63
51#define NEXT_FREE_BLKADDR(sbi, curseg) \ 64#define NEXT_FREE_BLKADDR(sbi, curseg) \
52 (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) 65 (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff)
53 66
54#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr) 67#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi))
55
56#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \
57 ((blk_addr) - SM_I(sbi)->seg0_blkaddr)
58#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ 68#define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \
59 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) 69 (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg)
60#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ 70#define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \
@@ -77,23 +87,21 @@
77 87
78#define SIT_ENTRY_OFFSET(sit_i, segno) \ 88#define SIT_ENTRY_OFFSET(sit_i, segno) \
79 (segno % sit_i->sents_per_block) 89 (segno % sit_i->sents_per_block)
80#define SIT_BLOCK_OFFSET(sit_i, segno) \ 90#define SIT_BLOCK_OFFSET(segno) \
81 (segno / SIT_ENTRY_PER_BLOCK) 91 (segno / SIT_ENTRY_PER_BLOCK)
82#define START_SEGNO(sit_i, segno) \ 92#define START_SEGNO(segno) \
83 (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) 93 (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK)
84#define SIT_BLK_CNT(sbi) \ 94#define SIT_BLK_CNT(sbi) \
85 ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) 95 ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK)
86#define f2fs_bitmap_size(nr) \ 96#define f2fs_bitmap_size(nr) \
87 (BITS_TO_LONGS(nr) * sizeof(unsigned long)) 97 (BITS_TO_LONGS(nr) * sizeof(unsigned long))
88#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments)
89#define TOTAL_SECS(sbi) (sbi->total_sections)
90 98
91#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ 99#define SECTOR_FROM_BLOCK(blk_addr) \
92 (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) 100 (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
93#define SECTOR_TO_BLOCK(sbi, sectors) \ 101#define SECTOR_TO_BLOCK(sectors) \
94 (sectors >> (sbi)->log_sectors_per_block) 102 (sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
95#define MAX_BIO_BLOCKS(max_hw_blocks) \ 103#define MAX_BIO_BLOCKS(sbi) \
96 (min((int)max_hw_blocks, BIO_MAX_PAGES)) 104 ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
97 105
98/* 106/*
99 * indicate a block allocation direction: RIGHT and LEFT. 107 * indicate a block allocation direction: RIGHT and LEFT.
@@ -167,6 +175,11 @@ struct segment_allocation {
167 void (*allocate_segment)(struct f2fs_sb_info *, int, bool); 175 void (*allocate_segment)(struct f2fs_sb_info *, int, bool);
168}; 176};
169 177
178struct inmem_pages {
179 struct list_head list;
180 struct page *page;
181};
182
170struct sit_info { 183struct sit_info {
171 const struct segment_allocation *s_ops; 184 const struct segment_allocation *s_ops;
172 185
@@ -237,6 +250,12 @@ struct curseg_info {
237 unsigned int next_segno; /* preallocated segment */ 250 unsigned int next_segno; /* preallocated segment */
238}; 251};
239 252
253struct sit_entry_set {
254 struct list_head set_list; /* link with all sit sets */
255 unsigned int start_segno; /* start segno of sits in set */
256 unsigned int entry_cnt; /* the # of sit entries in set */
257};
258
240/* 259/*
241 * inline functions 260 * inline functions
242 */ 261 */
@@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
316 clear_bit(segno, free_i->free_segmap); 335 clear_bit(segno, free_i->free_segmap);
317 free_i->free_segments++; 336 free_i->free_segments++;
318 337
319 next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno); 338 next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno);
320 if (next >= start_segno + sbi->segs_per_sec) { 339 if (next >= start_segno + sbi->segs_per_sec) {
321 clear_bit(secno, free_i->free_secmap); 340 clear_bit(secno, free_i->free_secmap);
322 free_i->free_sections++; 341 free_i->free_sections++;
@@ -347,8 +366,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
347 if (test_and_clear_bit(segno, free_i->free_segmap)) { 366 if (test_and_clear_bit(segno, free_i->free_segmap)) {
348 free_i->free_segments++; 367 free_i->free_segments++;
349 368
350 next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), 369 next = find_next_bit(free_i->free_segmap,
351 start_segno); 370 start_segno + sbi->segs_per_sec, start_segno);
352 if (next >= start_segno + sbi->segs_per_sec) { 371 if (next >= start_segno + sbi->segs_per_sec) {
353 if (test_and_clear_bit(secno, free_i->free_secmap)) 372 if (test_and_clear_bit(secno, free_i->free_secmap))
354 free_i->free_sections++; 373 free_i->free_sections++;
@@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
430 449
431static inline bool need_SSR(struct f2fs_sb_info *sbi) 450static inline bool need_SSR(struct f2fs_sb_info *sbi)
432{ 451{
433 return (prefree_segments(sbi) / sbi->segs_per_sec) 452 int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
434 + free_sections(sbi) < overprovision_sections(sbi); 453 int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
454 return free_sections(sbi) <= (node_secs + 2 * dent_secs +
455 reserved_sections(sbi) + 1);
435} 456}
436 457
437static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) 458static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
@@ -466,44 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi)
466 * F2FS_IPU_UTIL - if FS utilization is over threashold, 487 * F2FS_IPU_UTIL - if FS utilization is over threashold,
467 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over 488 * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over
468 * threashold, 489 * threashold,
490 * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash
491 * storages. IPU will be triggered only if the # of dirty
492 * pages over min_fsync_blocks.
469 * F2FS_IPUT_DISABLE - disable IPU. (=default option) 493 * F2FS_IPUT_DISABLE - disable IPU. (=default option)
470 */ 494 */
471#define DEF_MIN_IPU_UTIL 70 495#define DEF_MIN_IPU_UTIL 70
496#define DEF_MIN_FSYNC_BLOCKS 8
472 497
473enum { 498enum {
474 F2FS_IPU_FORCE, 499 F2FS_IPU_FORCE,
475 F2FS_IPU_SSR, 500 F2FS_IPU_SSR,
476 F2FS_IPU_UTIL, 501 F2FS_IPU_UTIL,
477 F2FS_IPU_SSR_UTIL, 502 F2FS_IPU_SSR_UTIL,
478 F2FS_IPU_DISABLE, 503 F2FS_IPU_FSYNC,
479}; 504};
480 505
481static inline bool need_inplace_update(struct inode *inode) 506static inline bool need_inplace_update(struct inode *inode)
482{ 507{
483 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 508 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
509 unsigned int policy = SM_I(sbi)->ipu_policy;
484 510
485 /* IPU can be done only for the user data */ 511 /* IPU can be done only for the user data */
486 if (S_ISDIR(inode->i_mode)) 512 if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode))
487 return false; 513 return false;
488 514
489 switch (SM_I(sbi)->ipu_policy) { 515 if (policy & (0x1 << F2FS_IPU_FORCE))
490 case F2FS_IPU_FORCE:
491 return true; 516 return true;
492 case F2FS_IPU_SSR: 517 if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi))
493 if (need_SSR(sbi)) 518 return true;
494 return true; 519 if (policy & (0x1 << F2FS_IPU_UTIL) &&
495 break; 520 utilization(sbi) > SM_I(sbi)->min_ipu_util)
496 case F2FS_IPU_UTIL: 521 return true;
497 if (utilization(sbi) > SM_I(sbi)->min_ipu_util) 522 if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) &&
498 return true; 523 utilization(sbi) > SM_I(sbi)->min_ipu_util)
499 break; 524 return true;
500 case F2FS_IPU_SSR_UTIL: 525
501 if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) 526 /* this is only set during fdatasync */
502 return true; 527 if (policy & (0x1 << F2FS_IPU_FSYNC) &&
503 break; 528 is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU))
504 case F2FS_IPU_DISABLE: 529 return true;
505 break; 530
506 }
507 return false; 531 return false;
508} 532}
509 533
@@ -530,28 +554,21 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type)
530#ifdef CONFIG_F2FS_CHECK_FS 554#ifdef CONFIG_F2FS_CHECK_FS
531static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) 555static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
532{ 556{
533 unsigned int end_segno = SM_I(sbi)->segment_count - 1; 557 BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
534 BUG_ON(segno > end_segno);
535} 558}
536 559
537static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) 560static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
538{ 561{
539 struct f2fs_sm_info *sm_info = SM_I(sbi); 562 BUG_ON(blk_addr < SEG0_BLKADDR(sbi));
540 block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg; 563 BUG_ON(blk_addr >= MAX_BLKADDR(sbi));
541 block_t start_addr = sm_info->seg0_blkaddr;
542 block_t end_addr = start_addr + total_blks - 1;
543 BUG_ON(blk_addr < start_addr);
544 BUG_ON(blk_addr > end_addr);
545} 564}
546 565
547/* 566/*
548 * Summary block is always treated as invalid block 567 * Summary block is always treated as an invalid block
549 */ 568 */
550static inline void check_block_count(struct f2fs_sb_info *sbi, 569static inline void check_block_count(struct f2fs_sb_info *sbi,
551 int segno, struct f2fs_sit_entry *raw_sit) 570 int segno, struct f2fs_sit_entry *raw_sit)
552{ 571{
553 struct f2fs_sm_info *sm_info = SM_I(sbi);
554 unsigned int end_segno = sm_info->segment_count - 1;
555 bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; 572 bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false;
556 int valid_blocks = 0; 573 int valid_blocks = 0;
557 int cur_pos = 0, next_pos; 574 int cur_pos = 0, next_pos;
@@ -560,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
560 BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); 577 BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg);
561 578
562 /* check boundary of a given segment number */ 579 /* check boundary of a given segment number */
563 BUG_ON(segno > end_segno); 580 BUG_ON(segno > TOTAL_SEGS(sbi) - 1);
564 581
565 /* check bitmap with valid block count */ 582 /* check bitmap with valid block count */
566 do { 583 do {
@@ -579,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
579 BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); 596 BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks);
580} 597}
581#else 598#else
582#define check_seg_range(sbi, segno) 599static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
583#define verify_block_addr(sbi, blk_addr) 600{
584#define check_block_count(sbi, segno, raw_sit) 601 if (segno > TOTAL_SEGS(sbi) - 1)
602 sbi->need_fsck = true;
603}
604
605static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
606{
607 if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
608 sbi->need_fsck = true;
609}
610
611/*
612 * Summary block is always treated as an invalid block
613 */
614static inline void check_block_count(struct f2fs_sb_info *sbi,
615 int segno, struct f2fs_sit_entry *raw_sit)
616{
617 /* check segment usage */
618 if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
619 sbi->need_fsck = true;
620
621 /* check boundary of a given segment number */
622 if (segno > TOTAL_SEGS(sbi) - 1)
623 sbi->need_fsck = true;
624}
585#endif 625#endif
586 626
587static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, 627static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi,
588 unsigned int start) 628 unsigned int start)
589{ 629{
590 struct sit_info *sit_i = SIT_I(sbi); 630 struct sit_info *sit_i = SIT_I(sbi);
591 unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start); 631 unsigned int offset = SIT_BLOCK_OFFSET(start);
592 block_t blk_addr = sit_i->sit_base_addr + offset; 632 block_t blk_addr = sit_i->sit_base_addr + offset;
593 633
594 check_seg_range(sbi, start); 634 check_seg_range(sbi, start);
@@ -615,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi,
615 655
616static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) 656static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start)
617{ 657{
618 unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start); 658 unsigned int block_off = SIT_BLOCK_OFFSET(start);
619 659
620 if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) 660 if (f2fs_test_bit(block_off, sit_i->sit_bitmap))
621 f2fs_clear_bit(block_off, sit_i->sit_bitmap); 661 f2fs_clear_bit(block_off, sit_i->sit_bitmap);
@@ -662,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
662{ 702{
663 struct block_device *bdev = sbi->sb->s_bdev; 703 struct block_device *bdev = sbi->sb->s_bdev;
664 struct request_queue *q = bdev_get_queue(bdev); 704 struct request_queue *q = bdev_get_queue(bdev);
665 return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); 705 return SECTOR_TO_BLOCK(queue_max_sectors(q));
666} 706}
667 707
668/* 708/*
@@ -679,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
679 else if (type == NODE) 719 else if (type == NODE)
680 return 3 * sbi->blocks_per_seg; 720 return 3 * sbi->blocks_per_seg;
681 else if (type == META) 721 else if (type == META)
682 return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 722 return MAX_BIO_BLOCKS(sbi);
683 else 723 else
684 return 0; 724 return 0;
685} 725}
@@ -702,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
702 else if (type == NODE) 742 else if (type == NODE)
703 desired = 3 * max_hw_blocks(sbi); 743 desired = 3 * max_hw_blocks(sbi);
704 else 744 else
705 desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); 745 desired = MAX_BIO_BLOCKS(sbi);
706 746
707 wbc->nr_to_write = desired; 747 wbc->nr_to_write = desired;
708 return desired - nr_to_write; 748 return desired - nr_to_write;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8f96d9372ade..41d6f700f4ee 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -52,6 +52,7 @@ enum {
52 Opt_inline_xattr, 52 Opt_inline_xattr,
53 Opt_inline_data, 53 Opt_inline_data,
54 Opt_flush_merge, 54 Opt_flush_merge,
55 Opt_nobarrier,
55 Opt_err, 56 Opt_err,
56}; 57};
57 58
@@ -69,6 +70,7 @@ static match_table_t f2fs_tokens = {
69 {Opt_inline_xattr, "inline_xattr"}, 70 {Opt_inline_xattr, "inline_xattr"},
70 {Opt_inline_data, "inline_data"}, 71 {Opt_inline_data, "inline_data"},
71 {Opt_flush_merge, "flush_merge"}, 72 {Opt_flush_merge, "flush_merge"},
73 {Opt_nobarrier, "nobarrier"},
72 {Opt_err, NULL}, 74 {Opt_err, NULL},
73}; 75};
74 76
@@ -188,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
188F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); 190F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
189F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); 191F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
190F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); 192F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
193F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
191F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); 194F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
192F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); 195F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
193F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); 196F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
@@ -202,6 +205,7 @@ static struct attribute *f2fs_attrs[] = {
202 ATTR_LIST(max_small_discards), 205 ATTR_LIST(max_small_discards),
203 ATTR_LIST(ipu_policy), 206 ATTR_LIST(ipu_policy),
204 ATTR_LIST(min_ipu_util), 207 ATTR_LIST(min_ipu_util),
208 ATTR_LIST(min_fsync_blocks),
205 ATTR_LIST(max_victim_search), 209 ATTR_LIST(max_victim_search),
206 ATTR_LIST(dir_level), 210 ATTR_LIST(dir_level),
207 ATTR_LIST(ram_thresh), 211 ATTR_LIST(ram_thresh),
@@ -339,6 +343,9 @@ static int parse_options(struct super_block *sb, char *options)
339 case Opt_flush_merge: 343 case Opt_flush_merge:
340 set_opt(sbi, FLUSH_MERGE); 344 set_opt(sbi, FLUSH_MERGE);
341 break; 345 break;
346 case Opt_nobarrier:
347 set_opt(sbi, NOBARRIER);
348 break;
342 default: 349 default:
343 f2fs_msg(sb, KERN_ERR, 350 f2fs_msg(sb, KERN_ERR,
344 "Unrecognized mount option \"%s\" or missing value", 351 "Unrecognized mount option \"%s\" or missing value",
@@ -361,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
361 368
362 /* Initialize f2fs-specific inode info */ 369 /* Initialize f2fs-specific inode info */
363 fi->vfs_inode.i_version = 1; 370 fi->vfs_inode.i_version = 1;
364 atomic_set(&fi->dirty_dents, 0); 371 atomic_set(&fi->dirty_pages, 0);
365 fi->i_current_depth = 1; 372 fi->i_current_depth = 1;
366 fi->i_advise = 0; 373 fi->i_advise = 0;
367 rwlock_init(&fi->ext.ext_lock); 374 rwlock_init(&fi->ext.ext_lock);
368 init_rwsem(&fi->i_sem); 375 init_rwsem(&fi->i_sem);
376 INIT_LIST_HEAD(&fi->inmem_pages);
377 mutex_init(&fi->inmem_lock);
369 378
370 set_inode_flag(fi, FI_NEW_INODE); 379 set_inode_flag(fi, FI_NEW_INODE);
371 380
@@ -427,8 +436,19 @@ static void f2fs_put_super(struct super_block *sb)
427 stop_gc_thread(sbi); 436 stop_gc_thread(sbi);
428 437
429 /* We don't need to do checkpoint when it's clean */ 438 /* We don't need to do checkpoint when it's clean */
430 if (sbi->s_dirty && get_pages(sbi, F2FS_DIRTY_NODES)) 439 if (sbi->s_dirty) {
431 write_checkpoint(sbi, true); 440 struct cp_control cpc = {
441 .reason = CP_UMOUNT,
442 };
443 write_checkpoint(sbi, &cpc);
444 }
445
446 /*
447 * normally superblock is clean, so we need to release this.
448 * In addition, EIO will skip do checkpoint, we need this as well.
449 */
450 release_dirty_inode(sbi);
451 release_discard_addrs(sbi);
432 452
433 iput(sbi->node_inode); 453 iput(sbi->node_inode);
434 iput(sbi->meta_inode); 454 iput(sbi->meta_inode);
@@ -452,12 +472,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
452 472
453 trace_f2fs_sync_fs(sb, sync); 473 trace_f2fs_sync_fs(sb, sync);
454 474
455 if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
456 return 0;
457
458 if (sync) { 475 if (sync) {
476 struct cp_control cpc = {
477 .reason = CP_SYNC,
478 };
459 mutex_lock(&sbi->gc_mutex); 479 mutex_lock(&sbi->gc_mutex);
460 write_checkpoint(sbi, false); 480 write_checkpoint(sbi, &cpc);
461 mutex_unlock(&sbi->gc_mutex); 481 mutex_unlock(&sbi->gc_mutex);
462 } else { 482 } else {
463 f2fs_balance_fs(sbi); 483 f2fs_balance_fs(sbi);
@@ -500,8 +520,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
500 buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; 520 buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
501 buf->f_bavail = user_block_count - valid_user_blocks(sbi); 521 buf->f_bavail = user_block_count - valid_user_blocks(sbi);
502 522
503 buf->f_files = sbi->total_node_count; 523 buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
504 buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi); 524 buf->f_ffree = buf->f_files - valid_inode_count(sbi);
505 525
506 buf->f_namelen = F2FS_NAME_LEN; 526 buf->f_namelen = F2FS_NAME_LEN;
507 buf->f_fsid.val[0] = (u32)id; 527 buf->f_fsid.val[0] = (u32)id;
@@ -544,6 +564,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
544 seq_puts(seq, ",inline_data"); 564 seq_puts(seq, ",inline_data");
545 if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) 565 if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE))
546 seq_puts(seq, ",flush_merge"); 566 seq_puts(seq, ",flush_merge");
567 if (test_opt(sbi, NOBARRIER))
568 seq_puts(seq, ",nobarrier");
547 seq_printf(seq, ",active_logs=%u", sbi->active_logs); 569 seq_printf(seq, ",active_logs=%u", sbi->active_logs);
548 570
549 return 0; 571 return 0;
@@ -606,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
606 org_mount_opt = sbi->mount_opt; 628 org_mount_opt = sbi->mount_opt;
607 active_logs = sbi->active_logs; 629 active_logs = sbi->active_logs;
608 630
631 sbi->mount_opt.opt = 0;
632 sbi->active_logs = NR_CURSEG_TYPE;
633
609 /* parse mount options */ 634 /* parse mount options */
610 err = parse_options(sb, data); 635 err = parse_options(sb, data);
611 if (err) 636 if (err)
@@ -615,7 +640,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
615 * Previous and new state of filesystem is RO, 640 * Previous and new state of filesystem is RO,
616 * so skip checking GC and FLUSH_MERGE conditions. 641 * so skip checking GC and FLUSH_MERGE conditions.
617 */ 642 */
618 if ((sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) 643 if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
619 goto skip; 644 goto skip;
620 645
621 /* 646 /*
@@ -642,8 +667,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
642 */ 667 */
643 if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { 668 if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
644 destroy_flush_cmd_control(sbi); 669 destroy_flush_cmd_control(sbi);
645 } else if (test_opt(sbi, FLUSH_MERGE) && 670 } else if (test_opt(sbi, FLUSH_MERGE) && !SM_I(sbi)->cmd_control_info) {
646 !sbi->sm_info->cmd_control_info) {
647 err = create_flush_cmd_control(sbi); 671 err = create_flush_cmd_control(sbi);
648 if (err) 672 if (err)
649 goto restore_gc; 673 goto restore_gc;
@@ -657,7 +681,7 @@ restore_gc:
657 if (need_restart_gc) { 681 if (need_restart_gc) {
658 if (start_gc_thread(sbi)) 682 if (start_gc_thread(sbi))
659 f2fs_msg(sbi->sb, KERN_WARNING, 683 f2fs_msg(sbi->sb, KERN_WARNING,
660 "background gc thread is stop"); 684 "background gc thread has stopped");
661 } else if (need_stop_gc) { 685 } else if (need_stop_gc) {
662 stop_gc_thread(sbi); 686 stop_gc_thread(sbi);
663 } 687 }
@@ -777,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb,
777 return 1; 801 return 1;
778 } 802 }
779 803
780 if (le32_to_cpu(raw_super->log_sectorsize) != 804 /* Currently, support 512/1024/2048/4096 bytes sector size */
781 F2FS_LOG_SECTOR_SIZE) { 805 if (le32_to_cpu(raw_super->log_sectorsize) >
782 f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); 806 F2FS_MAX_LOG_SECTOR_SIZE ||
807 le32_to_cpu(raw_super->log_sectorsize) <
808 F2FS_MIN_LOG_SECTOR_SIZE) {
809 f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
810 le32_to_cpu(raw_super->log_sectorsize));
783 return 1; 811 return 1;
784 } 812 }
785 if (le32_to_cpu(raw_super->log_sectors_per_block) != 813 if (le32_to_cpu(raw_super->log_sectors_per_block) +
786 F2FS_LOG_SECTORS_PER_BLOCK) { 814 le32_to_cpu(raw_super->log_sectorsize) !=
787 f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); 815 F2FS_MAX_LOG_SECTOR_SIZE) {
816 f2fs_msg(sb, KERN_INFO,
817 "Invalid log sectors per block(%u) log sectorsize(%u)",
818 le32_to_cpu(raw_super->log_sectors_per_block),
819 le32_to_cpu(raw_super->log_sectorsize));
788 return 1; 820 return 1;
789 } 821 }
790 return 0; 822 return 0;
@@ -806,7 +838,7 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi)
806 if (unlikely(fsmeta >= total)) 838 if (unlikely(fsmeta >= total))
807 return 1; 839 return 1;
808 840
809 if (unlikely(is_set_ckpt_flags(ckpt, CP_ERROR_FLAG))) { 841 if (unlikely(f2fs_cp_error(sbi))) {
810 f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); 842 f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
811 return 1; 843 return 1;
812 } 844 }
@@ -840,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
840 atomic_set(&sbi->nr_pages[i], 0); 872 atomic_set(&sbi->nr_pages[i], 0);
841 873
842 sbi->dir_level = DEF_DIR_LEVEL; 874 sbi->dir_level = DEF_DIR_LEVEL;
875 sbi->need_fsck = false;
843} 876}
844 877
845/* 878/*
@@ -893,8 +926,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
893 struct buffer_head *raw_super_buf; 926 struct buffer_head *raw_super_buf;
894 struct inode *root; 927 struct inode *root;
895 long err = -EINVAL; 928 long err = -EINVAL;
929 bool retry = true;
896 int i; 930 int i;
897 931
932try_onemore:
898 /* allocate memory for f2fs-specific super block info */ 933 /* allocate memory for f2fs-specific super block info */
899 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); 934 sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
900 if (!sbi) 935 if (!sbi)
@@ -947,7 +982,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
947 mutex_init(&sbi->gc_mutex); 982 mutex_init(&sbi->gc_mutex);
948 mutex_init(&sbi->writepages); 983 mutex_init(&sbi->writepages);
949 mutex_init(&sbi->cp_mutex); 984 mutex_init(&sbi->cp_mutex);
950 mutex_init(&sbi->node_write); 985 init_rwsem(&sbi->node_write);
951 sbi->por_doing = false; 986 sbi->por_doing = false;
952 spin_lock_init(&sbi->stat_lock); 987 spin_lock_init(&sbi->stat_lock);
953 988
@@ -997,7 +1032,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
997 INIT_LIST_HEAD(&sbi->dir_inode_list); 1032 INIT_LIST_HEAD(&sbi->dir_inode_list);
998 spin_lock_init(&sbi->dir_inode_lock); 1033 spin_lock_init(&sbi->dir_inode_lock);
999 1034
1000 init_orphan_info(sbi); 1035 init_ino_entry_info(sbi);
1001 1036
1002 /* setup f2fs internal modules */ 1037 /* setup f2fs internal modules */
1003 err = build_segment_manager(sbi); 1038 err = build_segment_manager(sbi);
@@ -1034,8 +1069,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
1034 goto free_node_inode; 1069 goto free_node_inode;
1035 } 1070 }
1036 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 1071 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
1072 iput(root);
1037 err = -EINVAL; 1073 err = -EINVAL;
1038 goto free_root_inode; 1074 goto free_node_inode;
1039 } 1075 }
1040 1076
1041 sb->s_root = d_make_root(root); /* allocate root dentry */ 1077 sb->s_root = d_make_root(root); /* allocate root dentry */
@@ -1070,19 +1106,24 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
1070 if (err) 1106 if (err)
1071 goto free_proc; 1107 goto free_proc;
1072 1108
1109 if (!retry)
1110 sbi->need_fsck = true;
1111
1073 /* recover fsynced data */ 1112 /* recover fsynced data */
1074 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { 1113 if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
1075 err = recover_fsync_data(sbi); 1114 err = recover_fsync_data(sbi);
1076 if (err) 1115 if (err) {
1077 f2fs_msg(sb, KERN_ERR, 1116 f2fs_msg(sb, KERN_ERR,
1078 "Cannot recover all fsync data errno=%ld", err); 1117 "Cannot recover all fsync data errno=%ld", err);
1118 goto free_kobj;
1119 }
1079 } 1120 }
1080 1121
1081 /* 1122 /*
1082 * If filesystem is not mounted as read-only then 1123 * If filesystem is not mounted as read-only then
1083 * do start the gc_thread. 1124 * do start the gc_thread.
1084 */ 1125 */
1085 if (!(sb->s_flags & MS_RDONLY)) { 1126 if (!f2fs_readonly(sb)) {
1086 /* After POR, we can run background GC thread.*/ 1127 /* After POR, we can run background GC thread.*/
1087 err = start_gc_thread(sbi); 1128 err = start_gc_thread(sbi);
1088 if (err) 1129 if (err)
@@ -1116,6 +1157,13 @@ free_sb_buf:
1116 brelse(raw_super_buf); 1157 brelse(raw_super_buf);
1117free_sbi: 1158free_sbi:
1118 kfree(sbi); 1159 kfree(sbi);
1160
1161 /* give only one another chance */
1162 if (retry) {
1163 retry = 0;
1164 shrink_dcache_sb(sb);
1165 goto try_onemore;
1166 }
1119 return err; 1167 return err;
1120} 1168}
1121 1169
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 8bea941ee309..deca8728117b 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
266 266
267static void *read_all_xattrs(struct inode *inode, struct page *ipage) 267static void *read_all_xattrs(struct inode *inode, struct page *ipage)
268{ 268{
269 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 269 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
270 struct f2fs_xattr_header *header; 270 struct f2fs_xattr_header *header;
271 size_t size = PAGE_SIZE, inline_size = 0; 271 size_t size = PAGE_SIZE, inline_size = 0;
272 void *txattr_addr; 272 void *txattr_addr;
@@ -325,7 +325,7 @@ fail:
325static inline int write_all_xattrs(struct inode *inode, __u32 hsize, 325static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
326 void *txattr_addr, struct page *ipage) 326 void *txattr_addr, struct page *ipage)
327{ 327{
328 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 328 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
329 size_t inline_size = 0; 329 size_t inline_size = 0;
330 void *xattr_addr; 330 void *xattr_addr;
331 struct page *xpage; 331 struct page *xpage;
@@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
373 alloc_nid_failed(sbi, new_nid); 373 alloc_nid_failed(sbi, new_nid);
374 return PTR_ERR(xpage); 374 return PTR_ERR(xpage);
375 } 375 }
376 f2fs_bug_on(new_nid); 376 f2fs_bug_on(sbi, new_nid);
377 f2fs_wait_on_page_writeback(xpage, NODE); 377 f2fs_wait_on_page_writeback(xpage, NODE);
378 } else { 378 } else {
379 struct dnode_of_data dn; 379 struct dnode_of_data dn;
@@ -528,7 +528,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
528 int free; 528 int free;
529 /* 529 /*
530 * If value is NULL, it is remove operation. 530 * If value is NULL, it is remove operation.
531 * In case of update operation, we caculate free. 531 * In case of update operation, we calculate free.
532 */ 532 */
533 free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr); 533 free = MIN_OFFSET(inode) - ((char *)last - (char *)base_addr);
534 if (found) 534 if (found)
@@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
596 const void *value, size_t size, 596 const void *value, size_t size,
597 struct page *ipage, int flags) 597 struct page *ipage, int flags)
598{ 598{
599 struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); 599 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
600 int err; 600 int err;
601 601
602 /* this case is only from init_inode_metadata */ 602 /* this case is only from init_inode_metadata */
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 628e22a5a543..d8da2d2e30ae 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -164,8 +164,6 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
164 return 0; 164 return 0;
165} 165}
166 166
167extern struct timezone sys_tz;
168
169/* 167/*
170 * The epoch of FAT timestamp is 1980. 168 * The epoch of FAT timestamp is 1980.
171 * : bits : value 169 * : bits : value
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 72c82f69b01b..99d440a4a6ba 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -21,6 +21,7 @@
21#include <linux/rcupdate.h> 21#include <linux/rcupdate.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/user_namespace.h> 23#include <linux/user_namespace.h>
24#include <linux/shmem_fs.h>
24 25
25#include <asm/poll.h> 26#include <asm/poll.h>
26#include <asm/siginfo.h> 27#include <asm/siginfo.h>
@@ -97,26 +98,19 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
97 write_unlock_irq(&filp->f_owner.lock); 98 write_unlock_irq(&filp->f_owner.lock);
98} 99}
99 100
100int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, 101void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
101 int force) 102 int force)
102{ 103{
103 int err; 104 security_file_set_fowner(filp);
104
105 err = security_file_set_fowner(filp);
106 if (err)
107 return err;
108
109 f_modown(filp, pid, type, force); 105 f_modown(filp, pid, type, force);
110 return 0;
111} 106}
112EXPORT_SYMBOL(__f_setown); 107EXPORT_SYMBOL(__f_setown);
113 108
114int f_setown(struct file *filp, unsigned long arg, int force) 109void f_setown(struct file *filp, unsigned long arg, int force)
115{ 110{
116 enum pid_type type; 111 enum pid_type type;
117 struct pid *pid; 112 struct pid *pid;
118 int who = arg; 113 int who = arg;
119 int result;
120 type = PIDTYPE_PID; 114 type = PIDTYPE_PID;
121 if (who < 0) { 115 if (who < 0) {
122 type = PIDTYPE_PGID; 116 type = PIDTYPE_PGID;
@@ -124,9 +118,8 @@ int f_setown(struct file *filp, unsigned long arg, int force)
124 } 118 }
125 rcu_read_lock(); 119 rcu_read_lock();
126 pid = find_vpid(who); 120 pid = find_vpid(who);
127 result = __f_setown(filp, pid, type, force); 121 __f_setown(filp, pid, type, force);
128 rcu_read_unlock(); 122 rcu_read_unlock();
129 return result;
130} 123}
131EXPORT_SYMBOL(f_setown); 124EXPORT_SYMBOL(f_setown);
132 125
@@ -180,7 +173,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
180 if (owner.pid && !pid) 173 if (owner.pid && !pid)
181 ret = -ESRCH; 174 ret = -ESRCH;
182 else 175 else
183 ret = __f_setown(filp, pid, type, 1); 176 __f_setown(filp, pid, type, 1);
184 rcu_read_unlock(); 177 rcu_read_unlock();
185 178
186 return ret; 179 return ret;
@@ -301,7 +294,8 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
301 force_successful_syscall_return(); 294 force_successful_syscall_return();
302 break; 295 break;
303 case F_SETOWN: 296 case F_SETOWN:
304 err = f_setown(filp, arg, 1); 297 f_setown(filp, arg, 1);
298 err = 0;
305 break; 299 break;
306 case F_GETOWN_EX: 300 case F_GETOWN_EX:
307 err = f_getown_ex(filp, arg); 301 err = f_getown_ex(filp, arg);
@@ -336,6 +330,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
336 case F_GETPIPE_SZ: 330 case F_GETPIPE_SZ:
337 err = pipe_fcntl(filp, cmd, arg); 331 err = pipe_fcntl(filp, cmd, arg);
338 break; 332 break;
333 case F_ADD_SEALS:
334 case F_GET_SEALS:
335 err = shmem_fcntl(filp, cmd, arg);
336 break;
339 default: 337 default:
340 break; 338 break;
341 } 339 }
diff --git a/fs/file.c b/fs/file.c
index 66923fe3176e..ab3eb6a88239 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -367,7 +367,7 @@ static struct fdtable *close_files(struct files_struct * files)
367 struct file * file = xchg(&fdt->fd[i], NULL); 367 struct file * file = xchg(&fdt->fd[i], NULL);
368 if (file) { 368 if (file) {
369 filp_close(file, files); 369 filp_close(file, files);
370 cond_resched(); 370 cond_resched_rcu_qs();
371 } 371 }
372 } 372 }
373 i++; 373 i++;
@@ -750,6 +750,7 @@ bool get_close_on_exec(unsigned int fd)
750 750
751static int do_dup2(struct files_struct *files, 751static int do_dup2(struct files_struct *files,
752 struct file *file, unsigned fd, unsigned flags) 752 struct file *file, unsigned fd, unsigned flags)
753__releases(&files->file_lock)
753{ 754{
754 struct file *tofree; 755 struct file *tofree;
755 struct fdtable *fdt; 756 struct fdtable *fdt;
diff --git a/fs/file_table.c b/fs/file_table.c
index 385bfd31512a..3f85411b03ce 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -150,18 +150,10 @@ over:
150 150
151/** 151/**
152 * alloc_file - allocate and initialize a 'struct file' 152 * alloc_file - allocate and initialize a 'struct file'
153 * @mnt: the vfsmount on which the file will reside 153 *
154 * @dentry: the dentry representing the new file 154 * @path: the (dentry, vfsmount) pair for the new file
155 * @mode: the mode with which the new file will be opened 155 * @mode: the mode with which the new file will be opened
156 * @fop: the 'struct file_operations' for the new file 156 * @fop: the 'struct file_operations' for the new file
157 *
158 * Use this instead of get_empty_filp() to get a new
159 * 'struct file'. Do so because of the same initialization
160 * pitfalls reasons listed for init_file(). This is a
161 * preferred interface to using init_file().
162 *
163 * If all the callers of init_file() are eliminated, its
164 * code should be moved into this function.
165 */ 157 */
166struct file *alloc_file(struct path *path, fmode_t mode, 158struct file *alloc_file(struct path *path, fmode_t mode,
167 const struct file_operations *fop) 159 const struct file_operations *fop)
@@ -331,5 +323,5 @@ void __init files_init(unsigned long mempages)
331 323
332 n = (mempages * (PAGE_SIZE / 1024)) / 10; 324 n = (mempages * (PAGE_SIZE / 1024)) / 10;
333 files_stat.max_files = max_t(unsigned long, n, NR_FILE); 325 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
334 percpu_counter_init(&nr_files, 0); 326 percpu_counter_init(&nr_files, 0, GFP_KERNEL);
335} 327}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index be568b7311d6..ef9bef118342 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -342,7 +342,8 @@ static void __inode_wait_for_writeback(struct inode *inode)
342 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 342 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
343 while (inode->i_state & I_SYNC) { 343 while (inode->i_state & I_SYNC) {
344 spin_unlock(&inode->i_lock); 344 spin_unlock(&inode->i_lock);
345 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 345 __wait_on_bit(wqh, &wq, bit_wait,
346 TASK_UNINTERRUPTIBLE);
346 spin_lock(&inode->i_lock); 347 spin_lock(&inode->i_lock);
347 } 348 }
348} 349}
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
new file mode 100644
index 000000000000..9368236ca100
--- /dev/null
+++ b/fs/fs_pin.c
@@ -0,0 +1,78 @@
1#include <linux/fs.h>
2#include <linux/slab.h>
3#include <linux/fs_pin.h>
4#include "internal.h"
5#include "mount.h"
6
7static void pin_free_rcu(struct rcu_head *head)
8{
9 kfree(container_of(head, struct fs_pin, rcu));
10}
11
12static DEFINE_SPINLOCK(pin_lock);
13
14void pin_put(struct fs_pin *p)
15{
16 if (atomic_long_dec_and_test(&p->count))
17 call_rcu(&p->rcu, pin_free_rcu);
18}
19
20void pin_remove(struct fs_pin *pin)
21{
22 spin_lock(&pin_lock);
23 hlist_del(&pin->m_list);
24 hlist_del(&pin->s_list);
25 spin_unlock(&pin_lock);
26}
27
28void pin_insert(struct fs_pin *pin, struct vfsmount *m)
29{
30 spin_lock(&pin_lock);
31 hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
32 hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
33 spin_unlock(&pin_lock);
34}
35
36void mnt_pin_kill(struct mount *m)
37{
38 while (1) {
39 struct hlist_node *p;
40 struct fs_pin *pin;
41 rcu_read_lock();
42 p = ACCESS_ONCE(m->mnt_pins.first);
43 if (!p) {
44 rcu_read_unlock();
45 break;
46 }
47 pin = hlist_entry(p, struct fs_pin, m_list);
48 if (!atomic_long_inc_not_zero(&pin->count)) {
49 rcu_read_unlock();
50 cpu_relax();
51 continue;
52 }
53 rcu_read_unlock();
54 pin->kill(pin);
55 }
56}
57
58void sb_pin_kill(struct super_block *sb)
59{
60 while (1) {
61 struct hlist_node *p;
62 struct fs_pin *pin;
63 rcu_read_lock();
64 p = ACCESS_ONCE(sb->s_pins.first);
65 if (!p) {
66 rcu_read_unlock();
67 break;
68 }
69 pin = hlist_entry(p, struct fs_pin, s_list);
70 if (!atomic_long_inc_not_zero(&pin->count)) {
71 rcu_read_unlock();
72 cpu_relax();
73 continue;
74 }
75 rcu_read_unlock();
76 pin->kill(pin);
77 }
78}
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index aec01be91b0a..89acec742e0b 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -160,7 +160,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie,
160 _enter("%p", cookie); 160 _enter("%p", cookie);
161 161
162 wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, 162 wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
163 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 163 TASK_UNINTERRUPTIBLE);
164 164
165 if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) 165 if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
166 goto out_unlock; 166 goto out_unlock;
@@ -255,7 +255,7 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie)
255 if (!fscache_defer_lookup) { 255 if (!fscache_defer_lookup) {
256 _debug("non-deferred lookup %p", &cookie->flags); 256 _debug("non-deferred lookup %p", &cookie->flags);
257 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, 257 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
258 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 258 TASK_UNINTERRUPTIBLE);
259 _debug("complete"); 259 _debug("complete");
260 if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) 260 if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags))
261 goto unavailable; 261 goto unavailable;
@@ -463,7 +463,6 @@ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie)
463 _enter("%p", cookie); 463 _enter("%p", cookie);
464 464
465 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, 465 wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING,
466 fscache_wait_bit_interruptible,
467 TASK_UNINTERRUPTIBLE); 466 TASK_UNINTERRUPTIBLE);
468 467
469 _leave(""); 468 _leave("");
@@ -525,7 +524,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate)
525 } 524 }
526 525
527 wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, 526 wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK,
528 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 527 TASK_UNINTERRUPTIBLE);
529 if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) 528 if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags))
530 goto out_unlock_enable; 529 goto out_unlock_enable;
531 530
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index bc6c08fcfddd..7872a62ef30c 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -97,8 +97,6 @@ static inline bool fscache_object_congested(void)
97 return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); 97 return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq);
98} 98}
99 99
100extern int fscache_wait_bit(void *);
101extern int fscache_wait_bit_interruptible(void *);
102extern int fscache_wait_atomic_t(atomic_t *); 100extern int fscache_wait_atomic_t(atomic_t *);
103 101
104/* 102/*
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 63f868e869b9..b39d487ccfb0 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write,
67 return ret; 67 return ret;
68} 68}
69 69
70struct ctl_table fscache_sysctls[] = { 70static struct ctl_table fscache_sysctls[] = {
71 { 71 {
72 .procname = "object_max_active", 72 .procname = "object_max_active",
73 .data = &fscache_object_max_active, 73 .data = &fscache_object_max_active,
@@ -87,7 +87,7 @@ struct ctl_table fscache_sysctls[] = {
87 {} 87 {}
88}; 88};
89 89
90struct ctl_table fscache_sysctls_root[] = { 90static struct ctl_table fscache_sysctls_root[] = {
91 { 91 {
92 .procname = "fscache", 92 .procname = "fscache",
93 .mode = 0555, 93 .mode = 0555,
@@ -197,24 +197,6 @@ static void __exit fscache_exit(void)
197module_exit(fscache_exit); 197module_exit(fscache_exit);
198 198
199/* 199/*
200 * wait_on_bit() sleep function for uninterruptible waiting
201 */
202int fscache_wait_bit(void *flags)
203{
204 schedule();
205 return 0;
206}
207
208/*
209 * wait_on_bit() sleep function for interruptible waiting
210 */
211int fscache_wait_bit_interruptible(void *flags)
212{
213 schedule();
214 return signal_pending(current);
215}
216
217/*
218 * wait_on_atomic_t() sleep function for uninterruptible waiting 200 * wait_on_atomic_t() sleep function for uninterruptible waiting
219 */ 201 */
220int fscache_wait_atomic_t(atomic_t *p) 202int fscache_wait_atomic_t(atomic_t *p)
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index b8179ca6bf9d..51dde817e1f2 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -380,26 +380,14 @@ no_config:
380static int fscache_objlist_open(struct inode *inode, struct file *file) 380static int fscache_objlist_open(struct inode *inode, struct file *file)
381{ 381{
382 struct fscache_objlist_data *data; 382 struct fscache_objlist_data *data;
383 struct seq_file *m;
384 int ret;
385 383
386 ret = seq_open(file, &fscache_objlist_ops); 384 data = __seq_open_private(file, &fscache_objlist_ops, sizeof(*data));
387 if (ret < 0) 385 if (!data)
388 return ret;
389
390 m = file->private_data;
391
392 /* buffer for key extraction */
393 data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL);
394 if (!data) {
395 seq_release(inode, file);
396 return -ENOMEM; 386 return -ENOMEM;
397 }
398 387
399 /* get the configuration key */ 388 /* get the configuration key */
400 fscache_objlist_config(data); 389 fscache_objlist_config(data);
401 390
402 m->private = data;
403 return 0; 391 return 0;
404} 392}
405 393
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index d3b4539f1651..da032daf0e0d 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -982,6 +982,7 @@ nomem:
982submit_op_failed: 982submit_op_failed:
983 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); 983 clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
984 spin_unlock(&cookie->lock); 984 spin_unlock(&cookie->lock);
985 fscache_unuse_cookie(object);
985 kfree(op); 986 kfree(op);
986 _leave(" [EIO]"); 987 _leave(" [EIO]");
987 return transit_to(KILL_OBJECT); 988 return transit_to(KILL_OBJECT);
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index ed70714503fa..de33b3fccca6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -44,6 +44,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
44EXPORT_SYMBOL(__fscache_wait_on_page_write); 44EXPORT_SYMBOL(__fscache_wait_on_page_write);
45 45
46/* 46/*
47 * wait for a page to finish being written to the cache. Put a timeout here
48 * since we might be called recursively via parent fs.
49 */
50static
51bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
52{
53 wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
54
55 return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
56 HZ);
57}
58
59/*
47 * decide whether a page can be released, possibly by cancelling a store to it 60 * decide whether a page can be released, possibly by cancelling a store to it
48 * - we're allowed to sleep if __GFP_WAIT is flagged 61 * - we're allowed to sleep if __GFP_WAIT is flagged
49 */ 62 */
@@ -115,7 +128,10 @@ page_busy:
115 } 128 }
116 129
117 fscache_stat(&fscache_n_store_vmscan_wait); 130 fscache_stat(&fscache_n_store_vmscan_wait);
118 __fscache_wait_on_page_write(cookie, page); 131 if (!release_page_wait_timeout(cookie, page))
132 _debug("fscache writeout timeout page: %p{%lx}",
133 page, page->index);
134
119 gfp &= ~__GFP_WAIT; 135 gfp &= ~__GFP_WAIT;
120 goto try_again; 136 goto try_again;
121} 137}
@@ -182,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
182{ 198{
183 struct fscache_operation *op; 199 struct fscache_operation *op;
184 struct fscache_object *object; 200 struct fscache_object *object;
185 bool wake_cookie; 201 bool wake_cookie = false;
186 202
187 _enter("%p", cookie); 203 _enter("%p", cookie);
188 204
@@ -212,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
212 228
213 __fscache_use_cookie(cookie); 229 __fscache_use_cookie(cookie);
214 if (fscache_submit_exclusive_op(object, op) < 0) 230 if (fscache_submit_exclusive_op(object, op) < 0)
215 goto nobufs; 231 goto nobufs_dec;
216 spin_unlock(&cookie->lock); 232 spin_unlock(&cookie->lock);
217 fscache_stat(&fscache_n_attr_changed_ok); 233 fscache_stat(&fscache_n_attr_changed_ok);
218 fscache_put_operation(op); 234 fscache_put_operation(op);
219 _leave(" = 0"); 235 _leave(" = 0");
220 return 0; 236 return 0;
221 237
222nobufs: 238nobufs_dec:
223 wake_cookie = __fscache_unuse_cookie(cookie); 239 wake_cookie = __fscache_unuse_cookie(cookie);
240nobufs:
224 spin_unlock(&cookie->lock); 241 spin_unlock(&cookie->lock);
225 kfree(op); 242 kfree(op);
226 if (wake_cookie) 243 if (wake_cookie)
@@ -298,7 +315,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie)
298 315
299 jif = jiffies; 316 jif = jiffies;
300 if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, 317 if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP,
301 fscache_wait_bit_interruptible,
302 TASK_INTERRUPTIBLE) != 0) { 318 TASK_INTERRUPTIBLE) != 0) {
303 fscache_stat(&fscache_n_retrievals_intr); 319 fscache_stat(&fscache_n_retrievals_intr);
304 _leave(" = -ERESTARTSYS"); 320 _leave(" = -ERESTARTSYS");
@@ -342,7 +358,6 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
342 if (stat_op_waits) 358 if (stat_op_waits)
343 fscache_stat(stat_op_waits); 359 fscache_stat(stat_op_waits);
344 if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, 360 if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
345 fscache_wait_bit_interruptible,
346 TASK_INTERRUPTIBLE) != 0) { 361 TASK_INTERRUPTIBLE) != 0) {
347 ret = fscache_cancel_op(op, do_cancel); 362 ret = fscache_cancel_op(op, do_cancel);
348 if (ret == 0) 363 if (ret == 0)
@@ -351,7 +366,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object,
351 /* it's been removed from the pending queue by another party, 366 /* it's been removed from the pending queue by another party,
352 * so we should get to run shortly */ 367 * so we should get to run shortly */
353 wait_on_bit(&op->flags, FSCACHE_OP_WAITING, 368 wait_on_bit(&op->flags, FSCACHE_OP_WAITING,
354 fscache_wait_bit, TASK_UNINTERRUPTIBLE); 369 TASK_UNINTERRUPTIBLE);
355 } 370 }
356 _debug("<<< GO"); 371 _debug("<<< GO");
357 372
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 0c6048247a34..dbab798f5caf 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -274,9 +274,6 @@ out:
274 274
275invalid: 275invalid:
276 ret = 0; 276 ret = 0;
277
278 if (!(flags & LOOKUP_RCU) && check_submounts_and_drop(entry) != 0)
279 ret = 1;
280 goto out; 277 goto out;
281} 278}
282 279
@@ -845,12 +842,6 @@ static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
845 return err; 842 return err;
846} 843}
847 844
848static int fuse_rename(struct inode *olddir, struct dentry *oldent,
849 struct inode *newdir, struct dentry *newent)
850{
851 return fuse_rename2(olddir, oldent, newdir, newent, 0);
852}
853
854static int fuse_link(struct dentry *entry, struct inode *newdir, 845static int fuse_link(struct dentry *entry, struct inode *newdir,
855 struct dentry *newent) 846 struct dentry *newent)
856{ 847{
@@ -1295,9 +1286,7 @@ static int fuse_direntplus_link(struct file *file,
1295 d_drop(dentry); 1286 d_drop(dentry);
1296 } else if (get_node_id(inode) != o->nodeid || 1287 } else if (get_node_id(inode) != o->nodeid ||
1297 ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { 1288 ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
1298 err = d_invalidate(dentry); 1289 d_invalidate(dentry);
1299 if (err)
1300 goto out;
1301 } else if (is_bad_inode(inode)) { 1290 } else if (is_bad_inode(inode)) {
1302 err = -EIO; 1291 err = -EIO;
1303 goto out; 1292 goto out;
@@ -2024,7 +2013,6 @@ static const struct inode_operations fuse_dir_inode_operations = {
2024 .symlink = fuse_symlink, 2013 .symlink = fuse_symlink,
2025 .unlink = fuse_unlink, 2014 .unlink = fuse_unlink,
2026 .rmdir = fuse_rmdir, 2015 .rmdir = fuse_rmdir,
2027 .rename = fuse_rename,
2028 .rename2 = fuse_rename2, 2016 .rename2 = fuse_rename2,
2029 .link = fuse_link, 2017 .link = fuse_link,
2030 .setattr = fuse_setattr, 2018 .setattr = fuse_setattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 40ac2628ddcf..caa8d95b24e8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1303,10 +1303,11 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1303 while (nbytes < *nbytesp && req->num_pages < req->max_pages) { 1303 while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
1304 unsigned npages; 1304 unsigned npages;
1305 size_t start; 1305 size_t start;
1306 unsigned n = req->max_pages - req->num_pages;
1307 ssize_t ret = iov_iter_get_pages(ii, 1306 ssize_t ret = iov_iter_get_pages(ii,
1308 &req->pages[req->num_pages], 1307 &req->pages[req->num_pages],
1309 n * PAGE_SIZE, &start); 1308 *nbytesp - nbytes,
1309 req->max_pages - req->num_pages,
1310 &start);
1310 if (ret < 0) 1311 if (ret < 0)
1311 return ret; 1312 return ret;
1312 1313
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e6ee5b6e8d99..f0b945ab853e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp)
359 * Returns: The length of the extent (minimum of one block) 359 * Returns: The length of the extent (minimum of one block)
360 */ 360 */
361 361
362static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) 362static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
363{ 363{
364 const __be64 *end = (start + len); 364 const __be64 *end = (start + len);
365 const __be64 *first = ptr; 365 const __be64 *first = ptr;
@@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
449 struct buffer_head *bh_map, struct metapath *mp, 449 struct buffer_head *bh_map, struct metapath *mp,
450 const unsigned int sheight, 450 const unsigned int sheight,
451 const unsigned int height, 451 const unsigned int height,
452 const unsigned int maxlen) 452 const size_t maxlen)
453{ 453{
454 struct gfs2_inode *ip = GFS2_I(inode); 454 struct gfs2_inode *ip = GFS2_I(inode);
455 struct gfs2_sbd *sdp = GFS2_SB(inode); 455 struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
483 } else { 483 } else {
484 /* Need to allocate indirect blocks */ 484 /* Need to allocate indirect blocks */
485 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; 485 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
486 dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); 486 dblks = min(maxlen, (size_t)(ptrs_per_blk -
487 mp->mp_list[end_of_metadata]));
487 if (height == ip->i_height) { 488 if (height == ip->i_height) {
488 /* Writing into existing tree, extend tree down */ 489 /* Writing into existing tree, extend tree down */
489 iblks = height - sheight; 490 iblks = height - sheight;
@@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
605 struct gfs2_inode *ip = GFS2_I(inode); 606 struct gfs2_inode *ip = GFS2_I(inode);
606 struct gfs2_sbd *sdp = GFS2_SB(inode); 607 struct gfs2_sbd *sdp = GFS2_SB(inode);
607 unsigned int bsize = sdp->sd_sb.sb_bsize; 608 unsigned int bsize = sdp->sd_sb.sb_bsize;
608 const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; 609 const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
609 const u64 *arr = sdp->sd_heightsize; 610 const u64 *arr = sdp->sd_heightsize;
610 __be64 *ptr; 611 __be64 *ptr;
611 u64 size; 612 u64 size;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index d3a5d4e29ba5..589f4ea9381c 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -93,9 +93,6 @@ invalid_gunlock:
93 if (!had_lock) 93 if (!had_lock)
94 gfs2_glock_dq_uninit(&d_gh); 94 gfs2_glock_dq_uninit(&d_gh);
95invalid: 95invalid:
96 if (check_submounts_and_drop(dentry) != 0)
97 goto valid;
98
99 dput(parent); 96 dput(parent);
100 return 0; 97 return 0;
101 98
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a349f9a9685..5d4261ff5d23 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2100,8 +2100,13 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
2100 } 2100 }
2101 if (IS_ERR(dent)) 2101 if (IS_ERR(dent))
2102 return PTR_ERR(dent); 2102 return PTR_ERR(dent);
2103 da->bh = bh; 2103
2104 da->dent = dent; 2104 if (da->save_loc) {
2105 da->bh = bh;
2106 da->dent = dent;
2107 } else {
2108 brelse(bh);
2109 }
2105 return 0; 2110 return 0;
2106} 2111}
2107 2112
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 126c65dda028..e1b309c24dab 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,6 +23,7 @@ struct gfs2_diradd {
23 unsigned nr_blocks; 23 unsigned nr_blocks;
24 struct gfs2_dirent *dent; 24 struct gfs2_dirent *dent;
25 struct buffer_head *bh; 25 struct buffer_head *bh;
26 int save_loc;
26}; 27};
27 28
28extern struct inode *gfs2_dir_search(struct inode *dir, 29extern struct inode *gfs2_dir_search(struct inode *dir,
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 26b3f952e6b1..80dd44dca028 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -26,6 +26,7 @@
26#include <linux/dlm.h> 26#include <linux/dlm.h>
27#include <linux/dlm_plock.h> 27#include <linux/dlm_plock.h>
28#include <linux/aio.h> 28#include <linux/aio.h>
29#include <linux/delay.h>
29 30
30#include "gfs2.h" 31#include "gfs2.h"
31#include "incore.h" 32#include "incore.h"
@@ -913,26 +914,6 @@ out_uninit:
913#ifdef CONFIG_GFS2_FS_LOCKING_DLM 914#ifdef CONFIG_GFS2_FS_LOCKING_DLM
914 915
915/** 916/**
916 * gfs2_setlease - acquire/release a file lease
917 * @file: the file pointer
918 * @arg: lease type
919 * @fl: file lock
920 *
921 * We don't currently have a way to enforce a lease across the whole
922 * cluster; until we do, disable leases (by just returning -EINVAL),
923 * unless the administrator has requested purely local locking.
924 *
925 * Locking: called under i_lock
926 *
927 * Returns: errno
928 */
929
930static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
931{
932 return -EINVAL;
933}
934
935/**
936 * gfs2_lock - acquire/release a posix lock on a file 917 * gfs2_lock - acquire/release a posix lock on a file
937 * @file: the file pointer 918 * @file: the file pointer
938 * @cmd: either modify or retrieve lock state, possibly wait 919 * @cmd: either modify or retrieve lock state, possibly wait
@@ -979,9 +960,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
979 unsigned int state; 960 unsigned int state;
980 int flags; 961 int flags;
981 int error = 0; 962 int error = 0;
963 int sleeptime;
982 964
983 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; 965 state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
984 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; 966 flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
985 967
986 mutex_lock(&fp->f_fl_mutex); 968 mutex_lock(&fp->f_fl_mutex);
987 969
@@ -1001,7 +983,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
1001 gfs2_holder_init(gl, state, flags, fl_gh); 983 gfs2_holder_init(gl, state, flags, fl_gh);
1002 gfs2_glock_put(gl); 984 gfs2_glock_put(gl);
1003 } 985 }
1004 error = gfs2_glock_nq(fl_gh); 986 for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
987 error = gfs2_glock_nq(fl_gh);
988 if (error != GLR_TRYFAILED)
989 break;
990 fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
991 fl_gh->gh_error = 0;
992 msleep(sleeptime);
993 }
1005 if (error) { 994 if (error) {
1006 gfs2_holder_uninit(fl_gh); 995 gfs2_holder_uninit(fl_gh);
1007 if (error == GLR_TRYFAILED) 996 if (error == GLR_TRYFAILED)
@@ -1024,7 +1013,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
1024 mutex_lock(&fp->f_fl_mutex); 1013 mutex_lock(&fp->f_fl_mutex);
1025 flock_lock_file_wait(file, fl); 1014 flock_lock_file_wait(file, fl);
1026 if (fl_gh->gh_gl) { 1015 if (fl_gh->gh_gl) {
1027 gfs2_glock_dq_wait(fl_gh); 1016 gfs2_glock_dq(fl_gh);
1028 gfs2_holder_uninit(fl_gh); 1017 gfs2_holder_uninit(fl_gh);
1029 } 1018 }
1030 mutex_unlock(&fp->f_fl_mutex); 1019 mutex_unlock(&fp->f_fl_mutex);
@@ -1069,7 +1058,7 @@ const struct file_operations gfs2_file_fops = {
1069 .flock = gfs2_flock, 1058 .flock = gfs2_flock,
1070 .splice_read = generic_file_splice_read, 1059 .splice_read = generic_file_splice_read,
1071 .splice_write = iter_file_splice_write, 1060 .splice_write = iter_file_splice_write,
1072 .setlease = gfs2_setlease, 1061 .setlease = simple_nosetlease,
1073 .fallocate = gfs2_fallocate, 1062 .fallocate = gfs2_fallocate,
1074}; 1063};
1075 1064
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ee4e04fe60fc..8f0c19d1d943 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -811,7 +811,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
811{ 811{
812 INIT_LIST_HEAD(&gh->gh_list); 812 INIT_LIST_HEAD(&gh->gh_list);
813 gh->gh_gl = gl; 813 gh->gh_gl = gl;
814 gh->gh_ip = (unsigned long)__builtin_return_address(0); 814 gh->gh_ip = _RET_IP_;
815 gh->gh_owner_pid = get_pid(task_pid(current)); 815 gh->gh_owner_pid = get_pid(task_pid(current));
816 gh->gh_state = state; 816 gh->gh_state = state;
817 gh->gh_flags = flags; 817 gh->gh_flags = flags;
@@ -835,7 +835,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *
835 gh->gh_state = state; 835 gh->gh_state = state;
836 gh->gh_flags = flags; 836 gh->gh_flags = flags;
837 gh->gh_iflags = 0; 837 gh->gh_iflags = 0;
838 gh->gh_ip = (unsigned long)__builtin_return_address(0); 838 gh->gh_ip = _RET_IP_;
839 if (gh->gh_owner_pid) 839 if (gh->gh_owner_pid)
840 put_pid(gh->gh_owner_pid); 840 put_pid(gh->gh_owner_pid);
841 gh->gh_owner_pid = get_pid(task_pid(current)); 841 gh->gh_owner_pid = get_pid(task_pid(current));
@@ -856,27 +856,6 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
856} 856}
857 857
858/** 858/**
859 * gfs2_glock_holder_wait
860 * @word: unused
861 *
862 * This function and gfs2_glock_demote_wait both show up in the WCHAN
863 * field. Thus I've separated these otherwise identical functions in
864 * order to be more informative to the user.
865 */
866
867static int gfs2_glock_holder_wait(void *word)
868{
869 schedule();
870 return 0;
871}
872
873static int gfs2_glock_demote_wait(void *word)
874{
875 schedule();
876 return 0;
877}
878
879/**
880 * gfs2_glock_wait - wait on a glock acquisition 859 * gfs2_glock_wait - wait on a glock acquisition
881 * @gh: the glock holder 860 * @gh: the glock holder
882 * 861 *
@@ -888,7 +867,7 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
888 unsigned long time1 = jiffies; 867 unsigned long time1 = jiffies;
889 868
890 might_sleep(); 869 might_sleep();
891 wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); 870 wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE);
892 if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */ 871 if (time_after(jiffies, time1 + HZ)) /* have we waited > a second? */
893 /* Lengthen the minimum hold time. */ 872 /* Lengthen the minimum hold time. */
894 gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time + 873 gh->gh_gl->gl_hold_time = min(gh->gh_gl->gl_hold_time +
@@ -1128,7 +1107,7 @@ void gfs2_glock_dq_wait(struct gfs2_holder *gh)
1128 struct gfs2_glock *gl = gh->gh_gl; 1107 struct gfs2_glock *gl = gh->gh_gl;
1129 gfs2_glock_dq(gh); 1108 gfs2_glock_dq(gh);
1130 might_sleep(); 1109 might_sleep();
1131 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); 1110 wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE);
1132} 1111}
1133 1112
1134/** 1113/**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 2ffc67dce87f..1cc0bba6313f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -93,7 +93,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
93 * tr->alloced is not set since the transaction structure is 93 * tr->alloced is not set since the transaction structure is
94 * on the stack */ 94 * on the stack */
95 tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); 95 tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
96 tr.tr_ip = (unsigned long)__builtin_return_address(0); 96 tr.tr_ip = _RET_IP_;
97 sb_start_intwrite(sdp->sd_vfs); 97 sb_start_intwrite(sdp->sd_vfs);
98 if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) { 98 if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) {
99 sb_end_intwrite(sdp->sd_vfs); 99 sb_end_intwrite(sdp->sd_vfs);
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 67d310c9ada3..39e7e9959b74 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -262,6 +262,9 @@ struct gfs2_holder {
262 unsigned long gh_ip; 262 unsigned long gh_ip;
263}; 263};
264 264
265/* Number of quota types we support */
266#define GFS2_MAXQUOTAS 2
267
265/* Resource group multi-block reservation, in order of appearance: 268/* Resource group multi-block reservation, in order of appearance:
266 269
267 Step 1. Function prepares to write, allocates a mb, sets the size hint. 270 Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -282,8 +285,8 @@ struct gfs2_blkreserv {
282 u64 rs_inum; /* Inode number for reservation */ 285 u64 rs_inum; /* Inode number for reservation */
283 286
284 /* ancillary quota stuff */ 287 /* ancillary quota stuff */
285 struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; 288 struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
286 struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS]; 289 struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
287 unsigned int rs_qa_qd_num; 290 unsigned int rs_qa_qd_num;
288}; 291};
289 292
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e62e59477884..c4ed823d150e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -600,7 +600,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
600 int error, free_vfs_inode = 0; 600 int error, free_vfs_inode = 0;
601 u32 aflags = 0; 601 u32 aflags = 0;
602 unsigned blocks = 1; 602 unsigned blocks = 1;
603 struct gfs2_diradd da = { .bh = NULL, }; 603 struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
604 604
605 if (!name->len || name->len > GFS2_FNAMESIZE) 605 if (!name->len || name->len > GFS2_FNAMESIZE)
606 return -ENAMETOOLONG; 606 return -ENAMETOOLONG;
@@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
626 if (!IS_ERR(inode)) { 626 if (!IS_ERR(inode)) {
627 d = d_splice_alias(inode, dentry); 627 d = d_splice_alias(inode, dentry);
628 error = PTR_ERR(d); 628 error = PTR_ERR(d);
629 if (IS_ERR(d)) 629 if (IS_ERR(d)) {
630 inode = ERR_CAST(d);
630 goto fail_gunlock; 631 goto fail_gunlock;
632 }
631 error = 0; 633 error = 0;
632 if (file) { 634 if (file) {
633 if (S_ISREG(inode->i_mode)) { 635 if (S_ISREG(inode->i_mode)) {
@@ -670,6 +672,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
670 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 672 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
671 gfs2_set_inode_blocks(inode, 1); 673 gfs2_set_inode_blocks(inode, 1);
672 munge_mode_uid_gid(dip, inode); 674 munge_mode_uid_gid(dip, inode);
675 check_and_update_goal(dip);
673 ip->i_goal = dip->i_goal; 676 ip->i_goal = dip->i_goal;
674 ip->i_diskflags = 0; 677 ip->i_diskflags = 0;
675 ip->i_eattr = 0; 678 ip->i_eattr = 0;
@@ -840,8 +843,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
840 int error; 843 int error;
841 844
842 inode = gfs2_lookupi(dir, &dentry->d_name, 0); 845 inode = gfs2_lookupi(dir, &dentry->d_name, 0);
843 if (!inode) 846 if (inode == NULL) {
847 d_add(dentry, NULL);
844 return NULL; 848 return NULL;
849 }
845 if (IS_ERR(inode)) 850 if (IS_ERR(inode))
846 return ERR_CAST(inode); 851 return ERR_CAST(inode);
847 852
@@ -854,7 +859,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
854 859
855 d = d_splice_alias(inode, dentry); 860 d = d_splice_alias(inode, dentry);
856 if (IS_ERR(d)) { 861 if (IS_ERR(d)) {
857 iput(inode);
858 gfs2_glock_dq_uninit(&gh); 862 gfs2_glock_dq_uninit(&gh);
859 return d; 863 return d;
860 } 864 }
@@ -896,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
896 struct gfs2_inode *ip = GFS2_I(inode); 900 struct gfs2_inode *ip = GFS2_I(inode);
897 struct gfs2_holder ghs[2]; 901 struct gfs2_holder ghs[2];
898 struct buffer_head *dibh; 902 struct buffer_head *dibh;
899 struct gfs2_diradd da = { .bh = NULL, }; 903 struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
900 int error; 904 int error;
901 905
902 if (S_ISDIR(inode->i_mode)) 906 if (S_ISDIR(inode->i_mode))
@@ -1241,6 +1245,9 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
1241 struct dentry *d; 1245 struct dentry *d;
1242 bool excl = !!(flags & O_EXCL); 1246 bool excl = !!(flags & O_EXCL);
1243 1247
1248 if (!d_unhashed(dentry))
1249 goto skip_lookup;
1250
1244 d = __gfs2_lookup(dir, dentry, file, opened); 1251 d = __gfs2_lookup(dir, dentry, file, opened);
1245 if (IS_ERR(d)) 1252 if (IS_ERR(d))
1246 return PTR_ERR(d); 1253 return PTR_ERR(d);
@@ -1257,6 +1264,8 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
1257 } 1264 }
1258 1265
1259 BUG_ON(d != NULL); 1266 BUG_ON(d != NULL);
1267
1268skip_lookup:
1260 if (!(flags & O_CREAT)) 1269 if (!(flags & O_CREAT))
1261 return -ENOENT; 1270 return -ENOENT;
1262 1271
@@ -1334,7 +1343,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
1334 struct gfs2_rgrpd *nrgd; 1343 struct gfs2_rgrpd *nrgd;
1335 unsigned int num_gh; 1344 unsigned int num_gh;
1336 int dir_rename = 0; 1345 int dir_rename = 0;
1337 struct gfs2_diradd da = { .nr_blocks = 0, }; 1346 struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, };
1338 unsigned int x; 1347 unsigned int x;
1339 int error; 1348 int error;
1340 1349
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 4fafea1c9ecf..641383a9c1bb 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -936,12 +936,6 @@ fail:
936 return error; 936 return error;
937} 937}
938 938
939static int dlm_recovery_wait(void *word)
940{
941 schedule();
942 return 0;
943}
944
945static int control_first_done(struct gfs2_sbd *sdp) 939static int control_first_done(struct gfs2_sbd *sdp)
946{ 940{
947 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 941 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -976,7 +970,7 @@ restart:
976 fs_info(sdp, "control_first_done wait gen %u\n", start_gen); 970 fs_info(sdp, "control_first_done wait gen %u\n", start_gen);
977 971
978 wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, 972 wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY,
979 dlm_recovery_wait, TASK_UNINTERRUPTIBLE); 973 TASK_UNINTERRUPTIBLE);
980 goto restart; 974 goto restart;
981 } 975 }
982 976
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index bc564c0d6d16..d3eae244076e 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1024,20 +1024,13 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
1024 lm->lm_unmount(sdp); 1024 lm->lm_unmount(sdp);
1025} 1025}
1026 1026
1027static int gfs2_journalid_wait(void *word)
1028{
1029 if (signal_pending(current))
1030 return -EINTR;
1031 schedule();
1032 return 0;
1033}
1034
1035static int wait_on_journal(struct gfs2_sbd *sdp) 1027static int wait_on_journal(struct gfs2_sbd *sdp)
1036{ 1028{
1037 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 1029 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1038 return 0; 1030 return 0;
1039 1031
1040 return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, gfs2_journalid_wait, TASK_INTERRUPTIBLE); 1032 return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE)
1033 ? -EINTR : 0;
1041} 1034}
1042 1035
1043void gfs2_online_uevent(struct gfs2_sbd *sdp) 1036void gfs2_online_uevent(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 94555d4c5698..573bd3b758fa 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -591,12 +591,6 @@ done:
591 wake_up_bit(&jd->jd_flags, JDF_RECOVERY); 591 wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
592} 592}
593 593
594static int gfs2_recovery_wait(void *word)
595{
596 schedule();
597 return 0;
598}
599
600int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) 594int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
601{ 595{
602 int rv; 596 int rv;
@@ -609,7 +603,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait)
609 BUG_ON(!rv); 603 BUG_ON(!rv);
610 604
611 if (wait) 605 if (wait)
612 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, 606 wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
613 TASK_UNINTERRUPTIBLE); 607 TASK_UNINTERRUPTIBLE);
614 608
615 return wait ? jd->jd_recover_error : 0; 609 return wait ? jd->jd_recover_error : 0;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f4cb9c0d6bbd..7474c413ffd1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -577,6 +577,13 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd)
577 return rgd; 577 return rgd;
578} 578}
579 579
580void check_and_update_goal(struct gfs2_inode *ip)
581{
582 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
583 if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL)
584 ip->i_goal = ip->i_no_addr;
585}
586
580void gfs2_free_clones(struct gfs2_rgrpd *rgd) 587void gfs2_free_clones(struct gfs2_rgrpd *rgd)
581{ 588{
582 int x; 589 int x;
@@ -1910,6 +1917,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
1910 } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { 1917 } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
1911 rs->rs_rbm.rgd = begin = ip->i_rgd; 1918 rs->rs_rbm.rgd = begin = ip->i_rgd;
1912 } else { 1919 } else {
1920 check_and_update_goal(ip);
1913 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); 1921 rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
1914 } 1922 }
1915 if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) 1923 if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV))
@@ -2089,7 +2097,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2089 u32 blen, unsigned char new_state) 2097 u32 blen, unsigned char new_state)
2090{ 2098{
2091 struct gfs2_rbm rbm; 2099 struct gfs2_rbm rbm;
2092 struct gfs2_bitmap *bi; 2100 struct gfs2_bitmap *bi, *bi_prev = NULL;
2093 2101
2094 rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); 2102 rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
2095 if (!rbm.rgd) { 2103 if (!rbm.rgd) {
@@ -2098,18 +2106,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
2098 return NULL; 2106 return NULL;
2099 } 2107 }
2100 2108
2109 gfs2_rbm_from_block(&rbm, bstart);
2101 while (blen--) { 2110 while (blen--) {
2102 gfs2_rbm_from_block(&rbm, bstart);
2103 bi = rbm_bi(&rbm); 2111 bi = rbm_bi(&rbm);
2104 bstart++; 2112 if (bi != bi_prev) {
2105 if (!bi->bi_clone) { 2113 if (!bi->bi_clone) {
2106 bi->bi_clone = kmalloc(bi->bi_bh->b_size, 2114 bi->bi_clone = kmalloc(bi->bi_bh->b_size,
2107 GFP_NOFS | __GFP_NOFAIL); 2115 GFP_NOFS | __GFP_NOFAIL);
2108 memcpy(bi->bi_clone + bi->bi_offset, 2116 memcpy(bi->bi_clone + bi->bi_offset,
2109 bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); 2117 bi->bi_bh->b_data + bi->bi_offset,
2118 bi->bi_len);
2119 }
2120 gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2121 bi_prev = bi;
2110 } 2122 }
2111 gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
2112 gfs2_setbit(&rbm, false, new_state); 2123 gfs2_setbit(&rbm, false, new_state);
2124 gfs2_rbm_incr(&rbm);
2113 } 2125 }
2114 2126
2115 return rbm.rgd; 2127 return rbm.rgd;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 463ab2e95d1c..5d8f085f7ade 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -80,4 +80,5 @@ static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
80 return rs && !RB_EMPTY_NODE(&rs->rs_node); 80 return rs && !RB_EMPTY_NODE(&rs->rs_node);
81} 81}
82 82
83extern void check_and_update_goal(struct gfs2_inode *ip);
83#endif /* __RGRP_DOT_H__ */ 84#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 1319b5c4ec68..a346f56c4c6d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -864,12 +864,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
864 return error; 864 return error;
865} 865}
866 866
867static int gfs2_umount_recovery_wait(void *word)
868{
869 schedule();
870 return 0;
871}
872
873/** 867/**
874 * gfs2_put_super - Unmount the filesystem 868 * gfs2_put_super - Unmount the filesystem
875 * @sb: The VFS superblock 869 * @sb: The VFS superblock
@@ -894,7 +888,7 @@ restart:
894 continue; 888 continue;
895 spin_unlock(&sdp->sd_jindex_spin); 889 spin_unlock(&sdp->sd_jindex_spin);
896 wait_on_bit(&jd->jd_flags, JDF_RECOVERY, 890 wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
897 gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); 891 TASK_UNINTERRUPTIBLE);
898 goto restart; 892 goto restart;
899 } 893 }
900 spin_unlock(&sdp->sd_jindex_spin); 894 spin_unlock(&sdp->sd_jindex_spin);
@@ -1300,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1300 int val; 1294 int val;
1301 1295
1302 if (is_ancestor(root, sdp->sd_master_dir)) 1296 if (is_ancestor(root, sdp->sd_master_dir))
1303 seq_printf(s, ",meta"); 1297 seq_puts(s, ",meta");
1304 if (args->ar_lockproto[0]) 1298 if (args->ar_lockproto[0])
1305 seq_printf(s, ",lockproto=%s", args->ar_lockproto); 1299 seq_printf(s, ",lockproto=%s", args->ar_lockproto);
1306 if (args->ar_locktable[0]) 1300 if (args->ar_locktable[0])
@@ -1308,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1308 if (args->ar_hostdata[0]) 1302 if (args->ar_hostdata[0])
1309 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1303 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1310 if (args->ar_spectator) 1304 if (args->ar_spectator)
1311 seq_printf(s, ",spectator"); 1305 seq_puts(s, ",spectator");
1312 if (args->ar_localflocks) 1306 if (args->ar_localflocks)
1313 seq_printf(s, ",localflocks"); 1307 seq_puts(s, ",localflocks");
1314 if (args->ar_debug) 1308 if (args->ar_debug)
1315 seq_printf(s, ",debug"); 1309 seq_puts(s, ",debug");
1316 if (args->ar_posix_acl) 1310 if (args->ar_posix_acl)
1317 seq_printf(s, ",acl"); 1311 seq_puts(s, ",acl");
1318 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 1312 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
1319 char *state; 1313 char *state;
1320 switch (args->ar_quota) { 1314 switch (args->ar_quota) {
@@ -1334,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1334 seq_printf(s, ",quota=%s", state); 1328 seq_printf(s, ",quota=%s", state);
1335 } 1329 }
1336 if (args->ar_suiddir) 1330 if (args->ar_suiddir)
1337 seq_printf(s, ",suiddir"); 1331 seq_puts(s, ",suiddir");
1338 if (args->ar_data != GFS2_DATA_DEFAULT) { 1332 if (args->ar_data != GFS2_DATA_DEFAULT) {
1339 char *state; 1333 char *state;
1340 switch (args->ar_data) { 1334 switch (args->ar_data) {
@@ -1351,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1351 seq_printf(s, ",data=%s", state); 1345 seq_printf(s, ",data=%s", state);
1352 } 1346 }
1353 if (args->ar_discard) 1347 if (args->ar_discard)
1354 seq_printf(s, ",discard"); 1348 seq_puts(s, ",discard");
1355 val = sdp->sd_tune.gt_logd_secs; 1349 val = sdp->sd_tune.gt_logd_secs;
1356 if (val != 30) 1350 if (val != 30)
1357 seq_printf(s, ",commit=%d", val); 1351 seq_printf(s, ",commit=%d", val);
@@ -1382,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
1382 seq_printf(s, ",errors=%s", state); 1376 seq_printf(s, ",errors=%s", state);
1383 } 1377 }
1384 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) 1378 if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
1385 seq_printf(s, ",nobarrier"); 1379 seq_puts(s, ",nobarrier");
1386 if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) 1380 if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
1387 seq_printf(s, ",demote_interface_used"); 1381 seq_puts(s, ",demote_interface_used");
1388 if (args->ar_rgrplvb) 1382 if (args->ar_rgrplvb)
1389 seq_printf(s, ",rgrplvb"); 1383 seq_puts(s, ",rgrplvb");
1390 return 0; 1384 return 0;
1391} 1385}
1392 1386
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 0546ab4e28e8..42bfd3361979 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -44,7 +44,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
44 if (!tr) 44 if (!tr)
45 return -ENOMEM; 45 return -ENOMEM;
46 46
47 tr->tr_ip = (unsigned long)__builtin_return_address(0); 47 tr->tr_ip = _RET_IP_;
48 tr->tr_blocks = blocks; 48 tr->tr_blocks = blocks;
49 tr->tr_revokes = revokes; 49 tr->tr_revokes = revokes;
50 tr->tr_reserved = 1; 50 tr->tr_reserved = 1;
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 0524cda47a6e..95d255219b1e 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -242,8 +242,6 @@ extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *);
242/* super.c */ 242/* super.c */
243extern void hfs_mark_mdb_dirty(struct super_block *sb); 243extern void hfs_mark_mdb_dirty(struct super_block *sb);
244 244
245extern struct timezone sys_tz;
246
247/* 245/*
248 * There are two time systems. Both are based on seconds since 246 * There are two time systems. Both are based on seconds since
249 * a particular time/date. 247 * a particular time/date.
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 9c88da0e855a..4fcd40d6f308 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -89,6 +89,7 @@ extern int do_mknod(const char *file, int mode, unsigned int major,
89extern int link_file(const char *from, const char *to); 89extern int link_file(const char *from, const char *to);
90extern int hostfs_do_readlink(char *file, char *buf, int size); 90extern int hostfs_do_readlink(char *file, char *buf, int size);
91extern int rename_file(char *from, char *to); 91extern int rename_file(char *from, char *to);
92extern int rename2_file(char *from, char *to, unsigned int flags);
92extern int do_statfs(char *root, long *bsize_out, long long *blocks_out, 93extern int do_statfs(char *root, long *bsize_out, long long *blocks_out,
93 long long *bfree_out, long long *bavail_out, 94 long long *bfree_out, long long *bavail_out,
94 long long *files_out, long long *ffree_out, 95 long long *files_out, long long *ffree_out,
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index bb529f3b7f2b..fd62cae0fdcb 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -741,21 +741,31 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
741 return err; 741 return err;
742} 742}
743 743
744static int hostfs_rename(struct inode *from_ino, struct dentry *from, 744static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
745 struct inode *to_ino, struct dentry *to) 745 struct inode *new_dir, struct dentry *new_dentry,
746 unsigned int flags)
746{ 747{
747 char *from_name, *to_name; 748 char *old_name, *new_name;
748 int err; 749 int err;
749 750
750 if ((from_name = dentry_name(from)) == NULL) 751 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
752 return -EINVAL;
753
754 old_name = dentry_name(old_dentry);
755 if (old_name == NULL)
751 return -ENOMEM; 756 return -ENOMEM;
752 if ((to_name = dentry_name(to)) == NULL) { 757 new_name = dentry_name(new_dentry);
753 __putname(from_name); 758 if (new_name == NULL) {
759 __putname(old_name);
754 return -ENOMEM; 760 return -ENOMEM;
755 } 761 }
756 err = rename_file(from_name, to_name); 762 if (!flags)
757 __putname(from_name); 763 err = rename_file(old_name, new_name);
758 __putname(to_name); 764 else
765 err = rename2_file(old_name, new_name, flags);
766
767 __putname(old_name);
768 __putname(new_name);
759 return err; 769 return err;
760} 770}
761 771
@@ -867,7 +877,7 @@ static const struct inode_operations hostfs_dir_iops = {
867 .mkdir = hostfs_mkdir, 877 .mkdir = hostfs_mkdir,
868 .rmdir = hostfs_rmdir, 878 .rmdir = hostfs_rmdir,
869 .mknod = hostfs_mknod, 879 .mknod = hostfs_mknod,
870 .rename = hostfs_rename, 880 .rename2 = hostfs_rename2,
871 .permission = hostfs_permission, 881 .permission = hostfs_permission,
872 .setattr = hostfs_setattr, 882 .setattr = hostfs_setattr,
873}; 883};
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 67838f3aa20a..9765dab95cbd 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -14,6 +14,7 @@
14#include <sys/time.h> 14#include <sys/time.h>
15#include <sys/types.h> 15#include <sys/types.h>
16#include <sys/vfs.h> 16#include <sys/vfs.h>
17#include <sys/syscall.h>
17#include "hostfs.h" 18#include "hostfs.h"
18#include <utime.h> 19#include <utime.h>
19 20
@@ -360,6 +361,33 @@ int rename_file(char *from, char *to)
360 return 0; 361 return 0;
361} 362}
362 363
364int rename2_file(char *from, char *to, unsigned int flags)
365{
366 int err;
367
368#ifndef SYS_renameat2
369# ifdef __x86_64__
370# define SYS_renameat2 316
371# endif
372# ifdef __i386__
373# define SYS_renameat2 353
374# endif
375#endif
376
377#ifdef SYS_renameat2
378 err = syscall(SYS_renameat2, AT_FDCWD, from, AT_FDCWD, to, flags);
379 if (err < 0) {
380 if (errno != ENOSYS)
381 return -errno;
382 else
383 return -EINVAL;
384 }
385 return 0;
386#else
387 return -EINVAL;
388#endif
389}
390
363int do_statfs(char *root, long *bsize_out, long long *blocks_out, 391int do_statfs(char *root, long *bsize_out, long long *blocks_out,
364 long long *bfree_out, long long *bavail_out, 392 long long *bfree_out, long long *bavail_out,
365 long long *files_out, long long *ffree_out, 393 long long *files_out, long long *ffree_out,
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index f36fc010fccb..2923a7bd82ac 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -545,12 +545,13 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
545 struct dnode *d1; 545 struct dnode *d1;
546 struct quad_buffer_head qbh1; 546 struct quad_buffer_head qbh1;
547 if (hpfs_sb(i->i_sb)->sb_chk) 547 if (hpfs_sb(i->i_sb)->sb_chk)
548 if (up != i->i_ino) { 548 if (up != i->i_ino) {
549 hpfs_error(i->i_sb, 549 hpfs_error(i->i_sb,
550 "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", 550 "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx",
551 dno, up, (unsigned long)i->i_ino); 551 dno, up,
552 return; 552 (unsigned long)i->i_ino);
553 } 553 return;
554 }
554 if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { 555 if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) {
555 d1->up = cpu_to_le32(up); 556 d1->up = cpu_to_le32(up);
556 d1->root_dnode = 1; 557 d1->root_dnode = 1;
@@ -1061,8 +1062,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
1061 hpfs_brelse4(qbh); 1062 hpfs_brelse4(qbh);
1062 if (hpfs_sb(s)->sb_chk) 1063 if (hpfs_sb(s)->sb_chk)
1063 if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) { 1064 if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) {
1064 kfree(name2); 1065 kfree(name2);
1065 return NULL; 1066 return NULL;
1066 } 1067 }
1067 goto go_down; 1068 goto go_down;
1068 } 1069 }
diff --git a/fs/inode.c b/fs/inode.c
index 6eecb7ff0b9a..26753ba7b6d6 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,6 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
165 mapping->a_ops = &empty_aops; 165 mapping->a_ops = &empty_aops;
166 mapping->host = inode; 166 mapping->host = inode;
167 mapping->flags = 0; 167 mapping->flags = 0;
168 atomic_set(&mapping->i_mmap_writable, 0);
168 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 169 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
169 mapping->private_data = NULL; 170 mapping->private_data = NULL;
170 mapping->backing_dev_info = &default_backing_dev_info; 171 mapping->backing_dev_info = &default_backing_dev_info;
@@ -1695,13 +1696,6 @@ int inode_needs_sync(struct inode *inode)
1695} 1696}
1696EXPORT_SYMBOL(inode_needs_sync); 1697EXPORT_SYMBOL(inode_needs_sync);
1697 1698
1698int inode_wait(void *word)
1699{
1700 schedule();
1701 return 0;
1702}
1703EXPORT_SYMBOL(inode_wait);
1704
1705/* 1699/*
1706 * If we try to find an inode in the inode hash while it is being 1700 * If we try to find an inode in the inode hash while it is being
1707 * deleted, we have to wait until the filesystem completes its 1701 * deleted, we have to wait until the filesystem completes its
diff --git a/fs/internal.h b/fs/internal.h
index 465742407466..9477f8f6aefc 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
35#endif 35#endif
36 36
37/* 37/*
38 * buffer.c
39 */
40extern void guard_bio_eod(int rw, struct bio *bio);
41
42/*
38 * char_dev.c 43 * char_dev.c
39 */ 44 */
40extern void __init chrdev_init(void); 45extern void __init chrdev_init(void);
@@ -51,7 +56,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
51 * namespace.c 56 * namespace.c
52 */ 57 */
53extern int copy_mount_options(const void __user *, unsigned long *); 58extern int copy_mount_options(const void __user *, unsigned long *);
54extern int copy_mount_string(const void __user *, char **); 59extern char *copy_mount_string(const void __user *);
55 60
56extern struct vfsmount *lookup_mnt(struct path *); 61extern struct vfsmount *lookup_mnt(struct path *);
57extern int finish_automount(struct vfsmount *, struct path *); 62extern int finish_automount(struct vfsmount *, struct path *);
@@ -131,7 +136,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
131/* 136/*
132 * read_write.c 137 * read_write.c
133 */ 138 */
134extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
135extern int rw_verify_area(int, struct file *, const loff_t *, size_t); 139extern int rw_verify_area(int, struct file *, const loff_t *, size_t);
136 140
137/* 141/*
@@ -144,3 +148,9 @@ extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
144 * pipe.c 148 * pipe.c
145 */ 149 */
146extern const struct file_operations pipefifo_fops; 150extern const struct file_operations pipefifo_fops;
151
152/*
153 * fs_pin.c
154 */
155extern void sb_pin_kill(struct super_block *sb);
156extern void mnt_pin_kill(struct mount *m);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 592e5115a561..f311bf084015 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -158,8 +158,8 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start,
158 "zisofs: zisofs_inflate returned" 158 "zisofs: zisofs_inflate returned"
159 " %d, inode = %lu," 159 " %d, inode = %lu,"
160 " page idx = %d, bh idx = %d," 160 " page idx = %d, bh idx = %d,"
161 " avail_in = %d," 161 " avail_in = %ld,"
162 " avail_out = %d\n", 162 " avail_out = %ld\n",
163 zerr, inode->i_ino, curpage, 163 zerr, inode->i_ino, curpage,
164 curbh, stream.avail_in, 164 curbh, stream.avail_in,
165 stream.avail_out); 165 stream.avail_out);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 4556ce1af5b0..881b3bd0143f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -61,7 +61,7 @@ static void isofs_put_super(struct super_block *sb)
61 return; 61 return;
62} 62}
63 63
64static int isofs_read_inode(struct inode *); 64static int isofs_read_inode(struct inode *, int relocated);
65static int isofs_statfs (struct dentry *, struct kstatfs *); 65static int isofs_statfs (struct dentry *, struct kstatfs *);
66 66
67static struct kmem_cache *isofs_inode_cachep; 67static struct kmem_cache *isofs_inode_cachep;
@@ -247,7 +247,7 @@ static int isofs_dentry_cmp_common(
247 } 247 }
248 if (alen == blen) { 248 if (alen == blen) {
249 if (ci) { 249 if (ci) {
250 if (strnicmp(name->name, str, alen) == 0) 250 if (strncasecmp(name->name, str, alen) == 0)
251 return 0; 251 return 0;
252 } else { 252 } else {
253 if (strncmp(name->name, str, alen) == 0) 253 if (strncmp(name->name, str, alen) == 0)
@@ -1259,7 +1259,7 @@ out_toomany:
1259 goto out; 1259 goto out;
1260} 1260}
1261 1261
1262static int isofs_read_inode(struct inode *inode) 1262static int isofs_read_inode(struct inode *inode, int relocated)
1263{ 1263{
1264 struct super_block *sb = inode->i_sb; 1264 struct super_block *sb = inode->i_sb;
1265 struct isofs_sb_info *sbi = ISOFS_SB(sb); 1265 struct isofs_sb_info *sbi = ISOFS_SB(sb);
@@ -1404,7 +1404,7 @@ static int isofs_read_inode(struct inode *inode)
1404 */ 1404 */
1405 1405
1406 if (!high_sierra) { 1406 if (!high_sierra) {
1407 parse_rock_ridge_inode(de, inode); 1407 parse_rock_ridge_inode(de, inode, relocated);
1408 /* if we want uid/gid set, override the rock ridge setting */ 1408 /* if we want uid/gid set, override the rock ridge setting */
1409 if (sbi->s_uid_set) 1409 if (sbi->s_uid_set)
1410 inode->i_uid = sbi->s_uid; 1410 inode->i_uid = sbi->s_uid;
@@ -1483,9 +1483,10 @@ static int isofs_iget5_set(struct inode *ino, void *data)
1483 * offset that point to the underlying meta-data for the inode. The 1483 * offset that point to the underlying meta-data for the inode. The
1484 * code below is otherwise similar to the iget() code in 1484 * code below is otherwise similar to the iget() code in
1485 * include/linux/fs.h */ 1485 * include/linux/fs.h */
1486struct inode *isofs_iget(struct super_block *sb, 1486struct inode *__isofs_iget(struct super_block *sb,
1487 unsigned long block, 1487 unsigned long block,
1488 unsigned long offset) 1488 unsigned long offset,
1489 int relocated)
1489{ 1490{
1490 unsigned long hashval; 1491 unsigned long hashval;
1491 struct inode *inode; 1492 struct inode *inode;
@@ -1507,7 +1508,7 @@ struct inode *isofs_iget(struct super_block *sb,
1507 return ERR_PTR(-ENOMEM); 1508 return ERR_PTR(-ENOMEM);
1508 1509
1509 if (inode->i_state & I_NEW) { 1510 if (inode->i_state & I_NEW) {
1510 ret = isofs_read_inode(inode); 1511 ret = isofs_read_inode(inode, relocated);
1511 if (ret < 0) { 1512 if (ret < 0) {
1512 iget_failed(inode); 1513 iget_failed(inode);
1513 inode = ERR_PTR(ret); 1514 inode = ERR_PTR(ret);
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 99167238518d..0ac4c1f73fbd 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -107,7 +107,7 @@ extern int iso_date(char *, int);
107 107
108struct inode; /* To make gcc happy */ 108struct inode; /* To make gcc happy */
109 109
110extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *); 110extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated);
111extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *); 111extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *);
112extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *); 112extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *);
113 113
@@ -118,9 +118,24 @@ extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int
118extern struct buffer_head *isofs_bread(struct inode *, sector_t); 118extern struct buffer_head *isofs_bread(struct inode *, sector_t);
119extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long); 119extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long);
120 120
121extern struct inode *isofs_iget(struct super_block *sb, 121struct inode *__isofs_iget(struct super_block *sb,
122 unsigned long block, 122 unsigned long block,
123 unsigned long offset); 123 unsigned long offset,
124 int relocated);
125
126static inline struct inode *isofs_iget(struct super_block *sb,
127 unsigned long block,
128 unsigned long offset)
129{
130 return __isofs_iget(sb, block, offset, 0);
131}
132
133static inline struct inode *isofs_iget_reloc(struct super_block *sb,
134 unsigned long block,
135 unsigned long offset)
136{
137 return __isofs_iget(sb, block, offset, 1);
138}
124 139
125/* Because the inode number is no longer relevant to finding the 140/* Because the inode number is no longer relevant to finding the
126 * underlying meta-data for an inode, we are free to choose a more 141 * underlying meta-data for an inode, we are free to choose a more
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index c0bf42472e40..f488bbae541a 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -288,12 +288,16 @@ eio:
288 goto out; 288 goto out;
289} 289}
290 290
291#define RR_REGARD_XA 1
292#define RR_RELOC_DE 2
293
291static int 294static int
292parse_rock_ridge_inode_internal(struct iso_directory_record *de, 295parse_rock_ridge_inode_internal(struct iso_directory_record *de,
293 struct inode *inode, int regard_xa) 296 struct inode *inode, int flags)
294{ 297{
295 int symlink_len = 0; 298 int symlink_len = 0;
296 int cnt, sig; 299 int cnt, sig;
300 unsigned int reloc_block;
297 struct inode *reloc; 301 struct inode *reloc;
298 struct rock_ridge *rr; 302 struct rock_ridge *rr;
299 int rootflag; 303 int rootflag;
@@ -305,7 +309,7 @@ parse_rock_ridge_inode_internal(struct iso_directory_record *de,
305 309
306 init_rock_state(&rs, inode); 310 init_rock_state(&rs, inode);
307 setup_rock_ridge(de, inode, &rs); 311 setup_rock_ridge(de, inode, &rs);
308 if (regard_xa) { 312 if (flags & RR_REGARD_XA) {
309 rs.chr += 14; 313 rs.chr += 14;
310 rs.len -= 14; 314 rs.len -= 14;
311 if (rs.len < 0) 315 if (rs.len < 0)
@@ -485,12 +489,22 @@ repeat:
485 "relocated directory\n"); 489 "relocated directory\n");
486 goto out; 490 goto out;
487 case SIG('C', 'L'): 491 case SIG('C', 'L'):
488 ISOFS_I(inode)->i_first_extent = 492 if (flags & RR_RELOC_DE) {
489 isonum_733(rr->u.CL.location); 493 printk(KERN_ERR
490 reloc = 494 "ISOFS: Recursive directory relocation "
491 isofs_iget(inode->i_sb, 495 "is not supported\n");
492 ISOFS_I(inode)->i_first_extent, 496 goto eio;
493 0); 497 }
498 reloc_block = isonum_733(rr->u.CL.location);
499 if (reloc_block == ISOFS_I(inode)->i_iget5_block &&
500 ISOFS_I(inode)->i_iget5_offset == 0) {
501 printk(KERN_ERR
502 "ISOFS: Directory relocation points to "
503 "itself\n");
504 goto eio;
505 }
506 ISOFS_I(inode)->i_first_extent = reloc_block;
507 reloc = isofs_iget_reloc(inode->i_sb, reloc_block, 0);
494 if (IS_ERR(reloc)) { 508 if (IS_ERR(reloc)) {
495 ret = PTR_ERR(reloc); 509 ret = PTR_ERR(reloc);
496 goto out; 510 goto out;
@@ -637,9 +651,11 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit)
637 return rpnt; 651 return rpnt;
638} 652}
639 653
640int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode) 654int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode,
655 int relocated)
641{ 656{
642 int result = parse_rock_ridge_inode_internal(de, inode, 0); 657 int flags = relocated ? RR_RELOC_DE : 0;
658 int result = parse_rock_ridge_inode_internal(de, inode, flags);
643 659
644 /* 660 /*
645 * if rockridge flag was reset and we didn't look for attributes 661 * if rockridge flag was reset and we didn't look for attributes
@@ -647,7 +663,8 @@ int parse_rock_ridge_inode(struct iso_directory_record *de, struct inode *inode)
647 */ 663 */
648 if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1) 664 if ((ISOFS_SB(inode->i_sb)->s_rock_offset == -1)
649 && (ISOFS_SB(inode->i_sb)->s_rock == 2)) { 665 && (ISOFS_SB(inode->i_sb)->s_rock == 2)) {
650 result = parse_rock_ridge_inode_internal(de, inode, 14); 666 result = parse_rock_ridge_inode_internal(de, inode,
667 flags | RR_REGARD_XA);
651 } 668 }
652 return result; 669 return result;
653} 670}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 6fac74349856..b73e0215baa7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
97 struct commit_header *h; 97 struct commit_header *h;
98 __u32 csum; 98 __u32 csum;
99 99
100 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 100 if (!jbd2_journal_has_csum_v2or3(j))
101 return; 101 return;
102 102
103 h = (struct commit_header *)(bh->b_data); 103 h = (struct commit_header *)(bh->b_data);
@@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
313 return checksum; 313 return checksum;
314} 314}
315 315
316static void write_tag_block(int tag_bytes, journal_block_tag_t *tag, 316static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
317 unsigned long long block) 317 unsigned long long block)
318{ 318{
319 tag->t_blocknr = cpu_to_be32(block & (u32)~0); 319 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
320 if (tag_bytes > JBD2_TAG_SIZE32) 320 if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
321 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1); 321 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322} 322}
323 323
@@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j,
327 struct jbd2_journal_block_tail *tail; 327 struct jbd2_journal_block_tail *tail;
328 __u32 csum; 328 __u32 csum;
329 329
330 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 330 if (!jbd2_journal_has_csum_v2or3(j))
331 return; 331 return;
332 332
333 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - 333 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
@@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j,
340static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, 340static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
341 struct buffer_head *bh, __u32 sequence) 341 struct buffer_head *bh, __u32 sequence)
342{ 342{
343 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
343 struct page *page = bh->b_page; 344 struct page *page = bh->b_page;
344 __u8 *addr; 345 __u8 *addr;
345 __u32 csum32; 346 __u32 csum32;
346 __be32 seq; 347 __be32 seq;
347 348
348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 349 if (!jbd2_journal_has_csum_v2or3(j))
349 return; 350 return;
350 351
351 seq = cpu_to_be32(sequence); 352 seq = cpu_to_be32(sequence);
@@ -355,8 +356,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
355 bh->b_size); 356 bh->b_size);
356 kunmap_atomic(addr); 357 kunmap_atomic(addr);
357 358
358 /* We only have space to store the lower 16 bits of the crc32c. */ 359 if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
359 tag->t_checksum = cpu_to_be16(csum32); 360 tag3->t_checksum = cpu_to_be32(csum32);
361 else
362 tag->t_checksum = cpu_to_be16(csum32);
360} 363}
361/* 364/*
362 * jbd2_journal_commit_transaction 365 * jbd2_journal_commit_transaction
@@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
396 LIST_HEAD(io_bufs); 399 LIST_HEAD(io_bufs);
397 LIST_HEAD(log_bufs); 400 LIST_HEAD(log_bufs);
398 401
399 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 402 if (jbd2_journal_has_csum_v2or3(journal))
400 csum_size = sizeof(struct jbd2_journal_block_tail); 403 csum_size = sizeof(struct jbd2_journal_block_tail);
401 404
402 /* 405 /*
@@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
690 tag_flag |= JBD2_FLAG_SAME_UUID; 693 tag_flag |= JBD2_FLAG_SAME_UUID;
691 694
692 tag = (journal_block_tag_t *) tagp; 695 tag = (journal_block_tag_t *) tagp;
693 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr); 696 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
694 tag->t_flags = cpu_to_be16(tag_flag); 697 tag->t_flags = cpu_to_be16(tag_flag);
695 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs], 698 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
696 commit_transaction->t_tid); 699 commit_transaction->t_tid);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 67b8e303946c..19d74d86d99c 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
124/* Checksumming functions */ 124/* Checksumming functions */
125static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) 125static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
126{ 126{
127 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 127 if (!jbd2_journal_has_csum_v2or3(j))
128 return 1; 128 return 1;
129 129
130 return sb->s_checksum_type == JBD2_CRC32C_CHKSUM; 130 return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
145 145
146static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) 146static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
147{ 147{
148 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 148 if (!jbd2_journal_has_csum_v2or3(j))
149 return 1; 149 return 1;
150 150
151 return sb->s_checksum == jbd2_superblock_csum(j, sb); 151 return sb->s_checksum == jbd2_superblock_csum(j, sb);
@@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
153 153
154static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) 154static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
155{ 155{
156 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 156 if (!jbd2_journal_has_csum_v2or3(j))
157 return; 157 return;
158 158
159 sb->s_checksum = jbd2_superblock_csum(j, sb); 159 sb->s_checksum = jbd2_superblock_csum(j, sb);
@@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal)
1522 goto out; 1522 goto out;
1523 } 1523 }
1524 1524
1525 if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) && 1525 if (jbd2_journal_has_csum_v2or3(journal) &&
1526 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1526 JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
1527 /* Can't have checksum v1 and v2 on at the same time! */ 1527 /* Can't have checksum v1 and v2 on at the same time! */
1528 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " 1528 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
1529 "at the same time!\n"); 1529 "at the same time!\n");
1530 goto out; 1530 goto out;
1531 } 1531 }
1532 1532
1533 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
1534 JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1535 /* Can't have checksum v2 and v3 at the same time! */
1536 printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
1537 "at the same time!\n");
1538 goto out;
1539 }
1540
1533 if (!jbd2_verify_csum_type(journal, sb)) { 1541 if (!jbd2_verify_csum_type(journal, sb)) {
1534 printk(KERN_ERR "JBD2: Unknown checksum type\n"); 1542 printk(KERN_ERR "JBD2: Unknown checksum type\n");
1535 goto out; 1543 goto out;
1536 } 1544 }
1537 1545
1538 /* Load the checksum driver */ 1546 /* Load the checksum driver */
1539 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1547 if (jbd2_journal_has_csum_v2or3(journal)) {
1540 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); 1548 journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
1541 if (IS_ERR(journal->j_chksum_driver)) { 1549 if (IS_ERR(journal->j_chksum_driver)) {
1542 printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n"); 1550 printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal)
1553 } 1561 }
1554 1562
1555 /* Precompute checksum seed for all metadata */ 1563 /* Precompute checksum seed for all metadata */
1556 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 1564 if (jbd2_journal_has_csum_v2or3(journal))
1557 journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, 1565 journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
1558 sizeof(sb->s_uuid)); 1566 sizeof(sb->s_uuid));
1559 1567
@@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1813 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat)) 1821 if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
1814 return 0; 1822 return 0;
1815 1823
1816 /* Asking for checksumming v2 and v1? Only give them v2. */ 1824 /* If enabling v2 checksums, turn on v3 instead */
1817 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 && 1825 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
1826 incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
1827 incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
1828 }
1829
1830 /* Asking for checksumming v3 and v1? Only give them v3. */
1831 if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
1818 compat & JBD2_FEATURE_COMPAT_CHECKSUM) 1832 compat & JBD2_FEATURE_COMPAT_CHECKSUM)
1819 compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; 1833 compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
1820 1834
@@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1823 1837
1824 sb = journal->j_superblock; 1838 sb = journal->j_superblock;
1825 1839
1826 /* If enabling v2 checksums, update superblock */ 1840 /* If enabling v3 checksums, update superblock */
1827 if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) { 1841 if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
1828 sb->s_checksum_type = JBD2_CRC32C_CHKSUM; 1842 sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
1829 sb->s_feature_compat &= 1843 sb->s_feature_compat &=
1830 ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); 1844 ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
@@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1842 } 1856 }
1843 1857
1844 /* Precompute checksum seed for all metadata */ 1858 /* Precompute checksum seed for all metadata */
1845 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 1859 if (jbd2_journal_has_csum_v2or3(journal))
1846 JBD2_FEATURE_INCOMPAT_CSUM_V2))
1847 journal->j_csum_seed = jbd2_chksum(journal, ~0, 1860 journal->j_csum_seed = jbd2_chksum(journal, ~0,
1848 sb->s_uuid, 1861 sb->s_uuid,
1849 sizeof(sb->s_uuid)); 1862 sizeof(sb->s_uuid));
@@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
1852 /* If enabling v1 checksums, downgrade superblock */ 1865 /* If enabling v1 checksums, downgrade superblock */
1853 if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM)) 1866 if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
1854 sb->s_feature_incompat &= 1867 sb->s_feature_incompat &=
1855 ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2); 1868 ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
1869 JBD2_FEATURE_INCOMPAT_CSUM_V3);
1856 1870
1857 sb->s_feature_compat |= cpu_to_be32(compat); 1871 sb->s_feature_compat |= cpu_to_be32(compat);
1858 sb->s_feature_ro_compat |= cpu_to_be32(ro); 1872 sb->s_feature_ro_compat |= cpu_to_be32(ro);
@@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
2165 */ 2179 */
2166size_t journal_tag_bytes(journal_t *journal) 2180size_t journal_tag_bytes(journal_t *journal)
2167{ 2181{
2168 journal_block_tag_t tag; 2182 size_t sz;
2169 size_t x = 0; 2183
2184 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
2185 return sizeof(journal_block_tag3_t);
2186
2187 sz = sizeof(journal_block_tag_t);
2170 2188
2171 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 2189 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
2172 x += sizeof(tag.t_checksum); 2190 sz += sizeof(__u16);
2173 2191
2174 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) 2192 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
2175 return x + JBD2_TAG_SIZE64; 2193 return sz;
2176 else 2194 else
2177 return x + JBD2_TAG_SIZE32; 2195 return sz - sizeof(__u32);
2178} 2196}
2179 2197
2180/* 2198/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3b6bb19d60b1..9b329b55ffe3 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
181 __be32 provided; 181 __be32 provided;
182 __u32 calculated; 182 __u32 calculated;
183 183
184 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 184 if (!jbd2_journal_has_csum_v2or3(j))
185 return 1; 185 return 1;
186 186
187 tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - 187 tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
@@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
205 int nr = 0, size = journal->j_blocksize; 205 int nr = 0, size = journal->j_blocksize;
206 int tag_bytes = journal_tag_bytes(journal); 206 int tag_bytes = journal_tag_bytes(journal);
207 207
208 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 208 if (jbd2_journal_has_csum_v2or3(journal))
209 size -= sizeof(struct jbd2_journal_block_tail); 209 size -= sizeof(struct jbd2_journal_block_tail);
210 210
211 tagp = &bh->b_data[sizeof(journal_header_t)]; 211 tagp = &bh->b_data[sizeof(journal_header_t)];
@@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal)
338 return err; 338 return err;
339} 339}
340 340
341static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag) 341static inline unsigned long long read_tag_block(journal_t *journal,
342 journal_block_tag_t *tag)
342{ 343{
343 unsigned long long block = be32_to_cpu(tag->t_blocknr); 344 unsigned long long block = be32_to_cpu(tag->t_blocknr);
344 if (tag_bytes > JBD2_TAG_SIZE32) 345 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
345 block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32; 346 block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
346 return block; 347 return block;
347} 348}
@@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
384 __be32 provided; 385 __be32 provided;
385 __u32 calculated; 386 __u32 calculated;
386 387
387 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 388 if (!jbd2_journal_has_csum_v2or3(j))
388 return 1; 389 return 1;
389 390
390 h = buf; 391 h = buf;
@@ -399,17 +400,21 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
399static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, 400static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
400 void *buf, __u32 sequence) 401 void *buf, __u32 sequence)
401{ 402{
403 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
402 __u32 csum32; 404 __u32 csum32;
403 __be32 seq; 405 __be32 seq;
404 406
405 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 407 if (!jbd2_journal_has_csum_v2or3(j))
406 return 1; 408 return 1;
407 409
408 seq = cpu_to_be32(sequence); 410 seq = cpu_to_be32(sequence);
409 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); 411 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
410 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); 412 csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
411 413
412 return tag->t_checksum == cpu_to_be16(csum32); 414 if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
415 return tag3->t_checksum == cpu_to_be32(csum32);
416 else
417 return tag->t_checksum == cpu_to_be16(csum32);
413} 418}
414 419
415static int do_one_pass(journal_t *journal, 420static int do_one_pass(journal_t *journal,
@@ -426,6 +431,7 @@ static int do_one_pass(journal_t *journal,
426 int tag_bytes = journal_tag_bytes(journal); 431 int tag_bytes = journal_tag_bytes(journal);
427 __u32 crc32_sum = ~0; /* Transactional Checksums */ 432 __u32 crc32_sum = ~0; /* Transactional Checksums */
428 int descr_csum_size = 0; 433 int descr_csum_size = 0;
434 int block_error = 0;
429 435
430 /* 436 /*
431 * First thing is to establish what we expect to find in the log 437 * First thing is to establish what we expect to find in the log
@@ -512,8 +518,7 @@ static int do_one_pass(journal_t *journal,
512 switch(blocktype) { 518 switch(blocktype) {
513 case JBD2_DESCRIPTOR_BLOCK: 519 case JBD2_DESCRIPTOR_BLOCK:
514 /* Verify checksum first */ 520 /* Verify checksum first */
515 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 521 if (jbd2_journal_has_csum_v2or3(journal))
516 JBD2_FEATURE_INCOMPAT_CSUM_V2))
517 descr_csum_size = 522 descr_csum_size =
518 sizeof(struct jbd2_journal_block_tail); 523 sizeof(struct jbd2_journal_block_tail);
519 if (descr_csum_size > 0 && 524 if (descr_csum_size > 0 &&
@@ -574,7 +579,7 @@ static int do_one_pass(journal_t *journal,
574 unsigned long long blocknr; 579 unsigned long long blocknr;
575 580
576 J_ASSERT(obh != NULL); 581 J_ASSERT(obh != NULL);
577 blocknr = read_tag_block(tag_bytes, 582 blocknr = read_tag_block(journal,
578 tag); 583 tag);
579 584
580 /* If the block has been 585 /* If the block has been
@@ -598,7 +603,8 @@ static int do_one_pass(journal_t *journal,
598 "checksum recovering " 603 "checksum recovering "
599 "block %llu in log\n", 604 "block %llu in log\n",
600 blocknr); 605 blocknr);
601 continue; 606 block_error = 1;
607 goto skip_write;
602 } 608 }
603 609
604 /* Find a buffer for the new 610 /* Find a buffer for the new
@@ -797,7 +803,8 @@ static int do_one_pass(journal_t *journal,
797 success = -EIO; 803 success = -EIO;
798 } 804 }
799 } 805 }
800 806 if (block_error && success == 0)
807 success = -EIO;
801 return success; 808 return success;
802 809
803 failed: 810 failed:
@@ -811,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
811 __be32 provided; 818 __be32 provided;
812 __u32 calculated; 819 __u32 calculated;
813 820
814 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 821 if (!jbd2_journal_has_csum_v2or3(j))
815 return 1; 822 return 1;
816 823
817 tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize - 824 tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 198c9c10276d..d5e95a175c92 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -91,8 +91,8 @@
91#include <linux/list.h> 91#include <linux/list.h>
92#include <linux/init.h> 92#include <linux/init.h>
93#include <linux/bio.h> 93#include <linux/bio.h>
94#endif
95#include <linux/log2.h> 94#include <linux/log2.h>
95#endif
96 96
97static struct kmem_cache *jbd2_revoke_record_cache; 97static struct kmem_cache *jbd2_revoke_record_cache;
98static struct kmem_cache *jbd2_revoke_table_cache; 98static struct kmem_cache *jbd2_revoke_table_cache;
@@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal,
597 offset = *offsetp; 597 offset = *offsetp;
598 598
599 /* Do we need to leave space at the end for a checksum? */ 599 /* Do we need to leave space at the end for a checksum? */
600 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 600 if (jbd2_journal_has_csum_v2or3(journal))
601 csum_size = sizeof(struct jbd2_journal_revoke_tail); 601 csum_size = sizeof(struct jbd2_journal_revoke_tail);
602 602
603 /* Make sure we have a descriptor with space left for the record */ 603 /* Make sure we have a descriptor with space left for the record */
@@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
644 struct jbd2_journal_revoke_tail *tail; 644 struct jbd2_journal_revoke_tail *tail;
645 __u32 csum; 645 __u32 csum;
646 646
647 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) 647 if (!jbd2_journal_has_csum_v2or3(j))
648 return; 648 return;
649 649
650 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize - 650 tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6f0f590cc5a3..5f09370c90a8 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -763,12 +763,6 @@ static void warn_dirty_buffer(struct buffer_head *bh)
763 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); 763 bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
764} 764}
765 765
766static int sleep_on_shadow_bh(void *word)
767{
768 io_schedule();
769 return 0;
770}
771
772/* 766/*
773 * If the buffer is already part of the current transaction, then there 767 * If the buffer is already part of the current transaction, then there
774 * is nothing we need to do. If it is already part of a prior 768 * is nothing we need to do. If it is already part of a prior
@@ -906,8 +900,8 @@ repeat:
906 if (buffer_shadow(bh)) { 900 if (buffer_shadow(bh)) {
907 JBUFFER_TRACE(jh, "on shadow: sleep"); 901 JBUFFER_TRACE(jh, "on shadow: sleep");
908 jbd_unlock_bh_state(bh); 902 jbd_unlock_bh_state(bh);
909 wait_on_bit(&bh->b_state, BH_Shadow, 903 wait_on_bit_io(&bh->b_state, BH_Shadow,
910 sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE); 904 TASK_UNINTERRUPTIBLE);
911 goto repeat; 905 goto repeat;
912 } 906 }
913 907
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 009ec0b5993d..2f7a3c090489 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -202,8 +202,7 @@ struct posix_acl *jffs2_get_acl(struct inode *inode, int type)
202 } else { 202 } else {
203 acl = ERR_PTR(rc); 203 acl = ERR_PTR(rc);
204 } 204 }
205 if (value) 205 kfree(value);
206 kfree(value);
207 if (!IS_ERR(acl)) 206 if (!IS_ERR(acl))
208 set_cached_acl(inode, type, acl); 207 set_cached_acl(inode, type, acl);
209 return acl; 208 return acl;
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index 0b9a1e44e833..5698dae5d92d 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -94,11 +94,12 @@ static int jffs2_zlib_compress(unsigned char *data_in,
94 94
95 while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) { 95 while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) {
96 def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE); 96 def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE);
97 def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out); 97 def_strm.avail_in = min_t(unsigned long,
98 jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n", 98 (*sourcelen-def_strm.total_in), def_strm.avail_out);
99 jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n",
99 def_strm.avail_in, def_strm.avail_out); 100 def_strm.avail_in, def_strm.avail_out);
100 ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH); 101 ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH);
101 jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n", 102 jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n",
102 def_strm.avail_in, def_strm.avail_out, 103 def_strm.avail_in, def_strm.avail_out,
103 def_strm.total_in, def_strm.total_out); 104 def_strm.total_in, def_strm.total_out);
104 if (ret != Z_OK) { 105 if (ret != Z_OK) {
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 413ef89c2d1b..046fee8b6e9b 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -134,8 +134,6 @@ struct jffs2_sb_info {
134 struct rw_semaphore wbuf_sem; /* Protects the write buffer */ 134 struct rw_semaphore wbuf_sem; /* Protects the write buffer */
135 135
136 struct delayed_work wbuf_dwork; /* write-buffer write-out work */ 136 struct delayed_work wbuf_dwork; /* write-buffer write-out work */
137 int wbuf_queued; /* non-zero delayed work is queued */
138 spinlock_t wbuf_dwork_lock; /* protects wbuf_dwork and and wbuf_queued */
139 137
140 unsigned char *oobbuf; 138 unsigned char *oobbuf;
141 int oobavail; /* How many bytes are available for JFFS2 in OOB */ 139 int oobavail; /* How many bytes are available for JFFS2 in OOB */
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a6597d60d76d..09ed55190ee2 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1162,10 +1162,6 @@ static void delayed_wbuf_sync(struct work_struct *work)
1162 struct jffs2_sb_info *c = work_to_sb(work); 1162 struct jffs2_sb_info *c = work_to_sb(work);
1163 struct super_block *sb = OFNI_BS_2SFFJ(c); 1163 struct super_block *sb = OFNI_BS_2SFFJ(c);
1164 1164
1165 spin_lock(&c->wbuf_dwork_lock);
1166 c->wbuf_queued = 0;
1167 spin_unlock(&c->wbuf_dwork_lock);
1168
1169 if (!(sb->s_flags & MS_RDONLY)) { 1165 if (!(sb->s_flags & MS_RDONLY)) {
1170 jffs2_dbg(1, "%s()\n", __func__); 1166 jffs2_dbg(1, "%s()\n", __func__);
1171 jffs2_flush_wbuf_gc(c, 0); 1167 jffs2_flush_wbuf_gc(c, 0);
@@ -1180,14 +1176,9 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c)
1180 if (sb->s_flags & MS_RDONLY) 1176 if (sb->s_flags & MS_RDONLY)
1181 return; 1177 return;
1182 1178
1183 spin_lock(&c->wbuf_dwork_lock); 1179 delay = msecs_to_jiffies(dirty_writeback_interval * 10);
1184 if (!c->wbuf_queued) { 1180 if (queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay))
1185 jffs2_dbg(1, "%s()\n", __func__); 1181 jffs2_dbg(1, "%s()\n", __func__);
1186 delay = msecs_to_jiffies(dirty_writeback_interval * 10);
1187 queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay);
1188 c->wbuf_queued = 1;
1189 }
1190 spin_unlock(&c->wbuf_dwork_lock);
1191} 1182}
1192 1183
1193int jffs2_nand_flash_setup(struct jffs2_sb_info *c) 1184int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
@@ -1211,7 +1202,6 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
1211 1202
1212 /* Initialise write buffer */ 1203 /* Initialise write buffer */
1213 init_rwsem(&c->wbuf_sem); 1204 init_rwsem(&c->wbuf_sem);
1214 spin_lock_init(&c->wbuf_dwork_lock);
1215 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); 1205 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
1216 c->wbuf_pagesize = c->mtd->writesize; 1206 c->wbuf_pagesize = c->mtd->writesize;
1217 c->wbuf_ofs = 0xFFFFFFFF; 1207 c->wbuf_ofs = 0xFFFFFFFF;
@@ -1251,7 +1241,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
1251 1241
1252 /* Initialize write buffer */ 1242 /* Initialize write buffer */
1253 init_rwsem(&c->wbuf_sem); 1243 init_rwsem(&c->wbuf_sem);
1254 spin_lock_init(&c->wbuf_dwork_lock);
1255 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); 1244 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
1256 c->wbuf_pagesize = c->mtd->erasesize; 1245 c->wbuf_pagesize = c->mtd->erasesize;
1257 1246
@@ -1311,7 +1300,6 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
1311 1300
1312 /* Initialize write buffer */ 1301 /* Initialize write buffer */
1313 init_rwsem(&c->wbuf_sem); 1302 init_rwsem(&c->wbuf_sem);
1314 spin_lock_init(&c->wbuf_dwork_lock);
1315 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); 1303 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
1316 1304
1317 c->wbuf_pagesize = c->mtd->writesize; 1305 c->wbuf_pagesize = c->mtd->writesize;
@@ -1346,7 +1334,6 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
1346 return 0; 1334 return 0;
1347 1335
1348 init_rwsem(&c->wbuf_sem); 1336 init_rwsem(&c->wbuf_sem);
1349 spin_lock_init(&c->wbuf_dwork_lock);
1350 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); 1337 INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
1351 1338
1352 c->wbuf_pagesize = c->mtd->writesize; 1339 c->wbuf_pagesize = c->mtd->writesize;
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index ad0f2e2a1700..d72817ac51f6 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -756,8 +756,7 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
756 for (i=0; i < XATTRINDEX_HASHSIZE; i++) { 756 for (i=0; i < XATTRINDEX_HASHSIZE; i++) {
757 list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { 757 list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) {
758 list_del(&xd->xindex); 758 list_del(&xd->xindex);
759 if (xd->xname) 759 kfree(xd->xname);
760 kfree(xd->xname);
761 jffs2_free_xattr_datum(xd); 760 jffs2_free_xattr_datum(xd);
762 } 761 }
763 } 762 }
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 0acddf60af55..bc462dcd7a40 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1585,7 +1585,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait)
1585 set_current_state(TASK_UNINTERRUPTIBLE); 1585 set_current_state(TASK_UNINTERRUPTIBLE);
1586 LOGGC_UNLOCK(log); 1586 LOGGC_UNLOCK(log);
1587 schedule(); 1587 schedule();
1588 __set_current_state(TASK_RUNNING);
1589 LOGGC_LOCK(log); 1588 LOGGC_LOCK(log);
1590 remove_wait_queue(&target->gcwait, &__wait); 1589 remove_wait_queue(&target->gcwait, &__wait);
1591 } 1590 }
@@ -2359,7 +2358,6 @@ int jfsIOWait(void *arg)
2359 set_current_state(TASK_INTERRUPTIBLE); 2358 set_current_state(TASK_INTERRUPTIBLE);
2360 spin_unlock_irq(&log_redrive_lock); 2359 spin_unlock_irq(&log_redrive_lock);
2361 schedule(); 2360 schedule();
2362 __set_current_state(TASK_RUNNING);
2363 } 2361 }
2364 } while (!kthread_should_stop()); 2362 } while (!kthread_should_stop());
2365 2363
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 564c4f279ac6..d595856453b2 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -136,7 +136,6 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
136 set_current_state(TASK_UNINTERRUPTIBLE); 136 set_current_state(TASK_UNINTERRUPTIBLE);
137 TXN_UNLOCK(); 137 TXN_UNLOCK();
138 io_schedule(); 138 io_schedule();
139 __set_current_state(TASK_RUNNING);
140 remove_wait_queue(event, &wait); 139 remove_wait_queue(event, &wait);
141} 140}
142 141
@@ -2808,7 +2807,6 @@ int jfs_lazycommit(void *arg)
2808 set_current_state(TASK_INTERRUPTIBLE); 2807 set_current_state(TASK_INTERRUPTIBLE);
2809 LAZY_UNLOCK(flags); 2808 LAZY_UNLOCK(flags);
2810 schedule(); 2809 schedule();
2811 __set_current_state(TASK_RUNNING);
2812 remove_wait_queue(&jfs_commit_thread_wait, &wq); 2810 remove_wait_queue(&jfs_commit_thread_wait, &wq);
2813 } 2811 }
2814 } while (!kthread_should_stop()); 2812 } while (!kthread_should_stop());
@@ -2996,7 +2994,6 @@ int jfs_sync(void *arg)
2996 set_current_state(TASK_INTERRUPTIBLE); 2994 set_current_state(TASK_INTERRUPTIBLE);
2997 TXN_UNLOCK(); 2995 TXN_UNLOCK();
2998 schedule(); 2996 schedule();
2999 __set_current_state(TASK_RUNNING);
3000 } 2997 }
3001 } while (!kthread_should_stop()); 2998 } while (!kthread_should_stop());
3002 2999
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index adf8cb045b9e..93e897e588a8 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -550,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
550 inode->i_ino = 0; 550 inode->i_ino = 0;
551 inode->i_size = sb->s_bdev->bd_inode->i_size; 551 inode->i_size = sb->s_bdev->bd_inode->i_size;
552 inode->i_mapping->a_ops = &jfs_metapage_aops; 552 inode->i_mapping->a_ops = &jfs_metapage_aops;
553 insert_inode_hash(inode); 553 hlist_add_fake(&inode->i_hash);
554 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); 554 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
555 555
556 sbi->direct_inode = inode; 556 sbi->direct_inode = inode;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a693f5b01ae6..1c771931bb60 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -463,21 +463,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
463 goto out_bad; 463 goto out_bad;
464 464
465 mutex_unlock(&kernfs_mutex); 465 mutex_unlock(&kernfs_mutex);
466out_valid:
467 return 1; 466 return 1;
468out_bad: 467out_bad:
469 mutex_unlock(&kernfs_mutex); 468 mutex_unlock(&kernfs_mutex);
470out_bad_unlocked: 469out_bad_unlocked:
471 /*
472 * @dentry doesn't match the underlying kernfs node, drop the
473 * dentry and force lookup. If we have submounts we must allow the
474 * vfs caches to lie about the state of the filesystem to prevent
475 * leaks and other nasty things, so use check_submounts_and_drop()
476 * instead of d_drop().
477 */
478 if (check_submounts_and_drop(dentry) != 0)
479 goto out_valid;
480
481 return 0; 470 return 0;
482} 471}
483 472
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index d895b4b7b661..4429d6d9217f 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -896,7 +896,7 @@ const struct file_operations kernfs_file_fops = {
896 * @ops: kernfs operations for the file 896 * @ops: kernfs operations for the file
897 * @priv: private data for the file 897 * @priv: private data for the file
898 * @ns: optional namespace tag of the file 898 * @ns: optional namespace tag of the file
899 * @static_name: don't copy file name 899 * @name_is_static: don't copy file name
900 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep 900 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
901 * 901 *
902 * Returns the created node on success, ERR_PTR() value on error. 902 * Returns the created node on success, ERR_PTR() value on error.
diff --git a/fs/libfs.c b/fs/libfs.c
index 88e3e00e2eca..171d2846f2a3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1075,3 +1075,21 @@ struct inode *alloc_anon_inode(struct super_block *s)
1075 return inode; 1075 return inode;
1076} 1076}
1077EXPORT_SYMBOL(alloc_anon_inode); 1077EXPORT_SYMBOL(alloc_anon_inode);
1078
1079/**
1080 * simple_nosetlease - generic helper for prohibiting leases
1081 * @filp: file pointer
1082 * @arg: type of lease to obtain
1083 * @flp: new lease supplied for insertion
1084 * @priv: private data for lm_setup operation
1085 *
1086 * Generic helper for filesystems that do not wish to allow leases to be set.
1087 * All arguments are ignored and it just returns -EINVAL.
1088 */
1089int
1090simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
1091 void **priv)
1092{
1093 return -EINVAL;
1094}
1095EXPORT_SYMBOL(simple_nosetlease);
diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile
index ca58d64374ca..9b320cc2a8cf 100644
--- a/fs/lockd/Makefile
+++ b/fs/lockd/Makefile
@@ -5,6 +5,7 @@
5obj-$(CONFIG_LOCKD) += lockd.o 5obj-$(CONFIG_LOCKD) += lockd.o
6 6
7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ 7lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \
8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o 8 svcshare.o svcproc.o svcsubs.o mon.o xdr.o
9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o 9lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o
10lockd-objs-$(CONFIG_PROC_FS) += procfs.o
10lockd-objs := $(lockd-objs-y) 11lockd-objs := $(lockd-objs-y)
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1812f026960c..9106f42c472c 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
159 159
160 msg.rpc_proc = &clnt->cl_procinfo[proc]; 160 msg.rpc_proc = &clnt->cl_procinfo[proc];
161 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); 161 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
162 if (status == -ECONNREFUSED) {
163 dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n",
164 status);
165 rpc_force_rebind(clnt);
166 status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
167 }
162 if (status < 0) 168 if (status < 0)
163 dprintk("lockd: NSM upcall RPC failed, status=%d\n", 169 dprintk("lockd: NSM upcall RPC failed, status=%d\n",
164 status); 170 status);
@@ -306,11 +312,9 @@ static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
306static void nsm_init_private(struct nsm_handle *nsm) 312static void nsm_init_private(struct nsm_handle *nsm)
307{ 313{
308 u64 *p = (u64 *)&nsm->sm_priv.data; 314 u64 *p = (u64 *)&nsm->sm_priv.data;
309 struct timespec ts;
310 s64 ns; 315 s64 ns;
311 316
312 ktime_get_ts(&ts); 317 ns = ktime_get_ns();
313 ns = timespec_to_ns(&ts);
314 put_unaligned(ns, p); 318 put_unaligned(ns, p);
315 put_unaligned((unsigned long)nsm, p + 1); 319 put_unaligned((unsigned long)nsm, p + 1);
316} 320}
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5010b55628b4..097bfa3adb1c 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -11,7 +11,6 @@ struct lockd_net {
11 11
12 struct delayed_work grace_period_end; 12 struct delayed_work grace_period_end;
13 struct lock_manager lockd_manager; 13 struct lock_manager lockd_manager;
14 struct list_head grace_list;
15 14
16 spinlock_t nsm_clnt_lock; 15 spinlock_t nsm_clnt_lock;
17 unsigned int nsm_users; 16 unsigned int nsm_users;
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
new file mode 100644
index 000000000000..2a0a98480e39
--- /dev/null
+++ b/fs/lockd/procfs.c
@@ -0,0 +1,92 @@
1/*
2 * Procfs support for lockd
3 *
4 * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
5 */
6
7#include <linux/fs.h>
8#include <linux/proc_fs.h>
9#include <linux/module.h>
10#include <linux/nsproxy.h>
11#include <net/net_namespace.h>
12
13#include "netns.h"
14#include "procfs.h"
15
16/*
17 * We only allow strings that start with 'Y', 'y', or '1'.
18 */
19static ssize_t
20nlm_end_grace_write(struct file *file, const char __user *buf, size_t size,
21 loff_t *pos)
22{
23 char *data;
24 struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
25 lockd_net_id);
26
27 if (size < 1)
28 return -EINVAL;
29
30 data = simple_transaction_get(file, buf, size);
31 if (IS_ERR(data))
32 return PTR_ERR(data);
33
34 switch(data[0]) {
35 case 'Y':
36 case 'y':
37 case '1':
38 locks_end_grace(&ln->lockd_manager);
39 break;
40 default:
41 return -EINVAL;
42 }
43
44 return size;
45}
46
47static ssize_t
48nlm_end_grace_read(struct file *file, char __user *buf, size_t size,
49 loff_t *pos)
50{
51 struct lockd_net *ln = net_generic(current->nsproxy->net_ns,
52 lockd_net_id);
53 char resp[3];
54
55 resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N';
56 resp[1] = '\n';
57 resp[2] = '\0';
58
59 return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp));
60}
61
62static const struct file_operations lockd_end_grace_operations = {
63 .write = nlm_end_grace_write,
64 .read = nlm_end_grace_read,
65 .llseek = default_llseek,
66 .release = simple_transaction_release,
67 .owner = THIS_MODULE,
68};
69
70int __init
71lockd_create_procfs(void)
72{
73 struct proc_dir_entry *entry;
74
75 entry = proc_mkdir("fs/lockd", NULL);
76 if (!entry)
77 return -ENOMEM;
78 entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry,
79 &lockd_end_grace_operations);
80 if (!entry) {
81 remove_proc_entry("fs/lockd", NULL);
82 return -ENOMEM;
83 }
84 return 0;
85}
86
87void __exit
88lockd_remove_procfs(void)
89{
90 remove_proc_entry("fs/lockd/nlm_end_grace", NULL);
91 remove_proc_entry("fs/lockd", NULL);
92}
diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h
new file mode 100644
index 000000000000..2257a1311027
--- /dev/null
+++ b/fs/lockd/procfs.h
@@ -0,0 +1,28 @@
1/*
2 * Procfs support for lockd
3 *
4 * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com>
5 */
6#ifndef _LOCKD_PROCFS_H
7#define _LOCKD_PROCFS_H
8
9#include <linux/kconfig.h>
10
11#if IS_ENABLED(CONFIG_PROC_FS)
12int lockd_create_procfs(void);
13void lockd_remove_procfs(void);
14#else
15static inline int
16lockd_create_procfs(void)
17{
18 return 0;
19}
20
21static inline void
22lockd_remove_procfs(void)
23{
24 return;
25}
26#endif /* IS_ENABLED(CONFIG_PROC_FS) */
27
28#endif /* _LOCKD_PROCFS_H */
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 8f27c93f8d2e..d1bb7ecfd201 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -36,6 +36,7 @@
36#include <linux/nfs.h> 36#include <linux/nfs.h>
37 37
38#include "netns.h" 38#include "netns.h"
39#include "procfs.h"
39 40
40#define NLMDBG_FACILITY NLMDBG_SVC 41#define NLMDBG_FACILITY NLMDBG_SVC
41#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) 42#define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE)
@@ -253,13 +254,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
253 254
254 error = make_socks(serv, net); 255 error = make_socks(serv, net);
255 if (error < 0) 256 if (error < 0)
256 goto err_socks; 257 goto err_bind;
257 set_grace_period(net); 258 set_grace_period(net);
258 dprintk("lockd_up_net: per-net data created; net=%p\n", net); 259 dprintk("lockd_up_net: per-net data created; net=%p\n", net);
259 return 0; 260 return 0;
260 261
261err_socks:
262 svc_rpcb_cleanup(serv, net);
263err_bind: 262err_bind:
264 ln->nlmsvc_users--; 263 ln->nlmsvc_users--;
265 return error; 264 return error;
@@ -306,13 +305,16 @@ static int lockd_start_svc(struct svc_serv *serv)
306 svc_sock_update_bufs(serv); 305 svc_sock_update_bufs(serv);
307 serv->sv_maxconn = nlm_max_connections; 306 serv->sv_maxconn = nlm_max_connections;
308 307
309 nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name); 308 nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name);
310 if (IS_ERR(nlmsvc_task)) { 309 if (IS_ERR(nlmsvc_task)) {
311 error = PTR_ERR(nlmsvc_task); 310 error = PTR_ERR(nlmsvc_task);
312 printk(KERN_WARNING 311 printk(KERN_WARNING
313 "lockd_up: kthread_run failed, error=%d\n", error); 312 "lockd_up: kthread_run failed, error=%d\n", error);
314 goto out_task; 313 goto out_task;
315 } 314 }
315 nlmsvc_rqst->rq_task = nlmsvc_task;
316 wake_up_process(nlmsvc_task);
317
316 dprintk("lockd_up: service started\n"); 318 dprintk("lockd_up: service started\n");
317 return 0; 319 return 0;
318 320
@@ -583,7 +585,7 @@ static int lockd_init_net(struct net *net)
583 struct lockd_net *ln = net_generic(net, lockd_net_id); 585 struct lockd_net *ln = net_generic(net, lockd_net_id);
584 586
585 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); 587 INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
586 INIT_LIST_HEAD(&ln->grace_list); 588 INIT_LIST_HEAD(&ln->lockd_manager.list);
587 spin_lock_init(&ln->nsm_clnt_lock); 589 spin_lock_init(&ln->nsm_clnt_lock);
588 return 0; 590 return 0;
589} 591}
@@ -617,8 +619,15 @@ static int __init init_nlm(void)
617 err = register_pernet_subsys(&lockd_net_ops); 619 err = register_pernet_subsys(&lockd_net_ops);
618 if (err) 620 if (err)
619 goto err_pernet; 621 goto err_pernet;
622
623 err = lockd_create_procfs();
624 if (err)
625 goto err_procfs;
626
620 return 0; 627 return 0;
621 628
629err_procfs:
630 unregister_pernet_subsys(&lockd_net_ops);
622err_pernet: 631err_pernet:
623#ifdef CONFIG_SYSCTL 632#ifdef CONFIG_SYSCTL
624 unregister_sysctl_table(nlm_sysctl_table); 633 unregister_sysctl_table(nlm_sysctl_table);
@@ -631,6 +640,7 @@ static void __exit exit_nlm(void)
631{ 640{
632 /* FIXME: delete all NLM clients */ 641 /* FIXME: delete all NLM clients */
633 nlm_shutdown_hosts(); 642 nlm_shutdown_hosts();
643 lockd_remove_procfs();
634 unregister_pernet_subsys(&lockd_net_ops); 644 unregister_pernet_subsys(&lockd_net_ops);
635#ifdef CONFIG_SYSCTL 645#ifdef CONFIG_SYSCTL
636 unregister_sysctl_table(nlm_sysctl_table); 646 unregister_sysctl_table(nlm_sysctl_table);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index ab798a88ec1d..13db95f54176 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -245,7 +245,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
245 block->b_daemon = rqstp->rq_server; 245 block->b_daemon = rqstp->rq_server;
246 block->b_host = host; 246 block->b_host = host;
247 block->b_file = file; 247 block->b_file = file;
248 block->b_fl = NULL;
249 file->f_count++; 248 file->f_count++;
250 249
251 /* Add to file's list of blocks */ 250 /* Add to file's list of blocks */
@@ -295,7 +294,6 @@ static void nlmsvc_free_block(struct kref *kref)
295 nlmsvc_freegrantargs(block->b_call); 294 nlmsvc_freegrantargs(block->b_call);
296 nlmsvc_release_call(block->b_call); 295 nlmsvc_release_call(block->b_call);
297 nlm_release_file(block->b_file); 296 nlm_release_file(block->b_file);
298 kfree(block->b_fl);
299 kfree(block); 297 kfree(block);
300} 298}
301 299
@@ -508,7 +506,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
508 struct nlm_host *host, struct nlm_lock *lock, 506 struct nlm_host *host, struct nlm_lock *lock,
509 struct nlm_lock *conflock, struct nlm_cookie *cookie) 507 struct nlm_lock *conflock, struct nlm_cookie *cookie)
510{ 508{
511 struct nlm_block *block = NULL;
512 int error; 509 int error;
513 __be32 ret; 510 __be32 ret;
514 511
@@ -519,63 +516,26 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
519 (long long)lock->fl.fl_start, 516 (long long)lock->fl.fl_start,
520 (long long)lock->fl.fl_end); 517 (long long)lock->fl.fl_end);
521 518
522 /* Get existing block (in case client is busy-waiting) */
523 block = nlmsvc_lookup_block(file, lock);
524
525 if (block == NULL) {
526 struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
527
528 if (conf == NULL)
529 return nlm_granted;
530 block = nlmsvc_create_block(rqstp, host, file, lock, cookie);
531 if (block == NULL) {
532 kfree(conf);
533 return nlm_granted;
534 }
535 block->b_fl = conf;
536 }
537 if (block->b_flags & B_QUEUED) {
538 dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n",
539 block, block->b_flags, block->b_fl);
540 if (block->b_flags & B_TIMED_OUT) {
541 nlmsvc_unlink_block(block);
542 ret = nlm_lck_denied;
543 goto out;
544 }
545 if (block->b_flags & B_GOT_CALLBACK) {
546 nlmsvc_unlink_block(block);
547 if (block->b_fl != NULL
548 && block->b_fl->fl_type != F_UNLCK) {
549 lock->fl = *block->b_fl;
550 goto conf_lock;
551 } else {
552 ret = nlm_granted;
553 goto out;
554 }
555 }
556 ret = nlm_drop_reply;
557 goto out;
558 }
559
560 if (locks_in_grace(SVC_NET(rqstp))) { 519 if (locks_in_grace(SVC_NET(rqstp))) {
561 ret = nlm_lck_denied_grace_period; 520 ret = nlm_lck_denied_grace_period;
562 goto out; 521 goto out;
563 } 522 }
523
564 error = vfs_test_lock(file->f_file, &lock->fl); 524 error = vfs_test_lock(file->f_file, &lock->fl);
565 if (error == FILE_LOCK_DEFERRED) {
566 ret = nlmsvc_defer_lock_rqst(rqstp, block);
567 goto out;
568 }
569 if (error) { 525 if (error) {
526 /* We can't currently deal with deferred test requests */
527 if (error == FILE_LOCK_DEFERRED)
528 WARN_ON_ONCE(1);
529
570 ret = nlm_lck_denied_nolocks; 530 ret = nlm_lck_denied_nolocks;
571 goto out; 531 goto out;
572 } 532 }
533
573 if (lock->fl.fl_type == F_UNLCK) { 534 if (lock->fl.fl_type == F_UNLCK) {
574 ret = nlm_granted; 535 ret = nlm_granted;
575 goto out; 536 goto out;
576 } 537 }
577 538
578conf_lock:
579 dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", 539 dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
580 lock->fl.fl_type, (long long)lock->fl.fl_start, 540 lock->fl.fl_type, (long long)lock->fl.fl_start,
581 (long long)lock->fl.fl_end); 541 (long long)lock->fl.fl_end);
@@ -586,10 +546,9 @@ conf_lock:
586 conflock->fl.fl_type = lock->fl.fl_type; 546 conflock->fl.fl_type = lock->fl.fl_type;
587 conflock->fl.fl_start = lock->fl.fl_start; 547 conflock->fl.fl_start = lock->fl.fl_start;
588 conflock->fl.fl_end = lock->fl.fl_end; 548 conflock->fl.fl_end = lock->fl.fl_end;
549 locks_release_private(&lock->fl);
589 ret = nlm_lck_denied; 550 ret = nlm_lck_denied;
590out: 551out:
591 if (block)
592 nlmsvc_release_block(block);
593 return ret; 552 return ret;
594} 553}
595 554
@@ -660,29 +619,22 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
660 * This is a callback from the filesystem for VFS file lock requests. 619 * This is a callback from the filesystem for VFS file lock requests.
661 * It will be used if lm_grant is defined and the filesystem can not 620 * It will be used if lm_grant is defined and the filesystem can not
662 * respond to the request immediately. 621 * respond to the request immediately.
663 * For GETLK request it will copy the reply to the nlm_block.
664 * For SETLK or SETLKW request it will get the local posix lock. 622 * For SETLK or SETLKW request it will get the local posix lock.
665 * In all cases it will move the block to the head of nlm_blocked q where 623 * In all cases it will move the block to the head of nlm_blocked q where
666 * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the 624 * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the
667 * deferred rpc for GETLK and SETLK. 625 * deferred rpc for GETLK and SETLK.
668 */ 626 */
669static void 627static void
670nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf, 628nlmsvc_update_deferred_block(struct nlm_block *block, int result)
671 int result)
672{ 629{
673 block->b_flags |= B_GOT_CALLBACK; 630 block->b_flags |= B_GOT_CALLBACK;
674 if (result == 0) 631 if (result == 0)
675 block->b_granted = 1; 632 block->b_granted = 1;
676 else 633 else
677 block->b_flags |= B_TIMED_OUT; 634 block->b_flags |= B_TIMED_OUT;
678 if (conf) {
679 if (block->b_fl)
680 __locks_copy_lock(block->b_fl, conf);
681 }
682} 635}
683 636
684static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, 637static int nlmsvc_grant_deferred(struct file_lock *fl, int result)
685 int result)
686{ 638{
687 struct nlm_block *block; 639 struct nlm_block *block;
688 int rc = -ENOENT; 640 int rc = -ENOENT;
@@ -697,7 +649,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf,
697 rc = -ENOLCK; 649 rc = -ENOLCK;
698 break; 650 break;
699 } 651 }
700 nlmsvc_update_deferred_block(block, conf, result); 652 nlmsvc_update_deferred_block(block, result);
701 } else if (result == 0) 653 } else if (result == 0)
702 block->b_granted = 1; 654 block->b_granted = 1;
703 655
diff --git a/fs/locks.c b/fs/locks.c
index 717fbc404e6b..735b8d3fa78c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -230,8 +230,12 @@ void locks_release_private(struct file_lock *fl)
230 fl->fl_ops->fl_release_private(fl); 230 fl->fl_ops->fl_release_private(fl);
231 fl->fl_ops = NULL; 231 fl->fl_ops = NULL;
232 } 232 }
233 fl->fl_lmops = NULL;
234 233
234 if (fl->fl_lmops) {
235 if (fl->fl_lmops->lm_put_owner)
236 fl->fl_lmops->lm_put_owner(fl);
237 fl->fl_lmops = NULL;
238 }
235} 239}
236EXPORT_SYMBOL_GPL(locks_release_private); 240EXPORT_SYMBOL_GPL(locks_release_private);
237 241
@@ -247,6 +251,18 @@ void locks_free_lock(struct file_lock *fl)
247} 251}
248EXPORT_SYMBOL(locks_free_lock); 252EXPORT_SYMBOL(locks_free_lock);
249 253
254static void
255locks_dispose_list(struct list_head *dispose)
256{
257 struct file_lock *fl;
258
259 while (!list_empty(dispose)) {
260 fl = list_first_entry(dispose, struct file_lock, fl_block);
261 list_del_init(&fl->fl_block);
262 locks_free_lock(fl);
263 }
264}
265
250void locks_init_lock(struct file_lock *fl) 266void locks_init_lock(struct file_lock *fl)
251{ 267{
252 memset(fl, 0, sizeof(struct file_lock)); 268 memset(fl, 0, sizeof(struct file_lock));
@@ -255,21 +271,10 @@ void locks_init_lock(struct file_lock *fl)
255 271
256EXPORT_SYMBOL(locks_init_lock); 272EXPORT_SYMBOL(locks_init_lock);
257 273
258static void locks_copy_private(struct file_lock *new, struct file_lock *fl)
259{
260 if (fl->fl_ops) {
261 if (fl->fl_ops->fl_copy_lock)
262 fl->fl_ops->fl_copy_lock(new, fl);
263 new->fl_ops = fl->fl_ops;
264 }
265 if (fl->fl_lmops)
266 new->fl_lmops = fl->fl_lmops;
267}
268
269/* 274/*
270 * Initialize a new lock from an existing file_lock structure. 275 * Initialize a new lock from an existing file_lock structure.
271 */ 276 */
272void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) 277void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
273{ 278{
274 new->fl_owner = fl->fl_owner; 279 new->fl_owner = fl->fl_owner;
275 new->fl_pid = fl->fl_pid; 280 new->fl_pid = fl->fl_pid;
@@ -278,21 +283,30 @@ void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl)
278 new->fl_type = fl->fl_type; 283 new->fl_type = fl->fl_type;
279 new->fl_start = fl->fl_start; 284 new->fl_start = fl->fl_start;
280 new->fl_end = fl->fl_end; 285 new->fl_end = fl->fl_end;
286 new->fl_lmops = fl->fl_lmops;
281 new->fl_ops = NULL; 287 new->fl_ops = NULL;
282 new->fl_lmops = NULL; 288
289 if (fl->fl_lmops) {
290 if (fl->fl_lmops->lm_get_owner)
291 fl->fl_lmops->lm_get_owner(new, fl);
292 }
283} 293}
284EXPORT_SYMBOL(__locks_copy_lock); 294EXPORT_SYMBOL(locks_copy_conflock);
285 295
286void locks_copy_lock(struct file_lock *new, struct file_lock *fl) 296void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
287{ 297{
288 locks_release_private(new); 298 /* "new" must be a freshly-initialized lock */
299 WARN_ON_ONCE(new->fl_ops);
300
301 locks_copy_conflock(new, fl);
289 302
290 __locks_copy_lock(new, fl);
291 new->fl_file = fl->fl_file; 303 new->fl_file = fl->fl_file;
292 new->fl_ops = fl->fl_ops; 304 new->fl_ops = fl->fl_ops;
293 new->fl_lmops = fl->fl_lmops;
294 305
295 locks_copy_private(new, fl); 306 if (fl->fl_ops) {
307 if (fl->fl_ops->fl_copy_lock)
308 fl->fl_ops->fl_copy_lock(new, fl);
309 }
296} 310}
297 311
298EXPORT_SYMBOL(locks_copy_lock); 312EXPORT_SYMBOL(locks_copy_lock);
@@ -312,27 +326,27 @@ static inline int flock_translate_cmd(int cmd) {
312} 326}
313 327
314/* Fill in a file_lock structure with an appropriate FLOCK lock. */ 328/* Fill in a file_lock structure with an appropriate FLOCK lock. */
315static int flock_make_lock(struct file *filp, struct file_lock **lock, 329static struct file_lock *
316 unsigned int cmd) 330flock_make_lock(struct file *filp, unsigned int cmd)
317{ 331{
318 struct file_lock *fl; 332 struct file_lock *fl;
319 int type = flock_translate_cmd(cmd); 333 int type = flock_translate_cmd(cmd);
334
320 if (type < 0) 335 if (type < 0)
321 return type; 336 return ERR_PTR(type);
322 337
323 fl = locks_alloc_lock(); 338 fl = locks_alloc_lock();
324 if (fl == NULL) 339 if (fl == NULL)
325 return -ENOMEM; 340 return ERR_PTR(-ENOMEM);
326 341
327 fl->fl_file = filp; 342 fl->fl_file = filp;
328 fl->fl_owner = (fl_owner_t)filp; 343 fl->fl_owner = filp;
329 fl->fl_pid = current->tgid; 344 fl->fl_pid = current->tgid;
330 fl->fl_flags = FL_FLOCK; 345 fl->fl_flags = FL_FLOCK;
331 fl->fl_type = type; 346 fl->fl_type = type;
332 fl->fl_end = OFFSET_MAX; 347 fl->fl_end = OFFSET_MAX;
333 348
334 *lock = fl; 349 return fl;
335 return 0;
336} 350}
337 351
338static int assign_type(struct file_lock *fl, long type) 352static int assign_type(struct file_lock *fl, long type)
@@ -413,14 +427,34 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
413} 427}
414 428
415/* default lease lock manager operations */ 429/* default lease lock manager operations */
416static void lease_break_callback(struct file_lock *fl) 430static bool
431lease_break_callback(struct file_lock *fl)
417{ 432{
418 kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); 433 kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
434 return false;
435}
436
437static void
438lease_setup(struct file_lock *fl, void **priv)
439{
440 struct file *filp = fl->fl_file;
441 struct fasync_struct *fa = *priv;
442
443 /*
444 * fasync_insert_entry() returns the old entry if any. If there was no
445 * old entry, then it used "priv" and inserted it into the fasync list.
446 * Clear the pointer to indicate that it shouldn't be freed.
447 */
448 if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
449 *priv = NULL;
450
451 __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
419} 452}
420 453
421static const struct lock_manager_operations lease_manager_ops = { 454static const struct lock_manager_operations lease_manager_ops = {
422 .lm_break = lease_break_callback, 455 .lm_break = lease_break_callback,
423 .lm_change = lease_modify, 456 .lm_change = lease_modify,
457 .lm_setup = lease_setup,
424}; 458};
425 459
426/* 460/*
@@ -431,7 +465,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
431 if (assign_type(fl, type) != 0) 465 if (assign_type(fl, type) != 0)
432 return -EINVAL; 466 return -EINVAL;
433 467
434 fl->fl_owner = (fl_owner_t)current->files; 468 fl->fl_owner = filp;
435 fl->fl_pid = current->tgid; 469 fl->fl_pid = current->tgid;
436 470
437 fl->fl_file = filp; 471 fl->fl_file = filp;
@@ -650,12 +684,16 @@ static void locks_unlink_lock(struct file_lock **thisfl_p)
650 * 684 *
651 * Must be called with i_lock held! 685 * Must be called with i_lock held!
652 */ 686 */
653static void locks_delete_lock(struct file_lock **thisfl_p) 687static void locks_delete_lock(struct file_lock **thisfl_p,
688 struct list_head *dispose)
654{ 689{
655 struct file_lock *fl = *thisfl_p; 690 struct file_lock *fl = *thisfl_p;
656 691
657 locks_unlink_lock(thisfl_p); 692 locks_unlink_lock(thisfl_p);
658 locks_free_lock(fl); 693 if (dispose)
694 list_add(&fl->fl_block, dispose);
695 else
696 locks_free_lock(fl);
659} 697}
660 698
661/* Determine if lock sys_fl blocks lock caller_fl. Common functionality 699/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
@@ -718,7 +756,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
718 break; 756 break;
719 } 757 }
720 if (cfl) { 758 if (cfl) {
721 __locks_copy_lock(fl, cfl); 759 locks_copy_conflock(fl, cfl);
722 if (cfl->fl_nspid) 760 if (cfl->fl_nspid)
723 fl->fl_pid = pid_vnr(cfl->fl_nspid); 761 fl->fl_pid = pid_vnr(cfl->fl_nspid);
724 } else 762 } else
@@ -811,6 +849,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
811 struct inode * inode = file_inode(filp); 849 struct inode * inode = file_inode(filp);
812 int error = 0; 850 int error = 0;
813 int found = 0; 851 int found = 0;
852 LIST_HEAD(dispose);
814 853
815 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { 854 if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
816 new_fl = locks_alloc_lock(); 855 new_fl = locks_alloc_lock();
@@ -833,7 +872,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
833 if (request->fl_type == fl->fl_type) 872 if (request->fl_type == fl->fl_type)
834 goto out; 873 goto out;
835 found = 1; 874 found = 1;
836 locks_delete_lock(before); 875 locks_delete_lock(before, &dispose);
837 break; 876 break;
838 } 877 }
839 878
@@ -880,6 +919,7 @@ out:
880 spin_unlock(&inode->i_lock); 919 spin_unlock(&inode->i_lock);
881 if (new_fl) 920 if (new_fl)
882 locks_free_lock(new_fl); 921 locks_free_lock(new_fl);
922 locks_dispose_list(&dispose);
883 return error; 923 return error;
884} 924}
885 925
@@ -893,6 +933,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
893 struct file_lock **before; 933 struct file_lock **before;
894 int error; 934 int error;
895 bool added = false; 935 bool added = false;
936 LIST_HEAD(dispose);
896 937
897 /* 938 /*
898 * We may need two file_lock structures for this operation, 939 * We may need two file_lock structures for this operation,
@@ -921,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
921 if (!posix_locks_conflict(request, fl)) 962 if (!posix_locks_conflict(request, fl))
922 continue; 963 continue;
923 if (conflock) 964 if (conflock)
924 __locks_copy_lock(conflock, fl); 965 locks_copy_conflock(conflock, fl);
925 error = -EAGAIN; 966 error = -EAGAIN;
926 if (!(request->fl_flags & FL_SLEEP)) 967 if (!(request->fl_flags & FL_SLEEP))
927 goto out; 968 goto out;
@@ -988,7 +1029,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
988 else 1029 else
989 request->fl_end = fl->fl_end; 1030 request->fl_end = fl->fl_end;
990 if (added) { 1031 if (added) {
991 locks_delete_lock(before); 1032 locks_delete_lock(before, &dispose);
992 continue; 1033 continue;
993 } 1034 }
994 request = fl; 1035 request = fl;
@@ -1018,21 +1059,24 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1018 * one (This may happen several times). 1059 * one (This may happen several times).
1019 */ 1060 */
1020 if (added) { 1061 if (added) {
1021 locks_delete_lock(before); 1062 locks_delete_lock(before, &dispose);
1022 continue; 1063 continue;
1023 } 1064 }
1024 /* Replace the old lock with the new one. 1065 /*
1025 * Wake up anybody waiting for the old one, 1066 * Replace the old lock with new_fl, and
1026 * as the change in lock type might satisfy 1067 * remove the old one. It's safe to do the
1027 * their needs. 1068 * insert here since we know that we won't be
1069 * using new_fl later, and that the lock is
1070 * just replacing an existing lock.
1028 */ 1071 */
1029 locks_wake_up_blocks(fl); 1072 error = -ENOLCK;
1030 fl->fl_start = request->fl_start; 1073 if (!new_fl)
1031 fl->fl_end = request->fl_end; 1074 goto out;
1032 fl->fl_type = request->fl_type; 1075 locks_copy_lock(new_fl, request);
1033 locks_release_private(fl); 1076 request = new_fl;
1034 locks_copy_private(fl, request); 1077 new_fl = NULL;
1035 request = fl; 1078 locks_delete_lock(before, &dispose);
1079 locks_insert_lock(before, request);
1036 added = true; 1080 added = true;
1037 } 1081 }
1038 } 1082 }
@@ -1093,6 +1137,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
1093 locks_free_lock(new_fl); 1137 locks_free_lock(new_fl);
1094 if (new_fl2) 1138 if (new_fl2)
1095 locks_free_lock(new_fl2); 1139 locks_free_lock(new_fl2);
1140 locks_dispose_list(&dispose);
1096 return error; 1141 return error;
1097} 1142}
1098 1143
@@ -1155,7 +1200,6 @@ EXPORT_SYMBOL(posix_lock_file_wait);
1155int locks_mandatory_locked(struct file *file) 1200int locks_mandatory_locked(struct file *file)
1156{ 1201{
1157 struct inode *inode = file_inode(file); 1202 struct inode *inode = file_inode(file);
1158 fl_owner_t owner = current->files;
1159 struct file_lock *fl; 1203 struct file_lock *fl;
1160 1204
1161 /* 1205 /*
@@ -1165,7 +1209,8 @@ int locks_mandatory_locked(struct file *file)
1165 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { 1209 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
1166 if (!IS_POSIX(fl)) 1210 if (!IS_POSIX(fl))
1167 continue; 1211 continue;
1168 if (fl->fl_owner != owner && fl->fl_owner != (fl_owner_t)file) 1212 if (fl->fl_owner != current->files &&
1213 fl->fl_owner != file)
1169 break; 1214 break;
1170 } 1215 }
1171 spin_unlock(&inode->i_lock); 1216 spin_unlock(&inode->i_lock);
@@ -1205,7 +1250,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
1205 1250
1206 for (;;) { 1251 for (;;) {
1207 if (filp) { 1252 if (filp) {
1208 fl.fl_owner = (fl_owner_t)filp; 1253 fl.fl_owner = filp;
1209 fl.fl_flags &= ~FL_SLEEP; 1254 fl.fl_flags &= ~FL_SLEEP;
1210 error = __posix_lock_file(inode, &fl, NULL); 1255 error = __posix_lock_file(inode, &fl, NULL);
1211 if (!error) 1256 if (!error)
@@ -1249,7 +1294,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
1249} 1294}
1250 1295
1251/* We already had a lease on this file; just change its type */ 1296/* We already had a lease on this file; just change its type */
1252int lease_modify(struct file_lock **before, int arg) 1297int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
1253{ 1298{
1254 struct file_lock *fl = *before; 1299 struct file_lock *fl = *before;
1255 int error = assign_type(fl, arg); 1300 int error = assign_type(fl, arg);
@@ -1268,11 +1313,10 @@ int lease_modify(struct file_lock **before, int arg)
1268 printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); 1313 printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
1269 fl->fl_fasync = NULL; 1314 fl->fl_fasync = NULL;
1270 } 1315 }
1271 locks_delete_lock(before); 1316 locks_delete_lock(before, dispose);
1272 } 1317 }
1273 return 0; 1318 return 0;
1274} 1319}
1275
1276EXPORT_SYMBOL(lease_modify); 1320EXPORT_SYMBOL(lease_modify);
1277 1321
1278static bool past_time(unsigned long then) 1322static bool past_time(unsigned long then)
@@ -1283,18 +1327,20 @@ static bool past_time(unsigned long then)
1283 return time_after(jiffies, then); 1327 return time_after(jiffies, then);
1284} 1328}
1285 1329
1286static void time_out_leases(struct inode *inode) 1330static void time_out_leases(struct inode *inode, struct list_head *dispose)
1287{ 1331{
1288 struct file_lock **before; 1332 struct file_lock **before;
1289 struct file_lock *fl; 1333 struct file_lock *fl;
1290 1334
1335 lockdep_assert_held(&inode->i_lock);
1336
1291 before = &inode->i_flock; 1337 before = &inode->i_flock;
1292 while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { 1338 while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
1293 trace_time_out_leases(inode, fl); 1339 trace_time_out_leases(inode, fl);
1294 if (past_time(fl->fl_downgrade_time)) 1340 if (past_time(fl->fl_downgrade_time))
1295 lease_modify(before, F_RDLCK); 1341 lease_modify(before, F_RDLCK, dispose);
1296 if (past_time(fl->fl_break_time)) 1342 if (past_time(fl->fl_break_time))
1297 lease_modify(before, F_UNLCK); 1343 lease_modify(before, F_UNLCK, dispose);
1298 if (fl == *before) /* lease_modify may have freed fl */ 1344 if (fl == *before) /* lease_modify may have freed fl */
1299 before = &fl->fl_next; 1345 before = &fl->fl_next;
1300 } 1346 }
@@ -1307,6 +1353,20 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
1307 return locks_conflict(breaker, lease); 1353 return locks_conflict(breaker, lease);
1308} 1354}
1309 1355
1356static bool
1357any_leases_conflict(struct inode *inode, struct file_lock *breaker)
1358{
1359 struct file_lock *fl;
1360
1361 lockdep_assert_held(&inode->i_lock);
1362
1363 for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) {
1364 if (leases_conflict(fl, breaker))
1365 return true;
1366 }
1367 return false;
1368}
1369
1310/** 1370/**
1311 * __break_lease - revoke all outstanding leases on file 1371 * __break_lease - revoke all outstanding leases on file
1312 * @inode: the inode of the file to return 1372 * @inode: the inode of the file to return
@@ -1323,12 +1383,11 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
1323int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) 1383int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1324{ 1384{
1325 int error = 0; 1385 int error = 0;
1326 struct file_lock *new_fl, *flock; 1386 struct file_lock *new_fl;
1327 struct file_lock *fl; 1387 struct file_lock *fl, **before;
1328 unsigned long break_time; 1388 unsigned long break_time;
1329 int i_have_this_lease = 0;
1330 bool lease_conflict = false;
1331 int want_write = (mode & O_ACCMODE) != O_RDONLY; 1389 int want_write = (mode & O_ACCMODE) != O_RDONLY;
1390 LIST_HEAD(dispose);
1332 1391
1333 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); 1392 new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
1334 if (IS_ERR(new_fl)) 1393 if (IS_ERR(new_fl))
@@ -1337,20 +1396,9 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1337 1396
1338 spin_lock(&inode->i_lock); 1397 spin_lock(&inode->i_lock);
1339 1398
1340 time_out_leases(inode); 1399 time_out_leases(inode, &dispose);
1341
1342 flock = inode->i_flock;
1343 if ((flock == NULL) || !IS_LEASE(flock))
1344 goto out;
1345 1400
1346 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { 1401 if (!any_leases_conflict(inode, new_fl))
1347 if (leases_conflict(fl, new_fl)) {
1348 lease_conflict = true;
1349 if (fl->fl_owner == current->files)
1350 i_have_this_lease = 1;
1351 }
1352 }
1353 if (!lease_conflict)
1354 goto out; 1402 goto out;
1355 1403
1356 break_time = 0; 1404 break_time = 0;
@@ -1360,7 +1408,9 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1360 break_time++; /* so that 0 means no break time */ 1408 break_time++; /* so that 0 means no break time */
1361 } 1409 }
1362 1410
1363 for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { 1411 for (before = &inode->i_flock;
1412 ((fl = *before) != NULL) && IS_LEASE(fl);
1413 before = &fl->fl_next) {
1364 if (!leases_conflict(fl, new_fl)) 1414 if (!leases_conflict(fl, new_fl))
1365 continue; 1415 continue;
1366 if (want_write) { 1416 if (want_write) {
@@ -1369,51 +1419,56 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
1369 fl->fl_flags |= FL_UNLOCK_PENDING; 1419 fl->fl_flags |= FL_UNLOCK_PENDING;
1370 fl->fl_break_time = break_time; 1420 fl->fl_break_time = break_time;
1371 } else { 1421 } else {
1372 if (lease_breaking(flock)) 1422 if (lease_breaking(inode->i_flock))
1373 continue; 1423 continue;
1374 fl->fl_flags |= FL_DOWNGRADE_PENDING; 1424 fl->fl_flags |= FL_DOWNGRADE_PENDING;
1375 fl->fl_downgrade_time = break_time; 1425 fl->fl_downgrade_time = break_time;
1376 } 1426 }
1377 fl->fl_lmops->lm_break(fl); 1427 if (fl->fl_lmops->lm_break(fl))
1428 locks_delete_lock(before, &dispose);
1378 } 1429 }
1379 1430
1380 if (i_have_this_lease || (mode & O_NONBLOCK)) { 1431 fl = inode->i_flock;
1432 if (!fl || !IS_LEASE(fl))
1433 goto out;
1434
1435 if (mode & O_NONBLOCK) {
1381 trace_break_lease_noblock(inode, new_fl); 1436 trace_break_lease_noblock(inode, new_fl);
1382 error = -EWOULDBLOCK; 1437 error = -EWOULDBLOCK;
1383 goto out; 1438 goto out;
1384 } 1439 }
1385 1440
1386restart: 1441restart:
1387 break_time = flock->fl_break_time; 1442 break_time = inode->i_flock->fl_break_time;
1388 if (break_time != 0) 1443 if (break_time != 0)
1389 break_time -= jiffies; 1444 break_time -= jiffies;
1390 if (break_time == 0) 1445 if (break_time == 0)
1391 break_time++; 1446 break_time++;
1392 locks_insert_block(flock, new_fl); 1447 locks_insert_block(inode->i_flock, new_fl);
1393 trace_break_lease_block(inode, new_fl); 1448 trace_break_lease_block(inode, new_fl);
1394 spin_unlock(&inode->i_lock); 1449 spin_unlock(&inode->i_lock);
1450 locks_dispose_list(&dispose);
1395 error = wait_event_interruptible_timeout(new_fl->fl_wait, 1451 error = wait_event_interruptible_timeout(new_fl->fl_wait,
1396 !new_fl->fl_next, break_time); 1452 !new_fl->fl_next, break_time);
1397 spin_lock(&inode->i_lock); 1453 spin_lock(&inode->i_lock);
1398 trace_break_lease_unblock(inode, new_fl); 1454 trace_break_lease_unblock(inode, new_fl);
1399 locks_delete_block(new_fl); 1455 locks_delete_block(new_fl);
1400 if (error >= 0) { 1456 if (error >= 0) {
1401 if (error == 0)
1402 time_out_leases(inode);
1403 /* 1457 /*
1404 * Wait for the next conflicting lease that has not been 1458 * Wait for the next conflicting lease that has not been
1405 * broken yet 1459 * broken yet
1406 */ 1460 */
1407 for (flock = inode->i_flock; flock && IS_LEASE(flock); 1461 if (error == 0)
1408 flock = flock->fl_next) { 1462 time_out_leases(inode, &dispose);
1409 if (leases_conflict(new_fl, flock)) 1463 if (any_leases_conflict(inode, new_fl))
1410 goto restart; 1464 goto restart;
1411 } 1465
1412 error = 0; 1466 error = 0;
1413 } 1467 }
1414 1468
1415out: 1469out:
1416 spin_unlock(&inode->i_lock); 1470 spin_unlock(&inode->i_lock);
1471 locks_dispose_list(&dispose);
1417 locks_free_lock(new_fl); 1472 locks_free_lock(new_fl);
1418 return error; 1473 return error;
1419} 1474}
@@ -1431,8 +1486,18 @@ EXPORT_SYMBOL(__break_lease);
1431 */ 1486 */
1432void lease_get_mtime(struct inode *inode, struct timespec *time) 1487void lease_get_mtime(struct inode *inode, struct timespec *time)
1433{ 1488{
1434 struct file_lock *flock = inode->i_flock; 1489 bool has_lease = false;
1435 if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK)) 1490 struct file_lock *flock;
1491
1492 if (inode->i_flock) {
1493 spin_lock(&inode->i_lock);
1494 flock = inode->i_flock;
1495 if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
1496 has_lease = true;
1497 spin_unlock(&inode->i_lock);
1498 }
1499
1500 if (has_lease)
1436 *time = current_fs_time(inode->i_sb); 1501 *time = current_fs_time(inode->i_sb);
1437 else 1502 else
1438 *time = inode->i_mtime; 1503 *time = inode->i_mtime;
@@ -1468,9 +1533,10 @@ int fcntl_getlease(struct file *filp)
1468 struct file_lock *fl; 1533 struct file_lock *fl;
1469 struct inode *inode = file_inode(filp); 1534 struct inode *inode = file_inode(filp);
1470 int type = F_UNLCK; 1535 int type = F_UNLCK;
1536 LIST_HEAD(dispose);
1471 1537
1472 spin_lock(&inode->i_lock); 1538 spin_lock(&inode->i_lock);
1473 time_out_leases(file_inode(filp)); 1539 time_out_leases(file_inode(filp), &dispose);
1474 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); 1540 for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
1475 fl = fl->fl_next) { 1541 fl = fl->fl_next) {
1476 if (fl->fl_file == filp) { 1542 if (fl->fl_file == filp) {
@@ -1479,6 +1545,7 @@ int fcntl_getlease(struct file *filp)
1479 } 1545 }
1480 } 1546 }
1481 spin_unlock(&inode->i_lock); 1547 spin_unlock(&inode->i_lock);
1548 locks_dispose_list(&dispose);
1482 return type; 1549 return type;
1483} 1550}
1484 1551
@@ -1508,13 +1575,15 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
1508 return ret; 1575 return ret;
1509} 1576}
1510 1577
1511static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) 1578static int
1579generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
1512{ 1580{
1513 struct file_lock *fl, **before, **my_before = NULL, *lease; 1581 struct file_lock *fl, **before, **my_before = NULL, *lease;
1514 struct dentry *dentry = filp->f_path.dentry; 1582 struct dentry *dentry = filp->f_path.dentry;
1515 struct inode *inode = dentry->d_inode; 1583 struct inode *inode = dentry->d_inode;
1516 bool is_deleg = (*flp)->fl_flags & FL_DELEG; 1584 bool is_deleg = (*flp)->fl_flags & FL_DELEG;
1517 int error; 1585 int error;
1586 LIST_HEAD(dispose);
1518 1587
1519 lease = *flp; 1588 lease = *flp;
1520 trace_generic_add_lease(inode, lease); 1589 trace_generic_add_lease(inode, lease);
@@ -1537,6 +1606,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1537 return -EINVAL; 1606 return -EINVAL;
1538 } 1607 }
1539 1608
1609 spin_lock(&inode->i_lock);
1610 time_out_leases(inode, &dispose);
1540 error = check_conflicting_open(dentry, arg); 1611 error = check_conflicting_open(dentry, arg);
1541 if (error) 1612 if (error)
1542 goto out; 1613 goto out;
@@ -1572,10 +1643,11 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1572 } 1643 }
1573 1644
1574 if (my_before != NULL) { 1645 if (my_before != NULL) {
1575 error = lease->fl_lmops->lm_change(my_before, arg); 1646 lease = *my_before;
1576 if (!error) 1647 error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
1577 *flp = *my_before; 1648 if (error)
1578 goto out; 1649 goto out;
1650 goto out_setup;
1579 } 1651 }
1580 1652
1581 error = -EINVAL; 1653 error = -EINVAL;
@@ -1595,43 +1667,61 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
1595 smp_mb(); 1667 smp_mb();
1596 error = check_conflicting_open(dentry, arg); 1668 error = check_conflicting_open(dentry, arg);
1597 if (error) 1669 if (error)
1598 locks_unlink_lock(flp); 1670 goto out_unlink;
1671
1672out_setup:
1673 if (lease->fl_lmops->lm_setup)
1674 lease->fl_lmops->lm_setup(lease, priv);
1599out: 1675out:
1676 spin_unlock(&inode->i_lock);
1677 locks_dispose_list(&dispose);
1600 if (is_deleg) 1678 if (is_deleg)
1601 mutex_unlock(&inode->i_mutex); 1679 mutex_unlock(&inode->i_mutex);
1680 if (!error && !my_before)
1681 *flp = NULL;
1602 return error; 1682 return error;
1683out_unlink:
1684 locks_unlink_lock(before);
1685 goto out;
1603} 1686}
1604 1687
1605static int generic_delete_lease(struct file *filp, struct file_lock **flp) 1688static int generic_delete_lease(struct file *filp)
1606{ 1689{
1690 int error = -EAGAIN;
1607 struct file_lock *fl, **before; 1691 struct file_lock *fl, **before;
1608 struct dentry *dentry = filp->f_path.dentry; 1692 struct dentry *dentry = filp->f_path.dentry;
1609 struct inode *inode = dentry->d_inode; 1693 struct inode *inode = dentry->d_inode;
1694 LIST_HEAD(dispose);
1610 1695
1611 trace_generic_delete_lease(inode, *flp); 1696 spin_lock(&inode->i_lock);
1612 1697 time_out_leases(inode, &dispose);
1613 for (before = &inode->i_flock; 1698 for (before = &inode->i_flock;
1614 ((fl = *before) != NULL) && IS_LEASE(fl); 1699 ((fl = *before) != NULL) && IS_LEASE(fl);
1615 before = &fl->fl_next) { 1700 before = &fl->fl_next) {
1616 if (fl->fl_file != filp) 1701 if (fl->fl_file == filp)
1617 continue; 1702 break;
1618 return (*flp)->fl_lmops->lm_change(before, F_UNLCK);
1619 } 1703 }
1620 return -EAGAIN; 1704 trace_generic_delete_lease(inode, fl);
1705 if (fl)
1706 error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose);
1707 spin_unlock(&inode->i_lock);
1708 locks_dispose_list(&dispose);
1709 return error;
1621} 1710}
1622 1711
1623/** 1712/**
1624 * generic_setlease - sets a lease on an open file 1713 * generic_setlease - sets a lease on an open file
1625 * @filp: file pointer 1714 * @filp: file pointer
1626 * @arg: type of lease to obtain 1715 * @arg: type of lease to obtain
1627 * @flp: input - file_lock to use, output - file_lock inserted 1716 * @flp: input - file_lock to use, output - file_lock inserted
1717 * @priv: private data for lm_setup (may be NULL if lm_setup
1718 * doesn't require it)
1628 * 1719 *
1629 * The (input) flp->fl_lmops->lm_break function is required 1720 * The (input) flp->fl_lmops->lm_break function is required
1630 * by break_lease(). 1721 * by break_lease().
1631 *
1632 * Called with inode->i_lock held.
1633 */ 1722 */
1634int generic_setlease(struct file *filp, long arg, struct file_lock **flp) 1723int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
1724 void **priv)
1635{ 1725{
1636 struct dentry *dentry = filp->f_path.dentry; 1726 struct dentry *dentry = filp->f_path.dentry;
1637 struct inode *inode = dentry->d_inode; 1727 struct inode *inode = dentry->d_inode;
@@ -1645,83 +1735,52 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
1645 if (error) 1735 if (error)
1646 return error; 1736 return error;
1647 1737
1648 time_out_leases(inode);
1649
1650 BUG_ON(!(*flp)->fl_lmops->lm_break);
1651
1652 switch (arg) { 1738 switch (arg) {
1653 case F_UNLCK: 1739 case F_UNLCK:
1654 return generic_delete_lease(filp, flp); 1740 return generic_delete_lease(filp);
1655 case F_RDLCK: 1741 case F_RDLCK:
1656 case F_WRLCK: 1742 case F_WRLCK:
1657 return generic_add_lease(filp, arg, flp); 1743 if (!(*flp)->fl_lmops->lm_break) {
1744 WARN_ON_ONCE(1);
1745 return -ENOLCK;
1746 }
1747 return generic_add_lease(filp, arg, flp, priv);
1658 default: 1748 default:
1659 return -EINVAL; 1749 return -EINVAL;
1660 } 1750 }
1661} 1751}
1662EXPORT_SYMBOL(generic_setlease); 1752EXPORT_SYMBOL(generic_setlease);
1663 1753
1664static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
1665{
1666 if (filp->f_op->setlease)
1667 return filp->f_op->setlease(filp, arg, lease);
1668 else
1669 return generic_setlease(filp, arg, lease);
1670}
1671
1672/** 1754/**
1673 * vfs_setlease - sets a lease on an open file 1755 * vfs_setlease - sets a lease on an open file
1674 * @filp: file pointer 1756 * @filp: file pointer
1675 * @arg: type of lease to obtain 1757 * @arg: type of lease to obtain
1676 * @lease: file_lock to use 1758 * @lease: file_lock to use when adding a lease
1677 * 1759 * @priv: private info for lm_setup when adding a lease (may be
1678 * Call this to establish a lease on the file. 1760 * NULL if lm_setup doesn't require it)
1679 * The (*lease)->fl_lmops->lm_break operation must be set; if not, 1761 *
1680 * break_lease will oops! 1762 * Call this to establish a lease on the file. The "lease" argument is not
1681 * 1763 * used for F_UNLCK requests and may be NULL. For commands that set or alter
1682 * This will call the filesystem's setlease file method, if 1764 * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set;
1683 * defined. Note that there is no getlease method; instead, the 1765 * if not, this function will return -ENOLCK (and generate a scary-looking
1684 * filesystem setlease method should call back to setlease() to 1766 * stack trace).
1685 * add a lease to the inode's lease list, where fcntl_getlease() can 1767 *
1686 * find it. Since fcntl_getlease() only reports whether the current 1768 * The "priv" pointer is passed directly to the lm_setup function as-is. It
1687 * task holds a lease, a cluster filesystem need only do this for 1769 * may be NULL if the lm_setup operation doesn't require it.
1688 * leases held by processes on this node.
1689 *
1690 * There is also no break_lease method; filesystems that
1691 * handle their own leases should break leases themselves from the
1692 * filesystem's open, create, and (on truncate) setattr methods.
1693 *
1694 * Warning: the only current setlease methods exist only to disable
1695 * leases in certain cases. More vfs changes may be required to
1696 * allow a full filesystem lease implementation.
1697 */ 1770 */
1698 1771int
1699int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) 1772vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv)
1700{ 1773{
1701 struct inode *inode = file_inode(filp); 1774 if (filp->f_op->setlease)
1702 int error; 1775 return filp->f_op->setlease(filp, arg, lease, priv);
1703 1776 else
1704 spin_lock(&inode->i_lock); 1777 return generic_setlease(filp, arg, lease, priv);
1705 error = __vfs_setlease(filp, arg, lease);
1706 spin_unlock(&inode->i_lock);
1707
1708 return error;
1709} 1778}
1710EXPORT_SYMBOL_GPL(vfs_setlease); 1779EXPORT_SYMBOL_GPL(vfs_setlease);
1711 1780
1712static int do_fcntl_delete_lease(struct file *filp)
1713{
1714 struct file_lock fl, *flp = &fl;
1715
1716 lease_init(filp, F_UNLCK, flp);
1717
1718 return vfs_setlease(filp, F_UNLCK, &flp);
1719}
1720
1721static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) 1781static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1722{ 1782{
1723 struct file_lock *fl, *ret; 1783 struct file_lock *fl;
1724 struct inode *inode = file_inode(filp);
1725 struct fasync_struct *new; 1784 struct fasync_struct *new;
1726 int error; 1785 int error;
1727 1786
@@ -1734,30 +1793,11 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
1734 locks_free_lock(fl); 1793 locks_free_lock(fl);
1735 return -ENOMEM; 1794 return -ENOMEM;
1736 } 1795 }
1737 ret = fl; 1796 new->fa_fd = fd;
1738 spin_lock(&inode->i_lock);
1739 error = __vfs_setlease(filp, arg, &ret);
1740 if (error) {
1741 spin_unlock(&inode->i_lock);
1742 locks_free_lock(fl);
1743 goto out_free_fasync;
1744 }
1745 if (ret != fl)
1746 locks_free_lock(fl);
1747
1748 /*
1749 * fasync_insert_entry() returns the old entry if any.
1750 * If there was no old entry, then it used 'new' and
1751 * inserted it into the fasync list. Clear new so that
1752 * we don't release it here.
1753 */
1754 if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new))
1755 new = NULL;
1756
1757 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
1758 spin_unlock(&inode->i_lock);
1759 1797
1760out_free_fasync: 1798 error = vfs_setlease(filp, arg, &fl, (void **)&new);
1799 if (fl)
1800 locks_free_lock(fl);
1761 if (new) 1801 if (new)
1762 fasync_free(new); 1802 fasync_free(new);
1763 return error; 1803 return error;
@@ -1776,7 +1816,7 @@ out_free_fasync:
1776int fcntl_setlease(unsigned int fd, struct file *filp, long arg) 1816int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
1777{ 1817{
1778 if (arg == F_UNLCK) 1818 if (arg == F_UNLCK)
1779 return do_fcntl_delete_lease(filp); 1819 return vfs_setlease(filp, F_UNLCK, NULL, NULL);
1780 return do_fcntl_add_lease(fd, filp, arg); 1820 return do_fcntl_add_lease(fd, filp, arg);
1781} 1821}
1782 1822
@@ -1845,9 +1885,12 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
1845 !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) 1885 !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
1846 goto out_putf; 1886 goto out_putf;
1847 1887
1848 error = flock_make_lock(f.file, &lock, cmd); 1888 lock = flock_make_lock(f.file, cmd);
1849 if (error) 1889 if (IS_ERR(lock)) {
1890 error = PTR_ERR(lock);
1850 goto out_putf; 1891 goto out_putf;
1892 }
1893
1851 if (can_sleep) 1894 if (can_sleep)
1852 lock->fl_flags |= FL_SLEEP; 1895 lock->fl_flags |= FL_SLEEP;
1853 1896
@@ -1948,7 +1991,7 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
1948 1991
1949 cmd = F_GETLK; 1992 cmd = F_GETLK;
1950 file_lock.fl_flags |= FL_OFDLCK; 1993 file_lock.fl_flags |= FL_OFDLCK;
1951 file_lock.fl_owner = (fl_owner_t)filp; 1994 file_lock.fl_owner = filp;
1952 } 1995 }
1953 1996
1954 error = vfs_test_lock(filp, &file_lock); 1997 error = vfs_test_lock(filp, &file_lock);
@@ -1959,11 +2002,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l)
1959 if (file_lock.fl_type != F_UNLCK) { 2002 if (file_lock.fl_type != F_UNLCK) {
1960 error = posix_lock_to_flock(&flock, &file_lock); 2003 error = posix_lock_to_flock(&flock, &file_lock);
1961 if (error) 2004 if (error)
1962 goto out; 2005 goto rel_priv;
1963 } 2006 }
1964 error = -EFAULT; 2007 error = -EFAULT;
1965 if (!copy_to_user(l, &flock, sizeof(flock))) 2008 if (!copy_to_user(l, &flock, sizeof(flock)))
1966 error = 0; 2009 error = 0;
2010rel_priv:
2011 locks_release_private(&file_lock);
1967out: 2012out:
1968 return error; 2013 return error;
1969} 2014}
@@ -2103,7 +2148,7 @@ again:
2103 2148
2104 cmd = F_SETLK; 2149 cmd = F_SETLK;
2105 file_lock->fl_flags |= FL_OFDLCK; 2150 file_lock->fl_flags |= FL_OFDLCK;
2106 file_lock->fl_owner = (fl_owner_t)filp; 2151 file_lock->fl_owner = filp;
2107 break; 2152 break;
2108 case F_OFD_SETLKW: 2153 case F_OFD_SETLKW:
2109 error = -EINVAL; 2154 error = -EINVAL;
@@ -2112,7 +2157,7 @@ again:
2112 2157
2113 cmd = F_SETLKW; 2158 cmd = F_SETLKW;
2114 file_lock->fl_flags |= FL_OFDLCK; 2159 file_lock->fl_flags |= FL_OFDLCK;
2115 file_lock->fl_owner = (fl_owner_t)filp; 2160 file_lock->fl_owner = filp;
2116 /* Fallthrough */ 2161 /* Fallthrough */
2117 case F_SETLKW: 2162 case F_SETLKW:
2118 file_lock->fl_flags |= FL_SLEEP; 2163 file_lock->fl_flags |= FL_SLEEP;
@@ -2170,7 +2215,7 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
2170 2215
2171 cmd = F_GETLK64; 2216 cmd = F_GETLK64;
2172 file_lock.fl_flags |= FL_OFDLCK; 2217 file_lock.fl_flags |= FL_OFDLCK;
2173 file_lock.fl_owner = (fl_owner_t)filp; 2218 file_lock.fl_owner = filp;
2174 } 2219 }
2175 2220
2176 error = vfs_test_lock(filp, &file_lock); 2221 error = vfs_test_lock(filp, &file_lock);
@@ -2184,7 +2229,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
2184 error = -EFAULT; 2229 error = -EFAULT;
2185 if (!copy_to_user(l, &flock, sizeof(flock))) 2230 if (!copy_to_user(l, &flock, sizeof(flock)))
2186 error = 0; 2231 error = 0;
2187 2232
2233 locks_release_private(&file_lock);
2188out: 2234out:
2189 return error; 2235 return error;
2190} 2236}
@@ -2242,7 +2288,7 @@ again:
2242 2288
2243 cmd = F_SETLK64; 2289 cmd = F_SETLK64;
2244 file_lock->fl_flags |= FL_OFDLCK; 2290 file_lock->fl_flags |= FL_OFDLCK;
2245 file_lock->fl_owner = (fl_owner_t)filp; 2291 file_lock->fl_owner = filp;
2246 break; 2292 break;
2247 case F_OFD_SETLKW: 2293 case F_OFD_SETLKW:
2248 error = -EINVAL; 2294 error = -EINVAL;
@@ -2251,7 +2297,7 @@ again:
2251 2297
2252 cmd = F_SETLKW64; 2298 cmd = F_SETLKW64;
2253 file_lock->fl_flags |= FL_OFDLCK; 2299 file_lock->fl_flags |= FL_OFDLCK;
2254 file_lock->fl_owner = (fl_owner_t)filp; 2300 file_lock->fl_owner = filp;
2255 /* Fallthrough */ 2301 /* Fallthrough */
2256 case F_SETLKW64: 2302 case F_SETLKW64:
2257 file_lock->fl_flags |= FL_SLEEP; 2303 file_lock->fl_flags |= FL_SLEEP;
@@ -2320,15 +2366,16 @@ void locks_remove_file(struct file *filp)
2320 struct inode * inode = file_inode(filp); 2366 struct inode * inode = file_inode(filp);
2321 struct file_lock *fl; 2367 struct file_lock *fl;
2322 struct file_lock **before; 2368 struct file_lock **before;
2369 LIST_HEAD(dispose);
2323 2370
2324 if (!inode->i_flock) 2371 if (!inode->i_flock)
2325 return; 2372 return;
2326 2373
2327 locks_remove_posix(filp, (fl_owner_t)filp); 2374 locks_remove_posix(filp, filp);
2328 2375
2329 if (filp->f_op->flock) { 2376 if (filp->f_op->flock) {
2330 struct file_lock fl = { 2377 struct file_lock fl = {
2331 .fl_owner = (fl_owner_t)filp, 2378 .fl_owner = filp,
2332 .fl_pid = current->tgid, 2379 .fl_pid = current->tgid,
2333 .fl_file = filp, 2380 .fl_file = filp,
2334 .fl_flags = FL_FLOCK, 2381 .fl_flags = FL_FLOCK,
@@ -2346,7 +2393,7 @@ void locks_remove_file(struct file *filp)
2346 while ((fl = *before) != NULL) { 2393 while ((fl = *before) != NULL) {
2347 if (fl->fl_file == filp) { 2394 if (fl->fl_file == filp) {
2348 if (IS_LEASE(fl)) { 2395 if (IS_LEASE(fl)) {
2349 lease_modify(before, F_UNLCK); 2396 lease_modify(before, F_UNLCK, &dispose);
2350 continue; 2397 continue;
2351 } 2398 }
2352 2399
@@ -2365,12 +2412,13 @@ void locks_remove_file(struct file *filp)
2365 fl->fl_type, fl->fl_flags, 2412 fl->fl_type, fl->fl_flags,
2366 fl->fl_start, fl->fl_end); 2413 fl->fl_start, fl->fl_end);
2367 2414
2368 locks_delete_lock(before); 2415 locks_delete_lock(before, &dispose);
2369 continue; 2416 continue;
2370 } 2417 }
2371 before = &fl->fl_next; 2418 before = &fl->fl_next;
2372 } 2419 }
2373 spin_unlock(&inode->i_lock); 2420 spin_unlock(&inode->i_lock);
2421 locks_dispose_list(&dispose);
2374} 2422}
2375 2423
2376/** 2424/**
@@ -2452,7 +2500,11 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
2452 seq_puts(f, "FLOCK ADVISORY "); 2500 seq_puts(f, "FLOCK ADVISORY ");
2453 } 2501 }
2454 } else if (IS_LEASE(fl)) { 2502 } else if (IS_LEASE(fl)) {
2455 seq_puts(f, "LEASE "); 2503 if (fl->fl_flags & FL_DELEG)
2504 seq_puts(f, "DELEG ");
2505 else
2506 seq_puts(f, "LEASE ");
2507
2456 if (lease_breaking(fl)) 2508 if (lease_breaking(fl))
2457 seq_puts(f, "BREAKING "); 2509 seq_puts(f, "BREAKING ");
2458 else if (fl->fl_file) 2510 else if (fl->fl_file)
@@ -2565,86 +2617,6 @@ static int __init proc_locks_init(void)
2565module_init(proc_locks_init); 2617module_init(proc_locks_init);
2566#endif 2618#endif
2567 2619
2568/**
2569 * lock_may_read - checks that the region is free of locks
2570 * @inode: the inode that is being read
2571 * @start: the first byte to read
2572 * @len: the number of bytes to read
2573 *
2574 * Emulates Windows locking requirements. Whole-file
2575 * mandatory locks (share modes) can prohibit a read and
2576 * byte-range POSIX locks can prohibit a read if they overlap.
2577 *
2578 * N.B. this function is only ever called
2579 * from knfsd and ownership of locks is never checked.
2580 */
2581int lock_may_read(struct inode *inode, loff_t start, unsigned long len)
2582{
2583 struct file_lock *fl;
2584 int result = 1;
2585
2586 spin_lock(&inode->i_lock);
2587 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2588 if (IS_POSIX(fl)) {
2589 if (fl->fl_type == F_RDLCK)
2590 continue;
2591 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
2592 continue;
2593 } else if (IS_FLOCK(fl)) {
2594 if (!(fl->fl_type & LOCK_MAND))
2595 continue;
2596 if (fl->fl_type & LOCK_READ)
2597 continue;
2598 } else
2599 continue;
2600 result = 0;
2601 break;
2602 }
2603 spin_unlock(&inode->i_lock);
2604 return result;
2605}
2606
2607EXPORT_SYMBOL(lock_may_read);
2608
2609/**
2610 * lock_may_write - checks that the region is free of locks
2611 * @inode: the inode that is being written
2612 * @start: the first byte to write
2613 * @len: the number of bytes to write
2614 *
2615 * Emulates Windows locking requirements. Whole-file
2616 * mandatory locks (share modes) can prohibit a write and
2617 * byte-range POSIX locks can prohibit a write if they overlap.
2618 *
2619 * N.B. this function is only ever called
2620 * from knfsd and ownership of locks is never checked.
2621 */
2622int lock_may_write(struct inode *inode, loff_t start, unsigned long len)
2623{
2624 struct file_lock *fl;
2625 int result = 1;
2626
2627 spin_lock(&inode->i_lock);
2628 for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
2629 if (IS_POSIX(fl)) {
2630 if ((fl->fl_end < start) || (fl->fl_start > (start + len)))
2631 continue;
2632 } else if (IS_FLOCK(fl)) {
2633 if (!(fl->fl_type & LOCK_MAND))
2634 continue;
2635 if (fl->fl_type & LOCK_WRITE)
2636 continue;
2637 } else
2638 continue;
2639 result = 0;
2640 break;
2641 }
2642 spin_unlock(&inode->i_lock);
2643 return result;
2644}
2645
2646EXPORT_SYMBOL(lock_may_write);
2647
2648static int __init filelock_init(void) 2620static int __init filelock_init(void)
2649{ 2621{
2650 int i; 2622 int i;
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 48140315f627..380d86e1ab45 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1019,11 +1019,11 @@ static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1019/** 1019/**
1020 * logfs_is_valid_block - check whether this block is still valid 1020 * logfs_is_valid_block - check whether this block is still valid
1021 * 1021 *
1022 * @sb - superblock 1022 * @sb: superblock
1023 * @ofs - block physical offset 1023 * @ofs: block physical offset
1024 * @ino - block inode number 1024 * @ino: block inode number
1025 * @bix - block index 1025 * @bix: block index
1026 * @level - block level 1026 * @gc_level: block level
1027 * 1027 *
1028 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will 1028 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1029 * become invalid once the journal is written. 1029 * become invalid once the journal is written.
@@ -2226,10 +2226,9 @@ void btree_write_block(struct logfs_block *block)
2226 * 2226 *
2227 * @inode: parent inode (ifile or directory) 2227 * @inode: parent inode (ifile or directory)
2228 * @buf: object to write (inode or dentry) 2228 * @buf: object to write (inode or dentry)
2229 * @n: object size 2229 * @count: object size
2230 * @_pos: object number (file position in blocks/objects) 2230 * @bix: block index
2231 * @flags: write flags 2231 * @flags: write flags
2232 * @lock: 0 if write lock is already taken, 1 otherwise
2233 * @shadow_tree: shadow below this inode 2232 * @shadow_tree: shadow below this inode
2234 * 2233 *
2235 * FIXME: All caller of this put a 200-300 byte variable on the stack, 2234 * FIXME: All caller of this put a 200-300 byte variable on the stack,
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 4bc50dac8e97..742942a983be 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -96,7 +96,7 @@ int minix_new_block(struct inode * inode)
96unsigned long minix_count_free_blocks(struct super_block *sb) 96unsigned long minix_count_free_blocks(struct super_block *sb)
97{ 97{
98 struct minix_sb_info *sbi = minix_sb(sb); 98 struct minix_sb_info *sbi = minix_sb(sb);
99 u32 bits = sbi->s_nzones - (sbi->s_firstdatazone + 1); 99 u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1;
100 100
101 return (count_free(sbi->s_zmap, sb->s_blocksize, bits) 101 return (count_free(sbi->s_zmap, sb->s_blocksize, bits)
102 << sbi->s_log_zone_size); 102 << sbi->s_log_zone_size);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f007a3355570..3f57af196a7d 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -267,12 +267,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
267 block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize); 267 block = minix_blocks_needed(sbi->s_ninodes, s->s_blocksize);
268 if (sbi->s_imap_blocks < block) { 268 if (sbi->s_imap_blocks < block) {
269 printk("MINIX-fs: file system does not have enough " 269 printk("MINIX-fs: file system does not have enough "
270 "imap blocks allocated. Refusing to mount\n"); 270 "imap blocks allocated. Refusing to mount.\n");
271 goto out_no_bitmap; 271 goto out_no_bitmap;
272 } 272 }
273 273
274 block = minix_blocks_needed( 274 block = minix_blocks_needed(
275 (sbi->s_nzones - (sbi->s_firstdatazone + 1)), 275 (sbi->s_nzones - sbi->s_firstdatazone + 1),
276 s->s_blocksize); 276 s->s_blocksize);
277 if (sbi->s_zmap_blocks < block) { 277 if (sbi->s_zmap_blocks < block) {
278 printk("MINIX-fs: file system does not have enough " 278 printk("MINIX-fs: file system does not have enough "
diff --git a/fs/mount.h b/fs/mount.h
index d55297f2fa05..f82c62840905 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -21,6 +21,7 @@ struct mnt_pcp {
21struct mountpoint { 21struct mountpoint {
22 struct hlist_node m_hash; 22 struct hlist_node m_hash;
23 struct dentry *m_dentry; 23 struct dentry *m_dentry;
24 struct hlist_head m_list;
24 int m_count; 25 int m_count;
25}; 26};
26 27
@@ -29,7 +30,10 @@ struct mount {
29 struct mount *mnt_parent; 30 struct mount *mnt_parent;
30 struct dentry *mnt_mountpoint; 31 struct dentry *mnt_mountpoint;
31 struct vfsmount mnt; 32 struct vfsmount mnt;
32 struct rcu_head mnt_rcu; 33 union {
34 struct rcu_head mnt_rcu;
35 struct llist_node mnt_llist;
36 };
33#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
34 struct mnt_pcp __percpu *mnt_pcp; 38 struct mnt_pcp __percpu *mnt_pcp;
35#else 39#else
@@ -48,6 +52,7 @@ struct mount {
48 struct mount *mnt_master; /* slave is on master->mnt_slave_list */ 52 struct mount *mnt_master; /* slave is on master->mnt_slave_list */
49 struct mnt_namespace *mnt_ns; /* containing namespace */ 53 struct mnt_namespace *mnt_ns; /* containing namespace */
50 struct mountpoint *mnt_mp; /* where is it mounted */ 54 struct mountpoint *mnt_mp; /* where is it mounted */
55 struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */
51#ifdef CONFIG_FSNOTIFY 56#ifdef CONFIG_FSNOTIFY
52 struct hlist_head mnt_fsnotify_marks; 57 struct hlist_head mnt_fsnotify_marks;
53 __u32 mnt_fsnotify_mask; 58 __u32 mnt_fsnotify_mask;
@@ -55,7 +60,7 @@ struct mount {
55 int mnt_id; /* mount identifier */ 60 int mnt_id; /* mount identifier */
56 int mnt_group_id; /* peer group identifier */ 61 int mnt_group_id; /* peer group identifier */
57 int mnt_expiry_mark; /* true if marked for expiry */ 62 int mnt_expiry_mark; /* true if marked for expiry */
58 int mnt_pinned; 63 struct hlist_head mnt_pins;
59 struct path mnt_ex_mountpoint; 64 struct path mnt_ex_mountpoint;
60}; 65};
61 66
@@ -82,6 +87,15 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
82 87
83extern bool legitimize_mnt(struct vfsmount *, unsigned); 88extern bool legitimize_mnt(struct vfsmount *, unsigned);
84 89
90extern void __detach_mounts(struct dentry *dentry);
91
92static inline void detach_mounts(struct dentry *dentry)
93{
94 if (!d_mountpoint(dentry))
95 return;
96 __detach_mounts(dentry);
97}
98
85static inline void get_mnt_ns(struct mnt_namespace *ns) 99static inline void get_mnt_ns(struct mnt_namespace *ns)
86{ 100{
87 atomic_inc(&ns->count); 101 atomic_inc(&ns->count);
@@ -112,3 +126,12 @@ struct proc_mounts {
112#define proc_mounts(p) (container_of((p), struct proc_mounts, m)) 126#define proc_mounts(p) (container_of((p), struct proc_mounts, m))
113 127
114extern const struct seq_operations mounts_op; 128extern const struct seq_operations mounts_op;
129
130extern bool __is_local_mountpoint(struct dentry *dentry);
131static inline bool is_local_mountpoint(struct dentry *dentry)
132{
133 if (!d_mountpoint(dentry))
134 return false;
135
136 return __is_local_mountpoint(dentry);
137}
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f9ed622274f..3e79220babac 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -28,6 +28,7 @@
28#include <linux/backing-dev.h> 28#include <linux/backing-dev.h>
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/cleancache.h> 30#include <linux/cleancache.h>
31#include "internal.h"
31 32
32/* 33/*
33 * I/O completion handler for multipage BIOs. 34 * I/O completion handler for multipage BIOs.
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err)
57static struct bio *mpage_bio_submit(int rw, struct bio *bio) 58static struct bio *mpage_bio_submit(int rw, struct bio *bio)
58{ 59{
59 bio->bi_end_io = mpage_end_io; 60 bio->bi_end_io = mpage_end_io;
61 guard_bio_eod(rw, bio);
60 submit_bio(rw, bio); 62 submit_bio(rw, bio);
61 return NULL; 63 return NULL;
62} 64}
diff --git a/fs/namei.c b/fs/namei.c
index 9eb787e5c167..43927d14db67 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
34#include <linux/device_cgroup.h> 34#include <linux/device_cgroup.h>
35#include <linux/fs_struct.h> 35#include <linux/fs_struct.h>
36#include <linux/posix_acl.h> 36#include <linux/posix_acl.h>
37#include <linux/hash.h>
37#include <asm/uaccess.h> 38#include <asm/uaccess.h>
38 39
39#include "internal.h" 40#include "internal.h"
@@ -643,24 +644,22 @@ static int complete_walk(struct nameidata *nd)
643 644
644static __always_inline void set_root(struct nameidata *nd) 645static __always_inline void set_root(struct nameidata *nd)
645{ 646{
646 if (!nd->root.mnt) 647 get_fs_root(current->fs, &nd->root);
647 get_fs_root(current->fs, &nd->root);
648} 648}
649 649
650static int link_path_walk(const char *, struct nameidata *); 650static int link_path_walk(const char *, struct nameidata *);
651 651
652static __always_inline void set_root_rcu(struct nameidata *nd) 652static __always_inline unsigned set_root_rcu(struct nameidata *nd)
653{ 653{
654 if (!nd->root.mnt) { 654 struct fs_struct *fs = current->fs;
655 struct fs_struct *fs = current->fs; 655 unsigned seq, res;
656 unsigned seq;
657 656
658 do { 657 do {
659 seq = read_seqcount_begin(&fs->seq); 658 seq = read_seqcount_begin(&fs->seq);
660 nd->root = fs->root; 659 nd->root = fs->root;
661 nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); 660 res = __read_seqcount_begin(&nd->root.dentry->d_seq);
662 } while (read_seqcount_retry(&fs->seq, seq)); 661 } while (read_seqcount_retry(&fs->seq, seq));
663 } 662 return res;
664} 663}
665 664
666static void path_put_conditional(struct path *path, struct nameidata *nd) 665static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -860,7 +859,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
860 return PTR_ERR(s); 859 return PTR_ERR(s);
861 } 860 }
862 if (*s == '/') { 861 if (*s == '/') {
863 set_root(nd); 862 if (!nd->root.mnt)
863 set_root(nd);
864 path_put(&nd->path); 864 path_put(&nd->path);
865 nd->path = nd->root; 865 nd->path = nd->root;
866 path_get(&nd->root); 866 path_get(&nd->root);
@@ -1091,10 +1091,10 @@ int follow_down_one(struct path *path)
1091} 1091}
1092EXPORT_SYMBOL(follow_down_one); 1092EXPORT_SYMBOL(follow_down_one);
1093 1093
1094static inline bool managed_dentry_might_block(struct dentry *dentry) 1094static inline int managed_dentry_rcu(struct dentry *dentry)
1095{ 1095{
1096 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT && 1096 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
1097 dentry->d_op->d_manage(dentry, true) < 0); 1097 dentry->d_op->d_manage(dentry, true) : 0;
1098} 1098}
1099 1099
1100/* 1100/*
@@ -1110,11 +1110,18 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1110 * Don't forget we might have a non-mountpoint managed dentry 1110 * Don't forget we might have a non-mountpoint managed dentry
1111 * that wants to block transit. 1111 * that wants to block transit.
1112 */ 1112 */
1113 if (unlikely(managed_dentry_might_block(path->dentry))) 1113 switch (managed_dentry_rcu(path->dentry)) {
1114 case -ECHILD:
1115 default:
1114 return false; 1116 return false;
1117 case -EISDIR:
1118 return true;
1119 case 0:
1120 break;
1121 }
1115 1122
1116 if (!d_mountpoint(path->dentry)) 1123 if (!d_mountpoint(path->dentry))
1117 return true; 1124 return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1118 1125
1119 mounted = __lookup_mnt(path->mnt, path->dentry); 1126 mounted = __lookup_mnt(path->mnt, path->dentry);
1120 if (!mounted) 1127 if (!mounted)
@@ -1130,12 +1137,15 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1130 */ 1137 */
1131 *inode = path->dentry->d_inode; 1138 *inode = path->dentry->d_inode;
1132 } 1139 }
1133 return read_seqretry(&mount_lock, nd->m_seq); 1140 return !read_seqretry(&mount_lock, nd->m_seq) &&
1141 !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
1134} 1142}
1135 1143
1136static int follow_dotdot_rcu(struct nameidata *nd) 1144static int follow_dotdot_rcu(struct nameidata *nd)
1137{ 1145{
1138 set_root_rcu(nd); 1146 struct inode *inode = nd->inode;
1147 if (!nd->root.mnt)
1148 set_root_rcu(nd);
1139 1149
1140 while (1) { 1150 while (1) {
1141 if (nd->path.dentry == nd->root.dentry && 1151 if (nd->path.dentry == nd->root.dentry &&
@@ -1147,6 +1157,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1147 struct dentry *parent = old->d_parent; 1157 struct dentry *parent = old->d_parent;
1148 unsigned seq; 1158 unsigned seq;
1149 1159
1160 inode = parent->d_inode;
1150 seq = read_seqcount_begin(&parent->d_seq); 1161 seq = read_seqcount_begin(&parent->d_seq);
1151 if (read_seqcount_retry(&old->d_seq, nd->seq)) 1162 if (read_seqcount_retry(&old->d_seq, nd->seq))
1152 goto failed; 1163 goto failed;
@@ -1156,6 +1167,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1156 } 1167 }
1157 if (!follow_up_rcu(&nd->path)) 1168 if (!follow_up_rcu(&nd->path))
1158 break; 1169 break;
1170 inode = nd->path.dentry->d_inode;
1159 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 1171 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1160 } 1172 }
1161 while (d_mountpoint(nd->path.dentry)) { 1173 while (d_mountpoint(nd->path.dentry)) {
@@ -1165,11 +1177,12 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1165 break; 1177 break;
1166 nd->path.mnt = &mounted->mnt; 1178 nd->path.mnt = &mounted->mnt;
1167 nd->path.dentry = mounted->mnt.mnt_root; 1179 nd->path.dentry = mounted->mnt.mnt_root;
1180 inode = nd->path.dentry->d_inode;
1168 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); 1181 nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
1169 if (!read_seqretry(&mount_lock, nd->m_seq)) 1182 if (read_seqretry(&mount_lock, nd->m_seq))
1170 goto failed; 1183 goto failed;
1171 } 1184 }
1172 nd->inode = nd->path.dentry->d_inode; 1185 nd->inode = inode;
1173 return 0; 1186 return 0;
1174 1187
1175failed: 1188failed:
@@ -1248,7 +1261,8 @@ static void follow_mount(struct path *path)
1248 1261
1249static void follow_dotdot(struct nameidata *nd) 1262static void follow_dotdot(struct nameidata *nd)
1250{ 1263{
1251 set_root(nd); 1264 if (!nd->root.mnt)
1265 set_root(nd);
1252 1266
1253 while(1) { 1267 while(1) {
1254 struct dentry *old = nd->path.dentry; 1268 struct dentry *old = nd->path.dentry;
@@ -1292,7 +1306,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir,
1292 if (error < 0) { 1306 if (error < 0) {
1293 dput(dentry); 1307 dput(dentry);
1294 return ERR_PTR(error); 1308 return ERR_PTR(error);
1295 } else if (!d_invalidate(dentry)) { 1309 } else {
1310 d_invalidate(dentry);
1296 dput(dentry); 1311 dput(dentry);
1297 dentry = NULL; 1312 dentry = NULL;
1298 } 1313 }
@@ -1402,11 +1417,8 @@ static int lookup_fast(struct nameidata *nd,
1402 } 1417 }
1403 path->mnt = mnt; 1418 path->mnt = mnt;
1404 path->dentry = dentry; 1419 path->dentry = dentry;
1405 if (unlikely(!__follow_mount_rcu(nd, path, inode))) 1420 if (likely(__follow_mount_rcu(nd, path, inode)))
1406 goto unlazy; 1421 return 0;
1407 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
1408 goto unlazy;
1409 return 0;
1410unlazy: 1422unlazy:
1411 if (unlazy_walk(nd, dentry)) 1423 if (unlazy_walk(nd, dentry))
1412 return -ECHILD; 1424 return -ECHILD;
@@ -1424,10 +1436,9 @@ unlazy:
1424 dput(dentry); 1436 dput(dentry);
1425 return status; 1437 return status;
1426 } 1438 }
1427 if (!d_invalidate(dentry)) { 1439 d_invalidate(dentry);
1428 dput(dentry); 1440 dput(dentry);
1429 goto need_lookup; 1441 goto need_lookup;
1430 }
1431 } 1442 }
1432 1443
1433 path->mnt = mnt; 1444 path->mnt = mnt;
@@ -1629,8 +1640,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
1629 1640
1630static inline unsigned int fold_hash(unsigned long hash) 1641static inline unsigned int fold_hash(unsigned long hash)
1631{ 1642{
1632 hash += hash >> (8*sizeof(int)); 1643 return hash_64(hash, 32);
1633 return hash;
1634} 1644}
1635 1645
1636#else /* 32-bit case */ 1646#else /* 32-bit case */
@@ -1664,9 +1674,9 @@ EXPORT_SYMBOL(full_name_hash);
1664 1674
1665/* 1675/*
1666 * Calculate the length and hash of the path component, and 1676 * Calculate the length and hash of the path component, and
1667 * return the length of the component; 1677 * return the "hash_len" as the result.
1668 */ 1678 */
1669static inline unsigned long hash_name(const char *name, unsigned int *hashp) 1679static inline u64 hash_name(const char *name)
1670{ 1680{
1671 unsigned long a, b, adata, bdata, mask, hash, len; 1681 unsigned long a, b, adata, bdata, mask, hash, len;
1672 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; 1682 const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
@@ -1686,9 +1696,8 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
1686 mask = create_zero_mask(adata | bdata); 1696 mask = create_zero_mask(adata | bdata);
1687 1697
1688 hash += a & zero_bytemask(mask); 1698 hash += a & zero_bytemask(mask);
1689 *hashp = fold_hash(hash); 1699 len += find_zero(mask);
1690 1700 return hashlen_create(fold_hash(hash), len);
1691 return len + find_zero(mask);
1692} 1701}
1693 1702
1694#else 1703#else
@@ -1706,7 +1715,7 @@ EXPORT_SYMBOL(full_name_hash);
1706 * We know there's a real path component here of at least 1715 * We know there's a real path component here of at least
1707 * one character. 1716 * one character.
1708 */ 1717 */
1709static inline unsigned long hash_name(const char *name, unsigned int *hashp) 1718static inline u64 hash_name(const char *name)
1710{ 1719{
1711 unsigned long hash = init_name_hash(); 1720 unsigned long hash = init_name_hash();
1712 unsigned long len = 0, c; 1721 unsigned long len = 0, c;
@@ -1717,8 +1726,7 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
1717 hash = partial_name_hash(c, hash); 1726 hash = partial_name_hash(c, hash);
1718 c = (unsigned char)name[len]; 1727 c = (unsigned char)name[len];
1719 } while (c && c != '/'); 1728 } while (c && c != '/');
1720 *hashp = end_name_hash(hash); 1729 return hashlen_create(end_name_hash(hash), len);
1721 return len;
1722} 1730}
1723 1731
1724#endif 1732#endif
@@ -1743,20 +1751,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1743 1751
1744 /* At this point we know we have a real path component. */ 1752 /* At this point we know we have a real path component. */
1745 for(;;) { 1753 for(;;) {
1746 struct qstr this; 1754 u64 hash_len;
1747 long len;
1748 int type; 1755 int type;
1749 1756
1750 err = may_lookup(nd); 1757 err = may_lookup(nd);
1751 if (err) 1758 if (err)
1752 break; 1759 break;
1753 1760
1754 len = hash_name(name, &this.hash); 1761 hash_len = hash_name(name);
1755 this.name = name;
1756 this.len = len;
1757 1762
1758 type = LAST_NORM; 1763 type = LAST_NORM;
1759 if (name[0] == '.') switch (len) { 1764 if (name[0] == '.') switch (hashlen_len(hash_len)) {
1760 case 2: 1765 case 2:
1761 if (name[1] == '.') { 1766 if (name[1] == '.') {
1762 type = LAST_DOTDOT; 1767 type = LAST_DOTDOT;
@@ -1770,29 +1775,32 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1770 struct dentry *parent = nd->path.dentry; 1775 struct dentry *parent = nd->path.dentry;
1771 nd->flags &= ~LOOKUP_JUMPED; 1776 nd->flags &= ~LOOKUP_JUMPED;
1772 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { 1777 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1778 struct qstr this = { { .hash_len = hash_len }, .name = name };
1773 err = parent->d_op->d_hash(parent, &this); 1779 err = parent->d_op->d_hash(parent, &this);
1774 if (err < 0) 1780 if (err < 0)
1775 break; 1781 break;
1782 hash_len = this.hash_len;
1783 name = this.name;
1776 } 1784 }
1777 } 1785 }
1778 1786
1779 nd->last = this; 1787 nd->last.hash_len = hash_len;
1788 nd->last.name = name;
1780 nd->last_type = type; 1789 nd->last_type = type;
1781 1790
1782 if (!name[len]) 1791 name += hashlen_len(hash_len);
1792 if (!*name)
1783 return 0; 1793 return 0;
1784 /* 1794 /*
1785 * If it wasn't NUL, we know it was '/'. Skip that 1795 * If it wasn't NUL, we know it was '/'. Skip that
1786 * slash, and continue until no more slashes. 1796 * slash, and continue until no more slashes.
1787 */ 1797 */
1788 do { 1798 do {
1789 len++; 1799 name++;
1790 } while (unlikely(name[len] == '/')); 1800 } while (unlikely(*name == '/'));
1791 if (!name[len]) 1801 if (!*name)
1792 return 0; 1802 return 0;
1793 1803
1794 name += len;
1795
1796 err = walk_component(nd, &next, LOOKUP_FOLLOW); 1804 err = walk_component(nd, &next, LOOKUP_FOLLOW);
1797 if (err < 0) 1805 if (err < 0)
1798 return err; 1806 return err;
@@ -1847,7 +1855,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1847 if (*name=='/') { 1855 if (*name=='/') {
1848 if (flags & LOOKUP_RCU) { 1856 if (flags & LOOKUP_RCU) {
1849 rcu_read_lock(); 1857 rcu_read_lock();
1850 set_root_rcu(nd); 1858 nd->seq = set_root_rcu(nd);
1851 } else { 1859 } else {
1852 set_root(nd); 1860 set_root(nd);
1853 path_get(&nd->root); 1861 path_get(&nd->root);
@@ -1898,7 +1906,14 @@ static int path_init(int dfd, const char *name, unsigned int flags,
1898 } 1906 }
1899 1907
1900 nd->inode = nd->path.dentry->d_inode; 1908 nd->inode = nd->path.dentry->d_inode;
1901 return 0; 1909 if (!(flags & LOOKUP_RCU))
1910 return 0;
1911 if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
1912 return 0;
1913 if (!(nd->flags & LOOKUP_ROOT))
1914 nd->root.mnt = NULL;
1915 rcu_read_unlock();
1916 return -ECHILD;
1902} 1917}
1903 1918
1904static inline int lookup_last(struct nameidata *nd, struct path *path) 1919static inline int lookup_last(struct nameidata *nd, struct path *path)
@@ -1935,7 +1950,7 @@ static int path_lookupat(int dfd, const char *name,
1935 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); 1950 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1936 1951
1937 if (unlikely(err)) 1952 if (unlikely(err))
1938 return err; 1953 goto out;
1939 1954
1940 current->total_link_count = 0; 1955 current->total_link_count = 0;
1941 err = link_path_walk(name, nd); 1956 err = link_path_walk(name, nd);
@@ -1967,6 +1982,7 @@ static int path_lookupat(int dfd, const char *name,
1967 } 1982 }
1968 } 1983 }
1969 1984
1985out:
1970 if (base) 1986 if (base)
1971 fput(base); 1987 fput(base);
1972 1988
@@ -2286,7 +2302,7 @@ path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags
2286 2302
2287 err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base); 2303 err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
2288 if (unlikely(err)) 2304 if (unlikely(err))
2289 return err; 2305 goto out;
2290 2306
2291 current->total_link_count = 0; 2307 current->total_link_count = 0;
2292 err = link_path_walk(name, &nd); 2308 err = link_path_walk(name, &nd);
@@ -3059,7 +3075,7 @@ opened:
3059 error = open_check_o_direct(file); 3075 error = open_check_o_direct(file);
3060 if (error) 3076 if (error)
3061 goto exit_fput; 3077 goto exit_fput;
3062 error = ima_file_check(file, op->acc_mode); 3078 error = ima_file_check(file, op->acc_mode, *opened);
3063 if (error) 3079 if (error)
3064 goto exit_fput; 3080 goto exit_fput;
3065 3081
@@ -3550,7 +3566,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3550 mutex_lock(&dentry->d_inode->i_mutex); 3566 mutex_lock(&dentry->d_inode->i_mutex);
3551 3567
3552 error = -EBUSY; 3568 error = -EBUSY;
3553 if (d_mountpoint(dentry)) 3569 if (is_local_mountpoint(dentry))
3554 goto out; 3570 goto out;
3555 3571
3556 error = security_inode_rmdir(dir, dentry); 3572 error = security_inode_rmdir(dir, dentry);
@@ -3564,6 +3580,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
3564 3580
3565 dentry->d_inode->i_flags |= S_DEAD; 3581 dentry->d_inode->i_flags |= S_DEAD;
3566 dont_mount(dentry); 3582 dont_mount(dentry);
3583 detach_mounts(dentry);
3567 3584
3568out: 3585out:
3569 mutex_unlock(&dentry->d_inode->i_mutex); 3586 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -3666,7 +3683,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
3666 return -EPERM; 3683 return -EPERM;
3667 3684
3668 mutex_lock(&target->i_mutex); 3685 mutex_lock(&target->i_mutex);
3669 if (d_mountpoint(dentry)) 3686 if (is_local_mountpoint(dentry))
3670 error = -EBUSY; 3687 error = -EBUSY;
3671 else { 3688 else {
3672 error = security_inode_unlink(dir, dentry); 3689 error = security_inode_unlink(dir, dentry);
@@ -3675,8 +3692,10 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
3675 if (error) 3692 if (error)
3676 goto out; 3693 goto out;
3677 error = dir->i_op->unlink(dir, dentry); 3694 error = dir->i_op->unlink(dir, dentry);
3678 if (!error) 3695 if (!error) {
3679 dont_mount(dentry); 3696 dont_mount(dentry);
3697 detach_mounts(dentry);
3698 }
3680 } 3699 }
3681 } 3700 }
3682out: 3701out:
@@ -4019,7 +4038,7 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
4019 * The worst of all namespace operations - renaming directory. "Perverted" 4038 * The worst of all namespace operations - renaming directory. "Perverted"
4020 * doesn't even start to describe it. Somebody in UCB had a heck of a trip... 4039 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
4021 * Problems: 4040 * Problems:
4022 * a) we can get into loop creation. Check is done in is_subdir(). 4041 * a) we can get into loop creation.
4023 * b) race potential - two innocent renames can create a loop together. 4042 * b) race potential - two innocent renames can create a loop together.
4024 * That's where 4.4 screws up. Current fix: serialization on 4043 * That's where 4.4 screws up. Current fix: serialization on
4025 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another 4044 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another
@@ -4075,7 +4094,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4075 if (error) 4094 if (error)
4076 return error; 4095 return error;
4077 4096
4078 if (!old_dir->i_op->rename) 4097 if (!old_dir->i_op->rename && !old_dir->i_op->rename2)
4079 return -EPERM; 4098 return -EPERM;
4080 4099
4081 if (flags && !old_dir->i_op->rename2) 4100 if (flags && !old_dir->i_op->rename2)
@@ -4111,7 +4130,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4111 mutex_lock(&target->i_mutex); 4130 mutex_lock(&target->i_mutex);
4112 4131
4113 error = -EBUSY; 4132 error = -EBUSY;
4114 if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) 4133 if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
4115 goto out; 4134 goto out;
4116 4135
4117 if (max_links && new_dir != old_dir) { 4136 if (max_links && new_dir != old_dir) {
@@ -4134,10 +4153,11 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4134 if (error) 4153 if (error)
4135 goto out; 4154 goto out;
4136 } 4155 }
4137 if (!flags) { 4156 if (!old_dir->i_op->rename2) {
4138 error = old_dir->i_op->rename(old_dir, old_dentry, 4157 error = old_dir->i_op->rename(old_dir, old_dentry,
4139 new_dir, new_dentry); 4158 new_dir, new_dentry);
4140 } else { 4159 } else {
4160 WARN_ON(old_dir->i_op->rename != NULL);
4141 error = old_dir->i_op->rename2(old_dir, old_dentry, 4161 error = old_dir->i_op->rename2(old_dir, old_dentry,
4142 new_dir, new_dentry, flags); 4162 new_dir, new_dentry, flags);
4143 } 4163 }
@@ -4148,6 +4168,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
4148 if (is_dir) 4168 if (is_dir)
4149 target->i_flags |= S_DEAD; 4169 target->i_flags |= S_DEAD;
4150 dont_mount(new_dentry); 4170 dont_mount(new_dentry);
4171 detach_mounts(new_dentry);
4151 } 4172 }
4152 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { 4173 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
4153 if (!(flags & RENAME_EXCHANGE)) 4174 if (!(flags & RENAME_EXCHANGE))
diff --git a/fs/namespace.c b/fs/namespace.c
index 182bc41cd887..fbba8b17330d 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -16,7 +16,6 @@
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/security.h> 17#include <linux/security.h>
18#include <linux/idr.h> 18#include <linux/idr.h>
19#include <linux/acct.h> /* acct_auto_close_mnt */
20#include <linux/init.h> /* init_rootfs */ 19#include <linux/init.h> /* init_rootfs */
21#include <linux/fs_struct.h> /* get_fs_root et.al. */ 20#include <linux/fs_struct.h> /* get_fs_root et.al. */
22#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 21#include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */
@@ -24,6 +23,7 @@
24#include <linux/proc_ns.h> 23#include <linux/proc_ns.h>
25#include <linux/magic.h> 24#include <linux/magic.h>
26#include <linux/bootmem.h> 25#include <linux/bootmem.h>
26#include <linux/task_work.h>
27#include "pnode.h" 27#include "pnode.h"
28#include "internal.h" 28#include "internal.h"
29 29
@@ -225,6 +225,7 @@ static struct mount *alloc_vfsmnt(const char *name)
225 INIT_LIST_HEAD(&mnt->mnt_share); 225 INIT_LIST_HEAD(&mnt->mnt_share);
226 INIT_LIST_HEAD(&mnt->mnt_slave_list); 226 INIT_LIST_HEAD(&mnt->mnt_slave_list);
227 INIT_LIST_HEAD(&mnt->mnt_slave); 227 INIT_LIST_HEAD(&mnt->mnt_slave);
228 INIT_HLIST_NODE(&mnt->mnt_mp_list);
228#ifdef CONFIG_FSNOTIFY 229#ifdef CONFIG_FSNOTIFY
229 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); 230 INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
230#endif 231#endif
@@ -667,11 +668,45 @@ struct vfsmount *lookup_mnt(struct path *path)
667 return m; 668 return m;
668} 669}
669 670
670static struct mountpoint *new_mountpoint(struct dentry *dentry) 671/*
672 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
673 * current mount namespace.
674 *
675 * The common case is dentries are not mountpoints at all and that
676 * test is handled inline. For the slow case when we are actually
677 * dealing with a mountpoint of some kind, walk through all of the
678 * mounts in the current mount namespace and test to see if the dentry
679 * is a mountpoint.
680 *
681 * The mount_hashtable is not usable in the context because we
682 * need to identify all mounts that may be in the current mount
683 * namespace not just a mount that happens to have some specified
684 * parent mount.
685 */
686bool __is_local_mountpoint(struct dentry *dentry)
687{
688 struct mnt_namespace *ns = current->nsproxy->mnt_ns;
689 struct mount *mnt;
690 bool is_covered = false;
691
692 if (!d_mountpoint(dentry))
693 goto out;
694
695 down_read(&namespace_sem);
696 list_for_each_entry(mnt, &ns->list, mnt_list) {
697 is_covered = (mnt->mnt_mountpoint == dentry);
698 if (is_covered)
699 break;
700 }
701 up_read(&namespace_sem);
702out:
703 return is_covered;
704}
705
706static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
671{ 707{
672 struct hlist_head *chain = mp_hash(dentry); 708 struct hlist_head *chain = mp_hash(dentry);
673 struct mountpoint *mp; 709 struct mountpoint *mp;
674 int ret;
675 710
676 hlist_for_each_entry(mp, chain, m_hash) { 711 hlist_for_each_entry(mp, chain, m_hash) {
677 if (mp->m_dentry == dentry) { 712 if (mp->m_dentry == dentry) {
@@ -682,6 +717,14 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
682 return mp; 717 return mp;
683 } 718 }
684 } 719 }
720 return NULL;
721}
722
723static struct mountpoint *new_mountpoint(struct dentry *dentry)
724{
725 struct hlist_head *chain = mp_hash(dentry);
726 struct mountpoint *mp;
727 int ret;
685 728
686 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); 729 mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
687 if (!mp) 730 if (!mp)
@@ -696,6 +739,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry)
696 mp->m_dentry = dentry; 739 mp->m_dentry = dentry;
697 mp->m_count = 1; 740 mp->m_count = 1;
698 hlist_add_head(&mp->m_hash, chain); 741 hlist_add_head(&mp->m_hash, chain);
742 INIT_HLIST_HEAD(&mp->m_list);
699 return mp; 743 return mp;
700} 744}
701 745
@@ -703,6 +747,7 @@ static void put_mountpoint(struct mountpoint *mp)
703{ 747{
704 if (!--mp->m_count) { 748 if (!--mp->m_count) {
705 struct dentry *dentry = mp->m_dentry; 749 struct dentry *dentry = mp->m_dentry;
750 BUG_ON(!hlist_empty(&mp->m_list));
706 spin_lock(&dentry->d_lock); 751 spin_lock(&dentry->d_lock);
707 dentry->d_flags &= ~DCACHE_MOUNTED; 752 dentry->d_flags &= ~DCACHE_MOUNTED;
708 spin_unlock(&dentry->d_lock); 753 spin_unlock(&dentry->d_lock);
@@ -749,6 +794,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
749 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 794 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
750 list_del_init(&mnt->mnt_child); 795 list_del_init(&mnt->mnt_child);
751 hlist_del_init_rcu(&mnt->mnt_hash); 796 hlist_del_init_rcu(&mnt->mnt_hash);
797 hlist_del_init(&mnt->mnt_mp_list);
752 put_mountpoint(mnt->mnt_mp); 798 put_mountpoint(mnt->mnt_mp);
753 mnt->mnt_mp = NULL; 799 mnt->mnt_mp = NULL;
754} 800}
@@ -765,6 +811,7 @@ void mnt_set_mountpoint(struct mount *mnt,
765 child_mnt->mnt_mountpoint = dget(mp->m_dentry); 811 child_mnt->mnt_mountpoint = dget(mp->m_dentry);
766 child_mnt->mnt_parent = mnt; 812 child_mnt->mnt_parent = mnt;
767 child_mnt->mnt_mp = mp; 813 child_mnt->mnt_mp = mp;
814 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
768} 815}
769 816
770/* 817/*
@@ -779,6 +826,20 @@ static void attach_mnt(struct mount *mnt,
779 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 826 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
780} 827}
781 828
829static void attach_shadowed(struct mount *mnt,
830 struct mount *parent,
831 struct mount *shadows)
832{
833 if (shadows) {
834 hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
835 list_add(&mnt->mnt_child, &shadows->mnt_child);
836 } else {
837 hlist_add_head_rcu(&mnt->mnt_hash,
838 m_hash(&parent->mnt, mnt->mnt_mountpoint));
839 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
840 }
841}
842
782/* 843/*
783 * vfsmount lock must be held for write 844 * vfsmount lock must be held for write
784 */ 845 */
@@ -797,12 +858,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows)
797 858
798 list_splice(&head, n->list.prev); 859 list_splice(&head, n->list.prev);
799 860
800 if (shadows) 861 attach_shadowed(mnt, parent, shadows);
801 hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash);
802 else
803 hlist_add_head_rcu(&mnt->mnt_hash,
804 m_hash(&parent->mnt, mnt->mnt_mountpoint));
805 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
806 touch_mnt_namespace(n); 862 touch_mnt_namespace(n);
807} 863}
808 864
@@ -890,8 +946,21 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
890 946
891 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); 947 mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
892 /* Don't allow unprivileged users to change mount flags */ 948 /* Don't allow unprivileged users to change mount flags */
893 if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) 949 if (flag & CL_UNPRIVILEGED) {
894 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; 950 mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
951
952 if (mnt->mnt.mnt_flags & MNT_READONLY)
953 mnt->mnt.mnt_flags |= MNT_LOCK_READONLY;
954
955 if (mnt->mnt.mnt_flags & MNT_NODEV)
956 mnt->mnt.mnt_flags |= MNT_LOCK_NODEV;
957
958 if (mnt->mnt.mnt_flags & MNT_NOSUID)
959 mnt->mnt.mnt_flags |= MNT_LOCK_NOSUID;
960
961 if (mnt->mnt.mnt_flags & MNT_NOEXEC)
962 mnt->mnt.mnt_flags |= MNT_LOCK_NOEXEC;
963 }
895 964
896 /* Don't allow unprivileged users to reveal what is under a mount */ 965 /* Don't allow unprivileged users to reveal what is under a mount */
897 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) 966 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire))
@@ -936,9 +1005,48 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
936 return ERR_PTR(err); 1005 return ERR_PTR(err);
937} 1006}
938 1007
1008static void cleanup_mnt(struct mount *mnt)
1009{
1010 /*
1011 * This probably indicates that somebody messed
1012 * up a mnt_want/drop_write() pair. If this
1013 * happens, the filesystem was probably unable
1014 * to make r/w->r/o transitions.
1015 */
1016 /*
1017 * The locking used to deal with mnt_count decrement provides barriers,
1018 * so mnt_get_writers() below is safe.
1019 */
1020 WARN_ON(mnt_get_writers(mnt));
1021 if (unlikely(mnt->mnt_pins.first))
1022 mnt_pin_kill(mnt);
1023 fsnotify_vfsmount_delete(&mnt->mnt);
1024 dput(mnt->mnt.mnt_root);
1025 deactivate_super(mnt->mnt.mnt_sb);
1026 mnt_free_id(mnt);
1027 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1028}
1029
1030static void __cleanup_mnt(struct rcu_head *head)
1031{
1032 cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1033}
1034
1035static LLIST_HEAD(delayed_mntput_list);
1036static void delayed_mntput(struct work_struct *unused)
1037{
1038 struct llist_node *node = llist_del_all(&delayed_mntput_list);
1039 struct llist_node *next;
1040
1041 for (; node; node = next) {
1042 next = llist_next(node);
1043 cleanup_mnt(llist_entry(node, struct mount, mnt_llist));
1044 }
1045}
1046static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1047
939static void mntput_no_expire(struct mount *mnt) 1048static void mntput_no_expire(struct mount *mnt)
940{ 1049{
941put_again:
942 rcu_read_lock(); 1050 rcu_read_lock();
943 mnt_add_count(mnt, -1); 1051 mnt_add_count(mnt, -1);
944 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */ 1052 if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
@@ -951,14 +1059,6 @@ put_again:
951 unlock_mount_hash(); 1059 unlock_mount_hash();
952 return; 1060 return;
953 } 1061 }
954 if (unlikely(mnt->mnt_pinned)) {
955 mnt_add_count(mnt, mnt->mnt_pinned + 1);
956 mnt->mnt_pinned = 0;
957 rcu_read_unlock();
958 unlock_mount_hash();
959 acct_auto_close_mnt(&mnt->mnt);
960 goto put_again;
961 }
962 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { 1062 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
963 rcu_read_unlock(); 1063 rcu_read_unlock();
964 unlock_mount_hash(); 1064 unlock_mount_hash();
@@ -970,22 +1070,18 @@ put_again:
970 list_del(&mnt->mnt_instance); 1070 list_del(&mnt->mnt_instance);
971 unlock_mount_hash(); 1071 unlock_mount_hash();
972 1072
973 /* 1073 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
974 * This probably indicates that somebody messed 1074 struct task_struct *task = current;
975 * up a mnt_want/drop_write() pair. If this 1075 if (likely(!(task->flags & PF_KTHREAD))) {
976 * happens, the filesystem was probably unable 1076 init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
977 * to make r/w->r/o transitions. 1077 if (!task_work_add(task, &mnt->mnt_rcu, true))
978 */ 1078 return;
979 /* 1079 }
980 * The locking used to deal with mnt_count decrement provides barriers, 1080 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
981 * so mnt_get_writers() below is safe. 1081 schedule_delayed_work(&delayed_mntput_work, 1);
982 */ 1082 return;
983 WARN_ON(mnt_get_writers(mnt)); 1083 }
984 fsnotify_vfsmount_delete(&mnt->mnt); 1084 cleanup_mnt(mnt);
985 dput(mnt->mnt.mnt_root);
986 deactivate_super(mnt->mnt.mnt_sb);
987 mnt_free_id(mnt);
988 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
989} 1085}
990 1086
991void mntput(struct vfsmount *mnt) 1087void mntput(struct vfsmount *mnt)
@@ -1008,25 +1104,15 @@ struct vfsmount *mntget(struct vfsmount *mnt)
1008} 1104}
1009EXPORT_SYMBOL(mntget); 1105EXPORT_SYMBOL(mntget);
1010 1106
1011void mnt_pin(struct vfsmount *mnt) 1107struct vfsmount *mnt_clone_internal(struct path *path)
1012{
1013 lock_mount_hash();
1014 real_mount(mnt)->mnt_pinned++;
1015 unlock_mount_hash();
1016}
1017EXPORT_SYMBOL(mnt_pin);
1018
1019void mnt_unpin(struct vfsmount *m)
1020{ 1108{
1021 struct mount *mnt = real_mount(m); 1109 struct mount *p;
1022 lock_mount_hash(); 1110 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1023 if (mnt->mnt_pinned) { 1111 if (IS_ERR(p))
1024 mnt_add_count(mnt, 1); 1112 return ERR_CAST(p);
1025 mnt->mnt_pinned--; 1113 p->mnt.mnt_flags |= MNT_INTERNAL;
1026 } 1114 return &p->mnt;
1027 unlock_mount_hash();
1028} 1115}
1029EXPORT_SYMBOL(mnt_unpin);
1030 1116
1031static inline void mangle(struct seq_file *m, const char *s) 1117static inline void mangle(struct seq_file *m, const char *s)
1032{ 1118{
@@ -1213,6 +1299,11 @@ static void namespace_unlock(void)
1213 head.first->pprev = &head.first; 1299 head.first->pprev = &head.first;
1214 INIT_HLIST_HEAD(&unmounted); 1300 INIT_HLIST_HEAD(&unmounted);
1215 1301
1302 /* undo decrements we'd done in umount_tree() */
1303 hlist_for_each_entry(mnt, &head, mnt_hash)
1304 if (mnt->mnt_ex_mountpoint.mnt)
1305 mntget(mnt->mnt_ex_mountpoint.mnt);
1306
1216 up_write(&namespace_sem); 1307 up_write(&namespace_sem);
1217 1308
1218 synchronize_rcu(); 1309 synchronize_rcu();
@@ -1249,6 +1340,9 @@ void umount_tree(struct mount *mnt, int how)
1249 hlist_add_head(&p->mnt_hash, &tmp_list); 1340 hlist_add_head(&p->mnt_hash, &tmp_list);
1250 } 1341 }
1251 1342
1343 hlist_for_each_entry(p, &tmp_list, mnt_hash)
1344 list_del_init(&p->mnt_child);
1345
1252 if (how) 1346 if (how)
1253 propagate_umount(&tmp_list); 1347 propagate_umount(&tmp_list);
1254 1348
@@ -1259,9 +1353,10 @@ void umount_tree(struct mount *mnt, int how)
1259 p->mnt_ns = NULL; 1353 p->mnt_ns = NULL;
1260 if (how < 2) 1354 if (how < 2)
1261 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1355 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1262 list_del_init(&p->mnt_child);
1263 if (mnt_has_parent(p)) { 1356 if (mnt_has_parent(p)) {
1357 hlist_del_init(&p->mnt_mp_list);
1264 put_mountpoint(p->mnt_mp); 1358 put_mountpoint(p->mnt_mp);
1359 mnt_add_count(p->mnt_parent, -1);
1265 /* move the reference to mountpoint into ->mnt_ex_mountpoint */ 1360 /* move the reference to mountpoint into ->mnt_ex_mountpoint */
1266 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint; 1361 p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
1267 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt; 1362 p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
@@ -1344,6 +1439,8 @@ static int do_umount(struct mount *mnt, int flags)
1344 * Special case for "unmounting" root ... 1439 * Special case for "unmounting" root ...
1345 * we just try to remount it readonly. 1440 * we just try to remount it readonly.
1346 */ 1441 */
1442 if (!capable(CAP_SYS_ADMIN))
1443 return -EPERM;
1347 down_write(&sb->s_umount); 1444 down_write(&sb->s_umount);
1348 if (!(sb->s_flags & MS_RDONLY)) 1445 if (!(sb->s_flags & MS_RDONLY))
1349 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); 1446 retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
@@ -1373,6 +1470,37 @@ static int do_umount(struct mount *mnt, int flags)
1373 return retval; 1470 return retval;
1374} 1471}
1375 1472
1473/*
1474 * __detach_mounts - lazily unmount all mounts on the specified dentry
1475 *
1476 * During unlink, rmdir, and d_drop it is possible to loose the path
1477 * to an existing mountpoint, and wind up leaking the mount.
1478 * detach_mounts allows lazily unmounting those mounts instead of
1479 * leaking them.
1480 *
1481 * The caller may hold dentry->d_inode->i_mutex.
1482 */
1483void __detach_mounts(struct dentry *dentry)
1484{
1485 struct mountpoint *mp;
1486 struct mount *mnt;
1487
1488 namespace_lock();
1489 mp = lookup_mountpoint(dentry);
1490 if (!mp)
1491 goto out_unlock;
1492
1493 lock_mount_hash();
1494 while (!hlist_empty(&mp->m_list)) {
1495 mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
1496 umount_tree(mnt, 2);
1497 }
1498 unlock_mount_hash();
1499 put_mountpoint(mp);
1500out_unlock:
1501 namespace_unlock();
1502}
1503
1376/* 1504/*
1377 * Is the caller allowed to modify his namespace? 1505 * Is the caller allowed to modify his namespace?
1378 */ 1506 */
@@ -1492,6 +1620,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1492 continue; 1620 continue;
1493 1621
1494 for (s = r; s; s = next_mnt(s, r)) { 1622 for (s = r; s; s = next_mnt(s, r)) {
1623 struct mount *t = NULL;
1495 if (!(flag & CL_COPY_UNBINDABLE) && 1624 if (!(flag & CL_COPY_UNBINDABLE) &&
1496 IS_MNT_UNBINDABLE(s)) { 1625 IS_MNT_UNBINDABLE(s)) {
1497 s = skip_mnt_tree(s); 1626 s = skip_mnt_tree(s);
@@ -1513,7 +1642,14 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1513 goto out; 1642 goto out;
1514 lock_mount_hash(); 1643 lock_mount_hash();
1515 list_add_tail(&q->mnt_list, &res->mnt_list); 1644 list_add_tail(&q->mnt_list, &res->mnt_list);
1516 attach_mnt(q, parent, p->mnt_mp); 1645 mnt_set_mountpoint(parent, p->mnt_mp, q);
1646 if (!list_empty(&parent->mnt_mounts)) {
1647 t = list_last_entry(&parent->mnt_mounts,
1648 struct mount, mnt_child);
1649 if (t->mnt_mp != p->mnt_mp)
1650 t = NULL;
1651 }
1652 attach_shadowed(q, parent, t);
1517 unlock_mount_hash(); 1653 unlock_mount_hash();
1518 } 1654 }
1519 } 1655 }
@@ -1722,7 +1858,9 @@ retry:
1722 namespace_lock(); 1858 namespace_lock();
1723 mnt = lookup_mnt(path); 1859 mnt = lookup_mnt(path);
1724 if (likely(!mnt)) { 1860 if (likely(!mnt)) {
1725 struct mountpoint *mp = new_mountpoint(dentry); 1861 struct mountpoint *mp = lookup_mountpoint(dentry);
1862 if (!mp)
1863 mp = new_mountpoint(dentry);
1726 if (IS_ERR(mp)) { 1864 if (IS_ERR(mp)) {
1727 namespace_unlock(); 1865 namespace_unlock();
1728 mutex_unlock(&dentry->d_inode->i_mutex); 1866 mutex_unlock(&dentry->d_inode->i_mutex);
@@ -1896,9 +2034,6 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
1896 if (readonly_request == __mnt_is_readonly(mnt)) 2034 if (readonly_request == __mnt_is_readonly(mnt))
1897 return 0; 2035 return 0;
1898 2036
1899 if (mnt->mnt_flags & MNT_LOCK_READONLY)
1900 return -EPERM;
1901
1902 if (readonly_request) 2037 if (readonly_request)
1903 error = mnt_make_readonly(real_mount(mnt)); 2038 error = mnt_make_readonly(real_mount(mnt));
1904 else 2039 else
@@ -1924,6 +2059,33 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1924 if (path->dentry != path->mnt->mnt_root) 2059 if (path->dentry != path->mnt->mnt_root)
1925 return -EINVAL; 2060 return -EINVAL;
1926 2061
2062 /* Don't allow changing of locked mnt flags.
2063 *
2064 * No locks need to be held here while testing the various
2065 * MNT_LOCK flags because those flags can never be cleared
2066 * once they are set.
2067 */
2068 if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) &&
2069 !(mnt_flags & MNT_READONLY)) {
2070 return -EPERM;
2071 }
2072 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
2073 !(mnt_flags & MNT_NODEV)) {
2074 return -EPERM;
2075 }
2076 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
2077 !(mnt_flags & MNT_NOSUID)) {
2078 return -EPERM;
2079 }
2080 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOEXEC) &&
2081 !(mnt_flags & MNT_NOEXEC)) {
2082 return -EPERM;
2083 }
2084 if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) &&
2085 ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) {
2086 return -EPERM;
2087 }
2088
1927 err = security_sb_remount(sb, data); 2089 err = security_sb_remount(sb, data);
1928 if (err) 2090 if (err)
1929 return err; 2091 return err;
@@ -1937,7 +2099,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1937 err = do_remount_sb(sb, flags, data, 0); 2099 err = do_remount_sb(sb, flags, data, 0);
1938 if (!err) { 2100 if (!err) {
1939 lock_mount_hash(); 2101 lock_mount_hash();
1940 mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK; 2102 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
1941 mnt->mnt.mnt_flags = mnt_flags; 2103 mnt->mnt.mnt_flags = mnt_flags;
1942 touch_mnt_namespace(mnt->mnt_ns); 2104 touch_mnt_namespace(mnt->mnt_ns);
1943 unlock_mount_hash(); 2105 unlock_mount_hash();
@@ -2122,7 +2284,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
2122 */ 2284 */
2123 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) { 2285 if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
2124 flags |= MS_NODEV; 2286 flags |= MS_NODEV;
2125 mnt_flags |= MNT_NODEV; 2287 mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
2126 } 2288 }
2127 } 2289 }
2128 2290
@@ -2354,21 +2516,9 @@ int copy_mount_options(const void __user * data, unsigned long *where)
2354 return 0; 2516 return 0;
2355} 2517}
2356 2518
2357int copy_mount_string(const void __user *data, char **where) 2519char *copy_mount_string(const void __user *data)
2358{ 2520{
2359 char *tmp; 2521 return data ? strndup_user(data, PAGE_SIZE) : NULL;
2360
2361 if (!data) {
2362 *where = NULL;
2363 return 0;
2364 }
2365
2366 tmp = strndup_user(data, PAGE_SIZE);
2367 if (IS_ERR(tmp))
2368 return PTR_ERR(tmp);
2369
2370 *where = tmp;
2371 return 0;
2372} 2522}
2373 2523
2374/* 2524/*
@@ -2385,7 +2535,7 @@ int copy_mount_string(const void __user *data, char **where)
2385 * Therefore, if this magic number is present, it carries no information 2535 * Therefore, if this magic number is present, it carries no information
2386 * and must be discarded. 2536 * and must be discarded.
2387 */ 2537 */
2388long do_mount(const char *dev_name, const char *dir_name, 2538long do_mount(const char *dev_name, const char __user *dir_name,
2389 const char *type_page, unsigned long flags, void *data_page) 2539 const char *type_page, unsigned long flags, void *data_page)
2390{ 2540{
2391 struct path path; 2541 struct path path;
@@ -2397,15 +2547,11 @@ long do_mount(const char *dev_name, const char *dir_name,
2397 flags &= ~MS_MGC_MSK; 2547 flags &= ~MS_MGC_MSK;
2398 2548
2399 /* Basic sanity checks */ 2549 /* Basic sanity checks */
2400
2401 if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE))
2402 return -EINVAL;
2403
2404 if (data_page) 2550 if (data_page)
2405 ((char *)data_page)[PAGE_SIZE - 1] = 0; 2551 ((char *)data_page)[PAGE_SIZE - 1] = 0;
2406 2552
2407 /* ... and get the mountpoint */ 2553 /* ... and get the mountpoint */
2408 retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); 2554 retval = user_path(dir_name, &path);
2409 if (retval) 2555 if (retval)
2410 return retval; 2556 return retval;
2411 2557
@@ -2436,6 +2582,14 @@ long do_mount(const char *dev_name, const char *dir_name,
2436 if (flags & MS_RDONLY) 2582 if (flags & MS_RDONLY)
2437 mnt_flags |= MNT_READONLY; 2583 mnt_flags |= MNT_READONLY;
2438 2584
2585 /* The default atime for remount is preservation */
2586 if ((flags & MS_REMOUNT) &&
2587 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
2588 MS_STRICTATIME)) == 0)) {
2589 mnt_flags &= ~MNT_ATIME_MASK;
2590 mnt_flags |= path.mnt->mnt_flags & MNT_ATIME_MASK;
2591 }
2592
2439 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN | 2593 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_BORN |
2440 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT | 2594 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT |
2441 MS_STRICTATIME); 2595 MS_STRICTATIME);
@@ -2622,37 +2776,30 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
2622{ 2776{
2623 int ret; 2777 int ret;
2624 char *kernel_type; 2778 char *kernel_type;
2625 struct filename *kernel_dir;
2626 char *kernel_dev; 2779 char *kernel_dev;
2627 unsigned long data_page; 2780 unsigned long data_page;
2628 2781
2629 ret = copy_mount_string(type, &kernel_type); 2782 kernel_type = copy_mount_string(type);
2630 if (ret < 0) 2783 ret = PTR_ERR(kernel_type);
2784 if (IS_ERR(kernel_type))
2631 goto out_type; 2785 goto out_type;
2632 2786
2633 kernel_dir = getname(dir_name); 2787 kernel_dev = copy_mount_string(dev_name);
2634 if (IS_ERR(kernel_dir)) { 2788 ret = PTR_ERR(kernel_dev);
2635 ret = PTR_ERR(kernel_dir); 2789 if (IS_ERR(kernel_dev))
2636 goto out_dir;
2637 }
2638
2639 ret = copy_mount_string(dev_name, &kernel_dev);
2640 if (ret < 0)
2641 goto out_dev; 2790 goto out_dev;
2642 2791
2643 ret = copy_mount_options(data, &data_page); 2792 ret = copy_mount_options(data, &data_page);
2644 if (ret < 0) 2793 if (ret < 0)
2645 goto out_data; 2794 goto out_data;
2646 2795
2647 ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, 2796 ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
2648 (void *) data_page); 2797 (void *) data_page);
2649 2798
2650 free_page(data_page); 2799 free_page(data_page);
2651out_data: 2800out_data:
2652 kfree(kernel_dev); 2801 kfree(kernel_dev);
2653out_dev: 2802out_dev:
2654 putname(kernel_dir);
2655out_dir:
2656 kfree(kernel_type); 2803 kfree(kernel_type);
2657out_type: 2804out_type:
2658 return ret; 2805 return ret;
@@ -2768,6 +2915,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2768 /* make sure we can reach put_old from new_root */ 2915 /* make sure we can reach put_old from new_root */
2769 if (!is_path_reachable(old_mnt, old.dentry, &new)) 2916 if (!is_path_reachable(old_mnt, old.dentry, &new))
2770 goto out4; 2917 goto out4;
2918 /* make certain new is below the root */
2919 if (!is_path_reachable(new_mnt, new.dentry, &root))
2920 goto out4;
2771 root_mp->m_count++; /* pin it so it won't go away */ 2921 root_mp->m_count++; /* pin it so it won't go away */
2772 lock_mount_hash(); 2922 lock_mount_hash();
2773 detach_mnt(new_mnt, &parent_path); 2923 detach_mnt(new_mnt, &parent_path);
@@ -2972,13 +3122,13 @@ static void *mntns_get(struct task_struct *task)
2972 struct mnt_namespace *ns = NULL; 3122 struct mnt_namespace *ns = NULL;
2973 struct nsproxy *nsproxy; 3123 struct nsproxy *nsproxy;
2974 3124
2975 rcu_read_lock(); 3125 task_lock(task);
2976 nsproxy = task_nsproxy(task); 3126 nsproxy = task->nsproxy;
2977 if (nsproxy) { 3127 if (nsproxy) {
2978 ns = nsproxy->mnt_ns; 3128 ns = nsproxy->mnt_ns;
2979 get_mnt_ns(ns); 3129 get_mnt_ns(ns);
2980 } 3130 }
2981 rcu_read_unlock(); 3131 task_unlock(task);
2982 3132
2983 return ns; 3133 return ns;
2984} 3134}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 08b8ea8c353e..7cb751dfbeef 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -388,7 +388,6 @@ static struct dentry *
388ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) 388ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
389{ 389{
390 struct dentry *dent = dentry; 390 struct dentry *dent = dentry;
391 struct list_head *next;
392 391
393 if (d_validate(dent, parent)) { 392 if (d_validate(dent, parent)) {
394 if (dent->d_name.len <= NCP_MAXPATHLEN && 393 if (dent->d_name.len <= NCP_MAXPATHLEN &&
@@ -404,9 +403,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
404 403
405 /* If a pointer is invalid, we search the dentry. */ 404 /* If a pointer is invalid, we search the dentry. */
406 spin_lock(&parent->d_lock); 405 spin_lock(&parent->d_lock);
407 next = parent->d_subdirs.next; 406 list_for_each_entry(dent, &parent->d_subdirs, d_u.d_child) {
408 while (next != &parent->d_subdirs) {
409 dent = list_entry(next, struct dentry, d_u.d_child);
410 if ((unsigned long)dent->d_fsdata == fpos) { 407 if ((unsigned long)dent->d_fsdata == fpos) {
411 if (dent->d_inode) 408 if (dent->d_inode)
412 dget(dent); 409 dget(dent);
@@ -415,7 +412,6 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
415 spin_unlock(&parent->d_lock); 412 spin_unlock(&parent->d_lock);
416 goto out; 413 goto out;
417 } 414 }
418 next = next->next;
419 } 415 }
420 spin_unlock(&parent->d_lock); 416 spin_unlock(&parent->d_lock);
421 return NULL; 417 return NULL;
@@ -1182,9 +1178,6 @@ static int day_n[] =
1182{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0}; 1178{0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0};
1183/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ 1179/* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */
1184 1180
1185
1186extern struct timezone sys_tz;
1187
1188static int utc2local(int time) 1181static int utc2local(int time)
1189{ 1182{
1190 return time - sys_tz.tz_minuteswest * 60; 1183 return time - sys_tz.tz_minuteswest * 60;
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 32c06587351a..52cb19d66ecb 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -188,20 +188,14 @@ static inline void
188ncp_renew_dentries(struct dentry *parent) 188ncp_renew_dentries(struct dentry *parent)
189{ 189{
190 struct ncp_server *server = NCP_SERVER(parent->d_inode); 190 struct ncp_server *server = NCP_SERVER(parent->d_inode);
191 struct list_head *next;
192 struct dentry *dentry; 191 struct dentry *dentry;
193 192
194 spin_lock(&parent->d_lock); 193 spin_lock(&parent->d_lock);
195 next = parent->d_subdirs.next; 194 list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) {
196 while (next != &parent->d_subdirs) {
197 dentry = list_entry(next, struct dentry, d_u.d_child);
198
199 if (dentry->d_fsdata == NULL) 195 if (dentry->d_fsdata == NULL)
200 ncp_age_dentry(server, dentry); 196 ncp_age_dentry(server, dentry);
201 else 197 else
202 ncp_new_dentry(dentry); 198 ncp_new_dentry(dentry);
203
204 next = next->next;
205 } 199 }
206 spin_unlock(&parent->d_lock); 200 spin_unlock(&parent->d_lock);
207} 201}
@@ -210,16 +204,12 @@ static inline void
210ncp_invalidate_dircache_entries(struct dentry *parent) 204ncp_invalidate_dircache_entries(struct dentry *parent)
211{ 205{
212 struct ncp_server *server = NCP_SERVER(parent->d_inode); 206 struct ncp_server *server = NCP_SERVER(parent->d_inode);
213 struct list_head *next;
214 struct dentry *dentry; 207 struct dentry *dentry;
215 208
216 spin_lock(&parent->d_lock); 209 spin_lock(&parent->d_lock);
217 next = parent->d_subdirs.next; 210 list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) {
218 while (next != &parent->d_subdirs) {
219 dentry = list_entry(next, struct dentry, d_u.d_child);
220 dentry->d_fsdata = NULL; 211 dentry->d_fsdata = NULL;
221 ncp_age_dentry(server, dentry); 212 ncp_age_dentry(server, dentry);
222 next = next->next;
223 } 213 }
224 spin_unlock(&parent->d_lock); 214 spin_unlock(&parent->d_lock);
225} 215}
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 4782e0840dcc..04cb830fa09f 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -28,6 +28,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o 28nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o 29nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o 30nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
31nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
31 32
32obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ 33obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
33obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ 34obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
index d5815505c020..3ca14c36d08b 100644
--- a/fs/nfs/blocklayout/Makefile
+++ b/fs/nfs/blocklayout/Makefile
@@ -2,4 +2,5 @@
2# Makefile for the pNFS block layout driver kernel module 2# Makefile for the pNFS block layout driver kernel module
3# 3#
4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o 4obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
5blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o 5
6blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9b431f44fad9..5228f201d3d5 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -35,7 +35,6 @@
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/namei.h> 36#include <linux/namei.h>
37#include <linux/bio.h> /* struct bio */ 37#include <linux/bio.h> /* struct bio */
38#include <linux/buffer_head.h> /* various write calls */
39#include <linux/prefetch.h> 38#include <linux/prefetch.h>
40#include <linux/pagevec.h> 39#include <linux/pagevec.h>
41 40
@@ -50,40 +49,16 @@ MODULE_LICENSE("GPL");
50MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); 49MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
51MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); 50MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
52 51
53static void print_page(struct page *page) 52static bool is_hole(struct pnfs_block_extent *be)
54{ 53{
55 dprintk("PRINTPAGE page %p\n", page); 54 switch (be->be_state) {
56 dprintk(" PagePrivate %d\n", PagePrivate(page)); 55 case PNFS_BLOCK_NONE_DATA:
57 dprintk(" PageUptodate %d\n", PageUptodate(page)); 56 return true;
58 dprintk(" PageError %d\n", PageError(page)); 57 case PNFS_BLOCK_INVALID_DATA:
59 dprintk(" PageDirty %d\n", PageDirty(page)); 58 return be->be_tag ? false : true;
60 dprintk(" PageReferenced %d\n", PageReferenced(page)); 59 default:
61 dprintk(" PageLocked %d\n", PageLocked(page)); 60 return false;
62 dprintk(" PageWriteback %d\n", PageWriteback(page)); 61 }
63 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page));
64 dprintk("\n");
65}
66
67/* Given the be associated with isect, determine if page data needs to be
68 * initialized.
69 */
70static int is_hole(struct pnfs_block_extent *be, sector_t isect)
71{
72 if (be->be_state == PNFS_BLOCK_NONE_DATA)
73 return 1;
74 else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
75 return 0;
76 else
77 return !bl_is_sector_init(be->be_inval, isect);
78}
79
80/* Given the be associated with isect, determine if page data can be
81 * written to disk.
82 */
83static int is_writable(struct pnfs_block_extent *be, sector_t isect)
84{
85 return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
86 be->be_state == PNFS_BLOCK_INVALID_DATA);
87} 62}
88 63
89/* The data we are handed might be spread across several bios. We need 64/* The data we are handed might be spread across several bios. We need
@@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
91 */ 66 */
92struct parallel_io { 67struct parallel_io {
93 struct kref refcnt; 68 struct kref refcnt;
94 void (*pnfs_callback) (void *data, int num_se); 69 void (*pnfs_callback) (void *data);
95 void *data; 70 void *data;
96 int bse_count;
97}; 71};
98 72
99static inline struct parallel_io *alloc_parallel(void *data) 73static inline struct parallel_io *alloc_parallel(void *data)
@@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data)
104 if (rv) { 78 if (rv) {
105 rv->data = data; 79 rv->data = data;
106 kref_init(&rv->refcnt); 80 kref_init(&rv->refcnt);
107 rv->bse_count = 0;
108 } 81 }
109 return rv; 82 return rv;
110} 83}
@@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref)
119 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 92 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
120 93
121 dprintk("%s enter\n", __func__); 94 dprintk("%s enter\n", __func__);
122 p->pnfs_callback(p->data, p->bse_count); 95 p->pnfs_callback(p->data);
123 kfree(p); 96 kfree(p);
124} 97}
125 98
@@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio)
141 return NULL; 114 return NULL;
142} 115}
143 116
144static struct bio *bl_alloc_init_bio(int npg, sector_t isect, 117static struct bio *
145 struct pnfs_block_extent *be, 118bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
146 void (*end_io)(struct bio *, int err), 119 void (*end_io)(struct bio *, int err), struct parallel_io *par)
147 struct parallel_io *par)
148{ 120{
149 struct bio *bio; 121 struct bio *bio;
150 122
@@ -156,67 +128,73 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
156 } 128 }
157 129
158 if (bio) { 130 if (bio) {
159 bio->bi_iter.bi_sector = isect - be->be_f_offset + 131 bio->bi_iter.bi_sector = disk_sector;
160 be->be_v_offset; 132 bio->bi_bdev = bdev;
161 bio->bi_bdev = be->be_mdev;
162 bio->bi_end_io = end_io; 133 bio->bi_end_io = end_io;
163 bio->bi_private = par; 134 bio->bi_private = par;
164 } 135 }
165 return bio; 136 return bio;
166} 137}
167 138
168static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, 139static struct bio *
169 sector_t isect, struct page *page, 140do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
170 struct pnfs_block_extent *be, 141 struct page *page, struct pnfs_block_dev_map *map,
171 void (*end_io)(struct bio *, int err), 142 struct pnfs_block_extent *be,
172 struct parallel_io *par, 143 void (*end_io)(struct bio *, int err),
173 unsigned int offset, int len) 144 struct parallel_io *par, unsigned int offset, int *len)
174{ 145{
175 isect = isect + (offset >> SECTOR_SHIFT); 146 struct pnfs_block_dev *dev =
147 container_of(be->be_device, struct pnfs_block_dev, node);
148 u64 disk_addr, end;
149
176 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, 150 dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
177 npg, rw, (unsigned long long)isect, offset, len); 151 npg, rw, (unsigned long long)isect, offset, *len);
152
153 /* translate to device offset */
154 isect += be->be_v_offset;
155 isect -= be->be_f_offset;
156
157 /* translate to physical disk offset */
158 disk_addr = (u64)isect << SECTOR_SHIFT;
159 if (disk_addr < map->start || disk_addr >= map->start + map->len) {
160 if (!dev->map(dev, disk_addr, map))
161 return ERR_PTR(-EIO);
162 bio = bl_submit_bio(rw, bio);
163 }
164 disk_addr += map->disk_offset;
165 disk_addr -= map->start;
166
167 /* limit length to what the device mapping allows */
168 end = disk_addr + *len;
169 if (end >= map->start + map->len)
170 *len = map->start + map->len - disk_addr;
171
178retry: 172retry:
179 if (!bio) { 173 if (!bio) {
180 bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 174 bio = bl_alloc_init_bio(npg, map->bdev,
175 disk_addr >> SECTOR_SHIFT, end_io, par);
181 if (!bio) 176 if (!bio)
182 return ERR_PTR(-ENOMEM); 177 return ERR_PTR(-ENOMEM);
183 } 178 }
184 if (bio_add_page(bio, page, len, offset) < len) { 179 if (bio_add_page(bio, page, *len, offset) < *len) {
185 bio = bl_submit_bio(rw, bio); 180 bio = bl_submit_bio(rw, bio);
186 goto retry; 181 goto retry;
187 } 182 }
188 return bio; 183 return bio;
189} 184}
190 185
191static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
192 sector_t isect, struct page *page,
193 struct pnfs_block_extent *be,
194 void (*end_io)(struct bio *, int err),
195 struct parallel_io *par)
196{
197 return do_add_page_to_bio(bio, npg, rw, isect, page, be,
198 end_io, par, 0, PAGE_CACHE_SIZE);
199}
200
201/* This is basically copied from mpage_end_io_read */
202static void bl_end_io_read(struct bio *bio, int err) 186static void bl_end_io_read(struct bio *bio, int err)
203{ 187{
204 struct parallel_io *par = bio->bi_private; 188 struct parallel_io *par = bio->bi_private;
205 struct bio_vec *bvec;
206 int i;
207
208 if (!err)
209 bio_for_each_segment_all(bvec, bio, i)
210 SetPageUptodate(bvec->bv_page);
211 189
212 if (err) { 190 if (err) {
213 struct nfs_pgio_data *rdata = par->data; 191 struct nfs_pgio_header *header = par->data;
214 struct nfs_pgio_header *header = rdata->header;
215 192
216 if (!header->pnfs_error) 193 if (!header->pnfs_error)
217 header->pnfs_error = -EIO; 194 header->pnfs_error = -EIO;
218 pnfs_set_lo_fail(header->lseg); 195 pnfs_set_lo_fail(header->lseg);
219 } 196 }
197
220 bio_put(bio); 198 bio_put(bio);
221 put_parallel(par); 199 put_parallel(par);
222} 200}
@@ -224,104 +202,96 @@ static void bl_end_io_read(struct bio *bio, int err)
224static void bl_read_cleanup(struct work_struct *work) 202static void bl_read_cleanup(struct work_struct *work)
225{ 203{
226 struct rpc_task *task; 204 struct rpc_task *task;
227 struct nfs_pgio_data *rdata; 205 struct nfs_pgio_header *hdr;
228 dprintk("%s enter\n", __func__); 206 dprintk("%s enter\n", __func__);
229 task = container_of(work, struct rpc_task, u.tk_work); 207 task = container_of(work, struct rpc_task, u.tk_work);
230 rdata = container_of(task, struct nfs_pgio_data, task); 208 hdr = container_of(task, struct nfs_pgio_header, task);
231 pnfs_ld_read_done(rdata); 209 pnfs_ld_read_done(hdr);
232} 210}
233 211
234static void 212static void
235bl_end_par_io_read(void *data, int unused) 213bl_end_par_io_read(void *data)
236{ 214{
237 struct nfs_pgio_data *rdata = data; 215 struct nfs_pgio_header *hdr = data;
238 216
239 rdata->task.tk_status = rdata->header->pnfs_error; 217 hdr->task.tk_status = hdr->pnfs_error;
240 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); 218 INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
241 schedule_work(&rdata->task.u.tk_work); 219 schedule_work(&hdr->task.u.tk_work);
242} 220}
243 221
244static enum pnfs_try_status 222static enum pnfs_try_status
245bl_read_pagelist(struct nfs_pgio_data *rdata) 223bl_read_pagelist(struct nfs_pgio_header *header)
246{ 224{
247 struct nfs_pgio_header *header = rdata->header; 225 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
248 int i, hole; 226 struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
249 struct bio *bio = NULL; 227 struct bio *bio = NULL;
250 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 228 struct pnfs_block_extent be;
251 sector_t isect, extent_length = 0; 229 sector_t isect, extent_length = 0;
252 struct parallel_io *par; 230 struct parallel_io *par;
253 loff_t f_offset = rdata->args.offset; 231 loff_t f_offset = header->args.offset;
254 size_t bytes_left = rdata->args.count; 232 size_t bytes_left = header->args.count;
255 unsigned int pg_offset, pg_len; 233 unsigned int pg_offset, pg_len;
256 struct page **pages = rdata->args.pages; 234 struct page **pages = header->args.pages;
257 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; 235 int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
258 const bool is_dio = (header->dreq != NULL); 236 const bool is_dio = (header->dreq != NULL);
237 struct blk_plug plug;
238 int i;
259 239
260 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 240 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
261 rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); 241 header->page_array.npages, f_offset,
242 (unsigned int)header->args.count);
262 243
263 par = alloc_parallel(rdata); 244 par = alloc_parallel(header);
264 if (!par) 245 if (!par)
265 goto use_mds; 246 return PNFS_NOT_ATTEMPTED;
266 par->pnfs_callback = bl_end_par_io_read; 247 par->pnfs_callback = bl_end_par_io_read;
267 /* At this point, we can no longer jump to use_mds */ 248
249 blk_start_plug(&plug);
268 250
269 isect = (sector_t) (f_offset >> SECTOR_SHIFT); 251 isect = (sector_t) (f_offset >> SECTOR_SHIFT);
270 /* Code assumes extents are page-aligned */ 252 /* Code assumes extents are page-aligned */
271 for (i = pg_index; i < rdata->pages.npages; i++) { 253 for (i = pg_index; i < header->page_array.npages; i++) {
272 if (!extent_length) { 254 if (extent_length <= 0) {
273 /* We've used up the previous extent */ 255 /* We've used up the previous extent */
274 bl_put_extent(be);
275 bl_put_extent(cow_read);
276 bio = bl_submit_bio(READ, bio); 256 bio = bl_submit_bio(READ, bio);
257
277 /* Get the next one */ 258 /* Get the next one */
278 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 259 if (!ext_tree_lookup(bl, isect, &be, false)) {
279 isect, &cow_read);
280 if (!be) {
281 header->pnfs_error = -EIO; 260 header->pnfs_error = -EIO;
282 goto out; 261 goto out;
283 } 262 }
284 extent_length = be->be_length - 263 extent_length = be.be_length - (isect - be.be_f_offset);
285 (isect - be->be_f_offset);
286 if (cow_read) {
287 sector_t cow_length = cow_read->be_length -
288 (isect - cow_read->be_f_offset);
289 extent_length = min(extent_length, cow_length);
290 }
291 } 264 }
292 265
266 pg_offset = f_offset & ~PAGE_CACHE_MASK;
293 if (is_dio) { 267 if (is_dio) {
294 pg_offset = f_offset & ~PAGE_CACHE_MASK;
295 if (pg_offset + bytes_left > PAGE_CACHE_SIZE) 268 if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
296 pg_len = PAGE_CACHE_SIZE - pg_offset; 269 pg_len = PAGE_CACHE_SIZE - pg_offset;
297 else 270 else
298 pg_len = bytes_left; 271 pg_len = bytes_left;
299
300 f_offset += pg_len;
301 bytes_left -= pg_len;
302 isect += (pg_offset >> SECTOR_SHIFT);
303 } else { 272 } else {
304 pg_offset = 0; 273 BUG_ON(pg_offset != 0);
305 pg_len = PAGE_CACHE_SIZE; 274 pg_len = PAGE_CACHE_SIZE;
306 } 275 }
307 276
308 hole = is_hole(be, isect); 277 isect += (pg_offset >> SECTOR_SHIFT);
309 if (hole && !cow_read) { 278 extent_length -= (pg_offset >> SECTOR_SHIFT);
279
280 if (is_hole(&be)) {
310 bio = bl_submit_bio(READ, bio); 281 bio = bl_submit_bio(READ, bio);
311 /* Fill hole w/ zeroes w/o accessing device */ 282 /* Fill hole w/ zeroes w/o accessing device */
312 dprintk("%s Zeroing page for hole\n", __func__); 283 dprintk("%s Zeroing page for hole\n", __func__);
313 zero_user_segment(pages[i], pg_offset, pg_len); 284 zero_user_segment(pages[i], pg_offset, pg_len);
314 print_page(pages[i]);
315 SetPageUptodate(pages[i]);
316 } else {
317 struct pnfs_block_extent *be_read;
318 285
319 be_read = (hole && cow_read) ? cow_read : be; 286 /* invalidate map */
320 bio = do_add_page_to_bio(bio, rdata->pages.npages - i, 287 map.start = NFS4_MAX_UINT64;
288 } else {
289 bio = do_add_page_to_bio(bio,
290 header->page_array.npages - i,
321 READ, 291 READ,
322 isect, pages[i], be_read, 292 isect, pages[i], &map, &be,
323 bl_end_io_read, par, 293 bl_end_io_read, par,
324 pg_offset, pg_len); 294 pg_offset, &pg_len);
325 if (IS_ERR(bio)) { 295 if (IS_ERR(bio)) {
326 header->pnfs_error = PTR_ERR(bio); 296 header->pnfs_error = PTR_ERR(bio);
327 bio = NULL; 297 bio = NULL;
@@ -329,84 +299,28 @@ bl_read_pagelist(struct nfs_pgio_data *rdata)
329 } 299 }
330 } 300 }
331 isect += (pg_len >> SECTOR_SHIFT); 301 isect += (pg_len >> SECTOR_SHIFT);
332 extent_length -= PAGE_CACHE_SECTORS; 302 extent_length -= (pg_len >> SECTOR_SHIFT);
303 f_offset += pg_len;
304 bytes_left -= pg_len;
333 } 305 }
334 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 306 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
335 rdata->res.eof = 1; 307 header->res.eof = 1;
336 rdata->res.count = header->inode->i_size - rdata->args.offset; 308 header->res.count = header->inode->i_size - header->args.offset;
337 } else { 309 } else {
338 rdata->res.count = (isect << SECTOR_SHIFT) - rdata->args.offset; 310 header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
339 } 311 }
340out: 312out:
341 bl_put_extent(be);
342 bl_put_extent(cow_read);
343 bl_submit_bio(READ, bio); 313 bl_submit_bio(READ, bio);
314 blk_finish_plug(&plug);
344 put_parallel(par); 315 put_parallel(par);
345 return PNFS_ATTEMPTED; 316 return PNFS_ATTEMPTED;
346
347 use_mds:
348 dprintk("Giving up and using normal NFS\n");
349 return PNFS_NOT_ATTEMPTED;
350}
351
352static void mark_extents_written(struct pnfs_block_layout *bl,
353 __u64 offset, __u32 count)
354{
355 sector_t isect, end;
356 struct pnfs_block_extent *be;
357 struct pnfs_block_short_extent *se;
358
359 dprintk("%s(%llu, %u)\n", __func__, offset, count);
360 if (count == 0)
361 return;
362 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
363 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
364 end >>= SECTOR_SHIFT;
365 while (isect < end) {
366 sector_t len;
367 be = bl_find_get_extent(bl, isect, NULL);
368 BUG_ON(!be); /* FIXME */
369 len = min(end, be->be_f_offset + be->be_length) - isect;
370 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
371 se = bl_pop_one_short_extent(be->be_inval);
372 BUG_ON(!se);
373 bl_mark_for_commit(be, isect, len, se);
374 }
375 isect += len;
376 bl_put_extent(be);
377 }
378}
379
380static void bl_end_io_write_zero(struct bio *bio, int err)
381{
382 struct parallel_io *par = bio->bi_private;
383 struct bio_vec *bvec;
384 int i;
385
386 bio_for_each_segment_all(bvec, bio, i) {
387 /* This is the zeroing page we added */
388 end_page_writeback(bvec->bv_page);
389 page_cache_release(bvec->bv_page);
390 }
391
392 if (unlikely(err)) {
393 struct nfs_pgio_data *data = par->data;
394 struct nfs_pgio_header *header = data->header;
395
396 if (!header->pnfs_error)
397 header->pnfs_error = -EIO;
398 pnfs_set_lo_fail(header->lseg);
399 }
400 bio_put(bio);
401 put_parallel(par);
402} 317}
403 318
404static void bl_end_io_write(struct bio *bio, int err) 319static void bl_end_io_write(struct bio *bio, int err)
405{ 320{
406 struct parallel_io *par = bio->bi_private; 321 struct parallel_io *par = bio->bi_private;
407 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 322 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
408 struct nfs_pgio_data *data = par->data; 323 struct nfs_pgio_header *header = par->data;
409 struct nfs_pgio_header *header = data->header;
410 324
411 if (!uptodate) { 325 if (!uptodate) {
412 if (!header->pnfs_error) 326 if (!header->pnfs_error)
@@ -422,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err)
422 */ 336 */
423static void bl_write_cleanup(struct work_struct *work) 337static void bl_write_cleanup(struct work_struct *work)
424{ 338{
425 struct rpc_task *task; 339 struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
426 struct nfs_pgio_data *wdata; 340 struct nfs_pgio_header *hdr =
427 dprintk("%s enter\n", __func__); 341 container_of(task, struct nfs_pgio_header, task);
428 task = container_of(work, struct rpc_task, u.tk_work);
429 wdata = container_of(task, struct nfs_pgio_data, task);
430 if (likely(!wdata->header->pnfs_error)) {
431 /* Marks for LAYOUTCOMMIT */
432 mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg),
433 wdata->args.offset, wdata->args.count);
434 }
435 pnfs_ld_write_done(wdata);
436}
437
438/* Called when last of bios associated with a bl_write_pagelist call finishes */
439static void bl_end_par_io_write(void *data, int num_se)
440{
441 struct nfs_pgio_data *wdata = data;
442
443 if (unlikely(wdata->header->pnfs_error)) {
444 bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval,
445 num_se);
446 }
447
448 wdata->task.tk_status = wdata->header->pnfs_error;
449 wdata->verf.committed = NFS_FILE_SYNC;
450 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
451 schedule_work(&wdata->task.u.tk_work);
452}
453
454/* FIXME STUB - mark intersection of layout and page as bad, so is not
455 * used again.
456 */
457static void mark_bad_read(void)
458{
459 return;
460}
461
462/*
463 * map_block: map a requested I/0 block (isect) into an offset in the LVM
464 * block_device
465 */
466static void
467map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
468{
469 dprintk("%s enter be=%p\n", __func__, be);
470
471 set_buffer_mapped(bh);
472 bh->b_bdev = be->be_mdev;
473 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
474 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
475
476 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
477 __func__, (unsigned long long)isect, (long)bh->b_blocknr,
478 bh->b_size);
479 return;
480}
481
482static void
483bl_read_single_end_io(struct bio *bio, int error)
484{
485 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
486 struct page *page = bvec->bv_page;
487
488 /* Only one page in bvec */
489 unlock_page(page);
490}
491
492static int
493bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be,
494 unsigned int offset, unsigned int len)
495{
496 struct bio *bio;
497 struct page *shadow_page;
498 sector_t isect;
499 char *kaddr, *kshadow_addr;
500 int ret = 0;
501 342
502 dprintk("%s: offset %u len %u\n", __func__, offset, len); 343 dprintk("%s enter\n", __func__);
503
504 shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
505 if (shadow_page == NULL)
506 return -ENOMEM;
507
508 bio = bio_alloc(GFP_NOIO, 1);
509 if (bio == NULL)
510 return -ENOMEM;
511
512 isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) +
513 (offset / SECTOR_SIZE);
514
515 bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset;
516 bio->bi_bdev = be->be_mdev;
517 bio->bi_end_io = bl_read_single_end_io;
518
519 lock_page(shadow_page);
520 if (bio_add_page(bio, shadow_page,
521 SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) {
522 unlock_page(shadow_page);
523 bio_put(bio);
524 return -EIO;
525 }
526
527 submit_bio(READ, bio);
528 wait_on_page_locked(shadow_page);
529 if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) {
530 ret = -EIO;
531 } else {
532 kaddr = kmap_atomic(page);
533 kshadow_addr = kmap_atomic(shadow_page);
534 memcpy(kaddr + offset, kshadow_addr + offset, len);
535 kunmap_atomic(kshadow_addr);
536 kunmap_atomic(kaddr);
537 }
538 __free_page(shadow_page);
539 bio_put(bio);
540
541 return ret;
542}
543
544static int
545bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be,
546 unsigned int dirty_offset, unsigned int dirty_len,
547 bool full_page)
548{
549 int ret = 0;
550 unsigned int start, end;
551 344
552 if (full_page) { 345 if (likely(!hdr->pnfs_error)) {
553 start = 0; 346 struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
554 end = PAGE_CACHE_SIZE; 347 u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK;
555 } else { 348 u64 end = (hdr->args.offset + hdr->args.count +
556 start = round_down(dirty_offset, SECTOR_SIZE); 349 PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK;
557 end = round_up(dirty_offset + dirty_len, SECTOR_SIZE);
558 }
559 350
560 dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); 351 ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
561 if (!be) { 352 (end - start) >> SECTOR_SHIFT);
562 zero_user_segments(page, start, dirty_offset,
563 dirty_offset + dirty_len, end);
564 if (start == 0 && end == PAGE_CACHE_SIZE &&
565 trylock_page(page)) {
566 SetPageUptodate(page);
567 unlock_page(page);
568 }
569 return ret;
570 } 353 }
571 354
572 if (start != dirty_offset) 355 pnfs_ld_write_done(hdr);
573 ret = bl_do_readpage_sync(page, be, start, dirty_offset - start);
574
575 if (!ret && (dirty_offset + dirty_len < end))
576 ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len,
577 end - dirty_offset - dirty_len);
578
579 return ret;
580} 356}
581 357
582/* Given an unmapped page, zero it or read in page for COW, page is locked 358/* Called when last of bios associated with a bl_write_pagelist call finishes */
583 * by caller. 359static void bl_end_par_io_write(void *data)
584 */
585static int
586init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
587{ 360{
588 struct buffer_head *bh = NULL; 361 struct nfs_pgio_header *hdr = data;
589 int ret = 0;
590 sector_t isect;
591
592 dprintk("%s enter, %p\n", __func__, page);
593 BUG_ON(PageUptodate(page));
594 if (!cow_read) {
595 zero_user_segment(page, 0, PAGE_SIZE);
596 SetPageUptodate(page);
597 goto cleanup;
598 }
599
600 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
601 if (!bh) {
602 ret = -ENOMEM;
603 goto cleanup;
604 }
605 362
606 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; 363 hdr->task.tk_status = hdr->pnfs_error;
607 map_block(bh, isect, cow_read); 364 hdr->verf.committed = NFS_FILE_SYNC;
608 if (!bh_uptodate_or_lock(bh)) 365 INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
609 ret = bh_submit_read(bh); 366 schedule_work(&hdr->task.u.tk_work);
610 if (ret)
611 goto cleanup;
612 SetPageUptodate(page);
613
614cleanup:
615 if (bh)
616 free_buffer_head(bh);
617 if (ret) {
618 /* Need to mark layout with bad read...should now
619 * just use nfs4 for reads and writes.
620 */
621 mark_bad_read();
622 }
623 return ret;
624}
625
626/* Find or create a zeroing page marked being writeback.
627 * Return ERR_PTR on error, NULL to indicate skip this page and page itself
628 * to indicate write out.
629 */
630static struct page *
631bl_find_get_zeroing_page(struct inode *inode, pgoff_t index,
632 struct pnfs_block_extent *cow_read)
633{
634 struct page *page;
635 int locked = 0;
636 page = find_get_page(inode->i_mapping, index);
637 if (page)
638 goto check_page;
639
640 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
641 if (unlikely(!page)) {
642 dprintk("%s oom\n", __func__);
643 return ERR_PTR(-ENOMEM);
644 }
645 locked = 1;
646
647check_page:
648 /* PageDirty: Other will write this out
649 * PageWriteback: Other is writing this out
650 * PageUptodate: It was read before
651 */
652 if (PageDirty(page) || PageWriteback(page)) {
653 print_page(page);
654 if (locked)
655 unlock_page(page);
656 page_cache_release(page);
657 return NULL;
658 }
659
660 if (!locked) {
661 lock_page(page);
662 locked = 1;
663 goto check_page;
664 }
665 if (!PageUptodate(page)) {
666 /* New page, readin or zero it */
667 init_page_for_write(page, cow_read);
668 }
669 set_page_writeback(page);
670 unlock_page(page);
671
672 return page;
673} 367}
674 368
675static enum pnfs_try_status 369static enum pnfs_try_status
676bl_write_pagelist(struct nfs_pgio_data *wdata, int sync) 370bl_write_pagelist(struct nfs_pgio_header *header, int sync)
677{ 371{
678 struct nfs_pgio_header *header = wdata->header; 372 struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
679 int i, ret, npg_zero, pg_index, last = 0; 373 struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
680 struct bio *bio = NULL; 374 struct bio *bio = NULL;
681 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 375 struct pnfs_block_extent be;
682 sector_t isect, last_isect = 0, extent_length = 0; 376 sector_t isect, extent_length = 0;
683 struct parallel_io *par = NULL; 377 struct parallel_io *par = NULL;
684 loff_t offset = wdata->args.offset; 378 loff_t offset = header->args.offset;
685 size_t count = wdata->args.count; 379 size_t count = header->args.count;
686 unsigned int pg_offset, pg_len, saved_len; 380 struct page **pages = header->args.pages;
687 struct page **pages = wdata->args.pages; 381 int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
688 struct page *page; 382 unsigned int pg_len;
689 pgoff_t index; 383 struct blk_plug plug;
690 u64 temp; 384 int i;
691 int npg_per_block =
692 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
693 385
694 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 386 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
695 387
696 if (header->dreq != NULL && 388 /* At this point, header->page_aray is a (sequential) list of nfs_pages.
697 (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) ||
698 !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) {
699 dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n");
700 goto out_mds;
701 }
702 /* At this point, wdata->pages is a (sequential) list of nfs_pages.
703 * We want to write each, and if there is an error set pnfs_error 389 * We want to write each, and if there is an error set pnfs_error
704 * to have it redone using nfs. 390 * to have it redone using nfs.
705 */ 391 */
706 par = alloc_parallel(wdata); 392 par = alloc_parallel(header);
707 if (!par) 393 if (!par)
708 goto out_mds; 394 return PNFS_NOT_ATTEMPTED;
709 par->pnfs_callback = bl_end_par_io_write; 395 par->pnfs_callback = bl_end_par_io_write;
710 /* At this point, have to be more careful with error handling */
711 396
712 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 397 blk_start_plug(&plug);
713 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read);
714 if (!be || !is_writable(be, isect)) {
715 dprintk("%s no matching extents!\n", __func__);
716 goto out_mds;
717 }
718 398
719 /* First page inside INVALID extent */ 399 /* we always write out the whole page */
720 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 400 offset = offset & (loff_t)PAGE_CACHE_MASK;
721 if (likely(!bl_push_one_short_extent(be->be_inval))) 401 isect = offset >> SECTOR_SHIFT;
722 par->bse_count++;
723 else
724 goto out_mds;
725 temp = offset >> PAGE_CACHE_SHIFT;
726 npg_zero = do_div(temp, npg_per_block);
727 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
728 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
729 extent_length = be->be_length - (isect - be->be_f_offset);
730
731fill_invalid_ext:
732 dprintk("%s need to zero %d pages\n", __func__, npg_zero);
733 for (;npg_zero > 0; npg_zero--) {
734 if (bl_is_sector_init(be->be_inval, isect)) {
735 dprintk("isect %llu already init\n",
736 (unsigned long long)isect);
737 goto next_page;
738 }
739 /* page ref released in bl_end_io_write_zero */
740 index = isect >> PAGE_CACHE_SECTOR_SHIFT;
741 dprintk("%s zero %dth page: index %lu isect %llu\n",
742 __func__, npg_zero, index,
743 (unsigned long long)isect);
744 page = bl_find_get_zeroing_page(header->inode, index,
745 cow_read);
746 if (unlikely(IS_ERR(page))) {
747 header->pnfs_error = PTR_ERR(page);
748 goto out;
749 } else if (page == NULL)
750 goto next_page;
751
752 ret = bl_mark_sectors_init(be->be_inval, isect,
753 PAGE_CACHE_SECTORS);
754 if (unlikely(ret)) {
755 dprintk("%s bl_mark_sectors_init fail %d\n",
756 __func__, ret);
757 end_page_writeback(page);
758 page_cache_release(page);
759 header->pnfs_error = ret;
760 goto out;
761 }
762 if (likely(!bl_push_one_short_extent(be->be_inval)))
763 par->bse_count++;
764 else {
765 end_page_writeback(page);
766 page_cache_release(page);
767 header->pnfs_error = -ENOMEM;
768 goto out;
769 }
770 /* FIXME: This should be done in bi_end_io */
771 mark_extents_written(BLK_LSEG2EXT(header->lseg),
772 page->index << PAGE_CACHE_SHIFT,
773 PAGE_CACHE_SIZE);
774
775 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
776 isect, page, be,
777 bl_end_io_write_zero, par);
778 if (IS_ERR(bio)) {
779 header->pnfs_error = PTR_ERR(bio);
780 bio = NULL;
781 goto out;
782 }
783next_page:
784 isect += PAGE_CACHE_SECTORS;
785 extent_length -= PAGE_CACHE_SECTORS;
786 }
787 if (last)
788 goto write_done;
789 }
790 bio = bl_submit_bio(WRITE, bio);
791 402
792 /* Middle pages */ 403 for (i = pg_index; i < header->page_array.npages; i++) {
793 pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; 404 if (extent_length <= 0) {
794 for (i = pg_index; i < wdata->pages.npages; i++) {
795 if (!extent_length) {
796 /* We've used up the previous extent */ 405 /* We've used up the previous extent */
797 bl_put_extent(be);
798 bl_put_extent(cow_read);
799 bio = bl_submit_bio(WRITE, bio); 406 bio = bl_submit_bio(WRITE, bio);
800 /* Get the next one */ 407 /* Get the next one */
801 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 408 if (!ext_tree_lookup(bl, isect, &be, true)) {
802 isect, &cow_read);
803 if (!be || !is_writable(be, isect)) {
804 header->pnfs_error = -EINVAL; 409 header->pnfs_error = -EINVAL;
805 goto out; 410 goto out;
806 } 411 }
807 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
808 if (likely(!bl_push_one_short_extent(
809 be->be_inval)))
810 par->bse_count++;
811 else {
812 header->pnfs_error = -ENOMEM;
813 goto out;
814 }
815 }
816 extent_length = be->be_length -
817 (isect - be->be_f_offset);
818 }
819 412
820 dprintk("%s offset %lld count %Zu\n", __func__, offset, count); 413 extent_length = be.be_length - (isect - be.be_f_offset);
821 pg_offset = offset & ~PAGE_CACHE_MASK;
822 if (pg_offset + count > PAGE_CACHE_SIZE)
823 pg_len = PAGE_CACHE_SIZE - pg_offset;
824 else
825 pg_len = count;
826
827 saved_len = pg_len;
828 if (be->be_state == PNFS_BLOCK_INVALID_DATA &&
829 !bl_is_sector_init(be->be_inval, isect)) {
830 ret = bl_read_partial_page_sync(pages[i], cow_read,
831 pg_offset, pg_len, true);
832 if (ret) {
833 dprintk("%s bl_read_partial_page_sync fail %d\n",
834 __func__, ret);
835 header->pnfs_error = ret;
836 goto out;
837 }
838
839 ret = bl_mark_sectors_init(be->be_inval, isect,
840 PAGE_CACHE_SECTORS);
841 if (unlikely(ret)) {
842 dprintk("%s bl_mark_sectors_init fail %d\n",
843 __func__, ret);
844 header->pnfs_error = ret;
845 goto out;
846 }
847
848 /* Expand to full page write */
849 pg_offset = 0;
850 pg_len = PAGE_CACHE_SIZE;
851 } else if ((pg_offset & (SECTOR_SIZE - 1)) ||
852 (pg_len & (SECTOR_SIZE - 1))){
853 /* ahh, nasty case. We have to do sync full sector
854 * read-modify-write cycles.
855 */
856 unsigned int saved_offset = pg_offset;
857 ret = bl_read_partial_page_sync(pages[i], be, pg_offset,
858 pg_len, false);
859 pg_offset = round_down(pg_offset, SECTOR_SIZE);
860 pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE)
861 - pg_offset;
862 } 414 }
863 415
864 416 pg_len = PAGE_CACHE_SIZE;
865 bio = do_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, 417 bio = do_add_page_to_bio(bio, header->page_array.npages - i,
866 isect, pages[i], be, 418 WRITE, isect, pages[i], &map, &be,
867 bl_end_io_write, par, 419 bl_end_io_write, par,
868 pg_offset, pg_len); 420 0, &pg_len);
869 if (IS_ERR(bio)) { 421 if (IS_ERR(bio)) {
870 header->pnfs_error = PTR_ERR(bio); 422 header->pnfs_error = PTR_ERR(bio);
871 bio = NULL; 423 bio = NULL;
872 goto out; 424 goto out;
873 } 425 }
874 offset += saved_len;
875 count -= saved_len;
876 isect += PAGE_CACHE_SECTORS;
877 last_isect = isect;
878 extent_length -= PAGE_CACHE_SECTORS;
879 }
880 426
881 /* Last page inside INVALID extent */ 427 offset += pg_len;
882 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 428 count -= pg_len;
883 bio = bl_submit_bio(WRITE, bio); 429 isect += (pg_len >> SECTOR_SHIFT);
884 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; 430 extent_length -= (pg_len >> SECTOR_SHIFT);
885 npg_zero = npg_per_block - do_div(temp, npg_per_block);
886 if (npg_zero < npg_per_block) {
887 last = 1;
888 goto fill_invalid_ext;
889 }
890 } 431 }
891 432
892write_done: 433 header->res.count = header->args.count;
893 wdata->res.count = wdata->args.count;
894out: 434out:
895 bl_put_extent(be);
896 bl_put_extent(cow_read);
897 bl_submit_bio(WRITE, bio); 435 bl_submit_bio(WRITE, bio);
436 blk_finish_plug(&plug);
898 put_parallel(par); 437 put_parallel(par);
899 return PNFS_ATTEMPTED; 438 return PNFS_ATTEMPTED;
900out_mds:
901 bl_put_extent(be);
902 bl_put_extent(cow_read);
903 kfree(par);
904 return PNFS_NOT_ATTEMPTED;
905}
906
907/* FIXME - range ignored */
908static void
909release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
910{
911 int i;
912 struct pnfs_block_extent *be;
913
914 spin_lock(&bl->bl_ext_lock);
915 for (i = 0; i < EXTENT_LISTS; i++) {
916 while (!list_empty(&bl->bl_extents[i])) {
917 be = list_first_entry(&bl->bl_extents[i],
918 struct pnfs_block_extent,
919 be_node);
920 list_del(&be->be_node);
921 bl_put_extent(be);
922 }
923 }
924 spin_unlock(&bl->bl_ext_lock);
925}
926
927static void
928release_inval_marks(struct pnfs_inval_markings *marks)
929{
930 struct pnfs_inval_tracking *pos, *temp;
931 struct pnfs_block_short_extent *se, *stemp;
932
933 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
934 list_del(&pos->it_link);
935 kfree(pos);
936 }
937
938 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
939 list_del(&se->bse_node);
940 kfree(se);
941 }
942 return;
943} 439}
944 440
945static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) 441static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
946{ 442{
947 struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 443 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
444 int err;
948 445
949 dprintk("%s enter\n", __func__); 446 dprintk("%s enter\n", __func__);
950 release_extents(bl, NULL); 447
951 release_inval_marks(&bl->bl_inval); 448 err = ext_tree_remove(bl, true, 0, LLONG_MAX);
449 WARN_ON(err);
450
952 kfree(bl); 451 kfree(bl);
953} 452}
954 453
@@ -961,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
961 bl = kzalloc(sizeof(*bl), gfp_flags); 460 bl = kzalloc(sizeof(*bl), gfp_flags);
962 if (!bl) 461 if (!bl)
963 return NULL; 462 return NULL;
463
464 bl->bl_ext_rw = RB_ROOT;
465 bl->bl_ext_ro = RB_ROOT;
964 spin_lock_init(&bl->bl_ext_lock); 466 spin_lock_init(&bl->bl_ext_lock);
965 INIT_LIST_HEAD(&bl->bl_extents[0]); 467
966 INIT_LIST_HEAD(&bl->bl_extents[1]);
967 INIT_LIST_HEAD(&bl->bl_commit);
968 INIT_LIST_HEAD(&bl->bl_committing);
969 bl->bl_count = 0;
970 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
971 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
972 return &bl->bl_layout; 468 return &bl->bl_layout;
973} 469}
974 470
@@ -978,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg)
978 kfree(lseg); 474 kfree(lseg);
979} 475}
980 476
981/* We pretty much ignore lseg, and store all data layout wide, so we 477/* Tracks info needed to ensure extents in layout obey constraints of spec */
982 * can correctly merge. 478struct layout_verification {
983 */ 479 u32 mode; /* R or RW */
984static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, 480 u64 start; /* Expected start of next non-COW extent */
985 struct nfs4_layoutget_res *lgr, 481 u64 inval; /* Start of INVAL coverage */
986 gfp_t gfp_flags) 482 u64 cowread; /* End of COW read coverage */
987{ 483};
988 struct pnfs_layout_segment *lseg;
989 int status;
990 484
991 dprintk("%s enter\n", __func__); 485/* Verify the extent meets the layout requirements of the pnfs-block draft,
992 lseg = kzalloc(sizeof(*lseg), gfp_flags); 486 * section 2.3.1.
993 if (!lseg) 487 */
994 return ERR_PTR(-ENOMEM); 488static int verify_extent(struct pnfs_block_extent *be,
995 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); 489 struct layout_verification *lv)
996 if (status) { 490{
997 /* We don't want to call the full-blown bl_free_lseg, 491 if (lv->mode == IOMODE_READ) {
998 * since on error extents were not touched. 492 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
999 */ 493 be->be_state == PNFS_BLOCK_INVALID_DATA)
1000 kfree(lseg); 494 return -EIO;
1001 return ERR_PTR(status); 495 if (be->be_f_offset != lv->start)
496 return -EIO;
497 lv->start += be->be_length;
498 return 0;
1002 } 499 }
1003 return lseg; 500 /* lv->mode == IOMODE_RW */
501 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
502 if (be->be_f_offset != lv->start)
503 return -EIO;
504 if (lv->cowread > lv->start)
505 return -EIO;
506 lv->start += be->be_length;
507 lv->inval = lv->start;
508 return 0;
509 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
510 if (be->be_f_offset != lv->start)
511 return -EIO;
512 lv->start += be->be_length;
513 return 0;
514 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
515 if (be->be_f_offset > lv->start)
516 return -EIO;
517 if (be->be_f_offset < lv->inval)
518 return -EIO;
519 if (be->be_f_offset < lv->cowread)
520 return -EIO;
521 /* It looks like you might want to min this with lv->start,
522 * but you really don't.
523 */
524 lv->inval = lv->inval + be->be_length;
525 lv->cowread = be->be_f_offset + be->be_length;
526 return 0;
527 } else
528 return -EIO;
1004} 529}
1005 530
1006static void 531static int decode_sector_number(__be32 **rp, sector_t *sp)
1007bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
1008 const struct nfs4_layoutcommit_args *arg)
1009{ 532{
1010 dprintk("%s enter\n", __func__); 533 uint64_t s;
1011 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); 534
535 *rp = xdr_decode_hyper(*rp, &s);
536 if (s & 0x1ff) {
537 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
538 return -1;
539 }
540 *sp = s >> SECTOR_SHIFT;
541 return 0;
1012} 542}
1013 543
1014static void 544static int
1015bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) 545bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
546 struct layout_verification *lv, struct list_head *extents,
547 gfp_t gfp_mask)
1016{ 548{
1017 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; 549 struct pnfs_block_extent *be;
550 struct nfs4_deviceid id;
551 int error;
552 __be32 *p;
1018 553
1019 dprintk("%s enter\n", __func__); 554 p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
1020 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); 555 if (!p)
1021} 556 return -EIO;
1022 557
1023static void free_blk_mountid(struct block_mount_id *mid) 558 be = kzalloc(sizeof(*be), GFP_NOFS);
1024{ 559 if (!be)
1025 if (mid) { 560 return -ENOMEM;
1026 struct pnfs_block_dev *dev, *tmp;
1027 561
1028 /* No need to take bm_lock as we are last user freeing bm_devlist */ 562 memcpy(&id, p, NFS4_DEVICEID4_SIZE);
1029 list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { 563 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
1030 list_del(&dev->bm_node); 564
1031 bl_free_block_dev(dev); 565 error = -EIO;
1032 } 566 be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
1033 kfree(mid); 567 lo->plh_lc_cred, gfp_mask);
568 if (!be->be_device)
569 goto out_free_be;
570
571 /*
572 * The next three values are read in as bytes, but stored in the
573 * extent structure in 512-byte granularity.
574 */
575 if (decode_sector_number(&p, &be->be_f_offset) < 0)
576 goto out_put_deviceid;
577 if (decode_sector_number(&p, &be->be_length) < 0)
578 goto out_put_deviceid;
579 if (decode_sector_number(&p, &be->be_v_offset) < 0)
580 goto out_put_deviceid;
581 be->be_state = be32_to_cpup(p++);
582
583 error = verify_extent(be, lv);
584 if (error) {
585 dprintk("%s: extent verification failed\n", __func__);
586 goto out_put_deviceid;
1034 } 587 }
588
589 list_add_tail(&be->be_list, extents);
590 return 0;
591
592out_put_deviceid:
593 nfs4_put_deviceid_node(be->be_device);
594out_free_be:
595 kfree(be);
596 return error;
1035} 597}
1036 598
1037/* This is mostly copied from the filelayout_get_device_info function. 599static struct pnfs_layout_segment *
1038 * It seems much of this should be at the generic pnfs level. 600bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
1039 */ 601 gfp_t gfp_mask)
1040static struct pnfs_block_dev *
1041nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
1042 struct nfs4_deviceid *d_id)
1043{ 602{
1044 struct pnfs_device *dev; 603 struct layout_verification lv = {
1045 struct pnfs_block_dev *rv; 604 .mode = lgr->range.iomode,
1046 u32 max_resp_sz; 605 .start = lgr->range.offset >> SECTOR_SHIFT,
1047 int max_pages; 606 .inval = lgr->range.offset >> SECTOR_SHIFT,
1048 struct page **pages = NULL; 607 .cowread = lgr->range.offset >> SECTOR_SHIFT,
1049 int i, rc; 608 };
609 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
610 struct pnfs_layout_segment *lseg;
611 struct xdr_buf buf;
612 struct xdr_stream xdr;
613 struct page *scratch;
614 int status, i;
615 uint32_t count;
616 __be32 *p;
617 LIST_HEAD(extents);
618
619 dprintk("---> %s\n", __func__);
620
621 lseg = kzalloc(sizeof(*lseg), gfp_mask);
622 if (!lseg)
623 return ERR_PTR(-ENOMEM);
624
625 status = -ENOMEM;
626 scratch = alloc_page(gfp_mask);
627 if (!scratch)
628 goto out;
629
630 xdr_init_decode_pages(&xdr, &buf,
631 lgr->layoutp->pages, lgr->layoutp->len);
632 xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
633
634 status = -EIO;
635 p = xdr_inline_decode(&xdr, 4);
636 if (unlikely(!p))
637 goto out_free_scratch;
638
639 count = be32_to_cpup(p++);
640 dprintk("%s: number of extents %d\n", __func__, count);
1050 641
1051 /* 642 /*
1052 * Use the session max response size as the basis for setting 643 * Decode individual extents, putting them in temporary staging area
1053 * GETDEVICEINFO's maxcount 644 * until whole layout is decoded to make error recovery easier.
1054 */ 645 */
1055 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 646 for (i = 0; i < count; i++) {
1056 max_pages = nfs_page_array_len(0, max_resp_sz); 647 status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
1057 dprintk("%s max_resp_sz %u max_pages %d\n", 648 if (status)
1058 __func__, max_resp_sz, max_pages); 649 goto process_extents;
1059
1060 dev = kmalloc(sizeof(*dev), GFP_NOFS);
1061 if (!dev) {
1062 dprintk("%s kmalloc failed\n", __func__);
1063 return ERR_PTR(-ENOMEM);
1064 } 650 }
1065 651
1066 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); 652 if (lgr->range.offset + lgr->range.length !=
1067 if (pages == NULL) { 653 lv.start << SECTOR_SHIFT) {
1068 kfree(dev); 654 dprintk("%s Final length mismatch\n", __func__);
1069 return ERR_PTR(-ENOMEM); 655 status = -EIO;
656 goto process_extents;
1070 } 657 }
1071 for (i = 0; i < max_pages; i++) { 658
1072 pages[i] = alloc_page(GFP_NOFS); 659 if (lv.start < lv.cowread) {
1073 if (!pages[i]) { 660 dprintk("%s Final uncovered COW extent\n", __func__);
1074 rv = ERR_PTR(-ENOMEM); 661 status = -EIO;
1075 goto out_free;
1076 }
1077 } 662 }
1078 663
1079 memcpy(&dev->dev_id, d_id, sizeof(*d_id)); 664process_extents:
1080 dev->layout_type = LAYOUT_BLOCK_VOLUME; 665 while (!list_empty(&extents)) {
1081 dev->pages = pages; 666 struct pnfs_block_extent *be =
1082 dev->pgbase = 0; 667 list_first_entry(&extents, struct pnfs_block_extent,
1083 dev->pglen = PAGE_SIZE * max_pages; 668 be_list);
1084 dev->mincount = 0; 669 list_del(&be->be_list);
1085 dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; 670
1086 671 if (!status)
1087 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 672 status = ext_tree_insert(bl, be);
1088 rc = nfs4_proc_getdeviceinfo(server, dev, NULL); 673
1089 dprintk("%s getdevice info returns %d\n", __func__, rc); 674 if (status) {
1090 if (rc) { 675 nfs4_put_deviceid_node(be->be_device);
1091 rv = ERR_PTR(rc); 676 kfree(be);
1092 goto out_free; 677 }
1093 } 678 }
1094 679
1095 rv = nfs4_blk_decode_device(server, dev); 680out_free_scratch:
1096 out_free: 681 __free_page(scratch);
1097 for (i = 0; i < max_pages; i++) 682out:
1098 __free_page(pages[i]); 683 dprintk("%s returns %d\n", __func__, status);
1099 kfree(pages); 684 if (status) {
1100 kfree(dev); 685 kfree(lseg);
1101 return rv; 686 return ERR_PTR(status);
687 }
688 return lseg;
1102} 689}
1103 690
1104static int 691static void
1105bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) 692bl_return_range(struct pnfs_layout_hdr *lo,
693 struct pnfs_layout_range *range)
1106{ 694{
1107 struct block_mount_id *b_mt_id = NULL; 695 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
1108 struct pnfs_devicelist *dlist = NULL; 696 sector_t offset = range->offset >> SECTOR_SHIFT, end;
1109 struct pnfs_block_dev *bdev;
1110 LIST_HEAD(block_disklist);
1111 int status, i;
1112
1113 dprintk("%s enter\n", __func__);
1114 697
1115 if (server->pnfs_blksize == 0) { 698 if (range->offset % 8) {
1116 dprintk("%s Server did not return blksize\n", __func__); 699 dprintk("%s: offset %lld not block size aligned\n",
1117 return -EINVAL; 700 __func__, range->offset);
1118 } 701 return;
1119 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
1120 if (!b_mt_id) {
1121 status = -ENOMEM;
1122 goto out_error;
1123 }
1124 /* Initialize nfs4 block layout mount id */
1125 spin_lock_init(&b_mt_id->bm_lock);
1126 INIT_LIST_HEAD(&b_mt_id->bm_devlist);
1127
1128 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
1129 if (!dlist) {
1130 status = -ENOMEM;
1131 goto out_error;
1132 } 702 }
1133 dlist->eof = 0; 703
1134 while (!dlist->eof) { 704 if (range->length != NFS4_MAX_UINT64) {
1135 status = nfs4_proc_getdevicelist(server, fh, dlist); 705 if (range->length % 8) {
1136 if (status) 706 dprintk("%s: length %lld not block size aligned\n",
1137 goto out_error; 707 __func__, range->length);
1138 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", 708 return;
1139 __func__, dlist->num_devs, dlist->eof);
1140 for (i = 0; i < dlist->num_devs; i++) {
1141 bdev = nfs4_blk_get_deviceinfo(server, fh,
1142 &dlist->dev_id[i]);
1143 if (IS_ERR(bdev)) {
1144 status = PTR_ERR(bdev);
1145 goto out_error;
1146 }
1147 spin_lock(&b_mt_id->bm_lock);
1148 list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
1149 spin_unlock(&b_mt_id->bm_lock);
1150 } 709 }
1151 }
1152 dprintk("%s SUCCESS\n", __func__);
1153 server->pnfs_ld_data = b_mt_id;
1154 710
1155 out_return: 711 end = offset + (range->length >> SECTOR_SHIFT);
1156 kfree(dlist); 712 } else {
1157 return status; 713 end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
714 }
1158 715
1159 out_error: 716 ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
1160 free_blk_mountid(b_mt_id);
1161 goto out_return;
1162} 717}
1163 718
1164static int 719static int
1165bl_clear_layoutdriver(struct nfs_server *server) 720bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
1166{ 721{
1167 struct block_mount_id *b_mt_id = server->pnfs_ld_data; 722 return ext_tree_prepare_commit(arg);
723}
1168 724
725static void
726bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
727{
728 ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
729}
730
731static int
732bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
733{
1169 dprintk("%s enter\n", __func__); 734 dprintk("%s enter\n", __func__);
1170 free_blk_mountid(b_mt_id); 735
1171 dprintk("%s RETURNS\n", __func__); 736 if (server->pnfs_blksize == 0) {
737 dprintk("%s Server did not return blksize\n", __func__);
738 return -EINVAL;
739 }
740 if (server->pnfs_blksize > PAGE_SIZE) {
741 printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
742 __func__, server->pnfs_blksize);
743 return -EINVAL;
744 }
745
1172 return 0; 746 return 0;
1173} 747}
1174 748
1175static bool 749static bool
1176is_aligned_req(struct nfs_page *req, unsigned int alignment) 750is_aligned_req(struct nfs_pageio_descriptor *pgio,
751 struct nfs_page *req, unsigned int alignment)
1177{ 752{
1178 return IS_ALIGNED(req->wb_offset, alignment) && 753 /*
1179 IS_ALIGNED(req->wb_bytes, alignment); 754 * Always accept buffered writes, higher layers take care of the
755 * right alignment.
756 */
757 if (pgio->pg_dreq == NULL)
758 return true;
759
760 if (!IS_ALIGNED(req->wb_offset, alignment))
761 return false;
762
763 if (IS_ALIGNED(req->wb_bytes, alignment))
764 return true;
765
766 if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) {
767 /*
768 * If the write goes up to the inode size, just write
769 * the full page. Data past the inode size is
770 * guaranteed to be zeroed by the higher level client
771 * code, and this behaviour is mandated by RFC 5663
772 * section 2.3.2.
773 */
774 return true;
775 }
776
777 return false;
1180} 778}
1181 779
1182static void 780static void
1183bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 781bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1184{ 782{
1185 if (pgio->pg_dreq != NULL && 783 if (!is_aligned_req(pgio, req, SECTOR_SIZE)) {
1186 !is_aligned_req(req, SECTOR_SIZE))
1187 nfs_pageio_reset_read_mds(pgio); 784 nfs_pageio_reset_read_mds(pgio);
1188 else 785 return;
1189 pnfs_generic_pg_init_read(pgio, req); 786 }
787
788 pnfs_generic_pg_init_read(pgio, req);
1190} 789}
1191 790
1192/* 791/*
@@ -1197,10 +796,8 @@ static size_t
1197bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 796bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1198 struct nfs_page *req) 797 struct nfs_page *req)
1199{ 798{
1200 if (pgio->pg_dreq != NULL && 799 if (!is_aligned_req(pgio, req, SECTOR_SIZE))
1201 !is_aligned_req(req, SECTOR_SIZE))
1202 return 0; 800 return 0;
1203
1204 return pnfs_generic_pg_test(pgio, prev, req); 801 return pnfs_generic_pg_test(pgio, prev, req);
1205} 802}
1206 803
@@ -1230,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
1230static void 827static void
1231bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) 828bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
1232{ 829{
1233 if (pgio->pg_dreq != NULL && 830 u64 wb_size;
1234 !is_aligned_req(req, PAGE_CACHE_SIZE)) { 831
832 if (!is_aligned_req(pgio, req, PAGE_SIZE)) {
1235 nfs_pageio_reset_write_mds(pgio); 833 nfs_pageio_reset_write_mds(pgio);
1236 } else { 834 return;
1237 u64 wb_size;
1238 if (pgio->pg_dreq == NULL)
1239 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
1240 req->wb_index);
1241 else
1242 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
1243
1244 pnfs_generic_pg_init_write(pgio, req, wb_size);
1245 } 835 }
836
837 if (pgio->pg_dreq == NULL)
838 wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
839 req->wb_index);
840 else
841 wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
842
843 pnfs_generic_pg_init_write(pgio, req, wb_size);
1246} 844}
1247 845
1248/* 846/*
@@ -1253,10 +851,8 @@ static size_t
1253bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, 851bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1254 struct nfs_page *req) 852 struct nfs_page *req)
1255{ 853{
1256 if (pgio->pg_dreq != NULL && 854 if (!is_aligned_req(pgio, req, PAGE_SIZE))
1257 !is_aligned_req(req, PAGE_CACHE_SIZE))
1258 return 0; 855 return 0;
1259
1260 return pnfs_generic_pg_test(pgio, prev, req); 856 return pnfs_generic_pg_test(pgio, prev, req);
1261} 857}
1262 858
@@ -1276,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = {
1276 .id = LAYOUT_BLOCK_VOLUME, 872 .id = LAYOUT_BLOCK_VOLUME,
1277 .name = "LAYOUT_BLOCK_VOLUME", 873 .name = "LAYOUT_BLOCK_VOLUME",
1278 .owner = THIS_MODULE, 874 .owner = THIS_MODULE,
875 .flags = PNFS_LAYOUTRET_ON_SETATTR |
876 PNFS_READ_WHOLE_PAGE,
1279 .read_pagelist = bl_read_pagelist, 877 .read_pagelist = bl_read_pagelist,
1280 .write_pagelist = bl_write_pagelist, 878 .write_pagelist = bl_write_pagelist,
1281 .alloc_layout_hdr = bl_alloc_layout_hdr, 879 .alloc_layout_hdr = bl_alloc_layout_hdr,
1282 .free_layout_hdr = bl_free_layout_hdr, 880 .free_layout_hdr = bl_free_layout_hdr,
1283 .alloc_lseg = bl_alloc_lseg, 881 .alloc_lseg = bl_alloc_lseg,
1284 .free_lseg = bl_free_lseg, 882 .free_lseg = bl_free_lseg,
1285 .encode_layoutcommit = bl_encode_layoutcommit, 883 .return_range = bl_return_range,
884 .prepare_layoutcommit = bl_prepare_layoutcommit,
1286 .cleanup_layoutcommit = bl_cleanup_layoutcommit, 885 .cleanup_layoutcommit = bl_cleanup_layoutcommit,
1287 .set_layoutdriver = bl_set_layoutdriver, 886 .set_layoutdriver = bl_set_layoutdriver,
1288 .clear_layoutdriver = bl_clear_layoutdriver, 887 .alloc_deviceid_node = bl_alloc_deviceid_node,
888 .free_deviceid_node = bl_free_deviceid_node,
1289 .pg_read_ops = &bl_pg_read_ops, 889 .pg_read_ops = &bl_pg_read_ops,
1290 .pg_write_ops = &bl_pg_write_ops, 890 .pg_write_ops = &bl_pg_write_ops,
1291}; 891};
1292 892
1293static const struct rpc_pipe_ops bl_upcall_ops = {
1294 .upcall = rpc_pipe_generic_upcall,
1295 .downcall = bl_pipe_downcall,
1296 .destroy_msg = bl_pipe_destroy_msg,
1297};
1298
1299static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
1300 struct rpc_pipe *pipe)
1301{
1302 struct dentry *dir, *dentry;
1303
1304 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
1305 if (dir == NULL)
1306 return ERR_PTR(-ENOENT);
1307 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
1308 dput(dir);
1309 return dentry;
1310}
1311
1312static void nfs4blocklayout_unregister_sb(struct super_block *sb,
1313 struct rpc_pipe *pipe)
1314{
1315 if (pipe->dentry)
1316 rpc_unlink(pipe->dentry);
1317}
1318
1319static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
1320 void *ptr)
1321{
1322 struct super_block *sb = ptr;
1323 struct net *net = sb->s_fs_info;
1324 struct nfs_net *nn = net_generic(net, nfs_net_id);
1325 struct dentry *dentry;
1326 int ret = 0;
1327
1328 if (!try_module_get(THIS_MODULE))
1329 return 0;
1330
1331 if (nn->bl_device_pipe == NULL) {
1332 module_put(THIS_MODULE);
1333 return 0;
1334 }
1335
1336 switch (event) {
1337 case RPC_PIPEFS_MOUNT:
1338 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
1339 if (IS_ERR(dentry)) {
1340 ret = PTR_ERR(dentry);
1341 break;
1342 }
1343 nn->bl_device_pipe->dentry = dentry;
1344 break;
1345 case RPC_PIPEFS_UMOUNT:
1346 if (nn->bl_device_pipe->dentry)
1347 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
1348 break;
1349 default:
1350 ret = -ENOTSUPP;
1351 break;
1352 }
1353 module_put(THIS_MODULE);
1354 return ret;
1355}
1356
1357static struct notifier_block nfs4blocklayout_block = {
1358 .notifier_call = rpc_pipefs_event,
1359};
1360
1361static struct dentry *nfs4blocklayout_register_net(struct net *net,
1362 struct rpc_pipe *pipe)
1363{
1364 struct super_block *pipefs_sb;
1365 struct dentry *dentry;
1366
1367 pipefs_sb = rpc_get_sb_net(net);
1368 if (!pipefs_sb)
1369 return NULL;
1370 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
1371 rpc_put_sb_net(net);
1372 return dentry;
1373}
1374
1375static void nfs4blocklayout_unregister_net(struct net *net,
1376 struct rpc_pipe *pipe)
1377{
1378 struct super_block *pipefs_sb;
1379
1380 pipefs_sb = rpc_get_sb_net(net);
1381 if (pipefs_sb) {
1382 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
1383 rpc_put_sb_net(net);
1384 }
1385}
1386
1387static int nfs4blocklayout_net_init(struct net *net)
1388{
1389 struct nfs_net *nn = net_generic(net, nfs_net_id);
1390 struct dentry *dentry;
1391
1392 init_waitqueue_head(&nn->bl_wq);
1393 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
1394 if (IS_ERR(nn->bl_device_pipe))
1395 return PTR_ERR(nn->bl_device_pipe);
1396 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
1397 if (IS_ERR(dentry)) {
1398 rpc_destroy_pipe_data(nn->bl_device_pipe);
1399 return PTR_ERR(dentry);
1400 }
1401 nn->bl_device_pipe->dentry = dentry;
1402 return 0;
1403}
1404
1405static void nfs4blocklayout_net_exit(struct net *net)
1406{
1407 struct nfs_net *nn = net_generic(net, nfs_net_id);
1408
1409 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
1410 rpc_destroy_pipe_data(nn->bl_device_pipe);
1411 nn->bl_device_pipe = NULL;
1412}
1413
1414static struct pernet_operations nfs4blocklayout_net_ops = {
1415 .init = nfs4blocklayout_net_init,
1416 .exit = nfs4blocklayout_net_exit,
1417};
1418
1419static int __init nfs4blocklayout_init(void) 893static int __init nfs4blocklayout_init(void)
1420{ 894{
1421 int ret; 895 int ret;
@@ -1425,20 +899,14 @@ static int __init nfs4blocklayout_init(void)
1425 ret = pnfs_register_layoutdriver(&blocklayout_type); 899 ret = pnfs_register_layoutdriver(&blocklayout_type);
1426 if (ret) 900 if (ret)
1427 goto out; 901 goto out;
1428 902 ret = bl_init_pipefs();
1429 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
1430 if (ret) 903 if (ret)
1431 goto out_remove; 904 goto out_unregister;
1432 ret = register_pernet_subsys(&nfs4blocklayout_net_ops); 905 return 0;
1433 if (ret)
1434 goto out_notifier;
1435out:
1436 return ret;
1437 906
1438out_notifier: 907out_unregister:
1439 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
1440out_remove:
1441 pnfs_unregister_layoutdriver(&blocklayout_type); 908 pnfs_unregister_layoutdriver(&blocklayout_type);
909out:
1442 return ret; 910 return ret;
1443} 911}
1444 912
@@ -1447,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void)
1447 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", 915 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1448 __func__); 916 __func__);
1449 917
1450 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); 918 bl_cleanup_pipefs();
1451 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
1452 pnfs_unregister_layoutdriver(&blocklayout_type); 919 pnfs_unregister_layoutdriver(&blocklayout_type);
1453} 920}
1454 921
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 9838fb020473..92dca9e90d8d 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -44,105 +44,112 @@
44#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) 44#define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
45#define SECTOR_SIZE (1 << SECTOR_SHIFT) 45#define SECTOR_SIZE (1 << SECTOR_SHIFT)
46 46
47struct block_mount_id { 47struct pnfs_block_dev;
48 spinlock_t bm_lock; /* protects list */
49 struct list_head bm_devlist; /* holds pnfs_block_dev */
50};
51 48
52struct pnfs_block_dev { 49enum pnfs_block_volume_type {
53 struct list_head bm_node; 50 PNFS_BLOCK_VOLUME_SIMPLE = 0,
54 struct nfs4_deviceid bm_mdevid; /* associated devid */ 51 PNFS_BLOCK_VOLUME_SLICE = 1,
55 struct block_device *bm_mdev; /* meta device itself */ 52 PNFS_BLOCK_VOLUME_CONCAT = 2,
56 struct net *net; 53 PNFS_BLOCK_VOLUME_STRIPE = 3,
57}; 54};
58 55
59enum exstate4 { 56#define PNFS_BLOCK_MAX_UUIDS 4
60 PNFS_BLOCK_READWRITE_DATA = 0, 57#define PNFS_BLOCK_MAX_DEVICES 64
61 PNFS_BLOCK_READ_DATA = 1, 58
62 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ 59/*
63 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ 60 * Random upper cap for the uuid length to avoid unbounded allocation.
61 * Not actually limited by the protocol.
62 */
63#define PNFS_BLOCK_UUID_LEN 128
64
65
66struct pnfs_block_volume {
67 enum pnfs_block_volume_type type;
68 union {
69 struct {
70 int len;
71 int nr_sigs;
72 struct {
73 u64 offset;
74 u32 sig_len;
75 u8 sig[PNFS_BLOCK_UUID_LEN];
76 } sigs[PNFS_BLOCK_MAX_UUIDS];
77 } simple;
78 struct {
79 u64 start;
80 u64 len;
81 u32 volume;
82 } slice;
83 struct {
84 u32 volumes_count;
85 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
86 } concat;
87 struct {
88 u64 chunk_size;
89 u32 volumes_count;
90 u32 volumes[PNFS_BLOCK_MAX_DEVICES];
91 } stripe;
92 };
64}; 93};
65 94
66#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ 95struct pnfs_block_dev_map {
96 sector_t start;
97 sector_t len;
67 98
68struct my_tree { 99 sector_t disk_offset;
69 sector_t mtt_step_size; /* Internal sector alignment */ 100 struct block_device *bdev;
70 struct list_head mtt_stub; /* Should be a radix tree */
71}; 101};
72 102
73struct pnfs_inval_markings { 103struct pnfs_block_dev {
74 spinlock_t im_lock; 104 struct nfs4_deviceid_node node;
75 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ 105
76 sector_t im_block_size; /* Server blocksize in sectors */ 106 u64 start;
77 struct list_head im_extents; /* Short extents for INVAL->RW conversion */ 107 u64 len;
108
109 u32 nr_children;
110 struct pnfs_block_dev *children;
111 u64 chunk_size;
112
113 struct block_device *bdev;
114 u64 disk_offset;
115
116 bool (*map)(struct pnfs_block_dev *dev, u64 offset,
117 struct pnfs_block_dev_map *map);
78}; 118};
79 119
80struct pnfs_inval_tracking { 120enum exstate4 {
81 struct list_head it_link; 121 PNFS_BLOCK_READWRITE_DATA = 0,
82 int it_sector; 122 PNFS_BLOCK_READ_DATA = 1,
83 int it_tags; 123 PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
124 PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
84}; 125};
85 126
86/* sector_t fields are all in 512-byte sectors */ 127/* sector_t fields are all in 512-byte sectors */
87struct pnfs_block_extent { 128struct pnfs_block_extent {
88 struct kref be_refcnt; 129 union {
89 struct list_head be_node; /* link into lseg list */ 130 struct rb_node be_node;
90 struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ 131 struct list_head be_list;
91 struct block_device *be_mdev; 132 };
133 struct nfs4_deviceid_node *be_device;
92 sector_t be_f_offset; /* the starting offset in the file */ 134 sector_t be_f_offset; /* the starting offset in the file */
93 sector_t be_length; /* the size of the extent */ 135 sector_t be_length; /* the size of the extent */
94 sector_t be_v_offset; /* the starting offset in the volume */ 136 sector_t be_v_offset; /* the starting offset in the volume */
95 enum exstate4 be_state; /* the state of this extent */ 137 enum exstate4 be_state; /* the state of this extent */
96 struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ 138#define EXTENT_WRITTEN 1
139#define EXTENT_COMMITTING 2
140 unsigned int be_tag;
97}; 141};
98 142
99/* Shortened extent used by LAYOUTCOMMIT */ 143/* on the wire size of the extent */
100struct pnfs_block_short_extent { 144#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
101 struct list_head bse_node;
102 struct nfs4_deviceid bse_devid;
103 struct block_device *bse_mdev;
104 sector_t bse_f_offset; /* the starting offset in the file */
105 sector_t bse_length; /* the size of the extent */
106};
107
108static inline void
109BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
110{
111 spin_lock_init(&marks->im_lock);
112 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
113 INIT_LIST_HEAD(&marks->im_extents);
114 marks->im_block_size = blocksize;
115 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
116 blocksize);
117}
118
119enum extentclass4 {
120 RW_EXTENT = 0, /* READWRTE and INVAL */
121 RO_EXTENT = 1, /* READ and NONE */
122 EXTENT_LISTS = 2,
123};
124
125static inline int bl_choose_list(enum exstate4 state)
126{
127 if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA)
128 return RO_EXTENT;
129 else
130 return RW_EXTENT;
131}
132 145
133struct pnfs_block_layout { 146struct pnfs_block_layout {
134 struct pnfs_layout_hdr bl_layout; 147 struct pnfs_layout_hdr bl_layout;
135 struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ 148 struct rb_root bl_ext_rw;
149 struct rb_root bl_ext_ro;
136 spinlock_t bl_ext_lock; /* Protects list manipulation */ 150 spinlock_t bl_ext_lock; /* Protects list manipulation */
137 struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */
138 struct list_head bl_commit; /* Needs layout commit */
139 struct list_head bl_committing; /* Layout committing */
140 unsigned int bl_count; /* entries in bl_commit */
141 sector_t bl_blocksize; /* Server blocksize in sectors */
142}; 151};
143 152
144#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data))
145
146static inline struct pnfs_block_layout * 153static inline struct pnfs_block_layout *
147BLK_LO2EXT(struct pnfs_layout_hdr *lo) 154BLK_LO2EXT(struct pnfs_layout_hdr *lo)
148{ 155{
@@ -171,41 +178,27 @@ struct bl_msg_hdr {
171#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ 178#define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */
172#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ 179#define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */
173 180
174/* blocklayoutdev.c */ 181/* dev.c */
175ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); 182struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
176void bl_pipe_destroy_msg(struct rpc_pipe_msg *); 183 struct pnfs_device *pdev, gfp_t gfp_mask);
177void nfs4_blkdev_put(struct block_device *bdev); 184void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
178struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, 185
179 struct pnfs_device *dev); 186/* extent_tree.c */
180int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, 187int ext_tree_insert(struct pnfs_block_layout *bl,
181 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 188 struct pnfs_block_extent *new);
182 189int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start,
183/* blocklayoutdm.c */ 190 sector_t end);
184void bl_free_block_dev(struct pnfs_block_dev *bdev); 191int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
185 192 sector_t len);
186/* extents.c */ 193bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
187struct pnfs_block_extent * 194 struct pnfs_block_extent *ret, bool rw);
188bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, 195int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
189 struct pnfs_block_extent **cow_read); 196void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
190int bl_mark_sectors_init(struct pnfs_inval_markings *marks, 197
191 sector_t offset, sector_t length); 198/* rpc_pipefs.c */
192void bl_put_extent(struct pnfs_block_extent *be); 199dev_t bl_resolve_deviceid(struct nfs_server *server,
193struct pnfs_block_extent *bl_alloc_extent(void); 200 struct pnfs_block_volume *b, gfp_t gfp_mask);
194int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); 201int __init bl_init_pipefs(void);
195int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, 202void __exit bl_cleanup_pipefs(void);
196 struct xdr_stream *xdr,
197 const struct nfs4_layoutcommit_args *arg);
198void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
199 const struct nfs4_layoutcommit_args *arg,
200 int status);
201int bl_add_merge_extent(struct pnfs_block_layout *bl,
202 struct pnfs_block_extent *new);
203int bl_mark_for_commit(struct pnfs_block_extent *be,
204 sector_t offset, sector_t length,
205 struct pnfs_block_short_extent *new);
206int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
207struct pnfs_block_short_extent *
208bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
209void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
210 203
211#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 204#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
deleted file mode 100644
index 04303b5c9361..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ /dev/null
@@ -1,384 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdev.c
3 *
4 * Device operations for the pnfs nfs4 file layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32#include <linux/module.h>
33#include <linux/buffer_head.h> /* __bread */
34
35#include <linux/genhd.h>
36#include <linux/blkdev.h>
37#include <linux/hash.h>
38
39#include "blocklayout.h"
40
41#define NFSDBG_FACILITY NFSDBG_PNFS_LD
42
43static int decode_sector_number(__be32 **rp, sector_t *sp)
44{
45 uint64_t s;
46
47 *rp = xdr_decode_hyper(*rp, &s);
48 if (s & 0x1ff) {
49 printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
50 return -1;
51 }
52 *sp = s >> SECTOR_SHIFT;
53 return 0;
54}
55
56/*
57 * Release the block device
58 */
59void nfs4_blkdev_put(struct block_device *bdev)
60{
61 dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
62 MINOR(bdev->bd_dev));
63 blkdev_put(bdev, FMODE_READ);
64}
65
66ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
67 size_t mlen)
68{
69 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
70 nfs_net_id);
71
72 if (mlen != sizeof (struct bl_dev_msg))
73 return -EINVAL;
74
75 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
76 return -EFAULT;
77
78 wake_up(&nn->bl_wq);
79
80 return mlen;
81}
82
83void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
84{
85 struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg);
86
87 if (msg->errno >= 0)
88 return;
89 wake_up(bl_pipe_msg->bl_wq);
90}
91
92/*
93 * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
94 */
95struct pnfs_block_dev *
96nfs4_blk_decode_device(struct nfs_server *server,
97 struct pnfs_device *dev)
98{
99 struct pnfs_block_dev *rv;
100 struct block_device *bd = NULL;
101 struct bl_pipe_msg bl_pipe_msg;
102 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
103 struct bl_msg_hdr bl_msg = {
104 .type = BL_DEVICE_MOUNT,
105 .totallen = dev->mincount,
106 };
107 uint8_t *dataptr;
108 DECLARE_WAITQUEUE(wq, current);
109 int offset, len, i, rc;
110 struct net *net = server->nfs_client->cl_net;
111 struct nfs_net *nn = net_generic(net, nfs_net_id);
112 struct bl_dev_msg *reply = &nn->bl_mount_reply;
113
114 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
115 dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
116 dev->mincount);
117
118 bl_pipe_msg.bl_wq = &nn->bl_wq;
119 memset(msg, 0, sizeof(*msg));
120 msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS);
121 if (!msg->data) {
122 rv = ERR_PTR(-ENOMEM);
123 goto out;
124 }
125
126 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
127 dataptr = (uint8_t *) msg->data;
128 len = dev->mincount;
129 offset = sizeof(bl_msg);
130 for (i = 0; len > 0; i++) {
131 memcpy(&dataptr[offset], page_address(dev->pages[i]),
132 len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
133 len -= PAGE_CACHE_SIZE;
134 offset += PAGE_CACHE_SIZE;
135 }
136 msg->len = sizeof(bl_msg) + dev->mincount;
137
138 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
139 add_wait_queue(&nn->bl_wq, &wq);
140 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
141 if (rc < 0) {
142 remove_wait_queue(&nn->bl_wq, &wq);
143 rv = ERR_PTR(rc);
144 goto out;
145 }
146
147 set_current_state(TASK_UNINTERRUPTIBLE);
148 schedule();
149 __set_current_state(TASK_RUNNING);
150 remove_wait_queue(&nn->bl_wq, &wq);
151
152 if (reply->status != BL_DEVICE_REQUEST_PROC) {
153 dprintk("%s failed to open device: %d\n",
154 __func__, reply->status);
155 rv = ERR_PTR(-EINVAL);
156 goto out;
157 }
158
159 bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
160 FMODE_READ, NULL);
161 if (IS_ERR(bd)) {
162 dprintk("%s failed to open device : %ld\n", __func__,
163 PTR_ERR(bd));
164 rv = ERR_CAST(bd);
165 goto out;
166 }
167
168 rv = kzalloc(sizeof(*rv), GFP_NOFS);
169 if (!rv) {
170 rv = ERR_PTR(-ENOMEM);
171 goto out;
172 }
173
174 rv->bm_mdev = bd;
175 memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid));
176 rv->net = net;
177 dprintk("%s Created device %s with bd_block_size %u\n",
178 __func__,
179 bd->bd_disk->disk_name,
180 bd->bd_block_size);
181
182out:
183 kfree(msg->data);
184 return rv;
185}
186
187/* Map deviceid returned by the server to constructed block_device */
188static struct block_device *translate_devid(struct pnfs_layout_hdr *lo,
189 struct nfs4_deviceid *id)
190{
191 struct block_device *rv = NULL;
192 struct block_mount_id *mid;
193 struct pnfs_block_dev *dev;
194
195 dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id);
196 mid = BLK_ID(lo);
197 spin_lock(&mid->bm_lock);
198 list_for_each_entry(dev, &mid->bm_devlist, bm_node) {
199 if (memcmp(id->data, dev->bm_mdevid.data,
200 NFS4_DEVICEID4_SIZE) == 0) {
201 rv = dev->bm_mdev;
202 goto out;
203 }
204 }
205 out:
206 spin_unlock(&mid->bm_lock);
207 dprintk("%s returning %p\n", __func__, rv);
208 return rv;
209}
210
211/* Tracks info needed to ensure extents in layout obey constraints of spec */
212struct layout_verification {
213 u32 mode; /* R or RW */
214 u64 start; /* Expected start of next non-COW extent */
215 u64 inval; /* Start of INVAL coverage */
216 u64 cowread; /* End of COW read coverage */
217};
218
219/* Verify the extent meets the layout requirements of the pnfs-block draft,
220 * section 2.3.1.
221 */
222static int verify_extent(struct pnfs_block_extent *be,
223 struct layout_verification *lv)
224{
225 if (lv->mode == IOMODE_READ) {
226 if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
227 be->be_state == PNFS_BLOCK_INVALID_DATA)
228 return -EIO;
229 if (be->be_f_offset != lv->start)
230 return -EIO;
231 lv->start += be->be_length;
232 return 0;
233 }
234 /* lv->mode == IOMODE_RW */
235 if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
236 if (be->be_f_offset != lv->start)
237 return -EIO;
238 if (lv->cowread > lv->start)
239 return -EIO;
240 lv->start += be->be_length;
241 lv->inval = lv->start;
242 return 0;
243 } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
244 if (be->be_f_offset != lv->start)
245 return -EIO;
246 lv->start += be->be_length;
247 return 0;
248 } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
249 if (be->be_f_offset > lv->start)
250 return -EIO;
251 if (be->be_f_offset < lv->inval)
252 return -EIO;
253 if (be->be_f_offset < lv->cowread)
254 return -EIO;
255 /* It looks like you might want to min this with lv->start,
256 * but you really don't.
257 */
258 lv->inval = lv->inval + be->be_length;
259 lv->cowread = be->be_f_offset + be->be_length;
260 return 0;
261 } else
262 return -EIO;
263}
264
265/* XDR decode pnfs_block_layout4 structure */
266int
267nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
268 struct nfs4_layoutget_res *lgr, gfp_t gfp_flags)
269{
270 struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
271 int i, status = -EIO;
272 uint32_t count;
273 struct pnfs_block_extent *be = NULL, *save;
274 struct xdr_stream stream;
275 struct xdr_buf buf;
276 struct page *scratch;
277 __be32 *p;
278 struct layout_verification lv = {
279 .mode = lgr->range.iomode,
280 .start = lgr->range.offset >> SECTOR_SHIFT,
281 .inval = lgr->range.offset >> SECTOR_SHIFT,
282 .cowread = lgr->range.offset >> SECTOR_SHIFT,
283 };
284 LIST_HEAD(extents);
285
286 dprintk("---> %s\n", __func__);
287
288 scratch = alloc_page(gfp_flags);
289 if (!scratch)
290 return -ENOMEM;
291
292 xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
293 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
294
295 p = xdr_inline_decode(&stream, 4);
296 if (unlikely(!p))
297 goto out_err;
298
299 count = be32_to_cpup(p++);
300
301 dprintk("%s enter, number of extents %i\n", __func__, count);
302 p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count);
303 if (unlikely(!p))
304 goto out_err;
305
306 /* Decode individual extents, putting them in temporary
307 * staging area until whole layout is decoded to make error
308 * recovery easier.
309 */
310 for (i = 0; i < count; i++) {
311 be = bl_alloc_extent();
312 if (!be) {
313 status = -ENOMEM;
314 goto out_err;
315 }
316 memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE);
317 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
318 be->be_mdev = translate_devid(lo, &be->be_devid);
319 if (!be->be_mdev)
320 goto out_err;
321
322 /* The next three values are read in as bytes,
323 * but stored as 512-byte sector lengths
324 */
325 if (decode_sector_number(&p, &be->be_f_offset) < 0)
326 goto out_err;
327 if (decode_sector_number(&p, &be->be_length) < 0)
328 goto out_err;
329 if (decode_sector_number(&p, &be->be_v_offset) < 0)
330 goto out_err;
331 be->be_state = be32_to_cpup(p++);
332 if (be->be_state == PNFS_BLOCK_INVALID_DATA)
333 be->be_inval = &bl->bl_inval;
334 if (verify_extent(be, &lv)) {
335 dprintk("%s verify failed\n", __func__);
336 goto out_err;
337 }
338 list_add_tail(&be->be_node, &extents);
339 }
340 if (lgr->range.offset + lgr->range.length !=
341 lv.start << SECTOR_SHIFT) {
342 dprintk("%s Final length mismatch\n", __func__);
343 be = NULL;
344 goto out_err;
345 }
346 if (lv.start < lv.cowread) {
347 dprintk("%s Final uncovered COW extent\n", __func__);
348 be = NULL;
349 goto out_err;
350 }
351 /* Extents decoded properly, now try to merge them in to
352 * existing layout extents.
353 */
354 spin_lock(&bl->bl_ext_lock);
355 list_for_each_entry_safe(be, save, &extents, be_node) {
356 list_del(&be->be_node);
357 status = bl_add_merge_extent(bl, be);
358 if (status) {
359 spin_unlock(&bl->bl_ext_lock);
360 /* This is a fairly catastrophic error, as the
361 * entire layout extent lists are now corrupted.
362 * We should have some way to distinguish this.
363 */
364 be = NULL;
365 goto out_err;
366 }
367 }
368 spin_unlock(&bl->bl_ext_lock);
369 status = 0;
370 out:
371 __free_page(scratch);
372 dprintk("%s returns %i\n", __func__, status);
373 return status;
374
375 out_err:
376 bl_put_extent(be);
377 while (!list_empty(&extents)) {
378 be = list_first_entry(&extents, struct pnfs_block_extent,
379 be_node);
380 list_del(&be->be_node);
381 bl_put_extent(be);
382 }
383 goto out;
384}
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
deleted file mode 100644
index 8999cfddd866..000000000000
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ /dev/null
@@ -1,108 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayoutdm.c
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2007 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Fred Isaman <iisaman@umich.edu>
10 * Andy Adamson <andros@citi.umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include <linux/genhd.h> /* gendisk - used in a dprintk*/
34#include <linux/sched.h>
35#include <linux/hash.h>
36
37#include "blocklayout.h"
38
39#define NFSDBG_FACILITY NFSDBG_PNFS_LD
40
41static void dev_remove(struct net *net, dev_t dev)
42{
43 struct bl_pipe_msg bl_pipe_msg;
44 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
45 struct bl_dev_msg bl_umount_request;
46 struct bl_msg_hdr bl_msg = {
47 .type = BL_DEVICE_UMOUNT,
48 .totallen = sizeof(bl_umount_request),
49 };
50 uint8_t *dataptr;
51 DECLARE_WAITQUEUE(wq, current);
52 struct nfs_net *nn = net_generic(net, nfs_net_id);
53
54 dprintk("Entering %s\n", __func__);
55
56 bl_pipe_msg.bl_wq = &nn->bl_wq;
57 memset(msg, 0, sizeof(*msg));
58 msg->len = sizeof(bl_msg) + bl_msg.totallen;
59 msg->data = kzalloc(msg->len, GFP_NOFS);
60 if (!msg->data)
61 goto out;
62
63 memset(&bl_umount_request, 0, sizeof(bl_umount_request));
64 bl_umount_request.major = MAJOR(dev);
65 bl_umount_request.minor = MINOR(dev);
66
67 memcpy(msg->data, &bl_msg, sizeof(bl_msg));
68 dataptr = (uint8_t *) msg->data;
69 memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
70
71 add_wait_queue(&nn->bl_wq, &wq);
72 if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
73 remove_wait_queue(&nn->bl_wq, &wq);
74 goto out;
75 }
76
77 set_current_state(TASK_UNINTERRUPTIBLE);
78 schedule();
79 __set_current_state(TASK_RUNNING);
80 remove_wait_queue(&nn->bl_wq, &wq);
81
82out:
83 kfree(msg->data);
84}
85
86/*
87 * Release meta device
88 */
89static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
90{
91 dprintk("%s Releasing\n", __func__);
92 nfs4_blkdev_put(bdev->bm_mdev);
93 dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
94}
95
96void bl_free_block_dev(struct pnfs_block_dev *bdev)
97{
98 if (bdev) {
99 if (bdev->bm_mdev) {
100 dprintk("%s Removing DM device: %d:%d\n",
101 __func__,
102 MAJOR(bdev->bm_mdev->bd_dev),
103 MINOR(bdev->bm_mdev->bd_dev));
104 nfs4_blk_metadev_release(bdev);
105 }
106 kfree(bdev);
107 }
108}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
new file mode 100644
index 000000000000..5aed4f98df41
--- /dev/null
+++ b/fs/nfs/blocklayout/dev.c
@@ -0,0 +1,363 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4#include <linux/sunrpc/svc.h>
5#include <linux/blkdev.h>
6#include <linux/nfs4.h>
7#include <linux/nfs_fs.h>
8#include <linux/nfs_xdr.h>
9
10#include "blocklayout.h"
11
12#define NFSDBG_FACILITY NFSDBG_PNFS_LD
13
14static void
15bl_free_device(struct pnfs_block_dev *dev)
16{
17 if (dev->nr_children) {
18 int i;
19
20 for (i = 0; i < dev->nr_children; i++)
21 bl_free_device(&dev->children[i]);
22 kfree(dev->children);
23 } else {
24 if (dev->bdev)
25 blkdev_put(dev->bdev, FMODE_READ);
26 }
27}
28
29void
30bl_free_deviceid_node(struct nfs4_deviceid_node *d)
31{
32 struct pnfs_block_dev *dev =
33 container_of(d, struct pnfs_block_dev, node);
34
35 bl_free_device(dev);
36 kfree(dev);
37}
38
39static int
40nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
41{
42 __be32 *p;
43 int i;
44
45 p = xdr_inline_decode(xdr, 4);
46 if (!p)
47 return -EIO;
48 b->type = be32_to_cpup(p++);
49
50 switch (b->type) {
51 case PNFS_BLOCK_VOLUME_SIMPLE:
52 p = xdr_inline_decode(xdr, 4);
53 if (!p)
54 return -EIO;
55 b->simple.nr_sigs = be32_to_cpup(p++);
56 if (!b->simple.nr_sigs) {
57 dprintk("no signature\n");
58 return -EIO;
59 }
60
61 b->simple.len = 4 + 4;
62 for (i = 0; i < b->simple.nr_sigs; i++) {
63 p = xdr_inline_decode(xdr, 8 + 4);
64 if (!p)
65 return -EIO;
66 p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
67 b->simple.sigs[i].sig_len = be32_to_cpup(p++);
68
69 p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
70 if (!p)
71 return -EIO;
72 memcpy(&b->simple.sigs[i].sig, p,
73 b->simple.sigs[i].sig_len);
74
75 b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
76 }
77 break;
78 case PNFS_BLOCK_VOLUME_SLICE:
79 p = xdr_inline_decode(xdr, 8 + 8 + 4);
80 if (!p)
81 return -EIO;
82 p = xdr_decode_hyper(p, &b->slice.start);
83 p = xdr_decode_hyper(p, &b->slice.len);
84 b->slice.volume = be32_to_cpup(p++);
85 break;
86 case PNFS_BLOCK_VOLUME_CONCAT:
87 p = xdr_inline_decode(xdr, 4);
88 if (!p)
89 return -EIO;
90 b->concat.volumes_count = be32_to_cpup(p++);
91
92 p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
93 if (!p)
94 return -EIO;
95 for (i = 0; i < b->concat.volumes_count; i++)
96 b->concat.volumes[i] = be32_to_cpup(p++);
97 break;
98 case PNFS_BLOCK_VOLUME_STRIPE:
99 p = xdr_inline_decode(xdr, 8 + 4);
100 if (!p)
101 return -EIO;
102 p = xdr_decode_hyper(p, &b->stripe.chunk_size);
103 b->stripe.volumes_count = be32_to_cpup(p++);
104
105 p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
106 if (!p)
107 return -EIO;
108 for (i = 0; i < b->stripe.volumes_count; i++)
109 b->stripe.volumes[i] = be32_to_cpup(p++);
110 break;
111 default:
112 dprintk("unknown volume type!\n");
113 return -EIO;
114 }
115
116 return 0;
117}
118
119static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
120 struct pnfs_block_dev_map *map)
121{
122 map->start = dev->start;
123 map->len = dev->len;
124 map->disk_offset = dev->disk_offset;
125 map->bdev = dev->bdev;
126 return true;
127}
128
129static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
130 struct pnfs_block_dev_map *map)
131{
132 int i;
133
134 for (i = 0; i < dev->nr_children; i++) {
135 struct pnfs_block_dev *child = &dev->children[i];
136
137 if (child->start > offset ||
138 child->start + child->len <= offset)
139 continue;
140
141 child->map(child, offset - child->start, map);
142 return true;
143 }
144
145 dprintk("%s: ran off loop!\n", __func__);
146 return false;
147}
148
149static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
150 struct pnfs_block_dev_map *map)
151{
152 struct pnfs_block_dev *child;
153 u64 chunk;
154 u32 chunk_idx;
155 u64 disk_offset;
156
157 chunk = div_u64(offset, dev->chunk_size);
158 div_u64_rem(chunk, dev->nr_children, &chunk_idx);
159
160 if (chunk_idx > dev->nr_children) {
161 dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
162 __func__, chunk_idx, offset, dev->chunk_size);
163 /* error, should not happen */
164 return false;
165 }
166
167 /* truncate offset to the beginning of the stripe */
168 offset = chunk * dev->chunk_size;
169
170 /* disk offset of the stripe */
171 disk_offset = div_u64(offset, dev->nr_children);
172
173 child = &dev->children[chunk_idx];
174 child->map(child, disk_offset, map);
175
176 map->start += offset;
177 map->disk_offset += disk_offset;
178 map->len = dev->chunk_size;
179 return true;
180}
181
182static int
183bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
184 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
185
186
187static int
188bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
189 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
190{
191 struct pnfs_block_volume *v = &volumes[idx];
192 dev_t dev;
193
194 dev = bl_resolve_deviceid(server, v, gfp_mask);
195 if (!dev)
196 return -EIO;
197
198 d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
199 if (IS_ERR(d->bdev)) {
200 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
201 MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
202 return PTR_ERR(d->bdev);
203 }
204
205
206 d->len = i_size_read(d->bdev->bd_inode);
207 d->map = bl_map_simple;
208
209 printk(KERN_INFO "pNFS: using block device %s\n",
210 d->bdev->bd_disk->disk_name);
211 return 0;
212}
213
214static int
215bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
216 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
217{
218 struct pnfs_block_volume *v = &volumes[idx];
219 int ret;
220
221 ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
222 if (ret)
223 return ret;
224
225 d->disk_offset = v->slice.start;
226 d->len = v->slice.len;
227 return 0;
228}
229
230static int
231bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
232 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
233{
234 struct pnfs_block_volume *v = &volumes[idx];
235 u64 len = 0;
236 int ret, i;
237
238 d->children = kcalloc(v->concat.volumes_count,
239 sizeof(struct pnfs_block_dev), GFP_KERNEL);
240 if (!d->children)
241 return -ENOMEM;
242
243 for (i = 0; i < v->concat.volumes_count; i++) {
244 ret = bl_parse_deviceid(server, &d->children[i],
245 volumes, v->concat.volumes[i], gfp_mask);
246 if (ret)
247 return ret;
248
249 d->nr_children++;
250 d->children[i].start += len;
251 len += d->children[i].len;
252 }
253
254 d->len = len;
255 d->map = bl_map_concat;
256 return 0;
257}
258
259static int
260bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
261 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
262{
263 struct pnfs_block_volume *v = &volumes[idx];
264 u64 len = 0;
265 int ret, i;
266
267 d->children = kcalloc(v->stripe.volumes_count,
268 sizeof(struct pnfs_block_dev), GFP_KERNEL);
269 if (!d->children)
270 return -ENOMEM;
271
272 for (i = 0; i < v->stripe.volumes_count; i++) {
273 ret = bl_parse_deviceid(server, &d->children[i],
274 volumes, v->stripe.volumes[i], gfp_mask);
275 if (ret)
276 return ret;
277
278 d->nr_children++;
279 len += d->children[i].len;
280 }
281
282 d->len = len;
283 d->chunk_size = v->stripe.chunk_size;
284 d->map = bl_map_stripe;
285 return 0;
286}
287
288static int
289bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
290 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
291{
292 switch (volumes[idx].type) {
293 case PNFS_BLOCK_VOLUME_SIMPLE:
294 return bl_parse_simple(server, d, volumes, idx, gfp_mask);
295 case PNFS_BLOCK_VOLUME_SLICE:
296 return bl_parse_slice(server, d, volumes, idx, gfp_mask);
297 case PNFS_BLOCK_VOLUME_CONCAT:
298 return bl_parse_concat(server, d, volumes, idx, gfp_mask);
299 case PNFS_BLOCK_VOLUME_STRIPE:
300 return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
301 default:
302 dprintk("unsupported volume type: %d\n", volumes[idx].type);
303 return -EIO;
304 }
305}
306
307struct nfs4_deviceid_node *
308bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
309 gfp_t gfp_mask)
310{
311 struct nfs4_deviceid_node *node = NULL;
312 struct pnfs_block_volume *volumes;
313 struct pnfs_block_dev *top;
314 struct xdr_stream xdr;
315 struct xdr_buf buf;
316 struct page *scratch;
317 int nr_volumes, ret, i;
318 __be32 *p;
319
320 scratch = alloc_page(gfp_mask);
321 if (!scratch)
322 goto out;
323
324 xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
325 xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
326
327 p = xdr_inline_decode(&xdr, sizeof(__be32));
328 if (!p)
329 goto out_free_scratch;
330 nr_volumes = be32_to_cpup(p++);
331
332 volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
333 gfp_mask);
334 if (!volumes)
335 goto out_free_scratch;
336
337 for (i = 0; i < nr_volumes; i++) {
338 ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
339 if (ret < 0)
340 goto out_free_volumes;
341 }
342
343 top = kzalloc(sizeof(*top), gfp_mask);
344 if (!top)
345 goto out_free_volumes;
346
347 ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
348 if (ret) {
349 bl_free_device(top);
350 kfree(top);
351 goto out_free_volumes;
352 }
353
354 node = &top->node;
355 nfs4_init_deviceid_node(node, server, &pdev->dev_id);
356
357out_free_volumes:
358 kfree(volumes);
359out_free_scratch:
360 __free_page(scratch);
361out:
362 return node;
363}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
new file mode 100644
index 000000000000..31d0b5e53dfd
--- /dev/null
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -0,0 +1,602 @@
1/*
2 * Copyright (c) 2014 Christoph Hellwig.
3 */
4
5#include <linux/vmalloc.h>
6
7#include "blocklayout.h"
8
9#define NFSDBG_FACILITY NFSDBG_PNFS_LD
10
11static inline struct pnfs_block_extent *
12ext_node(struct rb_node *node)
13{
14 return rb_entry(node, struct pnfs_block_extent, be_node);
15}
16
17static struct pnfs_block_extent *
18ext_tree_first(struct rb_root *root)
19{
20 struct rb_node *node = rb_first(root);
21 return node ? ext_node(node) : NULL;
22}
23
24static struct pnfs_block_extent *
25ext_tree_prev(struct pnfs_block_extent *be)
26{
27 struct rb_node *node = rb_prev(&be->be_node);
28 return node ? ext_node(node) : NULL;
29}
30
31static struct pnfs_block_extent *
32ext_tree_next(struct pnfs_block_extent *be)
33{
34 struct rb_node *node = rb_next(&be->be_node);
35 return node ? ext_node(node) : NULL;
36}
37
38static inline sector_t
39ext_f_end(struct pnfs_block_extent *be)
40{
41 return be->be_f_offset + be->be_length;
42}
43
44static struct pnfs_block_extent *
45__ext_tree_search(struct rb_root *root, sector_t start)
46{
47 struct rb_node *node = root->rb_node;
48 struct pnfs_block_extent *be = NULL;
49
50 while (node) {
51 be = ext_node(node);
52 if (start < be->be_f_offset)
53 node = node->rb_left;
54 else if (start >= ext_f_end(be))
55 node = node->rb_right;
56 else
57 return be;
58 }
59
60 if (be) {
61 if (start < be->be_f_offset)
62 return be;
63
64 if (start >= ext_f_end(be))
65 return ext_tree_next(be);
66 }
67
68 return NULL;
69}
70
71static bool
72ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
73{
74 if (be1->be_state != be2->be_state)
75 return false;
76 if (be1->be_device != be2->be_device)
77 return false;
78
79 if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
80 return false;
81
82 if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
83 (be1->be_v_offset + be1->be_length != be2->be_v_offset))
84 return false;
85
86 if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
87 be1->be_tag != be2->be_tag)
88 return false;
89
90 return true;
91}
92
93static struct pnfs_block_extent *
94ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
95{
96 struct pnfs_block_extent *left = ext_tree_prev(be);
97
98 if (left && ext_can_merge(left, be)) {
99 left->be_length += be->be_length;
100 rb_erase(&be->be_node, root);
101 nfs4_put_deviceid_node(be->be_device);
102 kfree(be);
103 return left;
104 }
105
106 return be;
107}
108
109static struct pnfs_block_extent *
110ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
111{
112 struct pnfs_block_extent *right = ext_tree_next(be);
113
114 if (right && ext_can_merge(be, right)) {
115 be->be_length += right->be_length;
116 rb_erase(&right->be_node, root);
117 nfs4_put_deviceid_node(right->be_device);
118 kfree(right);
119 }
120
121 return be;
122}
123
124static void
125__ext_tree_insert(struct rb_root *root,
126 struct pnfs_block_extent *new, bool merge_ok)
127{
128 struct rb_node **p = &root->rb_node, *parent = NULL;
129 struct pnfs_block_extent *be;
130
131 while (*p) {
132 parent = *p;
133 be = ext_node(parent);
134
135 if (new->be_f_offset < be->be_f_offset) {
136 if (merge_ok && ext_can_merge(new, be)) {
137 be->be_f_offset = new->be_f_offset;
138 if (be->be_state != PNFS_BLOCK_NONE_DATA)
139 be->be_v_offset = new->be_v_offset;
140 be->be_length += new->be_length;
141 be = ext_try_to_merge_left(root, be);
142 goto free_new;
143 }
144 p = &(*p)->rb_left;
145 } else if (new->be_f_offset >= ext_f_end(be)) {
146 if (merge_ok && ext_can_merge(be, new)) {
147 be->be_length += new->be_length;
148 be = ext_try_to_merge_right(root, be);
149 goto free_new;
150 }
151 p = &(*p)->rb_right;
152 } else {
153 BUG();
154 }
155 }
156
157 rb_link_node(&new->be_node, parent, p);
158 rb_insert_color(&new->be_node, root);
159 return;
160free_new:
161 nfs4_put_deviceid_node(new->be_device);
162 kfree(new);
163}
164
165static int
166__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
167{
168 struct pnfs_block_extent *be;
169 sector_t len1 = 0, len2 = 0;
170 sector_t orig_v_offset;
171 sector_t orig_len;
172
173 be = __ext_tree_search(root, start);
174 if (!be)
175 return 0;
176 if (be->be_f_offset >= end)
177 return 0;
178
179 orig_v_offset = be->be_v_offset;
180 orig_len = be->be_length;
181
182 if (start > be->be_f_offset)
183 len1 = start - be->be_f_offset;
184 if (ext_f_end(be) > end)
185 len2 = ext_f_end(be) - end;
186
187 if (len2 > 0) {
188 if (len1 > 0) {
189 struct pnfs_block_extent *new;
190
191 new = kzalloc(sizeof(*new), GFP_ATOMIC);
192 if (!new)
193 return -ENOMEM;
194
195 be->be_length = len1;
196
197 new->be_f_offset = end;
198 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
199 new->be_v_offset =
200 orig_v_offset + orig_len - len2;
201 }
202 new->be_length = len2;
203 new->be_state = be->be_state;
204 new->be_tag = be->be_tag;
205 new->be_device = nfs4_get_deviceid(be->be_device);
206
207 __ext_tree_insert(root, new, true);
208 } else {
209 be->be_f_offset = end;
210 if (be->be_state != PNFS_BLOCK_NONE_DATA) {
211 be->be_v_offset =
212 orig_v_offset + orig_len - len2;
213 }
214 be->be_length = len2;
215 }
216 } else {
217 if (len1 > 0) {
218 be->be_length = len1;
219 be = ext_tree_next(be);
220 }
221
222 while (be && ext_f_end(be) <= end) {
223 struct pnfs_block_extent *next = ext_tree_next(be);
224
225 rb_erase(&be->be_node, root);
226 nfs4_put_deviceid_node(be->be_device);
227 kfree(be);
228 be = next;
229 }
230
231 if (be && be->be_f_offset < end) {
232 len1 = ext_f_end(be) - end;
233 be->be_f_offset = end;
234 if (be->be_state != PNFS_BLOCK_NONE_DATA)
235 be->be_v_offset += be->be_length - len1;
236 be->be_length = len1;
237 }
238 }
239
240 return 0;
241}
242
243int
244ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
245{
246 struct pnfs_block_extent *be;
247 struct rb_root *root;
248 int err = 0;
249
250 switch (new->be_state) {
251 case PNFS_BLOCK_READWRITE_DATA:
252 case PNFS_BLOCK_INVALID_DATA:
253 root = &bl->bl_ext_rw;
254 break;
255 case PNFS_BLOCK_READ_DATA:
256 case PNFS_BLOCK_NONE_DATA:
257 root = &bl->bl_ext_ro;
258 break;
259 default:
260 dprintk("invalid extent type\n");
261 return -EINVAL;
262 }
263
264 spin_lock(&bl->bl_ext_lock);
265retry:
266 be = __ext_tree_search(root, new->be_f_offset);
267 if (!be || be->be_f_offset >= ext_f_end(new)) {
268 __ext_tree_insert(root, new, true);
269 } else if (new->be_f_offset >= be->be_f_offset) {
270 if (ext_f_end(new) <= ext_f_end(be)) {
271 nfs4_put_deviceid_node(new->be_device);
272 kfree(new);
273 } else {
274 sector_t new_len = ext_f_end(new) - ext_f_end(be);
275 sector_t diff = new->be_length - new_len;
276
277 new->be_f_offset += diff;
278 new->be_v_offset += diff;
279 new->be_length = new_len;
280 goto retry;
281 }
282 } else if (ext_f_end(new) <= ext_f_end(be)) {
283 new->be_length = be->be_f_offset - new->be_f_offset;
284 __ext_tree_insert(root, new, true);
285 } else {
286 struct pnfs_block_extent *split;
287 sector_t new_len = ext_f_end(new) - ext_f_end(be);
288 sector_t diff = new->be_length - new_len;
289
290 split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
291 if (!split) {
292 err = -EINVAL;
293 goto out;
294 }
295
296 split->be_length = be->be_f_offset - split->be_f_offset;
297 split->be_device = nfs4_get_deviceid(new->be_device);
298 __ext_tree_insert(root, split, true);
299
300 new->be_f_offset += diff;
301 new->be_v_offset += diff;
302 new->be_length = new_len;
303 goto retry;
304 }
305out:
306 spin_unlock(&bl->bl_ext_lock);
307 return err;
308}
309
310static bool
311__ext_tree_lookup(struct rb_root *root, sector_t isect,
312 struct pnfs_block_extent *ret)
313{
314 struct rb_node *node;
315 struct pnfs_block_extent *be;
316
317 node = root->rb_node;
318 while (node) {
319 be = ext_node(node);
320 if (isect < be->be_f_offset)
321 node = node->rb_left;
322 else if (isect >= ext_f_end(be))
323 node = node->rb_right;
324 else {
325 *ret = *be;
326 return true;
327 }
328 }
329
330 return false;
331}
332
333bool
334ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
335 struct pnfs_block_extent *ret, bool rw)
336{
337 bool found = false;
338
339 spin_lock(&bl->bl_ext_lock);
340 if (!rw)
341 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
342 if (!found)
343 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
344 spin_unlock(&bl->bl_ext_lock);
345
346 return found;
347}
348
349int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
350 sector_t start, sector_t end)
351{
352 int err, err2;
353
354 spin_lock(&bl->bl_ext_lock);
355 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
356 if (rw) {
357 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
358 if (!err)
359 err = err2;
360 }
361 spin_unlock(&bl->bl_ext_lock);
362
363 return err;
364}
365
366static int
367ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
368 sector_t split)
369{
370 struct pnfs_block_extent *new;
371 sector_t orig_len = be->be_length;
372
373 new = kzalloc(sizeof(*new), GFP_ATOMIC);
374 if (!new)
375 return -ENOMEM;
376
377 be->be_length = split - be->be_f_offset;
378
379 new->be_f_offset = split;
380 if (be->be_state != PNFS_BLOCK_NONE_DATA)
381 new->be_v_offset = be->be_v_offset + be->be_length;
382 new->be_length = orig_len - be->be_length;
383 new->be_state = be->be_state;
384 new->be_tag = be->be_tag;
385 new->be_device = nfs4_get_deviceid(be->be_device);
386
387 __ext_tree_insert(root, new, false);
388 return 0;
389}
390
391int
392ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
393 sector_t len)
394{
395 struct rb_root *root = &bl->bl_ext_rw;
396 sector_t end = start + len;
397 struct pnfs_block_extent *be;
398 int err = 0;
399
400 spin_lock(&bl->bl_ext_lock);
401 /*
402 * First remove all COW extents or holes from written to range.
403 */
404 err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
405 if (err)
406 goto out;
407
408 /*
409 * Then mark all invalid extents in the range as written to.
410 */
411 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
412 if (be->be_f_offset >= end)
413 break;
414
415 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
416 continue;
417
418 if (be->be_f_offset < start) {
419 struct pnfs_block_extent *left = ext_tree_prev(be);
420
421 if (left && ext_can_merge(left, be)) {
422 sector_t diff = start - be->be_f_offset;
423
424 left->be_length += diff;
425
426 be->be_f_offset += diff;
427 be->be_v_offset += diff;
428 be->be_length -= diff;
429 } else {
430 err = ext_tree_split(root, be, start);
431 if (err)
432 goto out;
433 }
434 }
435
436 if (ext_f_end(be) > end) {
437 struct pnfs_block_extent *right = ext_tree_next(be);
438
439 if (right && ext_can_merge(be, right)) {
440 sector_t diff = end - be->be_f_offset;
441
442 be->be_length -= diff;
443
444 right->be_f_offset -= diff;
445 right->be_v_offset -= diff;
446 right->be_length += diff;
447 } else {
448 err = ext_tree_split(root, be, end);
449 if (err)
450 goto out;
451 }
452 }
453
454 if (be->be_f_offset >= start && ext_f_end(be) <= end) {
455 be->be_tag = EXTENT_WRITTEN;
456 be = ext_try_to_merge_left(root, be);
457 be = ext_try_to_merge_right(root, be);
458 }
459 }
460out:
461 spin_unlock(&bl->bl_ext_lock);
462 return err;
463}
464
465static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
466 size_t buffer_size)
467{
468 if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
469 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
470
471 for (i = 0; i < nr_pages; i++)
472 put_page(arg->layoutupdate_pages[i]);
473 kfree(arg->layoutupdate_pages);
474 } else {
475 put_page(arg->layoutupdate_page);
476 }
477}
478
479static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
480 size_t buffer_size, size_t *count)
481{
482 struct pnfs_block_extent *be;
483 int ret = 0;
484
485 spin_lock(&bl->bl_ext_lock);
486 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
487 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
488 be->be_tag != EXTENT_WRITTEN)
489 continue;
490
491 (*count)++;
492 if (*count * BL_EXTENT_SIZE > buffer_size) {
493 /* keep counting.. */
494 ret = -ENOSPC;
495 continue;
496 }
497
498 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
499 NFS4_DEVICEID4_SIZE);
500 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
501 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
502 p = xdr_encode_hyper(p, 0LL);
503 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
504
505 be->be_tag = EXTENT_COMMITTING;
506 }
507 spin_unlock(&bl->bl_ext_lock);
508
509 return ret;
510}
511
512int
513ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
514{
515 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
516 size_t count = 0, buffer_size = PAGE_SIZE;
517 __be32 *start_p;
518 int ret;
519
520 dprintk("%s enter\n", __func__);
521
522 arg->layoutupdate_page = alloc_page(GFP_NOFS);
523 if (!arg->layoutupdate_page)
524 return -ENOMEM;
525 start_p = page_address(arg->layoutupdate_page);
526 arg->layoutupdate_pages = &arg->layoutupdate_page;
527
528retry:
529 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count);
530 if (unlikely(ret)) {
531 ext_tree_free_commitdata(arg, buffer_size);
532
533 buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
534 count = 0;
535
536 arg->layoutupdate_pages =
537 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
538 sizeof(struct page *), GFP_NOFS);
539 if (!arg->layoutupdate_pages)
540 return -ENOMEM;
541
542 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
543 if (!start_p) {
544 kfree(arg->layoutupdate_pages);
545 return -ENOMEM;
546 }
547
548 goto retry;
549 }
550
551 *start_p = cpu_to_be32(count);
552 arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
553
554 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
555 __be32 *p = start_p;
556 int i = 0;
557
558 for (p = start_p;
559 p < start_p + arg->layoutupdate_len;
560 p += PAGE_SIZE) {
561 arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
562 }
563 }
564
565 dprintk("%s found %zu ranges\n", __func__, count);
566 return 0;
567}
568
569void
570ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
571{
572 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
573 struct rb_root *root = &bl->bl_ext_rw;
574 struct pnfs_block_extent *be;
575
576 dprintk("%s status %d\n", __func__, status);
577
578 ext_tree_free_commitdata(arg, arg->layoutupdate_len);
579
580 spin_lock(&bl->bl_ext_lock);
581 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
582 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
583 be->be_tag != EXTENT_COMMITTING)
584 continue;
585
586 if (status) {
587 /*
588 * Mark as written and try again.
589 *
590 * XXX: some real error handling here wouldn't hurt..
591 */
592 be->be_tag = EXTENT_WRITTEN;
593 } else {
594 be->be_state = PNFS_BLOCK_READWRITE_DATA;
595 be->be_tag = 0;
596 }
597
598 be = ext_try_to_merge_left(root, be);
599 be = ext_try_to_merge_right(root, be);
600 }
601 spin_unlock(&bl->bl_ext_lock);
602}
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
deleted file mode 100644
index 4d0161442565..000000000000
--- a/fs/nfs/blocklayout/extents.c
+++ /dev/null
@@ -1,908 +0,0 @@
1/*
2 * linux/fs/nfs/blocklayout/blocklayout.h
3 *
4 * Module for the NFSv4.1 pNFS block layout driver.
5 *
6 * Copyright (c) 2006 The Regents of the University of Michigan.
7 * All rights reserved.
8 *
9 * Andy Adamson <andros@citi.umich.edu>
10 * Fred Isaman <iisaman@umich.edu>
11 *
12 * permission is granted to use, copy, create derivative works and
13 * redistribute this software and such derivative works for any purpose,
14 * so long as the name of the university of michigan is not used in
15 * any advertising or publicity pertaining to the use or distribution
16 * of this software without specific, written prior authorization. if
17 * the above copyright notice or any other identification of the
18 * university of michigan is included in any copy of any portion of
19 * this software, then the disclaimer below must also be included.
20 *
21 * this software is provided as is, without representation from the
22 * university of michigan as to its fitness for any purpose, and without
23 * warranty by the university of michigan of any kind, either express
24 * or implied, including without limitation the implied warranties of
25 * merchantability and fitness for a particular purpose. the regents
26 * of the university of michigan shall not be liable for any damages,
27 * including special, indirect, incidental, or consequential damages,
28 * with respect to any claim arising out or in connection with the use
29 * of the software, even if it has been or is hereafter advised of the
30 * possibility of such damages.
31 */
32
33#include "blocklayout.h"
34#define NFSDBG_FACILITY NFSDBG_PNFS_LD
35
36/* Bit numbers */
37#define EXTENT_INITIALIZED 0
38#define EXTENT_WRITTEN 1
39#define EXTENT_IN_COMMIT 2
40#define INTERNAL_EXISTS MY_MAX_TAGS
41#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1)
42
43/* Returns largest t<=s s.t. t%base==0 */
44static inline sector_t normalize(sector_t s, int base)
45{
46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - sector_div(tmp, base);
48}
49
50static inline sector_t normalize_up(sector_t s, int base)
51{
52 return normalize(s + base - 1, base);
53}
54
55/* Complete stub using list while determine API wanted */
56
57/* Returns tags, or negative */
58static int32_t _find_entry(struct my_tree *tree, u64 s)
59{
60 struct pnfs_inval_tracking *pos;
61
62 dprintk("%s(%llu) enter\n", __func__, s);
63 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
64 if (pos->it_sector > s)
65 continue;
66 else if (pos->it_sector == s)
67 return pos->it_tags & INTERNAL_MASK;
68 else
69 break;
70 }
71 return -ENOENT;
72}
73
74static inline
75int _has_tag(struct my_tree *tree, u64 s, int32_t tag)
76{
77 int32_t tags;
78
79 dprintk("%s(%llu, %i) enter\n", __func__, s, tag);
80 s = normalize(s, tree->mtt_step_size);
81 tags = _find_entry(tree, s);
82 if ((tags < 0) || !(tags & (1 << tag)))
83 return 0;
84 else
85 return 1;
86}
87
88/* Creates entry with tag, or if entry already exists, unions tag to it.
89 * If storage is not NULL, newly created entry will use it.
90 * Returns number of entries added, or negative on error.
91 */
92static int _add_entry(struct my_tree *tree, u64 s, int32_t tag,
93 struct pnfs_inval_tracking *storage)
94{
95 int found = 0;
96 struct pnfs_inval_tracking *pos;
97
98 dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage);
99 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
100 if (pos->it_sector > s)
101 continue;
102 else if (pos->it_sector == s) {
103 found = 1;
104 break;
105 } else
106 break;
107 }
108 if (found) {
109 pos->it_tags |= (1 << tag);
110 return 0;
111 } else {
112 struct pnfs_inval_tracking *new;
113 new = storage;
114 new->it_sector = s;
115 new->it_tags = (1 << tag);
116 list_add(&new->it_link, &pos->it_link);
117 return 1;
118 }
119}
120
121/* XXXX Really want option to not create */
122/* Over range, unions tag with existing entries, else creates entry with tag */
123static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length)
124{
125 u64 i;
126
127 dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length);
128 for (i = normalize(s, tree->mtt_step_size); i < s + length;
129 i += tree->mtt_step_size)
130 if (_add_entry(tree, i, tag, NULL))
131 return -ENOMEM;
132 return 0;
133}
134
135/* Ensure that future operations on given range of tree will not malloc */
136static int _preload_range(struct pnfs_inval_markings *marks,
137 u64 offset, u64 length)
138{
139 u64 start, end, s;
140 int count, i, used = 0, status = -ENOMEM;
141 struct pnfs_inval_tracking **storage;
142 struct my_tree *tree = &marks->im_tree;
143
144 dprintk("%s(%llu, %llu) enter\n", __func__, offset, length);
145 start = normalize(offset, tree->mtt_step_size);
146 end = normalize_up(offset + length, tree->mtt_step_size);
147 count = (int)(end - start) / (int)tree->mtt_step_size;
148
149 /* Pre-malloc what memory we might need */
150 storage = kcalloc(count, sizeof(*storage), GFP_NOFS);
151 if (!storage)
152 return -ENOMEM;
153 for (i = 0; i < count; i++) {
154 storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking),
155 GFP_NOFS);
156 if (!storage[i])
157 goto out_cleanup;
158 }
159
160 spin_lock_bh(&marks->im_lock);
161 for (s = start; s < end; s += tree->mtt_step_size)
162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163 spin_unlock_bh(&marks->im_lock);
164
165 status = 0;
166
167 out_cleanup:
168 for (i = used; i < count; i++) {
169 if (!storage[i])
170 break;
171 kfree(storage[i]);
172 }
173 kfree(storage);
174 return status;
175}
176
177/* We are relying on page lock to serialize this */
178int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
179{
180 int rv;
181
182 spin_lock_bh(&marks->im_lock);
183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
184 spin_unlock_bh(&marks->im_lock);
185 return rv;
186}
187
188/* Assume start, end already sector aligned */
189static int
190_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag)
191{
192 struct pnfs_inval_tracking *pos;
193 u64 expect = 0;
194
195 dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag);
196 list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) {
197 if (pos->it_sector >= end)
198 continue;
199 if (!expect) {
200 if ((pos->it_sector == end - tree->mtt_step_size) &&
201 (pos->it_tags & (1 << tag))) {
202 expect = pos->it_sector - tree->mtt_step_size;
203 if (pos->it_sector < tree->mtt_step_size || expect < start)
204 return 1;
205 continue;
206 } else {
207 return 0;
208 }
209 }
210 if (pos->it_sector != expect || !(pos->it_tags & (1 << tag)))
211 return 0;
212 expect -= tree->mtt_step_size;
213 if (expect < start)
214 return 1;
215 }
216 return 0;
217}
218
219static int is_range_written(struct pnfs_inval_markings *marks,
220 sector_t start, sector_t end)
221{
222 int rv;
223
224 spin_lock_bh(&marks->im_lock);
225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
226 spin_unlock_bh(&marks->im_lock);
227 return rv;
228}
229
230/* Marks sectors in [offest, offset_length) as having been initialized.
231 * All lengths are step-aligned, where step is min(pagesize, blocksize).
232 * Currently assumes offset is page-aligned
233 */
234int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
235 sector_t offset, sector_t length)
236{
237 sector_t start, end;
238
239 dprintk("%s(offset=%llu,len=%llu) enter\n",
240 __func__, (u64)offset, (u64)length);
241
242 start = normalize(offset, marks->im_block_size);
243 end = normalize_up(offset + length, marks->im_block_size);
244 if (_preload_range(marks, start, end - start))
245 goto outerr;
246
247 spin_lock_bh(&marks->im_lock);
248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
249 goto out_unlock;
250 spin_unlock_bh(&marks->im_lock);
251
252 return 0;
253
254out_unlock:
255 spin_unlock_bh(&marks->im_lock);
256outerr:
257 return -ENOMEM;
258}
259
260/* Marks sectors in [offest, offset+length) as having been written to disk.
261 * All lengths should be block aligned.
262 */
263static int mark_written_sectors(struct pnfs_inval_markings *marks,
264 sector_t offset, sector_t length)
265{
266 int status;
267
268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
269 (u64)offset, (u64)length);
270 spin_lock_bh(&marks->im_lock);
271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
272 spin_unlock_bh(&marks->im_lock);
273 return status;
274}
275
276static void print_short_extent(struct pnfs_block_short_extent *be)
277{
278 dprintk("PRINT SHORT EXTENT extent %p\n", be);
279 if (be) {
280 dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset);
281 dprintk(" be_length %llu\n", (u64)be->bse_length);
282 }
283}
284
285static void print_clist(struct list_head *list, unsigned int count)
286{
287 struct pnfs_block_short_extent *be;
288 unsigned int i = 0;
289
290 ifdebug(FACILITY) {
291 printk(KERN_DEBUG "****************\n");
292 printk(KERN_DEBUG "Extent list looks like:\n");
293 list_for_each_entry(be, list, bse_node) {
294 i++;
295 print_short_extent(be);
296 }
297 if (i != count)
298 printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count);
299 printk(KERN_DEBUG "****************\n");
300 }
301}
302
303/* Note: In theory, we should do more checking that devid's match between
304 * old and new, but if they don't, the lists are too corrupt to salvage anyway.
305 */
306/* Note this is very similar to bl_add_merge_extent */
307static void add_to_commitlist(struct pnfs_block_layout *bl,
308 struct pnfs_block_short_extent *new)
309{
310 struct list_head *clist = &bl->bl_commit;
311 struct pnfs_block_short_extent *old, *save;
312 sector_t end = new->bse_f_offset + new->bse_length;
313
314 dprintk("%s enter\n", __func__);
315 print_short_extent(new);
316 print_clist(clist, bl->bl_count);
317 bl->bl_count++;
318 /* Scan for proper place to insert, extending new to the left
319 * as much as possible.
320 */
321 list_for_each_entry_safe(old, save, clist, bse_node) {
322 if (new->bse_f_offset < old->bse_f_offset)
323 break;
324 if (end <= old->bse_f_offset + old->bse_length) {
325 /* Range is already in list */
326 bl->bl_count--;
327 kfree(new);
328 return;
329 } else if (new->bse_f_offset <=
330 old->bse_f_offset + old->bse_length) {
331 /* new overlaps or abuts existing be */
332 if (new->bse_mdev == old->bse_mdev) {
333 /* extend new to fully replace old */
334 new->bse_length += new->bse_f_offset -
335 old->bse_f_offset;
336 new->bse_f_offset = old->bse_f_offset;
337 list_del(&old->bse_node);
338 bl->bl_count--;
339 kfree(old);
340 }
341 }
342 }
343 /* Note that if we never hit the above break, old will not point to a
344 * valid extent. However, in that case &old->bse_node==list.
345 */
346 list_add_tail(&new->bse_node, &old->bse_node);
347 /* Scan forward for overlaps. If we find any, extend new and
348 * remove the overlapped extent.
349 */
350 old = list_prepare_entry(new, clist, bse_node);
351 list_for_each_entry_safe_continue(old, save, clist, bse_node) {
352 if (end < old->bse_f_offset)
353 break;
354 /* new overlaps or abuts old */
355 if (new->bse_mdev == old->bse_mdev) {
356 if (end < old->bse_f_offset + old->bse_length) {
357 /* extend new to fully cover old */
358 end = old->bse_f_offset + old->bse_length;
359 new->bse_length = end - new->bse_f_offset;
360 }
361 list_del(&old->bse_node);
362 bl->bl_count--;
363 kfree(old);
364 }
365 }
366 dprintk("%s: after merging\n", __func__);
367 print_clist(clist, bl->bl_count);
368}
369
370/* Note the range described by offset, length is guaranteed to be contained
371 * within be.
372 * new will be freed, either by this function or add_to_commitlist if they
373 * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
374 */
375int bl_mark_for_commit(struct pnfs_block_extent *be,
376 sector_t offset, sector_t length,
377 struct pnfs_block_short_extent *new)
378{
379 sector_t new_end, end = offset + length;
380 struct pnfs_block_layout *bl = container_of(be->be_inval,
381 struct pnfs_block_layout,
382 bl_inval);
383
384 mark_written_sectors(be->be_inval, offset, length);
385 /* We want to add the range to commit list, but it must be
386 * block-normalized, and verified that the normalized range has
387 * been entirely written to disk.
388 */
389 new->bse_f_offset = offset;
390 offset = normalize(offset, bl->bl_blocksize);
391 if (offset < new->bse_f_offset) {
392 if (is_range_written(be->be_inval, offset, new->bse_f_offset))
393 new->bse_f_offset = offset;
394 else
395 new->bse_f_offset = offset + bl->bl_blocksize;
396 }
397 new_end = normalize_up(end, bl->bl_blocksize);
398 if (end < new_end) {
399 if (is_range_written(be->be_inval, end, new_end))
400 end = new_end;
401 else
402 end = new_end - bl->bl_blocksize;
403 }
404 if (end <= new->bse_f_offset) {
405 kfree(new);
406 return 0;
407 }
408 new->bse_length = end - new->bse_f_offset;
409 new->bse_devid = be->be_devid;
410 new->bse_mdev = be->be_mdev;
411
412 spin_lock(&bl->bl_ext_lock);
413 add_to_commitlist(bl, new);
414 spin_unlock(&bl->bl_ext_lock);
415 return 0;
416}
417
418static void print_bl_extent(struct pnfs_block_extent *be)
419{
420 dprintk("PRINT EXTENT extent %p\n", be);
421 if (be) {
422 dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset);
423 dprintk(" be_length %llu\n", (u64)be->be_length);
424 dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset);
425 dprintk(" be_state %d\n", be->be_state);
426 }
427}
428
429static void
430destroy_extent(struct kref *kref)
431{
432 struct pnfs_block_extent *be;
433
434 be = container_of(kref, struct pnfs_block_extent, be_refcnt);
435 dprintk("%s be=%p\n", __func__, be);
436 kfree(be);
437}
438
439void
440bl_put_extent(struct pnfs_block_extent *be)
441{
442 if (be) {
443 dprintk("%s enter %p (%i)\n", __func__, be,
444 atomic_read(&be->be_refcnt.refcount));
445 kref_put(&be->be_refcnt, destroy_extent);
446 }
447}
448
449struct pnfs_block_extent *bl_alloc_extent(void)
450{
451 struct pnfs_block_extent *be;
452
453 be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS);
454 if (!be)
455 return NULL;
456 INIT_LIST_HEAD(&be->be_node);
457 kref_init(&be->be_refcnt);
458 be->be_inval = NULL;
459 return be;
460}
461
462static void print_elist(struct list_head *list)
463{
464 struct pnfs_block_extent *be;
465 dprintk("****************\n");
466 dprintk("Extent list looks like:\n");
467 list_for_each_entry(be, list, be_node) {
468 print_bl_extent(be);
469 }
470 dprintk("****************\n");
471}
472
473static inline int
474extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new)
475{
476 /* Note this assumes new->be_f_offset >= old->be_f_offset */
477 return (new->be_state == old->be_state) &&
478 ((new->be_state == PNFS_BLOCK_NONE_DATA) ||
479 ((new->be_v_offset - old->be_v_offset ==
480 new->be_f_offset - old->be_f_offset) &&
481 new->be_mdev == old->be_mdev));
482}
483
484/* Adds new to appropriate list in bl, modifying new and removing existing
485 * extents as appropriate to deal with overlaps.
486 *
487 * See bl_find_get_extent for list constraints.
488 *
489 * Refcount on new is already set. If end up not using it, or error out,
490 * need to put the reference.
491 *
492 * bl->bl_ext_lock is held by caller.
493 */
494int
495bl_add_merge_extent(struct pnfs_block_layout *bl,
496 struct pnfs_block_extent *new)
497{
498 struct pnfs_block_extent *be, *tmp;
499 sector_t end = new->be_f_offset + new->be_length;
500 struct list_head *list;
501
502 dprintk("%s enter with be=%p\n", __func__, new);
503 print_bl_extent(new);
504 list = &bl->bl_extents[bl_choose_list(new->be_state)];
505 print_elist(list);
506
507 /* Scan for proper place to insert, extending new to the left
508 * as much as possible.
509 */
510 list_for_each_entry_safe_reverse(be, tmp, list, be_node) {
511 if (new->be_f_offset >= be->be_f_offset + be->be_length)
512 break;
513 if (new->be_f_offset >= be->be_f_offset) {
514 if (end <= be->be_f_offset + be->be_length) {
515 /* new is a subset of existing be*/
516 if (extents_consistent(be, new)) {
517 dprintk("%s: new is subset, ignoring\n",
518 __func__);
519 bl_put_extent(new);
520 return 0;
521 } else {
522 goto out_err;
523 }
524 } else {
525 /* |<-- be -->|
526 * |<-- new -->| */
527 if (extents_consistent(be, new)) {
528 /* extend new to fully replace be */
529 new->be_length += new->be_f_offset -
530 be->be_f_offset;
531 new->be_f_offset = be->be_f_offset;
532 new->be_v_offset = be->be_v_offset;
533 dprintk("%s: removing %p\n", __func__, be);
534 list_del(&be->be_node);
535 bl_put_extent(be);
536 } else {
537 goto out_err;
538 }
539 }
540 } else if (end >= be->be_f_offset + be->be_length) {
541 /* new extent overlap existing be */
542 if (extents_consistent(be, new)) {
543 /* extend new to fully replace be */
544 dprintk("%s: removing %p\n", __func__, be);
545 list_del(&be->be_node);
546 bl_put_extent(be);
547 } else {
548 goto out_err;
549 }
550 } else if (end > be->be_f_offset) {
551 /* |<-- be -->|
552 *|<-- new -->| */
553 if (extents_consistent(new, be)) {
554 /* extend new to fully replace be */
555 new->be_length += be->be_f_offset + be->be_length -
556 new->be_f_offset - new->be_length;
557 dprintk("%s: removing %p\n", __func__, be);
558 list_del(&be->be_node);
559 bl_put_extent(be);
560 } else {
561 goto out_err;
562 }
563 }
564 }
565 /* Note that if we never hit the above break, be will not point to a
566 * valid extent. However, in that case &be->be_node==list.
567 */
568 list_add(&new->be_node, &be->be_node);
569 dprintk("%s: inserting new\n", __func__);
570 print_elist(list);
571 /* FIXME - The per-list consistency checks have all been done,
572 * should now check cross-list consistency.
573 */
574 return 0;
575
576 out_err:
577 bl_put_extent(new);
578 return -EIO;
579}
580
581/* Returns extent, or NULL. If a second READ extent exists, it is returned
582 * in cow_read, if given.
583 *
584 * The extents are kept in two seperate ordered lists, one for READ and NONE,
585 * one for READWRITE and INVALID. Within each list, we assume:
586 * 1. Extents are ordered by file offset.
587 * 2. For any given isect, there is at most one extents that matches.
588 */
589struct pnfs_block_extent *
590bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect,
591 struct pnfs_block_extent **cow_read)
592{
593 struct pnfs_block_extent *be, *cow, *ret;
594 int i;
595
596 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
597 cow = ret = NULL;
598 spin_lock(&bl->bl_ext_lock);
599 for (i = 0; i < EXTENT_LISTS; i++) {
600 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
601 if (isect >= be->be_f_offset + be->be_length)
602 break;
603 if (isect >= be->be_f_offset) {
604 /* We have found an extent */
605 dprintk("%s Get %p (%i)\n", __func__, be,
606 atomic_read(&be->be_refcnt.refcount));
607 kref_get(&be->be_refcnt);
608 if (!ret)
609 ret = be;
610 else if (be->be_state != PNFS_BLOCK_READ_DATA)
611 bl_put_extent(be);
612 else
613 cow = be;
614 break;
615 }
616 }
617 if (ret &&
618 (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA))
619 break;
620 }
621 spin_unlock(&bl->bl_ext_lock);
622 if (cow_read)
623 *cow_read = cow;
624 print_bl_extent(ret);
625 return ret;
626}
627
628/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */
629static struct pnfs_block_extent *
630bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect)
631{
632 struct pnfs_block_extent *be, *ret = NULL;
633 int i;
634
635 dprintk("%s enter with isect %llu\n", __func__, (u64)isect);
636 for (i = 0; i < EXTENT_LISTS; i++) {
637 if (ret)
638 break;
639 list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) {
640 if (isect >= be->be_f_offset + be->be_length)
641 break;
642 if (isect >= be->be_f_offset) {
643 /* We have found an extent */
644 dprintk("%s Get %p (%i)\n", __func__, be,
645 atomic_read(&be->be_refcnt.refcount));
646 kref_get(&be->be_refcnt);
647 ret = be;
648 break;
649 }
650 }
651 }
652 print_bl_extent(ret);
653 return ret;
654}
655
656int
657encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
658 struct xdr_stream *xdr,
659 const struct nfs4_layoutcommit_args *arg)
660{
661 struct pnfs_block_short_extent *lce, *save;
662 unsigned int count = 0;
663 __be32 *p, *xdr_start;
664
665 dprintk("%s enter\n", __func__);
666 /* BUG - creation of bl_commit is buggy - need to wait for
667 * entire block to be marked WRITTEN before it can be added.
668 */
669 spin_lock(&bl->bl_ext_lock);
670 /* Want to adjust for possible truncate */
671 /* We now want to adjust argument range */
672
673 /* XDR encode the ranges found */
674 xdr_start = xdr_reserve_space(xdr, 8);
675 if (!xdr_start)
676 goto out;
677 list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) {
678 p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data));
679 if (!p)
680 break;
681 p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE);
682 p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT);
683 p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT);
684 p = xdr_encode_hyper(p, 0LL);
685 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
686 list_move_tail(&lce->bse_node, &bl->bl_committing);
687 bl->bl_count--;
688 count++;
689 }
690 xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4);
691 xdr_start[1] = cpu_to_be32(count);
692out:
693 spin_unlock(&bl->bl_ext_lock);
694 dprintk("%s found %i ranges\n", __func__, count);
695 return 0;
696}
697
698/* Helper function to set_to_rw that initialize a new extent */
699static void
700_prep_new_extent(struct pnfs_block_extent *new,
701 struct pnfs_block_extent *orig,
702 sector_t offset, sector_t length, int state)
703{
704 kref_init(&new->be_refcnt);
705 /* don't need to INIT_LIST_HEAD(&new->be_node) */
706 memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid));
707 new->be_mdev = orig->be_mdev;
708 new->be_f_offset = offset;
709 new->be_length = length;
710 new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset;
711 new->be_state = state;
712 new->be_inval = orig->be_inval;
713}
714
715/* Tries to merge be with extent in front of it in list.
716 * Frees storage if not used.
717 */
718static struct pnfs_block_extent *
719_front_merge(struct pnfs_block_extent *be, struct list_head *head,
720 struct pnfs_block_extent *storage)
721{
722 struct pnfs_block_extent *prev;
723
724 if (!storage)
725 goto no_merge;
726 if (&be->be_node == head || be->be_node.prev == head)
727 goto no_merge;
728 prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node);
729 if ((prev->be_f_offset + prev->be_length != be->be_f_offset) ||
730 !extents_consistent(prev, be))
731 goto no_merge;
732 _prep_new_extent(storage, prev, prev->be_f_offset,
733 prev->be_length + be->be_length, prev->be_state);
734 list_replace(&prev->be_node, &storage->be_node);
735 bl_put_extent(prev);
736 list_del(&be->be_node);
737 bl_put_extent(be);
738 return storage;
739
740 no_merge:
741 kfree(storage);
742 return be;
743}
744
745static u64
746set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length)
747{
748 u64 rv = offset + length;
749 struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old;
750 struct pnfs_block_extent *children[3];
751 struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL;
752 int i = 0, j;
753
754 dprintk("%s(%llu, %llu)\n", __func__, offset, length);
755 /* Create storage for up to three new extents e1, e2, e3 */
756 e1 = kmalloc(sizeof(*e1), GFP_ATOMIC);
757 e2 = kmalloc(sizeof(*e2), GFP_ATOMIC);
758 e3 = kmalloc(sizeof(*e3), GFP_ATOMIC);
759 /* BUG - we are ignoring any failure */
760 if (!e1 || !e2 || !e3)
761 goto out_nosplit;
762
763 spin_lock(&bl->bl_ext_lock);
764 be = bl_find_get_extent_locked(bl, offset);
765 rv = be->be_f_offset + be->be_length;
766 if (be->be_state != PNFS_BLOCK_INVALID_DATA) {
767 spin_unlock(&bl->bl_ext_lock);
768 goto out_nosplit;
769 }
770 /* Add e* to children, bumping e*'s krefs */
771 if (be->be_f_offset != offset) {
772 _prep_new_extent(e1, be, be->be_f_offset,
773 offset - be->be_f_offset,
774 PNFS_BLOCK_INVALID_DATA);
775 children[i++] = e1;
776 print_bl_extent(e1);
777 } else
778 merge1 = e1;
779 _prep_new_extent(e2, be, offset,
780 min(length, be->be_f_offset + be->be_length - offset),
781 PNFS_BLOCK_READWRITE_DATA);
782 children[i++] = e2;
783 print_bl_extent(e2);
784 if (offset + length < be->be_f_offset + be->be_length) {
785 _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length,
786 be->be_f_offset + be->be_length -
787 offset - length,
788 PNFS_BLOCK_INVALID_DATA);
789 children[i++] = e3;
790 print_bl_extent(e3);
791 } else
792 merge2 = e3;
793
794 /* Remove be from list, and insert the e* */
795 /* We don't get refs on e*, since this list is the base reference
796 * set when init'ed.
797 */
798 if (i < 3)
799 children[i] = NULL;
800 new = children[0];
801 list_replace(&be->be_node, &new->be_node);
802 bl_put_extent(be);
803 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1);
804 for (j = 1; j < i; j++) {
805 old = new;
806 new = children[j];
807 list_add(&new->be_node, &old->be_node);
808 }
809 if (merge2) {
810 /* This is a HACK, should just create a _back_merge function */
811 new = list_entry(new->be_node.next,
812 struct pnfs_block_extent, be_node);
813 new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2);
814 }
815 spin_unlock(&bl->bl_ext_lock);
816
817 /* Since we removed the base reference above, be is now scheduled for
818 * destruction.
819 */
820 bl_put_extent(be);
821 dprintk("%s returns %llu after split\n", __func__, rv);
822 return rv;
823
824 out_nosplit:
825 kfree(e1);
826 kfree(e2);
827 kfree(e3);
828 dprintk("%s returns %llu without splitting\n", __func__, rv);
829 return rv;
830}
831
832void
833clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
834 const struct nfs4_layoutcommit_args *arg,
835 int status)
836{
837 struct pnfs_block_short_extent *lce, *save;
838
839 dprintk("%s status %d\n", __func__, status);
840 list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) {
841 if (likely(!status)) {
842 u64 offset = lce->bse_f_offset;
843 u64 end = offset + lce->bse_length;
844
845 do {
846 offset = set_to_rw(bl, offset, end - offset);
847 } while (offset < end);
848 list_del(&lce->bse_node);
849
850 kfree(lce);
851 } else {
852 list_del(&lce->bse_node);
853 spin_lock(&bl->bl_ext_lock);
854 add_to_commitlist(bl, lce);
855 spin_unlock(&bl->bl_ext_lock);
856 }
857 }
858}
859
860int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
861{
862 struct pnfs_block_short_extent *new;
863
864 new = kmalloc(sizeof(*new), GFP_NOFS);
865 if (unlikely(!new))
866 return -ENOMEM;
867
868 spin_lock_bh(&marks->im_lock);
869 list_add(&new->bse_node, &marks->im_extents);
870 spin_unlock_bh(&marks->im_lock);
871
872 return 0;
873}
874
875struct pnfs_block_short_extent *
876bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
877{
878 struct pnfs_block_short_extent *rv = NULL;
879
880 spin_lock_bh(&marks->im_lock);
881 if (!list_empty(&marks->im_extents)) {
882 rv = list_entry((&marks->im_extents)->next,
883 struct pnfs_block_short_extent, bse_node);
884 list_del_init(&rv->bse_node);
885 }
886 spin_unlock_bh(&marks->im_lock);
887
888 return rv;
889}
890
891void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
892{
893 struct pnfs_block_short_extent *se = NULL, *tmp;
894
895 if (num_to_free <= 0)
896 return;
897
898 spin_lock(&marks->im_lock);
899 list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
900 list_del(&se->bse_node);
901 kfree(se);
902 if (--num_to_free == 0)
903 break;
904 }
905 spin_unlock(&marks->im_lock);
906
907 BUG_ON(num_to_free > 0);
908}
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
new file mode 100644
index 000000000000..e966c023b1b7
--- /dev/null
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -0,0 +1,284 @@
1/*
2 * Copyright (c) 2006,2007 The Regents of the University of Michigan.
3 * All rights reserved.
4 *
5 * Andy Adamson <andros@citi.umich.edu>
6 * Fred Isaman <iisaman@umich.edu>
7 *
8 * permission is granted to use, copy, create derivative works and
9 * redistribute this software and such derivative works for any purpose,
10 * so long as the name of the university of michigan is not used in
11 * any advertising or publicity pertaining to the use or distribution
12 * of this software without specific, written prior authorization. if
13 * the above copyright notice or any other identification of the
14 * university of michigan is included in any copy of any portion of
15 * this software, then the disclaimer below must also be included.
16 *
17 * this software is provided as is, without representation from the
18 * university of michigan as to its fitness for any purpose, and without
19 * warranty by the university of michigan of any kind, either express
20 * or implied, including without limitation the implied warranties of
21 * merchantability and fitness for a particular purpose. the regents
22 * of the university of michigan shall not be liable for any damages,
23 * including special, indirect, incidental, or consequential damages,
24 * with respect to any claim arising out or in connection with the use
25 * of the software, even if it has been or is hereafter advised of the
26 * possibility of such damages.
27 */
28
29#include <linux/module.h>
30#include <linux/genhd.h>
31#include <linux/blkdev.h>
32
33#include "blocklayout.h"
34
35#define NFSDBG_FACILITY NFSDBG_PNFS_LD
36
37static void
38nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
39{
40 int i;
41
42 *p++ = cpu_to_be32(1);
43 *p++ = cpu_to_be32(b->type);
44 *p++ = cpu_to_be32(b->simple.nr_sigs);
45 for (i = 0; i < b->simple.nr_sigs; i++) {
46 p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
47 p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
48 b->simple.sigs[i].sig_len);
49 }
50}
51
52dev_t
53bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
54 gfp_t gfp_mask)
55{
56 struct net *net = server->nfs_client->cl_net;
57 struct nfs_net *nn = net_generic(net, nfs_net_id);
58 struct bl_dev_msg *reply = &nn->bl_mount_reply;
59 struct bl_pipe_msg bl_pipe_msg;
60 struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
61 struct bl_msg_hdr *bl_msg;
62 DECLARE_WAITQUEUE(wq, current);
63 dev_t dev = 0;
64 int rc;
65
66 dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
67
68 bl_pipe_msg.bl_wq = &nn->bl_wq;
69
70 b->simple.len += 4; /* single volume */
71 if (b->simple.len > PAGE_SIZE)
72 return -EIO;
73
74 memset(msg, 0, sizeof(*msg));
75 msg->len = sizeof(*bl_msg) + b->simple.len;
76 msg->data = kzalloc(msg->len, gfp_mask);
77 if (!msg->data)
78 goto out;
79
80 bl_msg = msg->data;
81 bl_msg->type = BL_DEVICE_MOUNT,
82 bl_msg->totallen = b->simple.len;
83 nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
84
85 dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
86 add_wait_queue(&nn->bl_wq, &wq);
87 rc = rpc_queue_upcall(nn->bl_device_pipe, msg);
88 if (rc < 0) {
89 remove_wait_queue(&nn->bl_wq, &wq);
90 goto out;
91 }
92
93 set_current_state(TASK_UNINTERRUPTIBLE);
94 schedule();
95 remove_wait_queue(&nn->bl_wq, &wq);
96
97 if (reply->status != BL_DEVICE_REQUEST_PROC) {
98 printk(KERN_WARNING "%s failed to decode device: %d\n",
99 __func__, reply->status);
100 goto out;
101 }
102
103 dev = MKDEV(reply->major, reply->minor);
104out:
105 kfree(msg->data);
106 return dev;
107}
108
109static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
110 size_t mlen)
111{
112 struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
113 nfs_net_id);
114
115 if (mlen != sizeof (struct bl_dev_msg))
116 return -EINVAL;
117
118 if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0)
119 return -EFAULT;
120
121 wake_up(&nn->bl_wq);
122
123 return mlen;
124}
125
126static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg)
127{
128 struct bl_pipe_msg *bl_pipe_msg =
129 container_of(msg, struct bl_pipe_msg, msg);
130
131 if (msg->errno >= 0)
132 return;
133 wake_up(bl_pipe_msg->bl_wq);
134}
135
136static const struct rpc_pipe_ops bl_upcall_ops = {
137 .upcall = rpc_pipe_generic_upcall,
138 .downcall = bl_pipe_downcall,
139 .destroy_msg = bl_pipe_destroy_msg,
140};
141
142static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb,
143 struct rpc_pipe *pipe)
144{
145 struct dentry *dir, *dentry;
146
147 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME);
148 if (dir == NULL)
149 return ERR_PTR(-ENOENT);
150 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe);
151 dput(dir);
152 return dentry;
153}
154
155static void nfs4blocklayout_unregister_sb(struct super_block *sb,
156 struct rpc_pipe *pipe)
157{
158 if (pipe->dentry)
159 rpc_unlink(pipe->dentry);
160}
161
162static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event,
163 void *ptr)
164{
165 struct super_block *sb = ptr;
166 struct net *net = sb->s_fs_info;
167 struct nfs_net *nn = net_generic(net, nfs_net_id);
168 struct dentry *dentry;
169 int ret = 0;
170
171 if (!try_module_get(THIS_MODULE))
172 return 0;
173
174 if (nn->bl_device_pipe == NULL) {
175 module_put(THIS_MODULE);
176 return 0;
177 }
178
179 switch (event) {
180 case RPC_PIPEFS_MOUNT:
181 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe);
182 if (IS_ERR(dentry)) {
183 ret = PTR_ERR(dentry);
184 break;
185 }
186 nn->bl_device_pipe->dentry = dentry;
187 break;
188 case RPC_PIPEFS_UMOUNT:
189 if (nn->bl_device_pipe->dentry)
190 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe);
191 break;
192 default:
193 ret = -ENOTSUPP;
194 break;
195 }
196 module_put(THIS_MODULE);
197 return ret;
198}
199
200static struct notifier_block nfs4blocklayout_block = {
201 .notifier_call = rpc_pipefs_event,
202};
203
204static struct dentry *nfs4blocklayout_register_net(struct net *net,
205 struct rpc_pipe *pipe)
206{
207 struct super_block *pipefs_sb;
208 struct dentry *dentry;
209
210 pipefs_sb = rpc_get_sb_net(net);
211 if (!pipefs_sb)
212 return NULL;
213 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe);
214 rpc_put_sb_net(net);
215 return dentry;
216}
217
218static void nfs4blocklayout_unregister_net(struct net *net,
219 struct rpc_pipe *pipe)
220{
221 struct super_block *pipefs_sb;
222
223 pipefs_sb = rpc_get_sb_net(net);
224 if (pipefs_sb) {
225 nfs4blocklayout_unregister_sb(pipefs_sb, pipe);
226 rpc_put_sb_net(net);
227 }
228}
229
230static int nfs4blocklayout_net_init(struct net *net)
231{
232 struct nfs_net *nn = net_generic(net, nfs_net_id);
233 struct dentry *dentry;
234
235 init_waitqueue_head(&nn->bl_wq);
236 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0);
237 if (IS_ERR(nn->bl_device_pipe))
238 return PTR_ERR(nn->bl_device_pipe);
239 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe);
240 if (IS_ERR(dentry)) {
241 rpc_destroy_pipe_data(nn->bl_device_pipe);
242 return PTR_ERR(dentry);
243 }
244 nn->bl_device_pipe->dentry = dentry;
245 return 0;
246}
247
248static void nfs4blocklayout_net_exit(struct net *net)
249{
250 struct nfs_net *nn = net_generic(net, nfs_net_id);
251
252 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe);
253 rpc_destroy_pipe_data(nn->bl_device_pipe);
254 nn->bl_device_pipe = NULL;
255}
256
257static struct pernet_operations nfs4blocklayout_net_ops = {
258 .init = nfs4blocklayout_net_init,
259 .exit = nfs4blocklayout_net_exit,
260};
261
262int __init bl_init_pipefs(void)
263{
264 int ret;
265
266 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block);
267 if (ret)
268 goto out;
269 ret = register_pernet_subsys(&nfs4blocklayout_net_ops);
270 if (ret)
271 goto out_unregister_notifier;
272 return 0;
273
274out_unregister_notifier:
275 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
276out:
277 return ret;
278}
279
280void __exit bl_cleanup_pipefs(void)
281{
282 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block);
283 unregister_pernet_subsys(&nfs4blocklayout_net_ops);
284}
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 073b4cf67ed9..b8fb3a4ef649 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -235,7 +235,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
235 235
236 cb_info->serv = serv; 236 cb_info->serv = serv;
237 cb_info->rqst = rqstp; 237 cb_info->rqst = rqstp;
238 cb_info->task = kthread_run(callback_svc, cb_info->rqst, 238 cb_info->task = kthread_create(callback_svc, cb_info->rqst,
239 "nfsv4.%u-svc", minorversion); 239 "nfsv4.%u-svc", minorversion);
240 if (IS_ERR(cb_info->task)) { 240 if (IS_ERR(cb_info->task)) {
241 ret = PTR_ERR(cb_info->task); 241 ret = PTR_ERR(cb_info->task);
@@ -244,6 +244,8 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
244 cb_info->task = NULL; 244 cb_info->task = NULL;
245 return ret; 245 return ret;
246 } 246 }
247 rqstp->rq_task = cb_info->task;
248 wake_up_process(cb_info->task);
247 dprintk("nfs_callback_up: service started\n"); 249 dprintk("nfs_callback_up: service started\n");
248 return 0; 250 return 0;
249} 251}
@@ -428,6 +430,18 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
428 if (p == NULL) 430 if (p == NULL)
429 return 0; 431 return 0;
430 432
433 /*
434 * Did we get the acceptor from userland during the SETCLIENID
435 * negotiation?
436 */
437 if (clp->cl_acceptor)
438 return !strcmp(p, clp->cl_acceptor);
439
440 /*
441 * Otherwise try to verify it using the cl_hostname. Note that this
442 * doesn't work if a non-canonical hostname was used in the devname.
443 */
444
431 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */ 445 /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
432 446
433 if (memcmp(p, "nfs@", 4) != 0) 447 if (memcmp(p, "nfs@", 4) != 0)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 41db5258e7a7..73466b934090 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp,
171 goto out; 171 goto out;
172 172
173 ino = lo->plh_inode; 173 ino = lo->plh_inode;
174
175 spin_lock(&ino->i_lock);
176 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
177 spin_unlock(&ino->i_lock);
178
179 pnfs_layoutcommit_inode(ino, false);
180
174 spin_lock(&ino->i_lock); 181 spin_lock(&ino->i_lock);
175 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 182 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
176 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, 183 pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
177 &args->cbl_range)) 184 &args->cbl_range)) {
178 rv = NFS4ERR_DELAY; 185 rv = NFS4ERR_DELAY;
179 else 186 goto unlock;
180 rv = NFS4ERR_NOMATCHING_LAYOUT; 187 }
181 pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); 188
189 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
190 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
191 &args->cbl_range);
192 }
193unlock:
182 spin_unlock(&ino->i_lock); 194 spin_unlock(&ino->i_lock);
183 pnfs_free_lseg_list(&free_me_list); 195 pnfs_free_lseg_list(&free_me_list);
184 pnfs_put_layout_hdr(lo); 196 pnfs_put_layout_hdr(lo);
@@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args,
277 } 289 }
278 290
279 found: 291 found:
280 if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE)
281 dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, "
282 "deleting instead\n", __func__);
283 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); 292 nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
284 } 293 }
285 294
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1d09289c8f0e..f9f4845db989 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -110,8 +110,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version)
110 mutex_unlock(&nfs_version_mutex); 110 mutex_unlock(&nfs_version_mutex);
111 } 111 }
112 112
113 if (!IS_ERR(nfs)) 113 if (!IS_ERR(nfs) && !try_module_get(nfs->owner))
114 try_module_get(nfs->owner); 114 return ERR_PTR(-EAGAIN);
115 return nfs; 115 return nfs;
116} 116}
117 117
@@ -158,7 +158,8 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
158 goto error_0; 158 goto error_0;
159 159
160 clp->cl_nfs_mod = cl_init->nfs_mod; 160 clp->cl_nfs_mod = cl_init->nfs_mod;
161 try_module_get(clp->cl_nfs_mod->owner); 161 if (!try_module_get(clp->cl_nfs_mod->owner))
162 goto error_dealloc;
162 163
163 clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; 164 clp->rpc_ops = clp->cl_nfs_mod->rpc_ops;
164 165
@@ -190,6 +191,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
190 191
191error_cleanup: 192error_cleanup:
192 put_nfs_version(clp->cl_nfs_mod); 193 put_nfs_version(clp->cl_nfs_mod);
194error_dealloc:
193 kfree(clp); 195 kfree(clp);
194error_0: 196error_0:
195 return ERR_PTR(err); 197 return ERR_PTR(err);
@@ -252,6 +254,7 @@ void nfs_free_client(struct nfs_client *clp)
252 put_net(clp->cl_net); 254 put_net(clp->cl_net);
253 put_nfs_version(clp->cl_nfs_mod); 255 put_nfs_version(clp->cl_nfs_mod);
254 kfree(clp->cl_hostname); 256 kfree(clp->cl_hostname);
257 kfree(clp->cl_acceptor);
255 kfree(clp); 258 kfree(clp);
256 259
257 dprintk("<-- nfs_free_client()\n"); 260 dprintk("<-- nfs_free_client()\n");
@@ -482,8 +485,13 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
482 struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); 485 struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
483 const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops; 486 const struct nfs_rpc_ops *rpc_ops = cl_init->nfs_mod->rpc_ops;
484 487
488 if (cl_init->hostname == NULL) {
489 WARN_ON(1);
490 return NULL;
491 }
492
485 dprintk("--> nfs_get_client(%s,v%u)\n", 493 dprintk("--> nfs_get_client(%s,v%u)\n",
486 cl_init->hostname ?: "", rpc_ops->version); 494 cl_init->hostname, rpc_ops->version);
487 495
488 /* see if the client already exists */ 496 /* see if the client already exists */
489 do { 497 do {
@@ -510,7 +518,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
510 } while (!IS_ERR(new)); 518 } while (!IS_ERR(new));
511 519
512 dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", 520 dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n",
513 cl_init->hostname ?: "", PTR_ERR(new)); 521 cl_init->hostname, PTR_ERR(new));
514 return new; 522 return new;
515} 523}
516EXPORT_SYMBOL_GPL(nfs_get_client); 524EXPORT_SYMBOL_GPL(nfs_get_client);
@@ -1205,7 +1213,7 @@ static const struct file_operations nfs_server_list_fops = {
1205 .open = nfs_server_list_open, 1213 .open = nfs_server_list_open,
1206 .read = seq_read, 1214 .read = seq_read,
1207 .llseek = seq_lseek, 1215 .llseek = seq_lseek,
1208 .release = seq_release, 1216 .release = seq_release_net,
1209 .owner = THIS_MODULE, 1217 .owner = THIS_MODULE,
1210}; 1218};
1211 1219
@@ -1226,7 +1234,7 @@ static const struct file_operations nfs_volume_list_fops = {
1226 .open = nfs_volume_list_open, 1234 .open = nfs_volume_list_open,
1227 .read = seq_read, 1235 .read = seq_read,
1228 .llseek = seq_lseek, 1236 .llseek = seq_lseek,
1229 .release = seq_release, 1237 .release = seq_release_net,
1230 .owner = THIS_MODULE, 1238 .owner = THIS_MODULE,
1231}; 1239};
1232 1240
@@ -1236,27 +1244,17 @@ static const struct file_operations nfs_volume_list_fops = {
1236 */ 1244 */
1237static int nfs_server_list_open(struct inode *inode, struct file *file) 1245static int nfs_server_list_open(struct inode *inode, struct file *file)
1238{ 1246{
1239 struct seq_file *m; 1247 return seq_open_net(inode, file, &nfs_server_list_ops,
1240 int ret; 1248 sizeof(struct seq_net_private));
1241 struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
1242 struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
1243
1244 ret = seq_open(file, &nfs_server_list_ops);
1245 if (ret < 0)
1246 return ret;
1247
1248 m = file->private_data;
1249 m->private = net;
1250
1251 return 0;
1252} 1249}
1253 1250
1254/* 1251/*
1255 * set up the iterator to start reading from the server list and return the first item 1252 * set up the iterator to start reading from the server list and return the first item
1256 */ 1253 */
1257static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) 1254static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1255 __acquires(&nn->nfs_client_lock)
1258{ 1256{
1259 struct nfs_net *nn = net_generic(m->private, nfs_net_id); 1257 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1260 1258
1261 /* lock the list against modification */ 1259 /* lock the list against modification */
1262 spin_lock(&nn->nfs_client_lock); 1260 spin_lock(&nn->nfs_client_lock);
@@ -1268,7 +1266,7 @@ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos)
1268 */ 1266 */
1269static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) 1267static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1270{ 1268{
1271 struct nfs_net *nn = net_generic(p->private, nfs_net_id); 1269 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1272 1270
1273 return seq_list_next(v, &nn->nfs_client_list, pos); 1271 return seq_list_next(v, &nn->nfs_client_list, pos);
1274} 1272}
@@ -1277,8 +1275,9 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos)
1277 * clean up after reading from the transports list 1275 * clean up after reading from the transports list
1278 */ 1276 */
1279static void nfs_server_list_stop(struct seq_file *p, void *v) 1277static void nfs_server_list_stop(struct seq_file *p, void *v)
1278 __releases(&nn->nfs_client_lock)
1280{ 1279{
1281 struct nfs_net *nn = net_generic(p->private, nfs_net_id); 1280 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1282 1281
1283 spin_unlock(&nn->nfs_client_lock); 1282 spin_unlock(&nn->nfs_client_lock);
1284} 1283}
@@ -1289,7 +1288,7 @@ static void nfs_server_list_stop(struct seq_file *p, void *v)
1289static int nfs_server_list_show(struct seq_file *m, void *v) 1288static int nfs_server_list_show(struct seq_file *m, void *v)
1290{ 1289{
1291 struct nfs_client *clp; 1290 struct nfs_client *clp;
1292 struct nfs_net *nn = net_generic(m->private, nfs_net_id); 1291 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1293 1292
1294 /* display header on line 1 */ 1293 /* display header on line 1 */
1295 if (v == &nn->nfs_client_list) { 1294 if (v == &nn->nfs_client_list) {
@@ -1321,27 +1320,17 @@ static int nfs_server_list_show(struct seq_file *m, void *v)
1321 */ 1320 */
1322static int nfs_volume_list_open(struct inode *inode, struct file *file) 1321static int nfs_volume_list_open(struct inode *inode, struct file *file)
1323{ 1322{
1324 struct seq_file *m; 1323 return seq_open_net(inode, file, &nfs_volume_list_ops,
1325 int ret; 1324 sizeof(struct seq_net_private));
1326 struct pid_namespace *pid_ns = file->f_dentry->d_sb->s_fs_info;
1327 struct net *net = pid_ns->child_reaper->nsproxy->net_ns;
1328
1329 ret = seq_open(file, &nfs_volume_list_ops);
1330 if (ret < 0)
1331 return ret;
1332
1333 m = file->private_data;
1334 m->private = net;
1335
1336 return 0;
1337} 1325}
1338 1326
1339/* 1327/*
1340 * set up the iterator to start reading from the volume list and return the first item 1328 * set up the iterator to start reading from the volume list and return the first item
1341 */ 1329 */
1342static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) 1330static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1331 __acquires(&nn->nfs_client_lock)
1343{ 1332{
1344 struct nfs_net *nn = net_generic(m->private, nfs_net_id); 1333 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1345 1334
1346 /* lock the list against modification */ 1335 /* lock the list against modification */
1347 spin_lock(&nn->nfs_client_lock); 1336 spin_lock(&nn->nfs_client_lock);
@@ -1353,7 +1342,7 @@ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos)
1353 */ 1342 */
1354static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) 1343static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1355{ 1344{
1356 struct nfs_net *nn = net_generic(p->private, nfs_net_id); 1345 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1357 1346
1358 return seq_list_next(v, &nn->nfs_volume_list, pos); 1347 return seq_list_next(v, &nn->nfs_volume_list, pos);
1359} 1348}
@@ -1362,8 +1351,9 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos)
1362 * clean up after reading from the transports list 1351 * clean up after reading from the transports list
1363 */ 1352 */
1364static void nfs_volume_list_stop(struct seq_file *p, void *v) 1353static void nfs_volume_list_stop(struct seq_file *p, void *v)
1354 __releases(&nn->nfs_client_lock)
1365{ 1355{
1366 struct nfs_net *nn = net_generic(p->private, nfs_net_id); 1356 struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id);
1367 1357
1368 spin_unlock(&nn->nfs_client_lock); 1358 spin_unlock(&nn->nfs_client_lock);
1369} 1359}
@@ -1376,7 +1366,7 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1376 struct nfs_server *server; 1366 struct nfs_server *server;
1377 struct nfs_client *clp; 1367 struct nfs_client *clp;
1378 char dev[8], fsid[17]; 1368 char dev[8], fsid[17];
1379 struct nfs_net *nn = net_generic(m->private, nfs_net_id); 1369 struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id);
1380 1370
1381 /* display header on line 1 */ 1371 /* display header on line 1 */
1382 if (v == &nn->nfs_volume_list) { 1372 if (v == &nn->nfs_volume_list) {
@@ -1407,6 +1397,39 @@ static int nfs_volume_list_show(struct seq_file *m, void *v)
1407 return 0; 1397 return 0;
1408} 1398}
1409 1399
1400int nfs_fs_proc_net_init(struct net *net)
1401{
1402 struct nfs_net *nn = net_generic(net, nfs_net_id);
1403 struct proc_dir_entry *p;
1404
1405 nn->proc_nfsfs = proc_net_mkdir(net, "nfsfs", net->proc_net);
1406 if (!nn->proc_nfsfs)
1407 goto error_0;
1408
1409 /* a file of servers with which we're dealing */
1410 p = proc_create("servers", S_IFREG|S_IRUGO,
1411 nn->proc_nfsfs, &nfs_server_list_fops);
1412 if (!p)
1413 goto error_1;
1414
1415 /* a file of volumes that we have mounted */
1416 p = proc_create("volumes", S_IFREG|S_IRUGO,
1417 nn->proc_nfsfs, &nfs_volume_list_fops);
1418 if (!p)
1419 goto error_1;
1420 return 0;
1421
1422error_1:
1423 remove_proc_subtree("nfsfs", net->proc_net);
1424error_0:
1425 return -ENOMEM;
1426}
1427
1428void nfs_fs_proc_net_exit(struct net *net)
1429{
1430 remove_proc_subtree("nfsfs", net->proc_net);
1431}
1432
1410/* 1433/*
1411 * initialise the /proc/fs/nfsfs/ directory 1434 * initialise the /proc/fs/nfsfs/ directory
1412 */ 1435 */
@@ -1419,14 +1442,12 @@ int __init nfs_fs_proc_init(void)
1419 goto error_0; 1442 goto error_0;
1420 1443
1421 /* a file of servers with which we're dealing */ 1444 /* a file of servers with which we're dealing */
1422 p = proc_create("servers", S_IFREG|S_IRUGO, 1445 p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers");
1423 proc_fs_nfs, &nfs_server_list_fops);
1424 if (!p) 1446 if (!p)
1425 goto error_1; 1447 goto error_1;
1426 1448
1427 /* a file of volumes that we have mounted */ 1449 /* a file of volumes that we have mounted */
1428 p = proc_create("volumes", S_IFREG|S_IRUGO, 1450 p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes");
1429 proc_fs_nfs, &nfs_volume_list_fops);
1430 if (!p) 1451 if (!p)
1431 goto error_2; 1452 goto error_2;
1432 return 0; 1453 return 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5d8ccecf5f5c..5853f53db732 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,14 +41,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
41 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); 41 set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
42} 42}
43 43
44/** 44static int
45 * nfs_have_delegation - check if inode has a delegation 45nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
46 * @inode: inode to check
47 * @flags: delegation types to check for
48 *
49 * Returns one if inode has the indicated delegation, otherwise zero.
50 */
51int nfs4_have_delegation(struct inode *inode, fmode_t flags)
52{ 46{
53 struct nfs_delegation *delegation; 47 struct nfs_delegation *delegation;
54 int ret = 0; 48 int ret = 0;
@@ -58,12 +52,34 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
58 delegation = rcu_dereference(NFS_I(inode)->delegation); 52 delegation = rcu_dereference(NFS_I(inode)->delegation);
59 if (delegation != NULL && (delegation->type & flags) == flags && 53 if (delegation != NULL && (delegation->type & flags) == flags &&
60 !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { 54 !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
61 nfs_mark_delegation_referenced(delegation); 55 if (mark)
56 nfs_mark_delegation_referenced(delegation);
62 ret = 1; 57 ret = 1;
63 } 58 }
64 rcu_read_unlock(); 59 rcu_read_unlock();
65 return ret; 60 return ret;
66} 61}
62/**
63 * nfs_have_delegation - check if inode has a delegation, mark it
64 * NFS_DELEGATION_REFERENCED if there is one.
65 * @inode: inode to check
66 * @flags: delegation types to check for
67 *
68 * Returns one if inode has the indicated delegation, otherwise zero.
69 */
70int nfs4_have_delegation(struct inode *inode, fmode_t flags)
71{
72 return nfs4_do_check_delegation(inode, flags, true);
73}
74
75/*
76 * nfs4_check_delegation - check if inode has a delegation, do not mark
77 * NFS_DELEGATION_REFERENCED if it has one.
78 */
79int nfs4_check_delegation(struct inode *inode, fmode_t flags)
80{
81 return nfs4_do_check_delegation(inode, flags, false);
82}
67 83
68static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid) 84static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
69{ 85{
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9a79c7a99d6d..5c1cce39297f 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -59,6 +59,7 @@ bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_
59 59
60void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); 60void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
61int nfs4_have_delegation(struct inode *inode, fmode_t flags); 61int nfs4_have_delegation(struct inode *inode, fmode_t flags);
62int nfs4_check_delegation(struct inode *inode, fmode_t flags);
62 63
63#endif 64#endif
64 65
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4a3d4ef76127..06e8cfcbb670 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -486,8 +486,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
486 nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label); 486 nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label);
487 goto out; 487 goto out;
488 } else { 488 } else {
489 if (d_invalidate(dentry) != 0) 489 d_invalidate(dentry);
490 goto out;
491 dput(dentry); 490 dput(dentry);
492 } 491 }
493 } 492 }
@@ -988,9 +987,13 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
988 * A check for whether or not the parent directory has changed. 987 * A check for whether or not the parent directory has changed.
989 * In the case it has, we assume that the dentries are untrustworthy 988 * In the case it has, we assume that the dentries are untrustworthy
990 * and may need to be looked up again. 989 * and may need to be looked up again.
990 * If rcu_walk prevents us from performing a full check, return 0.
991 */ 991 */
992static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) 992static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
993 int rcu_walk)
993{ 994{
995 int ret;
996
994 if (IS_ROOT(dentry)) 997 if (IS_ROOT(dentry))
995 return 1; 998 return 1;
996 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) 999 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -998,7 +1001,11 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
998 if (!nfs_verify_change_attribute(dir, dentry->d_time)) 1001 if (!nfs_verify_change_attribute(dir, dentry->d_time))
999 return 0; 1002 return 0;
1000 /* Revalidate nfsi->cache_change_attribute before we declare a match */ 1003 /* Revalidate nfsi->cache_change_attribute before we declare a match */
1001 if (nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) 1004 if (rcu_walk)
1005 ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
1006 else
1007 ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
1008 if (ret < 0)
1002 return 0; 1009 return 0;
1003 if (!nfs_verify_change_attribute(dir, dentry->d_time)) 1010 if (!nfs_verify_change_attribute(dir, dentry->d_time))
1004 return 0; 1011 return 0;
@@ -1042,6 +1049,8 @@ int nfs_lookup_verify_inode(struct inode *inode, unsigned int flags)
1042out: 1049out:
1043 return (inode->i_nlink == 0) ? -ENOENT : 0; 1050 return (inode->i_nlink == 0) ? -ENOENT : 0;
1044out_force: 1051out_force:
1052 if (flags & LOOKUP_RCU)
1053 return -ECHILD;
1045 ret = __nfs_revalidate_inode(server, inode); 1054 ret = __nfs_revalidate_inode(server, inode);
1046 if (ret != 0) 1055 if (ret != 0)
1047 return ret; 1056 return ret;
@@ -1054,6 +1063,9 @@ out_force:
1054 * 1063 *
1055 * If parent mtime has changed, we revalidate, else we wait for a 1064 * If parent mtime has changed, we revalidate, else we wait for a
1056 * period corresponding to the parent's attribute cache timeout value. 1065 * period corresponding to the parent's attribute cache timeout value.
1066 *
1067 * If LOOKUP_RCU prevents us from performing a full check, return 1
1068 * suggesting a reval is needed.
1057 */ 1069 */
1058static inline 1070static inline
1059int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, 1071int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
@@ -1064,7 +1076,7 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry,
1064 return 0; 1076 return 0;
1065 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) 1077 if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG)
1066 return 1; 1078 return 1;
1067 return !nfs_check_verifier(dir, dentry); 1079 return !nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU);
1068} 1080}
1069 1081
1070/* 1082/*
@@ -1088,21 +1100,30 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1088 struct nfs4_label *label = NULL; 1100 struct nfs4_label *label = NULL;
1089 int error; 1101 int error;
1090 1102
1091 if (flags & LOOKUP_RCU) 1103 if (flags & LOOKUP_RCU) {
1092 return -ECHILD; 1104 parent = ACCESS_ONCE(dentry->d_parent);
1093 1105 dir = ACCESS_ONCE(parent->d_inode);
1094 parent = dget_parent(dentry); 1106 if (!dir)
1095 dir = parent->d_inode; 1107 return -ECHILD;
1108 } else {
1109 parent = dget_parent(dentry);
1110 dir = parent->d_inode;
1111 }
1096 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); 1112 nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE);
1097 inode = dentry->d_inode; 1113 inode = dentry->d_inode;
1098 1114
1099 if (!inode) { 1115 if (!inode) {
1100 if (nfs_neg_need_reval(dir, dentry, flags)) 1116 if (nfs_neg_need_reval(dir, dentry, flags)) {
1117 if (flags & LOOKUP_RCU)
1118 return -ECHILD;
1101 goto out_bad; 1119 goto out_bad;
1120 }
1102 goto out_valid_noent; 1121 goto out_valid_noent;
1103 } 1122 }
1104 1123
1105 if (is_bad_inode(inode)) { 1124 if (is_bad_inode(inode)) {
1125 if (flags & LOOKUP_RCU)
1126 return -ECHILD;
1106 dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n", 1127 dfprintk(LOOKUPCACHE, "%s: %pd2 has dud inode\n",
1107 __func__, dentry); 1128 __func__, dentry);
1108 goto out_bad; 1129 goto out_bad;
@@ -1112,12 +1133,20 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1112 goto out_set_verifier; 1133 goto out_set_verifier;
1113 1134
1114 /* Force a full look up iff the parent directory has changed */ 1135 /* Force a full look up iff the parent directory has changed */
1115 if (!nfs_is_exclusive_create(dir, flags) && nfs_check_verifier(dir, dentry)) { 1136 if (!nfs_is_exclusive_create(dir, flags) &&
1116 if (nfs_lookup_verify_inode(inode, flags)) 1137 nfs_check_verifier(dir, dentry, flags & LOOKUP_RCU)) {
1138
1139 if (nfs_lookup_verify_inode(inode, flags)) {
1140 if (flags & LOOKUP_RCU)
1141 return -ECHILD;
1117 goto out_zap_parent; 1142 goto out_zap_parent;
1143 }
1118 goto out_valid; 1144 goto out_valid;
1119 } 1145 }
1120 1146
1147 if (flags & LOOKUP_RCU)
1148 return -ECHILD;
1149
1121 if (NFS_STALE(inode)) 1150 if (NFS_STALE(inode))
1122 goto out_bad; 1151 goto out_bad;
1123 1152
@@ -1153,13 +1182,18 @@ out_set_verifier:
1153 /* Success: notify readdir to use READDIRPLUS */ 1182 /* Success: notify readdir to use READDIRPLUS */
1154 nfs_advise_use_readdirplus(dir); 1183 nfs_advise_use_readdirplus(dir);
1155 out_valid_noent: 1184 out_valid_noent:
1156 dput(parent); 1185 if (flags & LOOKUP_RCU) {
1186 if (parent != ACCESS_ONCE(dentry->d_parent))
1187 return -ECHILD;
1188 } else
1189 dput(parent);
1157 dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n", 1190 dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is valid\n",
1158 __func__, dentry); 1191 __func__, dentry);
1159 return 1; 1192 return 1;
1160out_zap_parent: 1193out_zap_parent:
1161 nfs_zap_caches(dir); 1194 nfs_zap_caches(dir);
1162 out_bad: 1195 out_bad:
1196 WARN_ON(flags & LOOKUP_RCU);
1163 nfs_free_fattr(fattr); 1197 nfs_free_fattr(fattr);
1164 nfs_free_fhandle(fhandle); 1198 nfs_free_fhandle(fhandle);
1165 nfs4_label_free(label); 1199 nfs4_label_free(label);
@@ -1176,15 +1210,12 @@ out_zap_parent:
1176 if (IS_ROOT(dentry)) 1210 if (IS_ROOT(dentry))
1177 goto out_valid; 1211 goto out_valid;
1178 } 1212 }
1179 /* If we have submounts, don't unhash ! */
1180 if (check_submounts_and_drop(dentry) != 0)
1181 goto out_valid;
1182
1183 dput(parent); 1213 dput(parent);
1184 dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", 1214 dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n",
1185 __func__, dentry); 1215 __func__, dentry);
1186 return 0; 1216 return 0;
1187out_error: 1217out_error:
1218 WARN_ON(flags & LOOKUP_RCU);
1188 nfs_free_fattr(fattr); 1219 nfs_free_fattr(fattr);
1189 nfs_free_fhandle(fhandle); 1220 nfs_free_fhandle(fhandle);
1190 nfs4_label_free(label); 1221 nfs4_label_free(label);
@@ -1529,14 +1560,9 @@ EXPORT_SYMBOL_GPL(nfs_atomic_open);
1529 1560
1530static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) 1561static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1531{ 1562{
1532 struct dentry *parent = NULL;
1533 struct inode *inode; 1563 struct inode *inode;
1534 struct inode *dir;
1535 int ret = 0; 1564 int ret = 0;
1536 1565
1537 if (flags & LOOKUP_RCU)
1538 return -ECHILD;
1539
1540 if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY)) 1566 if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
1541 goto no_open; 1567 goto no_open;
1542 if (d_mountpoint(dentry)) 1568 if (d_mountpoint(dentry))
@@ -1545,34 +1571,47 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
1545 goto no_open; 1571 goto no_open;
1546 1572
1547 inode = dentry->d_inode; 1573 inode = dentry->d_inode;
1548 parent = dget_parent(dentry);
1549 dir = parent->d_inode;
1550 1574
1551 /* We can't create new files in nfs_open_revalidate(), so we 1575 /* We can't create new files in nfs_open_revalidate(), so we
1552 * optimize away revalidation of negative dentries. 1576 * optimize away revalidation of negative dentries.
1553 */ 1577 */
1554 if (inode == NULL) { 1578 if (inode == NULL) {
1579 struct dentry *parent;
1580 struct inode *dir;
1581
1582 if (flags & LOOKUP_RCU) {
1583 parent = ACCESS_ONCE(dentry->d_parent);
1584 dir = ACCESS_ONCE(parent->d_inode);
1585 if (!dir)
1586 return -ECHILD;
1587 } else {
1588 parent = dget_parent(dentry);
1589 dir = parent->d_inode;
1590 }
1555 if (!nfs_neg_need_reval(dir, dentry, flags)) 1591 if (!nfs_neg_need_reval(dir, dentry, flags))
1556 ret = 1; 1592 ret = 1;
1593 else if (flags & LOOKUP_RCU)
1594 ret = -ECHILD;
1595 if (!(flags & LOOKUP_RCU))
1596 dput(parent);
1597 else if (parent != ACCESS_ONCE(dentry->d_parent))
1598 return -ECHILD;
1557 goto out; 1599 goto out;
1558 } 1600 }
1559 1601
1560 /* NFS only supports OPEN on regular files */ 1602 /* NFS only supports OPEN on regular files */
1561 if (!S_ISREG(inode->i_mode)) 1603 if (!S_ISREG(inode->i_mode))
1562 goto no_open_dput; 1604 goto no_open;
1563 /* We cannot do exclusive creation on a positive dentry */ 1605 /* We cannot do exclusive creation on a positive dentry */
1564 if (flags & LOOKUP_EXCL) 1606 if (flags & LOOKUP_EXCL)
1565 goto no_open_dput; 1607 goto no_open;
1566 1608
1567 /* Let f_op->open() actually open (and revalidate) the file */ 1609 /* Let f_op->open() actually open (and revalidate) the file */
1568 ret = 1; 1610 ret = 1;
1569 1611
1570out: 1612out:
1571 dput(parent);
1572 return ret; 1613 return ret;
1573 1614
1574no_open_dput:
1575 dput(parent);
1576no_open: 1615no_open:
1577 return nfs_lookup_revalidate(dentry, flags); 1616 return nfs_lookup_revalidate(dentry, flags);
1578} 1617}
@@ -2028,10 +2067,14 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
2028static LIST_HEAD(nfs_access_lru_list); 2067static LIST_HEAD(nfs_access_lru_list);
2029static atomic_long_t nfs_access_nr_entries; 2068static atomic_long_t nfs_access_nr_entries;
2030 2069
2070static unsigned long nfs_access_max_cachesize = ULONG_MAX;
2071module_param(nfs_access_max_cachesize, ulong, 0644);
2072MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
2073
2031static void nfs_access_free_entry(struct nfs_access_entry *entry) 2074static void nfs_access_free_entry(struct nfs_access_entry *entry)
2032{ 2075{
2033 put_rpccred(entry->cred); 2076 put_rpccred(entry->cred);
2034 kfree(entry); 2077 kfree_rcu(entry, rcu_head);
2035 smp_mb__before_atomic(); 2078 smp_mb__before_atomic();
2036 atomic_long_dec(&nfs_access_nr_entries); 2079 atomic_long_dec(&nfs_access_nr_entries);
2037 smp_mb__after_atomic(); 2080 smp_mb__after_atomic();
@@ -2048,19 +2091,14 @@ static void nfs_access_free_list(struct list_head *head)
2048 } 2091 }
2049} 2092}
2050 2093
2051unsigned long 2094static unsigned long
2052nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc) 2095nfs_do_access_cache_scan(unsigned int nr_to_scan)
2053{ 2096{
2054 LIST_HEAD(head); 2097 LIST_HEAD(head);
2055 struct nfs_inode *nfsi, *next; 2098 struct nfs_inode *nfsi, *next;
2056 struct nfs_access_entry *cache; 2099 struct nfs_access_entry *cache;
2057 int nr_to_scan = sc->nr_to_scan;
2058 gfp_t gfp_mask = sc->gfp_mask;
2059 long freed = 0; 2100 long freed = 0;
2060 2101
2061 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
2062 return SHRINK_STOP;
2063
2064 spin_lock(&nfs_access_lru_lock); 2102 spin_lock(&nfs_access_lru_lock);
2065 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) { 2103 list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
2066 struct inode *inode; 2104 struct inode *inode;
@@ -2094,11 +2132,39 @@ remove_lru_entry:
2094} 2132}
2095 2133
2096unsigned long 2134unsigned long
2135nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
2136{
2137 int nr_to_scan = sc->nr_to_scan;
2138 gfp_t gfp_mask = sc->gfp_mask;
2139
2140 if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
2141 return SHRINK_STOP;
2142 return nfs_do_access_cache_scan(nr_to_scan);
2143}
2144
2145
2146unsigned long
2097nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc) 2147nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
2098{ 2148{
2099 return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries)); 2149 return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
2100} 2150}
2101 2151
2152static void
2153nfs_access_cache_enforce_limit(void)
2154{
2155 long nr_entries = atomic_long_read(&nfs_access_nr_entries);
2156 unsigned long diff;
2157 unsigned int nr_to_scan;
2158
2159 if (nr_entries < 0 || nr_entries <= nfs_access_max_cachesize)
2160 return;
2161 nr_to_scan = 100;
2162 diff = nr_entries - nfs_access_max_cachesize;
2163 if (diff < nr_to_scan)
2164 nr_to_scan = diff;
2165 nfs_do_access_cache_scan(nr_to_scan);
2166}
2167
2102static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head) 2168static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
2103{ 2169{
2104 struct rb_root *root_node = &nfsi->access_cache; 2170 struct rb_root *root_node = &nfsi->access_cache;
@@ -2186,6 +2252,38 @@ out_zap:
2186 return -ENOENT; 2252 return -ENOENT;
2187} 2253}
2188 2254
2255static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
2256{
2257 /* Only check the most recently returned cache entry,
2258 * but do it without locking.
2259 */
2260 struct nfs_inode *nfsi = NFS_I(inode);
2261 struct nfs_access_entry *cache;
2262 int err = -ECHILD;
2263 struct list_head *lh;
2264
2265 rcu_read_lock();
2266 if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
2267 goto out;
2268 lh = rcu_dereference(nfsi->access_cache_entry_lru.prev);
2269 cache = list_entry(lh, struct nfs_access_entry, lru);
2270 if (lh == &nfsi->access_cache_entry_lru ||
2271 cred != cache->cred)
2272 cache = NULL;
2273 if (cache == NULL)
2274 goto out;
2275 if (!nfs_have_delegated_attributes(inode) &&
2276 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
2277 goto out;
2278 res->jiffies = cache->jiffies;
2279 res->cred = cache->cred;
2280 res->mask = cache->mask;
2281 err = 0;
2282out:
2283 rcu_read_unlock();
2284 return err;
2285}
2286
2189static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set) 2287static void nfs_access_add_rbtree(struct inode *inode, struct nfs_access_entry *set)
2190{ 2288{
2191 struct nfs_inode *nfsi = NFS_I(inode); 2289 struct nfs_inode *nfsi = NFS_I(inode);
@@ -2229,6 +2327,11 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
2229 cache->cred = get_rpccred(set->cred); 2327 cache->cred = get_rpccred(set->cred);
2230 cache->mask = set->mask; 2328 cache->mask = set->mask;
2231 2329
2330 /* The above field assignments must be visible
2331 * before this item appears on the lru. We cannot easily
2332 * use rcu_assign_pointer, so just force the memory barrier.
2333 */
2334 smp_wmb();
2232 nfs_access_add_rbtree(inode, cache); 2335 nfs_access_add_rbtree(inode, cache);
2233 2336
2234 /* Update accounting */ 2337 /* Update accounting */
@@ -2244,6 +2347,7 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
2244 &nfs_access_lru_list); 2347 &nfs_access_lru_list);
2245 spin_unlock(&nfs_access_lru_lock); 2348 spin_unlock(&nfs_access_lru_lock);
2246 } 2349 }
2350 nfs_access_cache_enforce_limit();
2247} 2351}
2248EXPORT_SYMBOL_GPL(nfs_access_add_cache); 2352EXPORT_SYMBOL_GPL(nfs_access_add_cache);
2249 2353
@@ -2267,10 +2371,16 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
2267 2371
2268 trace_nfs_access_enter(inode); 2372 trace_nfs_access_enter(inode);
2269 2373
2270 status = nfs_access_get_cached(inode, cred, &cache); 2374 status = nfs_access_get_cached_rcu(inode, cred, &cache);
2375 if (status != 0)
2376 status = nfs_access_get_cached(inode, cred, &cache);
2271 if (status == 0) 2377 if (status == 0)
2272 goto out_cached; 2378 goto out_cached;
2273 2379
2380 status = -ECHILD;
2381 if (mask & MAY_NOT_BLOCK)
2382 goto out;
2383
2274 /* Be clever: ask server to check for all possible rights */ 2384 /* Be clever: ask server to check for all possible rights */
2275 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; 2385 cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ;
2276 cache.cred = cred; 2386 cache.cred = cred;
@@ -2321,9 +2431,6 @@ int nfs_permission(struct inode *inode, int mask)
2321 struct rpc_cred *cred; 2431 struct rpc_cred *cred;
2322 int res = 0; 2432 int res = 0;
2323 2433
2324 if (mask & MAY_NOT_BLOCK)
2325 return -ECHILD;
2326
2327 nfs_inc_stats(inode, NFSIOS_VFSACCESS); 2434 nfs_inc_stats(inode, NFSIOS_VFSACCESS);
2328 2435
2329 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) 2436 if ((mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
@@ -2350,12 +2457,23 @@ force_lookup:
2350 if (!NFS_PROTO(inode)->access) 2457 if (!NFS_PROTO(inode)->access)
2351 goto out_notsup; 2458 goto out_notsup;
2352 2459
2353 cred = rpc_lookup_cred(); 2460 /* Always try fast lookups first */
2354 if (!IS_ERR(cred)) { 2461 rcu_read_lock();
2355 res = nfs_do_access(inode, cred, mask); 2462 cred = rpc_lookup_cred_nonblock();
2356 put_rpccred(cred); 2463 if (!IS_ERR(cred))
2357 } else 2464 res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
2465 else
2358 res = PTR_ERR(cred); 2466 res = PTR_ERR(cred);
2467 rcu_read_unlock();
2468 if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
2469 /* Fast lookup failed, try the slow way */
2470 cred = rpc_lookup_cred();
2471 if (!IS_ERR(cred)) {
2472 res = nfs_do_access(inode, cred, mask);
2473 put_rpccred(cred);
2474 } else
2475 res = PTR_ERR(cred);
2476 }
2359out: 2477out:
2360 if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) 2478 if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
2361 res = -EACCES; 2479 res = -EACCES;
@@ -2364,6 +2482,9 @@ out:
2364 inode->i_sb->s_id, inode->i_ino, mask, res); 2482 inode->i_sb->s_id, inode->i_ino, mask, res);
2365 return res; 2483 return res;
2366out_notsup: 2484out_notsup:
2485 if (mask & MAY_NOT_BLOCK)
2486 return -ECHILD;
2487
2367 res = nfs_revalidate_inode(NFS_SERVER(inode), inode); 2488 res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
2368 if (res == 0) 2489 if (res == 0)
2369 res = generic_permission(inode, mask); 2490 res = generic_permission(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f11b9eed0de1..20cffc830468 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -148,8 +148,8 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
148{ 148{
149 struct nfs_writeverf *verfp; 149 struct nfs_writeverf *verfp;
150 150
151 verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, 151 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
152 hdr->data->ds_idx); 152 hdr->ds_idx);
153 WARN_ON_ONCE(verfp->committed >= 0); 153 WARN_ON_ONCE(verfp->committed >= 0);
154 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 154 memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
155 WARN_ON_ONCE(verfp->committed < 0); 155 WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +169,8 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
169{ 169{
170 struct nfs_writeverf *verfp; 170 struct nfs_writeverf *verfp;
171 171
172 verfp = nfs_direct_select_verf(dreq, hdr->data->ds_clp, 172 verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
173 hdr->data->ds_idx); 173 hdr->ds_idx);
174 if (verfp->committed < 0) { 174 if (verfp->committed < 0) {
175 nfs_direct_set_hdr_verf(dreq, hdr); 175 nfs_direct_set_hdr_verf(dreq, hdr);
176 return 0; 176 return 0;
@@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
178 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); 178 return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
179} 179}
180 180
181#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
182/* 181/*
183 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data 182 * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
184 * @dreq - direct request possibly spanning multiple servers 183 * @dreq - direct request possibly spanning multiple servers
@@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
197 WARN_ON_ONCE(verfp->committed < 0); 196 WARN_ON_ONCE(verfp->committed < 0);
198 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); 197 return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
199} 198}
200#endif
201 199
202/** 200/**
203 * nfs_direct_IO - NFS address space operation for direct I/O 201 * nfs_direct_IO - NFS address space operation for direct I/O
@@ -222,11 +220,9 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t
222#else 220#else
223 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); 221 VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
224 222
225 if (rw == READ || rw == KERNEL_READ) 223 if (rw == READ)
226 return nfs_file_direct_read(iocb, iter, pos, 224 return nfs_file_direct_read(iocb, iter, pos);
227 rw == READ ? true : false); 225 return nfs_file_direct_write(iocb, iter, pos);
228 return nfs_file_direct_write(iocb, iter, pos,
229 rw == WRITE ? true : false);
230#endif /* CONFIG_NFS_SWAP */ 226#endif /* CONFIG_NFS_SWAP */
231} 227}
232 228
@@ -512,7 +508,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
512 * cache. 508 * cache.
513 */ 509 */
514ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, 510ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
515 loff_t pos, bool uio) 511 loff_t pos)
516{ 512{
517 struct file *file = iocb->ki_filp; 513 struct file *file = iocb->ki_filp;
518 struct address_space *mapping = file->f_mapping; 514 struct address_space *mapping = file->f_mapping;
@@ -576,7 +572,6 @@ out:
576 return result; 572 return result;
577} 573}
578 574
579#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
580static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) 575static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
581{ 576{
582 struct nfs_pageio_descriptor desc; 577 struct nfs_pageio_descriptor desc;
@@ -700,22 +695,11 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
700 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ 695 schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
701} 696}
702 697
703#else
704static void nfs_direct_write_schedule_work(struct work_struct *work)
705{
706}
707
708static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
709{
710 nfs_direct_complete(dreq, true);
711}
712#endif
713
714static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) 698static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
715{ 699{
716 struct nfs_direct_req *dreq = hdr->dreq; 700 struct nfs_direct_req *dreq = hdr->dreq;
717 struct nfs_commit_info cinfo; 701 struct nfs_commit_info cinfo;
718 int bit = -1; 702 bool request_commit = false;
719 struct nfs_page *req = nfs_list_entry(hdr->pages.next); 703 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
720 704
721 if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) 705 if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
@@ -729,27 +713,20 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
729 dreq->flags = 0; 713 dreq->flags = 0;
730 dreq->error = hdr->error; 714 dreq->error = hdr->error;
731 } 715 }
732 if (dreq->error != 0) 716 if (dreq->error == 0) {
733 bit = NFS_IOHDR_ERROR;
734 else {
735 dreq->count += hdr->good_bytes; 717 dreq->count += hdr->good_bytes;
736 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { 718 if (nfs_write_need_commit(hdr)) {
737 dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
738 bit = NFS_IOHDR_NEED_RESCHED;
739 } else if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
740 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) 719 if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
741 bit = NFS_IOHDR_NEED_RESCHED; 720 request_commit = true;
742 else if (dreq->flags == 0) { 721 else if (dreq->flags == 0) {
743 nfs_direct_set_hdr_verf(dreq, hdr); 722 nfs_direct_set_hdr_verf(dreq, hdr);
744 bit = NFS_IOHDR_NEED_COMMIT; 723 request_commit = true;
745 dreq->flags = NFS_ODIRECT_DO_COMMIT; 724 dreq->flags = NFS_ODIRECT_DO_COMMIT;
746 } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { 725 } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
747 if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr)) { 726 request_commit = true;
727 if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
748 dreq->flags = 728 dreq->flags =
749 NFS_ODIRECT_RESCHED_WRITES; 729 NFS_ODIRECT_RESCHED_WRITES;
750 bit = NFS_IOHDR_NEED_RESCHED;
751 } else
752 bit = NFS_IOHDR_NEED_COMMIT;
753 } 730 }
754 } 731 }
755 } 732 }
@@ -759,9 +736,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
759 736
760 req = nfs_list_entry(hdr->pages.next); 737 req = nfs_list_entry(hdr->pages.next);
761 nfs_list_remove_request(req); 738 nfs_list_remove_request(req);
762 switch (bit) { 739 if (request_commit) {
763 case NFS_IOHDR_NEED_RESCHED:
764 case NFS_IOHDR_NEED_COMMIT:
765 kref_get(&req->wb_kref); 740 kref_get(&req->wb_kref);
766 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 741 nfs_mark_request_commit(req, hdr->lseg, &cinfo);
767 } 742 }
@@ -902,7 +877,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
902 * is no atomic O_APPEND write facility in the NFS protocol. 877 * is no atomic O_APPEND write facility in the NFS protocol.
903 */ 878 */
904ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, 879ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
905 loff_t pos, bool uio) 880 loff_t pos)
906{ 881{
907 ssize_t result = -EINVAL; 882 ssize_t result = -EINVAL;
908 struct file *file = iocb->ki_filp; 883 struct file *file = iocb->ki_filp;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 4042ff58fe3f..2ab6f00dba5b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -36,6 +36,7 @@
36#include "internal.h" 36#include "internal.h"
37#include "iostat.h" 37#include "iostat.h"
38#include "fscache.h" 38#include "fscache.h"
39#include "pnfs.h"
39 40
40#include "nfstrace.h" 41#include "nfstrace.h"
41 42
@@ -171,7 +172,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
171 ssize_t result; 172 ssize_t result;
172 173
173 if (iocb->ki_filp->f_flags & O_DIRECT) 174 if (iocb->ki_filp->f_flags & O_DIRECT)
174 return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); 175 return nfs_file_direct_read(iocb, to, iocb->ki_pos);
175 176
176 dprintk("NFS: read(%pD2, %zu@%lu)\n", 177 dprintk("NFS: read(%pD2, %zu@%lu)\n",
177 iocb->ki_filp, 178 iocb->ki_filp,
@@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page,
327 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); 328 unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
328 unsigned int end = offset + len; 329 unsigned int end = offset + len;
329 330
331 if (pnfs_ld_read_whole_page(file->f_mapping->host)) {
332 if (!PageUptodate(page))
333 return 1;
334 return 0;
335 }
336
330 if ((file->f_mode & FMODE_READ) && /* open for read? */ 337 if ((file->f_mode & FMODE_READ) && /* open for read? */
331 !PageUptodate(page) && /* Uptodate? */ 338 !PageUptodate(page) && /* Uptodate? */
332 !PagePrivate(page) && /* i/o request already? */ 339 !PagePrivate(page) && /* i/o request already? */
@@ -361,8 +368,8 @@ start:
361 * Prevent starvation issues if someone is doing a consistency 368 * Prevent starvation issues if someone is doing a consistency
362 * sync-to-disk 369 * sync-to-disk
363 */ 370 */
364 ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING, 371 ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
365 nfs_wait_bit_killable, TASK_KILLABLE); 372 nfs_wait_bit_killable, TASK_KILLABLE);
366 if (ret) 373 if (ret)
367 return ret; 374 return ret;
368 375
@@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
468 475
469 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 476 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
470 477
471 /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not 478 /* Always try to initiate a 'commit' if relevant, but only
472 * doing this memory reclaim for a fs-related allocation. 479 * wait for it if __GFP_WAIT is set. Even then, only wait 1
480 * second and only if the 'bdi' is not congested.
481 * Waiting indefinitely can cause deadlocks when the NFS
482 * server is on this machine, when a new TCP connection is
483 * needed and in other rare cases. There is no particular
484 * need to wait extensively here. A short wait has the
485 * benefit that someone else can worry about the freezer.
473 */ 486 */
474 if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && 487 if (mapping) {
475 !(current->flags & PF_FSTRANS)) { 488 struct nfs_server *nfss = NFS_SERVER(mapping->host);
476 int how = FLUSH_SYNC; 489 nfs_commit_inode(mapping->host, 0);
477 490 if ((gfp & __GFP_WAIT) &&
478 /* Don't let kswapd deadlock waiting for OOM RPC calls */ 491 !bdi_write_congested(&nfss->backing_dev_info)) {
479 if (current_is_kswapd()) 492 wait_on_page_bit_killable_timeout(page, PG_private,
480 how = 0; 493 HZ);
481 nfs_commit_inode(mapping->host, how); 494 if (PagePrivate(page))
495 set_bdi_congested(&nfss->backing_dev_info,
496 BLK_RW_ASYNC);
497 }
482 } 498 }
483 /* If PagePrivate() is set, then the page is not freeable */ 499 /* If PagePrivate() is set, then the page is not freeable */
484 if (PagePrivate(page)) 500 if (PagePrivate(page))
@@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page)
539static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, 555static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
540 sector_t *span) 556 sector_t *span)
541{ 557{
558 int ret;
559 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
560
542 *span = sis->pages; 561 *span = sis->pages;
543 return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); 562
563 rcu_read_lock();
564 ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1);
565 rcu_read_unlock();
566
567 return ret;
544} 568}
545 569
546static void nfs_swap_deactivate(struct file *file) 570static void nfs_swap_deactivate(struct file *file)
547{ 571{
548 xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); 572 struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host);
573
574 rcu_read_lock();
575 xs_swapper(rcu_dereference(clnt->cl_xprt), 0);
576 rcu_read_unlock();
549} 577}
550#endif 578#endif
551 579
@@ -648,7 +676,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
648 return result; 676 return result;
649 677
650 if (file->f_flags & O_DIRECT) 678 if (file->f_flags & O_DIRECT)
651 return nfs_file_direct_write(iocb, from, pos, true); 679 return nfs_file_direct_write(iocb, from, pos);
652 680
653 dprintk("NFS: write(%pD2, %zu@%Ld)\n", 681 dprintk("NFS: write(%pD2, %zu@%Ld)\n",
654 file, count, (long long) pos); 682 file, count, (long long) pos);
@@ -891,17 +919,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
891} 919}
892EXPORT_SYMBOL_GPL(nfs_flock); 920EXPORT_SYMBOL_GPL(nfs_flock);
893 921
894/*
895 * There is no protocol support for leases, so we have no way to implement
896 * them correctly in the face of opens by other clients.
897 */
898int nfs_setlease(struct file *file, long arg, struct file_lock **fl)
899{
900 dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg);
901 return -EINVAL;
902}
903EXPORT_SYMBOL_GPL(nfs_setlease);
904
905const struct file_operations nfs_file_operations = { 922const struct file_operations nfs_file_operations = {
906 .llseek = nfs_file_llseek, 923 .llseek = nfs_file_llseek,
907 .read = new_sync_read, 924 .read = new_sync_read,
@@ -918,6 +935,6 @@ const struct file_operations nfs_file_operations = {
918 .splice_read = nfs_file_splice_read, 935 .splice_read = nfs_file_splice_read,
919 .splice_write = iter_file_splice_write, 936 .splice_write = iter_file_splice_write,
920 .check_flags = nfs_check_flags, 937 .check_flags = nfs_check_flags,
921 .setlease = nfs_setlease, 938 .setlease = simple_nosetlease,
922}; 939};
923EXPORT_SYMBOL_GPL(nfs_file_operations); 940EXPORT_SYMBOL_GPL(nfs_file_operations);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index d2eba1c13b7e..46fab1cb455a 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -84,45 +84,37 @@ filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
84 BUG(); 84 BUG();
85} 85}
86 86
87static void filelayout_reset_write(struct nfs_pgio_data *data) 87static void filelayout_reset_write(struct nfs_pgio_header *hdr)
88{ 88{
89 struct nfs_pgio_header *hdr = data->header; 89 struct rpc_task *task = &hdr->task;
90 struct rpc_task *task = &data->task;
91 90
92 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 91 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
93 dprintk("%s Reset task %5u for i/o through MDS " 92 dprintk("%s Reset task %5u for i/o through MDS "
94 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, 93 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
95 data->task.tk_pid, 94 hdr->task.tk_pid,
96 hdr->inode->i_sb->s_id, 95 hdr->inode->i_sb->s_id,
97 (unsigned long long)NFS_FILEID(hdr->inode), 96 (unsigned long long)NFS_FILEID(hdr->inode),
98 data->args.count, 97 hdr->args.count,
99 (unsigned long long)data->args.offset); 98 (unsigned long long)hdr->args.offset);
100 99
101 task->tk_status = pnfs_write_done_resend_to_mds(hdr->inode, 100 task->tk_status = pnfs_write_done_resend_to_mds(hdr);
102 &hdr->pages,
103 hdr->completion_ops,
104 hdr->dreq);
105 } 101 }
106} 102}
107 103
108static void filelayout_reset_read(struct nfs_pgio_data *data) 104static void filelayout_reset_read(struct nfs_pgio_header *hdr)
109{ 105{
110 struct nfs_pgio_header *hdr = data->header; 106 struct rpc_task *task = &hdr->task;
111 struct rpc_task *task = &data->task;
112 107
113 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 108 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
114 dprintk("%s Reset task %5u for i/o through MDS " 109 dprintk("%s Reset task %5u for i/o through MDS "
115 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, 110 "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
116 data->task.tk_pid, 111 hdr->task.tk_pid,
117 hdr->inode->i_sb->s_id, 112 hdr->inode->i_sb->s_id,
118 (unsigned long long)NFS_FILEID(hdr->inode), 113 (unsigned long long)NFS_FILEID(hdr->inode),
119 data->args.count, 114 hdr->args.count,
120 (unsigned long long)data->args.offset); 115 (unsigned long long)hdr->args.offset);
121 116
122 task->tk_status = pnfs_read_done_resend_to_mds(hdr->inode, 117 task->tk_status = pnfs_read_done_resend_to_mds(hdr);
123 &hdr->pages,
124 hdr->completion_ops,
125 hdr->dreq);
126 } 118 }
127} 119}
128 120
@@ -243,18 +235,17 @@ wait_on_recovery:
243/* NFS_PROTO call done callback routines */ 235/* NFS_PROTO call done callback routines */
244 236
245static int filelayout_read_done_cb(struct rpc_task *task, 237static int filelayout_read_done_cb(struct rpc_task *task,
246 struct nfs_pgio_data *data) 238 struct nfs_pgio_header *hdr)
247{ 239{
248 struct nfs_pgio_header *hdr = data->header;
249 int err; 240 int err;
250 241
251 trace_nfs4_pnfs_read(data, task->tk_status); 242 trace_nfs4_pnfs_read(hdr, task->tk_status);
252 err = filelayout_async_handle_error(task, data->args.context->state, 243 err = filelayout_async_handle_error(task, hdr->args.context->state,
253 data->ds_clp, hdr->lseg); 244 hdr->ds_clp, hdr->lseg);
254 245
255 switch (err) { 246 switch (err) {
256 case -NFS4ERR_RESET_TO_MDS: 247 case -NFS4ERR_RESET_TO_MDS:
257 filelayout_reset_read(data); 248 filelayout_reset_read(hdr);
258 return task->tk_status; 249 return task->tk_status;
259 case -EAGAIN: 250 case -EAGAIN:
260 rpc_restart_call_prepare(task); 251 rpc_restart_call_prepare(task);
@@ -270,15 +261,14 @@ static int filelayout_read_done_cb(struct rpc_task *task,
270 * rfc5661 is not clear about which credential should be used. 261 * rfc5661 is not clear about which credential should be used.
271 */ 262 */
272static void 263static void
273filelayout_set_layoutcommit(struct nfs_pgio_data *wdata) 264filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
274{ 265{
275 struct nfs_pgio_header *hdr = wdata->header;
276 266
277 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || 267 if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
278 wdata->res.verf->committed == NFS_FILE_SYNC) 268 hdr->res.verf->committed != NFS_DATA_SYNC)
279 return; 269 return;
280 270
281 pnfs_set_layoutcommit(wdata); 271 pnfs_set_layoutcommit(hdr);
282 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, 272 dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
283 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); 273 (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
284} 274}
@@ -305,83 +295,82 @@ filelayout_reset_to_mds(struct pnfs_layout_segment *lseg)
305 */ 295 */
306static void filelayout_read_prepare(struct rpc_task *task, void *data) 296static void filelayout_read_prepare(struct rpc_task *task, void *data)
307{ 297{
308 struct nfs_pgio_data *rdata = data; 298 struct nfs_pgio_header *hdr = data;
309 299
310 if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) { 300 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
311 rpc_exit(task, -EIO); 301 rpc_exit(task, -EIO);
312 return; 302 return;
313 } 303 }
314 if (filelayout_reset_to_mds(rdata->header->lseg)) { 304 if (filelayout_reset_to_mds(hdr->lseg)) {
315 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); 305 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
316 filelayout_reset_read(rdata); 306 filelayout_reset_read(hdr);
317 rpc_exit(task, 0); 307 rpc_exit(task, 0);
318 return; 308 return;
319 } 309 }
320 rdata->pgio_done_cb = filelayout_read_done_cb; 310 hdr->pgio_done_cb = filelayout_read_done_cb;
321 311
322 if (nfs41_setup_sequence(rdata->ds_clp->cl_session, 312 if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
323 &rdata->args.seq_args, 313 &hdr->args.seq_args,
324 &rdata->res.seq_res, 314 &hdr->res.seq_res,
325 task)) 315 task))
326 return; 316 return;
327 if (nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context, 317 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
328 rdata->args.lock_context, FMODE_READ) == -EIO) 318 hdr->args.lock_context, FMODE_READ) == -EIO)
329 rpc_exit(task, -EIO); /* lost lock, terminate I/O */ 319 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
330} 320}
331 321
332static void filelayout_read_call_done(struct rpc_task *task, void *data) 322static void filelayout_read_call_done(struct rpc_task *task, void *data)
333{ 323{
334 struct nfs_pgio_data *rdata = data; 324 struct nfs_pgio_header *hdr = data;
335 325
336 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); 326 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
337 327
338 if (test_bit(NFS_IOHDR_REDO, &rdata->header->flags) && 328 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
339 task->tk_status == 0) { 329 task->tk_status == 0) {
340 nfs41_sequence_done(task, &rdata->res.seq_res); 330 nfs41_sequence_done(task, &hdr->res.seq_res);
341 return; 331 return;
342 } 332 }
343 333
344 /* Note this may cause RPC to be resent */ 334 /* Note this may cause RPC to be resent */
345 rdata->header->mds_ops->rpc_call_done(task, data); 335 hdr->mds_ops->rpc_call_done(task, data);
346} 336}
347 337
348static void filelayout_read_count_stats(struct rpc_task *task, void *data) 338static void filelayout_read_count_stats(struct rpc_task *task, void *data)
349{ 339{
350 struct nfs_pgio_data *rdata = data; 340 struct nfs_pgio_header *hdr = data;
351 341
352 rpc_count_iostats(task, NFS_SERVER(rdata->header->inode)->client->cl_metrics); 342 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
353} 343}
354 344
355static void filelayout_read_release(void *data) 345static void filelayout_read_release(void *data)
356{ 346{
357 struct nfs_pgio_data *rdata = data; 347 struct nfs_pgio_header *hdr = data;
358 struct pnfs_layout_hdr *lo = rdata->header->lseg->pls_layout; 348 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
359 349
360 filelayout_fenceme(lo->plh_inode, lo); 350 filelayout_fenceme(lo->plh_inode, lo);
361 nfs_put_client(rdata->ds_clp); 351 nfs_put_client(hdr->ds_clp);
362 rdata->header->mds_ops->rpc_release(data); 352 hdr->mds_ops->rpc_release(data);
363} 353}
364 354
365static int filelayout_write_done_cb(struct rpc_task *task, 355static int filelayout_write_done_cb(struct rpc_task *task,
366 struct nfs_pgio_data *data) 356 struct nfs_pgio_header *hdr)
367{ 357{
368 struct nfs_pgio_header *hdr = data->header;
369 int err; 358 int err;
370 359
371 trace_nfs4_pnfs_write(data, task->tk_status); 360 trace_nfs4_pnfs_write(hdr, task->tk_status);
372 err = filelayout_async_handle_error(task, data->args.context->state, 361 err = filelayout_async_handle_error(task, hdr->args.context->state,
373 data->ds_clp, hdr->lseg); 362 hdr->ds_clp, hdr->lseg);
374 363
375 switch (err) { 364 switch (err) {
376 case -NFS4ERR_RESET_TO_MDS: 365 case -NFS4ERR_RESET_TO_MDS:
377 filelayout_reset_write(data); 366 filelayout_reset_write(hdr);
378 return task->tk_status; 367 return task->tk_status;
379 case -EAGAIN: 368 case -EAGAIN:
380 rpc_restart_call_prepare(task); 369 rpc_restart_call_prepare(task);
381 return -EAGAIN; 370 return -EAGAIN;
382 } 371 }
383 372
384 filelayout_set_layoutcommit(data); 373 filelayout_set_layoutcommit(hdr);
385 return 0; 374 return 0;
386} 375}
387 376
@@ -414,62 +403,65 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
414 return -EAGAIN; 403 return -EAGAIN;
415 } 404 }
416 405
406 if (data->verf.committed == NFS_UNSTABLE)
407 pnfs_commit_set_layoutcommit(data);
408
417 return 0; 409 return 0;
418} 410}
419 411
420static void filelayout_write_prepare(struct rpc_task *task, void *data) 412static void filelayout_write_prepare(struct rpc_task *task, void *data)
421{ 413{
422 struct nfs_pgio_data *wdata = data; 414 struct nfs_pgio_header *hdr = data;
423 415
424 if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) { 416 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
425 rpc_exit(task, -EIO); 417 rpc_exit(task, -EIO);
426 return; 418 return;
427 } 419 }
428 if (filelayout_reset_to_mds(wdata->header->lseg)) { 420 if (filelayout_reset_to_mds(hdr->lseg)) {
429 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); 421 dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
430 filelayout_reset_write(wdata); 422 filelayout_reset_write(hdr);
431 rpc_exit(task, 0); 423 rpc_exit(task, 0);
432 return; 424 return;
433 } 425 }
434 if (nfs41_setup_sequence(wdata->ds_clp->cl_session, 426 if (nfs41_setup_sequence(hdr->ds_clp->cl_session,
435 &wdata->args.seq_args, 427 &hdr->args.seq_args,
436 &wdata->res.seq_res, 428 &hdr->res.seq_res,
437 task)) 429 task))
438 return; 430 return;
439 if (nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context, 431 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
440 wdata->args.lock_context, FMODE_WRITE) == -EIO) 432 hdr->args.lock_context, FMODE_WRITE) == -EIO)
441 rpc_exit(task, -EIO); /* lost lock, terminate I/O */ 433 rpc_exit(task, -EIO); /* lost lock, terminate I/O */
442} 434}
443 435
444static void filelayout_write_call_done(struct rpc_task *task, void *data) 436static void filelayout_write_call_done(struct rpc_task *task, void *data)
445{ 437{
446 struct nfs_pgio_data *wdata = data; 438 struct nfs_pgio_header *hdr = data;
447 439
448 if (test_bit(NFS_IOHDR_REDO, &wdata->header->flags) && 440 if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
449 task->tk_status == 0) { 441 task->tk_status == 0) {
450 nfs41_sequence_done(task, &wdata->res.seq_res); 442 nfs41_sequence_done(task, &hdr->res.seq_res);
451 return; 443 return;
452 } 444 }
453 445
454 /* Note this may cause RPC to be resent */ 446 /* Note this may cause RPC to be resent */
455 wdata->header->mds_ops->rpc_call_done(task, data); 447 hdr->mds_ops->rpc_call_done(task, data);
456} 448}
457 449
458static void filelayout_write_count_stats(struct rpc_task *task, void *data) 450static void filelayout_write_count_stats(struct rpc_task *task, void *data)
459{ 451{
460 struct nfs_pgio_data *wdata = data; 452 struct nfs_pgio_header *hdr = data;
461 453
462 rpc_count_iostats(task, NFS_SERVER(wdata->header->inode)->client->cl_metrics); 454 rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
463} 455}
464 456
465static void filelayout_write_release(void *data) 457static void filelayout_write_release(void *data)
466{ 458{
467 struct nfs_pgio_data *wdata = data; 459 struct nfs_pgio_header *hdr = data;
468 struct pnfs_layout_hdr *lo = wdata->header->lseg->pls_layout; 460 struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
469 461
470 filelayout_fenceme(lo->plh_inode, lo); 462 filelayout_fenceme(lo->plh_inode, lo);
471 nfs_put_client(wdata->ds_clp); 463 nfs_put_client(hdr->ds_clp);
472 wdata->header->mds_ops->rpc_release(data); 464 hdr->mds_ops->rpc_release(data);
473} 465}
474 466
475static void filelayout_commit_prepare(struct rpc_task *task, void *data) 467static void filelayout_commit_prepare(struct rpc_task *task, void *data)
@@ -529,19 +521,18 @@ static const struct rpc_call_ops filelayout_commit_call_ops = {
529}; 521};
530 522
531static enum pnfs_try_status 523static enum pnfs_try_status
532filelayout_read_pagelist(struct nfs_pgio_data *data) 524filelayout_read_pagelist(struct nfs_pgio_header *hdr)
533{ 525{
534 struct nfs_pgio_header *hdr = data->header;
535 struct pnfs_layout_segment *lseg = hdr->lseg; 526 struct pnfs_layout_segment *lseg = hdr->lseg;
536 struct nfs4_pnfs_ds *ds; 527 struct nfs4_pnfs_ds *ds;
537 struct rpc_clnt *ds_clnt; 528 struct rpc_clnt *ds_clnt;
538 loff_t offset = data->args.offset; 529 loff_t offset = hdr->args.offset;
539 u32 j, idx; 530 u32 j, idx;
540 struct nfs_fh *fh; 531 struct nfs_fh *fh;
541 532
542 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", 533 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
543 __func__, hdr->inode->i_ino, 534 __func__, hdr->inode->i_ino,
544 data->args.pgbase, (size_t)data->args.count, offset); 535 hdr->args.pgbase, (size_t)hdr->args.count, offset);
545 536
546 /* Retrieve the correct rpc_client for the byte range */ 537 /* Retrieve the correct rpc_client for the byte range */
547 j = nfs4_fl_calc_j_index(lseg, offset); 538 j = nfs4_fl_calc_j_index(lseg, offset);
@@ -559,30 +550,29 @@ filelayout_read_pagelist(struct nfs_pgio_data *data)
559 550
560 /* No multipath support. Use first DS */ 551 /* No multipath support. Use first DS */
561 atomic_inc(&ds->ds_clp->cl_count); 552 atomic_inc(&ds->ds_clp->cl_count);
562 data->ds_clp = ds->ds_clp; 553 hdr->ds_clp = ds->ds_clp;
563 data->ds_idx = idx; 554 hdr->ds_idx = idx;
564 fh = nfs4_fl_select_ds_fh(lseg, j); 555 fh = nfs4_fl_select_ds_fh(lseg, j);
565 if (fh) 556 if (fh)
566 data->args.fh = fh; 557 hdr->args.fh = fh;
567 558
568 data->args.offset = filelayout_get_dserver_offset(lseg, offset); 559 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
569 data->mds_offset = offset; 560 hdr->mds_offset = offset;
570 561
571 /* Perform an asynchronous read to ds */ 562 /* Perform an asynchronous read to ds */
572 nfs_initiate_pgio(ds_clnt, data, 563 nfs_initiate_pgio(ds_clnt, hdr,
573 &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN); 564 &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
574 return PNFS_ATTEMPTED; 565 return PNFS_ATTEMPTED;
575} 566}
576 567
577/* Perform async writes. */ 568/* Perform async writes. */
578static enum pnfs_try_status 569static enum pnfs_try_status
579filelayout_write_pagelist(struct nfs_pgio_data *data, int sync) 570filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
580{ 571{
581 struct nfs_pgio_header *hdr = data->header;
582 struct pnfs_layout_segment *lseg = hdr->lseg; 572 struct pnfs_layout_segment *lseg = hdr->lseg;
583 struct nfs4_pnfs_ds *ds; 573 struct nfs4_pnfs_ds *ds;
584 struct rpc_clnt *ds_clnt; 574 struct rpc_clnt *ds_clnt;
585 loff_t offset = data->args.offset; 575 loff_t offset = hdr->args.offset;
586 u32 j, idx; 576 u32 j, idx;
587 struct nfs_fh *fh; 577 struct nfs_fh *fh;
588 578
@@ -598,21 +588,20 @@ filelayout_write_pagelist(struct nfs_pgio_data *data, int sync)
598 return PNFS_NOT_ATTEMPTED; 588 return PNFS_NOT_ATTEMPTED;
599 589
600 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n", 590 dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d\n",
601 __func__, hdr->inode->i_ino, sync, (size_t) data->args.count, 591 __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
602 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); 592 offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count));
603 593
604 data->pgio_done_cb = filelayout_write_done_cb; 594 hdr->pgio_done_cb = filelayout_write_done_cb;
605 atomic_inc(&ds->ds_clp->cl_count); 595 atomic_inc(&ds->ds_clp->cl_count);
606 data->ds_clp = ds->ds_clp; 596 hdr->ds_clp = ds->ds_clp;
607 data->ds_idx = idx; 597 hdr->ds_idx = idx;
608 fh = nfs4_fl_select_ds_fh(lseg, j); 598 fh = nfs4_fl_select_ds_fh(lseg, j);
609 if (fh) 599 if (fh)
610 data->args.fh = fh; 600 hdr->args.fh = fh;
611 601 hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
612 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
613 602
614 /* Perform an asynchronous write */ 603 /* Perform an asynchronous write */
615 nfs_initiate_pgio(ds_clnt, data, 604 nfs_initiate_pgio(ds_clnt, hdr,
616 &filelayout_write_call_ops, sync, 605 &filelayout_write_call_ops, sync,
617 RPC_TASK_SOFTCONN); 606 RPC_TASK_SOFTCONN);
618 return PNFS_ATTEMPTED; 607 return PNFS_ATTEMPTED;
@@ -660,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
660 } 649 }
661 650
662 /* find and reference the deviceid */ 651 /* find and reference the deviceid */
663 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, 652 d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id,
664 NFS_SERVER(lo->plh_inode)->nfs_client, id); 653 lo->plh_lc_cred, gfp_flags);
665 if (d == NULL) { 654 if (d == NULL)
666 dsaddr = filelayout_get_device_info(lo->plh_inode, id, 655 goto out;
667 lo->plh_lc_cred, gfp_flags); 656
668 if (dsaddr == NULL) 657 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
669 goto out;
670 } else
671 dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
672 /* Found deviceid is unavailable */ 658 /* Found deviceid is unavailable */
673 if (filelayout_test_devid_unavailable(&dsaddr->id_node)) 659 if (filelayout_test_devid_unavailable(&dsaddr->id_node))
674 goto out_put; 660 goto out_put;
675 661
676 fl->dsaddr = dsaddr; 662 fl->dsaddr = dsaddr;
677 663
@@ -1023,6 +1009,7 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
1023 1009
1024/* The generic layer is about to remove the req from the commit list. 1010/* The generic layer is about to remove the req from the commit list.
1025 * If this will make the bucket empty, it will need to put the lseg reference. 1011 * If this will make the bucket empty, it will need to put the lseg reference.
1012 * Note this is must be called holding the inode (/cinfo) lock
1026 */ 1013 */
1027static void 1014static void
1028filelayout_clear_request_commit(struct nfs_page *req, 1015filelayout_clear_request_commit(struct nfs_page *req,
@@ -1030,7 +1017,6 @@ filelayout_clear_request_commit(struct nfs_page *req,
1030{ 1017{
1031 struct pnfs_layout_segment *freeme = NULL; 1018 struct pnfs_layout_segment *freeme = NULL;
1032 1019
1033 spin_lock(cinfo->lock);
1034 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags)) 1020 if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
1035 goto out; 1021 goto out;
1036 cinfo->ds->nwritten--; 1022 cinfo->ds->nwritten--;
@@ -1045,22 +1031,25 @@ filelayout_clear_request_commit(struct nfs_page *req,
1045 } 1031 }
1046out: 1032out:
1047 nfs_request_remove_commit_list(req, cinfo); 1033 nfs_request_remove_commit_list(req, cinfo);
1048 spin_unlock(cinfo->lock); 1034 pnfs_put_lseg_locked(freeme);
1049 pnfs_put_lseg(freeme);
1050} 1035}
1051 1036
1052static struct list_head * 1037static void
1053filelayout_choose_commit_list(struct nfs_page *req, 1038filelayout_mark_request_commit(struct nfs_page *req,
1054 struct pnfs_layout_segment *lseg, 1039 struct pnfs_layout_segment *lseg,
1055 struct nfs_commit_info *cinfo) 1040 struct nfs_commit_info *cinfo)
1041
1056{ 1042{
1057 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 1043 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
1058 u32 i, j; 1044 u32 i, j;
1059 struct list_head *list; 1045 struct list_head *list;
1060 struct pnfs_commit_bucket *buckets; 1046 struct pnfs_commit_bucket *buckets;
1061 1047
1062 if (fl->commit_through_mds) 1048 if (fl->commit_through_mds) {
1063 return &cinfo->mds->list; 1049 list = &cinfo->mds->list;
1050 spin_lock(cinfo->lock);
1051 goto mds_commit;
1052 }
1064 1053
1065 /* Note that we are calling nfs4_fl_calc_j_index on each page 1054 /* Note that we are calling nfs4_fl_calc_j_index on each page
1066 * that ends up being committed to a data server. An attractive 1055 * that ends up being committed to a data server. An attractive
@@ -1084,19 +1073,22 @@ filelayout_choose_commit_list(struct nfs_page *req,
1084 } 1073 }
1085 set_bit(PG_COMMIT_TO_DS, &req->wb_flags); 1074 set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
1086 cinfo->ds->nwritten++; 1075 cinfo->ds->nwritten++;
1087 spin_unlock(cinfo->lock);
1088 return list;
1089}
1090
1091static void
1092filelayout_mark_request_commit(struct nfs_page *req,
1093 struct pnfs_layout_segment *lseg,
1094 struct nfs_commit_info *cinfo)
1095{
1096 struct list_head *list;
1097 1076
1098 list = filelayout_choose_commit_list(req, lseg, cinfo); 1077mds_commit:
1099 nfs_request_add_commit_list(req, list, cinfo); 1078 /* nfs_request_add_commit_list(). We need to add req to list without
1079 * dropping cinfo lock.
1080 */
1081 set_bit(PG_CLEAN, &(req)->wb_flags);
1082 nfs_list_add_request(req, list);
1083 cinfo->mds->ncommit++;
1084 spin_unlock(cinfo->lock);
1085 if (!cinfo->dreq) {
1086 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1087 inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
1088 BDI_RECLAIMABLE);
1089 __mark_inode_dirty(req->wb_context->dentry->d_inode,
1090 I_DIRTY_DATASYNC);
1091 }
1100} 1092}
1101 1093
1102static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) 1094static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
@@ -1244,15 +1236,64 @@ restart:
1244 spin_unlock(cinfo->lock); 1236 spin_unlock(cinfo->lock);
1245} 1237}
1246 1238
1239/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
1240 * for @page
1241 * @cinfo - commit info for current inode
1242 * @page - page to search for matching head request
1243 *
1244 * Returns a the head request if one is found, otherwise returns NULL.
1245 */
1246static struct nfs_page *
1247filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
1248{
1249 struct nfs_page *freq, *t;
1250 struct pnfs_commit_bucket *b;
1251 int i;
1252
1253 /* Linearly search the commit lists for each bucket until a matching
1254 * request is found */
1255 for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
1256 list_for_each_entry_safe(freq, t, &b->written, wb_list) {
1257 if (freq->wb_page == page)
1258 return freq->wb_head;
1259 }
1260 list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
1261 if (freq->wb_page == page)
1262 return freq->wb_head;
1263 }
1264 }
1265
1266 return NULL;
1267}
1268
1269static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
1270{
1271 struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
1272 struct pnfs_commit_bucket *bucket;
1273 struct pnfs_layout_segment *freeme;
1274 int i;
1275
1276 for (i = idx; i < fl_cinfo->nbuckets; i++) {
1277 bucket = &fl_cinfo->buckets[i];
1278 if (list_empty(&bucket->committing))
1279 continue;
1280 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1281 spin_lock(cinfo->lock);
1282 freeme = bucket->clseg;
1283 bucket->clseg = NULL;
1284 spin_unlock(cinfo->lock);
1285 pnfs_put_lseg(freeme);
1286 }
1287}
1288
1247static unsigned int 1289static unsigned int
1248alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list) 1290alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1249{ 1291{
1250 struct pnfs_ds_commit_info *fl_cinfo; 1292 struct pnfs_ds_commit_info *fl_cinfo;
1251 struct pnfs_commit_bucket *bucket; 1293 struct pnfs_commit_bucket *bucket;
1252 struct nfs_commit_data *data; 1294 struct nfs_commit_data *data;
1253 int i, j; 1295 int i;
1254 unsigned int nreq = 0; 1296 unsigned int nreq = 0;
1255 struct pnfs_layout_segment *freeme;
1256 1297
1257 fl_cinfo = cinfo->ds; 1298 fl_cinfo = cinfo->ds;
1258 bucket = fl_cinfo->buckets; 1299 bucket = fl_cinfo->buckets;
@@ -1272,16 +1313,7 @@ alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
1272 } 1313 }
1273 1314
1274 /* Clean up on error */ 1315 /* Clean up on error */
1275 for (j = i; j < fl_cinfo->nbuckets; j++, bucket++) { 1316 filelayout_retry_commit(cinfo, i);
1276 if (list_empty(&bucket->committing))
1277 continue;
1278 nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
1279 spin_lock(cinfo->lock);
1280 freeme = bucket->clseg;
1281 bucket->clseg = NULL;
1282 spin_unlock(cinfo->lock);
1283 pnfs_put_lseg(freeme);
1284 }
1285 /* Caller will clean up entries put on list */ 1317 /* Caller will clean up entries put on list */
1286 return nreq; 1318 return nreq;
1287} 1319}
@@ -1301,8 +1333,12 @@ filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
1301 data->lseg = NULL; 1333 data->lseg = NULL;
1302 list_add(&data->pages, &list); 1334 list_add(&data->pages, &list);
1303 nreq++; 1335 nreq++;
1304 } else 1336 } else {
1305 nfs_retry_commit(mds_pages, NULL, cinfo); 1337 nfs_retry_commit(mds_pages, NULL, cinfo);
1338 filelayout_retry_commit(cinfo, 0);
1339 cinfo->completion_ops->error_cleanup(NFS_I(inode));
1340 return -ENOMEM;
1341 }
1306 } 1342 }
1307 1343
1308 nreq += alloc_ds_commits(cinfo, &list); 1344 nreq += alloc_ds_commits(cinfo, &list);
@@ -1332,6 +1368,17 @@ out:
1332 cinfo->ds->ncommitting = 0; 1368 cinfo->ds->ncommitting = 0;
1333 return PNFS_ATTEMPTED; 1369 return PNFS_ATTEMPTED;
1334} 1370}
1371static struct nfs4_deviceid_node *
1372filelayout_alloc_deviceid_node(struct nfs_server *server,
1373 struct pnfs_device *pdev, gfp_t gfp_flags)
1374{
1375 struct nfs4_file_layout_dsaddr *dsaddr;
1376
1377 dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags);
1378 if (!dsaddr)
1379 return NULL;
1380 return &dsaddr->id_node;
1381}
1335 1382
1336static void 1383static void
1337filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) 1384filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d)
@@ -1380,9 +1427,11 @@ static struct pnfs_layoutdriver_type filelayout_type = {
1380 .clear_request_commit = filelayout_clear_request_commit, 1427 .clear_request_commit = filelayout_clear_request_commit,
1381 .scan_commit_lists = filelayout_scan_commit_lists, 1428 .scan_commit_lists = filelayout_scan_commit_lists,
1382 .recover_commit_reqs = filelayout_recover_commit_reqs, 1429 .recover_commit_reqs = filelayout_recover_commit_reqs,
1430 .search_commit_reqs = filelayout_search_commit_reqs,
1383 .commit_pagelist = filelayout_commit_pagelist, 1431 .commit_pagelist = filelayout_commit_pagelist,
1384 .read_pagelist = filelayout_read_pagelist, 1432 .read_pagelist = filelayout_read_pagelist,
1385 .write_pagelist = filelayout_write_pagelist, 1433 .write_pagelist = filelayout_write_pagelist,
1434 .alloc_deviceid_node = filelayout_alloc_deviceid_node,
1386 .free_deviceid_node = filelayout_free_deveiceid_node, 1435 .free_deviceid_node = filelayout_free_deveiceid_node,
1387}; 1436};
1388 1437
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index ffbddf2219ea..7c9f800c49d7 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); 147u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, 148struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
149 u32 ds_idx); 149 u32 ds_idx);
150
151extern struct nfs4_file_layout_dsaddr *
152nfs4_fl_alloc_deviceid_node(struct nfs_server *server,
153 struct pnfs_device *pdev, gfp_t gfp_flags);
150extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 154extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
151extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); 155extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
152struct nfs4_file_layout_dsaddr *
153filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id,
154 struct rpc_cred *cred, gfp_t gfp_flags);
155 156
156#endif /* FS_NFS_NFS4FILELAYOUT_H */ 157#endif /* FS_NFS_NFS4FILELAYOUT_H */
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 44bf0140a4c7..9bb806a76d99 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -484,8 +484,9 @@ out_err:
484} 484}
485 485
486/* Decode opaque device data and return the result */ 486/* Decode opaque device data and return the result */
487static struct nfs4_file_layout_dsaddr* 487struct nfs4_file_layout_dsaddr *
488decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) 488nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
489 gfp_t gfp_flags)
489{ 490{
490 int i; 491 int i;
491 u32 cnt, num; 492 u32 cnt, num;
@@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
570 dsaddr->stripe_indices = stripe_indices; 571 dsaddr->stripe_indices = stripe_indices;
571 stripe_indices = NULL; 572 stripe_indices = NULL;
572 dsaddr->ds_num = num; 573 dsaddr->ds_num = num;
573 nfs4_init_deviceid_node(&dsaddr->id_node, 574 nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id);
574 NFS_SERVER(ino)->pnfs_curr_ld,
575 NFS_SERVER(ino)->nfs_client,
576 &pdev->dev_id);
577 575
578 INIT_LIST_HEAD(&dsaddrs); 576 INIT_LIST_HEAD(&dsaddrs);
579 577
@@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags)
587 585
588 mp_count = be32_to_cpup(p); /* multipath count */ 586 mp_count = be32_to_cpup(p); /* multipath count */
589 for (j = 0; j < mp_count; j++) { 587 for (j = 0; j < mp_count; j++) {
590 da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, 588 da = decode_ds_addr(server->nfs_client->cl_net,
591 &stream, gfp_flags); 589 &stream, gfp_flags);
592 if (da) 590 if (da)
593 list_add_tail(&da->da_node, &dsaddrs); 591 list_add_tail(&da->da_node, &dsaddrs);
@@ -637,102 +635,6 @@ out_err:
637 return NULL; 635 return NULL;
638} 636}
639 637
640/*
641 * Decode the opaque device specified in 'dev' and add it to the cache of
642 * available devices.
643 */
644static struct nfs4_file_layout_dsaddr *
645decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags)
646{
647 struct nfs4_deviceid_node *d;
648 struct nfs4_file_layout_dsaddr *n, *new;
649
650 new = decode_device(inode, dev, gfp_flags);
651 if (!new) {
652 printk(KERN_WARNING "NFS: %s: Could not decode or add device\n",
653 __func__);
654 return NULL;
655 }
656
657 d = nfs4_insert_deviceid_node(&new->id_node);
658 n = container_of(d, struct nfs4_file_layout_dsaddr, id_node);
659 if (n != new) {
660 nfs4_fl_free_deviceid(new);
661 return n;
662 }
663
664 return new;
665}
666
667/*
668 * Retrieve the information for dev_id, add it to the list
669 * of available devices, and return it.
670 */
671struct nfs4_file_layout_dsaddr *
672filelayout_get_device_info(struct inode *inode,
673 struct nfs4_deviceid *dev_id,
674 struct rpc_cred *cred,
675 gfp_t gfp_flags)
676{
677 struct pnfs_device *pdev = NULL;
678 u32 max_resp_sz;
679 int max_pages;
680 struct page **pages = NULL;
681 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
682 int rc, i;
683 struct nfs_server *server = NFS_SERVER(inode);
684
685 /*
686 * Use the session max response size as the basis for setting
687 * GETDEVICEINFO's maxcount
688 */
689 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
690 max_pages = nfs_page_array_len(0, max_resp_sz);
691 dprintk("%s inode %p max_resp_sz %u max_pages %d\n",
692 __func__, inode, max_resp_sz, max_pages);
693
694 pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags);
695 if (pdev == NULL)
696 return NULL;
697
698 pages = kzalloc(max_pages * sizeof(struct page *), gfp_flags);
699 if (pages == NULL) {
700 kfree(pdev);
701 return NULL;
702 }
703 for (i = 0; i < max_pages; i++) {
704 pages[i] = alloc_page(gfp_flags);
705 if (!pages[i])
706 goto out_free;
707 }
708
709 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
710 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
711 pdev->pages = pages;
712 pdev->pgbase = 0;
713 pdev->pglen = max_resp_sz;
714 pdev->mincount = 0;
715 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
716
717 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
718 dprintk("%s getdevice info returns %d\n", __func__, rc);
719 if (rc)
720 goto out_free;
721
722 /*
723 * Found new device, need to decode it and then add it to the
724 * list of known devices for this mountpoint.
725 */
726 dsaddr = decode_and_add_device(inode, pdev, gfp_flags);
727out_free:
728 for (i = 0; i < max_pages; i++)
729 __free_page(pages[i]);
730 kfree(pages);
731 kfree(pdev);
732 dprintk("<-- %s dsaddr %p\n", __func__, dsaddr);
733 return dsaddr;
734}
735
736void 638void
737nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) 639nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
738{ 640{
@@ -783,8 +685,8 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
783static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) 685static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
784{ 686{
785 might_sleep(); 687 might_sleep();
786 wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, 688 wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
787 nfs_wait_bit_killable, TASK_KILLABLE); 689 nfs_wait_bit_killable, TASK_KILLABLE);
788} 690}
789 691
790static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) 692static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 7cf2c4699b08..777b055063f6 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data,
74 struct nfs_server_key *key = buffer; 74 struct nfs_server_key *key = buffer;
75 uint16_t len = sizeof(struct nfs_server_key); 75 uint16_t len = sizeof(struct nfs_server_key);
76 76
77 memset(key, 0, len);
77 key->nfsversion = clp->rpc_ops->version; 78 key->nfsversion = clp->rpc_ops->version;
78 key->family = clp->cl_addr.ss_family; 79 key->family = clp->cl_addr.ss_family;
79 80
80 memset(key, 0, len);
81
82 switch (clp->cl_addr.ss_family) { 81 switch (clp->cl_addr.ss_family) {
83 case AF_INET: 82 case AF_INET:
84 key->port = sin->sin_port; 83 key->port = sin->sin_port;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b94f80420a58..880618a8b048 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -112,7 +112,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
112 * if the dentry tree reaches them; however if the dentry already 112 * if the dentry tree reaches them; however if the dentry already
113 * exists, we'll pick it up at this point and use it as the root 113 * exists, we'll pick it up at this point and use it as the root
114 */ 114 */
115 ret = d_obtain_alias(inode); 115 ret = d_obtain_root(inode);
116 if (IS_ERR(ret)) { 116 if (IS_ERR(ret)) {
117 dprintk("nfs_get_root: get root dentry failed\n"); 117 dprintk("nfs_get_root: get root dentry failed\n");
118 goto out; 118 goto out;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 567983d2c0eb..2f5db844c172 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -174,8 +174,9 @@ static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
174 174
175static struct key_type key_type_id_resolver = { 175static struct key_type key_type_id_resolver = {
176 .name = "id_resolver", 176 .name = "id_resolver",
177 .instantiate = user_instantiate, 177 .preparse = user_preparse,
178 .match = user_match, 178 .free_preparse = user_free_preparse,
179 .instantiate = generic_key_instantiate,
179 .revoke = user_revoke, 180 .revoke = user_revoke,
180 .destroy = user_destroy, 181 .destroy = user_destroy,
181 .describe = user_describe, 182 .describe = user_describe,
@@ -282,6 +283,8 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen,
282 desc, "", 0, idmap); 283 desc, "", 0, idmap);
283 mutex_unlock(&idmap->idmap_mutex); 284 mutex_unlock(&idmap->idmap_mutex);
284 } 285 }
286 if (!IS_ERR(rkey))
287 set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags);
285 288
286 kfree(desc); 289 kfree(desc);
287 return rkey; 290 return rkey;
@@ -394,8 +397,9 @@ static const struct rpc_pipe_ops idmap_upcall_ops = {
394 397
395static struct key_type key_type_id_resolver_legacy = { 398static struct key_type key_type_id_resolver_legacy = {
396 .name = "id_legacy", 399 .name = "id_legacy",
397 .instantiate = user_instantiate, 400 .preparse = user_preparse,
398 .match = user_match, 401 .free_preparse = user_free_preparse,
402 .instantiate = generic_key_instantiate,
399 .revoke = user_revoke, 403 .revoke = user_revoke,
400 .destroy = user_destroy, 404 .destroy = user_destroy,
401 .describe = user_describe, 405 .describe = user_describe,
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9927913c97c2..6388a59f2add 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -75,7 +75,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
75 * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks 75 * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
76 * @word: long word containing the bit lock 76 * @word: long word containing the bit lock
77 */ 77 */
78int nfs_wait_bit_killable(void *word) 78int nfs_wait_bit_killable(struct wait_bit_key *key)
79{ 79{
80 if (fatal_signal_pending(current)) 80 if (fatal_signal_pending(current))
81 return -ERESTARTSYS; 81 return -ERESTARTSYS;
@@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
505 attr->ia_valid &= ~ATTR_MODE; 505 attr->ia_valid &= ~ATTR_MODE;
506 506
507 if (attr->ia_valid & ATTR_SIZE) { 507 if (attr->ia_valid & ATTR_SIZE) {
508 if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) 508 BUG_ON(!S_ISREG(inode->i_mode));
509
510 if (attr->ia_size == i_size_read(inode))
509 attr->ia_valid &= ~ATTR_SIZE; 511 attr->ia_valid &= ~ATTR_SIZE;
510 } 512 }
511 513
@@ -716,6 +718,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
716 kfree(new); 718 kfree(new);
717 return res; 719 return res;
718} 720}
721EXPORT_SYMBOL_GPL(nfs_get_lock_context);
719 722
720void nfs_put_lock_context(struct nfs_lock_context *l_ctx) 723void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
721{ 724{
@@ -728,6 +731,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx)
728 spin_unlock(&inode->i_lock); 731 spin_unlock(&inode->i_lock);
729 kfree(l_ctx); 732 kfree(l_ctx);
730} 733}
734EXPORT_SYMBOL_GPL(nfs_put_lock_context);
731 735
732/** 736/**
733 * nfs_close_context - Common close_context() routine NFSv2/v3 737 * nfs_close_context - Common close_context() routine NFSv2/v3
@@ -1002,6 +1006,15 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
1002} 1006}
1003EXPORT_SYMBOL_GPL(nfs_revalidate_inode); 1007EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
1004 1008
1009int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
1010{
1011 if (!(NFS_I(inode)->cache_validity &
1012 (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
1013 && !nfs_attribute_cache_expired(inode))
1014 return NFS_STALE(inode) ? -ESTALE : 0;
1015 return -ECHILD;
1016}
1017
1005static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) 1018static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
1006{ 1019{
1007 struct nfs_inode *nfsi = NFS_I(inode); 1020 struct nfs_inode *nfsi = NFS_I(inode);
@@ -1074,8 +1087,8 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
1074 * the bit lock here if it looks like we're going to be doing that. 1087 * the bit lock here if it looks like we're going to be doing that.
1075 */ 1088 */
1076 for (;;) { 1089 for (;;) {
1077 ret = wait_on_bit(bitlock, NFS_INO_INVALIDATING, 1090 ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING,
1078 nfs_wait_bit_killable, TASK_KILLABLE); 1091 nfs_wait_bit_killable, TASK_KILLABLE);
1079 if (ret) 1092 if (ret)
1080 goto out; 1093 goto out;
1081 spin_lock(&inode->i_lock); 1094 spin_lock(&inode->i_lock);
@@ -1840,11 +1853,12 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
1840static int nfs_net_init(struct net *net) 1853static int nfs_net_init(struct net *net)
1841{ 1854{
1842 nfs_clients_init(net); 1855 nfs_clients_init(net);
1843 return 0; 1856 return nfs_fs_proc_net_init(net);
1844} 1857}
1845 1858
1846static void nfs_net_exit(struct net *net) 1859static void nfs_net_exit(struct net *net)
1847{ 1860{
1861 nfs_fs_proc_net_exit(net);
1848 nfs_cleanup_cb_ident_idr(net); 1862 nfs_cleanup_cb_ident_idr(net);
1849} 1863}
1850 1864
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f415cbf9f6c3..efaa31c70fbe 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -195,7 +195,16 @@ extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
195#ifdef CONFIG_PROC_FS 195#ifdef CONFIG_PROC_FS
196extern int __init nfs_fs_proc_init(void); 196extern int __init nfs_fs_proc_init(void);
197extern void nfs_fs_proc_exit(void); 197extern void nfs_fs_proc_exit(void);
198extern int nfs_fs_proc_net_init(struct net *net);
199extern void nfs_fs_proc_net_exit(struct net *net);
198#else 200#else
201static inline int nfs_fs_proc_net_init(struct net *net)
202{
203 return 0;
204}
205static inline void nfs_fs_proc_net_exit(struct net *net)
206{
207}
199static inline int nfs_fs_proc_init(void) 208static inline int nfs_fs_proc_init(void)
200{ 209{
201 return 0; 210 return 0;
@@ -209,13 +218,6 @@ static inline void nfs_fs_proc_exit(void)
209int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); 218int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
210#endif 219#endif
211 220
212/* nfs3client.c */
213#if IS_ENABLED(CONFIG_NFS_V3)
214struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
215struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
216 struct nfs_fattr *, rpc_authflavor_t);
217#endif
218
219/* callback_xdr.c */ 221/* callback_xdr.c */
220extern struct svc_version nfs4_callback_version1; 222extern struct svc_version nfs4_callback_version1;
221extern struct svc_version nfs4_callback_version4; 223extern struct svc_version nfs4_callback_version4;
@@ -238,11 +240,11 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
238int nfs_iocounter_wait(struct nfs_io_counter *c); 240int nfs_iocounter_wait(struct nfs_io_counter *c);
239 241
240extern const struct nfs_pageio_ops nfs_pgio_rw_ops; 242extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
241struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *); 243struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
242void nfs_rw_header_free(struct nfs_pgio_header *); 244void nfs_pgio_header_free(struct nfs_pgio_header *);
243void nfs_pgio_data_release(struct nfs_pgio_data *); 245void nfs_pgio_data_destroy(struct nfs_pgio_header *);
244int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); 246int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
245int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_data *, 247int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
246 const struct rpc_call_ops *, int, int); 248 const struct rpc_call_ops *, int, int);
247void nfs_free_request(struct nfs_page *req); 249void nfs_free_request(struct nfs_page *req);
248 250
@@ -337,7 +339,6 @@ int nfs_file_release(struct inode *, struct file *);
337int nfs_lock(struct file *, int, struct file_lock *); 339int nfs_lock(struct file *, int, struct file_lock *);
338int nfs_flock(struct file *, int, struct file_lock *); 340int nfs_flock(struct file *, int, struct file_lock *);
339int nfs_check_flags(int); 341int nfs_check_flags(int);
340int nfs_setlease(struct file *, long, struct file_lock **);
341 342
342/* inode.c */ 343/* inode.c */
343extern struct workqueue_struct *nfsiod_workqueue; 344extern struct workqueue_struct *nfsiod_workqueue;
@@ -348,7 +349,7 @@ extern int nfs_drop_inode(struct inode *);
348extern void nfs_clear_inode(struct inode *); 349extern void nfs_clear_inode(struct inode *);
349extern void nfs_evict_inode(struct inode *); 350extern void nfs_evict_inode(struct inode *);
350void nfs_zap_acl_cache(struct inode *inode); 351void nfs_zap_acl_cache(struct inode *inode);
351extern int nfs_wait_bit_killable(void *word); 352extern int nfs_wait_bit_killable(struct wait_bit_key *key);
352 353
353/* super.c */ 354/* super.c */
354extern const struct super_operations nfs_sops; 355extern const struct super_operations nfs_sops;
@@ -442,6 +443,7 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
442void nfs_mark_request_commit(struct nfs_page *req, 443void nfs_mark_request_commit(struct nfs_page *req,
443 struct pnfs_layout_segment *lseg, 444 struct pnfs_layout_segment *lseg,
444 struct nfs_commit_info *cinfo); 445 struct nfs_commit_info *cinfo);
446int nfs_write_need_commit(struct nfs_pgio_header *);
445int nfs_generic_commit_list(struct inode *inode, struct list_head *head, 447int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
446 int how, struct nfs_commit_info *cinfo); 448 int how, struct nfs_commit_info *cinfo);
447void nfs_retry_commit(struct list_head *page_list, 449void nfs_retry_commit(struct list_head *page_list,
@@ -482,7 +484,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
482extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); 484extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
483 485
484/* nfs4proc.c */ 486/* nfs4proc.c */
485extern void __nfs4_read_done_cb(struct nfs_pgio_data *); 487extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
486extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, 488extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
487 const struct rpc_timeout *timeparms, 489 const struct rpc_timeout *timeparms,
488 const char *ip_addr); 490 const char *ip_addr);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index 8ee1fab83268..ef221fb8a183 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -29,6 +29,9 @@ struct nfs_net {
29#endif 29#endif
30 spinlock_t nfs_client_lock; 30 spinlock_t nfs_client_lock;
31 struct timespec boot_time; 31 struct timespec boot_time;
32#ifdef CONFIG_PROC_FS
33 struct proc_dir_entry *proc_nfsfs;
34#endif
32}; 35};
33 36
34extern int nfs_net_id; 37extern int nfs_net_id;
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
new file mode 100644
index 000000000000..333ae4068506
--- /dev/null
+++ b/fs/nfs/nfs3_fs.h
@@ -0,0 +1,34 @@
1/*
2 * Copyright (C) 2014 Anna Schumaker.
3 *
4 * NFSv3-specific filesystem definitions and declarations
5 */
6#ifndef __LINUX_FS_NFS_NFS3_FS_H
7#define __LINUX_FS_NFS_NFS3_FS_H
8
9/*
10 * nfs3acl.c
11 */
12#ifdef CONFIG_NFS_V3_ACL
13extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type);
14extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type);
15extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
16 struct posix_acl *dfacl);
17extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
18extern const struct xattr_handler *nfs3_xattr_handlers[];
19#else
20static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
21 struct posix_acl *dfacl)
22{
23 return 0;
24}
25#define nfs3_listxattr NULL
26#endif /* CONFIG_NFS_V3_ACL */
27
28/* nfs3client.c */
29struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *);
30struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
31 struct nfs_fattr *, rpc_authflavor_t);
32
33
34#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 8f854dde4150..658e586ca438 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -7,6 +7,7 @@
7#include <linux/nfsacl.h> 7#include <linux/nfsacl.h>
8 8
9#include "internal.h" 9#include "internal.h"
10#include "nfs3_fs.h"
10 11
11#define NFSDBG_FACILITY NFSDBG_PROC 12#define NFSDBG_FACILITY NFSDBG_PROC
12 13
@@ -129,7 +130,10 @@ static int __nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
129 .rpc_argp = &args, 130 .rpc_argp = &args,
130 .rpc_resp = &fattr, 131 .rpc_resp = &fattr,
131 }; 132 };
132 int status; 133 int status = 0;
134
135 if (acl == NULL && (!S_ISDIR(inode->i_mode) || dfacl == NULL))
136 goto out;
133 137
134 status = -EOPNOTSUPP; 138 status = -EOPNOTSUPP;
135 if (!nfs_server_capable(inode, NFS_CAP_ACLS)) 139 if (!nfs_server_capable(inode, NFS_CAP_ACLS))
@@ -256,7 +260,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data,
256 char *p = data + *result; 260 char *p = data + *result;
257 261
258 acl = get_acl(inode, type); 262 acl = get_acl(inode, type);
259 if (!acl) 263 if (IS_ERR_OR_NULL(acl))
260 return 0; 264 return 0;
261 265
262 posix_acl_release(acl); 266 posix_acl_release(acl);
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index b3fc65ef39ca..8c1b437c5403 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,6 +1,7 @@
1#include <linux/nfs_fs.h> 1#include <linux/nfs_fs.h>
2#include <linux/nfs_mount.h> 2#include <linux/nfs_mount.h>
3#include "internal.h" 3#include "internal.h"
4#include "nfs3_fs.h"
4 5
5#ifdef CONFIG_NFS_V3_ACL 6#ifdef CONFIG_NFS_V3_ACL
6static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; 7static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index f0afa291fd58..524f9f837408 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -22,6 +22,7 @@
22 22
23#include "iostat.h" 23#include "iostat.h"
24#include "internal.h" 24#include "internal.h"
25#include "nfs3_fs.h"
25 26
26#define NFSDBG_FACILITY NFSDBG_PROC 27#define NFSDBG_FACILITY NFSDBG_PROC
27 28
@@ -795,41 +796,44 @@ nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
795 return status; 796 return status;
796} 797}
797 798
798static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_data *data) 799static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
799{ 800{
800 struct inode *inode = data->header->inode; 801 struct inode *inode = hdr->inode;
801 802
802 if (nfs3_async_handle_jukebox(task, inode)) 803 if (nfs3_async_handle_jukebox(task, inode))
803 return -EAGAIN; 804 return -EAGAIN;
804 805
805 nfs_invalidate_atime(inode); 806 nfs_invalidate_atime(inode);
806 nfs_refresh_inode(inode, &data->fattr); 807 nfs_refresh_inode(inode, &hdr->fattr);
807 return 0; 808 return 0;
808} 809}
809 810
810static void nfs3_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 811static void nfs3_proc_read_setup(struct nfs_pgio_header *hdr,
812 struct rpc_message *msg)
811{ 813{
812 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ]; 814 msg->rpc_proc = &nfs3_procedures[NFS3PROC_READ];
813} 815}
814 816
815static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) 817static int nfs3_proc_pgio_rpc_prepare(struct rpc_task *task,
818 struct nfs_pgio_header *hdr)
816{ 819{
817 rpc_call_start(task); 820 rpc_call_start(task);
818 return 0; 821 return 0;
819} 822}
820 823
821static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_data *data) 824static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
822{ 825{
823 struct inode *inode = data->header->inode; 826 struct inode *inode = hdr->inode;
824 827
825 if (nfs3_async_handle_jukebox(task, inode)) 828 if (nfs3_async_handle_jukebox(task, inode))
826 return -EAGAIN; 829 return -EAGAIN;
827 if (task->tk_status >= 0) 830 if (task->tk_status >= 0)
828 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); 831 nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
829 return 0; 832 return 0;
830} 833}
831 834
832static void nfs3_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 835static void nfs3_proc_write_setup(struct nfs_pgio_header *hdr,
836 struct rpc_message *msg)
833{ 837{
834 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE]; 838 msg->rpc_proc = &nfs3_procedures[NFS3PROC_WRITE];
835} 839}
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index d6a98949af19..6af29c2da352 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -4,6 +4,7 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/nfs_fs.h> 5#include <linux/nfs_fs.h>
6#include "internal.h" 6#include "internal.h"
7#include "nfs3_fs.h"
7#include "nfs.h" 8#include "nfs.h"
8 9
9static struct nfs_subversion nfs_v3 = { 10static struct nfs_subversion nfs_v3 = {
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
new file mode 100644
index 000000000000..d10333a197bf
--- /dev/null
+++ b/fs/nfs/nfs42.h
@@ -0,0 +1,14 @@
1/*
2 * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
3 */
4
5#ifndef __LINUX_FS_NFS_NFS4_2_H
6#define __LINUX_FS_NFS_NFS4_2_H
7
8/* nfs4.2proc.c */
9loff_t nfs42_proc_llseek(struct file *, loff_t, int);
10
11/* nfs4.2xdr.h */
12extern struct rpc_procinfo nfs4_2_procedures[];
13
14#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
new file mode 100644
index 000000000000..0886f1db5917
--- /dev/null
+++ b/fs/nfs/nfs42proc.c
@@ -0,0 +1,69 @@
1/*
2 * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
3 */
4#include <linux/fs.h>
5#include <linux/sunrpc/sched.h>
6#include <linux/nfs.h>
7#include <linux/nfs3.h>
8#include <linux/nfs4.h>
9#include <linux/nfs_xdr.h>
10#include <linux/nfs_fs.h>
11#include "nfs4_fs.h"
12#include "nfs42.h"
13
14static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
15 fmode_t fmode)
16{
17 struct nfs_open_context *open;
18 struct nfs_lock_context *lock;
19 int ret;
20
21 open = get_nfs_open_context(nfs_file_open_context(file));
22 lock = nfs_get_lock_context(open);
23 if (IS_ERR(lock)) {
24 put_nfs_open_context(open);
25 return PTR_ERR(lock);
26 }
27
28 ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
29
30 nfs_put_lock_context(lock);
31 put_nfs_open_context(open);
32 return ret;
33}
34
35loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
36{
37 struct inode *inode = file_inode(filep);
38 struct nfs42_seek_args args = {
39 .sa_fh = NFS_FH(inode),
40 .sa_offset = offset,
41 .sa_what = (whence == SEEK_HOLE) ?
42 NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA,
43 };
44 struct nfs42_seek_res res;
45 struct rpc_message msg = {
46 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK],
47 .rpc_argp = &args,
48 .rpc_resp = &res,
49 };
50 struct nfs_server *server = NFS_SERVER(inode);
51 int status;
52
53 if (!(server->caps & NFS_CAP_SEEK))
54 return -ENOTSUPP;
55
56 status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
57 if (status)
58 return status;
59
60 nfs_wb_all(inode);
61 status = nfs4_call_sync(server->client, server, &msg,
62 &args.seq_args, &res.seq_res, 0);
63 if (status == -ENOTSUPP)
64 server->caps &= ~NFS_CAP_SEEK;
65 if (status)
66 return status;
67
68 return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes);
69}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
new file mode 100644
index 000000000000..c90469b604b8
--- /dev/null
+++ b/fs/nfs/nfs42xdr.c
@@ -0,0 +1,98 @@
1/*
2 * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com>
3 */
4#ifndef __LINUX_FS_NFS_NFS4_2XDR_H
5#define __LINUX_FS_NFS_NFS4_2XDR_H
6
7#define encode_seek_maxsz (op_encode_hdr_maxsz + \
8 encode_stateid_maxsz + \
9 2 /* offset */ + \
10 1 /* whence */)
11#define decode_seek_maxsz (op_decode_hdr_maxsz + \
12 1 /* eof */ + \
13 1 /* whence */ + \
14 2 /* offset */ + \
15 2 /* length */)
16
17#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \
18 encode_putfh_maxsz + \
19 encode_seek_maxsz)
20#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \
21 decode_putfh_maxsz + \
22 decode_seek_maxsz)
23
24
25static void encode_seek(struct xdr_stream *xdr,
26 struct nfs42_seek_args *args,
27 struct compound_hdr *hdr)
28{
29 encode_op_hdr(xdr, OP_SEEK, decode_seek_maxsz, hdr);
30 encode_nfs4_stateid(xdr, &args->sa_stateid);
31 encode_uint64(xdr, args->sa_offset);
32 encode_uint32(xdr, args->sa_what);
33}
34
35/*
36 * Encode SEEK request
37 */
38static void nfs4_xdr_enc_seek(struct rpc_rqst *req,
39 struct xdr_stream *xdr,
40 struct nfs42_seek_args *args)
41{
42 struct compound_hdr hdr = {
43 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
44 };
45
46 encode_compound_hdr(xdr, req, &hdr);
47 encode_sequence(xdr, &args->seq_args, &hdr);
48 encode_putfh(xdr, args->sa_fh, &hdr);
49 encode_seek(xdr, args, &hdr);
50 encode_nops(&hdr);
51}
52
53static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res)
54{
55 int status;
56 __be32 *p;
57
58 status = decode_op_hdr(xdr, OP_SEEK);
59 if (status)
60 return status;
61
62 p = xdr_inline_decode(xdr, 4 + 8);
63 if (unlikely(!p))
64 goto out_overflow;
65
66 res->sr_eof = be32_to_cpup(p++);
67 p = xdr_decode_hyper(p, &res->sr_offset);
68 return 0;
69
70out_overflow:
71 print_overflow_msg(__func__, xdr);
72 return -EIO;
73}
74
75/*
76 * Decode SEEK request
77 */
78static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp,
79 struct xdr_stream *xdr,
80 struct nfs42_seek_res *res)
81{
82 struct compound_hdr hdr;
83 int status;
84
85 status = decode_compound_hdr(xdr, &hdr);
86 if (status)
87 goto out;
88 status = decode_sequence(xdr, &res->seq_res, rqstp);
89 if (status)
90 goto out;
91 status = decode_putfh(xdr);
92 if (status)
93 goto out;
94 status = decode_seek(xdr, res);
95out:
96 return status;
97}
98#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ba2affa51941..be6cac37ea10 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -54,7 +54,7 @@ struct nfs4_minor_version_ops {
54 const nfs4_stateid *); 54 const nfs4_stateid *);
55 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *, 55 int (*find_root_sec)(struct nfs_server *, struct nfs_fh *,
56 struct nfs_fsinfo *); 56 struct nfs_fsinfo *);
57 int (*free_lock_state)(struct nfs_server *, 57 void (*free_lock_state)(struct nfs_server *,
58 struct nfs4_lock_state *); 58 struct nfs4_lock_state *);
59 const struct rpc_call_ops *call_sync_ops; 59 const struct rpc_call_ops *call_sync_ops;
60 const struct nfs4_state_recovery_ops *reboot_recovery_ops; 60 const struct nfs4_state_recovery_ops *reboot_recovery_ops;
@@ -129,17 +129,6 @@ enum {
129 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) 129 * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
130 */ 130 */
131 131
132struct nfs4_lock_owner {
133 unsigned int lo_type;
134#define NFS4_ANY_LOCK_TYPE (0U)
135#define NFS4_FLOCK_LOCK_TYPE (1U << 0)
136#define NFS4_POSIX_LOCK_TYPE (1U << 1)
137 union {
138 fl_owner_t posix_owner;
139 pid_t flock_owner;
140 } lo_u;
141};
142
143struct nfs4_lock_state { 132struct nfs4_lock_state {
144 struct list_head ls_locks; /* Other lock stateids */ 133 struct list_head ls_locks; /* Other lock stateids */
145 struct nfs4_state * ls_state; /* Pointer to open state */ 134 struct nfs4_state * ls_state; /* Pointer to open state */
@@ -149,7 +138,7 @@ struct nfs4_lock_state {
149 struct nfs_seqid_counter ls_seqid; 138 struct nfs_seqid_counter ls_seqid;
150 nfs4_stateid ls_stateid; 139 nfs4_stateid ls_stateid;
151 atomic_t ls_count; 140 atomic_t ls_count;
152 struct nfs4_lock_owner ls_owner; 141 fl_owner_t ls_owner;
153}; 142};
154 143
155/* bits for nfs4_state->flags */ 144/* bits for nfs4_state->flags */
@@ -237,6 +226,9 @@ int nfs4_replace_transport(struct nfs_server *server,
237 const struct nfs4_fs_locations *locations); 226 const struct nfs4_fs_locations *locations);
238 227
239/* nfs4proc.c */ 228/* nfs4proc.c */
229extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *,
230 struct rpc_message *, struct nfs4_sequence_args *,
231 struct nfs4_sequence_res *, int);
240extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); 232extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
241extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); 233extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
242extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); 234extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool);
@@ -337,11 +329,11 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
337 */ 329 */
338static inline void 330static inline void
339nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, 331nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
340 struct rpc_message *msg, struct nfs_pgio_data *wdata) 332 struct rpc_message *msg, struct nfs_pgio_header *hdr)
341{ 333{
342 if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) && 334 if (_nfs4_state_protect(clp, NFS_SP4_MACH_CRED_WRITE, clntp, msg) &&
343 !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags)) 335 !test_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags))
344 wdata->args.stable = NFS_FILE_SYNC; 336 hdr->args.stable = NFS_FILE_SYNC;
345} 337}
346#else /* CONFIG_NFS_v4_1 */ 338#else /* CONFIG_NFS_v4_1 */
347static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 339static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
@@ -369,7 +361,7 @@ nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_flags,
369 361
370static inline void 362static inline void
371nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp, 363nfs4_state_protect_write(struct nfs_client *clp, struct rpc_clnt **clntp,
372 struct rpc_message *msg, struct nfs_pgio_data *wdata) 364 struct rpc_message *msg, struct nfs_pgio_header *hdr)
373{ 365{
374} 366}
375#endif /* CONFIG_NFS_V4_1 */ 367#endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index aa9ef4876046..ffdb28d86cf8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
482 482
483 spin_lock(&nn->nfs_client_lock); 483 spin_lock(&nn->nfs_client_lock);
484 list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { 484 list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
485
486 if (pos->rpc_ops != new->rpc_ops)
487 continue;
488
489 if (pos->cl_proto != new->cl_proto)
490 continue;
491
492 if (pos->cl_minorversion != new->cl_minorversion)
493 continue;
494
485 /* If "pos" isn't marked ready, we can't trust the 495 /* If "pos" isn't marked ready, we can't trust the
486 * remaining fields in "pos" */ 496 * remaining fields in "pos" */
487 if (pos->cl_cons_state > NFS_CS_READY) { 497 if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
501 if (pos->cl_cons_state != NFS_CS_READY) 511 if (pos->cl_cons_state != NFS_CS_READY)
502 continue; 512 continue;
503 513
504 if (pos->rpc_ops != new->rpc_ops)
505 continue;
506
507 if (pos->cl_proto != new->cl_proto)
508 continue;
509
510 if (pos->cl_minorversion != new->cl_minorversion)
511 continue;
512
513 if (pos->cl_clientid != new->cl_clientid) 514 if (pos->cl_clientid != new->cl_clientid)
514 continue; 515 continue;
515 516
@@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
622 623
623 spin_lock(&nn->nfs_client_lock); 624 spin_lock(&nn->nfs_client_lock);
624 list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { 625 list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
626
627 if (pos->rpc_ops != new->rpc_ops)
628 continue;
629
630 if (pos->cl_proto != new->cl_proto)
631 continue;
632
633 if (pos->cl_minorversion != new->cl_minorversion)
634 continue;
635
625 /* If "pos" isn't marked ready, we can't trust the 636 /* If "pos" isn't marked ready, we can't trust the
626 * remaining fields in "pos", especially the client 637 * remaining fields in "pos", especially the client
627 * ID and serverowner fields. Wait for CREATE_SESSION 638 * ID and serverowner fields. Wait for CREATE_SESSION
@@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
647 if (pos->cl_cons_state != NFS_CS_READY) 658 if (pos->cl_cons_state != NFS_CS_READY)
648 continue; 659 continue;
649 660
650 if (pos->rpc_ops != new->rpc_ops)
651 continue;
652
653 if (pos->cl_proto != new->cl_proto)
654 continue;
655
656 if (pos->cl_minorversion != new->cl_minorversion)
657 continue;
658
659 if (!nfs4_match_clientids(pos, new)) 661 if (!nfs4_match_clientids(pos, new))
660 continue; 662 continue;
661 663
@@ -855,6 +857,11 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
855 }; 857 };
856 struct rpc_timeout ds_timeout; 858 struct rpc_timeout ds_timeout;
857 struct nfs_client *clp; 859 struct nfs_client *clp;
860 char buf[INET6_ADDRSTRLEN + 1];
861
862 if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
863 return ERR_PTR(-EINVAL);
864 cl_init.hostname = buf;
858 865
859 /* 866 /*
860 * Set an authflavor equual to the MDS value. Use the MDS nfs_client 867 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index a816f0627a6c..c51fb4db9bfe 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -8,6 +8,10 @@
8#include "fscache.h" 8#include "fscache.h"
9#include "pnfs.h" 9#include "pnfs.h"
10 10
11#ifdef CONFIG_NFS_V4_2
12#include "nfs42.h"
13#endif
14
11#define NFSDBG_FACILITY NFSDBG_FILE 15#define NFSDBG_FACILITY NFSDBG_FILE
12 16
13static int 17static int
@@ -115,8 +119,29 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
115 return ret; 119 return ret;
116} 120}
117 121
122#ifdef CONFIG_NFS_V4_2
123static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence)
124{
125 loff_t ret;
126
127 switch (whence) {
128 case SEEK_HOLE:
129 case SEEK_DATA:
130 ret = nfs42_proc_llseek(filep, offset, whence);
131 if (ret != -ENOTSUPP)
132 return ret;
133 default:
134 return nfs_file_llseek(filep, offset, whence);
135 }
136}
137#endif /* CONFIG_NFS_V4_2 */
138
118const struct file_operations nfs4_file_operations = { 139const struct file_operations nfs4_file_operations = {
140#ifdef CONFIG_NFS_V4_2
141 .llseek = nfs4_file_llseek,
142#else
119 .llseek = nfs_file_llseek, 143 .llseek = nfs_file_llseek,
144#endif
120 .read = new_sync_read, 145 .read = new_sync_read,
121 .write = new_sync_write, 146 .write = new_sync_write,
122 .read_iter = nfs_file_read, 147 .read_iter = nfs_file_read,
@@ -131,5 +156,5 @@ const struct file_operations nfs4_file_operations = {
131 .splice_read = nfs_file_splice_read, 156 .splice_read = nfs_file_splice_read,
132 .splice_write = iter_file_splice_write, 157 .splice_write = iter_file_splice_write,
133 .check_flags = nfs_check_flags, 158 .check_flags = nfs_check_flags,
134 .setlease = nfs_setlease, 159 .setlease = simple_nosetlease,
135}; 160};
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 4bf3d97cc5a0..405bd95c1f58 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -77,7 +77,7 @@ struct nfs4_opendata;
77static int _nfs4_proc_open(struct nfs4_opendata *data); 77static int _nfs4_proc_open(struct nfs4_opendata *data);
78static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 78static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
79static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 79static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
80static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 80static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
81static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); 81static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
82static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); 82static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); 83static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
314 kunmap_atomic(start); 314 kunmap_atomic(start);
315} 315}
316 316
317static long nfs4_update_delay(long *timeout)
318{
319 long ret;
320 if (!timeout)
321 return NFS4_POLL_RETRY_MAX;
322 if (*timeout <= 0)
323 *timeout = NFS4_POLL_RETRY_MIN;
324 if (*timeout > NFS4_POLL_RETRY_MAX)
325 *timeout = NFS4_POLL_RETRY_MAX;
326 ret = *timeout;
327 *timeout <<= 1;
328 return ret;
329}
330
317static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) 331static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
318{ 332{
319 int res = 0; 333 int res = 0;
320 334
321 might_sleep(); 335 might_sleep();
322 336
323 if (*timeout <= 0) 337 freezable_schedule_timeout_killable_unsafe(
324 *timeout = NFS4_POLL_RETRY_MIN; 338 nfs4_update_delay(timeout));
325 if (*timeout > NFS4_POLL_RETRY_MAX)
326 *timeout = NFS4_POLL_RETRY_MAX;
327 freezable_schedule_timeout_killable_unsafe(*timeout);
328 if (fatal_signal_pending(current)) 339 if (fatal_signal_pending(current))
329 res = -ERESTARTSYS; 340 res = -ERESTARTSYS;
330 *timeout <<= 1;
331 return res; 341 return res;
332} 342}
333 343
@@ -875,7 +885,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
875 return ret; 885 return ret;
876} 886}
877 887
878static
879int nfs4_call_sync(struct rpc_clnt *clnt, 888int nfs4_call_sync(struct rpc_clnt *clnt,
880 struct nfs_server *server, 889 struct nfs_server *server,
881 struct rpc_message *msg, 890 struct rpc_message *msg,
@@ -1307,15 +1316,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
1307 int ret = -EAGAIN; 1316 int ret = -EAGAIN;
1308 1317
1309 for (;;) { 1318 for (;;) {
1319 spin_lock(&state->owner->so_lock);
1310 if (can_open_cached(state, fmode, open_mode)) { 1320 if (can_open_cached(state, fmode, open_mode)) {
1311 spin_lock(&state->owner->so_lock); 1321 update_open_stateflags(state, fmode);
1312 if (can_open_cached(state, fmode, open_mode)) {
1313 update_open_stateflags(state, fmode);
1314 spin_unlock(&state->owner->so_lock);
1315 goto out_return_state;
1316 }
1317 spin_unlock(&state->owner->so_lock); 1322 spin_unlock(&state->owner->so_lock);
1323 goto out_return_state;
1318 } 1324 }
1325 spin_unlock(&state->owner->so_lock);
1319 rcu_read_lock(); 1326 rcu_read_lock();
1320 delegation = rcu_dereference(nfsi->delegation); 1327 delegation = rcu_dereference(nfsi->delegation);
1321 if (!can_open_delegated(delegation, fmode)) { 1328 if (!can_open_delegated(delegation, fmode)) {
@@ -1952,6 +1959,14 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
1952 return status; 1959 return status;
1953} 1960}
1954 1961
1962/*
1963 * Additional permission checks in order to distinguish between an
1964 * open for read, and an open for execute. This works around the
1965 * fact that NFSv4 OPEN treats read and execute permissions as being
1966 * the same.
1967 * Note that in the non-execute case, we want to turn off permission
1968 * checking if we just created a new file (POSIX open() semantics).
1969 */
1955static int nfs4_opendata_access(struct rpc_cred *cred, 1970static int nfs4_opendata_access(struct rpc_cred *cred,
1956 struct nfs4_opendata *opendata, 1971 struct nfs4_opendata *opendata,
1957 struct nfs4_state *state, fmode_t fmode, 1972 struct nfs4_state *state, fmode_t fmode,
@@ -1966,14 +1981,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
1966 return 0; 1981 return 0;
1967 1982
1968 mask = 0; 1983 mask = 0;
1969 /* don't check MAY_WRITE - a newly created file may not have 1984 /*
1970 * write mode bits, but POSIX allows the creating process to write. 1985 * Use openflags to check for exec, because fmode won't
1971 * use openflags to check for exec, because fmode won't 1986 * always have FMODE_EXEC set when file open for exec.
1972 * always have FMODE_EXEC set when file open for exec. */ 1987 */
1973 if (openflags & __FMODE_EXEC) { 1988 if (openflags & __FMODE_EXEC) {
1974 /* ONLY check for exec rights */ 1989 /* ONLY check for exec rights */
1975 mask = MAY_EXEC; 1990 mask = MAY_EXEC;
1976 } else if (fmode & FMODE_READ) 1991 } else if ((fmode & FMODE_READ) && !opendata->file_created)
1977 mask = MAY_READ; 1992 mask = MAY_READ;
1978 1993
1979 cache.cred = cred; 1994 cache.cred = cred;
@@ -2216,8 +2231,19 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
2216 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount); 2231 seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
2217 2232
2218 ret = _nfs4_proc_open(opendata); 2233 ret = _nfs4_proc_open(opendata);
2219 if (ret != 0) 2234 if (ret != 0) {
2235 if (ret == -ENOENT) {
2236 dentry = opendata->dentry;
2237 if (dentry->d_inode)
2238 d_delete(dentry);
2239 else if (d_unhashed(dentry))
2240 d_add(dentry, NULL);
2241
2242 nfs_set_verifier(dentry,
2243 nfs_save_change_attribute(opendata->dir->d_inode));
2244 }
2220 goto out; 2245 goto out;
2246 }
2221 2247
2222 state = nfs4_opendata_to_nfs4_state(opendata); 2248 state = nfs4_opendata_to_nfs4_state(opendata);
2223 ret = PTR_ERR(state); 2249 ret = PTR_ERR(state);
@@ -2545,6 +2571,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2545 struct nfs4_closedata *calldata = data; 2571 struct nfs4_closedata *calldata = data;
2546 struct nfs4_state *state = calldata->state; 2572 struct nfs4_state *state = calldata->state;
2547 struct nfs_server *server = NFS_SERVER(calldata->inode); 2573 struct nfs_server *server = NFS_SERVER(calldata->inode);
2574 nfs4_stateid *res_stateid = NULL;
2548 2575
2549 dprintk("%s: begin!\n", __func__); 2576 dprintk("%s: begin!\n", __func__);
2550 if (!nfs4_sequence_done(task, &calldata->res.seq_res)) 2577 if (!nfs4_sequence_done(task, &calldata->res.seq_res))
@@ -2555,12 +2582,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2555 */ 2582 */
2556 switch (task->tk_status) { 2583 switch (task->tk_status) {
2557 case 0: 2584 case 0:
2558 if (calldata->roc) 2585 res_stateid = &calldata->res.stateid;
2586 if (calldata->arg.fmode == 0 && calldata->roc)
2559 pnfs_roc_set_barrier(state->inode, 2587 pnfs_roc_set_barrier(state->inode,
2560 calldata->roc_barrier); 2588 calldata->roc_barrier);
2561 nfs_clear_open_stateid(state, &calldata->res.stateid, 0);
2562 renew_lease(server, calldata->timestamp); 2589 renew_lease(server, calldata->timestamp);
2563 goto out_release; 2590 break;
2564 case -NFS4ERR_ADMIN_REVOKED: 2591 case -NFS4ERR_ADMIN_REVOKED:
2565 case -NFS4ERR_STALE_STATEID: 2592 case -NFS4ERR_STALE_STATEID:
2566 case -NFS4ERR_OLD_STATEID: 2593 case -NFS4ERR_OLD_STATEID:
@@ -2569,12 +2596,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2569 if (calldata->arg.fmode == 0) 2596 if (calldata->arg.fmode == 0)
2570 break; 2597 break;
2571 default: 2598 default:
2572 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { 2599 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) {
2573 rpc_restart_call_prepare(task); 2600 rpc_restart_call_prepare(task);
2574 goto out_release; 2601 goto out_release;
2575 } 2602 }
2576 } 2603 }
2577 nfs_clear_open_stateid(state, NULL, calldata->arg.fmode); 2604 nfs_clear_open_stateid(state, res_stateid, calldata->arg.fmode);
2578out_release: 2605out_release:
2579 nfs_release_seqid(calldata->arg.seqid); 2606 nfs_release_seqid(calldata->arg.seqid);
2580 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 2607 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
@@ -2586,6 +2613,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2586 struct nfs4_closedata *calldata = data; 2613 struct nfs4_closedata *calldata = data;
2587 struct nfs4_state *state = calldata->state; 2614 struct nfs4_state *state = calldata->state;
2588 struct inode *inode = calldata->inode; 2615 struct inode *inode = calldata->inode;
2616 bool is_rdonly, is_wronly, is_rdwr;
2589 int call_close = 0; 2617 int call_close = 0;
2590 2618
2591 dprintk("%s: begin!\n", __func__); 2619 dprintk("%s: begin!\n", __func__);
@@ -2593,21 +2621,27 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
2593 goto out_wait; 2621 goto out_wait;
2594 2622
2595 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE]; 2623 task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
2596 calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
2597 spin_lock(&state->owner->so_lock); 2624 spin_lock(&state->owner->so_lock);
2625 is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
2626 is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
2627 is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
2598 /* Calculate the change in open mode */ 2628 /* Calculate the change in open mode */
2629 calldata->arg.fmode = 0;
2599 if (state->n_rdwr == 0) { 2630 if (state->n_rdwr == 0) {
2600 if (state->n_rdonly == 0) { 2631 if (state->n_rdonly == 0)
2601 call_close |= test_bit(NFS_O_RDONLY_STATE, &state->flags); 2632 call_close |= is_rdonly;
2602 call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); 2633 else if (is_rdonly)
2603 calldata->arg.fmode &= ~FMODE_READ; 2634 calldata->arg.fmode |= FMODE_READ;
2604 } 2635 if (state->n_wronly == 0)
2605 if (state->n_wronly == 0) { 2636 call_close |= is_wronly;
2606 call_close |= test_bit(NFS_O_WRONLY_STATE, &state->flags); 2637 else if (is_wronly)
2607 call_close |= test_bit(NFS_O_RDWR_STATE, &state->flags); 2638 calldata->arg.fmode |= FMODE_WRITE;
2608 calldata->arg.fmode &= ~FMODE_WRITE; 2639 } else if (is_rdwr)
2609 } 2640 calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
2610 } 2641
2642 if (calldata->arg.fmode == 0)
2643 call_close |= is_rdwr;
2644
2611 if (!nfs4_valid_open_stateid(state)) 2645 if (!nfs4_valid_open_stateid(state))
2612 call_close = 0; 2646 call_close = 0;
2613 spin_unlock(&state->owner->so_lock); 2647 spin_unlock(&state->owner->so_lock);
@@ -2647,6 +2681,48 @@ static const struct rpc_call_ops nfs4_close_ops = {
2647 .rpc_release = nfs4_free_closedata, 2681 .rpc_release = nfs4_free_closedata,
2648}; 2682};
2649 2683
2684static bool nfs4_state_has_opener(struct nfs4_state *state)
2685{
2686 /* first check existing openers */
2687 if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
2688 state->n_rdonly != 0)
2689 return true;
2690
2691 if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
2692 state->n_wronly != 0)
2693 return true;
2694
2695 if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
2696 state->n_rdwr != 0)
2697 return true;
2698
2699 return false;
2700}
2701
2702static bool nfs4_roc(struct inode *inode)
2703{
2704 struct nfs_inode *nfsi = NFS_I(inode);
2705 struct nfs_open_context *ctx;
2706 struct nfs4_state *state;
2707
2708 spin_lock(&inode->i_lock);
2709 list_for_each_entry(ctx, &nfsi->open_files, list) {
2710 state = ctx->state;
2711 if (state == NULL)
2712 continue;
2713 if (nfs4_state_has_opener(state)) {
2714 spin_unlock(&inode->i_lock);
2715 return false;
2716 }
2717 }
2718 spin_unlock(&inode->i_lock);
2719
2720 if (nfs4_check_delegation(inode, FMODE_READ))
2721 return false;
2722
2723 return pnfs_roc(inode);
2724}
2725
2650/* 2726/*
2651 * It is possible for data to be read/written from a mem-mapped file 2727 * It is possible for data to be read/written from a mem-mapped file
2652 * after the sys_close call (which hits the vfs layer as a flush). 2728 * after the sys_close call (which hits the vfs layer as a flush).
@@ -2697,7 +2773,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
2697 calldata->res.fattr = &calldata->fattr; 2773 calldata->res.fattr = &calldata->fattr;
2698 calldata->res.seqid = calldata->arg.seqid; 2774 calldata->res.seqid = calldata->arg.seqid;
2699 calldata->res.server = server; 2775 calldata->res.server = server;
2700 calldata->roc = pnfs_roc(state->inode); 2776 calldata->roc = nfs4_roc(state->inode);
2701 nfs_sb_active(calldata->inode->i_sb); 2777 nfs_sb_active(calldata->inode->i_sb);
2702 2778
2703 msg.rpc_argp = &calldata->arg; 2779 msg.rpc_argp = &calldata->arg;
@@ -3148,7 +3224,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
3148 struct nfs4_label *label = NULL; 3224 struct nfs4_label *label = NULL;
3149 int status; 3225 int status;
3150 3226
3151 if (pnfs_ld_layoutret_on_setattr(inode)) 3227 if (pnfs_ld_layoutret_on_setattr(inode) &&
3228 sattr->ia_valid & ATTR_SIZE &&
3229 sattr->ia_size < i_size_read(inode))
3152 pnfs_commit_and_return_layout(inode); 3230 pnfs_commit_and_return_layout(inode);
3153 3231
3154 nfs_fattr_init(fattr); 3232 nfs_fattr_init(fattr);
@@ -3507,7 +3585,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
3507 3585
3508 if (!nfs4_sequence_done(task, &res->seq_res)) 3586 if (!nfs4_sequence_done(task, &res->seq_res))
3509 return 0; 3587 return 0;
3510 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 3588 if (nfs4_async_handle_error(task, res->server, NULL,
3589 &data->timeout) == -EAGAIN)
3511 return 0; 3590 return 0;
3512 update_changeattr(dir, &res->cinfo); 3591 update_changeattr(dir, &res->cinfo);
3513 return 1; 3592 return 1;
@@ -3540,7 +3619,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
3540 3619
3541 if (!nfs4_sequence_done(task, &res->seq_res)) 3620 if (!nfs4_sequence_done(task, &res->seq_res))
3542 return 0; 3621 return 0;
3543 if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) 3622 if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
3544 return 0; 3623 return 0;
3545 3624
3546 update_changeattr(old_dir, &res->old_cinfo); 3625 update_changeattr(old_dir, &res->old_cinfo);
@@ -4033,24 +4112,26 @@ static bool nfs4_error_stateid_expired(int err)
4033 return false; 4112 return false;
4034} 4113}
4035 4114
4036void __nfs4_read_done_cb(struct nfs_pgio_data *data) 4115void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
4037{ 4116{
4038 nfs_invalidate_atime(data->header->inode); 4117 nfs_invalidate_atime(hdr->inode);
4039} 4118}
4040 4119
4041static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) 4120static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
4042{ 4121{
4043 struct nfs_server *server = NFS_SERVER(data->header->inode); 4122 struct nfs_server *server = NFS_SERVER(hdr->inode);
4044 4123
4045 trace_nfs4_read(data, task->tk_status); 4124 trace_nfs4_read(hdr, task->tk_status);
4046 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 4125 if (nfs4_async_handle_error(task, server,
4126 hdr->args.context->state,
4127 NULL) == -EAGAIN) {
4047 rpc_restart_call_prepare(task); 4128 rpc_restart_call_prepare(task);
4048 return -EAGAIN; 4129 return -EAGAIN;
4049 } 4130 }
4050 4131
4051 __nfs4_read_done_cb(data); 4132 __nfs4_read_done_cb(hdr);
4052 if (task->tk_status > 0) 4133 if (task->tk_status > 0)
4053 renew_lease(server, data->timestamp); 4134 renew_lease(server, hdr->timestamp);
4054 return 0; 4135 return 0;
4055} 4136}
4056 4137
@@ -4068,54 +4149,60 @@ static bool nfs4_read_stateid_changed(struct rpc_task *task,
4068 return true; 4149 return true;
4069} 4150}
4070 4151
4071static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_data *data) 4152static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
4072{ 4153{
4073 4154
4074 dprintk("--> %s\n", __func__); 4155 dprintk("--> %s\n", __func__);
4075 4156
4076 if (!nfs4_sequence_done(task, &data->res.seq_res)) 4157 if (!nfs4_sequence_done(task, &hdr->res.seq_res))
4077 return -EAGAIN; 4158 return -EAGAIN;
4078 if (nfs4_read_stateid_changed(task, &data->args)) 4159 if (nfs4_read_stateid_changed(task, &hdr->args))
4079 return -EAGAIN; 4160 return -EAGAIN;
4080 return data->pgio_done_cb ? data->pgio_done_cb(task, data) : 4161 return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
4081 nfs4_read_done_cb(task, data); 4162 nfs4_read_done_cb(task, hdr);
4082} 4163}
4083 4164
4084static void nfs4_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 4165static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
4166 struct rpc_message *msg)
4085{ 4167{
4086 data->timestamp = jiffies; 4168 hdr->timestamp = jiffies;
4087 data->pgio_done_cb = nfs4_read_done_cb; 4169 hdr->pgio_done_cb = nfs4_read_done_cb;
4088 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 4170 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
4089 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); 4171 nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
4090} 4172}
4091 4173
4092static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) 4174static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task,
4175 struct nfs_pgio_header *hdr)
4093{ 4176{
4094 if (nfs4_setup_sequence(NFS_SERVER(data->header->inode), 4177 if (nfs4_setup_sequence(NFS_SERVER(hdr->inode),
4095 &data->args.seq_args, 4178 &hdr->args.seq_args,
4096 &data->res.seq_res, 4179 &hdr->res.seq_res,
4097 task)) 4180 task))
4098 return 0; 4181 return 0;
4099 if (nfs4_set_rw_stateid(&data->args.stateid, data->args.context, 4182 if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
4100 data->args.lock_context, data->header->rw_ops->rw_mode) == -EIO) 4183 hdr->args.lock_context,
4184 hdr->rw_ops->rw_mode) == -EIO)
4101 return -EIO; 4185 return -EIO;
4102 if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags))) 4186 if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags)))
4103 return -EIO; 4187 return -EIO;
4104 return 0; 4188 return 0;
4105} 4189}
4106 4190
4107static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_data *data) 4191static int nfs4_write_done_cb(struct rpc_task *task,
4192 struct nfs_pgio_header *hdr)
4108{ 4193{
4109 struct inode *inode = data->header->inode; 4194 struct inode *inode = hdr->inode;
4110 4195
4111 trace_nfs4_write(data, task->tk_status); 4196 trace_nfs4_write(hdr, task->tk_status);
4112 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 4197 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4198 hdr->args.context->state,
4199 NULL) == -EAGAIN) {
4113 rpc_restart_call_prepare(task); 4200 rpc_restart_call_prepare(task);
4114 return -EAGAIN; 4201 return -EAGAIN;
4115 } 4202 }
4116 if (task->tk_status >= 0) { 4203 if (task->tk_status >= 0) {
4117 renew_lease(NFS_SERVER(inode), data->timestamp); 4204 renew_lease(NFS_SERVER(inode), hdr->timestamp);
4118 nfs_post_op_update_inode_force_wcc(inode, &data->fattr); 4205 nfs_post_op_update_inode_force_wcc(inode, &hdr->fattr);
4119 } 4206 }
4120 return 0; 4207 return 0;
4121} 4208}
@@ -4134,23 +4221,21 @@ static bool nfs4_write_stateid_changed(struct rpc_task *task,
4134 return true; 4221 return true;
4135} 4222}
4136 4223
4137static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_data *data) 4224static int nfs4_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
4138{ 4225{
4139 if (!nfs4_sequence_done(task, &data->res.seq_res)) 4226 if (!nfs4_sequence_done(task, &hdr->res.seq_res))
4140 return -EAGAIN; 4227 return -EAGAIN;
4141 if (nfs4_write_stateid_changed(task, &data->args)) 4228 if (nfs4_write_stateid_changed(task, &hdr->args))
4142 return -EAGAIN; 4229 return -EAGAIN;
4143 return data->pgio_done_cb ? data->pgio_done_cb(task, data) : 4230 return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
4144 nfs4_write_done_cb(task, data); 4231 nfs4_write_done_cb(task, hdr);
4145} 4232}
4146 4233
4147static 4234static
4148bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data) 4235bool nfs4_write_need_cache_consistency_data(struct nfs_pgio_header *hdr)
4149{ 4236{
4150 const struct nfs_pgio_header *hdr = data->header;
4151
4152 /* Don't request attributes for pNFS or O_DIRECT writes */ 4237 /* Don't request attributes for pNFS or O_DIRECT writes */
4153 if (data->ds_clp != NULL || hdr->dreq != NULL) 4238 if (hdr->ds_clp != NULL || hdr->dreq != NULL)
4154 return false; 4239 return false;
4155 /* Otherwise, request attributes if and only if we don't hold 4240 /* Otherwise, request attributes if and only if we don't hold
4156 * a delegation 4241 * a delegation
@@ -4158,23 +4243,24 @@ bool nfs4_write_need_cache_consistency_data(const struct nfs_pgio_data *data)
4158 return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0; 4243 return nfs4_have_delegation(hdr->inode, FMODE_READ) == 0;
4159} 4244}
4160 4245
4161static void nfs4_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 4246static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
4247 struct rpc_message *msg)
4162{ 4248{
4163 struct nfs_server *server = NFS_SERVER(data->header->inode); 4249 struct nfs_server *server = NFS_SERVER(hdr->inode);
4164 4250
4165 if (!nfs4_write_need_cache_consistency_data(data)) { 4251 if (!nfs4_write_need_cache_consistency_data(hdr)) {
4166 data->args.bitmask = NULL; 4252 hdr->args.bitmask = NULL;
4167 data->res.fattr = NULL; 4253 hdr->res.fattr = NULL;
4168 } else 4254 } else
4169 data->args.bitmask = server->cache_consistency_bitmask; 4255 hdr->args.bitmask = server->cache_consistency_bitmask;
4170 4256
4171 if (!data->pgio_done_cb) 4257 if (!hdr->pgio_done_cb)
4172 data->pgio_done_cb = nfs4_write_done_cb; 4258 hdr->pgio_done_cb = nfs4_write_done_cb;
4173 data->res.server = server; 4259 hdr->res.server = server;
4174 data->timestamp = jiffies; 4260 hdr->timestamp = jiffies;
4175 4261
4176 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; 4262 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
4177 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); 4263 nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1);
4178} 4264}
4179 4265
4180static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data) 4266static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -4190,7 +4276,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da
4190 struct inode *inode = data->inode; 4276 struct inode *inode = data->inode;
4191 4277
4192 trace_nfs4_commit(data, task->tk_status); 4278 trace_nfs4_commit(data, task->tk_status);
4193 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 4279 if (nfs4_async_handle_error(task, NFS_SERVER(inode),
4280 NULL, NULL) == -EAGAIN) {
4194 rpc_restart_call_prepare(task); 4281 rpc_restart_call_prepare(task);
4195 return -EAGAIN; 4282 return -EAGAIN;
4196 } 4283 }
@@ -4743,7 +4830,8 @@ out:
4743 4830
4744 4831
4745static int 4832static int
4746nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) 4833nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4834 struct nfs4_state *state, long *timeout)
4747{ 4835{
4748 struct nfs_client *clp = server->nfs_client; 4836 struct nfs_client *clp = server->nfs_client;
4749 4837
@@ -4793,6 +4881,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4793#endif /* CONFIG_NFS_V4_1 */ 4881#endif /* CONFIG_NFS_V4_1 */
4794 case -NFS4ERR_DELAY: 4882 case -NFS4ERR_DELAY:
4795 nfs_inc_server_stats(server, NFSIOS_DELAY); 4883 nfs_inc_server_stats(server, NFSIOS_DELAY);
4884 rpc_delay(task, nfs4_update_delay(timeout));
4885 goto restart_call;
4796 case -NFS4ERR_GRACE: 4886 case -NFS4ERR_GRACE:
4797 rpc_delay(task, NFS4_POLL_RETRY_MAX); 4887 rpc_delay(task, NFS4_POLL_RETRY_MAX);
4798 case -NFS4ERR_RETRY_UNCACHED_REP: 4888 case -NFS4ERR_RETRY_UNCACHED_REP:
@@ -4881,6 +4971,18 @@ nfs4_init_callback_netid(const struct nfs_client *clp, char *buf, size_t len)
4881 return scnprintf(buf, len, "tcp"); 4971 return scnprintf(buf, len, "tcp");
4882} 4972}
4883 4973
4974static void nfs4_setclientid_done(struct rpc_task *task, void *calldata)
4975{
4976 struct nfs4_setclientid *sc = calldata;
4977
4978 if (task->tk_status == 0)
4979 sc->sc_cred = get_rpccred(task->tk_rqstp->rq_cred);
4980}
4981
4982static const struct rpc_call_ops nfs4_setclientid_ops = {
4983 .rpc_call_done = nfs4_setclientid_done,
4984};
4985
4884/** 4986/**
4885 * nfs4_proc_setclientid - Negotiate client ID 4987 * nfs4_proc_setclientid - Negotiate client ID
4886 * @clp: state data structure 4988 * @clp: state data structure
@@ -4907,6 +5009,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4907 .rpc_resp = res, 5009 .rpc_resp = res,
4908 .rpc_cred = cred, 5010 .rpc_cred = cred,
4909 }; 5011 };
5012 struct rpc_task *task;
5013 struct rpc_task_setup task_setup_data = {
5014 .rpc_client = clp->cl_rpcclient,
5015 .rpc_message = &msg,
5016 .callback_ops = &nfs4_setclientid_ops,
5017 .callback_data = &setclientid,
5018 .flags = RPC_TASK_TIMEOUT,
5019 };
4910 int status; 5020 int status;
4911 5021
4912 /* nfs_client_id4 */ 5022 /* nfs_client_id4 */
@@ -4933,7 +5043,18 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
4933 dprintk("NFS call setclientid auth=%s, '%.*s'\n", 5043 dprintk("NFS call setclientid auth=%s, '%.*s'\n",
4934 clp->cl_rpcclient->cl_auth->au_ops->au_name, 5044 clp->cl_rpcclient->cl_auth->au_ops->au_name,
4935 setclientid.sc_name_len, setclientid.sc_name); 5045 setclientid.sc_name_len, setclientid.sc_name);
4936 status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); 5046 task = rpc_run_task(&task_setup_data);
5047 if (IS_ERR(task)) {
5048 status = PTR_ERR(task);
5049 goto out;
5050 }
5051 status = task->tk_status;
5052 if (setclientid.sc_cred) {
5053 clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
5054 put_rpccred(setclientid.sc_cred);
5055 }
5056 rpc_put_task(task);
5057out:
4937 trace_nfs4_setclientid(clp, status); 5058 trace_nfs4_setclientid(clp, status);
4938 dprintk("NFS reply setclientid: %d\n", status); 5059 dprintk("NFS reply setclientid: %d\n", status);
4939 return status; 5060 return status;
@@ -4975,6 +5096,9 @@ struct nfs4_delegreturndata {
4975 unsigned long timestamp; 5096 unsigned long timestamp;
4976 struct nfs_fattr fattr; 5097 struct nfs_fattr fattr;
4977 int rpc_status; 5098 int rpc_status;
5099 struct inode *inode;
5100 bool roc;
5101 u32 roc_barrier;
4978}; 5102};
4979 5103
4980static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) 5104static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -4988,7 +5112,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
4988 switch (task->tk_status) { 5112 switch (task->tk_status) {
4989 case 0: 5113 case 0:
4990 renew_lease(data->res.server, data->timestamp); 5114 renew_lease(data->res.server, data->timestamp);
4991 break;
4992 case -NFS4ERR_ADMIN_REVOKED: 5115 case -NFS4ERR_ADMIN_REVOKED:
4993 case -NFS4ERR_DELEG_REVOKED: 5116 case -NFS4ERR_DELEG_REVOKED:
4994 case -NFS4ERR_BAD_STATEID: 5117 case -NFS4ERR_BAD_STATEID:
@@ -4996,10 +5119,12 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
4996 case -NFS4ERR_STALE_STATEID: 5119 case -NFS4ERR_STALE_STATEID:
4997 case -NFS4ERR_EXPIRED: 5120 case -NFS4ERR_EXPIRED:
4998 task->tk_status = 0; 5121 task->tk_status = 0;
5122 if (data->roc)
5123 pnfs_roc_set_barrier(data->inode, data->roc_barrier);
4999 break; 5124 break;
5000 default: 5125 default:
5001 if (nfs4_async_handle_error(task, data->res.server, NULL) == 5126 if (nfs4_async_handle_error(task, data->res.server,
5002 -EAGAIN) { 5127 NULL, NULL) == -EAGAIN) {
5003 rpc_restart_call_prepare(task); 5128 rpc_restart_call_prepare(task);
5004 return; 5129 return;
5005 } 5130 }
@@ -5009,6 +5134,10 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
5009 5134
5010static void nfs4_delegreturn_release(void *calldata) 5135static void nfs4_delegreturn_release(void *calldata)
5011{ 5136{
5137 struct nfs4_delegreturndata *data = calldata;
5138
5139 if (data->roc)
5140 pnfs_roc_release(data->inode);
5012 kfree(calldata); 5141 kfree(calldata);
5013} 5142}
5014 5143
@@ -5018,6 +5147,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
5018 5147
5019 d_data = (struct nfs4_delegreturndata *)data; 5148 d_data = (struct nfs4_delegreturndata *)data;
5020 5149
5150 if (d_data->roc &&
5151 pnfs_roc_drain(d_data->inode, &d_data->roc_barrier, task))
5152 return;
5153
5021 nfs4_setup_sequence(d_data->res.server, 5154 nfs4_setup_sequence(d_data->res.server,
5022 &d_data->args.seq_args, 5155 &d_data->args.seq_args,
5023 &d_data->res.seq_res, 5156 &d_data->res.seq_res,
@@ -5061,6 +5194,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
5061 nfs_fattr_init(data->res.fattr); 5194 nfs_fattr_init(data->res.fattr);
5062 data->timestamp = jiffies; 5195 data->timestamp = jiffies;
5063 data->rpc_status = 0; 5196 data->rpc_status = 0;
5197 data->inode = inode;
5198 data->roc = list_empty(&NFS_I(inode)->open_files) ?
5199 pnfs_roc(inode) : false;
5064 5200
5065 task_setup_data.callback_data = data; 5201 task_setup_data.callback_data = data;
5066 msg.rpc_argp = &data->args; 5202 msg.rpc_argp = &data->args;
@@ -5252,7 +5388,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
5252 case -NFS4ERR_EXPIRED: 5388 case -NFS4ERR_EXPIRED:
5253 break; 5389 break;
5254 default: 5390 default:
5255 if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) 5391 if (nfs4_async_handle_error(task, calldata->server,
5392 NULL, NULL) == -EAGAIN)
5256 rpc_restart_call_prepare(task); 5393 rpc_restart_call_prepare(task);
5257 } 5394 }
5258 nfs_release_seqid(calldata->arg.seqid); 5395 nfs_release_seqid(calldata->arg.seqid);
@@ -5834,8 +5971,10 @@ struct nfs_release_lockowner_data {
5834static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata) 5971static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata)
5835{ 5972{
5836 struct nfs_release_lockowner_data *data = calldata; 5973 struct nfs_release_lockowner_data *data = calldata;
5837 nfs40_setup_sequence(data->server, 5974 struct nfs_server *server = data->server;
5838 &data->args.seq_args, &data->res.seq_res, task); 5975 nfs40_setup_sequence(server, &data->args.seq_args,
5976 &data->res.seq_res, task);
5977 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
5839 data->timestamp = jiffies; 5978 data->timestamp = jiffies;
5840} 5979}
5841 5980
@@ -5852,9 +5991,12 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata)
5852 break; 5991 break;
5853 case -NFS4ERR_STALE_CLIENTID: 5992 case -NFS4ERR_STALE_CLIENTID:
5854 case -NFS4ERR_EXPIRED: 5993 case -NFS4ERR_EXPIRED:
5994 nfs4_schedule_lease_recovery(server->nfs_client);
5995 break;
5855 case -NFS4ERR_LEASE_MOVED: 5996 case -NFS4ERR_LEASE_MOVED:
5856 case -NFS4ERR_DELAY: 5997 case -NFS4ERR_DELAY:
5857 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) 5998 if (nfs4_async_handle_error(task, server,
5999 NULL, NULL) == -EAGAIN)
5858 rpc_restart_call_prepare(task); 6000 rpc_restart_call_prepare(task);
5859 } 6001 }
5860} 6002}
@@ -5872,7 +6014,8 @@ static const struct rpc_call_ops nfs4_release_lockowner_ops = {
5872 .rpc_release = nfs4_release_lockowner_release, 6014 .rpc_release = nfs4_release_lockowner_release,
5873}; 6015};
5874 6016
5875static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) 6017static void
6018nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
5876{ 6019{
5877 struct nfs_release_lockowner_data *data; 6020 struct nfs_release_lockowner_data *data;
5878 struct rpc_message msg = { 6021 struct rpc_message msg = {
@@ -5880,11 +6023,11 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
5880 }; 6023 };
5881 6024
5882 if (server->nfs_client->cl_mvops->minor_version != 0) 6025 if (server->nfs_client->cl_mvops->minor_version != 0)
5883 return -EINVAL; 6026 return;
5884 6027
5885 data = kmalloc(sizeof(*data), GFP_NOFS); 6028 data = kmalloc(sizeof(*data), GFP_NOFS);
5886 if (!data) 6029 if (!data)
5887 return -ENOMEM; 6030 return;
5888 data->lsp = lsp; 6031 data->lsp = lsp;
5889 data->server = server; 6032 data->server = server;
5890 data->args.lock_owner.clientid = server->nfs_client->cl_clientid; 6033 data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
@@ -5895,7 +6038,6 @@ static int nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_st
5895 msg.rpc_resp = &data->res; 6038 msg.rpc_resp = &data->res;
5896 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); 6039 nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
5897 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data); 6040 rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, data);
5898 return 0;
5899} 6041}
5900 6042
5901#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" 6043#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
@@ -7229,7 +7371,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
7229 int ret = 0; 7371 int ret = 0;
7230 7372
7231 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) 7373 if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
7232 return 0; 7374 return -EAGAIN;
7233 task = _nfs41_proc_sequence(clp, cred, false); 7375 task = _nfs41_proc_sequence(clp, cred, false);
7234 if (IS_ERR(task)) 7376 if (IS_ERR(task))
7235 ret = PTR_ERR(task); 7377 ret = PTR_ERR(task);
@@ -7459,14 +7601,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
7459 } else { 7601 } else {
7460 LIST_HEAD(head); 7602 LIST_HEAD(head);
7461 7603
7604 /*
7605 * Mark the bad layout state as invalid, then retry
7606 * with the current stateid.
7607 */
7462 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); 7608 pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
7463 spin_unlock(&inode->i_lock); 7609 spin_unlock(&inode->i_lock);
7464 /* Mark the bad layout state as invalid, then
7465 * retry using the open stateid. */
7466 pnfs_free_lseg_list(&head); 7610 pnfs_free_lseg_list(&head);
7611
7612 task->tk_status = 0;
7613 rpc_restart_call_prepare(task);
7467 } 7614 }
7468 } 7615 }
7469 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) 7616 if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
7470 rpc_restart_call_prepare(task); 7617 rpc_restart_call_prepare(task);
7471out: 7618out:
7472 dprintk("<-- %s\n", __func__); 7619 dprintk("<-- %s\n", __func__);
@@ -7626,7 +7773,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
7626 case 0: 7773 case 0:
7627 break; 7774 break;
7628 case -NFS4ERR_DELAY: 7775 case -NFS4ERR_DELAY:
7629 if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) 7776 if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN)
7630 break; 7777 break;
7631 rpc_restart_call_prepare(task); 7778 rpc_restart_call_prepare(task);
7632 return; 7779 return;
@@ -7685,54 +7832,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
7685 return status; 7832 return status;
7686} 7833}
7687 7834
7688/*
7689 * Retrieve the list of Data Server devices from the MDS.
7690 */
7691static int _nfs4_getdevicelist(struct nfs_server *server,
7692 const struct nfs_fh *fh,
7693 struct pnfs_devicelist *devlist)
7694{
7695 struct nfs4_getdevicelist_args args = {
7696 .fh = fh,
7697 .layoutclass = server->pnfs_curr_ld->id,
7698 };
7699 struct nfs4_getdevicelist_res res = {
7700 .devlist = devlist,
7701 };
7702 struct rpc_message msg = {
7703 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST],
7704 .rpc_argp = &args,
7705 .rpc_resp = &res,
7706 };
7707 int status;
7708
7709 dprintk("--> %s\n", __func__);
7710 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args,
7711 &res.seq_res, 0);
7712 dprintk("<-- %s status=%d\n", __func__, status);
7713 return status;
7714}
7715
7716int nfs4_proc_getdevicelist(struct nfs_server *server,
7717 const struct nfs_fh *fh,
7718 struct pnfs_devicelist *devlist)
7719{
7720 struct nfs4_exception exception = { };
7721 int err;
7722
7723 do {
7724 err = nfs4_handle_exception(server,
7725 _nfs4_getdevicelist(server, fh, devlist),
7726 &exception);
7727 } while (exception.retry);
7728
7729 dprintk("%s: err=%d, num_devs=%u\n", __func__,
7730 err, devlist->num_devs);
7731
7732 return err;
7733}
7734EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist);
7735
7736static int 7835static int
7737_nfs4_proc_getdeviceinfo(struct nfs_server *server, 7836_nfs4_proc_getdeviceinfo(struct nfs_server *server,
7738 struct pnfs_device *pdev, 7837 struct pnfs_device *pdev,
@@ -7805,7 +7904,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
7805 case 0: 7904 case 0:
7806 break; 7905 break;
7807 default: 7906 default:
7808 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 7907 if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) {
7809 rpc_restart_call_prepare(task); 7908 rpc_restart_call_prepare(task);
7810 return; 7909 return;
7811 } 7910 }
@@ -8101,7 +8200,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
8101 8200
8102 switch (task->tk_status) { 8201 switch (task->tk_status) {
8103 case -NFS4ERR_DELAY: 8202 case -NFS4ERR_DELAY:
8104 if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) 8203 if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN)
8105 rpc_restart_call_prepare(task); 8204 rpc_restart_call_prepare(task);
8106 } 8205 }
8107} 8206}
@@ -8182,7 +8281,8 @@ static int nfs41_free_stateid(struct nfs_server *server,
8182 return ret; 8281 return ret;
8183} 8282}
8184 8283
8185static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp) 8284static void
8285nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
8186{ 8286{
8187 struct rpc_task *task; 8287 struct rpc_task *task;
8188 struct rpc_cred *cred = lsp->ls_state->owner->so_cred; 8288 struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
@@ -8190,9 +8290,8 @@ static int nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_sta
8190 task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false); 8290 task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
8191 nfs4_free_lock_state(server, lsp); 8291 nfs4_free_lock_state(server, lsp);
8192 if (IS_ERR(task)) 8292 if (IS_ERR(task))
8193 return PTR_ERR(task); 8293 return;
8194 rpc_put_task(task); 8294 rpc_put_task(task);
8195 return 0;
8196} 8295}
8197 8296
8198static bool nfs41_match_stateid(const nfs4_stateid *s1, 8297static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -8309,7 +8408,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
8309 | NFS_CAP_CHANGE_ATTR 8408 | NFS_CAP_CHANGE_ATTR
8310 | NFS_CAP_POSIX_LOCK 8409 | NFS_CAP_POSIX_LOCK
8311 | NFS_CAP_STATEID_NFSV41 8410 | NFS_CAP_STATEID_NFSV41
8312 | NFS_CAP_ATOMIC_OPEN_V1, 8411 | NFS_CAP_ATOMIC_OPEN_V1
8412 | NFS_CAP_SEEK,
8313 .init_client = nfs41_init_client, 8413 .init_client = nfs41_init_client,
8314 .shutdown_client = nfs41_shutdown_client, 8414 .shutdown_client = nfs41_shutdown_client,
8315 .match_stateid = nfs41_match_stateid, 8415 .match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 1720d32ffa54..e1ba58c3d1ad 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work)
88 } 88 }
89 nfs_expire_all_delegations(clp); 89 nfs_expire_all_delegations(clp);
90 } else { 90 } else {
91 int ret;
92
91 /* Queue an asynchronous RENEW. */ 93 /* Queue an asynchronous RENEW. */
92 ops->sched_state_renewal(clp, cred, renew_flags); 94 ret = ops->sched_state_renewal(clp, cred, renew_flags);
93 put_rpccred(cred); 95 put_rpccred(cred);
94 goto out_exp; 96 switch (ret) {
97 default:
98 goto out_exp;
99 case -EAGAIN:
100 case -ENOMEM:
101 break;
102 }
95 } 103 }
96 } else { 104 } else {
97 dprintk("%s: failed to call renewd. Reason: lease not expired \n", 105 dprintk("%s: failed to call renewd. Reason: lease not expired \n",
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 848f6853c59e..5194933ed419 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -787,21 +787,12 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
787 * that is compatible with current->files 787 * that is compatible with current->files
788 */ 788 */
789static struct nfs4_lock_state * 789static struct nfs4_lock_state *
790__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) 790__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
791{ 791{
792 struct nfs4_lock_state *pos; 792 struct nfs4_lock_state *pos;
793 list_for_each_entry(pos, &state->lock_states, ls_locks) { 793 list_for_each_entry(pos, &state->lock_states, ls_locks) {
794 if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) 794 if (pos->ls_owner != fl_owner)
795 continue; 795 continue;
796 switch (pos->ls_owner.lo_type) {
797 case NFS4_POSIX_LOCK_TYPE:
798 if (pos->ls_owner.lo_u.posix_owner != fl_owner)
799 continue;
800 break;
801 case NFS4_FLOCK_LOCK_TYPE:
802 if (pos->ls_owner.lo_u.flock_owner != fl_pid)
803 continue;
804 }
805 atomic_inc(&pos->ls_count); 796 atomic_inc(&pos->ls_count);
806 return pos; 797 return pos;
807 } 798 }
@@ -813,7 +804,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_p
813 * exists, return an uninitialized one. 804 * exists, return an uninitialized one.
814 * 805 *
815 */ 806 */
816static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) 807static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
817{ 808{
818 struct nfs4_lock_state *lsp; 809 struct nfs4_lock_state *lsp;
819 struct nfs_server *server = state->owner->so_server; 810 struct nfs_server *server = state->owner->so_server;
@@ -824,17 +815,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
824 nfs4_init_seqid_counter(&lsp->ls_seqid); 815 nfs4_init_seqid_counter(&lsp->ls_seqid);
825 atomic_set(&lsp->ls_count, 1); 816 atomic_set(&lsp->ls_count, 1);
826 lsp->ls_state = state; 817 lsp->ls_state = state;
827 lsp->ls_owner.lo_type = type; 818 lsp->ls_owner = fl_owner;
828 switch (lsp->ls_owner.lo_type) {
829 case NFS4_FLOCK_LOCK_TYPE:
830 lsp->ls_owner.lo_u.flock_owner = fl_pid;
831 break;
832 case NFS4_POSIX_LOCK_TYPE:
833 lsp->ls_owner.lo_u.posix_owner = fl_owner;
834 break;
835 default:
836 goto out_free;
837 }
838 lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); 819 lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS);
839 if (lsp->ls_seqid.owner_id < 0) 820 if (lsp->ls_seqid.owner_id < 0)
840 goto out_free; 821 goto out_free;
@@ -857,13 +838,13 @@ void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp
857 * exists, return an uninitialized one. 838 * exists, return an uninitialized one.
858 * 839 *
859 */ 840 */
860static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) 841static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
861{ 842{
862 struct nfs4_lock_state *lsp, *new = NULL; 843 struct nfs4_lock_state *lsp, *new = NULL;
863 844
864 for(;;) { 845 for(;;) {
865 spin_lock(&state->state_lock); 846 spin_lock(&state->state_lock);
866 lsp = __nfs4_find_lock_state(state, owner, pid, type); 847 lsp = __nfs4_find_lock_state(state, owner);
867 if (lsp != NULL) 848 if (lsp != NULL)
868 break; 849 break;
869 if (new != NULL) { 850 if (new != NULL) {
@@ -874,7 +855,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
874 break; 855 break;
875 } 856 }
876 spin_unlock(&state->state_lock); 857 spin_unlock(&state->state_lock);
877 new = nfs4_alloc_lock_state(state, owner, pid, type); 858 new = nfs4_alloc_lock_state(state, owner);
878 if (new == NULL) 859 if (new == NULL)
879 return NULL; 860 return NULL;
880 } 861 }
@@ -935,13 +916,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
935 916
936 if (fl->fl_ops != NULL) 917 if (fl->fl_ops != NULL)
937 return 0; 918 return 0;
938 if (fl->fl_flags & FL_POSIX) 919 lsp = nfs4_get_lock_state(state, fl->fl_owner);
939 lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE);
940 else if (fl->fl_flags & FL_FLOCK)
941 lsp = nfs4_get_lock_state(state, NULL, fl->fl_pid,
942 NFS4_FLOCK_LOCK_TYPE);
943 else
944 return -EINVAL;
945 if (lsp == NULL) 920 if (lsp == NULL)
946 return -ENOMEM; 921 return -ENOMEM;
947 fl->fl_u.nfs4_fl.owner = lsp; 922 fl->fl_u.nfs4_fl.owner = lsp;
@@ -955,7 +930,6 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
955{ 930{
956 struct nfs4_lock_state *lsp; 931 struct nfs4_lock_state *lsp;
957 fl_owner_t fl_owner; 932 fl_owner_t fl_owner;
958 pid_t fl_pid;
959 int ret = -ENOENT; 933 int ret = -ENOENT;
960 934
961 935
@@ -966,9 +940,8 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
966 goto out; 940 goto out;
967 941
968 fl_owner = lockowner->l_owner; 942 fl_owner = lockowner->l_owner;
969 fl_pid = lockowner->l_pid;
970 spin_lock(&state->state_lock); 943 spin_lock(&state->state_lock);
971 lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); 944 lsp = __nfs4_find_lock_state(state, fl_owner);
972 if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) 945 if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
973 ret = -EIO; 946 ret = -EIO;
974 else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { 947 else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -1251,8 +1224,8 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp)
1251 might_sleep(); 1224 might_sleep();
1252 1225
1253 atomic_inc(&clp->cl_count); 1226 atomic_inc(&clp->cl_count);
1254 res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, 1227 res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
1255 nfs_wait_bit_killable, TASK_KILLABLE); 1228 nfs_wait_bit_killable, TASK_KILLABLE);
1256 if (res) 1229 if (res)
1257 goto out; 1230 goto out;
1258 if (clp->cl_cons_state < 0) 1231 if (clp->cl_cons_state < 0)
@@ -1732,7 +1705,8 @@ restart:
1732 if (status < 0) { 1705 if (status < 0) {
1733 set_bit(ops->owner_flag_bit, &sp->so_flags); 1706 set_bit(ops->owner_flag_bit, &sp->so_flags);
1734 nfs4_put_state_owner(sp); 1707 nfs4_put_state_owner(sp);
1735 return nfs4_recovery_handle_error(clp, status); 1708 status = nfs4_recovery_handle_error(clp, status);
1709 return (status != 0) ? status : -EAGAIN;
1736 } 1710 }
1737 1711
1738 nfs4_put_state_owner(sp); 1712 nfs4_put_state_owner(sp);
@@ -1741,7 +1715,7 @@ restart:
1741 spin_unlock(&clp->cl_lock); 1715 spin_unlock(&clp->cl_lock);
1742 } 1716 }
1743 rcu_read_unlock(); 1717 rcu_read_unlock();
1744 return status; 1718 return 0;
1745} 1719}
1746 1720
1747static int nfs4_check_lease(struct nfs_client *clp) 1721static int nfs4_check_lease(struct nfs_client *clp)
@@ -1788,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status)
1788 break; 1762 break;
1789 case -NFS4ERR_STALE_CLIENTID: 1763 case -NFS4ERR_STALE_CLIENTID:
1790 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); 1764 clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
1791 nfs4_state_clear_reclaim_reboot(clp);
1792 nfs4_state_start_reclaim_reboot(clp); 1765 nfs4_state_start_reclaim_reboot(clp);
1793 break; 1766 break;
1794 case -NFS4ERR_CLID_INUSE: 1767 case -NFS4ERR_CLID_INUSE:
@@ -2372,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
2372 status = nfs4_check_lease(clp); 2345 status = nfs4_check_lease(clp);
2373 if (status < 0) 2346 if (status < 0)
2374 goto out_error; 2347 goto out_error;
2348 continue;
2375 } 2349 }
2376 2350
2377 if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { 2351 if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) {
@@ -2393,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp)
2393 section = "reclaim reboot"; 2367 section = "reclaim reboot";
2394 status = nfs4_do_reclaim(clp, 2368 status = nfs4_do_reclaim(clp,
2395 clp->cl_mvops->reboot_recovery_ops); 2369 clp->cl_mvops->reboot_recovery_ops);
2396 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 2370 if (status == -EAGAIN)
2397 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
2398 continue;
2399 nfs4_state_end_reclaim_reboot(clp);
2400 if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state))
2401 continue; 2371 continue;
2402 if (status < 0) 2372 if (status < 0)
2403 goto out_error; 2373 goto out_error;
2374 nfs4_state_end_reclaim_reboot(clp);
2404 } 2375 }
2405 2376
2406 /* Now recover expired state... */ 2377 /* Now recover expired state... */
@@ -2408,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
2408 section = "reclaim nograce"; 2379 section = "reclaim nograce";
2409 status = nfs4_do_reclaim(clp, 2380 status = nfs4_do_reclaim(clp,
2410 clp->cl_mvops->nograce_recovery_ops); 2381 clp->cl_mvops->nograce_recovery_ops);
2411 if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || 2382 if (status == -EAGAIN)
2412 test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) ||
2413 test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
2414 continue; 2383 continue;
2415 if (status < 0) 2384 if (status < 0)
2416 goto out_error; 2385 goto out_error;
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 0a744f3a86f6..1c32adbe728d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -932,11 +932,11 @@ DEFINE_NFS4_IDMAP_EVENT(nfs4_map_gid_to_group);
932 932
933DECLARE_EVENT_CLASS(nfs4_read_event, 933DECLARE_EVENT_CLASS(nfs4_read_event,
934 TP_PROTO( 934 TP_PROTO(
935 const struct nfs_pgio_data *data, 935 const struct nfs_pgio_header *hdr,
936 int error 936 int error
937 ), 937 ),
938 938
939 TP_ARGS(data, error), 939 TP_ARGS(hdr, error),
940 940
941 TP_STRUCT__entry( 941 TP_STRUCT__entry(
942 __field(dev_t, dev) 942 __field(dev_t, dev)
@@ -948,12 +948,12 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
948 ), 948 ),
949 949
950 TP_fast_assign( 950 TP_fast_assign(
951 const struct inode *inode = data->header->inode; 951 const struct inode *inode = hdr->inode;
952 __entry->dev = inode->i_sb->s_dev; 952 __entry->dev = inode->i_sb->s_dev;
953 __entry->fileid = NFS_FILEID(inode); 953 __entry->fileid = NFS_FILEID(inode);
954 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); 954 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
955 __entry->offset = data->args.offset; 955 __entry->offset = hdr->args.offset;
956 __entry->count = data->args.count; 956 __entry->count = hdr->args.count;
957 __entry->error = error; 957 __entry->error = error;
958 ), 958 ),
959 959
@@ -972,10 +972,10 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
972#define DEFINE_NFS4_READ_EVENT(name) \ 972#define DEFINE_NFS4_READ_EVENT(name) \
973 DEFINE_EVENT(nfs4_read_event, name, \ 973 DEFINE_EVENT(nfs4_read_event, name, \
974 TP_PROTO( \ 974 TP_PROTO( \
975 const struct nfs_pgio_data *data, \ 975 const struct nfs_pgio_header *hdr, \
976 int error \ 976 int error \
977 ), \ 977 ), \
978 TP_ARGS(data, error)) 978 TP_ARGS(hdr, error))
979DEFINE_NFS4_READ_EVENT(nfs4_read); 979DEFINE_NFS4_READ_EVENT(nfs4_read);
980#ifdef CONFIG_NFS_V4_1 980#ifdef CONFIG_NFS_V4_1
981DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read); 981DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
@@ -983,11 +983,11 @@ DEFINE_NFS4_READ_EVENT(nfs4_pnfs_read);
983 983
984DECLARE_EVENT_CLASS(nfs4_write_event, 984DECLARE_EVENT_CLASS(nfs4_write_event,
985 TP_PROTO( 985 TP_PROTO(
986 const struct nfs_pgio_data *data, 986 const struct nfs_pgio_header *hdr,
987 int error 987 int error
988 ), 988 ),
989 989
990 TP_ARGS(data, error), 990 TP_ARGS(hdr, error),
991 991
992 TP_STRUCT__entry( 992 TP_STRUCT__entry(
993 __field(dev_t, dev) 993 __field(dev_t, dev)
@@ -999,12 +999,12 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
999 ), 999 ),
1000 1000
1001 TP_fast_assign( 1001 TP_fast_assign(
1002 const struct inode *inode = data->header->inode; 1002 const struct inode *inode = hdr->inode;
1003 __entry->dev = inode->i_sb->s_dev; 1003 __entry->dev = inode->i_sb->s_dev;
1004 __entry->fileid = NFS_FILEID(inode); 1004 __entry->fileid = NFS_FILEID(inode);
1005 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); 1005 __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
1006 __entry->offset = data->args.offset; 1006 __entry->offset = hdr->args.offset;
1007 __entry->count = data->args.count; 1007 __entry->count = hdr->args.count;
1008 __entry->error = error; 1008 __entry->error = error;
1009 ), 1009 ),
1010 1010
@@ -1024,10 +1024,10 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
1024#define DEFINE_NFS4_WRITE_EVENT(name) \ 1024#define DEFINE_NFS4_WRITE_EVENT(name) \
1025 DEFINE_EVENT(nfs4_write_event, name, \ 1025 DEFINE_EVENT(nfs4_write_event, name, \
1026 TP_PROTO( \ 1026 TP_PROTO( \
1027 const struct nfs_pgio_data *data, \ 1027 const struct nfs_pgio_header *hdr, \
1028 int error \ 1028 int error \
1029 ), \ 1029 ), \
1030 TP_ARGS(data, error)) 1030 TP_ARGS(hdr, error))
1031DEFINE_NFS4_WRITE_EVENT(nfs4_write); 1031DEFINE_NFS4_WRITE_EVENT(nfs4_write);
1032#ifdef CONFIG_NFS_V4_1 1032#ifdef CONFIG_NFS_V4_1
1033DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write); 1033DEFINE_NFS4_WRITE_EVENT(nfs4_pnfs_write);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 939ae606cfa4..206c08a60c7f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int);
362 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) 362 XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5)
363#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) 363#define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4)
364#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) 364#define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4)
365#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ 365#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \
366 encode_verifier_maxsz) 366 XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \
367#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ 367 1 /* layout type */ + \
368 2 /* nfs_cookie4 gdlr_cookie */ + \ 368 1 /* maxcount */ + \
369 decode_verifier_maxsz \ 369 1 /* bitmap size */ + \
370 /* verifier4 gdlr_verifier */ + \ 370 1 /* notification bitmap length */ + \
371 1 /* gdlr_deviceid_list count */ + \ 371 1 /* notification bitmap, word 0 */)
372 XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \
373 NFS4_DEVICEID4_SIZE) \
374 /* gdlr_deviceid_list */ + \
375 1 /* bool gdlr_eof */)
376#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \
377 XDR_QUADLEN(NFS4_DEVICEID4_SIZE))
378#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 372#define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \
379 1 /* layout type */ + \ 373 1 /* layout type */ + \
380 1 /* opaque devaddr4 length */ + \ 374 1 /* opaque devaddr4 length */ + \
381 /* devaddr4 payload is read into page */ \ 375 /* devaddr4 payload is read into page */ \
382 1 /* notification bitmap length */ + \ 376 1 /* notification bitmap length */ + \
383 1 /* notification bitmap */) 377 1 /* notification bitmap, word 0 */)
384#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ 378#define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \
385 encode_stateid_maxsz) 379 encode_stateid_maxsz)
386#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ 380#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
@@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int);
395 2 /* last byte written */ + \ 389 2 /* last byte written */ + \
396 1 /* nt_timechanged (false) */ + \ 390 1 /* nt_timechanged (false) */ + \
397 1 /* layoutupdate4 layout type */ + \ 391 1 /* layoutupdate4 layout type */ + \
398 1 /* NULL filelayout layoutupdate4 payload */) 392 1 /* layoutupdate4 opaqueue len */)
393 /* the actual content of layoutupdate4 should
394 be allocated by drivers and spliced in
395 using xdr_write_pages */
399#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) 396#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
400#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ 397#define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \
401 encode_stateid_maxsz + \ 398 encode_stateid_maxsz + \
@@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int);
809#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ 806#define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \
810 decode_sequence_maxsz + \ 807 decode_sequence_maxsz + \
811 decode_reclaim_complete_maxsz) 808 decode_reclaim_complete_maxsz)
812#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \
813 encode_sequence_maxsz + \
814 encode_putfh_maxsz + \
815 encode_getdevicelist_maxsz)
816#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \
817 decode_sequence_maxsz + \
818 decode_putfh_maxsz + \
819 decode_getdevicelist_maxsz)
820#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ 809#define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \
821 encode_sequence_maxsz +\ 810 encode_sequence_maxsz +\
822 encode_getdeviceinfo_maxsz) 811 encode_getdeviceinfo_maxsz)
@@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr,
1927 1916
1928#ifdef CONFIG_NFS_V4_1 1917#ifdef CONFIG_NFS_V4_1
1929static void 1918static void
1930encode_getdevicelist(struct xdr_stream *xdr,
1931 const struct nfs4_getdevicelist_args *args,
1932 struct compound_hdr *hdr)
1933{
1934 __be32 *p;
1935 nfs4_verifier dummy = {
1936 .data = "dummmmmy",
1937 };
1938
1939 encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr);
1940 p = reserve_space(xdr, 16);
1941 *p++ = cpu_to_be32(args->layoutclass);
1942 *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM);
1943 xdr_encode_hyper(p, 0ULL); /* cookie */
1944 encode_nfs4_verifier(xdr, &dummy);
1945}
1946
1947static void
1948encode_getdeviceinfo(struct xdr_stream *xdr, 1919encode_getdeviceinfo(struct xdr_stream *xdr,
1949 const struct nfs4_getdeviceinfo_args *args, 1920 const struct nfs4_getdeviceinfo_args *args,
1950 struct compound_hdr *hdr) 1921 struct compound_hdr *hdr)
@@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr,
1952 __be32 *p; 1923 __be32 *p;
1953 1924
1954 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); 1925 encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr);
1955 p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); 1926 p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4);
1956 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, 1927 p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data,
1957 NFS4_DEVICEID4_SIZE); 1928 NFS4_DEVICEID4_SIZE);
1958 *p++ = cpu_to_be32(args->pdev->layout_type); 1929 *p++ = cpu_to_be32(args->pdev->layout_type);
1959 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ 1930 *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */
1960 *p++ = cpu_to_be32(0); /* bitmap length 0 */ 1931
1932 p = reserve_space(xdr, 4 + 4);
1933 *p++ = cpu_to_be32(1); /* bitmap length */
1934 *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE);
1961} 1935}
1962 1936
1963static void 1937static void
@@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr,
1990static int 1964static int
1991encode_layoutcommit(struct xdr_stream *xdr, 1965encode_layoutcommit(struct xdr_stream *xdr,
1992 struct inode *inode, 1966 struct inode *inode,
1993 const struct nfs4_layoutcommit_args *args, 1967 struct nfs4_layoutcommit_args *args,
1994 struct compound_hdr *hdr) 1968 struct compound_hdr *hdr)
1995{ 1969{
1996 __be32 *p; 1970 __be32 *p;
@@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr,
2011 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ 1985 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
2012 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ 1986 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
2013 1987
2014 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) 1988 if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) {
2015 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( 1989 NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit(
2016 NFS_I(inode)->layout, xdr, args); 1990 NFS_I(inode)->layout, xdr, args);
2017 else 1991 } else {
2018 encode_uint32(xdr, 0); /* no layout-type payload */ 1992 encode_uint32(xdr, args->layoutupdate_len);
1993 if (args->layoutupdate_pages) {
1994 xdr_write_pages(xdr, args->layoutupdate_pages, 0,
1995 args->layoutupdate_len);
1996 }
1997 }
2019 1998
2020 return 0; 1999 return 0;
2021} 2000}
@@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req,
2893} 2872}
2894 2873
2895/* 2874/*
2896 * Encode GETDEVICELIST request
2897 */
2898static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req,
2899 struct xdr_stream *xdr,
2900 struct nfs4_getdevicelist_args *args)
2901{
2902 struct compound_hdr hdr = {
2903 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2904 };
2905
2906 encode_compound_hdr(xdr, req, &hdr);
2907 encode_sequence(xdr, &args->seq_args, &hdr);
2908 encode_putfh(xdr, args->fh, &hdr);
2909 encode_getdevicelist(xdr, args, &hdr);
2910 encode_nops(&hdr);
2911}
2912
2913/*
2914 * Encode GETDEVICEINFO request 2875 * Encode GETDEVICEINFO request
2915 */ 2876 */
2916static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, 2877static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req,
@@ -5765,54 +5726,6 @@ out_overflow:
5765} 5726}
5766 5727
5767#if defined(CONFIG_NFS_V4_1) 5728#if defined(CONFIG_NFS_V4_1)
5768/*
5769 * TODO: Need to handle case when EOF != true;
5770 */
5771static int decode_getdevicelist(struct xdr_stream *xdr,
5772 struct pnfs_devicelist *res)
5773{
5774 __be32 *p;
5775 int status, i;
5776 nfs4_verifier verftemp;
5777
5778 status = decode_op_hdr(xdr, OP_GETDEVICELIST);
5779 if (status)
5780 return status;
5781
5782 p = xdr_inline_decode(xdr, 8 + 8 + 4);
5783 if (unlikely(!p))
5784 goto out_overflow;
5785
5786 /* TODO: Skip cookie for now */
5787 p += 2;
5788
5789 /* Read verifier */
5790 p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE);
5791
5792 res->num_devs = be32_to_cpup(p);
5793
5794 dprintk("%s: num_dev %d\n", __func__, res->num_devs);
5795
5796 if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) {
5797 printk(KERN_ERR "NFS: %s too many result dev_num %u\n",
5798 __func__, res->num_devs);
5799 return -EIO;
5800 }
5801
5802 p = xdr_inline_decode(xdr,
5803 res->num_devs * NFS4_DEVICEID4_SIZE + 4);
5804 if (unlikely(!p))
5805 goto out_overflow;
5806 for (i = 0; i < res->num_devs; i++)
5807 p = xdr_decode_opaque_fixed(p, res->dev_id[i].data,
5808 NFS4_DEVICEID4_SIZE);
5809 res->eof = be32_to_cpup(p);
5810 return 0;
5811out_overflow:
5812 print_overflow_msg(__func__, xdr);
5813 return -EIO;
5814}
5815
5816static int decode_getdeviceinfo(struct xdr_stream *xdr, 5729static int decode_getdeviceinfo(struct xdr_stream *xdr,
5817 struct pnfs_device *pdev) 5730 struct pnfs_device *pdev)
5818{ 5731{
@@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr,
5862 p = xdr_inline_decode(xdr, 4 * len); 5775 p = xdr_inline_decode(xdr, 4 * len);
5863 if (unlikely(!p)) 5776 if (unlikely(!p))
5864 goto out_overflow; 5777 goto out_overflow;
5865 for (i = 0; i < len; i++, p++) { 5778
5866 if (be32_to_cpup(p)) { 5779 if (be32_to_cpup(p++) &
5867 dprintk("%s: notifications not supported\n", 5780 ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) {
5781 dprintk("%s: unsupported notification\n",
5782 __func__);
5783 }
5784
5785 for (i = 1; i < len; i++) {
5786 if (be32_to_cpup(p++)) {
5787 dprintk("%s: unsupported notification\n",
5868 __func__); 5788 __func__);
5869 return -EIO; 5789 return -EIO;
5870 } 5790 }
@@ -7092,33 +7012,7 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp,
7092 if (!status) 7012 if (!status)
7093 status = decode_sequence(xdr, &res->seq_res, rqstp); 7013 status = decode_sequence(xdr, &res->seq_res, rqstp);
7094 if (!status) 7014 if (!status)
7095 status = decode_reclaim_complete(xdr, (void *)NULL); 7015 status = decode_reclaim_complete(xdr, NULL);
7096 return status;
7097}
7098
7099/*
7100 * Decode GETDEVICELIST response
7101 */
7102static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp,
7103 struct xdr_stream *xdr,
7104 struct nfs4_getdevicelist_res *res)
7105{
7106 struct compound_hdr hdr;
7107 int status;
7108
7109 dprintk("encoding getdevicelist!\n");
7110
7111 status = decode_compound_hdr(xdr, &hdr);
7112 if (status != 0)
7113 goto out;
7114 status = decode_sequence(xdr, &res->seq_res, rqstp);
7115 if (status != 0)
7116 goto out;
7117 status = decode_putfh(xdr);
7118 if (status != 0)
7119 goto out;
7120 status = decode_getdevicelist(xdr, res->devlist);
7121out:
7122 return status; 7016 return status;
7123} 7017}
7124 7018
@@ -7427,6 +7321,10 @@ nfs4_stat_to_errno(int stat)
7427 return -stat; 7321 return -stat;
7428} 7322}
7429 7323
7324#ifdef CONFIG_NFS_V4_2
7325#include "nfs42xdr.c"
7326#endif /* CONFIG_NFS_V4_2 */
7327
7430#define PROC(proc, argtype, restype) \ 7328#define PROC(proc, argtype, restype) \
7431[NFSPROC4_CLNT_##proc] = { \ 7329[NFSPROC4_CLNT_##proc] = { \
7432 .p_proc = NFSPROC4_COMPOUND, \ 7330 .p_proc = NFSPROC4_COMPOUND, \
@@ -7490,11 +7388,13 @@ struct rpc_procinfo nfs4_procedures[] = {
7490 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), 7388 PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name),
7491 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), 7389 PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid),
7492 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), 7390 PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid),
7493 PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist),
7494 PROC(BIND_CONN_TO_SESSION, 7391 PROC(BIND_CONN_TO_SESSION,
7495 enc_bind_conn_to_session, dec_bind_conn_to_session), 7392 enc_bind_conn_to_session, dec_bind_conn_to_session),
7496 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), 7393 PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid),
7497#endif /* CONFIG_NFS_V4_1 */ 7394#endif /* CONFIG_NFS_V4_1 */
7395#ifdef CONFIG_NFS_V4_2
7396 PROC(SEEK, enc_seek, dec_seek),
7397#endif /* CONFIG_NFS_V4_2 */
7498}; 7398};
7499 7399
7500const struct rpc_version nfs_version4 = { 7400const struct rpc_version nfs_version4 = {
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 611320753db2..c6e4bda63000 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d)
60 kfree(de); 60 kfree(de);
61} 61}
62 62
63static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss,
64 const struct nfs4_deviceid *d_id)
65{
66 struct nfs4_deviceid_node *d;
67 struct objio_dev_ent *de;
68
69 d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id);
70 if (!d)
71 return NULL;
72
73 de = container_of(d, struct objio_dev_ent, id_node);
74 return de;
75}
76
77static struct objio_dev_ent *
78_dev_list_add(const struct nfs_server *nfss,
79 const struct nfs4_deviceid *d_id, struct osd_dev *od,
80 gfp_t gfp_flags)
81{
82 struct nfs4_deviceid_node *d;
83 struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags);
84 struct objio_dev_ent *n;
85
86 if (!de) {
87 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
88 return NULL;
89 }
90
91 dprintk("%s: Adding od=%p\n", __func__, od);
92 nfs4_init_deviceid_node(&de->id_node,
93 nfss->pnfs_curr_ld,
94 nfss->nfs_client,
95 d_id);
96 de->od.od = od;
97
98 d = nfs4_insert_deviceid_node(&de->id_node);
99 n = container_of(d, struct objio_dev_ent, id_node);
100 if (n != de) {
101 dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od);
102 objio_free_deviceid_node(&de->id_node);
103 de = n;
104 }
105
106 return de;
107}
108
109struct objio_segment { 63struct objio_segment {
110 struct pnfs_layout_segment lseg; 64 struct pnfs_layout_segment lseg;
111 65
@@ -130,29 +84,24 @@ struct objio_state {
130 84
131/* Send and wait for a get_device_info of devices in the layout, 85/* Send and wait for a get_device_info of devices in the layout,
132 then look them up with the osd_initiator library */ 86 then look them up with the osd_initiator library */
133static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, 87struct nfs4_deviceid_node *
134 struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, 88objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
135 gfp_t gfp_flags) 89 gfp_t gfp_flags)
136{ 90{
137 struct pnfs_osd_deviceaddr *deviceaddr; 91 struct pnfs_osd_deviceaddr *deviceaddr;
138 struct objio_dev_ent *ode; 92 struct objio_dev_ent *ode = NULL;
139 struct osd_dev *od; 93 struct osd_dev *od;
140 struct osd_dev_info odi; 94 struct osd_dev_info odi;
141 bool retry_flag = true; 95 bool retry_flag = true;
96 __be32 *p;
142 int err; 97 int err;
143 98
144 ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); 99 deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags);
145 if (ode) { 100 if (!deviceaddr)
146 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ 101 return NULL;
147 return 0;
148 }
149 102
150 err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); 103 p = page_address(pdev->pages[0]);
151 if (unlikely(err)) { 104 pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p);
152 dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n",
153 __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err);
154 return err;
155 }
156 105
157 odi.systemid_len = deviceaddr->oda_systemid.len; 106 odi.systemid_len = deviceaddr->oda_systemid.len;
158 if (odi.systemid_len > sizeof(odi.systemid)) { 107 if (odi.systemid_len > sizeof(odi.systemid)) {
@@ -188,14 +137,24 @@ retry_lookup:
188 goto out; 137 goto out;
189 } 138 }
190 139
191 ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od,
192 gfp_flags);
193 objio_seg->oc.ods[c] = &ode->od; /* must use container_of */
194 dprintk("Adding new dev_id(%llx:%llx)\n", 140 dprintk("Adding new dev_id(%llx:%llx)\n",
195 _DEVID_LO(d_id), _DEVID_HI(d_id)); 141 _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id));
142
143 ode = kzalloc(sizeof(*ode), gfp_flags);
144 if (!ode) {
145 dprintk("%s: -ENOMEM od=%p\n", __func__, od);
146 goto out;
147 }
148
149 nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id);
150 kfree(deviceaddr);
151
152 ode->od.od = od;
153 return &ode->id_node;
154
196out: 155out:
197 objlayout_put_deviceinfo(deviceaddr); 156 kfree(deviceaddr);
198 return err; 157 return NULL;
199} 158}
200 159
201static void copy_single_comp(struct ore_components *oc, unsigned c, 160static void copy_single_comp(struct ore_components *oc, unsigned c,
@@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
254 struct xdr_stream *xdr, 213 struct xdr_stream *xdr,
255 gfp_t gfp_flags) 214 gfp_t gfp_flags)
256{ 215{
216 struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode);
257 struct objio_segment *objio_seg; 217 struct objio_segment *objio_seg;
258 struct pnfs_osd_xdr_decode_layout_iter iter; 218 struct pnfs_osd_xdr_decode_layout_iter iter;
259 struct pnfs_osd_layout layout; 219 struct pnfs_osd_layout layout;
@@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp,
283 objio_seg->oc.first_dev = layout.olo_comps_index; 243 objio_seg->oc.first_dev = layout.olo_comps_index;
284 cur_comp = 0; 244 cur_comp = 0;
285 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { 245 while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) {
246 struct nfs4_deviceid_node *d;
247 struct objio_dev_ent *ode;
248
286 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); 249 copy_single_comp(&objio_seg->oc, cur_comp, &src_comp);
287 err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, 250
288 &src_comp.oc_object_id.oid_device_id, 251 d = nfs4_find_get_deviceid(server,
289 gfp_flags); 252 &src_comp.oc_object_id.oid_device_id,
290 if (err) 253 pnfslay->plh_lc_cred, gfp_flags);
254 if (!d) {
255 err = -ENXIO;
291 goto err; 256 goto err;
292 ++cur_comp; 257 }
258
259 ode = container_of(d, struct objio_dev_ent, id_node);
260 objio_seg->oc.ods[cur_comp++] = &ode->od;
293 } 261 }
294 /* pnfs_osd_xdr_decode_layout_comp returns false on error */ 262 /* pnfs_osd_xdr_decode_layout_comp returns false on error */
295 if (unlikely(err)) 263 if (unlikely(err))
@@ -439,22 +407,21 @@ static void _read_done(struct ore_io_state *ios, void *private)
439 objlayout_read_done(&objios->oir, status, objios->sync); 407 objlayout_read_done(&objios->oir, status, objios->sync);
440} 408}
441 409
442int objio_read_pagelist(struct nfs_pgio_data *rdata) 410int objio_read_pagelist(struct nfs_pgio_header *hdr)
443{ 411{
444 struct nfs_pgio_header *hdr = rdata->header;
445 struct objio_state *objios; 412 struct objio_state *objios;
446 int ret; 413 int ret;
447 414
448 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, 415 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true,
449 hdr->lseg, rdata->args.pages, rdata->args.pgbase, 416 hdr->lseg, hdr->args.pages, hdr->args.pgbase,
450 rdata->args.offset, rdata->args.count, rdata, 417 hdr->args.offset, hdr->args.count, hdr,
451 GFP_KERNEL, &objios); 418 GFP_KERNEL, &objios);
452 if (unlikely(ret)) 419 if (unlikely(ret))
453 return ret; 420 return ret;
454 421
455 objios->ios->done = _read_done; 422 objios->ios->done = _read_done;
456 dprintk("%s: offset=0x%llx length=0x%x\n", __func__, 423 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
457 rdata->args.offset, rdata->args.count); 424 hdr->args.offset, hdr->args.count);
458 ret = ore_read(objios->ios); 425 ret = ore_read(objios->ios);
459 if (unlikely(ret)) 426 if (unlikely(ret))
460 objio_free_result(&objios->oir); 427 objio_free_result(&objios->oir);
@@ -487,11 +454,11 @@ static void _write_done(struct ore_io_state *ios, void *private)
487static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) 454static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate)
488{ 455{
489 struct objio_state *objios = priv; 456 struct objio_state *objios = priv;
490 struct nfs_pgio_data *wdata = objios->oir.rpcdata; 457 struct nfs_pgio_header *hdr = objios->oir.rpcdata;
491 struct address_space *mapping = wdata->header->inode->i_mapping; 458 struct address_space *mapping = hdr->inode->i_mapping;
492 pgoff_t index = offset / PAGE_SIZE; 459 pgoff_t index = offset / PAGE_SIZE;
493 struct page *page; 460 struct page *page;
494 loff_t i_size = i_size_read(wdata->header->inode); 461 loff_t i_size = i_size_read(hdr->inode);
495 462
496 if (offset >= i_size) { 463 if (offset >= i_size) {
497 *uptodate = true; 464 *uptodate = true;
@@ -531,15 +498,14 @@ static const struct _ore_r4w_op _r4w_op = {
531 .put_page = &__r4w_put_page, 498 .put_page = &__r4w_put_page,
532}; 499};
533 500
534int objio_write_pagelist(struct nfs_pgio_data *wdata, int how) 501int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
535{ 502{
536 struct nfs_pgio_header *hdr = wdata->header;
537 struct objio_state *objios; 503 struct objio_state *objios;
538 int ret; 504 int ret;
539 505
540 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, 506 ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false,
541 hdr->lseg, wdata->args.pages, wdata->args.pgbase, 507 hdr->lseg, hdr->args.pages, hdr->args.pgbase,
542 wdata->args.offset, wdata->args.count, wdata, GFP_NOFS, 508 hdr->args.offset, hdr->args.count, hdr, GFP_NOFS,
543 &objios); 509 &objios);
544 if (unlikely(ret)) 510 if (unlikely(ret))
545 return ret; 511 return ret;
@@ -551,7 +517,7 @@ int objio_write_pagelist(struct nfs_pgio_data *wdata, int how)
551 objios->ios->done = _write_done; 517 objios->ios->done = _write_done;
552 518
553 dprintk("%s: offset=0x%llx length=0x%x\n", __func__, 519 dprintk("%s: offset=0x%llx length=0x%x\n", __func__,
554 wdata->args.offset, wdata->args.count); 520 hdr->args.offset, hdr->args.count);
555 ret = ore_write(objios->ios); 521 ret = ore_write(objios->ios);
556 if (unlikely(ret)) { 522 if (unlikely(ret)) {
557 objio_free_result(&objios->oir); 523 objio_free_result(&objios->oir);
@@ -655,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
655 .flags = PNFS_LAYOUTRET_ON_SETATTR | 621 .flags = PNFS_LAYOUTRET_ON_SETATTR |
656 PNFS_LAYOUTRET_ON_ERROR, 622 PNFS_LAYOUTRET_ON_ERROR,
657 623
624 .max_deviceinfo_size = PAGE_SIZE,
658 .owner = THIS_MODULE, 625 .owner = THIS_MODULE,
659 .alloc_layout_hdr = objlayout_alloc_layout_hdr, 626 .alloc_layout_hdr = objlayout_alloc_layout_hdr,
660 .free_layout_hdr = objlayout_free_layout_hdr, 627 .free_layout_hdr = objlayout_free_layout_hdr,
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 765d3f54e986..c89357c7a914 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -229,36 +229,36 @@ objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index,
229static void _rpc_read_complete(struct work_struct *work) 229static void _rpc_read_complete(struct work_struct *work)
230{ 230{
231 struct rpc_task *task; 231 struct rpc_task *task;
232 struct nfs_pgio_data *rdata; 232 struct nfs_pgio_header *hdr;
233 233
234 dprintk("%s enter\n", __func__); 234 dprintk("%s enter\n", __func__);
235 task = container_of(work, struct rpc_task, u.tk_work); 235 task = container_of(work, struct rpc_task, u.tk_work);
236 rdata = container_of(task, struct nfs_pgio_data, task); 236 hdr = container_of(task, struct nfs_pgio_header, task);
237 237
238 pnfs_ld_read_done(rdata); 238 pnfs_ld_read_done(hdr);
239} 239}
240 240
241void 241void
242objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) 242objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
243{ 243{
244 struct nfs_pgio_data *rdata = oir->rpcdata; 244 struct nfs_pgio_header *hdr = oir->rpcdata;
245 245
246 oir->status = rdata->task.tk_status = status; 246 oir->status = hdr->task.tk_status = status;
247 if (status >= 0) 247 if (status >= 0)
248 rdata->res.count = status; 248 hdr->res.count = status;
249 else 249 else
250 rdata->header->pnfs_error = status; 250 hdr->pnfs_error = status;
251 objlayout_iodone(oir); 251 objlayout_iodone(oir);
252 /* must not use oir after this point */ 252 /* must not use oir after this point */
253 253
254 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, 254 dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__,
255 status, rdata->res.eof, sync); 255 status, hdr->res.eof, sync);
256 256
257 if (sync) 257 if (sync)
258 pnfs_ld_read_done(rdata); 258 pnfs_ld_read_done(hdr);
259 else { 259 else {
260 INIT_WORK(&rdata->task.u.tk_work, _rpc_read_complete); 260 INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete);
261 schedule_work(&rdata->task.u.tk_work); 261 schedule_work(&hdr->task.u.tk_work);
262 } 262 }
263} 263}
264 264
@@ -266,12 +266,11 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
266 * Perform sync or async reads. 266 * Perform sync or async reads.
267 */ 267 */
268enum pnfs_try_status 268enum pnfs_try_status
269objlayout_read_pagelist(struct nfs_pgio_data *rdata) 269objlayout_read_pagelist(struct nfs_pgio_header *hdr)
270{ 270{
271 struct nfs_pgio_header *hdr = rdata->header;
272 struct inode *inode = hdr->inode; 271 struct inode *inode = hdr->inode;
273 loff_t offset = rdata->args.offset; 272 loff_t offset = hdr->args.offset;
274 size_t count = rdata->args.count; 273 size_t count = hdr->args.count;
275 int err; 274 int err;
276 loff_t eof; 275 loff_t eof;
277 276
@@ -279,23 +278,23 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
279 if (unlikely(offset + count > eof)) { 278 if (unlikely(offset + count > eof)) {
280 if (offset >= eof) { 279 if (offset >= eof) {
281 err = 0; 280 err = 0;
282 rdata->res.count = 0; 281 hdr->res.count = 0;
283 rdata->res.eof = 1; 282 hdr->res.eof = 1;
284 /*FIXME: do we need to call pnfs_ld_read_done() */ 283 /*FIXME: do we need to call pnfs_ld_read_done() */
285 goto out; 284 goto out;
286 } 285 }
287 count = eof - offset; 286 count = eof - offset;
288 } 287 }
289 288
290 rdata->res.eof = (offset + count) >= eof; 289 hdr->res.eof = (offset + count) >= eof;
291 _fix_verify_io_params(hdr->lseg, &rdata->args.pages, 290 _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
292 &rdata->args.pgbase, 291 &hdr->args.pgbase,
293 rdata->args.offset, rdata->args.count); 292 hdr->args.offset, hdr->args.count);
294 293
295 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n", 294 dprintk("%s: inode(%lx) offset 0x%llx count 0x%Zx eof=%d\n",
296 __func__, inode->i_ino, offset, count, rdata->res.eof); 295 __func__, inode->i_ino, offset, count, hdr->res.eof);
297 296
298 err = objio_read_pagelist(rdata); 297 err = objio_read_pagelist(hdr);
299 out: 298 out:
300 if (unlikely(err)) { 299 if (unlikely(err)) {
301 hdr->pnfs_error = err; 300 hdr->pnfs_error = err;
@@ -312,38 +311,38 @@ objlayout_read_pagelist(struct nfs_pgio_data *rdata)
312static void _rpc_write_complete(struct work_struct *work) 311static void _rpc_write_complete(struct work_struct *work)
313{ 312{
314 struct rpc_task *task; 313 struct rpc_task *task;
315 struct nfs_pgio_data *wdata; 314 struct nfs_pgio_header *hdr;
316 315
317 dprintk("%s enter\n", __func__); 316 dprintk("%s enter\n", __func__);
318 task = container_of(work, struct rpc_task, u.tk_work); 317 task = container_of(work, struct rpc_task, u.tk_work);
319 wdata = container_of(task, struct nfs_pgio_data, task); 318 hdr = container_of(task, struct nfs_pgio_header, task);
320 319
321 pnfs_ld_write_done(wdata); 320 pnfs_ld_write_done(hdr);
322} 321}
323 322
324void 323void
325objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) 324objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
326{ 325{
327 struct nfs_pgio_data *wdata = oir->rpcdata; 326 struct nfs_pgio_header *hdr = oir->rpcdata;
328 327
329 oir->status = wdata->task.tk_status = status; 328 oir->status = hdr->task.tk_status = status;
330 if (status >= 0) { 329 if (status >= 0) {
331 wdata->res.count = status; 330 hdr->res.count = status;
332 wdata->verf.committed = oir->committed; 331 hdr->verf.committed = oir->committed;
333 } else { 332 } else {
334 wdata->header->pnfs_error = status; 333 hdr->pnfs_error = status;
335 } 334 }
336 objlayout_iodone(oir); 335 objlayout_iodone(oir);
337 /* must not use oir after this point */ 336 /* must not use oir after this point */
338 337
339 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, 338 dprintk("%s: Return status %zd committed %d sync=%d\n", __func__,
340 status, wdata->verf.committed, sync); 339 status, hdr->verf.committed, sync);
341 340
342 if (sync) 341 if (sync)
343 pnfs_ld_write_done(wdata); 342 pnfs_ld_write_done(hdr);
344 else { 343 else {
345 INIT_WORK(&wdata->task.u.tk_work, _rpc_write_complete); 344 INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete);
346 schedule_work(&wdata->task.u.tk_work); 345 schedule_work(&hdr->task.u.tk_work);
347 } 346 }
348} 347}
349 348
@@ -351,17 +350,15 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
351 * Perform sync or async writes. 350 * Perform sync or async writes.
352 */ 351 */
353enum pnfs_try_status 352enum pnfs_try_status
354objlayout_write_pagelist(struct nfs_pgio_data *wdata, 353objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how)
355 int how)
356{ 354{
357 struct nfs_pgio_header *hdr = wdata->header;
358 int err; 355 int err;
359 356
360 _fix_verify_io_params(hdr->lseg, &wdata->args.pages, 357 _fix_verify_io_params(hdr->lseg, &hdr->args.pages,
361 &wdata->args.pgbase, 358 &hdr->args.pgbase,
362 wdata->args.offset, wdata->args.count); 359 hdr->args.offset, hdr->args.count);
363 360
364 err = objio_write_pagelist(wdata, how); 361 err = objio_write_pagelist(hdr, how);
365 if (unlikely(err)) { 362 if (unlikely(err)) {
366 hdr->pnfs_error = err; 363 hdr->pnfs_error = err;
367 dprintk("%s: Returned Error %d\n", __func__, err); 364 dprintk("%s: Returned Error %d\n", __func__, err);
@@ -577,76 +574,6 @@ loop_done:
577 dprintk("%s: Return\n", __func__); 574 dprintk("%s: Return\n", __func__);
578} 575}
579 576
580
581/*
582 * Get Device Info API for io engines
583 */
584struct objlayout_deviceinfo {
585 struct page *page;
586 struct pnfs_osd_deviceaddr da; /* This must be last */
587};
588
589/* Initialize and call nfs_getdeviceinfo, then decode and return a
590 * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo()
591 * should be called.
592 */
593int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
594 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
595 gfp_t gfp_flags)
596{
597 struct objlayout_deviceinfo *odi;
598 struct pnfs_device pd;
599 struct page *page, **pages;
600 u32 *p;
601 int err;
602
603 page = alloc_page(gfp_flags);
604 if (!page)
605 return -ENOMEM;
606
607 pages = &page;
608 pd.pages = pages;
609
610 memcpy(&pd.dev_id, d_id, sizeof(*d_id));
611 pd.layout_type = LAYOUT_OSD2_OBJECTS;
612 pd.pages = &page;
613 pd.pgbase = 0;
614 pd.pglen = PAGE_SIZE;
615 pd.mincount = 0;
616 pd.maxcount = PAGE_SIZE;
617
618 err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd,
619 pnfslay->plh_lc_cred);
620 dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err);
621 if (err)
622 goto err_out;
623
624 p = page_address(page);
625 odi = kzalloc(sizeof(*odi), gfp_flags);
626 if (!odi) {
627 err = -ENOMEM;
628 goto err_out;
629 }
630 pnfs_osd_xdr_decode_deviceaddr(&odi->da, p);
631 odi->page = page;
632 *deviceaddr = &odi->da;
633 return 0;
634
635err_out:
636 __free_page(page);
637 return err;
638}
639
640void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr)
641{
642 struct objlayout_deviceinfo *odi = container_of(deviceaddr,
643 struct objlayout_deviceinfo,
644 da);
645
646 __free_page(odi->page);
647 kfree(odi);
648}
649
650enum { 577enum {
651 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, 578 OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64,
652 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, 579 OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1,
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 01e041029a6c..3a0828d57339 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -119,8 +119,8 @@ extern void objio_free_lseg(struct pnfs_layout_segment *lseg);
119 */ 119 */
120extern void objio_free_result(struct objlayout_io_res *oir); 120extern void objio_free_result(struct objlayout_io_res *oir);
121 121
122extern int objio_read_pagelist(struct nfs_pgio_data *rdata); 122extern int objio_read_pagelist(struct nfs_pgio_header *rdata);
123extern int objio_write_pagelist(struct nfs_pgio_data *wdata, int how); 123extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how);
124 124
125/* 125/*
126 * callback API 126 * callback API
@@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir,
149extern void objlayout_write_done(struct objlayout_io_res *oir, 149extern void objlayout_write_done(struct objlayout_io_res *oir,
150 ssize_t status, bool sync); 150 ssize_t status, bool sync);
151 151
152extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay,
153 struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr,
154 gfp_t gfp_flags);
155extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr);
156
157/* 152/*
158 * exported generic objects function vectors 153 * exported generic objects function vectors
159 */ 154 */
@@ -168,10 +163,10 @@ extern struct pnfs_layout_segment *objlayout_alloc_lseg(
168extern void objlayout_free_lseg(struct pnfs_layout_segment *); 163extern void objlayout_free_lseg(struct pnfs_layout_segment *);
169 164
170extern enum pnfs_try_status objlayout_read_pagelist( 165extern enum pnfs_try_status objlayout_read_pagelist(
171 struct nfs_pgio_data *); 166 struct nfs_pgio_header *);
172 167
173extern enum pnfs_try_status objlayout_write_pagelist( 168extern enum pnfs_try_status objlayout_write_pagelist(
174 struct nfs_pgio_data *, 169 struct nfs_pgio_header *,
175 int how); 170 int how);
176 171
177extern void objlayout_encode_layoutcommit( 172extern void objlayout_encode_layoutcommit(
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 17fab89f6358..ed0db61f8543 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -115,8 +115,8 @@ __nfs_iocounter_wait(struct nfs_io_counter *c)
115 set_bit(NFS_IO_INPROGRESS, &c->flags); 115 set_bit(NFS_IO_INPROGRESS, &c->flags);
116 if (atomic_read(&c->io_count) == 0) 116 if (atomic_read(&c->io_count) == 0)
117 break; 117 break;
118 ret = nfs_wait_bit_killable(&c->flags); 118 ret = nfs_wait_bit_killable(&q.key);
119 } while (atomic_read(&c->io_count) != 0); 119 } while (atomic_read(&c->io_count) != 0 && !ret);
120 finish_wait(wq, &q.wait); 120 finish_wait(wq, &q.wait);
121 return ret; 121 return ret;
122} 122}
@@ -136,28 +136,52 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
136 return __nfs_iocounter_wait(c); 136 return __nfs_iocounter_wait(c);
137} 137}
138 138
139static int nfs_wait_bit_uninterruptible(void *word)
140{
141 io_schedule();
142 return 0;
143}
144
145/* 139/*
146 * nfs_page_group_lock - lock the head of the page group 140 * nfs_page_group_lock - lock the head of the page group
147 * @req - request in group that is to be locked 141 * @req - request in group that is to be locked
142 * @nonblock - if true don't block waiting for lock
148 * 143 *
149 * this lock must be held if modifying the page group list 144 * this lock must be held if modifying the page group list
145 *
146 * return 0 on success, < 0 on error: -EDELAY if nonblocking or the
147 * result from wait_on_bit_lock
148 *
149 * NOTE: calling with nonblock=false should always have set the
150 * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock
151 * with TASK_UNINTERRUPTIBLE), so there is no need to check the result.
152 */
153int
154nfs_page_group_lock(struct nfs_page *req, bool nonblock)
155{
156 struct nfs_page *head = req->wb_head;
157
158 WARN_ON_ONCE(head != head->wb_head);
159
160 if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
161 return 0;
162
163 if (!nonblock)
164 return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
165 TASK_UNINTERRUPTIBLE);
166
167 return -EAGAIN;
168}
169
170/*
171 * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it
172 * @req - a request in the group
173 *
174 * This is a blocking call to wait for the group lock to be cleared.
150 */ 175 */
151void 176void
152nfs_page_group_lock(struct nfs_page *req) 177nfs_page_group_lock_wait(struct nfs_page *req)
153{ 178{
154 struct nfs_page *head = req->wb_head; 179 struct nfs_page *head = req->wb_head;
155 180
156 WARN_ON_ONCE(head != head->wb_head); 181 WARN_ON_ONCE(head != head->wb_head);
157 182
158 wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, 183 wait_on_bit(&head->wb_flags, PG_HEADLOCK,
159 nfs_wait_bit_uninterruptible, 184 TASK_UNINTERRUPTIBLE);
160 TASK_UNINTERRUPTIBLE);
161} 185}
162 186
163/* 187/*
@@ -218,7 +242,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
218{ 242{
219 bool ret; 243 bool ret;
220 244
221 nfs_page_group_lock(req); 245 nfs_page_group_lock(req, false);
222 ret = nfs_page_group_sync_on_bit_locked(req, bit); 246 ret = nfs_page_group_sync_on_bit_locked(req, bit);
223 nfs_page_group_unlock(req); 247 nfs_page_group_unlock(req);
224 248
@@ -435,9 +459,8 @@ void nfs_release_request(struct nfs_page *req)
435int 459int
436nfs_wait_on_request(struct nfs_page *req) 460nfs_wait_on_request(struct nfs_page *req)
437{ 461{
438 return wait_on_bit(&req->wb_flags, PG_BUSY, 462 return wait_on_bit_io(&req->wb_flags, PG_BUSY,
439 nfs_wait_bit_uninterruptible, 463 TASK_UNINTERRUPTIBLE);
440 TASK_UNINTERRUPTIBLE);
441} 464}
442 465
443/* 466/*
@@ -458,127 +481,85 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
458 return 0; 481 return 0;
459 } 482 }
460 483
484 /*
485 * Limit the request size so that we can still allocate a page array
486 * for it without upsetting the slab allocator.
487 */
488 if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
489 sizeof(struct page) > PAGE_SIZE)
490 return 0;
491
461 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); 492 return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
462} 493}
463EXPORT_SYMBOL_GPL(nfs_generic_pg_test); 494EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
464 495
465static inline struct nfs_rw_header *NFS_RW_HEADER(struct nfs_pgio_header *hdr) 496struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops)
466{ 497{
467 return container_of(hdr, struct nfs_rw_header, header); 498 struct nfs_pgio_header *hdr = ops->rw_alloc_header();
468}
469
470/**
471 * nfs_rw_header_alloc - Allocate a header for a read or write
472 * @ops: Read or write function vector
473 */
474struct nfs_rw_header *nfs_rw_header_alloc(const struct nfs_rw_ops *ops)
475{
476 struct nfs_rw_header *header = ops->rw_alloc_header();
477
478 if (header) {
479 struct nfs_pgio_header *hdr = &header->header;
480 499
500 if (hdr) {
481 INIT_LIST_HEAD(&hdr->pages); 501 INIT_LIST_HEAD(&hdr->pages);
482 spin_lock_init(&hdr->lock); 502 spin_lock_init(&hdr->lock);
483 atomic_set(&hdr->refcnt, 0);
484 hdr->rw_ops = ops; 503 hdr->rw_ops = ops;
485 } 504 }
486 return header; 505 return hdr;
487} 506}
488EXPORT_SYMBOL_GPL(nfs_rw_header_alloc); 507EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc);
489 508
490/* 509/*
491 * nfs_rw_header_free - Free a read or write header 510 * nfs_pgio_header_free - Free a read or write header
492 * @hdr: The header to free 511 * @hdr: The header to free
493 */ 512 */
494void nfs_rw_header_free(struct nfs_pgio_header *hdr) 513void nfs_pgio_header_free(struct nfs_pgio_header *hdr)
495{
496 hdr->rw_ops->rw_free_header(NFS_RW_HEADER(hdr));
497}
498EXPORT_SYMBOL_GPL(nfs_rw_header_free);
499
500/**
501 * nfs_pgio_data_alloc - Allocate pageio data
502 * @hdr: The header making a request
503 * @pagecount: Number of pages to create
504 */
505static struct nfs_pgio_data *nfs_pgio_data_alloc(struct nfs_pgio_header *hdr,
506 unsigned int pagecount)
507{ 514{
508 struct nfs_pgio_data *data, *prealloc; 515 hdr->rw_ops->rw_free_header(hdr);
509
510 prealloc = &NFS_RW_HEADER(hdr)->rpc_data;
511 if (prealloc->header == NULL)
512 data = prealloc;
513 else
514 data = kzalloc(sizeof(*data), GFP_KERNEL);
515 if (!data)
516 goto out;
517
518 if (nfs_pgarray_set(&data->pages, pagecount)) {
519 data->header = hdr;
520 atomic_inc(&hdr->refcnt);
521 } else {
522 if (data != prealloc)
523 kfree(data);
524 data = NULL;
525 }
526out:
527 return data;
528} 516}
517EXPORT_SYMBOL_GPL(nfs_pgio_header_free);
529 518
530/** 519/**
531 * nfs_pgio_data_release - Properly free pageio data 520 * nfs_pgio_data_destroy - make @hdr suitable for reuse
532 * @data: The data to release 521 *
522 * Frees memory and releases refs from nfs_generic_pgio, so that it may
523 * be called again.
524 *
525 * @hdr: A header that has had nfs_generic_pgio called
533 */ 526 */
534void nfs_pgio_data_release(struct nfs_pgio_data *data) 527void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr)
535{ 528{
536 struct nfs_pgio_header *hdr = data->header; 529 if (hdr->args.context)
537 struct nfs_rw_header *pageio_header = NFS_RW_HEADER(hdr); 530 put_nfs_open_context(hdr->args.context);
538 531 if (hdr->page_array.pagevec != hdr->page_array.page_array)
539 put_nfs_open_context(data->args.context); 532 kfree(hdr->page_array.pagevec);
540 if (data->pages.pagevec != data->pages.page_array)
541 kfree(data->pages.pagevec);
542 if (data == &pageio_header->rpc_data) {
543 data->header = NULL;
544 data = NULL;
545 }
546 if (atomic_dec_and_test(&hdr->refcnt))
547 hdr->completion_ops->completion(hdr);
548 /* Note: we only free the rpc_task after callbacks are done.
549 * See the comment in rpc_free_task() for why
550 */
551 kfree(data);
552} 533}
553EXPORT_SYMBOL_GPL(nfs_pgio_data_release); 534EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy);
554 535
555/** 536/**
556 * nfs_pgio_rpcsetup - Set up arguments for a pageio call 537 * nfs_pgio_rpcsetup - Set up arguments for a pageio call
557 * @data: The pageio data 538 * @hdr: The pageio hdr
558 * @count: Number of bytes to read 539 * @count: Number of bytes to read
559 * @offset: Initial offset 540 * @offset: Initial offset
560 * @how: How to commit data (writes only) 541 * @how: How to commit data (writes only)
561 * @cinfo: Commit information for the call (writes only) 542 * @cinfo: Commit information for the call (writes only)
562 */ 543 */
563static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data, 544static void nfs_pgio_rpcsetup(struct nfs_pgio_header *hdr,
564 unsigned int count, unsigned int offset, 545 unsigned int count, unsigned int offset,
565 int how, struct nfs_commit_info *cinfo) 546 int how, struct nfs_commit_info *cinfo)
566{ 547{
567 struct nfs_page *req = data->header->req; 548 struct nfs_page *req = hdr->req;
568 549
569 /* Set up the RPC argument and reply structs 550 /* Set up the RPC argument and reply structs
570 * NB: take care not to mess about with data->commit et al. */ 551 * NB: take care not to mess about with hdr->commit et al. */
571 552
572 data->args.fh = NFS_FH(data->header->inode); 553 hdr->args.fh = NFS_FH(hdr->inode);
573 data->args.offset = req_offset(req) + offset; 554 hdr->args.offset = req_offset(req) + offset;
574 /* pnfs_set_layoutcommit needs this */ 555 /* pnfs_set_layoutcommit needs this */
575 data->mds_offset = data->args.offset; 556 hdr->mds_offset = hdr->args.offset;
576 data->args.pgbase = req->wb_pgbase + offset; 557 hdr->args.pgbase = req->wb_pgbase + offset;
577 data->args.pages = data->pages.pagevec; 558 hdr->args.pages = hdr->page_array.pagevec;
578 data->args.count = count; 559 hdr->args.count = count;
579 data->args.context = get_nfs_open_context(req->wb_context); 560 hdr->args.context = get_nfs_open_context(req->wb_context);
580 data->args.lock_context = req->wb_lock_context; 561 hdr->args.lock_context = req->wb_lock_context;
581 data->args.stable = NFS_UNSTABLE; 562 hdr->args.stable = NFS_UNSTABLE;
582 switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) { 563 switch (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
583 case 0: 564 case 0:
584 break; 565 break;
@@ -586,59 +567,59 @@ static void nfs_pgio_rpcsetup(struct nfs_pgio_data *data,
586 if (nfs_reqs_to_commit(cinfo)) 567 if (nfs_reqs_to_commit(cinfo))
587 break; 568 break;
588 default: 569 default:
589 data->args.stable = NFS_FILE_SYNC; 570 hdr->args.stable = NFS_FILE_SYNC;
590 } 571 }
591 572
592 data->res.fattr = &data->fattr; 573 hdr->res.fattr = &hdr->fattr;
593 data->res.count = count; 574 hdr->res.count = count;
594 data->res.eof = 0; 575 hdr->res.eof = 0;
595 data->res.verf = &data->verf; 576 hdr->res.verf = &hdr->verf;
596 nfs_fattr_init(&data->fattr); 577 nfs_fattr_init(&hdr->fattr);
597} 578}
598 579
599/** 580/**
600 * nfs_pgio_prepare - Prepare pageio data to go over the wire 581 * nfs_pgio_prepare - Prepare pageio hdr to go over the wire
601 * @task: The current task 582 * @task: The current task
602 * @calldata: pageio data to prepare 583 * @calldata: pageio header to prepare
603 */ 584 */
604static void nfs_pgio_prepare(struct rpc_task *task, void *calldata) 585static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
605{ 586{
606 struct nfs_pgio_data *data = calldata; 587 struct nfs_pgio_header *hdr = calldata;
607 int err; 588 int err;
608 err = NFS_PROTO(data->header->inode)->pgio_rpc_prepare(task, data); 589 err = NFS_PROTO(hdr->inode)->pgio_rpc_prepare(task, hdr);
609 if (err) 590 if (err)
610 rpc_exit(task, err); 591 rpc_exit(task, err);
611} 592}
612 593
613int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_data *data, 594int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
614 const struct rpc_call_ops *call_ops, int how, int flags) 595 const struct rpc_call_ops *call_ops, int how, int flags)
615{ 596{
616 struct rpc_task *task; 597 struct rpc_task *task;
617 struct rpc_message msg = { 598 struct rpc_message msg = {
618 .rpc_argp = &data->args, 599 .rpc_argp = &hdr->args,
619 .rpc_resp = &data->res, 600 .rpc_resp = &hdr->res,
620 .rpc_cred = data->header->cred, 601 .rpc_cred = hdr->cred,
621 }; 602 };
622 struct rpc_task_setup task_setup_data = { 603 struct rpc_task_setup task_setup_data = {
623 .rpc_client = clnt, 604 .rpc_client = clnt,
624 .task = &data->task, 605 .task = &hdr->task,
625 .rpc_message = &msg, 606 .rpc_message = &msg,
626 .callback_ops = call_ops, 607 .callback_ops = call_ops,
627 .callback_data = data, 608 .callback_data = hdr,
628 .workqueue = nfsiod_workqueue, 609 .workqueue = nfsiod_workqueue,
629 .flags = RPC_TASK_ASYNC | flags, 610 .flags = RPC_TASK_ASYNC | flags,
630 }; 611 };
631 int ret = 0; 612 int ret = 0;
632 613
633 data->header->rw_ops->rw_initiate(data, &msg, &task_setup_data, how); 614 hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
634 615
635 dprintk("NFS: %5u initiated pgio call " 616 dprintk("NFS: %5u initiated pgio call "
636 "(req %s/%llu, %u bytes @ offset %llu)\n", 617 "(req %s/%llu, %u bytes @ offset %llu)\n",
637 data->task.tk_pid, 618 hdr->task.tk_pid,
638 data->header->inode->i_sb->s_id, 619 hdr->inode->i_sb->s_id,
639 (unsigned long long)NFS_FILEID(data->header->inode), 620 (unsigned long long)NFS_FILEID(hdr->inode),
640 data->args.count, 621 hdr->args.count,
641 (unsigned long long)data->args.offset); 622 (unsigned long long)hdr->args.offset);
642 623
643 task = rpc_run_task(&task_setup_data); 624 task = rpc_run_task(&task_setup_data);
644 if (IS_ERR(task)) { 625 if (IS_ERR(task)) {
@@ -665,22 +646,23 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
665 struct nfs_pgio_header *hdr) 646 struct nfs_pgio_header *hdr)
666{ 647{
667 set_bit(NFS_IOHDR_REDO, &hdr->flags); 648 set_bit(NFS_IOHDR_REDO, &hdr->flags);
668 nfs_pgio_data_release(hdr->data); 649 nfs_pgio_data_destroy(hdr);
669 hdr->data = NULL; 650 hdr->completion_ops->completion(hdr);
670 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 651 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
671 return -ENOMEM; 652 return -ENOMEM;
672} 653}
673 654
674/** 655/**
675 * nfs_pgio_release - Release pageio data 656 * nfs_pgio_release - Release pageio data
676 * @calldata: The pageio data to release 657 * @calldata: The pageio header to release
677 */ 658 */
678static void nfs_pgio_release(void *calldata) 659static void nfs_pgio_release(void *calldata)
679{ 660{
680 struct nfs_pgio_data *data = calldata; 661 struct nfs_pgio_header *hdr = calldata;
681 if (data->header->rw_ops->rw_release) 662 if (hdr->rw_ops->rw_release)
682 data->header->rw_ops->rw_release(data); 663 hdr->rw_ops->rw_release(hdr);
683 nfs_pgio_data_release(data); 664 nfs_pgio_data_destroy(hdr);
665 hdr->completion_ops->completion(hdr);
684} 666}
685 667
686/** 668/**
@@ -721,22 +703,22 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init);
721/** 703/**
722 * nfs_pgio_result - Basic pageio error handling 704 * nfs_pgio_result - Basic pageio error handling
723 * @task: The task that ran 705 * @task: The task that ran
724 * @calldata: Pageio data to check 706 * @calldata: Pageio header to check
725 */ 707 */
726static void nfs_pgio_result(struct rpc_task *task, void *calldata) 708static void nfs_pgio_result(struct rpc_task *task, void *calldata)
727{ 709{
728 struct nfs_pgio_data *data = calldata; 710 struct nfs_pgio_header *hdr = calldata;
729 struct inode *inode = data->header->inode; 711 struct inode *inode = hdr->inode;
730 712
731 dprintk("NFS: %s: %5u, (status %d)\n", __func__, 713 dprintk("NFS: %s: %5u, (status %d)\n", __func__,
732 task->tk_pid, task->tk_status); 714 task->tk_pid, task->tk_status);
733 715
734 if (data->header->rw_ops->rw_done(task, data, inode) != 0) 716 if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
735 return; 717 return;
736 if (task->tk_status < 0) 718 if (task->tk_status < 0)
737 nfs_set_pgio_error(data->header, task->tk_status, data->args.offset); 719 nfs_set_pgio_error(hdr, task->tk_status, hdr->args.offset);
738 else 720 else
739 data->header->rw_ops->rw_result(task, data); 721 hdr->rw_ops->rw_result(task, hdr);
740} 722}
741 723
742/* 724/*
@@ -751,32 +733,41 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
751 struct nfs_pgio_header *hdr) 733 struct nfs_pgio_header *hdr)
752{ 734{
753 struct nfs_page *req; 735 struct nfs_page *req;
754 struct page **pages; 736 struct page **pages,
755 struct nfs_pgio_data *data; 737 *last_page;
756 struct list_head *head = &desc->pg_list; 738 struct list_head *head = &desc->pg_list;
757 struct nfs_commit_info cinfo; 739 struct nfs_commit_info cinfo;
740 unsigned int pagecount, pageused;
758 741
759 data = nfs_pgio_data_alloc(hdr, nfs_page_array_len(desc->pg_base, 742 pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
760 desc->pg_count)); 743 if (!nfs_pgarray_set(&hdr->page_array, pagecount))
761 if (!data)
762 return nfs_pgio_error(desc, hdr); 744 return nfs_pgio_error(desc, hdr);
763 745
764 nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); 746 nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
765 pages = data->pages.pagevec; 747 pages = hdr->page_array.pagevec;
748 last_page = NULL;
749 pageused = 0;
766 while (!list_empty(head)) { 750 while (!list_empty(head)) {
767 req = nfs_list_entry(head->next); 751 req = nfs_list_entry(head->next);
768 nfs_list_remove_request(req); 752 nfs_list_remove_request(req);
769 nfs_list_add_request(req, &hdr->pages); 753 nfs_list_add_request(req, &hdr->pages);
770 *pages++ = req->wb_page; 754
755 if (!last_page || last_page != req->wb_page) {
756 pageused++;
757 if (pageused > pagecount)
758 break;
759 *pages++ = last_page = req->wb_page;
760 }
771 } 761 }
762 if (WARN_ON_ONCE(pageused != pagecount))
763 return nfs_pgio_error(desc, hdr);
772 764
773 if ((desc->pg_ioflags & FLUSH_COND_STABLE) && 765 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
774 (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) 766 (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
775 desc->pg_ioflags &= ~FLUSH_COND_STABLE; 767 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
776 768
777 /* Set up the argument struct */ 769 /* Set up the argument struct */
778 nfs_pgio_rpcsetup(data, desc->pg_count, 0, desc->pg_ioflags, &cinfo); 770 nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
779 hdr->data = data;
780 desc->pg_rpc_callops = &nfs_pgio_common_ops; 771 desc->pg_rpc_callops = &nfs_pgio_common_ops;
781 return 0; 772 return 0;
782} 773}
@@ -784,25 +775,20 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
784 775
785static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) 776static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
786{ 777{
787 struct nfs_rw_header *rw_hdr;
788 struct nfs_pgio_header *hdr; 778 struct nfs_pgio_header *hdr;
789 int ret; 779 int ret;
790 780
791 rw_hdr = nfs_rw_header_alloc(desc->pg_rw_ops); 781 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
792 if (!rw_hdr) { 782 if (!hdr) {
793 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 783 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
794 return -ENOMEM; 784 return -ENOMEM;
795 } 785 }
796 hdr = &rw_hdr->header; 786 nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
797 nfs_pgheader_init(desc, hdr, nfs_rw_header_free);
798 atomic_inc(&hdr->refcnt);
799 ret = nfs_generic_pgio(desc, hdr); 787 ret = nfs_generic_pgio(desc, hdr);
800 if (ret == 0) 788 if (ret == 0)
801 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), 789 ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
802 hdr->data, desc->pg_rpc_callops, 790 hdr, desc->pg_rpc_callops,
803 desc->pg_ioflags, 0); 791 desc->pg_ioflags, 0);
804 if (atomic_dec_and_test(&hdr->refcnt))
805 hdr->completion_ops->completion(hdr);
806 return ret; 792 return ret;
807} 793}
808 794
@@ -845,6 +831,14 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
845 return false; 831 return false;
846 if (req_offset(req) != req_offset(prev) + prev->wb_bytes) 832 if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
847 return false; 833 return false;
834 if (req->wb_page == prev->wb_page) {
835 if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
836 return false;
837 } else {
838 if (req->wb_pgbase != 0 ||
839 prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
840 return false;
841 }
848 } 842 }
849 size = pgio->pg_ops->pg_test(pgio, prev, req); 843 size = pgio->pg_ops->pg_test(pgio, prev, req);
850 WARN_ON_ONCE(size > req->wb_bytes); 844 WARN_ON_ONCE(size > req->wb_bytes);
@@ -916,7 +910,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
916 unsigned int bytes_left = 0; 910 unsigned int bytes_left = 0;
917 unsigned int offset, pgbase; 911 unsigned int offset, pgbase;
918 912
919 nfs_page_group_lock(req); 913 nfs_page_group_lock(req, false);
920 914
921 subreq = req; 915 subreq = req;
922 bytes_left = subreq->wb_bytes; 916 bytes_left = subreq->wb_bytes;
@@ -938,7 +932,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
938 if (desc->pg_recoalesce) 932 if (desc->pg_recoalesce)
939 return 0; 933 return 0;
940 /* retry add_request for this subreq */ 934 /* retry add_request for this subreq */
941 nfs_page_group_lock(req); 935 nfs_page_group_lock(req, false);
942 continue; 936 continue;
943 } 937 }
944 938
@@ -1013,7 +1007,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
1013 } while (ret); 1007 } while (ret);
1014 return ret; 1008 return ret;
1015} 1009}
1016EXPORT_SYMBOL_GPL(nfs_pageio_add_request); 1010
1011/*
1012 * nfs_pageio_resend - Transfer requests to new descriptor and resend
1013 * @hdr - the pgio header to move request from
1014 * @desc - the pageio descriptor to add requests to
1015 *
1016 * Try to move each request (nfs_page) from @hdr to @desc then attempt
1017 * to send them.
1018 *
1019 * Returns 0 on success and < 0 on error.
1020 */
1021int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
1022 struct nfs_pgio_header *hdr)
1023{
1024 LIST_HEAD(failed);
1025
1026 desc->pg_dreq = hdr->dreq;
1027 while (!list_empty(&hdr->pages)) {
1028 struct nfs_page *req = nfs_list_entry(hdr->pages.next);
1029
1030 nfs_list_remove_request(req);
1031 if (!nfs_pageio_add_request(desc, req))
1032 nfs_list_add_request(req, &failed);
1033 }
1034 nfs_pageio_complete(desc);
1035 if (!list_empty(&failed)) {
1036 list_move(&failed, &hdr->pages);
1037 return -EIO;
1038 }
1039 return 0;
1040}
1041EXPORT_SYMBOL_GPL(nfs_pageio_resend);
1017 1042
1018/** 1043/**
1019 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor 1044 * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
@@ -1029,7 +1054,6 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
1029 break; 1054 break;
1030 } 1055 }
1031} 1056}
1032EXPORT_SYMBOL_GPL(nfs_pageio_complete);
1033 1057
1034/** 1058/**
1035 * nfs_pageio_cond_complete - Conditional I/O completion 1059 * nfs_pageio_cond_complete - Conditional I/O completion
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 6fdcd233d6f7..0a5dda4d85c2 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -361,6 +361,44 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
361} 361}
362EXPORT_SYMBOL_GPL(pnfs_put_lseg); 362EXPORT_SYMBOL_GPL(pnfs_put_lseg);
363 363
364static void pnfs_free_lseg_async_work(struct work_struct *work)
365{
366 struct pnfs_layout_segment *lseg;
367 struct pnfs_layout_hdr *lo;
368
369 lseg = container_of(work, struct pnfs_layout_segment, pls_work);
370 lo = lseg->pls_layout;
371
372 pnfs_free_lseg(lseg);
373 pnfs_put_layout_hdr(lo);
374}
375
376static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg)
377{
378 INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work);
379 schedule_work(&lseg->pls_work);
380}
381
382void
383pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
384{
385 if (!lseg)
386 return;
387
388 assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock);
389
390 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
391 atomic_read(&lseg->pls_refcount),
392 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
393 if (atomic_dec_and_test(&lseg->pls_refcount)) {
394 struct pnfs_layout_hdr *lo = lseg->pls_layout;
395 pnfs_get_layout_hdr(lo);
396 pnfs_layout_remove_lseg(lo, lseg);
397 pnfs_free_lseg_async(lseg);
398 }
399}
400EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
401
364static u64 402static u64
365end_offset(u64 start, u64 len) 403end_offset(u64 start, u64 len)
366{ 404{
@@ -577,6 +615,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
577 dprintk("%s freeing layout for inode %lu\n", __func__, 615 dprintk("%s freeing layout for inode %lu\n", __func__,
578 lo->plh_inode->i_ino); 616 lo->plh_inode->i_ino);
579 inode = lo->plh_inode; 617 inode = lo->plh_inode;
618
619 pnfs_layoutcommit_inode(inode, false);
620
580 spin_lock(&inode->i_lock); 621 spin_lock(&inode->i_lock);
581 list_del_init(&lo->plh_bulk_destroy); 622 list_del_init(&lo->plh_bulk_destroy);
582 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ 623 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
@@ -665,17 +706,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
665 return (s32)(s1 - s2) > 0; 706 return (s32)(s1 - s2) > 0;
666} 707}
667 708
668static void
669pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo,
670 const nfs4_stateid *new,
671 struct list_head *free_me_list)
672{
673 if (nfs4_stateid_match_other(&lo->plh_stateid, new))
674 return;
675 /* Layout is new! Kill existing layout segments */
676 pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL);
677}
678
679/* update lo->plh_stateid with new if is more recent */ 709/* update lo->plh_stateid with new if is more recent */
680void 710void
681pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, 711pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -732,7 +762,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
732 status = -EAGAIN; 762 status = -EAGAIN;
733 } else if (!nfs4_valid_open_stateid(open_state)) { 763 } else if (!nfs4_valid_open_stateid(open_state)) {
734 status = -EBADF; 764 status = -EBADF;
735 } else if (list_empty(&lo->plh_segs)) { 765 } else if (list_empty(&lo->plh_segs) ||
766 test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
736 int seq; 767 int seq;
737 768
738 do { 769 do {
@@ -847,6 +878,16 @@ _pnfs_return_layout(struct inode *ino)
847 empty = list_empty(&lo->plh_segs); 878 empty = list_empty(&lo->plh_segs);
848 pnfs_clear_layoutcommit(ino, &tmp_list); 879 pnfs_clear_layoutcommit(ino, &tmp_list);
849 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); 880 pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
881
882 if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
883 struct pnfs_layout_range range = {
884 .iomode = IOMODE_ANY,
885 .offset = 0,
886 .length = NFS4_MAX_UINT64,
887 };
888 NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
889 }
890
850 /* Don't send a LAYOUTRETURN if list was initially empty */ 891 /* Don't send a LAYOUTRETURN if list was initially empty */
851 if (empty) { 892 if (empty) {
852 spin_unlock(&ino->i_lock); 893 spin_unlock(&ino->i_lock);
@@ -854,6 +895,8 @@ _pnfs_return_layout(struct inode *ino)
854 dprintk("NFS: %s no layout segments to return\n", __func__); 895 dprintk("NFS: %s no layout segments to return\n", __func__);
855 goto out; 896 goto out;
856 } 897 }
898
899 set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
857 lo->plh_block_lgets++; 900 lo->plh_block_lgets++;
858 spin_unlock(&ino->i_lock); 901 spin_unlock(&ino->i_lock);
859 pnfs_free_lseg_list(&tmp_list); 902 pnfs_free_lseg_list(&tmp_list);
@@ -1341,25 +1384,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
1341 goto out; 1384 goto out;
1342 } 1385 }
1343 1386
1387 init_lseg(lo, lseg);
1388 lseg->pls_range = res->range;
1389
1344 spin_lock(&ino->i_lock); 1390 spin_lock(&ino->i_lock);
1345 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { 1391 if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1346 dprintk("%s forget reply due to recall\n", __func__); 1392 dprintk("%s forget reply due to recall\n", __func__);
1347 goto out_forget_reply; 1393 goto out_forget_reply;
1348 } 1394 }
1349 1395
1350 if (pnfs_layoutgets_blocked(lo, 1) || 1396 if (pnfs_layoutgets_blocked(lo, 1)) {
1351 pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1352 dprintk("%s forget reply due to state\n", __func__); 1397 dprintk("%s forget reply due to state\n", __func__);
1353 goto out_forget_reply; 1398 goto out_forget_reply;
1354 } 1399 }
1355 1400
1356 /* Check that the new stateid matches the old stateid */ 1401 if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
1357 pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); 1402 /* existing state ID, make sure the sequence number matches. */
1358 /* Done processing layoutget. Set the layout stateid */ 1403 if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
1359 pnfs_set_layout_stateid(lo, &res->stateid, false); 1404 dprintk("%s forget reply due to sequence\n", __func__);
1405 goto out_forget_reply;
1406 }
1407 pnfs_set_layout_stateid(lo, &res->stateid, false);
1408 } else {
1409 /*
1410 * We got an entirely new state ID. Mark all segments for the
1411 * inode invalid, and don't bother validating the stateid
1412 * sequence number.
1413 */
1414 pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL);
1415
1416 nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
1417 lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
1418 }
1419
1420 clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
1360 1421
1361 init_lseg(lo, lseg);
1362 lseg->pls_range = res->range;
1363 pnfs_get_lseg(lseg); 1422 pnfs_get_lseg(lseg);
1364 pnfs_layout_insert_lseg(lo, lseg); 1423 pnfs_layout_insert_lseg(lo, lseg);
1365 1424
@@ -1470,41 +1529,19 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
1470} 1529}
1471EXPORT_SYMBOL_GPL(pnfs_generic_pg_test); 1530EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
1472 1531
1473int pnfs_write_done_resend_to_mds(struct inode *inode, 1532int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
1474 struct list_head *head,
1475 const struct nfs_pgio_completion_ops *compl_ops,
1476 struct nfs_direct_req *dreq)
1477{ 1533{
1478 struct nfs_pageio_descriptor pgio; 1534 struct nfs_pageio_descriptor pgio;
1479 LIST_HEAD(failed);
1480 1535
1481 /* Resend all requests through the MDS */ 1536 /* Resend all requests through the MDS */
1482 nfs_pageio_init_write(&pgio, inode, FLUSH_STABLE, true, compl_ops); 1537 nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
1483 pgio.pg_dreq = dreq; 1538 hdr->completion_ops);
1484 while (!list_empty(head)) { 1539 return nfs_pageio_resend(&pgio, hdr);
1485 struct nfs_page *req = nfs_list_entry(head->next);
1486
1487 nfs_list_remove_request(req);
1488 if (!nfs_pageio_add_request(&pgio, req))
1489 nfs_list_add_request(req, &failed);
1490 }
1491 nfs_pageio_complete(&pgio);
1492
1493 if (!list_empty(&failed)) {
1494 /* For some reason our attempt to resend pages. Mark the
1495 * overall send request as having failed, and let
1496 * nfs_writeback_release_full deal with the error.
1497 */
1498 list_move(&failed, head);
1499 return -EIO;
1500 }
1501 return 0;
1502} 1540}
1503EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); 1541EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
1504 1542
1505static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data) 1543static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
1506{ 1544{
1507 struct nfs_pgio_header *hdr = data->header;
1508 1545
1509 dprintk("pnfs write error = %d\n", hdr->pnfs_error); 1546 dprintk("pnfs write error = %d\n", hdr->pnfs_error);
1510 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1547 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
@@ -1512,50 +1549,42 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_data *data)
1512 pnfs_return_layout(hdr->inode); 1549 pnfs_return_layout(hdr->inode);
1513 } 1550 }
1514 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1551 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1515 data->task.tk_status = pnfs_write_done_resend_to_mds(hdr->inode, 1552 hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
1516 &hdr->pages,
1517 hdr->completion_ops,
1518 hdr->dreq);
1519} 1553}
1520 1554
1521/* 1555/*
1522 * Called by non rpc-based layout drivers 1556 * Called by non rpc-based layout drivers
1523 */ 1557 */
1524void pnfs_ld_write_done(struct nfs_pgio_data *data) 1558void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
1525{ 1559{
1526 struct nfs_pgio_header *hdr = data->header; 1560 trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
1527
1528 trace_nfs4_pnfs_write(data, hdr->pnfs_error);
1529 if (!hdr->pnfs_error) { 1561 if (!hdr->pnfs_error) {
1530 pnfs_set_layoutcommit(data); 1562 pnfs_set_layoutcommit(hdr);
1531 hdr->mds_ops->rpc_call_done(&data->task, data); 1563 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
1532 } else 1564 } else
1533 pnfs_ld_handle_write_error(data); 1565 pnfs_ld_handle_write_error(hdr);
1534 hdr->mds_ops->rpc_release(data); 1566 hdr->mds_ops->rpc_release(hdr);
1535} 1567}
1536EXPORT_SYMBOL_GPL(pnfs_ld_write_done); 1568EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
1537 1569
1538static void 1570static void
1539pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, 1571pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
1540 struct nfs_pgio_data *data) 1572 struct nfs_pgio_header *hdr)
1541{ 1573{
1542 struct nfs_pgio_header *hdr = data->header;
1543
1544 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1574 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1545 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1575 list_splice_tail_init(&hdr->pages, &desc->pg_list);
1546 nfs_pageio_reset_write_mds(desc); 1576 nfs_pageio_reset_write_mds(desc);
1547 desc->pg_recoalesce = 1; 1577 desc->pg_recoalesce = 1;
1548 } 1578 }
1549 nfs_pgio_data_release(data); 1579 nfs_pgio_data_destroy(hdr);
1550} 1580}
1551 1581
1552static enum pnfs_try_status 1582static enum pnfs_try_status
1553pnfs_try_to_write_data(struct nfs_pgio_data *wdata, 1583pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
1554 const struct rpc_call_ops *call_ops, 1584 const struct rpc_call_ops *call_ops,
1555 struct pnfs_layout_segment *lseg, 1585 struct pnfs_layout_segment *lseg,
1556 int how) 1586 int how)
1557{ 1587{
1558 struct nfs_pgio_header *hdr = wdata->header;
1559 struct inode *inode = hdr->inode; 1588 struct inode *inode = hdr->inode;
1560 enum pnfs_try_status trypnfs; 1589 enum pnfs_try_status trypnfs;
1561 struct nfs_server *nfss = NFS_SERVER(inode); 1590 struct nfs_server *nfss = NFS_SERVER(inode);
@@ -1563,8 +1592,8 @@ pnfs_try_to_write_data(struct nfs_pgio_data *wdata,
1563 hdr->mds_ops = call_ops; 1592 hdr->mds_ops = call_ops;
1564 1593
1565 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__, 1594 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
1566 inode->i_ino, wdata->args.count, wdata->args.offset, how); 1595 inode->i_ino, hdr->args.count, hdr->args.offset, how);
1567 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how); 1596 trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
1568 if (trypnfs != PNFS_NOT_ATTEMPTED) 1597 if (trypnfs != PNFS_NOT_ATTEMPTED)
1569 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE); 1598 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
1570 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1599 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1575,139 +1604,105 @@ static void
1575pnfs_do_write(struct nfs_pageio_descriptor *desc, 1604pnfs_do_write(struct nfs_pageio_descriptor *desc,
1576 struct nfs_pgio_header *hdr, int how) 1605 struct nfs_pgio_header *hdr, int how)
1577{ 1606{
1578 struct nfs_pgio_data *data = hdr->data;
1579 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1607 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1580 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1608 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1581 enum pnfs_try_status trypnfs; 1609 enum pnfs_try_status trypnfs;
1582 1610
1583 desc->pg_lseg = NULL; 1611 desc->pg_lseg = NULL;
1584 trypnfs = pnfs_try_to_write_data(data, call_ops, lseg, how); 1612 trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
1585 if (trypnfs == PNFS_NOT_ATTEMPTED) 1613 if (trypnfs == PNFS_NOT_ATTEMPTED)
1586 pnfs_write_through_mds(desc, data); 1614 pnfs_write_through_mds(desc, hdr);
1587 pnfs_put_lseg(lseg); 1615 pnfs_put_lseg(lseg);
1588} 1616}
1589 1617
1590static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) 1618static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
1591{ 1619{
1592 pnfs_put_lseg(hdr->lseg); 1620 pnfs_put_lseg(hdr->lseg);
1593 nfs_rw_header_free(hdr); 1621 nfs_pgio_header_free(hdr);
1594} 1622}
1595EXPORT_SYMBOL_GPL(pnfs_writehdr_free); 1623EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
1596 1624
1597int 1625int
1598pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) 1626pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
1599{ 1627{
1600 struct nfs_rw_header *whdr;
1601 struct nfs_pgio_header *hdr; 1628 struct nfs_pgio_header *hdr;
1602 int ret; 1629 int ret;
1603 1630
1604 whdr = nfs_rw_header_alloc(desc->pg_rw_ops); 1631 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1605 if (!whdr) { 1632 if (!hdr) {
1606 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1633 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1607 pnfs_put_lseg(desc->pg_lseg); 1634 pnfs_put_lseg(desc->pg_lseg);
1608 desc->pg_lseg = NULL; 1635 desc->pg_lseg = NULL;
1609 return -ENOMEM; 1636 return -ENOMEM;
1610 } 1637 }
1611 hdr = &whdr->header;
1612 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); 1638 nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
1613 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1639 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1614 atomic_inc(&hdr->refcnt);
1615 ret = nfs_generic_pgio(desc, hdr); 1640 ret = nfs_generic_pgio(desc, hdr);
1616 if (ret != 0) { 1641 if (ret != 0) {
1617 pnfs_put_lseg(desc->pg_lseg); 1642 pnfs_put_lseg(desc->pg_lseg);
1618 desc->pg_lseg = NULL; 1643 desc->pg_lseg = NULL;
1619 } else 1644 } else
1620 pnfs_do_write(desc, hdr, desc->pg_ioflags); 1645 pnfs_do_write(desc, hdr, desc->pg_ioflags);
1621 if (atomic_dec_and_test(&hdr->refcnt))
1622 hdr->completion_ops->completion(hdr);
1623 return ret; 1646 return ret;
1624} 1647}
1625EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages); 1648EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
1626 1649
1627int pnfs_read_done_resend_to_mds(struct inode *inode, 1650int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
1628 struct list_head *head,
1629 const struct nfs_pgio_completion_ops *compl_ops,
1630 struct nfs_direct_req *dreq)
1631{ 1651{
1632 struct nfs_pageio_descriptor pgio; 1652 struct nfs_pageio_descriptor pgio;
1633 LIST_HEAD(failed);
1634 1653
1635 /* Resend all requests through the MDS */ 1654 /* Resend all requests through the MDS */
1636 nfs_pageio_init_read(&pgio, inode, true, compl_ops); 1655 nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
1637 pgio.pg_dreq = dreq; 1656 return nfs_pageio_resend(&pgio, hdr);
1638 while (!list_empty(head)) {
1639 struct nfs_page *req = nfs_list_entry(head->next);
1640
1641 nfs_list_remove_request(req);
1642 if (!nfs_pageio_add_request(&pgio, req))
1643 nfs_list_add_request(req, &failed);
1644 }
1645 nfs_pageio_complete(&pgio);
1646
1647 if (!list_empty(&failed)) {
1648 list_move(&failed, head);
1649 return -EIO;
1650 }
1651 return 0;
1652} 1657}
1653EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds); 1658EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
1654 1659
1655static void pnfs_ld_handle_read_error(struct nfs_pgio_data *data) 1660static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
1656{ 1661{
1657 struct nfs_pgio_header *hdr = data->header;
1658
1659 dprintk("pnfs read error = %d\n", hdr->pnfs_error); 1662 dprintk("pnfs read error = %d\n", hdr->pnfs_error);
1660 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags & 1663 if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
1661 PNFS_LAYOUTRET_ON_ERROR) { 1664 PNFS_LAYOUTRET_ON_ERROR) {
1662 pnfs_return_layout(hdr->inode); 1665 pnfs_return_layout(hdr->inode);
1663 } 1666 }
1664 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) 1667 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
1665 data->task.tk_status = pnfs_read_done_resend_to_mds(hdr->inode, 1668 hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
1666 &hdr->pages,
1667 hdr->completion_ops,
1668 hdr->dreq);
1669} 1669}
1670 1670
1671/* 1671/*
1672 * Called by non rpc-based layout drivers 1672 * Called by non rpc-based layout drivers
1673 */ 1673 */
1674void pnfs_ld_read_done(struct nfs_pgio_data *data) 1674void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
1675{ 1675{
1676 struct nfs_pgio_header *hdr = data->header; 1676 trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
1677
1678 trace_nfs4_pnfs_read(data, hdr->pnfs_error);
1679 if (likely(!hdr->pnfs_error)) { 1677 if (likely(!hdr->pnfs_error)) {
1680 __nfs4_read_done_cb(data); 1678 __nfs4_read_done_cb(hdr);
1681 hdr->mds_ops->rpc_call_done(&data->task, data); 1679 hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
1682 } else 1680 } else
1683 pnfs_ld_handle_read_error(data); 1681 pnfs_ld_handle_read_error(hdr);
1684 hdr->mds_ops->rpc_release(data); 1682 hdr->mds_ops->rpc_release(hdr);
1685} 1683}
1686EXPORT_SYMBOL_GPL(pnfs_ld_read_done); 1684EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
1687 1685
1688static void 1686static void
1689pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, 1687pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
1690 struct nfs_pgio_data *data) 1688 struct nfs_pgio_header *hdr)
1691{ 1689{
1692 struct nfs_pgio_header *hdr = data->header;
1693
1694 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { 1690 if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
1695 list_splice_tail_init(&hdr->pages, &desc->pg_list); 1691 list_splice_tail_init(&hdr->pages, &desc->pg_list);
1696 nfs_pageio_reset_read_mds(desc); 1692 nfs_pageio_reset_read_mds(desc);
1697 desc->pg_recoalesce = 1; 1693 desc->pg_recoalesce = 1;
1698 } 1694 }
1699 nfs_pgio_data_release(data); 1695 nfs_pgio_data_destroy(hdr);
1700} 1696}
1701 1697
1702/* 1698/*
1703 * Call the appropriate parallel I/O subsystem read function. 1699 * Call the appropriate parallel I/O subsystem read function.
1704 */ 1700 */
1705static enum pnfs_try_status 1701static enum pnfs_try_status
1706pnfs_try_to_read_data(struct nfs_pgio_data *rdata, 1702pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
1707 const struct rpc_call_ops *call_ops, 1703 const struct rpc_call_ops *call_ops,
1708 struct pnfs_layout_segment *lseg) 1704 struct pnfs_layout_segment *lseg)
1709{ 1705{
1710 struct nfs_pgio_header *hdr = rdata->header;
1711 struct inode *inode = hdr->inode; 1706 struct inode *inode = hdr->inode;
1712 struct nfs_server *nfss = NFS_SERVER(inode); 1707 struct nfs_server *nfss = NFS_SERVER(inode);
1713 enum pnfs_try_status trypnfs; 1708 enum pnfs_try_status trypnfs;
@@ -1715,9 +1710,9 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
1715 hdr->mds_ops = call_ops; 1710 hdr->mds_ops = call_ops;
1716 1711
1717 dprintk("%s: Reading ino:%lu %u@%llu\n", 1712 dprintk("%s: Reading ino:%lu %u@%llu\n",
1718 __func__, inode->i_ino, rdata->args.count, rdata->args.offset); 1713 __func__, inode->i_ino, hdr->args.count, hdr->args.offset);
1719 1714
1720 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata); 1715 trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
1721 if (trypnfs != PNFS_NOT_ATTEMPTED) 1716 if (trypnfs != PNFS_NOT_ATTEMPTED)
1722 nfs_inc_stats(inode, NFSIOS_PNFS_READ); 1717 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
1723 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs); 1718 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
@@ -1727,52 +1722,46 @@ pnfs_try_to_read_data(struct nfs_pgio_data *rdata,
1727static void 1722static void
1728pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) 1723pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
1729{ 1724{
1730 struct nfs_pgio_data *data = hdr->data;
1731 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops; 1725 const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
1732 struct pnfs_layout_segment *lseg = desc->pg_lseg; 1726 struct pnfs_layout_segment *lseg = desc->pg_lseg;
1733 enum pnfs_try_status trypnfs; 1727 enum pnfs_try_status trypnfs;
1734 1728
1735 desc->pg_lseg = NULL; 1729 desc->pg_lseg = NULL;
1736 trypnfs = pnfs_try_to_read_data(data, call_ops, lseg); 1730 trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
1737 if (trypnfs == PNFS_NOT_ATTEMPTED) 1731 if (trypnfs == PNFS_NOT_ATTEMPTED)
1738 pnfs_read_through_mds(desc, data); 1732 pnfs_read_through_mds(desc, hdr);
1739 pnfs_put_lseg(lseg); 1733 pnfs_put_lseg(lseg);
1740} 1734}
1741 1735
1742static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) 1736static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
1743{ 1737{
1744 pnfs_put_lseg(hdr->lseg); 1738 pnfs_put_lseg(hdr->lseg);
1745 nfs_rw_header_free(hdr); 1739 nfs_pgio_header_free(hdr);
1746} 1740}
1747EXPORT_SYMBOL_GPL(pnfs_readhdr_free); 1741EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
1748 1742
1749int 1743int
1750pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) 1744pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
1751{ 1745{
1752 struct nfs_rw_header *rhdr;
1753 struct nfs_pgio_header *hdr; 1746 struct nfs_pgio_header *hdr;
1754 int ret; 1747 int ret;
1755 1748
1756 rhdr = nfs_rw_header_alloc(desc->pg_rw_ops); 1749 hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
1757 if (!rhdr) { 1750 if (!hdr) {
1758 desc->pg_completion_ops->error_cleanup(&desc->pg_list); 1751 desc->pg_completion_ops->error_cleanup(&desc->pg_list);
1759 ret = -ENOMEM; 1752 ret = -ENOMEM;
1760 pnfs_put_lseg(desc->pg_lseg); 1753 pnfs_put_lseg(desc->pg_lseg);
1761 desc->pg_lseg = NULL; 1754 desc->pg_lseg = NULL;
1762 return ret; 1755 return ret;
1763 } 1756 }
1764 hdr = &rhdr->header;
1765 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); 1757 nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
1766 hdr->lseg = pnfs_get_lseg(desc->pg_lseg); 1758 hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
1767 atomic_inc(&hdr->refcnt);
1768 ret = nfs_generic_pgio(desc, hdr); 1759 ret = nfs_generic_pgio(desc, hdr);
1769 if (ret != 0) { 1760 if (ret != 0) {
1770 pnfs_put_lseg(desc->pg_lseg); 1761 pnfs_put_lseg(desc->pg_lseg);
1771 desc->pg_lseg = NULL; 1762 desc->pg_lseg = NULL;
1772 } else 1763 } else
1773 pnfs_do_read(desc, hdr); 1764 pnfs_do_read(desc, hdr);
1774 if (atomic_dec_and_test(&hdr->refcnt))
1775 hdr->completion_ops->completion(hdr);
1776 return ret; 1765 return ret;
1777} 1766}
1778EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages); 1767EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
@@ -1820,12 +1809,11 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
1820EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); 1809EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
1821 1810
1822void 1811void
1823pnfs_set_layoutcommit(struct nfs_pgio_data *wdata) 1812pnfs_set_layoutcommit(struct nfs_pgio_header *hdr)
1824{ 1813{
1825 struct nfs_pgio_header *hdr = wdata->header;
1826 struct inode *inode = hdr->inode; 1814 struct inode *inode = hdr->inode;
1827 struct nfs_inode *nfsi = NFS_I(inode); 1815 struct nfs_inode *nfsi = NFS_I(inode);
1828 loff_t end_pos = wdata->mds_offset + wdata->res.count; 1816 loff_t end_pos = hdr->mds_offset + hdr->res.count;
1829 bool mark_as_dirty = false; 1817 bool mark_as_dirty = false;
1830 1818
1831 spin_lock(&inode->i_lock); 1819 spin_lock(&inode->i_lock);
@@ -1851,6 +1839,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_data *wdata)
1851} 1839}
1852EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); 1840EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
1853 1841
1842void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data)
1843{
1844 struct inode *inode = data->inode;
1845 struct nfs_inode *nfsi = NFS_I(inode);
1846 bool mark_as_dirty = false;
1847
1848 spin_lock(&inode->i_lock);
1849 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1850 mark_as_dirty = true;
1851 dprintk("%s: Set layoutcommit for inode %lu ",
1852 __func__, inode->i_ino);
1853 }
1854 if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) {
1855 /* references matched in nfs4_layoutcommit_release */
1856 pnfs_get_lseg(data->lseg);
1857 }
1858 if (data->lwb > nfsi->layout->plh_lwb)
1859 nfsi->layout->plh_lwb = data->lwb;
1860 spin_unlock(&inode->i_lock);
1861 dprintk("%s: lseg %p end_pos %llu\n",
1862 __func__, data->lseg, nfsi->layout->plh_lwb);
1863
1864 /* if pnfs_layoutcommit_inode() runs between inode locks, the next one
1865 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
1866 if (mark_as_dirty)
1867 mark_inode_dirty_sync(inode);
1868}
1869EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit);
1870
1854void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) 1871void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1855{ 1872{
1856 struct nfs_server *nfss = NFS_SERVER(data->args.inode); 1873 struct nfs_server *nfss = NFS_SERVER(data->args.inode);
@@ -1871,6 +1888,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
1871int 1888int
1872pnfs_layoutcommit_inode(struct inode *inode, bool sync) 1889pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1873{ 1890{
1891 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1874 struct nfs4_layoutcommit_data *data; 1892 struct nfs4_layoutcommit_data *data;
1875 struct nfs_inode *nfsi = NFS_I(inode); 1893 struct nfs_inode *nfsi = NFS_I(inode);
1876 loff_t end_pos; 1894 loff_t end_pos;
@@ -1885,7 +1903,7 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1885 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) { 1903 if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
1886 if (!sync) 1904 if (!sync)
1887 goto out; 1905 goto out;
1888 status = wait_on_bit_lock(&nfsi->flags, 1906 status = wait_on_bit_lock_action(&nfsi->flags,
1889 NFS_INO_LAYOUTCOMMITTING, 1907 NFS_INO_LAYOUTCOMMITTING,
1890 nfs_wait_bit_killable, 1908 nfs_wait_bit_killable,
1891 TASK_KILLABLE); 1909 TASK_KILLABLE);
@@ -1921,6 +1939,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
1921 data->args.lastbytewritten = end_pos - 1; 1939 data->args.lastbytewritten = end_pos - 1;
1922 data->res.server = NFS_SERVER(inode); 1940 data->res.server = NFS_SERVER(inode);
1923 1941
1942 if (ld->prepare_layoutcommit) {
1943 status = ld->prepare_layoutcommit(&data->args);
1944 if (status) {
1945 spin_lock(&inode->i_lock);
1946 if (end_pos < nfsi->layout->plh_lwb)
1947 nfsi->layout->plh_lwb = end_pos;
1948 spin_unlock(&inode->i_lock);
1949 put_rpccred(data->cred);
1950 set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
1951 goto clear_layoutcommitting;
1952 }
1953 }
1954
1955
1924 status = nfs4_proc_layoutcommit(data, sync); 1956 status = nfs4_proc_layoutcommit(data, sync);
1925out: 1957out:
1926 if (status) 1958 if (status)
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 4fb309a2b4c4..9ae5b765b073 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -32,6 +32,7 @@
32 32
33#include <linux/nfs_fs.h> 33#include <linux/nfs_fs.h>
34#include <linux/nfs_page.h> 34#include <linux/nfs_page.h>
35#include <linux/workqueue.h>
35 36
36enum { 37enum {
37 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 38 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
@@ -46,6 +47,7 @@ struct pnfs_layout_segment {
46 atomic_t pls_refcount; 47 atomic_t pls_refcount;
47 unsigned long pls_flags; 48 unsigned long pls_flags;
48 struct pnfs_layout_hdr *pls_layout; 49 struct pnfs_layout_hdr *pls_layout;
50 struct work_struct pls_work;
49}; 51};
50 52
51enum pnfs_try_status { 53enum pnfs_try_status {
@@ -63,12 +65,15 @@ enum {
63 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ 65 NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
64 NFS_LAYOUT_ROC, /* some lseg had roc bit set */ 66 NFS_LAYOUT_ROC, /* some lseg had roc bit set */
65 NFS_LAYOUT_RETURN, /* Return this layout ASAP */ 67 NFS_LAYOUT_RETURN, /* Return this layout ASAP */
68 NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
66}; 69};
67 70
68enum layoutdriver_policy_flags { 71enum layoutdriver_policy_flags {
69 /* Should the pNFS client commit and return the layout upon a setattr */ 72 /* Should the pNFS client commit and return the layout upon truncate to
73 * a smaller size */
70 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, 74 PNFS_LAYOUTRET_ON_SETATTR = 1 << 0,
71 PNFS_LAYOUTRET_ON_ERROR = 1 << 1, 75 PNFS_LAYOUTRET_ON_ERROR = 1 << 1,
76 PNFS_READ_WHOLE_PAGE = 1 << 2,
72}; 77};
73 78
74struct nfs4_deviceid_node; 79struct nfs4_deviceid_node;
@@ -80,6 +85,7 @@ struct pnfs_layoutdriver_type {
80 const char *name; 85 const char *name;
81 struct module *owner; 86 struct module *owner;
82 unsigned flags; 87 unsigned flags;
88 unsigned max_deviceinfo_size;
83 89
84 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); 90 int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *);
85 int (*clear_layoutdriver) (struct nfs_server *); 91 int (*clear_layoutdriver) (struct nfs_server *);
@@ -90,6 +96,9 @@ struct pnfs_layoutdriver_type {
90 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); 96 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags);
91 void (*free_lseg) (struct pnfs_layout_segment *lseg); 97 void (*free_lseg) (struct pnfs_layout_segment *lseg);
92 98
99 void (*return_range) (struct pnfs_layout_hdr *lo,
100 struct pnfs_layout_range *range);
101
93 /* test for nfs page cache coalescing */ 102 /* test for nfs page cache coalescing */
94 const struct nfs_pageio_ops *pg_read_ops; 103 const struct nfs_pageio_ops *pg_read_ops;
95 const struct nfs_pageio_ops *pg_write_ops; 104 const struct nfs_pageio_ops *pg_write_ops;
@@ -104,6 +113,8 @@ struct pnfs_layoutdriver_type {
104 int max); 113 int max);
105 void (*recover_commit_reqs) (struct list_head *list, 114 void (*recover_commit_reqs) (struct list_head *list,
106 struct nfs_commit_info *cinfo); 115 struct nfs_commit_info *cinfo);
116 struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
117 struct page *page);
107 int (*commit_pagelist)(struct inode *inode, 118 int (*commit_pagelist)(struct inode *inode,
108 struct list_head *mds_pages, 119 struct list_head *mds_pages,
109 int how, 120 int how,
@@ -113,18 +124,21 @@ struct pnfs_layoutdriver_type {
113 * Return PNFS_ATTEMPTED to indicate the layout code has attempted 124 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
114 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS 125 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
115 */ 126 */
116 enum pnfs_try_status (*read_pagelist) (struct nfs_pgio_data *nfs_data); 127 enum pnfs_try_status (*read_pagelist)(struct nfs_pgio_header *);
117 enum pnfs_try_status (*write_pagelist) (struct nfs_pgio_data *nfs_data, int how); 128 enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int);
118 129
119 void (*free_deviceid_node) (struct nfs4_deviceid_node *); 130 void (*free_deviceid_node) (struct nfs4_deviceid_node *);
131 struct nfs4_deviceid_node * (*alloc_deviceid_node)
132 (struct nfs_server *server, struct pnfs_device *pdev,
133 gfp_t gfp_flags);
120 134
121 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, 135 void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
122 struct xdr_stream *xdr, 136 struct xdr_stream *xdr,
123 const struct nfs4_layoutreturn_args *args); 137 const struct nfs4_layoutreturn_args *args);
124 138
125 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); 139 void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
126 140 int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args);
127 void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, 141 void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo,
128 struct xdr_stream *xdr, 142 struct xdr_stream *xdr,
129 const struct nfs4_layoutcommit_args *args); 143 const struct nfs4_layoutcommit_args *args);
130}; 144};
@@ -167,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
167extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 181extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
168 182
169/* nfs4proc.c */ 183/* nfs4proc.c */
170extern int nfs4_proc_getdevicelist(struct nfs_server *server,
171 const struct nfs_fh *fh,
172 struct pnfs_devicelist *devlist);
173extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, 184extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
174 struct pnfs_device *dev, 185 struct pnfs_device *dev,
175 struct rpc_cred *cred); 186 struct rpc_cred *cred);
@@ -179,6 +190,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
179/* pnfs.c */ 190/* pnfs.c */
180void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); 191void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
181void pnfs_put_lseg(struct pnfs_layout_segment *lseg); 192void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
193void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
182 194
183void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); 195void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
184void unset_pnfs_layoutdriver(struct nfs_server *); 196void unset_pnfs_layoutdriver(struct nfs_server *);
@@ -213,13 +225,14 @@ bool pnfs_roc(struct inode *ino);
213void pnfs_roc_release(struct inode *ino); 225void pnfs_roc_release(struct inode *ino);
214void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 226void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
215bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); 227bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task);
216void pnfs_set_layoutcommit(struct nfs_pgio_data *wdata); 228void pnfs_set_layoutcommit(struct nfs_pgio_header *);
229void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data);
217void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); 230void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
218int pnfs_layoutcommit_inode(struct inode *inode, bool sync); 231int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
219int _pnfs_return_layout(struct inode *); 232int _pnfs_return_layout(struct inode *);
220int pnfs_commit_and_return_layout(struct inode *); 233int pnfs_commit_and_return_layout(struct inode *);
221void pnfs_ld_write_done(struct nfs_pgio_data *); 234void pnfs_ld_write_done(struct nfs_pgio_header *);
222void pnfs_ld_read_done(struct nfs_pgio_data *); 235void pnfs_ld_read_done(struct nfs_pgio_header *);
223struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, 236struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
224 struct nfs_open_context *ctx, 237 struct nfs_open_context *ctx,
225 loff_t pos, 238 loff_t pos,
@@ -228,12 +241,8 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
228 gfp_t gfp_flags); 241 gfp_t gfp_flags);
229 242
230void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp); 243void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
231int pnfs_read_done_resend_to_mds(struct inode *inode, struct list_head *head, 244int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
232 const struct nfs_pgio_completion_ops *compl_ops, 245int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
233 struct nfs_direct_req *dreq);
234int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head,
235 const struct nfs_pgio_completion_ops *compl_ops,
236 struct nfs_direct_req *dreq);
237struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); 246struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
238 247
239/* nfs4_deviceid_flags */ 248/* nfs4_deviceid_flags */
@@ -254,18 +263,25 @@ struct nfs4_deviceid_node {
254 atomic_t ref; 263 atomic_t ref;
255}; 264};
256 265
257struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 266struct nfs4_deviceid_node *
267nfs4_find_get_deviceid(struct nfs_server *server,
268 const struct nfs4_deviceid *id, struct rpc_cred *cred,
269 gfp_t gfp_mask);
258void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); 270void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *);
259void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, 271void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *,
260 const struct pnfs_layoutdriver_type *,
261 const struct nfs_client *,
262 const struct nfs4_deviceid *); 272 const struct nfs4_deviceid *);
263struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *);
264bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); 273bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *);
265void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); 274void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
266bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); 275bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
267void nfs4_deviceid_purge_client(const struct nfs_client *); 276void nfs4_deviceid_purge_client(const struct nfs_client *);
268 277
278static inline struct nfs4_deviceid_node *
279nfs4_get_deviceid(struct nfs4_deviceid_node *d)
280{
281 atomic_inc(&d->ref);
282 return d;
283}
284
269static inline struct pnfs_layout_segment * 285static inline struct pnfs_layout_segment *
270pnfs_get_lseg(struct pnfs_layout_segment *lseg) 286pnfs_get_lseg(struct pnfs_layout_segment *lseg)
271{ 287{
@@ -345,6 +361,17 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
345 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); 361 NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
346} 362}
347 363
364static inline struct nfs_page *
365pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
366 struct page *page)
367{
368 struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
369
370 if (ld == NULL || ld->search_commit_reqs == NULL)
371 return NULL;
372 return ld->search_commit_reqs(cinfo, page);
373}
374
348/* Should the pNFS client commit and return the layout upon a setattr */ 375/* Should the pNFS client commit and return the layout upon a setattr */
349static inline bool 376static inline bool
350pnfs_ld_layoutret_on_setattr(struct inode *inode) 377pnfs_ld_layoutret_on_setattr(struct inode *inode)
@@ -356,6 +383,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
356} 383}
357 384
358static inline bool 385static inline bool
386pnfs_ld_read_whole_page(struct inode *inode)
387{
388 if (!pnfs_enabled_sb(NFS_SERVER(inode)))
389 return false;
390 return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE;
391}
392
393static inline bool
359pnfs_layoutcommit_outstanding(struct inode *inode) 394pnfs_layoutcommit_outstanding(struct inode *inode)
360{ 395{
361 struct nfs_inode *nfsi = NFS_I(inode); 396 struct nfs_inode *nfsi = NFS_I(inode);
@@ -427,6 +462,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode)
427} 462}
428 463
429static inline bool 464static inline bool
465pnfs_ld_read_whole_page(struct inode *inode)
466{
467 return false;
468}
469
470static inline bool
430pnfs_roc(struct inode *ino) 471pnfs_roc(struct inode *ino)
431{ 472{
432 return false; 473 return false;
@@ -496,6 +537,13 @@ pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
496{ 537{
497} 538}
498 539
540static inline struct nfs_page *
541pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
542 struct page *page)
543{
544 return NULL;
545}
546
499static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync) 547static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
500{ 548{
501 return 0; 549 return 0;
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 6da209bd9408..aa2ec0015183 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -29,6 +29,9 @@
29 */ 29 */
30 30
31#include <linux/export.h> 31#include <linux/export.h>
32#include <linux/nfs_fs.h>
33#include "nfs4session.h"
34#include "internal.h"
32#include "pnfs.h" 35#include "pnfs.h"
33 36
34#define NFSDBG_FACILITY NFSDBG_PNFS 37#define NFSDBG_FACILITY NFSDBG_PNFS
@@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
89 return NULL; 92 return NULL;
90} 93}
91 94
95static struct nfs4_deviceid_node *
96nfs4_get_device_info(struct nfs_server *server,
97 const struct nfs4_deviceid *dev_id,
98 struct rpc_cred *cred, gfp_t gfp_flags)
99{
100 struct nfs4_deviceid_node *d = NULL;
101 struct pnfs_device *pdev = NULL;
102 struct page **pages = NULL;
103 u32 max_resp_sz;
104 int max_pages;
105 int rc, i;
106
107 /*
108 * Use the session max response size as the basis for setting
109 * GETDEVICEINFO's maxcount
110 */
111 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
112 if (server->pnfs_curr_ld->max_deviceinfo_size &&
113 server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz)
114 max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size;
115 max_pages = nfs_page_array_len(0, max_resp_sz);
116 dprintk("%s: server %p max_resp_sz %u max_pages %d\n",
117 __func__, server, max_resp_sz, max_pages);
118
119 pdev = kzalloc(sizeof(*pdev), gfp_flags);
120 if (!pdev)
121 return NULL;
122
123 pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags);
124 if (!pages)
125 goto out_free_pdev;
126
127 for (i = 0; i < max_pages; i++) {
128 pages[i] = alloc_page(gfp_flags);
129 if (!pages[i])
130 goto out_free_pages;
131 }
132
133 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
134 pdev->layout_type = server->pnfs_curr_ld->id;
135 pdev->pages = pages;
136 pdev->pgbase = 0;
137 pdev->pglen = max_resp_sz;
138 pdev->mincount = 0;
139 pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead;
140
141 rc = nfs4_proc_getdeviceinfo(server, pdev, cred);
142 dprintk("%s getdevice info returns %d\n", __func__, rc);
143 if (rc)
144 goto out_free_pages;
145
146 /*
147 * Found new device, need to decode it and then add it to the
148 * list of known devices for this mountpoint.
149 */
150 d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev,
151 gfp_flags);
152
153out_free_pages:
154 for (i = 0; i < max_pages; i++)
155 __free_page(pages[i]);
156 kfree(pages);
157out_free_pdev:
158 kfree(pdev);
159 dprintk("<-- %s d %p\n", __func__, d);
160 return d;
161}
162
92/* 163/*
93 * Lookup a deviceid in cache and get a reference count on it if found 164 * Lookup a deviceid in cache and get a reference count on it if found
94 * 165 *
@@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld,
96 * @id deviceid to look up 167 * @id deviceid to look up
97 */ 168 */
98static struct nfs4_deviceid_node * 169static struct nfs4_deviceid_node *
99_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 170__nfs4_find_get_deviceid(struct nfs_server *server,
100 const struct nfs_client *clp, const struct nfs4_deviceid *id, 171 const struct nfs4_deviceid *id, long hash)
101 long hash)
102{ 172{
103 struct nfs4_deviceid_node *d; 173 struct nfs4_deviceid_node *d;
104 174
105 rcu_read_lock(); 175 rcu_read_lock();
106 d = _lookup_deviceid(ld, clp, id, hash); 176 d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id,
177 hash);
107 if (d != NULL) 178 if (d != NULL)
108 atomic_inc(&d->ref); 179 atomic_inc(&d->ref);
109 rcu_read_unlock(); 180 rcu_read_unlock();
@@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld,
111} 182}
112 183
113struct nfs4_deviceid_node * 184struct nfs4_deviceid_node *
114nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, 185nfs4_find_get_deviceid(struct nfs_server *server,
115 const struct nfs_client *clp, const struct nfs4_deviceid *id) 186 const struct nfs4_deviceid *id, struct rpc_cred *cred,
187 gfp_t gfp_mask)
116{ 188{
117 return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); 189 long hash = nfs4_deviceid_hash(id);
190 struct nfs4_deviceid_node *d, *new;
191
192 d = __nfs4_find_get_deviceid(server, id, hash);
193 if (d)
194 return d;
195
196 new = nfs4_get_device_info(server, id, cred, gfp_mask);
197 if (!new)
198 return new;
199
200 spin_lock(&nfs4_deviceid_lock);
201 d = __nfs4_find_get_deviceid(server, id, hash);
202 if (d) {
203 spin_unlock(&nfs4_deviceid_lock);
204 server->pnfs_curr_ld->free_deviceid_node(new);
205 return d;
206 }
207 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
208 atomic_inc(&new->ref);
209 spin_unlock(&nfs4_deviceid_lock);
210
211 return new;
118} 212}
119EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); 213EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid);
120 214
@@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld,
151EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); 245EXPORT_SYMBOL_GPL(nfs4_delete_deviceid);
152 246
153void 247void
154nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, 248nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server,
155 const struct pnfs_layoutdriver_type *ld,
156 const struct nfs_client *nfs_client,
157 const struct nfs4_deviceid *id) 249 const struct nfs4_deviceid *id)
158{ 250{
159 INIT_HLIST_NODE(&d->node); 251 INIT_HLIST_NODE(&d->node);
160 INIT_HLIST_NODE(&d->tmpnode); 252 INIT_HLIST_NODE(&d->tmpnode);
161 d->ld = ld; 253 d->ld = server->pnfs_curr_ld;
162 d->nfs_client = nfs_client; 254 d->nfs_client = server->nfs_client;
163 d->flags = 0; 255 d->flags = 0;
164 d->deviceid = *id; 256 d->deviceid = *id;
165 atomic_set(&d->ref, 1); 257 atomic_set(&d->ref, 1);
@@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d,
167EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); 259EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node);
168 260
169/* 261/*
170 * Uniquely initialize and insert a deviceid node into cache
171 *
172 * @new new deviceid node
173 * Note that the caller must set up the following members:
174 * new->ld
175 * new->nfs_client
176 * new->deviceid
177 *
178 * @ret the inserted node, if none found, otherwise, the found entry.
179 */
180struct nfs4_deviceid_node *
181nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new)
182{
183 struct nfs4_deviceid_node *d;
184 long hash;
185
186 spin_lock(&nfs4_deviceid_lock);
187 hash = nfs4_deviceid_hash(&new->deviceid);
188 d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash);
189 if (d) {
190 spin_unlock(&nfs4_deviceid_lock);
191 return d;
192 }
193
194 hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]);
195 spin_unlock(&nfs4_deviceid_lock);
196 atomic_inc(&new->ref);
197
198 return new;
199}
200EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node);
201
202/*
203 * Dereference a deviceid node and delete it when its reference count drops 262 * Dereference a deviceid node and delete it when its reference count drops
204 * to zero. 263 * to zero.
205 * 264 *
@@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp)
299 } 358 }
300 rcu_read_unlock(); 359 rcu_read_unlock();
301} 360}
302
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index c171ce1a8a30..b09cc23d6f43 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -578,46 +578,49 @@ nfs_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
578 return 0; 578 return 0;
579} 579}
580 580
581static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_data *data) 581static int nfs_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
582{ 582{
583 struct inode *inode = data->header->inode; 583 struct inode *inode = hdr->inode;
584 584
585 nfs_invalidate_atime(inode); 585 nfs_invalidate_atime(inode);
586 if (task->tk_status >= 0) { 586 if (task->tk_status >= 0) {
587 nfs_refresh_inode(inode, data->res.fattr); 587 nfs_refresh_inode(inode, hdr->res.fattr);
588 /* Emulate the eof flag, which isn't normally needed in NFSv2 588 /* Emulate the eof flag, which isn't normally needed in NFSv2
589 * as it is guaranteed to always return the file attributes 589 * as it is guaranteed to always return the file attributes
590 */ 590 */
591 if (data->args.offset + data->res.count >= data->res.fattr->size) 591 if (hdr->args.offset + hdr->res.count >= hdr->res.fattr->size)
592 data->res.eof = 1; 592 hdr->res.eof = 1;
593 } 593 }
594 return 0; 594 return 0;
595} 595}
596 596
597static void nfs_proc_read_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 597static void nfs_proc_read_setup(struct nfs_pgio_header *hdr,
598 struct rpc_message *msg)
598{ 599{
599 msg->rpc_proc = &nfs_procedures[NFSPROC_READ]; 600 msg->rpc_proc = &nfs_procedures[NFSPROC_READ];
600} 601}
601 602
602static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task, struct nfs_pgio_data *data) 603static int nfs_proc_pgio_rpc_prepare(struct rpc_task *task,
604 struct nfs_pgio_header *hdr)
603{ 605{
604 rpc_call_start(task); 606 rpc_call_start(task);
605 return 0; 607 return 0;
606} 608}
607 609
608static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_data *data) 610static int nfs_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
609{ 611{
610 struct inode *inode = data->header->inode; 612 struct inode *inode = hdr->inode;
611 613
612 if (task->tk_status >= 0) 614 if (task->tk_status >= 0)
613 nfs_post_op_update_inode_force_wcc(inode, data->res.fattr); 615 nfs_post_op_update_inode_force_wcc(inode, hdr->res.fattr);
614 return 0; 616 return 0;
615} 617}
616 618
617static void nfs_proc_write_setup(struct nfs_pgio_data *data, struct rpc_message *msg) 619static void nfs_proc_write_setup(struct nfs_pgio_header *hdr,
620 struct rpc_message *msg)
618{ 621{
619 /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */ 622 /* Note: NFSv2 ignores @stable and always uses NFS_FILE_SYNC */
620 data->args.stable = NFS_FILE_SYNC; 623 hdr->args.stable = NFS_FILE_SYNC;
621 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE]; 624 msg->rpc_proc = &nfs_procedures[NFSPROC_WRITE];
622} 625}
623 626
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index e818a475ca64..beff2769c5c5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -33,12 +33,12 @@ static const struct nfs_rw_ops nfs_rw_read_ops;
33 33
34static struct kmem_cache *nfs_rdata_cachep; 34static struct kmem_cache *nfs_rdata_cachep;
35 35
36static struct nfs_rw_header *nfs_readhdr_alloc(void) 36static struct nfs_pgio_header *nfs_readhdr_alloc(void)
37{ 37{
38 return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); 38 return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL);
39} 39}
40 40
41static void nfs_readhdr_free(struct nfs_rw_header *rhdr) 41static void nfs_readhdr_free(struct nfs_pgio_header *rhdr)
42{ 42{
43 kmem_cache_free(nfs_rdata_cachep, rhdr); 43 kmem_cache_free(nfs_rdata_cachep, rhdr);
44} 44}
@@ -115,12 +115,6 @@ static void nfs_readpage_release(struct nfs_page *req)
115 115
116 unlock_page(req->wb_page); 116 unlock_page(req->wb_page);
117 } 117 }
118
119 dprintk("NFS: read done (%s/%Lu %d@%Ld)\n",
120 req->wb_context->dentry->d_inode->i_sb->s_id,
121 (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode),
122 req->wb_bytes,
123 (long long)req_offset(req));
124 nfs_release_request(req); 118 nfs_release_request(req);
125} 119}
126 120
@@ -172,14 +166,15 @@ out:
172 hdr->release(hdr); 166 hdr->release(hdr);
173} 167}
174 168
175static void nfs_initiate_read(struct nfs_pgio_data *data, struct rpc_message *msg, 169static void nfs_initiate_read(struct nfs_pgio_header *hdr,
170 struct rpc_message *msg,
176 struct rpc_task_setup *task_setup_data, int how) 171 struct rpc_task_setup *task_setup_data, int how)
177{ 172{
178 struct inode *inode = data->header->inode; 173 struct inode *inode = hdr->inode;
179 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; 174 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
180 175
181 task_setup_data->flags |= swap_flags; 176 task_setup_data->flags |= swap_flags;
182 NFS_PROTO(inode)->read_setup(data, msg); 177 NFS_PROTO(inode)->read_setup(hdr, msg);
183} 178}
184 179
185static void 180static void
@@ -203,14 +198,15 @@ static const struct nfs_pgio_completion_ops nfs_async_read_completion_ops = {
203 * This is the callback from RPC telling us whether a reply was 198 * This is the callback from RPC telling us whether a reply was
204 * received or some error occurred (timeout or socket shutdown). 199 * received or some error occurred (timeout or socket shutdown).
205 */ 200 */
206static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data, 201static int nfs_readpage_done(struct rpc_task *task,
202 struct nfs_pgio_header *hdr,
207 struct inode *inode) 203 struct inode *inode)
208{ 204{
209 int status = NFS_PROTO(inode)->read_done(task, data); 205 int status = NFS_PROTO(inode)->read_done(task, hdr);
210 if (status != 0) 206 if (status != 0)
211 return status; 207 return status;
212 208
213 nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, data->res.count); 209 nfs_add_stats(inode, NFSIOS_SERVERREADBYTES, hdr->res.count);
214 210
215 if (task->tk_status == -ESTALE) { 211 if (task->tk_status == -ESTALE) {
216 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); 212 set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
@@ -219,34 +215,34 @@ static int nfs_readpage_done(struct rpc_task *task, struct nfs_pgio_data *data,
219 return 0; 215 return 0;
220} 216}
221 217
222static void nfs_readpage_retry(struct rpc_task *task, struct nfs_pgio_data *data) 218static void nfs_readpage_retry(struct rpc_task *task,
219 struct nfs_pgio_header *hdr)
223{ 220{
224 struct nfs_pgio_args *argp = &data->args; 221 struct nfs_pgio_args *argp = &hdr->args;
225 struct nfs_pgio_res *resp = &data->res; 222 struct nfs_pgio_res *resp = &hdr->res;
226 223
227 /* This is a short read! */ 224 /* This is a short read! */
228 nfs_inc_stats(data->header->inode, NFSIOS_SHORTREAD); 225 nfs_inc_stats(hdr->inode, NFSIOS_SHORTREAD);
229 /* Has the server at least made some progress? */ 226 /* Has the server at least made some progress? */
230 if (resp->count == 0) { 227 if (resp->count == 0) {
231 nfs_set_pgio_error(data->header, -EIO, argp->offset); 228 nfs_set_pgio_error(hdr, -EIO, argp->offset);
232 return; 229 return;
233 } 230 }
234 /* Yes, so retry the read at the end of the data */ 231 /* Yes, so retry the read at the end of the hdr */
235 data->mds_offset += resp->count; 232 hdr->mds_offset += resp->count;
236 argp->offset += resp->count; 233 argp->offset += resp->count;
237 argp->pgbase += resp->count; 234 argp->pgbase += resp->count;
238 argp->count -= resp->count; 235 argp->count -= resp->count;
239 rpc_restart_call_prepare(task); 236 rpc_restart_call_prepare(task);
240} 237}
241 238
242static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *data) 239static void nfs_readpage_result(struct rpc_task *task,
240 struct nfs_pgio_header *hdr)
243{ 241{
244 struct nfs_pgio_header *hdr = data->header; 242 if (hdr->res.eof) {
245
246 if (data->res.eof) {
247 loff_t bound; 243 loff_t bound;
248 244
249 bound = data->args.offset + data->res.count; 245 bound = hdr->args.offset + hdr->res.count;
250 spin_lock(&hdr->lock); 246 spin_lock(&hdr->lock);
251 if (bound < hdr->io_start + hdr->good_bytes) { 247 if (bound < hdr->io_start + hdr->good_bytes) {
252 set_bit(NFS_IOHDR_EOF, &hdr->flags); 248 set_bit(NFS_IOHDR_EOF, &hdr->flags);
@@ -254,8 +250,8 @@ static void nfs_readpage_result(struct rpc_task *task, struct nfs_pgio_data *dat
254 hdr->good_bytes = bound - hdr->io_start; 250 hdr->good_bytes = bound - hdr->io_start;
255 } 251 }
256 spin_unlock(&hdr->lock); 252 spin_unlock(&hdr->lock);
257 } else if (data->res.count != data->args.count) 253 } else if (hdr->res.count != hdr->args.count)
258 nfs_readpage_retry(task, data); 254 nfs_readpage_retry(task, hdr);
259} 255}
260 256
261/* 257/*
@@ -404,7 +400,7 @@ out:
404int __init nfs_init_readpagecache(void) 400int __init nfs_init_readpagecache(void)
405{ 401{
406 nfs_rdata_cachep = kmem_cache_create("nfs_read_data", 402 nfs_rdata_cachep = kmem_cache_create("nfs_read_data",
407 sizeof(struct nfs_rw_header), 403 sizeof(struct nfs_pgio_header),
408 0, SLAB_HWCACHE_ALIGN, 404 0, SLAB_HWCACHE_ALIGN,
409 NULL); 405 NULL);
410 if (nfs_rdata_cachep == NULL) 406 if (nfs_rdata_cachep == NULL)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 084af1060d79..31a11b0e885d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1027,8 +1027,7 @@ static bool nfs_auth_info_add(struct nfs_auth_info *auth_info,
1027 rpc_authflavor_t flavor) 1027 rpc_authflavor_t flavor)
1028{ 1028{
1029 unsigned int i; 1029 unsigned int i;
1030 unsigned int max_flavor_len = (sizeof(auth_info->flavors) / 1030 unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors);
1031 sizeof(auth_info->flavors[0]));
1032 1031
1033 /* make sure this flavor isn't already in the list */ 1032 /* make sure this flavor isn't already in the list */
1034 for (i = 0; i < auth_info->flavor_len; i++) { 1033 for (i = 0; i < auth_info->flavor_len; i++) {
@@ -2066,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options,
2066 return NFS_TEXT_DATA; 2065 return NFS_TEXT_DATA;
2067 } 2066 }
2068 2067
2069#if !IS_ENABLED(CONFIG_NFS_V3)
2070 if (args->version == 3)
2071 goto out_v3_not_compiled;
2072#endif /* !CONFIG_NFS_V3 */
2073
2074 return 0; 2068 return 0;
2075 2069
2076out_no_data: 2070out_no_data:
@@ -2086,12 +2080,6 @@ out_no_sec:
2086 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); 2080 dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n");
2087 return -EINVAL; 2081 return -EINVAL;
2088 2082
2089#if !IS_ENABLED(CONFIG_NFS_V3)
2090out_v3_not_compiled:
2091 dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n");
2092 return -EPROTONOSUPPORT;
2093#endif /* !CONFIG_NFS_V3 */
2094
2095out_nomem: 2083out_nomem:
2096 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); 2084 dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
2097 return -ENOMEM; 2085 return -ENOMEM;
@@ -2180,7 +2168,7 @@ out_no_address:
2180 return -EINVAL; 2168 return -EINVAL;
2181} 2169}
2182 2170
2183#define NFS_MOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \ 2171#define NFS_REMOUNT_CMP_FLAGMASK ~(NFS_MOUNT_INTR \
2184 | NFS_MOUNT_SECURE \ 2172 | NFS_MOUNT_SECURE \
2185 | NFS_MOUNT_TCP \ 2173 | NFS_MOUNT_TCP \
2186 | NFS_MOUNT_VER3 \ 2174 | NFS_MOUNT_VER3 \
@@ -2188,15 +2176,16 @@ out_no_address:
2188 | NFS_MOUNT_NONLM \ 2176 | NFS_MOUNT_NONLM \
2189 | NFS_MOUNT_BROKEN_SUID \ 2177 | NFS_MOUNT_BROKEN_SUID \
2190 | NFS_MOUNT_STRICTLOCK \ 2178 | NFS_MOUNT_STRICTLOCK \
2191 | NFS_MOUNT_UNSHARED \
2192 | NFS_MOUNT_NORESVPORT \
2193 | NFS_MOUNT_LEGACY_INTERFACE) 2179 | NFS_MOUNT_LEGACY_INTERFACE)
2194 2180
2181#define NFS_MOUNT_CMP_FLAGMASK (NFS_REMOUNT_CMP_FLAGMASK & \
2182 ~(NFS_MOUNT_UNSHARED | NFS_MOUNT_NORESVPORT))
2183
2195static int 2184static int
2196nfs_compare_remount_data(struct nfs_server *nfss, 2185nfs_compare_remount_data(struct nfs_server *nfss,
2197 struct nfs_parsed_mount_data *data) 2186 struct nfs_parsed_mount_data *data)
2198{ 2187{
2199 if ((data->flags ^ nfss->flags) & NFS_MOUNT_CMP_FLAGMASK || 2188 if ((data->flags ^ nfss->flags) & NFS_REMOUNT_CMP_FLAGMASK ||
2200 data->rsize != nfss->rsize || 2189 data->rsize != nfss->rsize ||
2201 data->wsize != nfss->wsize || 2190 data->wsize != nfss->wsize ||
2202 data->version != nfss->nfs_client->rpc_ops->version || 2191 data->version != nfss->nfs_client->rpc_ops->version ||
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 5e2f10304548..12493846a2d3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -47,6 +47,11 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops;
47static const struct nfs_commit_completion_ops nfs_commit_completion_ops; 47static const struct nfs_commit_completion_ops nfs_commit_completion_ops;
48static const struct nfs_rw_ops nfs_rw_write_ops; 48static const struct nfs_rw_ops nfs_rw_write_ops;
49static void nfs_clear_request_commit(struct nfs_page *req); 49static void nfs_clear_request_commit(struct nfs_page *req);
50static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
51 struct inode *inode);
52static struct nfs_page *
53nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
54 struct page *page);
50 55
51static struct kmem_cache *nfs_wdata_cachep; 56static struct kmem_cache *nfs_wdata_cachep;
52static mempool_t *nfs_wdata_mempool; 57static mempool_t *nfs_wdata_mempool;
@@ -71,18 +76,18 @@ void nfs_commit_free(struct nfs_commit_data *p)
71} 76}
72EXPORT_SYMBOL_GPL(nfs_commit_free); 77EXPORT_SYMBOL_GPL(nfs_commit_free);
73 78
74static struct nfs_rw_header *nfs_writehdr_alloc(void) 79static struct nfs_pgio_header *nfs_writehdr_alloc(void)
75{ 80{
76 struct nfs_rw_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); 81 struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO);
77 82
78 if (p) 83 if (p)
79 memset(p, 0, sizeof(*p)); 84 memset(p, 0, sizeof(*p));
80 return p; 85 return p;
81} 86}
82 87
83static void nfs_writehdr_free(struct nfs_rw_header *whdr) 88static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
84{ 89{
85 mempool_free(whdr, nfs_wdata_mempool); 90 mempool_free(hdr, nfs_wdata_mempool);
86} 91}
87 92
88static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) 93static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error)
@@ -106,21 +111,12 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page)
106 111
107 if (PagePrivate(page)) 112 if (PagePrivate(page))
108 req = (struct nfs_page *)page_private(page); 113 req = (struct nfs_page *)page_private(page);
109 else if (unlikely(PageSwapCache(page))) { 114 else if (unlikely(PageSwapCache(page)))
110 struct nfs_page *freq, *t; 115 req = nfs_page_search_commits_for_head_request_locked(nfsi,
111 116 page);
112 /* Linearly search the commit list for the correct req */
113 list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) {
114 if (freq->wb_page == page) {
115 req = freq->wb_head;
116 break;
117 }
118 }
119 }
120 117
121 if (req) { 118 if (req) {
122 WARN_ON_ONCE(req->wb_head != req); 119 WARN_ON_ONCE(req->wb_head != req);
123
124 kref_get(&req->wb_kref); 120 kref_get(&req->wb_kref);
125 } 121 }
126 122
@@ -216,7 +212,7 @@ static bool nfs_page_group_covers_page(struct nfs_page *req)
216 unsigned int pos = 0; 212 unsigned int pos = 0;
217 unsigned int len = nfs_page_length(req->wb_page); 213 unsigned int len = nfs_page_length(req->wb_page);
218 214
219 nfs_page_group_lock(req); 215 nfs_page_group_lock(req, false);
220 216
221 do { 217 do {
222 tmp = nfs_page_group_search_locked(req->wb_head, pos); 218 tmp = nfs_page_group_search_locked(req->wb_head, pos);
@@ -246,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req)
246 242
247static int wb_priority(struct writeback_control *wbc) 243static int wb_priority(struct writeback_control *wbc)
248{ 244{
245 int ret = 0;
249 if (wbc->for_reclaim) 246 if (wbc->for_reclaim)
250 return FLUSH_HIGHPRI | FLUSH_STABLE; 247 return FLUSH_HIGHPRI | FLUSH_STABLE;
248 if (wbc->sync_mode == WB_SYNC_ALL)
249 ret = FLUSH_COND_STABLE;
251 if (wbc->for_kupdate || wbc->for_background) 250 if (wbc->for_kupdate || wbc->for_background)
252 return FLUSH_LOWPRI | FLUSH_COND_STABLE; 251 ret |= FLUSH_LOWPRI;
253 return FLUSH_COND_STABLE; 252 return ret;
254} 253}
255 254
256/* 255/*
@@ -379,8 +378,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
379 subreq->wb_head = subreq; 378 subreq->wb_head = subreq;
380 subreq->wb_this_page = subreq; 379 subreq->wb_this_page = subreq;
381 380
382 nfs_clear_request_commit(subreq);
383
384 /* subreq is now totally disconnected from page group or any 381 /* subreq is now totally disconnected from page group or any
385 * write / commit lists. last chance to wake any waiters */ 382 * write / commit lists. last chance to wake any waiters */
386 nfs_unlock_request(subreq); 383 nfs_unlock_request(subreq);
@@ -455,8 +452,23 @@ try_again:
455 return NULL; 452 return NULL;
456 } 453 }
457 454
455 /* holding inode lock, so always make a non-blocking call to try the
456 * page group lock */
457 ret = nfs_page_group_lock(head, true);
458 if (ret < 0) {
459 spin_unlock(&inode->i_lock);
460
461 if (!nonblock && ret == -EAGAIN) {
462 nfs_page_group_lock_wait(head);
463 nfs_release_request(head);
464 goto try_again;
465 }
466
467 nfs_release_request(head);
468 return ERR_PTR(ret);
469 }
470
458 /* lock each request in the page group */ 471 /* lock each request in the page group */
459 nfs_page_group_lock(head);
460 subreq = head; 472 subreq = head;
461 do { 473 do {
462 /* 474 /*
@@ -488,7 +500,7 @@ try_again:
488 * Commit list removal accounting is done after locks are dropped */ 500 * Commit list removal accounting is done after locks are dropped */
489 subreq = head; 501 subreq = head;
490 do { 502 do {
491 nfs_list_remove_request(subreq); 503 nfs_clear_request_commit(subreq);
492 subreq = subreq->wb_this_page; 504 subreq = subreq->wb_this_page;
493 } while (subreq != head); 505 } while (subreq != head);
494 506
@@ -518,15 +530,11 @@ try_again:
518 530
519 nfs_page_group_unlock(head); 531 nfs_page_group_unlock(head);
520 532
521 /* drop lock to clear_request_commit the head req and clean up 533 /* drop lock to clean uprequests on destroy list */
522 * requests on destroy list */
523 spin_unlock(&inode->i_lock); 534 spin_unlock(&inode->i_lock);
524 535
525 nfs_destroy_unlinked_subrequests(destroy_list, head); 536 nfs_destroy_unlinked_subrequests(destroy_list, head);
526 537
527 /* clean up commit list state */
528 nfs_clear_request_commit(head);
529
530 /* still holds ref on head from nfs_page_find_head_request_locked 538 /* still holds ref on head from nfs_page_find_head_request_locked
531 * and still has lock on head from lock loop */ 539 * and still has lock on head from lock loop */
532 return head; 540 return head;
@@ -623,7 +631,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
623 int err; 631 int err;
624 632
625 /* Stop dirtying of new pages while we sync */ 633 /* Stop dirtying of new pages while we sync */
626 err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING, 634 err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
627 nfs_wait_bit_killable, TASK_KILLABLE); 635 nfs_wait_bit_killable, TASK_KILLABLE);
628 if (err) 636 if (err)
629 goto out_err; 637 goto out_err;
@@ -697,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
697 if (likely(!PageSwapCache(head->wb_page))) { 705 if (likely(!PageSwapCache(head->wb_page))) {
698 set_page_private(head->wb_page, 0); 706 set_page_private(head->wb_page, 0);
699 ClearPagePrivate(head->wb_page); 707 ClearPagePrivate(head->wb_page);
708 smp_mb__after_atomic();
709 wake_up_page(head->wb_page, PG_private);
700 clear_bit(PG_MAPPED, &head->wb_flags); 710 clear_bit(PG_MAPPED, &head->wb_flags);
701 } 711 }
702 nfsi->npages--; 712 nfsi->npages--;
@@ -705,6 +715,8 @@ static void nfs_inode_remove_request(struct nfs_page *req)
705 715
706 if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) 716 if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
707 nfs_release_request(req); 717 nfs_release_request(req);
718 else
719 WARN_ON_ONCE(1);
708} 720}
709 721
710static void 722static void
@@ -713,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req)
713 __set_page_dirty_nobuffers(req->wb_page); 725 __set_page_dirty_nobuffers(req->wb_page);
714} 726}
715 727
716#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 728/*
729 * nfs_page_search_commits_for_head_request_locked
730 *
731 * Search through commit lists on @inode for the head request for @page.
732 * Must be called while holding the inode (which is cinfo) lock.
733 *
734 * Returns the head request if found, or NULL if not found.
735 */
736static struct nfs_page *
737nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi,
738 struct page *page)
739{
740 struct nfs_page *freq, *t;
741 struct nfs_commit_info cinfo;
742 struct inode *inode = &nfsi->vfs_inode;
743
744 nfs_init_cinfo_from_inode(&cinfo, inode);
745
746 /* search through pnfs commit lists */
747 freq = pnfs_search_commit_reqs(inode, &cinfo, page);
748 if (freq)
749 return freq->wb_head;
750
751 /* Linearly search the commit list for the correct request */
752 list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) {
753 if (freq->wb_page == page)
754 return freq->wb_head;
755 }
756
757 return NULL;
758}
759
717/** 760/**
718 * nfs_request_add_commit_list - add request to a commit list 761 * nfs_request_add_commit_list - add request to a commit list
719 * @req: pointer to a struct nfs_page 762 * @req: pointer to a struct nfs_page
@@ -808,6 +851,7 @@ nfs_clear_page_commit(struct page *page)
808 dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); 851 dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
809} 852}
810 853
854/* Called holding inode (/cinfo) lock */
811static void 855static void
812nfs_clear_request_commit(struct nfs_page *req) 856nfs_clear_request_commit(struct nfs_page *req)
813{ 857{
@@ -817,53 +861,19 @@ nfs_clear_request_commit(struct nfs_page *req)
817 861
818 nfs_init_cinfo_from_inode(&cinfo, inode); 862 nfs_init_cinfo_from_inode(&cinfo, inode);
819 if (!pnfs_clear_request_commit(req, &cinfo)) { 863 if (!pnfs_clear_request_commit(req, &cinfo)) {
820 spin_lock(cinfo.lock);
821 nfs_request_remove_commit_list(req, &cinfo); 864 nfs_request_remove_commit_list(req, &cinfo);
822 spin_unlock(cinfo.lock);
823 } 865 }
824 nfs_clear_page_commit(req->wb_page); 866 nfs_clear_page_commit(req->wb_page);
825 } 867 }
826} 868}
827 869
828static inline 870int nfs_write_need_commit(struct nfs_pgio_header *hdr)
829int nfs_write_need_commit(struct nfs_pgio_data *data)
830{ 871{
831 if (data->verf.committed == NFS_DATA_SYNC) 872 if (hdr->verf.committed == NFS_DATA_SYNC)
832 return data->header->lseg == NULL; 873 return hdr->lseg == NULL;
833 return data->verf.committed != NFS_FILE_SYNC; 874 return hdr->verf.committed != NFS_FILE_SYNC;
834} 875}
835 876
836#else
837static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo,
838 struct inode *inode)
839{
840}
841
842void nfs_init_cinfo(struct nfs_commit_info *cinfo,
843 struct inode *inode,
844 struct nfs_direct_req *dreq)
845{
846}
847
848void
849nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
850 struct nfs_commit_info *cinfo)
851{
852}
853
854static void
855nfs_clear_request_commit(struct nfs_page *req)
856{
857}
858
859static inline
860int nfs_write_need_commit(struct nfs_pgio_data *data)
861{
862 return 0;
863}
864
865#endif
866
867static void nfs_write_completion(struct nfs_pgio_header *hdr) 877static void nfs_write_completion(struct nfs_pgio_header *hdr)
868{ 878{
869 struct nfs_commit_info cinfo; 879 struct nfs_commit_info cinfo;
@@ -883,11 +893,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
883 nfs_context_set_write_error(req->wb_context, hdr->error); 893 nfs_context_set_write_error(req->wb_context, hdr->error);
884 goto remove_req; 894 goto remove_req;
885 } 895 }
886 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) { 896 if (nfs_write_need_commit(hdr)) {
887 nfs_mark_request_dirty(req);
888 goto next;
889 }
890 if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) {
891 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); 897 memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
892 nfs_mark_request_commit(req, hdr->lseg, &cinfo); 898 nfs_mark_request_commit(req, hdr->lseg, &cinfo);
893 goto next; 899 goto next;
@@ -903,7 +909,6 @@ out:
903 hdr->release(hdr); 909 hdr->release(hdr);
904} 910}
905 911
906#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
907unsigned long 912unsigned long
908nfs_reqs_to_commit(struct nfs_commit_info *cinfo) 913nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
909{ 914{
@@ -960,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst,
960 return ret; 965 return ret;
961} 966}
962 967
963#else
964unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo)
965{
966 return 0;
967}
968
969int nfs_scan_commit(struct inode *inode, struct list_head *dst,
970 struct nfs_commit_info *cinfo)
971{
972 return 0;
973}
974#endif
975
976/* 968/*
977 * Search for an existing write request, and attempt to update 969 * Search for an existing write request, and attempt to update
978 * it to reflect a new dirty region on a given page. 970 * it to reflect a new dirty region on a given page.
@@ -1038,9 +1030,9 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
1038 else 1030 else
1039 req->wb_bytes = rqend - req->wb_offset; 1031 req->wb_bytes = rqend - req->wb_offset;
1040out_unlock: 1032out_unlock:
1041 spin_unlock(&inode->i_lock);
1042 if (req) 1033 if (req)
1043 nfs_clear_request_commit(req); 1034 nfs_clear_request_commit(req);
1035 spin_unlock(&inode->i_lock);
1044 return req; 1036 return req;
1045out_flushme: 1037out_flushme:
1046 spin_unlock(&inode->i_lock); 1038 spin_unlock(&inode->i_lock);
@@ -1241,17 +1233,18 @@ static int flush_task_priority(int how)
1241 return RPC_PRIORITY_NORMAL; 1233 return RPC_PRIORITY_NORMAL;
1242} 1234}
1243 1235
1244static void nfs_initiate_write(struct nfs_pgio_data *data, struct rpc_message *msg, 1236static void nfs_initiate_write(struct nfs_pgio_header *hdr,
1237 struct rpc_message *msg,
1245 struct rpc_task_setup *task_setup_data, int how) 1238 struct rpc_task_setup *task_setup_data, int how)
1246{ 1239{
1247 struct inode *inode = data->header->inode; 1240 struct inode *inode = hdr->inode;
1248 int priority = flush_task_priority(how); 1241 int priority = flush_task_priority(how);
1249 1242
1250 task_setup_data->priority = priority; 1243 task_setup_data->priority = priority;
1251 NFS_PROTO(inode)->write_setup(data, msg); 1244 NFS_PROTO(inode)->write_setup(hdr, msg);
1252 1245
1253 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client, 1246 nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
1254 &task_setup_data->rpc_client, msg, data); 1247 &task_setup_data->rpc_client, msg, hdr);
1255} 1248}
1256 1249
1257/* If a nfs_flush_* function fails, it should remove reqs from @head and 1250/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -1313,21 +1306,9 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata)
1313 NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); 1306 NFS_PROTO(data->inode)->commit_rpc_prepare(task, data);
1314} 1307}
1315 1308
1316static void nfs_writeback_release_common(struct nfs_pgio_data *data) 1309static void nfs_writeback_release_common(struct nfs_pgio_header *hdr)
1317{ 1310{
1318 struct nfs_pgio_header *hdr = data->header; 1311 /* do nothing! */
1319 int status = data->task.tk_status;
1320
1321 if ((status >= 0) && nfs_write_need_commit(data)) {
1322 spin_lock(&hdr->lock);
1323 if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags))
1324 ; /* Do nothing */
1325 else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags))
1326 memcpy(&hdr->verf, &data->verf, sizeof(hdr->verf));
1327 else if (memcmp(&hdr->verf, &data->verf, sizeof(hdr->verf)))
1328 set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags);
1329 spin_unlock(&hdr->lock);
1330 }
1331} 1312}
1332 1313
1333/* 1314/*
@@ -1358,7 +1339,8 @@ static int nfs_should_remove_suid(const struct inode *inode)
1358/* 1339/*
1359 * This function is called when the WRITE call is complete. 1340 * This function is called when the WRITE call is complete.
1360 */ 1341 */
1361static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data, 1342static int nfs_writeback_done(struct rpc_task *task,
1343 struct nfs_pgio_header *hdr,
1362 struct inode *inode) 1344 struct inode *inode)
1363{ 1345{
1364 int status; 1346 int status;
@@ -1370,13 +1352,13 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
1370 * another writer had changed the file, but some applications 1352 * another writer had changed the file, but some applications
1371 * depend on tighter cache coherency when writing. 1353 * depend on tighter cache coherency when writing.
1372 */ 1354 */
1373 status = NFS_PROTO(inode)->write_done(task, data); 1355 status = NFS_PROTO(inode)->write_done(task, hdr);
1374 if (status != 0) 1356 if (status != 0)
1375 return status; 1357 return status;
1376 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, data->res.count); 1358 nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count);
1377 1359
1378#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) 1360 if (hdr->res.verf->committed < hdr->args.stable &&
1379 if (data->res.verf->committed < data->args.stable && task->tk_status >= 0) { 1361 task->tk_status >= 0) {
1380 /* We tried a write call, but the server did not 1362 /* We tried a write call, but the server did not
1381 * commit data to stable storage even though we 1363 * commit data to stable storage even though we
1382 * requested it. 1364 * requested it.
@@ -1392,11 +1374,10 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
1392 dprintk("NFS: faulty NFS server %s:" 1374 dprintk("NFS: faulty NFS server %s:"
1393 " (committed = %d) != (stable = %d)\n", 1375 " (committed = %d) != (stable = %d)\n",
1394 NFS_SERVER(inode)->nfs_client->cl_hostname, 1376 NFS_SERVER(inode)->nfs_client->cl_hostname,
1395 data->res.verf->committed, data->args.stable); 1377 hdr->res.verf->committed, hdr->args.stable);
1396 complain = jiffies + 300 * HZ; 1378 complain = jiffies + 300 * HZ;
1397 } 1379 }
1398 } 1380 }
1399#endif
1400 1381
1401 /* Deal with the suid/sgid bit corner case */ 1382 /* Deal with the suid/sgid bit corner case */
1402 if (nfs_should_remove_suid(inode)) 1383 if (nfs_should_remove_suid(inode))
@@ -1407,16 +1388,17 @@ static int nfs_writeback_done(struct rpc_task *task, struct nfs_pgio_data *data,
1407/* 1388/*
1408 * This function is called when the WRITE call is complete. 1389 * This function is called when the WRITE call is complete.
1409 */ 1390 */
1410static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *data) 1391static void nfs_writeback_result(struct rpc_task *task,
1392 struct nfs_pgio_header *hdr)
1411{ 1393{
1412 struct nfs_pgio_args *argp = &data->args; 1394 struct nfs_pgio_args *argp = &hdr->args;
1413 struct nfs_pgio_res *resp = &data->res; 1395 struct nfs_pgio_res *resp = &hdr->res;
1414 1396
1415 if (resp->count < argp->count) { 1397 if (resp->count < argp->count) {
1416 static unsigned long complain; 1398 static unsigned long complain;
1417 1399
1418 /* This a short write! */ 1400 /* This a short write! */
1419 nfs_inc_stats(data->header->inode, NFSIOS_SHORTWRITE); 1401 nfs_inc_stats(hdr->inode, NFSIOS_SHORTWRITE);
1420 1402
1421 /* Has the server at least made some progress? */ 1403 /* Has the server at least made some progress? */
1422 if (resp->count == 0) { 1404 if (resp->count == 0) {
@@ -1426,14 +1408,14 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
1426 argp->count); 1408 argp->count);
1427 complain = jiffies + 300 * HZ; 1409 complain = jiffies + 300 * HZ;
1428 } 1410 }
1429 nfs_set_pgio_error(data->header, -EIO, argp->offset); 1411 nfs_set_pgio_error(hdr, -EIO, argp->offset);
1430 task->tk_status = -EIO; 1412 task->tk_status = -EIO;
1431 return; 1413 return;
1432 } 1414 }
1433 /* Was this an NFSv2 write or an NFSv3 stable write? */ 1415 /* Was this an NFSv2 write or an NFSv3 stable write? */
1434 if (resp->verf->committed != NFS_UNSTABLE) { 1416 if (resp->verf->committed != NFS_UNSTABLE) {
1435 /* Resend from where the server left off */ 1417 /* Resend from where the server left off */
1436 data->mds_offset += resp->count; 1418 hdr->mds_offset += resp->count;
1437 argp->offset += resp->count; 1419 argp->offset += resp->count;
1438 argp->pgbase += resp->count; 1420 argp->pgbase += resp->count;
1439 argp->count -= resp->count; 1421 argp->count -= resp->count;
@@ -1448,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task, struct nfs_pgio_data *da
1448} 1430}
1449 1431
1450 1432
1451#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4)
1452static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) 1433static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1453{ 1434{
1454 int ret; 1435 int ret;
@@ -1517,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
1517} 1498}
1518EXPORT_SYMBOL_GPL(nfs_initiate_commit); 1499EXPORT_SYMBOL_GPL(nfs_initiate_commit);
1519 1500
1501static loff_t nfs_get_lwb(struct list_head *head)
1502{
1503 loff_t lwb = 0;
1504 struct nfs_page *req;
1505
1506 list_for_each_entry(req, head, wb_list)
1507 if (lwb < (req_offset(req) + req->wb_bytes))
1508 lwb = req_offset(req) + req->wb_bytes;
1509
1510 return lwb;
1511}
1512
1520/* 1513/*
1521 * Set up the argument/result storage required for the RPC call. 1514 * Set up the argument/result storage required for the RPC call.
1522 */ 1515 */
@@ -1536,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data,
1536 data->inode = inode; 1529 data->inode = inode;
1537 data->cred = first->wb_context->cred; 1530 data->cred = first->wb_context->cred;
1538 data->lseg = lseg; /* reference transferred */ 1531 data->lseg = lseg; /* reference transferred */
1532 /* only set lwb for pnfs commit */
1533 if (lseg)
1534 data->lwb = nfs_get_lwb(&data->pages);
1539 data->mds_ops = &nfs_commit_ops; 1535 data->mds_ops = &nfs_commit_ops;
1540 data->completion_ops = cinfo->completion_ops; 1536 data->completion_ops = cinfo->completion_ops;
1541 data->dreq = cinfo->dreq; 1537 data->dreq = cinfo->dreq;
@@ -1615,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1615 struct nfs_page *req; 1611 struct nfs_page *req;
1616 int status = data->task.tk_status; 1612 int status = data->task.tk_status;
1617 struct nfs_commit_info cinfo; 1613 struct nfs_commit_info cinfo;
1614 struct nfs_server *nfss;
1618 1615
1619 while (!list_empty(&data->pages)) { 1616 while (!list_empty(&data->pages)) {
1620 req = nfs_list_entry(data->pages.next); 1617 req = nfs_list_entry(data->pages.next);
@@ -1648,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
1648 next: 1645 next:
1649 nfs_unlock_and_release_request(req); 1646 nfs_unlock_and_release_request(req);
1650 } 1647 }
1648 nfss = NFS_SERVER(data->inode);
1649 if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH)
1650 clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
1651
1651 nfs_init_cinfo(&cinfo, data->inode, data->dreq); 1652 nfs_init_cinfo(&cinfo, data->inode, data->dreq);
1652 if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) 1653 if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
1653 nfs_commit_clear_lock(NFS_I(data->inode)); 1654 nfs_commit_clear_lock(NFS_I(data->inode));
@@ -1703,7 +1704,7 @@ int nfs_commit_inode(struct inode *inode, int how)
1703 return error; 1704 return error;
1704 if (!may_wait) 1705 if (!may_wait)
1705 goto out_mark_dirty; 1706 goto out_mark_dirty;
1706 error = wait_on_bit(&NFS_I(inode)->flags, 1707 error = wait_on_bit_action(&NFS_I(inode)->flags,
1707 NFS_INO_COMMIT, 1708 NFS_INO_COMMIT,
1708 nfs_wait_bit_killable, 1709 nfs_wait_bit_killable,
1709 TASK_KILLABLE); 1710 TASK_KILLABLE);
@@ -1757,12 +1758,6 @@ out_mark_dirty:
1757 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 1758 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1758 return ret; 1759 return ret;
1759} 1760}
1760#else
1761static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
1762{
1763 return 0;
1764}
1765#endif
1766 1761
1767int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) 1762int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1768{ 1763{
@@ -1884,7 +1879,7 @@ int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
1884int __init nfs_init_writepagecache(void) 1879int __init nfs_init_writepagecache(void)
1885{ 1880{
1886 nfs_wdata_cachep = kmem_cache_create("nfs_write_data", 1881 nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
1887 sizeof(struct nfs_rw_header), 1882 sizeof(struct nfs_pgio_header),
1888 0, SLAB_HWCACHE_ALIGN, 1883 0, SLAB_HWCACHE_ALIGN,
1889 NULL); 1884 NULL);
1890 if (nfs_wdata_cachep == NULL) 1885 if (nfs_wdata_cachep == NULL)
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
index f689ed82af3a..d153ca3ea577 100644
--- a/fs/nfs_common/Makefile
+++ b/fs/nfs_common/Makefile
@@ -3,5 +3,6 @@
3# 3#
4 4
5obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o 5obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
6
7nfs_acl-objs := nfsacl.o 6nfs_acl-objs := nfsacl.o
7
8obj-$(CONFIG_GRACE_PERIOD) += grace.o
diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c
index 6d1ee7204c88..ae6e58ea4de5 100644
--- a/fs/lockd/grace.c
+++ b/fs/nfs_common/grace.c
@@ -1,17 +1,20 @@
1/* 1/*
2 * Common code for control of lockd and nfsv4 grace periods. 2 * Common code for control of lockd and nfsv4 grace periods.
3 *
4 * Transplanted from lockd code
3 */ 5 */
4 6
5#include <linux/module.h> 7#include <linux/module.h>
6#include <linux/lockd/bind.h>
7#include <net/net_namespace.h> 8#include <net/net_namespace.h>
9#include <net/netns/generic.h>
10#include <linux/fs.h>
8 11
9#include "netns.h" 12static int grace_net_id;
10
11static DEFINE_SPINLOCK(grace_lock); 13static DEFINE_SPINLOCK(grace_lock);
12 14
13/** 15/**
14 * locks_start_grace 16 * locks_start_grace
17 * @net: net namespace that this lock manager belongs to
15 * @lm: who this grace period is for 18 * @lm: who this grace period is for
16 * 19 *
17 * A grace period is a period during which locks should not be given 20 * A grace period is a period during which locks should not be given
@@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock);
21 * 24 *
22 * This function is called to start a grace period. 25 * This function is called to start a grace period.
23 */ 26 */
24void locks_start_grace(struct net *net, struct lock_manager *lm) 27void
28locks_start_grace(struct net *net, struct lock_manager *lm)
25{ 29{
26 struct lockd_net *ln = net_generic(net, lockd_net_id); 30 struct list_head *grace_list = net_generic(net, grace_net_id);
27 31
28 spin_lock(&grace_lock); 32 spin_lock(&grace_lock);
29 list_add(&lm->list, &ln->grace_list); 33 list_add(&lm->list, grace_list);
30 spin_unlock(&grace_lock); 34 spin_unlock(&grace_lock);
31} 35}
32EXPORT_SYMBOL_GPL(locks_start_grace); 36EXPORT_SYMBOL_GPL(locks_start_grace);
33 37
34/** 38/**
35 * locks_end_grace 39 * locks_end_grace
40 * @net: net namespace that this lock manager belongs to
36 * @lm: who this grace period is for 41 * @lm: who this grace period is for
37 * 42 *
38 * Call this function to state that the given lock manager is ready to 43 * Call this function to state that the given lock manager is ready to
@@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace);
41 * Note that callers count on it being safe to call this more than once, 46 * Note that callers count on it being safe to call this more than once,
42 * and the second call should be a no-op. 47 * and the second call should be a no-op.
43 */ 48 */
44void locks_end_grace(struct lock_manager *lm) 49void
50locks_end_grace(struct lock_manager *lm)
45{ 51{
46 spin_lock(&grace_lock); 52 spin_lock(&grace_lock);
47 list_del_init(&lm->list); 53 list_del_init(&lm->list);
@@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace);
56 * to answer ordinary lock requests, and when they should accept only 62 * to answer ordinary lock requests, and when they should accept only
57 * lock reclaims. 63 * lock reclaims.
58 */ 64 */
59int locks_in_grace(struct net *net) 65int
66locks_in_grace(struct net *net)
60{ 67{
61 struct lockd_net *ln = net_generic(net, lockd_net_id); 68 struct list_head *grace_list = net_generic(net, grace_net_id);
62 69
63 return !list_empty(&ln->grace_list); 70 return !list_empty(grace_list);
64} 71}
65EXPORT_SYMBOL_GPL(locks_in_grace); 72EXPORT_SYMBOL_GPL(locks_in_grace);
73
74static int __net_init
75grace_init_net(struct net *net)
76{
77 struct list_head *grace_list = net_generic(net, grace_net_id);
78
79 INIT_LIST_HEAD(grace_list);
80 return 0;
81}
82
83static void __net_exit
84grace_exit_net(struct net *net)
85{
86 struct list_head *grace_list = net_generic(net, grace_net_id);
87
88 BUG_ON(!list_empty(grace_list));
89}
90
91static struct pernet_operations grace_net_ops = {
92 .init = grace_init_net,
93 .exit = grace_exit_net,
94 .id = &grace_net_id,
95 .size = sizeof(struct list_head),
96};
97
98static int __init
99init_grace(void)
100{
101 return register_pernet_subsys(&grace_net_ops);
102}
103
104static void __exit
105exit_grace(void)
106{
107 unregister_pernet_subsys(&grace_net_ops);
108}
109
110MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>");
111MODULE_LICENSE("GPL");
112module_init(init_grace)
113module_exit(exit_grace)
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index ed628f71274c..538f142935ea 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -30,9 +30,6 @@
30 30
31MODULE_LICENSE("GPL"); 31MODULE_LICENSE("GPL");
32 32
33EXPORT_SYMBOL_GPL(nfsacl_encode);
34EXPORT_SYMBOL_GPL(nfsacl_decode);
35
36struct nfsacl_encode_desc { 33struct nfsacl_encode_desc {
37 struct xdr_array2_desc desc; 34 struct xdr_array2_desc desc;
38 unsigned int count; 35 unsigned int count;
@@ -136,6 +133,7 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
136 nfsacl_desc.desc.array_len; 133 nfsacl_desc.desc.array_len;
137 return err; 134 return err;
138} 135}
136EXPORT_SYMBOL_GPL(nfsacl_encode);
139 137
140struct nfsacl_decode_desc { 138struct nfsacl_decode_desc {
141 struct xdr_array2_desc desc; 139 struct xdr_array2_desc desc;
@@ -295,3 +293,4 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
295 return 8 + nfsacl_desc.desc.elem_size * 293 return 8 + nfsacl_desc.desc.elem_size *
296 nfsacl_desc.desc.array_len; 294 nfsacl_desc.desc.array_len;
297} 295}
296EXPORT_SYMBOL_GPL(nfsacl_decode);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index f994e750e0d1..73395156bdb4 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -71,6 +71,7 @@ config NFSD_V4
71 select FS_POSIX_ACL 71 select FS_POSIX_ACL
72 select SUNRPC_GSS 72 select SUNRPC_GSS
73 select CRYPTO 73 select CRYPTO
74 select GRACE_PERIOD
74 help 75 help
75 This option enables support in your system's NFS server for 76 This option enables support in your system's NFS server for
76 version 4 of the NFS protocol (RFC 3530). 77 version 4 of the NFS protocol (RFC 3530).
@@ -94,9 +95,6 @@ config NFSD_V4_SECURITY_LABEL
94 If you do not wish to enable fine-grained security labels SELinux or 95 If you do not wish to enable fine-grained security labels SELinux or
95 Smack policies on NFSv4 files, say N. 96 Smack policies on NFSv4 files, say N.
96 97
97 WARNING: there is still a chance of backwards-incompatible protocol changes.
98 For now we recommend "Y" only for developers and testers.
99
100config NFSD_FAULT_INJECTION 98config NFSD_FAULT_INJECTION
101 bool "NFS server manual fault injection" 99 bool "NFS server manual fault injection"
102 depends on NFSD_V4 && DEBUG_KERNEL 100 depends on NFSD_V4 && DEBUG_KERNEL
diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
index a986ceb6fd0d..4cd7c69a6cb9 100644
--- a/fs/nfsd/acl.h
+++ b/fs/nfsd/acl.h
@@ -47,7 +47,7 @@ struct svc_rqst;
47#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ 47#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \
48 / sizeof(struct nfs4_ace)) 48 / sizeof(struct nfs4_ace))
49 49
50struct nfs4_acl *nfs4_acl_new(int); 50int nfs4_acl_bytes(int entries);
51int nfs4_acl_get_whotype(char *, u32); 51int nfs4_acl_get_whotype(char *, u32);
52__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); 52__be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
53 53
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 72f44823adbb..9d46a0bdd9f9 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -28,7 +28,7 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
28 validate_process_creds(); 28 validate_process_creds();
29 29
30 /* discard any old override before preparing the new set */ 30 /* discard any old override before preparing the new set */
31 revert_creds(get_cred(current->real_cred)); 31 revert_creds(get_cred(current_real_cred()));
32 new = prepare_creds(); 32 new = prepare_creds();
33 if (!new) 33 if (!new)
34 return -ENOMEM; 34 return -ENOMEM;
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index b582f9ab6b2a..dd96a3830004 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -18,7 +18,6 @@
18 * is much larger than a sockaddr_in6. 18 * is much larger than a sockaddr_in6.
19 */ 19 */
20struct svc_cacherep { 20struct svc_cacherep {
21 struct hlist_node c_hash;
22 struct list_head c_lru; 21 struct list_head c_lru;
23 22
24 unsigned char c_state, /* unused, inprog, done */ 23 unsigned char c_state, /* unused, inprog, done */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 13b85f94d9e2..30a739d896ff 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -698,8 +698,8 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
698 698
699 kref_get(&item->ex_client->ref); 699 kref_get(&item->ex_client->ref);
700 new->ex_client = item->ex_client; 700 new->ex_client = item->ex_client;
701 new->ex_path.dentry = dget(item->ex_path.dentry); 701 new->ex_path = item->ex_path;
702 new->ex_path.mnt = mntget(item->ex_path.mnt); 702 path_get(&item->ex_path);
703 new->ex_fslocs.locations = NULL; 703 new->ex_fslocs.locations = NULL;
704 new->ex_fslocs.locations_count = 0; 704 new->ex_fslocs.locations_count = 0;
705 new->ex_fslocs.migrated = 0; 705 new->ex_fslocs.migrated = 0;
@@ -1145,6 +1145,7 @@ static struct flags {
1145 { NFSEXP_ALLSQUASH, {"all_squash", ""}}, 1145 { NFSEXP_ALLSQUASH, {"all_squash", ""}},
1146 { NFSEXP_ASYNC, {"async", "sync"}}, 1146 { NFSEXP_ASYNC, {"async", "sync"}},
1147 { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, 1147 { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}},
1148 { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}},
1148 { NFSEXP_NOHIDE, {"nohide", ""}}, 1149 { NFSEXP_NOHIDE, {"nohide", ""}},
1149 { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, 1150 { NFSEXP_CROSSMOUNT, {"crossmnt", ""}},
1150 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, 1151 { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}},
@@ -1253,7 +1254,7 @@ static int e_show(struct seq_file *m, void *p)
1253 return 0; 1254 return 0;
1254 } 1255 }
1255 1256
1256 cache_get(&exp->h); 1257 exp_get(exp);
1257 if (cache_check(cd, &exp->h, NULL)) 1258 if (cache_check(cd, &exp->h, NULL))
1258 return 0; 1259 return 0;
1259 exp_put(exp); 1260 exp_put(exp);
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index cfeea85c5bed..04dc8c167b0c 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -101,9 +101,10 @@ static inline void exp_put(struct svc_export *exp)
101 cache_put(&exp->h, exp->cd); 101 cache_put(&exp->h, exp->cd);
102} 102}
103 103
104static inline void exp_get(struct svc_export *exp) 104static inline struct svc_export *exp_get(struct svc_export *exp)
105{ 105{
106 cache_get(&exp->h); 106 cache_get(&exp->h);
107 return exp;
107} 108}
108struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *); 109struct svc_export * rqst_exp_find(struct svc_rqst *, int, u32 *);
109 110
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index 2ed05c3cd43d..c16bf5af6831 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -17,81 +17,13 @@
17 17
18struct nfsd_fault_inject_op { 18struct nfsd_fault_inject_op {
19 char *file; 19 char *file;
20 u64 (*forget)(struct nfs4_client *, u64); 20 u64 (*get)(void);
21 u64 (*print)(struct nfs4_client *, u64); 21 u64 (*set_val)(u64);
22 u64 (*set_clnt)(struct sockaddr_storage *, size_t);
22}; 23};
23 24
24static struct nfsd_fault_inject_op inject_ops[] = {
25 {
26 .file = "forget_clients",
27 .forget = nfsd_forget_client,
28 .print = nfsd_print_client,
29 },
30 {
31 .file = "forget_locks",
32 .forget = nfsd_forget_client_locks,
33 .print = nfsd_print_client_locks,
34 },
35 {
36 .file = "forget_openowners",
37 .forget = nfsd_forget_client_openowners,
38 .print = nfsd_print_client_openowners,
39 },
40 {
41 .file = "forget_delegations",
42 .forget = nfsd_forget_client_delegations,
43 .print = nfsd_print_client_delegations,
44 },
45 {
46 .file = "recall_delegations",
47 .forget = nfsd_recall_client_delegations,
48 .print = nfsd_print_client_delegations,
49 },
50};
51
52static long int NUM_INJECT_OPS = sizeof(inject_ops) / sizeof(struct nfsd_fault_inject_op);
53static struct dentry *debug_dir; 25static struct dentry *debug_dir;
54 26
55static void nfsd_inject_set(struct nfsd_fault_inject_op *op, u64 val)
56{
57 u64 count = 0;
58
59 if (val == 0)
60 printk(KERN_INFO "NFSD Fault Injection: %s (all)", op->file);
61 else
62 printk(KERN_INFO "NFSD Fault Injection: %s (n = %llu)", op->file, val);
63
64 nfs4_lock_state();
65 count = nfsd_for_n_state(val, op->forget);
66 nfs4_unlock_state();
67 printk(KERN_INFO "NFSD: %s: found %llu", op->file, count);
68}
69
70static void nfsd_inject_set_client(struct nfsd_fault_inject_op *op,
71 struct sockaddr_storage *addr,
72 size_t addr_size)
73{
74 char buf[INET6_ADDRSTRLEN];
75 struct nfs4_client *clp;
76 u64 count;
77
78 nfs4_lock_state();
79 clp = nfsd_find_client(addr, addr_size);
80 if (clp) {
81 count = op->forget(clp, 0);
82 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
83 printk(KERN_INFO "NFSD [%s]: Client %s had %llu state object(s)\n", op->file, buf, count);
84 }
85 nfs4_unlock_state();
86}
87
88static void nfsd_inject_get(struct nfsd_fault_inject_op *op, u64 *val)
89{
90 nfs4_lock_state();
91 *val = nfsd_for_n_state(0, op->print);
92 nfs4_unlock_state();
93}
94
95static ssize_t fault_inject_read(struct file *file, char __user *buf, 27static ssize_t fault_inject_read(struct file *file, char __user *buf,
96 size_t len, loff_t *ppos) 28 size_t len, loff_t *ppos)
97{ 29{
@@ -99,9 +31,10 @@ static ssize_t fault_inject_read(struct file *file, char __user *buf,
99 char read_buf[25]; 31 char read_buf[25];
100 size_t size; 32 size_t size;
101 loff_t pos = *ppos; 33 loff_t pos = *ppos;
34 struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
102 35
103 if (!pos) 36 if (!pos)
104 nfsd_inject_get(file_inode(file)->i_private, &val); 37 val = op->get();
105 size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val); 38 size = scnprintf(read_buf, sizeof(read_buf), "%llu\n", val);
106 39
107 return simple_read_from_buffer(buf, len, ppos, read_buf, size); 40 return simple_read_from_buffer(buf, len, ppos, read_buf, size);
@@ -114,18 +47,36 @@ static ssize_t fault_inject_write(struct file *file, const char __user *buf,
114 size_t size = min(sizeof(write_buf) - 1, len); 47 size_t size = min(sizeof(write_buf) - 1, len);
115 struct net *net = current->nsproxy->net_ns; 48 struct net *net = current->nsproxy->net_ns;
116 struct sockaddr_storage sa; 49 struct sockaddr_storage sa;
50 struct nfsd_fault_inject_op *op = file_inode(file)->i_private;
117 u64 val; 51 u64 val;
52 char *nl;
118 53
119 if (copy_from_user(write_buf, buf, size)) 54 if (copy_from_user(write_buf, buf, size))
120 return -EFAULT; 55 return -EFAULT;
121 write_buf[size] = '\0'; 56 write_buf[size] = '\0';
122 57
58 /* Deal with any embedded newlines in the string */
59 nl = strchr(write_buf, '\n');
60 if (nl) {
61 size = nl - write_buf;
62 *nl = '\0';
63 }
64
123 size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa)); 65 size = rpc_pton(net, write_buf, size, (struct sockaddr *)&sa, sizeof(sa));
124 if (size > 0) 66 if (size > 0) {
125 nfsd_inject_set_client(file_inode(file)->i_private, &sa, size); 67 val = op->set_clnt(&sa, size);
126 else { 68 if (val)
69 pr_info("NFSD [%s]: Client %s had %llu state object(s)\n",
70 op->file, write_buf, val);
71 } else {
127 val = simple_strtoll(write_buf, NULL, 0); 72 val = simple_strtoll(write_buf, NULL, 0);
128 nfsd_inject_set(file_inode(file)->i_private, val); 73 if (val == 0)
74 pr_info("NFSD Fault Injection: %s (all)", op->file);
75 else
76 pr_info("NFSD Fault Injection: %s (n = %llu)",
77 op->file, val);
78 val = op->set_val(val);
79 pr_info("NFSD: %s: found %llu", op->file, val);
129 } 80 }
130 return len; /* on success, claim we got the whole input */ 81 return len; /* on success, claim we got the whole input */
131} 82}
@@ -141,6 +92,41 @@ void nfsd_fault_inject_cleanup(void)
141 debugfs_remove_recursive(debug_dir); 92 debugfs_remove_recursive(debug_dir);
142} 93}
143 94
95static struct nfsd_fault_inject_op inject_ops[] = {
96 {
97 .file = "forget_clients",
98 .get = nfsd_inject_print_clients,
99 .set_val = nfsd_inject_forget_clients,
100 .set_clnt = nfsd_inject_forget_client,
101 },
102 {
103 .file = "forget_locks",
104 .get = nfsd_inject_print_locks,
105 .set_val = nfsd_inject_forget_locks,
106 .set_clnt = nfsd_inject_forget_client_locks,
107 },
108 {
109 .file = "forget_openowners",
110 .get = nfsd_inject_print_openowners,
111 .set_val = nfsd_inject_forget_openowners,
112 .set_clnt = nfsd_inject_forget_client_openowners,
113 },
114 {
115 .file = "forget_delegations",
116 .get = nfsd_inject_print_delegations,
117 .set_val = nfsd_inject_forget_delegations,
118 .set_clnt = nfsd_inject_forget_client_delegations,
119 },
120 {
121 .file = "recall_delegations",
122 .get = nfsd_inject_print_delegations,
123 .set_val = nfsd_inject_recall_delegations,
124 .set_clnt = nfsd_inject_recall_client_delegations,
125 },
126};
127
128#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op))
129
144int nfsd_fault_inject_init(void) 130int nfsd_fault_inject_init(void)
145{ 131{
146 unsigned int i; 132 unsigned int i;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d32b3aa6600d..ea6749a32760 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -29,14 +29,19 @@
29#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS) 29#define CLIENT_HASH_SIZE (1 << CLIENT_HASH_BITS)
30#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1) 30#define CLIENT_HASH_MASK (CLIENT_HASH_SIZE - 1)
31 31
32#define LOCKOWNER_INO_HASH_BITS 8
33#define LOCKOWNER_INO_HASH_SIZE (1 << LOCKOWNER_INO_HASH_BITS)
34
35#define SESSION_HASH_SIZE 512 32#define SESSION_HASH_SIZE 512
36 33
37struct cld_net; 34struct cld_net;
38struct nfsd4_client_tracking_ops; 35struct nfsd4_client_tracking_ops;
39 36
37/*
38 * Represents a nfsd "container". With respect to nfsv4 state tracking, the
39 * fields of interest are the *_id_hashtbls and the *_name_tree. These track
40 * the nfs4_client objects by either short or long form clientid.
41 *
42 * Each nfsd_net runs a nfs4_laundromat workqueue job when necessary to clean
43 * up expired clients and delegations within the container.
44 */
40struct nfsd_net { 45struct nfsd_net {
41 struct cld_net *cld_net; 46 struct cld_net *cld_net;
42 47
@@ -66,8 +71,6 @@ struct nfsd_net {
66 struct rb_root conf_name_tree; 71 struct rb_root conf_name_tree;
67 struct list_head *unconf_id_hashtbl; 72 struct list_head *unconf_id_hashtbl;
68 struct rb_root unconf_name_tree; 73 struct rb_root unconf_name_tree;
69 struct list_head *ownerstr_hashtbl;
70 struct list_head *lockowner_ino_hashtbl;
71 struct list_head *sessionid_hashtbl; 74 struct list_head *sessionid_hashtbl;
72 /* 75 /*
73 * client_lru holds client queue ordered by nfs4_client.cl_time 76 * client_lru holds client queue ordered by nfs4_client.cl_time
@@ -97,10 +100,16 @@ struct nfsd_net {
97 bool nfsd_net_up; 100 bool nfsd_net_up;
98 bool lockd_up; 101 bool lockd_up;
99 102
103 /* Time of server startup */
104 struct timeval nfssvc_boot;
105
100 /* 106 /*
101 * Time of server startup 107 * Max number of connections this nfsd container will allow. Defaults
108 * to '0' which is means that it bases this on the number of threads.
102 */ 109 */
103 struct timeval nfssvc_boot; 110 unsigned int max_connections;
111
112 u32 clientid_counter;
104 113
105 struct svc_serv *nfsd_serv; 114 struct svc_serv *nfsd_serv;
106}; 115};
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
index 12b023a7ab7d..ac54ea60b3f6 100644
--- a/fs/nfsd/nfs2acl.c
+++ b/fs/nfsd/nfs2acl.c
@@ -54,14 +54,14 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp,
54 54
55 if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { 55 if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
56 acl = get_acl(inode, ACL_TYPE_ACCESS); 56 acl = get_acl(inode, ACL_TYPE_ACCESS);
57 if (IS_ERR(acl)) {
58 nfserr = nfserrno(PTR_ERR(acl));
59 goto fail;
60 }
61 if (acl == NULL) { 57 if (acl == NULL) {
62 /* Solaris returns the inode's minimum ACL. */ 58 /* Solaris returns the inode's minimum ACL. */
63 acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); 59 acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
64 } 60 }
61 if (IS_ERR(acl)) {
62 nfserr = nfserrno(PTR_ERR(acl));
63 goto fail;
64 }
65 resp->acl_access = acl; 65 resp->acl_access = acl;
66 } 66 }
67 if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { 67 if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
index 2a514e21dc74..34cbbab6abd7 100644
--- a/fs/nfsd/nfs3acl.c
+++ b/fs/nfsd/nfs3acl.c
@@ -47,14 +47,14 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp,
47 47
48 if (resp->mask & (NFS_ACL|NFS_ACLCNT)) { 48 if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
49 acl = get_acl(inode, ACL_TYPE_ACCESS); 49 acl = get_acl(inode, ACL_TYPE_ACCESS);
50 if (IS_ERR(acl)) {
51 nfserr = nfserrno(PTR_ERR(acl));
52 goto fail;
53 }
54 if (acl == NULL) { 50 if (acl == NULL) {
55 /* Solaris returns the inode's minimum ACL. */ 51 /* Solaris returns the inode's minimum ACL. */
56 acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); 52 acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
57 } 53 }
54 if (IS_ERR(acl)) {
55 nfserr = nfserrno(PTR_ERR(acl));
56 goto fail;
57 }
58 resp->acl_access = acl; 58 resp->acl_access = acl;
59 } 59 }
60 if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) { 60 if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 401289913130..12f2aab4f614 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -157,11 +157,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
157 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof) 157 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
158 * + 1 (xdr opaque byte count) = 26 158 * + 1 (xdr opaque byte count) = 26
159 */ 159 */
160 160 resp->count = min(argp->count, max_blocksize);
161 resp->count = argp->count;
162 if (max_blocksize < resp->count)
163 resp->count = max_blocksize;
164
165 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); 161 svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
166 162
167 fh_copy(&resp->fh, &argp->fh); 163 fh_copy(&resp->fh, &argp->fh);
@@ -227,11 +223,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp,
227 newfhp = fh_init(&resp->fh, NFS3_FHSIZE); 223 newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
228 attr = &argp->attrs; 224 attr = &argp->attrs;
229 225
230 /* Get the directory inode */
231 nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE);
232 if (nfserr)
233 RETURN_STATUS(nfserr);
234
235 /* Unfudge the mode bits */ 226 /* Unfudge the mode bits */
236 attr->ia_mode &= ~S_IFMT; 227 attr->ia_mode &= ~S_IFMT;
237 if (!(attr->ia_valid & ATTR_MODE)) { 228 if (!(attr->ia_valid & ATTR_MODE)) {
@@ -286,8 +277,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp, struct nfsd3_symlinkargs *argp,
286 fh_copy(&resp->dirfh, &argp->ffh); 277 fh_copy(&resp->dirfh, &argp->ffh);
287 fh_init(&resp->fh, NFS3_FHSIZE); 278 fh_init(&resp->fh, NFS3_FHSIZE);
288 nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen, 279 nfserr = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, argp->flen,
289 argp->tname, argp->tlen, 280 argp->tname, &resp->fh);
290 &resp->fh, &argp->attrs);
291 RETURN_STATUS(nfserr); 281 RETURN_STATUS(nfserr);
292} 282}
293 283
@@ -476,6 +466,14 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
476 resp->buflen = resp->count; 466 resp->buflen = resp->count;
477 resp->rqstp = rqstp; 467 resp->rqstp = rqstp;
478 offset = argp->cookie; 468 offset = argp->cookie;
469
470 nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP);
471 if (nfserr)
472 RETURN_STATUS(nfserr);
473
474 if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS)
475 RETURN_STATUS(nfserr_notsupp);
476
479 nfserr = nfsd_readdir(rqstp, &resp->fh, 477 nfserr = nfsd_readdir(rqstp, &resp->fh,
480 &offset, 478 &offset,
481 &resp->common, 479 &resp->common,
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index e6c01e80325e..39c5eb3ad33a 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -120,10 +120,7 @@ decode_sattr3(__be32 *p, struct iattr *iap)
120 120
121 iap->ia_valid |= ATTR_SIZE; 121 iap->ia_valid |= ATTR_SIZE;
122 p = xdr_decode_hyper(p, &newsize); 122 p = xdr_decode_hyper(p, &newsize);
123 if (newsize <= NFS_OFFSET_MAX) 123 iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
124 iap->ia_size = newsize;
125 else
126 iap->ia_size = NFS_OFFSET_MAX;
127 } 124 }
128 if ((tmp = ntohl(*p++)) == 1) { /* set to server time */ 125 if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
129 iap->ia_valid |= ATTR_ATIME; 126 iap->ia_valid |= ATTR_ATIME;
@@ -338,10 +335,8 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
338 return 0; 335 return 0;
339 p = xdr_decode_hyper(p, &args->offset); 336 p = xdr_decode_hyper(p, &args->offset);
340 337
341 len = args->count = ntohl(*p++); 338 args->count = ntohl(*p++);
342 339 len = min(args->count, max_blocksize);
343 if (len > max_blocksize)
344 len = max_blocksize;
345 340
346 /* set up the kvec */ 341 /* set up the kvec */
347 v=0; 342 v=0;
@@ -349,7 +344,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
349 struct page *p = *(rqstp->rq_next_page++); 344 struct page *p = *(rqstp->rq_next_page++);
350 345
351 rqstp->rq_vec[v].iov_base = page_address(p); 346 rqstp->rq_vec[v].iov_base = page_address(p);
352 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE; 347 rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
353 len -= rqstp->rq_vec[v].iov_len; 348 len -= rqstp->rq_vec[v].iov_len;
354 v++; 349 v++;
355 } 350 }
@@ -484,9 +479,7 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
484 } 479 }
485 /* now copy next page if there is one */ 480 /* now copy next page if there is one */
486 if (len && !avail && rqstp->rq_arg.page_len) { 481 if (len && !avail && rqstp->rq_arg.page_len) {
487 avail = rqstp->rq_arg.page_len; 482 avail = min_t(unsigned int, rqstp->rq_arg.page_len, PAGE_SIZE);
488 if (avail > PAGE_SIZE)
489 avail = PAGE_SIZE;
490 old = page_address(rqstp->rq_arg.pages[0]); 483 old = page_address(rqstp->rq_arg.pages[0]);
491 } 484 }
492 while (len && avail && *old) { 485 while (len && avail && *old) {
@@ -571,10 +564,7 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
571 args->verf = p; p += 2; 564 args->verf = p; p += 2;
572 args->dircount = ~0; 565 args->dircount = ~0;
573 args->count = ntohl(*p++); 566 args->count = ntohl(*p++);
574 567 args->count = min_t(u32, args->count, PAGE_SIZE);
575 if (args->count > PAGE_SIZE)
576 args->count = PAGE_SIZE;
577
578 args->buffer = page_address(*(rqstp->rq_next_page++)); 568 args->buffer = page_address(*(rqstp->rq_next_page++));
579 569
580 return xdr_argsize_check(rqstp, p); 570 return xdr_argsize_check(rqstp, p);
@@ -595,10 +585,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p,
595 args->dircount = ntohl(*p++); 585 args->dircount = ntohl(*p++);
596 args->count = ntohl(*p++); 586 args->count = ntohl(*p++);
597 587
598 len = (args->count > max_blocksize) ? max_blocksize : 588 len = args->count = min(args->count, max_blocksize);
599 args->count;
600 args->count = len;
601
602 while (len > 0) { 589 while (len > 0) {
603 struct page *p = *(rqstp->rq_next_page++); 590 struct page *p = *(rqstp->rq_next_page++);
604 if (!args->buffer) 591 if (!args->buffer)
@@ -913,8 +900,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
913 */ 900 */
914 901
915 /* truncate filename if too long */ 902 /* truncate filename if too long */
916 if (namlen > NFS3_MAXNAMLEN) 903 namlen = min(namlen, NFS3_MAXNAMLEN);
917 namlen = NFS3_MAXNAMLEN;
918 904
919 slen = XDR_QUADLEN(namlen); 905 slen = XDR_QUADLEN(namlen);
920 elen = slen + NFS3_ENTRY_BAGGAGE 906 elen = slen + NFS3_ENTRY_BAGGAGE
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index d714156a19fd..59fd76651781 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -146,35 +146,43 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
146 int size = 0; 146 int size = 0;
147 147
148 pacl = get_acl(inode, ACL_TYPE_ACCESS); 148 pacl = get_acl(inode, ACL_TYPE_ACCESS);
149 if (!pacl) { 149 if (!pacl)
150 pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL); 150 pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
151 if (IS_ERR(pacl)) 151
152 return PTR_ERR(pacl); 152 if (IS_ERR(pacl))
153 } 153 return PTR_ERR(pacl);
154
154 /* allocate for worst case: one (deny, allow) pair each: */ 155 /* allocate for worst case: one (deny, allow) pair each: */
155 size += 2 * pacl->a_count; 156 size += 2 * pacl->a_count;
156 157
157 if (S_ISDIR(inode->i_mode)) { 158 if (S_ISDIR(inode->i_mode)) {
158 flags = NFS4_ACL_DIR; 159 flags = NFS4_ACL_DIR;
159 dpacl = get_acl(inode, ACL_TYPE_DEFAULT); 160 dpacl = get_acl(inode, ACL_TYPE_DEFAULT);
161 if (IS_ERR(dpacl)) {
162 error = PTR_ERR(dpacl);
163 goto rel_pacl;
164 }
165
160 if (dpacl) 166 if (dpacl)
161 size += 2 * dpacl->a_count; 167 size += 2 * dpacl->a_count;
162 } 168 }
163 169
164 *acl = nfs4_acl_new(size); 170 *acl = kmalloc(nfs4_acl_bytes(size), GFP_KERNEL);
165 if (*acl == NULL) { 171 if (*acl == NULL) {
166 error = -ENOMEM; 172 error = -ENOMEM;
167 goto out; 173 goto out;
168 } 174 }
175 (*acl)->naces = 0;
169 176
170 _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT); 177 _posix_to_nfsv4_one(pacl, *acl, flags & ~NFS4_ACL_TYPE_DEFAULT);
171 178
172 if (dpacl) 179 if (dpacl)
173 _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT); 180 _posix_to_nfsv4_one(dpacl, *acl, flags | NFS4_ACL_TYPE_DEFAULT);
174 181
175 out: 182out:
176 posix_acl_release(pacl);
177 posix_acl_release(dpacl); 183 posix_acl_release(dpacl);
184rel_pacl:
185 posix_acl_release(pacl);
178 return error; 186 return error;
179} 187}
180 188
@@ -872,16 +880,13 @@ ace2type(struct nfs4_ace *ace)
872 return -1; 880 return -1;
873} 881}
874 882
875struct nfs4_acl * 883/*
876nfs4_acl_new(int n) 884 * return the size of the struct nfs4_acl required to represent an acl
885 * with @entries entries.
886 */
887int nfs4_acl_bytes(int entries)
877{ 888{
878 struct nfs4_acl *acl; 889 return sizeof(struct nfs4_acl) + entries * sizeof(struct nfs4_ace);
879
880 acl = kmalloc(sizeof(*acl) + n*sizeof(struct nfs4_ace), GFP_KERNEL);
881 if (acl == NULL)
882 return NULL;
883 acl->naces = 0;
884 return acl;
885} 890}
886 891
887static struct { 892static struct {
@@ -935,5 +940,5 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who)
935 return 0; 940 return 0;
936 } 941 }
937 WARN_ON_ONCE(1); 942 WARN_ON_ONCE(1);
938 return -1; 943 return nfserr_serverfault;
939} 944}
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 2c73cae9899d..ed2b1151b171 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
49 49
50/* Index of predefined Linux callback client operations */ 50/* Index of predefined Linux callback client operations */
51 51
52enum {
53 NFSPROC4_CLNT_CB_NULL = 0,
54 NFSPROC4_CLNT_CB_RECALL,
55 NFSPROC4_CLNT_CB_SEQUENCE,
56};
57
58struct nfs4_cb_compound_hdr { 52struct nfs4_cb_compound_hdr {
59 /* args */ 53 /* args */
60 u32 ident; /* minorversion 0 only */ 54 u32 ident; /* minorversion 0 only */
@@ -337,7 +331,7 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
337 p = xdr_reserve_space(xdr, 4); 331 p = xdr_reserve_space(xdr, 4);
338 *p++ = xdr_zero; /* truncate */ 332 *p++ = xdr_zero; /* truncate */
339 333
340 encode_nfs_fh4(xdr, &dp->dl_fh); 334 encode_nfs_fh4(xdr, &dp->dl_stid.sc_file->fi_fhandle);
341 335
342 hdr->nops++; 336 hdr->nops++;
343} 337}
@@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
494static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, 488static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
495 const struct nfsd4_callback *cb) 489 const struct nfsd4_callback *cb)
496{ 490{
497 const struct nfs4_delegation *args = cb->cb_op; 491 const struct nfs4_delegation *dp = cb_to_delegation(cb);
498 struct nfs4_cb_compound_hdr hdr = { 492 struct nfs4_cb_compound_hdr hdr = {
499 .ident = cb->cb_clp->cl_cb_ident, 493 .ident = cb->cb_clp->cl_cb_ident,
500 .minorversion = cb->cb_minorversion, 494 .minorversion = cb->cb_minorversion,
@@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
502 496
503 encode_cb_compound4args(xdr, &hdr); 497 encode_cb_compound4args(xdr, &hdr);
504 encode_cb_sequence4args(xdr, cb, &hdr); 498 encode_cb_sequence4args(xdr, cb, &hdr);
505 encode_cb_recall4args(xdr, args, &hdr); 499 encode_cb_recall4args(xdr, dp, &hdr);
506 encode_cb_nops(&hdr); 500 encode_cb_nops(&hdr);
507} 501}
508 502
@@ -678,7 +672,7 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
678 (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5)) 672 (clp->cl_cred.cr_flavor >= RPC_AUTH_GSS_KRB5))
679 return -EINVAL; 673 return -EINVAL;
680 args.client_name = clp->cl_cred.cr_principal; 674 args.client_name = clp->cl_cred.cr_principal;
681 args.prognumber = conn->cb_prog, 675 args.prognumber = conn->cb_prog;
682 args.protocol = XPRT_TRANSPORT_TCP; 676 args.protocol = XPRT_TRANSPORT_TCP;
683 args.authflavor = clp->cl_cred.cr_flavor; 677 args.authflavor = clp->cl_cred.cr_flavor;
684 clp->cl_cb_ident = conn->cb_ident; 678 clp->cl_cb_ident = conn->cb_ident;
@@ -689,7 +683,8 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
689 clp->cl_cb_session = ses; 683 clp->cl_cb_session = ses;
690 args.bc_xprt = conn->cb_xprt; 684 args.bc_xprt = conn->cb_xprt;
691 args.prognumber = clp->cl_cb_session->se_cb_prog; 685 args.prognumber = clp->cl_cb_session->se_cb_prog;
692 args.protocol = XPRT_TRANSPORT_BC_TCP; 686 args.protocol = conn->cb_xprt->xpt_class->xcl_ident |
687 XPRT_TRANSPORT_BC;
693 args.authflavor = ses->se_cb_sec.flavor; 688 args.authflavor = ses->se_cb_sec.flavor;
694 } 689 }
695 /* Create RPC client */ 690 /* Create RPC client */
@@ -745,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
745 740
746static struct workqueue_struct *callback_wq; 741static struct workqueue_struct *callback_wq;
747 742
748static void run_nfsd4_cb(struct nfsd4_callback *cb)
749{
750 queue_work(callback_wq, &cb->cb_work);
751}
752
753static void do_probe_callback(struct nfs4_client *clp)
754{
755 struct nfsd4_callback *cb = &clp->cl_cb_null;
756
757 cb->cb_op = NULL;
758 cb->cb_clp = clp;
759
760 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL];
761 cb->cb_msg.rpc_argp = NULL;
762 cb->cb_msg.rpc_resp = NULL;
763
764 cb->cb_ops = &nfsd4_cb_probe_ops;
765
766 run_nfsd4_cb(cb);
767}
768
769/* 743/*
770 * Poke the callback thread to process any updates to the callback 744 * Poke the callback thread to process any updates to the callback
771 * parameters, and send a null probe. 745 * parameters, and send a null probe.
@@ -774,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp)
774{ 748{
775 clp->cl_cb_state = NFSD4_CB_UNKNOWN; 749 clp->cl_cb_state = NFSD4_CB_UNKNOWN;
776 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); 750 set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
777 do_probe_callback(clp); 751 nfsd4_run_cb(&clp->cl_cb_null);
778} 752}
779 753
780void nfsd4_probe_callback_sync(struct nfs4_client *clp) 754void nfsd4_probe_callback_sync(struct nfs4_client *clp)
@@ -846,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
846 rpc_wake_up_next(&clp->cl_cb_waitq); 820 rpc_wake_up_next(&clp->cl_cb_waitq);
847 dprintk("%s: freed slot, new seqid=%d\n", __func__, 821 dprintk("%s: freed slot, new seqid=%d\n", __func__,
848 clp->cl_cb_session->se_cb_seq_nr); 822 clp->cl_cb_session->se_cb_seq_nr);
849
850 /* We're done looking into the sequence information */
851 task->tk_msg.rpc_resp = NULL;
852 } 823 }
853}
854
855 824
856static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) 825 if (clp->cl_cb_client != task->tk_client) {
857{
858 struct nfsd4_callback *cb = calldata;
859 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
860 struct nfs4_client *clp = cb->cb_clp;
861 struct rpc_clnt *current_rpc_client = clp->cl_cb_client;
862
863 nfsd4_cb_done(task, calldata);
864
865 if (current_rpc_client != task->tk_client) {
866 /* We're shutting down or changing cl_cb_client; leave 826 /* We're shutting down or changing cl_cb_client; leave
867 * it to nfsd4_process_cb_update to restart the call if 827 * it to nfsd4_process_cb_update to restart the call if
868 * necessary. */ 828 * necessary. */
@@ -871,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata)
871 831
872 if (cb->cb_done) 832 if (cb->cb_done)
873 return; 833 return;
874 switch (task->tk_status) { 834
835 switch (cb->cb_ops->done(cb, task)) {
875 case 0: 836 case 0:
876 cb->cb_done = true; 837 task->tk_status = 0;
838 rpc_restart_call_prepare(task);
877 return; 839 return;
878 case -EBADHANDLE: 840 case 1:
879 case -NFS4ERR_BAD_STATEID:
880 /* Race: client probably got cb_recall
881 * before open reply granting delegation */
882 break; 841 break;
883 default: 842 case -1:
884 /* Network partition? */ 843 /* Network partition? */
885 nfsd4_mark_cb_down(clp, task->tk_status); 844 nfsd4_mark_cb_down(clp, task->tk_status);
845 break;
846 default:
847 BUG();
886 } 848 }
887 if (dp->dl_retries--) {
888 rpc_delay(task, 2*HZ);
889 task->tk_status = 0;
890 rpc_restart_call_prepare(task);
891 return;
892 }
893 nfsd4_mark_cb_down(clp, task->tk_status);
894 cb->cb_done = true; 849 cb->cb_done = true;
895} 850}
896 851
897static void nfsd4_cb_recall_release(void *calldata) 852static void nfsd4_cb_release(void *calldata)
898{ 853{
899 struct nfsd4_callback *cb = calldata; 854 struct nfsd4_callback *cb = calldata;
900 struct nfs4_client *clp = cb->cb_clp; 855 struct nfs4_client *clp = cb->cb_clp;
901 struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall);
902 856
903 if (cb->cb_done) { 857 if (cb->cb_done) {
904 spin_lock(&clp->cl_lock); 858 spin_lock(&clp->cl_lock);
905 list_del(&cb->cb_per_client); 859 list_del(&cb->cb_per_client);
906 spin_unlock(&clp->cl_lock); 860 spin_unlock(&clp->cl_lock);
907 nfs4_put_delegation(dp); 861
862 cb->cb_ops->release(cb);
908 } 863 }
909} 864}
910 865
911static const struct rpc_call_ops nfsd4_cb_recall_ops = { 866static const struct rpc_call_ops nfsd4_cb_ops = {
912 .rpc_call_prepare = nfsd4_cb_prepare, 867 .rpc_call_prepare = nfsd4_cb_prepare,
913 .rpc_call_done = nfsd4_cb_recall_done, 868 .rpc_call_done = nfsd4_cb_done,
914 .rpc_release = nfsd4_cb_recall_release, 869 .rpc_release = nfsd4_cb_release,
915}; 870};
916 871
917int nfsd4_create_callback_queue(void) 872int nfsd4_create_callback_queue(void)
@@ -933,19 +888,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
933 set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags); 888 set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
934 /* 889 /*
935 * Note this won't actually result in a null callback; 890 * Note this won't actually result in a null callback;
936 * instead, nfsd4_do_callback_rpc() will detect the killed 891 * instead, nfsd4_run_cb_null() will detect the killed
937 * client, destroy the rpc client, and stop: 892 * client, destroy the rpc client, and stop:
938 */ 893 */
939 do_probe_callback(clp); 894 nfsd4_run_cb(&clp->cl_cb_null);
940 flush_workqueue(callback_wq); 895 flush_workqueue(callback_wq);
941} 896}
942 897
943static void nfsd4_release_cb(struct nfsd4_callback *cb)
944{
945 if (cb->cb_ops->rpc_release)
946 cb->cb_ops->rpc_release(cb);
947}
948
949/* requires cl_lock: */ 898/* requires cl_lock: */
950static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) 899static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
951{ 900{
@@ -1008,50 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
1008 } 957 }
1009 /* Yay, the callback channel's back! Restart any callbacks: */ 958 /* Yay, the callback channel's back! Restart any callbacks: */
1010 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) 959 list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client)
1011 run_nfsd4_cb(cb); 960 queue_work(callback_wq, &cb->cb_work);
1012} 961}
1013 962
1014static void nfsd4_do_callback_rpc(struct work_struct *w) 963static void
964nfsd4_run_cb_work(struct work_struct *work)
1015{ 965{
1016 struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, cb_work); 966 struct nfsd4_callback *cb =
967 container_of(work, struct nfsd4_callback, cb_work);
1017 struct nfs4_client *clp = cb->cb_clp; 968 struct nfs4_client *clp = cb->cb_clp;
1018 struct rpc_clnt *clnt; 969 struct rpc_clnt *clnt;
1019 970
971 if (cb->cb_ops && cb->cb_ops->prepare)
972 cb->cb_ops->prepare(cb);
973
1020 if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) 974 if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
1021 nfsd4_process_cb_update(cb); 975 nfsd4_process_cb_update(cb);
1022 976
1023 clnt = clp->cl_cb_client; 977 clnt = clp->cl_cb_client;
1024 if (!clnt) { 978 if (!clnt) {
1025 /* Callback channel broken, or client killed; give up: */ 979 /* Callback channel broken, or client killed; give up: */
1026 nfsd4_release_cb(cb); 980 if (cb->cb_ops && cb->cb_ops->release)
981 cb->cb_ops->release(cb);
1027 return; 982 return;
1028 } 983 }
1029 cb->cb_msg.rpc_cred = clp->cl_cb_cred; 984 cb->cb_msg.rpc_cred = clp->cl_cb_cred;
1030 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, 985 rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
1031 cb->cb_ops, cb); 986 cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
1032}
1033
1034void nfsd4_init_callback(struct nfsd4_callback *cb)
1035{
1036 INIT_WORK(&cb->cb_work, nfsd4_do_callback_rpc);
1037} 987}
1038 988
1039void nfsd4_cb_recall(struct nfs4_delegation *dp) 989void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
990 struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
1040{ 991{
1041 struct nfsd4_callback *cb = &dp->dl_recall;
1042 struct nfs4_client *clp = dp->dl_stid.sc_client;
1043
1044 dp->dl_retries = 1;
1045 cb->cb_op = dp;
1046 cb->cb_clp = clp; 992 cb->cb_clp = clp;
1047 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; 993 cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
1048 cb->cb_msg.rpc_argp = cb; 994 cb->cb_msg.rpc_argp = cb;
1049 cb->cb_msg.rpc_resp = cb; 995 cb->cb_msg.rpc_resp = cb;
1050 996 cb->cb_ops = ops;
1051 cb->cb_ops = &nfsd4_cb_recall_ops; 997 INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
1052
1053 INIT_LIST_HEAD(&cb->cb_per_client); 998 INIT_LIST_HEAD(&cb->cb_per_client);
1054 cb->cb_done = true; 999 cb->cb_done = true;
1000}
1055 1001
1056 run_nfsd4_cb(&dp->dl_recall); 1002void nfsd4_run_cb(struct nfsd4_callback *cb)
1003{
1004 queue_work(callback_wq, &cb->cb_work);
1057} 1005}
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index a0ab0a847d69..e1b3d3d472da 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
215 memset(&ent, 0, sizeof(ent)); 215 memset(&ent, 0, sizeof(ent));
216 216
217 /* Authentication name */ 217 /* Authentication name */
218 if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) 218 len = qword_get(&buf, buf1, PAGE_SIZE);
219 if (len <= 0 || len >= IDMAP_NAMESZ)
219 goto out; 220 goto out;
220 memcpy(ent.authname, buf1, sizeof(ent.authname)); 221 memcpy(ent.authname, buf1, sizeof(ent.authname));
221 222
@@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
245 /* Name */ 246 /* Name */
246 error = -EINVAL; 247 error = -EINVAL;
247 len = qword_get(&buf, buf1, PAGE_SIZE); 248 len = qword_get(&buf, buf1, PAGE_SIZE);
248 if (len < 0) 249 if (len < 0 || len >= IDMAP_NAMESZ)
249 goto out; 250 goto out;
250 if (len == 0) 251 if (len == 0)
251 set_bit(CACHE_NEGATIVE, &ent.h.flags); 252 set_bit(CACHE_NEGATIVE, &ent.h.flags);
252 else if (len >= IDMAP_NAMESZ)
253 goto out;
254 else 253 else
255 memcpy(ent.name, buf1, sizeof(ent.name)); 254 memcpy(ent.name, buf1, sizeof(ent.name));
256 error = -ENOMEM; 255 error = -ENOMEM;
@@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen)
259 goto out; 258 goto out;
260 259
261 cache_put(&res->h, cd); 260 cache_put(&res->h, cd);
262
263 error = 0; 261 error = 0;
264out: 262out:
265 kfree(buf1); 263 kfree(buf1);
266
267 return error; 264 return error;
268} 265}
269 266
270
271static struct ent * 267static struct ent *
272idtoname_lookup(struct cache_detail *cd, struct ent *item) 268idtoname_lookup(struct cache_detail *cd, struct ent *item)
273{ 269{
@@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
368{ 364{
369 struct ent ent, *res; 365 struct ent ent, *res;
370 char *buf1; 366 char *buf1;
371 int error = -EINVAL; 367 int len, error = -EINVAL;
372 368
373 if (buf[buflen - 1] != '\n') 369 if (buf[buflen - 1] != '\n')
374 return (-EINVAL); 370 return (-EINVAL);
@@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
381 memset(&ent, 0, sizeof(ent)); 377 memset(&ent, 0, sizeof(ent));
382 378
383 /* Authentication name */ 379 /* Authentication name */
384 if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) 380 len = qword_get(&buf, buf1, PAGE_SIZE);
381 if (len <= 0 || len >= IDMAP_NAMESZ)
385 goto out; 382 goto out;
386 memcpy(ent.authname, buf1, sizeof(ent.authname)); 383 memcpy(ent.authname, buf1, sizeof(ent.authname));
387 384
@@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
392 IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; 389 IDMAP_TYPE_USER : IDMAP_TYPE_GROUP;
393 390
394 /* Name */ 391 /* Name */
395 error = qword_get(&buf, buf1, PAGE_SIZE); 392 len = qword_get(&buf, buf1, PAGE_SIZE);
396 if (error <= 0 || error >= IDMAP_NAMESZ) 393 if (len <= 0 || len >= IDMAP_NAMESZ)
397 goto out; 394 goto out;
398 memcpy(ent.name, buf1, sizeof(ent.name)); 395 memcpy(ent.name, buf1, sizeof(ent.name));
399 396
@@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
421 error = 0; 418 error = 0;
422out: 419out:
423 kfree(buf1); 420 kfree(buf1);
424
425 return (error); 421 return (error);
426} 422}
427 423
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8f029db5d271..cdeb3cfd6f32 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -177,7 +177,7 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
177 fh_put(dst); 177 fh_put(dst);
178 dget(src->fh_dentry); 178 dget(src->fh_dentry);
179 if (src->fh_export) 179 if (src->fh_export)
180 cache_get(&src->fh_export->h); 180 exp_get(src->fh_export);
181 *dst = *src; 181 *dst = *src;
182} 182}
183 183
@@ -385,8 +385,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
385 if (nfsd4_has_session(cstate)) 385 if (nfsd4_has_session(cstate))
386 copy_clientid(&open->op_clientid, cstate->session); 386 copy_clientid(&open->op_clientid, cstate->session);
387 387
388 nfs4_lock_state();
389
390 /* check seqid for replay. set nfs4_owner */ 388 /* check seqid for replay. set nfs4_owner */
391 resp = rqstp->rq_resp; 389 resp = rqstp->rq_resp;
392 status = nfsd4_process_open1(&resp->cstate, open, nn); 390 status = nfsd4_process_open1(&resp->cstate, open, nn);
@@ -431,8 +429,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
431 break; 429 break;
432 case NFS4_OPEN_CLAIM_PREVIOUS: 430 case NFS4_OPEN_CLAIM_PREVIOUS:
433 status = nfs4_check_open_reclaim(&open->op_clientid, 431 status = nfs4_check_open_reclaim(&open->op_clientid,
434 cstate->minorversion, 432 cstate, nn);
435 nn);
436 if (status) 433 if (status)
437 goto out; 434 goto out;
438 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED; 435 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
@@ -461,19 +458,17 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
461 * set, (2) sets open->op_stateid, (3) sets open->op_delegation. 458 * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
462 */ 459 */
463 status = nfsd4_process_open2(rqstp, resfh, open); 460 status = nfsd4_process_open2(rqstp, resfh, open);
464 WARN_ON(status && open->op_created); 461 WARN(status && open->op_created,
462 "nfsd4_process_open2 failed to open newly-created file! status=%u\n",
463 be32_to_cpu(status));
465out: 464out:
466 if (resfh && resfh != &cstate->current_fh) { 465 if (resfh && resfh != &cstate->current_fh) {
467 fh_dup2(&cstate->current_fh, resfh); 466 fh_dup2(&cstate->current_fh, resfh);
468 fh_put(resfh); 467 fh_put(resfh);
469 kfree(resfh); 468 kfree(resfh);
470 } 469 }
471 nfsd4_cleanup_open_state(open, status); 470 nfsd4_cleanup_open_state(cstate, open, status);
472 if (open->op_openowner && !nfsd4_has_session(cstate))
473 cstate->replay_owner = &open->op_openowner->oo_owner;
474 nfsd4_bump_seqid(cstate, status); 471 nfsd4_bump_seqid(cstate, status);
475 if (!cstate->replay_owner)
476 nfs4_unlock_state();
477 return status; 472 return status;
478} 473}
479 474
@@ -581,8 +576,12 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
581 __be32 verf[2]; 576 __be32 verf[2];
582 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 577 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
583 578
584 verf[0] = (__be32)nn->nfssvc_boot.tv_sec; 579 /*
585 verf[1] = (__be32)nn->nfssvc_boot.tv_usec; 580 * This is opaque to client, so no need to byte-swap. Use
581 * __force to keep sparse happy
582 */
583 verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
584 verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec;
586 memcpy(verifier->data, verf, sizeof(verifier->data)); 585 memcpy(verifier->data, verf, sizeof(verifier->data));
587} 586}
588 587
@@ -619,8 +618,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
619 case NF4LNK: 618 case NF4LNK:
620 status = nfsd_symlink(rqstp, &cstate->current_fh, 619 status = nfsd_symlink(rqstp, &cstate->current_fh,
621 create->cr_name, create->cr_namelen, 620 create->cr_name, create->cr_namelen,
622 create->cr_linkname, create->cr_linklen, 621 create->cr_data, &resfh);
623 &resfh, &create->cr_iattr);
624 break; 622 break;
625 623
626 case NF4BLK: 624 case NF4BLK:
@@ -909,8 +907,8 @@ nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstat
909 default: 907 default:
910 return nfserr_inval; 908 return nfserr_inval;
911 } 909 }
912 exp_get(cstate->current_fh.fh_export); 910
913 sin->sin_exp = cstate->current_fh.fh_export; 911 sin->sin_exp = exp_get(cstate->current_fh.fh_export);
914 fh_put(&cstate->current_fh); 912 fh_put(&cstate->current_fh);
915 return nfs_ok; 913 return nfs_ok;
916} 914}
@@ -1015,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1015 return status; 1013 return status;
1016} 1014}
1017 1015
1016static __be32
1017nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
1018 struct nfsd4_seek *seek)
1019{
1020 int whence;
1021 __be32 status;
1022 struct file *file;
1023
1024 status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate,
1025 &seek->seek_stateid,
1026 RD_STATE, &file);
1027 if (status) {
1028 dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
1029 return status;
1030 }
1031
1032 switch (seek->seek_whence) {
1033 case NFS4_CONTENT_DATA:
1034 whence = SEEK_DATA;
1035 break;
1036 case NFS4_CONTENT_HOLE:
1037 whence = SEEK_HOLE;
1038 break;
1039 default:
1040 status = nfserr_union_notsupp;
1041 goto out;
1042 }
1043
1044 /*
1045 * Note: This call does change file->f_pos, but nothing in NFSD
1046 * should ever file->f_pos.
1047 */
1048 seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence);
1049 if (seek->seek_pos < 0)
1050 status = nfserrno(seek->seek_pos);
1051 else if (seek->seek_pos >= i_size_read(file_inode(file)))
1052 seek->seek_eof = true;
1053
1054out:
1055 fput(file);
1056 return status;
1057}
1058
1018/* This routine never returns NFS_OK! If there are no other errors, it 1059/* This routine never returns NFS_OK! If there are no other errors, it
1019 * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the 1060 * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the
1020 * attributes matched. VERIFY is implemented by mapping NFSERR_SAME 1061 * attributes matched. VERIFY is implemented by mapping NFSERR_SAME
@@ -1289,7 +1330,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
1289 * Don't use the deferral mechanism for NFSv4; compounds make it 1330 * Don't use the deferral mechanism for NFSv4; compounds make it
1290 * too hard to avoid non-idempotency problems. 1331 * too hard to avoid non-idempotency problems.
1291 */ 1332 */
1292 rqstp->rq_usedeferral = 0; 1333 rqstp->rq_usedeferral = false;
1293 1334
1294 /* 1335 /*
1295 * According to RFC3010, this takes precedence over all other errors. 1336 * According to RFC3010, this takes precedence over all other errors.
@@ -1391,10 +1432,7 @@ encode_op:
1391 args->ops, args->opcnt, resp->opcnt, op->opnum, 1432 args->ops, args->opcnt, resp->opcnt, op->opnum,
1392 be32_to_cpu(status)); 1433 be32_to_cpu(status));
1393 1434
1394 if (cstate->replay_owner) { 1435 nfsd4_cstate_clear_replay(cstate);
1395 nfs4_unlock_state();
1396 cstate->replay_owner = NULL;
1397 }
1398 /* XXX Ugh, we need to get rid of this kind of special case: */ 1436 /* XXX Ugh, we need to get rid of this kind of special case: */
1399 if (op->opnum == OP_READ && op->u.read.rd_filp) 1437 if (op->opnum == OP_READ && op->u.read.rd_filp)
1400 fput(op->u.read.rd_filp); 1438 fput(op->u.read.rd_filp);
@@ -1408,7 +1446,7 @@ encode_op:
1408 BUG_ON(cstate->replay_owner); 1446 BUG_ON(cstate->replay_owner);
1409out: 1447out:
1410 /* Reset deferral mechanism for RPC deferrals */ 1448 /* Reset deferral mechanism for RPC deferrals */
1411 rqstp->rq_usedeferral = 1; 1449 rqstp->rq_usedeferral = true;
1412 dprintk("nfsv4 compound returned %d\n", ntohl(status)); 1450 dprintk("nfsv4 compound returned %d\n", ntohl(status));
1413 return status; 1451 return status;
1414} 1452}
@@ -1520,21 +1558,17 @@ static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1520 u32 maxcount = 0, rlen = 0; 1558 u32 maxcount = 0, rlen = 0;
1521 1559
1522 maxcount = svc_max_payload(rqstp); 1560 maxcount = svc_max_payload(rqstp);
1523 rlen = op->u.read.rd_length; 1561 rlen = min(op->u.read.rd_length, maxcount);
1524
1525 if (rlen > maxcount)
1526 rlen = maxcount;
1527 1562
1528 return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32); 1563 return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
1529} 1564}
1530 1565
1531static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op) 1566static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
1532{ 1567{
1533 u32 maxcount = svc_max_payload(rqstp); 1568 u32 maxcount = 0, rlen = 0;
1534 u32 rlen = op->u.readdir.rd_maxcount;
1535 1569
1536 if (rlen > maxcount) 1570 maxcount = svc_max_payload(rqstp);
1537 rlen = maxcount; 1571 rlen = min(op->u.readdir.rd_maxcount, maxcount);
1538 1572
1539 return (op_encode_hdr_size + op_encode_verifier_maxsz + 1573 return (op_encode_hdr_size + op_encode_verifier_maxsz +
1540 XDR_QUADLEN(rlen)) * sizeof(__be32); 1574 XDR_QUADLEN(rlen)) * sizeof(__be32);
@@ -1890,6 +1924,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
1890 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, 1924 .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
1891 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, 1925 .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
1892 }, 1926 },
1927
1928 /* NFSv4.2 operations */
1929 [OP_SEEK] = {
1930 .op_func = (nfsd4op_func)nfsd4_seek,
1931 .op_name = "OP_SEEK",
1932 },
1893}; 1933};
1894 1934
1895int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) 1935int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 9c271f42604a..a25490ae6c62 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops {
58 void (*create)(struct nfs4_client *); 58 void (*create)(struct nfs4_client *);
59 void (*remove)(struct nfs4_client *); 59 void (*remove)(struct nfs4_client *);
60 int (*check)(struct nfs4_client *); 60 int (*check)(struct nfs4_client *);
61 void (*grace_done)(struct nfsd_net *, time_t); 61 void (*grace_done)(struct nfsd_net *);
62}; 62};
63 63
64/* Globals */ 64/* Globals */
@@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
188 188
189 status = mnt_want_write_file(nn->rec_file); 189 status = mnt_want_write_file(nn->rec_file);
190 if (status) 190 if (status)
191 return; 191 goto out_creds;
192 192
193 dir = nn->rec_file->f_path.dentry; 193 dir = nn->rec_file->f_path.dentry;
194 /* lock the parent */ 194 /* lock the parent */
@@ -228,6 +228,7 @@ out_unlock:
228 user_recovery_dirname); 228 user_recovery_dirname);
229 } 229 }
230 mnt_drop_write_file(nn->rec_file); 230 mnt_drop_write_file(nn->rec_file);
231out_creds:
231 nfs4_reset_creds(original_cred); 232 nfs4_reset_creds(original_cred);
232} 233}
233 234
@@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
392} 393}
393 394
394static void 395static void
395nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time) 396nfsd4_recdir_purge_old(struct nfsd_net *nn)
396{ 397{
397 int status; 398 int status;
398 399
@@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net)
479 return status; 480 return status;
480} 481}
481 482
483static void
484nfsd4_shutdown_recdir(struct net *net)
485{
486 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
487
488 if (!nn->rec_file)
489 return;
490 fput(nn->rec_file);
491 nn->rec_file = NULL;
492}
482 493
483static int 494static int
484nfs4_legacy_state_init(struct net *net) 495nfs4_legacy_state_init(struct net *net)
@@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net)
512 int status; 523 int status;
513 524
514 status = nfsd4_init_recdir(net); 525 status = nfsd4_init_recdir(net);
515 if (!status)
516 status = nfsd4_recdir_load(net);
517 if (status) 526 if (status)
518 printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); 527 return status;
528
529 status = nfsd4_recdir_load(net);
530 if (status)
531 nfsd4_shutdown_recdir(net);
532
519 return status; 533 return status;
520} 534}
521 535
@@ -546,21 +560,12 @@ err:
546} 560}
547 561
548static void 562static void
549nfsd4_shutdown_recdir(struct nfsd_net *nn)
550{
551 if (!nn->rec_file)
552 return;
553 fput(nn->rec_file);
554 nn->rec_file = NULL;
555}
556
557static void
558nfsd4_legacy_tracking_exit(struct net *net) 563nfsd4_legacy_tracking_exit(struct net *net)
559{ 564{
560 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 565 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
561 566
562 nfs4_release_reclaim(nn); 567 nfs4_release_reclaim(nn);
563 nfsd4_shutdown_recdir(nn); 568 nfsd4_shutdown_recdir(net);
564 nfs4_legacy_state_shutdown(net); 569 nfs4_legacy_state_shutdown(net);
565} 570}
566 571
@@ -670,7 +675,6 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg)
670 } 675 }
671 676
672 schedule(); 677 schedule();
673 set_current_state(TASK_RUNNING);
674 678
675 if (msg.errno < 0) 679 if (msg.errno < 0)
676 ret = msg.errno; 680 ret = msg.errno;
@@ -1016,7 +1020,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
1016} 1020}
1017 1021
1018static void 1022static void
1019nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time) 1023nfsd4_cld_grace_done(struct nfsd_net *nn)
1020{ 1024{
1021 int ret; 1025 int ret;
1022 struct cld_upcall *cup; 1026 struct cld_upcall *cup;
@@ -1029,7 +1033,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time)
1029 } 1033 }
1030 1034
1031 cup->cu_msg.cm_cmd = Cld_GraceDone; 1035 cup->cu_msg.cm_cmd = Cld_GraceDone;
1032 cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time; 1036 cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time;
1033 ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); 1037 ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg);
1034 if (!ret) 1038 if (!ret)
1035 ret = cup->cu_msg.cm_status; 1039 ret = cup->cu_msg.cm_status;
@@ -1062,6 +1066,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable,
1062 1066
1063#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" 1067#define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR="
1064#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" 1068#define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR="
1069#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION="
1070#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START="
1065 1071
1066static char * 1072static char *
1067nfsd4_cltrack_legacy_topdir(void) 1073nfsd4_cltrack_legacy_topdir(void)
@@ -1126,10 +1132,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name)
1126 return result; 1132 return result;
1127} 1133}
1128 1134
1135static char *
1136nfsd4_cltrack_client_has_session(struct nfs4_client *clp)
1137{
1138 int copied;
1139 size_t len;
1140 char *result;
1141
1142 /* prefix + Y/N character + terminating NULL */
1143 len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1;
1144
1145 result = kmalloc(len, GFP_KERNEL);
1146 if (!result)
1147 return result;
1148
1149 copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c",
1150 clp->cl_minorversion ? 'Y' : 'N');
1151 if (copied >= len) {
1152 /* just return nothing if output was truncated */
1153 kfree(result);
1154 return NULL;
1155 }
1156
1157 return result;
1158}
1159
1160static char *
1161nfsd4_cltrack_grace_start(time_t grace_start)
1162{
1163 int copied;
1164 size_t len;
1165 char *result;
1166
1167 /* prefix + max width of int64_t string + terminating NULL */
1168 len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1;
1169
1170 result = kmalloc(len, GFP_KERNEL);
1171 if (!result)
1172 return result;
1173
1174 copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld",
1175 grace_start);
1176 if (copied >= len) {
1177 /* just return nothing if output was truncated */
1178 kfree(result);
1179 return NULL;
1180 }
1181
1182 return result;
1183}
1184
1129static int 1185static int
1130nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) 1186nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
1131{ 1187{
1132 char *envp[2]; 1188 char *envp[3];
1133 char *argv[4]; 1189 char *argv[4];
1134 int ret; 1190 int ret;
1135 1191
@@ -1140,10 +1196,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy)
1140 1196
1141 dprintk("%s: cmd: %s\n", __func__, cmd); 1197 dprintk("%s: cmd: %s\n", __func__, cmd);
1142 dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); 1198 dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)");
1143 dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)"); 1199 dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)");
1200 dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)");
1144 1201
1145 envp[0] = legacy; 1202 envp[0] = env0;
1146 envp[1] = NULL; 1203 envp[1] = env1;
1204 envp[2] = NULL;
1147 1205
1148 argv[0] = (char *)cltrack_prog; 1206 argv[0] = (char *)cltrack_prog;
1149 argv[1] = cmd; 1207 argv[1] = cmd;
@@ -1187,28 +1245,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen)
1187} 1245}
1188 1246
1189static int 1247static int
1190nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) 1248nfsd4_umh_cltrack_init(struct net *net)
1191{ 1249{
1250 int ret;
1251 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1252 char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
1253
1192 /* XXX: The usermode helper s not working in container yet. */ 1254 /* XXX: The usermode helper s not working in container yet. */
1193 if (net != &init_net) { 1255 if (net != &init_net) {
1194 WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " 1256 WARN(1, KERN_ERR "NFSD: attempt to initialize umh client "
1195 "tracking in a container!\n"); 1257 "tracking in a container!\n");
1196 return -EINVAL; 1258 return -EINVAL;
1197 } 1259 }
1198 return nfsd4_umh_cltrack_upcall("init", NULL, NULL); 1260
1261 ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
1262 kfree(grace_start);
1263 return ret;
1264}
1265
1266static void
1267nfsd4_cltrack_upcall_lock(struct nfs4_client *clp)
1268{
1269 wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK,
1270 TASK_UNINTERRUPTIBLE);
1271}
1272
1273static void
1274nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp)
1275{
1276 smp_mb__before_atomic();
1277 clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags);
1278 smp_mb__after_atomic();
1279 wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK);
1199} 1280}
1200 1281
1201static void 1282static void
1202nfsd4_umh_cltrack_create(struct nfs4_client *clp) 1283nfsd4_umh_cltrack_create(struct nfs4_client *clp)
1203{ 1284{
1204 char *hexid; 1285 char *hexid, *has_session, *grace_start;
1286 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1287
1288 /*
1289 * With v4.0 clients, there's little difference in outcome between a
1290 * create and check operation, and we can end up calling into this
1291 * function multiple times per client (once for each openowner). So,
1292 * for v4.0 clients skip upcalling once the client has been recorded
1293 * on stable storage.
1294 *
1295 * For v4.1+ clients, the outcome of the two operations is different,
1296 * so we must ensure that we upcall for the create operation. v4.1+
1297 * clients call this on RECLAIM_COMPLETE though, so we should only end
1298 * up doing a single create upcall per client.
1299 */
1300 if (clp->cl_minorversion == 0 &&
1301 test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1302 return;
1205 1303
1206 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1304 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1207 if (!hexid) { 1305 if (!hexid) {
1208 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1306 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1209 return; 1307 return;
1210 } 1308 }
1211 nfsd4_umh_cltrack_upcall("create", hexid, NULL); 1309
1310 has_session = nfsd4_cltrack_client_has_session(clp);
1311 grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
1312
1313 nfsd4_cltrack_upcall_lock(clp);
1314 if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start))
1315 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1316 nfsd4_cltrack_upcall_unlock(clp);
1317
1318 kfree(has_session);
1319 kfree(grace_start);
1212 kfree(hexid); 1320 kfree(hexid);
1213} 1321}
1214 1322
@@ -1217,12 +1325,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp)
1217{ 1325{
1218 char *hexid; 1326 char *hexid;
1219 1327
1328 if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1329 return;
1330
1220 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1331 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1221 if (!hexid) { 1332 if (!hexid) {
1222 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1333 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1223 return; 1334 return;
1224 } 1335 }
1225 nfsd4_umh_cltrack_upcall("remove", hexid, NULL); 1336
1337 nfsd4_cltrack_upcall_lock(clp);
1338 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) &&
1339 nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0)
1340 clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1341 nfsd4_cltrack_upcall_unlock(clp);
1342
1226 kfree(hexid); 1343 kfree(hexid);
1227} 1344}
1228 1345
@@ -1230,30 +1347,45 @@ static int
1230nfsd4_umh_cltrack_check(struct nfs4_client *clp) 1347nfsd4_umh_cltrack_check(struct nfs4_client *clp)
1231{ 1348{
1232 int ret; 1349 int ret;
1233 char *hexid, *legacy; 1350 char *hexid, *has_session, *legacy;
1351
1352 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
1353 return 0;
1234 1354
1235 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); 1355 hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len);
1236 if (!hexid) { 1356 if (!hexid) {
1237 dprintk("%s: can't allocate memory for upcall!\n", __func__); 1357 dprintk("%s: can't allocate memory for upcall!\n", __func__);
1238 return -ENOMEM; 1358 return -ENOMEM;
1239 } 1359 }
1360
1361 has_session = nfsd4_cltrack_client_has_session(clp);
1240 legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); 1362 legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name);
1241 ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy); 1363
1364 nfsd4_cltrack_upcall_lock(clp);
1365 if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) {
1366 ret = 0;
1367 } else {
1368 ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy);
1369 if (ret == 0)
1370 set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
1371 }
1372 nfsd4_cltrack_upcall_unlock(clp);
1373 kfree(has_session);
1242 kfree(legacy); 1374 kfree(legacy);
1243 kfree(hexid); 1375 kfree(hexid);
1376
1244 return ret; 1377 return ret;
1245} 1378}
1246 1379
1247static void 1380static void
1248nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn, 1381nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
1249 time_t boot_time)
1250{ 1382{
1251 char *legacy; 1383 char *legacy;
1252 char timestr[22]; /* FIXME: better way to determine max size? */ 1384 char timestr[22]; /* FIXME: better way to determine max size? */
1253 1385
1254 sprintf(timestr, "%ld", boot_time); 1386 sprintf(timestr, "%ld", nn->boot_time);
1255 legacy = nfsd4_cltrack_legacy_topdir(); 1387 legacy = nfsd4_cltrack_legacy_topdir();
1256 nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy); 1388 nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL);
1257 kfree(legacy); 1389 kfree(legacy);
1258} 1390}
1259 1391
@@ -1356,10 +1488,10 @@ nfsd4_client_record_check(struct nfs4_client *clp)
1356} 1488}
1357 1489
1358void 1490void
1359nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time) 1491nfsd4_record_grace_done(struct nfsd_net *nn)
1360{ 1492{
1361 if (nn->client_tracking_ops) 1493 if (nn->client_tracking_ops)
1362 nn->client_tracking_ops->grace_done(nn, boot_time); 1494 nn->client_tracking_ops->grace_done(nn);
1363} 1495}
1364 1496
1365static int 1497static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2204e1fe5725..e9c3afe4b5d3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -70,13 +70,11 @@ static u64 current_sessionid = 1;
70#define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t))) 70#define CURRENT_STATEID(stateid) (!memcmp((stateid), &currentstateid, sizeof(stateid_t)))
71 71
72/* forward declarations */ 72/* forward declarations */
73static int check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner); 73static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner);
74static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
74 75
75/* Locking: */ 76/* Locking: */
76 77
77/* Currently used for almost all code touching nfsv4 state: */
78static DEFINE_MUTEX(client_mutex);
79
80/* 78/*
81 * Currently used for the del_recall_lru and file hash table. In an 79 * Currently used for the del_recall_lru and file hash table. In an
82 * effort to decrease the scope of the client_mutex, this spinlock may 80 * effort to decrease the scope of the client_mutex, this spinlock may
@@ -84,31 +82,27 @@ static DEFINE_MUTEX(client_mutex);
84 */ 82 */
85static DEFINE_SPINLOCK(state_lock); 83static DEFINE_SPINLOCK(state_lock);
86 84
85/*
86 * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for
87 * the refcount on the open stateid to drop.
88 */
89static DECLARE_WAIT_QUEUE_HEAD(close_wq);
90
87static struct kmem_cache *openowner_slab; 91static struct kmem_cache *openowner_slab;
88static struct kmem_cache *lockowner_slab; 92static struct kmem_cache *lockowner_slab;
89static struct kmem_cache *file_slab; 93static struct kmem_cache *file_slab;
90static struct kmem_cache *stateid_slab; 94static struct kmem_cache *stateid_slab;
91static struct kmem_cache *deleg_slab; 95static struct kmem_cache *deleg_slab;
92 96
93void
94nfs4_lock_state(void)
95{
96 mutex_lock(&client_mutex);
97}
98
99static void free_session(struct nfsd4_session *); 97static void free_session(struct nfsd4_session *);
100 98
99static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
100
101static bool is_session_dead(struct nfsd4_session *ses) 101static bool is_session_dead(struct nfsd4_session *ses)
102{ 102{
103 return ses->se_flags & NFS4_SESSION_DEAD; 103 return ses->se_flags & NFS4_SESSION_DEAD;
104} 104}
105 105
106void nfsd4_put_session(struct nfsd4_session *ses)
107{
108 if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
109 free_session(ses);
110}
111
112static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me) 106static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_by_me)
113{ 107{
114 if (atomic_read(&ses->se_ref) > ref_held_by_me) 108 if (atomic_read(&ses->se_ref) > ref_held_by_me)
@@ -117,46 +111,17 @@ static __be32 mark_session_dead_locked(struct nfsd4_session *ses, int ref_held_b
117 return nfs_ok; 111 return nfs_ok;
118} 112}
119 113
120static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
121{
122 if (is_session_dead(ses))
123 return nfserr_badsession;
124 atomic_inc(&ses->se_ref);
125 return nfs_ok;
126}
127
128void
129nfs4_unlock_state(void)
130{
131 mutex_unlock(&client_mutex);
132}
133
134static bool is_client_expired(struct nfs4_client *clp) 114static bool is_client_expired(struct nfs4_client *clp)
135{ 115{
136 return clp->cl_time == 0; 116 return clp->cl_time == 0;
137} 117}
138 118
139static __be32 mark_client_expired_locked(struct nfs4_client *clp) 119static __be32 get_client_locked(struct nfs4_client *clp)
140{
141 if (atomic_read(&clp->cl_refcount))
142 return nfserr_jukebox;
143 clp->cl_time = 0;
144 return nfs_ok;
145}
146
147static __be32 mark_client_expired(struct nfs4_client *clp)
148{ 120{
149 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 121 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
150 __be32 ret;
151 122
152 spin_lock(&nn->client_lock); 123 lockdep_assert_held(&nn->client_lock);
153 ret = mark_client_expired_locked(clp);
154 spin_unlock(&nn->client_lock);
155 return ret;
156}
157 124
158static __be32 get_client_locked(struct nfs4_client *clp)
159{
160 if (is_client_expired(clp)) 125 if (is_client_expired(clp))
161 return nfserr_expired; 126 return nfserr_expired;
162 atomic_inc(&clp->cl_refcount); 127 atomic_inc(&clp->cl_refcount);
@@ -197,13 +162,17 @@ renew_client(struct nfs4_client *clp)
197 162
198static void put_client_renew_locked(struct nfs4_client *clp) 163static void put_client_renew_locked(struct nfs4_client *clp)
199{ 164{
165 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
166
167 lockdep_assert_held(&nn->client_lock);
168
200 if (!atomic_dec_and_test(&clp->cl_refcount)) 169 if (!atomic_dec_and_test(&clp->cl_refcount))
201 return; 170 return;
202 if (!is_client_expired(clp)) 171 if (!is_client_expired(clp))
203 renew_client_locked(clp); 172 renew_client_locked(clp);
204} 173}
205 174
206void put_client_renew(struct nfs4_client *clp) 175static void put_client_renew(struct nfs4_client *clp)
207{ 176{
208 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 177 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
209 178
@@ -214,6 +183,84 @@ void put_client_renew(struct nfs4_client *clp)
214 spin_unlock(&nn->client_lock); 183 spin_unlock(&nn->client_lock);
215} 184}
216 185
186static __be32 nfsd4_get_session_locked(struct nfsd4_session *ses)
187{
188 __be32 status;
189
190 if (is_session_dead(ses))
191 return nfserr_badsession;
192 status = get_client_locked(ses->se_client);
193 if (status)
194 return status;
195 atomic_inc(&ses->se_ref);
196 return nfs_ok;
197}
198
199static void nfsd4_put_session_locked(struct nfsd4_session *ses)
200{
201 struct nfs4_client *clp = ses->se_client;
202 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
203
204 lockdep_assert_held(&nn->client_lock);
205
206 if (atomic_dec_and_test(&ses->se_ref) && is_session_dead(ses))
207 free_session(ses);
208 put_client_renew_locked(clp);
209}
210
211static void nfsd4_put_session(struct nfsd4_session *ses)
212{
213 struct nfs4_client *clp = ses->se_client;
214 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
215
216 spin_lock(&nn->client_lock);
217 nfsd4_put_session_locked(ses);
218 spin_unlock(&nn->client_lock);
219}
220
221static inline struct nfs4_stateowner *
222nfs4_get_stateowner(struct nfs4_stateowner *sop)
223{
224 atomic_inc(&sop->so_count);
225 return sop;
226}
227
228static int
229same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner)
230{
231 return (sop->so_owner.len == owner->len) &&
232 0 == memcmp(sop->so_owner.data, owner->data, owner->len);
233}
234
235static struct nfs4_openowner *
236find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open,
237 struct nfs4_client *clp)
238{
239 struct nfs4_stateowner *so;
240
241 lockdep_assert_held(&clp->cl_lock);
242
243 list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[hashval],
244 so_strhash) {
245 if (!so->so_is_open_owner)
246 continue;
247 if (same_owner_str(so, &open->op_owner))
248 return openowner(nfs4_get_stateowner(so));
249 }
250 return NULL;
251}
252
253static struct nfs4_openowner *
254find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
255 struct nfs4_client *clp)
256{
257 struct nfs4_openowner *oo;
258
259 spin_lock(&clp->cl_lock);
260 oo = find_openstateowner_str_locked(hashval, open, clp);
261 spin_unlock(&clp->cl_lock);
262 return oo;
263}
217 264
218static inline u32 265static inline u32
219opaque_hashval(const void *ptr, int nbytes) 266opaque_hashval(const void *ptr, int nbytes)
@@ -236,10 +283,11 @@ static void nfsd4_free_file(struct nfs4_file *f)
236static inline void 283static inline void
237put_nfs4_file(struct nfs4_file *fi) 284put_nfs4_file(struct nfs4_file *fi)
238{ 285{
286 might_lock(&state_lock);
287
239 if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { 288 if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) {
240 hlist_del(&fi->fi_hash); 289 hlist_del(&fi->fi_hash);
241 spin_unlock(&state_lock); 290 spin_unlock(&state_lock);
242 iput(fi->fi_inode);
243 nfsd4_free_file(fi); 291 nfsd4_free_file(fi);
244 } 292 }
245} 293}
@@ -250,7 +298,80 @@ get_nfs4_file(struct nfs4_file *fi)
250 atomic_inc(&fi->fi_ref); 298 atomic_inc(&fi->fi_ref);
251} 299}
252 300
253static int num_delegations; 301static struct file *
302__nfs4_get_fd(struct nfs4_file *f, int oflag)
303{
304 if (f->fi_fds[oflag])
305 return get_file(f->fi_fds[oflag]);
306 return NULL;
307}
308
309static struct file *
310find_writeable_file_locked(struct nfs4_file *f)
311{
312 struct file *ret;
313
314 lockdep_assert_held(&f->fi_lock);
315
316 ret = __nfs4_get_fd(f, O_WRONLY);
317 if (!ret)
318 ret = __nfs4_get_fd(f, O_RDWR);
319 return ret;
320}
321
322static struct file *
323find_writeable_file(struct nfs4_file *f)
324{
325 struct file *ret;
326
327 spin_lock(&f->fi_lock);
328 ret = find_writeable_file_locked(f);
329 spin_unlock(&f->fi_lock);
330
331 return ret;
332}
333
334static struct file *find_readable_file_locked(struct nfs4_file *f)
335{
336 struct file *ret;
337
338 lockdep_assert_held(&f->fi_lock);
339
340 ret = __nfs4_get_fd(f, O_RDONLY);
341 if (!ret)
342 ret = __nfs4_get_fd(f, O_RDWR);
343 return ret;
344}
345
346static struct file *
347find_readable_file(struct nfs4_file *f)
348{
349 struct file *ret;
350
351 spin_lock(&f->fi_lock);
352 ret = find_readable_file_locked(f);
353 spin_unlock(&f->fi_lock);
354
355 return ret;
356}
357
358static struct file *
359find_any_file(struct nfs4_file *f)
360{
361 struct file *ret;
362
363 spin_lock(&f->fi_lock);
364 ret = __nfs4_get_fd(f, O_RDWR);
365 if (!ret) {
366 ret = __nfs4_get_fd(f, O_WRONLY);
367 if (!ret)
368 ret = __nfs4_get_fd(f, O_RDONLY);
369 }
370 spin_unlock(&f->fi_lock);
371 return ret;
372}
373
374static atomic_long_t num_delegations;
254unsigned long max_delegations; 375unsigned long max_delegations;
255 376
256/* 377/*
@@ -262,12 +383,11 @@ unsigned long max_delegations;
262#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) 383#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS)
263#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) 384#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1)
264 385
265static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername) 386static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
266{ 387{
267 unsigned int ret; 388 unsigned int ret;
268 389
269 ret = opaque_hashval(ownername->data, ownername->len); 390 ret = opaque_hashval(ownername->data, ownername->len);
270 ret += clientid;
271 return ret & OWNER_HASH_MASK; 391 return ret & OWNER_HASH_MASK;
272} 392}
273 393
@@ -275,75 +395,124 @@ static unsigned int ownerstr_hashval(u32 clientid, struct xdr_netobj *ownername)
275#define FILE_HASH_BITS 8 395#define FILE_HASH_BITS 8
276#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) 396#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
277 397
278static unsigned int file_hashval(struct inode *ino) 398static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh)
399{
400 return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0);
401}
402
403static unsigned int file_hashval(struct knfsd_fh *fh)
404{
405 return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
406}
407
408static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
279{ 409{
280 /* XXX: why are we hashing on inode pointer, anyway? */ 410 return fh1->fh_size == fh2->fh_size &&
281 return hash_ptr(ino, FILE_HASH_BITS); 411 !memcmp(fh1->fh_base.fh_pad,
412 fh2->fh_base.fh_pad,
413 fh1->fh_size);
282} 414}
283 415
284static struct hlist_head file_hashtbl[FILE_HASH_SIZE]; 416static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
285 417
286static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) 418static void
419__nfs4_file_get_access(struct nfs4_file *fp, u32 access)
287{ 420{
288 WARN_ON_ONCE(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); 421 lockdep_assert_held(&fp->fi_lock);
289 atomic_inc(&fp->fi_access[oflag]); 422
423 if (access & NFS4_SHARE_ACCESS_WRITE)
424 atomic_inc(&fp->fi_access[O_WRONLY]);
425 if (access & NFS4_SHARE_ACCESS_READ)
426 atomic_inc(&fp->fi_access[O_RDONLY]);
290} 427}
291 428
292static void nfs4_file_get_access(struct nfs4_file *fp, int oflag) 429static __be32
430nfs4_file_get_access(struct nfs4_file *fp, u32 access)
293{ 431{
294 if (oflag == O_RDWR) { 432 lockdep_assert_held(&fp->fi_lock);
295 __nfs4_file_get_access(fp, O_RDONLY); 433
296 __nfs4_file_get_access(fp, O_WRONLY); 434 /* Does this access mode make sense? */
297 } else 435 if (access & ~NFS4_SHARE_ACCESS_BOTH)
298 __nfs4_file_get_access(fp, oflag); 436 return nfserr_inval;
437
438 /* Does it conflict with a deny mode already set? */
439 if ((access & fp->fi_share_deny) != 0)
440 return nfserr_share_denied;
441
442 __nfs4_file_get_access(fp, access);
443 return nfs_ok;
299} 444}
300 445
301static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) 446static __be32 nfs4_file_check_deny(struct nfs4_file *fp, u32 deny)
302{ 447{
303 if (fp->fi_fds[oflag]) { 448 /* Common case is that there is no deny mode. */
304 fput(fp->fi_fds[oflag]); 449 if (deny) {
305 fp->fi_fds[oflag] = NULL; 450 /* Does this deny mode make sense? */
451 if (deny & ~NFS4_SHARE_DENY_BOTH)
452 return nfserr_inval;
453
454 if ((deny & NFS4_SHARE_DENY_READ) &&
455 atomic_read(&fp->fi_access[O_RDONLY]))
456 return nfserr_share_denied;
457
458 if ((deny & NFS4_SHARE_DENY_WRITE) &&
459 atomic_read(&fp->fi_access[O_WRONLY]))
460 return nfserr_share_denied;
306 } 461 }
462 return nfs_ok;
307} 463}
308 464
309static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) 465static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag)
310{ 466{
311 if (atomic_dec_and_test(&fp->fi_access[oflag])) { 467 might_lock(&fp->fi_lock);
312 nfs4_file_put_fd(fp, oflag); 468
469 if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) {
470 struct file *f1 = NULL;
471 struct file *f2 = NULL;
472
473 swap(f1, fp->fi_fds[oflag]);
313 if (atomic_read(&fp->fi_access[1 - oflag]) == 0) 474 if (atomic_read(&fp->fi_access[1 - oflag]) == 0)
314 nfs4_file_put_fd(fp, O_RDWR); 475 swap(f2, fp->fi_fds[O_RDWR]);
476 spin_unlock(&fp->fi_lock);
477 if (f1)
478 fput(f1);
479 if (f2)
480 fput(f2);
315 } 481 }
316} 482}
317 483
318static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) 484static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
319{ 485{
320 if (oflag == O_RDWR) { 486 WARN_ON_ONCE(access & ~NFS4_SHARE_ACCESS_BOTH);
321 __nfs4_file_put_access(fp, O_RDONLY); 487
488 if (access & NFS4_SHARE_ACCESS_WRITE)
322 __nfs4_file_put_access(fp, O_WRONLY); 489 __nfs4_file_put_access(fp, O_WRONLY);
323 } else 490 if (access & NFS4_SHARE_ACCESS_READ)
324 __nfs4_file_put_access(fp, oflag); 491 __nfs4_file_put_access(fp, O_RDONLY);
325} 492}
326 493
327static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct 494static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
328kmem_cache *slab) 495 struct kmem_cache *slab)
329{ 496{
330 struct idr *stateids = &cl->cl_stateids;
331 struct nfs4_stid *stid; 497 struct nfs4_stid *stid;
332 int new_id; 498 int new_id;
333 499
334 stid = kmem_cache_alloc(slab, GFP_KERNEL); 500 stid = kmem_cache_zalloc(slab, GFP_KERNEL);
335 if (!stid) 501 if (!stid)
336 return NULL; 502 return NULL;
337 503
338 new_id = idr_alloc_cyclic(stateids, stid, 0, 0, GFP_KERNEL); 504 idr_preload(GFP_KERNEL);
505 spin_lock(&cl->cl_lock);
506 new_id = idr_alloc_cyclic(&cl->cl_stateids, stid, 0, 0, GFP_NOWAIT);
507 spin_unlock(&cl->cl_lock);
508 idr_preload_end();
339 if (new_id < 0) 509 if (new_id < 0)
340 goto out_free; 510 goto out_free;
341 stid->sc_client = cl; 511 stid->sc_client = cl;
342 stid->sc_type = 0;
343 stid->sc_stateid.si_opaque.so_id = new_id; 512 stid->sc_stateid.si_opaque.so_id = new_id;
344 stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; 513 stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
345 /* Will be incremented before return to client: */ 514 /* Will be incremented before return to client: */
346 stid->sc_stateid.si_generation = 0; 515 atomic_set(&stid->sc_count, 1);
347 516
348 /* 517 /*
349 * It shouldn't be a problem to reuse an opaque stateid value. 518 * It shouldn't be a problem to reuse an opaque stateid value.
@@ -360,9 +529,24 @@ out_free:
360 return NULL; 529 return NULL;
361} 530}
362 531
363static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp) 532static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
364{ 533{
365 return openlockstateid(nfs4_alloc_stid(clp, stateid_slab)); 534 struct nfs4_stid *stid;
535 struct nfs4_ol_stateid *stp;
536
537 stid = nfs4_alloc_stid(clp, stateid_slab);
538 if (!stid)
539 return NULL;
540
541 stp = openlockstateid(stid);
542 stp->st_stid.sc_free = nfs4_free_ol_stateid;
543 return stp;
544}
545
546static void nfs4_free_deleg(struct nfs4_stid *stid)
547{
548 kmem_cache_free(deleg_slab, stid);
549 atomic_long_dec(&num_delegations);
366} 550}
367 551
368/* 552/*
@@ -379,10 +563,11 @@ static struct nfs4_ol_stateid * nfs4_alloc_stateid(struct nfs4_client *clp)
379 * Each filter is 256 bits. We hash the filehandle to 32bit and use the 563 * Each filter is 256 bits. We hash the filehandle to 32bit and use the
380 * low 3 bytes as hash-table indices. 564 * low 3 bytes as hash-table indices.
381 * 565 *
382 * 'state_lock', which is always held when block_delegations() is called, 566 * 'blocked_delegations_lock', which is always taken in block_delegations(),
383 * is used to manage concurrent access. Testing does not need the lock 567 * is used to manage concurrent access. Testing does not need the lock
384 * except when swapping the two filters. 568 * except when swapping the two filters.
385 */ 569 */
570static DEFINE_SPINLOCK(blocked_delegations_lock);
386static struct bloom_pair { 571static struct bloom_pair {
387 int entries, old_entries; 572 int entries, old_entries;
388 time_t swap_time; 573 time_t swap_time;
@@ -398,7 +583,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
398 if (bd->entries == 0) 583 if (bd->entries == 0)
399 return 0; 584 return 0;
400 if (seconds_since_boot() - bd->swap_time > 30) { 585 if (seconds_since_boot() - bd->swap_time > 30) {
401 spin_lock(&state_lock); 586 spin_lock(&blocked_delegations_lock);
402 if (seconds_since_boot() - bd->swap_time > 30) { 587 if (seconds_since_boot() - bd->swap_time > 30) {
403 bd->entries -= bd->old_entries; 588 bd->entries -= bd->old_entries;
404 bd->old_entries = bd->entries; 589 bd->old_entries = bd->entries;
@@ -407,7 +592,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
407 bd->new = 1-bd->new; 592 bd->new = 1-bd->new;
408 bd->swap_time = seconds_since_boot(); 593 bd->swap_time = seconds_since_boot();
409 } 594 }
410 spin_unlock(&state_lock); 595 spin_unlock(&blocked_delegations_lock);
411 } 596 }
412 hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); 597 hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
413 if (test_bit(hash&255, bd->set[0]) && 598 if (test_bit(hash&255, bd->set[0]) &&
@@ -430,76 +615,83 @@ static void block_delegations(struct knfsd_fh *fh)
430 615
431 hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0); 616 hash = arch_fast_hash(&fh->fh_base, fh->fh_size, 0);
432 617
618 spin_lock(&blocked_delegations_lock);
433 __set_bit(hash&255, bd->set[bd->new]); 619 __set_bit(hash&255, bd->set[bd->new]);
434 __set_bit((hash>>8)&255, bd->set[bd->new]); 620 __set_bit((hash>>8)&255, bd->set[bd->new]);
435 __set_bit((hash>>16)&255, bd->set[bd->new]); 621 __set_bit((hash>>16)&255, bd->set[bd->new]);
436 if (bd->entries == 0) 622 if (bd->entries == 0)
437 bd->swap_time = seconds_since_boot(); 623 bd->swap_time = seconds_since_boot();
438 bd->entries += 1; 624 bd->entries += 1;
625 spin_unlock(&blocked_delegations_lock);
439} 626}
440 627
441static struct nfs4_delegation * 628static struct nfs4_delegation *
442alloc_init_deleg(struct nfs4_client *clp, struct nfs4_ol_stateid *stp, struct svc_fh *current_fh) 629alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh)
443{ 630{
444 struct nfs4_delegation *dp; 631 struct nfs4_delegation *dp;
632 long n;
445 633
446 dprintk("NFSD alloc_init_deleg\n"); 634 dprintk("NFSD alloc_init_deleg\n");
447 if (num_delegations > max_delegations) 635 n = atomic_long_inc_return(&num_delegations);
448 return NULL; 636 if (n < 0 || n > max_delegations)
637 goto out_dec;
449 if (delegation_blocked(&current_fh->fh_handle)) 638 if (delegation_blocked(&current_fh->fh_handle))
450 return NULL; 639 goto out_dec;
451 dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); 640 dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab));
452 if (dp == NULL) 641 if (dp == NULL)
453 return dp; 642 goto out_dec;
643
644 dp->dl_stid.sc_free = nfs4_free_deleg;
454 /* 645 /*
455 * delegation seqid's are never incremented. The 4.1 special 646 * delegation seqid's are never incremented. The 4.1 special
456 * meaning of seqid 0 isn't meaningful, really, but let's avoid 647 * meaning of seqid 0 isn't meaningful, really, but let's avoid
457 * 0 anyway just for consistency and use 1: 648 * 0 anyway just for consistency and use 1:
458 */ 649 */
459 dp->dl_stid.sc_stateid.si_generation = 1; 650 dp->dl_stid.sc_stateid.si_generation = 1;
460 num_delegations++;
461 INIT_LIST_HEAD(&dp->dl_perfile); 651 INIT_LIST_HEAD(&dp->dl_perfile);
462 INIT_LIST_HEAD(&dp->dl_perclnt); 652 INIT_LIST_HEAD(&dp->dl_perclnt);
463 INIT_LIST_HEAD(&dp->dl_recall_lru); 653 INIT_LIST_HEAD(&dp->dl_recall_lru);
464 dp->dl_file = NULL;
465 dp->dl_type = NFS4_OPEN_DELEGATE_READ; 654 dp->dl_type = NFS4_OPEN_DELEGATE_READ;
466 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 655 dp->dl_retries = 1;
467 dp->dl_time = 0; 656 nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
468 atomic_set(&dp->dl_count, 1); 657 &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
469 nfsd4_init_callback(&dp->dl_recall);
470 return dp; 658 return dp;
659out_dec:
660 atomic_long_dec(&num_delegations);
661 return NULL;
471} 662}
472 663
473static void remove_stid(struct nfs4_stid *s) 664void
665nfs4_put_stid(struct nfs4_stid *s)
474{ 666{
475 struct idr *stateids = &s->sc_client->cl_stateids; 667 struct nfs4_file *fp = s->sc_file;
668 struct nfs4_client *clp = s->sc_client;
476 669
477 idr_remove(stateids, s->sc_stateid.si_opaque.so_id); 670 might_lock(&clp->cl_lock);
478}
479 671
480static void nfs4_free_stid(struct kmem_cache *slab, struct nfs4_stid *s) 672 if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) {
481{ 673 wake_up_all(&close_wq);
482 kmem_cache_free(slab, s); 674 return;
483}
484
485void
486nfs4_put_delegation(struct nfs4_delegation *dp)
487{
488 if (atomic_dec_and_test(&dp->dl_count)) {
489 nfs4_free_stid(deleg_slab, &dp->dl_stid);
490 num_delegations--;
491 } 675 }
676 idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
677 spin_unlock(&clp->cl_lock);
678 s->sc_free(s);
679 if (fp)
680 put_nfs4_file(fp);
492} 681}
493 682
494static void nfs4_put_deleg_lease(struct nfs4_file *fp) 683static void nfs4_put_deleg_lease(struct nfs4_file *fp)
495{ 684{
496 if (!fp->fi_lease) 685 struct file *filp = NULL;
497 return; 686
498 if (atomic_dec_and_test(&fp->fi_delegees)) { 687 spin_lock(&fp->fi_lock);
499 vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); 688 if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees))
500 fp->fi_lease = NULL; 689 swap(filp, fp->fi_deleg_file);
501 fput(fp->fi_deleg_file); 690 spin_unlock(&fp->fi_lock);
502 fp->fi_deleg_file = NULL; 691
692 if (filp) {
693 vfs_setlease(filp, F_UNLCK, NULL, NULL);
694 fput(filp);
503 } 695 }
504} 696}
505 697
@@ -512,54 +704,55 @@ static void
512hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) 704hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
513{ 705{
514 lockdep_assert_held(&state_lock); 706 lockdep_assert_held(&state_lock);
707 lockdep_assert_held(&fp->fi_lock);
515 708
709 atomic_inc(&dp->dl_stid.sc_count);
516 dp->dl_stid.sc_type = NFS4_DELEG_STID; 710 dp->dl_stid.sc_type = NFS4_DELEG_STID;
517 list_add(&dp->dl_perfile, &fp->fi_delegations); 711 list_add(&dp->dl_perfile, &fp->fi_delegations);
518 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); 712 list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
519} 713}
520 714
521/* Called under the state lock. */
522static void 715static void
523unhash_delegation(struct nfs4_delegation *dp) 716unhash_delegation_locked(struct nfs4_delegation *dp)
524{ 717{
525 spin_lock(&state_lock); 718 struct nfs4_file *fp = dp->dl_stid.sc_file;
526 list_del_init(&dp->dl_perclnt);
527 list_del_init(&dp->dl_perfile);
528 list_del_init(&dp->dl_recall_lru);
529 spin_unlock(&state_lock);
530 if (dp->dl_file) {
531 nfs4_put_deleg_lease(dp->dl_file);
532 put_nfs4_file(dp->dl_file);
533 dp->dl_file = NULL;
534 }
535}
536
537 719
720 lockdep_assert_held(&state_lock);
538 721
539static void destroy_revoked_delegation(struct nfs4_delegation *dp) 722 dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
540{ 723 /* Ensure that deleg break won't try to requeue it */
724 ++dp->dl_time;
725 spin_lock(&fp->fi_lock);
726 list_del_init(&dp->dl_perclnt);
541 list_del_init(&dp->dl_recall_lru); 727 list_del_init(&dp->dl_recall_lru);
542 remove_stid(&dp->dl_stid); 728 list_del_init(&dp->dl_perfile);
543 nfs4_put_delegation(dp); 729 spin_unlock(&fp->fi_lock);
544} 730}
545 731
546static void destroy_delegation(struct nfs4_delegation *dp) 732static void destroy_delegation(struct nfs4_delegation *dp)
547{ 733{
548 unhash_delegation(dp); 734 spin_lock(&state_lock);
549 remove_stid(&dp->dl_stid); 735 unhash_delegation_locked(dp);
550 nfs4_put_delegation(dp); 736 spin_unlock(&state_lock);
737 nfs4_put_deleg_lease(dp->dl_stid.sc_file);
738 nfs4_put_stid(&dp->dl_stid);
551} 739}
552 740
553static void revoke_delegation(struct nfs4_delegation *dp) 741static void revoke_delegation(struct nfs4_delegation *dp)
554{ 742{
555 struct nfs4_client *clp = dp->dl_stid.sc_client; 743 struct nfs4_client *clp = dp->dl_stid.sc_client;
556 744
745 WARN_ON(!list_empty(&dp->dl_recall_lru));
746
747 nfs4_put_deleg_lease(dp->dl_stid.sc_file);
748
557 if (clp->cl_minorversion == 0) 749 if (clp->cl_minorversion == 0)
558 destroy_delegation(dp); 750 nfs4_put_stid(&dp->dl_stid);
559 else { 751 else {
560 unhash_delegation(dp);
561 dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; 752 dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
753 spin_lock(&clp->cl_lock);
562 list_add(&dp->dl_recall_lru, &clp->cl_revoked); 754 list_add(&dp->dl_recall_lru, &clp->cl_revoked);
755 spin_unlock(&clp->cl_lock);
563 } 756 }
564} 757}
565 758
@@ -607,57 +800,62 @@ bmap_to_share_mode(unsigned long bmap) {
607 return access; 800 return access;
608} 801}
609 802
610static bool
611test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
612 unsigned int access, deny;
613
614 access = bmap_to_share_mode(stp->st_access_bmap);
615 deny = bmap_to_share_mode(stp->st_deny_bmap);
616 if ((access & open->op_share_deny) || (deny & open->op_share_access))
617 return false;
618 return true;
619}
620
621/* set share access for a given stateid */ 803/* set share access for a given stateid */
622static inline void 804static inline void
623set_access(u32 access, struct nfs4_ol_stateid *stp) 805set_access(u32 access, struct nfs4_ol_stateid *stp)
624{ 806{
625 __set_bit(access, &stp->st_access_bmap); 807 unsigned char mask = 1 << access;
808
809 WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
810 stp->st_access_bmap |= mask;
626} 811}
627 812
628/* clear share access for a given stateid */ 813/* clear share access for a given stateid */
629static inline void 814static inline void
630clear_access(u32 access, struct nfs4_ol_stateid *stp) 815clear_access(u32 access, struct nfs4_ol_stateid *stp)
631{ 816{
632 __clear_bit(access, &stp->st_access_bmap); 817 unsigned char mask = 1 << access;
818
819 WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
820 stp->st_access_bmap &= ~mask;
633} 821}
634 822
635/* test whether a given stateid has access */ 823/* test whether a given stateid has access */
636static inline bool 824static inline bool
637test_access(u32 access, struct nfs4_ol_stateid *stp) 825test_access(u32 access, struct nfs4_ol_stateid *stp)
638{ 826{
639 return test_bit(access, &stp->st_access_bmap); 827 unsigned char mask = 1 << access;
828
829 return (bool)(stp->st_access_bmap & mask);
640} 830}
641 831
642/* set share deny for a given stateid */ 832/* set share deny for a given stateid */
643static inline void 833static inline void
644set_deny(u32 access, struct nfs4_ol_stateid *stp) 834set_deny(u32 deny, struct nfs4_ol_stateid *stp)
645{ 835{
646 __set_bit(access, &stp->st_deny_bmap); 836 unsigned char mask = 1 << deny;
837
838 WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
839 stp->st_deny_bmap |= mask;
647} 840}
648 841
649/* clear share deny for a given stateid */ 842/* clear share deny for a given stateid */
650static inline void 843static inline void
651clear_deny(u32 access, struct nfs4_ol_stateid *stp) 844clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
652{ 845{
653 __clear_bit(access, &stp->st_deny_bmap); 846 unsigned char mask = 1 << deny;
847
848 WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
849 stp->st_deny_bmap &= ~mask;
654} 850}
655 851
656/* test whether a given stateid is denying specific access */ 852/* test whether a given stateid is denying specific access */
657static inline bool 853static inline bool
658test_deny(u32 access, struct nfs4_ol_stateid *stp) 854test_deny(u32 deny, struct nfs4_ol_stateid *stp)
659{ 855{
660 return test_bit(access, &stp->st_deny_bmap); 856 unsigned char mask = 1 << deny;
857
858 return (bool)(stp->st_deny_bmap & mask);
661} 859}
662 860
663static int nfs4_access_to_omode(u32 access) 861static int nfs4_access_to_omode(u32 access)
@@ -674,138 +872,283 @@ static int nfs4_access_to_omode(u32 access)
674 return O_RDONLY; 872 return O_RDONLY;
675} 873}
676 874
875/*
876 * A stateid that had a deny mode associated with it is being released
877 * or downgraded. Recalculate the deny mode on the file.
878 */
879static void
880recalculate_deny_mode(struct nfs4_file *fp)
881{
882 struct nfs4_ol_stateid *stp;
883
884 spin_lock(&fp->fi_lock);
885 fp->fi_share_deny = 0;
886 list_for_each_entry(stp, &fp->fi_stateids, st_perfile)
887 fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap);
888 spin_unlock(&fp->fi_lock);
889}
890
891static void
892reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp)
893{
894 int i;
895 bool change = false;
896
897 for (i = 1; i < 4; i++) {
898 if ((i & deny) != i) {
899 change = true;
900 clear_deny(i, stp);
901 }
902 }
903
904 /* Recalculate per-file deny mode if there was a change */
905 if (change)
906 recalculate_deny_mode(stp->st_stid.sc_file);
907}
908
677/* release all access and file references for a given stateid */ 909/* release all access and file references for a given stateid */
678static void 910static void
679release_all_access(struct nfs4_ol_stateid *stp) 911release_all_access(struct nfs4_ol_stateid *stp)
680{ 912{
681 int i; 913 int i;
914 struct nfs4_file *fp = stp->st_stid.sc_file;
915
916 if (fp && stp->st_deny_bmap != 0)
917 recalculate_deny_mode(fp);
682 918
683 for (i = 1; i < 4; i++) { 919 for (i = 1; i < 4; i++) {
684 if (test_access(i, stp)) 920 if (test_access(i, stp))
685 nfs4_file_put_access(stp->st_file, 921 nfs4_file_put_access(stp->st_stid.sc_file, i);
686 nfs4_access_to_omode(i));
687 clear_access(i, stp); 922 clear_access(i, stp);
688 } 923 }
689} 924}
690 925
691static void unhash_generic_stateid(struct nfs4_ol_stateid *stp) 926static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
692{ 927{
928 struct nfs4_client *clp = sop->so_client;
929
930 might_lock(&clp->cl_lock);
931
932 if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock))
933 return;
934 sop->so_ops->so_unhash(sop);
935 spin_unlock(&clp->cl_lock);
936 kfree(sop->so_owner.data);
937 sop->so_ops->so_free(sop);
938}
939
940static void unhash_ol_stateid(struct nfs4_ol_stateid *stp)
941{
942 struct nfs4_file *fp = stp->st_stid.sc_file;
943
944 lockdep_assert_held(&stp->st_stateowner->so_client->cl_lock);
945
946 spin_lock(&fp->fi_lock);
693 list_del(&stp->st_perfile); 947 list_del(&stp->st_perfile);
948 spin_unlock(&fp->fi_lock);
694 list_del(&stp->st_perstateowner); 949 list_del(&stp->st_perstateowner);
695} 950}
696 951
697static void close_generic_stateid(struct nfs4_ol_stateid *stp) 952static void nfs4_free_ol_stateid(struct nfs4_stid *stid)
698{ 953{
954 struct nfs4_ol_stateid *stp = openlockstateid(stid);
955
699 release_all_access(stp); 956 release_all_access(stp);
700 put_nfs4_file(stp->st_file); 957 if (stp->st_stateowner)
701 stp->st_file = NULL; 958 nfs4_put_stateowner(stp->st_stateowner);
959 kmem_cache_free(stateid_slab, stid);
702} 960}
703 961
704static void free_generic_stateid(struct nfs4_ol_stateid *stp) 962static void nfs4_free_lock_stateid(struct nfs4_stid *stid)
705{ 963{
706 remove_stid(&stp->st_stid); 964 struct nfs4_ol_stateid *stp = openlockstateid(stid);
707 nfs4_free_stid(stateid_slab, &stp->st_stid); 965 struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
966 struct file *file;
967
968 file = find_any_file(stp->st_stid.sc_file);
969 if (file)
970 filp_close(file, (fl_owner_t)lo);
971 nfs4_free_ol_stateid(stid);
708} 972}
709 973
710static void release_lock_stateid(struct nfs4_ol_stateid *stp) 974/*
975 * Put the persistent reference to an already unhashed generic stateid, while
976 * holding the cl_lock. If it's the last reference, then put it onto the
977 * reaplist for later destruction.
978 */
979static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
980 struct list_head *reaplist)
711{ 981{
712 struct file *file; 982 struct nfs4_stid *s = &stp->st_stid;
983 struct nfs4_client *clp = s->sc_client;
984
985 lockdep_assert_held(&clp->cl_lock);
986
987 WARN_ON_ONCE(!list_empty(&stp->st_locks));
988
989 if (!atomic_dec_and_test(&s->sc_count)) {
990 wake_up_all(&close_wq);
991 return;
992 }
993
994 idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
995 list_add(&stp->st_locks, reaplist);
996}
713 997
714 unhash_generic_stateid(stp); 998static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
999{
1000 struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
1001
1002 lockdep_assert_held(&oo->oo_owner.so_client->cl_lock);
1003
1004 list_del_init(&stp->st_locks);
1005 unhash_ol_stateid(stp);
715 unhash_stid(&stp->st_stid); 1006 unhash_stid(&stp->st_stid);
716 file = find_any_file(stp->st_file);
717 if (file)
718 locks_remove_posix(file, (fl_owner_t)lockowner(stp->st_stateowner));
719 close_generic_stateid(stp);
720 free_generic_stateid(stp);
721} 1007}
722 1008
723static void unhash_lockowner(struct nfs4_lockowner *lo) 1009static void release_lock_stateid(struct nfs4_ol_stateid *stp)
724{ 1010{
725 struct nfs4_ol_stateid *stp; 1011 struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner);
726 1012
727 list_del(&lo->lo_owner.so_strhash); 1013 spin_lock(&oo->oo_owner.so_client->cl_lock);
728 list_del(&lo->lo_perstateid); 1014 unhash_lock_stateid(stp);
729 list_del(&lo->lo_owner_ino_hash); 1015 spin_unlock(&oo->oo_owner.so_client->cl_lock);
730 while (!list_empty(&lo->lo_owner.so_stateids)) { 1016 nfs4_put_stid(&stp->st_stid);
731 stp = list_first_entry(&lo->lo_owner.so_stateids,
732 struct nfs4_ol_stateid, st_perstateowner);
733 release_lock_stateid(stp);
734 }
735} 1017}
736 1018
737static void nfs4_free_lockowner(struct nfs4_lockowner *lo) 1019static void unhash_lockowner_locked(struct nfs4_lockowner *lo)
738{ 1020{
739 kfree(lo->lo_owner.so_owner.data); 1021 struct nfs4_client *clp = lo->lo_owner.so_client;
740 kmem_cache_free(lockowner_slab, lo); 1022
1023 lockdep_assert_held(&clp->cl_lock);
1024
1025 list_del_init(&lo->lo_owner.so_strhash);
1026}
1027
1028/*
1029 * Free a list of generic stateids that were collected earlier after being
1030 * fully unhashed.
1031 */
1032static void
1033free_ol_stateid_reaplist(struct list_head *reaplist)
1034{
1035 struct nfs4_ol_stateid *stp;
1036 struct nfs4_file *fp;
1037
1038 might_sleep();
1039
1040 while (!list_empty(reaplist)) {
1041 stp = list_first_entry(reaplist, struct nfs4_ol_stateid,
1042 st_locks);
1043 list_del(&stp->st_locks);
1044 fp = stp->st_stid.sc_file;
1045 stp->st_stid.sc_free(&stp->st_stid);
1046 if (fp)
1047 put_nfs4_file(fp);
1048 }
741} 1049}
742 1050
743static void release_lockowner(struct nfs4_lockowner *lo) 1051static void release_lockowner(struct nfs4_lockowner *lo)
744{ 1052{
745 unhash_lockowner(lo); 1053 struct nfs4_client *clp = lo->lo_owner.so_client;
746 nfs4_free_lockowner(lo); 1054 struct nfs4_ol_stateid *stp;
1055 struct list_head reaplist;
1056
1057 INIT_LIST_HEAD(&reaplist);
1058
1059 spin_lock(&clp->cl_lock);
1060 unhash_lockowner_locked(lo);
1061 while (!list_empty(&lo->lo_owner.so_stateids)) {
1062 stp = list_first_entry(&lo->lo_owner.so_stateids,
1063 struct nfs4_ol_stateid, st_perstateowner);
1064 unhash_lock_stateid(stp);
1065 put_ol_stateid_locked(stp, &reaplist);
1066 }
1067 spin_unlock(&clp->cl_lock);
1068 free_ol_stateid_reaplist(&reaplist);
1069 nfs4_put_stateowner(&lo->lo_owner);
747} 1070}
748 1071
749static void 1072static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
750release_stateid_lockowners(struct nfs4_ol_stateid *open_stp) 1073 struct list_head *reaplist)
751{ 1074{
752 struct nfs4_lockowner *lo; 1075 struct nfs4_ol_stateid *stp;
753 1076
754 while (!list_empty(&open_stp->st_lockowners)) { 1077 while (!list_empty(&open_stp->st_locks)) {
755 lo = list_entry(open_stp->st_lockowners.next, 1078 stp = list_entry(open_stp->st_locks.next,
756 struct nfs4_lockowner, lo_perstateid); 1079 struct nfs4_ol_stateid, st_locks);
757 release_lockowner(lo); 1080 unhash_lock_stateid(stp);
1081 put_ol_stateid_locked(stp, reaplist);
758 } 1082 }
759} 1083}
760 1084
761static void unhash_open_stateid(struct nfs4_ol_stateid *stp) 1085static void unhash_open_stateid(struct nfs4_ol_stateid *stp,
1086 struct list_head *reaplist)
762{ 1087{
763 unhash_generic_stateid(stp); 1088 lockdep_assert_held(&stp->st_stid.sc_client->cl_lock);
764 release_stateid_lockowners(stp); 1089
765 close_generic_stateid(stp); 1090 unhash_ol_stateid(stp);
1091 release_open_stateid_locks(stp, reaplist);
766} 1092}
767 1093
768static void release_open_stateid(struct nfs4_ol_stateid *stp) 1094static void release_open_stateid(struct nfs4_ol_stateid *stp)
769{ 1095{
770 unhash_open_stateid(stp); 1096 LIST_HEAD(reaplist);
771 free_generic_stateid(stp); 1097
1098 spin_lock(&stp->st_stid.sc_client->cl_lock);
1099 unhash_open_stateid(stp, &reaplist);
1100 put_ol_stateid_locked(stp, &reaplist);
1101 spin_unlock(&stp->st_stid.sc_client->cl_lock);
1102 free_ol_stateid_reaplist(&reaplist);
772} 1103}
773 1104
774static void unhash_openowner(struct nfs4_openowner *oo) 1105static void unhash_openowner_locked(struct nfs4_openowner *oo)
775{ 1106{
776 struct nfs4_ol_stateid *stp; 1107 struct nfs4_client *clp = oo->oo_owner.so_client;
777 1108
778 list_del(&oo->oo_owner.so_strhash); 1109 lockdep_assert_held(&clp->cl_lock);
779 list_del(&oo->oo_perclient); 1110
780 while (!list_empty(&oo->oo_owner.so_stateids)) { 1111 list_del_init(&oo->oo_owner.so_strhash);
781 stp = list_first_entry(&oo->oo_owner.so_stateids, 1112 list_del_init(&oo->oo_perclient);
782 struct nfs4_ol_stateid, st_perstateowner);
783 release_open_stateid(stp);
784 }
785} 1113}
786 1114
787static void release_last_closed_stateid(struct nfs4_openowner *oo) 1115static void release_last_closed_stateid(struct nfs4_openowner *oo)
788{ 1116{
789 struct nfs4_ol_stateid *s = oo->oo_last_closed_stid; 1117 struct nfsd_net *nn = net_generic(oo->oo_owner.so_client->net,
1118 nfsd_net_id);
1119 struct nfs4_ol_stateid *s;
790 1120
1121 spin_lock(&nn->client_lock);
1122 s = oo->oo_last_closed_stid;
791 if (s) { 1123 if (s) {
792 free_generic_stateid(s); 1124 list_del_init(&oo->oo_close_lru);
793 oo->oo_last_closed_stid = NULL; 1125 oo->oo_last_closed_stid = NULL;
794 } 1126 }
795} 1127 spin_unlock(&nn->client_lock);
796 1128 if (s)
797static void nfs4_free_openowner(struct nfs4_openowner *oo) 1129 nfs4_put_stid(&s->st_stid);
798{
799 kfree(oo->oo_owner.so_owner.data);
800 kmem_cache_free(openowner_slab, oo);
801} 1130}
802 1131
803static void release_openowner(struct nfs4_openowner *oo) 1132static void release_openowner(struct nfs4_openowner *oo)
804{ 1133{
805 unhash_openowner(oo); 1134 struct nfs4_ol_stateid *stp;
806 list_del(&oo->oo_close_lru); 1135 struct nfs4_client *clp = oo->oo_owner.so_client;
1136 struct list_head reaplist;
1137
1138 INIT_LIST_HEAD(&reaplist);
1139
1140 spin_lock(&clp->cl_lock);
1141 unhash_openowner_locked(oo);
1142 while (!list_empty(&oo->oo_owner.so_stateids)) {
1143 stp = list_first_entry(&oo->oo_owner.so_stateids,
1144 struct nfs4_ol_stateid, st_perstateowner);
1145 unhash_open_stateid(stp, &reaplist);
1146 put_ol_stateid_locked(stp, &reaplist);
1147 }
1148 spin_unlock(&clp->cl_lock);
1149 free_ol_stateid_reaplist(&reaplist);
807 release_last_closed_stateid(oo); 1150 release_last_closed_stateid(oo);
808 nfs4_free_openowner(oo); 1151 nfs4_put_stateowner(&oo->oo_owner);
809} 1152}
810 1153
811static inline int 1154static inline int
@@ -842,7 +1185,7 @@ void nfsd4_bump_seqid(struct nfsd4_compound_state *cstate, __be32 nfserr)
842 return; 1185 return;
843 1186
844 if (!seqid_mutating_err(ntohl(nfserr))) { 1187 if (!seqid_mutating_err(ntohl(nfserr))) {
845 cstate->replay_owner = NULL; 1188 nfsd4_cstate_clear_replay(cstate);
846 return; 1189 return;
847 } 1190 }
848 if (!so) 1191 if (!so)
@@ -1030,10 +1373,8 @@ static void nfsd4_init_conn(struct svc_rqst *rqstp, struct nfsd4_conn *conn, str
1030 if (ret) 1373 if (ret)
1031 /* oops; xprt is already down: */ 1374 /* oops; xprt is already down: */
1032 nfsd4_conn_lost(&conn->cn_xpt_user); 1375 nfsd4_conn_lost(&conn->cn_xpt_user);
1033 if (conn->cn_flags & NFS4_CDFC4_BACK) { 1376 /* We may have gained or lost a callback channel: */
1034 /* callback channel may be back up */ 1377 nfsd4_probe_callback_sync(ses->se_client);
1035 nfsd4_probe_callback(ses->se_client);
1036 }
1037} 1378}
1038 1379
1039static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses) 1380static struct nfsd4_conn *alloc_conn_from_crses(struct svc_rqst *rqstp, struct nfsd4_create_session *cses)
@@ -1073,9 +1414,6 @@ static void __free_session(struct nfsd4_session *ses)
1073 1414
1074static void free_session(struct nfsd4_session *ses) 1415static void free_session(struct nfsd4_session *ses)
1075{ 1416{
1076 struct nfsd_net *nn = net_generic(ses->se_client->net, nfsd_net_id);
1077
1078 lockdep_assert_held(&nn->client_lock);
1079 nfsd4_del_conns(ses); 1417 nfsd4_del_conns(ses);
1080 nfsd4_put_drc_mem(&ses->se_fchannel); 1418 nfsd4_put_drc_mem(&ses->se_fchannel);
1081 __free_session(ses); 1419 __free_session(ses);
@@ -1097,12 +1435,10 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
1097 new->se_cb_sec = cses->cb_sec; 1435 new->se_cb_sec = cses->cb_sec;
1098 atomic_set(&new->se_ref, 0); 1436 atomic_set(&new->se_ref, 0);
1099 idx = hash_sessionid(&new->se_sessionid); 1437 idx = hash_sessionid(&new->se_sessionid);
1100 spin_lock(&nn->client_lock);
1101 list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]); 1438 list_add(&new->se_hash, &nn->sessionid_hashtbl[idx]);
1102 spin_lock(&clp->cl_lock); 1439 spin_lock(&clp->cl_lock);
1103 list_add(&new->se_perclnt, &clp->cl_sessions); 1440 list_add(&new->se_perclnt, &clp->cl_sessions);
1104 spin_unlock(&clp->cl_lock); 1441 spin_unlock(&clp->cl_lock);
1105 spin_unlock(&nn->client_lock);
1106 1442
1107 if (cses->flags & SESSION4_BACK_CHAN) { 1443 if (cses->flags & SESSION4_BACK_CHAN) {
1108 struct sockaddr *sa = svc_addr(rqstp); 1444 struct sockaddr *sa = svc_addr(rqstp);
@@ -1120,12 +1456,14 @@ static void init_session(struct svc_rqst *rqstp, struct nfsd4_session *new, stru
1120 1456
1121/* caller must hold client_lock */ 1457/* caller must hold client_lock */
1122static struct nfsd4_session * 1458static struct nfsd4_session *
1123find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net) 1459__find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
1124{ 1460{
1125 struct nfsd4_session *elem; 1461 struct nfsd4_session *elem;
1126 int idx; 1462 int idx;
1127 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 1463 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1128 1464
1465 lockdep_assert_held(&nn->client_lock);
1466
1129 dump_sessionid(__func__, sessionid); 1467 dump_sessionid(__func__, sessionid);
1130 idx = hash_sessionid(sessionid); 1468 idx = hash_sessionid(sessionid);
1131 /* Search in the appropriate list */ 1469 /* Search in the appropriate list */
@@ -1140,10 +1478,33 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net)
1140 return NULL; 1478 return NULL;
1141} 1479}
1142 1480
1481static struct nfsd4_session *
1482find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid, struct net *net,
1483 __be32 *ret)
1484{
1485 struct nfsd4_session *session;
1486 __be32 status = nfserr_badsession;
1487
1488 session = __find_in_sessionid_hashtbl(sessionid, net);
1489 if (!session)
1490 goto out;
1491 status = nfsd4_get_session_locked(session);
1492 if (status)
1493 session = NULL;
1494out:
1495 *ret = status;
1496 return session;
1497}
1498
1143/* caller must hold client_lock */ 1499/* caller must hold client_lock */
1144static void 1500static void
1145unhash_session(struct nfsd4_session *ses) 1501unhash_session(struct nfsd4_session *ses)
1146{ 1502{
1503 struct nfs4_client *clp = ses->se_client;
1504 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1505
1506 lockdep_assert_held(&nn->client_lock);
1507
1147 list_del(&ses->se_hash); 1508 list_del(&ses->se_hash);
1148 spin_lock(&ses->se_client->cl_lock); 1509 spin_lock(&ses->se_client->cl_lock);
1149 list_del(&ses->se_perclnt); 1510 list_del(&ses->se_perclnt);
@@ -1169,15 +1530,20 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
1169static struct nfs4_client *alloc_client(struct xdr_netobj name) 1530static struct nfs4_client *alloc_client(struct xdr_netobj name)
1170{ 1531{
1171 struct nfs4_client *clp; 1532 struct nfs4_client *clp;
1533 int i;
1172 1534
1173 clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL); 1535 clp = kzalloc(sizeof(struct nfs4_client), GFP_KERNEL);
1174 if (clp == NULL) 1536 if (clp == NULL)
1175 return NULL; 1537 return NULL;
1176 clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL); 1538 clp->cl_name.data = kmemdup(name.data, name.len, GFP_KERNEL);
1177 if (clp->cl_name.data == NULL) { 1539 if (clp->cl_name.data == NULL)
1178 kfree(clp); 1540 goto err_no_name;
1179 return NULL; 1541 clp->cl_ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
1180 } 1542 OWNER_HASH_SIZE, GFP_KERNEL);
1543 if (!clp->cl_ownerstr_hashtbl)
1544 goto err_no_hashtbl;
1545 for (i = 0; i < OWNER_HASH_SIZE; i++)
1546 INIT_LIST_HEAD(&clp->cl_ownerstr_hashtbl[i]);
1181 clp->cl_name.len = name.len; 1547 clp->cl_name.len = name.len;
1182 INIT_LIST_HEAD(&clp->cl_sessions); 1548 INIT_LIST_HEAD(&clp->cl_sessions);
1183 idr_init(&clp->cl_stateids); 1549 idr_init(&clp->cl_stateids);
@@ -1192,14 +1558,16 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
1192 spin_lock_init(&clp->cl_lock); 1558 spin_lock_init(&clp->cl_lock);
1193 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); 1559 rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
1194 return clp; 1560 return clp;
1561err_no_hashtbl:
1562 kfree(clp->cl_name.data);
1563err_no_name:
1564 kfree(clp);
1565 return NULL;
1195} 1566}
1196 1567
1197static void 1568static void
1198free_client(struct nfs4_client *clp) 1569free_client(struct nfs4_client *clp)
1199{ 1570{
1200 struct nfsd_net __maybe_unused *nn = net_generic(clp->net, nfsd_net_id);
1201
1202 lockdep_assert_held(&nn->client_lock);
1203 while (!list_empty(&clp->cl_sessions)) { 1571 while (!list_empty(&clp->cl_sessions)) {
1204 struct nfsd4_session *ses; 1572 struct nfsd4_session *ses;
1205 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session, 1573 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
@@ -1210,18 +1578,32 @@ free_client(struct nfs4_client *clp)
1210 } 1578 }
1211 rpc_destroy_wait_queue(&clp->cl_cb_waitq); 1579 rpc_destroy_wait_queue(&clp->cl_cb_waitq);
1212 free_svc_cred(&clp->cl_cred); 1580 free_svc_cred(&clp->cl_cred);
1581 kfree(clp->cl_ownerstr_hashtbl);
1213 kfree(clp->cl_name.data); 1582 kfree(clp->cl_name.data);
1214 idr_destroy(&clp->cl_stateids); 1583 idr_destroy(&clp->cl_stateids);
1215 kfree(clp); 1584 kfree(clp);
1216} 1585}
1217 1586
1218/* must be called under the client_lock */ 1587/* must be called under the client_lock */
1219static inline void 1588static void
1220unhash_client_locked(struct nfs4_client *clp) 1589unhash_client_locked(struct nfs4_client *clp)
1221{ 1590{
1591 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1222 struct nfsd4_session *ses; 1592 struct nfsd4_session *ses;
1223 1593
1224 list_del(&clp->cl_lru); 1594 lockdep_assert_held(&nn->client_lock);
1595
1596 /* Mark the client as expired! */
1597 clp->cl_time = 0;
1598 /* Make it invisible */
1599 if (!list_empty(&clp->cl_idhash)) {
1600 list_del_init(&clp->cl_idhash);
1601 if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
1602 rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
1603 else
1604 rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
1605 }
1606 list_del_init(&clp->cl_lru);
1225 spin_lock(&clp->cl_lock); 1607 spin_lock(&clp->cl_lock);
1226 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt) 1608 list_for_each_entry(ses, &clp->cl_sessions, se_perclnt)
1227 list_del_init(&ses->se_hash); 1609 list_del_init(&ses->se_hash);
@@ -1229,53 +1611,72 @@ unhash_client_locked(struct nfs4_client *clp)
1229} 1611}
1230 1612
1231static void 1613static void
1232destroy_client(struct nfs4_client *clp) 1614unhash_client(struct nfs4_client *clp)
1615{
1616 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1617
1618 spin_lock(&nn->client_lock);
1619 unhash_client_locked(clp);
1620 spin_unlock(&nn->client_lock);
1621}
1622
1623static __be32 mark_client_expired_locked(struct nfs4_client *clp)
1624{
1625 if (atomic_read(&clp->cl_refcount))
1626 return nfserr_jukebox;
1627 unhash_client_locked(clp);
1628 return nfs_ok;
1629}
1630
1631static void
1632__destroy_client(struct nfs4_client *clp)
1233{ 1633{
1234 struct nfs4_openowner *oo; 1634 struct nfs4_openowner *oo;
1235 struct nfs4_delegation *dp; 1635 struct nfs4_delegation *dp;
1236 struct list_head reaplist; 1636 struct list_head reaplist;
1237 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1238 1637
1239 INIT_LIST_HEAD(&reaplist); 1638 INIT_LIST_HEAD(&reaplist);
1240 spin_lock(&state_lock); 1639 spin_lock(&state_lock);
1241 while (!list_empty(&clp->cl_delegations)) { 1640 while (!list_empty(&clp->cl_delegations)) {
1242 dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); 1641 dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
1243 list_del_init(&dp->dl_perclnt); 1642 unhash_delegation_locked(dp);
1244 list_move(&dp->dl_recall_lru, &reaplist); 1643 list_add(&dp->dl_recall_lru, &reaplist);
1245 } 1644 }
1246 spin_unlock(&state_lock); 1645 spin_unlock(&state_lock);
1247 while (!list_empty(&reaplist)) { 1646 while (!list_empty(&reaplist)) {
1248 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); 1647 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
1249 destroy_delegation(dp); 1648 list_del_init(&dp->dl_recall_lru);
1649 nfs4_put_deleg_lease(dp->dl_stid.sc_file);
1650 nfs4_put_stid(&dp->dl_stid);
1250 } 1651 }
1251 list_splice_init(&clp->cl_revoked, &reaplist); 1652 while (!list_empty(&clp->cl_revoked)) {
1252 while (!list_empty(&reaplist)) {
1253 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); 1653 dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru);
1254 destroy_revoked_delegation(dp); 1654 list_del_init(&dp->dl_recall_lru);
1655 nfs4_put_stid(&dp->dl_stid);
1255 } 1656 }
1256 while (!list_empty(&clp->cl_openowners)) { 1657 while (!list_empty(&clp->cl_openowners)) {
1257 oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); 1658 oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient);
1659 nfs4_get_stateowner(&oo->oo_owner);
1258 release_openowner(oo); 1660 release_openowner(oo);
1259 } 1661 }
1260 nfsd4_shutdown_callback(clp); 1662 nfsd4_shutdown_callback(clp);
1261 if (clp->cl_cb_conn.cb_xprt) 1663 if (clp->cl_cb_conn.cb_xprt)
1262 svc_xprt_put(clp->cl_cb_conn.cb_xprt); 1664 svc_xprt_put(clp->cl_cb_conn.cb_xprt);
1263 list_del(&clp->cl_idhash);
1264 if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
1265 rb_erase(&clp->cl_namenode, &nn->conf_name_tree);
1266 else
1267 rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
1268 spin_lock(&nn->client_lock);
1269 unhash_client_locked(clp);
1270 WARN_ON_ONCE(atomic_read(&clp->cl_refcount));
1271 free_client(clp); 1665 free_client(clp);
1272 spin_unlock(&nn->client_lock); 1666}
1667
1668static void
1669destroy_client(struct nfs4_client *clp)
1670{
1671 unhash_client(clp);
1672 __destroy_client(clp);
1273} 1673}
1274 1674
1275static void expire_client(struct nfs4_client *clp) 1675static void expire_client(struct nfs4_client *clp)
1276{ 1676{
1677 unhash_client(clp);
1277 nfsd4_client_record_remove(clp); 1678 nfsd4_client_record_remove(clp);
1278 destroy_client(clp); 1679 __destroy_client(clp);
1279} 1680}
1280 1681
1281static void copy_verf(struct nfs4_client *target, nfs4_verifier *source) 1682static void copy_verf(struct nfs4_client *target, nfs4_verifier *source)
@@ -1408,25 +1809,28 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
1408 return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); 1809 return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
1409} 1810}
1410 1811
1411static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn) 1812static void gen_confirm(struct nfs4_client *clp, struct nfsd_net *nn)
1412{ 1813{
1413 static u32 current_clientid = 1; 1814 __be32 verf[2];
1414 1815
1415 clp->cl_clientid.cl_boot = nn->boot_time; 1816 /*
1416 clp->cl_clientid.cl_id = current_clientid++; 1817 * This is opaque to client, so no need to byte-swap. Use
1818 * __force to keep sparse happy
1819 */
1820 verf[0] = (__force __be32)get_seconds();
1821 verf[1] = (__force __be32)nn->clientid_counter;
1822 memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
1417} 1823}
1418 1824
1419static void gen_confirm(struct nfs4_client *clp) 1825static void gen_clid(struct nfs4_client *clp, struct nfsd_net *nn)
1420{ 1826{
1421 __be32 verf[2]; 1827 clp->cl_clientid.cl_boot = nn->boot_time;
1422 static u32 i; 1828 clp->cl_clientid.cl_id = nn->clientid_counter++;
1423 1829 gen_confirm(clp, nn);
1424 verf[0] = (__be32)get_seconds();
1425 verf[1] = (__be32)i++;
1426 memcpy(clp->cl_confirm.data, verf, sizeof(clp->cl_confirm.data));
1427} 1830}
1428 1831
1429static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t) 1832static struct nfs4_stid *
1833find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
1430{ 1834{
1431 struct nfs4_stid *ret; 1835 struct nfs4_stid *ret;
1432 1836
@@ -1436,16 +1840,21 @@ static struct nfs4_stid *find_stateid(struct nfs4_client *cl, stateid_t *t)
1436 return ret; 1840 return ret;
1437} 1841}
1438 1842
1439static struct nfs4_stid *find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) 1843static struct nfs4_stid *
1844find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
1440{ 1845{
1441 struct nfs4_stid *s; 1846 struct nfs4_stid *s;
1442 1847
1443 s = find_stateid(cl, t); 1848 spin_lock(&cl->cl_lock);
1444 if (!s) 1849 s = find_stateid_locked(cl, t);
1445 return NULL; 1850 if (s != NULL) {
1446 if (typemask & s->sc_type) 1851 if (typemask & s->sc_type)
1447 return s; 1852 atomic_inc(&s->sc_count);
1448 return NULL; 1853 else
1854 s = NULL;
1855 }
1856 spin_unlock(&cl->cl_lock);
1857 return s;
1449} 1858}
1450 1859
1451static struct nfs4_client *create_client(struct xdr_netobj name, 1860static struct nfs4_client *create_client(struct xdr_netobj name,
@@ -1455,7 +1864,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
1455 struct sockaddr *sa = svc_addr(rqstp); 1864 struct sockaddr *sa = svc_addr(rqstp);
1456 int ret; 1865 int ret;
1457 struct net *net = SVC_NET(rqstp); 1866 struct net *net = SVC_NET(rqstp);
1458 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1459 1867
1460 clp = alloc_client(name); 1868 clp = alloc_client(name);
1461 if (clp == NULL) 1869 if (clp == NULL)
@@ -1463,17 +1871,14 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
1463 1871
1464 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); 1872 ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
1465 if (ret) { 1873 if (ret) {
1466 spin_lock(&nn->client_lock);
1467 free_client(clp); 1874 free_client(clp);
1468 spin_unlock(&nn->client_lock);
1469 return NULL; 1875 return NULL;
1470 } 1876 }
1471 nfsd4_init_callback(&clp->cl_cb_null); 1877 nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL);
1472 clp->cl_time = get_seconds(); 1878 clp->cl_time = get_seconds();
1473 clear_bit(0, &clp->cl_cb_slot_busy); 1879 clear_bit(0, &clp->cl_cb_slot_busy);
1474 copy_verf(clp, verf); 1880 copy_verf(clp, verf);
1475 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); 1881 rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
1476 gen_confirm(clp);
1477 clp->cl_cb_session = NULL; 1882 clp->cl_cb_session = NULL;
1478 clp->net = net; 1883 clp->net = net;
1479 return clp; 1884 return clp;
@@ -1525,11 +1930,13 @@ add_to_unconfirmed(struct nfs4_client *clp)
1525 unsigned int idhashval; 1930 unsigned int idhashval;
1526 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 1931 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1527 1932
1933 lockdep_assert_held(&nn->client_lock);
1934
1528 clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); 1935 clear_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
1529 add_clp_to_name_tree(clp, &nn->unconf_name_tree); 1936 add_clp_to_name_tree(clp, &nn->unconf_name_tree);
1530 idhashval = clientid_hashval(clp->cl_clientid.cl_id); 1937 idhashval = clientid_hashval(clp->cl_clientid.cl_id);
1531 list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]); 1938 list_add(&clp->cl_idhash, &nn->unconf_id_hashtbl[idhashval]);
1532 renew_client(clp); 1939 renew_client_locked(clp);
1533} 1940}
1534 1941
1535static void 1942static void
@@ -1538,12 +1945,14 @@ move_to_confirmed(struct nfs4_client *clp)
1538 unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id); 1945 unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
1539 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 1946 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
1540 1947
1948 lockdep_assert_held(&nn->client_lock);
1949
1541 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); 1950 dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
1542 list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]); 1951 list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
1543 rb_erase(&clp->cl_namenode, &nn->unconf_name_tree); 1952 rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
1544 add_clp_to_name_tree(clp, &nn->conf_name_tree); 1953 add_clp_to_name_tree(clp, &nn->conf_name_tree);
1545 set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags); 1954 set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
1546 renew_client(clp); 1955 renew_client_locked(clp);
1547} 1956}
1548 1957
1549static struct nfs4_client * 1958static struct nfs4_client *
@@ -1556,7 +1965,7 @@ find_client_in_id_table(struct list_head *tbl, clientid_t *clid, bool sessions)
1556 if (same_clid(&clp->cl_clientid, clid)) { 1965 if (same_clid(&clp->cl_clientid, clid)) {
1557 if ((bool)clp->cl_minorversion != sessions) 1966 if ((bool)clp->cl_minorversion != sessions)
1558 return NULL; 1967 return NULL;
1559 renew_client(clp); 1968 renew_client_locked(clp);
1560 return clp; 1969 return clp;
1561 } 1970 }
1562 } 1971 }
@@ -1568,6 +1977,7 @@ find_confirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
1568{ 1977{
1569 struct list_head *tbl = nn->conf_id_hashtbl; 1978 struct list_head *tbl = nn->conf_id_hashtbl;
1570 1979
1980 lockdep_assert_held(&nn->client_lock);
1571 return find_client_in_id_table(tbl, clid, sessions); 1981 return find_client_in_id_table(tbl, clid, sessions);
1572} 1982}
1573 1983
@@ -1576,6 +1986,7 @@ find_unconfirmed_client(clientid_t *clid, bool sessions, struct nfsd_net *nn)
1576{ 1986{
1577 struct list_head *tbl = nn->unconf_id_hashtbl; 1987 struct list_head *tbl = nn->unconf_id_hashtbl;
1578 1988
1989 lockdep_assert_held(&nn->client_lock);
1579 return find_client_in_id_table(tbl, clid, sessions); 1990 return find_client_in_id_table(tbl, clid, sessions);
1580} 1991}
1581 1992
@@ -1587,12 +1998,14 @@ static bool clp_used_exchangeid(struct nfs4_client *clp)
1587static struct nfs4_client * 1998static struct nfs4_client *
1588find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) 1999find_confirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
1589{ 2000{
2001 lockdep_assert_held(&nn->client_lock);
1590 return find_clp_in_name_tree(name, &nn->conf_name_tree); 2002 return find_clp_in_name_tree(name, &nn->conf_name_tree);
1591} 2003}
1592 2004
1593static struct nfs4_client * 2005static struct nfs4_client *
1594find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn) 2006find_unconfirmed_client_by_name(struct xdr_netobj *name, struct nfsd_net *nn)
1595{ 2007{
2008 lockdep_assert_held(&nn->client_lock);
1596 return find_clp_in_name_tree(name, &nn->unconf_name_tree); 2009 return find_clp_in_name_tree(name, &nn->unconf_name_tree);
1597} 2010}
1598 2011
@@ -1642,7 +2055,7 @@ out_err:
1642/* 2055/*
1643 * Cache a reply. nfsd4_check_resp_size() has bounded the cache size. 2056 * Cache a reply. nfsd4_check_resp_size() has bounded the cache size.
1644 */ 2057 */
1645void 2058static void
1646nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) 2059nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1647{ 2060{
1648 struct xdr_buf *buf = resp->xdr.buf; 2061 struct xdr_buf *buf = resp->xdr.buf;
@@ -1758,7 +2171,8 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1758 struct nfsd4_compound_state *cstate, 2171 struct nfsd4_compound_state *cstate,
1759 struct nfsd4_exchange_id *exid) 2172 struct nfsd4_exchange_id *exid)
1760{ 2173{
1761 struct nfs4_client *unconf, *conf, *new; 2174 struct nfs4_client *conf, *new;
2175 struct nfs4_client *unconf = NULL;
1762 __be32 status; 2176 __be32 status;
1763 char addr_str[INET6_ADDRSTRLEN]; 2177 char addr_str[INET6_ADDRSTRLEN];
1764 nfs4_verifier verf = exid->verifier; 2178 nfs4_verifier verf = exid->verifier;
@@ -1787,8 +2201,12 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1787 return nfserr_encr_alg_unsupp; 2201 return nfserr_encr_alg_unsupp;
1788 } 2202 }
1789 2203
2204 new = create_client(exid->clname, rqstp, &verf);
2205 if (new == NULL)
2206 return nfserr_jukebox;
2207
1790 /* Cases below refer to rfc 5661 section 18.35.4: */ 2208 /* Cases below refer to rfc 5661 section 18.35.4: */
1791 nfs4_lock_state(); 2209 spin_lock(&nn->client_lock);
1792 conf = find_confirmed_client_by_name(&exid->clname, nn); 2210 conf = find_confirmed_client_by_name(&exid->clname, nn);
1793 if (conf) { 2211 if (conf) {
1794 bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred); 2212 bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
@@ -1813,7 +2231,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1813 } 2231 }
1814 /* case 6 */ 2232 /* case 6 */
1815 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R; 2233 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
1816 new = conf;
1817 goto out_copy; 2234 goto out_copy;
1818 } 2235 }
1819 if (!creds_match) { /* case 3 */ 2236 if (!creds_match) { /* case 3 */
@@ -1821,15 +2238,14 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1821 status = nfserr_clid_inuse; 2238 status = nfserr_clid_inuse;
1822 goto out; 2239 goto out;
1823 } 2240 }
1824 expire_client(conf);
1825 goto out_new; 2241 goto out_new;
1826 } 2242 }
1827 if (verfs_match) { /* case 2 */ 2243 if (verfs_match) { /* case 2 */
1828 conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R; 2244 conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
1829 new = conf;
1830 goto out_copy; 2245 goto out_copy;
1831 } 2246 }
1832 /* case 5, client reboot */ 2247 /* case 5, client reboot */
2248 conf = NULL;
1833 goto out_new; 2249 goto out_new;
1834 } 2250 }
1835 2251
@@ -1840,33 +2256,38 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
1840 2256
1841 unconf = find_unconfirmed_client_by_name(&exid->clname, nn); 2257 unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
1842 if (unconf) /* case 4, possible retry or client restart */ 2258 if (unconf) /* case 4, possible retry or client restart */
1843 expire_client(unconf); 2259 unhash_client_locked(unconf);
1844 2260
1845 /* case 1 (normal case) */ 2261 /* case 1 (normal case) */
1846out_new: 2262out_new:
1847 new = create_client(exid->clname, rqstp, &verf); 2263 if (conf) {
1848 if (new == NULL) { 2264 status = mark_client_expired_locked(conf);
1849 status = nfserr_jukebox; 2265 if (status)
1850 goto out; 2266 goto out;
1851 } 2267 }
1852 new->cl_minorversion = cstate->minorversion; 2268 new->cl_minorversion = cstate->minorversion;
1853 new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED); 2269 new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
1854 2270
1855 gen_clid(new, nn); 2271 gen_clid(new, nn);
1856 add_to_unconfirmed(new); 2272 add_to_unconfirmed(new);
2273 swap(new, conf);
1857out_copy: 2274out_copy:
1858 exid->clientid.cl_boot = new->cl_clientid.cl_boot; 2275 exid->clientid.cl_boot = conf->cl_clientid.cl_boot;
1859 exid->clientid.cl_id = new->cl_clientid.cl_id; 2276 exid->clientid.cl_id = conf->cl_clientid.cl_id;
1860 2277
1861 exid->seqid = new->cl_cs_slot.sl_seqid + 1; 2278 exid->seqid = conf->cl_cs_slot.sl_seqid + 1;
1862 nfsd4_set_ex_flags(new, exid); 2279 nfsd4_set_ex_flags(conf, exid);
1863 2280
1864 dprintk("nfsd4_exchange_id seqid %d flags %x\n", 2281 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1865 new->cl_cs_slot.sl_seqid, new->cl_exchange_flags); 2282 conf->cl_cs_slot.sl_seqid, conf->cl_exchange_flags);
1866 status = nfs_ok; 2283 status = nfs_ok;
1867 2284
1868out: 2285out:
1869 nfs4_unlock_state(); 2286 spin_unlock(&nn->client_lock);
2287 if (new)
2288 expire_client(new);
2289 if (unconf)
2290 expire_client(unconf);
1870 return status; 2291 return status;
1871} 2292}
1872 2293
@@ -2010,6 +2431,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
2010{ 2431{
2011 struct sockaddr *sa = svc_addr(rqstp); 2432 struct sockaddr *sa = svc_addr(rqstp);
2012 struct nfs4_client *conf, *unconf; 2433 struct nfs4_client *conf, *unconf;
2434 struct nfs4_client *old = NULL;
2013 struct nfsd4_session *new; 2435 struct nfsd4_session *new;
2014 struct nfsd4_conn *conn; 2436 struct nfsd4_conn *conn;
2015 struct nfsd4_clid_slot *cs_slot = NULL; 2437 struct nfsd4_clid_slot *cs_slot = NULL;
@@ -2035,7 +2457,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
2035 if (!conn) 2457 if (!conn)
2036 goto out_free_session; 2458 goto out_free_session;
2037 2459
2038 nfs4_lock_state(); 2460 spin_lock(&nn->client_lock);
2039 unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); 2461 unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
2040 conf = find_confirmed_client(&cr_ses->clientid, true, nn); 2462 conf = find_confirmed_client(&cr_ses->clientid, true, nn);
2041 WARN_ON_ONCE(conf && unconf); 2463 WARN_ON_ONCE(conf && unconf);
@@ -2054,7 +2476,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
2054 goto out_free_conn; 2476 goto out_free_conn;
2055 } 2477 }
2056 } else if (unconf) { 2478 } else if (unconf) {
2057 struct nfs4_client *old;
2058 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || 2479 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
2059 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { 2480 !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
2060 status = nfserr_clid_inuse; 2481 status = nfserr_clid_inuse;
@@ -2072,10 +2493,11 @@ nfsd4_create_session(struct svc_rqst *rqstp,
2072 } 2493 }
2073 old = find_confirmed_client_by_name(&unconf->cl_name, nn); 2494 old = find_confirmed_client_by_name(&unconf->cl_name, nn);
2074 if (old) { 2495 if (old) {
2075 status = mark_client_expired(old); 2496 status = mark_client_expired_locked(old);
2076 if (status) 2497 if (status) {
2498 old = NULL;
2077 goto out_free_conn; 2499 goto out_free_conn;
2078 expire_client(old); 2500 }
2079 } 2501 }
2080 move_to_confirmed(unconf); 2502 move_to_confirmed(unconf);
2081 conf = unconf; 2503 conf = unconf;
@@ -2091,20 +2513,27 @@ nfsd4_create_session(struct svc_rqst *rqstp,
2091 cr_ses->flags &= ~SESSION4_RDMA; 2513 cr_ses->flags &= ~SESSION4_RDMA;
2092 2514
2093 init_session(rqstp, new, conf, cr_ses); 2515 init_session(rqstp, new, conf, cr_ses);
2094 nfsd4_init_conn(rqstp, conn, new); 2516 nfsd4_get_session_locked(new);
2095 2517
2096 memcpy(cr_ses->sessionid.data, new->se_sessionid.data, 2518 memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
2097 NFS4_MAX_SESSIONID_LEN); 2519 NFS4_MAX_SESSIONID_LEN);
2098 cs_slot->sl_seqid++; 2520 cs_slot->sl_seqid++;
2099 cr_ses->seqid = cs_slot->sl_seqid; 2521 cr_ses->seqid = cs_slot->sl_seqid;
2100 2522
2101 /* cache solo and embedded create sessions under the state lock */ 2523 /* cache solo and embedded create sessions under the client_lock */
2102 nfsd4_cache_create_session(cr_ses, cs_slot, status); 2524 nfsd4_cache_create_session(cr_ses, cs_slot, status);
2103 nfs4_unlock_state(); 2525 spin_unlock(&nn->client_lock);
2526 /* init connection and backchannel */
2527 nfsd4_init_conn(rqstp, conn, new);
2528 nfsd4_put_session(new);
2529 if (old)
2530 expire_client(old);
2104 return status; 2531 return status;
2105out_free_conn: 2532out_free_conn:
2106 nfs4_unlock_state(); 2533 spin_unlock(&nn->client_lock);
2107 free_conn(conn); 2534 free_conn(conn);
2535 if (old)
2536 expire_client(old);
2108out_free_session: 2537out_free_session:
2109 __free_session(new); 2538 __free_session(new);
2110out_release_drc_mem: 2539out_release_drc_mem:
@@ -2152,17 +2581,16 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
2152 __be32 status; 2581 __be32 status;
2153 struct nfsd4_conn *conn; 2582 struct nfsd4_conn *conn;
2154 struct nfsd4_session *session; 2583 struct nfsd4_session *session;
2155 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 2584 struct net *net = SVC_NET(rqstp);
2585 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
2156 2586
2157 if (!nfsd4_last_compound_op(rqstp)) 2587 if (!nfsd4_last_compound_op(rqstp))
2158 return nfserr_not_only_op; 2588 return nfserr_not_only_op;
2159 nfs4_lock_state();
2160 spin_lock(&nn->client_lock); 2589 spin_lock(&nn->client_lock);
2161 session = find_in_sessionid_hashtbl(&bcts->sessionid, SVC_NET(rqstp)); 2590 session = find_in_sessionid_hashtbl(&bcts->sessionid, net, &status);
2162 spin_unlock(&nn->client_lock); 2591 spin_unlock(&nn->client_lock);
2163 status = nfserr_badsession;
2164 if (!session) 2592 if (!session)
2165 goto out; 2593 goto out_no_session;
2166 status = nfserr_wrong_cred; 2594 status = nfserr_wrong_cred;
2167 if (!mach_creds_match(session->se_client, rqstp)) 2595 if (!mach_creds_match(session->se_client, rqstp))
2168 goto out; 2596 goto out;
@@ -2176,7 +2604,8 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
2176 nfsd4_init_conn(rqstp, conn, session); 2604 nfsd4_init_conn(rqstp, conn, session);
2177 status = nfs_ok; 2605 status = nfs_ok;
2178out: 2606out:
2179 nfs4_unlock_state(); 2607 nfsd4_put_session(session);
2608out_no_session:
2180 return status; 2609 return status;
2181} 2610}
2182 2611
@@ -2195,9 +2624,9 @@ nfsd4_destroy_session(struct svc_rqst *r,
2195 struct nfsd4_session *ses; 2624 struct nfsd4_session *ses;
2196 __be32 status; 2625 __be32 status;
2197 int ref_held_by_me = 0; 2626 int ref_held_by_me = 0;
2198 struct nfsd_net *nn = net_generic(SVC_NET(r), nfsd_net_id); 2627 struct net *net = SVC_NET(r);
2628 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
2199 2629
2200 nfs4_lock_state();
2201 status = nfserr_not_only_op; 2630 status = nfserr_not_only_op;
2202 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) { 2631 if (nfsd4_compound_in_session(cstate->session, &sessionid->sessionid)) {
2203 if (!nfsd4_last_compound_op(r)) 2632 if (!nfsd4_last_compound_op(r))
@@ -2206,14 +2635,12 @@ nfsd4_destroy_session(struct svc_rqst *r,
2206 } 2635 }
2207 dump_sessionid(__func__, &sessionid->sessionid); 2636 dump_sessionid(__func__, &sessionid->sessionid);
2208 spin_lock(&nn->client_lock); 2637 spin_lock(&nn->client_lock);
2209 ses = find_in_sessionid_hashtbl(&sessionid->sessionid, SVC_NET(r)); 2638 ses = find_in_sessionid_hashtbl(&sessionid->sessionid, net, &status);
2210 status = nfserr_badsession;
2211 if (!ses) 2639 if (!ses)
2212 goto out_client_lock; 2640 goto out_client_lock;
2213 status = nfserr_wrong_cred; 2641 status = nfserr_wrong_cred;
2214 if (!mach_creds_match(ses->se_client, r)) 2642 if (!mach_creds_match(ses->se_client, r))
2215 goto out_client_lock; 2643 goto out_put_session;
2216 nfsd4_get_session_locked(ses);
2217 status = mark_session_dead_locked(ses, 1 + ref_held_by_me); 2644 status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
2218 if (status) 2645 if (status)
2219 goto out_put_session; 2646 goto out_put_session;
@@ -2225,11 +2652,10 @@ nfsd4_destroy_session(struct svc_rqst *r,
2225 spin_lock(&nn->client_lock); 2652 spin_lock(&nn->client_lock);
2226 status = nfs_ok; 2653 status = nfs_ok;
2227out_put_session: 2654out_put_session:
2228 nfsd4_put_session(ses); 2655 nfsd4_put_session_locked(ses);
2229out_client_lock: 2656out_client_lock:
2230 spin_unlock(&nn->client_lock); 2657 spin_unlock(&nn->client_lock);
2231out: 2658out:
2232 nfs4_unlock_state();
2233 return status; 2659 return status;
2234} 2660}
2235 2661
@@ -2300,7 +2726,8 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2300 struct nfsd4_conn *conn; 2726 struct nfsd4_conn *conn;
2301 __be32 status; 2727 __be32 status;
2302 int buflen; 2728 int buflen;
2303 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 2729 struct net *net = SVC_NET(rqstp);
2730 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
2304 2731
2305 if (resp->opcnt != 1) 2732 if (resp->opcnt != 1)
2306 return nfserr_sequence_pos; 2733 return nfserr_sequence_pos;
@@ -2314,17 +2741,10 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2314 return nfserr_jukebox; 2741 return nfserr_jukebox;
2315 2742
2316 spin_lock(&nn->client_lock); 2743 spin_lock(&nn->client_lock);
2317 status = nfserr_badsession; 2744 session = find_in_sessionid_hashtbl(&seq->sessionid, net, &status);
2318 session = find_in_sessionid_hashtbl(&seq->sessionid, SVC_NET(rqstp));
2319 if (!session) 2745 if (!session)
2320 goto out_no_session; 2746 goto out_no_session;
2321 clp = session->se_client; 2747 clp = session->se_client;
2322 status = get_client_locked(clp);
2323 if (status)
2324 goto out_no_session;
2325 status = nfsd4_get_session_locked(session);
2326 if (status)
2327 goto out_put_client;
2328 2748
2329 status = nfserr_too_many_ops; 2749 status = nfserr_too_many_ops;
2330 if (nfsd4_session_too_many_ops(rqstp, session)) 2750 if (nfsd4_session_too_many_ops(rqstp, session))
@@ -2354,6 +2774,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2354 goto out_put_session; 2774 goto out_put_session;
2355 cstate->slot = slot; 2775 cstate->slot = slot;
2356 cstate->session = session; 2776 cstate->session = session;
2777 cstate->clp = clp;
2357 /* Return the cached reply status and set cstate->status 2778 /* Return the cached reply status and set cstate->status
2358 * for nfsd4_proc_compound processing */ 2779 * for nfsd4_proc_compound processing */
2359 status = nfsd4_replay_cache_entry(resp, seq); 2780 status = nfsd4_replay_cache_entry(resp, seq);
@@ -2388,6 +2809,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
2388 2809
2389 cstate->slot = slot; 2810 cstate->slot = slot;
2390 cstate->session = session; 2811 cstate->session = session;
2812 cstate->clp = clp;
2391 2813
2392out: 2814out:
2393 switch (clp->cl_cb_state) { 2815 switch (clp->cl_cb_state) {
@@ -2408,31 +2830,48 @@ out_no_session:
2408 spin_unlock(&nn->client_lock); 2830 spin_unlock(&nn->client_lock);
2409 return status; 2831 return status;
2410out_put_session: 2832out_put_session:
2411 nfsd4_put_session(session); 2833 nfsd4_put_session_locked(session);
2412out_put_client:
2413 put_client_renew_locked(clp);
2414 goto out_no_session; 2834 goto out_no_session;
2415} 2835}
2416 2836
2837void
2838nfsd4_sequence_done(struct nfsd4_compoundres *resp)
2839{
2840 struct nfsd4_compound_state *cs = &resp->cstate;
2841
2842 if (nfsd4_has_session(cs)) {
2843 if (cs->status != nfserr_replay_cache) {
2844 nfsd4_store_cache_entry(resp);
2845 cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
2846 }
2847 /* Drop session reference that was taken in nfsd4_sequence() */
2848 nfsd4_put_session(cs->session);
2849 } else if (cs->clp)
2850 put_client_renew(cs->clp);
2851}
2852
2417__be32 2853__be32
2418nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc) 2854nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
2419{ 2855{
2420 struct nfs4_client *conf, *unconf, *clp; 2856 struct nfs4_client *conf, *unconf;
2857 struct nfs4_client *clp = NULL;
2421 __be32 status = 0; 2858 __be32 status = 0;
2422 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 2859 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2423 2860
2424 nfs4_lock_state(); 2861 spin_lock(&nn->client_lock);
2425 unconf = find_unconfirmed_client(&dc->clientid, true, nn); 2862 unconf = find_unconfirmed_client(&dc->clientid, true, nn);
2426 conf = find_confirmed_client(&dc->clientid, true, nn); 2863 conf = find_confirmed_client(&dc->clientid, true, nn);
2427 WARN_ON_ONCE(conf && unconf); 2864 WARN_ON_ONCE(conf && unconf);
2428 2865
2429 if (conf) { 2866 if (conf) {
2430 clp = conf;
2431
2432 if (client_has_state(conf)) { 2867 if (client_has_state(conf)) {
2433 status = nfserr_clientid_busy; 2868 status = nfserr_clientid_busy;
2434 goto out; 2869 goto out;
2435 } 2870 }
2871 status = mark_client_expired_locked(conf);
2872 if (status)
2873 goto out;
2874 clp = conf;
2436 } else if (unconf) 2875 } else if (unconf)
2437 clp = unconf; 2876 clp = unconf;
2438 else { 2877 else {
@@ -2440,12 +2879,15 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2440 goto out; 2879 goto out;
2441 } 2880 }
2442 if (!mach_creds_match(clp, rqstp)) { 2881 if (!mach_creds_match(clp, rqstp)) {
2882 clp = NULL;
2443 status = nfserr_wrong_cred; 2883 status = nfserr_wrong_cred;
2444 goto out; 2884 goto out;
2445 } 2885 }
2446 expire_client(clp); 2886 unhash_client_locked(clp);
2447out: 2887out:
2448 nfs4_unlock_state(); 2888 spin_unlock(&nn->client_lock);
2889 if (clp)
2890 expire_client(clp);
2449 return status; 2891 return status;
2450} 2892}
2451 2893
@@ -2464,7 +2906,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2464 return nfs_ok; 2906 return nfs_ok;
2465 } 2907 }
2466 2908
2467 nfs4_lock_state();
2468 status = nfserr_complete_already; 2909 status = nfserr_complete_already;
2469 if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, 2910 if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
2470 &cstate->session->se_client->cl_flags)) 2911 &cstate->session->se_client->cl_flags))
@@ -2484,7 +2925,6 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
2484 status = nfs_ok; 2925 status = nfs_ok;
2485 nfsd4_client_record_create(cstate->session->se_client); 2926 nfsd4_client_record_create(cstate->session->se_client);
2486out: 2927out:
2487 nfs4_unlock_state();
2488 return status; 2928 return status;
2489} 2929}
2490 2930
@@ -2494,12 +2934,16 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2494{ 2934{
2495 struct xdr_netobj clname = setclid->se_name; 2935 struct xdr_netobj clname = setclid->se_name;
2496 nfs4_verifier clverifier = setclid->se_verf; 2936 nfs4_verifier clverifier = setclid->se_verf;
2497 struct nfs4_client *conf, *unconf, *new; 2937 struct nfs4_client *conf, *new;
2938 struct nfs4_client *unconf = NULL;
2498 __be32 status; 2939 __be32 status;
2499 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 2940 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
2500 2941
2942 new = create_client(clname, rqstp, &clverifier);
2943 if (new == NULL)
2944 return nfserr_jukebox;
2501 /* Cases below refer to rfc 3530 section 14.2.33: */ 2945 /* Cases below refer to rfc 3530 section 14.2.33: */
2502 nfs4_lock_state(); 2946 spin_lock(&nn->client_lock);
2503 conf = find_confirmed_client_by_name(&clname, nn); 2947 conf = find_confirmed_client_by_name(&clname, nn);
2504 if (conf) { 2948 if (conf) {
2505 /* case 0: */ 2949 /* case 0: */
@@ -2517,11 +2961,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2517 } 2961 }
2518 unconf = find_unconfirmed_client_by_name(&clname, nn); 2962 unconf = find_unconfirmed_client_by_name(&clname, nn);
2519 if (unconf) 2963 if (unconf)
2520 expire_client(unconf); 2964 unhash_client_locked(unconf);
2521 status = nfserr_jukebox;
2522 new = create_client(clname, rqstp, &clverifier);
2523 if (new == NULL)
2524 goto out;
2525 if (conf && same_verf(&conf->cl_verifier, &clverifier)) 2965 if (conf && same_verf(&conf->cl_verifier, &clverifier))
2526 /* case 1: probable callback update */ 2966 /* case 1: probable callback update */
2527 copy_clid(new, conf); 2967 copy_clid(new, conf);
@@ -2533,9 +2973,14 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2533 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot; 2973 setclid->se_clientid.cl_boot = new->cl_clientid.cl_boot;
2534 setclid->se_clientid.cl_id = new->cl_clientid.cl_id; 2974 setclid->se_clientid.cl_id = new->cl_clientid.cl_id;
2535 memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data)); 2975 memcpy(setclid->se_confirm.data, new->cl_confirm.data, sizeof(setclid->se_confirm.data));
2976 new = NULL;
2536 status = nfs_ok; 2977 status = nfs_ok;
2537out: 2978out:
2538 nfs4_unlock_state(); 2979 spin_unlock(&nn->client_lock);
2980 if (new)
2981 free_client(new);
2982 if (unconf)
2983 expire_client(unconf);
2539 return status; 2984 return status;
2540} 2985}
2541 2986
@@ -2546,6 +2991,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2546 struct nfsd4_setclientid_confirm *setclientid_confirm) 2991 struct nfsd4_setclientid_confirm *setclientid_confirm)
2547{ 2992{
2548 struct nfs4_client *conf, *unconf; 2993 struct nfs4_client *conf, *unconf;
2994 struct nfs4_client *old = NULL;
2549 nfs4_verifier confirm = setclientid_confirm->sc_confirm; 2995 nfs4_verifier confirm = setclientid_confirm->sc_confirm;
2550 clientid_t * clid = &setclientid_confirm->sc_clientid; 2996 clientid_t * clid = &setclientid_confirm->sc_clientid;
2551 __be32 status; 2997 __be32 status;
@@ -2553,8 +2999,8 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2553 2999
2554 if (STALE_CLIENTID(clid, nn)) 3000 if (STALE_CLIENTID(clid, nn))
2555 return nfserr_stale_clientid; 3001 return nfserr_stale_clientid;
2556 nfs4_lock_state();
2557 3002
3003 spin_lock(&nn->client_lock);
2558 conf = find_confirmed_client(clid, false, nn); 3004 conf = find_confirmed_client(clid, false, nn);
2559 unconf = find_unconfirmed_client(clid, false, nn); 3005 unconf = find_unconfirmed_client(clid, false, nn);
2560 /* 3006 /*
@@ -2578,22 +3024,30 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
2578 } 3024 }
2579 status = nfs_ok; 3025 status = nfs_ok;
2580 if (conf) { /* case 1: callback update */ 3026 if (conf) { /* case 1: callback update */
3027 old = unconf;
3028 unhash_client_locked(old);
2581 nfsd4_change_callback(conf, &unconf->cl_cb_conn); 3029 nfsd4_change_callback(conf, &unconf->cl_cb_conn);
2582 nfsd4_probe_callback(conf);
2583 expire_client(unconf);
2584 } else { /* case 3: normal case; new or rebooted client */ 3030 } else { /* case 3: normal case; new or rebooted client */
2585 conf = find_confirmed_client_by_name(&unconf->cl_name, nn); 3031 old = find_confirmed_client_by_name(&unconf->cl_name, nn);
2586 if (conf) { 3032 if (old) {
2587 status = mark_client_expired(conf); 3033 status = mark_client_expired_locked(old);
2588 if (status) 3034 if (status) {
3035 old = NULL;
2589 goto out; 3036 goto out;
2590 expire_client(conf); 3037 }
2591 } 3038 }
2592 move_to_confirmed(unconf); 3039 move_to_confirmed(unconf);
2593 nfsd4_probe_callback(unconf); 3040 conf = unconf;
2594 } 3041 }
3042 get_client_locked(conf);
3043 spin_unlock(&nn->client_lock);
3044 nfsd4_probe_callback(conf);
3045 spin_lock(&nn->client_lock);
3046 put_client_renew_locked(conf);
2595out: 3047out:
2596 nfs4_unlock_state(); 3048 spin_unlock(&nn->client_lock);
3049 if (old)
3050 expire_client(old);
2597 return status; 3051 return status;
2598} 3052}
2599 3053
@@ -2603,21 +3057,23 @@ static struct nfs4_file *nfsd4_alloc_file(void)
2603} 3057}
2604 3058
2605/* OPEN Share state helper functions */ 3059/* OPEN Share state helper functions */
2606static void nfsd4_init_file(struct nfs4_file *fp, struct inode *ino) 3060static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh)
2607{ 3061{
2608 unsigned int hashval = file_hashval(ino); 3062 unsigned int hashval = file_hashval(fh);
3063
3064 lockdep_assert_held(&state_lock);
2609 3065
2610 atomic_set(&fp->fi_ref, 1); 3066 atomic_set(&fp->fi_ref, 1);
3067 spin_lock_init(&fp->fi_lock);
2611 INIT_LIST_HEAD(&fp->fi_stateids); 3068 INIT_LIST_HEAD(&fp->fi_stateids);
2612 INIT_LIST_HEAD(&fp->fi_delegations); 3069 INIT_LIST_HEAD(&fp->fi_delegations);
2613 fp->fi_inode = igrab(ino); 3070 fh_copy_shallow(&fp->fi_fhandle, fh);
3071 fp->fi_deleg_file = NULL;
2614 fp->fi_had_conflict = false; 3072 fp->fi_had_conflict = false;
2615 fp->fi_lease = NULL; 3073 fp->fi_share_deny = 0;
2616 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 3074 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
2617 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 3075 memset(fp->fi_access, 0, sizeof(fp->fi_access));
2618 spin_lock(&state_lock);
2619 hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]); 3076 hlist_add_head(&fp->fi_hash, &file_hashtbl[hashval]);
2620 spin_unlock(&state_lock);
2621} 3077}
2622 3078
2623void 3079void
@@ -2673,6 +3129,27 @@ static void init_nfs4_replay(struct nfs4_replay *rp)
2673 rp->rp_status = nfserr_serverfault; 3129 rp->rp_status = nfserr_serverfault;
2674 rp->rp_buflen = 0; 3130 rp->rp_buflen = 0;
2675 rp->rp_buf = rp->rp_ibuf; 3131 rp->rp_buf = rp->rp_ibuf;
3132 mutex_init(&rp->rp_mutex);
3133}
3134
3135static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate,
3136 struct nfs4_stateowner *so)
3137{
3138 if (!nfsd4_has_session(cstate)) {
3139 mutex_lock(&so->so_replay.rp_mutex);
3140 cstate->replay_owner = nfs4_get_stateowner(so);
3141 }
3142}
3143
3144void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate)
3145{
3146 struct nfs4_stateowner *so = cstate->replay_owner;
3147
3148 if (so != NULL) {
3149 cstate->replay_owner = NULL;
3150 mutex_unlock(&so->so_replay.rp_mutex);
3151 nfs4_put_stateowner(so);
3152 }
2676} 3153}
2677 3154
2678static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp) 3155static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj *owner, struct nfs4_client *clp)
@@ -2693,111 +3170,171 @@ static inline void *alloc_stateowner(struct kmem_cache *slab, struct xdr_netobj
2693 INIT_LIST_HEAD(&sop->so_stateids); 3170 INIT_LIST_HEAD(&sop->so_stateids);
2694 sop->so_client = clp; 3171 sop->so_client = clp;
2695 init_nfs4_replay(&sop->so_replay); 3172 init_nfs4_replay(&sop->so_replay);
3173 atomic_set(&sop->so_count, 1);
2696 return sop; 3174 return sop;
2697} 3175}
2698 3176
2699static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval) 3177static void hash_openowner(struct nfs4_openowner *oo, struct nfs4_client *clp, unsigned int strhashval)
2700{ 3178{
2701 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 3179 lockdep_assert_held(&clp->cl_lock);
2702 3180
2703 list_add(&oo->oo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); 3181 list_add(&oo->oo_owner.so_strhash,
3182 &clp->cl_ownerstr_hashtbl[strhashval]);
2704 list_add(&oo->oo_perclient, &clp->cl_openowners); 3183 list_add(&oo->oo_perclient, &clp->cl_openowners);
2705} 3184}
2706 3185
3186static void nfs4_unhash_openowner(struct nfs4_stateowner *so)
3187{
3188 unhash_openowner_locked(openowner(so));
3189}
3190
3191static void nfs4_free_openowner(struct nfs4_stateowner *so)
3192{
3193 struct nfs4_openowner *oo = openowner(so);
3194
3195 kmem_cache_free(openowner_slab, oo);
3196}
3197
3198static const struct nfs4_stateowner_operations openowner_ops = {
3199 .so_unhash = nfs4_unhash_openowner,
3200 .so_free = nfs4_free_openowner,
3201};
3202
2707static struct nfs4_openowner * 3203static struct nfs4_openowner *
2708alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfsd4_open *open) { 3204alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
2709 struct nfs4_openowner *oo; 3205 struct nfsd4_compound_state *cstate)
3206{
3207 struct nfs4_client *clp = cstate->clp;
3208 struct nfs4_openowner *oo, *ret;
2710 3209
2711 oo = alloc_stateowner(openowner_slab, &open->op_owner, clp); 3210 oo = alloc_stateowner(openowner_slab, &open->op_owner, clp);
2712 if (!oo) 3211 if (!oo)
2713 return NULL; 3212 return NULL;
3213 oo->oo_owner.so_ops = &openowner_ops;
2714 oo->oo_owner.so_is_open_owner = 1; 3214 oo->oo_owner.so_is_open_owner = 1;
2715 oo->oo_owner.so_seqid = open->op_seqid; 3215 oo->oo_owner.so_seqid = open->op_seqid;
2716 oo->oo_flags = NFS4_OO_NEW; 3216 oo->oo_flags = 0;
3217 if (nfsd4_has_session(cstate))
3218 oo->oo_flags |= NFS4_OO_CONFIRMED;
2717 oo->oo_time = 0; 3219 oo->oo_time = 0;
2718 oo->oo_last_closed_stid = NULL; 3220 oo->oo_last_closed_stid = NULL;
2719 INIT_LIST_HEAD(&oo->oo_close_lru); 3221 INIT_LIST_HEAD(&oo->oo_close_lru);
2720 hash_openowner(oo, clp, strhashval); 3222 spin_lock(&clp->cl_lock);
3223 ret = find_openstateowner_str_locked(strhashval, open, clp);
3224 if (ret == NULL) {
3225 hash_openowner(oo, clp, strhashval);
3226 ret = oo;
3227 } else
3228 nfs4_free_openowner(&oo->oo_owner);
3229 spin_unlock(&clp->cl_lock);
2721 return oo; 3230 return oo;
2722} 3231}
2723 3232
2724static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { 3233static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
2725 struct nfs4_openowner *oo = open->op_openowner; 3234 struct nfs4_openowner *oo = open->op_openowner;
2726 3235
3236 atomic_inc(&stp->st_stid.sc_count);
2727 stp->st_stid.sc_type = NFS4_OPEN_STID; 3237 stp->st_stid.sc_type = NFS4_OPEN_STID;
2728 INIT_LIST_HEAD(&stp->st_lockowners); 3238 INIT_LIST_HEAD(&stp->st_locks);
2729 list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); 3239 stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
2730 list_add(&stp->st_perfile, &fp->fi_stateids);
2731 stp->st_stateowner = &oo->oo_owner;
2732 get_nfs4_file(fp); 3240 get_nfs4_file(fp);
2733 stp->st_file = fp; 3241 stp->st_stid.sc_file = fp;
2734 stp->st_access_bmap = 0; 3242 stp->st_access_bmap = 0;
2735 stp->st_deny_bmap = 0; 3243 stp->st_deny_bmap = 0;
2736 set_access(open->op_share_access, stp);
2737 set_deny(open->op_share_deny, stp);
2738 stp->st_openstp = NULL; 3244 stp->st_openstp = NULL;
3245 spin_lock(&oo->oo_owner.so_client->cl_lock);
3246 list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
3247 spin_lock(&fp->fi_lock);
3248 list_add(&stp->st_perfile, &fp->fi_stateids);
3249 spin_unlock(&fp->fi_lock);
3250 spin_unlock(&oo->oo_owner.so_client->cl_lock);
2739} 3251}
2740 3252
3253/*
3254 * In the 4.0 case we need to keep the owners around a little while to handle
3255 * CLOSE replay. We still do need to release any file access that is held by
3256 * them before returning however.
3257 */
2741static void 3258static void
2742move_to_close_lru(struct nfs4_openowner *oo, struct net *net) 3259move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
2743{ 3260{
2744 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 3261 struct nfs4_ol_stateid *last;
3262 struct nfs4_openowner *oo = openowner(s->st_stateowner);
3263 struct nfsd_net *nn = net_generic(s->st_stid.sc_client->net,
3264 nfsd_net_id);
2745 3265
2746 dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo); 3266 dprintk("NFSD: move_to_close_lru nfs4_openowner %p\n", oo);
2747 3267
3268 /*
3269 * We know that we hold one reference via nfsd4_close, and another
3270 * "persistent" reference for the client. If the refcount is higher
3271 * than 2, then there are still calls in progress that are using this
3272 * stateid. We can't put the sc_file reference until they are finished.
3273 * Wait for the refcount to drop to 2. Since it has been unhashed,
3274 * there should be no danger of the refcount going back up again at
3275 * this point.
3276 */
3277 wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2);
3278
3279 release_all_access(s);
3280 if (s->st_stid.sc_file) {
3281 put_nfs4_file(s->st_stid.sc_file);
3282 s->st_stid.sc_file = NULL;
3283 }
3284
3285 spin_lock(&nn->client_lock);
3286 last = oo->oo_last_closed_stid;
3287 oo->oo_last_closed_stid = s;
2748 list_move_tail(&oo->oo_close_lru, &nn->close_lru); 3288 list_move_tail(&oo->oo_close_lru, &nn->close_lru);
2749 oo->oo_time = get_seconds(); 3289 oo->oo_time = get_seconds();
3290 spin_unlock(&nn->client_lock);
3291 if (last)
3292 nfs4_put_stid(&last->st_stid);
2750} 3293}
2751 3294
2752static int 3295/* search file_hashtbl[] for file */
2753same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner, 3296static struct nfs4_file *
2754 clientid_t *clid) 3297find_file_locked(struct knfsd_fh *fh)
2755{ 3298{
2756 return (sop->so_owner.len == owner->len) && 3299 unsigned int hashval = file_hashval(fh);
2757 0 == memcmp(sop->so_owner.data, owner->data, owner->len) && 3300 struct nfs4_file *fp;
2758 (sop->so_client->cl_clientid.cl_id == clid->cl_id);
2759}
2760 3301
2761static struct nfs4_openowner * 3302 lockdep_assert_held(&state_lock);
2762find_openstateowner_str(unsigned int hashval, struct nfsd4_open *open,
2763 bool sessions, struct nfsd_net *nn)
2764{
2765 struct nfs4_stateowner *so;
2766 struct nfs4_openowner *oo;
2767 struct nfs4_client *clp;
2768 3303
2769 list_for_each_entry(so, &nn->ownerstr_hashtbl[hashval], so_strhash) { 3304 hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
2770 if (!so->so_is_open_owner) 3305 if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
2771 continue; 3306 get_nfs4_file(fp);
2772 if (same_owner_str(so, &open->op_owner, &open->op_clientid)) { 3307 return fp;
2773 oo = openowner(so);
2774 clp = oo->oo_owner.so_client;
2775 if ((bool)clp->cl_minorversion != sessions)
2776 return NULL;
2777 renew_client(oo->oo_owner.so_client);
2778 return oo;
2779 } 3308 }
2780 } 3309 }
2781 return NULL; 3310 return NULL;
2782} 3311}
2783 3312
2784/* search file_hashtbl[] for file */
2785static struct nfs4_file * 3313static struct nfs4_file *
2786find_file(struct inode *ino) 3314find_file(struct knfsd_fh *fh)
2787{ 3315{
2788 unsigned int hashval = file_hashval(ino);
2789 struct nfs4_file *fp; 3316 struct nfs4_file *fp;
2790 3317
2791 spin_lock(&state_lock); 3318 spin_lock(&state_lock);
2792 hlist_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { 3319 fp = find_file_locked(fh);
2793 if (fp->fi_inode == ino) { 3320 spin_unlock(&state_lock);
2794 get_nfs4_file(fp); 3321 return fp;
2795 spin_unlock(&state_lock); 3322}
2796 return fp; 3323
2797 } 3324static struct nfs4_file *
3325find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
3326{
3327 struct nfs4_file *fp;
3328
3329 spin_lock(&state_lock);
3330 fp = find_file_locked(fh);
3331 if (fp == NULL) {
3332 nfsd4_init_file(new, fh);
3333 fp = new;
2798 } 3334 }
2799 spin_unlock(&state_lock); 3335 spin_unlock(&state_lock);
2800 return NULL; 3336
3337 return fp;
2801} 3338}
2802 3339
2803/* 3340/*
@@ -2807,63 +3344,109 @@ find_file(struct inode *ino)
2807static __be32 3344static __be32
2808nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) 3345nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
2809{ 3346{
2810 struct inode *ino = current_fh->fh_dentry->d_inode;
2811 struct nfs4_file *fp; 3347 struct nfs4_file *fp;
2812 struct nfs4_ol_stateid *stp; 3348 __be32 ret = nfs_ok;
2813 __be32 ret;
2814 3349
2815 fp = find_file(ino); 3350 fp = find_file(&current_fh->fh_handle);
2816 if (!fp) 3351 if (!fp)
2817 return nfs_ok; 3352 return ret;
2818 ret = nfserr_locked; 3353 /* Check for conflicting share reservations */
2819 /* Search for conflicting share reservations */ 3354 spin_lock(&fp->fi_lock);
2820 list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { 3355 if (fp->fi_share_deny & deny_type)
2821 if (test_deny(deny_type, stp) || 3356 ret = nfserr_locked;
2822 test_deny(NFS4_SHARE_DENY_BOTH, stp)) 3357 spin_unlock(&fp->fi_lock);
2823 goto out;
2824 }
2825 ret = nfs_ok;
2826out:
2827 put_nfs4_file(fp); 3358 put_nfs4_file(fp);
2828 return ret; 3359 return ret;
2829} 3360}
2830 3361
2831static void nfsd_break_one_deleg(struct nfs4_delegation *dp) 3362static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
2832{ 3363{
2833 struct nfs4_client *clp = dp->dl_stid.sc_client; 3364 struct nfs4_delegation *dp = cb_to_delegation(cb);
2834 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); 3365 struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net,
3366 nfsd_net_id);
2835 3367
2836 lockdep_assert_held(&state_lock); 3368 block_delegations(&dp->dl_stid.sc_file->fi_fhandle);
2837 /* We're assuming the state code never drops its reference 3369
2838 * without first removing the lease. Since we're in this lease 3370 /*
2839 * callback (and since the lease code is serialized by the kernel 3371 * We can't do this in nfsd_break_deleg_cb because it is
2840 * lock) we know the server hasn't removed the lease yet, we know 3372 * already holding inode->i_lock.
2841 * it's safe to take a reference: */ 3373 *
2842 atomic_inc(&dp->dl_count); 3374 * If the dl_time != 0, then we know that it has already been
3375 * queued for a lease break. Don't queue it again.
3376 */
3377 spin_lock(&state_lock);
3378 if (dp->dl_time == 0) {
3379 dp->dl_time = get_seconds();
3380 list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru);
3381 }
3382 spin_unlock(&state_lock);
3383}
3384
3385static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
3386 struct rpc_task *task)
3387{
3388 struct nfs4_delegation *dp = cb_to_delegation(cb);
3389
3390 switch (task->tk_status) {
3391 case 0:
3392 return 1;
3393 case -EBADHANDLE:
3394 case -NFS4ERR_BAD_STATEID:
3395 /*
3396 * Race: client probably got cb_recall before open reply
3397 * granting delegation.
3398 */
3399 if (dp->dl_retries--) {
3400 rpc_delay(task, 2 * HZ);
3401 return 0;
3402 }
3403 /*FALLTHRU*/
3404 default:
3405 return -1;
3406 }
3407}
2843 3408
2844 list_add_tail(&dp->dl_recall_lru, &nn->del_recall_lru); 3409static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
3410{
3411 struct nfs4_delegation *dp = cb_to_delegation(cb);
2845 3412
2846 /* Only place dl_time is set; protected by i_lock: */ 3413 nfs4_put_stid(&dp->dl_stid);
2847 dp->dl_time = get_seconds(); 3414}
2848 3415
2849 block_delegations(&dp->dl_fh); 3416static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
3417 .prepare = nfsd4_cb_recall_prepare,
3418 .done = nfsd4_cb_recall_done,
3419 .release = nfsd4_cb_recall_release,
3420};
2850 3421
2851 nfsd4_cb_recall(dp); 3422static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
3423{
3424 /*
3425 * We're assuming the state code never drops its reference
3426 * without first removing the lease. Since we're in this lease
3427 * callback (and since the lease code is serialized by the kernel
3428 * lock) we know the server hasn't removed the lease yet, we know
3429 * it's safe to take a reference.
3430 */
3431 atomic_inc(&dp->dl_stid.sc_count);
3432 nfsd4_run_cb(&dp->dl_recall);
2852} 3433}
2853 3434
2854/* Called from break_lease() with i_lock held. */ 3435/* Called from break_lease() with i_lock held. */
2855static void nfsd_break_deleg_cb(struct file_lock *fl) 3436static bool
3437nfsd_break_deleg_cb(struct file_lock *fl)
2856{ 3438{
3439 bool ret = false;
2857 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; 3440 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
2858 struct nfs4_delegation *dp; 3441 struct nfs4_delegation *dp;
2859 3442
2860 if (!fp) { 3443 if (!fp) {
2861 WARN(1, "(%p)->fl_owner NULL\n", fl); 3444 WARN(1, "(%p)->fl_owner NULL\n", fl);
2862 return; 3445 return ret;
2863 } 3446 }
2864 if (fp->fi_had_conflict) { 3447 if (fp->fi_had_conflict) {
2865 WARN(1, "duplicate break on %p\n", fp); 3448 WARN(1, "duplicate break on %p\n", fp);
2866 return; 3449 return ret;
2867 } 3450 }
2868 /* 3451 /*
2869 * We don't want the locks code to timeout the lease for us; 3452 * We don't want the locks code to timeout the lease for us;
@@ -2872,18 +3455,26 @@ static void nfsd_break_deleg_cb(struct file_lock *fl)
2872 */ 3455 */
2873 fl->fl_break_time = 0; 3456 fl->fl_break_time = 0;
2874 3457
2875 spin_lock(&state_lock); 3458 spin_lock(&fp->fi_lock);
2876 fp->fi_had_conflict = true; 3459 fp->fi_had_conflict = true;
2877 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) 3460 /*
2878 nfsd_break_one_deleg(dp); 3461 * If there are no delegations on the list, then return true
2879 spin_unlock(&state_lock); 3462 * so that the lease code will go ahead and delete it.
3463 */
3464 if (list_empty(&fp->fi_delegations))
3465 ret = true;
3466 else
3467 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
3468 nfsd_break_one_deleg(dp);
3469 spin_unlock(&fp->fi_lock);
3470 return ret;
2880} 3471}
2881 3472
2882static 3473static int
2883int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) 3474nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose)
2884{ 3475{
2885 if (arg & F_UNLCK) 3476 if (arg & F_UNLCK)
2886 return lease_modify(onlist, arg); 3477 return lease_modify(onlist, arg, dispose);
2887 else 3478 else
2888 return -EAGAIN; 3479 return -EAGAIN;
2889} 3480}
@@ -2904,6 +3495,42 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
2904 return nfserr_bad_seqid; 3495 return nfserr_bad_seqid;
2905} 3496}
2906 3497
3498static __be32 lookup_clientid(clientid_t *clid,
3499 struct nfsd4_compound_state *cstate,
3500 struct nfsd_net *nn)
3501{
3502 struct nfs4_client *found;
3503
3504 if (cstate->clp) {
3505 found = cstate->clp;
3506 if (!same_clid(&found->cl_clientid, clid))
3507 return nfserr_stale_clientid;
3508 return nfs_ok;
3509 }
3510
3511 if (STALE_CLIENTID(clid, nn))
3512 return nfserr_stale_clientid;
3513
3514 /*
3515 * For v4.1+ we get the client in the SEQUENCE op. If we don't have one
3516 * cached already then we know this is for is for v4.0 and "sessions"
3517 * will be false.
3518 */
3519 WARN_ON_ONCE(cstate->session);
3520 spin_lock(&nn->client_lock);
3521 found = find_confirmed_client(clid, false, nn);
3522 if (!found) {
3523 spin_unlock(&nn->client_lock);
3524 return nfserr_expired;
3525 }
3526 atomic_inc(&found->cl_refcount);
3527 spin_unlock(&nn->client_lock);
3528
3529 /* Cache the nfs4_client in cstate! */
3530 cstate->clp = found;
3531 return nfs_ok;
3532}
3533
2907__be32 3534__be32
2908nfsd4_process_open1(struct nfsd4_compound_state *cstate, 3535nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2909 struct nfsd4_open *open, struct nfsd_net *nn) 3536 struct nfsd4_open *open, struct nfsd_net *nn)
@@ -2924,19 +3551,19 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2924 if (open->op_file == NULL) 3551 if (open->op_file == NULL)
2925 return nfserr_jukebox; 3552 return nfserr_jukebox;
2926 3553
2927 strhashval = ownerstr_hashval(clientid->cl_id, &open->op_owner); 3554 status = lookup_clientid(clientid, cstate, nn);
2928 oo = find_openstateowner_str(strhashval, open, cstate->minorversion, nn); 3555 if (status)
3556 return status;
3557 clp = cstate->clp;
3558
3559 strhashval = ownerstr_hashval(&open->op_owner);
3560 oo = find_openstateowner_str(strhashval, open, clp);
2929 open->op_openowner = oo; 3561 open->op_openowner = oo;
2930 if (!oo) { 3562 if (!oo) {
2931 clp = find_confirmed_client(clientid, cstate->minorversion,
2932 nn);
2933 if (clp == NULL)
2934 return nfserr_expired;
2935 goto new_owner; 3563 goto new_owner;
2936 } 3564 }
2937 if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { 3565 if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
2938 /* Replace unconfirmed owners without checking for replay. */ 3566 /* Replace unconfirmed owners without checking for replay. */
2939 clp = oo->oo_owner.so_client;
2940 release_openowner(oo); 3567 release_openowner(oo);
2941 open->op_openowner = NULL; 3568 open->op_openowner = NULL;
2942 goto new_owner; 3569 goto new_owner;
@@ -2944,15 +3571,14 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2944 status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid); 3571 status = nfsd4_check_seqid(cstate, &oo->oo_owner, open->op_seqid);
2945 if (status) 3572 if (status)
2946 return status; 3573 return status;
2947 clp = oo->oo_owner.so_client;
2948 goto alloc_stateid; 3574 goto alloc_stateid;
2949new_owner: 3575new_owner:
2950 oo = alloc_init_open_stateowner(strhashval, clp, open); 3576 oo = alloc_init_open_stateowner(strhashval, open, cstate);
2951 if (oo == NULL) 3577 if (oo == NULL)
2952 return nfserr_jukebox; 3578 return nfserr_jukebox;
2953 open->op_openowner = oo; 3579 open->op_openowner = oo;
2954alloc_stateid: 3580alloc_stateid:
2955 open->op_stp = nfs4_alloc_stateid(clp); 3581 open->op_stp = nfs4_alloc_open_stateid(clp);
2956 if (!open->op_stp) 3582 if (!open->op_stp)
2957 return nfserr_jukebox; 3583 return nfserr_jukebox;
2958 return nfs_ok; 3584 return nfs_ok;
@@ -2994,14 +3620,18 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
2994{ 3620{
2995 int flags; 3621 int flags;
2996 __be32 status = nfserr_bad_stateid; 3622 __be32 status = nfserr_bad_stateid;
3623 struct nfs4_delegation *deleg;
2997 3624
2998 *dp = find_deleg_stateid(cl, &open->op_delegate_stateid); 3625 deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
2999 if (*dp == NULL) 3626 if (deleg == NULL)
3000 goto out; 3627 goto out;
3001 flags = share_access_to_flags(open->op_share_access); 3628 flags = share_access_to_flags(open->op_share_access);
3002 status = nfs4_check_delegmode(*dp, flags); 3629 status = nfs4_check_delegmode(deleg, flags);
3003 if (status) 3630 if (status) {
3004 *dp = NULL; 3631 nfs4_put_stid(&deleg->dl_stid);
3632 goto out;
3633 }
3634 *dp = deleg;
3005out: 3635out:
3006 if (!nfsd4_is_deleg_cur(open)) 3636 if (!nfsd4_is_deleg_cur(open))
3007 return nfs_ok; 3637 return nfs_ok;
@@ -3011,24 +3641,25 @@ out:
3011 return nfs_ok; 3641 return nfs_ok;
3012} 3642}
3013 3643
3014static __be32 3644static struct nfs4_ol_stateid *
3015nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_ol_stateid **stpp) 3645nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
3016{ 3646{
3017 struct nfs4_ol_stateid *local; 3647 struct nfs4_ol_stateid *local, *ret = NULL;
3018 struct nfs4_openowner *oo = open->op_openowner; 3648 struct nfs4_openowner *oo = open->op_openowner;
3019 3649
3650 spin_lock(&fp->fi_lock);
3020 list_for_each_entry(local, &fp->fi_stateids, st_perfile) { 3651 list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
3021 /* ignore lock owners */ 3652 /* ignore lock owners */
3022 if (local->st_stateowner->so_is_open_owner == 0) 3653 if (local->st_stateowner->so_is_open_owner == 0)
3023 continue; 3654 continue;
3024 /* remember if we have seen this open owner */ 3655 if (local->st_stateowner == &oo->oo_owner) {
3025 if (local->st_stateowner == &oo->oo_owner) 3656 ret = local;
3026 *stpp = local; 3657 atomic_inc(&ret->st_stid.sc_count);
3027 /* check for conflicting share reservations */ 3658 break;
3028 if (!test_share(local, open)) 3659 }
3029 return nfserr_share_denied;
3030 } 3660 }
3031 return nfs_ok; 3661 spin_unlock(&fp->fi_lock);
3662 return ret;
3032} 3663}
3033 3664
3034static inline int nfs4_access_to_access(u32 nfs4_access) 3665static inline int nfs4_access_to_access(u32 nfs4_access)
@@ -3042,24 +3673,6 @@ static inline int nfs4_access_to_access(u32 nfs4_access)
3042 return flags; 3673 return flags;
3043} 3674}
3044 3675
3045static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
3046 struct svc_fh *cur_fh, struct nfsd4_open *open)
3047{
3048 __be32 status;
3049 int oflag = nfs4_access_to_omode(open->op_share_access);
3050 int access = nfs4_access_to_access(open->op_share_access);
3051
3052 if (!fp->fi_fds[oflag]) {
3053 status = nfsd_open(rqstp, cur_fh, S_IFREG, access,
3054 &fp->fi_fds[oflag]);
3055 if (status)
3056 return status;
3057 }
3058 nfs4_file_get_access(fp, oflag);
3059
3060 return nfs_ok;
3061}
3062
3063static inline __be32 3676static inline __be32
3064nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, 3677nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
3065 struct nfsd4_open *open) 3678 struct nfsd4_open *open)
@@ -3075,34 +3688,99 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
3075 return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0); 3688 return nfsd_setattr(rqstp, fh, &iattr, 0, (time_t)0);
3076} 3689}
3077 3690
3078static __be32 3691static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
3079nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) 3692 struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
3693 struct nfsd4_open *open)
3080{ 3694{
3081 u32 op_share_access = open->op_share_access; 3695 struct file *filp = NULL;
3082 bool new_access;
3083 __be32 status; 3696 __be32 status;
3697 int oflag = nfs4_access_to_omode(open->op_share_access);
3698 int access = nfs4_access_to_access(open->op_share_access);
3699 unsigned char old_access_bmap, old_deny_bmap;
3084 3700
3085 new_access = !test_access(op_share_access, stp); 3701 spin_lock(&fp->fi_lock);
3086 if (new_access) { 3702
3087 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open); 3703 /*
3088 if (status) 3704 * Are we trying to set a deny mode that would conflict with
3089 return status; 3705 * current access?
3706 */
3707 status = nfs4_file_check_deny(fp, open->op_share_deny);
3708 if (status != nfs_ok) {
3709 spin_unlock(&fp->fi_lock);
3710 goto out;
3090 } 3711 }
3091 status = nfsd4_truncate(rqstp, cur_fh, open); 3712
3092 if (status) { 3713 /* set access to the file */
3093 if (new_access) { 3714 status = nfs4_file_get_access(fp, open->op_share_access);
3094 int oflag = nfs4_access_to_omode(op_share_access); 3715 if (status != nfs_ok) {
3095 nfs4_file_put_access(fp, oflag); 3716 spin_unlock(&fp->fi_lock);
3096 } 3717 goto out;
3097 return status;
3098 } 3718 }
3099 /* remember the open */ 3719
3100 set_access(op_share_access, stp); 3720 /* Set access bits in stateid */
3721 old_access_bmap = stp->st_access_bmap;
3722 set_access(open->op_share_access, stp);
3723
3724 /* Set new deny mask */
3725 old_deny_bmap = stp->st_deny_bmap;
3101 set_deny(open->op_share_deny, stp); 3726 set_deny(open->op_share_deny, stp);
3727 fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
3102 3728
3103 return nfs_ok; 3729 if (!fp->fi_fds[oflag]) {
3730 spin_unlock(&fp->fi_lock);
3731 status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp);
3732 if (status)
3733 goto out_put_access;
3734 spin_lock(&fp->fi_lock);
3735 if (!fp->fi_fds[oflag]) {
3736 fp->fi_fds[oflag] = filp;
3737 filp = NULL;
3738 }
3739 }
3740 spin_unlock(&fp->fi_lock);
3741 if (filp)
3742 fput(filp);
3743
3744 status = nfsd4_truncate(rqstp, cur_fh, open);
3745 if (status)
3746 goto out_put_access;
3747out:
3748 return status;
3749out_put_access:
3750 stp->st_access_bmap = old_access_bmap;
3751 nfs4_file_put_access(fp, open->op_share_access);
3752 reset_union_bmap_deny(bmap_to_share_mode(old_deny_bmap), stp);
3753 goto out;
3104} 3754}
3105 3755
3756static __be32
3757nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
3758{
3759 __be32 status;
3760 unsigned char old_deny_bmap;
3761
3762 if (!test_access(open->op_share_access, stp))
3763 return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
3764
3765 /* test and set deny mode */
3766 spin_lock(&fp->fi_lock);
3767 status = nfs4_file_check_deny(fp, open->op_share_deny);
3768 if (status == nfs_ok) {
3769 old_deny_bmap = stp->st_deny_bmap;
3770 set_deny(open->op_share_deny, stp);
3771 fp->fi_share_deny |=
3772 (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
3773 }
3774 spin_unlock(&fp->fi_lock);
3775
3776 if (status != nfs_ok)
3777 return status;
3778
3779 status = nfsd4_truncate(rqstp, cur_fh, open);
3780 if (status != nfs_ok)
3781 reset_union_bmap_deny(old_deny_bmap, stp);
3782 return status;
3783}
3106 3784
3107static void 3785static void
3108nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session) 3786nfs4_set_claim_prev(struct nfsd4_open *open, bool has_session)
@@ -3123,65 +3801,112 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
3123 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; 3801 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
3124} 3802}
3125 3803
3126static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag) 3804static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
3127{ 3805{
3128 struct file_lock *fl; 3806 struct file_lock *fl;
3129 3807
3130 fl = locks_alloc_lock(); 3808 fl = locks_alloc_lock();
3131 if (!fl) 3809 if (!fl)
3132 return NULL; 3810 return NULL;
3133 locks_init_lock(fl);
3134 fl->fl_lmops = &nfsd_lease_mng_ops; 3811 fl->fl_lmops = &nfsd_lease_mng_ops;
3135 fl->fl_flags = FL_DELEG; 3812 fl->fl_flags = FL_DELEG;
3136 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; 3813 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
3137 fl->fl_end = OFFSET_MAX; 3814 fl->fl_end = OFFSET_MAX;
3138 fl->fl_owner = (fl_owner_t)(dp->dl_file); 3815 fl->fl_owner = (fl_owner_t)fp;
3139 fl->fl_pid = current->tgid; 3816 fl->fl_pid = current->tgid;
3140 return fl; 3817 return fl;
3141} 3818}
3142 3819
3143static int nfs4_setlease(struct nfs4_delegation *dp) 3820static int nfs4_setlease(struct nfs4_delegation *dp)
3144{ 3821{
3145 struct nfs4_file *fp = dp->dl_file; 3822 struct nfs4_file *fp = dp->dl_stid.sc_file;
3146 struct file_lock *fl; 3823 struct file_lock *fl, *ret;
3147 int status; 3824 struct file *filp;
3825 int status = 0;
3148 3826
3149 fl = nfs4_alloc_init_lease(dp, NFS4_OPEN_DELEGATE_READ); 3827 fl = nfs4_alloc_init_lease(fp, NFS4_OPEN_DELEGATE_READ);
3150 if (!fl) 3828 if (!fl)
3151 return -ENOMEM; 3829 return -ENOMEM;
3152 fl->fl_file = find_readable_file(fp); 3830 filp = find_readable_file(fp);
3153 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl); 3831 if (!filp) {
3832 /* We should always have a readable file here */
3833 WARN_ON_ONCE(1);
3834 return -EBADF;
3835 }
3836 fl->fl_file = filp;
3837 ret = fl;
3838 status = vfs_setlease(filp, fl->fl_type, &fl, NULL);
3839 if (fl)
3840 locks_free_lock(fl);
3154 if (status) 3841 if (status)
3155 goto out_free; 3842 goto out_fput;
3156 fp->fi_lease = fl;
3157 fp->fi_deleg_file = get_file(fl->fl_file);
3158 atomic_set(&fp->fi_delegees, 1);
3159 spin_lock(&state_lock); 3843 spin_lock(&state_lock);
3844 spin_lock(&fp->fi_lock);
3845 /* Did the lease get broken before we took the lock? */
3846 status = -EAGAIN;
3847 if (fp->fi_had_conflict)
3848 goto out_unlock;
3849 /* Race breaker */
3850 if (fp->fi_deleg_file) {
3851 status = 0;
3852 atomic_inc(&fp->fi_delegees);
3853 hash_delegation_locked(dp, fp);
3854 goto out_unlock;
3855 }
3856 fp->fi_deleg_file = filp;
3857 atomic_set(&fp->fi_delegees, 1);
3160 hash_delegation_locked(dp, fp); 3858 hash_delegation_locked(dp, fp);
3859 spin_unlock(&fp->fi_lock);
3161 spin_unlock(&state_lock); 3860 spin_unlock(&state_lock);
3162 return 0; 3861 return 0;
3163out_free: 3862out_unlock:
3164 locks_free_lock(fl); 3863 spin_unlock(&fp->fi_lock);
3864 spin_unlock(&state_lock);
3865out_fput:
3866 fput(filp);
3165 return status; 3867 return status;
3166} 3868}
3167 3869
3168static int nfs4_set_delegation(struct nfs4_delegation *dp, struct nfs4_file *fp) 3870static struct nfs4_delegation *
3871nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
3872 struct nfs4_file *fp)
3169{ 3873{
3874 int status;
3875 struct nfs4_delegation *dp;
3876
3170 if (fp->fi_had_conflict) 3877 if (fp->fi_had_conflict)
3171 return -EAGAIN; 3878 return ERR_PTR(-EAGAIN);
3879
3880 dp = alloc_init_deleg(clp, fh);
3881 if (!dp)
3882 return ERR_PTR(-ENOMEM);
3883
3172 get_nfs4_file(fp); 3884 get_nfs4_file(fp);
3173 dp->dl_file = fp;
3174 if (!fp->fi_lease)
3175 return nfs4_setlease(dp);
3176 spin_lock(&state_lock); 3885 spin_lock(&state_lock);
3886 spin_lock(&fp->fi_lock);
3887 dp->dl_stid.sc_file = fp;
3888 if (!fp->fi_deleg_file) {
3889 spin_unlock(&fp->fi_lock);
3890 spin_unlock(&state_lock);
3891 status = nfs4_setlease(dp);
3892 goto out;
3893 }
3177 atomic_inc(&fp->fi_delegees); 3894 atomic_inc(&fp->fi_delegees);
3178 if (fp->fi_had_conflict) { 3895 if (fp->fi_had_conflict) {
3179 spin_unlock(&state_lock); 3896 status = -EAGAIN;
3180 return -EAGAIN; 3897 goto out_unlock;
3181 } 3898 }
3182 hash_delegation_locked(dp, fp); 3899 hash_delegation_locked(dp, fp);
3900 status = 0;
3901out_unlock:
3902 spin_unlock(&fp->fi_lock);
3183 spin_unlock(&state_lock); 3903 spin_unlock(&state_lock);
3184 return 0; 3904out:
3905 if (status) {
3906 nfs4_put_stid(&dp->dl_stid);
3907 return ERR_PTR(status);
3908 }
3909 return dp;
3185} 3910}
3186 3911
3187static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status) 3912static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
@@ -3212,11 +3937,12 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
3212 * proper support for them. 3937 * proper support for them.
3213 */ 3938 */
3214static void 3939static void
3215nfs4_open_delegation(struct net *net, struct svc_fh *fh, 3940nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
3216 struct nfsd4_open *open, struct nfs4_ol_stateid *stp) 3941 struct nfs4_ol_stateid *stp)
3217{ 3942{
3218 struct nfs4_delegation *dp; 3943 struct nfs4_delegation *dp;
3219 struct nfs4_openowner *oo = container_of(stp->st_stateowner, struct nfs4_openowner, oo_owner); 3944 struct nfs4_openowner *oo = openowner(stp->st_stateowner);
3945 struct nfs4_client *clp = stp->st_stid.sc_client;
3220 int cb_up; 3946 int cb_up;
3221 int status = 0; 3947 int status = 0;
3222 3948
@@ -3235,7 +3961,7 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3235 * Let's not give out any delegations till everyone's 3961 * Let's not give out any delegations till everyone's
3236 * had the chance to reclaim theirs.... 3962 * had the chance to reclaim theirs....
3237 */ 3963 */
3238 if (locks_in_grace(net)) 3964 if (locks_in_grace(clp->net))
3239 goto out_no_deleg; 3965 goto out_no_deleg;
3240 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED)) 3966 if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
3241 goto out_no_deleg; 3967 goto out_no_deleg;
@@ -3254,21 +3980,17 @@ nfs4_open_delegation(struct net *net, struct svc_fh *fh,
3254 default: 3980 default:
3255 goto out_no_deleg; 3981 goto out_no_deleg;
3256 } 3982 }
3257 dp = alloc_init_deleg(oo->oo_owner.so_client, stp, fh); 3983 dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file);
3258 if (dp == NULL) 3984 if (IS_ERR(dp))
3259 goto out_no_deleg; 3985 goto out_no_deleg;
3260 status = nfs4_set_delegation(dp, stp->st_file);
3261 if (status)
3262 goto out_free;
3263 3986
3264 memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); 3987 memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
3265 3988
3266 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n", 3989 dprintk("NFSD: delegation stateid=" STATEID_FMT "\n",
3267 STATEID_VAL(&dp->dl_stid.sc_stateid)); 3990 STATEID_VAL(&dp->dl_stid.sc_stateid));
3268 open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; 3991 open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
3992 nfs4_put_stid(&dp->dl_stid);
3269 return; 3993 return;
3270out_free:
3271 destroy_delegation(dp);
3272out_no_deleg: 3994out_no_deleg:
3273 open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE; 3995 open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE;
3274 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && 3996 if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS &&
@@ -3301,16 +4023,12 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
3301 */ 4023 */
3302} 4024}
3303 4025
3304/*
3305 * called with nfs4_lock_state() held.
3306 */
3307__be32 4026__be32
3308nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 4027nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
3309{ 4028{
3310 struct nfsd4_compoundres *resp = rqstp->rq_resp; 4029 struct nfsd4_compoundres *resp = rqstp->rq_resp;
3311 struct nfs4_client *cl = open->op_openowner->oo_owner.so_client; 4030 struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
3312 struct nfs4_file *fp = NULL; 4031 struct nfs4_file *fp = NULL;
3313 struct inode *ino = current_fh->fh_dentry->d_inode;
3314 struct nfs4_ol_stateid *stp = NULL; 4032 struct nfs4_ol_stateid *stp = NULL;
3315 struct nfs4_delegation *dp = NULL; 4033 struct nfs4_delegation *dp = NULL;
3316 __be32 status; 4034 __be32 status;
@@ -3320,21 +4038,18 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
3320 * and check for delegations in the process of being recalled. 4038 * and check for delegations in the process of being recalled.
3321 * If not found, create the nfs4_file struct 4039 * If not found, create the nfs4_file struct
3322 */ 4040 */
3323 fp = find_file(ino); 4041 fp = find_or_add_file(open->op_file, &current_fh->fh_handle);
3324 if (fp) { 4042 if (fp != open->op_file) {
3325 if ((status = nfs4_check_open(fp, open, &stp)))
3326 goto out;
3327 status = nfs4_check_deleg(cl, open, &dp); 4043 status = nfs4_check_deleg(cl, open, &dp);
3328 if (status) 4044 if (status)
3329 goto out; 4045 goto out;
4046 stp = nfsd4_find_existing_open(fp, open);
3330 } else { 4047 } else {
4048 open->op_file = NULL;
3331 status = nfserr_bad_stateid; 4049 status = nfserr_bad_stateid;
3332 if (nfsd4_is_deleg_cur(open)) 4050 if (nfsd4_is_deleg_cur(open))
3333 goto out; 4051 goto out;
3334 status = nfserr_jukebox; 4052 status = nfserr_jukebox;
3335 fp = open->op_file;
3336 open->op_file = NULL;
3337 nfsd4_init_file(fp, ino);
3338 } 4053 }
3339 4054
3340 /* 4055 /*
@@ -3347,22 +4062,19 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
3347 if (status) 4062 if (status)
3348 goto out; 4063 goto out;
3349 } else { 4064 } else {
3350 status = nfs4_get_vfs_file(rqstp, fp, current_fh, open);
3351 if (status)
3352 goto out;
3353 status = nfsd4_truncate(rqstp, current_fh, open);
3354 if (status)
3355 goto out;
3356 stp = open->op_stp; 4065 stp = open->op_stp;
3357 open->op_stp = NULL; 4066 open->op_stp = NULL;
3358 init_open_stateid(stp, fp, open); 4067 init_open_stateid(stp, fp, open);
4068 status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
4069 if (status) {
4070 release_open_stateid(stp);
4071 goto out;
4072 }
3359 } 4073 }
3360 update_stateid(&stp->st_stid.sc_stateid); 4074 update_stateid(&stp->st_stid.sc_stateid);
3361 memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4075 memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
3362 4076
3363 if (nfsd4_has_session(&resp->cstate)) { 4077 if (nfsd4_has_session(&resp->cstate)) {
3364 open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
3365
3366 if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { 4078 if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
3367 open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT; 4079 open->op_delegate_type = NFS4_OPEN_DELEGATE_NONE_EXT;
3368 open->op_why_no_deleg = WND4_NOT_WANTED; 4080 open->op_why_no_deleg = WND4_NOT_WANTED;
@@ -3374,7 +4086,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
3374 * Attempt to hand out a delegation. No error return, because the 4086 * Attempt to hand out a delegation. No error return, because the
3375 * OPEN succeeds even if we fail. 4087 * OPEN succeeds even if we fail.
3376 */ 4088 */
3377 nfs4_open_delegation(SVC_NET(rqstp), current_fh, open, stp); 4089 nfs4_open_delegation(current_fh, open, stp);
3378nodeleg: 4090nodeleg:
3379 status = nfs_ok; 4091 status = nfs_ok;
3380 4092
@@ -3397,41 +4109,27 @@ out:
3397 if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) && 4109 if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
3398 !nfsd4_has_session(&resp->cstate)) 4110 !nfsd4_has_session(&resp->cstate))
3399 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; 4111 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
4112 if (dp)
4113 nfs4_put_stid(&dp->dl_stid);
4114 if (stp)
4115 nfs4_put_stid(&stp->st_stid);
3400 4116
3401 return status; 4117 return status;
3402} 4118}
3403 4119
3404void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status) 4120void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
4121 struct nfsd4_open *open, __be32 status)
3405{ 4122{
3406 if (open->op_openowner) { 4123 if (open->op_openowner) {
3407 struct nfs4_openowner *oo = open->op_openowner; 4124 struct nfs4_stateowner *so = &open->op_openowner->oo_owner;
3408 4125
3409 if (!list_empty(&oo->oo_owner.so_stateids)) 4126 nfsd4_cstate_assign_replay(cstate, so);
3410 list_del_init(&oo->oo_close_lru); 4127 nfs4_put_stateowner(so);
3411 if (oo->oo_flags & NFS4_OO_NEW) {
3412 if (status) {
3413 release_openowner(oo);
3414 open->op_openowner = NULL;
3415 } else
3416 oo->oo_flags &= ~NFS4_OO_NEW;
3417 }
3418 } 4128 }
3419 if (open->op_file) 4129 if (open->op_file)
3420 nfsd4_free_file(open->op_file); 4130 nfsd4_free_file(open->op_file);
3421 if (open->op_stp) 4131 if (open->op_stp)
3422 free_generic_stateid(open->op_stp); 4132 nfs4_put_stid(&open->op_stp->st_stid);
3423}
3424
3425static __be32 lookup_clientid(clientid_t *clid, bool session, struct nfsd_net *nn, struct nfs4_client **clp)
3426{
3427 struct nfs4_client *found;
3428
3429 if (STALE_CLIENTID(clid, nn))
3430 return nfserr_stale_clientid;
3431 found = find_confirmed_client(clid, session, nn);
3432 if (clp)
3433 *clp = found;
3434 return found ? nfs_ok : nfserr_expired;
3435} 4133}
3436 4134
3437__be32 4135__be32
@@ -3442,23 +4140,22 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3442 __be32 status; 4140 __be32 status;
3443 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 4141 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
3444 4142
3445 nfs4_lock_state();
3446 dprintk("process_renew(%08x/%08x): starting\n", 4143 dprintk("process_renew(%08x/%08x): starting\n",
3447 clid->cl_boot, clid->cl_id); 4144 clid->cl_boot, clid->cl_id);
3448 status = lookup_clientid(clid, cstate->minorversion, nn, &clp); 4145 status = lookup_clientid(clid, cstate, nn);
3449 if (status) 4146 if (status)
3450 goto out; 4147 goto out;
4148 clp = cstate->clp;
3451 status = nfserr_cb_path_down; 4149 status = nfserr_cb_path_down;
3452 if (!list_empty(&clp->cl_delegations) 4150 if (!list_empty(&clp->cl_delegations)
3453 && clp->cl_cb_state != NFSD4_CB_UP) 4151 && clp->cl_cb_state != NFSD4_CB_UP)
3454 goto out; 4152 goto out;
3455 status = nfs_ok; 4153 status = nfs_ok;
3456out: 4154out:
3457 nfs4_unlock_state();
3458 return status; 4155 return status;
3459} 4156}
3460 4157
3461static void 4158void
3462nfsd4_end_grace(struct nfsd_net *nn) 4159nfsd4_end_grace(struct nfsd_net *nn)
3463{ 4160{
3464 /* do nothing if grace period already ended */ 4161 /* do nothing if grace period already ended */
@@ -3467,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn)
3467 4164
3468 dprintk("NFSD: end of grace period\n"); 4165 dprintk("NFSD: end of grace period\n");
3469 nn->grace_ended = true; 4166 nn->grace_ended = true;
3470 nfsd4_record_grace_done(nn, nn->boot_time); 4167 /*
4168 * If the server goes down again right now, an NFSv4
4169 * client will still be allowed to reclaim after it comes back up,
4170 * even if it hasn't yet had a chance to reclaim state this time.
4171 *
4172 */
4173 nfsd4_record_grace_done(nn);
4174 /*
4175 * At this point, NFSv4 clients can still reclaim. But if the
4176 * server crashes, any that have not yet reclaimed will be out
4177 * of luck on the next boot.
4178 *
4179 * (NFSv4.1+ clients are considered to have reclaimed once they
4180 * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to
4181 * have reclaimed after their first OPEN.)
4182 */
3471 locks_end_grace(&nn->nfsd4_manager); 4183 locks_end_grace(&nn->nfsd4_manager);
3472 /* 4184 /*
3473 * Now that every NFSv4 client has had the chance to recover and 4185 * At this point, and once lockd and/or any other containers
3474 * to see the (possibly new, possibly shorter) lease time, we 4186 * exit their grace period, further reclaims will fail and
3475 * can safely set the next grace time to the current lease time: 4187 * regular locking can resume.
3476 */ 4188 */
3477 nn->nfsd4_grace = nn->nfsd4_lease;
3478} 4189}
3479 4190
3480static time_t 4191static time_t
@@ -3483,12 +4194,11 @@ nfs4_laundromat(struct nfsd_net *nn)
3483 struct nfs4_client *clp; 4194 struct nfs4_client *clp;
3484 struct nfs4_openowner *oo; 4195 struct nfs4_openowner *oo;
3485 struct nfs4_delegation *dp; 4196 struct nfs4_delegation *dp;
4197 struct nfs4_ol_stateid *stp;
3486 struct list_head *pos, *next, reaplist; 4198 struct list_head *pos, *next, reaplist;
3487 time_t cutoff = get_seconds() - nn->nfsd4_lease; 4199 time_t cutoff = get_seconds() - nn->nfsd4_lease;
3488 time_t t, new_timeo = nn->nfsd4_lease; 4200 time_t t, new_timeo = nn->nfsd4_lease;
3489 4201
3490 nfs4_lock_state();
3491
3492 dprintk("NFSD: laundromat service - starting\n"); 4202 dprintk("NFSD: laundromat service - starting\n");
3493 nfsd4_end_grace(nn); 4203 nfsd4_end_grace(nn);
3494 INIT_LIST_HEAD(&reaplist); 4204 INIT_LIST_HEAD(&reaplist);
@@ -3505,13 +4215,14 @@ nfs4_laundromat(struct nfsd_net *nn)
3505 clp->cl_clientid.cl_id); 4215 clp->cl_clientid.cl_id);
3506 continue; 4216 continue;
3507 } 4217 }
3508 list_move(&clp->cl_lru, &reaplist); 4218 list_add(&clp->cl_lru, &reaplist);
3509 } 4219 }
3510 spin_unlock(&nn->client_lock); 4220 spin_unlock(&nn->client_lock);
3511 list_for_each_safe(pos, next, &reaplist) { 4221 list_for_each_safe(pos, next, &reaplist) {
3512 clp = list_entry(pos, struct nfs4_client, cl_lru); 4222 clp = list_entry(pos, struct nfs4_client, cl_lru);
3513 dprintk("NFSD: purging unused client (clientid %08x)\n", 4223 dprintk("NFSD: purging unused client (clientid %08x)\n",
3514 clp->cl_clientid.cl_id); 4224 clp->cl_clientid.cl_id);
4225 list_del_init(&clp->cl_lru);
3515 expire_client(clp); 4226 expire_client(clp);
3516 } 4227 }
3517 spin_lock(&state_lock); 4228 spin_lock(&state_lock);
@@ -3524,24 +4235,37 @@ nfs4_laundromat(struct nfsd_net *nn)
3524 new_timeo = min(new_timeo, t); 4235 new_timeo = min(new_timeo, t);
3525 break; 4236 break;
3526 } 4237 }
3527 list_move(&dp->dl_recall_lru, &reaplist); 4238 unhash_delegation_locked(dp);
4239 list_add(&dp->dl_recall_lru, &reaplist);
3528 } 4240 }
3529 spin_unlock(&state_lock); 4241 spin_unlock(&state_lock);
3530 list_for_each_safe(pos, next, &reaplist) { 4242 while (!list_empty(&reaplist)) {
3531 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 4243 dp = list_first_entry(&reaplist, struct nfs4_delegation,
4244 dl_recall_lru);
4245 list_del_init(&dp->dl_recall_lru);
3532 revoke_delegation(dp); 4246 revoke_delegation(dp);
3533 } 4247 }
3534 list_for_each_safe(pos, next, &nn->close_lru) { 4248
3535 oo = container_of(pos, struct nfs4_openowner, oo_close_lru); 4249 spin_lock(&nn->client_lock);
3536 if (time_after((unsigned long)oo->oo_time, (unsigned long)cutoff)) { 4250 while (!list_empty(&nn->close_lru)) {
4251 oo = list_first_entry(&nn->close_lru, struct nfs4_openowner,
4252 oo_close_lru);
4253 if (time_after((unsigned long)oo->oo_time,
4254 (unsigned long)cutoff)) {
3537 t = oo->oo_time - cutoff; 4255 t = oo->oo_time - cutoff;
3538 new_timeo = min(new_timeo, t); 4256 new_timeo = min(new_timeo, t);
3539 break; 4257 break;
3540 } 4258 }
3541 release_openowner(oo); 4259 list_del_init(&oo->oo_close_lru);
4260 stp = oo->oo_last_closed_stid;
4261 oo->oo_last_closed_stid = NULL;
4262 spin_unlock(&nn->client_lock);
4263 nfs4_put_stid(&stp->st_stid);
4264 spin_lock(&nn->client_lock);
3542 } 4265 }
4266 spin_unlock(&nn->client_lock);
4267
3543 new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); 4268 new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
3544 nfs4_unlock_state();
3545 return new_timeo; 4269 return new_timeo;
3546} 4270}
3547 4271
@@ -3564,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
3564 4288
3565static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) 4289static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
3566{ 4290{
3567 if (fhp->fh_dentry->d_inode != stp->st_file->fi_inode) 4291 if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
3568 return nfserr_bad_stateid; 4292 return nfserr_bad_stateid;
3569 return nfs_ok; 4293 return nfs_ok;
3570} 4294}
@@ -3666,10 +4390,10 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
3666{ 4390{
3667 struct nfs4_stid *s; 4391 struct nfs4_stid *s;
3668 struct nfs4_ol_stateid *ols; 4392 struct nfs4_ol_stateid *ols;
3669 __be32 status; 4393 __be32 status = nfserr_bad_stateid;
3670 4394
3671 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 4395 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3672 return nfserr_bad_stateid; 4396 return status;
3673 /* Client debugging aid. */ 4397 /* Client debugging aid. */
3674 if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { 4398 if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) {
3675 char addr_str[INET6_ADDRSTRLEN]; 4399 char addr_str[INET6_ADDRSTRLEN];
@@ -3677,53 +4401,62 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
3677 sizeof(addr_str)); 4401 sizeof(addr_str));
3678 pr_warn_ratelimited("NFSD: client %s testing state ID " 4402 pr_warn_ratelimited("NFSD: client %s testing state ID "
3679 "with incorrect client ID\n", addr_str); 4403 "with incorrect client ID\n", addr_str);
3680 return nfserr_bad_stateid; 4404 return status;
3681 } 4405 }
3682 s = find_stateid(cl, stateid); 4406 spin_lock(&cl->cl_lock);
4407 s = find_stateid_locked(cl, stateid);
3683 if (!s) 4408 if (!s)
3684 return nfserr_bad_stateid; 4409 goto out_unlock;
3685 status = check_stateid_generation(stateid, &s->sc_stateid, 1); 4410 status = check_stateid_generation(stateid, &s->sc_stateid, 1);
3686 if (status) 4411 if (status)
3687 return status; 4412 goto out_unlock;
3688 switch (s->sc_type) { 4413 switch (s->sc_type) {
3689 case NFS4_DELEG_STID: 4414 case NFS4_DELEG_STID:
3690 return nfs_ok; 4415 status = nfs_ok;
4416 break;
3691 case NFS4_REVOKED_DELEG_STID: 4417 case NFS4_REVOKED_DELEG_STID:
3692 return nfserr_deleg_revoked; 4418 status = nfserr_deleg_revoked;
4419 break;
3693 case NFS4_OPEN_STID: 4420 case NFS4_OPEN_STID:
3694 case NFS4_LOCK_STID: 4421 case NFS4_LOCK_STID:
3695 ols = openlockstateid(s); 4422 ols = openlockstateid(s);
3696 if (ols->st_stateowner->so_is_open_owner 4423 if (ols->st_stateowner->so_is_open_owner
3697 && !(openowner(ols->st_stateowner)->oo_flags 4424 && !(openowner(ols->st_stateowner)->oo_flags
3698 & NFS4_OO_CONFIRMED)) 4425 & NFS4_OO_CONFIRMED))
3699 return nfserr_bad_stateid; 4426 status = nfserr_bad_stateid;
3700 return nfs_ok; 4427 else
4428 status = nfs_ok;
4429 break;
3701 default: 4430 default:
3702 printk("unknown stateid type %x\n", s->sc_type); 4431 printk("unknown stateid type %x\n", s->sc_type);
4432 /* Fallthrough */
3703 case NFS4_CLOSED_STID: 4433 case NFS4_CLOSED_STID:
3704 return nfserr_bad_stateid; 4434 case NFS4_CLOSED_DELEG_STID:
4435 status = nfserr_bad_stateid;
3705 } 4436 }
4437out_unlock:
4438 spin_unlock(&cl->cl_lock);
4439 return status;
3706} 4440}
3707 4441
3708static __be32 nfsd4_lookup_stateid(stateid_t *stateid, unsigned char typemask, 4442static __be32
3709 struct nfs4_stid **s, bool sessions, 4443nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
3710 struct nfsd_net *nn) 4444 stateid_t *stateid, unsigned char typemask,
4445 struct nfs4_stid **s, struct nfsd_net *nn)
3711{ 4446{
3712 struct nfs4_client *cl;
3713 __be32 status; 4447 __be32 status;
3714 4448
3715 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 4449 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3716 return nfserr_bad_stateid; 4450 return nfserr_bad_stateid;
3717 status = lookup_clientid(&stateid->si_opaque.so_clid, sessions, 4451 status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn);
3718 nn, &cl);
3719 if (status == nfserr_stale_clientid) { 4452 if (status == nfserr_stale_clientid) {
3720 if (sessions) 4453 if (cstate->session)
3721 return nfserr_bad_stateid; 4454 return nfserr_bad_stateid;
3722 return nfserr_stale_stateid; 4455 return nfserr_stale_stateid;
3723 } 4456 }
3724 if (status) 4457 if (status)
3725 return status; 4458 return status;
3726 *s = find_stateid_by_type(cl, stateid, typemask); 4459 *s = find_stateid_by_type(cstate->clp, stateid, typemask);
3727 if (!*s) 4460 if (!*s)
3728 return nfserr_bad_stateid; 4461 return nfserr_bad_stateid;
3729 return nfs_ok; 4462 return nfs_ok;
@@ -3754,12 +4487,11 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3754 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 4487 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
3755 return check_special_stateids(net, current_fh, stateid, flags); 4488 return check_special_stateids(net, current_fh, stateid, flags);
3756 4489
3757 nfs4_lock_state(); 4490 status = nfsd4_lookup_stateid(cstate, stateid,
3758 4491 NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
3759 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, 4492 &s, nn);
3760 &s, cstate->minorversion, nn);
3761 if (status) 4493 if (status)
3762 goto out; 4494 return status;
3763 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); 4495 status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate));
3764 if (status) 4496 if (status)
3765 goto out; 4497 goto out;
@@ -3770,12 +4502,13 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3770 if (status) 4502 if (status)
3771 goto out; 4503 goto out;
3772 if (filpp) { 4504 if (filpp) {
3773 file = dp->dl_file->fi_deleg_file; 4505 file = dp->dl_stid.sc_file->fi_deleg_file;
3774 if (!file) { 4506 if (!file) {
3775 WARN_ON_ONCE(1); 4507 WARN_ON_ONCE(1);
3776 status = nfserr_serverfault; 4508 status = nfserr_serverfault;
3777 goto out; 4509 goto out;
3778 } 4510 }
4511 get_file(file);
3779 } 4512 }
3780 break; 4513 break;
3781 case NFS4_OPEN_STID: 4514 case NFS4_OPEN_STID:
@@ -3791,10 +4524,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3791 if (status) 4524 if (status)
3792 goto out; 4525 goto out;
3793 if (filpp) { 4526 if (filpp) {
4527 struct nfs4_file *fp = stp->st_stid.sc_file;
4528
3794 if (flags & RD_STATE) 4529 if (flags & RD_STATE)
3795 file = find_readable_file(stp->st_file); 4530 file = find_readable_file(fp);
3796 else 4531 else
3797 file = find_writeable_file(stp->st_file); 4532 file = find_writeable_file(fp);
3798 } 4533 }
3799 break; 4534 break;
3800 default: 4535 default:
@@ -3803,28 +4538,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate,
3803 } 4538 }
3804 status = nfs_ok; 4539 status = nfs_ok;
3805 if (file) 4540 if (file)
3806 *filpp = get_file(file); 4541 *filpp = file;
3807out: 4542out:
3808 nfs4_unlock_state(); 4543 nfs4_put_stid(s);
3809 return status; 4544 return status;
3810} 4545}
3811 4546
3812static __be32
3813nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
3814{
3815 struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
3816
3817 if (check_for_locks(stp->st_file, lo))
3818 return nfserr_locks_held;
3819 /*
3820 * Currently there's a 1-1 lock stateid<->lockowner
3821 * correspondance, and we have to delete the lockowner when we
3822 * delete the lock stateid:
3823 */
3824 release_lockowner(lo);
3825 return nfs_ok;
3826}
3827
3828/* 4547/*
3829 * Test if the stateid is valid 4548 * Test if the stateid is valid
3830 */ 4549 */
@@ -3835,11 +4554,9 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3835 struct nfsd4_test_stateid_id *stateid; 4554 struct nfsd4_test_stateid_id *stateid;
3836 struct nfs4_client *cl = cstate->session->se_client; 4555 struct nfs4_client *cl = cstate->session->se_client;
3837 4556
3838 nfs4_lock_state();
3839 list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list) 4557 list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list)
3840 stateid->ts_id_status = 4558 stateid->ts_id_status =
3841 nfsd4_validate_stateid(cl, &stateid->ts_id_stateid); 4559 nfsd4_validate_stateid(cl, &stateid->ts_id_stateid);
3842 nfs4_unlock_state();
3843 4560
3844 return nfs_ok; 4561 return nfs_ok;
3845} 4562}
@@ -3851,37 +4568,50 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3851 stateid_t *stateid = &free_stateid->fr_stateid; 4568 stateid_t *stateid = &free_stateid->fr_stateid;
3852 struct nfs4_stid *s; 4569 struct nfs4_stid *s;
3853 struct nfs4_delegation *dp; 4570 struct nfs4_delegation *dp;
4571 struct nfs4_ol_stateid *stp;
3854 struct nfs4_client *cl = cstate->session->se_client; 4572 struct nfs4_client *cl = cstate->session->se_client;
3855 __be32 ret = nfserr_bad_stateid; 4573 __be32 ret = nfserr_bad_stateid;
3856 4574
3857 nfs4_lock_state(); 4575 spin_lock(&cl->cl_lock);
3858 s = find_stateid(cl, stateid); 4576 s = find_stateid_locked(cl, stateid);
3859 if (!s) 4577 if (!s)
3860 goto out; 4578 goto out_unlock;
3861 switch (s->sc_type) { 4579 switch (s->sc_type) {
3862 case NFS4_DELEG_STID: 4580 case NFS4_DELEG_STID:
3863 ret = nfserr_locks_held; 4581 ret = nfserr_locks_held;
3864 goto out; 4582 break;
3865 case NFS4_OPEN_STID: 4583 case NFS4_OPEN_STID:
3866 case NFS4_LOCK_STID:
3867 ret = check_stateid_generation(stateid, &s->sc_stateid, 1); 4584 ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
3868 if (ret) 4585 if (ret)
3869 goto out; 4586 break;
3870 if (s->sc_type == NFS4_LOCK_STID) 4587 ret = nfserr_locks_held;
3871 ret = nfsd4_free_lock_stateid(openlockstateid(s));
3872 else
3873 ret = nfserr_locks_held;
3874 break; 4588 break;
4589 case NFS4_LOCK_STID:
4590 ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
4591 if (ret)
4592 break;
4593 stp = openlockstateid(s);
4594 ret = nfserr_locks_held;
4595 if (check_for_locks(stp->st_stid.sc_file,
4596 lockowner(stp->st_stateowner)))
4597 break;
4598 unhash_lock_stateid(stp);
4599 spin_unlock(&cl->cl_lock);
4600 nfs4_put_stid(s);
4601 ret = nfs_ok;
4602 goto out;
3875 case NFS4_REVOKED_DELEG_STID: 4603 case NFS4_REVOKED_DELEG_STID:
3876 dp = delegstateid(s); 4604 dp = delegstateid(s);
3877 destroy_revoked_delegation(dp); 4605 list_del_init(&dp->dl_recall_lru);
4606 spin_unlock(&cl->cl_lock);
4607 nfs4_put_stid(s);
3878 ret = nfs_ok; 4608 ret = nfs_ok;
3879 break; 4609 goto out;
3880 default: 4610 /* Default falls through and returns nfserr_bad_stateid */
3881 ret = nfserr_bad_stateid;
3882 } 4611 }
4612out_unlock:
4613 spin_unlock(&cl->cl_lock);
3883out: 4614out:
3884 nfs4_unlock_state();
3885 return ret; 4615 return ret;
3886} 4616}
3887 4617
@@ -3926,20 +4656,24 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
3926{ 4656{
3927 __be32 status; 4657 __be32 status;
3928 struct nfs4_stid *s; 4658 struct nfs4_stid *s;
4659 struct nfs4_ol_stateid *stp = NULL;
3929 4660
3930 dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__, 4661 dprintk("NFSD: %s: seqid=%d stateid = " STATEID_FMT "\n", __func__,
3931 seqid, STATEID_VAL(stateid)); 4662 seqid, STATEID_VAL(stateid));
3932 4663
3933 *stpp = NULL; 4664 *stpp = NULL;
3934 status = nfsd4_lookup_stateid(stateid, typemask, &s, 4665 status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
3935 cstate->minorversion, nn);
3936 if (status) 4666 if (status)
3937 return status; 4667 return status;
3938 *stpp = openlockstateid(s); 4668 stp = openlockstateid(s);
3939 if (!nfsd4_has_session(cstate)) 4669 nfsd4_cstate_assign_replay(cstate, stp->st_stateowner);
3940 cstate->replay_owner = (*stpp)->st_stateowner;
3941 4670
3942 return nfs4_seqid_op_checks(cstate, stateid, seqid, *stpp); 4671 status = nfs4_seqid_op_checks(cstate, stateid, seqid, stp);
4672 if (!status)
4673 *stpp = stp;
4674 else
4675 nfs4_put_stid(&stp->st_stid);
4676 return status;
3943} 4677}
3944 4678
3945static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, 4679static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
@@ -3947,14 +4681,18 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
3947{ 4681{
3948 __be32 status; 4682 __be32 status;
3949 struct nfs4_openowner *oo; 4683 struct nfs4_openowner *oo;
4684 struct nfs4_ol_stateid *stp;
3950 4685
3951 status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, 4686 status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
3952 NFS4_OPEN_STID, stpp, nn); 4687 NFS4_OPEN_STID, &stp, nn);
3953 if (status) 4688 if (status)
3954 return status; 4689 return status;
3955 oo = openowner((*stpp)->st_stateowner); 4690 oo = openowner(stp->st_stateowner);
3956 if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) 4691 if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
4692 nfs4_put_stid(&stp->st_stid);
3957 return nfserr_bad_stateid; 4693 return nfserr_bad_stateid;
4694 }
4695 *stpp = stp;
3958 return nfs_ok; 4696 return nfs_ok;
3959} 4697}
3960 4698
@@ -3974,8 +4712,6 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3974 if (status) 4712 if (status)
3975 return status; 4713 return status;
3976 4714
3977 nfs4_lock_state();
3978
3979 status = nfs4_preprocess_seqid_op(cstate, 4715 status = nfs4_preprocess_seqid_op(cstate,
3980 oc->oc_seqid, &oc->oc_req_stateid, 4716 oc->oc_seqid, &oc->oc_req_stateid,
3981 NFS4_OPEN_STID, &stp, nn); 4717 NFS4_OPEN_STID, &stp, nn);
@@ -3984,7 +4720,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3984 oo = openowner(stp->st_stateowner); 4720 oo = openowner(stp->st_stateowner);
3985 status = nfserr_bad_stateid; 4721 status = nfserr_bad_stateid;
3986 if (oo->oo_flags & NFS4_OO_CONFIRMED) 4722 if (oo->oo_flags & NFS4_OO_CONFIRMED)
3987 goto out; 4723 goto put_stateid;
3988 oo->oo_flags |= NFS4_OO_CONFIRMED; 4724 oo->oo_flags |= NFS4_OO_CONFIRMED;
3989 update_stateid(&stp->st_stid.sc_stateid); 4725 update_stateid(&stp->st_stid.sc_stateid);
3990 memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4726 memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -3993,10 +4729,10 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3993 4729
3994 nfsd4_client_record_create(oo->oo_owner.so_client); 4730 nfsd4_client_record_create(oo->oo_owner.so_client);
3995 status = nfs_ok; 4731 status = nfs_ok;
4732put_stateid:
4733 nfs4_put_stid(&stp->st_stid);
3996out: 4734out:
3997 nfsd4_bump_seqid(cstate, status); 4735 nfsd4_bump_seqid(cstate, status);
3998 if (!cstate->replay_owner)
3999 nfs4_unlock_state();
4000 return status; 4736 return status;
4001} 4737}
4002 4738
@@ -4004,7 +4740,7 @@ static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 a
4004{ 4740{
4005 if (!test_access(access, stp)) 4741 if (!test_access(access, stp))
4006 return; 4742 return;
4007 nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access)); 4743 nfs4_file_put_access(stp->st_stid.sc_file, access);
4008 clear_access(access, stp); 4744 clear_access(access, stp);
4009} 4745}
4010 4746
@@ -4026,16 +4762,6 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
4026 } 4762 }
4027} 4763}
4028 4764
4029static void
4030reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp)
4031{
4032 int i;
4033 for (i = 0; i < 4; i++) {
4034 if ((i & deny) != i)
4035 clear_deny(i, stp);
4036 }
4037}
4038
4039__be32 4765__be32
4040nfsd4_open_downgrade(struct svc_rqst *rqstp, 4766nfsd4_open_downgrade(struct svc_rqst *rqstp,
4041 struct nfsd4_compound_state *cstate, 4767 struct nfsd4_compound_state *cstate,
@@ -4053,21 +4779,20 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
4053 dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__, 4779 dprintk("NFSD: %s: od_deleg_want=0x%x ignored\n", __func__,
4054 od->od_deleg_want); 4780 od->od_deleg_want);
4055 4781
4056 nfs4_lock_state();
4057 status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid, 4782 status = nfs4_preprocess_confirmed_seqid_op(cstate, od->od_seqid,
4058 &od->od_stateid, &stp, nn); 4783 &od->od_stateid, &stp, nn);
4059 if (status) 4784 if (status)
4060 goto out; 4785 goto out;
4061 status = nfserr_inval; 4786 status = nfserr_inval;
4062 if (!test_access(od->od_share_access, stp)) { 4787 if (!test_access(od->od_share_access, stp)) {
4063 dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n", 4788 dprintk("NFSD: access not a subset of current bitmap: 0x%hhx, input access=%08x\n",
4064 stp->st_access_bmap, od->od_share_access); 4789 stp->st_access_bmap, od->od_share_access);
4065 goto out; 4790 goto put_stateid;
4066 } 4791 }
4067 if (!test_deny(od->od_share_deny, stp)) { 4792 if (!test_deny(od->od_share_deny, stp)) {
4068 dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n", 4793 dprintk("NFSD: deny not a subset of current bitmap: 0x%hhx, input deny=%08x\n",
4069 stp->st_deny_bmap, od->od_share_deny); 4794 stp->st_deny_bmap, od->od_share_deny);
4070 goto out; 4795 goto put_stateid;
4071 } 4796 }
4072 nfs4_stateid_downgrade(stp, od->od_share_access); 4797 nfs4_stateid_downgrade(stp, od->od_share_access);
4073 4798
@@ -4076,17 +4801,31 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
4076 update_stateid(&stp->st_stid.sc_stateid); 4801 update_stateid(&stp->st_stid.sc_stateid);
4077 memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4802 memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4078 status = nfs_ok; 4803 status = nfs_ok;
4804put_stateid:
4805 nfs4_put_stid(&stp->st_stid);
4079out: 4806out:
4080 nfsd4_bump_seqid(cstate, status); 4807 nfsd4_bump_seqid(cstate, status);
4081 if (!cstate->replay_owner)
4082 nfs4_unlock_state();
4083 return status; 4808 return status;
4084} 4809}
4085 4810
4086static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) 4811static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s)
4087{ 4812{
4088 unhash_open_stateid(s); 4813 struct nfs4_client *clp = s->st_stid.sc_client;
4814 LIST_HEAD(reaplist);
4815
4089 s->st_stid.sc_type = NFS4_CLOSED_STID; 4816 s->st_stid.sc_type = NFS4_CLOSED_STID;
4817 spin_lock(&clp->cl_lock);
4818 unhash_open_stateid(s, &reaplist);
4819
4820 if (clp->cl_minorversion) {
4821 put_ol_stateid_locked(s, &reaplist);
4822 spin_unlock(&clp->cl_lock);
4823 free_ol_stateid_reaplist(&reaplist);
4824 } else {
4825 spin_unlock(&clp->cl_lock);
4826 free_ol_stateid_reaplist(&reaplist);
4827 move_to_close_lru(s, clp->net);
4828 }
4090} 4829}
4091 4830
4092/* 4831/*
@@ -4097,7 +4836,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4097 struct nfsd4_close *close) 4836 struct nfsd4_close *close)
4098{ 4837{
4099 __be32 status; 4838 __be32 status;
4100 struct nfs4_openowner *oo;
4101 struct nfs4_ol_stateid *stp; 4839 struct nfs4_ol_stateid *stp;
4102 struct net *net = SVC_NET(rqstp); 4840 struct net *net = SVC_NET(rqstp);
4103 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 4841 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
@@ -4105,7 +4843,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4105 dprintk("NFSD: nfsd4_close on file %pd\n", 4843 dprintk("NFSD: nfsd4_close on file %pd\n",
4106 cstate->current_fh.fh_dentry); 4844 cstate->current_fh.fh_dentry);
4107 4845
4108 nfs4_lock_state();
4109 status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, 4846 status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
4110 &close->cl_stateid, 4847 &close->cl_stateid,
4111 NFS4_OPEN_STID|NFS4_CLOSED_STID, 4848 NFS4_OPEN_STID|NFS4_CLOSED_STID,
@@ -4113,31 +4850,14 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4113 nfsd4_bump_seqid(cstate, status); 4850 nfsd4_bump_seqid(cstate, status);
4114 if (status) 4851 if (status)
4115 goto out; 4852 goto out;
4116 oo = openowner(stp->st_stateowner);
4117 update_stateid(&stp->st_stid.sc_stateid); 4853 update_stateid(&stp->st_stid.sc_stateid);
4118 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 4854 memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4119 4855
4120 nfsd4_close_open_stateid(stp); 4856 nfsd4_close_open_stateid(stp);
4121 4857
4122 if (cstate->minorversion) 4858 /* put reference from nfs4_preprocess_seqid_op */
4123 free_generic_stateid(stp); 4859 nfs4_put_stid(&stp->st_stid);
4124 else
4125 oo->oo_last_closed_stid = stp;
4126
4127 if (list_empty(&oo->oo_owner.so_stateids)) {
4128 if (cstate->minorversion)
4129 release_openowner(oo);
4130 else {
4131 /*
4132 * In the 4.0 case we need to keep the owners around a
4133 * little while to handle CLOSE replay.
4134 */
4135 move_to_close_lru(oo, SVC_NET(rqstp));
4136 }
4137 }
4138out: 4860out:
4139 if (!cstate->replay_owner)
4140 nfs4_unlock_state();
4141 return status; 4861 return status;
4142} 4862}
4143 4863
@@ -4154,28 +4874,24 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4154 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 4874 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
4155 return status; 4875 return status;
4156 4876
4157 nfs4_lock_state(); 4877 status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
4158 status = nfsd4_lookup_stateid(stateid, NFS4_DELEG_STID, &s,
4159 cstate->minorversion, nn);
4160 if (status) 4878 if (status)
4161 goto out; 4879 goto out;
4162 dp = delegstateid(s); 4880 dp = delegstateid(s);
4163 status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); 4881 status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate));
4164 if (status) 4882 if (status)
4165 goto out; 4883 goto put_stateid;
4166 4884
4167 destroy_delegation(dp); 4885 destroy_delegation(dp);
4886put_stateid:
4887 nfs4_put_stid(&dp->dl_stid);
4168out: 4888out:
4169 nfs4_unlock_state();
4170
4171 return status; 4889 return status;
4172} 4890}
4173 4891
4174 4892
4175#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start)) 4893#define LOFF_OVERFLOW(start, len) ((u64)(len) > ~(u64)(start))
4176 4894
4177#define LOCKOWNER_INO_HASH_MASK (LOCKOWNER_INO_HASH_SIZE - 1)
4178
4179static inline u64 4895static inline u64
4180end_offset(u64 start, u64 len) 4896end_offset(u64 start, u64 len)
4181{ 4897{
@@ -4196,13 +4912,6 @@ last_byte_offset(u64 start, u64 len)
4196 return end > start ? end - 1: NFS4_MAX_UINT64; 4912 return end > start ? end - 1: NFS4_MAX_UINT64;
4197} 4913}
4198 4914
4199static unsigned int lockowner_ino_hashval(struct inode *inode, u32 cl_id, struct xdr_netobj *ownername)
4200{
4201 return (file_hashval(inode) + cl_id
4202 + opaque_hashval(ownername->data, ownername->len))
4203 & LOCKOWNER_INO_HASH_MASK;
4204}
4205
4206/* 4915/*
4207 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that 4916 * TODO: Linux file offsets are _signed_ 64-bit quantities, which means that
4208 * we can't properly handle lock requests that go beyond the (2^63 - 1)-th 4917 * we can't properly handle lock requests that go beyond the (2^63 - 1)-th
@@ -4220,9 +4929,25 @@ nfs4_transform_lock_offset(struct file_lock *lock)
4220 lock->fl_end = OFFSET_MAX; 4929 lock->fl_end = OFFSET_MAX;
4221} 4930}
4222 4931
4223/* Hack!: For now, we're defining this just so we can use a pointer to it 4932static void nfsd4_fl_get_owner(struct file_lock *dst, struct file_lock *src)
4224 * as a unique cookie to identify our (NFSv4's) posix locks. */ 4933{
4934 struct nfs4_lockowner *lo = (struct nfs4_lockowner *)src->fl_owner;
4935 dst->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lo->lo_owner));
4936}
4937
4938static void nfsd4_fl_put_owner(struct file_lock *fl)
4939{
4940 struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner;
4941
4942 if (lo) {
4943 nfs4_put_stateowner(&lo->lo_owner);
4944 fl->fl_owner = NULL;
4945 }
4946}
4947
4225static const struct lock_manager_operations nfsd_posix_mng_ops = { 4948static const struct lock_manager_operations nfsd_posix_mng_ops = {
4949 .lm_get_owner = nfsd4_fl_get_owner,
4950 .lm_put_owner = nfsd4_fl_put_owner,
4226}; 4951};
4227 4952
4228static inline void 4953static inline void
@@ -4255,47 +4980,54 @@ nevermind:
4255 deny->ld_type = NFS4_WRITE_LT; 4980 deny->ld_type = NFS4_WRITE_LT;
4256} 4981}
4257 4982
4258static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, clientid_t *clid, struct xdr_netobj *owner) 4983static struct nfs4_lockowner *
4984find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner,
4985 struct nfs4_client *clp)
4259{ 4986{
4260 struct nfs4_ol_stateid *lst; 4987 unsigned int strhashval = ownerstr_hashval(owner);
4988 struct nfs4_stateowner *so;
4261 4989
4262 if (!same_owner_str(&lo->lo_owner, owner, clid)) 4990 lockdep_assert_held(&clp->cl_lock);
4263 return false; 4991
4264 if (list_empty(&lo->lo_owner.so_stateids)) { 4992 list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[strhashval],
4265 WARN_ON_ONCE(1); 4993 so_strhash) {
4266 return false; 4994 if (so->so_is_open_owner)
4995 continue;
4996 if (same_owner_str(so, owner))
4997 return lockowner(nfs4_get_stateowner(so));
4267 } 4998 }
4268 lst = list_first_entry(&lo->lo_owner.so_stateids, 4999 return NULL;
4269 struct nfs4_ol_stateid, st_perstateowner);
4270 return lst->st_file->fi_inode == inode;
4271} 5000}
4272 5001
4273static struct nfs4_lockowner * 5002static struct nfs4_lockowner *
4274find_lockowner_str(struct inode *inode, clientid_t *clid, 5003find_lockowner_str(clientid_t *clid, struct xdr_netobj *owner,
4275 struct xdr_netobj *owner, struct nfsd_net *nn) 5004 struct nfs4_client *clp)
4276{ 5005{
4277 unsigned int hashval = lockowner_ino_hashval(inode, clid->cl_id, owner);
4278 struct nfs4_lockowner *lo; 5006 struct nfs4_lockowner *lo;
4279 5007
4280 list_for_each_entry(lo, &nn->lockowner_ino_hashtbl[hashval], lo_owner_ino_hash) { 5008 spin_lock(&clp->cl_lock);
4281 if (same_lockowner_ino(lo, inode, clid, owner)) 5009 lo = find_lockowner_str_locked(clid, owner, clp);
4282 return lo; 5010 spin_unlock(&clp->cl_lock);
4283 } 5011 return lo;
4284 return NULL;
4285} 5012}
4286 5013
4287static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp) 5014static void nfs4_unhash_lockowner(struct nfs4_stateowner *sop)
4288{ 5015{
4289 struct inode *inode = open_stp->st_file->fi_inode; 5016 unhash_lockowner_locked(lockowner(sop));
4290 unsigned int inohash = lockowner_ino_hashval(inode, 5017}
4291 clp->cl_clientid.cl_id, &lo->lo_owner.so_owner);
4292 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
4293 5018
4294 list_add(&lo->lo_owner.so_strhash, &nn->ownerstr_hashtbl[strhashval]); 5019static void nfs4_free_lockowner(struct nfs4_stateowner *sop)
4295 list_add(&lo->lo_owner_ino_hash, &nn->lockowner_ino_hashtbl[inohash]); 5020{
4296 list_add(&lo->lo_perstateid, &open_stp->st_lockowners); 5021 struct nfs4_lockowner *lo = lockowner(sop);
5022
5023 kmem_cache_free(lockowner_slab, lo);
4297} 5024}
4298 5025
5026static const struct nfs4_stateowner_operations lockowner_ops = {
5027 .so_unhash = nfs4_unhash_lockowner,
5028 .so_free = nfs4_free_lockowner,
5029};
5030
4299/* 5031/*
4300 * Alloc a lock owner structure. 5032 * Alloc a lock owner structure.
4301 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 5033 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
@@ -4303,42 +5035,106 @@ static void hash_lockowner(struct nfs4_lockowner *lo, unsigned int strhashval, s
4303 * 5035 *
4304 * strhashval = ownerstr_hashval 5036 * strhashval = ownerstr_hashval
4305 */ 5037 */
4306
4307static struct nfs4_lockowner * 5038static struct nfs4_lockowner *
4308alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, struct nfs4_ol_stateid *open_stp, struct nfsd4_lock *lock) { 5039alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
4309 struct nfs4_lockowner *lo; 5040 struct nfs4_ol_stateid *open_stp,
5041 struct nfsd4_lock *lock)
5042{
5043 struct nfs4_lockowner *lo, *ret;
4310 5044
4311 lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp); 5045 lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
4312 if (!lo) 5046 if (!lo)
4313 return NULL; 5047 return NULL;
4314 INIT_LIST_HEAD(&lo->lo_owner.so_stateids); 5048 INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
4315 lo->lo_owner.so_is_open_owner = 0; 5049 lo->lo_owner.so_is_open_owner = 0;
4316 /* It is the openowner seqid that will be incremented in encode in the 5050 lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
4317 * case of new lockowners; so increment the lock seqid manually: */ 5051 lo->lo_owner.so_ops = &lockowner_ops;
4318 lo->lo_owner.so_seqid = lock->lk_new_lock_seqid + 1; 5052 spin_lock(&clp->cl_lock);
4319 hash_lockowner(lo, strhashval, clp, open_stp); 5053 ret = find_lockowner_str_locked(&clp->cl_clientid,
5054 &lock->lk_new_owner, clp);
5055 if (ret == NULL) {
5056 list_add(&lo->lo_owner.so_strhash,
5057 &clp->cl_ownerstr_hashtbl[strhashval]);
5058 ret = lo;
5059 } else
5060 nfs4_free_lockowner(&lo->lo_owner);
5061 spin_unlock(&clp->cl_lock);
4320 return lo; 5062 return lo;
4321} 5063}
4322 5064
4323static struct nfs4_ol_stateid * 5065static void
4324alloc_init_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp, struct nfs4_ol_stateid *open_stp) 5066init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
5067 struct nfs4_file *fp, struct inode *inode,
5068 struct nfs4_ol_stateid *open_stp)
4325{ 5069{
4326 struct nfs4_ol_stateid *stp;
4327 struct nfs4_client *clp = lo->lo_owner.so_client; 5070 struct nfs4_client *clp = lo->lo_owner.so_client;
4328 5071
4329 stp = nfs4_alloc_stateid(clp); 5072 lockdep_assert_held(&clp->cl_lock);
4330 if (stp == NULL) 5073
4331 return NULL; 5074 atomic_inc(&stp->st_stid.sc_count);
4332 stp->st_stid.sc_type = NFS4_LOCK_STID; 5075 stp->st_stid.sc_type = NFS4_LOCK_STID;
4333 list_add(&stp->st_perfile, &fp->fi_stateids); 5076 stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
4334 list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
4335 stp->st_stateowner = &lo->lo_owner;
4336 get_nfs4_file(fp); 5077 get_nfs4_file(fp);
4337 stp->st_file = fp; 5078 stp->st_stid.sc_file = fp;
5079 stp->st_stid.sc_free = nfs4_free_lock_stateid;
4338 stp->st_access_bmap = 0; 5080 stp->st_access_bmap = 0;
4339 stp->st_deny_bmap = open_stp->st_deny_bmap; 5081 stp->st_deny_bmap = open_stp->st_deny_bmap;
4340 stp->st_openstp = open_stp; 5082 stp->st_openstp = open_stp;
4341 return stp; 5083 list_add(&stp->st_locks, &open_stp->st_locks);
5084 list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
5085 spin_lock(&fp->fi_lock);
5086 list_add(&stp->st_perfile, &fp->fi_stateids);
5087 spin_unlock(&fp->fi_lock);
5088}
5089
5090static struct nfs4_ol_stateid *
5091find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp)
5092{
5093 struct nfs4_ol_stateid *lst;
5094 struct nfs4_client *clp = lo->lo_owner.so_client;
5095
5096 lockdep_assert_held(&clp->cl_lock);
5097
5098 list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) {
5099 if (lst->st_stid.sc_file == fp) {
5100 atomic_inc(&lst->st_stid.sc_count);
5101 return lst;
5102 }
5103 }
5104 return NULL;
5105}
5106
5107static struct nfs4_ol_stateid *
5108find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi,
5109 struct inode *inode, struct nfs4_ol_stateid *ost,
5110 bool *new)
5111{
5112 struct nfs4_stid *ns = NULL;
5113 struct nfs4_ol_stateid *lst;
5114 struct nfs4_openowner *oo = openowner(ost->st_stateowner);
5115 struct nfs4_client *clp = oo->oo_owner.so_client;
5116
5117 spin_lock(&clp->cl_lock);
5118 lst = find_lock_stateid(lo, fi);
5119 if (lst == NULL) {
5120 spin_unlock(&clp->cl_lock);
5121 ns = nfs4_alloc_stid(clp, stateid_slab);
5122 if (ns == NULL)
5123 return NULL;
5124
5125 spin_lock(&clp->cl_lock);
5126 lst = find_lock_stateid(lo, fi);
5127 if (likely(!lst)) {
5128 lst = openlockstateid(ns);
5129 init_lock_stateid(lst, lo, fi, inode, ost);
5130 ns = NULL;
5131 *new = true;
5132 }
5133 }
5134 spin_unlock(&clp->cl_lock);
5135 if (ns)
5136 nfs4_put_stid(ns);
5137 return lst;
4342} 5138}
4343 5139
4344static int 5140static int
@@ -4350,46 +5146,53 @@ check_lock_length(u64 offset, u64 length)
4350 5146
4351static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access) 5147static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
4352{ 5148{
4353 struct nfs4_file *fp = lock_stp->st_file; 5149 struct nfs4_file *fp = lock_stp->st_stid.sc_file;
4354 int oflag = nfs4_access_to_omode(access); 5150
5151 lockdep_assert_held(&fp->fi_lock);
4355 5152
4356 if (test_access(access, lock_stp)) 5153 if (test_access(access, lock_stp))
4357 return; 5154 return;
4358 nfs4_file_get_access(fp, oflag); 5155 __nfs4_file_get_access(fp, access);
4359 set_access(access, lock_stp); 5156 set_access(access, lock_stp);
4360} 5157}
4361 5158
4362static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new) 5159static __be32
5160lookup_or_create_lock_state(struct nfsd4_compound_state *cstate,
5161 struct nfs4_ol_stateid *ost,
5162 struct nfsd4_lock *lock,
5163 struct nfs4_ol_stateid **lst, bool *new)
4363{ 5164{
4364 struct nfs4_file *fi = ost->st_file; 5165 __be32 status;
5166 struct nfs4_file *fi = ost->st_stid.sc_file;
4365 struct nfs4_openowner *oo = openowner(ost->st_stateowner); 5167 struct nfs4_openowner *oo = openowner(ost->st_stateowner);
4366 struct nfs4_client *cl = oo->oo_owner.so_client; 5168 struct nfs4_client *cl = oo->oo_owner.so_client;
5169 struct inode *inode = cstate->current_fh.fh_dentry->d_inode;
4367 struct nfs4_lockowner *lo; 5170 struct nfs4_lockowner *lo;
4368 unsigned int strhashval; 5171 unsigned int strhashval;
4369 struct nfsd_net *nn = net_generic(cl->net, nfsd_net_id);
4370 5172
4371 lo = find_lockowner_str(fi->fi_inode, &cl->cl_clientid, 5173 lo = find_lockowner_str(&cl->cl_clientid, &lock->v.new.owner, cl);
4372 &lock->v.new.owner, nn); 5174 if (!lo) {
4373 if (lo) { 5175 strhashval = ownerstr_hashval(&lock->v.new.owner);
4374 if (!cstate->minorversion) 5176 lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
4375 return nfserr_bad_seqid; 5177 if (lo == NULL)
4376 /* XXX: a lockowner always has exactly one stateid: */ 5178 return nfserr_jukebox;
4377 *lst = list_first_entry(&lo->lo_owner.so_stateids, 5179 } else {
4378 struct nfs4_ol_stateid, st_perstateowner); 5180 /* with an existing lockowner, seqids must be the same */
4379 return nfs_ok; 5181 status = nfserr_bad_seqid;
5182 if (!cstate->minorversion &&
5183 lock->lk_new_lock_seqid != lo->lo_owner.so_seqid)
5184 goto out;
4380 } 5185 }
4381 strhashval = ownerstr_hashval(cl->cl_clientid.cl_id, 5186
4382 &lock->v.new.owner); 5187 *lst = find_or_create_lock_stateid(lo, fi, inode, ost, new);
4383 lo = alloc_init_lock_stateowner(strhashval, cl, ost, lock);
4384 if (lo == NULL)
4385 return nfserr_jukebox;
4386 *lst = alloc_init_lock_stateid(lo, fi, ost);
4387 if (*lst == NULL) { 5188 if (*lst == NULL) {
4388 release_lockowner(lo); 5189 status = nfserr_jukebox;
4389 return nfserr_jukebox; 5190 goto out;
4390 } 5191 }
4391 *new = true; 5192 status = nfs_ok;
4392 return nfs_ok; 5193out:
5194 nfs4_put_stateowner(&lo->lo_owner);
5195 return status;
4393} 5196}
4394 5197
4395/* 5198/*
@@ -4401,14 +5204,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4401{ 5204{
4402 struct nfs4_openowner *open_sop = NULL; 5205 struct nfs4_openowner *open_sop = NULL;
4403 struct nfs4_lockowner *lock_sop = NULL; 5206 struct nfs4_lockowner *lock_sop = NULL;
4404 struct nfs4_ol_stateid *lock_stp; 5207 struct nfs4_ol_stateid *lock_stp = NULL;
5208 struct nfs4_ol_stateid *open_stp = NULL;
5209 struct nfs4_file *fp;
4405 struct file *filp = NULL; 5210 struct file *filp = NULL;
4406 struct file_lock *file_lock = NULL; 5211 struct file_lock *file_lock = NULL;
4407 struct file_lock *conflock = NULL; 5212 struct file_lock *conflock = NULL;
4408 __be32 status = 0; 5213 __be32 status = 0;
4409 bool new_state = false;
4410 int lkflg; 5214 int lkflg;
4411 int err; 5215 int err;
5216 bool new = false;
4412 struct net *net = SVC_NET(rqstp); 5217 struct net *net = SVC_NET(rqstp);
4413 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 5218 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
4414 5219
@@ -4425,11 +5230,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4425 return status; 5230 return status;
4426 } 5231 }
4427 5232
4428 nfs4_lock_state();
4429
4430 if (lock->lk_is_new) { 5233 if (lock->lk_is_new) {
4431 struct nfs4_ol_stateid *open_stp = NULL;
4432
4433 if (nfsd4_has_session(cstate)) 5234 if (nfsd4_has_session(cstate))
4434 /* See rfc 5661 18.10.3: given clientid is ignored: */ 5235 /* See rfc 5661 18.10.3: given clientid is ignored: */
4435 memcpy(&lock->v.new.clientid, 5236 memcpy(&lock->v.new.clientid,
@@ -4453,12 +5254,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4453 &lock->v.new.clientid)) 5254 &lock->v.new.clientid))
4454 goto out; 5255 goto out;
4455 status = lookup_or_create_lock_state(cstate, open_stp, lock, 5256 status = lookup_or_create_lock_state(cstate, open_stp, lock,
4456 &lock_stp, &new_state); 5257 &lock_stp, &new);
4457 } else 5258 } else {
4458 status = nfs4_preprocess_seqid_op(cstate, 5259 status = nfs4_preprocess_seqid_op(cstate,
4459 lock->lk_old_lock_seqid, 5260 lock->lk_old_lock_seqid,
4460 &lock->lk_old_lock_stateid, 5261 &lock->lk_old_lock_stateid,
4461 NFS4_LOCK_STID, &lock_stp, nn); 5262 NFS4_LOCK_STID, &lock_stp, nn);
5263 }
4462 if (status) 5264 if (status)
4463 goto out; 5265 goto out;
4464 lock_sop = lockowner(lock_stp->st_stateowner); 5266 lock_sop = lockowner(lock_stp->st_stateowner);
@@ -4482,20 +5284,24 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4482 goto out; 5284 goto out;
4483 } 5285 }
4484 5286
4485 locks_init_lock(file_lock); 5287 fp = lock_stp->st_stid.sc_file;
4486 switch (lock->lk_type) { 5288 switch (lock->lk_type) {
4487 case NFS4_READ_LT: 5289 case NFS4_READ_LT:
4488 case NFS4_READW_LT: 5290 case NFS4_READW_LT:
4489 filp = find_readable_file(lock_stp->st_file); 5291 spin_lock(&fp->fi_lock);
5292 filp = find_readable_file_locked(fp);
4490 if (filp) 5293 if (filp)
4491 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); 5294 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
5295 spin_unlock(&fp->fi_lock);
4492 file_lock->fl_type = F_RDLCK; 5296 file_lock->fl_type = F_RDLCK;
4493 break; 5297 break;
4494 case NFS4_WRITE_LT: 5298 case NFS4_WRITE_LT:
4495 case NFS4_WRITEW_LT: 5299 case NFS4_WRITEW_LT:
4496 filp = find_writeable_file(lock_stp->st_file); 5300 spin_lock(&fp->fi_lock);
5301 filp = find_writeable_file_locked(fp);
4497 if (filp) 5302 if (filp)
4498 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); 5303 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
5304 spin_unlock(&fp->fi_lock);
4499 file_lock->fl_type = F_WRLCK; 5305 file_lock->fl_type = F_WRLCK;
4500 break; 5306 break;
4501 default: 5307 default:
@@ -4506,7 +5312,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4506 status = nfserr_openmode; 5312 status = nfserr_openmode;
4507 goto out; 5313 goto out;
4508 } 5314 }
4509 file_lock->fl_owner = (fl_owner_t)lock_sop; 5315
5316 file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
4510 file_lock->fl_pid = current->tgid; 5317 file_lock->fl_pid = current->tgid;
4511 file_lock->fl_file = filp; 5318 file_lock->fl_file = filp;
4512 file_lock->fl_flags = FL_POSIX; 5319 file_lock->fl_flags = FL_POSIX;
@@ -4544,11 +5351,27 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4544 break; 5351 break;
4545 } 5352 }
4546out: 5353out:
4547 if (status && new_state) 5354 if (filp)
4548 release_lockowner(lock_sop); 5355 fput(filp);
5356 if (lock_stp) {
5357 /* Bump seqid manually if the 4.0 replay owner is openowner */
5358 if (cstate->replay_owner &&
5359 cstate->replay_owner != &lock_sop->lo_owner &&
5360 seqid_mutating_err(ntohl(status)))
5361 lock_sop->lo_owner.so_seqid++;
5362
5363 /*
5364 * If this is a new, never-before-used stateid, and we are
5365 * returning an error, then just go ahead and release it.
5366 */
5367 if (status && new)
5368 release_lock_stateid(lock_stp);
5369
5370 nfs4_put_stid(&lock_stp->st_stid);
5371 }
5372 if (open_stp)
5373 nfs4_put_stid(&open_stp->st_stid);
4549 nfsd4_bump_seqid(cstate, status); 5374 nfsd4_bump_seqid(cstate, status);
4550 if (!cstate->replay_owner)
4551 nfs4_unlock_state();
4552 if (file_lock) 5375 if (file_lock)
4553 locks_free_lock(file_lock); 5376 locks_free_lock(file_lock);
4554 if (conflock) 5377 if (conflock)
@@ -4580,9 +5403,8 @@ __be32
4580nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 5403nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4581 struct nfsd4_lockt *lockt) 5404 struct nfsd4_lockt *lockt)
4582{ 5405{
4583 struct inode *inode;
4584 struct file_lock *file_lock = NULL; 5406 struct file_lock *file_lock = NULL;
4585 struct nfs4_lockowner *lo; 5407 struct nfs4_lockowner *lo = NULL;
4586 __be32 status; 5408 __be32 status;
4587 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 5409 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
4588 5410
@@ -4592,10 +5414,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4592 if (check_lock_length(lockt->lt_offset, lockt->lt_length)) 5414 if (check_lock_length(lockt->lt_offset, lockt->lt_length))
4593 return nfserr_inval; 5415 return nfserr_inval;
4594 5416
4595 nfs4_lock_state();
4596
4597 if (!nfsd4_has_session(cstate)) { 5417 if (!nfsd4_has_session(cstate)) {
4598 status = lookup_clientid(&lockt->lt_clientid, false, nn, NULL); 5418 status = lookup_clientid(&lockt->lt_clientid, cstate, nn);
4599 if (status) 5419 if (status)
4600 goto out; 5420 goto out;
4601 } 5421 }
@@ -4603,14 +5423,13 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4603 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 5423 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
4604 goto out; 5424 goto out;
4605 5425
4606 inode = cstate->current_fh.fh_dentry->d_inode;
4607 file_lock = locks_alloc_lock(); 5426 file_lock = locks_alloc_lock();
4608 if (!file_lock) { 5427 if (!file_lock) {
4609 dprintk("NFSD: %s: unable to allocate lock!\n", __func__); 5428 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4610 status = nfserr_jukebox; 5429 status = nfserr_jukebox;
4611 goto out; 5430 goto out;
4612 } 5431 }
4613 locks_init_lock(file_lock); 5432
4614 switch (lockt->lt_type) { 5433 switch (lockt->lt_type) {
4615 case NFS4_READ_LT: 5434 case NFS4_READ_LT:
4616 case NFS4_READW_LT: 5435 case NFS4_READW_LT:
@@ -4626,7 +5445,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4626 goto out; 5445 goto out;
4627 } 5446 }
4628 5447
4629 lo = find_lockowner_str(inode, &lockt->lt_clientid, &lockt->lt_owner, nn); 5448 lo = find_lockowner_str(&lockt->lt_clientid, &lockt->lt_owner,
5449 cstate->clp);
4630 if (lo) 5450 if (lo)
4631 file_lock->fl_owner = (fl_owner_t)lo; 5451 file_lock->fl_owner = (fl_owner_t)lo;
4632 file_lock->fl_pid = current->tgid; 5452 file_lock->fl_pid = current->tgid;
@@ -4646,7 +5466,8 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4646 nfs4_set_lock_denied(file_lock, &lockt->lt_denied); 5466 nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
4647 } 5467 }
4648out: 5468out:
4649 nfs4_unlock_state(); 5469 if (lo)
5470 nfs4_put_stateowner(&lo->lo_owner);
4650 if (file_lock) 5471 if (file_lock)
4651 locks_free_lock(file_lock); 5472 locks_free_lock(file_lock);
4652 return status; 5473 return status;
@@ -4670,27 +5491,25 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4670 if (check_lock_length(locku->lu_offset, locku->lu_length)) 5491 if (check_lock_length(locku->lu_offset, locku->lu_length))
4671 return nfserr_inval; 5492 return nfserr_inval;
4672 5493
4673 nfs4_lock_state();
4674
4675 status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, 5494 status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
4676 &locku->lu_stateid, NFS4_LOCK_STID, 5495 &locku->lu_stateid, NFS4_LOCK_STID,
4677 &stp, nn); 5496 &stp, nn);
4678 if (status) 5497 if (status)
4679 goto out; 5498 goto out;
4680 filp = find_any_file(stp->st_file); 5499 filp = find_any_file(stp->st_stid.sc_file);
4681 if (!filp) { 5500 if (!filp) {
4682 status = nfserr_lock_range; 5501 status = nfserr_lock_range;
4683 goto out; 5502 goto put_stateid;
4684 } 5503 }
4685 file_lock = locks_alloc_lock(); 5504 file_lock = locks_alloc_lock();
4686 if (!file_lock) { 5505 if (!file_lock) {
4687 dprintk("NFSD: %s: unable to allocate lock!\n", __func__); 5506 dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
4688 status = nfserr_jukebox; 5507 status = nfserr_jukebox;
4689 goto out; 5508 goto fput;
4690 } 5509 }
4691 locks_init_lock(file_lock); 5510
4692 file_lock->fl_type = F_UNLCK; 5511 file_lock->fl_type = F_UNLCK;
4693 file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); 5512 file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
4694 file_lock->fl_pid = current->tgid; 5513 file_lock->fl_pid = current->tgid;
4695 file_lock->fl_file = filp; 5514 file_lock->fl_file = filp;
4696 file_lock->fl_flags = FL_POSIX; 5515 file_lock->fl_flags = FL_POSIX;
@@ -4708,41 +5527,51 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
4708 } 5527 }
4709 update_stateid(&stp->st_stid.sc_stateid); 5528 update_stateid(&stp->st_stid.sc_stateid);
4710 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); 5529 memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
4711 5530fput:
5531 fput(filp);
5532put_stateid:
5533 nfs4_put_stid(&stp->st_stid);
4712out: 5534out:
4713 nfsd4_bump_seqid(cstate, status); 5535 nfsd4_bump_seqid(cstate, status);
4714 if (!cstate->replay_owner)
4715 nfs4_unlock_state();
4716 if (file_lock) 5536 if (file_lock)
4717 locks_free_lock(file_lock); 5537 locks_free_lock(file_lock);
4718 return status; 5538 return status;
4719 5539
4720out_nfserr: 5540out_nfserr:
4721 status = nfserrno(err); 5541 status = nfserrno(err);
4722 goto out; 5542 goto fput;
4723} 5543}
4724 5544
4725/* 5545/*
4726 * returns 5546 * returns
4727 * 1: locks held by lockowner 5547 * true: locks held by lockowner
4728 * 0: no locks held by lockowner 5548 * false: no locks held by lockowner
4729 */ 5549 */
4730static int 5550static bool
4731check_for_locks(struct nfs4_file *filp, struct nfs4_lockowner *lowner) 5551check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
4732{ 5552{
4733 struct file_lock **flpp; 5553 struct file_lock **flpp;
4734 struct inode *inode = filp->fi_inode; 5554 int status = false;
4735 int status = 0; 5555 struct file *filp = find_any_file(fp);
5556 struct inode *inode;
5557
5558 if (!filp) {
5559 /* Any valid lock stateid should have some sort of access */
5560 WARN_ON_ONCE(1);
5561 return status;
5562 }
5563
5564 inode = file_inode(filp);
4736 5565
4737 spin_lock(&inode->i_lock); 5566 spin_lock(&inode->i_lock);
4738 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) { 5567 for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
4739 if ((*flpp)->fl_owner == (fl_owner_t)lowner) { 5568 if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
4740 status = 1; 5569 status = true;
4741 goto out; 5570 break;
4742 } 5571 }
4743 } 5572 }
4744out:
4745 spin_unlock(&inode->i_lock); 5573 spin_unlock(&inode->i_lock);
5574 fput(filp);
4746 return status; 5575 return status;
4747} 5576}
4748 5577
@@ -4753,53 +5582,46 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
4753{ 5582{
4754 clientid_t *clid = &rlockowner->rl_clientid; 5583 clientid_t *clid = &rlockowner->rl_clientid;
4755 struct nfs4_stateowner *sop; 5584 struct nfs4_stateowner *sop;
4756 struct nfs4_lockowner *lo; 5585 struct nfs4_lockowner *lo = NULL;
4757 struct nfs4_ol_stateid *stp; 5586 struct nfs4_ol_stateid *stp;
4758 struct xdr_netobj *owner = &rlockowner->rl_owner; 5587 struct xdr_netobj *owner = &rlockowner->rl_owner;
4759 struct list_head matches; 5588 unsigned int hashval = ownerstr_hashval(owner);
4760 unsigned int hashval = ownerstr_hashval(clid->cl_id, owner);
4761 __be32 status; 5589 __be32 status;
4762 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); 5590 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
5591 struct nfs4_client *clp;
4763 5592
4764 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n", 5593 dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
4765 clid->cl_boot, clid->cl_id); 5594 clid->cl_boot, clid->cl_id);
4766 5595
4767 nfs4_lock_state(); 5596 status = lookup_clientid(clid, cstate, nn);
4768
4769 status = lookup_clientid(clid, cstate->minorversion, nn, NULL);
4770 if (status) 5597 if (status)
4771 goto out; 5598 return status;
4772 5599
4773 status = nfserr_locks_held; 5600 clp = cstate->clp;
4774 INIT_LIST_HEAD(&matches); 5601 /* Find the matching lock stateowner */
5602 spin_lock(&clp->cl_lock);
5603 list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
5604 so_strhash) {
4775 5605
4776 list_for_each_entry(sop, &nn->ownerstr_hashtbl[hashval], so_strhash) { 5606 if (sop->so_is_open_owner || !same_owner_str(sop, owner))
4777 if (sop->so_is_open_owner)
4778 continue; 5607 continue;
4779 if (!same_owner_str(sop, owner, clid)) 5608
4780 continue; 5609 /* see if there are still any locks associated with it */
4781 list_for_each_entry(stp, &sop->so_stateids, 5610 lo = lockowner(sop);
4782 st_perstateowner) { 5611 list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) {
4783 lo = lockowner(sop); 5612 if (check_for_locks(stp->st_stid.sc_file, lo)) {
4784 if (check_for_locks(stp->st_file, lo)) 5613 status = nfserr_locks_held;
4785 goto out; 5614 spin_unlock(&clp->cl_lock);
4786 list_add(&lo->lo_list, &matches); 5615 return status;
5616 }
4787 } 5617 }
5618
5619 nfs4_get_stateowner(sop);
5620 break;
4788 } 5621 }
4789 /* Clients probably won't expect us to return with some (but not all) 5622 spin_unlock(&clp->cl_lock);
4790 * of the lockowner state released; so don't release any until all 5623 if (lo)
4791 * have been checked. */
4792 status = nfs_ok;
4793 while (!list_empty(&matches)) {
4794 lo = list_entry(matches.next, struct nfs4_lockowner,
4795 lo_list);
4796 /* unhash_stateowner deletes so_perclient only
4797 * for openowners. */
4798 list_del(&lo->lo_list);
4799 release_lockowner(lo); 5624 release_lockowner(lo);
4800 }
4801out:
4802 nfs4_unlock_state();
4803 return status; 5625 return status;
4804} 5626}
4805 5627
@@ -4887,34 +5709,126 @@ nfsd4_find_reclaim_client(const char *recdir, struct nfsd_net *nn)
4887* Called from OPEN. Look for clientid in reclaim list. 5709* Called from OPEN. Look for clientid in reclaim list.
4888*/ 5710*/
4889__be32 5711__be32
4890nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn) 5712nfs4_check_open_reclaim(clientid_t *clid,
5713 struct nfsd4_compound_state *cstate,
5714 struct nfsd_net *nn)
4891{ 5715{
4892 struct nfs4_client *clp; 5716 __be32 status;
4893 5717
4894 /* find clientid in conf_id_hashtbl */ 5718 /* find clientid in conf_id_hashtbl */
4895 clp = find_confirmed_client(clid, sessions, nn); 5719 status = lookup_clientid(clid, cstate, nn);
4896 if (clp == NULL) 5720 if (status)
5721 return nfserr_reclaim_bad;
5722
5723 if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
5724 return nfserr_no_grace;
5725
5726 if (nfsd4_client_record_check(cstate->clp))
4897 return nfserr_reclaim_bad; 5727 return nfserr_reclaim_bad;
4898 5728
4899 return nfsd4_client_record_check(clp) ? nfserr_reclaim_bad : nfs_ok; 5729 return nfs_ok;
4900} 5730}
4901 5731
4902#ifdef CONFIG_NFSD_FAULT_INJECTION 5732#ifdef CONFIG_NFSD_FAULT_INJECTION
5733static inline void
5734put_client(struct nfs4_client *clp)
5735{
5736 atomic_dec(&clp->cl_refcount);
5737}
4903 5738
4904u64 nfsd_forget_client(struct nfs4_client *clp, u64 max) 5739static struct nfs4_client *
5740nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size)
4905{ 5741{
4906 if (mark_client_expired(clp)) 5742 struct nfs4_client *clp;
4907 return 0; 5743 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
4908 expire_client(clp); 5744 nfsd_net_id);
4909 return 1; 5745
5746 if (!nfsd_netns_ready(nn))
5747 return NULL;
5748
5749 list_for_each_entry(clp, &nn->client_lru, cl_lru) {
5750 if (memcmp(&clp->cl_addr, addr, addr_size) == 0)
5751 return clp;
5752 }
5753 return NULL;
4910} 5754}
4911 5755
4912u64 nfsd_print_client(struct nfs4_client *clp, u64 num) 5756u64
5757nfsd_inject_print_clients(void)
4913{ 5758{
5759 struct nfs4_client *clp;
5760 u64 count = 0;
5761 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
5762 nfsd_net_id);
4914 char buf[INET6_ADDRSTRLEN]; 5763 char buf[INET6_ADDRSTRLEN];
4915 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf)); 5764
4916 printk(KERN_INFO "NFS Client: %s\n", buf); 5765 if (!nfsd_netns_ready(nn))
4917 return 1; 5766 return 0;
5767
5768 spin_lock(&nn->client_lock);
5769 list_for_each_entry(clp, &nn->client_lru, cl_lru) {
5770 rpc_ntop((struct sockaddr *)&clp->cl_addr, buf, sizeof(buf));
5771 pr_info("NFS Client: %s\n", buf);
5772 ++count;
5773 }
5774 spin_unlock(&nn->client_lock);
5775
5776 return count;
5777}
5778
5779u64
5780nfsd_inject_forget_client(struct sockaddr_storage *addr, size_t addr_size)
5781{
5782 u64 count = 0;
5783 struct nfs4_client *clp;
5784 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
5785 nfsd_net_id);
5786
5787 if (!nfsd_netns_ready(nn))
5788 return count;
5789
5790 spin_lock(&nn->client_lock);
5791 clp = nfsd_find_client(addr, addr_size);
5792 if (clp) {
5793 if (mark_client_expired_locked(clp) == nfs_ok)
5794 ++count;
5795 else
5796 clp = NULL;
5797 }
5798 spin_unlock(&nn->client_lock);
5799
5800 if (clp)
5801 expire_client(clp);
5802
5803 return count;
5804}
5805
5806u64
5807nfsd_inject_forget_clients(u64 max)
5808{
5809 u64 count = 0;
5810 struct nfs4_client *clp, *next;
5811 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
5812 nfsd_net_id);
5813 LIST_HEAD(reaplist);
5814
5815 if (!nfsd_netns_ready(nn))
5816 return count;
5817
5818 spin_lock(&nn->client_lock);
5819 list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
5820 if (mark_client_expired_locked(clp) == nfs_ok) {
5821 list_add(&clp->cl_lru, &reaplist);
5822 if (max != 0 && ++count >= max)
5823 break;
5824 }
5825 }
5826 spin_unlock(&nn->client_lock);
5827
5828 list_for_each_entry_safe(clp, next, &reaplist, cl_lru)
5829 expire_client(clp);
5830
5831 return count;
4918} 5832}
4919 5833
4920static void nfsd_print_count(struct nfs4_client *clp, unsigned int count, 5834static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
@@ -4925,158 +5839,484 @@ static void nfsd_print_count(struct nfs4_client *clp, unsigned int count,
4925 printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type); 5839 printk(KERN_INFO "NFS Client: %s has %u %s\n", buf, count, type);
4926} 5840}
4927 5841
4928static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_lockowner *)) 5842static void
5843nfsd_inject_add_lock_to_list(struct nfs4_ol_stateid *lst,
5844 struct list_head *collect)
5845{
5846 struct nfs4_client *clp = lst->st_stid.sc_client;
5847 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
5848 nfsd_net_id);
5849
5850 if (!collect)
5851 return;
5852
5853 lockdep_assert_held(&nn->client_lock);
5854 atomic_inc(&clp->cl_refcount);
5855 list_add(&lst->st_locks, collect);
5856}
5857
5858static u64 nfsd_foreach_client_lock(struct nfs4_client *clp, u64 max,
5859 struct list_head *collect,
5860 void (*func)(struct nfs4_ol_stateid *))
4929{ 5861{
4930 struct nfs4_openowner *oop; 5862 struct nfs4_openowner *oop;
4931 struct nfs4_lockowner *lop, *lo_next;
4932 struct nfs4_ol_stateid *stp, *st_next; 5863 struct nfs4_ol_stateid *stp, *st_next;
5864 struct nfs4_ol_stateid *lst, *lst_next;
4933 u64 count = 0; 5865 u64 count = 0;
4934 5866
5867 spin_lock(&clp->cl_lock);
4935 list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) { 5868 list_for_each_entry(oop, &clp->cl_openowners, oo_perclient) {
4936 list_for_each_entry_safe(stp, st_next, &oop->oo_owner.so_stateids, st_perstateowner) { 5869 list_for_each_entry_safe(stp, st_next,
4937 list_for_each_entry_safe(lop, lo_next, &stp->st_lockowners, lo_perstateid) { 5870 &oop->oo_owner.so_stateids, st_perstateowner) {
4938 if (func) 5871 list_for_each_entry_safe(lst, lst_next,
4939 func(lop); 5872 &stp->st_locks, st_locks) {
4940 if (++count == max) 5873 if (func) {
4941 return count; 5874 func(lst);
5875 nfsd_inject_add_lock_to_list(lst,
5876 collect);
5877 }
5878 ++count;
5879 /*
5880 * Despite the fact that these functions deal
5881 * with 64-bit integers for "count", we must
5882 * ensure that it doesn't blow up the
5883 * clp->cl_refcount. Throw a warning if we
5884 * start to approach INT_MAX here.
5885 */
5886 WARN_ON_ONCE(count == (INT_MAX / 2));
5887 if (count == max)
5888 goto out;
4942 } 5889 }
4943 } 5890 }
4944 } 5891 }
5892out:
5893 spin_unlock(&clp->cl_lock);
4945 5894
4946 return count; 5895 return count;
4947} 5896}
4948 5897
4949u64 nfsd_forget_client_locks(struct nfs4_client *clp, u64 max) 5898static u64
5899nfsd_collect_client_locks(struct nfs4_client *clp, struct list_head *collect,
5900 u64 max)
4950{ 5901{
4951 return nfsd_foreach_client_lock(clp, max, release_lockowner); 5902 return nfsd_foreach_client_lock(clp, max, collect, unhash_lock_stateid);
4952} 5903}
4953 5904
4954u64 nfsd_print_client_locks(struct nfs4_client *clp, u64 max) 5905static u64
5906nfsd_print_client_locks(struct nfs4_client *clp)
4955{ 5907{
4956 u64 count = nfsd_foreach_client_lock(clp, max, NULL); 5908 u64 count = nfsd_foreach_client_lock(clp, 0, NULL, NULL);
4957 nfsd_print_count(clp, count, "locked files"); 5909 nfsd_print_count(clp, count, "locked files");
4958 return count; 5910 return count;
4959} 5911}
4960 5912
4961static u64 nfsd_foreach_client_open(struct nfs4_client *clp, u64 max, void (*func)(struct nfs4_openowner *)) 5913u64
5914nfsd_inject_print_locks(void)
5915{
5916 struct nfs4_client *clp;
5917 u64 count = 0;
5918 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
5919 nfsd_net_id);
5920
5921 if (!nfsd_netns_ready(nn))
5922 return 0;
5923
5924 spin_lock(&nn->client_lock);
5925 list_for_each_entry(clp, &nn->client_lru, cl_lru)
5926 count += nfsd_print_client_locks(clp);
5927 spin_unlock(&nn->client_lock);
5928
5929 return count;
5930}
5931
5932static void
5933nfsd_reap_locks(struct list_head *reaplist)
5934{
5935 struct nfs4_client *clp;
5936 struct nfs4_ol_stateid *stp, *next;
5937
5938 list_for_each_entry_safe(stp, next, reaplist, st_locks) {
5939 list_del_init(&stp->st_locks);
5940 clp = stp->st_stid.sc_client;
5941 nfs4_put_stid(&stp->st_stid);
5942 put_client(clp);
5943 }
5944}
5945
5946u64
5947nfsd_inject_forget_client_locks(struct sockaddr_storage *addr, size_t addr_size)
5948{
5949 unsigned int count = 0;
5950 struct nfs4_client *clp;
5951 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
5952 nfsd_net_id);
5953 LIST_HEAD(reaplist);
5954
5955 if (!nfsd_netns_ready(nn))
5956 return count;
5957
5958 spin_lock(&nn->client_lock);
5959 clp = nfsd_find_client(addr, addr_size);
5960 if (clp)
5961 count = nfsd_collect_client_locks(clp, &reaplist, 0);
5962 spin_unlock(&nn->client_lock);
5963 nfsd_reap_locks(&reaplist);
5964 return count;
5965}
5966
5967u64
5968nfsd_inject_forget_locks(u64 max)
5969{
5970 u64 count = 0;
5971 struct nfs4_client *clp;
5972 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
5973 nfsd_net_id);
5974 LIST_HEAD(reaplist);
5975
5976 if (!nfsd_netns_ready(nn))
5977 return count;
5978
5979 spin_lock(&nn->client_lock);
5980 list_for_each_entry(clp, &nn->client_lru, cl_lru) {
5981 count += nfsd_collect_client_locks(clp, &reaplist, max - count);
5982 if (max != 0 && count >= max)
5983 break;
5984 }
5985 spin_unlock(&nn->client_lock);
5986 nfsd_reap_locks(&reaplist);
5987 return count;
5988}
5989
5990static u64
5991nfsd_foreach_client_openowner(struct nfs4_client *clp, u64 max,
5992 struct list_head *collect,
5993 void (*func)(struct nfs4_openowner *))
4962{ 5994{
4963 struct nfs4_openowner *oop, *next; 5995 struct nfs4_openowner *oop, *next;
5996 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
5997 nfsd_net_id);
4964 u64 count = 0; 5998 u64 count = 0;
4965 5999
6000 lockdep_assert_held(&nn->client_lock);
6001
6002 spin_lock(&clp->cl_lock);
4966 list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) { 6003 list_for_each_entry_safe(oop, next, &clp->cl_openowners, oo_perclient) {
4967 if (func) 6004 if (func) {
4968 func(oop); 6005 func(oop);
4969 if (++count == max) 6006 if (collect) {
6007 atomic_inc(&clp->cl_refcount);
6008 list_add(&oop->oo_perclient, collect);
6009 }
6010 }
6011 ++count;
6012 /*
6013 * Despite the fact that these functions deal with
6014 * 64-bit integers for "count", we must ensure that
6015 * it doesn't blow up the clp->cl_refcount. Throw a
6016 * warning if we start to approach INT_MAX here.
6017 */
6018 WARN_ON_ONCE(count == (INT_MAX / 2));
6019 if (count == max)
4970 break; 6020 break;
4971 } 6021 }
6022 spin_unlock(&clp->cl_lock);
4972 6023
4973 return count; 6024 return count;
4974} 6025}
4975 6026
4976u64 nfsd_forget_client_openowners(struct nfs4_client *clp, u64 max) 6027static u64
6028nfsd_print_client_openowners(struct nfs4_client *clp)
4977{ 6029{
4978 return nfsd_foreach_client_open(clp, max, release_openowner); 6030 u64 count = nfsd_foreach_client_openowner(clp, 0, NULL, NULL);
6031
6032 nfsd_print_count(clp, count, "openowners");
6033 return count;
4979} 6034}
4980 6035
4981u64 nfsd_print_client_openowners(struct nfs4_client *clp, u64 max) 6036static u64
6037nfsd_collect_client_openowners(struct nfs4_client *clp,
6038 struct list_head *collect, u64 max)
4982{ 6039{
4983 u64 count = nfsd_foreach_client_open(clp, max, NULL); 6040 return nfsd_foreach_client_openowner(clp, max, collect,
4984 nfsd_print_count(clp, count, "open files"); 6041 unhash_openowner_locked);
4985 return count;
4986} 6042}
4987 6043
4988static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max, 6044u64
4989 struct list_head *victims) 6045nfsd_inject_print_openowners(void)
4990{ 6046{
4991 struct nfs4_delegation *dp, *next; 6047 struct nfs4_client *clp;
4992 u64 count = 0; 6048 u64 count = 0;
6049 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6050 nfsd_net_id);
6051
6052 if (!nfsd_netns_ready(nn))
6053 return 0;
6054
6055 spin_lock(&nn->client_lock);
6056 list_for_each_entry(clp, &nn->client_lru, cl_lru)
6057 count += nfsd_print_client_openowners(clp);
6058 spin_unlock(&nn->client_lock);
4993 6059
4994 lockdep_assert_held(&state_lock);
4995 list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
4996 if (victims)
4997 list_move(&dp->dl_recall_lru, victims);
4998 if (++count == max)
4999 break;
5000 }
5001 return count; 6060 return count;
5002} 6061}
5003 6062
5004u64 nfsd_forget_client_delegations(struct nfs4_client *clp, u64 max) 6063static void
6064nfsd_reap_openowners(struct list_head *reaplist)
5005{ 6065{
5006 struct nfs4_delegation *dp, *next; 6066 struct nfs4_client *clp;
5007 LIST_HEAD(victims); 6067 struct nfs4_openowner *oop, *next;
5008 u64 count;
5009 6068
5010 spin_lock(&state_lock); 6069 list_for_each_entry_safe(oop, next, reaplist, oo_perclient) {
5011 count = nfsd_find_all_delegations(clp, max, &victims); 6070 list_del_init(&oop->oo_perclient);
5012 spin_unlock(&state_lock); 6071 clp = oop->oo_owner.so_client;
6072 release_openowner(oop);
6073 put_client(clp);
6074 }
6075}
5013 6076
5014 list_for_each_entry_safe(dp, next, &victims, dl_recall_lru) 6077u64
5015 revoke_delegation(dp); 6078nfsd_inject_forget_client_openowners(struct sockaddr_storage *addr,
6079 size_t addr_size)
6080{
6081 unsigned int count = 0;
6082 struct nfs4_client *clp;
6083 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6084 nfsd_net_id);
6085 LIST_HEAD(reaplist);
5016 6086
6087 if (!nfsd_netns_ready(nn))
6088 return count;
6089
6090 spin_lock(&nn->client_lock);
6091 clp = nfsd_find_client(addr, addr_size);
6092 if (clp)
6093 count = nfsd_collect_client_openowners(clp, &reaplist, 0);
6094 spin_unlock(&nn->client_lock);
6095 nfsd_reap_openowners(&reaplist);
5017 return count; 6096 return count;
5018} 6097}
5019 6098
5020u64 nfsd_recall_client_delegations(struct nfs4_client *clp, u64 max) 6099u64
6100nfsd_inject_forget_openowners(u64 max)
5021{ 6101{
5022 struct nfs4_delegation *dp, *next; 6102 u64 count = 0;
5023 LIST_HEAD(victims); 6103 struct nfs4_client *clp;
5024 u64 count; 6104 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6105 nfsd_net_id);
6106 LIST_HEAD(reaplist);
5025 6107
5026 spin_lock(&state_lock); 6108 if (!nfsd_netns_ready(nn))
5027 count = nfsd_find_all_delegations(clp, max, &victims); 6109 return count;
5028 list_for_each_entry_safe(dp, next, &victims, dl_recall_lru)
5029 nfsd_break_one_deleg(dp);
5030 spin_unlock(&state_lock);
5031 6110
6111 spin_lock(&nn->client_lock);
6112 list_for_each_entry(clp, &nn->client_lru, cl_lru) {
6113 count += nfsd_collect_client_openowners(clp, &reaplist,
6114 max - count);
6115 if (max != 0 && count >= max)
6116 break;
6117 }
6118 spin_unlock(&nn->client_lock);
6119 nfsd_reap_openowners(&reaplist);
5032 return count; 6120 return count;
5033} 6121}
5034 6122
5035u64 nfsd_print_client_delegations(struct nfs4_client *clp, u64 max) 6123static u64 nfsd_find_all_delegations(struct nfs4_client *clp, u64 max,
6124 struct list_head *victims)
5036{ 6125{
6126 struct nfs4_delegation *dp, *next;
6127 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6128 nfsd_net_id);
5037 u64 count = 0; 6129 u64 count = 0;
5038 6130
6131 lockdep_assert_held(&nn->client_lock);
6132
5039 spin_lock(&state_lock); 6133 spin_lock(&state_lock);
5040 count = nfsd_find_all_delegations(clp, max, NULL); 6134 list_for_each_entry_safe(dp, next, &clp->cl_delegations, dl_perclnt) {
6135 if (victims) {
6136 /*
6137 * It's not safe to mess with delegations that have a
6138 * non-zero dl_time. They might have already been broken
6139 * and could be processed by the laundromat outside of
6140 * the state_lock. Just leave them be.
6141 */
6142 if (dp->dl_time != 0)
6143 continue;
6144
6145 atomic_inc(&clp->cl_refcount);
6146 unhash_delegation_locked(dp);
6147 list_add(&dp->dl_recall_lru, victims);
6148 }
6149 ++count;
6150 /*
6151 * Despite the fact that these functions deal with
6152 * 64-bit integers for "count", we must ensure that
6153 * it doesn't blow up the clp->cl_refcount. Throw a
6154 * warning if we start to approach INT_MAX here.
6155 */
6156 WARN_ON_ONCE(count == (INT_MAX / 2));
6157 if (count == max)
6158 break;
6159 }
5041 spin_unlock(&state_lock); 6160 spin_unlock(&state_lock);
6161 return count;
6162}
6163
6164static u64
6165nfsd_print_client_delegations(struct nfs4_client *clp)
6166{
6167 u64 count = nfsd_find_all_delegations(clp, 0, NULL);
5042 6168
5043 nfsd_print_count(clp, count, "delegations"); 6169 nfsd_print_count(clp, count, "delegations");
5044 return count; 6170 return count;
5045} 6171}
5046 6172
5047u64 nfsd_for_n_state(u64 max, u64 (*func)(struct nfs4_client *, u64)) 6173u64
6174nfsd_inject_print_delegations(void)
5048{ 6175{
5049 struct nfs4_client *clp, *next; 6176 struct nfs4_client *clp;
5050 u64 count = 0; 6177 u64 count = 0;
5051 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); 6178 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6179 nfsd_net_id);
5052 6180
5053 if (!nfsd_netns_ready(nn)) 6181 if (!nfsd_netns_ready(nn))
5054 return 0; 6182 return 0;
5055 6183
5056 list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) { 6184 spin_lock(&nn->client_lock);
5057 count += func(clp, max - count); 6185 list_for_each_entry(clp, &nn->client_lru, cl_lru)
5058 if ((max != 0) && (count >= max)) 6186 count += nfsd_print_client_delegations(clp);
5059 break; 6187 spin_unlock(&nn->client_lock);
6188
6189 return count;
6190}
6191
6192static void
6193nfsd_forget_delegations(struct list_head *reaplist)
6194{
6195 struct nfs4_client *clp;
6196 struct nfs4_delegation *dp, *next;
6197
6198 list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
6199 list_del_init(&dp->dl_recall_lru);
6200 clp = dp->dl_stid.sc_client;
6201 revoke_delegation(dp);
6202 put_client(clp);
5060 } 6203 }
6204}
6205
6206u64
6207nfsd_inject_forget_client_delegations(struct sockaddr_storage *addr,
6208 size_t addr_size)
6209{
6210 u64 count = 0;
6211 struct nfs4_client *clp;
6212 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6213 nfsd_net_id);
6214 LIST_HEAD(reaplist);
5061 6215
6216 if (!nfsd_netns_ready(nn))
6217 return count;
6218
6219 spin_lock(&nn->client_lock);
6220 clp = nfsd_find_client(addr, addr_size);
6221 if (clp)
6222 count = nfsd_find_all_delegations(clp, 0, &reaplist);
6223 spin_unlock(&nn->client_lock);
6224
6225 nfsd_forget_delegations(&reaplist);
5062 return count; 6226 return count;
5063} 6227}
5064 6228
5065struct nfs4_client *nfsd_find_client(struct sockaddr_storage *addr, size_t addr_size) 6229u64
6230nfsd_inject_forget_delegations(u64 max)
5066{ 6231{
6232 u64 count = 0;
5067 struct nfs4_client *clp; 6233 struct nfs4_client *clp;
5068 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns, nfsd_net_id); 6234 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6235 nfsd_net_id);
6236 LIST_HEAD(reaplist);
5069 6237
5070 if (!nfsd_netns_ready(nn)) 6238 if (!nfsd_netns_ready(nn))
5071 return NULL; 6239 return count;
5072 6240
6241 spin_lock(&nn->client_lock);
5073 list_for_each_entry(clp, &nn->client_lru, cl_lru) { 6242 list_for_each_entry(clp, &nn->client_lru, cl_lru) {
5074 if (memcmp(&clp->cl_addr, addr, addr_size) == 0) 6243 count += nfsd_find_all_delegations(clp, max - count, &reaplist);
5075 return clp; 6244 if (max != 0 && count >= max)
6245 break;
5076 } 6246 }
5077 return NULL; 6247 spin_unlock(&nn->client_lock);
6248 nfsd_forget_delegations(&reaplist);
6249 return count;
6250}
6251
6252static void
6253nfsd_recall_delegations(struct list_head *reaplist)
6254{
6255 struct nfs4_client *clp;
6256 struct nfs4_delegation *dp, *next;
6257
6258 list_for_each_entry_safe(dp, next, reaplist, dl_recall_lru) {
6259 list_del_init(&dp->dl_recall_lru);
6260 clp = dp->dl_stid.sc_client;
6261 /*
6262 * We skipped all entries that had a zero dl_time before,
6263 * so we can now reset the dl_time back to 0. If a delegation
6264 * break comes in now, then it won't make any difference since
6265 * we're recalling it either way.
6266 */
6267 spin_lock(&state_lock);
6268 dp->dl_time = 0;
6269 spin_unlock(&state_lock);
6270 nfsd_break_one_deleg(dp);
6271 put_client(clp);
6272 }
6273}
6274
6275u64
6276nfsd_inject_recall_client_delegations(struct sockaddr_storage *addr,
6277 size_t addr_size)
6278{
6279 u64 count = 0;
6280 struct nfs4_client *clp;
6281 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6282 nfsd_net_id);
6283 LIST_HEAD(reaplist);
6284
6285 if (!nfsd_netns_ready(nn))
6286 return count;
6287
6288 spin_lock(&nn->client_lock);
6289 clp = nfsd_find_client(addr, addr_size);
6290 if (clp)
6291 count = nfsd_find_all_delegations(clp, 0, &reaplist);
6292 spin_unlock(&nn->client_lock);
6293
6294 nfsd_recall_delegations(&reaplist);
6295 return count;
5078} 6296}
5079 6297
6298u64
6299nfsd_inject_recall_delegations(u64 max)
6300{
6301 u64 count = 0;
6302 struct nfs4_client *clp, *next;
6303 struct nfsd_net *nn = net_generic(current->nsproxy->net_ns,
6304 nfsd_net_id);
6305 LIST_HEAD(reaplist);
6306
6307 if (!nfsd_netns_ready(nn))
6308 return count;
6309
6310 spin_lock(&nn->client_lock);
6311 list_for_each_entry_safe(clp, next, &nn->client_lru, cl_lru) {
6312 count += nfsd_find_all_delegations(clp, max - count, &reaplist);
6313 if (max != 0 && ++count >= max)
6314 break;
6315 }
6316 spin_unlock(&nn->client_lock);
6317 nfsd_recall_delegations(&reaplist);
6318 return count;
6319}
5080#endif /* CONFIG_NFSD_FAULT_INJECTION */ 6320#endif /* CONFIG_NFSD_FAULT_INJECTION */
5081 6321
5082/* 6322/*
@@ -5113,14 +6353,6 @@ static int nfs4_state_create_net(struct net *net)
5113 CLIENT_HASH_SIZE, GFP_KERNEL); 6353 CLIENT_HASH_SIZE, GFP_KERNEL);
5114 if (!nn->unconf_id_hashtbl) 6354 if (!nn->unconf_id_hashtbl)
5115 goto err_unconf_id; 6355 goto err_unconf_id;
5116 nn->ownerstr_hashtbl = kmalloc(sizeof(struct list_head) *
5117 OWNER_HASH_SIZE, GFP_KERNEL);
5118 if (!nn->ownerstr_hashtbl)
5119 goto err_ownerstr;
5120 nn->lockowner_ino_hashtbl = kmalloc(sizeof(struct list_head) *
5121 LOCKOWNER_INO_HASH_SIZE, GFP_KERNEL);
5122 if (!nn->lockowner_ino_hashtbl)
5123 goto err_lockowner_ino;
5124 nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) * 6356 nn->sessionid_hashtbl = kmalloc(sizeof(struct list_head) *
5125 SESSION_HASH_SIZE, GFP_KERNEL); 6357 SESSION_HASH_SIZE, GFP_KERNEL);
5126 if (!nn->sessionid_hashtbl) 6358 if (!nn->sessionid_hashtbl)
@@ -5130,10 +6362,6 @@ static int nfs4_state_create_net(struct net *net)
5130 INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]); 6362 INIT_LIST_HEAD(&nn->conf_id_hashtbl[i]);
5131 INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]); 6363 INIT_LIST_HEAD(&nn->unconf_id_hashtbl[i]);
5132 } 6364 }
5133 for (i = 0; i < OWNER_HASH_SIZE; i++)
5134 INIT_LIST_HEAD(&nn->ownerstr_hashtbl[i]);
5135 for (i = 0; i < LOCKOWNER_INO_HASH_SIZE; i++)
5136 INIT_LIST_HEAD(&nn->lockowner_ino_hashtbl[i]);
5137 for (i = 0; i < SESSION_HASH_SIZE; i++) 6365 for (i = 0; i < SESSION_HASH_SIZE; i++)
5138 INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); 6366 INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]);
5139 nn->conf_name_tree = RB_ROOT; 6367 nn->conf_name_tree = RB_ROOT;
@@ -5149,10 +6377,6 @@ static int nfs4_state_create_net(struct net *net)
5149 return 0; 6377 return 0;
5150 6378
5151err_sessionid: 6379err_sessionid:
5152 kfree(nn->lockowner_ino_hashtbl);
5153err_lockowner_ino:
5154 kfree(nn->ownerstr_hashtbl);
5155err_ownerstr:
5156 kfree(nn->unconf_id_hashtbl); 6380 kfree(nn->unconf_id_hashtbl);
5157err_unconf_id: 6381err_unconf_id:
5158 kfree(nn->conf_id_hashtbl); 6382 kfree(nn->conf_id_hashtbl);
@@ -5182,8 +6406,6 @@ nfs4_state_destroy_net(struct net *net)
5182 } 6406 }
5183 6407
5184 kfree(nn->sessionid_hashtbl); 6408 kfree(nn->sessionid_hashtbl);
5185 kfree(nn->lockowner_ino_hashtbl);
5186 kfree(nn->ownerstr_hashtbl);
5187 kfree(nn->unconf_id_hashtbl); 6409 kfree(nn->unconf_id_hashtbl);
5188 kfree(nn->conf_id_hashtbl); 6410 kfree(nn->conf_id_hashtbl);
5189 put_net(net); 6411 put_net(net);
@@ -5198,10 +6420,10 @@ nfs4_state_start_net(struct net *net)
5198 ret = nfs4_state_create_net(net); 6420 ret = nfs4_state_create_net(net);
5199 if (ret) 6421 if (ret)
5200 return ret; 6422 return ret;
5201 nfsd4_client_tracking_init(net);
5202 nn->boot_time = get_seconds(); 6423 nn->boot_time = get_seconds();
5203 locks_start_grace(net, &nn->nfsd4_manager);
5204 nn->grace_ended = false; 6424 nn->grace_ended = false;
6425 locks_start_grace(net, &nn->nfsd4_manager);
6426 nfsd4_client_tracking_init(net);
5205 printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", 6427 printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n",
5206 nn->nfsd4_grace, net); 6428 nn->nfsd4_grace, net);
5207 queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); 6429 queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ);
@@ -5247,22 +6469,23 @@ nfs4_state_shutdown_net(struct net *net)
5247 cancel_delayed_work_sync(&nn->laundromat_work); 6469 cancel_delayed_work_sync(&nn->laundromat_work);
5248 locks_end_grace(&nn->nfsd4_manager); 6470 locks_end_grace(&nn->nfsd4_manager);
5249 6471
5250 nfs4_lock_state();
5251 INIT_LIST_HEAD(&reaplist); 6472 INIT_LIST_HEAD(&reaplist);
5252 spin_lock(&state_lock); 6473 spin_lock(&state_lock);
5253 list_for_each_safe(pos, next, &nn->del_recall_lru) { 6474 list_for_each_safe(pos, next, &nn->del_recall_lru) {
5254 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 6475 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
5255 list_move(&dp->dl_recall_lru, &reaplist); 6476 unhash_delegation_locked(dp);
6477 list_add(&dp->dl_recall_lru, &reaplist);
5256 } 6478 }
5257 spin_unlock(&state_lock); 6479 spin_unlock(&state_lock);
5258 list_for_each_safe(pos, next, &reaplist) { 6480 list_for_each_safe(pos, next, &reaplist) {
5259 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); 6481 dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
5260 destroy_delegation(dp); 6482 list_del_init(&dp->dl_recall_lru);
6483 nfs4_put_deleg_lease(dp->dl_stid.sc_file);
6484 nfs4_put_stid(&dp->dl_stid);
5261 } 6485 }
5262 6486
5263 nfsd4_client_tracking_exit(net); 6487 nfsd4_client_tracking_exit(net);
5264 nfs4_state_destroy_net(net); 6488 nfs4_state_destroy_net(net);
5265 nfs4_unlock_state();
5266} 6489}
5267 6490
5268void 6491void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 944275c8f56d..eeea7a90eb87 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -31,13 +31,6 @@
31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 31 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 32 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 *
35 * TODO: Neil Brown made the following observation: We currently
36 * initially reserve NFSD_BUFSIZE space on the transmit queue and
37 * never release any of that until the request is complete.
38 * It would be good to calculate a new maximum response size while
39 * decoding the COMPOUND, and call svc_reserve with this number
40 * at the end of nfs4svc_decode_compoundargs.
41 */ 34 */
42 35
43#include <linux/slab.h> 36#include <linux/slab.h>
@@ -181,28 +174,43 @@ static int zero_clientid(clientid_t *clid)
181} 174}
182 175
183/** 176/**
184 * defer_free - mark an allocation as deferred freed 177 * svcxdr_tmpalloc - allocate memory to be freed after compound processing
185 * @argp: NFSv4 compound argument structure to be freed with 178 * @argp: NFSv4 compound argument structure
186 * @release: release callback to free @p, typically kfree() 179 * @p: pointer to be freed (with kfree())
187 * @p: pointer to be freed
188 * 180 *
189 * Marks @p to be freed when processing the compound operation 181 * Marks @p to be freed when processing the compound operation
190 * described in @argp finishes. 182 * described in @argp finishes.
191 */ 183 */
192static int 184static void *
193defer_free(struct nfsd4_compoundargs *argp, 185svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len)
194 void (*release)(const void *), void *p)
195{ 186{
196 struct tmpbuf *tb; 187 struct svcxdr_tmpbuf *tb;
197 188
198 tb = kmalloc(sizeof(*tb), GFP_KERNEL); 189 tb = kmalloc(sizeof(*tb) + len, GFP_KERNEL);
199 if (!tb) 190 if (!tb)
200 return -ENOMEM; 191 return NULL;
201 tb->buf = p;
202 tb->release = release;
203 tb->next = argp->to_free; 192 tb->next = argp->to_free;
204 argp->to_free = tb; 193 argp->to_free = tb;
205 return 0; 194 return tb->buf;
195}
196
197/*
198 * For xdr strings that need to be passed to other kernel api's
199 * as null-terminated strings.
200 *
201 * Note null-terminating in place usually isn't safe since the
202 * buffer might end on a page boundary.
203 */
204static char *
205svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
206{
207 char *p = svcxdr_tmpalloc(argp, len + 1);
208
209 if (!p)
210 return NULL;
211 memcpy(p, buf, len);
212 p[len] = '\0';
213 return p;
206} 214}
207 215
208/** 216/**
@@ -217,19 +225,13 @@ defer_free(struct nfsd4_compoundargs *argp,
217 */ 225 */
218static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes) 226static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
219{ 227{
220 if (p == argp->tmp) { 228 void *ret;
221 p = kmemdup(argp->tmp, nbytes, GFP_KERNEL); 229
222 if (!p) 230 ret = svcxdr_tmpalloc(argp, nbytes);
223 return NULL; 231 if (!ret)
224 } else {
225 BUG_ON(p != argp->tmpp);
226 argp->tmpp = NULL;
227 }
228 if (defer_free(argp, kfree, p)) {
229 kfree(p);
230 return NULL; 232 return NULL;
231 } else 233 memcpy(ret, p, nbytes);
232 return (char *)p; 234 return ret;
233} 235}
234 236
235static __be32 237static __be32
@@ -292,12 +294,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
292 if (nace > NFS4_ACL_MAX) 294 if (nace > NFS4_ACL_MAX)
293 return nfserr_fbig; 295 return nfserr_fbig;
294 296
295 *acl = nfs4_acl_new(nace); 297 *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
296 if (*acl == NULL) 298 if (*acl == NULL)
297 return nfserr_jukebox; 299 return nfserr_jukebox;
298 300
299 defer_free(argp, kfree, *acl);
300
301 (*acl)->naces = nace; 301 (*acl)->naces = nace;
302 for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) { 302 for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
303 READ_BUF(16); len += 16; 303 READ_BUF(16); len += 16;
@@ -418,12 +418,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
418 return nfserr_badlabel; 418 return nfserr_badlabel;
419 len += (XDR_QUADLEN(dummy32) << 2); 419 len += (XDR_QUADLEN(dummy32) << 2);
420 READMEM(buf, dummy32); 420 READMEM(buf, dummy32);
421 label->data = kzalloc(dummy32 + 1, GFP_KERNEL); 421 label->len = dummy32;
422 label->data = svcxdr_dupstr(argp, buf, dummy32);
422 if (!label->data) 423 if (!label->data)
423 return nfserr_jukebox; 424 return nfserr_jukebox;
424 label->len = dummy32;
425 defer_free(argp, kfree, label->data);
426 memcpy(label->data, buf, dummy32);
427 } 425 }
428#endif 426#endif
429 427
@@ -598,20 +596,11 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
598 switch (create->cr_type) { 596 switch (create->cr_type) {
599 case NF4LNK: 597 case NF4LNK:
600 READ_BUF(4); 598 READ_BUF(4);
601 create->cr_linklen = be32_to_cpup(p++); 599 create->cr_datalen = be32_to_cpup(p++);
602 READ_BUF(create->cr_linklen); 600 READ_BUF(create->cr_datalen);
603 /* 601 create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen);
604 * The VFS will want a null-terminated string, and 602 if (!create->cr_data)
605 * null-terminating in place isn't safe since this might
606 * end on a page boundary:
607 */
608 create->cr_linkname =
609 kmalloc(create->cr_linklen + 1, GFP_KERNEL);
610 if (!create->cr_linkname)
611 return nfserr_jukebox; 603 return nfserr_jukebox;
612 memcpy(create->cr_linkname, p, create->cr_linklen);
613 create->cr_linkname[create->cr_linklen] = '\0';
614 defer_free(argp, kfree, create->cr_linkname);
615 break; 604 break;
616 case NF4BLK: 605 case NF4BLK:
617 case NF4CHR: 606 case NF4CHR:
@@ -1481,13 +1470,12 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
1481 INIT_LIST_HEAD(&test_stateid->ts_stateid_list); 1470 INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
1482 1471
1483 for (i = 0; i < test_stateid->ts_num_ids; i++) { 1472 for (i = 0; i < test_stateid->ts_num_ids; i++) {
1484 stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL); 1473 stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
1485 if (!stateid) { 1474 if (!stateid) {
1486 status = nfserrno(-ENOMEM); 1475 status = nfserrno(-ENOMEM);
1487 goto out; 1476 goto out;
1488 } 1477 }
1489 1478
1490 defer_free(argp, kfree, stateid);
1491 INIT_LIST_HEAD(&stateid->ts_id_list); 1479 INIT_LIST_HEAD(&stateid->ts_id_list);
1492 list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list); 1480 list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
1493 1481
@@ -1526,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
1526} 1514}
1527 1515
1528static __be32 1516static __be32
1517nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
1518{
1519 DECODE_HEAD;
1520
1521 status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
1522 if (status)
1523 return status;
1524
1525 READ_BUF(8 + 4);
1526 p = xdr_decode_hyper(p, &seek->seek_offset);
1527 seek->seek_whence = be32_to_cpup(p);
1528
1529 DECODE_TAIL;
1530}
1531
1532static __be32
1529nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) 1533nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1530{ 1534{
1531 return nfs_ok; 1535 return nfs_ok;
@@ -1598,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1598 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, 1602 [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
1599 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, 1603 [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
1600 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, 1604 [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
1605
1606 /* new operations for NFSv4.2 */
1607 [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
1608 [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp,
1609 [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp,
1610 [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp,
1611 [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
1612 [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
1613 [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
1614 [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp,
1615 [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp,
1616 [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp,
1617 [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
1618 [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
1601}; 1619};
1602 1620
1603static inline bool 1621static inline bool
@@ -1640,7 +1658,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1640 goto xdr_error; 1658 goto xdr_error;
1641 1659
1642 if (argp->opcnt > ARRAY_SIZE(argp->iops)) { 1660 if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
1643 argp->ops = kmalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL); 1661 argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
1644 if (!argp->ops) { 1662 if (!argp->ops) {
1645 argp->ops = argp->iops; 1663 argp->ops = argp->iops;
1646 dprintk("nfsd: couldn't allocate room for COMPOUND\n"); 1664 dprintk("nfsd: couldn't allocate room for COMPOUND\n");
@@ -1675,6 +1693,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
1675 readbytes += nfsd4_max_reply(argp->rqstp, op); 1693 readbytes += nfsd4_max_reply(argp->rqstp, op);
1676 } else 1694 } else
1677 max_reply += nfsd4_max_reply(argp->rqstp, op); 1695 max_reply += nfsd4_max_reply(argp->rqstp, op);
1696 /*
1697 * OP_LOCK may return a conflicting lock. (Special case
1698 * because it will just skip encoding this if it runs
1699 * out of xdr buffer space, and it is the only operation
1700 * that behaves this way.)
1701 */
1702 if (op->opnum == OP_LOCK)
1703 max_reply += NFS4_OPAQUE_LIMIT;
1678 1704
1679 if (op->status) { 1705 if (op->status) {
1680 argp->opcnt = i+1; 1706 argp->opcnt = i+1;
@@ -2662,6 +2688,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2662 struct xdr_stream *xdr = cd->xdr; 2688 struct xdr_stream *xdr = cd->xdr;
2663 int start_offset = xdr->buf->len; 2689 int start_offset = xdr->buf->len;
2664 int cookie_offset; 2690 int cookie_offset;
2691 u32 name_and_cookie;
2665 int entry_bytes; 2692 int entry_bytes;
2666 __be32 nfserr = nfserr_toosmall; 2693 __be32 nfserr = nfserr_toosmall;
2667 __be64 wire_offset; 2694 __be64 wire_offset;
@@ -2723,7 +2750,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
2723 cd->rd_maxcount -= entry_bytes; 2750 cd->rd_maxcount -= entry_bytes;
2724 if (!cd->rd_dircount) 2751 if (!cd->rd_dircount)
2725 goto fail; 2752 goto fail;
2726 cd->rd_dircount--; 2753 /*
2754 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
2755 * let's always let through the first entry, at least:
2756 */
2757 name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
2758 if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
2759 goto fail;
2760 cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
2727 cd->cookie_offset = cookie_offset; 2761 cd->cookie_offset = cookie_offset;
2728skip_entry: 2762skip_entry:
2729 cd->common.err = nfs_ok; 2763 cd->common.err = nfs_ok;
@@ -3077,11 +3111,8 @@ static __be32 nfsd4_encode_splice_read(
3077 __be32 nfserr; 3111 __be32 nfserr;
3078 __be32 *p = xdr->p - 2; 3112 __be32 *p = xdr->p - 2;
3079 3113
3080 /* 3114 /* Make sure there will be room for padding if needed */
3081 * Don't inline pages unless we know there's room for eof, 3115 if (xdr->end - xdr->p < 1)
3082 * count, and possible padding:
3083 */
3084 if (xdr->end - xdr->p < 3)
3085 return nfserr_resource; 3116 return nfserr_resource;
3086 3117
3087 nfserr = nfsd_splice_read(read->rd_rqstp, file, 3118 nfserr = nfsd_splice_read(read->rd_rqstp, file,
@@ -3104,7 +3135,8 @@ static __be32 nfsd4_encode_splice_read(
3104 3135
3105 buf->page_len = maxcount; 3136 buf->page_len = maxcount;
3106 buf->len += maxcount; 3137 buf->len += maxcount;
3107 xdr->page_ptr += (maxcount + PAGE_SIZE - 1) / PAGE_SIZE; 3138 xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
3139 / PAGE_SIZE;
3108 3140
3109 /* Use rest of head for padding and remaining ops: */ 3141 /* Use rest of head for padding and remaining ops: */
3110 buf->tail[0].iov_base = xdr->p; 3142 buf->tail[0].iov_base = xdr->p;
@@ -3147,9 +3179,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
3147 len = maxcount; 3179 len = maxcount;
3148 v = 0; 3180 v = 0;
3149 3181
3150 thislen = (void *)xdr->end - (void *)xdr->p; 3182 thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
3151 if (len < thislen)
3152 thislen = len;
3153 p = xdr_reserve_space(xdr, (thislen+3)&~3); 3183 p = xdr_reserve_space(xdr, (thislen+3)&~3);
3154 WARN_ON_ONCE(!p); 3184 WARN_ON_ONCE(!p);
3155 resp->rqstp->rq_vec[v].iov_base = p; 3185 resp->rqstp->rq_vec[v].iov_base = p;
@@ -3216,10 +3246,8 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
3216 xdr_commit_encode(xdr); 3246 xdr_commit_encode(xdr);
3217 3247
3218 maxcount = svc_max_payload(resp->rqstp); 3248 maxcount = svc_max_payload(resp->rqstp);
3219 if (maxcount > xdr->buf->buflen - xdr->buf->len) 3249 maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len));
3220 maxcount = xdr->buf->buflen - xdr->buf->len; 3250 maxcount = min_t(unsigned long, maxcount, read->rd_length);
3221 if (maxcount > read->rd_length)
3222 maxcount = read->rd_length;
3223 3251
3224 if (!read->rd_filp) { 3252 if (!read->rd_filp) {
3225 err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, 3253 err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp,
@@ -3333,6 +3361,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
3333 } 3361 }
3334 maxcount = min_t(int, maxcount-16, bytes_left); 3362 maxcount = min_t(int, maxcount-16, bytes_left);
3335 3363
3364 /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
3365 if (!readdir->rd_dircount)
3366 readdir->rd_dircount = INT_MAX;
3367
3336 readdir->xdr = xdr; 3368 readdir->xdr = xdr;
3337 readdir->rd_maxcount = maxcount; 3369 readdir->rd_maxcount = maxcount;
3338 readdir->common.err = 0; 3370 readdir->common.err = 0;
@@ -3763,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
3763} 3795}
3764 3796
3765static __be32 3797static __be32
3798nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
3799 struct nfsd4_seek *seek)
3800{
3801 __be32 *p;
3802
3803 if (nfserr)
3804 return nfserr;
3805
3806 p = xdr_reserve_space(&resp->xdr, 4 + 8);
3807 *p++ = cpu_to_be32(seek->seek_eof);
3808 p = xdr_encode_hyper(p, seek->seek_pos);
3809
3810 return nfserr;
3811}
3812
3813static __be32
3766nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) 3814nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
3767{ 3815{
3768 return nfserr; 3816 return nfserr;
@@ -3834,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = {
3834 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, 3882 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3835 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, 3883 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
3836 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, 3884 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
3885
3886 /* NFSv4.2 operations */
3887 [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
3888 [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop,
3889 [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop,
3890 [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
3891 [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
3892 [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
3893 [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
3894 [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
3895 [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop,
3896 [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop,
3897 [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
3898 [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
3837}; 3899};
3838 3900
3839/* 3901/*
@@ -3937,8 +3999,6 @@ status:
3937 * 3999 *
3938 * XDR note: do not encode rp->rp_buflen: the buffer contains the 4000 * XDR note: do not encode rp->rp_buflen: the buffer contains the
3939 * previously sent already encoded operation. 4001 * previously sent already encoded operation.
3940 *
3941 * called with nfs4_lock_state() held
3942 */ 4002 */
3943void 4003void
3944nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) 4004nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
@@ -3977,9 +4037,8 @@ int nfsd4_release_compoundargs(void *rq, __be32 *p, void *resp)
3977 kfree(args->tmpp); 4037 kfree(args->tmpp);
3978 args->tmpp = NULL; 4038 args->tmpp = NULL;
3979 while (args->to_free) { 4039 while (args->to_free) {
3980 struct tmpbuf *tb = args->to_free; 4040 struct svcxdr_tmpbuf *tb = args->to_free;
3981 args->to_free = tb->next; 4041 args->to_free = tb->next;
3982 tb->release(tb->buf);
3983 kfree(tb); 4042 kfree(tb);
3984 } 4043 }
3985 return 1; 4044 return 1;
@@ -4012,7 +4071,6 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
4012 /* 4071 /*
4013 * All that remains is to write the tag and operation count... 4072 * All that remains is to write the tag and operation count...
4014 */ 4073 */
4015 struct nfsd4_compound_state *cs = &resp->cstate;
4016 struct xdr_buf *buf = resp->xdr.buf; 4074 struct xdr_buf *buf = resp->xdr.buf;
4017 4075
4018 WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len + 4076 WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
@@ -4026,19 +4084,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
4026 p += XDR_QUADLEN(resp->taglen); 4084 p += XDR_QUADLEN(resp->taglen);
4027 *p++ = htonl(resp->opcnt); 4085 *p++ = htonl(resp->opcnt);
4028 4086
4029 if (nfsd4_has_session(cs)) { 4087 nfsd4_sequence_done(resp);
4030 struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
4031 struct nfs4_client *clp = cs->session->se_client;
4032 if (cs->status != nfserr_replay_cache) {
4033 nfsd4_store_cache_entry(resp);
4034 cs->slot->sl_flags &= ~NFSD4_SLOT_INUSE;
4035 }
4036 /* Renew the clientid on success and on replay */
4037 spin_lock(&nn->client_lock);
4038 nfsd4_put_session(cs->session);
4039 spin_unlock(&nn->client_lock);
4040 put_client_renew(clp);
4041 }
4042 return 1; 4088 return 1;
4043} 4089}
4044 4090
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 6040da8830ff..122f69185ef5 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -27,8 +27,12 @@
27 */ 27 */
28#define TARGET_BUCKET_SIZE 64 28#define TARGET_BUCKET_SIZE 64
29 29
30static struct hlist_head * cache_hash; 30struct nfsd_drc_bucket {
31static struct list_head lru_head; 31 struct list_head lru_head;
32 spinlock_t cache_lock;
33};
34
35static struct nfsd_drc_bucket *drc_hashtbl;
32static struct kmem_cache *drc_slab; 36static struct kmem_cache *drc_slab;
33 37
34/* max number of entries allowed in the cache */ 38/* max number of entries allowed in the cache */
@@ -36,6 +40,7 @@ static unsigned int max_drc_entries;
36 40
37/* number of significant bits in the hash value */ 41/* number of significant bits in the hash value */
38static unsigned int maskbits; 42static unsigned int maskbits;
43static unsigned int drc_hashsize;
39 44
40/* 45/*
41 * Stats and other tracking of on the duplicate reply cache. All of these and 46 * Stats and other tracking of on the duplicate reply cache. All of these and
@@ -43,7 +48,7 @@ static unsigned int maskbits;
43 */ 48 */
44 49
45/* total number of entries */ 50/* total number of entries */
46static unsigned int num_drc_entries; 51static atomic_t num_drc_entries;
47 52
48/* cache misses due only to checksum comparison failures */ 53/* cache misses due only to checksum comparison failures */
49static unsigned int payload_misses; 54static unsigned int payload_misses;
@@ -75,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
75 * A cache entry is "single use" if c_state == RC_INPROG 80 * A cache entry is "single use" if c_state == RC_INPROG
76 * Otherwise, it when accessing _prev or _next, the lock must be held. 81 * Otherwise, it when accessing _prev or _next, the lock must be held.
77 */ 82 */
78static DEFINE_SPINLOCK(cache_lock);
79static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); 83static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
80 84
81/* 85/*
@@ -116,6 +120,12 @@ nfsd_hashsize(unsigned int limit)
116 return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); 120 return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
117} 121}
118 122
123static u32
124nfsd_cache_hash(__be32 xid)
125{
126 return hash_32(be32_to_cpu(xid), maskbits);
127}
128
119static struct svc_cacherep * 129static struct svc_cacherep *
120nfsd_reply_cache_alloc(void) 130nfsd_reply_cache_alloc(void)
121{ 131{
@@ -126,7 +136,6 @@ nfsd_reply_cache_alloc(void)
126 rp->c_state = RC_UNUSED; 136 rp->c_state = RC_UNUSED;
127 rp->c_type = RC_NOCACHE; 137 rp->c_type = RC_NOCACHE;
128 INIT_LIST_HEAD(&rp->c_lru); 138 INIT_LIST_HEAD(&rp->c_lru);
129 INIT_HLIST_NODE(&rp->c_hash);
130 } 139 }
131 return rp; 140 return rp;
132} 141}
@@ -138,29 +147,27 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
138 drc_mem_usage -= rp->c_replvec.iov_len; 147 drc_mem_usage -= rp->c_replvec.iov_len;
139 kfree(rp->c_replvec.iov_base); 148 kfree(rp->c_replvec.iov_base);
140 } 149 }
141 if (!hlist_unhashed(&rp->c_hash))
142 hlist_del(&rp->c_hash);
143 list_del(&rp->c_lru); 150 list_del(&rp->c_lru);
144 --num_drc_entries; 151 atomic_dec(&num_drc_entries);
145 drc_mem_usage -= sizeof(*rp); 152 drc_mem_usage -= sizeof(*rp);
146 kmem_cache_free(drc_slab, rp); 153 kmem_cache_free(drc_slab, rp);
147} 154}
148 155
149static void 156static void
150nfsd_reply_cache_free(struct svc_cacherep *rp) 157nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
151{ 158{
152 spin_lock(&cache_lock); 159 spin_lock(&b->cache_lock);
153 nfsd_reply_cache_free_locked(rp); 160 nfsd_reply_cache_free_locked(rp);
154 spin_unlock(&cache_lock); 161 spin_unlock(&b->cache_lock);
155} 162}
156 163
157int nfsd_reply_cache_init(void) 164int nfsd_reply_cache_init(void)
158{ 165{
159 unsigned int hashsize; 166 unsigned int hashsize;
167 unsigned int i;
160 168
161 INIT_LIST_HEAD(&lru_head);
162 max_drc_entries = nfsd_cache_size_limit(); 169 max_drc_entries = nfsd_cache_size_limit();
163 num_drc_entries = 0; 170 atomic_set(&num_drc_entries, 0);
164 hashsize = nfsd_hashsize(max_drc_entries); 171 hashsize = nfsd_hashsize(max_drc_entries);
165 maskbits = ilog2(hashsize); 172 maskbits = ilog2(hashsize);
166 173
@@ -170,9 +177,14 @@ int nfsd_reply_cache_init(void)
170 if (!drc_slab) 177 if (!drc_slab)
171 goto out_nomem; 178 goto out_nomem;
172 179
173 cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL); 180 drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
174 if (!cache_hash) 181 if (!drc_hashtbl)
175 goto out_nomem; 182 goto out_nomem;
183 for (i = 0; i < hashsize; i++) {
184 INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
185 spin_lock_init(&drc_hashtbl[i].cache_lock);
186 }
187 drc_hashsize = hashsize;
176 188
177 return 0; 189 return 0;
178out_nomem: 190out_nomem:
@@ -184,17 +196,22 @@ out_nomem:
184void nfsd_reply_cache_shutdown(void) 196void nfsd_reply_cache_shutdown(void)
185{ 197{
186 struct svc_cacherep *rp; 198 struct svc_cacherep *rp;
199 unsigned int i;
187 200
188 unregister_shrinker(&nfsd_reply_cache_shrinker); 201 unregister_shrinker(&nfsd_reply_cache_shrinker);
189 cancel_delayed_work_sync(&cache_cleaner); 202 cancel_delayed_work_sync(&cache_cleaner);
190 203
191 while (!list_empty(&lru_head)) { 204 for (i = 0; i < drc_hashsize; i++) {
192 rp = list_entry(lru_head.next, struct svc_cacherep, c_lru); 205 struct list_head *head = &drc_hashtbl[i].lru_head;
193 nfsd_reply_cache_free_locked(rp); 206 while (!list_empty(head)) {
207 rp = list_first_entry(head, struct svc_cacherep, c_lru);
208 nfsd_reply_cache_free_locked(rp);
209 }
194 } 210 }
195 211
196 kfree (cache_hash); 212 kfree (drc_hashtbl);
197 cache_hash = NULL; 213 drc_hashtbl = NULL;
214 drc_hashsize = 0;
198 215
199 if (drc_slab) { 216 if (drc_slab) {
200 kmem_cache_destroy(drc_slab); 217 kmem_cache_destroy(drc_slab);
@@ -207,56 +224,63 @@ void nfsd_reply_cache_shutdown(void)
207 * not already scheduled. 224 * not already scheduled.
208 */ 225 */
209static void 226static void
210lru_put_end(struct svc_cacherep *rp) 227lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
211{ 228{
212 rp->c_timestamp = jiffies; 229 rp->c_timestamp = jiffies;
213 list_move_tail(&rp->c_lru, &lru_head); 230 list_move_tail(&rp->c_lru, &b->lru_head);
214 schedule_delayed_work(&cache_cleaner, RC_EXPIRE); 231 schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
215} 232}
216 233
217/*
218 * Move a cache entry from one hash list to another
219 */
220static void
221hash_refile(struct svc_cacherep *rp)
222{
223 hlist_del_init(&rp->c_hash);
224 hlist_add_head(&rp->c_hash, cache_hash + hash_32(rp->c_xid, maskbits));
225}
226
227/*
228 * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
229 * Also prune the oldest ones when the total exceeds the max number of entries.
230 */
231static long 234static long
232prune_cache_entries(void) 235prune_bucket(struct nfsd_drc_bucket *b)
233{ 236{
234 struct svc_cacherep *rp, *tmp; 237 struct svc_cacherep *rp, *tmp;
235 long freed = 0; 238 long freed = 0;
236 239
237 list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { 240 list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) {
238 /* 241 /*
239 * Don't free entries attached to calls that are still 242 * Don't free entries attached to calls that are still
240 * in-progress, but do keep scanning the list. 243 * in-progress, but do keep scanning the list.
241 */ 244 */
242 if (rp->c_state == RC_INPROG) 245 if (rp->c_state == RC_INPROG)
243 continue; 246 continue;
244 if (num_drc_entries <= max_drc_entries && 247 if (atomic_read(&num_drc_entries) <= max_drc_entries &&
245 time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) 248 time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
246 break; 249 break;
247 nfsd_reply_cache_free_locked(rp); 250 nfsd_reply_cache_free_locked(rp);
248 freed++; 251 freed++;
249 } 252 }
253 return freed;
254}
255
256/*
257 * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
258 * Also prune the oldest ones when the total exceeds the max number of entries.
259 */
260static long
261prune_cache_entries(void)
262{
263 unsigned int i;
264 long freed = 0;
265 bool cancel = true;
266
267 for (i = 0; i < drc_hashsize; i++) {
268 struct nfsd_drc_bucket *b = &drc_hashtbl[i];
269
270 if (list_empty(&b->lru_head))
271 continue;
272 spin_lock(&b->cache_lock);
273 freed += prune_bucket(b);
274 if (!list_empty(&b->lru_head))
275 cancel = false;
276 spin_unlock(&b->cache_lock);
277 }
250 278
251 /* 279 /*
252 * Conditionally rearm the job. If we cleaned out the list, then 280 * Conditionally rearm the job to run in RC_EXPIRE since we just
253 * cancel any pending run (since there won't be any work to do). 281 * ran the pruner.
254 * Otherwise, we rearm the job or modify the existing one to run in
255 * RC_EXPIRE since we just ran the pruner.
256 */ 282 */
257 if (list_empty(&lru_head)) 283 if (!cancel)
258 cancel_delayed_work(&cache_cleaner);
259 else
260 mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); 284 mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
261 return freed; 285 return freed;
262} 286}
@@ -264,32 +288,19 @@ prune_cache_entries(void)
264static void 288static void
265cache_cleaner_func(struct work_struct *unused) 289cache_cleaner_func(struct work_struct *unused)
266{ 290{
267 spin_lock(&cache_lock);
268 prune_cache_entries(); 291 prune_cache_entries();
269 spin_unlock(&cache_lock);
270} 292}
271 293
272static unsigned long 294static unsigned long
273nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) 295nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
274{ 296{
275 unsigned long num; 297 return atomic_read(&num_drc_entries);
276
277 spin_lock(&cache_lock);
278 num = num_drc_entries;
279 spin_unlock(&cache_lock);
280
281 return num;
282} 298}
283 299
284static unsigned long 300static unsigned long
285nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) 301nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
286{ 302{
287 unsigned long freed; 303 return prune_cache_entries();
288
289 spin_lock(&cache_lock);
290 freed = prune_cache_entries();
291 spin_unlock(&cache_lock);
292 return freed;
293} 304}
294/* 305/*
295 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes 306 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
@@ -327,20 +338,24 @@ nfsd_cache_csum(struct svc_rqst *rqstp)
327static bool 338static bool
328nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) 339nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
329{ 340{
330 /* Check RPC header info first */ 341 /* Check RPC XID first */
331 if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc || 342 if (rqstp->rq_xid != rp->c_xid)
332 rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
333 rqstp->rq_arg.len != rp->c_len ||
334 !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
335 rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
336 return false; 343 return false;
337
338 /* compare checksum of NFS data */ 344 /* compare checksum of NFS data */
339 if (csum != rp->c_csum) { 345 if (csum != rp->c_csum) {
340 ++payload_misses; 346 ++payload_misses;
341 return false; 347 return false;
342 } 348 }
343 349
350 /* Other discriminators */
351 if (rqstp->rq_proc != rp->c_proc ||
352 rqstp->rq_prot != rp->c_prot ||
353 rqstp->rq_vers != rp->c_vers ||
354 rqstp->rq_arg.len != rp->c_len ||
355 !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
356 rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
357 return false;
358
344 return true; 359 return true;
345} 360}
346 361
@@ -350,14 +365,14 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
350 * NULL on failure. 365 * NULL on failure.
351 */ 366 */
352static struct svc_cacherep * 367static struct svc_cacherep *
353nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) 368nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp,
369 __wsum csum)
354{ 370{
355 struct svc_cacherep *rp, *ret = NULL; 371 struct svc_cacherep *rp, *ret = NULL;
356 struct hlist_head *rh; 372 struct list_head *rh = &b->lru_head;
357 unsigned int entries = 0; 373 unsigned int entries = 0;
358 374
359 rh = &cache_hash[hash_32(rqstp->rq_xid, maskbits)]; 375 list_for_each_entry(rp, rh, c_lru) {
360 hlist_for_each_entry(rp, rh, c_hash) {
361 ++entries; 376 ++entries;
362 if (nfsd_cache_match(rqstp, csum, rp)) { 377 if (nfsd_cache_match(rqstp, csum, rp)) {
363 ret = rp; 378 ret = rp;
@@ -368,11 +383,12 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
368 /* tally hash chain length stats */ 383 /* tally hash chain length stats */
369 if (entries > longest_chain) { 384 if (entries > longest_chain) {
370 longest_chain = entries; 385 longest_chain = entries;
371 longest_chain_cachesize = num_drc_entries; 386 longest_chain_cachesize = atomic_read(&num_drc_entries);
372 } else if (entries == longest_chain) { 387 } else if (entries == longest_chain) {
373 /* prefer to keep the smallest cachesize possible here */ 388 /* prefer to keep the smallest cachesize possible here */
374 longest_chain_cachesize = min(longest_chain_cachesize, 389 longest_chain_cachesize = min_t(unsigned int,
375 num_drc_entries); 390 longest_chain_cachesize,
391 atomic_read(&num_drc_entries));
376 } 392 }
377 393
378 return ret; 394 return ret;
@@ -394,6 +410,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
394 vers = rqstp->rq_vers, 410 vers = rqstp->rq_vers,
395 proc = rqstp->rq_proc; 411 proc = rqstp->rq_proc;
396 __wsum csum; 412 __wsum csum;
413 u32 hash = nfsd_cache_hash(xid);
414 struct nfsd_drc_bucket *b = &drc_hashtbl[hash];
397 unsigned long age; 415 unsigned long age;
398 int type = rqstp->rq_cachetype; 416 int type = rqstp->rq_cachetype;
399 int rtn = RC_DOIT; 417 int rtn = RC_DOIT;
@@ -411,16 +429,16 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
411 * preallocate an entry. 429 * preallocate an entry.
412 */ 430 */
413 rp = nfsd_reply_cache_alloc(); 431 rp = nfsd_reply_cache_alloc();
414 spin_lock(&cache_lock); 432 spin_lock(&b->cache_lock);
415 if (likely(rp)) { 433 if (likely(rp)) {
416 ++num_drc_entries; 434 atomic_inc(&num_drc_entries);
417 drc_mem_usage += sizeof(*rp); 435 drc_mem_usage += sizeof(*rp);
418 } 436 }
419 437
420 /* go ahead and prune the cache */ 438 /* go ahead and prune the cache */
421 prune_cache_entries(); 439 prune_bucket(b);
422 440
423 found = nfsd_cache_search(rqstp, csum); 441 found = nfsd_cache_search(b, rqstp, csum);
424 if (found) { 442 if (found) {
425 if (likely(rp)) 443 if (likely(rp))
426 nfsd_reply_cache_free_locked(rp); 444 nfsd_reply_cache_free_locked(rp);
@@ -445,8 +463,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
445 rp->c_len = rqstp->rq_arg.len; 463 rp->c_len = rqstp->rq_arg.len;
446 rp->c_csum = csum; 464 rp->c_csum = csum;
447 465
448 hash_refile(rp); 466 lru_put_end(b, rp);
449 lru_put_end(rp);
450 467
451 /* release any buffer */ 468 /* release any buffer */
452 if (rp->c_type == RC_REPLBUFF) { 469 if (rp->c_type == RC_REPLBUFF) {
@@ -456,14 +473,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
456 } 473 }
457 rp->c_type = RC_NOCACHE; 474 rp->c_type = RC_NOCACHE;
458 out: 475 out:
459 spin_unlock(&cache_lock); 476 spin_unlock(&b->cache_lock);
460 return rtn; 477 return rtn;
461 478
462found_entry: 479found_entry:
463 nfsdstats.rchits++; 480 nfsdstats.rchits++;
464 /* We found a matching entry which is either in progress or done. */ 481 /* We found a matching entry which is either in progress or done. */
465 age = jiffies - rp->c_timestamp; 482 age = jiffies - rp->c_timestamp;
466 lru_put_end(rp); 483 lru_put_end(b, rp);
467 484
468 rtn = RC_DROPIT; 485 rtn = RC_DROPIT;
469 /* Request being processed or excessive rexmits */ 486 /* Request being processed or excessive rexmits */
@@ -518,18 +535,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
518{ 535{
519 struct svc_cacherep *rp = rqstp->rq_cacherep; 536 struct svc_cacherep *rp = rqstp->rq_cacherep;
520 struct kvec *resv = &rqstp->rq_res.head[0], *cachv; 537 struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
538 u32 hash;
539 struct nfsd_drc_bucket *b;
521 int len; 540 int len;
522 size_t bufsize = 0; 541 size_t bufsize = 0;
523 542
524 if (!rp) 543 if (!rp)
525 return; 544 return;
526 545
546 hash = nfsd_cache_hash(rp->c_xid);
547 b = &drc_hashtbl[hash];
548
527 len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); 549 len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
528 len >>= 2; 550 len >>= 2;
529 551
530 /* Don't cache excessive amounts of data and XDR failures */ 552 /* Don't cache excessive amounts of data and XDR failures */
531 if (!statp || len > (256 >> 2)) { 553 if (!statp || len > (256 >> 2)) {
532 nfsd_reply_cache_free(rp); 554 nfsd_reply_cache_free(b, rp);
533 return; 555 return;
534 } 556 }
535 557
@@ -544,23 +566,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
544 bufsize = len << 2; 566 bufsize = len << 2;
545 cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); 567 cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
546 if (!cachv->iov_base) { 568 if (!cachv->iov_base) {
547 nfsd_reply_cache_free(rp); 569 nfsd_reply_cache_free(b, rp);
548 return; 570 return;
549 } 571 }
550 cachv->iov_len = bufsize; 572 cachv->iov_len = bufsize;
551 memcpy(cachv->iov_base, statp, bufsize); 573 memcpy(cachv->iov_base, statp, bufsize);
552 break; 574 break;
553 case RC_NOCACHE: 575 case RC_NOCACHE:
554 nfsd_reply_cache_free(rp); 576 nfsd_reply_cache_free(b, rp);
555 return; 577 return;
556 } 578 }
557 spin_lock(&cache_lock); 579 spin_lock(&b->cache_lock);
558 drc_mem_usage += bufsize; 580 drc_mem_usage += bufsize;
559 lru_put_end(rp); 581 lru_put_end(b, rp);
560 rp->c_secure = rqstp->rq_secure; 582 rp->c_secure = rqstp->rq_secure;
561 rp->c_type = cachetype; 583 rp->c_type = cachetype;
562 rp->c_state = RC_DONE; 584 rp->c_state = RC_DONE;
563 spin_unlock(&cache_lock); 585 spin_unlock(&b->cache_lock);
564 return; 586 return;
565} 587}
566 588
@@ -591,9 +613,9 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
591 */ 613 */
592static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) 614static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
593{ 615{
594 spin_lock(&cache_lock);
595 seq_printf(m, "max entries: %u\n", max_drc_entries); 616 seq_printf(m, "max entries: %u\n", max_drc_entries);
596 seq_printf(m, "num entries: %u\n", num_drc_entries); 617 seq_printf(m, "num entries: %u\n",
618 atomic_read(&num_drc_entries));
597 seq_printf(m, "hash buckets: %u\n", 1 << maskbits); 619 seq_printf(m, "hash buckets: %u\n", 1 << maskbits);
598 seq_printf(m, "mem usage: %u\n", drc_mem_usage); 620 seq_printf(m, "mem usage: %u\n", drc_mem_usage);
599 seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); 621 seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
@@ -602,7 +624,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
602 seq_printf(m, "payload misses: %u\n", payload_misses); 624 seq_printf(m, "payload misses: %u\n", payload_misses);
603 seq_printf(m, "longest chain len: %u\n", longest_chain); 625 seq_printf(m, "longest chain len: %u\n", longest_chain);
604 seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); 626 seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize);
605 spin_unlock(&cache_lock);
606 return 0; 627 return 0;
607} 628}
608 629
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 51844048937f..ca73ca79a0ee 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -39,6 +39,7 @@ enum {
39 NFSD_Versions, 39 NFSD_Versions,
40 NFSD_Ports, 40 NFSD_Ports,
41 NFSD_MaxBlkSize, 41 NFSD_MaxBlkSize,
42 NFSD_MaxConnections,
42 NFSD_SupportedEnctypes, 43 NFSD_SupportedEnctypes,
43 /* 44 /*
44 * The below MUST come last. Otherwise we leave a hole in nfsd_files[] 45 * The below MUST come last. Otherwise we leave a hole in nfsd_files[]
@@ -48,6 +49,7 @@ enum {
48 NFSD_Leasetime, 49 NFSD_Leasetime,
49 NFSD_Gracetime, 50 NFSD_Gracetime,
50 NFSD_RecoveryDir, 51 NFSD_RecoveryDir,
52 NFSD_V4EndGrace,
51#endif 53#endif
52}; 54};
53 55
@@ -62,10 +64,12 @@ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size);
62static ssize_t write_versions(struct file *file, char *buf, size_t size); 64static ssize_t write_versions(struct file *file, char *buf, size_t size);
63static ssize_t write_ports(struct file *file, char *buf, size_t size); 65static ssize_t write_ports(struct file *file, char *buf, size_t size);
64static ssize_t write_maxblksize(struct file *file, char *buf, size_t size); 66static ssize_t write_maxblksize(struct file *file, char *buf, size_t size);
67static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
65#ifdef CONFIG_NFSD_V4 68#ifdef CONFIG_NFSD_V4
66static ssize_t write_leasetime(struct file *file, char *buf, size_t size); 69static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
67static ssize_t write_gracetime(struct file *file, char *buf, size_t size); 70static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
68static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); 71static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
72static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
69#endif 73#endif
70 74
71static ssize_t (*write_op[])(struct file *, char *, size_t) = { 75static ssize_t (*write_op[])(struct file *, char *, size_t) = {
@@ -77,10 +81,12 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
77 [NFSD_Versions] = write_versions, 81 [NFSD_Versions] = write_versions,
78 [NFSD_Ports] = write_ports, 82 [NFSD_Ports] = write_ports,
79 [NFSD_MaxBlkSize] = write_maxblksize, 83 [NFSD_MaxBlkSize] = write_maxblksize,
84 [NFSD_MaxConnections] = write_maxconn,
80#ifdef CONFIG_NFSD_V4 85#ifdef CONFIG_NFSD_V4
81 [NFSD_Leasetime] = write_leasetime, 86 [NFSD_Leasetime] = write_leasetime,
82 [NFSD_Gracetime] = write_gracetime, 87 [NFSD_Gracetime] = write_gracetime,
83 [NFSD_RecoveryDir] = write_recoverydir, 88 [NFSD_RecoveryDir] = write_recoverydir,
89 [NFSD_V4EndGrace] = write_v4_end_grace,
84#endif 90#endif
85}; 91};
86 92
@@ -369,8 +375,7 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
369 375
370 if (maxsize < NFS_FHSIZE) 376 if (maxsize < NFS_FHSIZE)
371 return -EINVAL; 377 return -EINVAL;
372 if (maxsize > NFS3_FHSIZE) 378 maxsize = min(maxsize, NFS3_FHSIZE);
373 maxsize = NFS3_FHSIZE;
374 379
375 if (qword_get(&mesg, mesg, size)>0) 380 if (qword_get(&mesg, mesg, size)>0)
376 return -EINVAL; 381 return -EINVAL;
@@ -871,10 +876,8 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
871 /* force bsize into allowed range and 876 /* force bsize into allowed range and
872 * required alignment. 877 * required alignment.
873 */ 878 */
874 if (bsize < 1024) 879 bsize = max_t(int, bsize, 1024);
875 bsize = 1024; 880 bsize = min_t(int, bsize, NFSSVC_MAXBLKSIZE);
876 if (bsize > NFSSVC_MAXBLKSIZE)
877 bsize = NFSSVC_MAXBLKSIZE;
878 bsize &= ~(1024-1); 881 bsize &= ~(1024-1);
879 mutex_lock(&nfsd_mutex); 882 mutex_lock(&nfsd_mutex);
880 if (nn->nfsd_serv) { 883 if (nn->nfsd_serv) {
@@ -889,6 +892,44 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size)
889 nfsd_max_blksize); 892 nfsd_max_blksize);
890} 893}
891 894
895/**
896 * write_maxconn - Set or report the current max number of connections
897 *
898 * Input:
899 * buf: ignored
900 * size: zero
901 * OR
902 *
903 * Input:
904 * buf: C string containing an unsigned
905 * integer value representing the new
906 * number of max connections
907 * size: non-zero length of C string in @buf
908 * Output:
909 * On success: passed-in buffer filled with '\n'-terminated C string
910 * containing numeric value of max_connections setting
911 * for this net namespace;
912 * return code is the size in bytes of the string
913 * On error: return code is zero or a negative errno value
914 */
915static ssize_t write_maxconn(struct file *file, char *buf, size_t size)
916{
917 char *mesg = buf;
918 struct net *net = file->f_dentry->d_sb->s_fs_info;
919 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
920 unsigned int maxconn = nn->max_connections;
921
922 if (size > 0) {
923 int rv = get_uint(&mesg, &maxconn);
924
925 if (rv)
926 return rv;
927 nn->max_connections = maxconn;
928 }
929
930 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%u\n", maxconn);
931}
932
892#ifdef CONFIG_NFSD_V4 933#ifdef CONFIG_NFSD_V4
893static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size, 934static ssize_t __nfsd4_write_time(struct file *file, char *buf, size_t size,
894 time_t *time, struct nfsd_net *nn) 935 time_t *time, struct nfsd_net *nn)
@@ -1039,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
1039 return rv; 1080 return rv;
1040} 1081}
1041 1082
1083/**
1084 * write_v4_end_grace - release grace period for nfsd's v4.x lock manager
1085 *
1086 * Input:
1087 * buf: ignored
1088 * size: zero
1089 * OR
1090 *
1091 * Input:
1092 * buf: any value
1093 * size: non-zero length of C string in @buf
1094 * Output:
1095 * passed-in buffer filled with "Y" or "N" with a newline
1096 * and NULL-terminated C string. This indicates whether
1097 * the grace period has ended in the current net
1098 * namespace. Return code is the size in bytes of the
1099 * string. Writing a string that starts with 'Y', 'y', or
1100 * '1' to the file will end the grace period for nfsd's v4
1101 * lock manager.
1102 */
1103static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size)
1104{
1105 struct net *net = file->f_dentry->d_sb->s_fs_info;
1106 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
1107
1108 if (size > 0) {
1109 switch(buf[0]) {
1110 case 'Y':
1111 case 'y':
1112 case '1':
1113 nfsd4_end_grace(nn);
1114 break;
1115 default:
1116 return -EINVAL;
1117 }
1118 }
1119
1120 return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n",
1121 nn->grace_ended ? 'Y' : 'N');
1122}
1123
1042#endif 1124#endif
1043 1125
1044/*----------------------------------------------------------------------------*/ 1126/*----------------------------------------------------------------------------*/
@@ -1064,6 +1146,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1064 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 1146 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
1065 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 1147 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
1066 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1148 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
1149 [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
1067#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) 1150#if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
1068 [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, 1151 [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
1069#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ 1152#endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
@@ -1071,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1071 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1154 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
1072 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1155 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
1073 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, 1156 [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
1157 [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO},
1074#endif 1158#endif
1075 /* last one */ {""} 1159 /* last one */ {""}
1076 }; 1160 };
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 847daf37e566..747f3b95bd11 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -251,7 +251,7 @@ void nfsd_lockd_shutdown(void);
251#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) 251#define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED)
252#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP) 252#define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP)
253#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH) 253#define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH)
254#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP) 254#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP)
255#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) 255#define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED)
256#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) 256#define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS)
257#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) 257#define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL)
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index ec8393418154..88026fc6a981 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -162,7 +162,14 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
162 /* deprecated, convert to type 3 */ 162 /* deprecated, convert to type 3 */
163 len = key_len(FSID_ENCODE_DEV)/4; 163 len = key_len(FSID_ENCODE_DEV)/4;
164 fh->fh_fsid_type = FSID_ENCODE_DEV; 164 fh->fh_fsid_type = FSID_ENCODE_DEV;
165 fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1]))); 165 /*
166 * struct knfsd_fh uses host-endian fields, which are
167 * sometimes used to hold net-endian values. This
168 * confuses sparse, so we must use __force here to
169 * keep it from complaining.
170 */
171 fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
172 ntohl((__force __be32)fh->fh_fsid[1])));
166 fh->fh_fsid[1] = fh->fh_fsid[2]; 173 fh->fh_fsid[1] = fh->fh_fsid[2];
167 } 174 }
168 data_left -= len; 175 data_left -= len;
@@ -202,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
202 * fix that case easily. 209 * fix that case easily.
203 */ 210 */
204 struct cred *new = prepare_creds(); 211 struct cred *new = prepare_creds();
205 if (!new) 212 if (!new) {
206 return nfserrno(-ENOMEM); 213 error = nfserrno(-ENOMEM);
214 goto out;
215 }
207 new->cap_effective = 216 new->cap_effective =
208 cap_raise_nfsd_set(new->cap_effective, 217 cap_raise_nfsd_set(new->cap_effective,
209 new->cap_permitted); 218 new->cap_permitted);
@@ -539,8 +548,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
539 dentry); 548 dentry);
540 549
541 fhp->fh_dentry = dget(dentry); /* our internal copy */ 550 fhp->fh_dentry = dget(dentry); /* our internal copy */
542 fhp->fh_export = exp; 551 fhp->fh_export = exp_get(exp);
543 cache_get(&exp->h);
544 552
545 if (fhp->fh_handle.fh_version == 0xca) { 553 if (fhp->fh_handle.fh_version == 0xca) {
546 /* old style filehandle please */ 554 /* old style filehandle please */
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 2e89e70ac15c..08236d70c667 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -73,8 +73,15 @@ enum fsid_source {
73extern enum fsid_source fsid_source(struct svc_fh *fhp); 73extern enum fsid_source fsid_source(struct svc_fh *fhp);
74 74
75 75
76/* This might look a little large to "inline" but in all calls except 76/*
77 * This might look a little large to "inline" but in all calls except
77 * one, 'vers' is constant so moste of the function disappears. 78 * one, 'vers' is constant so moste of the function disappears.
79 *
80 * In some cases the values are considered to be host endian and in
81 * others, net endian. fsidv is always considered to be u32 as the
82 * callers don't know which it will be. So we must use __force to keep
83 * sparse from complaining. Since these values are opaque to the
84 * client, that shouldn't be a problem.
78 */ 85 */
79static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino, 86static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
80 u32 fsid, unsigned char *uuid) 87 u32 fsid, unsigned char *uuid)
@@ -82,7 +89,7 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
82 u32 *up; 89 u32 *up;
83 switch(vers) { 90 switch(vers) {
84 case FSID_DEV: 91 case FSID_DEV:
85 fsidv[0] = htonl((MAJOR(dev)<<16) | 92 fsidv[0] = (__force __u32)htonl((MAJOR(dev)<<16) |
86 MINOR(dev)); 93 MINOR(dev));
87 fsidv[1] = ino_t_to_u32(ino); 94 fsidv[1] = ino_t_to_u32(ino);
88 break; 95 break;
@@ -90,8 +97,8 @@ static inline void mk_fsid(int vers, u32 *fsidv, dev_t dev, ino_t ino,
90 fsidv[0] = fsid; 97 fsidv[0] = fsid;
91 break; 98 break;
92 case FSID_MAJOR_MINOR: 99 case FSID_MAJOR_MINOR:
93 fsidv[0] = htonl(MAJOR(dev)); 100 fsidv[0] = (__force __u32)htonl(MAJOR(dev));
94 fsidv[1] = htonl(MINOR(dev)); 101 fsidv[1] = (__force __u32)htonl(MINOR(dev));
95 fsidv[2] = ino_t_to_u32(ino); 102 fsidv[2] = ino_t_to_u32(ino);
96 break; 103 break;
97 104
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 54c6b3d3cc79..b8680738f588 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -403,12 +403,13 @@ nfsd_proc_symlink(struct svc_rqst *rqstp, struct nfsd_symlinkargs *argp,
403 403
404 fh_init(&newfh, NFS_FHSIZE); 404 fh_init(&newfh, NFS_FHSIZE);
405 /* 405 /*
406 * Create the link, look up new file and set attrs. 406 * Crazy hack: the request fits in a page, and already-decoded
407 * attributes follow argp->tname, so it's safe to just write a
408 * null to ensure it's null-terminated:
407 */ 409 */
410 argp->tname[argp->tlen] = '\0';
408 nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, 411 nfserr = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
409 argp->tname, argp->tlen, 412 argp->tname, &newfh);
410 &newfh, &argp->attrs);
411
412 413
413 fh_put(&argp->ffh); 414 fh_put(&argp->ffh);
414 fh_put(&newfh); 415 fh_put(&newfh);
@@ -716,6 +717,7 @@ nfserrno (int errno)
716 { nfserr_noent, -ENOENT }, 717 { nfserr_noent, -ENOENT },
717 { nfserr_io, -EIO }, 718 { nfserr_io, -EIO },
718 { nfserr_nxio, -ENXIO }, 719 { nfserr_nxio, -ENXIO },
720 { nfserr_fbig, -E2BIG },
719 { nfserr_acces, -EACCES }, 721 { nfserr_acces, -EACCES },
720 { nfserr_exist, -EEXIST }, 722 { nfserr_exist, -EEXIST },
721 { nfserr_xdev, -EXDEV }, 723 { nfserr_xdev, -EXDEV },
@@ -743,6 +745,7 @@ nfserrno (int errno)
743 { nfserr_notsupp, -EOPNOTSUPP }, 745 { nfserr_notsupp, -EOPNOTSUPP },
744 { nfserr_toosmall, -ETOOSMALL }, 746 { nfserr_toosmall, -ETOOSMALL },
745 { nfserr_serverfault, -ESERVERFAULT }, 747 { nfserr_serverfault, -ESERVERFAULT },
748 { nfserr_serverfault, -ENFILE },
746 }; 749 };
747 int i; 750 int i;
748 751
@@ -750,7 +753,7 @@ nfserrno (int errno)
750 if (nfs_errtbl[i].syserr == errno) 753 if (nfs_errtbl[i].syserr == errno)
751 return nfs_errtbl[i].nfserr; 754 return nfs_errtbl[i].nfserr;
752 } 755 }
753 printk (KERN_INFO "nfsd: non-standard errno: %d\n", errno); 756 WARN(1, "nfsd: non-standard errno: %d\n", errno);
754 return nfserr_io; 757 return nfserr_io;
755} 758}
756 759
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 1879e43f2868..752d56bbe0ba 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -221,7 +221,8 @@ static int nfsd_startup_generic(int nrservs)
221 */ 221 */
222 ret = nfsd_racache_init(2*nrservs); 222 ret = nfsd_racache_init(2*nrservs);
223 if (ret) 223 if (ret)
224 return ret; 224 goto dec_users;
225
225 ret = nfs4_state_start(); 226 ret = nfs4_state_start();
226 if (ret) 227 if (ret)
227 goto out_racache; 228 goto out_racache;
@@ -229,6 +230,8 @@ static int nfsd_startup_generic(int nrservs)
229 230
230out_racache: 231out_racache:
231 nfsd_racache_shutdown(); 232 nfsd_racache_shutdown();
233dec_users:
234 nfsd_users--;
232 return ret; 235 return ret;
233} 236}
234 237
@@ -405,6 +408,7 @@ int nfsd_create_serv(struct net *net)
405 if (nn->nfsd_serv == NULL) 408 if (nn->nfsd_serv == NULL)
406 return -ENOMEM; 409 return -ENOMEM;
407 410
411 nn->nfsd_serv->sv_maxconn = nn->max_connections;
408 error = svc_bind(nn->nfsd_serv, net); 412 error = svc_bind(nn->nfsd_serv, net);
409 if (error < 0) { 413 if (error < 0) {
410 svc_destroy(nn->nfsd_serv); 414 svc_destroy(nn->nfsd_serv);
@@ -469,8 +473,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
469 /* enforce a global maximum number of threads */ 473 /* enforce a global maximum number of threads */
470 tot = 0; 474 tot = 0;
471 for (i = 0; i < n; i++) { 475 for (i = 0; i < n; i++) {
472 if (nthreads[i] > NFSD_MAXSERVS) 476 nthreads[i] = min(nthreads[i], NFSD_MAXSERVS);
473 nthreads[i] = NFSD_MAXSERVS;
474 tot += nthreads[i]; 477 tot += nthreads[i];
475 } 478 }
476 if (tot > NFSD_MAXSERVS) { 479 if (tot > NFSD_MAXSERVS) {
@@ -519,11 +522,11 @@ nfsd_svc(int nrservs, struct net *net)
519 522
520 mutex_lock(&nfsd_mutex); 523 mutex_lock(&nfsd_mutex);
521 dprintk("nfsd: creating service\n"); 524 dprintk("nfsd: creating service\n");
522 if (nrservs <= 0) 525
523 nrservs = 0; 526 nrservs = max(nrservs, 0);
524 if (nrservs > NFSD_MAXSERVS) 527 nrservs = min(nrservs, NFSD_MAXSERVS);
525 nrservs = NFSD_MAXSERVS;
526 error = 0; 528 error = 0;
529
527 if (nrservs == 0 && nn->nfsd_serv == NULL) 530 if (nrservs == 0 && nn->nfsd_serv == NULL)
528 goto out; 531 goto out;
529 532
@@ -564,6 +567,7 @@ nfsd(void *vrqstp)
564 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; 567 struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
565 struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list); 568 struct svc_xprt *perm_sock = list_entry(rqstp->rq_server->sv_permsocks.next, typeof(struct svc_xprt), xpt_list);
566 struct net *net = perm_sock->xpt_net; 569 struct net *net = perm_sock->xpt_net;
570 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
567 int err; 571 int err;
568 572
569 /* Lock module and set up kernel thread */ 573 /* Lock module and set up kernel thread */
@@ -597,6 +601,9 @@ nfsd(void *vrqstp)
597 * The main request loop 601 * The main request loop
598 */ 602 */
599 for (;;) { 603 for (;;) {
604 /* Update sv_maxconn if it has changed */
605 rqstp->rq_server->sv_maxconn = nn->max_connections;
606
600 /* 607 /*
601 * Find a socket with data available and call its 608 * Find a socket with data available and call its
602 * recvfrom routine. 609 * recvfrom routine.
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 1ac306b769df..412d7061f9e5 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -257,8 +257,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
257 len = args->count = ntohl(*p++); 257 len = args->count = ntohl(*p++);
258 p++; /* totalcount - unused */ 258 p++; /* totalcount - unused */
259 259
260 if (len > NFSSVC_MAXBLKSIZE_V2) 260 len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
261 len = NFSSVC_MAXBLKSIZE_V2;
262 261
263 /* set up somewhere to store response. 262 /* set up somewhere to store response.
264 * We take pages, put them on reslist and include in iovec 263 * We take pages, put them on reslist and include in iovec
@@ -268,7 +267,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p,
268 struct page *p = *(rqstp->rq_next_page++); 267 struct page *p = *(rqstp->rq_next_page++);
269 268
270 rqstp->rq_vec[v].iov_base = page_address(p); 269 rqstp->rq_vec[v].iov_base = page_address(p);
271 rqstp->rq_vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE; 270 rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
272 len -= rqstp->rq_vec[v].iov_len; 271 len -= rqstp->rq_vec[v].iov_len;
273 v++; 272 v++;
274 } 273 }
@@ -400,9 +399,7 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p,
400 return 0; 399 return 0;
401 args->cookie = ntohl(*p++); 400 args->cookie = ntohl(*p++);
402 args->count = ntohl(*p++); 401 args->count = ntohl(*p++);
403 if (args->count > PAGE_SIZE) 402 args->count = min_t(u32, args->count, PAGE_SIZE);
404 args->count = PAGE_SIZE;
405
406 args->buffer = page_address(*(rqstp->rq_next_page++)); 403 args->buffer = page_address(*(rqstp->rq_next_page++));
407 404
408 return xdr_argsize_check(rqstp, p); 405 return xdr_argsize_check(rqstp, p);
@@ -516,10 +513,11 @@ nfssvc_encode_entry(void *ccdv, const char *name,
516 } 513 }
517 if (cd->offset) 514 if (cd->offset)
518 *cd->offset = htonl(offset); 515 *cd->offset = htonl(offset);
519 if (namlen > NFS2_MAXNAMLEN)
520 namlen = NFS2_MAXNAMLEN;/* truncate filename */
521 516
517 /* truncate filename */
518 namlen = min(namlen, NFS2_MAXNAMLEN);
522 slen = XDR_QUADLEN(namlen); 519 slen = XDR_QUADLEN(namlen);
520
523 if ((buflen = cd->buflen - slen - 4) < 0) { 521 if ((buflen = cd->buflen - slen - 4) < 0) {
524 cd->common.err = nfserr_toosmall; 522 cd->common.err = nfserr_toosmall;
525 return -EINVAL; 523 return -EINVAL;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 374c66283ac5..2712042a66b1 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -62,17 +62,28 @@ typedef struct {
62 (s)->si_generation 62 (s)->si_generation
63 63
64struct nfsd4_callback { 64struct nfsd4_callback {
65 void *cb_op;
66 struct nfs4_client *cb_clp; 65 struct nfs4_client *cb_clp;
67 struct list_head cb_per_client; 66 struct list_head cb_per_client;
68 u32 cb_minorversion; 67 u32 cb_minorversion;
69 struct rpc_message cb_msg; 68 struct rpc_message cb_msg;
70 const struct rpc_call_ops *cb_ops; 69 struct nfsd4_callback_ops *cb_ops;
71 struct work_struct cb_work; 70 struct work_struct cb_work;
72 bool cb_done; 71 bool cb_done;
73}; 72};
74 73
74struct nfsd4_callback_ops {
75 void (*prepare)(struct nfsd4_callback *);
76 int (*done)(struct nfsd4_callback *, struct rpc_task *);
77 void (*release)(struct nfsd4_callback *);
78};
79
80/*
81 * A core object that represents a "common" stateid. These are generally
82 * embedded within the different (more specific) stateid objects and contain
83 * fields that are of general use to any stateid.
84 */
75struct nfs4_stid { 85struct nfs4_stid {
86 atomic_t sc_count;
76#define NFS4_OPEN_STID 1 87#define NFS4_OPEN_STID 1
77#define NFS4_LOCK_STID 2 88#define NFS4_LOCK_STID 2
78#define NFS4_DELEG_STID 4 89#define NFS4_DELEG_STID 4
@@ -80,26 +91,50 @@ struct nfs4_stid {
80#define NFS4_CLOSED_STID 8 91#define NFS4_CLOSED_STID 8
81/* For a deleg stateid kept around only to process free_stateid's: */ 92/* For a deleg stateid kept around only to process free_stateid's: */
82#define NFS4_REVOKED_DELEG_STID 16 93#define NFS4_REVOKED_DELEG_STID 16
94#define NFS4_CLOSED_DELEG_STID 32
83 unsigned char sc_type; 95 unsigned char sc_type;
84 stateid_t sc_stateid; 96 stateid_t sc_stateid;
85 struct nfs4_client *sc_client; 97 struct nfs4_client *sc_client;
98 struct nfs4_file *sc_file;
99 void (*sc_free)(struct nfs4_stid *);
86}; 100};
87 101
102/*
103 * Represents a delegation stateid. The nfs4_client holds references to these
104 * and they are put when it is being destroyed or when the delegation is
105 * returned by the client:
106 *
107 * o 1 reference as long as a delegation is still in force (taken when it's
108 * alloc'd, put when it's returned or revoked)
109 *
110 * o 1 reference as long as a recall rpc is in progress (taken when the lease
111 * is broken, put when the rpc exits)
112 *
113 * o 1 more ephemeral reference for each nfsd thread currently doing something
114 * with that delegation without holding the cl_lock
115 *
116 * If the server attempts to recall a delegation and the client doesn't do so
117 * before a timeout, the server may also revoke the delegation. In that case,
118 * the object will either be destroyed (v4.0) or moved to a per-client list of
119 * revoked delegations (v4.1+).
120 *
121 * This object is a superset of the nfs4_stid.
122 */
88struct nfs4_delegation { 123struct nfs4_delegation {
89 struct nfs4_stid dl_stid; /* must be first field */ 124 struct nfs4_stid dl_stid; /* must be first field */
90 struct list_head dl_perfile; 125 struct list_head dl_perfile;
91 struct list_head dl_perclnt; 126 struct list_head dl_perclnt;
92 struct list_head dl_recall_lru; /* delegation recalled */ 127 struct list_head dl_recall_lru; /* delegation recalled */
93 atomic_t dl_count; /* ref count */
94 struct nfs4_file *dl_file;
95 u32 dl_type; 128 u32 dl_type;
96 time_t dl_time; 129 time_t dl_time;
97/* For recall: */ 130/* For recall: */
98 struct knfsd_fh dl_fh;
99 int dl_retries; 131 int dl_retries;
100 struct nfsd4_callback dl_recall; 132 struct nfsd4_callback dl_recall;
101}; 133};
102 134
135#define cb_to_delegation(cb) \
136 container_of(cb, struct nfs4_delegation, dl_recall)
137
103/* client delegation callback info */ 138/* client delegation callback info */
104struct nfs4_cb_conn { 139struct nfs4_cb_conn {
105 /* SETCLIENTID info */ 140 /* SETCLIENTID info */
@@ -194,6 +229,11 @@ struct nfsd4_conn {
194 unsigned char cn_flags; 229 unsigned char cn_flags;
195}; 230};
196 231
232/*
233 * Representation of a v4.1+ session. These are refcounted in a similar fashion
234 * to the nfs4_client. References are only taken when the server is actively
235 * working on the object (primarily during the processing of compounds).
236 */
197struct nfsd4_session { 237struct nfsd4_session {
198 atomic_t se_ref; 238 atomic_t se_ref;
199 struct list_head se_hash; /* hash by sessionid */ 239 struct list_head se_hash; /* hash by sessionid */
@@ -212,8 +252,6 @@ struct nfsd4_session {
212 struct nfsd4_slot *se_slots[]; /* forward channel slots */ 252 struct nfsd4_slot *se_slots[]; /* forward channel slots */
213}; 253};
214 254
215extern void nfsd4_put_session(struct nfsd4_session *ses);
216
217/* formatted contents of nfs4_sessionid */ 255/* formatted contents of nfs4_sessionid */
218struct nfsd4_sessionid { 256struct nfsd4_sessionid {
219 clientid_t clientid; 257 clientid_t clientid;
@@ -225,17 +263,35 @@ struct nfsd4_sessionid {
225 263
226/* 264/*
227 * struct nfs4_client - one per client. Clientids live here. 265 * struct nfs4_client - one per client. Clientids live here.
228 * o Each nfs4_client is hashed by clientid.
229 * 266 *
230 * o Each nfs4_clients is also hashed by name 267 * The initial object created by an NFS client using SETCLIENTID (for NFSv4.0)
231 * (the opaque quantity initially sent by the client to identify itself). 268 * or EXCHANGE_ID (for NFSv4.1+). These objects are refcounted and timestamped.
269 * Each nfsd_net_ns object contains a set of these and they are tracked via
270 * short and long form clientid. They are hashed and searched for under the
271 * per-nfsd_net client_lock spinlock.
272 *
273 * References to it are only held during the processing of compounds, and in
274 * certain other operations. In their "resting state" they have a refcount of
275 * 0. If they are not renewed within a lease period, they become eligible for
276 * destruction by the laundromat.
277 *
278 * These objects can also be destroyed prematurely by the fault injection code,
279 * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates.
280 * Care is taken *not* to do this however when the objects have an elevated
281 * refcount.
282 *
283 * o Each nfs4_client is hashed by clientid
284 *
285 * o Each nfs4_clients is also hashed by name (the opaque quantity initially
286 * sent by the client to identify itself).
232 * 287 *
233 * o cl_perclient list is used to ensure no dangling stateowner references 288 * o cl_perclient list is used to ensure no dangling stateowner references
234 * when we expire the nfs4_client 289 * when we expire the nfs4_client
235 */ 290 */
236struct nfs4_client { 291struct nfs4_client {
237 struct list_head cl_idhash; /* hash by cl_clientid.id */ 292 struct list_head cl_idhash; /* hash by cl_clientid.id */
238 struct rb_node cl_namenode; /* link into by-name trees */ 293 struct rb_node cl_namenode; /* link into by-name trees */
294 struct list_head *cl_ownerstr_hashtbl;
239 struct list_head cl_openowners; 295 struct list_head cl_openowners;
240 struct idr cl_stateids; /* stateid lookup */ 296 struct idr cl_stateids; /* stateid lookup */
241 struct list_head cl_delegations; 297 struct list_head cl_delegations;
@@ -258,6 +314,7 @@ struct nfs4_client {
258#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ 314#define NFSD4_CLIENT_STABLE (2) /* client on stable storage */
259#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ 315#define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */
260#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */ 316#define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */
317#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
261#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 318#define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
262 1 << NFSD4_CLIENT_CB_KILL) 319 1 << NFSD4_CLIENT_CB_KILL)
263 unsigned long cl_flags; 320 unsigned long cl_flags;
@@ -329,21 +386,43 @@ struct nfs4_replay {
329 unsigned int rp_buflen; 386 unsigned int rp_buflen;
330 char *rp_buf; 387 char *rp_buf;
331 struct knfsd_fh rp_openfh; 388 struct knfsd_fh rp_openfh;
389 struct mutex rp_mutex;
332 char rp_ibuf[NFSD4_REPLAY_ISIZE]; 390 char rp_ibuf[NFSD4_REPLAY_ISIZE];
333}; 391};
334 392
393struct nfs4_stateowner;
394
395struct nfs4_stateowner_operations {
396 void (*so_unhash)(struct nfs4_stateowner *);
397 void (*so_free)(struct nfs4_stateowner *);
398};
399
400/*
401 * A core object that represents either an open or lock owner. The object and
402 * lock owner objects have one of these embedded within them. Refcounts and
403 * other fields common to both owner types are contained within these
404 * structures.
405 */
335struct nfs4_stateowner { 406struct nfs4_stateowner {
336 struct list_head so_strhash; /* hash by op_name */ 407 struct list_head so_strhash;
337 struct list_head so_stateids; 408 struct list_head so_stateids;
338 struct nfs4_client * so_client; 409 struct nfs4_client *so_client;
339 /* after increment in ENCODE_SEQID_OP_TAIL, represents the next 410 const struct nfs4_stateowner_operations *so_ops;
411 /* after increment in nfsd4_bump_seqid, represents the next
340 * sequence id expected from the client: */ 412 * sequence id expected from the client: */
341 u32 so_seqid; 413 atomic_t so_count;
342 struct xdr_netobj so_owner; /* open owner name */ 414 u32 so_seqid;
343 struct nfs4_replay so_replay; 415 struct xdr_netobj so_owner; /* open owner name */
344 bool so_is_open_owner; 416 struct nfs4_replay so_replay;
417 bool so_is_open_owner;
345}; 418};
346 419
420/*
421 * When a file is opened, the client provides an open state owner opaque string
422 * that indicates the "owner" of that open. These objects are refcounted.
423 * References to it are held by each open state associated with it. This object
424 * is a superset of the nfs4_stateowner struct.
425 */
347struct nfs4_openowner { 426struct nfs4_openowner {
348 struct nfs4_stateowner oo_owner; /* must be first field */ 427 struct nfs4_stateowner oo_owner; /* must be first field */
349 struct list_head oo_perclient; 428 struct list_head oo_perclient;
@@ -358,15 +437,17 @@ struct nfs4_openowner {
358 struct nfs4_ol_stateid *oo_last_closed_stid; 437 struct nfs4_ol_stateid *oo_last_closed_stid;
359 time_t oo_time; /* time of placement on so_close_lru */ 438 time_t oo_time; /* time of placement on so_close_lru */
360#define NFS4_OO_CONFIRMED 1 439#define NFS4_OO_CONFIRMED 1
361#define NFS4_OO_NEW 4
362 unsigned char oo_flags; 440 unsigned char oo_flags;
363}; 441};
364 442
443/*
444 * Represents a generic "lockowner". Similar to an openowner. References to it
445 * are held by the lock stateids that are created on its behalf. This object is
446 * a superset of the nfs4_stateowner struct (or would be if it needed any extra
447 * fields).
448 */
365struct nfs4_lockowner { 449struct nfs4_lockowner {
366 struct nfs4_stateowner lo_owner; /* must be first element */ 450 struct nfs4_stateowner lo_owner; /* must be first element */
367 struct list_head lo_owner_ino_hash; /* hash by owner,file */
368 struct list_head lo_perstateid;
369 struct list_head lo_list; /* for temporary uses */
370}; 451};
371 452
372static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so) 453static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
@@ -379,9 +460,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so)
379 return container_of(so, struct nfs4_lockowner, lo_owner); 460 return container_of(so, struct nfs4_lockowner, lo_owner);
380} 461}
381 462
382/* nfs4_file: a file opened by some number of (open) nfs4_stateowners. */ 463/*
464 * nfs4_file: a file opened by some number of (open) nfs4_stateowners.
465 *
466 * These objects are global. nfsd only keeps one instance of a nfs4_file per
467 * inode (though it may keep multiple file descriptors open per inode). These
468 * are tracked in the file_hashtbl which is protected by the state_lock
469 * spinlock.
470 */
383struct nfs4_file { 471struct nfs4_file {
384 atomic_t fi_ref; 472 atomic_t fi_ref;
473 spinlock_t fi_lock;
385 struct hlist_node fi_hash; /* hash by "struct inode *" */ 474 struct hlist_node fi_hash; /* hash by "struct inode *" */
386 struct list_head fi_stateids; 475 struct list_head fi_stateids;
387 struct list_head fi_delegations; 476 struct list_head fi_delegations;
@@ -395,49 +484,35 @@ struct nfs4_file {
395 * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set. 484 * + 1 to both of the above if NFS4_SHARE_ACCESS_BOTH is set.
396 */ 485 */
397 atomic_t fi_access[2]; 486 atomic_t fi_access[2];
487 u32 fi_share_deny;
398 struct file *fi_deleg_file; 488 struct file *fi_deleg_file;
399 struct file_lock *fi_lease;
400 atomic_t fi_delegees; 489 atomic_t fi_delegees;
401 struct inode *fi_inode; 490 struct knfsd_fh fi_fhandle;
402 bool fi_had_conflict; 491 bool fi_had_conflict;
403}; 492};
404 493
405/* XXX: for first cut may fall back on returning file that doesn't work 494/*
406 * at all? */ 495 * A generic struct representing either a open or lock stateid. The nfs4_client
407static inline struct file *find_writeable_file(struct nfs4_file *f) 496 * holds a reference to each of these objects, and they in turn hold a
408{ 497 * reference to their respective stateowners. The client's reference is
409 if (f->fi_fds[O_WRONLY]) 498 * released in response to a close or unlock (depending on whether it's an open
410 return f->fi_fds[O_WRONLY]; 499 * or lock stateid) or when the client is being destroyed.
411 return f->fi_fds[O_RDWR]; 500 *
412} 501 * In the case of v4.0 open stateids, these objects are preserved for a little
413 502 * while after close in order to handle CLOSE replays. Those are eventually
414static inline struct file *find_readable_file(struct nfs4_file *f) 503 * reclaimed via a LRU scheme by the laundromat.
415{ 504 *
416 if (f->fi_fds[O_RDONLY]) 505 * This object is a superset of the nfs4_stid. "ol" stands for "Open or Lock".
417 return f->fi_fds[O_RDONLY]; 506 * Better suggestions welcome.
418 return f->fi_fds[O_RDWR]; 507 */
419}
420
421static inline struct file *find_any_file(struct nfs4_file *f)
422{
423 if (f->fi_fds[O_RDWR])
424 return f->fi_fds[O_RDWR];
425 else if (f->fi_fds[O_WRONLY])
426 return f->fi_fds[O_WRONLY];
427 else
428 return f->fi_fds[O_RDONLY];
429}
430
431/* "ol" stands for "Open or Lock". Better suggestions welcome. */
432struct nfs4_ol_stateid { 508struct nfs4_ol_stateid {
433 struct nfs4_stid st_stid; /* must be first field */ 509 struct nfs4_stid st_stid; /* must be first field */
434 struct list_head st_perfile; 510 struct list_head st_perfile;
435 struct list_head st_perstateowner; 511 struct list_head st_perstateowner;
436 struct list_head st_lockowners; 512 struct list_head st_locks;
437 struct nfs4_stateowner * st_stateowner; 513 struct nfs4_stateowner * st_stateowner;
438 struct nfs4_file * st_file; 514 unsigned char st_access_bmap;
439 unsigned long st_access_bmap; 515 unsigned char st_deny_bmap;
440 unsigned long st_deny_bmap;
441 struct nfs4_ol_stateid * st_openstp; 516 struct nfs4_ol_stateid * st_openstp;
442}; 517};
443 518
@@ -450,33 +525,43 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
450#define RD_STATE 0x00000010 525#define RD_STATE 0x00000010
451#define WR_STATE 0x00000020 526#define WR_STATE 0x00000020
452 527
528enum nfsd4_cb_op {
529 NFSPROC4_CLNT_CB_NULL = 0,
530 NFSPROC4_CLNT_CB_RECALL,
531 NFSPROC4_CLNT_CB_SEQUENCE,
532};
533
534
453struct nfsd4_compound_state; 535struct nfsd4_compound_state;
454struct nfsd_net; 536struct nfsd_net;
455 537
456extern __be32 nfs4_preprocess_stateid_op(struct net *net, 538extern __be32 nfs4_preprocess_stateid_op(struct net *net,
457 struct nfsd4_compound_state *cstate, 539 struct nfsd4_compound_state *cstate,
458 stateid_t *stateid, int flags, struct file **filp); 540 stateid_t *stateid, int flags, struct file **filp);
459extern void nfs4_lock_state(void); 541void nfs4_put_stid(struct nfs4_stid *s);
460extern void nfs4_unlock_state(void);
461void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); 542void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
462extern void nfs4_release_reclaim(struct nfsd_net *); 543extern void nfs4_release_reclaim(struct nfsd_net *);
463extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, 544extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
464 struct nfsd_net *nn); 545 struct nfsd_net *nn);
465extern __be32 nfs4_check_open_reclaim(clientid_t *clid, bool sessions, struct nfsd_net *nn); 546extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
547 struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
466extern int set_callback_cred(void); 548extern int set_callback_cred(void);
467extern void nfsd4_init_callback(struct nfsd4_callback *);
468extern void nfsd4_probe_callback(struct nfs4_client *clp); 549extern void nfsd4_probe_callback(struct nfs4_client *clp);
469extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); 550extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
470extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); 551extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
471extern void nfsd4_cb_recall(struct nfs4_delegation *dp); 552extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
553 struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
554extern void nfsd4_run_cb(struct nfsd4_callback *cb);
472extern int nfsd4_create_callback_queue(void); 555extern int nfsd4_create_callback_queue(void);
473extern void nfsd4_destroy_callback_queue(void); 556extern void nfsd4_destroy_callback_queue(void);
474extern void nfsd4_shutdown_callback(struct nfs4_client *); 557extern void nfsd4_shutdown_callback(struct nfs4_client *);
475extern void nfs4_put_delegation(struct nfs4_delegation *dp); 558extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
476extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, 559extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
477 struct nfsd_net *nn); 560 struct nfsd_net *nn);
478extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); 561extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
479extern void put_client_renew(struct nfs4_client *clp); 562
563/* grace period management */
564void nfsd4_end_grace(struct nfsd_net *nn);
480 565
481/* nfs4recover operations */ 566/* nfs4recover operations */
482extern int nfsd4_client_tracking_init(struct net *net); 567extern int nfsd4_client_tracking_init(struct net *net);
@@ -484,25 +569,30 @@ extern void nfsd4_client_tracking_exit(struct net *net);
484extern void nfsd4_client_record_create(struct nfs4_client *clp); 569extern void nfsd4_client_record_create(struct nfs4_client *clp);
485extern void nfsd4_client_record_remove(struct nfs4_client *clp); 570extern void nfsd4_client_record_remove(struct nfs4_client *clp);
486extern int nfsd4_client_record_check(struct nfs4_client *clp); 571extern int nfsd4_client_record_check(struct nfs4_client *clp);
487extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); 572extern void nfsd4_record_grace_done(struct nfsd_net *nn);
488 573
489/* nfs fault injection functions */ 574/* nfs fault injection functions */
490#ifdef CONFIG_NFSD_FAULT_INJECTION 575#ifdef CONFIG_NFSD_FAULT_INJECTION
491int nfsd_fault_inject_init(void); 576int nfsd_fault_inject_init(void);
492void nfsd_fault_inject_cleanup(void); 577void nfsd_fault_inject_cleanup(void);
493u64 nfsd_for_n_state(u64, u64 (*)(struct nfs4_client *, u64)); 578
494struct nfs4_client *nfsd_find_client(struct sockaddr_storage *, size_t); 579u64 nfsd_inject_print_clients(void);
495 580u64 nfsd_inject_forget_client(struct sockaddr_storage *, size_t);
496u64 nfsd_forget_client(struct nfs4_client *, u64); 581u64 nfsd_inject_forget_clients(u64);
497u64 nfsd_forget_client_locks(struct nfs4_client*, u64); 582
498u64 nfsd_forget_client_openowners(struct nfs4_client *, u64); 583u64 nfsd_inject_print_locks(void);
499u64 nfsd_forget_client_delegations(struct nfs4_client *, u64); 584u64 nfsd_inject_forget_client_locks(struct sockaddr_storage *, size_t);
500u64 nfsd_recall_client_delegations(struct nfs4_client *, u64); 585u64 nfsd_inject_forget_locks(u64);
501 586
502u64 nfsd_print_client(struct nfs4_client *, u64); 587u64 nfsd_inject_print_openowners(void);
503u64 nfsd_print_client_locks(struct nfs4_client *, u64); 588u64 nfsd_inject_forget_client_openowners(struct sockaddr_storage *, size_t);
504u64 nfsd_print_client_openowners(struct nfs4_client *, u64); 589u64 nfsd_inject_forget_openowners(u64);
505u64 nfsd_print_client_delegations(struct nfs4_client *, u64); 590
591u64 nfsd_inject_print_delegations(void);
592u64 nfsd_inject_forget_client_delegations(struct sockaddr_storage *, size_t);
593u64 nfsd_inject_forget_delegations(u64);
594u64 nfsd_inject_recall_client_delegations(struct sockaddr_storage *, size_t);
595u64 nfsd_inject_recall_delegations(u64);
506#else /* CONFIG_NFSD_FAULT_INJECTION */ 596#else /* CONFIG_NFSD_FAULT_INJECTION */
507static inline int nfsd_fault_inject_init(void) { return 0; } 597static inline int nfsd_fault_inject_init(void) { return 0; }
508static inline void nfsd_fault_inject_cleanup(void) {} 598static inline void nfsd_fault_inject_cleanup(void) {}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 140c496f612c..989129e2d6ea 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -189,8 +189,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
189 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); 189 dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
190 190
191 dparent = fhp->fh_dentry; 191 dparent = fhp->fh_dentry;
192 exp = fhp->fh_export; 192 exp = exp_get(fhp->fh_export);
193 exp_get(exp);
194 193
195 /* Lookup the name, but don't follow links */ 194 /* Lookup the name, but don't follow links */
196 if (isdotent(name, len)) { 195 if (isdotent(name, len)) {
@@ -446,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
446 if (err) 445 if (err)
447 goto out; 446 goto out;
448 size_change = 1; 447 size_change = 1;
448
449 /*
450 * RFC5661, Section 18.30.4:
451 * Changing the size of a file with SETATTR indirectly
452 * changes the time_modify and change attributes.
453 *
454 * (and similar for the older RFCs)
455 */
456 if (iap->ia_size != i_size_read(inode))
457 iap->ia_valid |= ATTR_MTIME;
449 } 458 }
450 459
451 iap->ia_valid |= ATTR_CTIME; 460 iap->ia_valid |= ATTR_CTIME;
@@ -464,7 +473,7 @@ out_put_write_access:
464 if (size_change) 473 if (size_change)
465 put_write_access(inode); 474 put_write_access(inode);
466 if (!err) 475 if (!err)
467 commit_metadata(fhp); 476 err = nfserrno(commit_metadata(fhp));
468out: 477out:
469 return err; 478 return err;
470} 479}
@@ -650,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
650{ 659{
651 struct path path; 660 struct path path;
652 struct inode *inode; 661 struct inode *inode;
662 struct file *file;
653 int flags = O_RDONLY|O_LARGEFILE; 663 int flags = O_RDONLY|O_LARGEFILE;
654 __be32 err; 664 __be32 err;
655 int host_err = 0; 665 int host_err = 0;
@@ -704,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
704 else 714 else
705 flags = O_WRONLY|O_LARGEFILE; 715 flags = O_WRONLY|O_LARGEFILE;
706 } 716 }
707 *filp = dentry_open(&path, flags, current_cred());
708 if (IS_ERR(*filp)) {
709 host_err = PTR_ERR(*filp);
710 *filp = NULL;
711 } else {
712 host_err = ima_file_check(*filp, may_flags);
713 717
714 if (may_flags & NFSD_MAY_64BIT_COOKIE) 718 file = dentry_open(&path, flags, current_cred());
715 (*filp)->f_mode |= FMODE_64BITHASH; 719 if (IS_ERR(file)) {
716 else 720 host_err = PTR_ERR(file);
717 (*filp)->f_mode |= FMODE_32BITHASH; 721 goto out_nfserr;
718 } 722 }
719 723
724 host_err = ima_file_check(file, may_flags, 0);
725 if (host_err) {
726 nfsd_close(file);
727 goto out_nfserr;
728 }
729
730 if (may_flags & NFSD_MAY_64BIT_COOKIE)
731 file->f_mode |= FMODE_64BITHASH;
732 else
733 file->f_mode |= FMODE_32BITHASH;
734
735 *filp = file;
720out_nfserr: 736out_nfserr:
721 err = nfserrno(host_err); 737 err = nfserrno(host_err);
722out: 738out:
@@ -820,7 +836,8 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
820 return __splice_from_pipe(pipe, sd, nfsd_splice_actor); 836 return __splice_from_pipe(pipe, sd, nfsd_splice_actor);
821} 837}
822 838
823__be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err) 839static __be32
840nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
824{ 841{
825 if (host_err >= 0) { 842 if (host_err >= 0) {
826 nfsdstats.io_read += host_err; 843 nfsdstats.io_read += host_err;
@@ -831,7 +848,7 @@ __be32 nfsd_finish_read(struct file *file, unsigned long *count, int host_err)
831 return nfserrno(host_err); 848 return nfserrno(host_err);
832} 849}
833 850
834int nfsd_splice_read(struct svc_rqst *rqstp, 851__be32 nfsd_splice_read(struct svc_rqst *rqstp,
835 struct file *file, loff_t offset, unsigned long *count) 852 struct file *file, loff_t offset, unsigned long *count)
836{ 853{
837 struct splice_desc sd = { 854 struct splice_desc sd = {
@@ -847,7 +864,7 @@ int nfsd_splice_read(struct svc_rqst *rqstp,
847 return nfsd_finish_read(file, count, host_err); 864 return nfsd_finish_read(file, count, host_err);
848} 865}
849 866
850int nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen, 867__be32 nfsd_readv(struct file *file, loff_t offset, struct kvec *vec, int vlen,
851 unsigned long *count) 868 unsigned long *count)
852{ 869{
853 mm_segment_t oldfs; 870 mm_segment_t oldfs;
@@ -1121,7 +1138,8 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
1121 iap->ia_valid &= ~(ATTR_UID|ATTR_GID); 1138 iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
1122 if (iap->ia_valid) 1139 if (iap->ia_valid)
1123 return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0); 1140 return nfsd_setattr(rqstp, resfhp, iap, 0, (time_t)0);
1124 return 0; 1141 /* Callers expect file metadata to be committed here */
1142 return nfserrno(commit_metadata(resfhp));
1125} 1143}
1126 1144
1127/* HPUX client sometimes creates a file in mode 000, and sets size to 0. 1145/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
@@ -1253,9 +1271,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1253 err = nfsd_create_setattr(rqstp, resfhp, iap); 1271 err = nfsd_create_setattr(rqstp, resfhp, iap);
1254 1272
1255 /* 1273 /*
1256 * nfsd_setattr already committed the child. Transactional filesystems 1274 * nfsd_create_setattr already committed the child. Transactional
1257 * had a chance to commit changes for both parent and child 1275 * filesystems had a chance to commit changes for both parent and
1258 * simultaneously making the following commit_metadata a noop. 1276 * child * simultaneously making the following commit_metadata a
1277 * noop.
1259 */ 1278 */
1260 err2 = nfserrno(commit_metadata(fhp)); 1279 err2 = nfserrno(commit_metadata(fhp));
1261 if (err2) 1280 if (err2)
@@ -1426,7 +1445,8 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1426 err = nfsd_create_setattr(rqstp, resfhp, iap); 1445 err = nfsd_create_setattr(rqstp, resfhp, iap);
1427 1446
1428 /* 1447 /*
1429 * nfsd_setattr already committed the child (and possibly also the parent). 1448 * nfsd_create_setattr already committed the child
1449 * (and possibly also the parent).
1430 */ 1450 */
1431 if (!err) 1451 if (!err)
1432 err = nfserrno(commit_metadata(fhp)); 1452 err = nfserrno(commit_metadata(fhp));
@@ -1504,16 +1524,15 @@ out_nfserr:
1504__be32 1524__be32
1505nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, 1525nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1506 char *fname, int flen, 1526 char *fname, int flen,
1507 char *path, int plen, 1527 char *path,
1508 struct svc_fh *resfhp, 1528 struct svc_fh *resfhp)
1509 struct iattr *iap)
1510{ 1529{
1511 struct dentry *dentry, *dnew; 1530 struct dentry *dentry, *dnew;
1512 __be32 err, cerr; 1531 __be32 err, cerr;
1513 int host_err; 1532 int host_err;
1514 1533
1515 err = nfserr_noent; 1534 err = nfserr_noent;
1516 if (!flen || !plen) 1535 if (!flen || path[0] == '\0')
1517 goto out; 1536 goto out;
1518 err = nfserr_exist; 1537 err = nfserr_exist;
1519 if (isdotent(fname, flen)) 1538 if (isdotent(fname, flen))
@@ -1534,18 +1553,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1534 if (IS_ERR(dnew)) 1553 if (IS_ERR(dnew))
1535 goto out_nfserr; 1554 goto out_nfserr;
1536 1555
1537 if (unlikely(path[plen] != 0)) { 1556 host_err = vfs_symlink(dentry->d_inode, dnew, path);
1538 char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
1539 if (path_alloced == NULL)
1540 host_err = -ENOMEM;
1541 else {
1542 strncpy(path_alloced, path, plen);
1543 path_alloced[plen] = 0;
1544 host_err = vfs_symlink(dentry->d_inode, dnew, path_alloced);
1545 kfree(path_alloced);
1546 }
1547 } else
1548 host_err = vfs_symlink(dentry->d_inode, dnew, path);
1549 err = nfserrno(host_err); 1557 err = nfserrno(host_err);
1550 if (!err) 1558 if (!err)
1551 err = nfserrno(commit_metadata(fhp)); 1559 err = nfserrno(commit_metadata(fhp));
@@ -2093,8 +2101,7 @@ nfsd_racache_init(int cache_size)
2093 if (raparm_hash[0].pb_head) 2101 if (raparm_hash[0].pb_head)
2094 return 0; 2102 return 0;
2095 nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); 2103 nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE);
2096 if (nperbucket < 2) 2104 nperbucket = max(2, nperbucket);
2097 nperbucket = 2;
2098 cache_size = nperbucket * RAPARM_HASH_SIZE; 2105 cache_size = nperbucket * RAPARM_HASH_SIZE;
2099 2106
2100 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); 2107 dprintk("nfsd: allocating %d readahead buffers.\n", cache_size);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 91b6ae3f658b..c2ff3f14e5f6 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -74,9 +74,9 @@ struct raparms;
74__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, 74__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *,
75 struct file **, struct raparms **); 75 struct file **, struct raparms **);
76void nfsd_put_tmp_read_open(struct file *, struct raparms *); 76void nfsd_put_tmp_read_open(struct file *, struct raparms *);
77int nfsd_splice_read(struct svc_rqst *, 77__be32 nfsd_splice_read(struct svc_rqst *,
78 struct file *, loff_t, unsigned long *); 78 struct file *, loff_t, unsigned long *);
79int nfsd_readv(struct file *, loff_t, struct kvec *, int, 79__be32 nfsd_readv(struct file *, loff_t, struct kvec *, int,
80 unsigned long *); 80 unsigned long *);
81__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, 81__be32 nfsd_read(struct svc_rqst *, struct svc_fh *,
82 loff_t, struct kvec *, int, unsigned long *); 82 loff_t, struct kvec *, int, unsigned long *);
@@ -85,8 +85,8 @@ __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *,
85__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, 85__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
86 char *, int *); 86 char *, int *);
87__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, 87__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
88 char *name, int len, char *path, int plen, 88 char *name, int len, char *path,
89 struct svc_fh *res, struct iattr *); 89 struct svc_fh *res);
90__be32 nfsd_link(struct svc_rqst *, struct svc_fh *, 90__be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
91 char *, int, struct svc_fh *); 91 char *, int, struct svc_fh *);
92__be32 nfsd_rename(struct svc_rqst *, 92__be32 nfsd_rename(struct svc_rqst *,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 18cbb6d9c8a9..5720e9457f33 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -55,6 +55,7 @@ struct nfsd4_compound_state {
55 struct svc_fh current_fh; 55 struct svc_fh current_fh;
56 struct svc_fh save_fh; 56 struct svc_fh save_fh;
57 struct nfs4_stateowner *replay_owner; 57 struct nfs4_stateowner *replay_owner;
58 struct nfs4_client *clp;
58 /* For sessions DRC */ 59 /* For sessions DRC */
59 struct nfsd4_session *session; 60 struct nfsd4_session *session;
60 struct nfsd4_slot *slot; 61 struct nfsd4_slot *slot;
@@ -107,8 +108,8 @@ struct nfsd4_create {
107 u32 cr_type; /* request */ 108 u32 cr_type; /* request */
108 union { /* request */ 109 union { /* request */
109 struct { 110 struct {
110 u32 namelen; 111 u32 datalen;
111 char *name; 112 char *data;
112 } link; /* NF4LNK */ 113 } link; /* NF4LNK */
113 struct { 114 struct {
114 u32 specdata1; 115 u32 specdata1;
@@ -121,8 +122,8 @@ struct nfsd4_create {
121 struct nfs4_acl *cr_acl; 122 struct nfs4_acl *cr_acl;
122 struct xdr_netobj cr_label; 123 struct xdr_netobj cr_label;
123}; 124};
124#define cr_linklen u.link.namelen 125#define cr_datalen u.link.datalen
125#define cr_linkname u.link.name 126#define cr_data u.link.data
126#define cr_specdata1 u.dev.specdata1 127#define cr_specdata1 u.dev.specdata1
127#define cr_specdata2 u.dev.specdata2 128#define cr_specdata2 u.dev.specdata2
128 129
@@ -427,6 +428,17 @@ struct nfsd4_reclaim_complete {
427 u32 rca_one_fs; 428 u32 rca_one_fs;
428}; 429};
429 430
431struct nfsd4_seek {
432 /* request */
433 stateid_t seek_stateid;
434 loff_t seek_offset;
435 u32 seek_whence;
436
437 /* response */
438 u32 seek_eof;
439 loff_t seek_pos;
440};
441
430struct nfsd4_op { 442struct nfsd4_op {
431 int opnum; 443 int opnum;
432 __be32 status; 444 __be32 status;
@@ -472,12 +484,23 @@ struct nfsd4_op {
472 struct nfsd4_reclaim_complete reclaim_complete; 484 struct nfsd4_reclaim_complete reclaim_complete;
473 struct nfsd4_test_stateid test_stateid; 485 struct nfsd4_test_stateid test_stateid;
474 struct nfsd4_free_stateid free_stateid; 486 struct nfsd4_free_stateid free_stateid;
487
488 /* NFSv4.2 */
489 struct nfsd4_seek seek;
475 } u; 490 } u;
476 struct nfs4_replay * replay; 491 struct nfs4_replay * replay;
477}; 492};
478 493
479bool nfsd4_cache_this_op(struct nfsd4_op *); 494bool nfsd4_cache_this_op(struct nfsd4_op *);
480 495
496/*
497 * Memory needed just for the duration of processing one compound:
498 */
499struct svcxdr_tmpbuf {
500 struct svcxdr_tmpbuf *next;
501 char buf[];
502};
503
481struct nfsd4_compoundargs { 504struct nfsd4_compoundargs {
482 /* scratch variables for XDR decode */ 505 /* scratch variables for XDR decode */
483 __be32 * p; 506 __be32 * p;
@@ -486,11 +509,7 @@ struct nfsd4_compoundargs {
486 int pagelen; 509 int pagelen;
487 __be32 tmp[8]; 510 __be32 tmp[8];
488 __be32 * tmpp; 511 __be32 * tmpp;
489 struct tmpbuf { 512 struct svcxdr_tmpbuf *to_free;
490 struct tmpbuf *next;
491 void (*release)(const void *);
492 void *buf;
493 } *to_free;
494 513
495 struct svc_rqst *rqstp; 514 struct svc_rqst *rqstp;
496 515
@@ -574,7 +593,6 @@ extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
574extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp, 593extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
575 struct nfsd4_compound_state *, 594 struct nfsd4_compound_state *,
576 struct nfsd4_setclientid_confirm *setclientid_confirm); 595 struct nfsd4_setclientid_confirm *setclientid_confirm);
577extern void nfsd4_store_cache_entry(struct nfsd4_compoundres *resp);
578extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp, 596extern __be32 nfsd4_exchange_id(struct svc_rqst *rqstp,
579 struct nfsd4_compound_state *, struct nfsd4_exchange_id *); 597 struct nfsd4_compound_state *, struct nfsd4_exchange_id *);
580extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *); 598extern __be32 nfsd4_backchannel_ctl(struct svc_rqst *, struct nfsd4_compound_state *, struct nfsd4_backchannel_ctl *);
@@ -585,6 +603,7 @@ extern __be32 nfsd4_create_session(struct svc_rqst *,
585extern __be32 nfsd4_sequence(struct svc_rqst *, 603extern __be32 nfsd4_sequence(struct svc_rqst *,
586 struct nfsd4_compound_state *, 604 struct nfsd4_compound_state *,
587 struct nfsd4_sequence *); 605 struct nfsd4_sequence *);
606extern void nfsd4_sequence_done(struct nfsd4_compoundres *resp);
588extern __be32 nfsd4_destroy_session(struct svc_rqst *, 607extern __be32 nfsd4_destroy_session(struct svc_rqst *,
589 struct nfsd4_compound_state *, 608 struct nfsd4_compound_state *,
590 struct nfsd4_destroy_session *); 609 struct nfsd4_destroy_session *);
@@ -594,7 +613,9 @@ extern __be32 nfsd4_process_open1(struct nfsd4_compound_state *,
594 struct nfsd4_open *open, struct nfsd_net *nn); 613 struct nfsd4_open *open, struct nfsd_net *nn);
595extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, 614extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp,
596 struct svc_fh *current_fh, struct nfsd4_open *open); 615 struct svc_fh *current_fh, struct nfsd4_open *open);
597extern void nfsd4_cleanup_open_state(struct nfsd4_open *open, __be32 status); 616extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate);
617extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate,
618 struct nfsd4_open *open, __be32 status);
598extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, 619extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp,
599 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); 620 struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc);
600extern __be32 nfsd4_close(struct svc_rqst *rqstp, 621extern __be32 nfsd4_close(struct svc_rqst *rqstp,
@@ -625,6 +646,7 @@ extern __be32 nfsd4_test_stateid(struct svc_rqst *rqstp,
625extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, 646extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp,
626 struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid); 647 struct nfsd4_compound_state *, struct nfsd4_free_stateid *free_stateid);
627extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr); 648extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr);
649
628#endif 650#endif
629 651
630/* 652/*
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
index 85c98737a146..fc603e0431bb 100644
--- a/fs/nilfs2/Makefile
+++ b/fs/nilfs2/Makefile
@@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ 2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \ 3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ 4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o 5 ifile.o alloc.o gcinode.o ioctl.o sysfs.o
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 24978153c0c4..e9e3325f29f3 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -56,11 +56,9 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
56 mutex_unlock(&inode->i_mutex); 56 mutex_unlock(&inode->i_mutex);
57 57
58 nilfs = inode->i_sb->s_fs_info; 58 nilfs = inode->i_sb->s_fs_info;
59 if (!err && nilfs_test_opt(nilfs, BARRIER)) { 59 if (!err)
60 err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 60 err = nilfs_flush_device(nilfs);
61 if (err != -EIO) 61
62 err = 0;
63 }
64 return err; 62 return err;
65} 63}
66 64
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6252b173a465..e1fa69b341b9 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -24,6 +24,7 @@
24#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/pagemap.h>
27#include <linux/writeback.h> 28#include <linux/writeback.h>
28#include <linux/aio.h> 29#include <linux/aio.h>
29#include "nilfs.h" 30#include "nilfs.h"
@@ -125,7 +126,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
125 nilfs_transaction_abort(inode->i_sb); 126 nilfs_transaction_abort(inode->i_sb);
126 goto out; 127 goto out;
127 } 128 }
128 nilfs_mark_inode_dirty(inode); 129 nilfs_mark_inode_dirty_sync(inode);
129 nilfs_transaction_commit(inode->i_sb); /* never fails */ 130 nilfs_transaction_commit(inode->i_sb); /* never fails */
130 /* Error handling should be detailed */ 131 /* Error handling should be detailed */
131 set_buffer_new(bh_result); 132 set_buffer_new(bh_result);
@@ -219,10 +220,10 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
219 220
220static int nilfs_set_page_dirty(struct page *page) 221static int nilfs_set_page_dirty(struct page *page)
221{ 222{
223 struct inode *inode = page->mapping->host;
222 int ret = __set_page_dirty_nobuffers(page); 224 int ret = __set_page_dirty_nobuffers(page);
223 225
224 if (page_has_buffers(page)) { 226 if (page_has_buffers(page)) {
225 struct inode *inode = page->mapping->host;
226 unsigned nr_dirty = 0; 227 unsigned nr_dirty = 0;
227 struct buffer_head *bh, *head; 228 struct buffer_head *bh, *head;
228 229
@@ -245,6 +246,10 @@ static int nilfs_set_page_dirty(struct page *page)
245 246
246 if (nr_dirty) 247 if (nr_dirty)
247 nilfs_set_file_dirty(inode, nr_dirty); 248 nilfs_set_file_dirty(inode, nr_dirty);
249 } else if (ret) {
250 unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
251
252 nilfs_set_file_dirty(inode, nr_dirty);
248 } 253 }
249 return ret; 254 return ret;
250} 255}
@@ -667,7 +672,7 @@ void nilfs_write_inode_common(struct inode *inode,
667 for substitutions of appended fields */ 672 for substitutions of appended fields */
668} 673}
669 674
670void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) 675void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
671{ 676{
672 ino_t ino = inode->i_ino; 677 ino_t ino = inode->i_ino;
673 struct nilfs_inode_info *ii = NILFS_I(inode); 678 struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -678,7 +683,8 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
678 683
679 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) 684 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
680 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); 685 memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
681 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 686 if (flags & I_DIRTY_DATASYNC)
687 set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
682 688
683 nilfs_write_inode_common(inode, raw_inode, 0); 689 nilfs_write_inode_common(inode, raw_inode, 0);
684 /* XXX: call with has_bmap = 0 is a workaround to avoid 690 /* XXX: call with has_bmap = 0 is a workaround to avoid
@@ -934,7 +940,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
934 return 0; 940 return 0;
935} 941}
936 942
937int nilfs_mark_inode_dirty(struct inode *inode) 943int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
938{ 944{
939 struct buffer_head *ibh; 945 struct buffer_head *ibh;
940 int err; 946 int err;
@@ -945,7 +951,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
945 "failed to reget inode block.\n"); 951 "failed to reget inode block.\n");
946 return err; 952 return err;
947 } 953 }
948 nilfs_update_inode(inode, ibh); 954 nilfs_update_inode(inode, ibh, flags);
949 mark_buffer_dirty(ibh); 955 mark_buffer_dirty(ibh);
950 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); 956 nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
951 brelse(ibh); 957 brelse(ibh);
@@ -978,7 +984,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
978 return; 984 return;
979 } 985 }
980 nilfs_transaction_begin(inode->i_sb, &ti, 0); 986 nilfs_transaction_begin(inode->i_sb, &ti, 0);
981 nilfs_mark_inode_dirty(inode); 987 __nilfs_mark_inode_dirty(inode, flags);
982 nilfs_transaction_commit(inode->i_sb); /* never fails */ 988 nilfs_transaction_commit(inode->i_sb); /* never fails */
983} 989}
984 990
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 422fb54b7377..9a20e513d7eb 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1022,11 +1022,9 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
1022 return ret; 1022 return ret;
1023 1023
1024 nilfs = inode->i_sb->s_fs_info; 1024 nilfs = inode->i_sb->s_fs_info;
1025 if (nilfs_test_opt(nilfs, BARRIER)) { 1025 ret = nilfs_flush_device(nilfs);
1026 ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 1026 if (ret < 0)
1027 if (ret == -EIO) 1027 return ret;
1028 return ret;
1029 }
1030 1028
1031 if (argp != NULL) { 1029 if (argp != NULL) {
1032 down_read(&nilfs->ns_segctor_sem); 1030 down_read(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 9bc72dec3fa6..91093cd74f0d 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -104,7 +104,7 @@ enum {
104 constructor */ 104 constructor */
105 NILFS_I_COLLECTED, /* All dirty blocks are collected */ 105 NILFS_I_COLLECTED, /* All dirty blocks are collected */
106 NILFS_I_UPDATED, /* The file has been written back */ 106 NILFS_I_UPDATED, /* The file has been written back */
107 NILFS_I_INODE_DIRTY, /* write_inode is requested */ 107 NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
108 NILFS_I_BMAP, /* has bmap and btnode_cache */ 108 NILFS_I_BMAP, /* has bmap and btnode_cache */
109 NILFS_I_GCINODE, /* inode for GC, on memory only */ 109 NILFS_I_GCINODE, /* inode for GC, on memory only */
110}; 110};
@@ -273,7 +273,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
273 unsigned long ino); 273 unsigned long ino);
274extern struct inode *nilfs_iget_for_gc(struct super_block *sb, 274extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
275 unsigned long ino, __u64 cno); 275 unsigned long ino, __u64 cno);
276extern void nilfs_update_inode(struct inode *, struct buffer_head *); 276extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
277extern void nilfs_truncate(struct inode *); 277extern void nilfs_truncate(struct inode *);
278extern void nilfs_evict_inode(struct inode *); 278extern void nilfs_evict_inode(struct inode *);
279extern int nilfs_setattr(struct dentry *, struct iattr *); 279extern int nilfs_setattr(struct dentry *, struct iattr *);
@@ -282,10 +282,18 @@ int nilfs_permission(struct inode *inode, int mask);
282int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); 282int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
283extern int nilfs_inode_dirty(struct inode *); 283extern int nilfs_inode_dirty(struct inode *);
284int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); 284int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
285extern int nilfs_mark_inode_dirty(struct inode *); 285extern int __nilfs_mark_inode_dirty(struct inode *, int);
286extern void nilfs_dirty_inode(struct inode *, int flags); 286extern void nilfs_dirty_inode(struct inode *, int flags);
287int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 287int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
288 __u64 start, __u64 len); 288 __u64 start, __u64 len);
289static inline int nilfs_mark_inode_dirty(struct inode *inode)
290{
291 return __nilfs_mark_inode_dirty(inode, I_DIRTY);
292}
293static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
294{
295 return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC);
296}
289 297
290/* super.c */ 298/* super.c */
291extern struct inode *nilfs_alloc_inode(struct super_block *); 299extern struct inode *nilfs_alloc_inode(struct super_block *);
@@ -320,6 +328,14 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
320int nilfs_init_gcinode(struct inode *inode); 328int nilfs_init_gcinode(struct inode *inode);
321void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs); 329void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs);
322 330
331/* sysfs.c */
332int __init nilfs_sysfs_init(void);
333void nilfs_sysfs_exit(void);
334int nilfs_sysfs_create_device_group(struct super_block *);
335void nilfs_sysfs_delete_device_group(struct the_nilfs *);
336int nilfs_sysfs_create_snapshot_group(struct nilfs_root *);
337void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *);
338
323/* 339/*
324 * Inodes and files operations 340 * Inodes and files operations
325 */ 341 */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a1a191634abc..7ef18fc656c2 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -930,7 +930,7 @@ static void nilfs_drop_collected_inodes(struct list_head *head)
930 if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) 930 if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
931 continue; 931 continue;
932 932
933 clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state); 933 clear_bit(NILFS_I_INODE_SYNC, &ii->i_state);
934 set_bit(NILFS_I_UPDATED, &ii->i_state); 934 set_bit(NILFS_I_UPDATED, &ii->i_state);
935 } 935 }
936} 936}
@@ -1833,6 +1833,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1833 nilfs_set_next_segment(nilfs, segbuf); 1833 nilfs_set_next_segment(nilfs, segbuf);
1834 1834
1835 if (update_sr) { 1835 if (update_sr) {
1836 nilfs->ns_flushed_device = 0;
1836 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, 1837 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
1837 segbuf->sb_sum.seg_seq, nilfs->ns_cno++); 1838 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
1838 1839
@@ -2194,7 +2195,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2194 nilfs_transaction_lock(sb, &ti, 0); 2195 nilfs_transaction_lock(sb, &ti, 0);
2195 2196
2196 ii = NILFS_I(inode); 2197 ii = NILFS_I(inode);
2197 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || 2198 if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) ||
2198 nilfs_test_opt(nilfs, STRICT_ORDER) || 2199 nilfs_test_opt(nilfs, STRICT_ORDER) ||
2199 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || 2200 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2200 nilfs_discontinued(nilfs)) { 2201 nilfs_discontinued(nilfs)) {
@@ -2216,6 +2217,8 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2216 sci->sc_dsync_end = end; 2217 sci->sc_dsync_end = end;
2217 2218
2218 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); 2219 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
2220 if (!err)
2221 nilfs->ns_flushed_device = 0;
2219 2222
2220 nilfs_transaction_unlock(sb); 2223 nilfs_transaction_unlock(sb);
2221 return err; 2224 return err;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8c532b2ca3ab..2e5b3ec85b8f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -310,6 +310,9 @@ int nilfs_commit_super(struct super_block *sb, int flag)
310 nilfs->ns_sbsize)); 310 nilfs->ns_sbsize));
311 } 311 }
312 clear_nilfs_sb_dirty(nilfs); 312 clear_nilfs_sb_dirty(nilfs);
313 nilfs->ns_flushed_device = 1;
314 /* make sure store to ns_flushed_device cannot be reordered */
315 smp_wmb();
313 return nilfs_sync_super(sb, flag); 316 return nilfs_sync_super(sb, flag);
314} 317}
315 318
@@ -514,6 +517,9 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
514 } 517 }
515 up_write(&nilfs->ns_sem); 518 up_write(&nilfs->ns_sem);
516 519
520 if (!err)
521 err = nilfs_flush_device(nilfs);
522
517 return err; 523 return err;
518} 524}
519 525
@@ -942,7 +948,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
942 iput(inode); 948 iput(inode);
943 } 949 }
944 } else { 950 } else {
945 dentry = d_obtain_alias(inode); 951 dentry = d_obtain_root(inode);
946 if (IS_ERR(dentry)) { 952 if (IS_ERR(dentry)) {
947 ret = PTR_ERR(dentry); 953 ret = PTR_ERR(dentry);
948 goto failed_dentry; 954 goto failed_dentry;
@@ -1452,13 +1458,19 @@ static int __init init_nilfs_fs(void)
1452 if (err) 1458 if (err)
1453 goto fail; 1459 goto fail;
1454 1460
1455 err = register_filesystem(&nilfs_fs_type); 1461 err = nilfs_sysfs_init();
1456 if (err) 1462 if (err)
1457 goto free_cachep; 1463 goto free_cachep;
1458 1464
1465 err = register_filesystem(&nilfs_fs_type);
1466 if (err)
1467 goto deinit_sysfs_entry;
1468
1459 printk(KERN_INFO "NILFS version 2 loaded\n"); 1469 printk(KERN_INFO "NILFS version 2 loaded\n");
1460 return 0; 1470 return 0;
1461 1471
1472deinit_sysfs_entry:
1473 nilfs_sysfs_exit();
1462free_cachep: 1474free_cachep:
1463 nilfs_destroy_cachep(); 1475 nilfs_destroy_cachep();
1464fail: 1476fail:
@@ -1468,6 +1480,7 @@ fail:
1468static void __exit exit_nilfs_fs(void) 1480static void __exit exit_nilfs_fs(void)
1469{ 1481{
1470 nilfs_destroy_cachep(); 1482 nilfs_destroy_cachep();
1483 nilfs_sysfs_exit();
1471 unregister_filesystem(&nilfs_fs_type); 1484 unregister_filesystem(&nilfs_fs_type);
1472} 1485}
1473 1486
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
new file mode 100644
index 000000000000..bbb0dcc35905
--- /dev/null
+++ b/fs/nilfs2/sysfs.c
@@ -0,0 +1,1137 @@
1/*
2 * sysfs.c - sysfs support implementation.
3 *
4 * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
5 * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
18 */
19
20#include <linux/kobject.h>
21
22#include "nilfs.h"
23#include "mdt.h"
24#include "sufile.h"
25#include "cpfile.h"
26#include "sysfs.h"
27
28/* /sys/fs/<nilfs>/ */
29static struct kset *nilfs_kset;
30
31#define NILFS_SHOW_TIME(time_t_val, buf) ({ \
32 struct tm res; \
33 int count = 0; \
34 time_to_tm(time_t_val, 0, &res); \
35 res.tm_year += 1900; \
36 res.tm_mon += 1; \
37 count = scnprintf(buf, PAGE_SIZE, \
38 "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \
39 res.tm_year, res.tm_mon, res.tm_mday, \
40 res.tm_hour, res.tm_min, res.tm_sec);\
41 count; \
42})
43
44#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \
45static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \
46 struct attribute *attr, char *buf) \
47{ \
48 struct the_nilfs *nilfs = container_of(kobj->parent, \
49 struct the_nilfs, \
50 ns_##parent_name##_kobj); \
51 struct nilfs_##name##_attr *a = container_of(attr, \
52 struct nilfs_##name##_attr, \
53 attr); \
54 return a->show ? a->show(a, nilfs, buf) : 0; \
55} \
56static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \
57 struct attribute *attr, \
58 const char *buf, size_t len) \
59{ \
60 struct the_nilfs *nilfs = container_of(kobj->parent, \
61 struct the_nilfs, \
62 ns_##parent_name##_kobj); \
63 struct nilfs_##name##_attr *a = container_of(attr, \
64 struct nilfs_##name##_attr, \
65 attr); \
66 return a->store ? a->store(a, nilfs, buf, len) : 0; \
67} \
68static const struct sysfs_ops nilfs_##name##_attr_ops = { \
69 .show = nilfs_##name##_attr_show, \
70 .store = nilfs_##name##_attr_store, \
71};
72
73#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \
74static void nilfs_##name##_attr_release(struct kobject *kobj) \
75{ \
76 struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
77 struct the_nilfs *nilfs = container_of(kobj->parent, \
78 struct the_nilfs, \
79 ns_##parent_name##_kobj); \
80 subgroups = nilfs->ns_##parent_name##_subgroups; \
81 complete(&subgroups->sg_##name##_kobj_unregister); \
82} \
83static struct kobj_type nilfs_##name##_ktype = { \
84 .default_attrs = nilfs_##name##_attrs, \
85 .sysfs_ops = &nilfs_##name##_attr_ops, \
86 .release = nilfs_##name##_attr_release, \
87};
88
89#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \
90static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \
91{ \
92 struct kobject *parent; \
93 struct kobject *kobj; \
94 struct completion *kobj_unregister; \
95 struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \
96 int err; \
97 subgroups = nilfs->ns_##parent_name##_subgroups; \
98 kobj = &subgroups->sg_##name##_kobj; \
99 kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \
100 parent = &nilfs->ns_##parent_name##_kobj; \
101 kobj->kset = nilfs_kset; \
102 init_completion(kobj_unregister); \
103 err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \
104 #name); \
105 if (err) \
106 return err; \
107 return 0; \
108} \
109static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \
110{ \
111 kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \
112}
113
114/************************************************************************
115 * NILFS snapshot attrs *
116 ************************************************************************/
117
118static ssize_t
119nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr,
120 struct nilfs_root *root, char *buf)
121{
122 return snprintf(buf, PAGE_SIZE, "%llu\n",
123 (unsigned long long)atomic64_read(&root->inodes_count));
124}
125
126static ssize_t
127nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr,
128 struct nilfs_root *root, char *buf)
129{
130 return snprintf(buf, PAGE_SIZE, "%llu\n",
131 (unsigned long long)atomic64_read(&root->blocks_count));
132}
133
134static const char snapshot_readme_str[] =
135 "The group contains details about mounted snapshot.\n\n"
136 "(1) inodes_count\n\tshow number of inodes for snapshot.\n\n"
137 "(2) blocks_count\n\tshow number of blocks for snapshot.\n\n";
138
139static ssize_t
140nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr,
141 struct nilfs_root *root, char *buf)
142{
143 return snprintf(buf, PAGE_SIZE, snapshot_readme_str);
144}
145
146NILFS_SNAPSHOT_RO_ATTR(inodes_count);
147NILFS_SNAPSHOT_RO_ATTR(blocks_count);
148NILFS_SNAPSHOT_RO_ATTR(README);
149
150static struct attribute *nilfs_snapshot_attrs[] = {
151 NILFS_SNAPSHOT_ATTR_LIST(inodes_count),
152 NILFS_SNAPSHOT_ATTR_LIST(blocks_count),
153 NILFS_SNAPSHOT_ATTR_LIST(README),
154 NULL,
155};
156
157static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj,
158 struct attribute *attr, char *buf)
159{
160 struct nilfs_root *root =
161 container_of(kobj, struct nilfs_root, snapshot_kobj);
162 struct nilfs_snapshot_attr *a =
163 container_of(attr, struct nilfs_snapshot_attr, attr);
164
165 return a->show ? a->show(a, root, buf) : 0;
166}
167
168static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj,
169 struct attribute *attr,
170 const char *buf, size_t len)
171{
172 struct nilfs_root *root =
173 container_of(kobj, struct nilfs_root, snapshot_kobj);
174 struct nilfs_snapshot_attr *a =
175 container_of(attr, struct nilfs_snapshot_attr, attr);
176
177 return a->store ? a->store(a, root, buf, len) : 0;
178}
179
180static void nilfs_snapshot_attr_release(struct kobject *kobj)
181{
182 struct nilfs_root *root = container_of(kobj, struct nilfs_root,
183 snapshot_kobj);
184 complete(&root->snapshot_kobj_unregister);
185}
186
187static const struct sysfs_ops nilfs_snapshot_attr_ops = {
188 .show = nilfs_snapshot_attr_show,
189 .store = nilfs_snapshot_attr_store,
190};
191
192static struct kobj_type nilfs_snapshot_ktype = {
193 .default_attrs = nilfs_snapshot_attrs,
194 .sysfs_ops = &nilfs_snapshot_attr_ops,
195 .release = nilfs_snapshot_attr_release,
196};
197
198int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root)
199{
200 struct the_nilfs *nilfs;
201 struct kobject *parent;
202 int err;
203
204 nilfs = root->nilfs;
205 parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj;
206 root->snapshot_kobj.kset = nilfs_kset;
207 init_completion(&root->snapshot_kobj_unregister);
208
209 if (root->cno == NILFS_CPTREE_CURRENT_CNO) {
210 err = kobject_init_and_add(&root->snapshot_kobj,
211 &nilfs_snapshot_ktype,
212 &nilfs->ns_dev_kobj,
213 "current_checkpoint");
214 } else {
215 err = kobject_init_and_add(&root->snapshot_kobj,
216 &nilfs_snapshot_ktype,
217 parent,
218 "%llu", root->cno);
219 }
220
221 if (err)
222 return err;
223
224 return 0;
225}
226
227void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root)
228{
229 kobject_del(&root->snapshot_kobj);
230}
231
232/************************************************************************
233 * NILFS mounted snapshots attrs *
234 ************************************************************************/
235
236static const char mounted_snapshots_readme_str[] =
237 "The mounted_snapshots group contains group for\n"
238 "every mounted snapshot.\n";
239
240static ssize_t
241nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr,
242 struct the_nilfs *nilfs, char *buf)
243{
244 return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str);
245}
246
247NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README);
248
249static struct attribute *nilfs_mounted_snapshots_attrs[] = {
250 NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README),
251 NULL,
252};
253
254NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev);
255NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev);
256NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev);
257
258/************************************************************************
259 * NILFS checkpoints attrs *
260 ************************************************************************/
261
262static ssize_t
263nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
264 struct the_nilfs *nilfs,
265 char *buf)
266{
267 __u64 ncheckpoints;
268 struct nilfs_cpstat cpstat;
269 int err;
270
271 down_read(&nilfs->ns_segctor_sem);
272 err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
273 up_read(&nilfs->ns_segctor_sem);
274 if (err < 0) {
275 printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
276 err);
277 return err;
278 }
279
280 ncheckpoints = cpstat.cs_ncps;
281
282 return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints);
283}
284
285static ssize_t
286nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
287 struct the_nilfs *nilfs,
288 char *buf)
289{
290 __u64 nsnapshots;
291 struct nilfs_cpstat cpstat;
292 int err;
293
294 down_read(&nilfs->ns_segctor_sem);
295 err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
296 up_read(&nilfs->ns_segctor_sem);
297 if (err < 0) {
298 printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
299 err);
300 return err;
301 }
302
303 nsnapshots = cpstat.cs_nsss;
304
305 return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots);
306}
307
308static ssize_t
309nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr,
310 struct the_nilfs *nilfs,
311 char *buf)
312{
313 __u64 last_cno;
314
315 spin_lock(&nilfs->ns_last_segment_lock);
316 last_cno = nilfs->ns_last_cno;
317 spin_unlock(&nilfs->ns_last_segment_lock);
318
319 return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
320}
321
322static ssize_t
323nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
324 struct the_nilfs *nilfs,
325 char *buf)
326{
327 __u64 cno;
328
329 down_read(&nilfs->ns_sem);
330 cno = nilfs->ns_cno;
331 up_read(&nilfs->ns_sem);
332
333 return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
334}
335
336static const char checkpoints_readme_str[] =
337 "The checkpoints group contains attributes that describe\n"
338 "details about volume's checkpoints.\n\n"
339 "(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n"
340 "(2) snapshots_number\n\tshow number of snapshots on volume.\n\n"
341 "(3) last_seg_checkpoint\n"
342 "\tshow checkpoint number of the latest segment.\n\n"
343 "(4) next_checkpoint\n\tshow next checkpoint number.\n\n";
344
345static ssize_t
346nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr,
347 struct the_nilfs *nilfs, char *buf)
348{
349 return snprintf(buf, PAGE_SIZE, checkpoints_readme_str);
350}
351
352NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number);
353NILFS_CHECKPOINTS_RO_ATTR(snapshots_number);
354NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint);
355NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint);
356NILFS_CHECKPOINTS_RO_ATTR(README);
357
358static struct attribute *nilfs_checkpoints_attrs[] = {
359 NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number),
360 NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number),
361 NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint),
362 NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint),
363 NILFS_CHECKPOINTS_ATTR_LIST(README),
364 NULL,
365};
366
367NILFS_DEV_INT_GROUP_OPS(checkpoints, dev);
368NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev);
369NILFS_DEV_INT_GROUP_FNS(checkpoints, dev);
370
371/************************************************************************
372 * NILFS segments attrs *
373 ************************************************************************/
374
375static ssize_t
376nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr,
377 struct the_nilfs *nilfs,
378 char *buf)
379{
380 return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments);
381}
382
383static ssize_t
384nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr,
385 struct the_nilfs *nilfs,
386 char *buf)
387{
388 return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment);
389}
390
391static ssize_t
392nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr,
393 struct the_nilfs *nilfs,
394 char *buf)
395{
396 unsigned long ncleansegs;
397
398 down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
399 ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
400 up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
401
402 return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs);
403}
404
405static ssize_t
406nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
407 struct the_nilfs *nilfs,
408 char *buf)
409{
410 struct nilfs_sustat sustat;
411 int err;
412
413 down_read(&nilfs->ns_segctor_sem);
414 err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
415 up_read(&nilfs->ns_segctor_sem);
416 if (err < 0) {
417 printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n",
418 err);
419 return err;
420 }
421
422 return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs);
423}
424
425static const char segments_readme_str[] =
426 "The segments group contains attributes that describe\n"
427 "details about volume's segments.\n\n"
428 "(1) segments_number\n\tshow number of segments on volume.\n\n"
429 "(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n"
430 "(3) clean_segments\n\tshow count of clean segments.\n\n"
431 "(4) dirty_segments\n\tshow count of dirty segments.\n\n";
432
433static ssize_t
434nilfs_segments_README_show(struct nilfs_segments_attr *attr,
435 struct the_nilfs *nilfs,
436 char *buf)
437{
438 return snprintf(buf, PAGE_SIZE, segments_readme_str);
439}
440
441NILFS_SEGMENTS_RO_ATTR(segments_number);
442NILFS_SEGMENTS_RO_ATTR(blocks_per_segment);
443NILFS_SEGMENTS_RO_ATTR(clean_segments);
444NILFS_SEGMENTS_RO_ATTR(dirty_segments);
445NILFS_SEGMENTS_RO_ATTR(README);
446
447static struct attribute *nilfs_segments_attrs[] = {
448 NILFS_SEGMENTS_ATTR_LIST(segments_number),
449 NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment),
450 NILFS_SEGMENTS_ATTR_LIST(clean_segments),
451 NILFS_SEGMENTS_ATTR_LIST(dirty_segments),
452 NILFS_SEGMENTS_ATTR_LIST(README),
453 NULL,
454};
455
456NILFS_DEV_INT_GROUP_OPS(segments, dev);
457NILFS_DEV_INT_GROUP_TYPE(segments, dev);
458NILFS_DEV_INT_GROUP_FNS(segments, dev);
459
460/************************************************************************
461 * NILFS segctor attrs *
462 ************************************************************************/
463
464static ssize_t
465nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr,
466 struct the_nilfs *nilfs,
467 char *buf)
468{
469 sector_t last_pseg;
470
471 spin_lock(&nilfs->ns_last_segment_lock);
472 last_pseg = nilfs->ns_last_pseg;
473 spin_unlock(&nilfs->ns_last_segment_lock);
474
475 return snprintf(buf, PAGE_SIZE, "%llu\n",
476 (unsigned long long)last_pseg);
477}
478
479static ssize_t
480nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr,
481 struct the_nilfs *nilfs,
482 char *buf)
483{
484 u64 last_seq;
485
486 spin_lock(&nilfs->ns_last_segment_lock);
487 last_seq = nilfs->ns_last_seq;
488 spin_unlock(&nilfs->ns_last_segment_lock);
489
490 return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq);
491}
492
493static ssize_t
494nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr,
495 struct the_nilfs *nilfs,
496 char *buf)
497{
498 __u64 last_cno;
499
500 spin_lock(&nilfs->ns_last_segment_lock);
501 last_cno = nilfs->ns_last_cno;
502 spin_unlock(&nilfs->ns_last_segment_lock);
503
504 return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno);
505}
506
507static ssize_t
508nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
509 struct the_nilfs *nilfs,
510 char *buf)
511{
512 u64 seg_seq;
513
514 down_read(&nilfs->ns_sem);
515 seg_seq = nilfs->ns_seg_seq;
516 up_read(&nilfs->ns_sem);
517
518 return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
519}
520
521static ssize_t
522nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
523 struct the_nilfs *nilfs,
524 char *buf)
525{
526 __u64 segnum;
527
528 down_read(&nilfs->ns_sem);
529 segnum = nilfs->ns_segnum;
530 up_read(&nilfs->ns_sem);
531
532 return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
533}
534
535static ssize_t
536nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
537 struct the_nilfs *nilfs,
538 char *buf)
539{
540 __u64 nextnum;
541
542 down_read(&nilfs->ns_sem);
543 nextnum = nilfs->ns_nextnum;
544 up_read(&nilfs->ns_sem);
545
546 return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
547}
548
549static ssize_t
550nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
551 struct the_nilfs *nilfs,
552 char *buf)
553{
554 unsigned long pseg_offset;
555
556 down_read(&nilfs->ns_sem);
557 pseg_offset = nilfs->ns_pseg_offset;
558 up_read(&nilfs->ns_sem);
559
560 return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
561}
562
563static ssize_t
564nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
565 struct the_nilfs *nilfs,
566 char *buf)
567{
568 __u64 cno;
569
570 down_read(&nilfs->ns_sem);
571 cno = nilfs->ns_cno;
572 up_read(&nilfs->ns_sem);
573
574 return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
575}
576
577static ssize_t
578nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
579 struct the_nilfs *nilfs,
580 char *buf)
581{
582 time_t ctime;
583
584 down_read(&nilfs->ns_sem);
585 ctime = nilfs->ns_ctime;
586 up_read(&nilfs->ns_sem);
587
588 return NILFS_SHOW_TIME(ctime, buf);
589}
590
591static ssize_t
592nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
593 struct the_nilfs *nilfs,
594 char *buf)
595{
596 time_t ctime;
597
598 down_read(&nilfs->ns_sem);
599 ctime = nilfs->ns_ctime;
600 up_read(&nilfs->ns_sem);
601
602 return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime);
603}
604
605static ssize_t
606nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
607 struct the_nilfs *nilfs,
608 char *buf)
609{
610 time_t nongc_ctime;
611
612 down_read(&nilfs->ns_sem);
613 nongc_ctime = nilfs->ns_nongc_ctime;
614 up_read(&nilfs->ns_sem);
615
616 return NILFS_SHOW_TIME(nongc_ctime, buf);
617}
618
619static ssize_t
620nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
621 struct the_nilfs *nilfs,
622 char *buf)
623{
624 time_t nongc_ctime;
625
626 down_read(&nilfs->ns_sem);
627 nongc_ctime = nilfs->ns_nongc_ctime;
628 up_read(&nilfs->ns_sem);
629
630 return snprintf(buf, PAGE_SIZE, "%llu\n",
631 (unsigned long long)nongc_ctime);
632}
633
634static ssize_t
635nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
636 struct the_nilfs *nilfs,
637 char *buf)
638{
639 u32 ndirtyblks;
640
641 down_read(&nilfs->ns_sem);
642 ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
643 up_read(&nilfs->ns_sem);
644
645 return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
646}
647
648static const char segctor_readme_str[] =
649 "The segctor group contains attributes that describe\n"
650 "segctor thread activity details.\n\n"
651 "(1) last_pseg_block\n"
652 "\tshow start block number of the latest segment.\n\n"
653 "(2) last_seg_sequence\n"
654 "\tshow sequence value of the latest segment.\n\n"
655 "(3) last_seg_checkpoint\n"
656 "\tshow checkpoint number of the latest segment.\n\n"
657 "(4) current_seg_sequence\n\tshow segment sequence counter.\n\n"
658 "(5) current_last_full_seg\n"
659 "\tshow index number of the latest full segment.\n\n"
660 "(6) next_full_seg\n"
661 "\tshow index number of the full segment index to be used next.\n\n"
662 "(7) next_pseg_offset\n"
663 "\tshow offset of next partial segment in the current full segment.\n\n"
664 "(8) next_checkpoint\n\tshow next checkpoint number.\n\n"
665 "(9) last_seg_write_time\n"
666 "\tshow write time of the last segment in human-readable format.\n\n"
667 "(10) last_seg_write_time_secs\n"
668 "\tshow write time of the last segment in seconds.\n\n"
669 "(11) last_nongc_write_time\n"
670 "\tshow write time of the last segment not for cleaner operation "
671 "in human-readable format.\n\n"
672 "(12) last_nongc_write_time_secs\n"
673 "\tshow write time of the last segment not for cleaner operation "
674 "in seconds.\n\n"
675 "(13) dirty_data_blocks_count\n"
676 "\tshow number of dirty data blocks.\n\n";
677
678static ssize_t
679nilfs_segctor_README_show(struct nilfs_segctor_attr *attr,
680 struct the_nilfs *nilfs, char *buf)
681{
682 return snprintf(buf, PAGE_SIZE, segctor_readme_str);
683}
684
685NILFS_SEGCTOR_RO_ATTR(last_pseg_block);
686NILFS_SEGCTOR_RO_ATTR(last_seg_sequence);
687NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint);
688NILFS_SEGCTOR_RO_ATTR(current_seg_sequence);
689NILFS_SEGCTOR_RO_ATTR(current_last_full_seg);
690NILFS_SEGCTOR_RO_ATTR(next_full_seg);
691NILFS_SEGCTOR_RO_ATTR(next_pseg_offset);
692NILFS_SEGCTOR_RO_ATTR(next_checkpoint);
693NILFS_SEGCTOR_RO_ATTR(last_seg_write_time);
694NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs);
695NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time);
696NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs);
697NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count);
698NILFS_SEGCTOR_RO_ATTR(README);
699
700static struct attribute *nilfs_segctor_attrs[] = {
701 NILFS_SEGCTOR_ATTR_LIST(last_pseg_block),
702 NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence),
703 NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint),
704 NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence),
705 NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg),
706 NILFS_SEGCTOR_ATTR_LIST(next_full_seg),
707 NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset),
708 NILFS_SEGCTOR_ATTR_LIST(next_checkpoint),
709 NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time),
710 NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs),
711 NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time),
712 NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs),
713 NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count),
714 NILFS_SEGCTOR_ATTR_LIST(README),
715 NULL,
716};
717
718NILFS_DEV_INT_GROUP_OPS(segctor, dev);
719NILFS_DEV_INT_GROUP_TYPE(segctor, dev);
720NILFS_DEV_INT_GROUP_FNS(segctor, dev);
721
722/************************************************************************
723 * NILFS superblock attrs *
724 ************************************************************************/
725
726static ssize_t
727nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr,
728 struct the_nilfs *nilfs,
729 char *buf)
730{
731 time_t sbwtime;
732
733 down_read(&nilfs->ns_sem);
734 sbwtime = nilfs->ns_sbwtime;
735 up_read(&nilfs->ns_sem);
736
737 return NILFS_SHOW_TIME(sbwtime, buf);
738}
739
740static ssize_t
741nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr,
742 struct the_nilfs *nilfs,
743 char *buf)
744{
745 time_t sbwtime;
746
747 down_read(&nilfs->ns_sem);
748 sbwtime = nilfs->ns_sbwtime;
749 up_read(&nilfs->ns_sem);
750
751 return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime);
752}
753
754static ssize_t
755nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr,
756 struct the_nilfs *nilfs,
757 char *buf)
758{
759 unsigned sbwcount;
760
761 down_read(&nilfs->ns_sem);
762 sbwcount = nilfs->ns_sbwcount;
763 up_read(&nilfs->ns_sem);
764
765 return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount);
766}
767
768static ssize_t
769nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr,
770 struct the_nilfs *nilfs,
771 char *buf)
772{
773 unsigned sb_update_freq;
774
775 down_read(&nilfs->ns_sem);
776 sb_update_freq = nilfs->ns_sb_update_freq;
777 up_read(&nilfs->ns_sem);
778
779 return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq);
780}
781
782static ssize_t
783nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
784 struct the_nilfs *nilfs,
785 const char *buf, size_t count)
786{
787 unsigned val;
788 int err;
789
790 err = kstrtouint(skip_spaces(buf), 0, &val);
791 if (err) {
792 printk(KERN_ERR "NILFS: unable to convert string: err=%d\n",
793 err);
794 return err;
795 }
796
797 if (val < NILFS_SB_FREQ) {
798 val = NILFS_SB_FREQ;
799 printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n");
800 }
801
802 down_write(&nilfs->ns_sem);
803 nilfs->ns_sb_update_freq = val;
804 up_write(&nilfs->ns_sem);
805
806 return count;
807}
808
809static const char sb_readme_str[] =
810 "The superblock group contains attributes that describe\n"
811 "superblock's details.\n\n"
812 "(1) sb_write_time\n\tshow previous write time of super block "
813 "in human-readable format.\n\n"
814 "(2) sb_write_time_secs\n\tshow previous write time of super block "
815 "in seconds.\n\n"
816 "(3) sb_write_count\n\tshow write count of super block.\n\n"
817 "(4) sb_update_frequency\n"
818 "\tshow/set interval of periodical update of superblock (in seconds).\n\n"
819 "\tYou can set preferable frequency of superblock update by command:\n\n"
820 "\t'echo <val> > /sys/fs/<nilfs>/<dev>/superblock/sb_update_frequency'\n";
821
822static ssize_t
823nilfs_superblock_README_show(struct nilfs_superblock_attr *attr,
824 struct the_nilfs *nilfs, char *buf)
825{
826 return snprintf(buf, PAGE_SIZE, sb_readme_str);
827}
828
829NILFS_SUPERBLOCK_RO_ATTR(sb_write_time);
830NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs);
831NILFS_SUPERBLOCK_RO_ATTR(sb_write_count);
832NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency);
833NILFS_SUPERBLOCK_RO_ATTR(README);
834
835static struct attribute *nilfs_superblock_attrs[] = {
836 NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time),
837 NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs),
838 NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count),
839 NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency),
840 NILFS_SUPERBLOCK_ATTR_LIST(README),
841 NULL,
842};
843
844NILFS_DEV_INT_GROUP_OPS(superblock, dev);
845NILFS_DEV_INT_GROUP_TYPE(superblock, dev);
846NILFS_DEV_INT_GROUP_FNS(superblock, dev);
847
848/************************************************************************
849 * NILFS device attrs *
850 ************************************************************************/
851
852static
853ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr,
854 struct the_nilfs *nilfs,
855 char *buf)
856{
857 struct nilfs_super_block **sbp = nilfs->ns_sbp;
858 u32 major = le32_to_cpu(sbp[0]->s_rev_level);
859 u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level);
860
861 return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor);
862}
863
864static
865ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr,
866 struct the_nilfs *nilfs,
867 char *buf)
868{
869 return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize);
870}
871
872static
873ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr,
874 struct the_nilfs *nilfs,
875 char *buf)
876{
877 struct nilfs_super_block **sbp = nilfs->ns_sbp;
878 u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size);
879
880 return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size);
881}
882
883static
884ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr,
885 struct the_nilfs *nilfs,
886 char *buf)
887{
888 sector_t free_blocks = 0;
889
890 nilfs_count_free_blocks(nilfs, &free_blocks);
891 return snprintf(buf, PAGE_SIZE, "%llu\n",
892 (unsigned long long)free_blocks);
893}
894
895static
896ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr,
897 struct the_nilfs *nilfs,
898 char *buf)
899{
900 struct nilfs_super_block **sbp = nilfs->ns_sbp;
901
902 return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid);
903}
904
905static
906ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr,
907 struct the_nilfs *nilfs,
908 char *buf)
909{
910 struct nilfs_super_block **sbp = nilfs->ns_sbp;
911
912 return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n",
913 sbp[0]->s_volume_name);
914}
915
916static const char dev_readme_str[] =
917 "The <device> group contains attributes that describe file system\n"
918 "partition's details.\n\n"
919 "(1) revision\n\tshow NILFS file system revision.\n\n"
920 "(2) blocksize\n\tshow volume block size in bytes.\n\n"
921 "(3) device_size\n\tshow volume size in bytes.\n\n"
922 "(4) free_blocks\n\tshow count of free blocks on volume.\n\n"
923 "(5) uuid\n\tshow volume's UUID.\n\n"
924 "(6) volume_name\n\tshow volume's name.\n\n";
925
926static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr,
927 struct the_nilfs *nilfs,
928 char *buf)
929{
930 return snprintf(buf, PAGE_SIZE, dev_readme_str);
931}
932
933NILFS_DEV_RO_ATTR(revision);
934NILFS_DEV_RO_ATTR(blocksize);
935NILFS_DEV_RO_ATTR(device_size);
936NILFS_DEV_RO_ATTR(free_blocks);
937NILFS_DEV_RO_ATTR(uuid);
938NILFS_DEV_RO_ATTR(volume_name);
939NILFS_DEV_RO_ATTR(README);
940
941static struct attribute *nilfs_dev_attrs[] = {
942 NILFS_DEV_ATTR_LIST(revision),
943 NILFS_DEV_ATTR_LIST(blocksize),
944 NILFS_DEV_ATTR_LIST(device_size),
945 NILFS_DEV_ATTR_LIST(free_blocks),
946 NILFS_DEV_ATTR_LIST(uuid),
947 NILFS_DEV_ATTR_LIST(volume_name),
948 NILFS_DEV_ATTR_LIST(README),
949 NULL,
950};
951
952static ssize_t nilfs_dev_attr_show(struct kobject *kobj,
953 struct attribute *attr, char *buf)
954{
955 struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
956 ns_dev_kobj);
957 struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
958 attr);
959
960 return a->show ? a->show(a, nilfs, buf) : 0;
961}
962
963static ssize_t nilfs_dev_attr_store(struct kobject *kobj,
964 struct attribute *attr,
965 const char *buf, size_t len)
966{
967 struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
968 ns_dev_kobj);
969 struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr,
970 attr);
971
972 return a->store ? a->store(a, nilfs, buf, len) : 0;
973}
974
975static void nilfs_dev_attr_release(struct kobject *kobj)
976{
977 struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs,
978 ns_dev_kobj);
979 complete(&nilfs->ns_dev_kobj_unregister);
980}
981
982static const struct sysfs_ops nilfs_dev_attr_ops = {
983 .show = nilfs_dev_attr_show,
984 .store = nilfs_dev_attr_store,
985};
986
987static struct kobj_type nilfs_dev_ktype = {
988 .default_attrs = nilfs_dev_attrs,
989 .sysfs_ops = &nilfs_dev_attr_ops,
990 .release = nilfs_dev_attr_release,
991};
992
993int nilfs_sysfs_create_device_group(struct super_block *sb)
994{
995 struct the_nilfs *nilfs = sb->s_fs_info;
996 size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups);
997 int err;
998
999 nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
1000 if (unlikely(!nilfs->ns_dev_subgroups)) {
1001 err = -ENOMEM;
1002 printk(KERN_ERR "NILFS: unable to allocate memory for device group\n");
1003 goto failed_create_device_group;
1004 }
1005
1006 nilfs->ns_dev_kobj.kset = nilfs_kset;
1007 init_completion(&nilfs->ns_dev_kobj_unregister);
1008 err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL,
1009 "%s", sb->s_id);
1010 if (err)
1011 goto free_dev_subgroups;
1012
1013 err = nilfs_sysfs_create_mounted_snapshots_group(nilfs);
1014 if (err)
1015 goto cleanup_dev_kobject;
1016
1017 err = nilfs_sysfs_create_checkpoints_group(nilfs);
1018 if (err)
1019 goto delete_mounted_snapshots_group;
1020
1021 err = nilfs_sysfs_create_segments_group(nilfs);
1022 if (err)
1023 goto delete_checkpoints_group;
1024
1025 err = nilfs_sysfs_create_superblock_group(nilfs);
1026 if (err)
1027 goto delete_segments_group;
1028
1029 err = nilfs_sysfs_create_segctor_group(nilfs);
1030 if (err)
1031 goto delete_superblock_group;
1032
1033 return 0;
1034
1035delete_superblock_group:
1036 nilfs_sysfs_delete_superblock_group(nilfs);
1037
1038delete_segments_group:
1039 nilfs_sysfs_delete_segments_group(nilfs);
1040
1041delete_checkpoints_group:
1042 nilfs_sysfs_delete_checkpoints_group(nilfs);
1043
1044delete_mounted_snapshots_group:
1045 nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
1046
1047cleanup_dev_kobject:
1048 kobject_del(&nilfs->ns_dev_kobj);
1049
1050free_dev_subgroups:
1051 kfree(nilfs->ns_dev_subgroups);
1052
1053failed_create_device_group:
1054 return err;
1055}
1056
1057void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs)
1058{
1059 nilfs_sysfs_delete_mounted_snapshots_group(nilfs);
1060 nilfs_sysfs_delete_checkpoints_group(nilfs);
1061 nilfs_sysfs_delete_segments_group(nilfs);
1062 nilfs_sysfs_delete_superblock_group(nilfs);
1063 nilfs_sysfs_delete_segctor_group(nilfs);
1064 kobject_del(&nilfs->ns_dev_kobj);
1065 kfree(nilfs->ns_dev_subgroups);
1066}
1067
1068/************************************************************************
1069 * NILFS feature attrs *
1070 ************************************************************************/
1071
1072static ssize_t nilfs_feature_revision_show(struct kobject *kobj,
1073 struct attribute *attr, char *buf)
1074{
1075 return snprintf(buf, PAGE_SIZE, "%d.%d\n",
1076 NILFS_CURRENT_REV, NILFS_MINOR_REV);
1077}
1078
1079static const char features_readme_str[] =
1080 "The features group contains attributes that describe NILFS file\n"
1081 "system driver features.\n\n"
1082 "(1) revision\n\tshow current revision of NILFS file system driver.\n";
1083
1084static ssize_t nilfs_feature_README_show(struct kobject *kobj,
1085 struct attribute *attr,
1086 char *buf)
1087{
1088 return snprintf(buf, PAGE_SIZE, features_readme_str);
1089}
1090
1091NILFS_FEATURE_RO_ATTR(revision);
1092NILFS_FEATURE_RO_ATTR(README);
1093
1094static struct attribute *nilfs_feature_attrs[] = {
1095 NILFS_FEATURE_ATTR_LIST(revision),
1096 NILFS_FEATURE_ATTR_LIST(README),
1097 NULL,
1098};
1099
1100static const struct attribute_group nilfs_feature_attr_group = {
1101 .name = "features",
1102 .attrs = nilfs_feature_attrs,
1103};
1104
1105int __init nilfs_sysfs_init(void)
1106{
1107 int err;
1108
1109 nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
1110 if (!nilfs_kset) {
1111 err = -ENOMEM;
1112 printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n",
1113 err);
1114 goto failed_sysfs_init;
1115 }
1116
1117 err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
1118 if (unlikely(err)) {
1119 printk(KERN_ERR "NILFS: unable to create feature group: err %d\n",
1120 err);
1121 goto cleanup_sysfs_init;
1122 }
1123
1124 return 0;
1125
1126cleanup_sysfs_init:
1127 kset_unregister(nilfs_kset);
1128
1129failed_sysfs_init:
1130 return err;
1131}
1132
1133void nilfs_sysfs_exit(void)
1134{
1135 sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
1136 kset_unregister(nilfs_kset);
1137}
diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h
new file mode 100644
index 000000000000..677e3a1a8370
--- /dev/null
+++ b/fs/nilfs2/sysfs.h
@@ -0,0 +1,176 @@
1/*
2 * sysfs.h - sysfs support declarations.
3 *
4 * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation.
5 * Copyright (C) 2014 HGST, Inc., a Western Digital Company.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com>
18 */
19
20#ifndef _NILFS_SYSFS_H
21#define _NILFS_SYSFS_H
22
23#include <linux/sysfs.h>
24
25#define NILFS_ROOT_GROUP_NAME "nilfs2"
26
27/*
28 * struct nilfs_sysfs_dev_subgroups - device subgroup kernel objects
29 * @sg_superblock_kobj: /sys/fs/<nilfs>/<device>/superblock
30 * @sg_superblock_kobj_unregister: completion state
31 * @sg_segctor_kobj: /sys/fs/<nilfs>/<device>/segctor
32 * @sg_segctor_kobj_unregister: completion state
33 * @sg_mounted_snapshots_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots
34 * @sg_mounted_snapshots_kobj_unregister: completion state
35 * @sg_checkpoints_kobj: /sys/fs/<nilfs>/<device>/checkpoints
36 * @sg_checkpoints_kobj_unregister: completion state
37 * @sg_segments_kobj: /sys/fs/<nilfs>/<device>/segments
38 * @sg_segments_kobj_unregister: completion state
39 */
40struct nilfs_sysfs_dev_subgroups {
41 /* /sys/fs/<nilfs>/<device>/superblock */
42 struct kobject sg_superblock_kobj;
43 struct completion sg_superblock_kobj_unregister;
44
45 /* /sys/fs/<nilfs>/<device>/segctor */
46 struct kobject sg_segctor_kobj;
47 struct completion sg_segctor_kobj_unregister;
48
49 /* /sys/fs/<nilfs>/<device>/mounted_snapshots */
50 struct kobject sg_mounted_snapshots_kobj;
51 struct completion sg_mounted_snapshots_kobj_unregister;
52
53 /* /sys/fs/<nilfs>/<device>/checkpoints */
54 struct kobject sg_checkpoints_kobj;
55 struct completion sg_checkpoints_kobj_unregister;
56
57 /* /sys/fs/<nilfs>/<device>/segments */
58 struct kobject sg_segments_kobj;
59 struct completion sg_segments_kobj_unregister;
60};
61
62#define NILFS_COMMON_ATTR_STRUCT(name) \
63struct nilfs_##name##_attr { \
64 struct attribute attr; \
65 ssize_t (*show)(struct kobject *, struct attribute *, \
66 char *); \
67 ssize_t (*store)(struct kobject *, struct attribute *, \
68 const char *, size_t); \
69};
70
71NILFS_COMMON_ATTR_STRUCT(feature);
72
73#define NILFS_DEV_ATTR_STRUCT(name) \
74struct nilfs_##name##_attr { \
75 struct attribute attr; \
76 ssize_t (*show)(struct nilfs_##name##_attr *, struct the_nilfs *, \
77 char *); \
78 ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \
79 const char *, size_t); \
80};
81
82NILFS_DEV_ATTR_STRUCT(dev);
83NILFS_DEV_ATTR_STRUCT(segments);
84NILFS_DEV_ATTR_STRUCT(mounted_snapshots);
85NILFS_DEV_ATTR_STRUCT(checkpoints);
86NILFS_DEV_ATTR_STRUCT(superblock);
87NILFS_DEV_ATTR_STRUCT(segctor);
88
89#define NILFS_CP_ATTR_STRUCT(name) \
90struct nilfs_##name##_attr { \
91 struct attribute attr; \
92 ssize_t (*show)(struct nilfs_##name##_attr *, struct nilfs_root *, \
93 char *); \
94 ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \
95 const char *, size_t); \
96};
97
98NILFS_CP_ATTR_STRUCT(snapshot);
99
100#define NILFS_ATTR(type, name, mode, show, store) \
101 static struct nilfs_##type##_attr nilfs_##type##_attr_##name = \
102 __ATTR(name, mode, show, store)
103
104#define NILFS_INFO_ATTR(type, name) \
105 NILFS_ATTR(type, name, 0444, NULL, NULL)
106#define NILFS_RO_ATTR(type, name) \
107 NILFS_ATTR(type, name, 0444, nilfs_##type##_##name##_show, NULL)
108#define NILFS_RW_ATTR(type, name) \
109 NILFS_ATTR(type, name, 0644, \
110 nilfs_##type##_##name##_show, \
111 nilfs_##type##_##name##_store)
112
113#define NILFS_FEATURE_INFO_ATTR(name) \
114 NILFS_INFO_ATTR(feature, name)
115#define NILFS_FEATURE_RO_ATTR(name) \
116 NILFS_RO_ATTR(feature, name)
117#define NILFS_FEATURE_RW_ATTR(name) \
118 NILFS_RW_ATTR(feature, name)
119
120#define NILFS_DEV_INFO_ATTR(name) \
121 NILFS_INFO_ATTR(dev, name)
122#define NILFS_DEV_RO_ATTR(name) \
123 NILFS_RO_ATTR(dev, name)
124#define NILFS_DEV_RW_ATTR(name) \
125 NILFS_RW_ATTR(dev, name)
126
127#define NILFS_SEGMENTS_RO_ATTR(name) \
128 NILFS_RO_ATTR(segments, name)
129#define NILFS_SEGMENTS_RW_ATTR(name) \
130 NILFS_RW_ATTR(segs_info, name)
131
132#define NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(name) \
133 NILFS_RO_ATTR(mounted_snapshots, name)
134
135#define NILFS_CHECKPOINTS_RO_ATTR(name) \
136 NILFS_RO_ATTR(checkpoints, name)
137#define NILFS_CHECKPOINTS_RW_ATTR(name) \
138 NILFS_RW_ATTR(checkpoints, name)
139
140#define NILFS_SNAPSHOT_INFO_ATTR(name) \
141 NILFS_INFO_ATTR(snapshot, name)
142#define NILFS_SNAPSHOT_RO_ATTR(name) \
143 NILFS_RO_ATTR(snapshot, name)
144#define NILFS_SNAPSHOT_RW_ATTR(name) \
145 NILFS_RW_ATTR(snapshot, name)
146
147#define NILFS_SUPERBLOCK_RO_ATTR(name) \
148 NILFS_RO_ATTR(superblock, name)
149#define NILFS_SUPERBLOCK_RW_ATTR(name) \
150 NILFS_RW_ATTR(superblock, name)
151
152#define NILFS_SEGCTOR_INFO_ATTR(name) \
153 NILFS_INFO_ATTR(segctor, name)
154#define NILFS_SEGCTOR_RO_ATTR(name) \
155 NILFS_RO_ATTR(segctor, name)
156#define NILFS_SEGCTOR_RW_ATTR(name) \
157 NILFS_RW_ATTR(segctor, name)
158
159#define NILFS_FEATURE_ATTR_LIST(name) \
160 (&nilfs_feature_attr_##name.attr)
161#define NILFS_DEV_ATTR_LIST(name) \
162 (&nilfs_dev_attr_##name.attr)
163#define NILFS_SEGMENTS_ATTR_LIST(name) \
164 (&nilfs_segments_attr_##name.attr)
165#define NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(name) \
166 (&nilfs_mounted_snapshots_attr_##name.attr)
167#define NILFS_CHECKPOINTS_ATTR_LIST(name) \
168 (&nilfs_checkpoints_attr_##name.attr)
169#define NILFS_SNAPSHOT_ATTR_LIST(name) \
170 (&nilfs_snapshot_attr_##name.attr)
171#define NILFS_SUPERBLOCK_ATTR_LIST(name) \
172 (&nilfs_superblock_attr_##name.attr)
173#define NILFS_SEGCTOR_ATTR_LIST(name) \
174 (&nilfs_segctor_attr_##name.attr)
175
176#endif /* _NILFS_SYSFS_H */
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8ba8229ba076..9da25fe9ea61 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -85,6 +85,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
85 nilfs->ns_cptree = RB_ROOT; 85 nilfs->ns_cptree = RB_ROOT;
86 spin_lock_init(&nilfs->ns_cptree_lock); 86 spin_lock_init(&nilfs->ns_cptree_lock);
87 init_rwsem(&nilfs->ns_segctor_sem); 87 init_rwsem(&nilfs->ns_segctor_sem);
88 nilfs->ns_sb_update_freq = NILFS_SB_FREQ;
88 89
89 return nilfs; 90 return nilfs;
90} 91}
@@ -97,6 +98,7 @@ void destroy_nilfs(struct the_nilfs *nilfs)
97{ 98{
98 might_sleep(); 99 might_sleep();
99 if (nilfs_init(nilfs)) { 100 if (nilfs_init(nilfs)) {
101 nilfs_sysfs_delete_device_group(nilfs);
100 brelse(nilfs->ns_sbh[0]); 102 brelse(nilfs->ns_sbh[0]);
101 brelse(nilfs->ns_sbh[1]); 103 brelse(nilfs->ns_sbh[1]);
102 } 104 }
@@ -640,6 +642,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
640 if (err) 642 if (err)
641 goto failed_sbh; 643 goto failed_sbh;
642 644
645 err = nilfs_sysfs_create_device_group(sb);
646 if (err)
647 goto failed_sbh;
648
643 set_nilfs_init(nilfs); 649 set_nilfs_init(nilfs);
644 err = 0; 650 err = 0;
645 out: 651 out:
@@ -740,12 +746,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
740{ 746{
741 struct rb_node **p, *parent; 747 struct rb_node **p, *parent;
742 struct nilfs_root *root, *new; 748 struct nilfs_root *root, *new;
749 int err;
743 750
744 root = nilfs_lookup_root(nilfs, cno); 751 root = nilfs_lookup_root(nilfs, cno);
745 if (root) 752 if (root)
746 return root; 753 return root;
747 754
748 new = kmalloc(sizeof(*root), GFP_KERNEL); 755 new = kzalloc(sizeof(*root), GFP_KERNEL);
749 if (!new) 756 if (!new)
750 return NULL; 757 return NULL;
751 758
@@ -782,6 +789,12 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno)
782 789
783 spin_unlock(&nilfs->ns_cptree_lock); 790 spin_unlock(&nilfs->ns_cptree_lock);
784 791
792 err = nilfs_sysfs_create_snapshot_group(new);
793 if (err) {
794 kfree(new);
795 new = NULL;
796 }
797
785 return new; 798 return new;
786} 799}
787 800
@@ -790,6 +803,8 @@ void nilfs_put_root(struct nilfs_root *root)
790 if (atomic_dec_and_test(&root->count)) { 803 if (atomic_dec_and_test(&root->count)) {
791 struct the_nilfs *nilfs = root->nilfs; 804 struct the_nilfs *nilfs = root->nilfs;
792 805
806 nilfs_sysfs_delete_snapshot_group(root);
807
793 spin_lock(&nilfs->ns_cptree_lock); 808 spin_lock(&nilfs->ns_cptree_lock);
794 rb_erase(&root->rb_node, &nilfs->ns_cptree); 809 rb_erase(&root->rb_node, &nilfs->ns_cptree);
795 spin_unlock(&nilfs->ns_cptree_lock); 810 spin_unlock(&nilfs->ns_cptree_lock);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index de8cc53b4a5c..23778d385836 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -33,6 +33,7 @@
33#include <linux/slab.h> 33#include <linux/slab.h>
34 34
35struct nilfs_sc_info; 35struct nilfs_sc_info;
36struct nilfs_sysfs_dev_subgroups;
36 37
37/* the_nilfs struct */ 38/* the_nilfs struct */
38enum { 39enum {
@@ -45,6 +46,7 @@ enum {
45/** 46/**
46 * struct the_nilfs - struct to supervise multiple nilfs mount points 47 * struct the_nilfs - struct to supervise multiple nilfs mount points
47 * @ns_flags: flags 48 * @ns_flags: flags
49 * @ns_flushed_device: flag indicating if all volatile data was flushed
48 * @ns_bdev: block device 50 * @ns_bdev: block device
49 * @ns_sem: semaphore for shared states 51 * @ns_sem: semaphore for shared states
50 * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts 52 * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts
@@ -54,6 +56,7 @@ enum {
54 * @ns_sbwcount: write count of super block 56 * @ns_sbwcount: write count of super block
55 * @ns_sbsize: size of valid data in super block 57 * @ns_sbsize: size of valid data in super block
56 * @ns_mount_state: file system state 58 * @ns_mount_state: file system state
59 * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds)
57 * @ns_seg_seq: segment sequence counter 60 * @ns_seg_seq: segment sequence counter
58 * @ns_segnum: index number of the latest full segment. 61 * @ns_segnum: index number of the latest full segment.
59 * @ns_nextnum: index number of the full segment index to be used next 62 * @ns_nextnum: index number of the full segment index to be used next
@@ -95,9 +98,13 @@ enum {
95 * @ns_inode_size: size of on-disk inode 98 * @ns_inode_size: size of on-disk inode
96 * @ns_first_ino: first not-special inode number 99 * @ns_first_ino: first not-special inode number
97 * @ns_crc_seed: seed value of CRC32 calculation 100 * @ns_crc_seed: seed value of CRC32 calculation
101 * @ns_dev_kobj: /sys/fs/<nilfs>/<device>
102 * @ns_dev_kobj_unregister: completion state
103 * @ns_dev_subgroups: <device> subgroups pointer
98 */ 104 */
99struct the_nilfs { 105struct the_nilfs {
100 unsigned long ns_flags; 106 unsigned long ns_flags;
107 int ns_flushed_device;
101 108
102 struct block_device *ns_bdev; 109 struct block_device *ns_bdev;
103 struct rw_semaphore ns_sem; 110 struct rw_semaphore ns_sem;
@@ -114,6 +121,7 @@ struct the_nilfs {
114 unsigned ns_sbwcount; 121 unsigned ns_sbwcount;
115 unsigned ns_sbsize; 122 unsigned ns_sbsize;
116 unsigned ns_mount_state; 123 unsigned ns_mount_state;
124 unsigned ns_sb_update_freq;
117 125
118 /* 126 /*
119 * Following fields are dedicated to a writable FS-instance. 127 * Following fields are dedicated to a writable FS-instance.
@@ -188,6 +196,11 @@ struct the_nilfs {
188 int ns_inode_size; 196 int ns_inode_size;
189 int ns_first_ino; 197 int ns_first_ino;
190 u32 ns_crc_seed; 198 u32 ns_crc_seed;
199
200 /* /sys/fs/<nilfs>/<device> */
201 struct kobject ns_dev_kobj;
202 struct completion ns_dev_kobj_unregister;
203 struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups;
191}; 204};
192 205
193#define THE_NILFS_FNS(bit, name) \ 206#define THE_NILFS_FNS(bit, name) \
@@ -232,6 +245,8 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty)
232 * @ifile: inode file 245 * @ifile: inode file
233 * @inodes_count: number of inodes 246 * @inodes_count: number of inodes
234 * @blocks_count: number of blocks 247 * @blocks_count: number of blocks
248 * @snapshot_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot>
249 * @snapshot_kobj_unregister: completion state for kernel object
235 */ 250 */
236struct nilfs_root { 251struct nilfs_root {
237 __u64 cno; 252 __u64 cno;
@@ -243,6 +258,10 @@ struct nilfs_root {
243 258
244 atomic64_t inodes_count; 259 atomic64_t inodes_count;
245 atomic64_t blocks_count; 260 atomic64_t blocks_count;
261
262 /* /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> */
263 struct kobject snapshot_kobj;
264 struct completion snapshot_kobj_unregister;
246}; 265};
247 266
248/* Special checkpoint number */ 267/* Special checkpoint number */
@@ -254,7 +273,8 @@ struct nilfs_root {
254static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) 273static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
255{ 274{
256 u64 t = get_seconds(); 275 u64 t = get_seconds();
257 return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ; 276 return t < nilfs->ns_sbwtime ||
277 t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq;
258} 278}
259 279
260static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) 280static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
@@ -353,4 +373,24 @@ static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
353 return n == nilfs->ns_segnum || n == nilfs->ns_nextnum; 373 return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
354} 374}
355 375
376static inline int nilfs_flush_device(struct the_nilfs *nilfs)
377{
378 int err;
379
380 if (!nilfs_test_opt(nilfs, BARRIER) || nilfs->ns_flushed_device)
381 return 0;
382
383 nilfs->ns_flushed_device = 1;
384 /*
385 * the store to ns_flushed_device must not be reordered after
386 * blkdev_issue_flush().
387 */
388 smp_wmb();
389
390 err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL);
391 if (err != -EIO)
392 err = 0;
393 return err;
394}
395
356#endif /* _THE_NILFS_H */ 396#endif /* _THE_NILFS_H */
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index abc8cbcfe90e..caaaf9dfe353 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -346,13 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
346 goto out; 346 goto out;
347 } 347 }
348 348
349 error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); 349 __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
350 if (error) {
351 /* if we added, we must shoot */
352 if (dn_mark == new_dn_mark)
353 destroy = 1;
354 goto out;
355 }
356 350
357 error = attach_dn(dn, dn_mark, id, fd, filp, mask); 351 error = attach_dn(dn, dn_mark, id, fd, filp, mask);
358 /* !error means that we attached the dn to the dn_mark, so don't free it */ 352 /* !error means that we attached the dn to the dn_mark, so don't free it */
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index ee9cb3795c2b..30d3addfad75 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -70,8 +70,15 @@ static int fanotify_get_response(struct fsnotify_group *group,
70 wait_event(group->fanotify_data.access_waitq, event->response || 70 wait_event(group->fanotify_data.access_waitq, event->response ||
71 atomic_read(&group->fanotify_data.bypass_perm)); 71 atomic_read(&group->fanotify_data.bypass_perm));
72 72
73 if (!event->response) /* bypass_perm set */ 73 if (!event->response) { /* bypass_perm set */
74 /*
75 * Event was canceled because group is being destroyed. Remove
76 * it from group's event list because we are responsible for
77 * freeing the permission event.
78 */
79 fsnotify_remove_event(group, &event->fae.fse);
74 return 0; 80 return 0;
81 }
75 82
76 /* userspace responded, convert to something usable */ 83 /* userspace responded, convert to something usable */
77 switch (event->response) { 84 switch (event->response) {
@@ -210,7 +217,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
210 return -ENOMEM; 217 return -ENOMEM;
211 218
212 fsn_event = &event->fse; 219 fsn_event = &event->fse;
213 ret = fsnotify_add_notify_event(group, fsn_event, fanotify_merge); 220 ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
214 if (ret) { 221 if (ret) {
215 /* Permission events shouldn't be merged */ 222 /* Permission events shouldn't be merged */
216 BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS); 223 BUG_ON(ret == 1 && mask & FAN_ALL_PERM_EVENTS);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 3fdc8a3e1134..c991616acca9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -66,7 +66,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
66 66
67 /* held the notification_mutex the whole time, so this is the 67 /* held the notification_mutex the whole time, so this is the
68 * same event we peeked above */ 68 * same event we peeked above */
69 return fsnotify_remove_notify_event(group); 69 return fsnotify_remove_first_event(group);
70} 70}
71 71
72static int create_fd(struct fsnotify_group *group, 72static int create_fd(struct fsnotify_group *group,
@@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group,
78 78
79 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 79 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
80 80
81 client_fd = get_unused_fd(); 81 client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
82 if (client_fd < 0) 82 if (client_fd < 0)
83 return client_fd; 83 return client_fd;
84 84
@@ -359,6 +359,11 @@ static int fanotify_release(struct inode *ignored, struct file *file)
359#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS 359#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
360 struct fanotify_perm_event_info *event, *next; 360 struct fanotify_perm_event_info *event, *next;
361 361
362 /*
363 * There may be still new events arriving in the notification queue
364 * but since userspace cannot use fanotify fd anymore, no event can
365 * enter or leave access_list by now.
366 */
362 spin_lock(&group->fanotify_data.access_lock); 367 spin_lock(&group->fanotify_data.access_lock);
363 368
364 atomic_inc(&group->fanotify_data.bypass_perm); 369 atomic_inc(&group->fanotify_data.bypass_perm);
@@ -373,6 +378,13 @@ static int fanotify_release(struct inode *ignored, struct file *file)
373 } 378 }
374 spin_unlock(&group->fanotify_data.access_lock); 379 spin_unlock(&group->fanotify_data.access_lock);
375 380
381 /*
382 * Since bypass_perm is set, newly queued events will not wait for
383 * access response. Wake up the already sleeping ones now.
384 * synchronize_srcu() in fsnotify_destroy_group() will wait for all
385 * processes sleeping in fanotify_handle_event() waiting for access
386 * response and thus also for all permission events to be freed.
387 */
376 wake_up(&group->fanotify_data.access_waitq); 388 wake_up(&group->fanotify_data.access_waitq);
377#endif 389#endif
378 390
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 238a5930cb3c..9d7e2b9659cb 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
42{ 42{
43 struct { 43 struct {
44 struct file_handle handle; 44 struct file_handle handle;
45 u8 pad[64]; 45 u8 pad[MAX_HANDLE_SZ];
46 } f; 46 } f;
47 int size, ret, i; 47 int size, ret, i;
48 48
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
50 size = f.handle.handle_bytes >> 2; 50 size = f.handle.handle_bytes >> 2;
51 51
52 ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); 52 ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
53 if ((ret == 255) || (ret == -ENOSPC)) { 53 if ((ret == FILEID_INVALID) || (ret < 0)) {
54 WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); 54 WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
55 return 0; 55 return 0;
56 } 56 }
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 85e7d2b431d9..9c0898c4cfe1 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -23,9 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
23 struct fsnotify_group *group, struct vfsmount *mnt, 23 struct fsnotify_group *group, struct vfsmount *mnt,
24 int allow_dups); 24 int allow_dups);
25 25
26/* final kfree of a group */
27extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
28
29/* vfsmount specific destruction of a mark */ 26/* vfsmount specific destruction of a mark */
30extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); 27extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark);
31/* inode specific destruction of a mark */ 28/* inode specific destruction of a mark */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index ad1995980456..d16b62cb2854 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -31,7 +31,7 @@
31/* 31/*
32 * Final freeing of a group 32 * Final freeing of a group
33 */ 33 */
34void fsnotify_final_destroy_group(struct fsnotify_group *group) 34static void fsnotify_final_destroy_group(struct fsnotify_group *group)
35{ 35{
36 if (group->ops->free_group_priv) 36 if (group->ops->free_group_priv)
37 group->ops->free_group_priv(group); 37 group->ops->free_group_priv(group);
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 74825be65b7b..9ce062218de9 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -232,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
232 232
233 BUG_ON(last == NULL); 233 BUG_ON(last == NULL);
234 /* mark should be the last entry. last is the current last entry */ 234 /* mark should be the last entry. last is the current last entry */
235 hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); 235 hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list);
236out: 236out:
237 fsnotify_recalc_inode_mask_locked(inode); 237 fsnotify_recalc_inode_mask_locked(inode);
238 spin_unlock(&inode->i_lock); 238 spin_unlock(&inode->i_lock);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 43ab1e1a07a2..7d888d77d59a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -108,7 +108,7 @@ int inotify_handle_event(struct fsnotify_group *group,
108 if (len) 108 if (len)
109 strcpy(event->name, file_name); 109 strcpy(event->name, file_name);
110 110
111 ret = fsnotify_add_notify_event(group, fsn_event, inotify_merge); 111 ret = fsnotify_add_event(group, fsn_event, inotify_merge);
112 if (ret) { 112 if (ret) {
113 /* Our event wasn't used in the end. Free it. */ 113 /* Our event wasn't used in the end. Free it. */
114 fsnotify_destroy_event(group, fsn_event); 114 fsnotify_destroy_event(group, fsn_event);
@@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
165 /* ideally the idr is empty and we won't hit the BUG in the callback */ 165 /* ideally the idr is empty and we won't hit the BUG in the callback */
166 idr_for_each(&group->inotify_data.idr, idr_callback, group); 166 idr_for_each(&group->inotify_data.idr, idr_callback, group);
167 idr_destroy(&group->inotify_data.idr); 167 idr_destroy(&group->inotify_data.idr);
168 atomic_dec(&group->inotify_data.user->inotify_devs); 168 if (group->inotify_data.user) {
169 free_uid(group->inotify_data.user); 169 atomic_dec(&group->inotify_data.user->inotify_devs);
170 free_uid(group->inotify_data.user);
171 }
170} 172}
171 173
172static void inotify_free_event(struct fsnotify_event *fsn_event) 174static void inotify_free_event(struct fsnotify_event *fsn_event)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cc423a30a0c8..daf76652fe58 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -149,7 +149,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
149 if (fsnotify_notify_queue_is_empty(group)) 149 if (fsnotify_notify_queue_is_empty(group))
150 return NULL; 150 return NULL;
151 151
152 event = fsnotify_peek_notify_event(group); 152 event = fsnotify_peek_first_event(group);
153 153
154 pr_debug("%s: group=%p event=%p\n", __func__, group, event); 154 pr_debug("%s: group=%p event=%p\n", __func__, group, event);
155 155
@@ -159,7 +159,7 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
159 159
160 /* held the notification_mutex the whole time, so this is the 160 /* held the notification_mutex the whole time, so this is the
161 * same event we peeked above */ 161 * same event we peeked above */
162 fsnotify_remove_notify_event(group); 162 fsnotify_remove_first_event(group);
163 163
164 return event; 164 return event;
165} 165}
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 1e58402171a5..a95d8e037aeb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -73,7 +73,8 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
73 /* Overflow events are per-group and we don't want to free them */ 73 /* Overflow events are per-group and we don't want to free them */
74 if (!event || event->mask == FS_Q_OVERFLOW) 74 if (!event || event->mask == FS_Q_OVERFLOW)
75 return; 75 return;
76 76 /* If the event is still queued, we have a problem... */
77 WARN_ON(!list_empty(&event->list));
77 group->ops->free_event(event); 78 group->ops->free_event(event);
78} 79}
79 80
@@ -83,10 +84,10 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
83 * added to the queue, 1 if the event was merged with some other queued event, 84 * added to the queue, 1 if the event was merged with some other queued event,
84 * 2 if the queue of events has overflown. 85 * 2 if the queue of events has overflown.
85 */ 86 */
86int fsnotify_add_notify_event(struct fsnotify_group *group, 87int fsnotify_add_event(struct fsnotify_group *group,
87 struct fsnotify_event *event, 88 struct fsnotify_event *event,
88 int (*merge)(struct list_head *, 89 int (*merge)(struct list_head *,
89 struct fsnotify_event *)) 90 struct fsnotify_event *))
90{ 91{
91 int ret = 0; 92 int ret = 0;
92 struct list_head *list = &group->notification_list; 93 struct list_head *list = &group->notification_list;
@@ -125,10 +126,25 @@ queue:
125} 126}
126 127
127/* 128/*
129 * Remove @event from group's notification queue. It is the responsibility of
130 * the caller to destroy the event.
131 */
132void fsnotify_remove_event(struct fsnotify_group *group,
133 struct fsnotify_event *event)
134{
135 mutex_lock(&group->notification_mutex);
136 if (!list_empty(&event->list)) {
137 list_del_init(&event->list);
138 group->q_len--;
139 }
140 mutex_unlock(&group->notification_mutex);
141}
142
143/*
128 * Remove and return the first event from the notification list. It is the 144 * Remove and return the first event from the notification list. It is the
129 * responsibility of the caller to destroy the obtained event 145 * responsibility of the caller to destroy the obtained event
130 */ 146 */
131struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group) 147struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
132{ 148{
133 struct fsnotify_event *event; 149 struct fsnotify_event *event;
134 150
@@ -140,7 +156,7 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
140 struct fsnotify_event, list); 156 struct fsnotify_event, list);
141 /* 157 /*
142 * We need to init list head for the case of overflow event so that 158 * We need to init list head for the case of overflow event so that
143 * check in fsnotify_add_notify_events() works 159 * check in fsnotify_add_event() works
144 */ 160 */
145 list_del_init(&event->list); 161 list_del_init(&event->list);
146 group->q_len--; 162 group->q_len--;
@@ -149,9 +165,10 @@ struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group
149} 165}
150 166
151/* 167/*
152 * This will not remove the event, that must be done with fsnotify_remove_notify_event() 168 * This will not remove the event, that must be done with
169 * fsnotify_remove_first_event()
153 */ 170 */
154struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group) 171struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
155{ 172{
156 BUG_ON(!mutex_is_locked(&group->notification_mutex)); 173 BUG_ON(!mutex_is_locked(&group->notification_mutex));
157 174
@@ -169,7 +186,7 @@ void fsnotify_flush_notify(struct fsnotify_group *group)
169 186
170 mutex_lock(&group->notification_mutex); 187 mutex_lock(&group->notification_mutex);
171 while (!fsnotify_notify_queue_is_empty(group)) { 188 while (!fsnotify_notify_queue_is_empty(group)) {
172 event = fsnotify_remove_notify_event(group); 189 event = fsnotify_remove_first_event(group);
173 fsnotify_destroy_event(group, event); 190 fsnotify_destroy_event(group, event);
174 } 191 }
175 mutex_unlock(&group->notification_mutex); 192 mutex_unlock(&group->notification_mutex);
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 68ca5a8704b5..ac851e8376b1 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -191,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark,
191 191
192 BUG_ON(last == NULL); 192 BUG_ON(last == NULL);
193 /* mark should be the last entry. last is the current last entry */ 193 /* mark should be the last entry. last is the current last entry */
194 hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); 194 hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list);
195out: 195out:
196 fsnotify_recalc_vfsmount_mask_locked(mnt); 196 fsnotify_recalc_vfsmount_mask_locked(mnt);
197 spin_unlock(&mnt->mnt_root->d_lock); 197 spin_unlock(&mnt->mnt_root->d_lock);
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 30206b238433..36ae529511c4 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
8 8
9ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o 9ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
10 10
11ccflags-y := -DNTFS_VERSION=\"2.1.30\" 11ccflags-y := -DNTFS_VERSION=\"2.1.31\"
12ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG 12ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
13ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW 13ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
14 14
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index d267ea6aa1a0..7521e11db728 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1,8 +1,7 @@
1/** 1/**
2 * aops.c - NTFS kernel address space operations and page cache handling. 2 * aops.c - NTFS kernel address space operations and page cache handling.
3 * Part of the Linux-NTFS project.
4 * 3 *
5 * Copyright (c) 2001-2007 Anton Altaparmakov 4 * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
6 * Copyright (c) 2002 Richard Russon 5 * Copyright (c) 2002 Richard Russon
7 * 6 *
8 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -1539,16 +1538,157 @@ err_out:
1539#endif /* NTFS_RW */ 1538#endif /* NTFS_RW */
1540 1539
1541/** 1540/**
1542 * ntfs_aops - general address space operations for inodes and attributes 1541 * ntfs_bmap - map logical file block to physical device block
1542 * @mapping: address space mapping to which the block to be mapped belongs
1543 * @block: logical block to map to its physical device block
1544 *
1545 * For regular, non-resident files (i.e. not compressed and not encrypted), map
1546 * the logical @block belonging to the file described by the address space
1547 * mapping @mapping to its physical device block.
1548 *
1549 * The size of the block is equal to the @s_blocksize field of the super block
1550 * of the mounted file system which is guaranteed to be smaller than or equal
1551 * to the cluster size thus the block is guaranteed to fit entirely inside the
1552 * cluster which means we do not need to care how many contiguous bytes are
1553 * available after the beginning of the block.
1554 *
1555 * Return the physical device block if the mapping succeeded or 0 if the block
1556 * is sparse or there was an error.
1557 *
1558 * Note: This is a problem if someone tries to run bmap() on $Boot system file
1559 * as that really is in block zero but there is nothing we can do. bmap() is
1560 * just broken in that respect (just like it cannot distinguish sparse from
1561 * not available or error).
1543 */ 1562 */
1544const struct address_space_operations ntfs_aops = { 1563static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
1545 .readpage = ntfs_readpage, /* Fill page with data. */ 1564{
1565 s64 ofs, size;
1566 loff_t i_size;
1567 LCN lcn;
1568 unsigned long blocksize, flags;
1569 ntfs_inode *ni = NTFS_I(mapping->host);
1570 ntfs_volume *vol = ni->vol;
1571 unsigned delta;
1572 unsigned char blocksize_bits, cluster_size_shift;
1573
1574 ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
1575 ni->mft_no, (unsigned long long)block);
1576 if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
1577 ntfs_error(vol->sb, "BMAP does not make sense for %s "
1578 "attributes, returning 0.",
1579 (ni->type != AT_DATA) ? "non-data" :
1580 (!NInoNonResident(ni) ? "resident" :
1581 "encrypted"));
1582 return 0;
1583 }
1584 /* None of these can happen. */
1585 BUG_ON(NInoCompressed(ni));
1586 BUG_ON(NInoMstProtected(ni));
1587 blocksize = vol->sb->s_blocksize;
1588 blocksize_bits = vol->sb->s_blocksize_bits;
1589 ofs = (s64)block << blocksize_bits;
1590 read_lock_irqsave(&ni->size_lock, flags);
1591 size = ni->initialized_size;
1592 i_size = i_size_read(VFS_I(ni));
1593 read_unlock_irqrestore(&ni->size_lock, flags);
1594 /*
1595 * If the offset is outside the initialized size or the block straddles
1596 * the initialized size then pretend it is a hole unless the
1597 * initialized size equals the file size.
1598 */
1599 if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
1600 goto hole;
1601 cluster_size_shift = vol->cluster_size_bits;
1602 down_read(&ni->runlist.lock);
1603 lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
1604 up_read(&ni->runlist.lock);
1605 if (unlikely(lcn < LCN_HOLE)) {
1606 /*
1607 * Step down to an integer to avoid gcc doing a long long
1608 * comparision in the switch when we know @lcn is between
1609 * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
1610 *
1611 * Otherwise older gcc (at least on some architectures) will
1612 * try to use __cmpdi2() which is of course not available in
1613 * the kernel.
1614 */
1615 switch ((int)lcn) {
1616 case LCN_ENOENT:
1617 /*
1618 * If the offset is out of bounds then pretend it is a
1619 * hole.
1620 */
1621 goto hole;
1622 case LCN_ENOMEM:
1623 ntfs_error(vol->sb, "Not enough memory to complete "
1624 "mapping for inode 0x%lx. "
1625 "Returning 0.", ni->mft_no);
1626 break;
1627 default:
1628 ntfs_error(vol->sb, "Failed to complete mapping for "
1629 "inode 0x%lx. Run chkdsk. "
1630 "Returning 0.", ni->mft_no);
1631 break;
1632 }
1633 return 0;
1634 }
1635 if (lcn < 0) {
1636 /* It is a hole. */
1637hole:
1638 ntfs_debug("Done (returning hole).");
1639 return 0;
1640 }
1641 /*
1642 * The block is really allocated and fullfils all our criteria.
1643 * Convert the cluster to units of block size and return the result.
1644 */
1645 delta = ofs & vol->cluster_size_mask;
1646 if (unlikely(sizeof(block) < sizeof(lcn))) {
1647 block = lcn = ((lcn << cluster_size_shift) + delta) >>
1648 blocksize_bits;
1649 /* If the block number was truncated return 0. */
1650 if (unlikely(block != lcn)) {
1651 ntfs_error(vol->sb, "Physical block 0x%llx is too "
1652 "large to be returned, returning 0.",
1653 (long long)lcn);
1654 return 0;
1655 }
1656 } else
1657 block = ((lcn << cluster_size_shift) + delta) >>
1658 blocksize_bits;
1659 ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
1660 return block;
1661}
1662
1663/**
1664 * ntfs_normal_aops - address space operations for normal inodes and attributes
1665 *
1666 * Note these are not used for compressed or mst protected inodes and
1667 * attributes.
1668 */
1669const struct address_space_operations ntfs_normal_aops = {
1670 .readpage = ntfs_readpage,
1546#ifdef NTFS_RW 1671#ifdef NTFS_RW
1547 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1672 .writepage = ntfs_writepage,
1673 .set_page_dirty = __set_page_dirty_buffers,
1674#endif /* NTFS_RW */
1675 .bmap = ntfs_bmap,
1676 .migratepage = buffer_migrate_page,
1677 .is_partially_uptodate = block_is_partially_uptodate,
1678 .error_remove_page = generic_error_remove_page,
1679};
1680
1681/**
1682 * ntfs_compressed_aops - address space operations for compressed inodes
1683 */
1684const struct address_space_operations ntfs_compressed_aops = {
1685 .readpage = ntfs_readpage,
1686#ifdef NTFS_RW
1687 .writepage = ntfs_writepage,
1688 .set_page_dirty = __set_page_dirty_buffers,
1548#endif /* NTFS_RW */ 1689#endif /* NTFS_RW */
1549 .migratepage = buffer_migrate_page, /* Move a page cache page from 1690 .migratepage = buffer_migrate_page,
1550 one physical page to an 1691 .is_partially_uptodate = block_is_partially_uptodate,
1551 other. */
1552 .error_remove_page = generic_error_remove_page, 1692 .error_remove_page = generic_error_remove_page,
1553}; 1693};
1554 1694
@@ -1564,9 +1704,8 @@ const struct address_space_operations ntfs_mst_aops = {
1564 without touching the buffers 1704 without touching the buffers
1565 belonging to the page. */ 1705 belonging to the page. */
1566#endif /* NTFS_RW */ 1706#endif /* NTFS_RW */
1567 .migratepage = buffer_migrate_page, /* Move a page cache page from 1707 .migratepage = buffer_migrate_page,
1568 one physical page to an 1708 .is_partially_uptodate = block_is_partially_uptodate,
1569 other. */
1570 .error_remove_page = generic_error_remove_page, 1709 .error_remove_page = generic_error_remove_page,
1571}; 1710};
1572 1711
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
index dd6103cc93c1..825a54e8f490 100644
--- a/fs/ntfs/debug.c
+++ b/fs/ntfs/debug.c
@@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb,
112/* If 1, output debug messages, and if 0, don't. */ 112/* If 1, output debug messages, and if 0, don't. */
113int debug_msgs = 0; 113int debug_msgs = 0;
114 114
115void __ntfs_debug (const char *file, int line, const char *function, 115void __ntfs_debug(const char *file, int line, const char *function,
116 const char *fmt, ...) 116 const char *fmt, ...)
117{ 117{
118 struct va_format vaf; 118 struct va_format vaf;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 5c9e2c81cb11..643faa44f22b 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. 2 * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
3 * 3 *
4 * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. 4 * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -74,8 +74,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
74 * ntfs_attr_extend_initialized - extend the initialized size of an attribute 74 * ntfs_attr_extend_initialized - extend the initialized size of an attribute
75 * @ni: ntfs inode of the attribute to extend 75 * @ni: ntfs inode of the attribute to extend
76 * @new_init_size: requested new initialized size in bytes 76 * @new_init_size: requested new initialized size in bytes
77 * @cached_page: store any allocated but unused page here
78 * @lru_pvec: lru-buffering pagevec of the caller
79 * 77 *
80 * Extend the initialized size of an attribute described by the ntfs inode @ni 78 * Extend the initialized size of an attribute described by the ntfs inode @ni
81 * to @new_init_size bytes. This involves zeroing any non-sparse space between 79 * to @new_init_size bytes. This involves zeroing any non-sparse space between
@@ -395,7 +393,6 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
395 * @nr_pages: number of page cache pages to obtain 393 * @nr_pages: number of page cache pages to obtain
396 * @pages: array of pages in which to return the obtained page cache pages 394 * @pages: array of pages in which to return the obtained page cache pages
397 * @cached_page: allocated but as yet unused page 395 * @cached_page: allocated but as yet unused page
398 * @lru_pvec: lru-buffering pagevec of caller
399 * 396 *
400 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 397 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
401 * starting at index @index. 398 * starting at index @index.
@@ -413,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
413 BUG_ON(!nr_pages); 410 BUG_ON(!nr_pages);
414 err = nr = 0; 411 err = nr = 0;
415 do { 412 do {
416 pages[nr] = find_lock_page(mapping, index); 413 pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
414 FGP_ACCESSED);
417 if (!pages[nr]) { 415 if (!pages[nr]) {
418 if (!*cached_page) { 416 if (!*cached_page) {
419 *cached_page = page_cache_alloc(mapping); 417 *cached_page = page_cache_alloc(mapping);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index f47af5e6e230..898b9949d363 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1,7 +1,7 @@
1/** 1/**
2 * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project. 2 * inode.c - NTFS kernel inode handling.
3 * 3 *
4 * Copyright (c) 2001-2007 Anton Altaparmakov 4 * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
5 * 5 *
6 * This program/include file is free software; you can redistribute it and/or 6 * This program/include file is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as published 7 * modify it under the terms of the GNU General Public License as published
@@ -1012,6 +1012,7 @@ skip_large_dir_stuff:
1012 /* Setup the operations for this inode. */ 1012 /* Setup the operations for this inode. */
1013 vi->i_op = &ntfs_dir_inode_ops; 1013 vi->i_op = &ntfs_dir_inode_ops;
1014 vi->i_fop = &ntfs_dir_ops; 1014 vi->i_fop = &ntfs_dir_ops;
1015 vi->i_mapping->a_ops = &ntfs_mst_aops;
1015 } else { 1016 } else {
1016 /* It is a file. */ 1017 /* It is a file. */
1017 ntfs_attr_reinit_search_ctx(ctx); 1018 ntfs_attr_reinit_search_ctx(ctx);
@@ -1160,11 +1161,12 @@ no_data_attr_special_case:
1160 /* Setup the operations for this inode. */ 1161 /* Setup the operations for this inode. */
1161 vi->i_op = &ntfs_file_inode_ops; 1162 vi->i_op = &ntfs_file_inode_ops;
1162 vi->i_fop = &ntfs_file_ops; 1163 vi->i_fop = &ntfs_file_ops;
1164 vi->i_mapping->a_ops = &ntfs_normal_aops;
1165 if (NInoMstProtected(ni))
1166 vi->i_mapping->a_ops = &ntfs_mst_aops;
1167 else if (NInoCompressed(ni))
1168 vi->i_mapping->a_ops = &ntfs_compressed_aops;
1163 } 1169 }
1164 if (NInoMstProtected(ni))
1165 vi->i_mapping->a_ops = &ntfs_mst_aops;
1166 else
1167 vi->i_mapping->a_ops = &ntfs_aops;
1168 /* 1170 /*
1169 * The number of 512-byte blocks used on disk (for stat). This is in so 1171 * The number of 512-byte blocks used on disk (for stat). This is in so
1170 * far inaccurate as it doesn't account for any named streams or other 1172 * far inaccurate as it doesn't account for any named streams or other
@@ -1414,10 +1416,11 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
1414 ni->allocated_size = sle64_to_cpu( 1416 ni->allocated_size = sle64_to_cpu(
1415 a->data.non_resident.allocated_size); 1417 a->data.non_resident.allocated_size);
1416 } 1418 }
1419 vi->i_mapping->a_ops = &ntfs_normal_aops;
1417 if (NInoMstProtected(ni)) 1420 if (NInoMstProtected(ni))
1418 vi->i_mapping->a_ops = &ntfs_mst_aops; 1421 vi->i_mapping->a_ops = &ntfs_mst_aops;
1419 else 1422 else if (NInoCompressed(ni))
1420 vi->i_mapping->a_ops = &ntfs_aops; 1423 vi->i_mapping->a_ops = &ntfs_compressed_aops;
1421 if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) 1424 if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
1422 vi->i_blocks = ni->itype.compressed.size >> 9; 1425 vi->i_blocks = ni->itype.compressed.size >> 9;
1423 else 1426 else
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
index d6a340bf80fc..c581e26a350d 100644
--- a/fs/ntfs/ntfs.h
+++ b/fs/ntfs/ntfs.h
@@ -1,8 +1,7 @@
1/* 1/*
2 * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS 2 * ntfs.h - Defines for NTFS Linux kernel driver.
3 * project.
4 * 3 *
5 * Copyright (c) 2001-2005 Anton Altaparmakov 4 * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
6 * Copyright (C) 2002 Richard Russon 5 * Copyright (C) 2002 Richard Russon
7 * 6 *
8 * This program/include file is free software; you can redistribute it and/or 7 * This program/include file is free software; you can redistribute it and/or
@@ -57,7 +56,8 @@ extern struct kmem_cache *ntfs_attr_ctx_cache;
57extern struct kmem_cache *ntfs_index_ctx_cache; 56extern struct kmem_cache *ntfs_index_ctx_cache;
58 57
59/* The various operations structs defined throughout the driver files. */ 58/* The various operations structs defined throughout the driver files. */
60extern const struct address_space_operations ntfs_aops; 59extern const struct address_space_operations ntfs_normal_aops;
60extern const struct address_space_operations ntfs_compressed_aops;
61extern const struct address_space_operations ntfs_mst_aops; 61extern const struct address_space_operations ntfs_mst_aops;
62 62
63extern const struct file_operations ntfs_file_ops; 63extern const struct file_operations ntfs_file_ops;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6c3296e546c3..9e1e112074fb 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void)
3208} 3208}
3209 3209
3210MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); 3210MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
3211MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc."); 3211MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
3212MODULE_VERSION(NTFS_VERSION); 3212MODULE_VERSION(NTFS_VERSION);
3213MODULE_LICENSE("GPL"); 3213MODULE_LICENSE("GPL");
3214#ifdef DEBUG 3214#ifdef DEBUG
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9d8fcf2f3b94..a93bf9892256 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4961,6 +4961,15 @@ leftright:
4961 4961
4962 el = path_leaf_el(path); 4962 el = path_leaf_el(path);
4963 split_index = ocfs2_search_extent_list(el, cpos); 4963 split_index = ocfs2_search_extent_list(el, cpos);
4964 if (split_index == -1) {
4965 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
4966 "Owner %llu has an extent at cpos %u "
4967 "which can no longer be found.\n",
4968 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4969 cpos);
4970 ret = -EROFS;
4971 goto out;
4972 }
4964 goto leftright; 4973 goto leftright;
4965 } 4974 }
4966out: 4975out:
@@ -5135,7 +5144,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
5135 el = path_leaf_el(left_path); 5144 el = path_leaf_el(left_path);
5136 5145
5137 index = ocfs2_search_extent_list(el, cpos); 5146 index = ocfs2_search_extent_list(el, cpos);
5138 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5147 if (index == -1) {
5139 ocfs2_error(sb, 5148 ocfs2_error(sb,
5140 "Owner %llu has an extent at cpos %u which can no " 5149 "Owner %llu has an extent at cpos %u which can no "
5141 "longer be found.\n", 5150 "longer be found.\n",
@@ -5491,7 +5500,7 @@ int ocfs2_remove_extent(handle_t *handle,
5491 5500
5492 el = path_leaf_el(path); 5501 el = path_leaf_el(path);
5493 index = ocfs2_search_extent_list(el, cpos); 5502 index = ocfs2_search_extent_list(el, cpos);
5494 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5503 if (index == -1) {
5495 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5504 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5496 "Owner %llu has an extent at cpos %u which can no " 5505 "Owner %llu has an extent at cpos %u which can no "
5497 "longer be found.\n", 5506 "longer be found.\n",
@@ -5557,7 +5566,7 @@ int ocfs2_remove_extent(handle_t *handle,
5557 5566
5558 el = path_leaf_el(path); 5567 el = path_leaf_el(path);
5559 index = ocfs2_search_extent_list(el, cpos); 5568 index = ocfs2_search_extent_list(el, cpos);
5560 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5569 if (index == -1) {
5561 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), 5570 ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5562 "Owner %llu: split at cpos %u lost record.", 5571 "Owner %llu: split at cpos %u lost record.",
5563 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5572 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 4a231a166cf8..1ef547e49373 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1481 handle_t *handle; 1481 handle_t *handle;
1482 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1482 struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1483 1483
1484 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1485 if (IS_ERR(handle)) {
1486 ret = PTR_ERR(handle);
1487 mlog_errno(ret);
1488 goto out;
1489 }
1490
1484 page = find_or_create_page(mapping, 0, GFP_NOFS); 1491 page = find_or_create_page(mapping, 0, GFP_NOFS);
1485 if (!page) { 1492 if (!page) {
1493 ocfs2_commit_trans(osb, handle);
1486 ret = -ENOMEM; 1494 ret = -ENOMEM;
1487 mlog_errno(ret); 1495 mlog_errno(ret);
1488 goto out; 1496 goto out;
@@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
1494 wc->w_pages[0] = wc->w_target_page = page; 1502 wc->w_pages[0] = wc->w_target_page = page;
1495 wc->w_num_pages = 1; 1503 wc->w_num_pages = 1;
1496 1504
1497 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1498 if (IS_ERR(handle)) {
1499 ret = PTR_ERR(handle);
1500 mlog_errno(ret);
1501 goto out;
1502 }
1503
1504 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, 1505 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh,
1505 OCFS2_JOURNAL_ACCESS_WRITE); 1506 OCFS2_JOURNAL_ACCESS_WRITE);
1506 if (ret) { 1507 if (ret) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 73039295d0d1..eb9d48746ab4 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -2244,7 +2244,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group,
2244 return -EINVAL; 2244 return -EINVAL;
2245 2245
2246 for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { 2246 for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) {
2247 if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) 2247 if (strncasecmp(page, o2hb_heartbeat_mode_desc[i], len))
2248 continue; 2248 continue;
2249 2249
2250 ret = o2hb_global_heartbeat_mode_set(i); 2250 ret = o2hb_global_heartbeat_mode_set(i);
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num)
2572} 2572}
2573EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); 2573EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating);
2574 2574
2575int o2hb_check_node_heartbeating_no_sem(u8 node_num)
2576{
2577 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
2578 unsigned long flags;
2579
2580 spin_lock_irqsave(&o2hb_live_lock, flags);
2581 o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map));
2582 spin_unlock_irqrestore(&o2hb_live_lock, flags);
2583 if (!test_bit(node_num, testing_map)) {
2584 mlog(ML_HEARTBEAT,
2585 "node (%u) does not have heartbeating enabled.\n",
2586 node_num);
2587 return 0;
2588 }
2589
2590 return 1;
2591}
2592EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem);
2593
2575int o2hb_check_node_heartbeating_from_callback(u8 node_num) 2594int o2hb_check_node_heartbeating_from_callback(u8 node_num)
2576{ 2595{
2577 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 2596 unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h
index 00ad8e8fea51..3ef5137dc362 100644
--- a/fs/ocfs2/cluster/heartbeat.h
+++ b/fs/ocfs2/cluster/heartbeat.h
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map,
80void o2hb_exit(void); 80void o2hb_exit(void);
81int o2hb_init(void); 81int o2hb_init(void);
82int o2hb_check_node_heartbeating(u8 node_num); 82int o2hb_check_node_heartbeating(u8 node_num);
83int o2hb_check_node_heartbeating_no_sem(u8 node_num);
83int o2hb_check_node_heartbeating_from_callback(u8 node_num); 84int o2hb_check_node_heartbeating_from_callback(u8 node_num);
84int o2hb_check_local_node_heartbeating(void); 85int o2hb_check_local_node_heartbeating(void);
85void o2hb_stop_all_regions(void); 86void o2hb_stop_all_regions(void);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 07ac24fd9252..af7598bff1b5 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -49,13 +49,13 @@ static ssize_t mlog_mask_show(u64 mask, char *buf)
49 49
50static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) 50static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count)
51{ 51{
52 if (!strnicmp(buf, "allow", 5)) { 52 if (!strncasecmp(buf, "allow", 5)) {
53 __mlog_set_u64(mask, mlog_and_bits); 53 __mlog_set_u64(mask, mlog_and_bits);
54 __mlog_clear_u64(mask, mlog_not_bits); 54 __mlog_clear_u64(mask, mlog_not_bits);
55 } else if (!strnicmp(buf, "deny", 4)) { 55 } else if (!strncasecmp(buf, "deny", 4)) {
56 __mlog_set_u64(mask, mlog_not_bits); 56 __mlog_set_u64(mask, mlog_not_bits);
57 __mlog_clear_u64(mask, mlog_and_bits); 57 __mlog_clear_u64(mask, mlog_and_bits);
58 } else if (!strnicmp(buf, "off", 3)) { 58 } else if (!strncasecmp(buf, "off", 3)) {
59 __mlog_clear_u64(mask, mlog_not_bits); 59 __mlog_clear_u64(mask, mlog_not_bits);
60 __mlog_clear_u64(mask, mlog_and_bits); 60 __mlog_clear_u64(mask, mlog_and_bits);
61 } else 61 } else
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 73ba81928bce..27d1242c8383 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = {
185static int nst_fop_open(struct inode *inode, struct file *file) 185static int nst_fop_open(struct inode *inode, struct file *file)
186{ 186{
187 struct o2net_send_tracking *dummy_nst; 187 struct o2net_send_tracking *dummy_nst;
188 struct seq_file *seq;
189 int ret;
190 188
191 dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); 189 dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst));
192 if (dummy_nst == NULL) { 190 if (!dummy_nst)
193 ret = -ENOMEM; 191 return -ENOMEM;
194 goto out;
195 }
196 dummy_nst->st_task = NULL;
197
198 ret = seq_open(file, &nst_seq_ops);
199 if (ret)
200 goto out;
201
202 seq = file->private_data;
203 seq->private = dummy_nst;
204 o2net_debug_add_nst(dummy_nst); 192 o2net_debug_add_nst(dummy_nst);
205 193
206 dummy_nst = NULL; 194 return 0;
207
208out:
209 kfree(dummy_nst);
210 return ret;
211} 195}
212 196
213static int nst_fop_release(struct inode *inode, struct file *file) 197static int nst_fop_release(struct inode *inode, struct file *file)
@@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = {
412 .show = sc_seq_show, 396 .show = sc_seq_show,
413}; 397};
414 398
415static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) 399static int sc_common_open(struct file *file, int ctxt)
416{ 400{
401 struct o2net_sock_debug *sd;
417 struct o2net_sock_container *dummy_sc; 402 struct o2net_sock_container *dummy_sc;
418 struct seq_file *seq;
419 int ret;
420 403
421 dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); 404 dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL);
422 if (dummy_sc == NULL) { 405 if (!dummy_sc)
423 ret = -ENOMEM; 406 return -ENOMEM;
424 goto out;
425 }
426 dummy_sc->sc_page = NULL;
427 407
428 ret = seq_open(file, &sc_seq_ops); 408 sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd));
429 if (ret) 409 if (!sd) {
430 goto out; 410 kfree(dummy_sc);
411 return -ENOMEM;
412 }
431 413
432 seq = file->private_data; 414 sd->dbg_ctxt = ctxt;
433 seq->private = sd;
434 sd->dbg_sock = dummy_sc; 415 sd->dbg_sock = dummy_sc;
435 o2net_debug_add_sc(dummy_sc);
436 416
437 dummy_sc = NULL; 417 o2net_debug_add_sc(dummy_sc);
438 418
439out: 419 return 0;
440 kfree(dummy_sc);
441 return ret;
442} 420}
443 421
444static int sc_fop_release(struct inode *inode, struct file *file) 422static int sc_fop_release(struct inode *inode, struct file *file)
@@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
453 431
454static int stats_fop_open(struct inode *inode, struct file *file) 432static int stats_fop_open(struct inode *inode, struct file *file)
455{ 433{
456 struct o2net_sock_debug *sd; 434 return sc_common_open(file, SHOW_SOCK_STATS);
457
458 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
459 if (sd == NULL)
460 return -ENOMEM;
461
462 sd->dbg_ctxt = SHOW_SOCK_STATS;
463 sd->dbg_sock = NULL;
464
465 return sc_common_open(file, sd);
466} 435}
467 436
468static const struct file_operations stats_seq_fops = { 437static const struct file_operations stats_seq_fops = {
@@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = {
474 443
475static int sc_fop_open(struct inode *inode, struct file *file) 444static int sc_fop_open(struct inode *inode, struct file *file)
476{ 445{
477 struct o2net_sock_debug *sd; 446 return sc_common_open(file, SHOW_SOCK_CONTAINERS);
478
479 sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL);
480 if (sd == NULL)
481 return -ENOMEM;
482
483 sd->dbg_ctxt = SHOW_SOCK_CONTAINERS;
484 sd->dbg_sock = NULL;
485
486 return sc_common_open(file, sd);
487} 447}
488 448
489static const struct file_operations sc_seq_fops = { 449static const struct file_operations sc_seq_fops = {
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 1ec141e758d7..62e8ec619b4c 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work)
160 } 160 }
161 161
162out: 162out:
163 spin_unlock(&qs->qs_lock); 163 if (fence) {
164 if (fence) 164 spin_unlock(&qs->qs_lock);
165 o2quo_fence_self(); 165 o2quo_fence_self();
166 } else {
167 mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, "
168 "connected: %d, lowest: %d (%sreachable)\n",
169 qs->qs_heartbeating, qs->qs_connected, lowest_hb,
170 lowest_reachable ? "" : "un");
171 spin_unlock(&qs->qs_lock);
172
173 }
174
166} 175}
167 176
168static void o2quo_set_hold(struct o2quo_state *qs, u8 node) 177static void o2quo_set_hold(struct o2quo_state *qs, u8 node)
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 681691bc233a..97de0fbd9f78 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
536 if (nn->nn_persistent_error || nn->nn_sc_valid) 536 if (nn->nn_persistent_error || nn->nn_sc_valid)
537 wake_up(&nn->nn_sc_wq); 537 wake_up(&nn->nn_sc_wq);
538 538
539 if (!was_err && nn->nn_persistent_error) { 539 if (was_valid && !was_err && nn->nn_persistent_error) {
540 o2quo_conn_err(o2net_num_from_nn(nn)); 540 o2quo_conn_err(o2net_num_from_nn(nn));
541 queue_delayed_work(o2net_wq, &nn->nn_still_up, 541 queue_delayed_work(o2net_wq, &nn->nn_still_up,
542 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); 542 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
@@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock)
1480 return ret; 1480 return ret;
1481} 1481}
1482 1482
1483static int o2net_set_usertimeout(struct socket *sock)
1484{
1485 int user_timeout = O2NET_TCP_USER_TIMEOUT;
1486
1487 return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT,
1488 (char *)&user_timeout, sizeof(user_timeout));
1489}
1490
1483static void o2net_initialize_handshake(void) 1491static void o2net_initialize_handshake(void)
1484{ 1492{
1485 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( 1493 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
@@ -1536,16 +1544,20 @@ static void o2net_idle_timer(unsigned long data)
1536#endif 1544#endif
1537 1545
1538 printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " 1546 printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
1539 "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), 1547 "idle for %lu.%lu secs.\n",
1540 msecs / 1000, msecs % 1000); 1548 SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000);
1541 1549
1542 /* 1550 /* idle timerout happen, don't shutdown the connection, but
1543 * Initialize the nn_timeout so that the next connection attempt 1551 * make fence decision. Maybe the connection can recover before
1544 * will continue in o2net_start_connect. 1552 * the decision is made.
1545 */ 1553 */
1546 atomic_set(&nn->nn_timeout, 1); 1554 atomic_set(&nn->nn_timeout, 1);
1555 o2quo_conn_err(o2net_num_from_nn(nn));
1556 queue_delayed_work(o2net_wq, &nn->nn_still_up,
1557 msecs_to_jiffies(O2NET_QUORUM_DELAY_MS));
1558
1559 o2net_sc_reset_idle_timer(sc);
1547 1560
1548 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1549} 1561}
1550 1562
1551static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) 1563static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
@@ -1560,6 +1572,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
1560 1572
1561static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) 1573static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
1562{ 1574{
1575 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1576
1577 /* clear fence decision since the connection recover from timeout*/
1578 if (atomic_read(&nn->nn_timeout)) {
1579 o2quo_conn_up(o2net_num_from_nn(nn));
1580 cancel_delayed_work(&nn->nn_still_up);
1581 atomic_set(&nn->nn_timeout, 0);
1582 }
1583
1563 /* Only push out an existing timer */ 1584 /* Only push out an existing timer */
1564 if (timer_pending(&sc->sc_idle_timeout)) 1585 if (timer_pending(&sc->sc_idle_timeout))
1565 o2net_sc_reset_idle_timer(sc); 1586 o2net_sc_reset_idle_timer(sc);
@@ -1580,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work)
1580 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1601 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1581 int ret = 0, stop; 1602 int ret = 0, stop;
1582 unsigned int timeout; 1603 unsigned int timeout;
1604 unsigned int noio_flag;
1583 1605
1606 /*
1607 * sock_create allocates the sock with GFP_KERNEL. We must set
1608 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
1609 * by this process are done as if GFP_NOIO was specified. So we
1610 * are not reentering filesystem while doing memory reclaim.
1611 */
1612 noio_flag = memalloc_noio_save();
1584 /* if we're greater we initiate tx, otherwise we accept */ 1613 /* if we're greater we initiate tx, otherwise we accept */
1585 if (o2nm_this_node() <= o2net_num_from_nn(nn)) 1614 if (o2nm_this_node() <= o2net_num_from_nn(nn))
1586 goto out; 1615 goto out;
@@ -1650,6 +1679,12 @@ static void o2net_start_connect(struct work_struct *work)
1650 goto out; 1679 goto out;
1651 } 1680 }
1652 1681
1682 ret = o2net_set_usertimeout(sock);
1683 if (ret) {
1684 mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
1685 goto out;
1686 }
1687
1653 o2net_register_callbacks(sc->sc_sock->sk, sc); 1688 o2net_register_callbacks(sc->sc_sock->sk, sc);
1654 1689
1655 spin_lock(&nn->nn_lock); 1690 spin_lock(&nn->nn_lock);
@@ -1683,6 +1718,7 @@ out:
1683 if (mynode) 1718 if (mynode)
1684 o2nm_node_put(mynode); 1719 o2nm_node_put(mynode);
1685 1720
1721 memalloc_noio_restore(noio_flag);
1686 return; 1722 return;
1687} 1723}
1688 1724
@@ -1694,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work)
1694 spin_lock(&nn->nn_lock); 1730 spin_lock(&nn->nn_lock);
1695 if (!nn->nn_sc_valid) { 1731 if (!nn->nn_sc_valid) {
1696 printk(KERN_NOTICE "o2net: No connection established with " 1732 printk(KERN_NOTICE "o2net: No connection established with "
1697 "node %u after %u.%u seconds, giving up.\n", 1733 "node %u after %u.%u seconds, check network and"
1734 " cluster configuration.\n",
1698 o2net_num_from_nn(nn), 1735 o2net_num_from_nn(nn),
1699 o2net_idle_timeout() / 1000, 1736 o2net_idle_timeout() / 1000,
1700 o2net_idle_timeout() % 1000); 1737 o2net_idle_timeout() % 1000);
@@ -1808,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more)
1808 struct o2nm_node *local_node = NULL; 1845 struct o2nm_node *local_node = NULL;
1809 struct o2net_sock_container *sc = NULL; 1846 struct o2net_sock_container *sc = NULL;
1810 struct o2net_node *nn; 1847 struct o2net_node *nn;
1848 unsigned int noio_flag;
1849
1850 /*
1851 * sock_create_lite allocates the sock with GFP_KERNEL. We must set
1852 * per-process flag PF_MEMALLOC_NOIO so that all allocations done
1853 * by this process are done as if GFP_NOIO was specified. So we
1854 * are not reentering filesystem while doing memory reclaim.
1855 */
1856 noio_flag = memalloc_noio_save();
1811 1857
1812 BUG_ON(sock == NULL); 1858 BUG_ON(sock == NULL);
1813 *more = 0; 1859 *more = 0;
@@ -1831,6 +1877,12 @@ static int o2net_accept_one(struct socket *sock, int *more)
1831 goto out; 1877 goto out;
1832 } 1878 }
1833 1879
1880 ret = o2net_set_usertimeout(new_sock);
1881 if (ret) {
1882 mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret);
1883 goto out;
1884 }
1885
1834 slen = sizeof(sin); 1886 slen = sizeof(sin);
1835 ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1887 ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin,
1836 &slen, 1); 1888 &slen, 1);
@@ -1918,6 +1970,8 @@ out:
1918 o2nm_node_put(local_node); 1970 o2nm_node_put(local_node);
1919 if (sc) 1971 if (sc)
1920 sc_put(sc); 1972 sc_put(sc);
1973
1974 memalloc_noio_restore(noio_flag);
1921 return ret; 1975 return ret;
1922} 1976}
1923 1977
@@ -2113,17 +2167,13 @@ int o2net_init(void)
2113 o2quo_init(); 2167 o2quo_init();
2114 2168
2115 if (o2net_debugfs_init()) 2169 if (o2net_debugfs_init())
2116 return -ENOMEM; 2170 goto out;
2117 2171
2118 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); 2172 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
2119 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2173 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
2120 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2174 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
2121 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { 2175 if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp)
2122 kfree(o2net_hand); 2176 goto out;
2123 kfree(o2net_keep_req);
2124 kfree(o2net_keep_resp);
2125 return -ENOMEM;
2126 }
2127 2177
2128 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); 2178 o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION);
2129 o2net_hand->connector_id = cpu_to_be64(1); 2179 o2net_hand->connector_id = cpu_to_be64(1);
@@ -2148,6 +2198,14 @@ int o2net_init(void)
2148 } 2198 }
2149 2199
2150 return 0; 2200 return 0;
2201
2202out:
2203 kfree(o2net_hand);
2204 kfree(o2net_keep_req);
2205 kfree(o2net_keep_resp);
2206
2207 o2quo_exit();
2208 return -ENOMEM;
2151} 2209}
2152 2210
2153void o2net_exit(void) 2211void o2net_exit(void)
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index 5bada2a69b50..c571e849fda4 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data,
63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 63#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000
64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 64#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000
65 65
66#define O2NET_TCP_USER_TIMEOUT 0x7fffffff
66 67
67/* TODO: figure this out.... */ 68/* TODO: figure this out.... */
68static inline int o2net_link_down(int err, struct socket *sock) 69static inline int o2net_link_down(int err, struct socket *sock)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 18f13c2e4a10..149eb556b8c6 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = {
647static int debug_lockres_open(struct inode *inode, struct file *file) 647static int debug_lockres_open(struct inode *inode, struct file *file)
648{ 648{
649 struct dlm_ctxt *dlm = inode->i_private; 649 struct dlm_ctxt *dlm = inode->i_private;
650 int ret = -ENOMEM; 650 struct debug_lockres *dl;
651 struct seq_file *seq; 651 void *buf;
652 struct debug_lockres *dl = NULL;
653 652
654 dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); 653 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
655 if (!dl) { 654 if (!buf)
656 mlog_errno(ret);
657 goto bail; 655 goto bail;
658 }
659 656
660 dl->dl_len = PAGE_SIZE; 657 dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl));
661 dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); 658 if (!dl)
662 if (!dl->dl_buf) { 659 goto bailfree;
663 mlog_errno(ret);
664 goto bail;
665 }
666 660
667 ret = seq_open(file, &debug_lockres_ops); 661 dl->dl_len = PAGE_SIZE;
668 if (ret) { 662 dl->dl_buf = buf;
669 mlog_errno(ret);
670 goto bail;
671 }
672
673 seq = file->private_data;
674 seq->private = dl;
675 663
676 dlm_grab(dlm); 664 dlm_grab(dlm);
677 dl->dl_ctxt = dlm; 665 dl->dl_ctxt = dlm;
678 666
679 return 0; 667 return 0;
668
669bailfree:
670 kfree(buf);
680bail: 671bail:
681 if (dl) 672 mlog_errno(-ENOMEM);
682 kfree(dl->dl_buf); 673 return -ENOMEM;
683 kfree(dl);
684 return ret;
685} 674}
686 675
687static int debug_lockres_release(struct inode *inode, struct file *file) 676static int debug_lockres_release(struct inode *inode, struct file *file)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 39efc5057a36..02d315fef432 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data,
839 * to back off and try again. This gives heartbeat a chance 839 * to back off and try again. This gives heartbeat a chance
840 * to catch up. 840 * to catch up.
841 */ 841 */
842 if (!o2hb_check_node_heartbeating(query->node_idx)) { 842 if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) {
843 mlog(0, "node %u is not in our live map yet\n", 843 mlog(0, "node %u is not in our live map yet\n",
844 query->node_idx); 844 query->node_idx);
845 845
@@ -1923,12 +1923,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1923 goto bail; 1923 goto bail;
1924 } 1924 }
1925 1925
1926 if (total_backoff > 1926 if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) {
1927 msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) {
1928 status = -ERESTARTSYS; 1927 status = -ERESTARTSYS;
1929 mlog(ML_NOTICE, "Timed out joining dlm domain " 1928 mlog(ML_NOTICE, "Timed out joining dlm domain "
1930 "%s after %u msecs\n", dlm->name, 1929 "%s after %u msecs\n", dlm->name,
1931 jiffies_to_msecs(total_backoff)); 1930 total_backoff);
1932 goto bail; 1931 goto bail;
1933 } 1932 }
1934 1933
@@ -1976,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1976 1975
1977 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); 1976 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
1978 if (!dlm) { 1977 if (!dlm) {
1979 mlog_errno(-ENOMEM); 1978 ret = -ENOMEM;
1979 mlog_errno(ret);
1980 goto leave; 1980 goto leave;
1981 } 1981 }
1982 1982
1983 dlm->name = kstrdup(domain, GFP_KERNEL); 1983 dlm->name = kstrdup(domain, GFP_KERNEL);
1984 if (dlm->name == NULL) { 1984 if (dlm->name == NULL) {
1985 mlog_errno(-ENOMEM); 1985 ret = -ENOMEM;
1986 kfree(dlm); 1986 mlog_errno(ret);
1987 dlm = NULL;
1988 goto leave; 1987 goto leave;
1989 } 1988 }
1990 1989
1991 dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); 1990 dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
1992 if (!dlm->lockres_hash) { 1991 if (!dlm->lockres_hash) {
1993 mlog_errno(-ENOMEM); 1992 ret = -ENOMEM;
1994 kfree(dlm->name); 1993 mlog_errno(ret);
1995 kfree(dlm);
1996 dlm = NULL;
1997 goto leave; 1994 goto leave;
1998 } 1995 }
1999 1996
@@ -2003,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2003 dlm->master_hash = (struct hlist_head **) 2000 dlm->master_hash = (struct hlist_head **)
2004 dlm_alloc_pagevec(DLM_HASH_PAGES); 2001 dlm_alloc_pagevec(DLM_HASH_PAGES);
2005 if (!dlm->master_hash) { 2002 if (!dlm->master_hash) {
2006 mlog_errno(-ENOMEM); 2003 ret = -ENOMEM;
2007 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 2004 mlog_errno(ret);
2008 kfree(dlm->name);
2009 kfree(dlm);
2010 dlm = NULL;
2011 goto leave; 2005 goto leave;
2012 } 2006 }
2013 2007
@@ -2018,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2018 dlm->node_num = o2nm_this_node(); 2012 dlm->node_num = o2nm_this_node();
2019 2013
2020 ret = dlm_create_debugfs_subroot(dlm); 2014 ret = dlm_create_debugfs_subroot(dlm);
2021 if (ret < 0) { 2015 if (ret < 0)
2022 dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
2023 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
2024 kfree(dlm->name);
2025 kfree(dlm);
2026 dlm = NULL;
2027 goto leave; 2016 goto leave;
2028 }
2029 2017
2030 spin_lock_init(&dlm->spinlock); 2018 spin_lock_init(&dlm->spinlock);
2031 spin_lock_init(&dlm->master_lock); 2019 spin_lock_init(&dlm->master_lock);
@@ -2086,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2086 atomic_read(&dlm->dlm_refs.refcount)); 2074 atomic_read(&dlm->dlm_refs.refcount));
2087 2075
2088leave: 2076leave:
2077 if (ret < 0 && dlm) {
2078 if (dlm->master_hash)
2079 dlm_free_pagevec((void **)dlm->master_hash,
2080 DLM_HASH_PAGES);
2081
2082 if (dlm->lockres_hash)
2083 dlm_free_pagevec((void **)dlm->lockres_hash,
2084 DLM_HASH_PAGES);
2085
2086 kfree(dlm->name);
2087 kfree(dlm);
2088 dlm = NULL;
2089 }
2089 return dlm; 2090 return dlm;
2090} 2091}
2091 2092
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 82abf0cc9a12..215e41abf101 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
625 return res; 625 return res;
626 626
627error: 627error:
628 if (res && res->lockname.name)
629 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
630
631 if (res) 628 if (res)
632 kmem_cache_free(dlm_lockres_cache, res); 629 kmem_cache_free(dlm_lockres_cache, res);
633 return NULL; 630 return NULL;
@@ -655,12 +652,9 @@ void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
655 clear_bit(bit, res->refmap); 652 clear_bit(bit, res->refmap);
656} 653}
657 654
658 655static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
659void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
660 struct dlm_lock_resource *res) 656 struct dlm_lock_resource *res)
661{ 657{
662 assert_spin_locked(&res->spinlock);
663
664 res->inflight_locks++; 658 res->inflight_locks++;
665 659
666 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name, 660 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
@@ -668,6 +662,13 @@ void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
668 __builtin_return_address(0)); 662 __builtin_return_address(0));
669} 663}
670 664
665void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
666 struct dlm_lock_resource *res)
667{
668 assert_spin_locked(&res->spinlock);
669 __dlm_lockres_grab_inflight_ref(dlm, res);
670}
671
671void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 672void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
672 struct dlm_lock_resource *res) 673 struct dlm_lock_resource *res)
673{ 674{
@@ -894,10 +895,8 @@ lookup:
894 /* finally add the lockres to its hash bucket */ 895 /* finally add the lockres to its hash bucket */
895 __dlm_insert_lockres(dlm, res); 896 __dlm_insert_lockres(dlm, res);
896 897
897 /* Grab inflight ref to pin the resource */ 898 /* since this lockres is new it doesn't not require the spinlock */
898 spin_lock(&res->spinlock); 899 __dlm_lockres_grab_inflight_ref(dlm, res);
899 dlm_lockres_grab_inflight_ref(dlm, res);
900 spin_unlock(&res->spinlock);
901 900
902 /* get an extra ref on the mle in case this is a BLOCK 901 /* get an extra ref on the mle in case this is a BLOCK
903 * if so, the creator of the BLOCK may try to put the last 902 * if so, the creator of the BLOCK may try to put the last
@@ -2037,6 +2036,10 @@ kill:
2037 "and killing the other node now! This node is OK and can continue.\n"); 2036 "and killing the other node now! This node is OK and can continue.\n");
2038 __dlm_print_one_lock_resource(res); 2037 __dlm_print_one_lock_resource(res);
2039 spin_unlock(&res->spinlock); 2038 spin_unlock(&res->spinlock);
2039 spin_lock(&dlm->master_lock);
2040 if (mle)
2041 __dlm_put_mle(mle);
2042 spin_unlock(&dlm->master_lock);
2040 spin_unlock(&dlm->spinlock); 2043 spin_unlock(&dlm->spinlock);
2041 *ret_data = (void *)res; 2044 *ret_data = (void *)res;
2042 dlm_put(dlm); 2045 dlm_put(dlm);
@@ -2405,6 +2408,10 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2405 if (res->state & DLM_LOCK_RES_MIGRATING) 2408 if (res->state & DLM_LOCK_RES_MIGRATING)
2406 return 0; 2409 return 0;
2407 2410
2411 /* delay migration when the lockres is in RECOCERING state */
2412 if (res->state & DLM_LOCK_RES_RECOVERING)
2413 return 0;
2414
2408 if (res->owner != dlm->node_num) 2415 if (res->owner != dlm->node_num)
2409 return 0; 2416 return 0;
2410 2417
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 45067faf5695..3365839d2971 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
1710 BUG(); 1710 BUG();
1711 } else 1711 } else
1712 __dlm_lockres_grab_inflight_worker(dlm, res); 1712 __dlm_lockres_grab_inflight_worker(dlm, res);
1713 } else /* put.. incase we are not the master */ 1713 spin_unlock(&res->spinlock);
1714 } else {
1715 /* put.. incase we are not the master */
1716 spin_unlock(&res->spinlock);
1714 dlm_lockres_put(res); 1717 dlm_lockres_put(res);
1715 spin_unlock(&res->spinlock); 1718 }
1716 } 1719 }
1717 spin_unlock(&dlm->spinlock); 1720 spin_unlock(&dlm->spinlock);
1718 1721
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 52cfe99ae056..21262f2b1654 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
2892 2892
2893static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) 2893static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
2894{ 2894{
2895 int ret;
2896 struct ocfs2_dlm_seq_priv *priv; 2895 struct ocfs2_dlm_seq_priv *priv;
2897 struct seq_file *seq;
2898 struct ocfs2_super *osb; 2896 struct ocfs2_super *osb;
2899 2897
2900 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); 2898 priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv));
2901 if (!priv) { 2899 if (!priv) {
2902 ret = -ENOMEM; 2900 mlog_errno(-ENOMEM);
2903 mlog_errno(ret); 2901 return -ENOMEM;
2904 goto out;
2905 } 2902 }
2903
2906 osb = inode->i_private; 2904 osb = inode->i_private;
2907 ocfs2_get_dlm_debug(osb->osb_dlm_debug); 2905 ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2908 priv->p_dlm_debug = osb->osb_dlm_debug; 2906 priv->p_dlm_debug = osb->osb_dlm_debug;
2909 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); 2907 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2910 2908
2911 ret = seq_open(file, &ocfs2_dlm_seq_ops);
2912 if (ret) {
2913 kfree(priv);
2914 mlog_errno(ret);
2915 goto out;
2916 }
2917
2918 seq = file->private_data;
2919 seq->private = priv;
2920
2921 ocfs2_add_lockres_tracking(&priv->p_iter_res, 2909 ocfs2_add_lockres_tracking(&priv->p_iter_res,
2922 priv->p_dlm_debug); 2910 priv->p_dlm_debug);
2923 2911
2924out: 2912 return 0;
2925 return ret;
2926} 2913}
2927 2914
2928static const struct file_operations ocfs2_dlm_debug_fops = { 2915static const struct file_operations ocfs2_dlm_debug_fops = {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 2930e231f3f9..324dc93ac896 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
760 struct address_space *mapping = inode->i_mapping; 760 struct address_space *mapping = inode->i_mapping;
761 struct page *page; 761 struct page *page;
762 unsigned long index = abs_from >> PAGE_CACHE_SHIFT; 762 unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
763 handle_t *handle = NULL; 763 handle_t *handle;
764 int ret = 0; 764 int ret = 0;
765 unsigned zero_from, zero_to, block_start, block_end; 765 unsigned zero_from, zero_to, block_start, block_end;
766 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 766 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
769 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); 769 BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
770 BUG_ON(abs_from & (inode->i_blkbits - 1)); 770 BUG_ON(abs_from & (inode->i_blkbits - 1));
771 771
772 handle = ocfs2_zero_start_ordered_transaction(inode, di_bh);
773 if (IS_ERR(handle)) {
774 ret = PTR_ERR(handle);
775 goto out;
776 }
777
772 page = find_or_create_page(mapping, index, GFP_NOFS); 778 page = find_or_create_page(mapping, index, GFP_NOFS);
773 if (!page) { 779 if (!page) {
774 ret = -ENOMEM; 780 ret = -ENOMEM;
775 mlog_errno(ret); 781 mlog_errno(ret);
776 goto out; 782 goto out_commit_trans;
777 } 783 }
778 784
779 /* Get the offsets within the page that we want to zero */ 785 /* Get the offsets within the page that we want to zero */
@@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
805 goto out_unlock; 811 goto out_unlock;
806 } 812 }
807 813
808 if (!handle) {
809 handle = ocfs2_zero_start_ordered_transaction(inode,
810 di_bh);
811 if (IS_ERR(handle)) {
812 ret = PTR_ERR(handle);
813 handle = NULL;
814 break;
815 }
816 }
817 814
818 /* must not update i_size! */ 815 /* must not update i_size! */
819 ret = block_commit_write(page, block_start + 1, 816 ret = block_commit_write(page, block_start + 1,
@@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
824 ret = 0; 821 ret = 0;
825 } 822 }
826 823
824 /*
825 * fs-writeback will release the dirty pages without page lock
826 * whose offset are over inode size, the release happens at
827 * block_write_full_page().
828 */
829 i_size_write(inode, abs_to);
830 inode->i_blocks = ocfs2_inode_sector_count(inode);
831 di->i_size = cpu_to_le64((u64)i_size_read(inode));
832 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
833 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
834 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
835 di->i_mtime_nsec = di->i_ctime_nsec;
827 if (handle) { 836 if (handle) {
828 /*
829 * fs-writeback will release the dirty pages without page lock
830 * whose offset are over inode size, the release happens at
831 * block_write_full_page().
832 */
833 i_size_write(inode, abs_to);
834 inode->i_blocks = ocfs2_inode_sector_count(inode);
835 di->i_size = cpu_to_le64((u64)i_size_read(inode));
836 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
837 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
838 di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
839 di->i_mtime_nsec = di->i_ctime_nsec;
840 ocfs2_journal_dirty(handle, di_bh); 837 ocfs2_journal_dirty(handle, di_bh);
841 ocfs2_update_inode_fsync_trans(handle, inode, 1); 838 ocfs2_update_inode_fsync_trans(handle, inode, 1);
842 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
843 } 839 }
844 840
845out_unlock: 841out_unlock:
846 unlock_page(page); 842 unlock_page(page);
847 page_cache_release(page); 843 page_cache_release(page);
844out_commit_trans:
845 if (handle)
846 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
848out: 847out:
849 return ret; 848 return ret;
850} 849}
@@ -1253,7 +1252,7 @@ bail:
1253 brelse(bh); 1252 brelse(bh);
1254 1253
1255 /* Release quota pointers in case we acquired them */ 1254 /* Release quota pointers in case we acquired them */
1256 for (qtype = 0; qtype < MAXQUOTAS; qtype++) 1255 for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++)
1257 dqput(transfer_to[qtype]); 1256 dqput(transfer_to[qtype]);
1258 1257
1259 if (!status && attr->ia_valid & ATTR_MODE) { 1258 if (!status && attr->ia_valid & ATTR_MODE) {
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index a6c991c0fc98..a9b76de46047 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
162{ 162{
163 int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; 163 int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
164 164
165 return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); 165 return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits;
166} 166}
167 167
168/* Validate that a bh contains a valid inode */ 168/* Validate that a bh contains a valid inode */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 6f66b3751ace..53e6c40ed4c6 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -35,9 +35,8 @@
35 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) 35 copy_to_user((typeof(a) __user *)b, &(a), sizeof(a))
36 36
37/* 37/*
38 * This call is void because we are already reporting an error that may 38 * This is just a best-effort to tell userspace that this request
39 * be -EFAULT. The error will be returned from the ioctl(2) call. It's 39 * caused the error.
40 * just a best-effort to tell userspace that this request caused the error.
41 */ 40 */
42static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, 41static inline void o2info_set_request_error(struct ocfs2_info_request *kreq,
43 struct ocfs2_info_request __user *req) 42 struct ocfs2_info_request __user *req)
@@ -146,136 +145,105 @@ bail:
146static int ocfs2_info_handle_blocksize(struct inode *inode, 145static int ocfs2_info_handle_blocksize(struct inode *inode,
147 struct ocfs2_info_request __user *req) 146 struct ocfs2_info_request __user *req)
148{ 147{
149 int status = -EFAULT;
150 struct ocfs2_info_blocksize oib; 148 struct ocfs2_info_blocksize oib;
151 149
152 if (o2info_from_user(oib, req)) 150 if (o2info_from_user(oib, req))
153 goto bail; 151 return -EFAULT;
154 152
155 oib.ib_blocksize = inode->i_sb->s_blocksize; 153 oib.ib_blocksize = inode->i_sb->s_blocksize;
156 154
157 o2info_set_request_filled(&oib.ib_req); 155 o2info_set_request_filled(&oib.ib_req);
158 156
159 if (o2info_to_user(oib, req)) 157 if (o2info_to_user(oib, req))
160 goto bail; 158 return -EFAULT;
161
162 status = 0;
163bail:
164 if (status)
165 o2info_set_request_error(&oib.ib_req, req);
166 159
167 return status; 160 return 0;
168} 161}
169 162
170static int ocfs2_info_handle_clustersize(struct inode *inode, 163static int ocfs2_info_handle_clustersize(struct inode *inode,
171 struct ocfs2_info_request __user *req) 164 struct ocfs2_info_request __user *req)
172{ 165{
173 int status = -EFAULT;
174 struct ocfs2_info_clustersize oic; 166 struct ocfs2_info_clustersize oic;
175 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 167 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
176 168
177 if (o2info_from_user(oic, req)) 169 if (o2info_from_user(oic, req))
178 goto bail; 170 return -EFAULT;
179 171
180 oic.ic_clustersize = osb->s_clustersize; 172 oic.ic_clustersize = osb->s_clustersize;
181 173
182 o2info_set_request_filled(&oic.ic_req); 174 o2info_set_request_filled(&oic.ic_req);
183 175
184 if (o2info_to_user(oic, req)) 176 if (o2info_to_user(oic, req))
185 goto bail; 177 return -EFAULT;
186
187 status = 0;
188bail:
189 if (status)
190 o2info_set_request_error(&oic.ic_req, req);
191 178
192 return status; 179 return 0;
193} 180}
194 181
195static int ocfs2_info_handle_maxslots(struct inode *inode, 182static int ocfs2_info_handle_maxslots(struct inode *inode,
196 struct ocfs2_info_request __user *req) 183 struct ocfs2_info_request __user *req)
197{ 184{
198 int status = -EFAULT;
199 struct ocfs2_info_maxslots oim; 185 struct ocfs2_info_maxslots oim;
200 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 186 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
201 187
202 if (o2info_from_user(oim, req)) 188 if (o2info_from_user(oim, req))
203 goto bail; 189 return -EFAULT;
204 190
205 oim.im_max_slots = osb->max_slots; 191 oim.im_max_slots = osb->max_slots;
206 192
207 o2info_set_request_filled(&oim.im_req); 193 o2info_set_request_filled(&oim.im_req);
208 194
209 if (o2info_to_user(oim, req)) 195 if (o2info_to_user(oim, req))
210 goto bail; 196 return -EFAULT;
211 197
212 status = 0; 198 return 0;
213bail:
214 if (status)
215 o2info_set_request_error(&oim.im_req, req);
216
217 return status;
218} 199}
219 200
220static int ocfs2_info_handle_label(struct inode *inode, 201static int ocfs2_info_handle_label(struct inode *inode,
221 struct ocfs2_info_request __user *req) 202 struct ocfs2_info_request __user *req)
222{ 203{
223 int status = -EFAULT;
224 struct ocfs2_info_label oil; 204 struct ocfs2_info_label oil;
225 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
226 206
227 if (o2info_from_user(oil, req)) 207 if (o2info_from_user(oil, req))
228 goto bail; 208 return -EFAULT;
229 209
230 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); 210 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
231 211
232 o2info_set_request_filled(&oil.il_req); 212 o2info_set_request_filled(&oil.il_req);
233 213
234 if (o2info_to_user(oil, req)) 214 if (o2info_to_user(oil, req))
235 goto bail; 215 return -EFAULT;
236 216
237 status = 0; 217 return 0;
238bail:
239 if (status)
240 o2info_set_request_error(&oil.il_req, req);
241
242 return status;
243} 218}
244 219
245static int ocfs2_info_handle_uuid(struct inode *inode, 220static int ocfs2_info_handle_uuid(struct inode *inode,
246 struct ocfs2_info_request __user *req) 221 struct ocfs2_info_request __user *req)
247{ 222{
248 int status = -EFAULT;
249 struct ocfs2_info_uuid oiu; 223 struct ocfs2_info_uuid oiu;
250 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 224 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
251 225
252 if (o2info_from_user(oiu, req)) 226 if (o2info_from_user(oiu, req))
253 goto bail; 227 return -EFAULT;
254 228
255 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); 229 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
256 230
257 o2info_set_request_filled(&oiu.iu_req); 231 o2info_set_request_filled(&oiu.iu_req);
258 232
259 if (o2info_to_user(oiu, req)) 233 if (o2info_to_user(oiu, req))
260 goto bail; 234 return -EFAULT;
261
262 status = 0;
263bail:
264 if (status)
265 o2info_set_request_error(&oiu.iu_req, req);
266 235
267 return status; 236 return 0;
268} 237}
269 238
270static int ocfs2_info_handle_fs_features(struct inode *inode, 239static int ocfs2_info_handle_fs_features(struct inode *inode,
271 struct ocfs2_info_request __user *req) 240 struct ocfs2_info_request __user *req)
272{ 241{
273 int status = -EFAULT;
274 struct ocfs2_info_fs_features oif; 242 struct ocfs2_info_fs_features oif;
275 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 243 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
276 244
277 if (o2info_from_user(oif, req)) 245 if (o2info_from_user(oif, req))
278 goto bail; 246 return -EFAULT;
279 247
280 oif.if_compat_features = osb->s_feature_compat; 248 oif.if_compat_features = osb->s_feature_compat;
281 oif.if_incompat_features = osb->s_feature_incompat; 249 oif.if_incompat_features = osb->s_feature_incompat;
@@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode,
284 o2info_set_request_filled(&oif.if_req); 252 o2info_set_request_filled(&oif.if_req);
285 253
286 if (o2info_to_user(oif, req)) 254 if (o2info_to_user(oif, req))
287 goto bail; 255 return -EFAULT;
288 256
289 status = 0; 257 return 0;
290bail:
291 if (status)
292 o2info_set_request_error(&oif.if_req, req);
293
294 return status;
295} 258}
296 259
297static int ocfs2_info_handle_journal_size(struct inode *inode, 260static int ocfs2_info_handle_journal_size(struct inode *inode,
298 struct ocfs2_info_request __user *req) 261 struct ocfs2_info_request __user *req)
299{ 262{
300 int status = -EFAULT;
301 struct ocfs2_info_journal_size oij; 263 struct ocfs2_info_journal_size oij;
302 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 264 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
303 265
304 if (o2info_from_user(oij, req)) 266 if (o2info_from_user(oij, req))
305 goto bail; 267 return -EFAULT;
306 268
307 oij.ij_journal_size = i_size_read(osb->journal->j_inode); 269 oij.ij_journal_size = i_size_read(osb->journal->j_inode);
308 270
309 o2info_set_request_filled(&oij.ij_req); 271 o2info_set_request_filled(&oij.ij_req);
310 272
311 if (o2info_to_user(oij, req)) 273 if (o2info_to_user(oij, req))
312 goto bail; 274 return -EFAULT;
313 275
314 status = 0; 276 return 0;
315bail:
316 if (status)
317 o2info_set_request_error(&oij.ij_req, req);
318
319 return status;
320} 277}
321 278
322static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, 279static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
@@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
373 u32 i; 330 u32 i;
374 u64 blkno = -1; 331 u64 blkno = -1;
375 char namebuf[40]; 332 char namebuf[40];
376 int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; 333 int status, type = INODE_ALLOC_SYSTEM_INODE;
377 struct ocfs2_info_freeinode *oifi = NULL; 334 struct ocfs2_info_freeinode *oifi = NULL;
378 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 335 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
379 struct inode *inode_alloc = NULL; 336 struct inode *inode_alloc = NULL;
@@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
385 goto out_err; 342 goto out_err;
386 } 343 }
387 344
388 if (o2info_from_user(*oifi, req)) 345 if (o2info_from_user(*oifi, req)) {
389 goto bail; 346 status = -EFAULT;
347 goto out_free;
348 }
390 349
391 oifi->ifi_slotnum = osb->max_slots; 350 oifi->ifi_slotnum = osb->max_slots;
392 351
@@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode,
424 383
425 o2info_set_request_filled(&oifi->ifi_req); 384 o2info_set_request_filled(&oifi->ifi_req);
426 385
427 if (o2info_to_user(*oifi, req)) 386 if (o2info_to_user(*oifi, req)) {
428 goto bail; 387 status = -EFAULT;
388 goto out_free;
389 }
429 390
430 status = 0; 391 status = 0;
431bail: 392bail:
432 if (status) 393 if (status)
433 o2info_set_request_error(&oifi->ifi_req, req); 394 o2info_set_request_error(&oifi->ifi_req, req);
434 395out_free:
435 kfree(oifi); 396 kfree(oifi);
436out_err: 397out_err:
437 return status; 398 return status;
@@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
658{ 619{
659 u64 blkno = -1; 620 u64 blkno = -1;
660 char namebuf[40]; 621 char namebuf[40];
661 int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; 622 int status, type = GLOBAL_BITMAP_SYSTEM_INODE;
662 623
663 struct ocfs2_info_freefrag *oiff; 624 struct ocfs2_info_freefrag *oiff;
664 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 625 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
671 goto out_err; 632 goto out_err;
672 } 633 }
673 634
674 if (o2info_from_user(*oiff, req)) 635 if (o2info_from_user(*oiff, req)) {
675 goto bail; 636 status = -EFAULT;
637 goto out_free;
638 }
676 /* 639 /*
677 * chunksize from userspace should be power of 2. 640 * chunksize from userspace should be power of 2.
678 */ 641 */
@@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode,
711 674
712 if (o2info_to_user(*oiff, req)) { 675 if (o2info_to_user(*oiff, req)) {
713 status = -EFAULT; 676 status = -EFAULT;
714 goto bail; 677 goto out_free;
715 } 678 }
716 679
717 status = 0; 680 status = 0;
718bail: 681bail:
719 if (status) 682 if (status)
720 o2info_set_request_error(&oiff->iff_req, req); 683 o2info_set_request_error(&oiff->iff_req, req);
721 684out_free:
722 kfree(oiff); 685 kfree(oiff);
723out_err: 686out_err:
724 return status; 687 return status;
@@ -727,23 +690,17 @@ out_err:
727static int ocfs2_info_handle_unknown(struct inode *inode, 690static int ocfs2_info_handle_unknown(struct inode *inode,
728 struct ocfs2_info_request __user *req) 691 struct ocfs2_info_request __user *req)
729{ 692{
730 int status = -EFAULT;
731 struct ocfs2_info_request oir; 693 struct ocfs2_info_request oir;
732 694
733 if (o2info_from_user(oir, req)) 695 if (o2info_from_user(oir, req))
734 goto bail; 696 return -EFAULT;
735 697
736 o2info_clear_request_filled(&oir); 698 o2info_clear_request_filled(&oir);
737 699
738 if (o2info_to_user(oir, req)) 700 if (o2info_to_user(oir, req))
739 goto bail; 701 return -EFAULT;
740 702
741 status = 0; 703 return 0;
742bail:
743 if (status)
744 o2info_set_request_error(&oir, req);
745
746 return status;
747} 704}
748 705
749/* 706/*
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 599eb4c4c8be..74caffeeee1d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -98,7 +98,7 @@ static int __ocfs2_move_extent(handle_t *handle,
98 el = path_leaf_el(path); 98 el = path_leaf_el(path);
99 99
100 index = ocfs2_search_extent_list(el, cpos); 100 index = ocfs2_search_extent_list(el, cpos);
101 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 101 if (index == -1) {
102 ocfs2_error(inode->i_sb, 102 ocfs2_error(inode->i_sb,
103 "Inode %llu has an extent at cpos %u which can no " 103 "Inode %llu has an extent at cpos %u which can no "
104 "longer be found.\n", 104 "longer be found.\n",
@@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
404 * 'vict_blkno' was out of the valid range. 404 * 'vict_blkno' was out of the valid range.
405 */ 405 */
406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
407 (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 407 (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
408 bits_per_unit))) { 408 bits_per_unit))) {
409 ret = -EINVAL; 409 ret = -EINVAL;
410 goto out; 410 goto out;
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index f266d67df3c6..1eae330193a6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -17,6 +17,9 @@
17 17
18#include "ocfs2.h" 18#include "ocfs2.h"
19 19
20/* Number of quota types we support */
21#define OCFS2_MAXQUOTAS 2
22
20/* 23/*
21 * In-memory structures 24 * In-memory structures
22 */ 25 */
@@ -39,7 +42,7 @@ struct ocfs2_recovery_chunk {
39}; 42};
40 43
41struct ocfs2_quota_recovery { 44struct ocfs2_quota_recovery {
42 struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */ 45 struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */
43}; 46};
44 47
45/* In-memory structure with quota header information */ 48/* In-memory structure with quota header information */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index b990a62cff50..c93d67220887 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -336,8 +336,8 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
336int ocfs2_global_read_info(struct super_block *sb, int type) 336int ocfs2_global_read_info(struct super_block *sb, int type)
337{ 337{
338 struct inode *gqinode = NULL; 338 struct inode *gqinode = NULL;
339 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, 339 unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
340 GROUP_QUOTA_SYSTEM_INODE }; 340 GROUP_QUOTA_SYSTEM_INODE };
341 struct ocfs2_global_disk_dqinfo dinfo; 341 struct ocfs2_global_disk_dqinfo dinfo;
342 struct mem_dqinfo *info = sb_dqinfo(sb, type); 342 struct mem_dqinfo *info = sb_dqinfo(sb, type);
343 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; 343 struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2001862bf2b1..10b653930ee2 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -166,12 +166,12 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
166/* Check whether we understand format of quota files */ 166/* Check whether we understand format of quota files */
167static int ocfs2_local_check_quota_file(struct super_block *sb, int type) 167static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
168{ 168{
169 unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; 169 unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
170 unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; 170 unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
171 unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; 171 unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
172 unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; 172 unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
173 unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, 173 unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
174 GROUP_QUOTA_SYSTEM_INODE }; 174 GROUP_QUOTA_SYSTEM_INODE };
175 struct buffer_head *bh = NULL; 175 struct buffer_head *bh = NULL;
176 struct inode *linode = sb_dqopt(sb)->files[type]; 176 struct inode *linode = sb_dqopt(sb)->files[type];
177 struct inode *ginode = NULL; 177 struct inode *ginode = NULL;
@@ -336,7 +336,7 @@ void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
336{ 336{
337 int type; 337 int type;
338 338
339 for (type = 0; type < MAXQUOTAS; type++) 339 for (type = 0; type < OCFS2_MAXQUOTAS; type++)
340 free_recovery_list(&(rec->r_list[type])); 340 free_recovery_list(&(rec->r_list[type]));
341 kfree(rec); 341 kfree(rec);
342} 342}
@@ -382,7 +382,7 @@ static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
382 rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); 382 rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
383 if (!rec) 383 if (!rec)
384 return NULL; 384 return NULL;
385 for (type = 0; type < MAXQUOTAS; type++) 385 for (type = 0; type < OCFS2_MAXQUOTAS; type++)
386 INIT_LIST_HEAD(&(rec->r_list[type])); 386 INIT_LIST_HEAD(&(rec->r_list[type]));
387 return rec; 387 return rec;
388} 388}
@@ -392,10 +392,11 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
392 struct ocfs2_super *osb, 392 struct ocfs2_super *osb,
393 int slot_num) 393 int slot_num)
394{ 394{
395 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 395 unsigned int feature[OCFS2_MAXQUOTAS] = {
396 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 396 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
397 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, 397 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
398 LOCAL_GROUP_QUOTA_SYSTEM_INODE }; 398 unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
399 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
399 struct super_block *sb = osb->sb; 400 struct super_block *sb = osb->sb;
400 struct ocfs2_local_disk_dqinfo *ldinfo; 401 struct ocfs2_local_disk_dqinfo *ldinfo;
401 struct inode *lqinode; 402 struct inode *lqinode;
@@ -412,7 +413,7 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
412 return ERR_PTR(-ENOMEM); 413 return ERR_PTR(-ENOMEM);
413 /* First init... */ 414 /* First init... */
414 415
415 for (type = 0; type < MAXQUOTAS; type++) { 416 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
416 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 417 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
417 continue; 418 continue;
418 /* At this point, journal of the slot is already replayed so 419 /* At this point, journal of the slot is already replayed so
@@ -589,8 +590,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
589 struct ocfs2_quota_recovery *rec, 590 struct ocfs2_quota_recovery *rec,
590 int slot_num) 591 int slot_num)
591{ 592{
592 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, 593 unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
593 LOCAL_GROUP_QUOTA_SYSTEM_INODE }; 594 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
594 struct super_block *sb = osb->sb; 595 struct super_block *sb = osb->sb;
595 struct ocfs2_local_disk_dqinfo *ldinfo; 596 struct ocfs2_local_disk_dqinfo *ldinfo;
596 struct buffer_head *bh; 597 struct buffer_head *bh;
@@ -604,7 +605,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
604 "slot %u\n", osb->dev_str, slot_num); 605 "slot %u\n", osb->dev_str, slot_num);
605 606
606 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 607 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
607 for (type = 0; type < MAXQUOTAS; type++) { 608 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
608 if (list_empty(&(rec->r_list[type]))) 609 if (list_empty(&(rec->r_list[type])))
609 continue; 610 continue;
610 trace_ocfs2_finish_quota_recovery(slot_num); 611 trace_ocfs2_finish_quota_recovery(slot_num);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 636aab69ead5..d81f6e2a97f5 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3109,7 +3109,7 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
3109 el = path_leaf_el(path); 3109 el = path_leaf_el(path);
3110 3110
3111 index = ocfs2_search_extent_list(el, cpos); 3111 index = ocfs2_search_extent_list(el, cpos);
3112 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 3112 if (index == -1) {
3113 ocfs2_error(sb, 3113 ocfs2_error(sb,
3114 "Inode %llu has an extent at cpos %u which can no " 3114 "Inode %llu has an extent at cpos %u which can no "
3115 "longer be found.\n", 3115 "longer be found.\n",
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 1424c151cccc..a88b2a4fcc85 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -382,7 +382,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
382 382
383 trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); 383 trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
384 384
385 si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, 385 si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *),
386 GFP_KERNEL); 386 GFP_KERNEL);
387 if (!si->si_bh) { 387 if (!si->si_bh) {
388 status = -ENOMEM; 388 status = -ENOMEM;
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 13a8537d8e8b..720aa389e0ea 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file)
591 */ 591 */
592 ocfs2_control_this_node = -1; 592 ocfs2_control_this_node = -1;
593 running_proto.pv_major = 0; 593 running_proto.pv_major = 0;
594 running_proto.pv_major = 0; 594 running_proto.pv_minor = 0;
595 } 595 }
596 596
597out: 597out:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index ddb662b32447..93c85bc745e1 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -899,11 +899,12 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
899{ 899{
900 int type; 900 int type;
901 struct super_block *sb = osb->sb; 901 struct super_block *sb = osb->sb;
902 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 902 unsigned int feature[OCFS2_MAXQUOTAS] = {
903 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 903 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
904 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
904 int status = 0; 905 int status = 0;
905 906
906 for (type = 0; type < MAXQUOTAS; type++) { 907 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
907 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 908 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
908 continue; 909 continue;
909 if (unsuspend) 910 if (unsuspend)
@@ -927,17 +928,19 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
927 928
928static int ocfs2_enable_quotas(struct ocfs2_super *osb) 929static int ocfs2_enable_quotas(struct ocfs2_super *osb)
929{ 930{
930 struct inode *inode[MAXQUOTAS] = { NULL, NULL }; 931 struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL };
931 struct super_block *sb = osb->sb; 932 struct super_block *sb = osb->sb;
932 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 933 unsigned int feature[OCFS2_MAXQUOTAS] = {
933 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 934 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
934 unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, 935 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
936 unsigned int ino[OCFS2_MAXQUOTAS] = {
937 LOCAL_USER_QUOTA_SYSTEM_INODE,
935 LOCAL_GROUP_QUOTA_SYSTEM_INODE }; 938 LOCAL_GROUP_QUOTA_SYSTEM_INODE };
936 int status; 939 int status;
937 int type; 940 int type;
938 941
939 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; 942 sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
940 for (type = 0; type < MAXQUOTAS; type++) { 943 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
941 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 944 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
942 continue; 945 continue;
943 inode[type] = ocfs2_get_system_file_inode(osb, ino[type], 946 inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
@@ -952,12 +955,12 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
952 goto out_quota_off; 955 goto out_quota_off;
953 } 956 }
954 957
955 for (type = 0; type < MAXQUOTAS; type++) 958 for (type = 0; type < OCFS2_MAXQUOTAS; type++)
956 iput(inode[type]); 959 iput(inode[type]);
957 return 0; 960 return 0;
958out_quota_off: 961out_quota_off:
959 ocfs2_disable_quotas(osb); 962 ocfs2_disable_quotas(osb);
960 for (type = 0; type < MAXQUOTAS; type++) 963 for (type = 0; type < OCFS2_MAXQUOTAS; type++)
961 iput(inode[type]); 964 iput(inode[type]);
962 mlog_errno(status); 965 mlog_errno(status);
963 return status; 966 return status;
@@ -972,7 +975,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
972 975
973 /* We mostly ignore errors in this function because there's not much 976 /* We mostly ignore errors in this function because there's not much
974 * we can do when we see them */ 977 * we can do when we see them */
975 for (type = 0; type < MAXQUOTAS; type++) { 978 for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
976 if (!sb_has_quota_loaded(sb, type)) 979 if (!sb_has_quota_loaded(sb, type))
977 continue; 980 continue;
978 /* Cancel periodic syncing before we grab dqonoff_mutex */ 981 /* Cancel periodic syncing before we grab dqonoff_mutex */
@@ -993,8 +996,9 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
993/* Handle quota on quotactl */ 996/* Handle quota on quotactl */
994static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) 997static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
995{ 998{
996 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 999 unsigned int feature[OCFS2_MAXQUOTAS] = {
997 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 1000 OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
1001 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
998 1002
999 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 1003 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
1000 return -EINVAL; 1004 return -EINVAL;
@@ -2532,6 +2536,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2532 kfree(osb->journal); 2536 kfree(osb->journal);
2533 kfree(osb->local_alloc_copy); 2537 kfree(osb->local_alloc_copy);
2534 kfree(osb->uuid_str); 2538 kfree(osb->uuid_str);
2539 kfree(osb->vol_label);
2535 ocfs2_put_dlm_debug(osb->osb_dlm_debug); 2540 ocfs2_put_dlm_debug(osb->osb_dlm_debug);
2536 memset(osb, 0, sizeof(struct ocfs2_super)); 2541 memset(osb, 0, sizeof(struct ocfs2_super));
2537} 2542}
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index ec58c7659183..138321b0c6c2 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -306,9 +306,7 @@ static const struct super_operations omfs_sops = {
306 */ 306 */
307static int omfs_get_imap(struct super_block *sb) 307static int omfs_get_imap(struct super_block *sb)
308{ 308{
309 int bitmap_size; 309 unsigned int bitmap_size, count, array_size;
310 int array_size;
311 int count;
312 struct omfs_sb_info *sbi = OMFS_SB(sb); 310 struct omfs_sb_info *sbi = OMFS_SB(sb);
313 struct buffer_head *bh; 311 struct buffer_head *bh;
314 unsigned long **ptr; 312 unsigned long **ptr;
@@ -321,7 +319,7 @@ static int omfs_get_imap(struct super_block *sb)
321 goto out; 319 goto out;
322 320
323 sbi->s_imap_size = array_size; 321 sbi->s_imap_size = array_size;
324 sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL); 322 sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL);
325 if (!sbi->s_imap) 323 if (!sbi->s_imap)
326 goto nomem; 324 goto nomem;
327 325
@@ -473,6 +471,12 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent)
473 sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize); 471 sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize);
474 mutex_init(&sbi->s_bitmap_lock); 472 mutex_init(&sbi->s_bitmap_lock);
475 473
474 if (sbi->s_num_blocks > OMFS_MAX_BLOCKS) {
475 printk(KERN_ERR "omfs: sysblock number (%llx) is out of range\n",
476 (unsigned long long)sbi->s_num_blocks);
477 goto out_brelse_bh;
478 }
479
476 if (sbi->s_sys_blocksize > PAGE_SIZE) { 480 if (sbi->s_sys_blocksize > PAGE_SIZE) {
477 printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n", 481 printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n",
478 sbi->s_sys_blocksize); 482 sbi->s_sys_blocksize);
diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h
index ee5e4327de92..83a98330ed66 100644
--- a/fs/omfs/omfs_fs.h
+++ b/fs/omfs/omfs_fs.h
@@ -18,6 +18,7 @@
18#define OMFS_XOR_COUNT 19 18#define OMFS_XOR_COUNT 19
19#define OMFS_MAX_BLOCK_SIZE 8192 19#define OMFS_MAX_BLOCK_SIZE 8192
20#define OMFS_MAX_CLUSTER_SIZE 8 20#define OMFS_MAX_CLUSTER_SIZE 8
21#define OMFS_MAX_BLOCKS (1ul << 31)
21 22
22struct omfs_super_block { 23struct omfs_super_block {
23 char s_fill1[256]; 24 char s_fill1[256];
diff --git a/fs/pnode.c b/fs/pnode.c
index 302bf22c4a30..aae331a5d03b 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -381,6 +381,7 @@ static void __propagate_umount(struct mount *mnt)
381 * other children 381 * other children
382 */ 382 */
383 if (child && list_empty(&child->mnt_mounts)) { 383 if (child && list_empty(&child->mnt_mounts)) {
384 list_del_init(&child->mnt_child);
384 hlist_del_init_rcu(&child->mnt_hash); 385 hlist_del_init_rcu(&child->mnt_hash);
385 hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); 386 hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
386 } 387 }
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 239493ec718e..7151ea428041 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -23,6 +23,7 @@ proc-y += version.o
23proc-y += softirqs.o 23proc-y += softirqs.o
24proc-y += namespaces.o 24proc-y += namespaces.o
25proc-y += self.o 25proc-y += self.o
26proc-y += thread_self.o
26proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 27proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
27proc-$(CONFIG_NET) += proc_net.o 28proc-$(CONFIG_NET) += proc_net.o
28proc-$(CONFIG_PROC_KCORE) += kcore.o 29proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 64db2bceac59..cd3653e4f35c 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -297,15 +297,11 @@ static void render_cap_t(struct seq_file *m, const char *header,
297 seq_puts(m, header); 297 seq_puts(m, header);
298 CAP_FOR_EACH_U32(__capi) { 298 CAP_FOR_EACH_U32(__capi) {
299 seq_printf(m, "%08x", 299 seq_printf(m, "%08x",
300 a->cap[(_KERNEL_CAPABILITY_U32S-1) - __capi]); 300 a->cap[CAP_LAST_U32 - __capi]);
301 } 301 }
302 seq_putc(m, '\n'); 302 seq_putc(m, '\n');
303} 303}
304 304
305/* Remove non-existent capabilities */
306#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
307 CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
308
309static inline void task_cap(struct seq_file *m, struct task_struct *p) 305static inline void task_cap(struct seq_file *m, struct task_struct *p)
310{ 306{
311 const struct cred *cred; 307 const struct cred *cred;
@@ -319,11 +315,6 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
319 cap_bset = cred->cap_bset; 315 cap_bset = cred->cap_bset;
320 rcu_read_unlock(); 316 rcu_read_unlock();
321 317
322 NORM_CAPS(cap_inheritable);
323 NORM_CAPS(cap_permitted);
324 NORM_CAPS(cap_effective);
325 NORM_CAPS(cap_bset);
326
327 render_cap_t(m, "CapInh:\t", &cap_inheritable); 318 render_cap_t(m, "CapInh:\t", &cap_inheritable);
328 render_cap_t(m, "CapPrm:\t", &cap_permitted); 319 render_cap_t(m, "CapPrm:\t", &cap_permitted);
329 render_cap_t(m, "CapEff:\t", &cap_effective); 320 render_cap_t(m, "CapEff:\t", &cap_effective);
@@ -473,13 +464,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
473 priority = task_prio(task); 464 priority = task_prio(task);
474 nice = task_nice(task); 465 nice = task_nice(task);
475 466
476 /* Temporary variable needed for gcc-2.96 */
477 /* convert timespec -> nsec*/
478 start_time =
479 (unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
480 + task->real_start_time.tv_nsec;
481 /* convert nsec -> ticks */ 467 /* convert nsec -> ticks */
482 start_time = nsec_to_clock_t(start_time); 468 start_time = nsec_to_clock_t(task->real_start_time);
483 469
484 seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); 470 seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
485 seq_put_decimal_ll(m, ' ', ppid); 471 seq_put_decimal_ll(m, ' ', ppid);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2d696b0c93bf..772efa45a452 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -105,7 +105,7 @@
105 */ 105 */
106 106
107struct pid_entry { 107struct pid_entry {
108 char *name; 108 const char *name;
109 int len; 109 int len;
110 umode_t mode; 110 umode_t mode;
111 const struct inode_operations *iop; 111 const struct inode_operations *iop;
@@ -130,10 +130,6 @@ struct pid_entry {
130 { .proc_get_link = get_link } ) 130 { .proc_get_link = get_link } )
131#define REG(NAME, MODE, fops) \ 131#define REG(NAME, MODE, fops) \
132 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) 132 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
133#define INF(NAME, MODE, read) \
134 NOD(NAME, (S_IFREG|(MODE)), \
135 NULL, &proc_info_file_operations, \
136 { .proc_read = read } )
137#define ONE(NAME, MODE, show) \ 133#define ONE(NAME, MODE, show) \
138 NOD(NAME, (S_IFREG|(MODE)), \ 134 NOD(NAME, (S_IFREG|(MODE)), \
139 NULL, &proc_single_file_operations, \ 135 NULL, &proc_single_file_operations, \
@@ -200,27 +196,32 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
200 return result; 196 return result;
201} 197}
202 198
203static int proc_pid_cmdline(struct task_struct *task, char *buffer) 199static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
200 struct pid *pid, struct task_struct *task)
204{ 201{
205 return get_cmdline(task, buffer, PAGE_SIZE); 202 /*
203 * Rely on struct seq_operations::show() being called once
204 * per internal buffer allocation. See single_open(), traverse().
205 */
206 BUG_ON(m->size < PAGE_SIZE);
207 m->count += get_cmdline(task, m->buf, PAGE_SIZE);
208 return 0;
206} 209}
207 210
208static int proc_pid_auxv(struct task_struct *task, char *buffer) 211static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
212 struct pid *pid, struct task_struct *task)
209{ 213{
210 struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); 214 struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
211 int res = PTR_ERR(mm);
212 if (mm && !IS_ERR(mm)) { 215 if (mm && !IS_ERR(mm)) {
213 unsigned int nwords = 0; 216 unsigned int nwords = 0;
214 do { 217 do {
215 nwords += 2; 218 nwords += 2;
216 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ 219 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
217 res = nwords * sizeof(mm->saved_auxv[0]); 220 seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
218 if (res > PAGE_SIZE)
219 res = PAGE_SIZE;
220 memcpy(buffer, mm->saved_auxv, res);
221 mmput(mm); 221 mmput(mm);
222 } 222 return 0;
223 return res; 223 } else
224 return PTR_ERR(mm);
224} 225}
225 226
226 227
@@ -229,7 +230,8 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer)
229 * Provides a wchan file via kallsyms in a proper one-value-per-file format. 230 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
230 * Returns the resolved symbol. If that fails, simply return the address. 231 * Returns the resolved symbol. If that fails, simply return the address.
231 */ 232 */
232static int proc_pid_wchan(struct task_struct *task, char *buffer) 233static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
234 struct pid *pid, struct task_struct *task)
233{ 235{
234 unsigned long wchan; 236 unsigned long wchan;
235 char symname[KSYM_NAME_LEN]; 237 char symname[KSYM_NAME_LEN];
@@ -240,9 +242,9 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
240 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 242 if (!ptrace_may_access(task, PTRACE_MODE_READ))
241 return 0; 243 return 0;
242 else 244 else
243 return sprintf(buffer, "%lu", wchan); 245 return seq_printf(m, "%lu", wchan);
244 else 246 else
245 return sprintf(buffer, "%s", symname); 247 return seq_printf(m, "%s", symname);
246} 248}
247#endif /* CONFIG_KALLSYMS */ 249#endif /* CONFIG_KALLSYMS */
248 250
@@ -304,9 +306,10 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
304/* 306/*
305 * Provides /proc/PID/schedstat 307 * Provides /proc/PID/schedstat
306 */ 308 */
307static int proc_pid_schedstat(struct task_struct *task, char *buffer) 309static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
310 struct pid *pid, struct task_struct *task)
308{ 311{
309 return sprintf(buffer, "%llu %llu %lu\n", 312 return seq_printf(m, "%llu %llu %lu\n",
310 (unsigned long long)task->se.sum_exec_runtime, 313 (unsigned long long)task->se.sum_exec_runtime,
311 (unsigned long long)task->sched_info.run_delay, 314 (unsigned long long)task->sched_info.run_delay,
312 task->sched_info.pcount); 315 task->sched_info.pcount);
@@ -373,38 +376,8 @@ static const struct file_operations proc_lstats_operations = {
373 376
374#endif 377#endif
375 378
376#ifdef CONFIG_CGROUPS 379static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
377static int cgroup_open(struct inode *inode, struct file *file) 380 struct pid *pid, struct task_struct *task)
378{
379 struct pid *pid = PROC_I(inode)->pid;
380 return single_open(file, proc_cgroup_show, pid);
381}
382
383static const struct file_operations proc_cgroup_operations = {
384 .open = cgroup_open,
385 .read = seq_read,
386 .llseek = seq_lseek,
387 .release = single_release,
388};
389#endif
390
391#ifdef CONFIG_PROC_PID_CPUSET
392
393static int cpuset_open(struct inode *inode, struct file *file)
394{
395 struct pid *pid = PROC_I(inode)->pid;
396 return single_open(file, proc_cpuset_show, pid);
397}
398
399static const struct file_operations proc_cpuset_operations = {
400 .open = cpuset_open,
401 .read = seq_read,
402 .llseek = seq_lseek,
403 .release = single_release,
404};
405#endif
406
407static int proc_oom_score(struct task_struct *task, char *buffer)
408{ 381{
409 unsigned long totalpages = totalram_pages + total_swap_pages; 382 unsigned long totalpages = totalram_pages + total_swap_pages;
410 unsigned long points = 0; 383 unsigned long points = 0;
@@ -414,12 +387,12 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
414 points = oom_badness(task, NULL, NULL, totalpages) * 387 points = oom_badness(task, NULL, NULL, totalpages) *
415 1000 / totalpages; 388 1000 / totalpages;
416 read_unlock(&tasklist_lock); 389 read_unlock(&tasklist_lock);
417 return sprintf(buffer, "%lu\n", points); 390 return seq_printf(m, "%lu\n", points);
418} 391}
419 392
420struct limit_names { 393struct limit_names {
421 char *name; 394 const char *name;
422 char *unit; 395 const char *unit;
423}; 396};
424 397
425static const struct limit_names lnames[RLIM_NLIMITS] = { 398static const struct limit_names lnames[RLIM_NLIMITS] = {
@@ -442,12 +415,11 @@ static const struct limit_names lnames[RLIM_NLIMITS] = {
442}; 415};
443 416
444/* Display limits for a process */ 417/* Display limits for a process */
445static int proc_pid_limits(struct task_struct *task, char *buffer) 418static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
419 struct pid *pid, struct task_struct *task)
446{ 420{
447 unsigned int i; 421 unsigned int i;
448 int count = 0;
449 unsigned long flags; 422 unsigned long flags;
450 char *bufptr = buffer;
451 423
452 struct rlimit rlim[RLIM_NLIMITS]; 424 struct rlimit rlim[RLIM_NLIMITS];
453 425
@@ -459,35 +431,34 @@ static int proc_pid_limits(struct task_struct *task, char *buffer)
459 /* 431 /*
460 * print the file header 432 * print the file header
461 */ 433 */
462 count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n", 434 seq_printf(m, "%-25s %-20s %-20s %-10s\n",
463 "Limit", "Soft Limit", "Hard Limit", "Units"); 435 "Limit", "Soft Limit", "Hard Limit", "Units");
464 436
465 for (i = 0; i < RLIM_NLIMITS; i++) { 437 for (i = 0; i < RLIM_NLIMITS; i++) {
466 if (rlim[i].rlim_cur == RLIM_INFINITY) 438 if (rlim[i].rlim_cur == RLIM_INFINITY)
467 count += sprintf(&bufptr[count], "%-25s %-20s ", 439 seq_printf(m, "%-25s %-20s ",
468 lnames[i].name, "unlimited"); 440 lnames[i].name, "unlimited");
469 else 441 else
470 count += sprintf(&bufptr[count], "%-25s %-20lu ", 442 seq_printf(m, "%-25s %-20lu ",
471 lnames[i].name, rlim[i].rlim_cur); 443 lnames[i].name, rlim[i].rlim_cur);
472 444
473 if (rlim[i].rlim_max == RLIM_INFINITY) 445 if (rlim[i].rlim_max == RLIM_INFINITY)
474 count += sprintf(&bufptr[count], "%-20s ", "unlimited"); 446 seq_printf(m, "%-20s ", "unlimited");
475 else 447 else
476 count += sprintf(&bufptr[count], "%-20lu ", 448 seq_printf(m, "%-20lu ", rlim[i].rlim_max);
477 rlim[i].rlim_max);
478 449
479 if (lnames[i].unit) 450 if (lnames[i].unit)
480 count += sprintf(&bufptr[count], "%-10s\n", 451 seq_printf(m, "%-10s\n", lnames[i].unit);
481 lnames[i].unit);
482 else 452 else
483 count += sprintf(&bufptr[count], "\n"); 453 seq_putc(m, '\n');
484 } 454 }
485 455
486 return count; 456 return 0;
487} 457}
488 458
489#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 459#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
490static int proc_pid_syscall(struct task_struct *task, char *buffer) 460static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
461 struct pid *pid, struct task_struct *task)
491{ 462{
492 long nr; 463 long nr;
493 unsigned long args[6], sp, pc; 464 unsigned long args[6], sp, pc;
@@ -496,11 +467,11 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
496 return res; 467 return res;
497 468
498 if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) 469 if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
499 res = sprintf(buffer, "running\n"); 470 seq_puts(m, "running\n");
500 else if (nr < 0) 471 else if (nr < 0)
501 res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); 472 seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
502 else 473 else
503 res = sprintf(buffer, 474 seq_printf(m,
504 "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", 475 "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
505 nr, 476 nr,
506 args[0], args[1], args[2], args[3], args[4], args[5], 477 args[0], args[1], args[2], args[3], args[4], args[5],
@@ -598,43 +569,6 @@ static const struct inode_operations proc_def_inode_operations = {
598 .setattr = proc_setattr, 569 .setattr = proc_setattr,
599}; 570};
600 571
601#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */
602
603static ssize_t proc_info_read(struct file * file, char __user * buf,
604 size_t count, loff_t *ppos)
605{
606 struct inode * inode = file_inode(file);
607 unsigned long page;
608 ssize_t length;
609 struct task_struct *task = get_proc_task(inode);
610
611 length = -ESRCH;
612 if (!task)
613 goto out_no_task;
614
615 if (count > PROC_BLOCK_SIZE)
616 count = PROC_BLOCK_SIZE;
617
618 length = -ENOMEM;
619 if (!(page = __get_free_page(GFP_TEMPORARY)))
620 goto out;
621
622 length = PROC_I(inode)->op.proc_read(task, (char*)page);
623
624 if (length >= 0)
625 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
626 free_page(page);
627out:
628 put_task_struct(task);
629out_no_task:
630 return length;
631}
632
633static const struct file_operations proc_info_file_operations = {
634 .read = proc_info_read,
635 .llseek = generic_file_llseek,
636};
637
638static int proc_single_show(struct seq_file *m, void *v) 572static int proc_single_show(struct seq_file *m, void *v)
639{ 573{
640 struct inode *inode = m->private; 574 struct inode *inode = m->private;
@@ -667,29 +601,35 @@ static const struct file_operations proc_single_file_operations = {
667 .release = single_release, 601 .release = single_release,
668}; 602};
669 603
670static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) 604
605struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
671{ 606{
672 struct task_struct *task = get_proc_task(file_inode(file)); 607 struct task_struct *task = get_proc_task(inode);
673 struct mm_struct *mm; 608 struct mm_struct *mm = ERR_PTR(-ESRCH);
674 609
675 if (!task) 610 if (task) {
676 return -ESRCH; 611 mm = mm_access(task, mode);
612 put_task_struct(task);
677 613
678 mm = mm_access(task, mode); 614 if (!IS_ERR_OR_NULL(mm)) {
679 put_task_struct(task); 615 /* ensure this mm_struct can't be freed */
616 atomic_inc(&mm->mm_count);
617 /* but do not pin its memory */
618 mmput(mm);
619 }
620 }
621
622 return mm;
623}
624
625static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
626{
627 struct mm_struct *mm = proc_mem_open(inode, mode);
680 628
681 if (IS_ERR(mm)) 629 if (IS_ERR(mm))
682 return PTR_ERR(mm); 630 return PTR_ERR(mm);
683 631
684 if (mm) {
685 /* ensure this mm_struct can't be freed */
686 atomic_inc(&mm->mm_count);
687 /* but do not pin its memory */
688 mmput(mm);
689 }
690
691 file->private_data = mm; 632 file->private_data = mm;
692
693 return 0; 633 return 0;
694} 634}
695 635
@@ -1625,7 +1565,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1625 put_task_struct(task); 1565 put_task_struct(task);
1626 return 1; 1566 return 1;
1627 } 1567 }
1628 d_drop(dentry);
1629 return 0; 1568 return 0;
1630} 1569}
1631 1570
@@ -1762,9 +1701,6 @@ out:
1762 put_task_struct(task); 1701 put_task_struct(task);
1763 1702
1764out_notask: 1703out_notask:
1765 if (status <= 0)
1766 d_drop(dentry);
1767
1768 return status; 1704 return status;
1769} 1705}
1770 1706
@@ -2056,7 +1992,7 @@ static int show_timer(struct seq_file *m, void *v)
2056 struct k_itimer *timer; 1992 struct k_itimer *timer;
2057 struct timers_private *tp = m->private; 1993 struct timers_private *tp = m->private;
2058 int notify; 1994 int notify;
2059 static char *nstr[] = { 1995 static const char * const nstr[] = {
2060 [SIGEV_SIGNAL] = "signal", 1996 [SIGEV_SIGNAL] = "signal",
2061 [SIGEV_NONE] = "none", 1997 [SIGEV_NONE] = "none",
2062 [SIGEV_THREAD] = "thread", 1998 [SIGEV_THREAD] = "thread",
@@ -2392,7 +2328,7 @@ static const struct file_operations proc_coredump_filter_operations = {
2392#endif 2328#endif
2393 2329
2394#ifdef CONFIG_TASK_IO_ACCOUNTING 2330#ifdef CONFIG_TASK_IO_ACCOUNTING
2395static int do_io_accounting(struct task_struct *task, char *buffer, int whole) 2331static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
2396{ 2332{
2397 struct task_io_accounting acct = task->ioac; 2333 struct task_io_accounting acct = task->ioac;
2398 unsigned long flags; 2334 unsigned long flags;
@@ -2416,7 +2352,7 @@ static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2416 2352
2417 unlock_task_sighand(task, &flags); 2353 unlock_task_sighand(task, &flags);
2418 } 2354 }
2419 result = sprintf(buffer, 2355 result = seq_printf(m,
2420 "rchar: %llu\n" 2356 "rchar: %llu\n"
2421 "wchar: %llu\n" 2357 "wchar: %llu\n"
2422 "syscr: %llu\n" 2358 "syscr: %llu\n"
@@ -2436,20 +2372,22 @@ out_unlock:
2436 return result; 2372 return result;
2437} 2373}
2438 2374
2439static int proc_tid_io_accounting(struct task_struct *task, char *buffer) 2375static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2376 struct pid *pid, struct task_struct *task)
2440{ 2377{
2441 return do_io_accounting(task, buffer, 0); 2378 return do_io_accounting(task, m, 0);
2442} 2379}
2443 2380
2444static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) 2381static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2382 struct pid *pid, struct task_struct *task)
2445{ 2383{
2446 return do_io_accounting(task, buffer, 1); 2384 return do_io_accounting(task, m, 1);
2447} 2385}
2448#endif /* CONFIG_TASK_IO_ACCOUNTING */ 2386#endif /* CONFIG_TASK_IO_ACCOUNTING */
2449 2387
2450#ifdef CONFIG_USER_NS 2388#ifdef CONFIG_USER_NS
2451static int proc_id_map_open(struct inode *inode, struct file *file, 2389static int proc_id_map_open(struct inode *inode, struct file *file,
2452 struct seq_operations *seq_ops) 2390 const struct seq_operations *seq_ops)
2453{ 2391{
2454 struct user_namespace *ns = NULL; 2392 struct user_namespace *ns = NULL;
2455 struct task_struct *task; 2393 struct task_struct *task;
@@ -2557,10 +2495,10 @@ static const struct pid_entry tgid_base_stuff[] = {
2557 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), 2495 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2558#endif 2496#endif
2559 REG("environ", S_IRUSR, proc_environ_operations), 2497 REG("environ", S_IRUSR, proc_environ_operations),
2560 INF("auxv", S_IRUSR, proc_pid_auxv), 2498 ONE("auxv", S_IRUSR, proc_pid_auxv),
2561 ONE("status", S_IRUGO, proc_pid_status), 2499 ONE("status", S_IRUGO, proc_pid_status),
2562 ONE("personality", S_IRUSR, proc_pid_personality), 2500 ONE("personality", S_IRUSR, proc_pid_personality),
2563 INF("limits", S_IRUGO, proc_pid_limits), 2501 ONE("limits", S_IRUGO, proc_pid_limits),
2564#ifdef CONFIG_SCHED_DEBUG 2502#ifdef CONFIG_SCHED_DEBUG
2565 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2503 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2566#endif 2504#endif
@@ -2569,9 +2507,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2569#endif 2507#endif
2570 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2508 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2571#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2509#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2572 INF("syscall", S_IRUSR, proc_pid_syscall), 2510 ONE("syscall", S_IRUSR, proc_pid_syscall),
2573#endif 2511#endif
2574 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2512 ONE("cmdline", S_IRUGO, proc_pid_cmdline),
2575 ONE("stat", S_IRUGO, proc_tgid_stat), 2513 ONE("stat", S_IRUGO, proc_tgid_stat),
2576 ONE("statm", S_IRUGO, proc_pid_statm), 2514 ONE("statm", S_IRUGO, proc_pid_statm),
2577 REG("maps", S_IRUGO, proc_pid_maps_operations), 2515 REG("maps", S_IRUGO, proc_pid_maps_operations),
@@ -2594,24 +2532,24 @@ static const struct pid_entry tgid_base_stuff[] = {
2594 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2532 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2595#endif 2533#endif
2596#ifdef CONFIG_KALLSYMS 2534#ifdef CONFIG_KALLSYMS
2597 INF("wchan", S_IRUGO, proc_pid_wchan), 2535 ONE("wchan", S_IRUGO, proc_pid_wchan),
2598#endif 2536#endif
2599#ifdef CONFIG_STACKTRACE 2537#ifdef CONFIG_STACKTRACE
2600 ONE("stack", S_IRUSR, proc_pid_stack), 2538 ONE("stack", S_IRUSR, proc_pid_stack),
2601#endif 2539#endif
2602#ifdef CONFIG_SCHEDSTATS 2540#ifdef CONFIG_SCHEDSTATS
2603 INF("schedstat", S_IRUGO, proc_pid_schedstat), 2541 ONE("schedstat", S_IRUGO, proc_pid_schedstat),
2604#endif 2542#endif
2605#ifdef CONFIG_LATENCYTOP 2543#ifdef CONFIG_LATENCYTOP
2606 REG("latency", S_IRUGO, proc_lstats_operations), 2544 REG("latency", S_IRUGO, proc_lstats_operations),
2607#endif 2545#endif
2608#ifdef CONFIG_PROC_PID_CPUSET 2546#ifdef CONFIG_PROC_PID_CPUSET
2609 REG("cpuset", S_IRUGO, proc_cpuset_operations), 2547 ONE("cpuset", S_IRUGO, proc_cpuset_show),
2610#endif 2548#endif
2611#ifdef CONFIG_CGROUPS 2549#ifdef CONFIG_CGROUPS
2612 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2550 ONE("cgroup", S_IRUGO, proc_cgroup_show),
2613#endif 2551#endif
2614 INF("oom_score", S_IRUGO, proc_oom_score), 2552 ONE("oom_score", S_IRUGO, proc_oom_score),
2615 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 2553 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
2616 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2554 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2617#ifdef CONFIG_AUDITSYSCALL 2555#ifdef CONFIG_AUDITSYSCALL
@@ -2625,10 +2563,10 @@ static const struct pid_entry tgid_base_stuff[] = {
2625 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), 2563 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
2626#endif 2564#endif
2627#ifdef CONFIG_TASK_IO_ACCOUNTING 2565#ifdef CONFIG_TASK_IO_ACCOUNTING
2628 INF("io", S_IRUSR, proc_tgid_io_accounting), 2566 ONE("io", S_IRUSR, proc_tgid_io_accounting),
2629#endif 2567#endif
2630#ifdef CONFIG_HARDWALL 2568#ifdef CONFIG_HARDWALL
2631 INF("hardwall", S_IRUGO, proc_pid_hardwall), 2569 ONE("hardwall", S_IRUGO, proc_pid_hardwall),
2632#endif 2570#endif
2633#ifdef CONFIG_USER_NS 2571#ifdef CONFIG_USER_NS
2634 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2572 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@ -2676,8 +2614,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2676 /* no ->d_hash() rejects on procfs */ 2614 /* no ->d_hash() rejects on procfs */
2677 dentry = d_hash_and_lookup(mnt->mnt_root, &name); 2615 dentry = d_hash_and_lookup(mnt->mnt_root, &name);
2678 if (dentry) { 2616 if (dentry) {
2679 shrink_dcache_parent(dentry); 2617 d_invalidate(dentry);
2680 d_drop(dentry);
2681 dput(dentry); 2618 dput(dentry);
2682 } 2619 }
2683 2620
@@ -2697,8 +2634,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2697 name.len = snprintf(buf, sizeof(buf), "%d", pid); 2634 name.len = snprintf(buf, sizeof(buf), "%d", pid);
2698 dentry = d_hash_and_lookup(dir, &name); 2635 dentry = d_hash_and_lookup(dir, &name);
2699 if (dentry) { 2636 if (dentry) {
2700 shrink_dcache_parent(dentry); 2637 d_invalidate(dentry);
2701 d_drop(dentry);
2702 dput(dentry); 2638 dput(dentry);
2703 } 2639 }
2704 2640
@@ -2780,12 +2716,12 @@ out:
2780 2716
2781struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2717struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2782{ 2718{
2783 int result = 0; 2719 int result = -ENOENT;
2784 struct task_struct *task; 2720 struct task_struct *task;
2785 unsigned tgid; 2721 unsigned tgid;
2786 struct pid_namespace *ns; 2722 struct pid_namespace *ns;
2787 2723
2788 tgid = name_to_int(dentry); 2724 tgid = name_to_int(&dentry->d_name);
2789 if (tgid == ~0U) 2725 if (tgid == ~0U)
2790 goto out; 2726 goto out;
2791 2727
@@ -2847,7 +2783,7 @@ retry:
2847 return iter; 2783 return iter;
2848} 2784}
2849 2785
2850#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1) 2786#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
2851 2787
2852/* for the /proc/ directory itself, after non-process stuff has been done */ 2788/* for the /proc/ directory itself, after non-process stuff has been done */
2853int proc_pid_readdir(struct file *file, struct dir_context *ctx) 2789int proc_pid_readdir(struct file *file, struct dir_context *ctx)
@@ -2859,14 +2795,19 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
2859 if (pos >= PID_MAX_LIMIT + TGID_OFFSET) 2795 if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
2860 return 0; 2796 return 0;
2861 2797
2862 if (pos == TGID_OFFSET - 1) { 2798 if (pos == TGID_OFFSET - 2) {
2863 struct inode *inode = ns->proc_self->d_inode; 2799 struct inode *inode = ns->proc_self->d_inode;
2864 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) 2800 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
2865 return 0; 2801 return 0;
2866 iter.tgid = 0; 2802 ctx->pos = pos = pos + 1;
2867 } else {
2868 iter.tgid = pos - TGID_OFFSET;
2869 } 2803 }
2804 if (pos == TGID_OFFSET - 1) {
2805 struct inode *inode = ns->proc_thread_self->d_inode;
2806 if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
2807 return 0;
2808 ctx->pos = pos = pos + 1;
2809 }
2810 iter.tgid = pos - TGID_OFFSET;
2870 iter.task = NULL; 2811 iter.task = NULL;
2871 for (iter = next_tgid(ns, iter); 2812 for (iter = next_tgid(ns, iter);
2872 iter.task; 2813 iter.task;
@@ -2895,19 +2836,22 @@ static const struct pid_entry tid_base_stuff[] = {
2895 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2836 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2896 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2837 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2897 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 2838 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2839#ifdef CONFIG_NET
2840 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2841#endif
2898 REG("environ", S_IRUSR, proc_environ_operations), 2842 REG("environ", S_IRUSR, proc_environ_operations),
2899 INF("auxv", S_IRUSR, proc_pid_auxv), 2843 ONE("auxv", S_IRUSR, proc_pid_auxv),
2900 ONE("status", S_IRUGO, proc_pid_status), 2844 ONE("status", S_IRUGO, proc_pid_status),
2901 ONE("personality", S_IRUSR, proc_pid_personality), 2845 ONE("personality", S_IRUSR, proc_pid_personality),
2902 INF("limits", S_IRUGO, proc_pid_limits), 2846 ONE("limits", S_IRUGO, proc_pid_limits),
2903#ifdef CONFIG_SCHED_DEBUG 2847#ifdef CONFIG_SCHED_DEBUG
2904 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2848 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2905#endif 2849#endif
2906 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2850 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2907#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2851#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2908 INF("syscall", S_IRUSR, proc_pid_syscall), 2852 ONE("syscall", S_IRUSR, proc_pid_syscall),
2909#endif 2853#endif
2910 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2854 ONE("cmdline", S_IRUGO, proc_pid_cmdline),
2911 ONE("stat", S_IRUGO, proc_tid_stat), 2855 ONE("stat", S_IRUGO, proc_tid_stat),
2912 ONE("statm", S_IRUGO, proc_pid_statm), 2856 ONE("statm", S_IRUGO, proc_pid_statm),
2913 REG("maps", S_IRUGO, proc_tid_maps_operations), 2857 REG("maps", S_IRUGO, proc_tid_maps_operations),
@@ -2932,24 +2876,24 @@ static const struct pid_entry tid_base_stuff[] = {
2932 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2876 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2933#endif 2877#endif
2934#ifdef CONFIG_KALLSYMS 2878#ifdef CONFIG_KALLSYMS
2935 INF("wchan", S_IRUGO, proc_pid_wchan), 2879 ONE("wchan", S_IRUGO, proc_pid_wchan),
2936#endif 2880#endif
2937#ifdef CONFIG_STACKTRACE 2881#ifdef CONFIG_STACKTRACE
2938 ONE("stack", S_IRUSR, proc_pid_stack), 2882 ONE("stack", S_IRUSR, proc_pid_stack),
2939#endif 2883#endif
2940#ifdef CONFIG_SCHEDSTATS 2884#ifdef CONFIG_SCHEDSTATS
2941 INF("schedstat", S_IRUGO, proc_pid_schedstat), 2885 ONE("schedstat", S_IRUGO, proc_pid_schedstat),
2942#endif 2886#endif
2943#ifdef CONFIG_LATENCYTOP 2887#ifdef CONFIG_LATENCYTOP
2944 REG("latency", S_IRUGO, proc_lstats_operations), 2888 REG("latency", S_IRUGO, proc_lstats_operations),
2945#endif 2889#endif
2946#ifdef CONFIG_PROC_PID_CPUSET 2890#ifdef CONFIG_PROC_PID_CPUSET
2947 REG("cpuset", S_IRUGO, proc_cpuset_operations), 2891 ONE("cpuset", S_IRUGO, proc_cpuset_show),
2948#endif 2892#endif
2949#ifdef CONFIG_CGROUPS 2893#ifdef CONFIG_CGROUPS
2950 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2894 ONE("cgroup", S_IRUGO, proc_cgroup_show),
2951#endif 2895#endif
2952 INF("oom_score", S_IRUGO, proc_oom_score), 2896 ONE("oom_score", S_IRUGO, proc_oom_score),
2953 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 2897 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
2954 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2898 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2955#ifdef CONFIG_AUDITSYSCALL 2899#ifdef CONFIG_AUDITSYSCALL
@@ -2960,10 +2904,10 @@ static const struct pid_entry tid_base_stuff[] = {
2960 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), 2904 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2961#endif 2905#endif
2962#ifdef CONFIG_TASK_IO_ACCOUNTING 2906#ifdef CONFIG_TASK_IO_ACCOUNTING
2963 INF("io", S_IRUSR, proc_tid_io_accounting), 2907 ONE("io", S_IRUSR, proc_tid_io_accounting),
2964#endif 2908#endif
2965#ifdef CONFIG_HARDWALL 2909#ifdef CONFIG_HARDWALL
2966 INF("hardwall", S_IRUGO, proc_pid_hardwall), 2910 ONE("hardwall", S_IRUGO, proc_pid_hardwall),
2967#endif 2911#endif
2968#ifdef CONFIG_USER_NS 2912#ifdef CONFIG_USER_NS
2969 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2913 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
@@ -3033,7 +2977,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
3033 if (!leader) 2977 if (!leader)
3034 goto out_no_task; 2978 goto out_no_task;
3035 2979
3036 tid = name_to_int(dentry); 2980 tid = name_to_int(&dentry->d_name);
3037 if (tid == ~0U) 2981 if (tid == ~0U)
3038 goto out; 2982 goto out;
3039 2983
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 0788d093f5d8..e11d7c590bb0 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -129,8 +129,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
129 } 129 }
130 put_task_struct(task); 130 put_task_struct(task);
131 } 131 }
132
133 d_drop(dentry);
134 return 0; 132 return 0;
135} 133}
136 134
@@ -206,7 +204,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir,
206{ 204{
207 struct task_struct *task = get_proc_task(dir); 205 struct task_struct *task = get_proc_task(dir);
208 int result = -ENOENT; 206 int result = -ENOENT;
209 unsigned fd = name_to_int(dentry); 207 unsigned fd = name_to_int(&dentry->d_name);
210 208
211 if (!task) 209 if (!task)
212 goto out_no_task; 210 goto out_no_task;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index b7f268eb5f45..317b72641ebf 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -27,7 +27,7 @@
27 27
28#include "internal.h" 28#include "internal.h"
29 29
30DEFINE_SPINLOCK(proc_subdir_lock); 30static DEFINE_SPINLOCK(proc_subdir_lock);
31 31
32static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) 32static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
33{ 33{
@@ -330,28 +330,28 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
330 nlink_t nlink) 330 nlink_t nlink)
331{ 331{
332 struct proc_dir_entry *ent = NULL; 332 struct proc_dir_entry *ent = NULL;
333 const char *fn = name; 333 const char *fn;
334 unsigned int len; 334 struct qstr qstr;
335
336 /* make sure name is valid */
337 if (!name || !strlen(name))
338 goto out;
339 335
340 if (xlate_proc_name(name, parent, &fn) != 0) 336 if (xlate_proc_name(name, parent, &fn) != 0)
341 goto out; 337 goto out;
338 qstr.name = fn;
339 qstr.len = strlen(fn);
340 if (qstr.len == 0 || qstr.len >= 256) {
341 WARN(1, "name len %u\n", qstr.len);
342 return NULL;
343 }
344 if (*parent == &proc_root && name_to_int(&qstr) != ~0U) {
345 WARN(1, "create '/proc/%s' by hand\n", qstr.name);
346 return NULL;
347 }
342 348
343 /* At this point there must not be any '/' characters beyond *fn */ 349 ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL);
344 if (strchr(fn, '/'))
345 goto out;
346
347 len = strlen(fn);
348
349 ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
350 if (!ent) 350 if (!ent)
351 goto out; 351 goto out;
352 352
353 memcpy(ent->name, fn, len + 1); 353 memcpy(ent->name, fn, qstr.len + 1);
354 ent->namelen = len; 354 ent->namelen = qstr.len;
355 ent->mode = mode; 355 ent->mode = mode;
356 ent->nlink = nlink; 356 ent->nlink = nlink;
357 atomic_set(&ent->count, 1); 357 atomic_set(&ent->count, 1);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 0adbc02d60e3..333080d7a671 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -442,6 +442,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
442int proc_fill_super(struct super_block *s) 442int proc_fill_super(struct super_block *s)
443{ 443{
444 struct inode *root_inode; 444 struct inode *root_inode;
445 int ret;
445 446
446 s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; 447 s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
447 s->s_blocksize = 1024; 448 s->s_blocksize = 1024;
@@ -463,5 +464,9 @@ int proc_fill_super(struct super_block *s)
463 return -ENOMEM; 464 return -ENOMEM;
464 } 465 }
465 466
466 return proc_setup_self(s); 467 ret = proc_setup_self(s);
468 if (ret) {
469 return ret;
470 }
471 return proc_setup_thread_self(s);
467} 472}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 3ab6d14e71c5..aa7a0ee182e1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -52,7 +52,6 @@ struct proc_dir_entry {
52 52
53union proc_op { 53union proc_op {
54 int (*proc_get_link)(struct dentry *, struct path *); 54 int (*proc_get_link)(struct dentry *, struct path *);
55 int (*proc_read)(struct task_struct *task, char *page);
56 int (*proc_show)(struct seq_file *m, 55 int (*proc_show)(struct seq_file *m,
57 struct pid_namespace *ns, struct pid *pid, 56 struct pid_namespace *ns, struct pid *pid,
58 struct task_struct *task); 57 struct task_struct *task);
@@ -112,10 +111,10 @@ static inline int task_dumpable(struct task_struct *task)
112 return 0; 111 return 0;
113} 112}
114 113
115static inline unsigned name_to_int(struct dentry *dentry) 114static inline unsigned name_to_int(const struct qstr *qstr)
116{ 115{
117 const char *name = dentry->d_name.name; 116 const char *name = qstr->name;
118 int len = dentry->d_name.len; 117 int len = qstr->len;
119 unsigned n = 0; 118 unsigned n = 0;
120 119
121 if (len > 1 && *name == '0') 120 if (len > 1 && *name == '0')
@@ -178,8 +177,6 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i
178/* 177/*
179 * generic.c 178 * generic.c
180 */ 179 */
181extern spinlock_t proc_subdir_lock;
182
183extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); 180extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
184extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, 181extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
185 struct dentry *); 182 struct dentry *);
@@ -234,6 +231,12 @@ static inline int proc_net_init(void) { return 0; }
234extern int proc_setup_self(struct super_block *); 231extern int proc_setup_self(struct super_block *);
235 232
236/* 233/*
234 * proc_thread_self.c
235 */
236extern int proc_setup_thread_self(struct super_block *);
237extern void proc_thread_self_init(void);
238
239/*
237 * proc_sysctl.c 240 * proc_sysctl.c
238 */ 241 */
239#ifdef CONFIG_PROC_SYSCTL 242#ifdef CONFIG_PROC_SYSCTL
@@ -265,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *);
265 * task_[no]mmu.c 268 * task_[no]mmu.c
266 */ 269 */
267struct proc_maps_private { 270struct proc_maps_private {
268 struct pid *pid; 271 struct inode *inode;
269 struct task_struct *task; 272 struct task_struct *task;
273 struct mm_struct *mm;
270#ifdef CONFIG_MMU 274#ifdef CONFIG_MMU
271 struct vm_area_struct *tail_vma; 275 struct vm_area_struct *tail_vma;
272#endif 276#endif
@@ -275,6 +279,8 @@ struct proc_maps_private {
275#endif 279#endif
276}; 280};
277 281
282struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode);
283
278extern const struct file_operations proc_pid_maps_operations; 284extern const struct file_operations proc_pid_maps_operations;
279extern const struct file_operations proc_tid_maps_operations; 285extern const struct file_operations proc_tid_maps_operations;
280extern const struct file_operations proc_pid_numa_maps_operations; 286extern const struct file_operations proc_pid_numa_maps_operations;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 39e6ef32f0bd..91a4e6426321 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -172,7 +172,7 @@ get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
172 172
173 start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; 173 start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK;
174 end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; 174 end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1;
175 end = ALIGN(end, PAGE_SIZE); 175 end = PAGE_ALIGN(end);
176 /* overlap check (because we have to align page */ 176 /* overlap check (because we have to align page */
177 list_for_each_entry(tmp, head, list) { 177 list_for_each_entry(tmp, head, list) {
178 if (tmp->type != KCORE_VMEMMAP) 178 if (tmp->type != KCORE_VMEMMAP)
@@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void)
610struct kcore_list kcore_modules; 610struct kcore_list kcore_modules;
611static void __init add_modules_range(void) 611static void __init add_modules_range(void)
612{ 612{
613 kclist_add(&kcore_modules, (void *)MODULES_VADDR, 613 if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) {
614 kclist_add(&kcore_modules, (void *)MODULES_VADDR,
614 MODULES_END - MODULES_VADDR, KCORE_VMALLOC); 615 MODULES_END - MODULES_VADDR, KCORE_VMALLOC);
616 }
615} 617}
616#else 618#else
617static void __init add_modules_range(void) 619static void __init add_modules_range(void)
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 7445af0b1aa3..aa1eee06420f 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
168 K(global_page_state(NR_WRITEBACK)), 168 K(global_page_state(NR_WRITEBACK)),
169 K(global_page_state(NR_ANON_PAGES)), 169 K(global_page_state(NR_ANON_PAGES)),
170 K(global_page_state(NR_FILE_MAPPED)), 170 K(global_page_state(NR_FILE_MAPPED)),
171 K(global_page_state(NR_SHMEM)), 171 K(i.sharedram),
172 K(global_page_state(NR_SLAB_RECLAIMABLE) + 172 K(global_page_state(NR_SLAB_RECLAIMABLE) +
173 global_page_state(NR_SLAB_UNRECLAIMABLE)), 173 global_page_state(NR_SLAB_UNRECLAIMABLE)),
174 K(global_page_state(NR_SLAB_RECLAIMABLE)), 174 K(global_page_state(NR_SLAB_RECLAIMABLE)),
diff --git a/fs/proc/page.c b/fs/proc/page.c
index e647c55275d9..1e3187da1fed 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page)
133 if (PageBuddy(page)) 133 if (PageBuddy(page))
134 u |= 1 << KPF_BUDDY; 134 u |= 1 << KPF_BUDDY;
135 135
136 if (PageBalloon(page))
137 u |= 1 << KPF_BALLOON;
138
136 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); 139 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
137 140
138 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 141 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4677bb7dc7c2..a63af3e0a612 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -113,9 +113,11 @@ static struct net *get_proc_task_net(struct inode *dir)
113 rcu_read_lock(); 113 rcu_read_lock();
114 task = pid_task(proc_pid(dir), PIDTYPE_PID); 114 task = pid_task(proc_pid(dir), PIDTYPE_PID);
115 if (task != NULL) { 115 if (task != NULL) {
116 ns = task_nsproxy(task); 116 task_lock(task);
117 ns = task->nsproxy;
117 if (ns != NULL) 118 if (ns != NULL)
118 net = get_net(ns->net_ns); 119 net = get_net(ns->net_ns);
120 task_unlock(task);
119 } 121 }
120 rcu_read_unlock(); 122 rcu_read_unlock();
121 123
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 71290463a1d3..f92d5dd578a4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -632,7 +632,7 @@ out:
632 return ret; 632 return ret;
633} 633}
634 634
635static int scan(struct ctl_table_header *head, ctl_table *table, 635static int scan(struct ctl_table_header *head, struct ctl_table *table,
636 unsigned long *pos, struct file *file, 636 unsigned long *pos, struct file *file,
637 struct dir_context *ctx) 637 struct dir_context *ctx)
638{ 638{
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index cb761f010300..15f327bed8c6 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -18,7 +18,7 @@
18/* 18/*
19 * The /proc/tty directory inodes... 19 * The /proc/tty directory inodes...
20 */ 20 */
21static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver; 21static struct proc_dir_entry *proc_tty_driver;
22 22
23/* 23/*
24 * This is the handler for /proc/tty/drivers 24 * This is the handler for /proc/tty/drivers
@@ -176,7 +176,7 @@ void __init proc_tty_init(void)
176{ 176{
177 if (!proc_mkdir("tty", NULL)) 177 if (!proc_mkdir("tty", NULL))
178 return; 178 return;
179 proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL); 179 proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */
180 /* 180 /*
181 * /proc/tty/driver/serial reveals the exact character counts for 181 * /proc/tty/driver/serial reveals the exact character counts for
182 * serial links which is just too easy to abuse for inferring 182 * serial links which is just too easy to abuse for inferring
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5dbadecb234d..094e44d4a6be 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -149,6 +149,8 @@ static void proc_kill_sb(struct super_block *sb)
149 ns = (struct pid_namespace *)sb->s_fs_info; 149 ns = (struct pid_namespace *)sb->s_fs_info;
150 if (ns->proc_self) 150 if (ns->proc_self)
151 dput(ns->proc_self); 151 dput(ns->proc_self);
152 if (ns->proc_thread_self)
153 dput(ns->proc_thread_self);
152 kill_anon_super(sb); 154 kill_anon_super(sb);
153 put_pid_ns(ns); 155 put_pid_ns(ns);
154} 156}
@@ -170,6 +172,7 @@ void __init proc_root_init(void)
170 return; 172 return;
171 173
172 proc_self_init(); 174 proc_self_init();
175 proc_thread_self_init();
173 proc_symlink("mounts", NULL, "self/mounts"); 176 proc_symlink("mounts", NULL, "self/mounts");
174 177
175 proc_net_init(); 178 proc_net_init();
@@ -199,10 +202,10 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
199 202
200static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) 203static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
201{ 204{
202 if (!proc_lookup(dir, dentry, flags)) 205 if (!proc_pid_lookup(dir, dentry, flags))
203 return NULL; 206 return NULL;
204 207
205 return proc_pid_lookup(dir, dentry, flags); 208 return proc_lookup(dir, dentry, flags);
206} 209}
207 210
208static int proc_root_readdir(struct file *file, struct dir_context *ctx) 211static int proc_root_readdir(struct file *file, struct dir_context *ctx)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index cfa63ee92c96..4e0388cffe3d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm,
87 87
88#ifdef CONFIG_NUMA 88#ifdef CONFIG_NUMA
89/* 89/*
90 * These functions are for numa_maps but called in generic **maps seq_file 90 * Save get_task_policy() for show_numa_map().
91 * ->start(), ->stop() ops.
92 *
93 * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
94 * Each mempolicy object is controlled by reference counting. The problem here
95 * is how to avoid accessing dead mempolicy object.
96 *
97 * Because we're holding mmap_sem while reading seq_file, it's safe to access
98 * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
99 *
100 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
101 * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
102 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
103 * gurantee the task never exits under us. But taking task_lock() around
104 * get_vma_plicy() causes lock order problem.
105 *
106 * To access task->mempolicy without lock, we hold a reference count of an
107 * object pointed by task->mempolicy and remember it. This will guarantee
108 * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
109 */ 91 */
110static void hold_task_mempolicy(struct proc_maps_private *priv) 92static void hold_task_mempolicy(struct proc_maps_private *priv)
111{ 93{
112 struct task_struct *task = priv->task; 94 struct task_struct *task = priv->task;
113 95
114 task_lock(task); 96 task_lock(task);
115 priv->task_mempolicy = task->mempolicy; 97 priv->task_mempolicy = get_task_policy(task);
116 mpol_get(priv->task_mempolicy); 98 mpol_get(priv->task_mempolicy);
117 task_unlock(task); 99 task_unlock(task);
118} 100}
@@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
129} 111}
130#endif 112#endif
131 113
132static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) 114static void vma_stop(struct proc_maps_private *priv)
133{ 115{
134 if (vma && vma != priv->tail_vma) { 116 struct mm_struct *mm = priv->mm;
135 struct mm_struct *mm = vma->vm_mm; 117
136 release_task_mempolicy(priv); 118 release_task_mempolicy(priv);
137 up_read(&mm->mmap_sem); 119 up_read(&mm->mmap_sem);
138 mmput(mm); 120 mmput(mm);
139 } 121}
122
123static struct vm_area_struct *
124m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
125{
126 if (vma == priv->tail_vma)
127 return NULL;
128 return vma->vm_next ?: priv->tail_vma;
140} 129}
141 130
142static void *m_start(struct seq_file *m, loff_t *pos) 131static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
132{
133 if (m->count < m->size) /* vma is copied successfully */
134 m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
135}
136
137static void *m_start(struct seq_file *m, loff_t *ppos)
143{ 138{
144 struct proc_maps_private *priv = m->private; 139 struct proc_maps_private *priv = m->private;
145 unsigned long last_addr = m->version; 140 unsigned long last_addr = m->version;
146 struct mm_struct *mm; 141 struct mm_struct *mm;
147 struct vm_area_struct *vma, *tail_vma = NULL; 142 struct vm_area_struct *vma;
148 loff_t l = *pos; 143 unsigned int pos = *ppos;
149
150 /* Clear the per syscall fields in priv */
151 priv->task = NULL;
152 priv->tail_vma = NULL;
153
154 /*
155 * We remember last_addr rather than next_addr to hit with
156 * vmacache most of the time. We have zero last_addr at
157 * the beginning and also after lseek. We will have -1 last_addr
158 * after the end of the vmas.
159 */
160 144
145 /* See m_cache_vma(). Zero at the start or after lseek. */
161 if (last_addr == -1UL) 146 if (last_addr == -1UL)
162 return NULL; 147 return NULL;
163 148
164 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 149 priv->task = get_proc_task(priv->inode);
165 if (!priv->task) 150 if (!priv->task)
166 return ERR_PTR(-ESRCH); 151 return ERR_PTR(-ESRCH);
167 152
168 mm = mm_access(priv->task, PTRACE_MODE_READ); 153 mm = priv->mm;
169 if (!mm || IS_ERR(mm)) 154 if (!mm || !atomic_inc_not_zero(&mm->mm_users))
170 return mm; 155 return NULL;
171 down_read(&mm->mmap_sem);
172 156
173 tail_vma = get_gate_vma(priv->task->mm); 157 down_read(&mm->mmap_sem);
174 priv->tail_vma = tail_vma;
175 hold_task_mempolicy(priv); 158 hold_task_mempolicy(priv);
176 /* Start with last addr hint */ 159 priv->tail_vma = get_gate_vma(mm);
177 vma = find_vma(mm, last_addr); 160
178 if (last_addr && vma) { 161 if (last_addr) {
179 vma = vma->vm_next; 162 vma = find_vma(mm, last_addr);
180 goto out; 163 if (vma && (vma = m_next_vma(priv, vma)))
164 return vma;
181 } 165 }
182 166
183 /* 167 m->version = 0;
184 * Check the vma index is within the range and do 168 if (pos < mm->map_count) {
185 * sequential scan until m_index. 169 for (vma = mm->mmap; pos; pos--) {
186 */ 170 m->version = vma->vm_start;
187 vma = NULL;
188 if ((unsigned long)l < mm->map_count) {
189 vma = mm->mmap;
190 while (l-- && vma)
191 vma = vma->vm_next; 171 vma = vma->vm_next;
192 goto out; 172 }
173 return vma;
193 } 174 }
194 175
195 if (l != mm->map_count) 176 /* we do not bother to update m->version in this case */
196 tail_vma = NULL; /* After gate vma */ 177 if (pos == mm->map_count && priv->tail_vma)
197 178 return priv->tail_vma;
198out:
199 if (vma)
200 return vma;
201 179
202 release_task_mempolicy(priv); 180 vma_stop(priv);
203 /* End of vmas has been reached */ 181 return NULL;
204 m->version = (tail_vma != NULL)? 0: -1UL;
205 up_read(&mm->mmap_sem);
206 mmput(mm);
207 return tail_vma;
208} 182}
209 183
210static void *m_next(struct seq_file *m, void *v, loff_t *pos) 184static void *m_next(struct seq_file *m, void *v, loff_t *pos)
211{ 185{
212 struct proc_maps_private *priv = m->private; 186 struct proc_maps_private *priv = m->private;
213 struct vm_area_struct *vma = v; 187 struct vm_area_struct *next;
214 struct vm_area_struct *tail_vma = priv->tail_vma;
215 188
216 (*pos)++; 189 (*pos)++;
217 if (vma && (vma != tail_vma) && vma->vm_next) 190 next = m_next_vma(priv, v);
218 return vma->vm_next; 191 if (!next)
219 vma_stop(priv, vma); 192 vma_stop(priv);
220 return (vma != tail_vma)? tail_vma: NULL; 193 return next;
221} 194}
222 195
223static void m_stop(struct seq_file *m, void *v) 196static void m_stop(struct seq_file *m, void *v)
224{ 197{
225 struct proc_maps_private *priv = m->private; 198 struct proc_maps_private *priv = m->private;
226 struct vm_area_struct *vma = v;
227 199
228 if (!IS_ERR(vma)) 200 if (!IS_ERR_OR_NULL(v))
229 vma_stop(priv, vma); 201 vma_stop(priv);
230 if (priv->task) 202 if (priv->task) {
231 put_task_struct(priv->task); 203 put_task_struct(priv->task);
204 priv->task = NULL;
205 }
206}
207
208static int proc_maps_open(struct inode *inode, struct file *file,
209 const struct seq_operations *ops, int psize)
210{
211 struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
212
213 if (!priv)
214 return -ENOMEM;
215
216 priv->inode = inode;
217 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
218 if (IS_ERR(priv->mm)) {
219 int err = PTR_ERR(priv->mm);
220
221 seq_release_private(inode, file);
222 return err;
223 }
224
225 return 0;
226}
227
228static int proc_map_release(struct inode *inode, struct file *file)
229{
230 struct seq_file *seq = file->private_data;
231 struct proc_maps_private *priv = seq->private;
232
233 if (priv->mm)
234 mmdrop(priv->mm);
235
236 return seq_release_private(inode, file);
232} 237}
233 238
234static int do_maps_open(struct inode *inode, struct file *file, 239static int do_maps_open(struct inode *inode, struct file *file,
235 const struct seq_operations *ops) 240 const struct seq_operations *ops)
236{ 241{
237 struct proc_maps_private *priv; 242 return proc_maps_open(inode, file, ops,
238 int ret = -ENOMEM; 243 sizeof(struct proc_maps_private));
239 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 244}
240 if (priv) { 245
241 priv->pid = proc_pid(inode); 246static pid_t pid_of_stack(struct proc_maps_private *priv,
242 ret = seq_open(file, ops); 247 struct vm_area_struct *vma, bool is_pid)
243 if (!ret) { 248{
244 struct seq_file *m = file->private_data; 249 struct inode *inode = priv->inode;
245 m->private = priv; 250 struct task_struct *task;
246 } else { 251 pid_t ret = 0;
247 kfree(priv); 252
248 } 253 rcu_read_lock();
254 task = pid_task(proc_pid(inode), PIDTYPE_PID);
255 if (task) {
256 task = task_of_stack(task, vma, is_pid);
257 if (task)
258 ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
249 } 259 }
260 rcu_read_unlock();
261
250 return ret; 262 return ret;
251} 263}
252 264
@@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
256 struct mm_struct *mm = vma->vm_mm; 268 struct mm_struct *mm = vma->vm_mm;
257 struct file *file = vma->vm_file; 269 struct file *file = vma->vm_file;
258 struct proc_maps_private *priv = m->private; 270 struct proc_maps_private *priv = m->private;
259 struct task_struct *task = priv->task;
260 vm_flags_t flags = vma->vm_flags; 271 vm_flags_t flags = vma->vm_flags;
261 unsigned long ino = 0; 272 unsigned long ino = 0;
262 unsigned long long pgoff = 0; 273 unsigned long long pgoff = 0;
@@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
321 goto done; 332 goto done;
322 } 333 }
323 334
324 tid = vm_is_stack(task, vma, is_pid); 335 tid = pid_of_stack(priv, vma, is_pid);
325
326 if (tid != 0) { 336 if (tid != 0) {
327 /* 337 /*
328 * Thread stack in /proc/PID/task/TID/maps or 338 * Thread stack in /proc/PID/task/TID/maps or
@@ -349,15 +359,8 @@ done:
349 359
350static int show_map(struct seq_file *m, void *v, int is_pid) 360static int show_map(struct seq_file *m, void *v, int is_pid)
351{ 361{
352 struct vm_area_struct *vma = v; 362 show_map_vma(m, v, is_pid);
353 struct proc_maps_private *priv = m->private; 363 m_cache_vma(m, v);
354 struct task_struct *task = priv->task;
355
356 show_map_vma(m, vma, is_pid);
357
358 if (m->count < m->size) /* vma is copied successfully */
359 m->version = (vma != get_gate_vma(task->mm))
360 ? vma->vm_start : 0;
361 return 0; 364 return 0;
362} 365}
363 366
@@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = {
399 .open = pid_maps_open, 402 .open = pid_maps_open,
400 .read = seq_read, 403 .read = seq_read,
401 .llseek = seq_lseek, 404 .llseek = seq_lseek,
402 .release = seq_release_private, 405 .release = proc_map_release,
403}; 406};
404 407
405const struct file_operations proc_tid_maps_operations = { 408const struct file_operations proc_tid_maps_operations = {
406 .open = tid_maps_open, 409 .open = tid_maps_open,
407 .read = seq_read, 410 .read = seq_read,
408 .llseek = seq_lseek, 411 .llseek = seq_lseek,
409 .release = seq_release_private, 412 .release = proc_map_release,
410}; 413};
411 414
412/* 415/*
@@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
583 586
584static int show_smap(struct seq_file *m, void *v, int is_pid) 587static int show_smap(struct seq_file *m, void *v, int is_pid)
585{ 588{
586 struct proc_maps_private *priv = m->private;
587 struct task_struct *task = priv->task;
588 struct vm_area_struct *vma = v; 589 struct vm_area_struct *vma = v;
589 struct mem_size_stats mss; 590 struct mem_size_stats mss;
590 struct mm_walk smaps_walk = { 591 struct mm_walk smaps_walk = {
@@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
637 mss.nonlinear >> 10); 638 mss.nonlinear >> 10);
638 639
639 show_smap_vma_flags(m, vma); 640 show_smap_vma_flags(m, vma);
640 641 m_cache_vma(m, vma);
641 if (m->count < m->size) /* vma is copied successfully */
642 m->version = (vma != get_gate_vma(task->mm))
643 ? vma->vm_start : 0;
644 return 0; 642 return 0;
645} 643}
646 644
@@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = {
682 .open = pid_smaps_open, 680 .open = pid_smaps_open,
683 .read = seq_read, 681 .read = seq_read,
684 .llseek = seq_lseek, 682 .llseek = seq_lseek,
685 .release = seq_release_private, 683 .release = proc_map_release,
686}; 684};
687 685
688const struct file_operations proc_tid_smaps_operations = { 686const struct file_operations proc_tid_smaps_operations = {
689 .open = tid_smaps_open, 687 .open = tid_smaps_open,
690 .read = seq_read, 688 .read = seq_read,
691 .llseek = seq_lseek, 689 .llseek = seq_lseek,
692 .release = seq_release_private, 690 .release = proc_map_release,
693}; 691};
694 692
695/* 693/*
@@ -829,8 +827,21 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
829 .private = &cp, 827 .private = &cp,
830 }; 828 };
831 down_read(&mm->mmap_sem); 829 down_read(&mm->mmap_sem);
832 if (type == CLEAR_REFS_SOFT_DIRTY) 830 if (type == CLEAR_REFS_SOFT_DIRTY) {
831 for (vma = mm->mmap; vma; vma = vma->vm_next) {
832 if (!(vma->vm_flags & VM_SOFTDIRTY))
833 continue;
834 up_read(&mm->mmap_sem);
835 down_write(&mm->mmap_sem);
836 for (vma = mm->mmap; vma; vma = vma->vm_next) {
837 vma->vm_flags &= ~VM_SOFTDIRTY;
838 vma_set_page_prot(vma);
839 }
840 downgrade_write(&mm->mmap_sem);
841 break;
842 }
833 mmu_notifier_invalidate_range_start(mm, 0, -1); 843 mmu_notifier_invalidate_range_start(mm, 0, -1);
844 }
834 for (vma = mm->mmap; vma; vma = vma->vm_next) { 845 for (vma = mm->mmap; vma; vma = vma->vm_next) {
835 cp.vma = vma; 846 cp.vma = vma;
836 if (is_vm_hugetlb_page(vma)) 847 if (is_vm_hugetlb_page(vma))
@@ -850,10 +861,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
850 continue; 861 continue;
851 if (type == CLEAR_REFS_MAPPED && !vma->vm_file) 862 if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
852 continue; 863 continue;
853 if (type == CLEAR_REFS_SOFT_DIRTY) {
854 if (vma->vm_flags & VM_SOFTDIRTY)
855 vma->vm_flags &= ~VM_SOFTDIRTY;
856 }
857 walk_page_range(vma->vm_start, vma->vm_end, 864 walk_page_range(vma->vm_start, vma->vm_end,
858 &clear_refs_walk); 865 &clear_refs_walk);
859 } 866 }
@@ -925,15 +932,39 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
925 struct mm_walk *walk) 932 struct mm_walk *walk)
926{ 933{
927 struct pagemapread *pm = walk->private; 934 struct pagemapread *pm = walk->private;
928 unsigned long addr; 935 unsigned long addr = start;
929 int err = 0; 936 int err = 0;
930 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
931 937
932 for (addr = start; addr < end; addr += PAGE_SIZE) { 938 while (addr < end) {
933 err = add_to_pagemap(addr, &pme, pm); 939 struct vm_area_struct *vma = find_vma(walk->mm, addr);
934 if (err) 940 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
941 /* End of address space hole, which we mark as non-present. */
942 unsigned long hole_end;
943
944 if (vma)
945 hole_end = min(end, vma->vm_start);
946 else
947 hole_end = end;
948
949 for (; addr < hole_end; addr += PAGE_SIZE) {
950 err = add_to_pagemap(addr, &pme, pm);
951 if (err)
952 goto out;
953 }
954
955 if (!vma)
935 break; 956 break;
957
958 /* Addresses in the VMA. */
959 if (vma->vm_flags & VM_SOFTDIRTY)
960 pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY);
961 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
962 err = add_to_pagemap(addr, &pme, pm);
963 if (err)
964 goto out;
965 }
936 } 966 }
967out:
937 return err; 968 return err;
938} 969}
939 970
@@ -1005,7 +1036,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1005 spinlock_t *ptl; 1036 spinlock_t *ptl;
1006 pte_t *pte; 1037 pte_t *pte;
1007 int err = 0; 1038 int err = 0;
1008 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1009 1039
1010 /* find the first VMA at or above 'addr' */ 1040 /* find the first VMA at or above 'addr' */
1011 vma = find_vma(walk->mm, addr); 1041 vma = find_vma(walk->mm, addr);
@@ -1019,6 +1049,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1019 1049
1020 for (; addr != end; addr += PAGE_SIZE) { 1050 for (; addr != end; addr += PAGE_SIZE) {
1021 unsigned long offset; 1051 unsigned long offset;
1052 pagemap_entry_t pme;
1022 1053
1023 offset = (addr & ~PAGEMAP_WALK_MASK) >> 1054 offset = (addr & ~PAGEMAP_WALK_MASK) >>
1024 PAGE_SHIFT; 1055 PAGE_SHIFT;
@@ -1033,32 +1064,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1033 1064
1034 if (pmd_trans_unstable(pmd)) 1065 if (pmd_trans_unstable(pmd))
1035 return 0; 1066 return 0;
1036 for (; addr != end; addr += PAGE_SIZE) { 1067
1037 int flags2; 1068 while (1) {
1038 1069 /* End of address space hole, which we mark as non-present. */
1039 /* check to see if we've left 'vma' behind 1070 unsigned long hole_end;
1040 * and need a new, higher one */ 1071
1041 if (vma && (addr >= vma->vm_end)) { 1072 if (vma)
1042 vma = find_vma(walk->mm, addr); 1073 hole_end = min(end, vma->vm_start);
1043 if (vma && (vma->vm_flags & VM_SOFTDIRTY)) 1074 else
1044 flags2 = __PM_SOFT_DIRTY; 1075 hole_end = end;
1045 else 1076
1046 flags2 = 0; 1077 for (; addr < hole_end; addr += PAGE_SIZE) {
1047 pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); 1078 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
1079
1080 err = add_to_pagemap(addr, &pme, pm);
1081 if (err)
1082 return err;
1048 } 1083 }
1049 1084
1050 /* check that 'vma' actually covers this address, 1085 if (!vma || vma->vm_start >= end)
1051 * and that it isn't a huge page vma */ 1086 break;
1052 if (vma && (vma->vm_start <= addr) && 1087 /*
1053 !is_vm_hugetlb_page(vma)) { 1088 * We can't possibly be in a hugetlb VMA. In general,
1089 * for a mm_walk with a pmd_entry and a hugetlb_entry,
1090 * the pmd_entry can only be called on addresses in a
1091 * hugetlb if the walk starts in a non-hugetlb VMA and
1092 * spans a hugepage VMA. Since pagemap_read walks are
1093 * PMD-sized and PMD-aligned, this will never be true.
1094 */
1095 BUG_ON(is_vm_hugetlb_page(vma));
1096
1097 /* Addresses in the VMA. */
1098 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1099 pagemap_entry_t pme;
1054 pte = pte_offset_map(pmd, addr); 1100 pte = pte_offset_map(pmd, addr);
1055 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); 1101 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
1056 /* unmap before userspace copy */
1057 pte_unmap(pte); 1102 pte_unmap(pte);
1103 err = add_to_pagemap(addr, &pme, pm);
1104 if (err)
1105 return err;
1058 } 1106 }
1059 err = add_to_pagemap(addr, &pme, pm); 1107
1060 if (err) 1108 if (addr == end)
1061 return err; 1109 break;
1110
1111 vma = find_vma(walk->mm, addr);
1062 } 1112 }
1063 1113
1064 cond_resched(); 1114 cond_resched();
@@ -1391,7 +1441,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1391 struct vm_area_struct *vma = v; 1441 struct vm_area_struct *vma = v;
1392 struct numa_maps *md = &numa_priv->md; 1442 struct numa_maps *md = &numa_priv->md;
1393 struct file *file = vma->vm_file; 1443 struct file *file = vma->vm_file;
1394 struct task_struct *task = proc_priv->task;
1395 struct mm_struct *mm = vma->vm_mm; 1444 struct mm_struct *mm = vma->vm_mm;
1396 struct mm_walk walk = {}; 1445 struct mm_walk walk = {};
1397 struct mempolicy *pol; 1446 struct mempolicy *pol;
@@ -1411,9 +1460,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1411 walk.private = md; 1460 walk.private = md;
1412 walk.mm = mm; 1461 walk.mm = mm;
1413 1462
1414 pol = get_vma_policy(task, vma, vma->vm_start); 1463 pol = __get_vma_policy(vma, vma->vm_start);
1415 mpol_to_str(buffer, sizeof(buffer), pol); 1464 if (pol) {
1416 mpol_cond_put(pol); 1465 mpol_to_str(buffer, sizeof(buffer), pol);
1466 mpol_cond_put(pol);
1467 } else {
1468 mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
1469 }
1417 1470
1418 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1471 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1419 1472
@@ -1423,7 +1476,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1423 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1476 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1424 seq_puts(m, " heap"); 1477 seq_puts(m, " heap");
1425 } else { 1478 } else {
1426 pid_t tid = vm_is_stack(task, vma, is_pid); 1479 pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
1427 if (tid != 0) { 1480 if (tid != 0) {
1428 /* 1481 /*
1429 * Thread stack in /proc/PID/task/TID/maps or 1482 * Thread stack in /proc/PID/task/TID/maps or
@@ -1471,9 +1524,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1471 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 1524 seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1472out: 1525out:
1473 seq_putc(m, '\n'); 1526 seq_putc(m, '\n');
1474 1527 m_cache_vma(m, vma);
1475 if (m->count < m->size)
1476 m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0;
1477 return 0; 1528 return 0;
1478} 1529}
1479 1530
@@ -1504,20 +1555,8 @@ static const struct seq_operations proc_tid_numa_maps_op = {
1504static int numa_maps_open(struct inode *inode, struct file *file, 1555static int numa_maps_open(struct inode *inode, struct file *file,
1505 const struct seq_operations *ops) 1556 const struct seq_operations *ops)
1506{ 1557{
1507 struct numa_maps_private *priv; 1558 return proc_maps_open(inode, file, ops,
1508 int ret = -ENOMEM; 1559 sizeof(struct numa_maps_private));
1509 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1510 if (priv) {
1511 priv->proc_maps.pid = proc_pid(inode);
1512 ret = seq_open(file, ops);
1513 if (!ret) {
1514 struct seq_file *m = file->private_data;
1515 m->private = priv;
1516 } else {
1517 kfree(priv);
1518 }
1519 }
1520 return ret;
1521} 1560}
1522 1561
1523static int pid_numa_maps_open(struct inode *inode, struct file *file) 1562static int pid_numa_maps_open(struct inode *inode, struct file *file)
@@ -1534,13 +1573,13 @@ const struct file_operations proc_pid_numa_maps_operations = {
1534 .open = pid_numa_maps_open, 1573 .open = pid_numa_maps_open,
1535 .read = seq_read, 1574 .read = seq_read,
1536 .llseek = seq_lseek, 1575 .llseek = seq_lseek,
1537 .release = seq_release_private, 1576 .release = proc_map_release,
1538}; 1577};
1539 1578
1540const struct file_operations proc_tid_numa_maps_operations = { 1579const struct file_operations proc_tid_numa_maps_operations = {
1541 .open = tid_numa_maps_open, 1580 .open = tid_numa_maps_open,
1542 .read = seq_read, 1581 .read = seq_read,
1543 .llseek = seq_lseek, 1582 .llseek = seq_lseek,
1544 .release = seq_release_private, 1583 .release = proc_map_release,
1545}; 1584};
1546#endif /* CONFIG_NUMA */ 1585#endif /* CONFIG_NUMA */
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 678455d2d683..599ec2e20104 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm,
123 return size; 123 return size;
124} 124}
125 125
126static pid_t pid_of_stack(struct proc_maps_private *priv,
127 struct vm_area_struct *vma, bool is_pid)
128{
129 struct inode *inode = priv->inode;
130 struct task_struct *task;
131 pid_t ret = 0;
132
133 rcu_read_lock();
134 task = pid_task(proc_pid(inode), PIDTYPE_PID);
135 if (task) {
136 task = task_of_stack(task, vma, is_pid);
137 if (task)
138 ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
139 }
140 rcu_read_unlock();
141
142 return ret;
143}
144
126/* 145/*
127 * display a single VMA to a sequenced file 146 * display a single VMA to a sequenced file
128 */ 147 */
@@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
163 seq_pad(m, ' '); 182 seq_pad(m, ' ');
164 seq_path(m, &file->f_path, ""); 183 seq_path(m, &file->f_path, "");
165 } else if (mm) { 184 } else if (mm) {
166 pid_t tid = vm_is_stack(priv->task, vma, is_pid); 185 pid_t tid = pid_of_stack(priv, vma, is_pid);
167 186
168 if (tid != 0) { 187 if (tid != 0) {
169 seq_pad(m, ' '); 188 seq_pad(m, ' ');
@@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos)
212 loff_t n = *pos; 231 loff_t n = *pos;
213 232
214 /* pin the task and mm whilst we play with them */ 233 /* pin the task and mm whilst we play with them */
215 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 234 priv->task = get_proc_task(priv->inode);
216 if (!priv->task) 235 if (!priv->task)
217 return ERR_PTR(-ESRCH); 236 return ERR_PTR(-ESRCH);
218 237
219 mm = mm_access(priv->task, PTRACE_MODE_READ); 238 mm = priv->mm;
220 if (!mm || IS_ERR(mm)) { 239 if (!mm || !atomic_inc_not_zero(&mm->mm_users))
221 put_task_struct(priv->task); 240 return NULL;
222 priv->task = NULL;
223 return mm;
224 }
225 down_read(&mm->mmap_sem);
226 241
242 down_read(&mm->mmap_sem);
227 /* start from the Nth VMA */ 243 /* start from the Nth VMA */
228 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) 244 for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
229 if (n-- == 0) 245 if (n-- == 0)
230 return p; 246 return p;
247
248 up_read(&mm->mmap_sem);
249 mmput(mm);
231 return NULL; 250 return NULL;
232} 251}
233 252
@@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml)
235{ 254{
236 struct proc_maps_private *priv = m->private; 255 struct proc_maps_private *priv = m->private;
237 256
257 if (!IS_ERR_OR_NULL(_vml)) {
258 up_read(&priv->mm->mmap_sem);
259 mmput(priv->mm);
260 }
238 if (priv->task) { 261 if (priv->task) {
239 struct mm_struct *mm = priv->task->mm;
240 up_read(&mm->mmap_sem);
241 mmput(mm);
242 put_task_struct(priv->task); 262 put_task_struct(priv->task);
263 priv->task = NULL;
243 } 264 }
244} 265}
245 266
@@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file,
269 const struct seq_operations *ops) 290 const struct seq_operations *ops)
270{ 291{
271 struct proc_maps_private *priv; 292 struct proc_maps_private *priv;
272 int ret = -ENOMEM; 293
273 294 priv = __seq_open_private(file, ops, sizeof(*priv));
274 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 295 if (!priv)
275 if (priv) { 296 return -ENOMEM;
276 priv->pid = proc_pid(inode); 297
277 ret = seq_open(file, ops); 298 priv->inode = inode;
278 if (!ret) { 299 priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
279 struct seq_file *m = file->private_data; 300 if (IS_ERR(priv->mm)) {
280 m->private = priv; 301 int err = PTR_ERR(priv->mm);
281 } else { 302
282 kfree(priv); 303 seq_release_private(inode, file);
283 } 304 return err;
284 } 305 }
285 return ret; 306
307 return 0;
308}
309
310
311static int map_release(struct inode *inode, struct file *file)
312{
313 struct seq_file *seq = file->private_data;
314 struct proc_maps_private *priv = seq->private;
315
316 if (priv->mm)
317 mmdrop(priv->mm);
318
319 return seq_release_private(inode, file);
286} 320}
287 321
288static int pid_maps_open(struct inode *inode, struct file *file) 322static int pid_maps_open(struct inode *inode, struct file *file)
@@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = {
299 .open = pid_maps_open, 333 .open = pid_maps_open,
300 .read = seq_read, 334 .read = seq_read,
301 .llseek = seq_lseek, 335 .llseek = seq_lseek,
302 .release = seq_release_private, 336 .release = map_release,
303}; 337};
304 338
305const struct file_operations proc_tid_maps_operations = { 339const struct file_operations proc_tid_maps_operations = {
306 .open = tid_maps_open, 340 .open = tid_maps_open,
307 .read = seq_read, 341 .read = seq_read,
308 .llseek = seq_lseek, 342 .llseek = seq_lseek,
309 .release = seq_release_private, 343 .release = map_release,
310}; 344};
311 345
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
new file mode 100644
index 000000000000..59075b509df3
--- /dev/null
+++ b/fs/proc/thread_self.c
@@ -0,0 +1,85 @@
1#include <linux/sched.h>
2#include <linux/namei.h>
3#include <linux/slab.h>
4#include <linux/pid_namespace.h>
5#include "internal.h"
6
7/*
8 * /proc/thread_self:
9 */
10static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
11 int buflen)
12{
13 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
14 pid_t tgid = task_tgid_nr_ns(current, ns);
15 pid_t pid = task_pid_nr_ns(current, ns);
16 char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF];
17 if (!pid)
18 return -ENOENT;
19 sprintf(tmp, "%d/task/%d", tgid, pid);
20 return readlink_copy(buffer, buflen, tmp);
21}
22
23static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd)
24{
25 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
26 pid_t tgid = task_tgid_nr_ns(current, ns);
27 pid_t pid = task_pid_nr_ns(current, ns);
28 char *name = ERR_PTR(-ENOENT);
29 if (pid) {
30 name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
31 if (!name)
32 name = ERR_PTR(-ENOMEM);
33 else
34 sprintf(name, "%d/task/%d", tgid, pid);
35 }
36 nd_set_link(nd, name);
37 return NULL;
38}
39
40static const struct inode_operations proc_thread_self_inode_operations = {
41 .readlink = proc_thread_self_readlink,
42 .follow_link = proc_thread_self_follow_link,
43 .put_link = kfree_put_link,
44};
45
46static unsigned thread_self_inum;
47
48int proc_setup_thread_self(struct super_block *s)
49{
50 struct inode *root_inode = s->s_root->d_inode;
51 struct pid_namespace *ns = s->s_fs_info;
52 struct dentry *thread_self;
53
54 mutex_lock(&root_inode->i_mutex);
55 thread_self = d_alloc_name(s->s_root, "thread-self");
56 if (thread_self) {
57 struct inode *inode = new_inode_pseudo(s);
58 if (inode) {
59 inode->i_ino = thread_self_inum;
60 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
61 inode->i_mode = S_IFLNK | S_IRWXUGO;
62 inode->i_uid = GLOBAL_ROOT_UID;
63 inode->i_gid = GLOBAL_ROOT_GID;
64 inode->i_op = &proc_thread_self_inode_operations;
65 d_add(thread_self, inode);
66 } else {
67 dput(thread_self);
68 thread_self = ERR_PTR(-ENOMEM);
69 }
70 } else {
71 thread_self = ERR_PTR(-ENOMEM);
72 }
73 mutex_unlock(&root_inode->i_mutex);
74 if (IS_ERR(thread_self)) {
75 pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
76 return PTR_ERR(thread_self);
77 }
78 ns->proc_thread_self = thread_self;
79 return 0;
80}
81
82void __init proc_thread_self_init(void)
83{
84 proc_alloc_inum(&thread_self_inum);
85}
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 382aa890e228..a90d6d354199 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -328,6 +328,82 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
328 * virtually contiguous user-space in ELF layout. 328 * virtually contiguous user-space in ELF layout.
329 */ 329 */
330#ifdef CONFIG_MMU 330#ifdef CONFIG_MMU
331/*
332 * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
333 * reported as not being ram with the zero page.
334 *
335 * @vma: vm_area_struct describing requested mapping
336 * @from: start remapping from
337 * @pfn: page frame number to start remapping to
338 * @size: remapping size
339 * @prot: protection bits
340 *
341 * Returns zero on success, -EAGAIN on failure.
342 */
343static int remap_oldmem_pfn_checked(struct vm_area_struct *vma,
344 unsigned long from, unsigned long pfn,
345 unsigned long size, pgprot_t prot)
346{
347 unsigned long map_size;
348 unsigned long pos_start, pos_end, pos;
349 unsigned long zeropage_pfn = my_zero_pfn(0);
350 size_t len = 0;
351
352 pos_start = pfn;
353 pos_end = pfn + (size >> PAGE_SHIFT);
354
355 for (pos = pos_start; pos < pos_end; ++pos) {
356 if (!pfn_is_ram(pos)) {
357 /*
358 * We hit a page which is not ram. Remap the continuous
359 * region between pos_start and pos-1 and replace
360 * the non-ram page at pos with the zero page.
361 */
362 if (pos > pos_start) {
363 /* Remap continuous region */
364 map_size = (pos - pos_start) << PAGE_SHIFT;
365 if (remap_oldmem_pfn_range(vma, from + len,
366 pos_start, map_size,
367 prot))
368 goto fail;
369 len += map_size;
370 }
371 /* Remap the zero page */
372 if (remap_oldmem_pfn_range(vma, from + len,
373 zeropage_pfn,
374 PAGE_SIZE, prot))
375 goto fail;
376 len += PAGE_SIZE;
377 pos_start = pos + 1;
378 }
379 }
380 if (pos > pos_start) {
381 /* Remap the rest */
382 map_size = (pos - pos_start) << PAGE_SHIFT;
383 if (remap_oldmem_pfn_range(vma, from + len, pos_start,
384 map_size, prot))
385 goto fail;
386 }
387 return 0;
388fail:
389 do_munmap(vma->vm_mm, from, len);
390 return -EAGAIN;
391}
392
393static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma,
394 unsigned long from, unsigned long pfn,
395 unsigned long size, pgprot_t prot)
396{
397 /*
398 * Check if oldmem_pfn_is_ram was registered to avoid
399 * looping over all pages without a reason.
400 */
401 if (oldmem_pfn_is_ram)
402 return remap_oldmem_pfn_checked(vma, from, pfn, size, prot);
403 else
404 return remap_oldmem_pfn_range(vma, from, pfn, size, prot);
405}
406
331static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) 407static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
332{ 408{
333 size_t size = vma->vm_end - vma->vm_start; 409 size_t size = vma->vm_end - vma->vm_start;
@@ -387,9 +463,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
387 463
388 tsz = min_t(size_t, m->offset + m->size - start, size); 464 tsz = min_t(size_t, m->offset + m->size - start, size);
389 paddr = m->paddr + start - m->offset; 465 paddr = m->paddr + start - m->offset;
390 if (remap_oldmem_pfn_range(vma, vma->vm_start + len, 466 if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len,
391 paddr >> PAGE_SHIFT, tsz, 467 paddr >> PAGE_SHIFT, tsz,
392 vma->vm_page_prot)) 468 vma->vm_page_prot))
393 goto fail; 469 goto fail;
394 size -= tsz; 470 size -= tsz;
395 start += tsz; 471 start += tsz;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 1a81373947f3..73ca1740d839 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -232,17 +232,15 @@ static int mounts_open_common(struct inode *inode, struct file *file,
232 if (!task) 232 if (!task)
233 goto err; 233 goto err;
234 234
235 rcu_read_lock(); 235 task_lock(task);
236 nsp = task_nsproxy(task); 236 nsp = task->nsproxy;
237 if (!nsp || !nsp->mnt_ns) { 237 if (!nsp || !nsp->mnt_ns) {
238 rcu_read_unlock(); 238 task_unlock(task);
239 put_task_struct(task); 239 put_task_struct(task);
240 goto err; 240 goto err;
241 } 241 }
242 ns = nsp->mnt_ns; 242 ns = nsp->mnt_ns;
243 get_mnt_ns(ns); 243 get_mnt_ns(ns);
244 rcu_read_unlock();
245 task_lock(task);
246 if (!task->fs) { 244 if (!task->fs) {
247 task_unlock(task); 245 task_unlock(task);
248 put_task_struct(task); 246 put_task_struct(task);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 192297b0090d..fafb7a02a5d6 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -320,10 +320,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
320 compressed ? ".enc.z" : ""); 320 compressed ? ".enc.z" : "");
321 break; 321 break;
322 case PSTORE_TYPE_CONSOLE: 322 case PSTORE_TYPE_CONSOLE:
323 sprintf(name, "console-%s", psname); 323 sprintf(name, "console-%s-%lld", psname, id);
324 break; 324 break;
325 case PSTORE_TYPE_FTRACE: 325 case PSTORE_TYPE_FTRACE:
326 sprintf(name, "ftrace-%s", psname); 326 sprintf(name, "ftrace-%s-%lld", psname, id);
327 break; 327 break;
328 case PSTORE_TYPE_MCE: 328 case PSTORE_TYPE_MCE:
329 sprintf(name, "mce-%s-%lld", psname, id); 329 sprintf(name, "mce-%s-%lld", psname, id);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 34a1e5aa848c..9d7b9a83699e 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -394,7 +394,7 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size)
394 394
395 prot = pgprot_noncached(PAGE_KERNEL); 395 prot = pgprot_noncached(PAGE_KERNEL);
396 396
397 pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL); 397 pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL);
398 if (!pages) { 398 if (!pages) {
399 pr_err("%s: Failed to allocate array for %u pages\n", 399 pr_err("%s: Failed to allocate array for %u pages\n",
400 __func__, page_count); 400 __func__, page_count);
diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile
index 9dd06199afc9..5e6bae6fae50 100644
--- a/fs/qnx6/Makefile
+++ b/fs/qnx6/Makefile
@@ -5,3 +5,4 @@
5obj-$(CONFIG_QNX6FS_FS) += qnx6.o 5obj-$(CONFIG_QNX6FS_FS) += qnx6.o
6 6
7qnx6-objs := inode.o dir.o namei.o super_mmi.o 7qnx6-objs := inode.o dir.o namei.o super_mmi.o
8ccflags-$(CONFIG_QNX6FS_DEBUG) += -DDEBUG
diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c
index 15b7d92ed60d..8d64bb5366bf 100644
--- a/fs/qnx6/dir.c
+++ b/fs/qnx6/dir.c
@@ -77,21 +77,20 @@ static int qnx6_dir_longfilename(struct inode *inode,
77 if (de->de_size != 0xff) { 77 if (de->de_size != 0xff) {
78 /* error - long filename entries always have size 0xff 78 /* error - long filename entries always have size 0xff
79 in direntry */ 79 in direntry */
80 printk(KERN_ERR "qnx6: invalid direntry size (%i).\n", 80 pr_err("invalid direntry size (%i).\n", de->de_size);
81 de->de_size);
82 return 0; 81 return 0;
83 } 82 }
84 lf = qnx6_longname(s, de, &page); 83 lf = qnx6_longname(s, de, &page);
85 if (IS_ERR(lf)) { 84 if (IS_ERR(lf)) {
86 printk(KERN_ERR "qnx6:Error reading longname\n"); 85 pr_err("Error reading longname\n");
87 return 0; 86 return 0;
88 } 87 }
89 88
90 lf_size = fs16_to_cpu(sbi, lf->lf_size); 89 lf_size = fs16_to_cpu(sbi, lf->lf_size);
91 90
92 if (lf_size > QNX6_LONG_NAME_MAX) { 91 if (lf_size > QNX6_LONG_NAME_MAX) {
93 QNX6DEBUG((KERN_INFO "file %s\n", lf->lf_fname)); 92 pr_debug("file %s\n", lf->lf_fname);
94 printk(KERN_ERR "qnx6:Filename too long (%i)\n", lf_size); 93 pr_err("Filename too long (%i)\n", lf_size);
95 qnx6_put_page(page); 94 qnx6_put_page(page);
96 return 0; 95 return 0;
97 } 96 }
@@ -100,10 +99,10 @@ static int qnx6_dir_longfilename(struct inode *inode,
100 mmi 3g filesystem does not have that checksum */ 99 mmi 3g filesystem does not have that checksum */
101 if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) != 100 if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) !=
102 qnx6_lfile_checksum(lf->lf_fname, lf_size)) 101 qnx6_lfile_checksum(lf->lf_fname, lf_size))
103 printk(KERN_INFO "qnx6: long filename checksum error.\n"); 102 pr_info("long filename checksum error.\n");
104 103
105 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", 104 pr_debug("qnx6_readdir:%.*s inode:%u\n",
106 lf_size, lf->lf_fname, de_inode)); 105 lf_size, lf->lf_fname, de_inode);
107 if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { 106 if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) {
108 qnx6_put_page(page); 107 qnx6_put_page(page);
109 return 0; 108 return 0;
@@ -136,7 +135,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
136 int i = start; 135 int i = start;
137 136
138 if (IS_ERR(page)) { 137 if (IS_ERR(page)) {
139 printk(KERN_ERR "qnx6_readdir: read failed\n"); 138 pr_err("%s(): read failed\n", __func__);
140 ctx->pos = (n + 1) << PAGE_CACHE_SHIFT; 139 ctx->pos = (n + 1) << PAGE_CACHE_SHIFT;
141 return PTR_ERR(page); 140 return PTR_ERR(page);
142 } 141 }
@@ -159,9 +158,9 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx)
159 break; 158 break;
160 } 159 }
161 } else { 160 } else {
162 QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" 161 pr_debug("%s():%.*s inode:%u\n",
163 " inode:%u\n", size, de->de_fname, 162 __func__, size, de->de_fname,
164 no_inode)); 163 no_inode);
165 if (!dir_emit(ctx, de->de_fname, size, 164 if (!dir_emit(ctx, de->de_fname, size,
166 no_inode, DT_UNKNOWN)) { 165 no_inode, DT_UNKNOWN)) {
167 done = true; 166 done = true;
@@ -259,8 +258,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name,
259 if (ino) 258 if (ino)
260 goto found; 259 goto found;
261 } else 260 } else
262 printk(KERN_ERR "qnx6: undefined " 261 pr_err("undefined filename size in inode.\n");
263 "filename size in inode.\n");
264 } 262 }
265 qnx6_put_page(page); 263 qnx6_put_page(page);
266 } 264 }
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 65cdaab3ed49..44e73923670d 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -73,8 +73,8 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock,
73{ 73{
74 unsigned phys; 74 unsigned phys;
75 75
76 QNX6DEBUG((KERN_INFO "qnx6: qnx6_get_block inode=[%ld] iblock=[%ld]\n", 76 pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n",
77 inode->i_ino, (unsigned long)iblock)); 77 inode->i_ino, (unsigned long)iblock);
78 78
79 phys = qnx6_block_map(inode, iblock); 79 phys = qnx6_block_map(inode, iblock);
80 if (phys) { 80 if (phys) {
@@ -87,7 +87,7 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock,
87static int qnx6_check_blockptr(__fs32 ptr) 87static int qnx6_check_blockptr(__fs32 ptr)
88{ 88{
89 if (ptr == ~(__fs32)0) { 89 if (ptr == ~(__fs32)0) {
90 printk(KERN_ERR "qnx6: hit unused blockpointer.\n"); 90 pr_err("hit unused blockpointer.\n");
91 return 0; 91 return 0;
92 } 92 }
93 return 1; 93 return 1;
@@ -127,8 +127,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no)
127 levelptr = no >> bitdelta; 127 levelptr = no >> bitdelta;
128 128
129 if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) { 129 if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) {
130 printk(KERN_ERR "qnx6:Requested file block number (%u) too big.", 130 pr_err("Requested file block number (%u) too big.", no);
131 no);
132 return 0; 131 return 0;
133 } 132 }
134 133
@@ -137,8 +136,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no)
137 for (i = 0; i < depth; i++) { 136 for (i = 0; i < depth; i++) {
138 bh = sb_bread(s, block); 137 bh = sb_bread(s, block);
139 if (!bh) { 138 if (!bh) {
140 printk(KERN_ERR "qnx6:Error reading block (%u)\n", 139 pr_err("Error reading block (%u)\n", block);
141 block);
142 return 0; 140 return 0;
143 } 141 }
144 bitdelta -= ptrbits; 142 bitdelta -= ptrbits;
@@ -207,26 +205,16 @@ void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s)
207{ 205{
208 struct qnx6_sb_info *sbi = QNX6_SB(s); 206 struct qnx6_sb_info *sbi = QNX6_SB(s);
209 207
210 QNX6DEBUG((KERN_INFO "magic: %08x\n", 208 pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic));
211 fs32_to_cpu(sbi, sb->sb_magic))); 209 pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum));
212 QNX6DEBUG((KERN_INFO "checksum: %08x\n", 210 pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial));
213 fs32_to_cpu(sbi, sb->sb_checksum))); 211 pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags));
214 QNX6DEBUG((KERN_INFO "serial: %llx\n", 212 pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize));
215 fs64_to_cpu(sbi, sb->sb_serial))); 213 pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes));
216 QNX6DEBUG((KERN_INFO "flags: %08x\n", 214 pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes));
217 fs32_to_cpu(sbi, sb->sb_flags))); 215 pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks));
218 QNX6DEBUG((KERN_INFO "blocksize: %08x\n", 216 pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks));
219 fs32_to_cpu(sbi, sb->sb_blocksize))); 217 pr_debug("inode_levels: %02x\n", sb->Inode.levels);
220 QNX6DEBUG((KERN_INFO "num_inodes: %08x\n",
221 fs32_to_cpu(sbi, sb->sb_num_inodes)));
222 QNX6DEBUG((KERN_INFO "free_inodes: %08x\n",
223 fs32_to_cpu(sbi, sb->sb_free_inodes)));
224 QNX6DEBUG((KERN_INFO "num_blocks: %08x\n",
225 fs32_to_cpu(sbi, sb->sb_num_blocks)));
226 QNX6DEBUG((KERN_INFO "free_blocks: %08x\n",
227 fs32_to_cpu(sbi, sb->sb_free_blocks)));
228 QNX6DEBUG((KERN_INFO "inode_levels: %02x\n",
229 sb->Inode.levels));
230} 218}
231#endif 219#endif
232 220
@@ -277,7 +265,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
277 start with the first superblock */ 265 start with the first superblock */
278 bh = sb_bread(s, offset); 266 bh = sb_bread(s, offset);
279 if (!bh) { 267 if (!bh) {
280 printk(KERN_ERR "qnx6: unable to read the first superblock\n"); 268 pr_err("unable to read the first superblock\n");
281 return NULL; 269 return NULL;
282 } 270 }
283 sb = (struct qnx6_super_block *)bh->b_data; 271 sb = (struct qnx6_super_block *)bh->b_data;
@@ -285,20 +273,16 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
285 sbi->s_bytesex = BYTESEX_BE; 273 sbi->s_bytesex = BYTESEX_BE;
286 if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { 274 if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
287 /* we got a big endian fs */ 275 /* we got a big endian fs */
288 QNX6DEBUG((KERN_INFO "qnx6: fs got different" 276 pr_debug("fs got different endianness.\n");
289 " endianness.\n"));
290 return bh; 277 return bh;
291 } else 278 } else
292 sbi->s_bytesex = BYTESEX_LE; 279 sbi->s_bytesex = BYTESEX_LE;
293 if (!silent) { 280 if (!silent) {
294 if (offset == 0) { 281 if (offset == 0) {
295 printk(KERN_ERR "qnx6: wrong signature (magic)" 282 pr_err("wrong signature (magic) in superblock #1.\n");
296 " in superblock #1.\n");
297 } else { 283 } else {
298 printk(KERN_INFO "qnx6: wrong signature (magic)" 284 pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n",
299 " at position (0x%lx) - will try" 285 offset * s->s_blocksize);
300 " alternative position (0x0000).\n",
301 offset * s->s_blocksize);
302 } 286 }
303 } 287 }
304 brelse(bh); 288 brelse(bh);
@@ -329,13 +313,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
329 313
330 /* Superblock always is 512 Byte long */ 314 /* Superblock always is 512 Byte long */
331 if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) { 315 if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) {
332 printk(KERN_ERR "qnx6: unable to set blocksize\n"); 316 pr_err("unable to set blocksize\n");
333 goto outnobh; 317 goto outnobh;
334 } 318 }
335 319
336 /* parse the mount-options */ 320 /* parse the mount-options */
337 if (!qnx6_parse_options((char *) data, s)) { 321 if (!qnx6_parse_options((char *) data, s)) {
338 printk(KERN_ERR "qnx6: invalid mount options.\n"); 322 pr_err("invalid mount options.\n");
339 goto outnobh; 323 goto outnobh;
340 } 324 }
341 if (test_opt(s, MMI_FS)) { 325 if (test_opt(s, MMI_FS)) {
@@ -355,7 +339,7 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
355 /* try again without bootblock offset */ 339 /* try again without bootblock offset */
356 bh1 = qnx6_check_first_superblock(s, 0, silent); 340 bh1 = qnx6_check_first_superblock(s, 0, silent);
357 if (!bh1) { 341 if (!bh1) {
358 printk(KERN_ERR "qnx6: unable to read the first superblock\n"); 342 pr_err("unable to read the first superblock\n");
359 goto outnobh; 343 goto outnobh;
360 } 344 }
361 /* seems that no bootblock at partition start */ 345 /* seems that no bootblock at partition start */
@@ -370,13 +354,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
370 /* checksum check - start at byte 8 and end at byte 512 */ 354 /* checksum check - start at byte 8 and end at byte 512 */
371 if (fs32_to_cpu(sbi, sb1->sb_checksum) != 355 if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
372 crc32_be(0, (char *)(bh1->b_data + 8), 504)) { 356 crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
373 printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); 357 pr_err("superblock #1 checksum error\n");
374 goto out; 358 goto out;
375 } 359 }
376 360
377 /* set new blocksize */ 361 /* set new blocksize */
378 if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { 362 if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
379 printk(KERN_ERR "qnx6: unable to set blocksize\n"); 363 pr_err("unable to set blocksize\n");
380 goto out; 364 goto out;
381 } 365 }
382 /* blocksize invalidates bh - pull it back in */ 366 /* blocksize invalidates bh - pull it back in */
@@ -398,21 +382,20 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
398 /* next the second superblock */ 382 /* next the second superblock */
399 bh2 = sb_bread(s, offset); 383 bh2 = sb_bread(s, offset);
400 if (!bh2) { 384 if (!bh2) {
401 printk(KERN_ERR "qnx6: unable to read the second superblock\n"); 385 pr_err("unable to read the second superblock\n");
402 goto out; 386 goto out;
403 } 387 }
404 sb2 = (struct qnx6_super_block *)bh2->b_data; 388 sb2 = (struct qnx6_super_block *)bh2->b_data;
405 if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { 389 if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
406 if (!silent) 390 if (!silent)
407 printk(KERN_ERR "qnx6: wrong signature (magic)" 391 pr_err("wrong signature (magic) in superblock #2.\n");
408 " in superblock #2.\n");
409 goto out; 392 goto out;
410 } 393 }
411 394
412 /* checksum check - start at byte 8 and end at byte 512 */ 395 /* checksum check - start at byte 8 and end at byte 512 */
413 if (fs32_to_cpu(sbi, sb2->sb_checksum) != 396 if (fs32_to_cpu(sbi, sb2->sb_checksum) !=
414 crc32_be(0, (char *)(bh2->b_data + 8), 504)) { 397 crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
415 printk(KERN_ERR "qnx6: superblock #2 checksum error\n"); 398 pr_err("superblock #2 checksum error\n");
416 goto out; 399 goto out;
417 } 400 }
418 401
@@ -422,25 +405,24 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent)
422 sbi->sb_buf = bh1; 405 sbi->sb_buf = bh1;
423 sbi->sb = (struct qnx6_super_block *)bh1->b_data; 406 sbi->sb = (struct qnx6_super_block *)bh1->b_data;
424 brelse(bh2); 407 brelse(bh2);
425 printk(KERN_INFO "qnx6: superblock #1 active\n"); 408 pr_info("superblock #1 active\n");
426 } else { 409 } else {
427 /* superblock #2 active */ 410 /* superblock #2 active */
428 sbi->sb_buf = bh2; 411 sbi->sb_buf = bh2;
429 sbi->sb = (struct qnx6_super_block *)bh2->b_data; 412 sbi->sb = (struct qnx6_super_block *)bh2->b_data;
430 brelse(bh1); 413 brelse(bh1);
431 printk(KERN_INFO "qnx6: superblock #2 active\n"); 414 pr_info("superblock #2 active\n");
432 } 415 }
433mmi_success: 416mmi_success:
434 /* sanity check - limit maximum indirect pointer levels */ 417 /* sanity check - limit maximum indirect pointer levels */
435 if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) { 418 if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) {
436 printk(KERN_ERR "qnx6: too many inode levels (max %i, sb %i)\n", 419 pr_err("too many inode levels (max %i, sb %i)\n",
437 QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); 420 QNX6_PTR_MAX_LEVELS, sb1->Inode.levels);
438 goto out; 421 goto out;
439 } 422 }
440 if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) { 423 if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) {
441 printk(KERN_ERR "qnx6: too many longfilename levels" 424 pr_err("too many longfilename levels (max %i, sb %i)\n",
442 " (max %i, sb %i)\n", 425 QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
443 QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels);
444 goto out; 426 goto out;
445 } 427 }
446 s->s_op = &qnx6_sops; 428 s->s_op = &qnx6_sops;
@@ -460,7 +442,7 @@ mmi_success:
460 /* prefetch root inode */ 442 /* prefetch root inode */
461 root = qnx6_iget(s, QNX6_ROOT_INO); 443 root = qnx6_iget(s, QNX6_ROOT_INO);
462 if (IS_ERR(root)) { 444 if (IS_ERR(root)) {
463 printk(KERN_ERR "qnx6: get inode failed\n"); 445 pr_err("get inode failed\n");
464 ret = PTR_ERR(root); 446 ret = PTR_ERR(root);
465 goto out2; 447 goto out2;
466 } 448 }
@@ -474,7 +456,7 @@ mmi_success:
474 errmsg = qnx6_checkroot(s); 456 errmsg = qnx6_checkroot(s);
475 if (errmsg != NULL) { 457 if (errmsg != NULL) {
476 if (!silent) 458 if (!silent)
477 printk(KERN_ERR "qnx6: %s\n", errmsg); 459 pr_err("%s\n", errmsg);
478 goto out3; 460 goto out3;
479 } 461 }
480 return 0; 462 return 0;
@@ -555,8 +537,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
555 inode->i_mode = 0; 537 inode->i_mode = 0;
556 538
557 if (ino == 0) { 539 if (ino == 0) {
558 printk(KERN_ERR "qnx6: bad inode number on dev %s: %u is " 540 pr_err("bad inode number on dev %s: %u is out of range\n",
559 "out of range\n",
560 sb->s_id, ino); 541 sb->s_id, ino);
561 iget_failed(inode); 542 iget_failed(inode);
562 return ERR_PTR(-EIO); 543 return ERR_PTR(-EIO);
@@ -566,8 +547,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
566 mapping = sbi->inodes->i_mapping; 547 mapping = sbi->inodes->i_mapping;
567 page = read_mapping_page(mapping, n, NULL); 548 page = read_mapping_page(mapping, n, NULL);
568 if (IS_ERR(page)) { 549 if (IS_ERR(page)) {
569 printk(KERN_ERR "qnx6: major problem: unable to read inode from " 550 pr_err("major problem: unable to read inode from dev %s\n",
570 "dev %s\n", sb->s_id); 551 sb->s_id);
571 iget_failed(inode); 552 iget_failed(inode);
572 return ERR_CAST(page); 553 return ERR_CAST(page);
573 } 554 }
@@ -689,7 +670,7 @@ static int __init init_qnx6_fs(void)
689 return err; 670 return err;
690 } 671 }
691 672
692 printk(KERN_INFO "QNX6 filesystem 1.0.0 registered.\n"); 673 pr_info("QNX6 filesystem 1.0.0 registered.\n");
693 return 0; 674 return 0;
694} 675}
695 676
diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c
index 0561326a94f5..6c1a323137dd 100644
--- a/fs/qnx6/namei.c
+++ b/fs/qnx6/namei.c
@@ -29,12 +29,12 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry,
29 foundinode = qnx6_iget(dir->i_sb, ino); 29 foundinode = qnx6_iget(dir->i_sb, ino);
30 qnx6_put_page(page); 30 qnx6_put_page(page);
31 if (IS_ERR(foundinode)) { 31 if (IS_ERR(foundinode)) {
32 QNX6DEBUG((KERN_ERR "qnx6: lookup->iget -> " 32 pr_debug("lookup->iget -> error %ld\n",
33 " error %ld\n", PTR_ERR(foundinode))); 33 PTR_ERR(foundinode));
34 return ERR_CAST(foundinode); 34 return ERR_CAST(foundinode);
35 } 35 }
36 } else { 36 } else {
37 QNX6DEBUG((KERN_INFO "qnx6_lookup: not found %s\n", name)); 37 pr_debug("%s(): not found %s\n", __func__, name);
38 return NULL; 38 return NULL;
39 } 39 }
40 d_add(dentry, foundinode); 40 d_add(dentry, foundinode);
diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h
index b00fcc960d37..d3fb2b698800 100644
--- a/fs/qnx6/qnx6.h
+++ b/fs/qnx6/qnx6.h
@@ -10,6 +10,12 @@
10 * 10 *
11 */ 11 */
12 12
13#ifdef pr_fmt
14#undef pr_fmt
15#endif
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
13#include <linux/fs.h> 19#include <linux/fs.h>
14#include <linux/pagemap.h> 20#include <linux/pagemap.h>
15 21
@@ -19,12 +25,6 @@ typedef __u64 __bitwise __fs64;
19 25
20#include <linux/qnx6_fs.h> 26#include <linux/qnx6_fs.h>
21 27
22#ifdef CONFIG_QNX6FS_DEBUG
23#define QNX6DEBUG(X) printk X
24#else
25#define QNX6DEBUG(X) (void) 0
26#endif
27
28struct qnx6_sb_info { 28struct qnx6_sb_info {
29 struct buffer_head *sb_buf; /* superblock buffer */ 29 struct buffer_head *sb_buf; /* superblock buffer */
30 struct qnx6_super_block *sb; /* our superblock */ 30 struct qnx6_super_block *sb; /* our superblock */
diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c
index 29c32cba62d6..62aaf3e3126a 100644
--- a/fs/qnx6/super_mmi.c
+++ b/fs/qnx6/super_mmi.c
@@ -44,15 +44,14 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
44 start with the first superblock */ 44 start with the first superblock */
45 bh1 = sb_bread(s, 0); 45 bh1 = sb_bread(s, 0);
46 if (!bh1) { 46 if (!bh1) {
47 printk(KERN_ERR "qnx6: Unable to read first mmi superblock\n"); 47 pr_err("Unable to read first mmi superblock\n");
48 return NULL; 48 return NULL;
49 } 49 }
50 sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; 50 sb1 = (struct qnx6_mmi_super_block *)bh1->b_data;
51 sbi = QNX6_SB(s); 51 sbi = QNX6_SB(s);
52 if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) { 52 if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) {
53 if (!silent) { 53 if (!silent) {
54 printk(KERN_ERR "qnx6: wrong signature (magic) in" 54 pr_err("wrong signature (magic) in superblock #1.\n");
55 " superblock #1.\n");
56 goto out; 55 goto out;
57 } 56 }
58 } 57 }
@@ -60,7 +59,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
60 /* checksum check - start at byte 8 and end at byte 512 */ 59 /* checksum check - start at byte 8 and end at byte 512 */
61 if (fs32_to_cpu(sbi, sb1->sb_checksum) != 60 if (fs32_to_cpu(sbi, sb1->sb_checksum) !=
62 crc32_be(0, (char *)(bh1->b_data + 8), 504)) { 61 crc32_be(0, (char *)(bh1->b_data + 8), 504)) {
63 printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); 62 pr_err("superblock #1 checksum error\n");
64 goto out; 63 goto out;
65 } 64 }
66 65
@@ -70,7 +69,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
70 69
71 /* set new blocksize */ 70 /* set new blocksize */
72 if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { 71 if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) {
73 printk(KERN_ERR "qnx6: unable to set blocksize\n"); 72 pr_err("unable to set blocksize\n");
74 goto out; 73 goto out;
75 } 74 }
76 /* blocksize invalidates bh - pull it back in */ 75 /* blocksize invalidates bh - pull it back in */
@@ -83,27 +82,26 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
83 /* read second superblock */ 82 /* read second superblock */
84 bh2 = sb_bread(s, offset); 83 bh2 = sb_bread(s, offset);
85 if (!bh2) { 84 if (!bh2) {
86 printk(KERN_ERR "qnx6: unable to read the second superblock\n"); 85 pr_err("unable to read the second superblock\n");
87 goto out; 86 goto out;
88 } 87 }
89 sb2 = (struct qnx6_mmi_super_block *)bh2->b_data; 88 sb2 = (struct qnx6_mmi_super_block *)bh2->b_data;
90 if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { 89 if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) {
91 if (!silent) 90 if (!silent)
92 printk(KERN_ERR "qnx6: wrong signature (magic) in" 91 pr_err("wrong signature (magic) in superblock #2.\n");
93 " superblock #2.\n");
94 goto out; 92 goto out;
95 } 93 }
96 94
97 /* checksum check - start at byte 8 and end at byte 512 */ 95 /* checksum check - start at byte 8 and end at byte 512 */
98 if (fs32_to_cpu(sbi, sb2->sb_checksum) 96 if (fs32_to_cpu(sbi, sb2->sb_checksum)
99 != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { 97 != crc32_be(0, (char *)(bh2->b_data + 8), 504)) {
100 printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); 98 pr_err("superblock #1 checksum error\n");
101 goto out; 99 goto out;
102 } 100 }
103 101
104 qsb = kmalloc(sizeof(*qsb), GFP_KERNEL); 102 qsb = kmalloc(sizeof(*qsb), GFP_KERNEL);
105 if (!qsb) { 103 if (!qsb) {
106 printk(KERN_ERR "qnx6: unable to allocate memory.\n"); 104 pr_err("unable to allocate memory.\n");
107 goto out; 105 goto out;
108 } 106 }
109 107
@@ -119,7 +117,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
119 sbi->sb_buf = bh1; 117 sbi->sb_buf = bh1;
120 sbi->sb = (struct qnx6_super_block *)bh1->b_data; 118 sbi->sb = (struct qnx6_super_block *)bh1->b_data;
121 brelse(bh2); 119 brelse(bh2);
122 printk(KERN_INFO "qnx6: superblock #1 active\n"); 120 pr_info("superblock #1 active\n");
123 } else { 121 } else {
124 /* superblock #2 active */ 122 /* superblock #2 active */
125 qnx6_mmi_copy_sb(qsb, sb2); 123 qnx6_mmi_copy_sb(qsb, sb2);
@@ -131,7 +129,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent)
131 sbi->sb_buf = bh2; 129 sbi->sb_buf = bh2;
132 sbi->sb = (struct qnx6_super_block *)bh2->b_data; 130 sbi->sb = (struct qnx6_super_block *)bh2->b_data;
133 brelse(bh1); 131 brelse(bh1);
134 printk(KERN_INFO "qnx6: superblock #2 active\n"); 132 pr_info("superblock #2 active\n");
135 } 133 }
136 kfree(qsb); 134 kfree(qsb);
137 135
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 7f30bdc57d13..8b663b2d9562 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -96,13 +96,16 @@
96 * Note that some things (eg. sb pointer, type, id) doesn't change during 96 * Note that some things (eg. sb pointer, type, id) doesn't change during
97 * the life of the dquot structure and so needn't to be protected by a lock 97 * the life of the dquot structure and so needn't to be protected by a lock
98 * 98 *
99 * Any operation working on dquots via inode pointers must hold dqptr_sem. If 99 * Operation accessing dquots via inode pointers are protected by dquot_srcu.
100 * operation is just reading pointers from inode (or not using them at all) the 100 * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
101 * read lock is enough. If pointers are altered function must hold write lock. 101 * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
102 * inode and before dropping dquot references to avoid use of dquots after
103 * they are freed. dq_data_lock is used to serialize the pointer setting and
104 * clearing operations.
102 * Special care needs to be taken about S_NOQUOTA inode flag (marking that 105 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
103 * inode is a quota file). Functions adding pointers from inode to dquots have 106 * inode is a quota file). Functions adding pointers from inode to dquots have
104 * to check this flag under dqptr_sem and then (if S_NOQUOTA is not set) they 107 * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
105 * have to do all pointer modifications before dropping dqptr_sem. This makes 108 * have to do all pointer modifications before dropping dq_data_lock. This makes
106 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and 109 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
107 * then drops all pointers to dquots from an inode. 110 * then drops all pointers to dquots from an inode.
108 * 111 *
@@ -116,21 +119,15 @@
116 * spinlock to internal buffers before writing. 119 * spinlock to internal buffers before writing.
117 * 120 *
118 * Lock ordering (including related VFS locks) is the following: 121 * Lock ordering (including related VFS locks) is the following:
119 * dqonoff_mutex > i_mutex > journal_lock > dqptr_sem > dquot->dq_lock > 122 * dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
120 * dqio_mutex
121 * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. 123 * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
122 * The lock ordering of dqptr_sem imposed by quota code is only dqonoff_sem >
123 * dqptr_sem. But filesystem has to count with the fact that functions such as
124 * dquot_alloc_space() acquire dqptr_sem and they usually have to be called
125 * from inside a transaction to keep filesystem consistency after a crash. Also
126 * filesystems usually want to do some IO on dquot from ->mark_dirty which is
127 * called with dqptr_sem held.
128 */ 124 */
129 125
130static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); 126static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
131static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); 127static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
132__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); 128__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
133EXPORT_SYMBOL(dq_data_lock); 129EXPORT_SYMBOL(dq_data_lock);
130DEFINE_STATIC_SRCU(dquot_srcu);
134 131
135void __quota_error(struct super_block *sb, const char *func, 132void __quota_error(struct super_block *sb, const char *func,
136 const char *fmt, ...) 133 const char *fmt, ...)
@@ -733,7 +730,6 @@ static struct shrinker dqcache_shrinker = {
733 730
734/* 731/*
735 * Put reference to dquot 732 * Put reference to dquot
736 * NOTE: If you change this function please check whether dqput_blocks() works right...
737 */ 733 */
738void dqput(struct dquot *dquot) 734void dqput(struct dquot *dquot)
739{ 735{
@@ -963,46 +959,33 @@ static void add_dquot_ref(struct super_block *sb, int type)
963} 959}
964 960
965/* 961/*
966 * Return 0 if dqput() won't block.
967 * (note that 1 doesn't necessarily mean blocking)
968 */
969static inline int dqput_blocks(struct dquot *dquot)
970{
971 if (atomic_read(&dquot->dq_count) <= 1)
972 return 1;
973 return 0;
974}
975
976/*
977 * Remove references to dquots from inode and add dquot to list for freeing 962 * Remove references to dquots from inode and add dquot to list for freeing
978 * if we have the last reference to dquot 963 * if we have the last reference to dquot
979 * We can't race with anybody because we hold dqptr_sem for writing...
980 */ 964 */
981static int remove_inode_dquot_ref(struct inode *inode, int type, 965static void remove_inode_dquot_ref(struct inode *inode, int type,
982 struct list_head *tofree_head) 966 struct list_head *tofree_head)
983{ 967{
984 struct dquot *dquot = inode->i_dquot[type]; 968 struct dquot *dquot = inode->i_dquot[type];
985 969
986 inode->i_dquot[type] = NULL; 970 inode->i_dquot[type] = NULL;
987 if (dquot) { 971 if (!dquot)
988 if (dqput_blocks(dquot)) { 972 return;
989#ifdef CONFIG_QUOTA_DEBUG 973
990 if (atomic_read(&dquot->dq_count) != 1) 974 if (list_empty(&dquot->dq_free)) {
991 quota_error(inode->i_sb, "Adding dquot with " 975 /*
992 "dq_count %d to dispose list", 976 * The inode still has reference to dquot so it can't be in the
993 atomic_read(&dquot->dq_count)); 977 * free list
994#endif 978 */
995 spin_lock(&dq_list_lock); 979 spin_lock(&dq_list_lock);
996 /* As dquot must have currently users it can't be on 980 list_add(&dquot->dq_free, tofree_head);
997 * the free list... */ 981 spin_unlock(&dq_list_lock);
998 list_add(&dquot->dq_free, tofree_head); 982 } else {
999 spin_unlock(&dq_list_lock); 983 /*
1000 return 1; 984 * Dquot is already in a list to put so we won't drop the last
1001 } 985 * reference here.
1002 else 986 */
1003 dqput(dquot); /* We have guaranteed we won't block */ 987 dqput(dquot);
1004 } 988 }
1005 return 0;
1006} 989}
1007 990
1008/* 991/*
@@ -1037,13 +1020,15 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1037 * We have to scan also I_NEW inodes because they can already 1020 * We have to scan also I_NEW inodes because they can already
1038 * have quota pointer initialized. Luckily, we need to touch 1021 * have quota pointer initialized. Luckily, we need to touch
1039 * only quota pointers and these have separate locking 1022 * only quota pointers and these have separate locking
1040 * (dqptr_sem). 1023 * (dq_data_lock).
1041 */ 1024 */
1025 spin_lock(&dq_data_lock);
1042 if (!IS_NOQUOTA(inode)) { 1026 if (!IS_NOQUOTA(inode)) {
1043 if (unlikely(inode_get_rsv_space(inode) > 0)) 1027 if (unlikely(inode_get_rsv_space(inode) > 0))
1044 reserved = 1; 1028 reserved = 1;
1045 remove_inode_dquot_ref(inode, type, tofree_head); 1029 remove_inode_dquot_ref(inode, type, tofree_head);
1046 } 1030 }
1031 spin_unlock(&dq_data_lock);
1047 } 1032 }
1048 spin_unlock(&inode_sb_list_lock); 1033 spin_unlock(&inode_sb_list_lock);
1049#ifdef CONFIG_QUOTA_DEBUG 1034#ifdef CONFIG_QUOTA_DEBUG
@@ -1061,9 +1046,8 @@ static void drop_dquot_ref(struct super_block *sb, int type)
1061 LIST_HEAD(tofree_head); 1046 LIST_HEAD(tofree_head);
1062 1047
1063 if (sb->dq_op) { 1048 if (sb->dq_op) {
1064 down_write(&sb_dqopt(sb)->dqptr_sem);
1065 remove_dquot_ref(sb, type, &tofree_head); 1049 remove_dquot_ref(sb, type, &tofree_head);
1066 up_write(&sb_dqopt(sb)->dqptr_sem); 1050 synchronize_srcu(&dquot_srcu);
1067 put_dquot_list(&tofree_head); 1051 put_dquot_list(&tofree_head);
1068 } 1052 }
1069} 1053}
@@ -1394,21 +1378,16 @@ static int dquot_active(const struct inode *inode)
1394/* 1378/*
1395 * Initialize quota pointers in inode 1379 * Initialize quota pointers in inode
1396 * 1380 *
1397 * We do things in a bit complicated way but by that we avoid calling
1398 * dqget() and thus filesystem callbacks under dqptr_sem.
1399 *
1400 * It is better to call this function outside of any transaction as it 1381 * It is better to call this function outside of any transaction as it
1401 * might need a lot of space in journal for dquot structure allocation. 1382 * might need a lot of space in journal for dquot structure allocation.
1402 */ 1383 */
1403static void __dquot_initialize(struct inode *inode, int type) 1384static void __dquot_initialize(struct inode *inode, int type)
1404{ 1385{
1405 int cnt; 1386 int cnt, init_needed = 0;
1406 struct dquot *got[MAXQUOTAS]; 1387 struct dquot *got[MAXQUOTAS];
1407 struct super_block *sb = inode->i_sb; 1388 struct super_block *sb = inode->i_sb;
1408 qsize_t rsv; 1389 qsize_t rsv;
1409 1390
1410 /* First test before acquiring mutex - solves deadlocks when we
1411 * re-enter the quota code and are already holding the mutex */
1412 if (!dquot_active(inode)) 1391 if (!dquot_active(inode))
1413 return; 1392 return;
1414 1393
@@ -1418,6 +1397,15 @@ static void __dquot_initialize(struct inode *inode, int type)
1418 got[cnt] = NULL; 1397 got[cnt] = NULL;
1419 if (type != -1 && cnt != type) 1398 if (type != -1 && cnt != type)
1420 continue; 1399 continue;
1400 /*
1401 * The i_dquot should have been initialized in most cases,
1402 * we check it without locking here to avoid unnecessary
1403 * dqget()/dqput() calls.
1404 */
1405 if (inode->i_dquot[cnt])
1406 continue;
1407 init_needed = 1;
1408
1421 switch (cnt) { 1409 switch (cnt) {
1422 case USRQUOTA: 1410 case USRQUOTA:
1423 qid = make_kqid_uid(inode->i_uid); 1411 qid = make_kqid_uid(inode->i_uid);
@@ -1429,7 +1417,11 @@ static void __dquot_initialize(struct inode *inode, int type)
1429 got[cnt] = dqget(sb, qid); 1417 got[cnt] = dqget(sb, qid);
1430 } 1418 }
1431 1419
1432 down_write(&sb_dqopt(sb)->dqptr_sem); 1420 /* All required i_dquot has been initialized */
1421 if (!init_needed)
1422 return;
1423
1424 spin_lock(&dq_data_lock);
1433 if (IS_NOQUOTA(inode)) 1425 if (IS_NOQUOTA(inode))
1434 goto out_err; 1426 goto out_err;
1435 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1427 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1449,15 +1441,12 @@ static void __dquot_initialize(struct inode *inode, int type)
1449 * did a write before quota was turned on 1441 * did a write before quota was turned on
1450 */ 1442 */
1451 rsv = inode_get_rsv_space(inode); 1443 rsv = inode_get_rsv_space(inode);
1452 if (unlikely(rsv)) { 1444 if (unlikely(rsv))
1453 spin_lock(&dq_data_lock);
1454 dquot_resv_space(inode->i_dquot[cnt], rsv); 1445 dquot_resv_space(inode->i_dquot[cnt], rsv);
1455 spin_unlock(&dq_data_lock);
1456 }
1457 } 1446 }
1458 } 1447 }
1459out_err: 1448out_err:
1460 up_write(&sb_dqopt(sb)->dqptr_sem); 1449 spin_unlock(&dq_data_lock);
1461 /* Drop unused references */ 1450 /* Drop unused references */
1462 dqput_all(got); 1451 dqput_all(got);
1463} 1452}
@@ -1469,19 +1458,24 @@ void dquot_initialize(struct inode *inode)
1469EXPORT_SYMBOL(dquot_initialize); 1458EXPORT_SYMBOL(dquot_initialize);
1470 1459
1471/* 1460/*
1472 * Release all quotas referenced by inode 1461 * Release all quotas referenced by inode.
1462 *
1463 * This function only be called on inode free or converting
1464 * a file to quota file, no other users for the i_dquot in
1465 * both cases, so we needn't call synchronize_srcu() after
1466 * clearing i_dquot.
1473 */ 1467 */
1474static void __dquot_drop(struct inode *inode) 1468static void __dquot_drop(struct inode *inode)
1475{ 1469{
1476 int cnt; 1470 int cnt;
1477 struct dquot *put[MAXQUOTAS]; 1471 struct dquot *put[MAXQUOTAS];
1478 1472
1479 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1473 spin_lock(&dq_data_lock);
1480 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1474 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1481 put[cnt] = inode->i_dquot[cnt]; 1475 put[cnt] = inode->i_dquot[cnt];
1482 inode->i_dquot[cnt] = NULL; 1476 inode->i_dquot[cnt] = NULL;
1483 } 1477 }
1484 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1478 spin_unlock(&dq_data_lock);
1485 dqput_all(put); 1479 dqput_all(put);
1486} 1480}
1487 1481
@@ -1599,15 +1593,11 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1599 */ 1593 */
1600int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) 1594int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1601{ 1595{
1602 int cnt, ret = 0; 1596 int cnt, ret = 0, index;
1603 struct dquot_warn warn[MAXQUOTAS]; 1597 struct dquot_warn warn[MAXQUOTAS];
1604 struct dquot **dquots = inode->i_dquot; 1598 struct dquot **dquots = inode->i_dquot;
1605 int reserve = flags & DQUOT_SPACE_RESERVE; 1599 int reserve = flags & DQUOT_SPACE_RESERVE;
1606 1600
1607 /*
1608 * First test before acquiring mutex - solves deadlocks when we
1609 * re-enter the quota code and are already holding the mutex
1610 */
1611 if (!dquot_active(inode)) { 1601 if (!dquot_active(inode)) {
1612 inode_incr_space(inode, number, reserve); 1602 inode_incr_space(inode, number, reserve);
1613 goto out; 1603 goto out;
@@ -1616,7 +1606,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1616 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1606 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1617 warn[cnt].w_type = QUOTA_NL_NOWARN; 1607 warn[cnt].w_type = QUOTA_NL_NOWARN;
1618 1608
1619 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1609 index = srcu_read_lock(&dquot_srcu);
1620 spin_lock(&dq_data_lock); 1610 spin_lock(&dq_data_lock);
1621 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1611 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1622 if (!dquots[cnt]) 1612 if (!dquots[cnt])
@@ -1643,7 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1643 goto out_flush_warn; 1633 goto out_flush_warn;
1644 mark_all_dquot_dirty(dquots); 1634 mark_all_dquot_dirty(dquots);
1645out_flush_warn: 1635out_flush_warn:
1646 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1636 srcu_read_unlock(&dquot_srcu, index);
1647 flush_warnings(warn); 1637 flush_warnings(warn);
1648out: 1638out:
1649 return ret; 1639 return ret;
@@ -1655,17 +1645,16 @@ EXPORT_SYMBOL(__dquot_alloc_space);
1655 */ 1645 */
1656int dquot_alloc_inode(const struct inode *inode) 1646int dquot_alloc_inode(const struct inode *inode)
1657{ 1647{
1658 int cnt, ret = 0; 1648 int cnt, ret = 0, index;
1659 struct dquot_warn warn[MAXQUOTAS]; 1649 struct dquot_warn warn[MAXQUOTAS];
1660 struct dquot * const *dquots = inode->i_dquot; 1650 struct dquot * const *dquots = inode->i_dquot;
1661 1651
1662 /* First test before acquiring mutex - solves deadlocks when we
1663 * re-enter the quota code and are already holding the mutex */
1664 if (!dquot_active(inode)) 1652 if (!dquot_active(inode))
1665 return 0; 1653 return 0;
1666 for (cnt = 0; cnt < MAXQUOTAS; cnt++) 1654 for (cnt = 0; cnt < MAXQUOTAS; cnt++)
1667 warn[cnt].w_type = QUOTA_NL_NOWARN; 1655 warn[cnt].w_type = QUOTA_NL_NOWARN;
1668 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1656
1657 index = srcu_read_lock(&dquot_srcu);
1669 spin_lock(&dq_data_lock); 1658 spin_lock(&dq_data_lock);
1670 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1659 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1671 if (!dquots[cnt]) 1660 if (!dquots[cnt])
@@ -1685,7 +1674,7 @@ warn_put_all:
1685 spin_unlock(&dq_data_lock); 1674 spin_unlock(&dq_data_lock);
1686 if (ret == 0) 1675 if (ret == 0)
1687 mark_all_dquot_dirty(dquots); 1676 mark_all_dquot_dirty(dquots);
1688 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1677 srcu_read_unlock(&dquot_srcu, index);
1689 flush_warnings(warn); 1678 flush_warnings(warn);
1690 return ret; 1679 return ret;
1691} 1680}
@@ -1696,14 +1685,14 @@ EXPORT_SYMBOL(dquot_alloc_inode);
1696 */ 1685 */
1697int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) 1686int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1698{ 1687{
1699 int cnt; 1688 int cnt, index;
1700 1689
1701 if (!dquot_active(inode)) { 1690 if (!dquot_active(inode)) {
1702 inode_claim_rsv_space(inode, number); 1691 inode_claim_rsv_space(inode, number);
1703 return 0; 1692 return 0;
1704 } 1693 }
1705 1694
1706 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1695 index = srcu_read_lock(&dquot_srcu);
1707 spin_lock(&dq_data_lock); 1696 spin_lock(&dq_data_lock);
1708 /* Claim reserved quotas to allocated quotas */ 1697 /* Claim reserved quotas to allocated quotas */
1709 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1698 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1715,7 +1704,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
1715 inode_claim_rsv_space(inode, number); 1704 inode_claim_rsv_space(inode, number);
1716 spin_unlock(&dq_data_lock); 1705 spin_unlock(&dq_data_lock);
1717 mark_all_dquot_dirty(inode->i_dquot); 1706 mark_all_dquot_dirty(inode->i_dquot);
1718 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1707 srcu_read_unlock(&dquot_srcu, index);
1719 return 0; 1708 return 0;
1720} 1709}
1721EXPORT_SYMBOL(dquot_claim_space_nodirty); 1710EXPORT_SYMBOL(dquot_claim_space_nodirty);
@@ -1725,14 +1714,14 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
1725 */ 1714 */
1726void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) 1715void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
1727{ 1716{
1728 int cnt; 1717 int cnt, index;
1729 1718
1730 if (!dquot_active(inode)) { 1719 if (!dquot_active(inode)) {
1731 inode_reclaim_rsv_space(inode, number); 1720 inode_reclaim_rsv_space(inode, number);
1732 return; 1721 return;
1733 } 1722 }
1734 1723
1735 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1724 index = srcu_read_lock(&dquot_srcu);
1736 spin_lock(&dq_data_lock); 1725 spin_lock(&dq_data_lock);
1737 /* Claim reserved quotas to allocated quotas */ 1726 /* Claim reserved quotas to allocated quotas */
1738 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1727 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1744,7 +1733,7 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
1744 inode_reclaim_rsv_space(inode, number); 1733 inode_reclaim_rsv_space(inode, number);
1745 spin_unlock(&dq_data_lock); 1734 spin_unlock(&dq_data_lock);
1746 mark_all_dquot_dirty(inode->i_dquot); 1735 mark_all_dquot_dirty(inode->i_dquot);
1747 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1736 srcu_read_unlock(&dquot_srcu, index);
1748 return; 1737 return;
1749} 1738}
1750EXPORT_SYMBOL(dquot_reclaim_space_nodirty); 1739EXPORT_SYMBOL(dquot_reclaim_space_nodirty);
@@ -1757,16 +1746,14 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1757 unsigned int cnt; 1746 unsigned int cnt;
1758 struct dquot_warn warn[MAXQUOTAS]; 1747 struct dquot_warn warn[MAXQUOTAS];
1759 struct dquot **dquots = inode->i_dquot; 1748 struct dquot **dquots = inode->i_dquot;
1760 int reserve = flags & DQUOT_SPACE_RESERVE; 1749 int reserve = flags & DQUOT_SPACE_RESERVE, index;
1761 1750
1762 /* First test before acquiring mutex - solves deadlocks when we
1763 * re-enter the quota code and are already holding the mutex */
1764 if (!dquot_active(inode)) { 1751 if (!dquot_active(inode)) {
1765 inode_decr_space(inode, number, reserve); 1752 inode_decr_space(inode, number, reserve);
1766 return; 1753 return;
1767 } 1754 }
1768 1755
1769 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1756 index = srcu_read_lock(&dquot_srcu);
1770 spin_lock(&dq_data_lock); 1757 spin_lock(&dq_data_lock);
1771 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1758 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1772 int wtype; 1759 int wtype;
@@ -1789,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1789 goto out_unlock; 1776 goto out_unlock;
1790 mark_all_dquot_dirty(dquots); 1777 mark_all_dquot_dirty(dquots);
1791out_unlock: 1778out_unlock:
1792 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1779 srcu_read_unlock(&dquot_srcu, index);
1793 flush_warnings(warn); 1780 flush_warnings(warn);
1794} 1781}
1795EXPORT_SYMBOL(__dquot_free_space); 1782EXPORT_SYMBOL(__dquot_free_space);
@@ -1802,13 +1789,12 @@ void dquot_free_inode(const struct inode *inode)
1802 unsigned int cnt; 1789 unsigned int cnt;
1803 struct dquot_warn warn[MAXQUOTAS]; 1790 struct dquot_warn warn[MAXQUOTAS];
1804 struct dquot * const *dquots = inode->i_dquot; 1791 struct dquot * const *dquots = inode->i_dquot;
1792 int index;
1805 1793
1806 /* First test before acquiring mutex - solves deadlocks when we
1807 * re-enter the quota code and are already holding the mutex */
1808 if (!dquot_active(inode)) 1794 if (!dquot_active(inode))
1809 return; 1795 return;
1810 1796
1811 down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1797 index = srcu_read_lock(&dquot_srcu);
1812 spin_lock(&dq_data_lock); 1798 spin_lock(&dq_data_lock);
1813 for (cnt = 0; cnt < MAXQUOTAS; cnt++) { 1799 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
1814 int wtype; 1800 int wtype;
@@ -1823,7 +1809,7 @@ void dquot_free_inode(const struct inode *inode)
1823 } 1809 }
1824 spin_unlock(&dq_data_lock); 1810 spin_unlock(&dq_data_lock);
1825 mark_all_dquot_dirty(dquots); 1811 mark_all_dquot_dirty(dquots);
1826 up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); 1812 srcu_read_unlock(&dquot_srcu, index);
1827 flush_warnings(warn); 1813 flush_warnings(warn);
1828} 1814}
1829EXPORT_SYMBOL(dquot_free_inode); 1815EXPORT_SYMBOL(dquot_free_inode);
@@ -1837,6 +1823,8 @@ EXPORT_SYMBOL(dquot_free_inode);
1837 * This operation can block, but only after everything is updated 1823 * This operation can block, but only after everything is updated
1838 * A transaction must be started when entering this function. 1824 * A transaction must be started when entering this function.
1839 * 1825 *
1826 * We are holding reference on transfer_from & transfer_to, no need to
1827 * protect them by srcu_read_lock().
1840 */ 1828 */
1841int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) 1829int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1842{ 1830{
@@ -1849,8 +1837,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1849 struct dquot_warn warn_from_inodes[MAXQUOTAS]; 1837 struct dquot_warn warn_from_inodes[MAXQUOTAS];
1850 struct dquot_warn warn_from_space[MAXQUOTAS]; 1838 struct dquot_warn warn_from_space[MAXQUOTAS];
1851 1839
1852 /* First test before acquiring mutex - solves deadlocks when we
1853 * re-enter the quota code and are already holding the mutex */
1854 if (IS_NOQUOTA(inode)) 1840 if (IS_NOQUOTA(inode))
1855 return 0; 1841 return 0;
1856 /* Initialize the arrays */ 1842 /* Initialize the arrays */
@@ -1859,12 +1845,12 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1859 warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN; 1845 warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
1860 warn_from_space[cnt].w_type = QUOTA_NL_NOWARN; 1846 warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
1861 } 1847 }
1862 down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1848
1849 spin_lock(&dq_data_lock);
1863 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ 1850 if (IS_NOQUOTA(inode)) { /* File without quota accounting? */
1864 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); 1851 spin_unlock(&dq_data_lock);
1865 return 0; 1852 return 0;
1866 } 1853 }
1867 spin_lock(&dq_data_lock);
1868 cur_space = inode_get_bytes(inode); 1854 cur_space = inode_get_bytes(inode);
1869 rsv_space = inode_get_rsv_space(inode); 1855 rsv_space = inode_get_rsv_space(inode);
1870 space = cur_space + rsv_space; 1856 space = cur_space + rsv_space;
@@ -1918,7 +1904,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1918 inode->i_dquot[cnt] = transfer_to[cnt]; 1904 inode->i_dquot[cnt] = transfer_to[cnt];
1919 } 1905 }
1920 spin_unlock(&dq_data_lock); 1906 spin_unlock(&dq_data_lock);
1921 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1922 1907
1923 mark_all_dquot_dirty(transfer_from); 1908 mark_all_dquot_dirty(transfer_from);
1924 mark_all_dquot_dirty(transfer_to); 1909 mark_all_dquot_dirty(transfer_to);
@@ -1932,7 +1917,6 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
1932 return 0; 1917 return 0;
1933over_quota: 1918over_quota:
1934 spin_unlock(&dq_data_lock); 1919 spin_unlock(&dq_data_lock);
1935 up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
1936 flush_warnings(warn_to); 1920 flush_warnings(warn_to);
1937 return ret; 1921 return ret;
1938} 1922}
@@ -2741,7 +2725,7 @@ static int __init dquot_init(void)
2741 panic("Cannot create dquot hash table"); 2725 panic("Cannot create dquot hash table");
2742 2726
2743 for (i = 0; i < _DQST_DQSTAT_LAST; i++) { 2727 for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
2744 ret = percpu_counter_init(&dqstats.counter[i], 0); 2728 ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
2745 if (ret) 2729 if (ret)
2746 panic("Cannot create dquot stat counters"); 2730 panic("Cannot create dquot stat counters");
2747 } 2731 }
diff --git a/fs/quota/kqid.c b/fs/quota/kqid.c
index 2f97b0e2c501..ebc5e6285800 100644
--- a/fs/quota/kqid.c
+++ b/fs/quota/kqid.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(qid_lt);
55/** 55/**
56 * from_kqid - Create a qid from a kqid user-namespace pair. 56 * from_kqid - Create a qid from a kqid user-namespace pair.
57 * @targ: The user namespace we want a qid in. 57 * @targ: The user namespace we want a qid in.
58 * @kuid: The kernel internal quota identifier to start with. 58 * @kqid: The kernel internal quota identifier to start with.
59 * 59 *
60 * Map @kqid into the user-namespace specified by @targ and 60 * Map @kqid into the user-namespace specified by @targ and
61 * return the resulting qid. 61 * return the resulting qid.
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 72d29177998e..bb2869f5dfd8 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -32,8 +32,7 @@ static struct genl_family quota_genl_family = {
32 32
33/** 33/**
34 * quota_send_warning - Send warning to userspace about exceeded quota 34 * quota_send_warning - Send warning to userspace about exceeded quota
35 * @type: The quota type: USRQQUOTA, GRPQUOTA,... 35 * @qid: The kernel internal quota identifier.
36 * @id: The user or group id of the quota that was exceeded
37 * @dev: The device on which the fs is mounted (sb->s_dev) 36 * @dev: The device on which the fs is mounted (sb->s_dev)
38 * @warntype: The type of the warning: QUOTA_NL_... 37 * @warntype: The type of the warning: QUOTA_NL_...
39 * 38 *
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ff3f0b3cfdb3..75621649dbd7 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -79,13 +79,13 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
79{ 79{
80 __u32 fmt; 80 __u32 fmt;
81 81
82 down_read(&sb_dqopt(sb)->dqptr_sem); 82 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
83 if (!sb_has_quota_active(sb, type)) { 83 if (!sb_has_quota_active(sb, type)) {
84 up_read(&sb_dqopt(sb)->dqptr_sem); 84 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
85 return -ESRCH; 85 return -ESRCH;
86 } 86 }
87 fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; 87 fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
88 up_read(&sb_dqopt(sb)->dqptr_sem); 88 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
89 if (copy_to_user(addr, &fmt, sizeof(fmt))) 89 if (copy_to_user(addr, &fmt, sizeof(fmt)))
90 return -EFAULT; 90 return -EFAULT;
91 return 0; 91 return 0;
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index dda012ad4208..bbafbde3471a 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -222,7 +222,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
222 222
223 /* gang-find the pages */ 223 /* gang-find the pages */
224 ret = -ENOMEM; 224 ret = -ENOMEM;
225 pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); 225 pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL);
226 if (!pages) 226 if (!pages)
227 goto out_free; 227 goto out_free;
228 228
diff --git a/fs/read_write.c b/fs/read_write.c
index 009d8542a889..7d9318c3d43c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -513,6 +513,8 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t
513 return ret; 513 return ret;
514} 514}
515 515
516EXPORT_SYMBOL(__kernel_write);
517
516ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 518ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
517{ 519{
518 ssize_t ret; 520 ssize_t ret;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index d9f5a60dd59b..0a7dc941aaf4 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -9,7 +9,7 @@
9#include <linux/stat.h> 9#include <linux/stat.h>
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <asm/uaccess.h> 12#include <linux/uaccess.h>
13 13
14extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
15 15
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 54fdf196bfb2..9c02d96d3a42 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -10,7 +10,7 @@
10 * and using buffers obtained after all above. 10 * and using buffers obtained after all above.
11 */ 11 */
12 12
13#include <asm/uaccess.h> 13#include <linux/uaccess.h>
14#include <linux/time.h> 14#include <linux/time.h>
15#include "reiserfs.h" 15#include "reiserfs.h"
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
@@ -286,12 +286,14 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
286 return 0; 286 return 0;
287} 287}
288 288
289static void balance_leaf_insert_left(struct tree_balance *tb, 289static unsigned int balance_leaf_insert_left(struct tree_balance *tb,
290 struct item_head *ih, const char *body) 290 struct item_head *const ih,
291 const char * const body)
291{ 292{
292 int ret; 293 int ret;
293 struct buffer_info bi; 294 struct buffer_info bi;
294 int n = B_NR_ITEMS(tb->L[0]); 295 int n = B_NR_ITEMS(tb->L[0]);
296 unsigned body_shift_bytes = 0;
295 297
296 if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { 298 if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) {
297 /* part of new item falls into L[0] */ 299 /* part of new item falls into L[0] */
@@ -329,7 +331,7 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
329 331
330 put_ih_item_len(ih, new_item_len); 332 put_ih_item_len(ih, new_item_len);
331 if (tb->lbytes > tb->zeroes_num) { 333 if (tb->lbytes > tb->zeroes_num) {
332 body += (tb->lbytes - tb->zeroes_num); 334 body_shift_bytes = tb->lbytes - tb->zeroes_num;
333 tb->zeroes_num = 0; 335 tb->zeroes_num = 0;
334 } else 336 } else
335 tb->zeroes_num -= tb->lbytes; 337 tb->zeroes_num -= tb->lbytes;
@@ -349,11 +351,12 @@ static void balance_leaf_insert_left(struct tree_balance *tb,
349 tb->insert_size[0] = 0; 351 tb->insert_size[0] = 0;
350 tb->zeroes_num = 0; 352 tb->zeroes_num = 0;
351 } 353 }
354 return body_shift_bytes;
352} 355}
353 356
354static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb, 357static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
355 struct item_head *ih, 358 struct item_head * const ih,
356 const char *body) 359 const char * const body)
357{ 360{
358 int n = B_NR_ITEMS(tb->L[0]); 361 int n = B_NR_ITEMS(tb->L[0]);
359 struct buffer_info bi; 362 struct buffer_info bi;
@@ -413,17 +416,18 @@ static void balance_leaf_paste_left_shift_dirent(struct tree_balance *tb,
413 tb->pos_in_item -= tb->lbytes; 416 tb->pos_in_item -= tb->lbytes;
414} 417}
415 418
416static void balance_leaf_paste_left_shift(struct tree_balance *tb, 419static unsigned int balance_leaf_paste_left_shift(struct tree_balance *tb,
417 struct item_head *ih, 420 struct item_head * const ih,
418 const char *body) 421 const char * const body)
419{ 422{
420 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 423 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
421 int n = B_NR_ITEMS(tb->L[0]); 424 int n = B_NR_ITEMS(tb->L[0]);
422 struct buffer_info bi; 425 struct buffer_info bi;
426 int body_shift_bytes = 0;
423 427
424 if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) { 428 if (is_direntry_le_ih(item_head(tbS0, tb->item_pos))) {
425 balance_leaf_paste_left_shift_dirent(tb, ih, body); 429 balance_leaf_paste_left_shift_dirent(tb, ih, body);
426 return; 430 return 0;
427 } 431 }
428 432
429 RFALSE(tb->lbytes <= 0, 433 RFALSE(tb->lbytes <= 0,
@@ -497,7 +501,7 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
497 * insert_size[0] 501 * insert_size[0]
498 */ 502 */
499 if (l_n > tb->zeroes_num) { 503 if (l_n > tb->zeroes_num) {
500 body += (l_n - tb->zeroes_num); 504 body_shift_bytes = l_n - tb->zeroes_num;
501 tb->zeroes_num = 0; 505 tb->zeroes_num = 0;
502 } else 506 } else
503 tb->zeroes_num -= l_n; 507 tb->zeroes_num -= l_n;
@@ -526,13 +530,14 @@ static void balance_leaf_paste_left_shift(struct tree_balance *tb,
526 */ 530 */
527 leaf_shift_left(tb, tb->lnum[0], tb->lbytes); 531 leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
528 } 532 }
533 return body_shift_bytes;
529} 534}
530 535
531 536
532/* appended item will be in L[0] in whole */ 537/* appended item will be in L[0] in whole */
533static void balance_leaf_paste_left_whole(struct tree_balance *tb, 538static void balance_leaf_paste_left_whole(struct tree_balance *tb,
534 struct item_head *ih, 539 struct item_head * const ih,
535 const char *body) 540 const char * const body)
536{ 541{
537 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 542 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
538 int n = B_NR_ITEMS(tb->L[0]); 543 int n = B_NR_ITEMS(tb->L[0]);
@@ -584,39 +589,44 @@ static void balance_leaf_paste_left_whole(struct tree_balance *tb,
584 tb->zeroes_num = 0; 589 tb->zeroes_num = 0;
585} 590}
586 591
587static void balance_leaf_paste_left(struct tree_balance *tb, 592static unsigned int balance_leaf_paste_left(struct tree_balance *tb,
588 struct item_head *ih, const char *body) 593 struct item_head * const ih,
594 const char * const body)
589{ 595{
590 /* we must shift the part of the appended item */ 596 /* we must shift the part of the appended item */
591 if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) 597 if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1)
592 balance_leaf_paste_left_shift(tb, ih, body); 598 return balance_leaf_paste_left_shift(tb, ih, body);
593 else 599 else
594 balance_leaf_paste_left_whole(tb, ih, body); 600 balance_leaf_paste_left_whole(tb, ih, body);
601 return 0;
595} 602}
596 603
597/* Shift lnum[0] items from S[0] to the left neighbor L[0] */ 604/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
598static void balance_leaf_left(struct tree_balance *tb, struct item_head *ih, 605static unsigned int balance_leaf_left(struct tree_balance *tb,
599 const char *body, int flag) 606 struct item_head * const ih,
607 const char * const body, int flag)
600{ 608{
601 if (tb->lnum[0] <= 0) 609 if (tb->lnum[0] <= 0)
602 return; 610 return 0;
603 611
604 /* new item or it part falls to L[0], shift it too */ 612 /* new item or it part falls to L[0], shift it too */
605 if (tb->item_pos < tb->lnum[0]) { 613 if (tb->item_pos < tb->lnum[0]) {
606 BUG_ON(flag != M_INSERT && flag != M_PASTE); 614 BUG_ON(flag != M_INSERT && flag != M_PASTE);
607 615
608 if (flag == M_INSERT) 616 if (flag == M_INSERT)
609 balance_leaf_insert_left(tb, ih, body); 617 return balance_leaf_insert_left(tb, ih, body);
610 else /* M_PASTE */ 618 else /* M_PASTE */
611 balance_leaf_paste_left(tb, ih, body); 619 return balance_leaf_paste_left(tb, ih, body);
612 } else 620 } else
613 /* new item doesn't fall into L[0] */ 621 /* new item doesn't fall into L[0] */
614 leaf_shift_left(tb, tb->lnum[0], tb->lbytes); 622 leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
623 return 0;
615} 624}
616 625
617 626
618static void balance_leaf_insert_right(struct tree_balance *tb, 627static void balance_leaf_insert_right(struct tree_balance *tb,
619 struct item_head *ih, const char *body) 628 struct item_head * const ih,
629 const char * const body)
620{ 630{
621 631
622 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 632 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
@@ -704,7 +714,8 @@ static void balance_leaf_insert_right(struct tree_balance *tb,
704 714
705 715
706static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb, 716static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
707 struct item_head *ih, const char *body) 717 struct item_head * const ih,
718 const char * const body)
708{ 719{
709 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 720 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
710 struct buffer_info bi; 721 struct buffer_info bi;
@@ -754,7 +765,8 @@ static void balance_leaf_paste_right_shift_dirent(struct tree_balance *tb,
754} 765}
755 766
756static void balance_leaf_paste_right_shift(struct tree_balance *tb, 767static void balance_leaf_paste_right_shift(struct tree_balance *tb,
757 struct item_head *ih, const char *body) 768 struct item_head * const ih,
769 const char * const body)
758{ 770{
759 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 771 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
760 int n_shift, n_rem, r_zeroes_number, version; 772 int n_shift, n_rem, r_zeroes_number, version;
@@ -831,7 +843,8 @@ static void balance_leaf_paste_right_shift(struct tree_balance *tb,
831} 843}
832 844
833static void balance_leaf_paste_right_whole(struct tree_balance *tb, 845static void balance_leaf_paste_right_whole(struct tree_balance *tb,
834 struct item_head *ih, const char *body) 846 struct item_head * const ih,
847 const char * const body)
835{ 848{
836 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 849 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
837 int n = B_NR_ITEMS(tbS0); 850 int n = B_NR_ITEMS(tbS0);
@@ -874,7 +887,8 @@ static void balance_leaf_paste_right_whole(struct tree_balance *tb,
874} 887}
875 888
876static void balance_leaf_paste_right(struct tree_balance *tb, 889static void balance_leaf_paste_right(struct tree_balance *tb,
877 struct item_head *ih, const char *body) 890 struct item_head * const ih,
891 const char * const body)
878{ 892{
879 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 893 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
880 int n = B_NR_ITEMS(tbS0); 894 int n = B_NR_ITEMS(tbS0);
@@ -896,8 +910,9 @@ static void balance_leaf_paste_right(struct tree_balance *tb,
896} 910}
897 911
898/* shift rnum[0] items from S[0] to the right neighbor R[0] */ 912/* shift rnum[0] items from S[0] to the right neighbor R[0] */
899static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih, 913static void balance_leaf_right(struct tree_balance *tb,
900 const char *body, int flag) 914 struct item_head * const ih,
915 const char * const body, int flag)
901{ 916{
902 if (tb->rnum[0] <= 0) 917 if (tb->rnum[0] <= 0)
903 return; 918 return;
@@ -911,8 +926,8 @@ static void balance_leaf_right(struct tree_balance *tb, struct item_head *ih,
911} 926}
912 927
913static void balance_leaf_new_nodes_insert(struct tree_balance *tb, 928static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
914 struct item_head *ih, 929 struct item_head * const ih,
915 const char *body, 930 const char * const body,
916 struct item_head *insert_key, 931 struct item_head *insert_key,
917 struct buffer_head **insert_ptr, 932 struct buffer_head **insert_ptr,
918 int i) 933 int i)
@@ -1003,8 +1018,8 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb,
1003 1018
1004/* we append to directory item */ 1019/* we append to directory item */
1005static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb, 1020static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
1006 struct item_head *ih, 1021 struct item_head * const ih,
1007 const char *body, 1022 const char * const body,
1008 struct item_head *insert_key, 1023 struct item_head *insert_key,
1009 struct buffer_head **insert_ptr, 1024 struct buffer_head **insert_ptr,
1010 int i) 1025 int i)
@@ -1058,8 +1073,8 @@ static void balance_leaf_new_nodes_paste_dirent(struct tree_balance *tb,
1058} 1073}
1059 1074
1060static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb, 1075static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
1061 struct item_head *ih, 1076 struct item_head * const ih,
1062 const char *body, 1077 const char * const body,
1063 struct item_head *insert_key, 1078 struct item_head *insert_key,
1064 struct buffer_head **insert_ptr, 1079 struct buffer_head **insert_ptr,
1065 int i) 1080 int i)
@@ -1131,8 +1146,8 @@ static void balance_leaf_new_nodes_paste_shift(struct tree_balance *tb,
1131} 1146}
1132 1147
1133static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb, 1148static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
1134 struct item_head *ih, 1149 struct item_head * const ih,
1135 const char *body, 1150 const char * const body,
1136 struct item_head *insert_key, 1151 struct item_head *insert_key,
1137 struct buffer_head **insert_ptr, 1152 struct buffer_head **insert_ptr,
1138 int i) 1153 int i)
@@ -1184,8 +1199,8 @@ static void balance_leaf_new_nodes_paste_whole(struct tree_balance *tb,
1184 1199
1185} 1200}
1186static void balance_leaf_new_nodes_paste(struct tree_balance *tb, 1201static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
1187 struct item_head *ih, 1202 struct item_head * const ih,
1188 const char *body, 1203 const char * const body,
1189 struct item_head *insert_key, 1204 struct item_head *insert_key,
1190 struct buffer_head **insert_ptr, 1205 struct buffer_head **insert_ptr,
1191 int i) 1206 int i)
@@ -1214,8 +1229,8 @@ static void balance_leaf_new_nodes_paste(struct tree_balance *tb,
1214 1229
1215/* Fill new nodes that appear in place of S[0] */ 1230/* Fill new nodes that appear in place of S[0] */
1216static void balance_leaf_new_nodes(struct tree_balance *tb, 1231static void balance_leaf_new_nodes(struct tree_balance *tb,
1217 struct item_head *ih, 1232 struct item_head * const ih,
1218 const char *body, 1233 const char * const body,
1219 struct item_head *insert_key, 1234 struct item_head *insert_key,
1220 struct buffer_head **insert_ptr, 1235 struct buffer_head **insert_ptr,
1221 int flag) 1236 int flag)
@@ -1254,8 +1269,8 @@ static void balance_leaf_new_nodes(struct tree_balance *tb,
1254} 1269}
1255 1270
1256static void balance_leaf_finish_node_insert(struct tree_balance *tb, 1271static void balance_leaf_finish_node_insert(struct tree_balance *tb,
1257 struct item_head *ih, 1272 struct item_head * const ih,
1258 const char *body) 1273 const char * const body)
1259{ 1274{
1260 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 1275 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
1261 struct buffer_info bi; 1276 struct buffer_info bi;
@@ -1271,8 +1286,8 @@ static void balance_leaf_finish_node_insert(struct tree_balance *tb,
1271} 1286}
1272 1287
1273static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb, 1288static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
1274 struct item_head *ih, 1289 struct item_head * const ih,
1275 const char *body) 1290 const char * const body)
1276{ 1291{
1277 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 1292 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
1278 struct item_head *pasted = item_head(tbS0, tb->item_pos); 1293 struct item_head *pasted = item_head(tbS0, tb->item_pos);
@@ -1305,8 +1320,8 @@ static void balance_leaf_finish_node_paste_dirent(struct tree_balance *tb,
1305} 1320}
1306 1321
1307static void balance_leaf_finish_node_paste(struct tree_balance *tb, 1322static void balance_leaf_finish_node_paste(struct tree_balance *tb,
1308 struct item_head *ih, 1323 struct item_head * const ih,
1309 const char *body) 1324 const char * const body)
1310{ 1325{
1311 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 1326 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
1312 struct buffer_info bi; 1327 struct buffer_info bi;
@@ -1349,8 +1364,8 @@ static void balance_leaf_finish_node_paste(struct tree_balance *tb,
1349 * of the affected item which remains in S 1364 * of the affected item which remains in S
1350 */ 1365 */
1351static void balance_leaf_finish_node(struct tree_balance *tb, 1366static void balance_leaf_finish_node(struct tree_balance *tb,
1352 struct item_head *ih, 1367 struct item_head * const ih,
1353 const char *body, int flag) 1368 const char * const body, int flag)
1354{ 1369{
1355 /* if we must insert or append into buffer S[0] */ 1370 /* if we must insert or append into buffer S[0] */
1356 if (0 <= tb->item_pos && tb->item_pos < tb->s0num) { 1371 if (0 <= tb->item_pos && tb->item_pos < tb->s0num) {
@@ -1402,7 +1417,7 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih,
1402 && is_indirect_le_ih(item_head(tbS0, tb->item_pos))) 1417 && is_indirect_le_ih(item_head(tbS0, tb->item_pos)))
1403 tb->pos_in_item *= UNFM_P_SIZE; 1418 tb->pos_in_item *= UNFM_P_SIZE;
1404 1419
1405 balance_leaf_left(tb, ih, body, flag); 1420 body += balance_leaf_left(tb, ih, body, flag);
1406 1421
1407 /* tb->lnum[0] > 0 */ 1422 /* tb->lnum[0] > 0 */
1408 /* Calculate new item position */ 1423 /* Calculate new item position */
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index db9e80ba53a0..751dd3f4346b 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -6,7 +6,7 @@
6#include "reiserfs.h" 6#include "reiserfs.h"
7#include "acl.h" 7#include "acl.h"
8#include "xattr.h" 8#include "xattr.h"
9#include <asm/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/pagemap.h> 10#include <linux/pagemap.h>
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/writeback.h> 12#include <linux/writeback.h>
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 73231b1ebdbe..b751eea32e20 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -2,7 +2,7 @@
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README 2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */ 3 */
4 4
5#include <asm/uaccess.h> 5#include <linux/uaccess.h>
6#include <linux/string.h> 6#include <linux/string.h>
7#include <linux/time.h> 7#include <linux/time.h>
8#include "reiserfs.h" 8#include "reiserfs.h"
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 63b2b0ec49e6..a7eec9888f10 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -11,7 +11,7 @@
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/highmem.h> 12#include <linux/highmem.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <asm/uaccess.h> 14#include <linux/uaccess.h>
15#include <asm/unaligned.h> 15#include <asm/unaligned.h>
16#include <linux/buffer_head.h> 16#include <linux/buffer_head.h>
17#include <linux/mpage.h> 17#include <linux/mpage.h>
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 501ed6811a2b..6ec8a30a0911 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -7,7 +7,7 @@
7#include <linux/mount.h> 7#include <linux/mount.h>
8#include "reiserfs.h" 8#include "reiserfs.h"
9#include <linux/time.h> 9#include <linux/time.h>
10#include <asm/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/compat.h> 12#include <linux/compat.h>
13 13
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index cfaee912ee09..aca73dd73906 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -54,7 +54,7 @@ static void sd_print_item(struct item_head *ih, char *item)
54 } else { 54 } else {
55 struct stat_data *sd = (struct stat_data *)item; 55 struct stat_data *sd = (struct stat_data *)item;
56 56
57 printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), 57 printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd),
58 (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), 58 (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
59 sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); 59 sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
60 } 60 }
@@ -408,7 +408,7 @@ static void direntry_print_item(struct item_head *ih, char *item)
408 namebuf[namelen + 2] = 0; 408 namebuf[namelen + 2] = 0;
409 } 409 }
410 410
411 printk("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", 411 printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n",
412 i, namebuf, 412 i, namebuf,
413 deh_dir_id(deh), deh_objectid(deh), 413 deh_dir_id(deh), deh_objectid(deh),
414 GET_HASH_VALUE(deh_offset(deh)), 414 GET_HASH_VALUE(deh_offset(deh)),
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e8870de4627e..d571e173a990 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -699,11 +699,13 @@ static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
699 chunk->bh[chunk->nr++] = bh; 699 chunk->bh[chunk->nr++] = bh;
700 if (chunk->nr >= CHUNK_SIZE) { 700 if (chunk->nr >= CHUNK_SIZE) {
701 ret = 1; 701 ret = 1;
702 if (lock) 702 if (lock) {
703 spin_unlock(lock); 703 spin_unlock(lock);
704 fn(chunk); 704 fn(chunk);
705 if (lock)
706 spin_lock(lock); 705 spin_lock(lock);
706 } else {
707 fn(chunk);
708 }
707 } 709 }
708 return ret; 710 return ret;
709} 711}
@@ -1947,8 +1949,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1947 } 1949 }
1948 } 1950 }
1949 1951
1950 /* wait for all commits to finish */
1951 cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
1952 1952
1953 /* 1953 /*
1954 * We must release the write lock here because 1954 * We must release the write lock here because
@@ -1956,8 +1956,14 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1956 */ 1956 */
1957 reiserfs_write_unlock(sb); 1957 reiserfs_write_unlock(sb);
1958 1958
1959 /*
1960 * Cancel flushing of old commits. Note that neither of these works
1961 * will be requeued because superblock is being shutdown and doesn't
1962 * have MS_ACTIVE set.
1963 */
1959 cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); 1964 cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
1960 flush_workqueue(REISERFS_SB(sb)->commit_wq); 1965 /* wait for all commits to finish */
1966 cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work);
1961 1967
1962 free_journal_ram(sb); 1968 free_journal_ram(sb);
1963 1969
@@ -4292,9 +4298,15 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags)
4292 if (flush) { 4298 if (flush) {
4293 flush_commit_list(sb, jl, 1); 4299 flush_commit_list(sb, jl, 1);
4294 flush_journal_list(sb, jl, 1); 4300 flush_journal_list(sb, jl, 1);
4295 } else if (!(jl->j_state & LIST_COMMIT_PENDING)) 4301 } else if (!(jl->j_state & LIST_COMMIT_PENDING)) {
4296 queue_delayed_work(REISERFS_SB(sb)->commit_wq, 4302 /*
4297 &journal->j_work, HZ / 10); 4303 * Avoid queueing work when sb is being shut down. Transaction
4304 * will be flushed on journal shutdown.
4305 */
4306 if (sb->s_flags & MS_ACTIVE)
4307 queue_delayed_work(REISERFS_SB(sb)->commit_wq,
4308 &journal->j_work, HZ / 10);
4309 }
4298 4310
4299 /* 4311 /*
4300 * if the next transaction has any chance of wrapping, flush 4312 * if the next transaction has any chance of wrapping, flush
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index d6744c8b24e1..249594a821e0 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -2,7 +2,7 @@
2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README 2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3 */ 3 */
4 4
5#include <asm/uaccess.h> 5#include <linux/uaccess.h>
6#include <linux/string.h> 6#include <linux/string.h>
7#include <linux/time.h> 7#include <linux/time.h>
8#include "reiserfs.h" 8#include "reiserfs.h"
@@ -899,8 +899,9 @@ void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
899 899
900/* insert item into the leaf node in position before */ 900/* insert item into the leaf node in position before */
901void leaf_insert_into_buf(struct buffer_info *bi, int before, 901void leaf_insert_into_buf(struct buffer_info *bi, int before,
902 struct item_head *inserted_item_ih, 902 struct item_head * const inserted_item_ih,
903 const char *inserted_item_body, int zeros_number) 903 const char * const inserted_item_body,
904 int zeros_number)
904{ 905{
905 struct buffer_head *bh = bi->bi_bh; 906 struct buffer_head *bh = bi->bi_bh;
906 int nr, free_space; 907 int nr, free_space;
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index c9b47e91baf8..ae1dc841db3a 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -17,7 +17,7 @@ static char off_buf[80];
17static char *reiserfs_cpu_offset(struct cpu_key *key) 17static char *reiserfs_cpu_offset(struct cpu_key *key)
18{ 18{
19 if (cpu_key_k_type(key) == TYPE_DIRENTRY) 19 if (cpu_key_k_type(key) == TYPE_DIRENTRY)
20 sprintf(off_buf, "%Lu(%Lu)", 20 sprintf(off_buf, "%llu(%llu)",
21 (unsigned long long) 21 (unsigned long long)
22 GET_HASH_VALUE(cpu_key_k_offset(key)), 22 GET_HASH_VALUE(cpu_key_k_offset(key)),
23 (unsigned long long) 23 (unsigned long long)
@@ -34,7 +34,7 @@ static char *le_offset(struct reiserfs_key *key)
34 34
35 version = le_key_version(key); 35 version = le_key_version(key);
36 if (le_key_k_type(version, key) == TYPE_DIRENTRY) 36 if (le_key_k_type(version, key) == TYPE_DIRENTRY)
37 sprintf(off_buf, "%Lu(%Lu)", 37 sprintf(off_buf, "%llu(%llu)",
38 (unsigned long long) 38 (unsigned long long)
39 GET_HASH_VALUE(le_key_k_offset(version, key)), 39 GET_HASH_VALUE(le_key_k_offset(version, key)),
40 (unsigned long long) 40 (unsigned long long)
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 02b0b7d0f7d5..621b9f381fe1 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -11,7 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/seq_file.h> 13#include <linux/seq_file.h>
14#include <asm/uaccess.h> 14#include <linux/uaccess.h>
15#include "reiserfs.h" 15#include "reiserfs.h"
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index bf53888c7f59..1894d96ccb7c 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -506,6 +506,9 @@ typedef struct reiserfs_proc_info_data {
506} reiserfs_proc_info_data_t; 506} reiserfs_proc_info_data_t;
507#endif 507#endif
508 508
509/* Number of quota types we support */
510#define REISERFS_MAXQUOTAS 2
511
509/* reiserfs union of in-core super block data */ 512/* reiserfs union of in-core super block data */
510struct reiserfs_sb_info { 513struct reiserfs_sb_info {
511 /* Buffer containing the super block */ 514 /* Buffer containing the super block */
@@ -615,7 +618,7 @@ struct reiserfs_sb_info {
615 spinlock_t old_work_lock; /* protects old_work and work_queued */ 618 spinlock_t old_work_lock; /* protects old_work and work_queued */
616 619
617#ifdef CONFIG_QUOTA 620#ifdef CONFIG_QUOTA
618 char *s_qf_names[MAXQUOTAS]; 621 char *s_qf_names[REISERFS_MAXQUOTAS];
619 int s_jquota_fmt; 622 int s_jquota_fmt;
620#endif 623#endif
621 char *s_jdev; /* Stored jdev for mount option showing */ 624 char *s_jdev; /* Stored jdev for mount option showing */
@@ -3216,11 +3219,12 @@ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
3216void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, 3219void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
3217 int del_num, int del_bytes); 3220 int del_num, int del_bytes);
3218void leaf_insert_into_buf(struct buffer_info *bi, int before, 3221void leaf_insert_into_buf(struct buffer_info *bi, int before,
3219 struct item_head *inserted_item_ih, 3222 struct item_head * const inserted_item_ih,
3220 const char *inserted_item_body, int zeros_number); 3223 const char * const inserted_item_body,
3221void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
3222 int pos_in_item, int paste_size, const char *body,
3223 int zeros_number); 3224 int zeros_number);
3225void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
3226 int pos_in_item, int paste_size,
3227 const char * const body, int zeros_number);
3224void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, 3228void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
3225 int pos_in_item, int cut_size); 3229 int pos_in_item, int cut_size);
3226void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, 3230void leaf_paste_entries(struct buffer_info *bi, int item_num, int before,
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index dd44468edc2b..24cbe013240f 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2006,7 +2006,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
2006 &s_search_path) == POSITION_FOUND); 2006 &s_search_path) == POSITION_FOUND);
2007 2007
2008 RFALSE(file_size > ROUND_UP(new_file_size), 2008 RFALSE(file_size > ROUND_UP(new_file_size),
2009 "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", 2009 "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d",
2010 new_file_size, file_size, s_item_key.on_disk_key.k_objectid); 2010 new_file_size, file_size, s_item_key.on_disk_key.k_objectid);
2011 2011
2012update_and_out: 2012update_and_out:
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index a392cef6acc6..f1376c92cf74 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -15,7 +15,7 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
17#include <linux/time.h> 17#include <linux/time.h>
18#include <asm/uaccess.h> 18#include <linux/uaccess.h>
19#include "reiserfs.h" 19#include "reiserfs.h"
20#include "acl.h" 20#include "acl.h"
21#include "xattr.h" 21#include "xattr.h"
@@ -100,7 +100,11 @@ void reiserfs_schedule_old_flush(struct super_block *s)
100 struct reiserfs_sb_info *sbi = REISERFS_SB(s); 100 struct reiserfs_sb_info *sbi = REISERFS_SB(s);
101 unsigned long delay; 101 unsigned long delay;
102 102
103 if (s->s_flags & MS_RDONLY) 103 /*
104 * Avoid scheduling flush when sb is being shut down. It can race
105 * with journal shutdown and free still queued delayed work.
106 */
107 if (s->s_flags & MS_RDONLY || !(s->s_flags & MS_ACTIVE))
104 return; 108 return;
105 109
106 spin_lock(&sbi->old_work_lock); 110 spin_lock(&sbi->old_work_lock);
@@ -202,7 +206,7 @@ static int finish_unfinished(struct super_block *s)
202#ifdef CONFIG_QUOTA 206#ifdef CONFIG_QUOTA
203 int i; 207 int i;
204 int ms_active_set; 208 int ms_active_set;
205 int quota_enabled[MAXQUOTAS]; 209 int quota_enabled[REISERFS_MAXQUOTAS];
206#endif 210#endif
207 211
208 /* compose key to look for "save" links */ 212 /* compose key to look for "save" links */
@@ -223,7 +227,7 @@ static int finish_unfinished(struct super_block *s)
223 s->s_flags |= MS_ACTIVE; 227 s->s_flags |= MS_ACTIVE;
224 } 228 }
225 /* Turn on quotas so that they are updated correctly */ 229 /* Turn on quotas so that they are updated correctly */
226 for (i = 0; i < MAXQUOTAS; i++) { 230 for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
227 quota_enabled[i] = 1; 231 quota_enabled[i] = 1;
228 if (REISERFS_SB(s)->s_qf_names[i]) { 232 if (REISERFS_SB(s)->s_qf_names[i]) {
229 int ret; 233 int ret;
@@ -331,7 +335,7 @@ static int finish_unfinished(struct super_block *s)
331 * not completed truncate found. New size was 335 * not completed truncate found. New size was
332 * committed together with "save" link 336 * committed together with "save" link
333 */ 337 */
334 reiserfs_info(s, "Truncating %k to %Ld ..", 338 reiserfs_info(s, "Truncating %k to %lld ..",
335 INODE_PKEY(inode), inode->i_size); 339 INODE_PKEY(inode), inode->i_size);
336 340
337 /* don't update modification time */ 341 /* don't update modification time */
@@ -366,7 +370,7 @@ static int finish_unfinished(struct super_block *s)
366#ifdef CONFIG_QUOTA 370#ifdef CONFIG_QUOTA
367 /* Turn quotas off */ 371 /* Turn quotas off */
368 reiserfs_write_unlock(s); 372 reiserfs_write_unlock(s);
369 for (i = 0; i < MAXQUOTAS; i++) { 373 for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
370 if (sb_dqopt(s)->files[i] && quota_enabled[i]) 374 if (sb_dqopt(s)->files[i] && quota_enabled[i])
371 dquot_quota_off(s, i); 375 dquot_quota_off(s, i);
372 } 376 }
@@ -1356,7 +1360,7 @@ static void handle_quota_files(struct super_block *s, char **qf_names,
1356{ 1360{
1357 int i; 1361 int i;
1358 1362
1359 for (i = 0; i < MAXQUOTAS; i++) { 1363 for (i = 0; i < REISERFS_MAXQUOTAS; i++) {
1360 if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) 1364 if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
1361 kfree(REISERFS_SB(s)->s_qf_names[i]); 1365 kfree(REISERFS_SB(s)->s_qf_names[i]);
1362 REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; 1366 REISERFS_SB(s)->s_qf_names[i] = qf_names[i];
@@ -1377,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1377 struct reiserfs_journal *journal = SB_JOURNAL(s); 1381 struct reiserfs_journal *journal = SB_JOURNAL(s);
1378 char *new_opts = kstrdup(arg, GFP_KERNEL); 1382 char *new_opts = kstrdup(arg, GFP_KERNEL);
1379 int err; 1383 int err;
1380 char *qf_names[MAXQUOTAS]; 1384 char *qf_names[REISERFS_MAXQUOTAS];
1381 unsigned int qfmt = 0; 1385 unsigned int qfmt = 0;
1382#ifdef CONFIG_QUOTA 1386#ifdef CONFIG_QUOTA
1383 int i; 1387 int i;
@@ -1396,7 +1400,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1396 (s, arg, &mount_options, &blocks, NULL, &commit_max_age, 1400 (s, arg, &mount_options, &blocks, NULL, &commit_max_age,
1397 qf_names, &qfmt)) { 1401 qf_names, &qfmt)) {
1398#ifdef CONFIG_QUOTA 1402#ifdef CONFIG_QUOTA
1399 for (i = 0; i < MAXQUOTAS; i++) 1403 for (i = 0; i < REISERFS_MAXQUOTAS; i++)
1400 if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) 1404 if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i])
1401 kfree(qf_names[i]); 1405 kfree(qf_names[i]);
1402#endif 1406#endif
@@ -1577,7 +1581,7 @@ static int read_super_block(struct super_block *s, int offset)
1577 rs = (struct reiserfs_super_block *)bh->b_data; 1581 rs = (struct reiserfs_super_block *)bh->b_data;
1578 if (sb_blocksize(rs) != s->s_blocksize) { 1582 if (sb_blocksize(rs) != s->s_blocksize) {
1579 reiserfs_warning(s, "sh-2011", "can't find a reiserfs " 1583 reiserfs_warning(s, "sh-2011", "can't find a reiserfs "
1580 "filesystem on (dev %s, block %Lu, size %lu)", 1584 "filesystem on (dev %s, block %llu, size %lu)",
1581 s->s_id, 1585 s->s_id,
1582 (unsigned long long)bh->b_blocknr, 1586 (unsigned long long)bh->b_blocknr,
1583 s->s_blocksize); 1587 s->s_blocksize);
@@ -1840,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1840 char *jdev_name; 1844 char *jdev_name;
1841 struct reiserfs_sb_info *sbi; 1845 struct reiserfs_sb_info *sbi;
1842 int errval = -EINVAL; 1846 int errval = -EINVAL;
1843 char *qf_names[MAXQUOTAS] = {}; 1847 char *qf_names[REISERFS_MAXQUOTAS] = {};
1844 unsigned int qfmt = 0; 1848 unsigned int qfmt = 0;
1845 1849
1846 save_mount_options(s, data); 1850 save_mount_options(s, data);
@@ -2165,7 +2169,7 @@ error_unlocked:
2165#ifdef CONFIG_QUOTA 2169#ifdef CONFIG_QUOTA
2166 { 2170 {
2167 int j; 2171 int j;
2168 for (j = 0; j < MAXQUOTAS; j++) 2172 for (j = 0; j < REISERFS_MAXQUOTAS; j++)
2169 kfree(qf_names[j]); 2173 kfree(qf_names[j]);
2170 } 2174 }
2171#endif 2175#endif
@@ -2441,8 +2445,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
2441 struct buffer_head tmp_bh, *bh; 2445 struct buffer_head tmp_bh, *bh;
2442 2446
2443 if (!current->journal_info) { 2447 if (!current->journal_info) {
2444 printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)" 2448 printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n",
2445 " cancelled because transaction is not started.\n",
2446 (unsigned long long)off, (unsigned long long)len); 2449 (unsigned long long)off, (unsigned long long)len);
2447 return -EIO; 2450 return -EIO;
2448 } 2451 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index ca416d099e7d..7c36898af402 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -45,7 +45,7 @@
45#include <linux/xattr.h> 45#include <linux/xattr.h>
46#include "xattr.h" 46#include "xattr.h"
47#include "acl.h" 47#include "acl.h"
48#include <asm/uaccess.h> 48#include <linux/uaccess.h>
49#include <net/checksum.h> 49#include <net/checksum.h>
50#include <linux/stat.h> 50#include <linux/stat.h>
51#include <linux/quotaops.h> 51#include <linux/quotaops.h>
@@ -84,6 +84,7 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
84static int xattr_unlink(struct inode *dir, struct dentry *dentry) 84static int xattr_unlink(struct inode *dir, struct dentry *dentry)
85{ 85{
86 int error; 86 int error;
87
87 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 88 BUG_ON(!mutex_is_locked(&dir->i_mutex));
88 89
89 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 90 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
@@ -98,6 +99,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
98static int xattr_rmdir(struct inode *dir, struct dentry *dentry) 99static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
99{ 100{
100 int error; 101 int error;
102
101 BUG_ON(!mutex_is_locked(&dir->i_mutex)); 103 BUG_ON(!mutex_is_locked(&dir->i_mutex));
102 104
103 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); 105 mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD);
@@ -117,6 +119,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
117{ 119{
118 struct dentry *privroot = REISERFS_SB(sb)->priv_root; 120 struct dentry *privroot = REISERFS_SB(sb)->priv_root;
119 struct dentry *xaroot; 121 struct dentry *xaroot;
122
120 if (!privroot->d_inode) 123 if (!privroot->d_inode)
121 return ERR_PTR(-ENODATA); 124 return ERR_PTR(-ENODATA);
122 125
@@ -127,6 +130,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
127 xaroot = ERR_PTR(-ENODATA); 130 xaroot = ERR_PTR(-ENODATA);
128 else if (!xaroot->d_inode) { 131 else if (!xaroot->d_inode) {
129 int err = -ENODATA; 132 int err = -ENODATA;
133
130 if (xattr_may_create(flags)) 134 if (xattr_may_create(flags))
131 err = xattr_mkdir(privroot->d_inode, xaroot, 0700); 135 err = xattr_mkdir(privroot->d_inode, xaroot, 0700);
132 if (err) { 136 if (err) {
@@ -157,6 +161,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
157 xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); 161 xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
158 if (!IS_ERR(xadir) && !xadir->d_inode) { 162 if (!IS_ERR(xadir) && !xadir->d_inode) {
159 int err = -ENODATA; 163 int err = -ENODATA;
164
160 if (xattr_may_create(flags)) 165 if (xattr_may_create(flags))
161 err = xattr_mkdir(xaroot->d_inode, xadir, 0700); 166 err = xattr_mkdir(xaroot->d_inode, xadir, 0700);
162 if (err) { 167 if (err) {
@@ -188,6 +193,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset,
188{ 193{
189 struct reiserfs_dentry_buf *dbuf = buf; 194 struct reiserfs_dentry_buf *dbuf = buf;
190 struct dentry *dentry; 195 struct dentry *dentry;
196
191 WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); 197 WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex));
192 198
193 if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) 199 if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
@@ -218,6 +224,7 @@ static void
218cleanup_dentry_buf(struct reiserfs_dentry_buf *buf) 224cleanup_dentry_buf(struct reiserfs_dentry_buf *buf)
219{ 225{
220 int i; 226 int i;
227
221 for (i = 0; i < buf->count; i++) 228 for (i = 0; i < buf->count; i++)
222 if (buf->dentries[i]) 229 if (buf->dentries[i])
223 dput(buf->dentries[i]); 230 dput(buf->dentries[i]);
@@ -283,11 +290,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
283 int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 290 int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 +
284 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); 291 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
285 struct reiserfs_transaction_handle th; 292 struct reiserfs_transaction_handle th;
293
286 reiserfs_write_lock(inode->i_sb); 294 reiserfs_write_lock(inode->i_sb);
287 err = journal_begin(&th, inode->i_sb, blocks); 295 err = journal_begin(&th, inode->i_sb, blocks);
288 reiserfs_write_unlock(inode->i_sb); 296 reiserfs_write_unlock(inode->i_sb);
289 if (!err) { 297 if (!err) {
290 int jerror; 298 int jerror;
299
291 mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, 300 mutex_lock_nested(&dir->d_parent->d_inode->i_mutex,
292 I_MUTEX_XATTR); 301 I_MUTEX_XATTR);
293 err = action(dir, data); 302 err = action(dir, data);
@@ -340,6 +349,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data)
340int reiserfs_delete_xattrs(struct inode *inode) 349int reiserfs_delete_xattrs(struct inode *inode)
341{ 350{
342 int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL); 351 int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL);
352
343 if (err) 353 if (err)
344 reiserfs_warning(inode->i_sb, "jdm-20004", 354 reiserfs_warning(inode->i_sb, "jdm-20004",
345 "Couldn't delete all xattrs (%d)\n", err); 355 "Couldn't delete all xattrs (%d)\n", err);
@@ -350,6 +360,7 @@ int reiserfs_delete_xattrs(struct inode *inode)
350int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) 360int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
351{ 361{
352 int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs); 362 int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs);
363
353 if (err) 364 if (err)
354 reiserfs_warning(inode->i_sb, "jdm-20007", 365 reiserfs_warning(inode->i_sb, "jdm-20007",
355 "Couldn't chown all xattrs (%d)\n", err); 366 "Couldn't chown all xattrs (%d)\n", err);
@@ -439,6 +450,7 @@ int reiserfs_commit_write(struct file *f, struct page *page,
439static void update_ctime(struct inode *inode) 450static void update_ctime(struct inode *inode)
440{ 451{
441 struct timespec now = current_fs_time(inode->i_sb); 452 struct timespec now = current_fs_time(inode->i_sb);
453
442 if (inode_unhashed(inode) || !inode->i_nlink || 454 if (inode_unhashed(inode) || !inode->i_nlink ||
443 timespec_equal(&inode->i_ctime, &now)) 455 timespec_equal(&inode->i_ctime, &now))
444 return; 456 return;
@@ -514,6 +526,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
514 size_t chunk; 526 size_t chunk;
515 size_t skip = 0; 527 size_t skip = 0;
516 size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); 528 size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
529
517 if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) 530 if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
518 chunk = PAGE_CACHE_SIZE; 531 chunk = PAGE_CACHE_SIZE;
519 else 532 else
@@ -530,6 +543,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
530 543
531 if (file_pos == 0) { 544 if (file_pos == 0) {
532 struct reiserfs_xattr_header *rxh; 545 struct reiserfs_xattr_header *rxh;
546
533 skip = file_pos = sizeof(struct reiserfs_xattr_header); 547 skip = file_pos = sizeof(struct reiserfs_xattr_header);
534 if (chunk + skip > PAGE_CACHE_SIZE) 548 if (chunk + skip > PAGE_CACHE_SIZE)
535 chunk = PAGE_CACHE_SIZE - skip; 549 chunk = PAGE_CACHE_SIZE - skip;
@@ -659,6 +673,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer,
659 size_t chunk; 673 size_t chunk;
660 char *data; 674 char *data;
661 size_t skip = 0; 675 size_t skip = 0;
676
662 if (isize - file_pos > PAGE_CACHE_SIZE) 677 if (isize - file_pos > PAGE_CACHE_SIZE)
663 chunk = PAGE_CACHE_SIZE; 678 chunk = PAGE_CACHE_SIZE;
664 else 679 else
@@ -792,6 +807,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
792int reiserfs_removexattr(struct dentry *dentry, const char *name) 807int reiserfs_removexattr(struct dentry *dentry, const char *name)
793{ 808{
794 const struct xattr_handler *handler; 809 const struct xattr_handler *handler;
810
795 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 811 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
796 812
797 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 813 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -813,9 +829,11 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
813{ 829{
814 struct listxattr_buf *b = (struct listxattr_buf *)buf; 830 struct listxattr_buf *b = (struct listxattr_buf *)buf;
815 size_t size; 831 size_t size;
832
816 if (name[0] != '.' || 833 if (name[0] != '.' ||
817 (namelen != 1 && (name[1] != '.' || namelen != 2))) { 834 (namelen != 1 && (name[1] != '.' || namelen != 2))) {
818 const struct xattr_handler *handler; 835 const struct xattr_handler *handler;
836
819 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, 837 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
820 name); 838 name);
821 if (!handler) /* Unsupported xattr name */ 839 if (!handler) /* Unsupported xattr name */
@@ -885,6 +903,7 @@ static int create_privroot(struct dentry *dentry)
885{ 903{
886 int err; 904 int err;
887 struct inode *inode = dentry->d_parent->d_inode; 905 struct inode *inode = dentry->d_parent->d_inode;
906
888 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); 907 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
889 908
890 err = xattr_mkdir(inode, dentry, 0700); 909 err = xattr_mkdir(inode, dentry, 0700);
@@ -1015,6 +1034,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
1015 mutex_lock(&privroot->d_inode->i_mutex); 1034 mutex_lock(&privroot->d_inode->i_mutex);
1016 if (!REISERFS_SB(s)->xattr_root) { 1035 if (!REISERFS_SB(s)->xattr_root) {
1017 struct dentry *dentry; 1036 struct dentry *dentry;
1037
1018 dentry = lookup_one_len(XAROOT_NAME, privroot, 1038 dentry = lookup_one_len(XAROOT_NAME, privroot,
1019 strlen(XAROOT_NAME)); 1039 strlen(XAROOT_NAME));
1020 if (!IS_ERR(dentry)) 1040 if (!IS_ERR(dentry))
diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h
index 857ec7e3016f..f620e9678dd5 100644
--- a/fs/reiserfs/xattr.h
+++ b/fs/reiserfs/xattr.h
@@ -7,7 +7,6 @@ struct inode;
7struct dentry; 7struct dentry;
8struct iattr; 8struct iattr;
9struct super_block; 9struct super_block;
10struct nameidata;
11 10
12int reiserfs_xattr_register_handlers(void) __init; 11int reiserfs_xattr_register_handlers(void) __init;
13void reiserfs_xattr_unregister_handlers(void); 12void reiserfs_xattr_unregister_handlers(void);
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 44503e293790..4b34b9dc03dd 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -9,7 +9,7 @@
9#include <linux/posix_acl_xattr.h> 9#include <linux/posix_acl_xattr.h>
10#include "xattr.h" 10#include "xattr.h"
11#include "acl.h" 11#include "acl.h"
12#include <asm/uaccess.h> 12#include <linux/uaccess.h>
13 13
14static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, 14static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th,
15 struct inode *inode, int type, 15 struct inode *inode, int type,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 800a3cef6f62..e7f8939a4cb5 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -6,7 +6,7 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include "xattr.h" 7#include "xattr.h"
8#include <linux/security.h> 8#include <linux/security.h>
9#include <asm/uaccess.h> 9#include <linux/uaccess.h>
10 10
11static int 11static int
12security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, 12security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index a0035719f66b..5eeb0c48ba46 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -5,7 +5,7 @@
5#include <linux/pagemap.h> 5#include <linux/pagemap.h>
6#include <linux/xattr.h> 6#include <linux/xattr.h>
7#include "xattr.h" 7#include "xattr.h"
8#include <asm/uaccess.h> 8#include <linux/uaccess.h>
9 9
10static int 10static int
11trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, 11trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 8667491ae7c3..e50eab046471 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -4,7 +4,7 @@
4#include <linux/pagemap.h> 4#include <linux/pagemap.h>
5#include <linux/xattr.h> 5#include <linux/xattr.h>
6#include "xattr.h" 6#include "xattr.h"
7#include <asm/uaccess.h> 7#include <linux/uaccess.h>
8 8
9static int 9static int
10user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, 10user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ef90e8bca95a..e98dd88197d5 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -56,6 +56,8 @@
56 * 2 of the Licence, or (at your option) any later version. 56 * 2 of the Licence, or (at your option) any later version.
57 */ 57 */
58 58
59#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
60
59#include <linux/module.h> 61#include <linux/module.h>
60#include <linux/string.h> 62#include <linux/string.h>
61#include <linux/fs.h> 63#include <linux/fs.h>
@@ -380,7 +382,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
380eio: 382eio:
381 ret = -EIO; 383 ret = -EIO;
382error: 384error:
383 printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos); 385 pr_err("read error for inode 0x%lx\n", pos);
384 return ERR_PTR(ret); 386 return ERR_PTR(ret);
385} 387}
386 388
@@ -390,6 +392,7 @@ error:
390static struct inode *romfs_alloc_inode(struct super_block *sb) 392static struct inode *romfs_alloc_inode(struct super_block *sb)
391{ 393{
392 struct romfs_inode_info *inode; 394 struct romfs_inode_info *inode;
395
393 inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); 396 inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
394 return inode ? &inode->vfs_inode : NULL; 397 return inode ? &inode->vfs_inode : NULL;
395} 398}
@@ -400,6 +403,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb)
400static void romfs_i_callback(struct rcu_head *head) 403static void romfs_i_callback(struct rcu_head *head)
401{ 404{
402 struct inode *inode = container_of(head, struct inode, i_rcu); 405 struct inode *inode = container_of(head, struct inode, i_rcu);
406
403 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); 407 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
404} 408}
405 409
@@ -507,15 +511,13 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
507 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || 511 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
508 img_size < ROMFH_SIZE) { 512 img_size < ROMFH_SIZE) {
509 if (!silent) 513 if (!silent)
510 printk(KERN_WARNING "VFS:" 514 pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n",
511 " Can't find a romfs filesystem on dev %s.\n",
512 sb->s_id); 515 sb->s_id);
513 goto error_rsb_inval; 516 goto error_rsb_inval;
514 } 517 }
515 518
516 if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { 519 if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
517 printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n", 520 pr_err("bad initial checksum on dev %s.\n", sb->s_id);
518 sb->s_id);
519 goto error_rsb_inval; 521 goto error_rsb_inval;
520 } 522 }
521 523
@@ -523,8 +525,8 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
523 525
524 len = strnlen(rsb->name, ROMFS_MAXFN); 526 len = strnlen(rsb->name, ROMFS_MAXFN);
525 if (!silent) 527 if (!silent)
526 printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n", 528 pr_notice("Mounting image '%*.*s' through %s\n",
527 (unsigned) len, (unsigned) len, rsb->name, storage); 529 (unsigned) len, (unsigned) len, rsb->name, storage);
528 530
529 kfree(rsb); 531 kfree(rsb);
530 rsb = NULL; 532 rsb = NULL;
@@ -614,7 +616,7 @@ static int __init init_romfs_fs(void)
614{ 616{
615 int ret; 617 int ret;
616 618
617 printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n"); 619 pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n");
618 620
619 romfs_inode_cachep = 621 romfs_inode_cachep =
620 kmem_cache_create("romfs_i", 622 kmem_cache_create("romfs_i",
@@ -623,13 +625,12 @@ static int __init init_romfs_fs(void)
623 romfs_i_init_once); 625 romfs_i_init_once);
624 626
625 if (!romfs_inode_cachep) { 627 if (!romfs_inode_cachep) {
626 printk(KERN_ERR 628 pr_err("Failed to initialise inode cache\n");
627 "ROMFS error: Failed to initialise inode cache\n");
628 return -ENOMEM; 629 return -ENOMEM;
629 } 630 }
630 ret = register_filesystem(&romfs_fs_type); 631 ret = register_filesystem(&romfs_fs_type);
631 if (ret) { 632 if (ret) {
632 printk(KERN_ERR "ROMFS error: Failed to register filesystem\n"); 633 pr_err("Failed to register filesystem\n");
633 goto error_register; 634 goto error_register;
634 } 635 }
635 return 0; 636 return 0;
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 62a0de6632e1..43e7a7eddac0 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -44,7 +44,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
44 44
45 pages = end_index - start_index + 1; 45 pages = end_index - start_index + 1;
46 46
47 page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); 47 page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
48 if (page == NULL) 48 if (page == NULL)
49 return res; 49 return res;
50 50
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 031c8d67fd51..5056babe00df 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -27,6 +27,8 @@
27 * the filesystem. 27 * the filesystem.
28 */ 28 */
29 29
30#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
31
30#include <linux/fs.h> 32#include <linux/fs.h>
31#include <linux/vfs.h> 33#include <linux/vfs.h>
32#include <linux/slab.h> 34#include <linux/slab.h>
@@ -448,8 +450,7 @@ static int __init init_squashfs_fs(void)
448 return err; 450 return err;
449 } 451 }
450 452
451 printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " 453 pr_info("version 4.0 (2009/01/31) Phillip Lougher\n");
452 "Phillip Lougher\n");
453 454
454 return 0; 455 return 0;
455} 456}
diff --git a/fs/stack.c b/fs/stack.c
index 5b5388250e29..a54e33ed10f1 100644
--- a/fs/stack.c
+++ b/fs/stack.c
@@ -44,7 +44,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src)
44 * include/linux/fs.h). We don't necessarily hold i_mutex when this 44 * include/linux/fs.h). We don't necessarily hold i_mutex when this
45 * is called, so take i_lock for that case. 45 * is called, so take i_lock for that case.
46 * 46 *
47 * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the 47 * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the
48 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock 48 * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock
49 * for that case too, and do both at once by combining the tests. 49 * for that case too, and do both at once by combining the tests.
50 * 50 *
diff --git a/fs/super.c b/fs/super.c
index d20d5b11dedf..eae088f6aaae 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,7 +22,6 @@
22 22
23#include <linux/export.h> 23#include <linux/export.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/acct.h>
26#include <linux/blkdev.h> 25#include <linux/blkdev.h>
27#include <linux/mount.h> 26#include <linux/mount.h>
28#include <linux/security.h> 27#include <linux/security.h>
@@ -81,6 +80,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
81 inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); 80 inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
82 dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); 81 dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
83 total_objects = dentries + inodes + fs_objects + 1; 82 total_objects = dentries + inodes + fs_objects + 1;
83 if (!total_objects)
84 total_objects = 1;
84 85
85 /* proportion the scan between the caches */ 86 /* proportion the scan between the caches */
86 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); 87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
@@ -176,7 +177,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
176 goto fail; 177 goto fail;
177 178
178 for (i = 0; i < SB_FREEZE_LEVELS; i++) { 179 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
179 if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) 180 if (percpu_counter_init(&s->s_writers.counter[i], 0,
181 GFP_KERNEL) < 0)
180 goto fail; 182 goto fail;
181 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], 183 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
182 &type->s_writers_key[i], 0); 184 &type->s_writers_key[i], 0);
@@ -218,7 +220,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
218 lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); 220 lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
219 mutex_init(&s->s_dquot.dqio_mutex); 221 mutex_init(&s->s_dquot.dqio_mutex);
220 mutex_init(&s->s_dquot.dqonoff_mutex); 222 mutex_init(&s->s_dquot.dqonoff_mutex);
221 init_rwsem(&s->s_dquot.dqptr_sem);
222 s->s_maxbytes = MAX_NON_LFS; 223 s->s_maxbytes = MAX_NON_LFS;
223 s->s_op = &default_op; 224 s->s_op = &default_op;
224 s->s_time_gran = 1000000000; 225 s->s_time_gran = 1000000000;
@@ -702,12 +703,22 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
702 return -EACCES; 703 return -EACCES;
703#endif 704#endif
704 705
705 if (flags & MS_RDONLY)
706 acct_auto_close(sb);
707 shrink_dcache_sb(sb);
708
709 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 706 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
710 707
708 if (remount_ro) {
709 if (sb->s_pins.first) {
710 up_write(&sb->s_umount);
711 sb_pin_kill(sb);
712 down_write(&sb->s_umount);
713 if (!sb->s_root)
714 return 0;
715 if (sb->s_writers.frozen != SB_UNFROZEN)
716 return -EBUSY;
717 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
718 }
719 }
720 shrink_dcache_sb(sb);
721
711 /* If we are remounting RDONLY and current sb is read/write, 722 /* If we are remounting RDONLY and current sb is read/write,
712 make sure there are no rw files opened */ 723 make sure there are no rw files opened */
713 if (remount_ro) { 724 if (remount_ro) {
diff --git a/fs/sync.c b/fs/sync.c
index b28d1dd10e8b..bdc729d80e5e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -65,7 +65,7 @@ int sync_filesystem(struct super_block *sb)
65 return ret; 65 return ret;
66 return __sync_filesystem(sb, 1); 66 return __sync_filesystem(sb, 1);
67} 67}
68EXPORT_SYMBOL_GPL(sync_filesystem); 68EXPORT_SYMBOL(sync_filesystem);
69 69
70static void sync_inodes_one_sb(struct super_block *sb, void *arg) 70static void sync_inodes_one_sb(struct super_block *sb, void *arg)
71{ 71{
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 0013142c0475..b46ffa94372a 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -35,8 +35,9 @@ struct timerfd_ctx {
35 ktime_t moffs; 35 ktime_t moffs;
36 wait_queue_head_t wqh; 36 wait_queue_head_t wqh;
37 u64 ticks; 37 u64 ticks;
38 int expired;
39 int clockid; 38 int clockid;
39 short unsigned expired;
40 short unsigned settime_flags; /* to show in fdinfo */
40 struct rcu_head rcu; 41 struct rcu_head rcu;
41 struct list_head clist; 42 struct list_head clist;
42 bool might_cancel; 43 bool might_cancel;
@@ -92,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
92 */ 93 */
93void timerfd_clock_was_set(void) 94void timerfd_clock_was_set(void)
94{ 95{
95 ktime_t moffs = ktime_get_monotonic_offset(); 96 ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
96 struct timerfd_ctx *ctx; 97 struct timerfd_ctx *ctx;
97 unsigned long flags; 98 unsigned long flags;
98 99
@@ -125,7 +126,7 @@ static bool timerfd_canceled(struct timerfd_ctx *ctx)
125{ 126{
126 if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) 127 if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
127 return false; 128 return false;
128 ctx->moffs = ktime_get_monotonic_offset(); 129 ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
129 return true; 130 return true;
130} 131}
131 132
@@ -196,6 +197,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
196 if (timerfd_canceled(ctx)) 197 if (timerfd_canceled(ctx))
197 return -ECANCELED; 198 return -ECANCELED;
198 } 199 }
200
201 ctx->settime_flags = flags & TFD_SETTIME_FLAGS;
199 return 0; 202 return 0;
200} 203}
201 204
@@ -284,11 +287,76 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
284 return res; 287 return res;
285} 288}
286 289
290#ifdef CONFIG_PROC_FS
291static int timerfd_show(struct seq_file *m, struct file *file)
292{
293 struct timerfd_ctx *ctx = file->private_data;
294 struct itimerspec t;
295
296 spin_lock_irq(&ctx->wqh.lock);
297 t.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
298 t.it_interval = ktime_to_timespec(ctx->tintv);
299 spin_unlock_irq(&ctx->wqh.lock);
300
301 return seq_printf(m,
302 "clockid: %d\n"
303 "ticks: %llu\n"
304 "settime flags: 0%o\n"
305 "it_value: (%llu, %llu)\n"
306 "it_interval: (%llu, %llu)\n",
307 ctx->clockid, (unsigned long long)ctx->ticks,
308 ctx->settime_flags,
309 (unsigned long long)t.it_value.tv_sec,
310 (unsigned long long)t.it_value.tv_nsec,
311 (unsigned long long)t.it_interval.tv_sec,
312 (unsigned long long)t.it_interval.tv_nsec);
313}
314#else
315#define timerfd_show NULL
316#endif
317
318#ifdef CONFIG_CHECKPOINT_RESTORE
319static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
320{
321 struct timerfd_ctx *ctx = file->private_data;
322 int ret = 0;
323
324 switch (cmd) {
325 case TFD_IOC_SET_TICKS: {
326 u64 ticks;
327
328 if (copy_from_user(&ticks, (u64 __user *)arg, sizeof(ticks)))
329 return -EFAULT;
330 if (!ticks)
331 return -EINVAL;
332
333 spin_lock_irq(&ctx->wqh.lock);
334 if (!timerfd_canceled(ctx)) {
335 ctx->ticks = ticks;
336 wake_up_locked(&ctx->wqh);
337 } else
338 ret = -ECANCELED;
339 spin_unlock_irq(&ctx->wqh.lock);
340 break;
341 }
342 default:
343 ret = -ENOTTY;
344 break;
345 }
346
347 return ret;
348}
349#else
350#define timerfd_ioctl NULL
351#endif
352
287static const struct file_operations timerfd_fops = { 353static const struct file_operations timerfd_fops = {
288 .release = timerfd_release, 354 .release = timerfd_release,
289 .poll = timerfd_poll, 355 .poll = timerfd_poll,
290 .read = timerfd_read, 356 .read = timerfd_read,
291 .llseek = noop_llseek, 357 .llseek = noop_llseek,
358 .show_fdinfo = timerfd_show,
359 .unlocked_ioctl = timerfd_ioctl,
292}; 360};
293 361
294static int timerfd_fget(int fd, struct fd *p) 362static int timerfd_fget(int fd, struct fd *p)
@@ -336,7 +404,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
336 else 404 else
337 hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); 405 hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
338 406
339 ctx->moffs = ktime_get_monotonic_offset(); 407 ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
340 408
341 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, 409 ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
342 O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); 410 O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index ff8229340cd5..26b69b2d4a45 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -166,15 +166,10 @@ static int do_commit(struct ubifs_info *c)
166 err = ubifs_orphan_end_commit(c); 166 err = ubifs_orphan_end_commit(c);
167 if (err) 167 if (err)
168 goto out; 168 goto out;
169 old_ltail_lnum = c->ltail_lnum;
170 err = ubifs_log_end_commit(c, new_ltail_lnum);
171 if (err)
172 goto out;
173 err = dbg_check_old_index(c, &zroot); 169 err = dbg_check_old_index(c, &zroot);
174 if (err) 170 if (err)
175 goto out; 171 goto out;
176 172
177 mutex_lock(&c->mst_mutex);
178 c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); 173 c->mst_node->cmt_no = cpu_to_le64(c->cmt_no);
179 c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); 174 c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum);
180 c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); 175 c->mst_node->root_lnum = cpu_to_le32(zroot.lnum);
@@ -203,8 +198,9 @@ static int do_commit(struct ubifs_info *c)
203 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); 198 c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS);
204 else 199 else
205 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); 200 c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS);
206 err = ubifs_write_master(c); 201
207 mutex_unlock(&c->mst_mutex); 202 old_ltail_lnum = c->ltail_lnum;
203 err = ubifs_log_end_commit(c, new_ltail_lnum);
208 if (err) 204 if (err)
209 goto out; 205 goto out;
210 206
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 177b0152fef4..7ed13e1e216a 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -334,9 +334,9 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node)
334 pr_err("\tkey_fmt %d (%s)\n", 334 pr_err("\tkey_fmt %d (%s)\n",
335 (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); 335 (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
336 pr_err("\tflags %#x\n", sup_flags); 336 pr_err("\tflags %#x\n", sup_flags);
337 pr_err("\t big_lpt %u\n", 337 pr_err("\tbig_lpt %u\n",
338 !!(sup_flags & UBIFS_FLG_BIGLPT)); 338 !!(sup_flags & UBIFS_FLG_BIGLPT));
339 pr_err("\t space_fixup %u\n", 339 pr_err("\tspace_fixup %u\n",
340 !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); 340 !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
341 pr_err("\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size)); 341 pr_err("\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size));
342 pr_err("\tleb_size %u\n", le32_to_cpu(sup->leb_size)); 342 pr_err("\tleb_size %u\n", le32_to_cpu(sup->leb_size));
@@ -2462,7 +2462,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write)
2462 2462
2463 if (chance(1, 2)) { 2463 if (chance(1, 2)) {
2464 d->pc_delay = 1; 2464 d->pc_delay = 1;
2465 /* Fail withing 1 minute */ 2465 /* Fail within 1 minute */
2466 delay = prandom_u32() % 60000; 2466 delay = prandom_u32() % 60000;
2467 d->pc_timeout = jiffies; 2467 d->pc_timeout = jiffies;
2468 d->pc_timeout += msecs_to_jiffies(delay); 2468 d->pc_timeout += msecs_to_jiffies(delay);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 2290d5866725..fb08b0c514b6 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -431,7 +431,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last)
431 431
432/** 432/**
433 * wbuf_timer_callback - write-buffer timer callback function. 433 * wbuf_timer_callback - write-buffer timer callback function.
434 * @data: timer data (write-buffer descriptor) 434 * @timer: timer data (write-buffer descriptor)
435 * 435 *
436 * This function is called when the write-buffer timer expires. 436 * This function is called when the write-buffer timer expires.
437 */ 437 */
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 0e045e75abd8..fb166e204441 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -546,15 +546,14 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
546 int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); 546 int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
547 int last_reference = !!(deletion && inode->i_nlink == 0); 547 int last_reference = !!(deletion && inode->i_nlink == 0);
548 struct ubifs_inode *ui = ubifs_inode(inode); 548 struct ubifs_inode *ui = ubifs_inode(inode);
549 struct ubifs_inode *dir_ui = ubifs_inode(dir); 549 struct ubifs_inode *host_ui = ubifs_inode(dir);
550 struct ubifs_dent_node *dent; 550 struct ubifs_dent_node *dent;
551 struct ubifs_ino_node *ino; 551 struct ubifs_ino_node *ino;
552 union ubifs_key dent_key, ino_key; 552 union ubifs_key dent_key, ino_key;
553 553
554 dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", 554 dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
555 inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); 555 inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
556 ubifs_assert(dir_ui->data_len == 0); 556 ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
557 ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex));
558 557
559 dlen = UBIFS_DENT_NODE_SZ + nm->len + 1; 558 dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
560 ilen = UBIFS_INO_NODE_SZ; 559 ilen = UBIFS_INO_NODE_SZ;
@@ -658,7 +657,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
658 ui->synced_i_size = ui->ui_size; 657 ui->synced_i_size = ui->ui_size;
659 spin_unlock(&ui->ui_lock); 658 spin_unlock(&ui->ui_lock);
660 mark_inode_clean(c, ui); 659 mark_inode_clean(c, ui);
661 mark_inode_clean(c, dir_ui); 660 mark_inode_clean(c, host_ui);
662 return 0; 661 return 0;
663 662
664out_finish: 663out_finish:
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index a902c5919e42..c14628fbeee2 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -106,10 +106,14 @@ static inline long long empty_log_bytes(const struct ubifs_info *c)
106 h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs; 106 h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs;
107 t = (long long)c->ltail_lnum * c->leb_size; 107 t = (long long)c->ltail_lnum * c->leb_size;
108 108
109 if (h >= t) 109 if (h > t)
110 return c->log_bytes - h + t; 110 return c->log_bytes - h + t;
111 else 111 else if (h != t)
112 return t - h; 112 return t - h;
113 else if (c->lhead_lnum != c->ltail_lnum)
114 return 0;
115 else
116 return c->log_bytes;
113} 117}
114 118
115/** 119/**
@@ -240,6 +244,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
240 244
241 if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { 245 if (c->lhead_offs > c->leb_size - c->ref_node_alsz) {
242 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); 246 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
247 ubifs_assert(c->lhead_lnum != c->ltail_lnum);
243 c->lhead_offs = 0; 248 c->lhead_offs = 0;
244 } 249 }
245 250
@@ -404,15 +409,14 @@ int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum)
404 /* Switch to the next log LEB */ 409 /* Switch to the next log LEB */
405 if (c->lhead_offs) { 410 if (c->lhead_offs) {
406 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum); 411 c->lhead_lnum = ubifs_next_log_lnum(c, c->lhead_lnum);
412 ubifs_assert(c->lhead_lnum != c->ltail_lnum);
407 c->lhead_offs = 0; 413 c->lhead_offs = 0;
408 } 414 }
409 415
410 if (c->lhead_offs == 0) { 416 /* Must ensure next LEB has been unmapped */
411 /* Must ensure next LEB has been unmapped */ 417 err = ubifs_leb_unmap(c, c->lhead_lnum);
412 err = ubifs_leb_unmap(c, c->lhead_lnum); 418 if (err)
413 if (err) 419 goto out;
414 goto out;
415 }
416 420
417 len = ALIGN(len, c->min_io_size); 421 len = ALIGN(len, c->min_io_size);
418 dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); 422 dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
@@ -447,9 +451,9 @@ out:
447 * @ltail_lnum: new log tail LEB number 451 * @ltail_lnum: new log tail LEB number
448 * 452 *
449 * This function is called on when the commit operation was finished. It 453 * This function is called on when the commit operation was finished. It
450 * moves log tail to new position and unmaps LEBs which contain obsolete data. 454 * moves log tail to new position and updates the master node so that it stores
451 * Returns zero in case of success and a negative error code in case of 455 * the new log tail LEB number. Returns zero in case of success and a negative
452 * failure. 456 * error code in case of failure.
453 */ 457 */
454int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum) 458int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
455{ 459{
@@ -477,7 +481,12 @@ int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum)
477 spin_unlock(&c->buds_lock); 481 spin_unlock(&c->buds_lock);
478 482
479 err = dbg_check_bud_bytes(c); 483 err = dbg_check_bud_bytes(c);
484 if (err)
485 goto out;
486
487 err = ubifs_write_master(c);
480 488
489out:
481 mutex_unlock(&c->log_mutex); 490 mutex_unlock(&c->log_mutex);
482 return err; 491 return err;
483} 492}
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index d46b19ec1815..421bd0a80424 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1464,7 +1464,6 @@ struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum)
1464 return ERR_CAST(nnode); 1464 return ERR_CAST(nnode);
1465 } 1465 }
1466 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 1466 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1467 shft -= UBIFS_LPT_FANOUT_SHIFT;
1468 pnode = ubifs_get_pnode(c, nnode, iip); 1467 pnode = ubifs_get_pnode(c, nnode, iip);
1469 if (IS_ERR(pnode)) 1468 if (IS_ERR(pnode))
1470 return ERR_CAST(pnode); 1469 return ERR_CAST(pnode);
@@ -1604,7 +1603,6 @@ struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum)
1604 return ERR_CAST(nnode); 1603 return ERR_CAST(nnode);
1605 } 1604 }
1606 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 1605 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1607 shft -= UBIFS_LPT_FANOUT_SHIFT;
1608 pnode = ubifs_get_pnode(c, nnode, iip); 1606 pnode = ubifs_get_pnode(c, nnode, iip);
1609 if (IS_ERR(pnode)) 1607 if (IS_ERR(pnode))
1610 return ERR_CAST(pnode); 1608 return ERR_CAST(pnode);
@@ -1964,7 +1962,6 @@ again:
1964 } 1962 }
1965 } 1963 }
1966 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); 1964 iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1));
1967 shft -= UBIFS_LPT_FANOUT_SHIFT;
1968 pnode = scan_get_pnode(c, path + h, nnode, iip); 1965 pnode = scan_get_pnode(c, path + h, nnode, iip);
1969 if (IS_ERR(pnode)) { 1966 if (IS_ERR(pnode)) {
1970 err = PTR_ERR(pnode); 1967 err = PTR_ERR(pnode);
@@ -2198,6 +2195,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
2198 lprops->dirty); 2195 lprops->dirty);
2199 return -EINVAL; 2196 return -EINVAL;
2200 } 2197 }
2198 break;
2201 case LPROPS_FREEABLE: 2199 case LPROPS_FREEABLE:
2202 case LPROPS_FRDI_IDX: 2200 case LPROPS_FRDI_IDX:
2203 if (lprops->free + lprops->dirty != c->leb_size) { 2201 if (lprops->free + lprops->dirty != c->leb_size) {
@@ -2206,6 +2204,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
2206 lprops->dirty); 2204 lprops->dirty);
2207 return -EINVAL; 2205 return -EINVAL;
2208 } 2206 }
2207 break;
2209 } 2208 }
2210 } 2209 }
2211 return 0; 2210 return 0;
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 45d4e96a6bac..d9c02928e992 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -304,7 +304,6 @@ static int layout_cnodes(struct ubifs_info *c)
304 ubifs_assert(lnum >= c->lpt_first && 304 ubifs_assert(lnum >= c->lpt_first &&
305 lnum <= c->lpt_last); 305 lnum <= c->lpt_last);
306 } 306 }
307 done_ltab = 1;
308 c->ltab_lnum = lnum; 307 c->ltab_lnum = lnum;
309 c->ltab_offs = offs; 308 c->ltab_offs = offs;
310 offs += c->ltab_sz; 309 offs += c->ltab_sz;
@@ -514,7 +513,6 @@ static int write_cnodes(struct ubifs_info *c)
514 if (err) 513 if (err)
515 return err; 514 return err;
516 } 515 }
517 done_ltab = 1;
518 ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); 516 ubifs_pack_ltab(c, buf + offs, c->ltab_cmt);
519 offs += c->ltab_sz; 517 offs += c->ltab_sz;
520 dbg_chk_lpt_sz(c, 1, c->ltab_sz); 518 dbg_chk_lpt_sz(c, 1, c->ltab_sz);
@@ -1941,6 +1939,11 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1941 pr_err("LEB %d:%d, nnode, ", 1939 pr_err("LEB %d:%d, nnode, ",
1942 lnum, offs); 1940 lnum, offs);
1943 err = ubifs_unpack_nnode(c, p, &nnode); 1941 err = ubifs_unpack_nnode(c, p, &nnode);
1942 if (err) {
1943 pr_err("failed to unpack_node, error %d\n",
1944 err);
1945 break;
1946 }
1944 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1947 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1945 pr_cont("%d:%d", nnode.nbranch[i].lnum, 1948 pr_cont("%d:%d", nnode.nbranch[i].lnum,
1946 nnode.nbranch[i].offs); 1949 nnode.nbranch[i].offs);
diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c
index ab83ace9910a..1a4bb9e8b3b8 100644
--- a/fs/ubifs/master.c
+++ b/fs/ubifs/master.c
@@ -352,10 +352,9 @@ int ubifs_read_master(struct ubifs_info *c)
352 * ubifs_write_master - write master node. 352 * ubifs_write_master - write master node.
353 * @c: UBIFS file-system description object 353 * @c: UBIFS file-system description object
354 * 354 *
355 * This function writes the master node. The caller has to take the 355 * This function writes the master node. Returns zero in case of success and a
356 * @c->mst_mutex lock before calling this function. Returns zero in case of 356 * negative error code in case of failure. The master node is written twice to
357 * success and a negative error code in case of failure. The master node is 357 * enable recovery.
358 * written twice to enable recovery.
359 */ 358 */
360int ubifs_write_master(struct ubifs_info *c) 359int ubifs_write_master(struct ubifs_info *c)
361{ 360{
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index f1c3e5a1b315..4409f486ecef 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -346,7 +346,6 @@ static int write_orph_nodes(struct ubifs_info *c, int atomic)
346 int lnum; 346 int lnum;
347 347
348 /* Unmap any unused LEBs after consolidation */ 348 /* Unmap any unused LEBs after consolidation */
349 lnum = c->ohead_lnum + 1;
350 for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) { 349 for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) {
351 err = ubifs_leb_unmap(c, lnum); 350 err = ubifs_leb_unmap(c, lnum);
352 if (err) 351 if (err)
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index c14adb2f420c..c640938f62f0 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -596,7 +596,6 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs)
596 * drop_last_node - drop the last node. 596 * drop_last_node - drop the last node.
597 * @sleb: scanned LEB information 597 * @sleb: scanned LEB information
598 * @offs: offset of dropped nodes is returned here 598 * @offs: offset of dropped nodes is returned here
599 * @grouped: non-zero if whole group of nodes have to be dropped
600 * 599 *
601 * This is a helper function for 'ubifs_recover_leb()' which drops the last 600 * This is a helper function for 'ubifs_recover_leb()' which drops the last
602 * node of the scanned LEB. 601 * node of the scanned LEB.
@@ -629,8 +628,8 @@ static void drop_last_node(struct ubifs_scan_leb *sleb, int *offs)
629 * 628 *
630 * This function does a scan of a LEB, but caters for errors that might have 629 * This function does a scan of a LEB, but caters for errors that might have
631 * been caused by the unclean unmount from which we are attempting to recover. 630 * been caused by the unclean unmount from which we are attempting to recover.
632 * Returns %0 in case of success, %-EUCLEAN if an unrecoverable corruption is 631 * Returns the scanned information on success and a negative error code on
633 * found, and a negative error code in case of failure. 632 * failure.
634 */ 633 */
635struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, 634struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
636 int offs, void *sbuf, int jhead) 635 int offs, void *sbuf, int jhead)
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 4c37607a958e..79c6dbbc0e04 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -332,6 +332,8 @@ static int create_default_filesystem(struct ubifs_info *c)
332 cs->ch.node_type = UBIFS_CS_NODE; 332 cs->ch.node_type = UBIFS_CS_NODE;
333 err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0); 333 err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
334 kfree(cs); 334 kfree(cs);
335 if (err)
336 return err;
335 337
336 ubifs_msg("default file-system created"); 338 ubifs_msg("default file-system created");
337 return 0; 339 return 0;
@@ -447,7 +449,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
447 goto failed; 449 goto failed;
448 } 450 }
449 451
450 if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) { 452 if (c->default_compr >= UBIFS_COMPR_TYPES_CNT) {
451 err = 13; 453 err = 13;
452 goto failed; 454 goto failed;
453 } 455 }
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 58aa05df2bb6..89adbc4d08ac 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -131,7 +131,8 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum,
131 * @offs: offset to start at (usually zero) 131 * @offs: offset to start at (usually zero)
132 * @sbuf: scan buffer (must be c->leb_size) 132 * @sbuf: scan buffer (must be c->leb_size)
133 * 133 *
134 * This function returns %0 on success and a negative error code on failure. 134 * This function returns the scanned information on success and a negative error
135 * code on failure.
135 */ 136 */
136struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, 137struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
137 int offs, void *sbuf) 138 int offs, void *sbuf)
@@ -157,9 +158,10 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
157 return ERR_PTR(err); 158 return ERR_PTR(err);
158 } 159 }
159 160
160 if (err == -EBADMSG) 161 /*
161 sleb->ecc = 1; 162 * Note, we ignore integrity errors (EBASMSG) because all the nodes are
162 163 * protected by CRC checksums.
164 */
163 return sleb; 165 return sleb;
164} 166}
165 167
@@ -169,8 +171,6 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum,
169 * @sleb: scanning information 171 * @sleb: scanning information
170 * @lnum: logical eraseblock number 172 * @lnum: logical eraseblock number
171 * @offs: offset to start at (usually zero) 173 * @offs: offset to start at (usually zero)
172 *
173 * This function returns %0 on success and a negative error code on failure.
174 */ 174 */
175void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, 175void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb,
176 int lnum, int offs) 176 int lnum, int offs)
@@ -257,7 +257,7 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs,
257 * @quiet: print no messages 257 * @quiet: print no messages
258 * 258 *
259 * This function scans LEB number @lnum and returns complete information about 259 * This function scans LEB number @lnum and returns complete information about
260 * its contents. Returns the scaned information in case of success and, 260 * its contents. Returns the scanned information in case of success and,
261 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case 261 * %-EUCLEAN if the LEB neads recovery, and other negative error codes in case
262 * of failure. 262 * of failure.
263 * 263 *
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3904c8574ef9..106bf20629ce 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -75,7 +75,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode)
75 return 1; 75 return 1;
76 } 76 }
77 77
78 if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { 78 if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) {
79 ubifs_err("unknown compression type %d", ui->compr_type); 79 ubifs_err("unknown compression type %d", ui->compr_type);
80 return 2; 80 return 2;
81 } 81 }
@@ -424,19 +424,19 @@ static int ubifs_show_options(struct seq_file *s, struct dentry *root)
424 struct ubifs_info *c = root->d_sb->s_fs_info; 424 struct ubifs_info *c = root->d_sb->s_fs_info;
425 425
426 if (c->mount_opts.unmount_mode == 2) 426 if (c->mount_opts.unmount_mode == 2)
427 seq_printf(s, ",fast_unmount"); 427 seq_puts(s, ",fast_unmount");
428 else if (c->mount_opts.unmount_mode == 1) 428 else if (c->mount_opts.unmount_mode == 1)
429 seq_printf(s, ",norm_unmount"); 429 seq_puts(s, ",norm_unmount");
430 430
431 if (c->mount_opts.bulk_read == 2) 431 if (c->mount_opts.bulk_read == 2)
432 seq_printf(s, ",bulk_read"); 432 seq_puts(s, ",bulk_read");
433 else if (c->mount_opts.bulk_read == 1) 433 else if (c->mount_opts.bulk_read == 1)
434 seq_printf(s, ",no_bulk_read"); 434 seq_puts(s, ",no_bulk_read");
435 435
436 if (c->mount_opts.chk_data_crc == 2) 436 if (c->mount_opts.chk_data_crc == 2)
437 seq_printf(s, ",chk_data_crc"); 437 seq_puts(s, ",chk_data_crc");
438 else if (c->mount_opts.chk_data_crc == 1) 438 else if (c->mount_opts.chk_data_crc == 1)
439 seq_printf(s, ",no_chk_data_crc"); 439 seq_puts(s, ",no_chk_data_crc");
440 440
441 if (c->mount_opts.override_compr) { 441 if (c->mount_opts.override_compr) {
442 seq_printf(s, ",compr=%s", 442 seq_printf(s, ",compr=%s",
@@ -796,8 +796,8 @@ static int alloc_wbufs(struct ubifs_info *c)
796{ 796{
797 int i, err; 797 int i, err;
798 798
799 c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead), 799 c->jheads = kcalloc(c->jhead_cnt, sizeof(struct ubifs_jhead),
800 GFP_KERNEL); 800 GFP_KERNEL);
801 if (!c->jheads) 801 if (!c->jheads)
802 return -ENOMEM; 802 return -ENOMEM;
803 803
@@ -1963,7 +1963,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
1963 mutex_init(&c->lp_mutex); 1963 mutex_init(&c->lp_mutex);
1964 mutex_init(&c->tnc_mutex); 1964 mutex_init(&c->tnc_mutex);
1965 mutex_init(&c->log_mutex); 1965 mutex_init(&c->log_mutex);
1966 mutex_init(&c->mst_mutex);
1967 mutex_init(&c->umount_mutex); 1966 mutex_init(&c->umount_mutex);
1968 mutex_init(&c->bu_mutex); 1967 mutex_init(&c->bu_mutex);
1969 mutex_init(&c->write_reserve_mutex); 1968 mutex_init(&c->write_reserve_mutex);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 8a40cf9c02d7..6793db0754f6 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -3294,7 +3294,6 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
3294 goto out_unlock; 3294 goto out_unlock;
3295 3295
3296 if (err) { 3296 if (err) {
3297 err = -EINVAL;
3298 key = &from_key; 3297 key = &from_key;
3299 goto out_dump; 3298 goto out_dump;
3300 } 3299 }
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 3600994f8411..7a205e046776 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -389,7 +389,6 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
389 ubifs_dump_lprops(c); 389 ubifs_dump_lprops(c);
390 } 390 }
391 /* Try to commit anyway */ 391 /* Try to commit anyway */
392 err = 0;
393 break; 392 break;
394 } 393 }
395 p++; 394 p++;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c1f71fe17cc0..c4fe900c67ab 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -314,7 +314,6 @@ struct ubifs_scan_node {
314 * @nodes_cnt: number of nodes scanned 314 * @nodes_cnt: number of nodes scanned
315 * @nodes: list of struct ubifs_scan_node 315 * @nodes: list of struct ubifs_scan_node
316 * @endpt: end point (and therefore the start of empty space) 316 * @endpt: end point (and therefore the start of empty space)
317 * @ecc: read returned -EBADMSG
318 * @buf: buffer containing entire LEB scanned 317 * @buf: buffer containing entire LEB scanned
319 */ 318 */
320struct ubifs_scan_leb { 319struct ubifs_scan_leb {
@@ -322,7 +321,6 @@ struct ubifs_scan_leb {
322 int nodes_cnt; 321 int nodes_cnt;
323 struct list_head nodes; 322 struct list_head nodes;
324 int endpt; 323 int endpt;
325 int ecc;
326 void *buf; 324 void *buf;
327}; 325};
328 326
@@ -1051,7 +1049,6 @@ struct ubifs_debug_info;
1051 * 1049 *
1052 * @mst_node: master node 1050 * @mst_node: master node
1053 * @mst_offs: offset of valid master node 1051 * @mst_offs: offset of valid master node
1054 * @mst_mutex: protects the master node area, @mst_node, and @mst_offs
1055 * 1052 *
1056 * @max_bu_buf_len: maximum bulk-read buffer length 1053 * @max_bu_buf_len: maximum bulk-read buffer length
1057 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu 1054 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
@@ -1292,7 +1289,6 @@ struct ubifs_info {
1292 1289
1293 struct ubifs_mst_node *mst_node; 1290 struct ubifs_mst_node *mst_node;
1294 int mst_offs; 1291 int mst_offs;
1295 struct mutex mst_mutex;
1296 1292
1297 int max_bu_buf_len; 1293 int max_bu_buf_len;
1298 struct mutex bu_mutex; 1294 struct mutex bu_mutex;
diff --git a/fs/udf/file.c b/fs/udf/file.c
index d80738fdf424..bb15771b92ae 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -27,7 +27,7 @@
27 27
28#include "udfdecl.h" 28#include "udfdecl.h"
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <asm/uaccess.h> 30#include <linux/uaccess.h>
31#include <linux/kernel.h> 31#include <linux/kernel.h>
32#include <linux/string.h> /* memset */ 32#include <linux/string.h> /* memset */
33#include <linux/capability.h> 33#include <linux/capability.h>
@@ -100,24 +100,6 @@ static int udf_adinicb_write_begin(struct file *file,
100 return 0; 100 return 0;
101} 101}
102 102
103static int udf_adinicb_write_end(struct file *file,
104 struct address_space *mapping,
105 loff_t pos, unsigned len, unsigned copied,
106 struct page *page, void *fsdata)
107{
108 struct inode *inode = mapping->host;
109 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
110 char *kaddr;
111 struct udf_inode_info *iinfo = UDF_I(inode);
112
113 kaddr = kmap_atomic(page);
114 memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset,
115 kaddr + offset, copied);
116 kunmap_atomic(kaddr);
117
118 return simple_write_end(file, mapping, pos, len, copied, page, fsdata);
119}
120
121static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb, 103static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb,
122 struct iov_iter *iter, 104 struct iov_iter *iter,
123 loff_t offset) 105 loff_t offset)
@@ -130,7 +112,7 @@ const struct address_space_operations udf_adinicb_aops = {
130 .readpage = udf_adinicb_readpage, 112 .readpage = udf_adinicb_readpage,
131 .writepage = udf_adinicb_writepage, 113 .writepage = udf_adinicb_writepage,
132 .write_begin = udf_adinicb_write_begin, 114 .write_begin = udf_adinicb_write_begin,
133 .write_end = udf_adinicb_write_end, 115 .write_end = simple_write_end,
134 .direct_IO = udf_adinicb_direct_IO, 116 .direct_IO = udf_adinicb_direct_IO,
135}; 117};
136 118
@@ -241,11 +223,18 @@ out:
241 223
242static int udf_release_file(struct inode *inode, struct file *filp) 224static int udf_release_file(struct inode *inode, struct file *filp)
243{ 225{
244 if (filp->f_mode & FMODE_WRITE) { 226 if (filp->f_mode & FMODE_WRITE &&
227 atomic_read(&inode->i_writecount) > 1) {
228 /*
229 * Grab i_mutex to avoid races with writes changing i_size
230 * while we are running.
231 */
232 mutex_lock(&inode->i_mutex);
245 down_write(&UDF_I(inode)->i_data_sem); 233 down_write(&UDF_I(inode)->i_data_sem);
246 udf_discard_prealloc(inode); 234 udf_discard_prealloc(inode);
247 udf_truncate_tail_extent(inode); 235 udf_truncate_tail_extent(inode);
248 up_write(&UDF_I(inode)->i_data_sem); 236 up_write(&UDF_I(inode)->i_data_sem);
237 mutex_unlock(&inode->i_mutex);
249 } 238 }
250 return 0; 239 return 0;
251} 240}
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6eaf5edf1ea1..e77db621ec89 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode)
45 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); 45 udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
46} 46}
47 47
48struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) 48struct inode *udf_new_inode(struct inode *dir, umode_t mode)
49{ 49{
50 struct super_block *sb = dir->i_sb; 50 struct super_block *sb = dir->i_sb;
51 struct udf_sb_info *sbi = UDF_SB(sb); 51 struct udf_sb_info *sbi = UDF_SB(sb);
@@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
55 struct udf_inode_info *iinfo; 55 struct udf_inode_info *iinfo;
56 struct udf_inode_info *dinfo = UDF_I(dir); 56 struct udf_inode_info *dinfo = UDF_I(dir);
57 struct logicalVolIntegrityDescImpUse *lvidiu; 57 struct logicalVolIntegrityDescImpUse *lvidiu;
58 int err;
58 59
59 inode = new_inode(sb); 60 inode = new_inode(sb);
60 61
61 if (!inode) { 62 if (!inode)
62 *err = -ENOMEM; 63 return ERR_PTR(-ENOMEM);
63 return NULL;
64 }
65 *err = -ENOSPC;
66 64
67 iinfo = UDF_I(inode); 65 iinfo = UDF_I(inode);
68 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { 66 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
@@ -80,21 +78,22 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
80 } 78 }
81 if (!iinfo->i_ext.i_data) { 79 if (!iinfo->i_ext.i_data) {
82 iput(inode); 80 iput(inode);
83 *err = -ENOMEM; 81 return ERR_PTR(-ENOMEM);
84 return NULL;
85 } 82 }
86 83
84 err = -ENOSPC;
87 block = udf_new_block(dir->i_sb, NULL, 85 block = udf_new_block(dir->i_sb, NULL,
88 dinfo->i_location.partitionReferenceNum, 86 dinfo->i_location.partitionReferenceNum,
89 start, err); 87 start, &err);
90 if (*err) { 88 if (err) {
91 iput(inode); 89 iput(inode);
92 return NULL; 90 return ERR_PTR(err);
93 } 91 }
94 92
95 lvidiu = udf_sb_lvidiu(sb); 93 lvidiu = udf_sb_lvidiu(sb);
96 if (lvidiu) { 94 if (lvidiu) {
97 iinfo->i_unique = lvid_get_unique_id(sb); 95 iinfo->i_unique = lvid_get_unique_id(sb);
96 inode->i_generation = iinfo->i_unique;
98 mutex_lock(&sbi->s_alloc_mutex); 97 mutex_lock(&sbi->s_alloc_mutex);
99 if (S_ISDIR(mode)) 98 if (S_ISDIR(mode))
100 le32_add_cpu(&lvidiu->numDirs, 1); 99 le32_add_cpu(&lvidiu->numDirs, 1);
@@ -123,9 +122,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
123 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; 122 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
124 inode->i_mtime = inode->i_atime = inode->i_ctime = 123 inode->i_mtime = inode->i_atime = inode->i_ctime =
125 iinfo->i_crtime = current_fs_time(inode->i_sb); 124 iinfo->i_crtime = current_fs_time(inode->i_sb);
126 insert_inode_hash(inode); 125 if (unlikely(insert_inode_locked(inode) < 0)) {
126 make_bad_inode(inode);
127 iput(inode);
128 return ERR_PTR(-EIO);
129 }
127 mark_inode_dirty(inode); 130 mark_inode_dirty(inode);
128 131
129 *err = 0;
130 return inode; 132 return inode;
131} 133}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 236cd48184c2..c9b4df5810d5 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -51,7 +51,6 @@ MODULE_LICENSE("GPL");
51 51
52static umode_t udf_convert_permissions(struct fileEntry *); 52static umode_t udf_convert_permissions(struct fileEntry *);
53static int udf_update_inode(struct inode *, int); 53static int udf_update_inode(struct inode *, int);
54static void udf_fill_inode(struct inode *, struct buffer_head *);
55static int udf_sync_inode(struct inode *inode); 54static int udf_sync_inode(struct inode *inode);
56static int udf_alloc_i_data(struct inode *inode, size_t size); 55static int udf_alloc_i_data(struct inode *inode, size_t size);
57static sector_t inode_getblk(struct inode *, sector_t, int *, int *); 56static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
@@ -1271,12 +1270,33 @@ update_time:
1271 return 0; 1270 return 0;
1272} 1271}
1273 1272
1274static void __udf_read_inode(struct inode *inode) 1273/*
1274 * Maximum length of linked list formed by ICB hierarchy. The chosen number is
1275 * arbitrary - just that we hopefully don't limit any real use of rewritten
1276 * inode on write-once media but avoid looping for too long on corrupted media.
1277 */
1278#define UDF_MAX_ICB_NESTING 1024
1279
1280static int udf_read_inode(struct inode *inode, bool hidden_inode)
1275{ 1281{
1276 struct buffer_head *bh = NULL; 1282 struct buffer_head *bh = NULL;
1277 struct fileEntry *fe; 1283 struct fileEntry *fe;
1284 struct extendedFileEntry *efe;
1278 uint16_t ident; 1285 uint16_t ident;
1279 struct udf_inode_info *iinfo = UDF_I(inode); 1286 struct udf_inode_info *iinfo = UDF_I(inode);
1287 struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
1288 struct kernel_lb_addr *iloc = &iinfo->i_location;
1289 unsigned int link_count;
1290 unsigned int indirections = 0;
1291 int ret = -EIO;
1292
1293reread:
1294 if (iloc->logicalBlockNum >=
1295 sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
1296 udf_debug("block=%d, partition=%d out of range\n",
1297 iloc->logicalBlockNum, iloc->partitionReferenceNum);
1298 return -EIO;
1299 }
1280 1300
1281 /* 1301 /*
1282 * Set defaults, but the inode is still incomplete! 1302 * Set defaults, but the inode is still incomplete!
@@ -1290,78 +1310,54 @@ static void __udf_read_inode(struct inode *inode)
1290 * i_nlink = 1 1310 * i_nlink = 1
1291 * i_op = NULL; 1311 * i_op = NULL;
1292 */ 1312 */
1293 bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); 1313 bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
1294 if (!bh) { 1314 if (!bh) {
1295 udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); 1315 udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
1296 make_bad_inode(inode); 1316 return -EIO;
1297 return;
1298 } 1317 }
1299 1318
1300 if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && 1319 if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
1301 ident != TAG_IDENT_USE) { 1320 ident != TAG_IDENT_USE) {
1302 udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", 1321 udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
1303 inode->i_ino, ident); 1322 inode->i_ino, ident);
1304 brelse(bh); 1323 goto out;
1305 make_bad_inode(inode);
1306 return;
1307 } 1324 }
1308 1325
1309 fe = (struct fileEntry *)bh->b_data; 1326 fe = (struct fileEntry *)bh->b_data;
1327 efe = (struct extendedFileEntry *)bh->b_data;
1310 1328
1311 if (fe->icbTag.strategyType == cpu_to_le16(4096)) { 1329 if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
1312 struct buffer_head *ibh; 1330 struct buffer_head *ibh;
1313 1331
1314 ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, 1332 ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
1315 &ident);
1316 if (ident == TAG_IDENT_IE && ibh) { 1333 if (ident == TAG_IDENT_IE && ibh) {
1317 struct buffer_head *nbh = NULL;
1318 struct kernel_lb_addr loc; 1334 struct kernel_lb_addr loc;
1319 struct indirectEntry *ie; 1335 struct indirectEntry *ie;
1320 1336
1321 ie = (struct indirectEntry *)ibh->b_data; 1337 ie = (struct indirectEntry *)ibh->b_data;
1322 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1338 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1323 1339
1324 if (ie->indirectICB.extLength && 1340 if (ie->indirectICB.extLength) {
1325 (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, 1341 brelse(ibh);
1326 &ident))) { 1342 memcpy(&iinfo->i_location, &loc,
1327 if (ident == TAG_IDENT_FE || 1343 sizeof(struct kernel_lb_addr));
1328 ident == TAG_IDENT_EFE) { 1344 if (++indirections > UDF_MAX_ICB_NESTING) {
1329 memcpy(&iinfo->i_location, 1345 udf_err(inode->i_sb,
1330 &loc, 1346 "too many ICBs in ICB hierarchy"
1331 sizeof(struct kernel_lb_addr)); 1347 " (max %d supported)\n",
1332 brelse(bh); 1348 UDF_MAX_ICB_NESTING);
1333 brelse(ibh); 1349 goto out;
1334 brelse(nbh);
1335 __udf_read_inode(inode);
1336 return;
1337 } 1350 }
1338 brelse(nbh); 1351 brelse(bh);
1352 goto reread;
1339 } 1353 }
1340 } 1354 }
1341 brelse(ibh); 1355 brelse(ibh);
1342 } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { 1356 } else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
1343 udf_err(inode->i_sb, "unsupported strategy type: %d\n", 1357 udf_err(inode->i_sb, "unsupported strategy type: %d\n",
1344 le16_to_cpu(fe->icbTag.strategyType)); 1358 le16_to_cpu(fe->icbTag.strategyType));
1345 brelse(bh); 1359 goto out;
1346 make_bad_inode(inode);
1347 return;
1348 } 1360 }
1349 udf_fill_inode(inode, bh);
1350
1351 brelse(bh);
1352}
1353
1354static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1355{
1356 struct fileEntry *fe;
1357 struct extendedFileEntry *efe;
1358 struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
1359 struct udf_inode_info *iinfo = UDF_I(inode);
1360 unsigned int link_count;
1361
1362 fe = (struct fileEntry *)bh->b_data;
1363 efe = (struct extendedFileEntry *)bh->b_data;
1364
1365 if (fe->icbTag.strategyType == cpu_to_le16(4)) 1361 if (fe->icbTag.strategyType == cpu_to_le16(4))
1366 iinfo->i_strat4096 = 0; 1362 iinfo->i_strat4096 = 0;
1367 else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ 1363 else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
@@ -1378,11 +1374,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1378 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { 1374 if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
1379 iinfo->i_efe = 1; 1375 iinfo->i_efe = 1;
1380 iinfo->i_use = 0; 1376 iinfo->i_use = 0;
1381 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1377 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
1382 sizeof(struct extendedFileEntry))) { 1378 sizeof(struct extendedFileEntry));
1383 make_bad_inode(inode); 1379 if (ret)
1384 return; 1380 goto out;
1385 }
1386 memcpy(iinfo->i_ext.i_data, 1381 memcpy(iinfo->i_ext.i_data,
1387 bh->b_data + sizeof(struct extendedFileEntry), 1382 bh->b_data + sizeof(struct extendedFileEntry),
1388 inode->i_sb->s_blocksize - 1383 inode->i_sb->s_blocksize -
@@ -1390,11 +1385,10 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1390 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { 1385 } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
1391 iinfo->i_efe = 0; 1386 iinfo->i_efe = 0;
1392 iinfo->i_use = 0; 1387 iinfo->i_use = 0;
1393 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1388 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
1394 sizeof(struct fileEntry))) { 1389 sizeof(struct fileEntry));
1395 make_bad_inode(inode); 1390 if (ret)
1396 return; 1391 goto out;
1397 }
1398 memcpy(iinfo->i_ext.i_data, 1392 memcpy(iinfo->i_ext.i_data,
1399 bh->b_data + sizeof(struct fileEntry), 1393 bh->b_data + sizeof(struct fileEntry),
1400 inode->i_sb->s_blocksize - sizeof(struct fileEntry)); 1394 inode->i_sb->s_blocksize - sizeof(struct fileEntry));
@@ -1404,18 +1398,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1404 iinfo->i_lenAlloc = le32_to_cpu( 1398 iinfo->i_lenAlloc = le32_to_cpu(
1405 ((struct unallocSpaceEntry *)bh->b_data)-> 1399 ((struct unallocSpaceEntry *)bh->b_data)->
1406 lengthAllocDescs); 1400 lengthAllocDescs);
1407 if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - 1401 ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
1408 sizeof(struct unallocSpaceEntry))) { 1402 sizeof(struct unallocSpaceEntry));
1409 make_bad_inode(inode); 1403 if (ret)
1410 return; 1404 goto out;
1411 }
1412 memcpy(iinfo->i_ext.i_data, 1405 memcpy(iinfo->i_ext.i_data,
1413 bh->b_data + sizeof(struct unallocSpaceEntry), 1406 bh->b_data + sizeof(struct unallocSpaceEntry),
1414 inode->i_sb->s_blocksize - 1407 inode->i_sb->s_blocksize -
1415 sizeof(struct unallocSpaceEntry)); 1408 sizeof(struct unallocSpaceEntry));
1416 return; 1409 return 0;
1417 } 1410 }
1418 1411
1412 ret = -EIO;
1419 read_lock(&sbi->s_cred_lock); 1413 read_lock(&sbi->s_cred_lock);
1420 i_uid_write(inode, le32_to_cpu(fe->uid)); 1414 i_uid_write(inode, le32_to_cpu(fe->uid));
1421 if (!uid_valid(inode->i_uid) || 1415 if (!uid_valid(inode->i_uid) ||
@@ -1441,8 +1435,13 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1441 read_unlock(&sbi->s_cred_lock); 1435 read_unlock(&sbi->s_cred_lock);
1442 1436
1443 link_count = le16_to_cpu(fe->fileLinkCount); 1437 link_count = le16_to_cpu(fe->fileLinkCount);
1444 if (!link_count) 1438 if (!link_count) {
1439 if (!hidden_inode) {
1440 ret = -ESTALE;
1441 goto out;
1442 }
1445 link_count = 1; 1443 link_count = 1;
1444 }
1446 set_nlink(inode, link_count); 1445 set_nlink(inode, link_count);
1447 1446
1448 inode->i_size = le64_to_cpu(fe->informationLength); 1447 inode->i_size = le64_to_cpu(fe->informationLength);
@@ -1488,6 +1487,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1488 iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); 1487 iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
1489 iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); 1488 iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
1490 } 1489 }
1490 inode->i_generation = iinfo->i_unique;
1491 1491
1492 switch (fe->icbTag.fileType) { 1492 switch (fe->icbTag.fileType) {
1493 case ICBTAG_FILE_TYPE_DIRECTORY: 1493 case ICBTAG_FILE_TYPE_DIRECTORY:
@@ -1537,8 +1537,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1537 default: 1537 default:
1538 udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", 1538 udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
1539 inode->i_ino, fe->icbTag.fileType); 1539 inode->i_ino, fe->icbTag.fileType);
1540 make_bad_inode(inode); 1540 goto out;
1541 return;
1542 } 1541 }
1543 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { 1542 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
1544 struct deviceSpec *dsea = 1543 struct deviceSpec *dsea =
@@ -1549,8 +1548,12 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1549 le32_to_cpu(dsea->minorDeviceIdent))); 1548 le32_to_cpu(dsea->minorDeviceIdent)));
1550 /* Developer ID ??? */ 1549 /* Developer ID ??? */
1551 } else 1550 } else
1552 make_bad_inode(inode); 1551 goto out;
1553 } 1552 }
1553 ret = 0;
1554out:
1555 brelse(bh);
1556 return ret;
1554} 1557}
1555 1558
1556static int udf_alloc_i_data(struct inode *inode, size_t size) 1559static int udf_alloc_i_data(struct inode *inode, size_t size)
@@ -1664,7 +1667,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1664 FE_PERM_U_DELETE | FE_PERM_U_CHATTR)); 1667 FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
1665 fe->permissions = cpu_to_le32(udfperms); 1668 fe->permissions = cpu_to_le32(udfperms);
1666 1669
1667 if (S_ISDIR(inode->i_mode)) 1670 if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
1668 fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); 1671 fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
1669 else 1672 else
1670 fe->fileLinkCount = cpu_to_le16(inode->i_nlink); 1673 fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
@@ -1826,36 +1829,28 @@ out:
1826 return err; 1829 return err;
1827} 1830}
1828 1831
1829struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) 1832struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
1833 bool hidden_inode)
1830{ 1834{
1831 unsigned long block = udf_get_lb_pblock(sb, ino, 0); 1835 unsigned long block = udf_get_lb_pblock(sb, ino, 0);
1832 struct inode *inode = iget_locked(sb, block); 1836 struct inode *inode = iget_locked(sb, block);
1837 int err;
1833 1838
1834 if (!inode) 1839 if (!inode)
1835 return NULL; 1840 return ERR_PTR(-ENOMEM);
1836
1837 if (inode->i_state & I_NEW) {
1838 memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
1839 __udf_read_inode(inode);
1840 unlock_new_inode(inode);
1841 }
1842 1841
1843 if (is_bad_inode(inode)) 1842 if (!(inode->i_state & I_NEW))
1844 goto out_iput; 1843 return inode;
1845 1844
1846 if (ino->logicalBlockNum >= UDF_SB(sb)-> 1845 memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
1847 s_partmaps[ino->partitionReferenceNum].s_partition_len) { 1846 err = udf_read_inode(inode, hidden_inode);
1848 udf_debug("block=%d, partition=%d out of range\n", 1847 if (err < 0) {
1849 ino->logicalBlockNum, ino->partitionReferenceNum); 1848 iget_failed(inode);
1850 make_bad_inode(inode); 1849 return ERR_PTR(err);
1851 goto out_iput;
1852 } 1850 }
1851 unlock_new_inode(inode);
1853 1852
1854 return inode; 1853 return inode;
1855
1856 out_iput:
1857 iput(inode);
1858 return NULL;
1859} 1854}
1860 1855
1861int udf_add_aext(struct inode *inode, struct extent_position *epos, 1856int udf_add_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 6583fe9b0645..6ad5a453af97 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -21,7 +21,7 @@
21 21
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/cdrom.h> 23#include <linux/cdrom.h>
24#include <asm/uaccess.h> 24#include <linux/uaccess.h>
25 25
26#include "udf_sb.h" 26#include "udf_sb.h"
27 27
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 9737cba1357d..c12e260fd6c4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
270 NULL, 0), 270 NULL, 0),
271 }; 271 };
272 inode = udf_iget(dir->i_sb, lb); 272 inode = udf_iget(dir->i_sb, lb);
273 if (!inode) { 273 if (IS_ERR(inode))
274 return ERR_PTR(-EACCES); 274 return inode;
275 }
276 } else 275 } else
277#endif /* UDF_RECOVERY */ 276#endif /* UDF_RECOVERY */
278 277
@@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
285 284
286 loc = lelb_to_cpu(cfi.icb.extLocation); 285 loc = lelb_to_cpu(cfi.icb.extLocation);
287 inode = udf_iget(dir->i_sb, &loc); 286 inode = udf_iget(dir->i_sb, &loc);
288 if (!inode) { 287 if (IS_ERR(inode))
289 return ERR_PTR(-EACCES); 288 return ERR_CAST(inode);
290 }
291 } 289 }
292 290
293 return d_splice_alias(inode, dentry); 291 return d_splice_alias(inode, dentry);
@@ -550,32 +548,18 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
550 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); 548 return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
551} 549}
552 550
553static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, 551static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
554 bool excl)
555{ 552{
553 struct udf_inode_info *iinfo = UDF_I(inode);
554 struct inode *dir = dentry->d_parent->d_inode;
556 struct udf_fileident_bh fibh; 555 struct udf_fileident_bh fibh;
557 struct inode *inode;
558 struct fileIdentDesc cfi, *fi; 556 struct fileIdentDesc cfi, *fi;
559 int err; 557 int err;
560 struct udf_inode_info *iinfo;
561
562 inode = udf_new_inode(dir, mode, &err);
563 if (!inode) {
564 return err;
565 }
566
567 iinfo = UDF_I(inode);
568 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
569 inode->i_data.a_ops = &udf_adinicb_aops;
570 else
571 inode->i_data.a_ops = &udf_aops;
572 inode->i_op = &udf_file_inode_operations;
573 inode->i_fop = &udf_file_operations;
574 mark_inode_dirty(inode);
575 558
576 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 559 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
577 if (!fi) { 560 if (unlikely(!fi)) {
578 inode_dec_link_count(inode); 561 inode_dec_link_count(inode);
562 unlock_new_inode(inode);
579 iput(inode); 563 iput(inode);
580 return err; 564 return err;
581 } 565 }
@@ -589,23 +573,21 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
589 if (fibh.sbh != fibh.ebh) 573 if (fibh.sbh != fibh.ebh)
590 brelse(fibh.ebh); 574 brelse(fibh.ebh);
591 brelse(fibh.sbh); 575 brelse(fibh.sbh);
576 unlock_new_inode(inode);
592 d_instantiate(dentry, inode); 577 d_instantiate(dentry, inode);
593 578
594 return 0; 579 return 0;
595} 580}
596 581
597static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) 582static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
583 bool excl)
598{ 584{
599 struct inode *inode; 585 struct inode *inode = udf_new_inode(dir, mode);
600 struct udf_inode_info *iinfo;
601 int err;
602 586
603 inode = udf_new_inode(dir, mode, &err); 587 if (IS_ERR(inode))
604 if (!inode) 588 return PTR_ERR(inode);
605 return err;
606 589
607 iinfo = UDF_I(inode); 590 if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
608 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
609 inode->i_data.a_ops = &udf_adinicb_aops; 591 inode->i_data.a_ops = &udf_adinicb_aops;
610 else 592 else
611 inode->i_data.a_ops = &udf_aops; 593 inode->i_data.a_ops = &udf_aops;
@@ -613,7 +595,25 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
613 inode->i_fop = &udf_file_operations; 595 inode->i_fop = &udf_file_operations;
614 mark_inode_dirty(inode); 596 mark_inode_dirty(inode);
615 597
598 return udf_add_nondir(dentry, inode);
599}
600
601static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
602{
603 struct inode *inode = udf_new_inode(dir, mode);
604
605 if (IS_ERR(inode))
606 return PTR_ERR(inode);
607
608 if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
609 inode->i_data.a_ops = &udf_adinicb_aops;
610 else
611 inode->i_data.a_ops = &udf_aops;
612 inode->i_op = &udf_file_inode_operations;
613 inode->i_fop = &udf_file_operations;
614 mark_inode_dirty(inode);
616 d_tmpfile(dentry, inode); 615 d_tmpfile(dentry, inode);
616 unlock_new_inode(inode);
617 return 0; 617 return 0;
618} 618}
619 619
@@ -621,44 +621,16 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
621 dev_t rdev) 621 dev_t rdev)
622{ 622{
623 struct inode *inode; 623 struct inode *inode;
624 struct udf_fileident_bh fibh;
625 struct fileIdentDesc cfi, *fi;
626 int err;
627 struct udf_inode_info *iinfo;
628 624
629 if (!old_valid_dev(rdev)) 625 if (!old_valid_dev(rdev))
630 return -EINVAL; 626 return -EINVAL;
631 627
632 err = -EIO; 628 inode = udf_new_inode(dir, mode);
633 inode = udf_new_inode(dir, mode, &err); 629 if (IS_ERR(inode))
634 if (!inode) 630 return PTR_ERR(inode);
635 goto out;
636 631
637 iinfo = UDF_I(inode);
638 init_special_inode(inode, mode, rdev); 632 init_special_inode(inode, mode, rdev);
639 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 633 return udf_add_nondir(dentry, inode);
640 if (!fi) {
641 inode_dec_link_count(inode);
642 iput(inode);
643 return err;
644 }
645 cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
646 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
647 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
648 cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
649 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
650 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
651 mark_inode_dirty(dir);
652 mark_inode_dirty(inode);
653
654 if (fibh.sbh != fibh.ebh)
655 brelse(fibh.ebh);
656 brelse(fibh.sbh);
657 d_instantiate(dentry, inode);
658 err = 0;
659
660out:
661 return err;
662} 634}
663 635
664static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 636static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -670,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
670 struct udf_inode_info *dinfo = UDF_I(dir); 642 struct udf_inode_info *dinfo = UDF_I(dir);
671 struct udf_inode_info *iinfo; 643 struct udf_inode_info *iinfo;
672 644
673 err = -EIO; 645 inode = udf_new_inode(dir, S_IFDIR | mode);
674 inode = udf_new_inode(dir, S_IFDIR | mode, &err); 646 if (IS_ERR(inode))
675 if (!inode) 647 return PTR_ERR(inode);
676 goto out;
677 648
678 iinfo = UDF_I(inode); 649 iinfo = UDF_I(inode);
679 inode->i_op = &udf_dir_inode_operations; 650 inode->i_op = &udf_dir_inode_operations;
@@ -681,6 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
681 fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); 652 fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
682 if (!fi) { 653 if (!fi) {
683 inode_dec_link_count(inode); 654 inode_dec_link_count(inode);
655 unlock_new_inode(inode);
684 iput(inode); 656 iput(inode);
685 goto out; 657 goto out;
686 } 658 }
@@ -699,6 +671,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
699 if (!fi) { 671 if (!fi) {
700 clear_nlink(inode); 672 clear_nlink(inode);
701 mark_inode_dirty(inode); 673 mark_inode_dirty(inode);
674 unlock_new_inode(inode);
702 iput(inode); 675 iput(inode);
703 goto out; 676 goto out;
704 } 677 }
@@ -710,6 +683,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
710 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); 683 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
711 inc_nlink(dir); 684 inc_nlink(dir);
712 mark_inode_dirty(dir); 685 mark_inode_dirty(dir);
686 unlock_new_inode(inode);
713 d_instantiate(dentry, inode); 687 d_instantiate(dentry, inode);
714 if (fibh.sbh != fibh.ebh) 688 if (fibh.sbh != fibh.ebh)
715 brelse(fibh.ebh); 689 brelse(fibh.ebh);
@@ -876,14 +850,11 @@ out:
876static int udf_symlink(struct inode *dir, struct dentry *dentry, 850static int udf_symlink(struct inode *dir, struct dentry *dentry,
877 const char *symname) 851 const char *symname)
878{ 852{
879 struct inode *inode; 853 struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
880 struct pathComponent *pc; 854 struct pathComponent *pc;
881 const char *compstart; 855 const char *compstart;
882 struct udf_fileident_bh fibh;
883 struct extent_position epos = {}; 856 struct extent_position epos = {};
884 int eoffset, elen = 0; 857 int eoffset, elen = 0;
885 struct fileIdentDesc *fi;
886 struct fileIdentDesc cfi;
887 uint8_t *ea; 858 uint8_t *ea;
888 int err; 859 int err;
889 int block; 860 int block;
@@ -892,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
892 struct udf_inode_info *iinfo; 863 struct udf_inode_info *iinfo;
893 struct super_block *sb = dir->i_sb; 864 struct super_block *sb = dir->i_sb;
894 865
895 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); 866 if (IS_ERR(inode))
896 if (!inode) 867 return PTR_ERR(inode);
897 goto out;
898 868
899 iinfo = UDF_I(inode); 869 iinfo = UDF_I(inode);
900 down_write(&iinfo->i_data_sem); 870 down_write(&iinfo->i_data_sem);
@@ -1012,24 +982,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1012 mark_inode_dirty(inode); 982 mark_inode_dirty(inode);
1013 up_write(&iinfo->i_data_sem); 983 up_write(&iinfo->i_data_sem);
1014 984
1015 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 985 err = udf_add_nondir(dentry, inode);
1016 if (!fi)
1017 goto out_no_entry;
1018 cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
1019 cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
1020 if (UDF_SB(inode->i_sb)->s_lvid_bh) {
1021 *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
1022 cpu_to_le32(lvid_get_unique_id(sb));
1023 }
1024 udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
1025 if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1026 mark_inode_dirty(dir);
1027 if (fibh.sbh != fibh.ebh)
1028 brelse(fibh.ebh);
1029 brelse(fibh.sbh);
1030 d_instantiate(dentry, inode);
1031 err = 0;
1032
1033out: 986out:
1034 kfree(name); 987 kfree(name);
1035 return err; 988 return err;
@@ -1037,6 +990,7 @@ out:
1037out_no_entry: 990out_no_entry:
1038 up_write(&iinfo->i_data_sem); 991 up_write(&iinfo->i_data_sem);
1039 inode_dec_link_count(inode); 992 inode_dec_link_count(inode);
993 unlock_new_inode(inode);
1040 iput(inode); 994 iput(inode);
1041 goto out; 995 goto out;
1042} 996}
@@ -1221,7 +1175,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
1221 struct udf_fileident_bh fibh; 1175 struct udf_fileident_bh fibh;
1222 1176
1223 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) 1177 if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
1224 goto out_unlock; 1178 return ERR_PTR(-EACCES);
1225 1179
1226 if (fibh.sbh != fibh.ebh) 1180 if (fibh.sbh != fibh.ebh)
1227 brelse(fibh.ebh); 1181 brelse(fibh.ebh);
@@ -1229,12 +1183,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
1229 1183
1230 tloc = lelb_to_cpu(cfi.icb.extLocation); 1184 tloc = lelb_to_cpu(cfi.icb.extLocation);
1231 inode = udf_iget(child->d_inode->i_sb, &tloc); 1185 inode = udf_iget(child->d_inode->i_sb, &tloc);
1232 if (!inode) 1186 if (IS_ERR(inode))
1233 goto out_unlock; 1187 return ERR_CAST(inode);
1234 1188
1235 return d_obtain_alias(inode); 1189 return d_obtain_alias(inode);
1236out_unlock:
1237 return ERR_PTR(-EACCES);
1238} 1190}
1239 1191
1240 1192
@@ -1251,8 +1203,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
1251 loc.partitionReferenceNum = partref; 1203 loc.partitionReferenceNum = partref;
1252 inode = udf_iget(sb, &loc); 1204 inode = udf_iget(sb, &loc);
1253 1205
1254 if (inode == NULL) 1206 if (IS_ERR(inode))
1255 return ERR_PTR(-ENOMEM); 1207 return ERR_CAST(inode);
1256 1208
1257 if (generation && inode->i_generation != generation) { 1209 if (generation && inode->i_generation != generation) {
1258 iput(inode); 1210 iput(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3286db047a40..e229315bbf7a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -63,7 +63,7 @@
63#include "udf_i.h" 63#include "udf_i.h"
64 64
65#include <linux/init.h> 65#include <linux/init.h>
66#include <asm/uaccess.h> 66#include <linux/uaccess.h>
67 67
68#define VDS_POS_PRIMARY_VOL_DESC 0 68#define VDS_POS_PRIMARY_VOL_DESC 0
69#define VDS_POS_UNALLOC_SPACE_DESC 1 69#define VDS_POS_UNALLOC_SPACE_DESC 1
@@ -959,14 +959,16 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
959 addr.logicalBlockNum = meta_file_loc; 959 addr.logicalBlockNum = meta_file_loc;
960 addr.partitionReferenceNum = partition_num; 960 addr.partitionReferenceNum = partition_num;
961 961
962 metadata_fe = udf_iget(sb, &addr); 962 metadata_fe = udf_iget_special(sb, &addr);
963 963
964 if (metadata_fe == NULL) 964 if (IS_ERR(metadata_fe)) {
965 udf_warn(sb, "metadata inode efe not found\n"); 965 udf_warn(sb, "metadata inode efe not found\n");
966 else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { 966 return metadata_fe;
967 }
968 if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
967 udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); 969 udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
968 iput(metadata_fe); 970 iput(metadata_fe);
969 metadata_fe = NULL; 971 return ERR_PTR(-EIO);
970 } 972 }
971 973
972 return metadata_fe; 974 return metadata_fe;
@@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
978 struct udf_part_map *map; 980 struct udf_part_map *map;
979 struct udf_meta_data *mdata; 981 struct udf_meta_data *mdata;
980 struct kernel_lb_addr addr; 982 struct kernel_lb_addr addr;
983 struct inode *fe;
981 984
982 map = &sbi->s_partmaps[partition]; 985 map = &sbi->s_partmaps[partition];
983 mdata = &map->s_type_specific.s_metadata; 986 mdata = &map->s_type_specific.s_metadata;
@@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
986 udf_debug("Metadata file location: block = %d part = %d\n", 989 udf_debug("Metadata file location: block = %d part = %d\n",
987 mdata->s_meta_file_loc, map->s_partition_num); 990 mdata->s_meta_file_loc, map->s_partition_num);
988 991
989 mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb, 992 fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
990 mdata->s_meta_file_loc, map->s_partition_num); 993 map->s_partition_num);
991 994 if (IS_ERR(fe)) {
992 if (mdata->s_metadata_fe == NULL) {
993 /* mirror file entry */ 995 /* mirror file entry */
994 udf_debug("Mirror metadata file location: block = %d part = %d\n", 996 udf_debug("Mirror metadata file location: block = %d part = %d\n",
995 mdata->s_mirror_file_loc, map->s_partition_num); 997 mdata->s_mirror_file_loc, map->s_partition_num);
996 998
997 mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, 999 fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
998 mdata->s_mirror_file_loc, map->s_partition_num); 1000 map->s_partition_num);
999 1001
1000 if (mdata->s_mirror_fe == NULL) { 1002 if (IS_ERR(fe)) {
1001 udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); 1003 udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
1002 return -EIO; 1004 return PTR_ERR(fe);
1003 } 1005 }
1004 } 1006 mdata->s_mirror_fe = fe;
1007 } else
1008 mdata->s_metadata_fe = fe;
1009
1005 1010
1006 /* 1011 /*
1007 * bitmap file entry 1012 * bitmap file entry
@@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
1015 udf_debug("Bitmap file location: block = %d part = %d\n", 1020 udf_debug("Bitmap file location: block = %d part = %d\n",
1016 addr.logicalBlockNum, addr.partitionReferenceNum); 1021 addr.logicalBlockNum, addr.partitionReferenceNum);
1017 1022
1018 mdata->s_bitmap_fe = udf_iget(sb, &addr); 1023 fe = udf_iget_special(sb, &addr);
1019 if (mdata->s_bitmap_fe == NULL) { 1024 if (IS_ERR(fe)) {
1020 if (sb->s_flags & MS_RDONLY) 1025 if (sb->s_flags & MS_RDONLY)
1021 udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); 1026 udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
1022 else { 1027 else {
1023 udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); 1028 udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
1024 return -EIO; 1029 return PTR_ERR(fe);
1025 } 1030 }
1026 } 1031 } else
1032 mdata->s_bitmap_fe = fe;
1027 } 1033 }
1028 1034
1029 udf_debug("udf_load_metadata_files Ok\n"); 1035 udf_debug("udf_load_metadata_files Ok\n");
@@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1111 phd->unallocSpaceTable.extPosition), 1117 phd->unallocSpaceTable.extPosition),
1112 .partitionReferenceNum = p_index, 1118 .partitionReferenceNum = p_index,
1113 }; 1119 };
1120 struct inode *inode;
1114 1121
1115 map->s_uspace.s_table = udf_iget(sb, &loc); 1122 inode = udf_iget_special(sb, &loc);
1116 if (!map->s_uspace.s_table) { 1123 if (IS_ERR(inode)) {
1117 udf_debug("cannot load unallocSpaceTable (part %d)\n", 1124 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1118 p_index); 1125 p_index);
1119 return -EIO; 1126 return PTR_ERR(inode);
1120 } 1127 }
1128 map->s_uspace.s_table = inode;
1121 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; 1129 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
1122 udf_debug("unallocSpaceTable (part %d) @ %ld\n", 1130 udf_debug("unallocSpaceTable (part %d) @ %ld\n",
1123 p_index, map->s_uspace.s_table->i_ino); 1131 p_index, map->s_uspace.s_table->i_ino);
@@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
1144 phd->freedSpaceTable.extPosition), 1152 phd->freedSpaceTable.extPosition),
1145 .partitionReferenceNum = p_index, 1153 .partitionReferenceNum = p_index,
1146 }; 1154 };
1155 struct inode *inode;
1147 1156
1148 map->s_fspace.s_table = udf_iget(sb, &loc); 1157 inode = udf_iget_special(sb, &loc);
1149 if (!map->s_fspace.s_table) { 1158 if (IS_ERR(inode)) {
1150 udf_debug("cannot load freedSpaceTable (part %d)\n", 1159 udf_debug("cannot load freedSpaceTable (part %d)\n",
1151 p_index); 1160 p_index);
1152 return -EIO; 1161 return PTR_ERR(inode);
1153 } 1162 }
1154 1163 map->s_fspace.s_table = inode;
1155 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; 1164 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
1156 udf_debug("freedSpaceTable (part %d) @ %ld\n", 1165 udf_debug("freedSpaceTable (part %d) @ %ld\n",
1157 p_index, map->s_fspace.s_table->i_ino); 1166 p_index, map->s_fspace.s_table->i_ino);
@@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
1178 struct udf_part_map *map = &sbi->s_partmaps[p_index]; 1187 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1179 sector_t vat_block; 1188 sector_t vat_block;
1180 struct kernel_lb_addr ino; 1189 struct kernel_lb_addr ino;
1190 struct inode *inode;
1181 1191
1182 /* 1192 /*
1183 * VAT file entry is in the last recorded block. Some broken disks have 1193 * VAT file entry is in the last recorded block. Some broken disks have
@@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
1186 ino.partitionReferenceNum = type1_index; 1196 ino.partitionReferenceNum = type1_index;
1187 for (vat_block = start_block; 1197 for (vat_block = start_block;
1188 vat_block >= map->s_partition_root && 1198 vat_block >= map->s_partition_root &&
1189 vat_block >= start_block - 3 && 1199 vat_block >= start_block - 3; vat_block--) {
1190 !sbi->s_vat_inode; vat_block--) {
1191 ino.logicalBlockNum = vat_block - map->s_partition_root; 1200 ino.logicalBlockNum = vat_block - map->s_partition_root;
1192 sbi->s_vat_inode = udf_iget(sb, &ino); 1201 inode = udf_iget_special(sb, &ino);
1202 if (!IS_ERR(inode)) {
1203 sbi->s_vat_inode = inode;
1204 break;
1205 }
1193 } 1206 }
1194} 1207}
1195 1208
@@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
2205 /* assign inodes by physical block number */ 2218 /* assign inodes by physical block number */
2206 /* perhaps it's not extensible enough, but for now ... */ 2219 /* perhaps it's not extensible enough, but for now ... */
2207 inode = udf_iget(sb, &rootdir); 2220 inode = udf_iget(sb, &rootdir);
2208 if (!inode) { 2221 if (IS_ERR(inode)) {
2209 udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", 2222 udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
2210 rootdir.logicalBlockNum, rootdir.partitionReferenceNum); 2223 rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
2211 ret = -EIO; 2224 ret = PTR_ERR(inode);
2212 goto error_out; 2225 goto error_out;
2213 } 2226 }
2214 2227
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index d7c6dbe4194b..6fb7945c1e6e 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -20,7 +20,7 @@
20 */ 20 */
21 21
22#include "udfdecl.h" 22#include "udfdecl.h"
23#include <asm/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/time.h> 26#include <linux/time.h>
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index be7dabbbcb49..1cc3c993ebd0 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -138,12 +138,22 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
138/* file.c */ 138/* file.c */
139extern long udf_ioctl(struct file *, unsigned int, unsigned long); 139extern long udf_ioctl(struct file *, unsigned int, unsigned long);
140/* inode.c */ 140/* inode.c */
141extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 141extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *,
142 bool hidden_inode);
143static inline struct inode *udf_iget_special(struct super_block *sb,
144 struct kernel_lb_addr *ino)
145{
146 return __udf_iget(sb, ino, true);
147}
148static inline struct inode *udf_iget(struct super_block *sb,
149 struct kernel_lb_addr *ino)
150{
151 return __udf_iget(sb, ino, false);
152}
142extern int udf_expand_file_adinicb(struct inode *); 153extern int udf_expand_file_adinicb(struct inode *);
143extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 154extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
144extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 155extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
145extern int udf_setsize(struct inode *, loff_t); 156extern int udf_setsize(struct inode *, loff_t);
146extern void udf_read_inode(struct inode *);
147extern void udf_evict_inode(struct inode *); 157extern void udf_evict_inode(struct inode *);
148extern int udf_write_inode(struct inode *, struct writeback_control *wbc); 158extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
149extern long udf_block_map(struct inode *, sector_t); 159extern long udf_block_map(struct inode *, sector_t);
@@ -209,7 +219,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
209 219
210/* ialloc.c */ 220/* ialloc.c */
211extern void udf_free_inode(struct inode *); 221extern void udf_free_inode(struct inode *);
212extern struct inode *udf_new_inode(struct inode *, umode_t, int *); 222extern struct inode *udf_new_inode(struct inode *, umode_t);
213 223
214/* truncate.c */ 224/* truncate.c */
215extern void udf_truncate_tail_extent(struct inode *); 225extern void udf_truncate_tail_extent(struct inode *);
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index 1f11483eba6a..77c331f1a770 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -81,8 +81,6 @@ static time_t year_seconds[MAX_YEAR_SECONDS] = {
81/*2038*/ SPY(68, 17, 0) 81/*2038*/ SPY(68, 17, 0)
82}; 82};
83 83
84extern struct timezone sys_tz;
85
86#define SECS_PER_HOUR (60 * 60) 84#define SECS_PER_HOUR (60 * 60)
87#define SECS_PER_DAY (SECS_PER_HOUR * 24) 85#define SECS_PER_DAY (SECS_PER_HOUR * 24)
88 86
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 44b815e57f94..afd470e588ff 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -412,7 +412,6 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
412 int extIndex = 0, newExtIndex = 0, hasExt = 0; 412 int extIndex = 0, newExtIndex = 0, hasExt = 0;
413 unsigned short valueCRC; 413 unsigned short valueCRC;
414 uint8_t curr; 414 uint8_t curr;
415 const uint8_t hexChar[] = "0123456789ABCDEF";
416 415
417 if (udfName[0] == '.' && 416 if (udfName[0] == '.' &&
418 (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) { 417 (udfLen == 1 || (udfLen == 2 && udfName[1] == '.'))) {
@@ -477,10 +476,10 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
477 newIndex = 250; 476 newIndex = 250;
478 newName[newIndex++] = CRC_MARK; 477 newName[newIndex++] = CRC_MARK;
479 valueCRC = crc_itu_t(0, fidName, fidNameLen); 478 valueCRC = crc_itu_t(0, fidName, fidNameLen);
480 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; 479 newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
481 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; 480 newName[newIndex++] = hex_asc_upper_lo(valueCRC >> 8);
482 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; 481 newName[newIndex++] = hex_asc_upper_hi(valueCRC);
483 newName[newIndex++] = hexChar[(valueCRC & 0x000f)]; 482 newName[newIndex++] = hex_asc_upper_lo(valueCRC);
484 483
485 if (hasExt) { 484 if (hasExt) {
486 newName[newIndex++] = EXT_MARK; 485 newName[newIndex++] = EXT_MARK;
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index dd39980437fc..4d0e02b022b3 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_UFS_FS) += ufs.o
6 6
7ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ 7ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
8 namei.o super.o symlink.o truncate.o util.o 8 namei.o super.o symlink.o truncate.o util.o
9ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 7bc20809c99e..2c1036080d52 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -784,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
784 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe 784 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe
785 }; 785 };
786 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; 786 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
787 struct ufs_cylinder_group *ucg;
788 unsigned start, length, loc; 787 unsigned start, length, loc;
789 unsigned pos, want, blockmap, mask, end; 788 unsigned pos, want, blockmap, mask, end;
790 u64 result; 789 u64 result;
@@ -792,8 +791,6 @@ static u64 ufs_bitmap_search(struct super_block *sb,
792 UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, 791 UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx,
793 (unsigned long long)goal, count); 792 (unsigned long long)goal, count);
794 793
795 ucg = ubh_get_ucg(UCPI_UBH(ucpi));
796
797 if (goal) 794 if (goal)
798 start = ufs_dtogd(uspi, goal) >> 3; 795 start = ufs_dtogd(uspi, goal) >> 3;
799 else 796 else
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index a9cc75ffa925..7caa01652888 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -298,7 +298,10 @@ cg_found:
298 ufsi->i_oeftflag = 0; 298 ufsi->i_oeftflag = 0;
299 ufsi->i_dir_start_lookup = 0; 299 ufsi->i_dir_start_lookup = 0;
300 memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1)); 300 memset(&ufsi->i_u1, 0, sizeof(ufsi->i_u1));
301 insert_inode_hash(inode); 301 if (insert_inode_locked(inode) < 0) {
302 err = -EIO;
303 goto failed;
304 }
302 mark_inode_dirty(inode); 305 mark_inode_dirty(inode);
303 306
304 if (uspi->fs_magic == UFS2_MAGIC) { 307 if (uspi->fs_magic == UFS2_MAGIC) {
@@ -337,6 +340,7 @@ cg_found:
337fail_remove_inode: 340fail_remove_inode:
338 unlock_ufs(sb); 341 unlock_ufs(sb);
339 clear_nlink(inode); 342 clear_nlink(inode);
343 unlock_new_inode(inode);
340 iput(inode); 344 iput(inode);
341 UFSD("EXIT (FAILED): err %d\n", err); 345 UFSD("EXIT (FAILED): err %d\n", err);
342 return ERR_PTR(err); 346 return ERR_PTR(err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 61e8a9b021dd..be7d42c7d938 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -158,16 +158,16 @@ out:
158 158
159/** 159/**
160 * ufs_inode_getfrag() - allocate new fragment(s) 160 * ufs_inode_getfrag() - allocate new fragment(s)
161 * @inode - pointer to inode 161 * @inode: pointer to inode
162 * @fragment - number of `fragment' which hold pointer 162 * @fragment: number of `fragment' which hold pointer
163 * to new allocated fragment(s) 163 * to new allocated fragment(s)
164 * @new_fragment - number of new allocated fragment(s) 164 * @new_fragment: number of new allocated fragment(s)
165 * @required - how many fragment(s) we require 165 * @required: how many fragment(s) we require
166 * @err - we set it if something wrong 166 * @err: we set it if something wrong
167 * @phys - pointer to where we save physical number of new allocated fragments, 167 * @phys: pointer to where we save physical number of new allocated fragments,
168 * NULL if we allocate not data(indirect blocks for example). 168 * NULL if we allocate not data(indirect blocks for example).
169 * @new - we set it if we allocate new block 169 * @new: we set it if we allocate new block
170 * @locked_page - for ufs_new_fragments() 170 * @locked_page: for ufs_new_fragments()
171 */ 171 */
172static struct buffer_head * 172static struct buffer_head *
173ufs_inode_getfrag(struct inode *inode, u64 fragment, 173ufs_inode_getfrag(struct inode *inode, u64 fragment,
@@ -315,16 +315,16 @@ repeat2:
315 315
316/** 316/**
317 * ufs_inode_getblock() - allocate new block 317 * ufs_inode_getblock() - allocate new block
318 * @inode - pointer to inode 318 * @inode: pointer to inode
319 * @bh - pointer to block which hold "pointer" to new allocated block 319 * @bh: pointer to block which hold "pointer" to new allocated block
320 * @fragment - number of `fragment' which hold pointer 320 * @fragment: number of `fragment' which hold pointer
321 * to new allocated block 321 * to new allocated block
322 * @new_fragment - number of new allocated fragment 322 * @new_fragment: number of new allocated fragment
323 * (block will hold this fragment and also uspi->s_fpb-1) 323 * (block will hold this fragment and also uspi->s_fpb-1)
324 * @err - see ufs_inode_getfrag() 324 * @err: see ufs_inode_getfrag()
325 * @phys - see ufs_inode_getfrag() 325 * @phys: see ufs_inode_getfrag()
326 * @new - see ufs_inode_getfrag() 326 * @new: see ufs_inode_getfrag()
327 * @locked_page - see ufs_inode_getfrag() 327 * @locked_page: see ufs_inode_getfrag()
328 */ 328 */
329static struct buffer_head * 329static struct buffer_head *
330ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, 330ufs_inode_getblock(struct inode *inode, struct buffer_head *bh,
@@ -902,9 +902,6 @@ void ufs_evict_inode(struct inode * inode)
902 invalidate_inode_buffers(inode); 902 invalidate_inode_buffers(inode);
903 clear_inode(inode); 903 clear_inode(inode);
904 904
905 if (want_delete) { 905 if (want_delete)
906 lock_ufs(inode->i_sb); 906 ufs_free_inode(inode);
907 ufs_free_inode (inode);
908 unlock_ufs(inode->i_sb);
909 }
910} 907}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 90d74b8f8eba..fd65deb4b5f0 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -38,10 +38,12 @@ static inline int ufs_add_nondir(struct dentry *dentry, struct inode *inode)
38{ 38{
39 int err = ufs_add_link(dentry, inode); 39 int err = ufs_add_link(dentry, inode);
40 if (!err) { 40 if (!err) {
41 unlock_new_inode(inode);
41 d_instantiate(dentry, inode); 42 d_instantiate(dentry, inode);
42 return 0; 43 return 0;
43 } 44 }
44 inode_dec_link_count(inode); 45 inode_dec_link_count(inode);
46 unlock_new_inode(inode);
45 iput(inode); 47 iput(inode);
46 return err; 48 return err;
47} 49}
@@ -126,12 +128,12 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
126 if (l > sb->s_blocksize) 128 if (l > sb->s_blocksize)
127 goto out_notlocked; 129 goto out_notlocked;
128 130
129 lock_ufs(dir->i_sb);
130 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 131 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
131 err = PTR_ERR(inode); 132 err = PTR_ERR(inode);
132 if (IS_ERR(inode)) 133 if (IS_ERR(inode))
133 goto out; 134 goto out_notlocked;
134 135
136 lock_ufs(dir->i_sb);
135 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { 137 if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
136 /* slow symlink */ 138 /* slow symlink */
137 inode->i_op = &ufs_symlink_inode_operations; 139 inode->i_op = &ufs_symlink_inode_operations;
@@ -155,6 +157,7 @@ out_notlocked:
155 157
156out_fail: 158out_fail:
157 inode_dec_link_count(inode); 159 inode_dec_link_count(inode);
160 unlock_new_inode(inode);
158 iput(inode); 161 iput(inode);
159 goto out; 162 goto out;
160} 163}
@@ -181,13 +184,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
181 struct inode * inode; 184 struct inode * inode;
182 int err; 185 int err;
183 186
184 lock_ufs(dir->i_sb);
185 inode_inc_link_count(dir);
186
187 inode = ufs_new_inode(dir, S_IFDIR|mode); 187 inode = ufs_new_inode(dir, S_IFDIR|mode);
188 err = PTR_ERR(inode);
189 if (IS_ERR(inode)) 188 if (IS_ERR(inode))
190 goto out_dir; 189 return PTR_ERR(inode);
191 190
192 inode->i_op = &ufs_dir_inode_operations; 191 inode->i_op = &ufs_dir_inode_operations;
193 inode->i_fop = &ufs_dir_operations; 192 inode->i_fop = &ufs_dir_operations;
@@ -195,6 +194,9 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode)
195 194
196 inode_inc_link_count(inode); 195 inode_inc_link_count(inode);
197 196
197 lock_ufs(dir->i_sb);
198 inode_inc_link_count(dir);
199
198 err = ufs_make_empty(inode, dir); 200 err = ufs_make_empty(inode, dir);
199 if (err) 201 if (err)
200 goto out_fail; 202 goto out_fail;
@@ -211,8 +213,8 @@ out:
211out_fail: 213out_fail:
212 inode_dec_link_count(inode); 214 inode_dec_link_count(inode);
213 inode_dec_link_count(inode); 215 inode_dec_link_count(inode);
216 unlock_new_inode(inode);
214 iput (inode); 217 iput (inode);
215out_dir:
216 inode_dec_link_count(dir); 218 inode_dec_link_count(dir);
217 unlock_ufs(dir->i_sb); 219 unlock_ufs(dir->i_sb);
218 goto out; 220 goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index b879f1ba3439..da73801301d5 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -65,7 +65,6 @@
65 * Evgeniy Dushistov <dushistov@mail.ru>, 2007 65 * Evgeniy Dushistov <dushistov@mail.ru>, 2007
66 */ 66 */
67 67
68
69#include <linux/exportfs.h> 68#include <linux/exportfs.h>
70#include <linux/module.h> 69#include <linux/module.h>
71#include <linux/bitops.h> 70#include <linux/bitops.h>
@@ -172,73 +171,73 @@ static void ufs_print_super_stuff(struct super_block *sb,
172{ 171{
173 u32 magic = fs32_to_cpu(sb, usb3->fs_magic); 172 u32 magic = fs32_to_cpu(sb, usb3->fs_magic);
174 173
175 printk("ufs_print_super_stuff\n"); 174 pr_debug("ufs_print_super_stuff\n");
176 printk(" magic: 0x%x\n", magic); 175 pr_debug(" magic: 0x%x\n", magic);
177 if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { 176 if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) {
178 printk(" fs_size: %llu\n", (unsigned long long) 177 pr_debug(" fs_size: %llu\n", (unsigned long long)
179 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); 178 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size));
180 printk(" fs_dsize: %llu\n", (unsigned long long) 179 pr_debug(" fs_dsize: %llu\n", (unsigned long long)
181 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); 180 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize));
182 printk(" bsize: %u\n", 181 pr_debug(" bsize: %u\n",
183 fs32_to_cpu(sb, usb1->fs_bsize)); 182 fs32_to_cpu(sb, usb1->fs_bsize));
184 printk(" fsize: %u\n", 183 pr_debug(" fsize: %u\n",
185 fs32_to_cpu(sb, usb1->fs_fsize)); 184 fs32_to_cpu(sb, usb1->fs_fsize));
186 printk(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); 185 pr_debug(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname);
187 printk(" fs_sblockloc: %llu\n", (unsigned long long) 186 pr_debug(" fs_sblockloc: %llu\n", (unsigned long long)
188 fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); 187 fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc));
189 printk(" cs_ndir(No of dirs): %llu\n", (unsigned long long) 188 pr_debug(" cs_ndir(No of dirs): %llu\n", (unsigned long long)
190 fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); 189 fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir));
191 printk(" cs_nbfree(No of free blocks): %llu\n", 190 pr_debug(" cs_nbfree(No of free blocks): %llu\n",
192 (unsigned long long) 191 (unsigned long long)
193 fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); 192 fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree));
194 printk(KERN_INFO" cs_nifree(Num of free inodes): %llu\n", 193 pr_info(" cs_nifree(Num of free inodes): %llu\n",
195 (unsigned long long) 194 (unsigned long long)
196 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); 195 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree));
197 printk(KERN_INFO" cs_nffree(Num of free frags): %llu\n", 196 pr_info(" cs_nffree(Num of free frags): %llu\n",
198 (unsigned long long) 197 (unsigned long long)
199 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); 198 fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree));
200 printk(KERN_INFO" fs_maxsymlinklen: %u\n", 199 pr_info(" fs_maxsymlinklen: %u\n",
201 fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); 200 fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen));
202 } else { 201 } else {
203 printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); 202 pr_debug(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno));
204 printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); 203 pr_debug(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno));
205 printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); 204 pr_debug(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno));
206 printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); 205 pr_debug(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno));
207 printk(" cgoffset: %u\n", 206 pr_debug(" cgoffset: %u\n",
208 fs32_to_cpu(sb, usb1->fs_cgoffset)); 207 fs32_to_cpu(sb, usb1->fs_cgoffset));
209 printk(" ~cgmask: 0x%x\n", 208 pr_debug(" ~cgmask: 0x%x\n",
210 ~fs32_to_cpu(sb, usb1->fs_cgmask)); 209 ~fs32_to_cpu(sb, usb1->fs_cgmask));
211 printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); 210 pr_debug(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size));
212 printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); 211 pr_debug(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize));
213 printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); 212 pr_debug(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg));
214 printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); 213 pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize));
215 printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); 214 pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize));
216 printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); 215 pr_debug(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag));
217 printk(" fragshift: %u\n", 216 pr_debug(" fragshift: %u\n",
218 fs32_to_cpu(sb, usb1->fs_fragshift)); 217 fs32_to_cpu(sb, usb1->fs_fragshift));
219 printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); 218 pr_debug(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask));
220 printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); 219 pr_debug(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift));
221 printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); 220 pr_debug(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize));
222 printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); 221 pr_debug(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc));
223 printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); 222 pr_debug(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg));
224 printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); 223 pr_debug(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg));
225 printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); 224 pr_debug(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg));
226 printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); 225 pr_debug(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr));
227 printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); 226 pr_debug(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize));
228 printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); 227 pr_debug(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize));
229 printk(" fstodb: %u\n", 228 pr_debug(" fstodb: %u\n",
230 fs32_to_cpu(sb, usb1->fs_fsbtodb)); 229 fs32_to_cpu(sb, usb1->fs_fsbtodb));
231 printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); 230 pr_debug(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos));
232 printk(" ndir %u\n", 231 pr_debug(" ndir %u\n",
233 fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); 232 fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir));
234 printk(" nifree %u\n", 233 pr_debug(" nifree %u\n",
235 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); 234 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree));
236 printk(" nbfree %u\n", 235 pr_debug(" nbfree %u\n",
237 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); 236 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree));
238 printk(" nffree %u\n", 237 pr_debug(" nffree %u\n",
239 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); 238 fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree));
240 } 239 }
241 printk("\n"); 240 pr_debug("\n");
242} 241}
243 242
244/* 243/*
@@ -247,38 +246,38 @@ static void ufs_print_super_stuff(struct super_block *sb,
247static void ufs_print_cylinder_stuff(struct super_block *sb, 246static void ufs_print_cylinder_stuff(struct super_block *sb,
248 struct ufs_cylinder_group *cg) 247 struct ufs_cylinder_group *cg)
249{ 248{
250 printk("\nufs_print_cylinder_stuff\n"); 249 pr_debug("\nufs_print_cylinder_stuff\n");
251 printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); 250 pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group));
252 printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); 251 pr_debug(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic));
253 printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); 252 pr_debug(" time: %u\n", fs32_to_cpu(sb, cg->cg_time));
254 printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); 253 pr_debug(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx));
255 printk(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); 254 pr_debug(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl));
256 printk(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); 255 pr_debug(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk));
257 printk(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); 256 pr_debug(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk));
258 printk(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); 257 pr_debug(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir));
259 printk(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); 258 pr_debug(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree));
260 printk(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); 259 pr_debug(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree));
261 printk(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); 260 pr_debug(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree));
262 printk(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); 261 pr_debug(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor));
263 printk(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); 262 pr_debug(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor));
264 printk(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); 263 pr_debug(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor));
265 printk(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", 264 pr_debug(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n",
266 fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]), 265 fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]),
267 fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]), 266 fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]),
268 fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]), 267 fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]),
269 fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7])); 268 fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7]));
270 printk(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); 269 pr_debug(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff));
271 printk(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); 270 pr_debug(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff));
272 printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); 271 pr_debug(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff));
273 printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); 272 pr_debug(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff));
274 printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); 273 pr_debug(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff));
275 printk(" clustersumoff %u\n", 274 pr_debug(" clustersumoff %u\n",
276 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); 275 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff));
277 printk(" clusteroff %u\n", 276 pr_debug(" clusteroff %u\n",
278 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); 277 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff));
279 printk(" nclusterblks %u\n", 278 pr_debug(" nclusterblks %u\n",
280 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); 279 fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks));
281 printk("\n"); 280 pr_debug("\n");
282} 281}
283#else 282#else
284# define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ 283# define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/
@@ -287,13 +286,12 @@ static void ufs_print_cylinder_stuff(struct super_block *sb,
287 286
288static const struct super_operations ufs_super_ops; 287static const struct super_operations ufs_super_ops;
289 288
290static char error_buf[1024];
291
292void ufs_error (struct super_block * sb, const char * function, 289void ufs_error (struct super_block * sb, const char * function,
293 const char * fmt, ...) 290 const char * fmt, ...)
294{ 291{
295 struct ufs_sb_private_info * uspi; 292 struct ufs_sb_private_info * uspi;
296 struct ufs_super_block_first * usb1; 293 struct ufs_super_block_first * usb1;
294 struct va_format vaf;
297 va_list args; 295 va_list args;
298 296
299 uspi = UFS_SB(sb)->s_uspi; 297 uspi = UFS_SB(sb)->s_uspi;
@@ -305,20 +303,21 @@ void ufs_error (struct super_block * sb, const char * function,
305 ufs_mark_sb_dirty(sb); 303 ufs_mark_sb_dirty(sb);
306 sb->s_flags |= MS_RDONLY; 304 sb->s_flags |= MS_RDONLY;
307 } 305 }
308 va_start (args, fmt); 306 va_start(args, fmt);
309 vsnprintf (error_buf, sizeof(error_buf), fmt, args); 307 vaf.fmt = fmt;
310 va_end (args); 308 vaf.va = &args;
311 switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { 309 switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) {
312 case UFS_MOUNT_ONERROR_PANIC: 310 case UFS_MOUNT_ONERROR_PANIC:
313 panic ("UFS-fs panic (device %s): %s: %s\n", 311 panic("panic (device %s): %s: %pV\n",
314 sb->s_id, function, error_buf); 312 sb->s_id, function, &vaf);
315 313
316 case UFS_MOUNT_ONERROR_LOCK: 314 case UFS_MOUNT_ONERROR_LOCK:
317 case UFS_MOUNT_ONERROR_UMOUNT: 315 case UFS_MOUNT_ONERROR_UMOUNT:
318 case UFS_MOUNT_ONERROR_REPAIR: 316 case UFS_MOUNT_ONERROR_REPAIR:
319 printk (KERN_CRIT "UFS-fs error (device %s): %s: %s\n", 317 pr_crit("error (device %s): %s: %pV\n",
320 sb->s_id, function, error_buf); 318 sb->s_id, function, &vaf);
321 } 319 }
320 va_end(args);
322} 321}
323 322
324void ufs_panic (struct super_block * sb, const char * function, 323void ufs_panic (struct super_block * sb, const char * function,
@@ -326,6 +325,7 @@ void ufs_panic (struct super_block * sb, const char * function,
326{ 325{
327 struct ufs_sb_private_info * uspi; 326 struct ufs_sb_private_info * uspi;
328 struct ufs_super_block_first * usb1; 327 struct ufs_super_block_first * usb1;
328 struct va_format vaf;
329 va_list args; 329 va_list args;
330 330
331 uspi = UFS_SB(sb)->s_uspi; 331 uspi = UFS_SB(sb)->s_uspi;
@@ -336,24 +336,27 @@ void ufs_panic (struct super_block * sb, const char * function,
336 ubh_mark_buffer_dirty(USPI_UBH(uspi)); 336 ubh_mark_buffer_dirty(USPI_UBH(uspi));
337 ufs_mark_sb_dirty(sb); 337 ufs_mark_sb_dirty(sb);
338 } 338 }
339 va_start (args, fmt); 339 va_start(args, fmt);
340 vsnprintf (error_buf, sizeof(error_buf), fmt, args); 340 vaf.fmt = fmt;
341 va_end (args); 341 vaf.va = &args;
342 sb->s_flags |= MS_RDONLY; 342 sb->s_flags |= MS_RDONLY;
343 printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n", 343 pr_crit("panic (device %s): %s: %pV\n",
344 sb->s_id, function, error_buf); 344 sb->s_id, function, &vaf);
345 va_end(args);
345} 346}
346 347
347void ufs_warning (struct super_block * sb, const char * function, 348void ufs_warning (struct super_block * sb, const char * function,
348 const char * fmt, ...) 349 const char * fmt, ...)
349{ 350{
351 struct va_format vaf;
350 va_list args; 352 va_list args;
351 353
352 va_start (args, fmt); 354 va_start(args, fmt);
353 vsnprintf (error_buf, sizeof(error_buf), fmt, args); 355 vaf.fmt = fmt;
354 va_end (args); 356 vaf.va = &args;
355 printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n", 357 pr_warn("(device %s): %s: %pV\n",
356 sb->s_id, function, error_buf); 358 sb->s_id, function, &vaf);
359 va_end(args);
357} 360}
358 361
359enum { 362enum {
@@ -464,14 +467,12 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
464 ufs_set_opt (*mount_options, ONERROR_UMOUNT); 467 ufs_set_opt (*mount_options, ONERROR_UMOUNT);
465 break; 468 break;
466 case Opt_onerror_repair: 469 case Opt_onerror_repair:
467 printk("UFS-fs: Unable to do repair on error, " 470 pr_err("Unable to do repair on error, will lock lock instead\n");
468 "will lock lock instead\n");
469 ufs_clear_opt (*mount_options, ONERROR); 471 ufs_clear_opt (*mount_options, ONERROR);
470 ufs_set_opt (*mount_options, ONERROR_REPAIR); 472 ufs_set_opt (*mount_options, ONERROR_REPAIR);
471 break; 473 break;
472 default: 474 default:
473 printk("UFS-fs: Invalid option: \"%s\" " 475 pr_err("Invalid option: \"%s\" or missing value\n", p);
474 "or missing value\n", p);
475 return 0; 476 return 0;
476 } 477 }
477 } 478 }
@@ -788,8 +789,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
788 789
789#ifndef CONFIG_UFS_FS_WRITE 790#ifndef CONFIG_UFS_FS_WRITE
790 if (!(sb->s_flags & MS_RDONLY)) { 791 if (!(sb->s_flags & MS_RDONLY)) {
791 printk("ufs was compiled with read-only support, " 792 pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
792 "can't be mounted as read-write\n");
793 return -EROFS; 793 return -EROFS;
794 } 794 }
795#endif 795#endif
@@ -812,12 +812,12 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
812 sbi->s_mount_opt = 0; 812 sbi->s_mount_opt = 0;
813 ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); 813 ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK);
814 if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { 814 if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) {
815 printk("wrong mount options\n"); 815 pr_err("wrong mount options\n");
816 goto failed; 816 goto failed;
817 } 817 }
818 if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { 818 if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) {
819 if (!silent) 819 if (!silent)
820 printk("You didn't specify the type of your ufs filesystem\n\n" 820 pr_err("You didn't specify the type of your ufs filesystem\n\n"
821 "mount -t ufs -o ufstype=" 821 "mount -t ufs -o ufstype="
822 "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" 822 "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n"
823 ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " 823 ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, "
@@ -868,7 +868,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
868 break; 868 break;
869 869
870 case UFS_MOUNT_UFSTYPE_SUNOS: 870 case UFS_MOUNT_UFSTYPE_SUNOS:
871 UFSD(("ufstype=sunos\n")) 871 UFSD("ufstype=sunos\n");
872 uspi->s_fsize = block_size = 1024; 872 uspi->s_fsize = block_size = 1024;
873 uspi->s_fmask = ~(1024 - 1); 873 uspi->s_fmask = ~(1024 - 1);
874 uspi->s_fshift = 10; 874 uspi->s_fshift = 10;
@@ -900,7 +900,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
900 flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; 900 flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
901 if (!(sb->s_flags & MS_RDONLY)) { 901 if (!(sb->s_flags & MS_RDONLY)) {
902 if (!silent) 902 if (!silent)
903 printk(KERN_INFO "ufstype=old is supported read-only\n"); 903 pr_info("ufstype=old is supported read-only\n");
904 sb->s_flags |= MS_RDONLY; 904 sb->s_flags |= MS_RDONLY;
905 } 905 }
906 break; 906 break;
@@ -916,7 +916,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
916 flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; 916 flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
917 if (!(sb->s_flags & MS_RDONLY)) { 917 if (!(sb->s_flags & MS_RDONLY)) {
918 if (!silent) 918 if (!silent)
919 printk(KERN_INFO "ufstype=nextstep is supported read-only\n"); 919 pr_info("ufstype=nextstep is supported read-only\n");
920 sb->s_flags |= MS_RDONLY; 920 sb->s_flags |= MS_RDONLY;
921 } 921 }
922 break; 922 break;
@@ -932,7 +932,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
932 flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; 932 flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
933 if (!(sb->s_flags & MS_RDONLY)) { 933 if (!(sb->s_flags & MS_RDONLY)) {
934 if (!silent) 934 if (!silent)
935 printk(KERN_INFO "ufstype=nextstep-cd is supported read-only\n"); 935 pr_info("ufstype=nextstep-cd is supported read-only\n");
936 sb->s_flags |= MS_RDONLY; 936 sb->s_flags |= MS_RDONLY;
937 } 937 }
938 break; 938 break;
@@ -948,7 +948,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
948 flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; 948 flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD;
949 if (!(sb->s_flags & MS_RDONLY)) { 949 if (!(sb->s_flags & MS_RDONLY)) {
950 if (!silent) 950 if (!silent)
951 printk(KERN_INFO "ufstype=openstep is supported read-only\n"); 951 pr_info("ufstype=openstep is supported read-only\n");
952 sb->s_flags |= MS_RDONLY; 952 sb->s_flags |= MS_RDONLY;
953 } 953 }
954 break; 954 break;
@@ -963,19 +963,19 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
963 flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; 963 flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD;
964 if (!(sb->s_flags & MS_RDONLY)) { 964 if (!(sb->s_flags & MS_RDONLY)) {
965 if (!silent) 965 if (!silent)
966 printk(KERN_INFO "ufstype=hp is supported read-only\n"); 966 pr_info("ufstype=hp is supported read-only\n");
967 sb->s_flags |= MS_RDONLY; 967 sb->s_flags |= MS_RDONLY;
968 } 968 }
969 break; 969 break;
970 default: 970 default:
971 if (!silent) 971 if (!silent)
972 printk("unknown ufstype\n"); 972 pr_err("unknown ufstype\n");
973 goto failed; 973 goto failed;
974 } 974 }
975 975
976again: 976again:
977 if (!sb_set_blocksize(sb, block_size)) { 977 if (!sb_set_blocksize(sb, block_size)) {
978 printk(KERN_ERR "UFS: failed to set blocksize\n"); 978 pr_err("failed to set blocksize\n");
979 goto failed; 979 goto failed;
980 } 980 }
981 981
@@ -1034,7 +1034,7 @@ again:
1034 goto again; 1034 goto again;
1035 } 1035 }
1036 if (!silent) 1036 if (!silent)
1037 printk("ufs_read_super: bad magic number\n"); 1037 pr_err("%s(): bad magic number\n", __func__);
1038 goto failed; 1038 goto failed;
1039 1039
1040magic_found: 1040magic_found:
@@ -1048,33 +1048,33 @@ magic_found:
1048 uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); 1048 uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift);
1049 1049
1050 if (!is_power_of_2(uspi->s_fsize)) { 1050 if (!is_power_of_2(uspi->s_fsize)) {
1051 printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n", 1051 pr_err("%s(): fragment size %u is not a power of 2\n",
1052 uspi->s_fsize); 1052 __func__, uspi->s_fsize);
1053 goto failed; 1053 goto failed;
1054 } 1054 }
1055 if (uspi->s_fsize < 512) { 1055 if (uspi->s_fsize < 512) {
1056 printk(KERN_ERR "ufs_read_super: fragment size %u is too small\n", 1056 pr_err("%s(): fragment size %u is too small\n",
1057 uspi->s_fsize); 1057 __func__, uspi->s_fsize);
1058 goto failed; 1058 goto failed;
1059 } 1059 }
1060 if (uspi->s_fsize > 4096) { 1060 if (uspi->s_fsize > 4096) {
1061 printk(KERN_ERR "ufs_read_super: fragment size %u is too large\n", 1061 pr_err("%s(): fragment size %u is too large\n",
1062 uspi->s_fsize); 1062 __func__, uspi->s_fsize);
1063 goto failed; 1063 goto failed;
1064 } 1064 }
1065 if (!is_power_of_2(uspi->s_bsize)) { 1065 if (!is_power_of_2(uspi->s_bsize)) {
1066 printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n", 1066 pr_err("%s(): block size %u is not a power of 2\n",
1067 uspi->s_bsize); 1067 __func__, uspi->s_bsize);
1068 goto failed; 1068 goto failed;
1069 } 1069 }
1070 if (uspi->s_bsize < 4096) { 1070 if (uspi->s_bsize < 4096) {
1071 printk(KERN_ERR "ufs_read_super: block size %u is too small\n", 1071 pr_err("%s(): block size %u is too small\n",
1072 uspi->s_bsize); 1072 __func__, uspi->s_bsize);
1073 goto failed; 1073 goto failed;
1074 } 1074 }
1075 if (uspi->s_bsize / uspi->s_fsize > 8) { 1075 if (uspi->s_bsize / uspi->s_fsize > 8) {
1076 printk(KERN_ERR "ufs_read_super: too many fragments per block (%u)\n", 1076 pr_err("%s(): too many fragments per block (%u)\n",
1077 uspi->s_bsize / uspi->s_fsize); 1077 __func__, uspi->s_bsize / uspi->s_fsize);
1078 goto failed; 1078 goto failed;
1079 } 1079 }
1080 if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) { 1080 if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) {
@@ -1113,20 +1113,21 @@ magic_found:
1113 UFSD("fs is DEC OSF/1\n"); 1113 UFSD("fs is DEC OSF/1\n");
1114 break; 1114 break;
1115 case UFS_FSACTIVE: 1115 case UFS_FSACTIVE:
1116 printk("ufs_read_super: fs is active\n"); 1116 pr_err("%s(): fs is active\n", __func__);
1117 sb->s_flags |= MS_RDONLY; 1117 sb->s_flags |= MS_RDONLY;
1118 break; 1118 break;
1119 case UFS_FSBAD: 1119 case UFS_FSBAD:
1120 printk("ufs_read_super: fs is bad\n"); 1120 pr_err("%s(): fs is bad\n", __func__);
1121 sb->s_flags |= MS_RDONLY; 1121 sb->s_flags |= MS_RDONLY;
1122 break; 1122 break;
1123 default: 1123 default:
1124 printk("ufs_read_super: can't grok fs_clean 0x%x\n", usb1->fs_clean); 1124 pr_err("%s(): can't grok fs_clean 0x%x\n",
1125 __func__, usb1->fs_clean);
1125 sb->s_flags |= MS_RDONLY; 1126 sb->s_flags |= MS_RDONLY;
1126 break; 1127 break;
1127 } 1128 }
1128 } else { 1129 } else {
1129 printk("ufs_read_super: fs needs fsck\n"); 1130 pr_err("%s(): fs needs fsck\n", __func__);
1130 sb->s_flags |= MS_RDONLY; 1131 sb->s_flags |= MS_RDONLY;
1131 } 1132 }
1132 1133
@@ -1299,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1299 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { 1300 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
1300 new_mount_opt |= ufstype; 1301 new_mount_opt |= ufstype;
1301 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { 1302 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1302 printk("ufstype can't be changed during remount\n"); 1303 pr_err("ufstype can't be changed during remount\n");
1303 unlock_ufs(sb); 1304 unlock_ufs(sb);
1304 return -EINVAL; 1305 return -EINVAL;
1305 } 1306 }
@@ -1328,8 +1329,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1328 * fs was mounted as ro, remounting rw 1329 * fs was mounted as ro, remounting rw
1329 */ 1330 */
1330#ifndef CONFIG_UFS_FS_WRITE 1331#ifndef CONFIG_UFS_FS_WRITE
1331 printk("ufs was compiled with read-only support, " 1332 pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n");
1332 "can't be mounted as read-write\n");
1333 unlock_ufs(sb); 1333 unlock_ufs(sb);
1334 return -EINVAL; 1334 return -EINVAL;
1335#else 1335#else
@@ -1338,12 +1338,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1338 ufstype != UFS_MOUNT_UFSTYPE_44BSD && 1338 ufstype != UFS_MOUNT_UFSTYPE_44BSD &&
1339 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && 1339 ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
1340 ufstype != UFS_MOUNT_UFSTYPE_UFS2) { 1340 ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
1341 printk("this ufstype is read-only supported\n"); 1341 pr_err("this ufstype is read-only supported\n");
1342 unlock_ufs(sb); 1342 unlock_ufs(sb);
1343 return -EINVAL; 1343 return -EINVAL;
1344 } 1344 }
1345 if (!ufs_read_cylinder_structures(sb)) { 1345 if (!ufs_read_cylinder_structures(sb)) {
1346 printk("failed during remounting\n"); 1346 pr_err("failed during remounting\n");
1347 unlock_ufs(sb); 1347 unlock_ufs(sb);
1348 return -EPERM; 1348 return -EPERM;
1349 } 1349 }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 343e6fc571e5..2a07396d5f9e 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -1,6 +1,12 @@
1#ifndef _UFS_UFS_H 1#ifndef _UFS_UFS_H
2#define _UFS_UFS_H 1 2#define _UFS_UFS_H 1
3 3
4#ifdef pr_fmt
5#undef pr_fmt
6#endif
7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
4#define UFS_MAX_GROUP_LOADED 8 10#define UFS_MAX_GROUP_LOADED 8
5#define UFS_CGNO_EMPTY ((unsigned)-1) 11#define UFS_CGNO_EMPTY ((unsigned)-1)
6 12
@@ -71,9 +77,9 @@ struct ufs_inode_info {
71 */ 77 */
72#ifdef CONFIG_UFS_DEBUG 78#ifdef CONFIG_UFS_DEBUG
73# define UFSD(f, a...) { \ 79# define UFSD(f, a...) { \
74 printk ("UFSD (%s, %d): %s:", \ 80 pr_debug("UFSD (%s, %d): %s:", \
75 __FILE__, __LINE__, __func__); \ 81 __FILE__, __LINE__, __func__); \
76 printk (f, ## a); \ 82 pr_debug(f, ## a); \
77 } 83 }
78#else 84#else
79# define UFSD(f, a...) /**/ 85# define UFSD(f, a...) /**/
diff --git a/fs/xattr.c b/fs/xattr.c
index c69e6d43a0d2..64e83efb742d 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -364,13 +364,12 @@ out:
364 return error; 364 return error;
365} 365}
366 366
367SYSCALL_DEFINE5(setxattr, const char __user *, pathname, 367static int path_setxattr(const char __user *pathname,
368 const char __user *, name, const void __user *, value, 368 const char __user *name, const void __user *value,
369 size_t, size, int, flags) 369 size_t size, int flags, unsigned int lookup_flags)
370{ 370{
371 struct path path; 371 struct path path;
372 int error; 372 int error;
373 unsigned int lookup_flags = LOOKUP_FOLLOW;
374retry: 373retry:
375 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); 374 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
376 if (error) 375 if (error)
@@ -388,28 +387,18 @@ retry:
388 return error; 387 return error;
389} 388}
390 389
390SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
391 const char __user *, name, const void __user *, value,
392 size_t, size, int, flags)
393{
394 return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
395}
396
391SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, 397SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
392 const char __user *, name, const void __user *, value, 398 const char __user *, name, const void __user *, value,
393 size_t, size, int, flags) 399 size_t, size, int, flags)
394{ 400{
395 struct path path; 401 return path_setxattr(pathname, name, value, size, flags, 0);
396 int error;
397 unsigned int lookup_flags = 0;
398retry:
399 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
400 if (error)
401 return error;
402 error = mnt_want_write(path.mnt);
403 if (!error) {
404 error = setxattr(path.dentry, name, value, size, flags);
405 mnt_drop_write(path.mnt);
406 }
407 path_put(&path);
408 if (retry_estale(error, lookup_flags)) {
409 lookup_flags |= LOOKUP_REVAL;
410 goto retry;
411 }
412 return error;
413} 402}
414 403
415SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, 404SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
@@ -481,12 +470,12 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
481 return error; 470 return error;
482} 471}
483 472
484SYSCALL_DEFINE4(getxattr, const char __user *, pathname, 473static ssize_t path_getxattr(const char __user *pathname,
485 const char __user *, name, void __user *, value, size_t, size) 474 const char __user *name, void __user *value,
475 size_t size, unsigned int lookup_flags)
486{ 476{
487 struct path path; 477 struct path path;
488 ssize_t error; 478 ssize_t error;
489 unsigned int lookup_flags = LOOKUP_FOLLOW;
490retry: 479retry:
491 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); 480 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
492 if (error) 481 if (error)
@@ -500,23 +489,16 @@ retry:
500 return error; 489 return error;
501} 490}
502 491
492SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
493 const char __user *, name, void __user *, value, size_t, size)
494{
495 return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
496}
497
503SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, 498SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
504 const char __user *, name, void __user *, value, size_t, size) 499 const char __user *, name, void __user *, value, size_t, size)
505{ 500{
506 struct path path; 501 return path_getxattr(pathname, name, value, size, 0);
507 ssize_t error;
508 unsigned int lookup_flags = 0;
509retry:
510 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
511 if (error)
512 return error;
513 error = getxattr(path.dentry, name, value, size);
514 path_put(&path);
515 if (retry_estale(error, lookup_flags)) {
516 lookup_flags |= LOOKUP_REVAL;
517 goto retry;
518 }
519 return error;
520} 502}
521 503
522SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, 504SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
@@ -571,12 +553,11 @@ listxattr(struct dentry *d, char __user *list, size_t size)
571 return error; 553 return error;
572} 554}
573 555
574SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, 556static ssize_t path_listxattr(const char __user *pathname, char __user *list,
575 size_t, size) 557 size_t size, unsigned int lookup_flags)
576{ 558{
577 struct path path; 559 struct path path;
578 ssize_t error; 560 ssize_t error;
579 unsigned int lookup_flags = LOOKUP_FOLLOW;
580retry: 561retry:
581 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); 562 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
582 if (error) 563 if (error)
@@ -590,23 +571,16 @@ retry:
590 return error; 571 return error;
591} 572}
592 573
574SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
575 size_t, size)
576{
577 return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
578}
579
593SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, 580SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
594 size_t, size) 581 size_t, size)
595{ 582{
596 struct path path; 583 return path_listxattr(pathname, list, size, 0);
597 ssize_t error;
598 unsigned int lookup_flags = 0;
599retry:
600 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
601 if (error)
602 return error;
603 error = listxattr(path.dentry, list, size);
604 path_put(&path);
605 if (retry_estale(error, lookup_flags)) {
606 lookup_flags |= LOOKUP_REVAL;
607 goto retry;
608 }
609 return error;
610} 584}
611 585
612SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) 586SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
@@ -640,12 +614,11 @@ removexattr(struct dentry *d, const char __user *name)
640 return vfs_removexattr(d, kname); 614 return vfs_removexattr(d, kname);
641} 615}
642 616
643SYSCALL_DEFINE2(removexattr, const char __user *, pathname, 617static int path_removexattr(const char __user *pathname,
644 const char __user *, name) 618 const char __user *name, unsigned int lookup_flags)
645{ 619{
646 struct path path; 620 struct path path;
647 int error; 621 int error;
648 unsigned int lookup_flags = LOOKUP_FOLLOW;
649retry: 622retry:
650 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); 623 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
651 if (error) 624 if (error)
@@ -663,27 +636,16 @@ retry:
663 return error; 636 return error;
664} 637}
665 638
639SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
640 const char __user *, name)
641{
642 return path_removexattr(pathname, name, LOOKUP_FOLLOW);
643}
644
666SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, 645SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
667 const char __user *, name) 646 const char __user *, name)
668{ 647{
669 struct path path; 648 return path_removexattr(pathname, name, 0);
670 int error;
671 unsigned int lookup_flags = 0;
672retry:
673 error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
674 if (error)
675 return error;
676 error = mnt_want_write(path.mnt);
677 if (!error) {
678 error = removexattr(path.dentry, name);
679 mnt_drop_write(path.mnt);
680 }
681 path_put(&path);
682 if (retry_estale(error, lookup_flags)) {
683 lookup_flags |= LOOKUP_REVAL;
684 goto retry;
685 }
686 return error;
687} 649}
688 650
689SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) 651SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 399e8cec6e60..5d47b4df61ea 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -1,6 +1,7 @@
1config XFS_FS 1config XFS_FS
2 tristate "XFS filesystem support" 2 tristate "XFS filesystem support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on (64BIT || LBDAF)
4 select EXPORTFS 5 select EXPORTFS
5 select LIBCRC32C 6 select LIBCRC32C
6 help 7 help
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index c21f43506661..d61799949580 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -17,6 +17,7 @@
17# 17#
18 18
19ccflags-y += -I$(src) # needed for trace events 19ccflags-y += -I$(src) # needed for trace events
20ccflags-y += -I$(src)/libxfs
20 21
21ccflags-$(CONFIG_XFS_DEBUG) += -g 22ccflags-$(CONFIG_XFS_DEBUG) += -g
22 23
@@ -25,6 +26,39 @@ obj-$(CONFIG_XFS_FS) += xfs.o
25# this one should be compiled first, as the tracing macros can easily blow up 26# this one should be compiled first, as the tracing macros can easily blow up
26xfs-y += xfs_trace.o 27xfs-y += xfs_trace.o
27 28
29# build the libxfs code first
30xfs-y += $(addprefix libxfs/, \
31 xfs_alloc.o \
32 xfs_alloc_btree.o \
33 xfs_attr.o \
34 xfs_attr_leaf.o \
35 xfs_attr_remote.o \
36 xfs_bmap.o \
37 xfs_bmap_btree.o \
38 xfs_btree.o \
39 xfs_da_btree.o \
40 xfs_da_format.o \
41 xfs_dir2.o \
42 xfs_dir2_block.o \
43 xfs_dir2_data.o \
44 xfs_dir2_leaf.o \
45 xfs_dir2_node.o \
46 xfs_dir2_sf.o \
47 xfs_dquot_buf.o \
48 xfs_ialloc.o \
49 xfs_ialloc_btree.o \
50 xfs_inode_fork.o \
51 xfs_inode_buf.o \
52 xfs_log_rlimit.o \
53 xfs_sb.o \
54 xfs_symlink_remote.o \
55 xfs_trans_resv.o \
56 )
57# xfs_rtbitmap is shared with libxfs
58xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \
59 xfs_rtbitmap.o \
60 )
61
28# highlevel code 62# highlevel code
29xfs-y += xfs_aops.o \ 63xfs-y += xfs_aops.o \
30 xfs_attr_inactive.o \ 64 xfs_attr_inactive.o \
@@ -45,53 +79,27 @@ xfs-y += xfs_aops.o \
45 xfs_ioctl.o \ 79 xfs_ioctl.o \
46 xfs_iomap.o \ 80 xfs_iomap.o \
47 xfs_iops.o \ 81 xfs_iops.o \
82 xfs_inode.o \
48 xfs_itable.o \ 83 xfs_itable.o \
49 xfs_message.o \ 84 xfs_message.o \
50 xfs_mount.o \ 85 xfs_mount.o \
51 xfs_mru_cache.o \ 86 xfs_mru_cache.o \
52 xfs_super.o \ 87 xfs_super.o \
53 xfs_symlink.o \ 88 xfs_symlink.o \
89 xfs_sysfs.o \
54 xfs_trans.o \ 90 xfs_trans.o \
55 xfs_xattr.o \ 91 xfs_xattr.o \
56 kmem.o \ 92 kmem.o \
57 uuid.o 93 uuid.o
58 94
59# code shared with libxfs
60xfs-y += xfs_alloc.o \
61 xfs_alloc_btree.o \
62 xfs_attr.o \
63 xfs_attr_leaf.o \
64 xfs_attr_remote.o \
65 xfs_bmap.o \
66 xfs_bmap_btree.o \
67 xfs_btree.o \
68 xfs_da_btree.o \
69 xfs_da_format.o \
70 xfs_dir2.o \
71 xfs_dir2_block.o \
72 xfs_dir2_data.o \
73 xfs_dir2_leaf.o \
74 xfs_dir2_node.o \
75 xfs_dir2_sf.o \
76 xfs_dquot_buf.o \
77 xfs_ialloc.o \
78 xfs_ialloc_btree.o \
79 xfs_icreate_item.o \
80 xfs_inode.o \
81 xfs_inode_fork.o \
82 xfs_inode_buf.o \
83 xfs_log_recover.o \
84 xfs_log_rlimit.o \
85 xfs_sb.o \
86 xfs_symlink_remote.o \
87 xfs_trans_resv.o
88
89# low-level transaction/log code 95# low-level transaction/log code
90xfs-y += xfs_log.o \ 96xfs-y += xfs_log.o \
91 xfs_log_cil.o \ 97 xfs_log_cil.o \
92 xfs_buf_item.o \ 98 xfs_buf_item.o \
93 xfs_extfree_item.o \ 99 xfs_extfree_item.o \
100 xfs_icreate_item.o \
94 xfs_inode_item.o \ 101 xfs_inode_item.o \
102 xfs_log_recover.o \
95 xfs_trans_ail.o \ 103 xfs_trans_ail.o \
96 xfs_trans_buf.o \ 104 xfs_trans_buf.o \
97 xfs_trans_extfree.o \ 105 xfs_trans_extfree.o \
@@ -107,8 +115,7 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
107 xfs_quotaops.o 115 xfs_quotaops.o
108 116
109# xfs_rtbitmap is shared with libxfs 117# xfs_rtbitmap is shared with libxfs
110xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ 118xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
111 xfs_rtbitmap.o
112 119
113xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o 120xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
114xfs-$(CONFIG_PROC_FS) += xfs_stats.o 121xfs-$(CONFIG_PROC_FS) += xfs_stats.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 844e288b9576..53e95b2a1369 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -21,7 +21,6 @@
21#include <linux/swap.h> 21#include <linux/swap.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include "time.h"
25#include "kmem.h" 24#include "kmem.h"
26#include "xfs_message.h" 25#include "xfs_message.h"
27 26
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 6e247a99f5db..6e247a99f5db 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index d43813267a80..eff34218f405 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -483,9 +483,9 @@ xfs_agfl_read_verify(
483 return; 483 return;
484 484
485 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 485 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
486 xfs_buf_ioerror(bp, EFSBADCRC); 486 xfs_buf_ioerror(bp, -EFSBADCRC);
487 else if (!xfs_agfl_verify(bp)) 487 else if (!xfs_agfl_verify(bp))
488 xfs_buf_ioerror(bp, EFSCORRUPTED); 488 xfs_buf_ioerror(bp, -EFSCORRUPTED);
489 489
490 if (bp->b_error) 490 if (bp->b_error)
491 xfs_verifier_error(bp); 491 xfs_verifier_error(bp);
@@ -503,7 +503,7 @@ xfs_agfl_write_verify(
503 return; 503 return;
504 504
505 if (!xfs_agfl_verify(bp)) { 505 if (!xfs_agfl_verify(bp)) {
506 xfs_buf_ioerror(bp, EFSCORRUPTED); 506 xfs_buf_ioerror(bp, -EFSCORRUPTED);
507 xfs_verifier_error(bp); 507 xfs_verifier_error(bp);
508 return; 508 return;
509 } 509 }
@@ -559,7 +559,7 @@ xfs_alloc_update_counters(
559 xfs_trans_agblocks_delta(tp, len); 559 xfs_trans_agblocks_delta(tp, len);
560 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 560 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
561 be32_to_cpu(agf->agf_length))) 561 be32_to_cpu(agf->agf_length)))
562 return EFSCORRUPTED; 562 return -EFSCORRUPTED;
563 563
564 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 564 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
565 return 0; 565 return 0;
@@ -2209,6 +2209,10 @@ xfs_agf_verify(
2209 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) 2209 be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
2210 return false; 2210 return false;
2211 2211
2212 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
2213 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
2214 return false;
2215
2212 /* 2216 /*
2213 * during growfs operations, the perag is not fully initialised, 2217 * during growfs operations, the perag is not fully initialised,
2214 * so we can't use it for any useful checking. growfs ensures we can't 2218 * so we can't use it for any useful checking. growfs ensures we can't
@@ -2234,11 +2238,11 @@ xfs_agf_read_verify(
2234 2238
2235 if (xfs_sb_version_hascrc(&mp->m_sb) && 2239 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2236 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 2240 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2237 xfs_buf_ioerror(bp, EFSBADCRC); 2241 xfs_buf_ioerror(bp, -EFSBADCRC);
2238 else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp, 2242 else if (XFS_TEST_ERROR(!xfs_agf_verify(mp, bp), mp,
2239 XFS_ERRTAG_ALLOC_READ_AGF, 2243 XFS_ERRTAG_ALLOC_READ_AGF,
2240 XFS_RANDOM_ALLOC_READ_AGF)) 2244 XFS_RANDOM_ALLOC_READ_AGF))
2241 xfs_buf_ioerror(bp, EFSCORRUPTED); 2245 xfs_buf_ioerror(bp, -EFSCORRUPTED);
2242 2246
2243 if (bp->b_error) 2247 if (bp->b_error)
2244 xfs_verifier_error(bp); 2248 xfs_verifier_error(bp);
@@ -2252,7 +2256,7 @@ xfs_agf_write_verify(
2252 struct xfs_buf_log_item *bip = bp->b_fspriv; 2256 struct xfs_buf_log_item *bip = bp->b_fspriv;
2253 2257
2254 if (!xfs_agf_verify(mp, bp)) { 2258 if (!xfs_agf_verify(mp, bp)) {
2255 xfs_buf_ioerror(bp, EFSCORRUPTED); 2259 xfs_buf_ioerror(bp, -EFSCORRUPTED);
2256 xfs_verifier_error(bp); 2260 xfs_verifier_error(bp);
2257 return; 2261 return;
2258 } 2262 }
@@ -2601,11 +2605,11 @@ xfs_free_extent(
2601 */ 2605 */
2602 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2606 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2603 if (args.agno >= args.mp->m_sb.sb_agcount) 2607 if (args.agno >= args.mp->m_sb.sb_agcount)
2604 return EFSCORRUPTED; 2608 return -EFSCORRUPTED;
2605 2609
2606 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2610 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2607 if (args.agbno >= args.mp->m_sb.sb_agblocks) 2611 if (args.agbno >= args.mp->m_sb.sb_agblocks)
2608 return EFSCORRUPTED; 2612 return -EFSCORRUPTED;
2609 2613
2610 args.pag = xfs_perag_get(args.mp, args.agno); 2614 args.pag = xfs_perag_get(args.mp, args.agno);
2611 ASSERT(args.pag); 2615 ASSERT(args.pag);
@@ -2617,7 +2621,7 @@ xfs_free_extent(
2617 /* validate the extent size is legal now we have the agf locked */ 2621 /* validate the extent size is legal now we have the agf locked */
2618 if (args.agbno + len > 2622 if (args.agbno + len >
2619 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) { 2623 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
2620 error = EFSCORRUPTED; 2624 error = -EFSCORRUPTED;
2621 goto error0; 2625 goto error0;
2622 } 2626 }
2623 2627
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index feacb061bab7..feacb061bab7 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 8358f1ded94d..e0e83e24d3ef 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -355,9 +355,9 @@ xfs_allocbt_read_verify(
355 struct xfs_buf *bp) 355 struct xfs_buf *bp)
356{ 356{
357 if (!xfs_btree_sblock_verify_crc(bp)) 357 if (!xfs_btree_sblock_verify_crc(bp))
358 xfs_buf_ioerror(bp, EFSBADCRC); 358 xfs_buf_ioerror(bp, -EFSBADCRC);
359 else if (!xfs_allocbt_verify(bp)) 359 else if (!xfs_allocbt_verify(bp))
360 xfs_buf_ioerror(bp, EFSCORRUPTED); 360 xfs_buf_ioerror(bp, -EFSCORRUPTED);
361 361
362 if (bp->b_error) { 362 if (bp->b_error) {
363 trace_xfs_btree_corrupt(bp, _RET_IP_); 363 trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -371,7 +371,7 @@ xfs_allocbt_write_verify(
371{ 371{
372 if (!xfs_allocbt_verify(bp)) { 372 if (!xfs_allocbt_verify(bp)) {
373 trace_xfs_btree_corrupt(bp, _RET_IP_); 373 trace_xfs_btree_corrupt(bp, _RET_IP_);
374 xfs_buf_ioerror(bp, EFSCORRUPTED); 374 xfs_buf_ioerror(bp, -EFSCORRUPTED);
375 xfs_verifier_error(bp); 375 xfs_verifier_error(bp);
376 return; 376 return;
377 } 377 }
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45e189e7e81c..45e189e7e81c 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index bfe36fc2cdc2..353fb425faef 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -85,7 +85,7 @@ xfs_attr_args_init(
85{ 85{
86 86
87 if (!name) 87 if (!name)
88 return EINVAL; 88 return -EINVAL;
89 89
90 memset(args, 0, sizeof(*args)); 90 memset(args, 0, sizeof(*args));
91 args->geo = dp->i_mount->m_attr_geo; 91 args->geo = dp->i_mount->m_attr_geo;
@@ -95,7 +95,7 @@ xfs_attr_args_init(
95 args->name = name; 95 args->name = name;
96 args->namelen = strlen((const char *)name); 96 args->namelen = strlen((const char *)name);
97 if (args->namelen >= MAXNAMELEN) 97 if (args->namelen >= MAXNAMELEN)
98 return EFAULT; /* match IRIX behaviour */ 98 return -EFAULT; /* match IRIX behaviour */
99 99
100 args->hashval = xfs_da_hashname(args->name, args->namelen); 100 args->hashval = xfs_da_hashname(args->name, args->namelen);
101 return 0; 101 return 0;
@@ -131,10 +131,10 @@ xfs_attr_get(
131 XFS_STATS_INC(xs_attr_get); 131 XFS_STATS_INC(xs_attr_get);
132 132
133 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 133 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
134 return EIO; 134 return -EIO;
135 135
136 if (!xfs_inode_hasattr(ip)) 136 if (!xfs_inode_hasattr(ip))
137 return ENOATTR; 137 return -ENOATTR;
138 138
139 error = xfs_attr_args_init(&args, ip, name, flags); 139 error = xfs_attr_args_init(&args, ip, name, flags);
140 if (error) 140 if (error)
@@ -145,7 +145,7 @@ xfs_attr_get(
145 145
146 lock_mode = xfs_ilock_attr_map_shared(ip); 146 lock_mode = xfs_ilock_attr_map_shared(ip);
147 if (!xfs_inode_hasattr(ip)) 147 if (!xfs_inode_hasattr(ip))
148 error = ENOATTR; 148 error = -ENOATTR;
149 else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) 149 else if (ip->i_d.di_aformat == XFS_DINODE_FMT_LOCAL)
150 error = xfs_attr_shortform_getvalue(&args); 150 error = xfs_attr_shortform_getvalue(&args);
151 else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK)) 151 else if (xfs_bmap_one_block(ip, XFS_ATTR_FORK))
@@ -155,7 +155,7 @@ xfs_attr_get(
155 xfs_iunlock(ip, lock_mode); 155 xfs_iunlock(ip, lock_mode);
156 156
157 *valuelenp = args.valuelen; 157 *valuelenp = args.valuelen;
158 return error == EEXIST ? 0 : error; 158 return error == -EEXIST ? 0 : error;
159} 159}
160 160
161/* 161/*
@@ -213,7 +213,7 @@ xfs_attr_set(
213 XFS_STATS_INC(xs_attr_set); 213 XFS_STATS_INC(xs_attr_set);
214 214
215 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 215 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
216 return EIO; 216 return -EIO;
217 217
218 error = xfs_attr_args_init(&args, dp, name, flags); 218 error = xfs_attr_args_init(&args, dp, name, flags);
219 if (error) 219 if (error)
@@ -304,7 +304,7 @@ xfs_attr_set(
304 * the inode. 304 * the inode.
305 */ 305 */
306 error = xfs_attr_shortform_addname(&args); 306 error = xfs_attr_shortform_addname(&args);
307 if (error != ENOSPC) { 307 if (error != -ENOSPC) {
308 /* 308 /*
309 * Commit the shortform mods, and we're done. 309 * Commit the shortform mods, and we're done.
310 * NOTE: this is also the error path (EEXIST, etc). 310 * NOTE: this is also the error path (EEXIST, etc).
@@ -419,10 +419,10 @@ xfs_attr_remove(
419 XFS_STATS_INC(xs_attr_remove); 419 XFS_STATS_INC(xs_attr_remove);
420 420
421 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 421 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
422 return EIO; 422 return -EIO;
423 423
424 if (!xfs_inode_hasattr(dp)) 424 if (!xfs_inode_hasattr(dp))
425 return ENOATTR; 425 return -ENOATTR;
426 426
427 error = xfs_attr_args_init(&args, dp, name, flags); 427 error = xfs_attr_args_init(&args, dp, name, flags);
428 if (error) 428 if (error)
@@ -477,7 +477,7 @@ xfs_attr_remove(
477 xfs_trans_ijoin(args.trans, dp, 0); 477 xfs_trans_ijoin(args.trans, dp, 0);
478 478
479 if (!xfs_inode_hasattr(dp)) { 479 if (!xfs_inode_hasattr(dp)) {
480 error = XFS_ERROR(ENOATTR); 480 error = -ENOATTR;
481 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { 481 } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
482 ASSERT(dp->i_afp->if_flags & XFS_IFINLINE); 482 ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
483 error = xfs_attr_shortform_remove(&args); 483 error = xfs_attr_shortform_remove(&args);
@@ -534,28 +534,28 @@ xfs_attr_shortform_addname(xfs_da_args_t *args)
534 trace_xfs_attr_sf_addname(args); 534 trace_xfs_attr_sf_addname(args);
535 535
536 retval = xfs_attr_shortform_lookup(args); 536 retval = xfs_attr_shortform_lookup(args);
537 if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { 537 if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
538 return(retval); 538 return retval;
539 } else if (retval == EEXIST) { 539 } else if (retval == -EEXIST) {
540 if (args->flags & ATTR_CREATE) 540 if (args->flags & ATTR_CREATE)
541 return(retval); 541 return retval;
542 retval = xfs_attr_shortform_remove(args); 542 retval = xfs_attr_shortform_remove(args);
543 ASSERT(retval == 0); 543 ASSERT(retval == 0);
544 } 544 }
545 545
546 if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX || 546 if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
547 args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX) 547 args->valuelen >= XFS_ATTR_SF_ENTSIZE_MAX)
548 return(XFS_ERROR(ENOSPC)); 548 return -ENOSPC;
549 549
550 newsize = XFS_ATTR_SF_TOTSIZE(args->dp); 550 newsize = XFS_ATTR_SF_TOTSIZE(args->dp);
551 newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen); 551 newsize += XFS_ATTR_SF_ENTSIZE_BYNAME(args->namelen, args->valuelen);
552 552
553 forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize); 553 forkoff = xfs_attr_shortform_bytesfit(args->dp, newsize);
554 if (!forkoff) 554 if (!forkoff)
555 return(XFS_ERROR(ENOSPC)); 555 return -ENOSPC;
556 556
557 xfs_attr_shortform_add(args, forkoff); 557 xfs_attr_shortform_add(args, forkoff);
558 return(0); 558 return 0;
559} 559}
560 560
561 561
@@ -592,10 +592,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
592 * the given flags produce an error or call for an atomic rename. 592 * the given flags produce an error or call for an atomic rename.
593 */ 593 */
594 retval = xfs_attr3_leaf_lookup_int(bp, args); 594 retval = xfs_attr3_leaf_lookup_int(bp, args);
595 if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { 595 if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
596 xfs_trans_brelse(args->trans, bp); 596 xfs_trans_brelse(args->trans, bp);
597 return retval; 597 return retval;
598 } else if (retval == EEXIST) { 598 } else if (retval == -EEXIST) {
599 if (args->flags & ATTR_CREATE) { /* pure create op */ 599 if (args->flags & ATTR_CREATE) { /* pure create op */
600 xfs_trans_brelse(args->trans, bp); 600 xfs_trans_brelse(args->trans, bp);
601 return retval; 601 return retval;
@@ -626,7 +626,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
626 * if required. 626 * if required.
627 */ 627 */
628 retval = xfs_attr3_leaf_add(bp, args); 628 retval = xfs_attr3_leaf_add(bp, args);
629 if (retval == ENOSPC) { 629 if (retval == -ENOSPC) {
630 /* 630 /*
631 * Promote the attribute list to the Btree format, then 631 * Promote the attribute list to the Btree format, then
632 * Commit that transaction so that the node_addname() call 632 * Commit that transaction so that the node_addname() call
@@ -642,7 +642,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
642 ASSERT(committed); 642 ASSERT(committed);
643 args->trans = NULL; 643 args->trans = NULL;
644 xfs_bmap_cancel(args->flist); 644 xfs_bmap_cancel(args->flist);
645 return(error); 645 return error;
646 } 646 }
647 647
648 /* 648 /*
@@ -658,13 +658,13 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
658 */ 658 */
659 error = xfs_trans_roll(&args->trans, dp); 659 error = xfs_trans_roll(&args->trans, dp);
660 if (error) 660 if (error)
661 return (error); 661 return error;
662 662
663 /* 663 /*
664 * Fob the whole rest of the problem off on the Btree code. 664 * Fob the whole rest of the problem off on the Btree code.
665 */ 665 */
666 error = xfs_attr_node_addname(args); 666 error = xfs_attr_node_addname(args);
667 return(error); 667 return error;
668 } 668 }
669 669
670 /* 670 /*
@@ -673,7 +673,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
673 */ 673 */
674 error = xfs_trans_roll(&args->trans, dp); 674 error = xfs_trans_roll(&args->trans, dp);
675 if (error) 675 if (error)
676 return (error); 676 return error;
677 677
678 /* 678 /*
679 * If there was an out-of-line value, allocate the blocks we 679 * If there was an out-of-line value, allocate the blocks we
@@ -684,7 +684,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
684 if (args->rmtblkno > 0) { 684 if (args->rmtblkno > 0) {
685 error = xfs_attr_rmtval_set(args); 685 error = xfs_attr_rmtval_set(args);
686 if (error) 686 if (error)
687 return(error); 687 return error;
688 } 688 }
689 689
690 /* 690 /*
@@ -700,7 +700,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
700 */ 700 */
701 error = xfs_attr3_leaf_flipflags(args); 701 error = xfs_attr3_leaf_flipflags(args);
702 if (error) 702 if (error)
703 return(error); 703 return error;
704 704
705 /* 705 /*
706 * Dismantle the "old" attribute/value pair by removing 706 * Dismantle the "old" attribute/value pair by removing
@@ -714,7 +714,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
714 if (args->rmtblkno) { 714 if (args->rmtblkno) {
715 error = xfs_attr_rmtval_remove(args); 715 error = xfs_attr_rmtval_remove(args);
716 if (error) 716 if (error)
717 return(error); 717 return error;
718 } 718 }
719 719
720 /* 720 /*
@@ -744,7 +744,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
744 ASSERT(committed); 744 ASSERT(committed);
745 args->trans = NULL; 745 args->trans = NULL;
746 xfs_bmap_cancel(args->flist); 746 xfs_bmap_cancel(args->flist);
747 return(error); 747 return error;
748 } 748 }
749 749
750 /* 750 /*
@@ -795,7 +795,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
795 return error; 795 return error;
796 796
797 error = xfs_attr3_leaf_lookup_int(bp, args); 797 error = xfs_attr3_leaf_lookup_int(bp, args);
798 if (error == ENOATTR) { 798 if (error == -ENOATTR) {
799 xfs_trans_brelse(args->trans, bp); 799 xfs_trans_brelse(args->trans, bp);
800 return error; 800 return error;
801 } 801 }
@@ -850,7 +850,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
850 return error; 850 return error;
851 851
852 error = xfs_attr3_leaf_lookup_int(bp, args); 852 error = xfs_attr3_leaf_lookup_int(bp, args);
853 if (error != EEXIST) { 853 if (error != -EEXIST) {
854 xfs_trans_brelse(args->trans, bp); 854 xfs_trans_brelse(args->trans, bp);
855 return error; 855 return error;
856 } 856 }
@@ -906,9 +906,9 @@ restart:
906 goto out; 906 goto out;
907 blk = &state->path.blk[ state->path.active-1 ]; 907 blk = &state->path.blk[ state->path.active-1 ];
908 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); 908 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
909 if ((args->flags & ATTR_REPLACE) && (retval == ENOATTR)) { 909 if ((args->flags & ATTR_REPLACE) && (retval == -ENOATTR)) {
910 goto out; 910 goto out;
911 } else if (retval == EEXIST) { 911 } else if (retval == -EEXIST) {
912 if (args->flags & ATTR_CREATE) 912 if (args->flags & ATTR_CREATE)
913 goto out; 913 goto out;
914 914
@@ -933,7 +933,7 @@ restart:
933 } 933 }
934 934
935 retval = xfs_attr3_leaf_add(blk->bp, state->args); 935 retval = xfs_attr3_leaf_add(blk->bp, state->args);
936 if (retval == ENOSPC) { 936 if (retval == -ENOSPC) {
937 if (state->path.active == 1) { 937 if (state->path.active == 1) {
938 /* 938 /*
939 * Its really a single leaf node, but it had 939 * Its really a single leaf node, but it had
@@ -1031,7 +1031,7 @@ restart:
1031 if (args->rmtblkno > 0) { 1031 if (args->rmtblkno > 0) {
1032 error = xfs_attr_rmtval_set(args); 1032 error = xfs_attr_rmtval_set(args);
1033 if (error) 1033 if (error)
1034 return(error); 1034 return error;
1035 } 1035 }
1036 1036
1037 /* 1037 /*
@@ -1061,7 +1061,7 @@ restart:
1061 if (args->rmtblkno) { 1061 if (args->rmtblkno) {
1062 error = xfs_attr_rmtval_remove(args); 1062 error = xfs_attr_rmtval_remove(args);
1063 if (error) 1063 if (error)
1064 return(error); 1064 return error;
1065 } 1065 }
1066 1066
1067 /* 1067 /*
@@ -1134,8 +1134,8 @@ out:
1134 if (state) 1134 if (state)
1135 xfs_da_state_free(state); 1135 xfs_da_state_free(state);
1136 if (error) 1136 if (error)
1137 return(error); 1137 return error;
1138 return(retval); 1138 return retval;
1139} 1139}
1140 1140
1141/* 1141/*
@@ -1168,7 +1168,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1168 * Search to see if name exists, and get back a pointer to it. 1168 * Search to see if name exists, and get back a pointer to it.
1169 */ 1169 */
1170 error = xfs_da3_node_lookup_int(state, &retval); 1170 error = xfs_da3_node_lookup_int(state, &retval);
1171 if (error || (retval != EEXIST)) { 1171 if (error || (retval != -EEXIST)) {
1172 if (error == 0) 1172 if (error == 0)
1173 error = retval; 1173 error = retval;
1174 goto out; 1174 goto out;
@@ -1297,7 +1297,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
1297 1297
1298out: 1298out:
1299 xfs_da_state_free(state); 1299 xfs_da_state_free(state);
1300 return(error); 1300 return error;
1301} 1301}
1302 1302
1303/* 1303/*
@@ -1345,7 +1345,7 @@ xfs_attr_fillstate(xfs_da_state_t *state)
1345 } 1345 }
1346 } 1346 }
1347 1347
1348 return(0); 1348 return 0;
1349} 1349}
1350 1350
1351/* 1351/*
@@ -1376,7 +1376,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1376 blk->blkno, blk->disk_blkno, 1376 blk->blkno, blk->disk_blkno,
1377 &blk->bp, XFS_ATTR_FORK); 1377 &blk->bp, XFS_ATTR_FORK);
1378 if (error) 1378 if (error)
1379 return(error); 1379 return error;
1380 } else { 1380 } else {
1381 blk->bp = NULL; 1381 blk->bp = NULL;
1382 } 1382 }
@@ -1395,13 +1395,13 @@ xfs_attr_refillstate(xfs_da_state_t *state)
1395 blk->blkno, blk->disk_blkno, 1395 blk->blkno, blk->disk_blkno,
1396 &blk->bp, XFS_ATTR_FORK); 1396 &blk->bp, XFS_ATTR_FORK);
1397 if (error) 1397 if (error)
1398 return(error); 1398 return error;
1399 } else { 1399 } else {
1400 blk->bp = NULL; 1400 blk->bp = NULL;
1401 } 1401 }
1402 } 1402 }
1403 1403
1404 return(0); 1404 return 0;
1405} 1405}
1406 1406
1407/* 1407/*
@@ -1431,7 +1431,7 @@ xfs_attr_node_get(xfs_da_args_t *args)
1431 error = xfs_da3_node_lookup_int(state, &retval); 1431 error = xfs_da3_node_lookup_int(state, &retval);
1432 if (error) { 1432 if (error) {
1433 retval = error; 1433 retval = error;
1434 } else if (retval == EEXIST) { 1434 } else if (retval == -EEXIST) {
1435 blk = &state->path.blk[ state->path.active-1 ]; 1435 blk = &state->path.blk[ state->path.active-1 ];
1436 ASSERT(blk->bp != NULL); 1436 ASSERT(blk->bp != NULL);
1437 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); 1437 ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@@ -1455,5 +1455,5 @@ xfs_attr_node_get(xfs_da_args_t *args)
1455 } 1455 }
1456 1456
1457 xfs_da_state_free(state); 1457 xfs_da_state_free(state);
1458 return(retval); 1458 return retval;
1459} 1459}
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 28712d29e43c..b1f73dbbf3d8 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -214,7 +214,7 @@ xfs_attr3_leaf_write_verify(
214 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr; 214 struct xfs_attr3_leaf_hdr *hdr3 = bp->b_addr;
215 215
216 if (!xfs_attr3_leaf_verify(bp)) { 216 if (!xfs_attr3_leaf_verify(bp)) {
217 xfs_buf_ioerror(bp, EFSCORRUPTED); 217 xfs_buf_ioerror(bp, -EFSCORRUPTED);
218 xfs_verifier_error(bp); 218 xfs_verifier_error(bp);
219 return; 219 return;
220 } 220 }
@@ -242,9 +242,9 @@ xfs_attr3_leaf_read_verify(
242 242
243 if (xfs_sb_version_hascrc(&mp->m_sb) && 243 if (xfs_sb_version_hascrc(&mp->m_sb) &&
244 !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) 244 !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF))
245 xfs_buf_ioerror(bp, EFSBADCRC); 245 xfs_buf_ioerror(bp, -EFSBADCRC);
246 else if (!xfs_attr3_leaf_verify(bp)) 246 else if (!xfs_attr3_leaf_verify(bp))
247 xfs_buf_ioerror(bp, EFSCORRUPTED); 247 xfs_buf_ioerror(bp, -EFSCORRUPTED);
248 248
249 if (bp->b_error) 249 if (bp->b_error)
250 xfs_verifier_error(bp); 250 xfs_verifier_error(bp);
@@ -547,7 +547,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
547 break; 547 break;
548 } 548 }
549 if (i == end) 549 if (i == end)
550 return(XFS_ERROR(ENOATTR)); 550 return -ENOATTR;
551 551
552 /* 552 /*
553 * Fix up the attribute fork data, covering the hole 553 * Fix up the attribute fork data, covering the hole
@@ -582,7 +582,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args)
582 582
583 xfs_sbversion_add_attr2(mp, args->trans); 583 xfs_sbversion_add_attr2(mp, args->trans);
584 584
585 return(0); 585 return 0;
586} 586}
587 587
588/* 588/*
@@ -611,9 +611,9 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
611 continue; 611 continue;
612 if (!xfs_attr_namesp_match(args->flags, sfe->flags)) 612 if (!xfs_attr_namesp_match(args->flags, sfe->flags))
613 continue; 613 continue;
614 return(XFS_ERROR(EEXIST)); 614 return -EEXIST;
615 } 615 }
616 return(XFS_ERROR(ENOATTR)); 616 return -ENOATTR;
617} 617}
618 618
619/* 619/*
@@ -640,18 +640,18 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args)
640 continue; 640 continue;
641 if (args->flags & ATTR_KERNOVAL) { 641 if (args->flags & ATTR_KERNOVAL) {
642 args->valuelen = sfe->valuelen; 642 args->valuelen = sfe->valuelen;
643 return(XFS_ERROR(EEXIST)); 643 return -EEXIST;
644 } 644 }
645 if (args->valuelen < sfe->valuelen) { 645 if (args->valuelen < sfe->valuelen) {
646 args->valuelen = sfe->valuelen; 646 args->valuelen = sfe->valuelen;
647 return(XFS_ERROR(ERANGE)); 647 return -ERANGE;
648 } 648 }
649 args->valuelen = sfe->valuelen; 649 args->valuelen = sfe->valuelen;
650 memcpy(args->value, &sfe->nameval[args->namelen], 650 memcpy(args->value, &sfe->nameval[args->namelen],
651 args->valuelen); 651 args->valuelen);
652 return(XFS_ERROR(EEXIST)); 652 return -EEXIST;
653 } 653 }
654 return(XFS_ERROR(ENOATTR)); 654 return -ENOATTR;
655} 655}
656 656
657/* 657/*
@@ -691,7 +691,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
691 * If we hit an IO error middle of the transaction inside 691 * If we hit an IO error middle of the transaction inside
692 * grow_inode(), we may have inconsistent data. Bail out. 692 * grow_inode(), we may have inconsistent data. Bail out.
693 */ 693 */
694 if (error == EIO) 694 if (error == -EIO)
695 goto out; 695 goto out;
696 xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ 696 xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */
697 memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ 697 memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */
@@ -730,9 +730,9 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
730 sfe->namelen); 730 sfe->namelen);
731 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags); 731 nargs.flags = XFS_ATTR_NSP_ONDISK_TO_ARGS(sfe->flags);
732 error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ 732 error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
733 ASSERT(error == ENOATTR); 733 ASSERT(error == -ENOATTR);
734 error = xfs_attr3_leaf_add(bp, &nargs); 734 error = xfs_attr3_leaf_add(bp, &nargs);
735 ASSERT(error != ENOSPC); 735 ASSERT(error != -ENOSPC);
736 if (error) 736 if (error)
737 goto out; 737 goto out;
738 sfe = XFS_ATTR_SF_NEXTENTRY(sfe); 738 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
@@ -741,7 +741,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args)
741 741
742out: 742out:
743 kmem_free(tmpbuffer); 743 kmem_free(tmpbuffer);
744 return(error); 744 return error;
745} 745}
746 746
747/* 747/*
@@ -769,12 +769,12 @@ xfs_attr_shortform_allfit(
769 if (entry->flags & XFS_ATTR_INCOMPLETE) 769 if (entry->flags & XFS_ATTR_INCOMPLETE)
770 continue; /* don't copy partial entries */ 770 continue; /* don't copy partial entries */
771 if (!(entry->flags & XFS_ATTR_LOCAL)) 771 if (!(entry->flags & XFS_ATTR_LOCAL))
772 return(0); 772 return 0;
773 name_loc = xfs_attr3_leaf_name_local(leaf, i); 773 name_loc = xfs_attr3_leaf_name_local(leaf, i);
774 if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX) 774 if (name_loc->namelen >= XFS_ATTR_SF_ENTSIZE_MAX)
775 return(0); 775 return 0;
776 if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX) 776 if (be16_to_cpu(name_loc->valuelen) >= XFS_ATTR_SF_ENTSIZE_MAX)
777 return(0); 777 return 0;
778 bytes += sizeof(struct xfs_attr_sf_entry) - 1 778 bytes += sizeof(struct xfs_attr_sf_entry) - 1
779 + name_loc->namelen 779 + name_loc->namelen
780 + be16_to_cpu(name_loc->valuelen); 780 + be16_to_cpu(name_loc->valuelen);
@@ -809,7 +809,7 @@ xfs_attr3_leaf_to_shortform(
809 809
810 tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP); 810 tmpbuffer = kmem_alloc(args->geo->blksize, KM_SLEEP);
811 if (!tmpbuffer) 811 if (!tmpbuffer)
812 return ENOMEM; 812 return -ENOMEM;
813 813
814 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); 814 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
815 815
@@ -1017,10 +1017,10 @@ xfs_attr3_leaf_split(
1017 ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); 1017 ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC);
1018 error = xfs_da_grow_inode(state->args, &blkno); 1018 error = xfs_da_grow_inode(state->args, &blkno);
1019 if (error) 1019 if (error)
1020 return(error); 1020 return error;
1021 error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp); 1021 error = xfs_attr3_leaf_create(state->args, blkno, &newblk->bp);
1022 if (error) 1022 if (error)
1023 return(error); 1023 return error;
1024 newblk->blkno = blkno; 1024 newblk->blkno = blkno;
1025 newblk->magic = XFS_ATTR_LEAF_MAGIC; 1025 newblk->magic = XFS_ATTR_LEAF_MAGIC;
1026 1026
@@ -1031,7 +1031,7 @@ xfs_attr3_leaf_split(
1031 xfs_attr3_leaf_rebalance(state, oldblk, newblk); 1031 xfs_attr3_leaf_rebalance(state, oldblk, newblk);
1032 error = xfs_da3_blk_link(state, oldblk, newblk); 1032 error = xfs_da3_blk_link(state, oldblk, newblk);
1033 if (error) 1033 if (error)
1034 return(error); 1034 return error;
1035 1035
1036 /* 1036 /*
1037 * Save info on "old" attribute for "atomic rename" ops, leaf_add() 1037 * Save info on "old" attribute for "atomic rename" ops, leaf_add()
@@ -1053,7 +1053,7 @@ xfs_attr3_leaf_split(
1053 */ 1053 */
1054 oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); 1054 oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL);
1055 newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); 1055 newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL);
1056 return(error); 1056 return error;
1057} 1057}
1058 1058
1059/* 1059/*
@@ -1108,7 +1108,7 @@ xfs_attr3_leaf_add(
1108 * no good and we should just give up. 1108 * no good and we should just give up.
1109 */ 1109 */
1110 if (!ichdr.holes && sum < entsize) 1110 if (!ichdr.holes && sum < entsize)
1111 return XFS_ERROR(ENOSPC); 1111 return -ENOSPC;
1112 1112
1113 /* 1113 /*
1114 * Compact the entries to coalesce free space. 1114 * Compact the entries to coalesce free space.
@@ -1121,7 +1121,7 @@ xfs_attr3_leaf_add(
1121 * free region, in freemap[0]. If it is not big enough, give up. 1121 * free region, in freemap[0]. If it is not big enough, give up.
1122 */ 1122 */
1123 if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { 1123 if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) {
1124 tmp = ENOSPC; 1124 tmp = -ENOSPC;
1125 goto out_log_hdr; 1125 goto out_log_hdr;
1126 } 1126 }
1127 1127
@@ -1692,7 +1692,7 @@ xfs_attr3_leaf_toosmall(
1692 ichdr.usedbytes; 1692 ichdr.usedbytes;
1693 if (bytes > (state->args->geo->blksize >> 1)) { 1693 if (bytes > (state->args->geo->blksize >> 1)) {
1694 *action = 0; /* blk over 50%, don't try to join */ 1694 *action = 0; /* blk over 50%, don't try to join */
1695 return(0); 1695 return 0;
1696 } 1696 }
1697 1697
1698 /* 1698 /*
@@ -1711,7 +1711,7 @@ xfs_attr3_leaf_toosmall(
1711 error = xfs_da3_path_shift(state, &state->altpath, forward, 1711 error = xfs_da3_path_shift(state, &state->altpath, forward,
1712 0, &retval); 1712 0, &retval);
1713 if (error) 1713 if (error)
1714 return(error); 1714 return error;
1715 if (retval) { 1715 if (retval) {
1716 *action = 0; 1716 *action = 0;
1717 } else { 1717 } else {
@@ -1740,7 +1740,7 @@ xfs_attr3_leaf_toosmall(
1740 error = xfs_attr3_leaf_read(state->args->trans, state->args->dp, 1740 error = xfs_attr3_leaf_read(state->args->trans, state->args->dp,
1741 blkno, -1, &bp); 1741 blkno, -1, &bp);
1742 if (error) 1742 if (error)
1743 return(error); 1743 return error;
1744 1744
1745 xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); 1745 xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr);
1746 1746
@@ -1757,7 +1757,7 @@ xfs_attr3_leaf_toosmall(
1757 } 1757 }
1758 if (i >= 2) { 1758 if (i >= 2) {
1759 *action = 0; 1759 *action = 0;
1760 return(0); 1760 return 0;
1761 } 1761 }
1762 1762
1763 /* 1763 /*
@@ -1773,13 +1773,13 @@ xfs_attr3_leaf_toosmall(
1773 0, &retval); 1773 0, &retval);
1774 } 1774 }
1775 if (error) 1775 if (error)
1776 return(error); 1776 return error;
1777 if (retval) { 1777 if (retval) {
1778 *action = 0; 1778 *action = 0;
1779 } else { 1779 } else {
1780 *action = 1; 1780 *action = 1;
1781 } 1781 }
1782 return(0); 1782 return 0;
1783} 1783}
1784 1784
1785/* 1785/*
@@ -2123,7 +2123,7 @@ xfs_attr3_leaf_lookup_int(
2123 } 2123 }
2124 if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) { 2124 if (probe == ichdr.count || be32_to_cpu(entry->hashval) != hashval) {
2125 args->index = probe; 2125 args->index = probe;
2126 return XFS_ERROR(ENOATTR); 2126 return -ENOATTR;
2127 } 2127 }
2128 2128
2129 /* 2129 /*
@@ -2152,7 +2152,7 @@ xfs_attr3_leaf_lookup_int(
2152 if (!xfs_attr_namesp_match(args->flags, entry->flags)) 2152 if (!xfs_attr_namesp_match(args->flags, entry->flags))
2153 continue; 2153 continue;
2154 args->index = probe; 2154 args->index = probe;
2155 return XFS_ERROR(EEXIST); 2155 return -EEXIST;
2156 } else { 2156 } else {
2157 name_rmt = xfs_attr3_leaf_name_remote(leaf, probe); 2157 name_rmt = xfs_attr3_leaf_name_remote(leaf, probe);
2158 if (name_rmt->namelen != args->namelen) 2158 if (name_rmt->namelen != args->namelen)
@@ -2168,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
2168 args->rmtblkcnt = xfs_attr3_rmt_blocks( 2168 args->rmtblkcnt = xfs_attr3_rmt_blocks(
2169 args->dp->i_mount, 2169 args->dp->i_mount,
2170 args->rmtvaluelen); 2170 args->rmtvaluelen);
2171 return XFS_ERROR(EEXIST); 2171 return -EEXIST;
2172 } 2172 }
2173 } 2173 }
2174 args->index = probe; 2174 args->index = probe;
2175 return XFS_ERROR(ENOATTR); 2175 return -ENOATTR;
2176} 2176}
2177 2177
2178/* 2178/*
@@ -2208,7 +2208,7 @@ xfs_attr3_leaf_getvalue(
2208 } 2208 }
2209 if (args->valuelen < valuelen) { 2209 if (args->valuelen < valuelen) {
2210 args->valuelen = valuelen; 2210 args->valuelen = valuelen;
2211 return XFS_ERROR(ERANGE); 2211 return -ERANGE;
2212 } 2212 }
2213 args->valuelen = valuelen; 2213 args->valuelen = valuelen;
2214 memcpy(args->value, &name_loc->nameval[args->namelen], valuelen); 2214 memcpy(args->value, &name_loc->nameval[args->namelen], valuelen);
@@ -2226,7 +2226,7 @@ xfs_attr3_leaf_getvalue(
2226 } 2226 }
2227 if (args->valuelen < args->rmtvaluelen) { 2227 if (args->valuelen < args->rmtvaluelen) {
2228 args->valuelen = args->rmtvaluelen; 2228 args->valuelen = args->rmtvaluelen;
2229 return XFS_ERROR(ERANGE); 2229 return -ERANGE;
2230 } 2230 }
2231 args->valuelen = args->rmtvaluelen; 2231 args->valuelen = args->rmtvaluelen;
2232 } 2232 }
@@ -2481,7 +2481,7 @@ xfs_attr3_leaf_clearflag(
2481 */ 2481 */
2482 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); 2482 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2483 if (error) 2483 if (error)
2484 return(error); 2484 return error;
2485 2485
2486 leaf = bp->b_addr; 2486 leaf = bp->b_addr;
2487 entry = &xfs_attr3_leaf_entryp(leaf)[args->index]; 2487 entry = &xfs_attr3_leaf_entryp(leaf)[args->index];
@@ -2548,7 +2548,7 @@ xfs_attr3_leaf_setflag(
2548 */ 2548 */
2549 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp); 2549 error = xfs_attr3_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
2550 if (error) 2550 if (error)
2551 return(error); 2551 return error;
2552 2552
2553 leaf = bp->b_addr; 2553 leaf = bp->b_addr;
2554#ifdef DEBUG 2554#ifdef DEBUG
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index e2929da7c3ba..e2929da7c3ba 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index b5adfecbb8ee..7510ab8058a4 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -138,11 +138,11 @@ xfs_attr3_rmt_read_verify(
138 138
139 while (len > 0) { 139 while (len > 0) {
140 if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) { 140 if (!xfs_verify_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF)) {
141 xfs_buf_ioerror(bp, EFSBADCRC); 141 xfs_buf_ioerror(bp, -EFSBADCRC);
142 break; 142 break;
143 } 143 }
144 if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { 144 if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
145 xfs_buf_ioerror(bp, EFSCORRUPTED); 145 xfs_buf_ioerror(bp, -EFSCORRUPTED);
146 break; 146 break;
147 } 147 }
148 len -= blksize; 148 len -= blksize;
@@ -178,7 +178,7 @@ xfs_attr3_rmt_write_verify(
178 178
179 while (len > 0) { 179 while (len > 0) {
180 if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { 180 if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) {
181 xfs_buf_ioerror(bp, EFSCORRUPTED); 181 xfs_buf_ioerror(bp, -EFSCORRUPTED);
182 xfs_verifier_error(bp); 182 xfs_verifier_error(bp);
183 return; 183 return;
184 } 184 }
@@ -257,7 +257,7 @@ xfs_attr_rmtval_copyout(
257 xfs_alert(mp, 257 xfs_alert(mp,
258"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", 258"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
259 bno, *offset, byte_cnt, ino); 259 bno, *offset, byte_cnt, ino);
260 return EFSCORRUPTED; 260 return -EFSCORRUPTED;
261 } 261 }
262 hdr_size = sizeof(struct xfs_attr3_rmt_hdr); 262 hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
263 } 263 }
@@ -452,7 +452,7 @@ xfs_attr_rmtval_set(
452 ASSERT(committed); 452 ASSERT(committed);
453 args->trans = NULL; 453 args->trans = NULL;
454 xfs_bmap_cancel(args->flist); 454 xfs_bmap_cancel(args->flist);
455 return(error); 455 return error;
456 } 456 }
457 457
458 /* 458 /*
@@ -473,7 +473,7 @@ xfs_attr_rmtval_set(
473 */ 473 */
474 error = xfs_trans_roll(&args->trans, dp); 474 error = xfs_trans_roll(&args->trans, dp);
475 if (error) 475 if (error)
476 return (error); 476 return error;
477 } 477 }
478 478
479 /* 479 /*
@@ -498,7 +498,7 @@ xfs_attr_rmtval_set(
498 blkcnt, &map, &nmap, 498 blkcnt, &map, &nmap,
499 XFS_BMAPI_ATTRFORK); 499 XFS_BMAPI_ATTRFORK);
500 if (error) 500 if (error)
501 return(error); 501 return error;
502 ASSERT(nmap == 1); 502 ASSERT(nmap == 1);
503 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 503 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
504 (map.br_startblock != HOLESTARTBLOCK)); 504 (map.br_startblock != HOLESTARTBLOCK));
@@ -508,7 +508,7 @@ xfs_attr_rmtval_set(
508 508
509 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); 509 bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0);
510 if (!bp) 510 if (!bp)
511 return ENOMEM; 511 return -ENOMEM;
512 bp->b_ops = &xfs_attr3_rmt_buf_ops; 512 bp->b_ops = &xfs_attr3_rmt_buf_ops;
513 513
514 xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, 514 xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset,
@@ -563,7 +563,7 @@ xfs_attr_rmtval_remove(
563 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, 563 error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
564 blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); 564 blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
565 if (error) 565 if (error)
566 return(error); 566 return error;
567 ASSERT(nmap == 1); 567 ASSERT(nmap == 1);
568 ASSERT((map.br_startblock != DELAYSTARTBLOCK) && 568 ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
569 (map.br_startblock != HOLESTARTBLOCK)); 569 (map.br_startblock != HOLESTARTBLOCK));
@@ -622,7 +622,7 @@ xfs_attr_rmtval_remove(
622 */ 622 */
623 error = xfs_trans_roll(&args->trans, args->dp); 623 error = xfs_trans_roll(&args->trans, args->dp);
624 if (error) 624 if (error)
625 return (error); 625 return error;
626 } 626 }
627 return(0); 627 return 0;
628} 628}
diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 5a9acfa156d7..5a9acfa156d7 100644
--- a/fs/xfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
diff --git a/fs/xfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 919756e3ba53..919756e3ba53 100644
--- a/fs/xfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/libxfs/xfs_bit.h
index e1649c0d3e02..e1649c0d3e02 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/libxfs/xfs_bit.h
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 75c3fe5f3d9d..79c981984dca 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -392,7 +392,7 @@ xfs_bmap_check_leaf_extents(
392 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); 392 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
393 bno = be64_to_cpu(*pp); 393 bno = be64_to_cpu(*pp);
394 394
395 ASSERT(bno != NULLDFSBNO); 395 ASSERT(bno != NULLFSBLOCK);
396 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); 396 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
397 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); 397 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
398 398
@@ -1033,7 +1033,7 @@ xfs_bmap_add_attrfork_btree(
1033 goto error0; 1033 goto error0;
1034 if (stat == 0) { 1034 if (stat == 0) {
1035 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); 1035 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1036 return XFS_ERROR(ENOSPC); 1036 return -ENOSPC;
1037 } 1037 }
1038 *firstblock = cur->bc_private.b.firstblock; 1038 *firstblock = cur->bc_private.b.firstblock;
1039 cur->bc_private.b.allocated = 0; 1039 cur->bc_private.b.allocated = 0;
@@ -1115,7 +1115,7 @@ xfs_bmap_add_attrfork_local(
1115 1115
1116 /* should only be called for types that support local format data */ 1116 /* should only be called for types that support local format data */
1117 ASSERT(0); 1117 ASSERT(0);
1118 return EFSCORRUPTED; 1118 return -EFSCORRUPTED;
1119} 1119}
1120 1120
1121/* 1121/*
@@ -1192,7 +1192,7 @@ xfs_bmap_add_attrfork(
1192 break; 1192 break;
1193 default: 1193 default:
1194 ASSERT(0); 1194 ASSERT(0);
1195 error = XFS_ERROR(EINVAL); 1195 error = -EINVAL;
1196 goto trans_cancel; 1196 goto trans_cancel;
1197 } 1197 }
1198 1198
@@ -1299,7 +1299,7 @@ xfs_bmap_read_extents(
1299 ASSERT(level > 0); 1299 ASSERT(level > 0);
1300 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); 1300 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
1301 bno = be64_to_cpu(*pp); 1301 bno = be64_to_cpu(*pp);
1302 ASSERT(bno != NULLDFSBNO); 1302 ASSERT(bno != NULLFSBLOCK);
1303 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); 1303 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
1304 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); 1304 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
1305 /* 1305 /*
@@ -1399,7 +1399,7 @@ xfs_bmap_read_extents(
1399 return 0; 1399 return 0;
1400error0: 1400error0:
1401 xfs_trans_brelse(tp, bp); 1401 xfs_trans_brelse(tp, bp);
1402 return XFS_ERROR(EFSCORRUPTED); 1402 return -EFSCORRUPTED;
1403} 1403}
1404 1404
1405 1405
@@ -1429,11 +1429,7 @@ xfs_bmap_search_multi_extents(
1429 gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; 1429 gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
1430 gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; 1430 gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
1431 gotp->br_state = XFS_EXT_INVALID; 1431 gotp->br_state = XFS_EXT_INVALID;
1432#if XFS_BIG_BLKNOS
1433 gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; 1432 gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
1434#else
1435 gotp->br_startblock = 0xffffa5a5;
1436#endif
1437 prevp->br_startoff = NULLFILEOFF; 1433 prevp->br_startoff = NULLFILEOFF;
1438 1434
1439 ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); 1435 ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
@@ -1576,7 +1572,7 @@ xfs_bmap_last_before(
1576 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && 1572 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
1577 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && 1573 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
1578 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) 1574 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
1579 return XFS_ERROR(EIO); 1575 return -EIO;
1580 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { 1576 if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
1581 *last_block = 0; 1577 *last_block = 0;
1582 return 0; 1578 return 0;
@@ -1690,7 +1686,7 @@ xfs_bmap_last_offset(
1690 1686
1691 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && 1687 if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
1692 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) 1688 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
1693 return XFS_ERROR(EIO); 1689 return -EIO;
1694 1690
1695 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty); 1691 error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
1696 if (error || is_empty) 1692 if (error || is_empty)
@@ -3323,7 +3319,7 @@ xfs_bmap_extsize_align(
3323 if (orig_off < align_off || 3319 if (orig_off < align_off ||
3324 orig_end > align_off + align_alen || 3320 orig_end > align_off + align_alen ||
3325 align_alen - temp < orig_alen) 3321 align_alen - temp < orig_alen)
3326 return XFS_ERROR(EINVAL); 3322 return -EINVAL;
3327 /* 3323 /*
3328 * Try to fix it by moving the start up. 3324 * Try to fix it by moving the start up.
3329 */ 3325 */
@@ -3348,7 +3344,7 @@ xfs_bmap_extsize_align(
3348 * Result doesn't cover the request, fail it. 3344 * Result doesn't cover the request, fail it.
3349 */ 3345 */
3350 if (orig_off < align_off || orig_end > align_off + align_alen) 3346 if (orig_off < align_off || orig_end > align_off + align_alen)
3351 return XFS_ERROR(EINVAL); 3347 return -EINVAL;
3352 } else { 3348 } else {
3353 ASSERT(orig_off >= align_off); 3349 ASSERT(orig_off >= align_off);
3354 ASSERT(orig_end <= align_off + align_alen); 3350 ASSERT(orig_end <= align_off + align_alen);
@@ -4051,11 +4047,11 @@ xfs_bmapi_read(
4051 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), 4047 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
4052 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { 4048 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4053 XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp); 4049 XFS_ERROR_REPORT("xfs_bmapi_read", XFS_ERRLEVEL_LOW, mp);
4054 return XFS_ERROR(EFSCORRUPTED); 4050 return -EFSCORRUPTED;
4055 } 4051 }
4056 4052
4057 if (XFS_FORCED_SHUTDOWN(mp)) 4053 if (XFS_FORCED_SHUTDOWN(mp))
4058 return XFS_ERROR(EIO); 4054 return -EIO;
4059 4055
4060 XFS_STATS_INC(xs_blk_mapr); 4056 XFS_STATS_INC(xs_blk_mapr);
4061 4057
@@ -4246,11 +4242,11 @@ xfs_bmapi_delay(
4246 XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), 4242 XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
4247 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { 4243 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4248 XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp); 4244 XFS_ERROR_REPORT("xfs_bmapi_delay", XFS_ERRLEVEL_LOW, mp);
4249 return XFS_ERROR(EFSCORRUPTED); 4245 return -EFSCORRUPTED;
4250 } 4246 }
4251 4247
4252 if (XFS_FORCED_SHUTDOWN(mp)) 4248 if (XFS_FORCED_SHUTDOWN(mp))
4253 return XFS_ERROR(EIO); 4249 return -EIO;
4254 4250
4255 XFS_STATS_INC(xs_blk_mapw); 4251 XFS_STATS_INC(xs_blk_mapw);
4256 4252
@@ -4469,7 +4465,7 @@ xfs_bmapi_convert_unwritten(
4469 * so generate another request. 4465 * so generate another request.
4470 */ 4466 */
4471 if (mval->br_blockcount < len) 4467 if (mval->br_blockcount < len)
4472 return EAGAIN; 4468 return -EAGAIN;
4473 return 0; 4469 return 0;
4474} 4470}
4475 4471
@@ -4540,11 +4536,11 @@ xfs_bmapi_write(
4540 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), 4536 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
4541 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { 4537 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
4542 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp); 4538 XFS_ERROR_REPORT("xfs_bmapi_write", XFS_ERRLEVEL_LOW, mp);
4543 return XFS_ERROR(EFSCORRUPTED); 4539 return -EFSCORRUPTED;
4544 } 4540 }
4545 4541
4546 if (XFS_FORCED_SHUTDOWN(mp)) 4542 if (XFS_FORCED_SHUTDOWN(mp))
4547 return XFS_ERROR(EIO); 4543 return -EIO;
4548 4544
4549 ifp = XFS_IFORK_PTR(ip, whichfork); 4545 ifp = XFS_IFORK_PTR(ip, whichfork);
4550 4546
@@ -4620,7 +4616,7 @@ xfs_bmapi_write(
4620 4616
4621 /* Execute unwritten extent conversion if necessary */ 4617 /* Execute unwritten extent conversion if necessary */
4622 error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags); 4618 error = xfs_bmapi_convert_unwritten(&bma, mval, len, flags);
4623 if (error == EAGAIN) 4619 if (error == -EAGAIN)
4624 continue; 4620 continue;
4625 if (error) 4621 if (error)
4626 goto error0; 4622 goto error0;
@@ -4922,7 +4918,7 @@ xfs_bmap_del_extent(
4922 goto done; 4918 goto done;
4923 cur->bc_rec.b = new; 4919 cur->bc_rec.b = new;
4924 error = xfs_btree_insert(cur, &i); 4920 error = xfs_btree_insert(cur, &i);
4925 if (error && error != ENOSPC) 4921 if (error && error != -ENOSPC)
4926 goto done; 4922 goto done;
4927 /* 4923 /*
4928 * If get no-space back from btree insert, 4924 * If get no-space back from btree insert,
@@ -4930,7 +4926,7 @@ xfs_bmap_del_extent(
4930 * block reservation. 4926 * block reservation.
4931 * Fix up our state and return the error. 4927 * Fix up our state and return the error.
4932 */ 4928 */
4933 if (error == ENOSPC) { 4929 if (error == -ENOSPC) {
4934 /* 4930 /*
4935 * Reset the cursor, don't trust 4931 * Reset the cursor, don't trust
4936 * it after any insert operation. 4932 * it after any insert operation.
@@ -4958,7 +4954,7 @@ xfs_bmap_del_extent(
4958 xfs_bmbt_set_blockcount(ep, 4954 xfs_bmbt_set_blockcount(ep,
4959 got.br_blockcount); 4955 got.br_blockcount);
4960 flags = 0; 4956 flags = 0;
4961 error = XFS_ERROR(ENOSPC); 4957 error = -ENOSPC;
4962 goto done; 4958 goto done;
4963 } 4959 }
4964 XFS_WANT_CORRUPTED_GOTO(i == 1, done); 4960 XFS_WANT_CORRUPTED_GOTO(i == 1, done);
@@ -5076,11 +5072,11 @@ xfs_bunmapi(
5076 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 5072 XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
5077 XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW, 5073 XFS_ERROR_REPORT("xfs_bunmapi", XFS_ERRLEVEL_LOW,
5078 ip->i_mount); 5074 ip->i_mount);
5079 return XFS_ERROR(EFSCORRUPTED); 5075 return -EFSCORRUPTED;
5080 } 5076 }
5081 mp = ip->i_mount; 5077 mp = ip->i_mount;
5082 if (XFS_FORCED_SHUTDOWN(mp)) 5078 if (XFS_FORCED_SHUTDOWN(mp))
5083 return XFS_ERROR(EIO); 5079 return -EIO;
5084 5080
5085 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 5081 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
5086 ASSERT(len > 0); 5082 ASSERT(len > 0);
@@ -5325,7 +5321,7 @@ xfs_bunmapi(
5325 del.br_startoff > got.br_startoff && 5321 del.br_startoff > got.br_startoff &&
5326 del.br_startoff + del.br_blockcount < 5322 del.br_startoff + del.br_blockcount <
5327 got.br_startoff + got.br_blockcount) { 5323 got.br_startoff + got.br_blockcount) {
5328 error = XFS_ERROR(ENOSPC); 5324 error = -ENOSPC;
5329 goto error0; 5325 goto error0;
5330 } 5326 }
5331 error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del, 5327 error = xfs_bmap_del_extent(ip, tp, &lastx, flist, cur, &del,
@@ -5408,39 +5404,237 @@ error0:
5408} 5404}
5409 5405
5410/* 5406/*
5407 * Determine whether an extent shift can be accomplished by a merge with the
5408 * extent that precedes the target hole of the shift.
5409 */
5410STATIC bool
5411xfs_bmse_can_merge(
5412 struct xfs_bmbt_irec *left, /* preceding extent */
5413 struct xfs_bmbt_irec *got, /* current extent to shift */
5414 xfs_fileoff_t shift) /* shift fsb */
5415{
5416 xfs_fileoff_t startoff;
5417
5418 startoff = got->br_startoff - shift;
5419
5420 /*
5421 * The extent, once shifted, must be adjacent in-file and on-disk with
5422 * the preceding extent.
5423 */
5424 if ((left->br_startoff + left->br_blockcount != startoff) ||
5425 (left->br_startblock + left->br_blockcount != got->br_startblock) ||
5426 (left->br_state != got->br_state) ||
5427 (left->br_blockcount + got->br_blockcount > MAXEXTLEN))
5428 return false;
5429
5430 return true;
5431}
5432
5433/*
5434 * A bmap extent shift adjusts the file offset of an extent to fill a preceding
5435 * hole in the file. If an extent shift would result in the extent being fully
5436 * adjacent to the extent that currently precedes the hole, we can merge with
5437 * the preceding extent rather than do the shift.
5438 *
5439 * This function assumes the caller has verified a shift-by-merge is possible
5440 * with the provided extents via xfs_bmse_can_merge().
5441 */
5442STATIC int
5443xfs_bmse_merge(
5444 struct xfs_inode *ip,
5445 int whichfork,
5446 xfs_fileoff_t shift, /* shift fsb */
5447 int current_ext, /* idx of gotp */
5448 struct xfs_bmbt_rec_host *gotp, /* extent to shift */
5449 struct xfs_bmbt_rec_host *leftp, /* preceding extent */
5450 struct xfs_btree_cur *cur,
5451 int *logflags) /* output */
5452{
5453 struct xfs_ifork *ifp;
5454 struct xfs_bmbt_irec got;
5455 struct xfs_bmbt_irec left;
5456 xfs_filblks_t blockcount;
5457 int error, i;
5458
5459 ifp = XFS_IFORK_PTR(ip, whichfork);
5460 xfs_bmbt_get_all(gotp, &got);
5461 xfs_bmbt_get_all(leftp, &left);
5462 blockcount = left.br_blockcount + got.br_blockcount;
5463
5464 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
5465 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
5466 ASSERT(xfs_bmse_can_merge(&left, &got, shift));
5467
5468 /*
5469 * Merge the in-core extents. Note that the host record pointers and
5470 * current_ext index are invalid once the extent has been removed via
5471 * xfs_iext_remove().
5472 */
5473 xfs_bmbt_set_blockcount(leftp, blockcount);
5474 xfs_iext_remove(ip, current_ext, 1, 0);
5475
5476 /*
5477 * Update the on-disk extent count, the btree if necessary and log the
5478 * inode.
5479 */
5480 XFS_IFORK_NEXT_SET(ip, whichfork,
5481 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
5482 *logflags |= XFS_ILOG_CORE;
5483 if (!cur) {
5484 *logflags |= XFS_ILOG_DEXT;
5485 return 0;
5486 }
5487
5488 /* lookup and remove the extent to merge */
5489 error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
5490 got.br_blockcount, &i);
5491 if (error)
5492 goto out_error;
5493 XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
5494
5495 error = xfs_btree_delete(cur, &i);
5496 if (error)
5497 goto out_error;
5498 XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
5499
5500 /* lookup and update size of the previous extent */
5501 error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock,
5502 left.br_blockcount, &i);
5503 if (error)
5504 goto out_error;
5505 XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
5506
5507 left.br_blockcount = blockcount;
5508
5509 error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock,
5510 left.br_blockcount, left.br_state);
5511 if (error)
5512 goto out_error;
5513
5514 return 0;
5515
5516out_error:
5517 return error;
5518}
5519
5520/*
5521 * Shift a single extent.
5522 */
5523STATIC int
5524xfs_bmse_shift_one(
5525 struct xfs_inode *ip,
5526 int whichfork,
5527 xfs_fileoff_t offset_shift_fsb,
5528 int *current_ext,
5529 struct xfs_bmbt_rec_host *gotp,
5530 struct xfs_btree_cur *cur,
5531 int *logflags)
5532{
5533 struct xfs_ifork *ifp;
5534 xfs_fileoff_t startoff;
5535 struct xfs_bmbt_rec_host *leftp;
5536 struct xfs_bmbt_irec got;
5537 struct xfs_bmbt_irec left;
5538 int error;
5539 int i;
5540
5541 ifp = XFS_IFORK_PTR(ip, whichfork);
5542
5543 xfs_bmbt_get_all(gotp, &got);
5544 startoff = got.br_startoff - offset_shift_fsb;
5545
5546 /* delalloc extents should be prevented by caller */
5547 XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock),
5548 out_error);
5549
5550 /*
5551 * If this is the first extent in the file, make sure there's enough
5552 * room at the start of the file and jump right to the shift as there's
5553 * no left extent to merge.
5554 */
5555 if (*current_ext == 0) {
5556 if (got.br_startoff < offset_shift_fsb)
5557 return -EINVAL;
5558 goto shift_extent;
5559 }
5560
5561 /* grab the left extent and check for a large enough hole */
5562 leftp = xfs_iext_get_ext(ifp, *current_ext - 1);
5563 xfs_bmbt_get_all(leftp, &left);
5564
5565 if (startoff < left.br_startoff + left.br_blockcount)
5566 return -EINVAL;
5567
5568 /* check whether to merge the extent or shift it down */
5569 if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb))
5570 goto shift_extent;
5571
5572 return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext,
5573 gotp, leftp, cur, logflags);
5574
5575shift_extent:
5576 /*
5577 * Increment the extent index for the next iteration, update the start
5578 * offset of the in-core extent and update the btree if applicable.
5579 */
5580 (*current_ext)++;
5581 xfs_bmbt_set_startoff(gotp, startoff);
5582 *logflags |= XFS_ILOG_CORE;
5583 if (!cur) {
5584 *logflags |= XFS_ILOG_DEXT;
5585 return 0;
5586 }
5587
5588 error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock,
5589 got.br_blockcount, &i);
5590 if (error)
5591 return error;
5592 XFS_WANT_CORRUPTED_GOTO(i == 1, out_error);
5593
5594 got.br_startoff = startoff;
5595 error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock,
5596 got.br_blockcount, got.br_state);
5597 if (error)
5598 return error;
5599
5600 return 0;
5601
5602out_error:
5603 return error;
5604}
5605
5606/*
5411 * Shift extent records to the left to cover a hole. 5607 * Shift extent records to the left to cover a hole.
5412 * 5608 *
5413 * The maximum number of extents to be shifted in a single operation 5609 * The maximum number of extents to be shifted in a single operation is
5414 * is @num_exts, and @current_ext keeps track of the current extent 5610 * @num_exts. @start_fsb specifies the file offset to start the shift and the
5415 * index we have shifted. @offset_shift_fsb is the length by which each 5611 * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb
5416 * extent is shifted. If there is no hole to shift the extents 5612 * is the length by which each extent is shifted. If there is no hole to shift
5417 * into, this will be considered invalid operation and we abort immediately. 5613 * the extents into, this will be considered invalid operation and we abort
5614 * immediately.
5418 */ 5615 */
5419int 5616int
5420xfs_bmap_shift_extents( 5617xfs_bmap_shift_extents(
5421 struct xfs_trans *tp, 5618 struct xfs_trans *tp,
5422 struct xfs_inode *ip, 5619 struct xfs_inode *ip,
5423 int *done,
5424 xfs_fileoff_t start_fsb, 5620 xfs_fileoff_t start_fsb,
5425 xfs_fileoff_t offset_shift_fsb, 5621 xfs_fileoff_t offset_shift_fsb,
5426 xfs_extnum_t *current_ext, 5622 int *done,
5623 xfs_fileoff_t *next_fsb,
5427 xfs_fsblock_t *firstblock, 5624 xfs_fsblock_t *firstblock,
5428 struct xfs_bmap_free *flist, 5625 struct xfs_bmap_free *flist,
5429 int num_exts) 5626 int num_exts)
5430{ 5627{
5431 struct xfs_btree_cur *cur; 5628 struct xfs_btree_cur *cur = NULL;
5432 struct xfs_bmbt_rec_host *gotp; 5629 struct xfs_bmbt_rec_host *gotp;
5433 struct xfs_bmbt_irec got; 5630 struct xfs_bmbt_irec got;
5434 struct xfs_bmbt_irec left;
5435 struct xfs_mount *mp = ip->i_mount; 5631 struct xfs_mount *mp = ip->i_mount;
5436 struct xfs_ifork *ifp; 5632 struct xfs_ifork *ifp;
5437 xfs_extnum_t nexts = 0; 5633 xfs_extnum_t nexts = 0;
5438 xfs_fileoff_t startoff; 5634 xfs_extnum_t current_ext;
5439 int error = 0; 5635 int error = 0;
5440 int i;
5441 int whichfork = XFS_DATA_FORK; 5636 int whichfork = XFS_DATA_FORK;
5442 int logflags; 5637 int logflags = 0;
5443 xfs_filblks_t blockcount = 0;
5444 int total_extents; 5638 int total_extents;
5445 5639
5446 if (unlikely(XFS_TEST_ERROR( 5640 if (unlikely(XFS_TEST_ERROR(
@@ -5449,13 +5643,14 @@ xfs_bmap_shift_extents(
5449 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { 5643 mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
5450 XFS_ERROR_REPORT("xfs_bmap_shift_extents", 5644 XFS_ERROR_REPORT("xfs_bmap_shift_extents",
5451 XFS_ERRLEVEL_LOW, mp); 5645 XFS_ERRLEVEL_LOW, mp);
5452 return XFS_ERROR(EFSCORRUPTED); 5646 return -EFSCORRUPTED;
5453 } 5647 }
5454 5648
5455 if (XFS_FORCED_SHUTDOWN(mp)) 5649 if (XFS_FORCED_SHUTDOWN(mp))
5456 return XFS_ERROR(EIO); 5650 return -EIO;
5457 5651
5458 ASSERT(current_ext != NULL); 5652 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
5653 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
5459 5654
5460 ifp = XFS_IFORK_PTR(ip, whichfork); 5655 ifp = XFS_IFORK_PTR(ip, whichfork);
5461 if (!(ifp->if_flags & XFS_IFEXTENTS)) { 5656 if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -5465,142 +5660,62 @@ xfs_bmap_shift_extents(
5465 return error; 5660 return error;
5466 } 5661 }
5467 5662
5468 /*
5469 * If *current_ext is 0, we would need to lookup the extent
5470 * from where we would start shifting and store it in gotp.
5471 */
5472 if (!*current_ext) {
5473 gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext);
5474 /*
5475 * gotp can be null in 2 cases: 1) if there are no extents
5476 * or 2) start_fsb lies in a hole beyond which there are
5477 * no extents. Either way, we are done.
5478 */
5479 if (!gotp) {
5480 *done = 1;
5481 return 0;
5482 }
5483 }
5484
5485 /* We are going to change core inode */
5486 logflags = XFS_ILOG_CORE;
5487 if (ifp->if_flags & XFS_IFBROOT) { 5663 if (ifp->if_flags & XFS_IFBROOT) {
5488 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); 5664 cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
5489 cur->bc_private.b.firstblock = *firstblock; 5665 cur->bc_private.b.firstblock = *firstblock;
5490 cur->bc_private.b.flist = flist; 5666 cur->bc_private.b.flist = flist;
5491 cur->bc_private.b.flags = 0; 5667 cur->bc_private.b.flags = 0;
5492 } else { 5668 }
5493 cur = NULL; 5669
5494 logflags |= XFS_ILOG_DEXT; 5670 /*
5671 * Look up the extent index for the fsb where we start shifting. We can
5672 * henceforth iterate with current_ext as extent list changes are locked
5673 * out via ilock.
5674 *
5675 * gotp can be null in 2 cases: 1) if there are no extents or 2)
5676 * start_fsb lies in a hole beyond which there are no extents. Either
5677 * way, we are done.
5678 */
5679 gotp = xfs_iext_bno_to_ext(ifp, start_fsb, &current_ext);
5680 if (!gotp) {
5681 *done = 1;
5682 goto del_cursor;
5495 } 5683 }
5496 5684
5497 /* 5685 /*
5498 * There may be delalloc extents in the data fork before the range we 5686 * There may be delalloc extents in the data fork before the range we
5499 * are collapsing out, so we cannot 5687 * are collapsing out, so we cannot use the count of real extents here.
5500 * use the count of real extents here. Instead we have to calculate it 5688 * Instead we have to calculate it from the incore fork.
5501 * from the incore fork.
5502 */ 5689 */
5503 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); 5690 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5504 while (nexts++ < num_exts && *current_ext < total_extents) { 5691 while (nexts++ < num_exts && current_ext < total_extents) {
5505 5692 error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb,
5506 gotp = xfs_iext_get_ext(ifp, *current_ext); 5693 &current_ext, gotp, cur, &logflags);
5507 xfs_bmbt_get_all(gotp, &got);
5508 startoff = got.br_startoff - offset_shift_fsb;
5509
5510 /*
5511 * Before shifting extent into hole, make sure that the hole
5512 * is large enough to accomodate the shift.
5513 */
5514 if (*current_ext) {
5515 xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
5516 *current_ext - 1), &left);
5517
5518 if (startoff < left.br_startoff + left.br_blockcount)
5519 error = XFS_ERROR(EINVAL);
5520 } else if (offset_shift_fsb > got.br_startoff) {
5521 /*
5522 * When first extent is shifted, offset_shift_fsb
5523 * should be less than the stating offset of
5524 * the first extent.
5525 */
5526 error = XFS_ERROR(EINVAL);
5527 }
5528
5529 if (error) 5694 if (error)
5530 goto del_cursor; 5695 goto del_cursor;
5531 5696
5532 if (cur) { 5697 /* update total extent count and grab the next record */
5533 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5534 got.br_startblock,
5535 got.br_blockcount,
5536 &i);
5537 if (error)
5538 goto del_cursor;
5539 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5540 }
5541
5542 /* Check if we can merge 2 adjacent extents */
5543 if (*current_ext &&
5544 left.br_startoff + left.br_blockcount == startoff &&
5545 left.br_startblock + left.br_blockcount ==
5546 got.br_startblock &&
5547 left.br_state == got.br_state &&
5548 left.br_blockcount + got.br_blockcount <= MAXEXTLEN) {
5549 blockcount = left.br_blockcount +
5550 got.br_blockcount;
5551 xfs_iext_remove(ip, *current_ext, 1, 0);
5552 if (cur) {
5553 error = xfs_btree_delete(cur, &i);
5554 if (error)
5555 goto del_cursor;
5556 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5557 }
5558 XFS_IFORK_NEXT_SET(ip, whichfork,
5559 XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
5560 gotp = xfs_iext_get_ext(ifp, --*current_ext);
5561 xfs_bmbt_get_all(gotp, &got);
5562
5563 /* Make cursor point to the extent we will update */
5564 if (cur) {
5565 error = xfs_bmbt_lookup_eq(cur, got.br_startoff,
5566 got.br_startblock,
5567 got.br_blockcount,
5568 &i);
5569 if (error)
5570 goto del_cursor;
5571 XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor);
5572 }
5573
5574 xfs_bmbt_set_blockcount(gotp, blockcount);
5575 got.br_blockcount = blockcount;
5576 } else {
5577 /* We have to update the startoff */
5578 xfs_bmbt_set_startoff(gotp, startoff);
5579 got.br_startoff = startoff;
5580 }
5581
5582 if (cur) {
5583 error = xfs_bmbt_update(cur, got.br_startoff,
5584 got.br_startblock,
5585 got.br_blockcount,
5586 got.br_state);
5587 if (error)
5588 goto del_cursor;
5589 }
5590
5591 (*current_ext)++;
5592 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); 5698 total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
5699 if (current_ext >= total_extents)
5700 break;
5701 gotp = xfs_iext_get_ext(ifp, current_ext);
5593 } 5702 }
5594 5703
5595 /* Check if we are done */ 5704 /* Check if we are done */
5596 if (*current_ext == total_extents) 5705 if (current_ext == total_extents) {
5597 *done = 1; 5706 *done = 1;
5707 } else if (next_fsb) {
5708 xfs_bmbt_get_all(gotp, &got);
5709 *next_fsb = got.br_startoff;
5710 }
5598 5711
5599del_cursor: 5712del_cursor:
5600 if (cur) 5713 if (cur)
5601 xfs_btree_del_cursor(cur, 5714 xfs_btree_del_cursor(cur,
5602 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 5715 error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
5603 5716
5604 xfs_trans_log_inode(tp, ip, logflags); 5717 if (logflags)
5718 xfs_trans_log_inode(tp, ip, logflags);
5719
5605 return error; 5720 return error;
5606} 5721}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b879ca56a64c..44db6db86402 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -178,9 +178,8 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
178 xfs_extnum_t num); 178 xfs_extnum_t num);
179uint xfs_default_attroffset(struct xfs_inode *ip); 179uint xfs_default_attroffset(struct xfs_inode *ip);
180int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, 180int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
181 int *done, xfs_fileoff_t start_fsb, 181 xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb,
182 xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext, 182 int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock,
183 xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, 183 struct xfs_bmap_free *flist, int num_exts);
184 int num_exts);
185 184
186#endif /* __XFS_BMAP_H__ */ 185#endif /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 948836c4fd90..fba753308f31 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -111,23 +111,8 @@ __xfs_bmbt_get_all(
111 ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN)); 111 ext_flag = (int)(l0 >> (64 - BMBT_EXNTFLAG_BITLEN));
112 s->br_startoff = ((xfs_fileoff_t)l0 & 112 s->br_startoff = ((xfs_fileoff_t)l0 &
113 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; 113 xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
114#if XFS_BIG_BLKNOS
115 s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) | 114 s->br_startblock = (((xfs_fsblock_t)l0 & xfs_mask64lo(9)) << 43) |
116 (((xfs_fsblock_t)l1) >> 21); 115 (((xfs_fsblock_t)l1) >> 21);
117#else
118#ifdef DEBUG
119 {
120 xfs_dfsbno_t b;
121
122 b = (((xfs_dfsbno_t)l0 & xfs_mask64lo(9)) << 43) |
123 (((xfs_dfsbno_t)l1) >> 21);
124 ASSERT((b >> 32) == 0 || isnulldstartblock(b));
125 s->br_startblock = (xfs_fsblock_t)b;
126 }
127#else /* !DEBUG */
128 s->br_startblock = (xfs_fsblock_t)(((xfs_dfsbno_t)l1) >> 21);
129#endif /* DEBUG */
130#endif /* XFS_BIG_BLKNOS */
131 s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21)); 116 s->br_blockcount = (xfs_filblks_t)(l1 & xfs_mask64lo(21));
132 /* This is xfs_extent_state() in-line */ 117 /* This is xfs_extent_state() in-line */
133 if (ext_flag) { 118 if (ext_flag) {
@@ -163,21 +148,8 @@ xfs_fsblock_t
163xfs_bmbt_get_startblock( 148xfs_bmbt_get_startblock(
164 xfs_bmbt_rec_host_t *r) 149 xfs_bmbt_rec_host_t *r)
165{ 150{
166#if XFS_BIG_BLKNOS
167 return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) | 151 return (((xfs_fsblock_t)r->l0 & xfs_mask64lo(9)) << 43) |
168 (((xfs_fsblock_t)r->l1) >> 21); 152 (((xfs_fsblock_t)r->l1) >> 21);
169#else
170#ifdef DEBUG
171 xfs_dfsbno_t b;
172
173 b = (((xfs_dfsbno_t)r->l0 & xfs_mask64lo(9)) << 43) |
174 (((xfs_dfsbno_t)r->l1) >> 21);
175 ASSERT((b >> 32) == 0 || isnulldstartblock(b));
176 return (xfs_fsblock_t)b;
177#else /* !DEBUG */
178 return (xfs_fsblock_t)(((xfs_dfsbno_t)r->l1) >> 21);
179#endif /* DEBUG */
180#endif /* XFS_BIG_BLKNOS */
181} 153}
182 154
183/* 155/*
@@ -241,7 +213,6 @@ xfs_bmbt_set_allf(
241 ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); 213 ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
242 ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); 214 ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
243 215
244#if XFS_BIG_BLKNOS
245 ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); 216 ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
246 217
247 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) | 218 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
@@ -250,23 +221,6 @@ xfs_bmbt_set_allf(
250 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) | 221 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
251 ((xfs_bmbt_rec_base_t)blockcount & 222 ((xfs_bmbt_rec_base_t)blockcount &
252 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)); 223 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
253#else /* !XFS_BIG_BLKNOS */
254 if (isnullstartblock(startblock)) {
255 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
256 ((xfs_bmbt_rec_base_t)startoff << 9) |
257 (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
258 r->l1 = xfs_mask64hi(11) |
259 ((xfs_bmbt_rec_base_t)startblock << 21) |
260 ((xfs_bmbt_rec_base_t)blockcount &
261 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
262 } else {
263 r->l0 = ((xfs_bmbt_rec_base_t)extent_flag << 63) |
264 ((xfs_bmbt_rec_base_t)startoff << 9);
265 r->l1 = ((xfs_bmbt_rec_base_t)startblock << 21) |
266 ((xfs_bmbt_rec_base_t)blockcount &
267 (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
268 }
269#endif /* XFS_BIG_BLKNOS */
270} 224}
271 225
272/* 226/*
@@ -298,8 +252,6 @@ xfs_bmbt_disk_set_allf(
298 ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN); 252 ASSERT(state == XFS_EXT_NORM || state == XFS_EXT_UNWRITTEN);
299 ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0); 253 ASSERT((startoff & xfs_mask64hi(64-BMBT_STARTOFF_BITLEN)) == 0);
300 ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0); 254 ASSERT((blockcount & xfs_mask64hi(64-BMBT_BLOCKCOUNT_BITLEN)) == 0);
301
302#if XFS_BIG_BLKNOS
303 ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0); 255 ASSERT((startblock & xfs_mask64hi(64-BMBT_STARTBLOCK_BITLEN)) == 0);
304 256
305 r->l0 = cpu_to_be64( 257 r->l0 = cpu_to_be64(
@@ -310,26 +262,6 @@ xfs_bmbt_disk_set_allf(
310 ((xfs_bmbt_rec_base_t)startblock << 21) | 262 ((xfs_bmbt_rec_base_t)startblock << 21) |
311 ((xfs_bmbt_rec_base_t)blockcount & 263 ((xfs_bmbt_rec_base_t)blockcount &
312 (xfs_bmbt_rec_base_t)xfs_mask64lo(21))); 264 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
313#else /* !XFS_BIG_BLKNOS */
314 if (isnullstartblock(startblock)) {
315 r->l0 = cpu_to_be64(
316 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
317 ((xfs_bmbt_rec_base_t)startoff << 9) |
318 (xfs_bmbt_rec_base_t)xfs_mask64lo(9));
319 r->l1 = cpu_to_be64(xfs_mask64hi(11) |
320 ((xfs_bmbt_rec_base_t)startblock << 21) |
321 ((xfs_bmbt_rec_base_t)blockcount &
322 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
323 } else {
324 r->l0 = cpu_to_be64(
325 ((xfs_bmbt_rec_base_t)extent_flag << 63) |
326 ((xfs_bmbt_rec_base_t)startoff << 9));
327 r->l1 = cpu_to_be64(
328 ((xfs_bmbt_rec_base_t)startblock << 21) |
329 ((xfs_bmbt_rec_base_t)blockcount &
330 (xfs_bmbt_rec_base_t)xfs_mask64lo(21)));
331 }
332#endif /* XFS_BIG_BLKNOS */
333} 265}
334 266
335/* 267/*
@@ -365,24 +297,11 @@ xfs_bmbt_set_startblock(
365 xfs_bmbt_rec_host_t *r, 297 xfs_bmbt_rec_host_t *r,
366 xfs_fsblock_t v) 298 xfs_fsblock_t v)
367{ 299{
368#if XFS_BIG_BLKNOS
369 ASSERT((v & xfs_mask64hi(12)) == 0); 300 ASSERT((v & xfs_mask64hi(12)) == 0);
370 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) | 301 r->l0 = (r->l0 & (xfs_bmbt_rec_base_t)xfs_mask64hi(55)) |
371 (xfs_bmbt_rec_base_t)(v >> 43); 302 (xfs_bmbt_rec_base_t)(v >> 43);
372 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) | 303 r->l1 = (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21)) |
373 (xfs_bmbt_rec_base_t)(v << 21); 304 (xfs_bmbt_rec_base_t)(v << 21);
374#else /* !XFS_BIG_BLKNOS */
375 if (isnullstartblock(v)) {
376 r->l0 |= (xfs_bmbt_rec_base_t)xfs_mask64lo(9);
377 r->l1 = (xfs_bmbt_rec_base_t)xfs_mask64hi(11) |
378 ((xfs_bmbt_rec_base_t)v << 21) |
379 (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
380 } else {
381 r->l0 &= ~(xfs_bmbt_rec_base_t)xfs_mask64lo(9);
382 r->l1 = ((xfs_bmbt_rec_base_t)v << 21) |
383 (r->l1 & (xfs_bmbt_rec_base_t)xfs_mask64lo(21));
384 }
385#endif /* XFS_BIG_BLKNOS */
386} 305}
387 306
388/* 307/*
@@ -438,8 +357,8 @@ xfs_bmbt_to_bmdr(
438 cpu_to_be64(XFS_BUF_DADDR_NULL)); 357 cpu_to_be64(XFS_BUF_DADDR_NULL));
439 } else 358 } else
440 ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC)); 359 ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC));
441 ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)); 360 ASSERT(rblock->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK));
442 ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)); 361 ASSERT(rblock->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK));
443 ASSERT(rblock->bb_level != 0); 362 ASSERT(rblock->bb_level != 0);
444 dblock->bb_level = rblock->bb_level; 363 dblock->bb_level = rblock->bb_level;
445 dblock->bb_numrecs = rblock->bb_numrecs; 364 dblock->bb_numrecs = rblock->bb_numrecs;
@@ -554,7 +473,7 @@ xfs_bmbt_alloc_block(
554 args.minlen = args.maxlen = args.prod = 1; 473 args.minlen = args.maxlen = args.prod = 1;
555 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL; 474 args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
556 if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) { 475 if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
557 error = XFS_ERROR(ENOSPC); 476 error = -ENOSPC;
558 goto error0; 477 goto error0;
559 } 478 }
560 error = xfs_alloc_vextent(&args); 479 error = xfs_alloc_vextent(&args);
@@ -763,11 +682,11 @@ xfs_bmbt_verify(
763 682
764 /* sibling pointer verification */ 683 /* sibling pointer verification */
765 if (!block->bb_u.l.bb_leftsib || 684 if (!block->bb_u.l.bb_leftsib ||
766 (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLDFSBNO) && 685 (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK) &&
767 !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib)))) 686 !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_leftsib))))
768 return false; 687 return false;
769 if (!block->bb_u.l.bb_rightsib || 688 if (!block->bb_u.l.bb_rightsib ||
770 (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLDFSBNO) && 689 (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK) &&
771 !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib)))) 690 !XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_u.l.bb_rightsib))))
772 return false; 691 return false;
773 692
@@ -779,9 +698,9 @@ xfs_bmbt_read_verify(
779 struct xfs_buf *bp) 698 struct xfs_buf *bp)
780{ 699{
781 if (!xfs_btree_lblock_verify_crc(bp)) 700 if (!xfs_btree_lblock_verify_crc(bp))
782 xfs_buf_ioerror(bp, EFSBADCRC); 701 xfs_buf_ioerror(bp, -EFSBADCRC);
783 else if (!xfs_bmbt_verify(bp)) 702 else if (!xfs_bmbt_verify(bp))
784 xfs_buf_ioerror(bp, EFSCORRUPTED); 703 xfs_buf_ioerror(bp, -EFSCORRUPTED);
785 704
786 if (bp->b_error) { 705 if (bp->b_error) {
787 trace_xfs_btree_corrupt(bp, _RET_IP_); 706 trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -795,7 +714,7 @@ xfs_bmbt_write_verify(
795{ 714{
796 if (!xfs_bmbt_verify(bp)) { 715 if (!xfs_bmbt_verify(bp)) {
797 trace_xfs_btree_corrupt(bp, _RET_IP_); 716 trace_xfs_btree_corrupt(bp, _RET_IP_);
798 xfs_buf_ioerror(bp, EFSCORRUPTED); 717 xfs_buf_ioerror(bp, -EFSCORRUPTED);
799 xfs_verifier_error(bp); 718 xfs_verifier_error(bp);
800 return; 719 return;
801 } 720 }
@@ -959,7 +878,7 @@ xfs_bmbt_change_owner(
959 878
960 cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); 879 cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
961 if (!cur) 880 if (!cur)
962 return ENOMEM; 881 return -ENOMEM;
963 882
964 error = xfs_btree_change_owner(cur, new_owner, buffer_list); 883 error = xfs_btree_change_owner(cur, new_owner, buffer_list);
965 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); 884 xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 819a8a4dee95..819a8a4dee95 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index cf893bc1e373..8fe6a93ff473 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -78,11 +78,11 @@ xfs_btree_check_lblock(
78 be16_to_cpu(block->bb_numrecs) <= 78 be16_to_cpu(block->bb_numrecs) <=
79 cur->bc_ops->get_maxrecs(cur, level) && 79 cur->bc_ops->get_maxrecs(cur, level) &&
80 block->bb_u.l.bb_leftsib && 80 block->bb_u.l.bb_leftsib &&
81 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) || 81 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
82 XFS_FSB_SANITY_CHECK(mp, 82 XFS_FSB_SANITY_CHECK(mp,
83 be64_to_cpu(block->bb_u.l.bb_leftsib))) && 83 be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
84 block->bb_u.l.bb_rightsib && 84 block->bb_u.l.bb_rightsib &&
85 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) || 85 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
86 XFS_FSB_SANITY_CHECK(mp, 86 XFS_FSB_SANITY_CHECK(mp,
87 be64_to_cpu(block->bb_u.l.bb_rightsib))); 87 be64_to_cpu(block->bb_u.l.bb_rightsib)));
88 88
@@ -92,7 +92,7 @@ xfs_btree_check_lblock(
92 if (bp) 92 if (bp)
93 trace_xfs_btree_corrupt(bp, _RET_IP_); 93 trace_xfs_btree_corrupt(bp, _RET_IP_);
94 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); 94 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
95 return XFS_ERROR(EFSCORRUPTED); 95 return -EFSCORRUPTED;
96 } 96 }
97 return 0; 97 return 0;
98} 98}
@@ -140,7 +140,7 @@ xfs_btree_check_sblock(
140 if (bp) 140 if (bp)
141 trace_xfs_btree_corrupt(bp, _RET_IP_); 141 trace_xfs_btree_corrupt(bp, _RET_IP_);
142 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); 142 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
143 return XFS_ERROR(EFSCORRUPTED); 143 return -EFSCORRUPTED;
144 } 144 }
145 return 0; 145 return 0;
146} 146}
@@ -167,12 +167,12 @@ xfs_btree_check_block(
167int /* error (0 or EFSCORRUPTED) */ 167int /* error (0 or EFSCORRUPTED) */
168xfs_btree_check_lptr( 168xfs_btree_check_lptr(
169 struct xfs_btree_cur *cur, /* btree cursor */ 169 struct xfs_btree_cur *cur, /* btree cursor */
170 xfs_dfsbno_t bno, /* btree block disk address */ 170 xfs_fsblock_t bno, /* btree block disk address */
171 int level) /* btree block level */ 171 int level) /* btree block level */
172{ 172{
173 XFS_WANT_CORRUPTED_RETURN( 173 XFS_WANT_CORRUPTED_RETURN(
174 level > 0 && 174 level > 0 &&
175 bno != NULLDFSBNO && 175 bno != NULLFSBLOCK &&
176 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); 176 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
177 return 0; 177 return 0;
178} 178}
@@ -595,7 +595,7 @@ xfs_btree_islastblock(
595 block = xfs_btree_get_block(cur, level, &bp); 595 block = xfs_btree_get_block(cur, level, &bp);
596 xfs_btree_check_block(cur, block, level, bp); 596 xfs_btree_check_block(cur, block, level, bp);
597 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 597 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
598 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO); 598 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
599 else 599 else
600 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK); 600 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
601} 601}
@@ -771,16 +771,16 @@ xfs_btree_readahead_lblock(
771 struct xfs_btree_block *block) 771 struct xfs_btree_block *block)
772{ 772{
773 int rval = 0; 773 int rval = 0;
774 xfs_dfsbno_t left = be64_to_cpu(block->bb_u.l.bb_leftsib); 774 xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
775 xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib); 775 xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
776 776
777 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) { 777 if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
778 xfs_btree_reada_bufl(cur->bc_mp, left, 1, 778 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
779 cur->bc_ops->buf_ops); 779 cur->bc_ops->buf_ops);
780 rval++; 780 rval++;
781 } 781 }
782 782
783 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) { 783 if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
784 xfs_btree_reada_bufl(cur->bc_mp, right, 1, 784 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
785 cur->bc_ops->buf_ops); 785 cur->bc_ops->buf_ops);
786 rval++; 786 rval++;
@@ -852,7 +852,7 @@ xfs_btree_ptr_to_daddr(
852 union xfs_btree_ptr *ptr) 852 union xfs_btree_ptr *ptr)
853{ 853{
854 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 854 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
855 ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO)); 855 ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
856 856
857 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l)); 857 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
858 } else { 858 } else {
@@ -900,9 +900,9 @@ xfs_btree_setbuf(
900 900
901 b = XFS_BUF_TO_BLOCK(bp); 901 b = XFS_BUF_TO_BLOCK(bp);
902 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { 902 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
903 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO)) 903 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
904 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; 904 cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
905 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO)) 905 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
906 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; 906 cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
907 } else { 907 } else {
908 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) 908 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
@@ -918,7 +918,7 @@ xfs_btree_ptr_is_null(
918 union xfs_btree_ptr *ptr) 918 union xfs_btree_ptr *ptr)
919{ 919{
920 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 920 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
921 return ptr->l == cpu_to_be64(NULLDFSBNO); 921 return ptr->l == cpu_to_be64(NULLFSBLOCK);
922 else 922 else
923 return ptr->s == cpu_to_be32(NULLAGBLOCK); 923 return ptr->s == cpu_to_be32(NULLAGBLOCK);
924} 924}
@@ -929,7 +929,7 @@ xfs_btree_set_ptr_null(
929 union xfs_btree_ptr *ptr) 929 union xfs_btree_ptr *ptr)
930{ 930{
931 if (cur->bc_flags & XFS_BTREE_LONG_PTRS) 931 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
932 ptr->l = cpu_to_be64(NULLDFSBNO); 932 ptr->l = cpu_to_be64(NULLFSBLOCK);
933 else 933 else
934 ptr->s = cpu_to_be32(NULLAGBLOCK); 934 ptr->s = cpu_to_be32(NULLAGBLOCK);
935} 935}
@@ -997,8 +997,8 @@ xfs_btree_init_block_int(
997 buf->bb_numrecs = cpu_to_be16(numrecs); 997 buf->bb_numrecs = cpu_to_be16(numrecs);
998 998
999 if (flags & XFS_BTREE_LONG_PTRS) { 999 if (flags & XFS_BTREE_LONG_PTRS) {
1000 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO); 1000 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1001 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO); 1001 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1002 if (flags & XFS_BTREE_CRC_BLOCKS) { 1002 if (flags & XFS_BTREE_CRC_BLOCKS) {
1003 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno); 1003 buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1004 buf->bb_u.l.bb_owner = cpu_to_be64(owner); 1004 buf->bb_u.l.bb_owner = cpu_to_be64(owner);
@@ -1140,7 +1140,7 @@ xfs_btree_get_buf_block(
1140 mp->m_bsize, flags); 1140 mp->m_bsize, flags);
1141 1141
1142 if (!*bpp) 1142 if (!*bpp)
1143 return ENOMEM; 1143 return -ENOMEM;
1144 1144
1145 (*bpp)->b_ops = cur->bc_ops->buf_ops; 1145 (*bpp)->b_ops = cur->bc_ops->buf_ops;
1146 *block = XFS_BUF_TO_BLOCK(*bpp); 1146 *block = XFS_BUF_TO_BLOCK(*bpp);
@@ -1498,7 +1498,7 @@ xfs_btree_increment(
1498 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1498 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1499 goto out0; 1499 goto out0;
1500 ASSERT(0); 1500 ASSERT(0);
1501 error = EFSCORRUPTED; 1501 error = -EFSCORRUPTED;
1502 goto error0; 1502 goto error0;
1503 } 1503 }
1504 ASSERT(lev < cur->bc_nlevels); 1504 ASSERT(lev < cur->bc_nlevels);
@@ -1597,7 +1597,7 @@ xfs_btree_decrement(
1597 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) 1597 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1598 goto out0; 1598 goto out0;
1599 ASSERT(0); 1599 ASSERT(0);
1600 error = EFSCORRUPTED; 1600 error = -EFSCORRUPTED;
1601 goto error0; 1601 goto error0;
1602 } 1602 }
1603 ASSERT(lev < cur->bc_nlevels); 1603 ASSERT(lev < cur->bc_nlevels);
@@ -4018,7 +4018,7 @@ xfs_btree_block_change_owner(
4018 /* now read rh sibling block for next iteration */ 4018 /* now read rh sibling block for next iteration */
4019 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB); 4019 xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4020 if (xfs_btree_ptr_is_null(cur, &rptr)) 4020 if (xfs_btree_ptr_is_null(cur, &rptr))
4021 return ENOENT; 4021 return -ENOENT;
4022 4022
4023 return xfs_btree_lookup_get_block(cur, level, &rptr, &block); 4023 return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4024} 4024}
@@ -4061,7 +4061,7 @@ xfs_btree_change_owner(
4061 buffer_list); 4061 buffer_list);
4062 } while (!error); 4062 } while (!error);
4063 4063
4064 if (error != ENOENT) 4064 if (error != -ENOENT)
4065 return error; 4065 return error;
4066 } 4066 }
4067 4067
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index a04b69422f67..8f18bab73ea5 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -258,7 +258,7 @@ xfs_btree_check_block(
258int /* error (0 or EFSCORRUPTED) */ 258int /* error (0 or EFSCORRUPTED) */
259xfs_btree_check_lptr( 259xfs_btree_check_lptr(
260 struct xfs_btree_cur *cur, /* btree cursor */ 260 struct xfs_btree_cur *cur, /* btree cursor */
261 xfs_dfsbno_t ptr, /* btree block disk address */ 261 xfs_fsblock_t ptr, /* btree block disk address */
262 int level); /* btree block level */ 262 int level); /* btree block level */
263 263
264/* 264/*
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676ad8cd..fad1676ad8cd 100644
--- a/fs/xfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index a514ab616650..fd827530afec 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -185,7 +185,7 @@ xfs_da3_node_write_verify(
185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr; 185 struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
186 186
187 if (!xfs_da3_node_verify(bp)) { 187 if (!xfs_da3_node_verify(bp)) {
188 xfs_buf_ioerror(bp, EFSCORRUPTED); 188 xfs_buf_ioerror(bp, -EFSCORRUPTED);
189 xfs_verifier_error(bp); 189 xfs_verifier_error(bp);
190 return; 190 return;
191 } 191 }
@@ -214,13 +214,13 @@ xfs_da3_node_read_verify(
214 switch (be16_to_cpu(info->magic)) { 214 switch (be16_to_cpu(info->magic)) {
215 case XFS_DA3_NODE_MAGIC: 215 case XFS_DA3_NODE_MAGIC:
216 if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) { 216 if (!xfs_buf_verify_cksum(bp, XFS_DA3_NODE_CRC_OFF)) {
217 xfs_buf_ioerror(bp, EFSBADCRC); 217 xfs_buf_ioerror(bp, -EFSBADCRC);
218 break; 218 break;
219 } 219 }
220 /* fall through */ 220 /* fall through */
221 case XFS_DA_NODE_MAGIC: 221 case XFS_DA_NODE_MAGIC:
222 if (!xfs_da3_node_verify(bp)) { 222 if (!xfs_da3_node_verify(bp)) {
223 xfs_buf_ioerror(bp, EFSCORRUPTED); 223 xfs_buf_ioerror(bp, -EFSCORRUPTED);
224 break; 224 break;
225 } 225 }
226 return; 226 return;
@@ -315,7 +315,7 @@ xfs_da3_node_create(
315 315
316 error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork); 316 error = xfs_da_get_buf(tp, dp, blkno, -1, &bp, whichfork);
317 if (error) 317 if (error)
318 return(error); 318 return error;
319 bp->b_ops = &xfs_da3_node_buf_ops; 319 bp->b_ops = &xfs_da3_node_buf_ops;
320 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); 320 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
321 node = bp->b_addr; 321 node = bp->b_addr;
@@ -337,7 +337,7 @@ xfs_da3_node_create(
337 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size)); 337 XFS_DA_LOGRANGE(node, &node->hdr, dp->d_ops->node_hdr_size));
338 338
339 *bpp = bp; 339 *bpp = bp;
340 return(0); 340 return 0;
341} 341}
342 342
343/* 343/*
@@ -385,8 +385,8 @@ xfs_da3_split(
385 switch (oldblk->magic) { 385 switch (oldblk->magic) {
386 case XFS_ATTR_LEAF_MAGIC: 386 case XFS_ATTR_LEAF_MAGIC:
387 error = xfs_attr3_leaf_split(state, oldblk, newblk); 387 error = xfs_attr3_leaf_split(state, oldblk, newblk);
388 if ((error != 0) && (error != ENOSPC)) { 388 if ((error != 0) && (error != -ENOSPC)) {
389 return(error); /* GROT: attr is inconsistent */ 389 return error; /* GROT: attr is inconsistent */
390 } 390 }
391 if (!error) { 391 if (!error) {
392 addblk = newblk; 392 addblk = newblk;
@@ -408,7 +408,7 @@ xfs_da3_split(
408 &state->extrablk); 408 &state->extrablk);
409 } 409 }
410 if (error) 410 if (error)
411 return(error); /* GROT: attr inconsistent */ 411 return error; /* GROT: attr inconsistent */
412 addblk = newblk; 412 addblk = newblk;
413 break; 413 break;
414 case XFS_DIR2_LEAFN_MAGIC: 414 case XFS_DIR2_LEAFN_MAGIC:
@@ -422,7 +422,7 @@ xfs_da3_split(
422 max - i, &action); 422 max - i, &action);
423 addblk->bp = NULL; 423 addblk->bp = NULL;
424 if (error) 424 if (error)
425 return(error); /* GROT: dir is inconsistent */ 425 return error; /* GROT: dir is inconsistent */
426 /* 426 /*
427 * Record the newly split block for the next time thru? 427 * Record the newly split block for the next time thru?
428 */ 428 */
@@ -439,7 +439,7 @@ xfs_da3_split(
439 xfs_da3_fixhashpath(state, &state->path); 439 xfs_da3_fixhashpath(state, &state->path);
440 } 440 }
441 if (!addblk) 441 if (!addblk)
442 return(0); 442 return 0;
443 443
444 /* 444 /*
445 * Split the root node. 445 * Split the root node.
@@ -449,7 +449,7 @@ xfs_da3_split(
449 error = xfs_da3_root_split(state, oldblk, addblk); 449 error = xfs_da3_root_split(state, oldblk, addblk);
450 if (error) { 450 if (error) {
451 addblk->bp = NULL; 451 addblk->bp = NULL;
452 return(error); /* GROT: dir is inconsistent */ 452 return error; /* GROT: dir is inconsistent */
453 } 453 }
454 454
455 /* 455 /*
@@ -492,7 +492,7 @@ xfs_da3_split(
492 sizeof(node->hdr.info))); 492 sizeof(node->hdr.info)));
493 } 493 }
494 addblk->bp = NULL; 494 addblk->bp = NULL;
495 return(0); 495 return 0;
496} 496}
497 497
498/* 498/*
@@ -670,18 +670,18 @@ xfs_da3_node_split(
670 */ 670 */
671 error = xfs_da_grow_inode(state->args, &blkno); 671 error = xfs_da_grow_inode(state->args, &blkno);
672 if (error) 672 if (error)
673 return(error); /* GROT: dir is inconsistent */ 673 return error; /* GROT: dir is inconsistent */
674 674
675 error = xfs_da3_node_create(state->args, blkno, treelevel, 675 error = xfs_da3_node_create(state->args, blkno, treelevel,
676 &newblk->bp, state->args->whichfork); 676 &newblk->bp, state->args->whichfork);
677 if (error) 677 if (error)
678 return(error); /* GROT: dir is inconsistent */ 678 return error; /* GROT: dir is inconsistent */
679 newblk->blkno = blkno; 679 newblk->blkno = blkno;
680 newblk->magic = XFS_DA_NODE_MAGIC; 680 newblk->magic = XFS_DA_NODE_MAGIC;
681 xfs_da3_node_rebalance(state, oldblk, newblk); 681 xfs_da3_node_rebalance(state, oldblk, newblk);
682 error = xfs_da3_blk_link(state, oldblk, newblk); 682 error = xfs_da3_blk_link(state, oldblk, newblk);
683 if (error) 683 if (error)
684 return(error); 684 return error;
685 *result = 1; 685 *result = 1;
686 } else { 686 } else {
687 *result = 0; 687 *result = 0;
@@ -721,7 +721,7 @@ xfs_da3_node_split(
721 } 721 }
722 } 722 }
723 723
724 return(0); 724 return 0;
725} 725}
726 726
727/* 727/*
@@ -963,9 +963,9 @@ xfs_da3_join(
963 case XFS_ATTR_LEAF_MAGIC: 963 case XFS_ATTR_LEAF_MAGIC:
964 error = xfs_attr3_leaf_toosmall(state, &action); 964 error = xfs_attr3_leaf_toosmall(state, &action);
965 if (error) 965 if (error)
966 return(error); 966 return error;
967 if (action == 0) 967 if (action == 0)
968 return(0); 968 return 0;
969 xfs_attr3_leaf_unbalance(state, drop_blk, save_blk); 969 xfs_attr3_leaf_unbalance(state, drop_blk, save_blk);
970 break; 970 break;
971 case XFS_DIR2_LEAFN_MAGIC: 971 case XFS_DIR2_LEAFN_MAGIC:
@@ -985,7 +985,7 @@ xfs_da3_join(
985 xfs_da3_fixhashpath(state, &state->path); 985 xfs_da3_fixhashpath(state, &state->path);
986 error = xfs_da3_node_toosmall(state, &action); 986 error = xfs_da3_node_toosmall(state, &action);
987 if (error) 987 if (error)
988 return(error); 988 return error;
989 if (action == 0) 989 if (action == 0)
990 return 0; 990 return 0;
991 xfs_da3_node_unbalance(state, drop_blk, save_blk); 991 xfs_da3_node_unbalance(state, drop_blk, save_blk);
@@ -995,12 +995,12 @@ xfs_da3_join(
995 error = xfs_da3_blk_unlink(state, drop_blk, save_blk); 995 error = xfs_da3_blk_unlink(state, drop_blk, save_blk);
996 xfs_da_state_kill_altpath(state); 996 xfs_da_state_kill_altpath(state);
997 if (error) 997 if (error)
998 return(error); 998 return error;
999 error = xfs_da_shrink_inode(state->args, drop_blk->blkno, 999 error = xfs_da_shrink_inode(state->args, drop_blk->blkno,
1000 drop_blk->bp); 1000 drop_blk->bp);
1001 drop_blk->bp = NULL; 1001 drop_blk->bp = NULL;
1002 if (error) 1002 if (error)
1003 return(error); 1003 return error;
1004 } 1004 }
1005 /* 1005 /*
1006 * We joined all the way to the top. If it turns out that 1006 * We joined all the way to the top. If it turns out that
@@ -1010,7 +1010,7 @@ xfs_da3_join(
1010 xfs_da3_node_remove(state, drop_blk); 1010 xfs_da3_node_remove(state, drop_blk);
1011 xfs_da3_fixhashpath(state, &state->path); 1011 xfs_da3_fixhashpath(state, &state->path);
1012 error = xfs_da3_root_join(state, &state->path.blk[0]); 1012 error = xfs_da3_root_join(state, &state->path.blk[0]);
1013 return(error); 1013 return error;
1014} 1014}
1015 1015
1016#ifdef DEBUG 1016#ifdef DEBUG
@@ -1099,7 +1099,7 @@ xfs_da3_root_join(
1099 xfs_trans_log_buf(args->trans, root_blk->bp, 0, 1099 xfs_trans_log_buf(args->trans, root_blk->bp, 0,
1100 args->geo->blksize - 1); 1100 args->geo->blksize - 1);
1101 error = xfs_da_shrink_inode(args, child, bp); 1101 error = xfs_da_shrink_inode(args, child, bp);
1102 return(error); 1102 return error;
1103} 1103}
1104 1104
1105/* 1105/*
@@ -1142,7 +1142,7 @@ xfs_da3_node_toosmall(
1142 dp->d_ops->node_hdr_from_disk(&nodehdr, node); 1142 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
1143 if (nodehdr.count > (state->args->geo->node_ents >> 1)) { 1143 if (nodehdr.count > (state->args->geo->node_ents >> 1)) {
1144 *action = 0; /* blk over 50%, don't try to join */ 1144 *action = 0; /* blk over 50%, don't try to join */
1145 return(0); /* blk over 50%, don't try to join */ 1145 return 0; /* blk over 50%, don't try to join */
1146 } 1146 }
1147 1147
1148 /* 1148 /*
@@ -1161,13 +1161,13 @@ xfs_da3_node_toosmall(
1161 error = xfs_da3_path_shift(state, &state->altpath, forward, 1161 error = xfs_da3_path_shift(state, &state->altpath, forward,
1162 0, &retval); 1162 0, &retval);
1163 if (error) 1163 if (error)
1164 return(error); 1164 return error;
1165 if (retval) { 1165 if (retval) {
1166 *action = 0; 1166 *action = 0;
1167 } else { 1167 } else {
1168 *action = 2; 1168 *action = 2;
1169 } 1169 }
1170 return(0); 1170 return 0;
1171 } 1171 }
1172 1172
1173 /* 1173 /*
@@ -1194,7 +1194,7 @@ xfs_da3_node_toosmall(
1194 error = xfs_da3_node_read(state->args->trans, dp, 1194 error = xfs_da3_node_read(state->args->trans, dp,
1195 blkno, -1, &bp, state->args->whichfork); 1195 blkno, -1, &bp, state->args->whichfork);
1196 if (error) 1196 if (error)
1197 return(error); 1197 return error;
1198 1198
1199 node = bp->b_addr; 1199 node = bp->b_addr;
1200 dp->d_ops->node_hdr_from_disk(&thdr, node); 1200 dp->d_ops->node_hdr_from_disk(&thdr, node);
@@ -1486,7 +1486,7 @@ xfs_da3_node_lookup_int(
1486 if (error) { 1486 if (error) {
1487 blk->blkno = 0; 1487 blk->blkno = 0;
1488 state->path.active--; 1488 state->path.active--;
1489 return(error); 1489 return error;
1490 } 1490 }
1491 curr = blk->bp->b_addr; 1491 curr = blk->bp->b_addr;
1492 blk->magic = be16_to_cpu(curr->magic); 1492 blk->magic = be16_to_cpu(curr->magic);
@@ -1579,25 +1579,25 @@ xfs_da3_node_lookup_int(
1579 args->blkno = blk->blkno; 1579 args->blkno = blk->blkno;
1580 } else { 1580 } else {
1581 ASSERT(0); 1581 ASSERT(0);
1582 return XFS_ERROR(EFSCORRUPTED); 1582 return -EFSCORRUPTED;
1583 } 1583 }
1584 if (((retval == ENOENT) || (retval == ENOATTR)) && 1584 if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
1585 (blk->hashval == args->hashval)) { 1585 (blk->hashval == args->hashval)) {
1586 error = xfs_da3_path_shift(state, &state->path, 1, 1, 1586 error = xfs_da3_path_shift(state, &state->path, 1, 1,
1587 &retval); 1587 &retval);
1588 if (error) 1588 if (error)
1589 return(error); 1589 return error;
1590 if (retval == 0) { 1590 if (retval == 0) {
1591 continue; 1591 continue;
1592 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) { 1592 } else if (blk->magic == XFS_ATTR_LEAF_MAGIC) {
1593 /* path_shift() gives ENOENT */ 1593 /* path_shift() gives ENOENT */
1594 retval = XFS_ERROR(ENOATTR); 1594 retval = -ENOATTR;
1595 } 1595 }
1596 } 1596 }
1597 break; 1597 break;
1598 } 1598 }
1599 *result = retval; 1599 *result = retval;
1600 return(0); 1600 return 0;
1601} 1601}
1602 1602
1603/*======================================================================== 1603/*========================================================================
@@ -1692,7 +1692,7 @@ xfs_da3_blk_link(
1692 be32_to_cpu(old_info->back), 1692 be32_to_cpu(old_info->back),
1693 -1, &bp, args->whichfork); 1693 -1, &bp, args->whichfork);
1694 if (error) 1694 if (error)
1695 return(error); 1695 return error;
1696 ASSERT(bp != NULL); 1696 ASSERT(bp != NULL);
1697 tmp_info = bp->b_addr; 1697 tmp_info = bp->b_addr;
1698 ASSERT(tmp_info->magic == old_info->magic); 1698 ASSERT(tmp_info->magic == old_info->magic);
@@ -1713,7 +1713,7 @@ xfs_da3_blk_link(
1713 be32_to_cpu(old_info->forw), 1713 be32_to_cpu(old_info->forw),
1714 -1, &bp, args->whichfork); 1714 -1, &bp, args->whichfork);
1715 if (error) 1715 if (error)
1716 return(error); 1716 return error;
1717 ASSERT(bp != NULL); 1717 ASSERT(bp != NULL);
1718 tmp_info = bp->b_addr; 1718 tmp_info = bp->b_addr;
1719 ASSERT(tmp_info->magic == old_info->magic); 1719 ASSERT(tmp_info->magic == old_info->magic);
@@ -1726,7 +1726,7 @@ xfs_da3_blk_link(
1726 1726
1727 xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1); 1727 xfs_trans_log_buf(args->trans, old_blk->bp, 0, sizeof(*tmp_info) - 1);
1728 xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1); 1728 xfs_trans_log_buf(args->trans, new_blk->bp, 0, sizeof(*tmp_info) - 1);
1729 return(0); 1729 return 0;
1730} 1730}
1731 1731
1732/* 1732/*
@@ -1772,7 +1772,7 @@ xfs_da3_blk_unlink(
1772 be32_to_cpu(drop_info->back), 1772 be32_to_cpu(drop_info->back),
1773 -1, &bp, args->whichfork); 1773 -1, &bp, args->whichfork);
1774 if (error) 1774 if (error)
1775 return(error); 1775 return error;
1776 ASSERT(bp != NULL); 1776 ASSERT(bp != NULL);
1777 tmp_info = bp->b_addr; 1777 tmp_info = bp->b_addr;
1778 ASSERT(tmp_info->magic == save_info->magic); 1778 ASSERT(tmp_info->magic == save_info->magic);
@@ -1789,7 +1789,7 @@ xfs_da3_blk_unlink(
1789 be32_to_cpu(drop_info->forw), 1789 be32_to_cpu(drop_info->forw),
1790 -1, &bp, args->whichfork); 1790 -1, &bp, args->whichfork);
1791 if (error) 1791 if (error)
1792 return(error); 1792 return error;
1793 ASSERT(bp != NULL); 1793 ASSERT(bp != NULL);
1794 tmp_info = bp->b_addr; 1794 tmp_info = bp->b_addr;
1795 ASSERT(tmp_info->magic == save_info->magic); 1795 ASSERT(tmp_info->magic == save_info->magic);
@@ -1801,7 +1801,7 @@ xfs_da3_blk_unlink(
1801 } 1801 }
1802 1802
1803 xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1); 1803 xfs_trans_log_buf(args->trans, save_blk->bp, 0, sizeof(*save_info) - 1);
1804 return(0); 1804 return 0;
1805} 1805}
1806 1806
1807/* 1807/*
@@ -1859,9 +1859,9 @@ xfs_da3_path_shift(
1859 } 1859 }
1860 } 1860 }
1861 if (level < 0) { 1861 if (level < 0) {
1862 *result = XFS_ERROR(ENOENT); /* we're out of our tree */ 1862 *result = -ENOENT; /* we're out of our tree */
1863 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); 1863 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
1864 return(0); 1864 return 0;
1865 } 1865 }
1866 1866
1867 /* 1867 /*
@@ -1883,7 +1883,7 @@ xfs_da3_path_shift(
1883 error = xfs_da3_node_read(args->trans, dp, blkno, -1, 1883 error = xfs_da3_node_read(args->trans, dp, blkno, -1,
1884 &blk->bp, args->whichfork); 1884 &blk->bp, args->whichfork);
1885 if (error) 1885 if (error)
1886 return(error); 1886 return error;
1887 info = blk->bp->b_addr; 1887 info = blk->bp->b_addr;
1888 ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || 1888 ASSERT(info->magic == cpu_to_be16(XFS_DA_NODE_MAGIC) ||
1889 info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) || 1889 info->magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
@@ -2004,7 +2004,7 @@ xfs_da_grow_inode_int(
2004 struct xfs_trans *tp = args->trans; 2004 struct xfs_trans *tp = args->trans;
2005 struct xfs_inode *dp = args->dp; 2005 struct xfs_inode *dp = args->dp;
2006 int w = args->whichfork; 2006 int w = args->whichfork;
2007 xfs_drfsbno_t nblks = dp->i_d.di_nblocks; 2007 xfs_rfsblock_t nblks = dp->i_d.di_nblocks;
2008 struct xfs_bmbt_irec map, *mapp; 2008 struct xfs_bmbt_irec map, *mapp;
2009 int nmap, error, got, i, mapi; 2009 int nmap, error, got, i, mapi;
2010 2010
@@ -2068,7 +2068,7 @@ xfs_da_grow_inode_int(
2068 if (got != count || mapp[0].br_startoff != *bno || 2068 if (got != count || mapp[0].br_startoff != *bno ||
2069 mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount != 2069 mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
2070 *bno + count) { 2070 *bno + count) {
2071 error = XFS_ERROR(ENOSPC); 2071 error = -ENOSPC;
2072 goto out_free_map; 2072 goto out_free_map;
2073 } 2073 }
2074 2074
@@ -2158,7 +2158,7 @@ xfs_da3_swap_lastblock(
2158 if (unlikely(lastoff == 0)) { 2158 if (unlikely(lastoff == 0)) {
2159 XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW, 2159 XFS_ERROR_REPORT("xfs_da_swap_lastblock(1)", XFS_ERRLEVEL_LOW,
2160 mp); 2160 mp);
2161 return XFS_ERROR(EFSCORRUPTED); 2161 return -EFSCORRUPTED;
2162 } 2162 }
2163 /* 2163 /*
2164 * Read the last block in the btree space. 2164 * Read the last block in the btree space.
@@ -2209,7 +2209,7 @@ xfs_da3_swap_lastblock(
2209 sib_info->magic != dead_info->magic)) { 2209 sib_info->magic != dead_info->magic)) {
2210 XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)", 2210 XFS_ERROR_REPORT("xfs_da_swap_lastblock(2)",
2211 XFS_ERRLEVEL_LOW, mp); 2211 XFS_ERRLEVEL_LOW, mp);
2212 error = XFS_ERROR(EFSCORRUPTED); 2212 error = -EFSCORRUPTED;
2213 goto done; 2213 goto done;
2214 } 2214 }
2215 sib_info->forw = cpu_to_be32(dead_blkno); 2215 sib_info->forw = cpu_to_be32(dead_blkno);
@@ -2231,7 +2231,7 @@ xfs_da3_swap_lastblock(
2231 sib_info->magic != dead_info->magic)) { 2231 sib_info->magic != dead_info->magic)) {
2232 XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)", 2232 XFS_ERROR_REPORT("xfs_da_swap_lastblock(3)",
2233 XFS_ERRLEVEL_LOW, mp); 2233 XFS_ERRLEVEL_LOW, mp);
2234 error = XFS_ERROR(EFSCORRUPTED); 2234 error = -EFSCORRUPTED;
2235 goto done; 2235 goto done;
2236 } 2236 }
2237 sib_info->back = cpu_to_be32(dead_blkno); 2237 sib_info->back = cpu_to_be32(dead_blkno);
@@ -2254,7 +2254,7 @@ xfs_da3_swap_lastblock(
2254 if (level >= 0 && level != par_hdr.level + 1) { 2254 if (level >= 0 && level != par_hdr.level + 1) {
2255 XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)", 2255 XFS_ERROR_REPORT("xfs_da_swap_lastblock(4)",
2256 XFS_ERRLEVEL_LOW, mp); 2256 XFS_ERRLEVEL_LOW, mp);
2257 error = XFS_ERROR(EFSCORRUPTED); 2257 error = -EFSCORRUPTED;
2258 goto done; 2258 goto done;
2259 } 2259 }
2260 level = par_hdr.level; 2260 level = par_hdr.level;
@@ -2267,7 +2267,7 @@ xfs_da3_swap_lastblock(
2267 if (entno == par_hdr.count) { 2267 if (entno == par_hdr.count) {
2268 XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)", 2268 XFS_ERROR_REPORT("xfs_da_swap_lastblock(5)",
2269 XFS_ERRLEVEL_LOW, mp); 2269 XFS_ERRLEVEL_LOW, mp);
2270 error = XFS_ERROR(EFSCORRUPTED); 2270 error = -EFSCORRUPTED;
2271 goto done; 2271 goto done;
2272 } 2272 }
2273 par_blkno = be32_to_cpu(btree[entno].before); 2273 par_blkno = be32_to_cpu(btree[entno].before);
@@ -2294,7 +2294,7 @@ xfs_da3_swap_lastblock(
2294 if (unlikely(par_blkno == 0)) { 2294 if (unlikely(par_blkno == 0)) {
2295 XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)", 2295 XFS_ERROR_REPORT("xfs_da_swap_lastblock(6)",
2296 XFS_ERRLEVEL_LOW, mp); 2296 XFS_ERRLEVEL_LOW, mp);
2297 error = XFS_ERROR(EFSCORRUPTED); 2297 error = -EFSCORRUPTED;
2298 goto done; 2298 goto done;
2299 } 2299 }
2300 error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w); 2300 error = xfs_da3_node_read(tp, dp, par_blkno, -1, &par_buf, w);
@@ -2305,7 +2305,7 @@ xfs_da3_swap_lastblock(
2305 if (par_hdr.level != level) { 2305 if (par_hdr.level != level) {
2306 XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)", 2306 XFS_ERROR_REPORT("xfs_da_swap_lastblock(7)",
2307 XFS_ERRLEVEL_LOW, mp); 2307 XFS_ERRLEVEL_LOW, mp);
2308 error = XFS_ERROR(EFSCORRUPTED); 2308 error = -EFSCORRUPTED;
2309 goto done; 2309 goto done;
2310 } 2310 }
2311 btree = dp->d_ops->node_tree_p(par_node); 2311 btree = dp->d_ops->node_tree_p(par_node);
@@ -2359,7 +2359,7 @@ xfs_da_shrink_inode(
2359 error = xfs_bunmapi(tp, dp, dead_blkno, count, 2359 error = xfs_bunmapi(tp, dp, dead_blkno, count,
2360 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA, 2360 xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
2361 0, args->firstblock, args->flist, &done); 2361 0, args->firstblock, args->flist, &done);
2362 if (error == ENOSPC) { 2362 if (error == -ENOSPC) {
2363 if (w != XFS_DATA_FORK) 2363 if (w != XFS_DATA_FORK)
2364 break; 2364 break;
2365 error = xfs_da3_swap_lastblock(args, &dead_blkno, 2365 error = xfs_da3_swap_lastblock(args, &dead_blkno,
@@ -2427,7 +2427,7 @@ xfs_buf_map_from_irec(
2427 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), 2427 map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map),
2428 KM_SLEEP | KM_NOFS); 2428 KM_SLEEP | KM_NOFS);
2429 if (!map) 2429 if (!map)
2430 return ENOMEM; 2430 return -ENOMEM;
2431 *mapp = map; 2431 *mapp = map;
2432 } 2432 }
2433 2433
@@ -2500,8 +2500,8 @@ xfs_dabuf_map(
2500 } 2500 }
2501 2501
2502 if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) { 2502 if (!xfs_da_map_covers_blocks(nirecs, irecs, bno, nfsb)) {
2503 error = mappedbno == -2 ? -1 : XFS_ERROR(EFSCORRUPTED); 2503 error = mappedbno == -2 ? -1 : -EFSCORRUPTED;
2504 if (unlikely(error == EFSCORRUPTED)) { 2504 if (unlikely(error == -EFSCORRUPTED)) {
2505 if (xfs_error_level >= XFS_ERRLEVEL_LOW) { 2505 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
2506 int i; 2506 int i;
2507 xfs_alert(mp, "%s: bno %lld dir: inode %lld", 2507 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
@@ -2561,9 +2561,10 @@ xfs_da_get_buf(
2561 2561
2562 bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp, 2562 bp = xfs_trans_get_buf_map(trans, dp->i_mount->m_ddev_targp,
2563 mapp, nmap, 0); 2563 mapp, nmap, 0);
2564 error = bp ? bp->b_error : XFS_ERROR(EIO); 2564 error = bp ? bp->b_error : -EIO;
2565 if (error) { 2565 if (error) {
2566 xfs_trans_brelse(trans, bp); 2566 if (bp)
2567 xfs_trans_brelse(trans, bp);
2567 goto out_free; 2568 goto out_free;
2568 } 2569 }
2569 2570
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index 6e153e399a77..6e153e399a77 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
diff --git a/fs/xfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index c9aee52a37e2..7e42fdfd2f1d 100644
--- a/fs/xfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -270,7 +270,6 @@ xfs_dir3_data_get_ftype(
270{ 270{
271 __uint8_t ftype = dep->name[dep->namelen]; 271 __uint8_t ftype = dep->name[dep->namelen];
272 272
273 ASSERT(ftype < XFS_DIR3_FT_MAX);
274 if (ftype >= XFS_DIR3_FT_MAX) 273 if (ftype >= XFS_DIR3_FT_MAX)
275 return XFS_DIR3_FT_UNKNOWN; 274 return XFS_DIR3_FT_UNKNOWN;
276 return ftype; 275 return ftype;
diff --git a/fs/xfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 0a49b0286372..0a49b0286372 100644
--- a/fs/xfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/libxfs/xfs_dinode.h
index 623bbe8fd921..623bbe8fd921 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/libxfs/xfs_dinode.h
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 79670cda48ae..7075aaf131f4 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -108,7 +108,7 @@ xfs_da_mount(
108 if (!mp->m_dir_geo || !mp->m_attr_geo) { 108 if (!mp->m_dir_geo || !mp->m_attr_geo) {
109 kmem_free(mp->m_dir_geo); 109 kmem_free(mp->m_dir_geo);
110 kmem_free(mp->m_attr_geo); 110 kmem_free(mp->m_attr_geo);
111 return ENOMEM; 111 return -ENOMEM;
112 } 112 }
113 113
114 /* set up directory geometry */ 114 /* set up directory geometry */
@@ -202,7 +202,7 @@ xfs_dir_ino_validate(
202 xfs_warn(mp, "Invalid inode number 0x%Lx", 202 xfs_warn(mp, "Invalid inode number 0x%Lx",
203 (unsigned long long) ino); 203 (unsigned long long) ino);
204 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); 204 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
205 return XFS_ERROR(EFSCORRUPTED); 205 return -EFSCORRUPTED;
206 } 206 }
207 return 0; 207 return 0;
208} 208}
@@ -226,7 +226,7 @@ xfs_dir_init(
226 226
227 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 227 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
228 if (!args) 228 if (!args)
229 return ENOMEM; 229 return -ENOMEM;
230 230
231 args->geo = dp->i_mount->m_dir_geo; 231 args->geo = dp->i_mount->m_dir_geo;
232 args->dp = dp; 232 args->dp = dp;
@@ -237,7 +237,8 @@ xfs_dir_init(
237} 237}
238 238
239/* 239/*
240 Enter a name in a directory. 240 * Enter a name in a directory, or check for available space.
241 * If inum is 0, only the available space test is performed.
241 */ 242 */
242int 243int
243xfs_dir_createname( 244xfs_dir_createname(
@@ -254,14 +255,16 @@ xfs_dir_createname(
254 int v; /* type-checking value */ 255 int v; /* type-checking value */
255 256
256 ASSERT(S_ISDIR(dp->i_d.di_mode)); 257 ASSERT(S_ISDIR(dp->i_d.di_mode));
257 rval = xfs_dir_ino_validate(tp->t_mountp, inum); 258 if (inum) {
258 if (rval) 259 rval = xfs_dir_ino_validate(tp->t_mountp, inum);
259 return rval; 260 if (rval)
260 XFS_STATS_INC(xs_dir_create); 261 return rval;
262 XFS_STATS_INC(xs_dir_create);
263 }
261 264
262 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 265 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
263 if (!args) 266 if (!args)
264 return ENOMEM; 267 return -ENOMEM;
265 268
266 args->geo = dp->i_mount->m_dir_geo; 269 args->geo = dp->i_mount->m_dir_geo;
267 args->name = name->name; 270 args->name = name->name;
@@ -276,6 +279,8 @@ xfs_dir_createname(
276 args->whichfork = XFS_DATA_FORK; 279 args->whichfork = XFS_DATA_FORK;
277 args->trans = tp; 280 args->trans = tp;
278 args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; 281 args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
282 if (!inum)
283 args->op_flags |= XFS_DA_OP_JUSTCHECK;
279 284
280 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { 285 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
281 rval = xfs_dir2_sf_addname(args); 286 rval = xfs_dir2_sf_addname(args);
@@ -314,18 +319,18 @@ xfs_dir_cilookup_result(
314 int len) 319 int len)
315{ 320{
316 if (args->cmpresult == XFS_CMP_DIFFERENT) 321 if (args->cmpresult == XFS_CMP_DIFFERENT)
317 return ENOENT; 322 return -ENOENT;
318 if (args->cmpresult != XFS_CMP_CASE || 323 if (args->cmpresult != XFS_CMP_CASE ||
319 !(args->op_flags & XFS_DA_OP_CILOOKUP)) 324 !(args->op_flags & XFS_DA_OP_CILOOKUP))
320 return EEXIST; 325 return -EEXIST;
321 326
322 args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); 327 args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
323 if (!args->value) 328 if (!args->value)
324 return ENOMEM; 329 return -ENOMEM;
325 330
326 memcpy(args->value, name, len); 331 memcpy(args->value, name, len);
327 args->valuelen = len; 332 args->valuelen = len;
328 return EEXIST; 333 return -EEXIST;
329} 334}
330 335
331/* 336/*
@@ -392,7 +397,7 @@ xfs_dir_lookup(
392 rval = xfs_dir2_node_lookup(args); 397 rval = xfs_dir2_node_lookup(args);
393 398
394out_check_rval: 399out_check_rval:
395 if (rval == EEXIST) 400 if (rval == -EEXIST)
396 rval = 0; 401 rval = 0;
397 if (!rval) { 402 if (!rval) {
398 *inum = args->inumber; 403 *inum = args->inumber;
@@ -428,7 +433,7 @@ xfs_dir_removename(
428 433
429 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 434 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
430 if (!args) 435 if (!args)
431 return ENOMEM; 436 return -ENOMEM;
432 437
433 args->geo = dp->i_mount->m_dir_geo; 438 args->geo = dp->i_mount->m_dir_geo;
434 args->name = name->name; 439 args->name = name->name;
@@ -493,7 +498,7 @@ xfs_dir_replace(
493 498
494 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); 499 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
495 if (!args) 500 if (!args)
496 return ENOMEM; 501 return -ENOMEM;
497 502
498 args->geo = dp->i_mount->m_dir_geo; 503 args->geo = dp->i_mount->m_dir_geo;
499 args->name = name->name; 504 args->name = name->name;
@@ -535,62 +540,14 @@ out_free:
535 540
536/* 541/*
537 * See if this entry can be added to the directory without allocating space. 542 * See if this entry can be added to the directory without allocating space.
538 * First checks that the caller couldn't reserve enough space (resblks = 0).
539 */ 543 */
540int 544int
541xfs_dir_canenter( 545xfs_dir_canenter(
542 xfs_trans_t *tp, 546 xfs_trans_t *tp,
543 xfs_inode_t *dp, 547 xfs_inode_t *dp,
544 struct xfs_name *name, /* name of entry to add */ 548 struct xfs_name *name) /* name of entry to add */
545 uint resblks)
546{ 549{
547 struct xfs_da_args *args; 550 return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0);
548 int rval;
549 int v; /* type-checking value */
550
551 if (resblks)
552 return 0;
553
554 ASSERT(S_ISDIR(dp->i_d.di_mode));
555
556 args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
557 if (!args)
558 return ENOMEM;
559
560 args->geo = dp->i_mount->m_dir_geo;
561 args->name = name->name;
562 args->namelen = name->len;
563 args->filetype = name->type;
564 args->hashval = dp->i_mount->m_dirnameops->hashname(name);
565 args->dp = dp;
566 args->whichfork = XFS_DATA_FORK;
567 args->trans = tp;
568 args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME |
569 XFS_DA_OP_OKNOENT;
570
571 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) {
572 rval = xfs_dir2_sf_addname(args);
573 goto out_free;
574 }
575
576 rval = xfs_dir2_isblock(args, &v);
577 if (rval)
578 goto out_free;
579 if (v) {
580 rval = xfs_dir2_block_addname(args);
581 goto out_free;
582 }
583
584 rval = xfs_dir2_isleaf(args, &v);
585 if (rval)
586 goto out_free;
587 if (v)
588 rval = xfs_dir2_leaf_addname(args);
589 else
590 rval = xfs_dir2_node_addname(args);
591out_free:
592 kmem_free(args);
593 return rval;
594} 551}
595 552
596/* 553/*
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index c8e86b0b5e99..4dff261e6ed5 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -136,7 +136,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
136 xfs_fsblock_t *first, 136 xfs_fsblock_t *first,
137 struct xfs_bmap_free *flist, xfs_extlen_t tot); 137 struct xfs_bmap_free *flist, xfs_extlen_t tot);
138extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, 138extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
139 struct xfs_name *name, uint resblks); 139 struct xfs_name *name);
140 140
141/* 141/*
142 * Direct call from the bmap code, bypassing the generic directory layer. 142 * Direct call from the bmap code, bypassing the generic directory layer.
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index c7cd3154026a..9628ceccfa02 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -91,9 +91,9 @@ xfs_dir3_block_read_verify(
91 91
92 if (xfs_sb_version_hascrc(&mp->m_sb) && 92 if (xfs_sb_version_hascrc(&mp->m_sb) &&
93 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) 93 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
94 xfs_buf_ioerror(bp, EFSBADCRC); 94 xfs_buf_ioerror(bp, -EFSBADCRC);
95 else if (!xfs_dir3_block_verify(bp)) 95 else if (!xfs_dir3_block_verify(bp))
96 xfs_buf_ioerror(bp, EFSCORRUPTED); 96 xfs_buf_ioerror(bp, -EFSCORRUPTED);
97 97
98 if (bp->b_error) 98 if (bp->b_error)
99 xfs_verifier_error(bp); 99 xfs_verifier_error(bp);
@@ -108,7 +108,7 @@ xfs_dir3_block_write_verify(
108 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 108 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
109 109
110 if (!xfs_dir3_block_verify(bp)) { 110 if (!xfs_dir3_block_verify(bp)) {
111 xfs_buf_ioerror(bp, EFSCORRUPTED); 111 xfs_buf_ioerror(bp, -EFSCORRUPTED);
112 xfs_verifier_error(bp); 112 xfs_verifier_error(bp);
113 return; 113 return;
114 } 114 }
@@ -392,7 +392,7 @@ xfs_dir2_block_addname(
392 if (args->op_flags & XFS_DA_OP_JUSTCHECK) { 392 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
393 xfs_trans_brelse(tp, bp); 393 xfs_trans_brelse(tp, bp);
394 if (!dup) 394 if (!dup)
395 return XFS_ERROR(ENOSPC); 395 return -ENOSPC;
396 return 0; 396 return 0;
397 } 397 }
398 398
@@ -402,7 +402,7 @@ xfs_dir2_block_addname(
402 if (!dup) { 402 if (!dup) {
403 /* Don't have a space reservation: return no-space. */ 403 /* Don't have a space reservation: return no-space. */
404 if (args->total == 0) 404 if (args->total == 0)
405 return XFS_ERROR(ENOSPC); 405 return -ENOSPC;
406 /* 406 /*
407 * Convert to the next larger format. 407 * Convert to the next larger format.
408 * Then add the new entry in that format. 408 * Then add the new entry in that format.
@@ -647,7 +647,7 @@ xfs_dir2_block_lookup(
647 args->filetype = dp->d_ops->data_get_ftype(dep); 647 args->filetype = dp->d_ops->data_get_ftype(dep);
648 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); 648 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
649 xfs_trans_brelse(args->trans, bp); 649 xfs_trans_brelse(args->trans, bp);
650 return XFS_ERROR(error); 650 return error;
651} 651}
652 652
653/* 653/*
@@ -703,7 +703,7 @@ xfs_dir2_block_lookup_int(
703 if (low > high) { 703 if (low > high) {
704 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); 704 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
705 xfs_trans_brelse(tp, bp); 705 xfs_trans_brelse(tp, bp);
706 return XFS_ERROR(ENOENT); 706 return -ENOENT;
707 } 707 }
708 } 708 }
709 /* 709 /*
@@ -751,7 +751,7 @@ xfs_dir2_block_lookup_int(
751 * No match, release the buffer and return ENOENT. 751 * No match, release the buffer and return ENOENT.
752 */ 752 */
753 xfs_trans_brelse(tp, bp); 753 xfs_trans_brelse(tp, bp);
754 return XFS_ERROR(ENOENT); 754 return -ENOENT;
755} 755}
756 756
757/* 757/*
@@ -1091,7 +1091,7 @@ xfs_dir2_sf_to_block(
1091 */ 1091 */
1092 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { 1092 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
1093 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1093 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1094 return XFS_ERROR(EIO); 1094 return -EIO;
1095 } 1095 }
1096 1096
1097 oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data; 1097 oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 8c2f6422648e..fdd803fecb8e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -100,7 +100,7 @@ __xfs_dir3_data_check(
100 break; 100 break;
101 default: 101 default:
102 XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp); 102 XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
103 return EFSCORRUPTED; 103 return -EFSCORRUPTED;
104 } 104 }
105 105
106 /* 106 /*
@@ -256,7 +256,7 @@ xfs_dir3_data_reada_verify(
256 xfs_dir3_data_verify(bp); 256 xfs_dir3_data_verify(bp);
257 return; 257 return;
258 default: 258 default:
259 xfs_buf_ioerror(bp, EFSCORRUPTED); 259 xfs_buf_ioerror(bp, -EFSCORRUPTED);
260 xfs_verifier_error(bp); 260 xfs_verifier_error(bp);
261 break; 261 break;
262 } 262 }
@@ -270,9 +270,9 @@ xfs_dir3_data_read_verify(
270 270
271 if (xfs_sb_version_hascrc(&mp->m_sb) && 271 if (xfs_sb_version_hascrc(&mp->m_sb) &&
272 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) 272 !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF))
273 xfs_buf_ioerror(bp, EFSBADCRC); 273 xfs_buf_ioerror(bp, -EFSBADCRC);
274 else if (!xfs_dir3_data_verify(bp)) 274 else if (!xfs_dir3_data_verify(bp))
275 xfs_buf_ioerror(bp, EFSCORRUPTED); 275 xfs_buf_ioerror(bp, -EFSCORRUPTED);
276 276
277 if (bp->b_error) 277 if (bp->b_error)
278 xfs_verifier_error(bp); 278 xfs_verifier_error(bp);
@@ -287,7 +287,7 @@ xfs_dir3_data_write_verify(
287 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 287 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
288 288
289 if (!xfs_dir3_data_verify(bp)) { 289 if (!xfs_dir3_data_verify(bp)) {
290 xfs_buf_ioerror(bp, EFSCORRUPTED); 290 xfs_buf_ioerror(bp, -EFSCORRUPTED);
291 xfs_verifier_error(bp); 291 xfs_verifier_error(bp);
292 return; 292 return;
293 } 293 }
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index fb0aad4440c1..a19174eb3cb2 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -183,9 +183,9 @@ __read_verify(
183 183
184 if (xfs_sb_version_hascrc(&mp->m_sb) && 184 if (xfs_sb_version_hascrc(&mp->m_sb) &&
185 !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) 185 !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF))
186 xfs_buf_ioerror(bp, EFSBADCRC); 186 xfs_buf_ioerror(bp, -EFSBADCRC);
187 else if (!xfs_dir3_leaf_verify(bp, magic)) 187 else if (!xfs_dir3_leaf_verify(bp, magic))
188 xfs_buf_ioerror(bp, EFSCORRUPTED); 188 xfs_buf_ioerror(bp, -EFSCORRUPTED);
189 189
190 if (bp->b_error) 190 if (bp->b_error)
191 xfs_verifier_error(bp); 191 xfs_verifier_error(bp);
@@ -201,7 +201,7 @@ __write_verify(
201 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; 201 struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr;
202 202
203 if (!xfs_dir3_leaf_verify(bp, magic)) { 203 if (!xfs_dir3_leaf_verify(bp, magic)) {
204 xfs_buf_ioerror(bp, EFSCORRUPTED); 204 xfs_buf_ioerror(bp, -EFSCORRUPTED);
205 xfs_verifier_error(bp); 205 xfs_verifier_error(bp);
206 return; 206 return;
207 } 207 }
@@ -731,7 +731,7 @@ xfs_dir2_leaf_addname(
731 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || 731 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) ||
732 args->total == 0) { 732 args->total == 0) {
733 xfs_trans_brelse(tp, lbp); 733 xfs_trans_brelse(tp, lbp);
734 return XFS_ERROR(ENOSPC); 734 return -ENOSPC;
735 } 735 }
736 /* 736 /*
737 * Convert to node form. 737 * Convert to node form.
@@ -755,7 +755,7 @@ xfs_dir2_leaf_addname(
755 */ 755 */
756 if (args->op_flags & XFS_DA_OP_JUSTCHECK) { 756 if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
757 xfs_trans_brelse(tp, lbp); 757 xfs_trans_brelse(tp, lbp);
758 return use_block == -1 ? XFS_ERROR(ENOSPC) : 0; 758 return use_block == -1 ? -ENOSPC : 0;
759 } 759 }
760 /* 760 /*
761 * If no allocations are allowed, return now before we've 761 * If no allocations are allowed, return now before we've
@@ -763,7 +763,7 @@ xfs_dir2_leaf_addname(
763 */ 763 */
764 if (args->total == 0 && use_block == -1) { 764 if (args->total == 0 && use_block == -1) {
765 xfs_trans_brelse(tp, lbp); 765 xfs_trans_brelse(tp, lbp);
766 return XFS_ERROR(ENOSPC); 766 return -ENOSPC;
767 } 767 }
768 /* 768 /*
769 * Need to compact the leaf entries, removing stale ones. 769 * Need to compact the leaf entries, removing stale ones.
@@ -1198,7 +1198,7 @@ xfs_dir2_leaf_lookup(
1198 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen); 1198 error = xfs_dir_cilookup_result(args, dep->name, dep->namelen);
1199 xfs_trans_brelse(tp, dbp); 1199 xfs_trans_brelse(tp, dbp);
1200 xfs_trans_brelse(tp, lbp); 1200 xfs_trans_brelse(tp, lbp);
1201 return XFS_ERROR(error); 1201 return error;
1202} 1202}
1203 1203
1204/* 1204/*
@@ -1327,13 +1327,13 @@ xfs_dir2_leaf_lookup_int(
1327 return 0; 1327 return 0;
1328 } 1328 }
1329 /* 1329 /*
1330 * No match found, return ENOENT. 1330 * No match found, return -ENOENT.
1331 */ 1331 */
1332 ASSERT(cidb == -1); 1332 ASSERT(cidb == -1);
1333 if (dbp) 1333 if (dbp)
1334 xfs_trans_brelse(tp, dbp); 1334 xfs_trans_brelse(tp, dbp);
1335 xfs_trans_brelse(tp, lbp); 1335 xfs_trans_brelse(tp, lbp);
1336 return XFS_ERROR(ENOENT); 1336 return -ENOENT;
1337} 1337}
1338 1338
1339/* 1339/*
@@ -1440,7 +1440,7 @@ xfs_dir2_leaf_removename(
1440 * Just go on, returning success, leaving the 1440 * Just go on, returning success, leaving the
1441 * empty block in place. 1441 * empty block in place.
1442 */ 1442 */
1443 if (error == ENOSPC && args->total == 0) 1443 if (error == -ENOSPC && args->total == 0)
1444 error = 0; 1444 error = 0;
1445 xfs_dir3_leaf_check(dp, lbp); 1445 xfs_dir3_leaf_check(dp, lbp);
1446 return error; 1446 return error;
@@ -1641,7 +1641,7 @@ xfs_dir2_leaf_trim_data(
1641 * Get rid of the data block. 1641 * Get rid of the data block.
1642 */ 1642 */
1643 if ((error = xfs_dir2_shrink_inode(args, db, dbp))) { 1643 if ((error = xfs_dir2_shrink_inode(args, db, dbp))) {
1644 ASSERT(error != ENOSPC); 1644 ASSERT(error != -ENOSPC);
1645 xfs_trans_brelse(tp, dbp); 1645 xfs_trans_brelse(tp, dbp);
1646 return error; 1646 return error;
1647 } 1647 }
@@ -1815,7 +1815,7 @@ xfs_dir2_node_to_leaf(
1815 * punching out the middle of an extent, and this is an 1815 * punching out the middle of an extent, and this is an
1816 * isolated block. 1816 * isolated block.
1817 */ 1817 */
1818 ASSERT(error != ENOSPC); 1818 ASSERT(error != -ENOSPC);
1819 return error; 1819 return error;
1820 } 1820 }
1821 fbp = NULL; 1821 fbp = NULL;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index da43d304fca2..2ae6ac2c11ae 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -117,9 +117,9 @@ xfs_dir3_free_read_verify(
117 117
118 if (xfs_sb_version_hascrc(&mp->m_sb) && 118 if (xfs_sb_version_hascrc(&mp->m_sb) &&
119 !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) 119 !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF))
120 xfs_buf_ioerror(bp, EFSBADCRC); 120 xfs_buf_ioerror(bp, -EFSBADCRC);
121 else if (!xfs_dir3_free_verify(bp)) 121 else if (!xfs_dir3_free_verify(bp))
122 xfs_buf_ioerror(bp, EFSCORRUPTED); 122 xfs_buf_ioerror(bp, -EFSCORRUPTED);
123 123
124 if (bp->b_error) 124 if (bp->b_error)
125 xfs_verifier_error(bp); 125 xfs_verifier_error(bp);
@@ -134,7 +134,7 @@ xfs_dir3_free_write_verify(
134 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; 134 struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
135 135
136 if (!xfs_dir3_free_verify(bp)) { 136 if (!xfs_dir3_free_verify(bp)) {
137 xfs_buf_ioerror(bp, EFSCORRUPTED); 137 xfs_buf_ioerror(bp, -EFSCORRUPTED);
138 xfs_verifier_error(bp); 138 xfs_verifier_error(bp);
139 return; 139 return;
140 } 140 }
@@ -406,7 +406,7 @@ xfs_dir2_leafn_add(
406 * into other peoples memory 406 * into other peoples memory
407 */ 407 */
408 if (index < 0) 408 if (index < 0)
409 return XFS_ERROR(EFSCORRUPTED); 409 return -EFSCORRUPTED;
410 410
411 /* 411 /*
412 * If there are already the maximum number of leaf entries in 412 * If there are already the maximum number of leaf entries in
@@ -417,7 +417,7 @@ xfs_dir2_leafn_add(
417 417
418 if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) { 418 if (leafhdr.count == dp->d_ops->leaf_max_ents(args->geo)) {
419 if (!leafhdr.stale) 419 if (!leafhdr.stale)
420 return XFS_ERROR(ENOSPC); 420 return -ENOSPC;
421 compact = leafhdr.stale > 1; 421 compact = leafhdr.stale > 1;
422 } else 422 } else
423 compact = 0; 423 compact = 0;
@@ -629,7 +629,7 @@ xfs_dir2_leafn_lookup_for_addname(
629 XFS_ERRLEVEL_LOW, mp); 629 XFS_ERRLEVEL_LOW, mp);
630 if (curfdb != newfdb) 630 if (curfdb != newfdb)
631 xfs_trans_brelse(tp, curbp); 631 xfs_trans_brelse(tp, curbp);
632 return XFS_ERROR(EFSCORRUPTED); 632 return -EFSCORRUPTED;
633 } 633 }
634 curfdb = newfdb; 634 curfdb = newfdb;
635 if (be16_to_cpu(bests[fi]) >= length) 635 if (be16_to_cpu(bests[fi]) >= length)
@@ -660,7 +660,7 @@ out:
660 * Return the index, that will be the insertion point. 660 * Return the index, that will be the insertion point.
661 */ 661 */
662 *indexp = index; 662 *indexp = index;
663 return XFS_ERROR(ENOENT); 663 return -ENOENT;
664} 664}
665 665
666/* 666/*
@@ -789,7 +789,7 @@ xfs_dir2_leafn_lookup_for_entry(
789 curbp->b_ops = &xfs_dir3_data_buf_ops; 789 curbp->b_ops = &xfs_dir3_data_buf_ops;
790 xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF); 790 xfs_trans_buf_set_type(tp, curbp, XFS_BLFT_DIR_DATA_BUF);
791 if (cmp == XFS_CMP_EXACT) 791 if (cmp == XFS_CMP_EXACT)
792 return XFS_ERROR(EEXIST); 792 return -EEXIST;
793 } 793 }
794 } 794 }
795 ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT)); 795 ASSERT(index == leafhdr.count || (args->op_flags & XFS_DA_OP_OKNOENT));
@@ -812,7 +812,7 @@ xfs_dir2_leafn_lookup_for_entry(
812 state->extravalid = 0; 812 state->extravalid = 0;
813 } 813 }
814 *indexp = index; 814 *indexp = index;
815 return XFS_ERROR(ENOENT); 815 return -ENOENT;
816} 816}
817 817
818/* 818/*
@@ -1133,7 +1133,7 @@ xfs_dir3_data_block_free(
1133 if (error == 0) { 1133 if (error == 0) {
1134 fbp = NULL; 1134 fbp = NULL;
1135 logfree = 0; 1135 logfree = 0;
1136 } else if (error != ENOSPC || args->total != 0) 1136 } else if (error != -ENOSPC || args->total != 0)
1137 return error; 1137 return error;
1138 /* 1138 /*
1139 * It's possible to get ENOSPC if there is no 1139 * It's possible to get ENOSPC if there is no
@@ -1287,7 +1287,7 @@ xfs_dir2_leafn_remove(
1287 * In this case just drop the buffer and some one else 1287 * In this case just drop the buffer and some one else
1288 * will eventually get rid of the empty block. 1288 * will eventually get rid of the empty block.
1289 */ 1289 */
1290 else if (!(error == ENOSPC && args->total == 0)) 1290 else if (!(error == -ENOSPC && args->total == 0))
1291 return error; 1291 return error;
1292 } 1292 }
1293 /* 1293 /*
@@ -1599,7 +1599,7 @@ xfs_dir2_node_addname(
1599 error = xfs_da3_node_lookup_int(state, &rval); 1599 error = xfs_da3_node_lookup_int(state, &rval);
1600 if (error) 1600 if (error)
1601 rval = error; 1601 rval = error;
1602 if (rval != ENOENT) { 1602 if (rval != -ENOENT) {
1603 goto done; 1603 goto done;
1604 } 1604 }
1605 /* 1605 /*
@@ -1628,7 +1628,7 @@ xfs_dir2_node_addname(
1628 * It didn't work, we need to split the leaf block. 1628 * It didn't work, we need to split the leaf block.
1629 */ 1629 */
1630 if (args->total == 0) { 1630 if (args->total == 0) {
1631 ASSERT(rval == ENOSPC); 1631 ASSERT(rval == -ENOSPC);
1632 goto done; 1632 goto done;
1633 } 1633 }
1634 /* 1634 /*
@@ -1815,7 +1815,7 @@ xfs_dir2_node_addname_int(
1815 * Not allowed to allocate, return failure. 1815 * Not allowed to allocate, return failure.
1816 */ 1816 */
1817 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) 1817 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
1818 return XFS_ERROR(ENOSPC); 1818 return -ENOSPC;
1819 1819
1820 /* 1820 /*
1821 * Allocate and initialize the new data block. 1821 * Allocate and initialize the new data block.
@@ -1876,7 +1876,7 @@ xfs_dir2_node_addname_int(
1876 } 1876 }
1877 XFS_ERROR_REPORT("xfs_dir2_node_addname_int", 1877 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1878 XFS_ERRLEVEL_LOW, mp); 1878 XFS_ERRLEVEL_LOW, mp);
1879 return XFS_ERROR(EFSCORRUPTED); 1879 return -EFSCORRUPTED;
1880 } 1880 }
1881 1881
1882 /* 1882 /*
@@ -2042,8 +2042,8 @@ xfs_dir2_node_lookup(
2042 error = xfs_da3_node_lookup_int(state, &rval); 2042 error = xfs_da3_node_lookup_int(state, &rval);
2043 if (error) 2043 if (error)
2044 rval = error; 2044 rval = error;
2045 else if (rval == ENOENT && args->cmpresult == XFS_CMP_CASE) { 2045 else if (rval == -ENOENT && args->cmpresult == XFS_CMP_CASE) {
2046 /* If a CI match, dup the actual name and return EEXIST */ 2046 /* If a CI match, dup the actual name and return -EEXIST */
2047 xfs_dir2_data_entry_t *dep; 2047 xfs_dir2_data_entry_t *dep;
2048 2048
2049 dep = (xfs_dir2_data_entry_t *) 2049 dep = (xfs_dir2_data_entry_t *)
@@ -2096,7 +2096,7 @@ xfs_dir2_node_removename(
2096 goto out_free; 2096 goto out_free;
2097 2097
2098 /* Didn't find it, upper layer screwed up. */ 2098 /* Didn't find it, upper layer screwed up. */
2099 if (rval != EEXIST) { 2099 if (rval != -EEXIST) {
2100 error = rval; 2100 error = rval;
2101 goto out_free; 2101 goto out_free;
2102 } 2102 }
@@ -2169,7 +2169,7 @@ xfs_dir2_node_replace(
2169 * It should be found, since the vnodeops layer has looked it up 2169 * It should be found, since the vnodeops layer has looked it up
2170 * and locked it. But paranoia is good. 2170 * and locked it. But paranoia is good.
2171 */ 2171 */
2172 if (rval == EEXIST) { 2172 if (rval == -EEXIST) {
2173 struct xfs_dir2_leaf_entry *ents; 2173 struct xfs_dir2_leaf_entry *ents;
2174 /* 2174 /*
2175 * Find the leaf entry. 2175 * Find the leaf entry.
@@ -2272,7 +2272,7 @@ xfs_dir2_node_trim_free(
2272 * space reservation, when breaking up an extent into two 2272 * space reservation, when breaking up an extent into two
2273 * pieces. This is the last block of an extent. 2273 * pieces. This is the last block of an extent.
2274 */ 2274 */
2275 ASSERT(error != ENOSPC); 2275 ASSERT(error != -ENOSPC);
2276 xfs_trans_brelse(tp, bp); 2276 xfs_trans_brelse(tp, bp);
2277 return error; 2277 return error;
2278 } 2278 }
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 27ce0794d196..27ce0794d196 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
diff --git a/fs/xfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 53c3be619db5..5079e051ef08 100644
--- a/fs/xfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -51,10 +51,9 @@ static void xfs_dir2_sf_check(xfs_da_args_t *args);
51#else 51#else
52#define xfs_dir2_sf_check(args) 52#define xfs_dir2_sf_check(args)
53#endif /* DEBUG */ 53#endif /* DEBUG */
54#if XFS_BIG_INUMS 54
55static void xfs_dir2_sf_toino4(xfs_da_args_t *args); 55static void xfs_dir2_sf_toino4(xfs_da_args_t *args);
56static void xfs_dir2_sf_toino8(xfs_da_args_t *args); 56static void xfs_dir2_sf_toino8(xfs_da_args_t *args);
57#endif /* XFS_BIG_INUMS */
58 57
59/* 58/*
60 * Given a block directory (dp/block), calculate its size as a shortform (sf) 59 * Given a block directory (dp/block), calculate its size as a shortform (sf)
@@ -117,10 +116,10 @@ xfs_dir2_block_sfsize(
117 isdotdot = 116 isdotdot =
118 dep->namelen == 2 && 117 dep->namelen == 2 &&
119 dep->name[0] == '.' && dep->name[1] == '.'; 118 dep->name[0] == '.' && dep->name[1] == '.';
120#if XFS_BIG_INUMS 119
121 if (!isdot) 120 if (!isdot)
122 i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM; 121 i8count += be64_to_cpu(dep->inumber) > XFS_DIR2_MAX_SHORT_INUM;
123#endif 122
124 /* take into account the file type field */ 123 /* take into account the file type field */
125 if (!isdot && !isdotdot) { 124 if (!isdot && !isdotdot) {
126 count++; 125 count++;
@@ -251,7 +250,7 @@ xfs_dir2_block_to_sf(
251 logflags = XFS_ILOG_CORE; 250 logflags = XFS_ILOG_CORE;
252 error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp); 251 error = xfs_dir2_shrink_inode(args, args->geo->datablk, bp);
253 if (error) { 252 if (error) {
254 ASSERT(error != ENOSPC); 253 ASSERT(error != -ENOSPC);
255 goto out; 254 goto out;
256 } 255 }
257 256
@@ -299,7 +298,7 @@ xfs_dir2_sf_addname(
299 298
300 trace_xfs_dir2_sf_addname(args); 299 trace_xfs_dir2_sf_addname(args);
301 300
302 ASSERT(xfs_dir2_sf_lookup(args) == ENOENT); 301 ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
303 dp = args->dp; 302 dp = args->dp;
304 ASSERT(dp->i_df.if_flags & XFS_IFINLINE); 303 ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
305 /* 304 /*
@@ -307,7 +306,7 @@ xfs_dir2_sf_addname(
307 */ 306 */
308 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { 307 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
309 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); 308 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
310 return XFS_ERROR(EIO); 309 return -EIO;
311 } 310 }
312 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 311 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
313 ASSERT(dp->i_df.if_u1.if_data != NULL); 312 ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -318,7 +317,7 @@ xfs_dir2_sf_addname(
318 */ 317 */
319 incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen); 318 incr_isize = dp->d_ops->sf_entsize(sfp, args->namelen);
320 objchange = 0; 319 objchange = 0;
321#if XFS_BIG_INUMS 320
322 /* 321 /*
323 * Do we have to change to 8 byte inodes? 322 * Do we have to change to 8 byte inodes?
324 */ 323 */
@@ -332,7 +331,7 @@ xfs_dir2_sf_addname(
332 (uint)sizeof(xfs_dir2_ino4_t)); 331 (uint)sizeof(xfs_dir2_ino4_t));
333 objchange = 1; 332 objchange = 1;
334 } 333 }
335#endif 334
336 new_isize = (int)dp->i_d.di_size + incr_isize; 335 new_isize = (int)dp->i_d.di_size + incr_isize;
337 /* 336 /*
338 * Won't fit as shortform any more (due to size), 337 * Won't fit as shortform any more (due to size),
@@ -345,7 +344,7 @@ xfs_dir2_sf_addname(
345 * Just checking or no space reservation, it doesn't fit. 344 * Just checking or no space reservation, it doesn't fit.
346 */ 345 */
347 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0) 346 if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
348 return XFS_ERROR(ENOSPC); 347 return -ENOSPC;
349 /* 348 /*
350 * Convert to block form then add the name. 349 * Convert to block form then add the name.
351 */ 350 */
@@ -370,10 +369,8 @@ xfs_dir2_sf_addname(
370 */ 369 */
371 else { 370 else {
372 ASSERT(pick == 2); 371 ASSERT(pick == 2);
373#if XFS_BIG_INUMS
374 if (objchange) 372 if (objchange)
375 xfs_dir2_sf_toino8(args); 373 xfs_dir2_sf_toino8(args);
376#endif
377 xfs_dir2_sf_addname_hard(args, objchange, new_isize); 374 xfs_dir2_sf_addname_hard(args, objchange, new_isize);
378 } 375 }
379 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); 376 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
@@ -425,10 +422,8 @@ xfs_dir2_sf_addname_easy(
425 * Update the header and inode. 422 * Update the header and inode.
426 */ 423 */
427 sfp->count++; 424 sfp->count++;
428#if XFS_BIG_INUMS
429 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM) 425 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM)
430 sfp->i8count++; 426 sfp->i8count++;
431#endif
432 dp->i_d.di_size = new_isize; 427 dp->i_d.di_size = new_isize;
433 xfs_dir2_sf_check(args); 428 xfs_dir2_sf_check(args);
434} 429}
@@ -516,10 +511,8 @@ xfs_dir2_sf_addname_hard(
516 dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); 511 dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
517 dp->d_ops->sf_put_ftype(sfep, args->filetype); 512 dp->d_ops->sf_put_ftype(sfep, args->filetype);
518 sfp->count++; 513 sfp->count++;
519#if XFS_BIG_INUMS
520 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange) 514 if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && !objchange)
521 sfp->i8count++; 515 sfp->i8count++;
522#endif
523 /* 516 /*
524 * If there's more left to copy, do that. 517 * If there's more left to copy, do that.
525 */ 518 */
@@ -593,13 +586,8 @@ xfs_dir2_sf_addname_pick(
593 /* 586 /*
594 * If changing the inode number size, do it the hard way. 587 * If changing the inode number size, do it the hard way.
595 */ 588 */
596#if XFS_BIG_INUMS 589 if (objchange)
597 if (objchange) {
598 return 2; 590 return 2;
599 }
600#else
601 ASSERT(objchange == 0);
602#endif
603 /* 591 /*
604 * If it won't fit at the end then do it the hard way (use the hole). 592 * If it won't fit at the end then do it the hard way (use the hole).
605 */ 593 */
@@ -650,7 +638,6 @@ xfs_dir2_sf_check(
650 ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX); 638 ASSERT(dp->d_ops->sf_get_ftype(sfep) < XFS_DIR3_FT_MAX);
651 } 639 }
652 ASSERT(i8count == sfp->i8count); 640 ASSERT(i8count == sfp->i8count);
653 ASSERT(XFS_BIG_INUMS || i8count == 0);
654 ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size); 641 ASSERT((char *)sfep - (char *)sfp == dp->i_d.di_size);
655 ASSERT(offset + 642 ASSERT(offset +
656 (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) + 643 (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
@@ -738,7 +725,7 @@ xfs_dir2_sf_lookup(
738 */ 725 */
739 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { 726 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
740 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); 727 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
741 return XFS_ERROR(EIO); 728 return -EIO;
742 } 729 }
743 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 730 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
744 ASSERT(dp->i_df.if_u1.if_data != NULL); 731 ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -751,7 +738,7 @@ xfs_dir2_sf_lookup(
751 args->inumber = dp->i_ino; 738 args->inumber = dp->i_ino;
752 args->cmpresult = XFS_CMP_EXACT; 739 args->cmpresult = XFS_CMP_EXACT;
753 args->filetype = XFS_DIR3_FT_DIR; 740 args->filetype = XFS_DIR3_FT_DIR;
754 return XFS_ERROR(EEXIST); 741 return -EEXIST;
755 } 742 }
756 /* 743 /*
757 * Special case for .. 744 * Special case for ..
@@ -761,7 +748,7 @@ xfs_dir2_sf_lookup(
761 args->inumber = dp->d_ops->sf_get_parent_ino(sfp); 748 args->inumber = dp->d_ops->sf_get_parent_ino(sfp);
762 args->cmpresult = XFS_CMP_EXACT; 749 args->cmpresult = XFS_CMP_EXACT;
763 args->filetype = XFS_DIR3_FT_DIR; 750 args->filetype = XFS_DIR3_FT_DIR;
764 return XFS_ERROR(EEXIST); 751 return -EEXIST;
765 } 752 }
766 /* 753 /*
767 * Loop over all the entries trying to match ours. 754 * Loop over all the entries trying to match ours.
@@ -781,20 +768,20 @@ xfs_dir2_sf_lookup(
781 args->inumber = dp->d_ops->sf_get_ino(sfp, sfep); 768 args->inumber = dp->d_ops->sf_get_ino(sfp, sfep);
782 args->filetype = dp->d_ops->sf_get_ftype(sfep); 769 args->filetype = dp->d_ops->sf_get_ftype(sfep);
783 if (cmp == XFS_CMP_EXACT) 770 if (cmp == XFS_CMP_EXACT)
784 return XFS_ERROR(EEXIST); 771 return -EEXIST;
785 ci_sfep = sfep; 772 ci_sfep = sfep;
786 } 773 }
787 } 774 }
788 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); 775 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
789 /* 776 /*
790 * Here, we can only be doing a lookup (not a rename or replace). 777 * Here, we can only be doing a lookup (not a rename or replace).
791 * If a case-insensitive match was not found, return ENOENT. 778 * If a case-insensitive match was not found, return -ENOENT.
792 */ 779 */
793 if (!ci_sfep) 780 if (!ci_sfep)
794 return XFS_ERROR(ENOENT); 781 return -ENOENT;
795 /* otherwise process the CI match as required by the caller */ 782 /* otherwise process the CI match as required by the caller */
796 error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen); 783 error = xfs_dir_cilookup_result(args, ci_sfep->name, ci_sfep->namelen);
797 return XFS_ERROR(error); 784 return error;
798} 785}
799 786
800/* 787/*
@@ -824,7 +811,7 @@ xfs_dir2_sf_removename(
824 */ 811 */
825 if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) { 812 if (oldsize < offsetof(xfs_dir2_sf_hdr_t, parent)) {
826 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); 813 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
827 return XFS_ERROR(EIO); 814 return -EIO;
828 } 815 }
829 ASSERT(dp->i_df.if_bytes == oldsize); 816 ASSERT(dp->i_df.if_bytes == oldsize);
830 ASSERT(dp->i_df.if_u1.if_data != NULL); 817 ASSERT(dp->i_df.if_u1.if_data != NULL);
@@ -847,7 +834,7 @@ xfs_dir2_sf_removename(
847 * Didn't find it. 834 * Didn't find it.
848 */ 835 */
849 if (i == sfp->count) 836 if (i == sfp->count)
850 return XFS_ERROR(ENOENT); 837 return -ENOENT;
851 /* 838 /*
852 * Calculate sizes. 839 * Calculate sizes.
853 */ 840 */
@@ -870,7 +857,6 @@ xfs_dir2_sf_removename(
870 */ 857 */
871 xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK); 858 xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
872 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 859 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
873#if XFS_BIG_INUMS
874 /* 860 /*
875 * Are we changing inode number size? 861 * Are we changing inode number size?
876 */ 862 */
@@ -880,7 +866,6 @@ xfs_dir2_sf_removename(
880 else 866 else
881 sfp->i8count--; 867 sfp->i8count--;
882 } 868 }
883#endif
884 xfs_dir2_sf_check(args); 869 xfs_dir2_sf_check(args);
885 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); 870 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
886 return 0; 871 return 0;
@@ -895,12 +880,8 @@ xfs_dir2_sf_replace(
895{ 880{
896 xfs_inode_t *dp; /* incore directory inode */ 881 xfs_inode_t *dp; /* incore directory inode */
897 int i; /* entry index */ 882 int i; /* entry index */
898#if XFS_BIG_INUMS || defined(DEBUG)
899 xfs_ino_t ino=0; /* entry old inode number */ 883 xfs_ino_t ino=0; /* entry old inode number */
900#endif
901#if XFS_BIG_INUMS
902 int i8elevated; /* sf_toino8 set i8count=1 */ 884 int i8elevated; /* sf_toino8 set i8count=1 */
903#endif
904 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */ 885 xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
905 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */ 886 xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
906 887
@@ -914,13 +895,13 @@ xfs_dir2_sf_replace(
914 */ 895 */
915 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { 896 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
916 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); 897 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
917 return XFS_ERROR(EIO); 898 return -EIO;
918 } 899 }
919 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 900 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
920 ASSERT(dp->i_df.if_u1.if_data != NULL); 901 ASSERT(dp->i_df.if_u1.if_data != NULL);
921 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 902 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
922 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count)); 903 ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
923#if XFS_BIG_INUMS 904
924 /* 905 /*
925 * New inode number is large, and need to convert to 8-byte inodes. 906 * New inode number is large, and need to convert to 8-byte inodes.
926 */ 907 */
@@ -951,17 +932,15 @@ xfs_dir2_sf_replace(
951 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data; 932 sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
952 } else 933 } else
953 i8elevated = 0; 934 i8elevated = 0;
954#endif 935
955 ASSERT(args->namelen != 1 || args->name[0] != '.'); 936 ASSERT(args->namelen != 1 || args->name[0] != '.');
956 /* 937 /*
957 * Replace ..'s entry. 938 * Replace ..'s entry.
958 */ 939 */
959 if (args->namelen == 2 && 940 if (args->namelen == 2 &&
960 args->name[0] == '.' && args->name[1] == '.') { 941 args->name[0] == '.' && args->name[1] == '.') {
961#if XFS_BIG_INUMS || defined(DEBUG)
962 ino = dp->d_ops->sf_get_parent_ino(sfp); 942 ino = dp->d_ops->sf_get_parent_ino(sfp);
963 ASSERT(args->inumber != ino); 943 ASSERT(args->inumber != ino);
964#endif
965 dp->d_ops->sf_put_parent_ino(sfp, args->inumber); 944 dp->d_ops->sf_put_parent_ino(sfp, args->inumber);
966 } 945 }
967 /* 946 /*
@@ -972,10 +951,8 @@ xfs_dir2_sf_replace(
972 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) { 951 i++, sfep = dp->d_ops->sf_nextentry(sfp, sfep)) {
973 if (xfs_da_compname(args, sfep->name, sfep->namelen) == 952 if (xfs_da_compname(args, sfep->name, sfep->namelen) ==
974 XFS_CMP_EXACT) { 953 XFS_CMP_EXACT) {
975#if XFS_BIG_INUMS || defined(DEBUG)
976 ino = dp->d_ops->sf_get_ino(sfp, sfep); 954 ino = dp->d_ops->sf_get_ino(sfp, sfep);
977 ASSERT(args->inumber != ino); 955 ASSERT(args->inumber != ino);
978#endif
979 dp->d_ops->sf_put_ino(sfp, sfep, args->inumber); 956 dp->d_ops->sf_put_ino(sfp, sfep, args->inumber);
980 dp->d_ops->sf_put_ftype(sfep, args->filetype); 957 dp->d_ops->sf_put_ftype(sfep, args->filetype);
981 break; 958 break;
@@ -986,14 +963,11 @@ xfs_dir2_sf_replace(
986 */ 963 */
987 if (i == sfp->count) { 964 if (i == sfp->count) {
988 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT); 965 ASSERT(args->op_flags & XFS_DA_OP_OKNOENT);
989#if XFS_BIG_INUMS
990 if (i8elevated) 966 if (i8elevated)
991 xfs_dir2_sf_toino4(args); 967 xfs_dir2_sf_toino4(args);
992#endif 968 return -ENOENT;
993 return XFS_ERROR(ENOENT);
994 } 969 }
995 } 970 }
996#if XFS_BIG_INUMS
997 /* 971 /*
998 * See if the old number was large, the new number is small. 972 * See if the old number was large, the new number is small.
999 */ 973 */
@@ -1020,13 +994,11 @@ xfs_dir2_sf_replace(
1020 if (!i8elevated) 994 if (!i8elevated)
1021 sfp->i8count++; 995 sfp->i8count++;
1022 } 996 }
1023#endif
1024 xfs_dir2_sf_check(args); 997 xfs_dir2_sf_check(args);
1025 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA); 998 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_DDATA);
1026 return 0; 999 return 0;
1027} 1000}
1028 1001
1029#if XFS_BIG_INUMS
1030/* 1002/*
1031 * Convert from 8-byte inode numbers to 4-byte inode numbers. 1003 * Convert from 8-byte inode numbers to 4-byte inode numbers.
1032 * The last 8-byte inode number is gone, but the count is still 1. 1004 * The last 8-byte inode number is gone, but the count is still 1.
@@ -1181,4 +1153,3 @@ xfs_dir2_sf_toino8(
1181 dp->i_d.di_size = newsize; 1153 dp->i_d.di_size = newsize;
1182 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA); 1154 xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
1183} 1155}
1184#endif /* XFS_BIG_INUMS */
diff --git a/fs/xfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index c2ac0c611ad8..bb969337efc8 100644
--- a/fs/xfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -257,9 +257,9 @@ xfs_dquot_buf_read_verify(
257 struct xfs_mount *mp = bp->b_target->bt_mount; 257 struct xfs_mount *mp = bp->b_target->bt_mount;
258 258
259 if (!xfs_dquot_buf_verify_crc(mp, bp)) 259 if (!xfs_dquot_buf_verify_crc(mp, bp))
260 xfs_buf_ioerror(bp, EFSBADCRC); 260 xfs_buf_ioerror(bp, -EFSBADCRC);
261 else if (!xfs_dquot_buf_verify(mp, bp)) 261 else if (!xfs_dquot_buf_verify(mp, bp))
262 xfs_buf_ioerror(bp, EFSCORRUPTED); 262 xfs_buf_ioerror(bp, -EFSCORRUPTED);
263 263
264 if (bp->b_error) 264 if (bp->b_error)
265 xfs_verifier_error(bp); 265 xfs_verifier_error(bp);
@@ -277,7 +277,7 @@ xfs_dquot_buf_write_verify(
277 struct xfs_mount *mp = bp->b_target->bt_mount; 277 struct xfs_mount *mp = bp->b_target->bt_mount;
278 278
279 if (!xfs_dquot_buf_verify(mp, bp)) { 279 if (!xfs_dquot_buf_verify(mp, bp)) {
280 xfs_buf_ioerror(bp, EFSCORRUPTED); 280 xfs_buf_ioerror(bp, -EFSCORRUPTED);
281 xfs_verifier_error(bp); 281 xfs_verifier_error(bp);
282 return; 282 return;
283 } 283 }
diff --git a/fs/xfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 34d85aca3058..7e42bba9a420 100644
--- a/fs/xfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -68,11 +68,7 @@ struct xfs_ifork;
68#define XFS_RTLOBIT(w) xfs_lowbit32(w) 68#define XFS_RTLOBIT(w) xfs_lowbit32(w)
69#define XFS_RTHIBIT(w) xfs_highbit32(w) 69#define XFS_RTHIBIT(w) xfs_highbit32(w)
70 70
71#if XFS_BIG_BLKNOS
72#define XFS_RTBLOCKLOG(b) xfs_highbit64(b) 71#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
73#else
74#define XFS_RTBLOCKLOG(b) xfs_highbit32(b)
75#endif
76 72
77/* 73/*
78 * Dquot and dquot block format definitions 74 * Dquot and dquot block format definitions
@@ -304,23 +300,15 @@ typedef struct xfs_bmbt_rec_host {
304 * Values and macros for delayed-allocation startblock fields. 300 * Values and macros for delayed-allocation startblock fields.
305 */ 301 */
306#define STARTBLOCKVALBITS 17 302#define STARTBLOCKVALBITS 17
307#define STARTBLOCKMASKBITS (15 + XFS_BIG_BLKNOS * 20) 303#define STARTBLOCKMASKBITS (15 + 20)
308#define DSTARTBLOCKMASKBITS (15 + 20)
309#define STARTBLOCKMASK \ 304#define STARTBLOCKMASK \
310 (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS) 305 (((((xfs_fsblock_t)1) << STARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
311#define DSTARTBLOCKMASK \
312 (((((xfs_dfsbno_t)1) << DSTARTBLOCKMASKBITS) - 1) << STARTBLOCKVALBITS)
313 306
314static inline int isnullstartblock(xfs_fsblock_t x) 307static inline int isnullstartblock(xfs_fsblock_t x)
315{ 308{
316 return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK; 309 return ((x) & STARTBLOCKMASK) == STARTBLOCKMASK;
317} 310}
318 311
319static inline int isnulldstartblock(xfs_dfsbno_t x)
320{
321 return ((x) & DSTARTBLOCKMASK) == DSTARTBLOCKMASK;
322}
323
324static inline xfs_fsblock_t nullstartblock(int k) 312static inline xfs_fsblock_t nullstartblock(int k)
325{ 313{
326 ASSERT(k < (1 << STARTBLOCKVALBITS)); 314 ASSERT(k < (1 << STARTBLOCKVALBITS));
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 5960e5593fe0..23dcb72fc5e6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -292,7 +292,7 @@ xfs_ialloc_inode_init(
292 mp->m_bsize * blks_per_cluster, 292 mp->m_bsize * blks_per_cluster,
293 XBF_UNMAPPED); 293 XBF_UNMAPPED);
294 if (!fbuf) 294 if (!fbuf)
295 return ENOMEM; 295 return -ENOMEM;
296 296
297 /* Initialize the inode buffers and log them appropriately. */ 297 /* Initialize the inode buffers and log them appropriately. */
298 fbuf->b_ops = &xfs_inode_buf_ops; 298 fbuf->b_ops = &xfs_inode_buf_ops;
@@ -380,7 +380,7 @@ xfs_ialloc_ag_alloc(
380 newlen = args.mp->m_ialloc_inos; 380 newlen = args.mp->m_ialloc_inos;
381 if (args.mp->m_maxicount && 381 if (args.mp->m_maxicount &&
382 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) 382 args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
383 return XFS_ERROR(ENOSPC); 383 return -ENOSPC;
384 args.minlen = args.maxlen = args.mp->m_ialloc_blks; 384 args.minlen = args.maxlen = args.mp->m_ialloc_blks;
385 /* 385 /*
386 * First try to allocate inodes contiguous with the last-allocated 386 * First try to allocate inodes contiguous with the last-allocated
@@ -1076,8 +1076,8 @@ xfs_dialloc_ag_finobt_newino(
1076 int i; 1076 int i;
1077 1077
1078 if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { 1078 if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1079 error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ, 1079 error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1080 &i); 1080 XFS_LOOKUP_EQ, &i);
1081 if (error) 1081 if (error)
1082 return error; 1082 return error;
1083 if (i == 1) { 1083 if (i == 1) {
@@ -1085,7 +1085,6 @@ xfs_dialloc_ag_finobt_newino(
1085 if (error) 1085 if (error)
1086 return error; 1086 return error;
1087 XFS_WANT_CORRUPTED_RETURN(i == 1); 1087 XFS_WANT_CORRUPTED_RETURN(i == 1);
1088
1089 return 0; 1088 return 0;
1090 } 1089 }
1091 } 1090 }
@@ -1385,7 +1384,7 @@ xfs_dialloc(
1385 if (error) { 1384 if (error) {
1386 xfs_trans_brelse(tp, agbp); 1385 xfs_trans_brelse(tp, agbp);
1387 1386
1388 if (error != ENOSPC) 1387 if (error != -ENOSPC)
1389 goto out_error; 1388 goto out_error;
1390 1389
1391 xfs_perag_put(pag); 1390 xfs_perag_put(pag);
@@ -1416,7 +1415,7 @@ nextag:
1416 agno = 0; 1415 agno = 0;
1417 if (agno == start_agno) { 1416 if (agno == start_agno) {
1418 *inop = NULLFSINO; 1417 *inop = NULLFSINO;
1419 return noroom ? ENOSPC : 0; 1418 return noroom ? -ENOSPC : 0;
1420 } 1419 }
1421 } 1420 }
1422 1421
@@ -1425,7 +1424,7 @@ out_alloc:
1425 return xfs_dialloc_ag(tp, agbp, parent, inop); 1424 return xfs_dialloc_ag(tp, agbp, parent, inop);
1426out_error: 1425out_error:
1427 xfs_perag_put(pag); 1426 xfs_perag_put(pag);
1428 return XFS_ERROR(error); 1427 return error;
1429} 1428}
1430 1429
1431STATIC int 1430STATIC int
@@ -1682,7 +1681,7 @@ xfs_difree(
1682 xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).", 1681 xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
1683 __func__, agno, mp->m_sb.sb_agcount); 1682 __func__, agno, mp->m_sb.sb_agcount);
1684 ASSERT(0); 1683 ASSERT(0);
1685 return XFS_ERROR(EINVAL); 1684 return -EINVAL;
1686 } 1685 }
1687 agino = XFS_INO_TO_AGINO(mp, inode); 1686 agino = XFS_INO_TO_AGINO(mp, inode);
1688 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { 1687 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
@@ -1690,14 +1689,14 @@ xfs_difree(
1690 __func__, (unsigned long long)inode, 1689 __func__, (unsigned long long)inode,
1691 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino)); 1690 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
1692 ASSERT(0); 1691 ASSERT(0);
1693 return XFS_ERROR(EINVAL); 1692 return -EINVAL;
1694 } 1693 }
1695 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 1694 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1696 if (agbno >= mp->m_sb.sb_agblocks) { 1695 if (agbno >= mp->m_sb.sb_agblocks) {
1697 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).", 1696 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
1698 __func__, agbno, mp->m_sb.sb_agblocks); 1697 __func__, agbno, mp->m_sb.sb_agblocks);
1699 ASSERT(0); 1698 ASSERT(0);
1700 return XFS_ERROR(EINVAL); 1699 return -EINVAL;
1701 } 1700 }
1702 /* 1701 /*
1703 * Get the allocation group header. 1702 * Get the allocation group header.
@@ -1769,7 +1768,7 @@ xfs_imap_lookup(
1769 if (i) 1768 if (i)
1770 error = xfs_inobt_get_rec(cur, &rec, &i); 1769 error = xfs_inobt_get_rec(cur, &rec, &i);
1771 if (!error && i == 0) 1770 if (!error && i == 0)
1772 error = EINVAL; 1771 error = -EINVAL;
1773 } 1772 }
1774 1773
1775 xfs_trans_brelse(tp, agbp); 1774 xfs_trans_brelse(tp, agbp);
@@ -1780,12 +1779,12 @@ xfs_imap_lookup(
1780 /* check that the returned record contains the required inode */ 1779 /* check that the returned record contains the required inode */
1781 if (rec.ir_startino > agino || 1780 if (rec.ir_startino > agino ||
1782 rec.ir_startino + mp->m_ialloc_inos <= agino) 1781 rec.ir_startino + mp->m_ialloc_inos <= agino)
1783 return EINVAL; 1782 return -EINVAL;
1784 1783
1785 /* for untrusted inodes check it is allocated first */ 1784 /* for untrusted inodes check it is allocated first */
1786 if ((flags & XFS_IGET_UNTRUSTED) && 1785 if ((flags & XFS_IGET_UNTRUSTED) &&
1787 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) 1786 (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
1788 return EINVAL; 1787 return -EINVAL;
1789 1788
1790 *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino); 1789 *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
1791 *offset_agbno = agbno - *chunk_agbno; 1790 *offset_agbno = agbno - *chunk_agbno;
@@ -1829,7 +1828,7 @@ xfs_imap(
1829 * as they can be invalid without implying corruption. 1828 * as they can be invalid without implying corruption.
1830 */ 1829 */
1831 if (flags & XFS_IGET_UNTRUSTED) 1830 if (flags & XFS_IGET_UNTRUSTED)
1832 return XFS_ERROR(EINVAL); 1831 return -EINVAL;
1833 if (agno >= mp->m_sb.sb_agcount) { 1832 if (agno >= mp->m_sb.sb_agcount) {
1834 xfs_alert(mp, 1833 xfs_alert(mp,
1835 "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)", 1834 "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
@@ -1849,7 +1848,7 @@ xfs_imap(
1849 } 1848 }
1850 xfs_stack_trace(); 1849 xfs_stack_trace();
1851#endif /* DEBUG */ 1850#endif /* DEBUG */
1852 return XFS_ERROR(EINVAL); 1851 return -EINVAL;
1853 } 1852 }
1854 1853
1855 blks_per_cluster = xfs_icluster_size_fsb(mp); 1854 blks_per_cluster = xfs_icluster_size_fsb(mp);
@@ -1922,7 +1921,7 @@ out_map:
1922 __func__, (unsigned long long) imap->im_blkno, 1921 __func__, (unsigned long long) imap->im_blkno,
1923 (unsigned long long) imap->im_len, 1922 (unsigned long long) imap->im_len,
1924 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1923 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1925 return XFS_ERROR(EINVAL); 1924 return -EINVAL;
1926 } 1925 }
1927 return 0; 1926 return 0;
1928} 1927}
@@ -2051,6 +2050,8 @@ xfs_agi_verify(
2051 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) 2050 if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2052 return false; 2051 return false;
2053 2052
2053 if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
2054 return false;
2054 /* 2055 /*
2055 * during growfs operations, the perag is not fully initialised, 2056 * during growfs operations, the perag is not fully initialised,
2056 * so we can't use it for any useful checking. growfs ensures we can't 2057 * so we can't use it for any useful checking. growfs ensures we can't
@@ -2072,11 +2073,11 @@ xfs_agi_read_verify(
2072 2073
2073 if (xfs_sb_version_hascrc(&mp->m_sb) && 2074 if (xfs_sb_version_hascrc(&mp->m_sb) &&
2074 !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) 2075 !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
2075 xfs_buf_ioerror(bp, EFSBADCRC); 2076 xfs_buf_ioerror(bp, -EFSBADCRC);
2076 else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp, 2077 else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
2077 XFS_ERRTAG_IALLOC_READ_AGI, 2078 XFS_ERRTAG_IALLOC_READ_AGI,
2078 XFS_RANDOM_IALLOC_READ_AGI)) 2079 XFS_RANDOM_IALLOC_READ_AGI))
2079 xfs_buf_ioerror(bp, EFSCORRUPTED); 2080 xfs_buf_ioerror(bp, -EFSCORRUPTED);
2080 2081
2081 if (bp->b_error) 2082 if (bp->b_error)
2082 xfs_verifier_error(bp); 2083 xfs_verifier_error(bp);
@@ -2090,7 +2091,7 @@ xfs_agi_write_verify(
2090 struct xfs_buf_log_item *bip = bp->b_fspriv; 2091 struct xfs_buf_log_item *bip = bp->b_fspriv;
2091 2092
2092 if (!xfs_agi_verify(bp)) { 2093 if (!xfs_agi_verify(bp)) {
2093 xfs_buf_ioerror(bp, EFSCORRUPTED); 2094 xfs_buf_ioerror(bp, -EFSCORRUPTED);
2094 xfs_verifier_error(bp); 2095 xfs_verifier_error(bp);
2095 return; 2096 return;
2096 } 2097 }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 95ad1c002d60..95ad1c002d60 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 726f83a681a5..c9b06f30fe86 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -272,9 +272,9 @@ xfs_inobt_read_verify(
272 struct xfs_buf *bp) 272 struct xfs_buf *bp)
273{ 273{
274 if (!xfs_btree_sblock_verify_crc(bp)) 274 if (!xfs_btree_sblock_verify_crc(bp))
275 xfs_buf_ioerror(bp, EFSBADCRC); 275 xfs_buf_ioerror(bp, -EFSBADCRC);
276 else if (!xfs_inobt_verify(bp)) 276 else if (!xfs_inobt_verify(bp))
277 xfs_buf_ioerror(bp, EFSCORRUPTED); 277 xfs_buf_ioerror(bp, -EFSCORRUPTED);
278 278
279 if (bp->b_error) { 279 if (bp->b_error) {
280 trace_xfs_btree_corrupt(bp, _RET_IP_); 280 trace_xfs_btree_corrupt(bp, _RET_IP_);
@@ -288,7 +288,7 @@ xfs_inobt_write_verify(
288{ 288{
289 if (!xfs_inobt_verify(bp)) { 289 if (!xfs_inobt_verify(bp)) {
290 trace_xfs_btree_corrupt(bp, _RET_IP_); 290 trace_xfs_btree_corrupt(bp, _RET_IP_);
291 xfs_buf_ioerror(bp, EFSCORRUPTED); 291 xfs_buf_ioerror(bp, -EFSCORRUPTED);
292 xfs_verifier_error(bp); 292 xfs_verifier_error(bp);
293 return; 293 return;
294 } 294 }
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index d7ebea72c2d0..d7ebea72c2d0 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index cb35ae41d4a1..f18fd2da49f7 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -101,7 +101,7 @@ xfs_inode_buf_verify(
101 return; 101 return;
102 } 102 }
103 103
104 xfs_buf_ioerror(bp, EFSCORRUPTED); 104 xfs_buf_ioerror(bp, -EFSCORRUPTED);
105 xfs_verifier_error(bp); 105 xfs_verifier_error(bp);
106#ifdef DEBUG 106#ifdef DEBUG
107 xfs_alert(mp, 107 xfs_alert(mp,
@@ -174,14 +174,14 @@ xfs_imap_to_bp(
174 (int)imap->im_len, buf_flags, &bp, 174 (int)imap->im_len, buf_flags, &bp,
175 &xfs_inode_buf_ops); 175 &xfs_inode_buf_ops);
176 if (error) { 176 if (error) {
177 if (error == EAGAIN) { 177 if (error == -EAGAIN) {
178 ASSERT(buf_flags & XBF_TRYLOCK); 178 ASSERT(buf_flags & XBF_TRYLOCK);
179 return error; 179 return error;
180 } 180 }
181 181
182 if (error == EFSCORRUPTED && 182 if (error == -EFSCORRUPTED &&
183 (iget_flags & XFS_IGET_UNTRUSTED)) 183 (iget_flags & XFS_IGET_UNTRUSTED))
184 return XFS_ERROR(EINVAL); 184 return -EINVAL;
185 185
186 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.", 186 xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
187 __func__, error); 187 __func__, error);
@@ -390,7 +390,7 @@ xfs_iread(
390 __func__, ip->i_ino); 390 __func__, ip->i_ino);
391 391
392 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); 392 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
393 error = XFS_ERROR(EFSCORRUPTED); 393 error = -EFSCORRUPTED;
394 goto out_brelse; 394 goto out_brelse;
395 } 395 }
396 396
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 9308c47f2a52..9308c47f2a52 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
diff --git a/fs/xfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index b031e8d0d928..6a00f7fed69d 100644
--- a/fs/xfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -102,7 +102,7 @@ xfs_iformat_fork(
102 be64_to_cpu(dip->di_nblocks)); 102 be64_to_cpu(dip->di_nblocks));
103 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW, 103 XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
104 ip->i_mount, dip); 104 ip->i_mount, dip);
105 return XFS_ERROR(EFSCORRUPTED); 105 return -EFSCORRUPTED;
106 } 106 }
107 107
108 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 108 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
@@ -111,7 +111,7 @@ xfs_iformat_fork(
111 dip->di_forkoff); 111 dip->di_forkoff);
112 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 112 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
113 ip->i_mount, dip); 113 ip->i_mount, dip);
114 return XFS_ERROR(EFSCORRUPTED); 114 return -EFSCORRUPTED;
115 } 115 }
116 116
117 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 117 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
@@ -121,7 +121,7 @@ xfs_iformat_fork(
121 ip->i_ino); 121 ip->i_ino);
122 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 122 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
123 XFS_ERRLEVEL_LOW, ip->i_mount, dip); 123 XFS_ERRLEVEL_LOW, ip->i_mount, dip);
124 return XFS_ERROR(EFSCORRUPTED); 124 return -EFSCORRUPTED;
125 } 125 }
126 126
127 switch (ip->i_d.di_mode & S_IFMT) { 127 switch (ip->i_d.di_mode & S_IFMT) {
@@ -132,7 +132,7 @@ xfs_iformat_fork(
132 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) { 132 if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
133 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, 133 XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
134 ip->i_mount, dip); 134 ip->i_mount, dip);
135 return XFS_ERROR(EFSCORRUPTED); 135 return -EFSCORRUPTED;
136 } 136 }
137 ip->i_d.di_size = 0; 137 ip->i_d.di_size = 0;
138 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip); 138 ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
@@ -153,7 +153,7 @@ xfs_iformat_fork(
153 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 153 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
154 XFS_ERRLEVEL_LOW, 154 XFS_ERRLEVEL_LOW,
155 ip->i_mount, dip); 155 ip->i_mount, dip);
156 return XFS_ERROR(EFSCORRUPTED); 156 return -EFSCORRUPTED;
157 } 157 }
158 158
159 di_size = be64_to_cpu(dip->di_size); 159 di_size = be64_to_cpu(dip->di_size);
@@ -166,7 +166,7 @@ xfs_iformat_fork(
166 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 166 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
167 XFS_ERRLEVEL_LOW, 167 XFS_ERRLEVEL_LOW,
168 ip->i_mount, dip); 168 ip->i_mount, dip);
169 return XFS_ERROR(EFSCORRUPTED); 169 return -EFSCORRUPTED;
170 } 170 }
171 171
172 size = (int)di_size; 172 size = (int)di_size;
@@ -181,13 +181,13 @@ xfs_iformat_fork(
181 default: 181 default:
182 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW, 182 XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
183 ip->i_mount); 183 ip->i_mount);
184 return XFS_ERROR(EFSCORRUPTED); 184 return -EFSCORRUPTED;
185 } 185 }
186 break; 186 break;
187 187
188 default: 188 default:
189 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount); 189 XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
190 return XFS_ERROR(EFSCORRUPTED); 190 return -EFSCORRUPTED;
191 } 191 }
192 if (error) { 192 if (error) {
193 return error; 193 return error;
@@ -211,7 +211,7 @@ xfs_iformat_fork(
211 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 211 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
212 XFS_ERRLEVEL_LOW, 212 XFS_ERRLEVEL_LOW,
213 ip->i_mount, dip); 213 ip->i_mount, dip);
214 return XFS_ERROR(EFSCORRUPTED); 214 return -EFSCORRUPTED;
215 } 215 }
216 216
217 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size); 217 error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
@@ -223,7 +223,7 @@ xfs_iformat_fork(
223 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK); 223 error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
224 break; 224 break;
225 default: 225 default:
226 error = XFS_ERROR(EFSCORRUPTED); 226 error = -EFSCORRUPTED;
227 break; 227 break;
228 } 228 }
229 if (error) { 229 if (error) {
@@ -266,7 +266,7 @@ xfs_iformat_local(
266 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 266 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
267 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 267 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
268 ip->i_mount, dip); 268 ip->i_mount, dip);
269 return XFS_ERROR(EFSCORRUPTED); 269 return -EFSCORRUPTED;
270 } 270 }
271 ifp = XFS_IFORK_PTR(ip, whichfork); 271 ifp = XFS_IFORK_PTR(ip, whichfork);
272 real_size = 0; 272 real_size = 0;
@@ -322,7 +322,7 @@ xfs_iformat_extents(
322 (unsigned long long) ip->i_ino, nex); 322 (unsigned long long) ip->i_ino, nex);
323 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 323 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
324 ip->i_mount, dip); 324 ip->i_mount, dip);
325 return XFS_ERROR(EFSCORRUPTED); 325 return -EFSCORRUPTED;
326 } 326 }
327 327
328 ifp->if_real_bytes = 0; 328 ifp->if_real_bytes = 0;
@@ -350,7 +350,7 @@ xfs_iformat_extents(
350 XFS_ERROR_REPORT("xfs_iformat_extents(2)", 350 XFS_ERROR_REPORT("xfs_iformat_extents(2)",
351 XFS_ERRLEVEL_LOW, 351 XFS_ERRLEVEL_LOW,
352 ip->i_mount); 352 ip->i_mount);
353 return XFS_ERROR(EFSCORRUPTED); 353 return -EFSCORRUPTED;
354 } 354 }
355 } 355 }
356 ifp->if_flags |= XFS_IFEXTENTS; 356 ifp->if_flags |= XFS_IFEXTENTS;
@@ -399,7 +399,7 @@ xfs_iformat_btree(
399 (unsigned long long) ip->i_ino); 399 (unsigned long long) ip->i_ino);
400 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 400 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
401 mp, dip); 401 mp, dip);
402 return XFS_ERROR(EFSCORRUPTED); 402 return -EFSCORRUPTED;
403 } 403 }
404 404
405 ifp->if_broot_bytes = size; 405 ifp->if_broot_bytes = size;
@@ -436,7 +436,7 @@ xfs_iread_extents(
436 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) { 436 if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
437 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW, 437 XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
438 ip->i_mount); 438 ip->i_mount);
439 return XFS_ERROR(EFSCORRUPTED); 439 return -EFSCORRUPTED;
440 } 440 }
441 nextents = XFS_IFORK_NEXTENTS(ip, whichfork); 441 nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
442 ifp = XFS_IFORK_PTR(ip, whichfork); 442 ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -528,7 +528,7 @@ xfs_iroot_realloc(
528 ifp->if_broot_bytes = (int)new_size; 528 ifp->if_broot_bytes = (int)new_size;
529 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <= 529 ASSERT(XFS_BMAP_BMDR_SPACE(ifp->if_broot) <=
530 XFS_IFORK_SIZE(ip, whichfork)); 530 XFS_IFORK_SIZE(ip, whichfork));
531 memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t)); 531 memmove(np, op, cur_max * (uint)sizeof(xfs_fsblock_t));
532 return; 532 return;
533 } 533 }
534 534
@@ -575,7 +575,7 @@ xfs_iroot_realloc(
575 ifp->if_broot_bytes); 575 ifp->if_broot_bytes);
576 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1, 576 np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
577 (int)new_size); 577 (int)new_size);
578 memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t)); 578 memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
579 } 579 }
580 kmem_free(ifp->if_broot); 580 kmem_free(ifp->if_broot);
581 ifp->if_broot = new_broot; 581 ifp->if_broot = new_broot;
@@ -1692,7 +1692,7 @@ xfs_iext_idx_to_irec(
1692 } 1692 }
1693 *idxp = page_idx; 1693 *idxp = page_idx;
1694 *erp_idxp = erp_idx; 1694 *erp_idxp = erp_idx;
1695 return(erp); 1695 return erp;
1696} 1696}
1697 1697
1698/* 1698/*
diff --git a/fs/xfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7d3b1ed6dcbe..7d3b1ed6dcbe 100644
--- a/fs/xfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/libxfs/xfs_inum.h
index 90efdaf1706f..4ff2278e147a 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/libxfs/xfs_inum.h
@@ -54,11 +54,7 @@ struct xfs_mount;
54#define XFS_OFFBNO_TO_AGINO(mp,b,o) \ 54#define XFS_OFFBNO_TO_AGINO(mp,b,o) \
55 ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o))) 55 ((xfs_agino_t)(((b) << XFS_INO_OFFSET_BITS(mp)) | (o)))
56 56
57#if XFS_BIG_INUMS
58#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL)) 57#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 56) - 1ULL))
59#else
60#define XFS_MAXINUMBER ((xfs_ino_t)((1ULL << 32) - 1ULL))
61#endif
62#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL)) 58#define XFS_MAXINUMBER_32 ((xfs_ino_t)((1ULL << 32) - 1ULL))
63 59
64#endif /* __XFS_INUM_H__ */ 60#endif /* __XFS_INUM_H__ */
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index f0969c77bdbe..aff12f2d4428 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -380,7 +380,7 @@ typedef struct xfs_icdinode {
380 xfs_ictimestamp_t di_mtime; /* time last modified */ 380 xfs_ictimestamp_t di_mtime; /* time last modified */
381 xfs_ictimestamp_t di_ctime; /* time created/inode modified */ 381 xfs_ictimestamp_t di_ctime; /* time created/inode modified */
382 xfs_fsize_t di_size; /* number of bytes in file */ 382 xfs_fsize_t di_size; /* number of bytes in file */
383 xfs_drfsbno_t di_nblocks; /* # of direct & btree blocks used */ 383 xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
384 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ 384 xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
385 xfs_extnum_t di_nextents; /* number of extents in data fork */ 385 xfs_extnum_t di_nextents; /* number of extents in data fork */
386 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/ 386 xfs_aextnum_t di_anextents; /* number of extents in attribute fork*/
@@ -516,7 +516,7 @@ xfs_blft_from_flags(struct xfs_buf_log_format *blf)
516 * EFI/EFD log format definitions 516 * EFI/EFD log format definitions
517 */ 517 */
518typedef struct xfs_extent { 518typedef struct xfs_extent {
519 xfs_dfsbno_t ext_start; 519 xfs_fsblock_t ext_start;
520 xfs_extlen_t ext_len; 520 xfs_extlen_t ext_len;
521} xfs_extent_t; 521} xfs_extent_t;
522 522
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 1c55ccbb379d..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
diff --git a/fs/xfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c
index ee7e0e80246b..ee7e0e80246b 100644
--- a/fs/xfs/xfs_log_rlimit.c
+++ b/fs/xfs/libxfs/xfs_log_rlimit.c
diff --git a/fs/xfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 137e20937077..1b0a08379759 100644
--- a/fs/xfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -98,8 +98,6 @@ typedef __uint16_t xfs_qwarncnt_t;
98#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ 98#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \
99 XFS_GQUOTA_ACTIVE | \ 99 XFS_GQUOTA_ACTIVE | \
100 XFS_PQUOTA_ACTIVE)) 100 XFS_PQUOTA_ACTIVE))
101#define XFS_IS_OQUOTA_ON(mp) ((mp)->m_qflags & (XFS_GQUOTA_ACTIVE | \
102 XFS_PQUOTA_ACTIVE))
103#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) 101#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE)
104#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) 102#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE)
105#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) 103#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE)
diff --git a/fs/xfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index f4dd697cac08..7c818f1e4484 100644
--- a/fs/xfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -424,20 +424,24 @@ xfs_rtfind_forw(
424} 424}
425 425
426/* 426/*
427 * Read and modify the summary information for a given extent size, 427 * Read and/or modify the summary information for a given extent size,
428 * bitmap block combination. 428 * bitmap block combination.
429 * Keeps track of a current summary block, so we don't keep reading 429 * Keeps track of a current summary block, so we don't keep reading
430 * it from the buffer cache. 430 * it from the buffer cache.
431 *
432 * Summary information is returned in *sum if specified.
433 * If no delta is specified, returns summary only.
431 */ 434 */
432int 435int
433xfs_rtmodify_summary( 436xfs_rtmodify_summary_int(
434 xfs_mount_t *mp, /* file system mount point */ 437 xfs_mount_t *mp, /* file system mount structure */
435 xfs_trans_t *tp, /* transaction pointer */ 438 xfs_trans_t *tp, /* transaction pointer */
436 int log, /* log2 of extent size */ 439 int log, /* log2 of extent size */
437 xfs_rtblock_t bbno, /* bitmap block number */ 440 xfs_rtblock_t bbno, /* bitmap block number */
438 int delta, /* change to make to summary info */ 441 int delta, /* change to make to summary info */
439 xfs_buf_t **rbpp, /* in/out: summary block buffer */ 442 xfs_buf_t **rbpp, /* in/out: summary block buffer */
440 xfs_fsblock_t *rsb) /* in/out: summary block number */ 443 xfs_fsblock_t *rsb, /* in/out: summary block number */
444 xfs_suminfo_t *sum) /* out: summary info for this block */
441{ 445{
442 xfs_buf_t *bp; /* buffer for the summary block */ 446 xfs_buf_t *bp; /* buffer for the summary block */
443 int error; /* error value */ 447 int error; /* error value */
@@ -456,7 +460,7 @@ xfs_rtmodify_summary(
456 /* 460 /*
457 * If we have an old buffer, and the block number matches, use that. 461 * If we have an old buffer, and the block number matches, use that.
458 */ 462 */
459 if (rbpp && *rbpp && *rsb == sb) 463 if (*rbpp && *rsb == sb)
460 bp = *rbpp; 464 bp = *rbpp;
461 /* 465 /*
462 * Otherwise we have to get the buffer. 466 * Otherwise we have to get the buffer.
@@ -465,7 +469,7 @@ xfs_rtmodify_summary(
465 /* 469 /*
466 * If there was an old one, get rid of it first. 470 * If there was an old one, get rid of it first.
467 */ 471 */
468 if (rbpp && *rbpp) 472 if (*rbpp)
469 xfs_trans_brelse(tp, *rbpp); 473 xfs_trans_brelse(tp, *rbpp);
470 error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); 474 error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
471 if (error) { 475 if (error) {
@@ -474,21 +478,38 @@ xfs_rtmodify_summary(
474 /* 478 /*
475 * Remember this buffer and block for the next call. 479 * Remember this buffer and block for the next call.
476 */ 480 */
477 if (rbpp) { 481 *rbpp = bp;
478 *rbpp = bp; 482 *rsb = sb;
479 *rsb = sb;
480 }
481 } 483 }
482 /* 484 /*
483 * Point to the summary information, modify and log it. 485 * Point to the summary information, modify/log it, and/or copy it out.
484 */ 486 */
485 sp = XFS_SUMPTR(mp, bp, so); 487 sp = XFS_SUMPTR(mp, bp, so);
486 *sp += delta; 488 if (delta) {
487 xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), 489 uint first = (uint)((char *)sp - (char *)bp->b_addr);
488 (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); 490
491 *sp += delta;
492 xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1);
493 }
494 if (sum)
495 *sum = *sp;
489 return 0; 496 return 0;
490} 497}
491 498
499int
500xfs_rtmodify_summary(
501 xfs_mount_t *mp, /* file system mount structure */
502 xfs_trans_t *tp, /* transaction pointer */
503 int log, /* log2 of extent size */
504 xfs_rtblock_t bbno, /* bitmap block number */
505 int delta, /* change to make to summary info */
506 xfs_buf_t **rbpp, /* in/out: summary block buffer */
507 xfs_fsblock_t *rsb) /* in/out: summary block number */
508{
509 return xfs_rtmodify_summary_int(mp, tp, log, bbno,
510 delta, rbpp, rsb, NULL);
511}
512
492/* 513/*
493 * Set the given range of bitmap bits to the given value. 514 * Set the given range of bitmap bits to the given value.
494 * Do whatever I/O and logging is required. 515 * Do whatever I/O and logging is required.
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 7703fa6770ff..5f902fa7913f 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -186,13 +186,13 @@ xfs_mount_validate_sb(
186 */ 186 */
187 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 187 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
188 xfs_warn(mp, "bad magic number"); 188 xfs_warn(mp, "bad magic number");
189 return XFS_ERROR(EWRONGFS); 189 return -EWRONGFS;
190 } 190 }
191 191
192 192
193 if (!xfs_sb_good_version(sbp)) { 193 if (!xfs_sb_good_version(sbp)) {
194 xfs_warn(mp, "bad version"); 194 xfs_warn(mp, "bad version");
195 return XFS_ERROR(EWRONGFS); 195 return -EWRONGFS;
196 } 196 }
197 197
198 /* 198 /*
@@ -220,7 +220,7 @@ xfs_mount_validate_sb(
220 xfs_warn(mp, 220 xfs_warn(mp,
221"Attempted to mount read-only compatible filesystem read-write.\n" 221"Attempted to mount read-only compatible filesystem read-write.\n"
222"Filesystem can only be safely mounted read only."); 222"Filesystem can only be safely mounted read only.");
223 return XFS_ERROR(EINVAL); 223 return -EINVAL;
224 } 224 }
225 } 225 }
226 if (xfs_sb_has_incompat_feature(sbp, 226 if (xfs_sb_has_incompat_feature(sbp,
@@ -230,7 +230,7 @@ xfs_mount_validate_sb(
230"Filesystem can not be safely mounted by this kernel.", 230"Filesystem can not be safely mounted by this kernel.",
231 (sbp->sb_features_incompat & 231 (sbp->sb_features_incompat &
232 XFS_SB_FEAT_INCOMPAT_UNKNOWN)); 232 XFS_SB_FEAT_INCOMPAT_UNKNOWN));
233 return XFS_ERROR(EINVAL); 233 return -EINVAL;
234 } 234 }
235 } 235 }
236 236
@@ -238,13 +238,13 @@ xfs_mount_validate_sb(
238 if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { 238 if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) {
239 xfs_notice(mp, 239 xfs_notice(mp,
240 "Version 5 of Super block has XFS_OQUOTA bits."); 240 "Version 5 of Super block has XFS_OQUOTA bits.");
241 return XFS_ERROR(EFSCORRUPTED); 241 return -EFSCORRUPTED;
242 } 242 }
243 } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | 243 } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD |
244 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { 244 XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) {
245 xfs_notice(mp, 245 xfs_notice(mp,
246"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits."); 246"Superblock earlier than Version 5 has XFS_[PQ]UOTA_{ENFD|CHKD} bits.");
247 return XFS_ERROR(EFSCORRUPTED); 247 return -EFSCORRUPTED;
248 } 248 }
249 249
250 if (unlikely( 250 if (unlikely(
@@ -252,7 +252,7 @@ xfs_mount_validate_sb(
252 xfs_warn(mp, 252 xfs_warn(mp,
253 "filesystem is marked as having an external log; " 253 "filesystem is marked as having an external log; "
254 "specify logdev on the mount command line."); 254 "specify logdev on the mount command line.");
255 return XFS_ERROR(EINVAL); 255 return -EINVAL;
256 } 256 }
257 257
258 if (unlikely( 258 if (unlikely(
@@ -260,7 +260,7 @@ xfs_mount_validate_sb(
260 xfs_warn(mp, 260 xfs_warn(mp,
261 "filesystem is marked as having an internal log; " 261 "filesystem is marked as having an internal log; "
262 "do not specify logdev on the mount command line."); 262 "do not specify logdev on the mount command line.");
263 return XFS_ERROR(EINVAL); 263 return -EINVAL;
264 } 264 }
265 265
266 /* 266 /*
@@ -279,11 +279,13 @@ xfs_mount_validate_sb(
279 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || 279 sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG ||
280 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || 280 sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG ||
281 sbp->sb_blocksize != (1 << sbp->sb_blocklog) || 281 sbp->sb_blocksize != (1 << sbp->sb_blocklog) ||
282 sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG ||
282 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || 283 sbp->sb_inodesize < XFS_DINODE_MIN_SIZE ||
283 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || 284 sbp->sb_inodesize > XFS_DINODE_MAX_SIZE ||
284 sbp->sb_inodelog < XFS_DINODE_MIN_LOG || 285 sbp->sb_inodelog < XFS_DINODE_MIN_LOG ||
285 sbp->sb_inodelog > XFS_DINODE_MAX_LOG || 286 sbp->sb_inodelog > XFS_DINODE_MAX_LOG ||
286 sbp->sb_inodesize != (1 << sbp->sb_inodelog) || 287 sbp->sb_inodesize != (1 << sbp->sb_inodelog) ||
288 sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE ||
287 sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || 289 sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
288 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || 290 (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) ||
289 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 291 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
@@ -294,7 +296,7 @@ xfs_mount_validate_sb(
294 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || 296 sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) ||
295 sbp->sb_shared_vn != 0)) { 297 sbp->sb_shared_vn != 0)) {
296 xfs_notice(mp, "SB sanity check failed"); 298 xfs_notice(mp, "SB sanity check failed");
297 return XFS_ERROR(EFSCORRUPTED); 299 return -EFSCORRUPTED;
298 } 300 }
299 301
300 /* 302 /*
@@ -305,7 +307,7 @@ xfs_mount_validate_sb(
305 "File system with blocksize %d bytes. " 307 "File system with blocksize %d bytes. "
306 "Only pagesize (%ld) or less will currently work.", 308 "Only pagesize (%ld) or less will currently work.",
307 sbp->sb_blocksize, PAGE_SIZE); 309 sbp->sb_blocksize, PAGE_SIZE);
308 return XFS_ERROR(ENOSYS); 310 return -ENOSYS;
309 } 311 }
310 312
311 /* 313 /*
@@ -320,19 +322,19 @@ xfs_mount_validate_sb(
320 default: 322 default:
321 xfs_warn(mp, "inode size of %d bytes not supported", 323 xfs_warn(mp, "inode size of %d bytes not supported",
322 sbp->sb_inodesize); 324 sbp->sb_inodesize);
323 return XFS_ERROR(ENOSYS); 325 return -ENOSYS;
324 } 326 }
325 327
326 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 328 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
327 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 329 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
328 xfs_warn(mp, 330 xfs_warn(mp,
329 "file system too large to be mounted on this system."); 331 "file system too large to be mounted on this system.");
330 return XFS_ERROR(EFBIG); 332 return -EFBIG;
331 } 333 }
332 334
333 if (check_inprogress && sbp->sb_inprogress) { 335 if (check_inprogress && sbp->sb_inprogress) {
334 xfs_warn(mp, "Offline file system operation in progress!"); 336 xfs_warn(mp, "Offline file system operation in progress!");
335 return XFS_ERROR(EFSCORRUPTED); 337 return -EFSCORRUPTED;
336 } 338 }
337 return 0; 339 return 0;
338} 340}
@@ -386,10 +388,11 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
386 } 388 }
387} 389}
388 390
389void 391static void
390xfs_sb_from_disk( 392__xfs_sb_from_disk(
391 struct xfs_sb *to, 393 struct xfs_sb *to,
392 xfs_dsb_t *from) 394 xfs_dsb_t *from,
395 bool convert_xquota)
393{ 396{
394 to->sb_magicnum = be32_to_cpu(from->sb_magicnum); 397 to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
395 to->sb_blocksize = be32_to_cpu(from->sb_blocksize); 398 to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
@@ -442,9 +445,22 @@ xfs_sb_from_disk(
442 to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); 445 to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat);
443 to->sb_features_log_incompat = 446 to->sb_features_log_incompat =
444 be32_to_cpu(from->sb_features_log_incompat); 447 be32_to_cpu(from->sb_features_log_incompat);
448 /* crc is only used on disk, not in memory; just init to 0 here. */
449 to->sb_crc = 0;
445 to->sb_pad = 0; 450 to->sb_pad = 0;
446 to->sb_pquotino = be64_to_cpu(from->sb_pquotino); 451 to->sb_pquotino = be64_to_cpu(from->sb_pquotino);
447 to->sb_lsn = be64_to_cpu(from->sb_lsn); 452 to->sb_lsn = be64_to_cpu(from->sb_lsn);
453 /* Convert on-disk flags to in-memory flags? */
454 if (convert_xquota)
455 xfs_sb_quota_from_disk(to);
456}
457
458void
459xfs_sb_from_disk(
460 struct xfs_sb *to,
461 xfs_dsb_t *from)
462{
463 __xfs_sb_from_disk(to, from, true);
448} 464}
449 465
450static inline void 466static inline void
@@ -536,6 +552,9 @@ xfs_sb_to_disk(
536 if (!fields) 552 if (!fields)
537 return; 553 return;
538 554
555 /* We should never write the crc here, it's updated in the IO path */
556 fields &= ~XFS_SB_CRC;
557
539 xfs_sb_quota_to_disk(to, from, &fields); 558 xfs_sb_quota_to_disk(to, from, &fields);
540 while (fields) { 559 while (fields) {
541 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); 560 f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
@@ -577,7 +596,11 @@ xfs_sb_verify(
577 struct xfs_mount *mp = bp->b_target->bt_mount; 596 struct xfs_mount *mp = bp->b_target->bt_mount;
578 struct xfs_sb sb; 597 struct xfs_sb sb;
579 598
580 xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp)); 599 /*
600 * Use call variant which doesn't convert quota flags from disk
601 * format, because xfs_mount_validate_sb checks the on-disk flags.
602 */
603 __xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp), false);
581 604
582 /* 605 /*
583 * Only check the in progress field for the primary superblock as 606 * Only check the in progress field for the primary superblock as
@@ -620,7 +643,7 @@ xfs_sb_read_verify(
620 /* Only fail bad secondaries on a known V5 filesystem */ 643 /* Only fail bad secondaries on a known V5 filesystem */
621 if (bp->b_bn == XFS_SB_DADDR || 644 if (bp->b_bn == XFS_SB_DADDR ||
622 xfs_sb_version_hascrc(&mp->m_sb)) { 645 xfs_sb_version_hascrc(&mp->m_sb)) {
623 error = EFSBADCRC; 646 error = -EFSBADCRC;
624 goto out_error; 647 goto out_error;
625 } 648 }
626 } 649 }
@@ -630,7 +653,7 @@ xfs_sb_read_verify(
630out_error: 653out_error:
631 if (error) { 654 if (error) {
632 xfs_buf_ioerror(bp, error); 655 xfs_buf_ioerror(bp, error);
633 if (error == EFSCORRUPTED || error == EFSBADCRC) 656 if (error == -EFSCORRUPTED || error == -EFSBADCRC)
634 xfs_verifier_error(bp); 657 xfs_verifier_error(bp);
635 } 658 }
636} 659}
@@ -653,7 +676,7 @@ xfs_sb_quiet_read_verify(
653 return; 676 return;
654 } 677 }
655 /* quietly fail */ 678 /* quietly fail */
656 xfs_buf_ioerror(bp, EWRONGFS); 679 xfs_buf_ioerror(bp, -EWRONGFS);
657} 680}
658 681
659static void 682static void
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index c43c2d609a24..2e739708afd3 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -87,11 +87,11 @@ struct xfs_trans;
87typedef struct xfs_sb { 87typedef struct xfs_sb {
88 __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */ 88 __uint32_t sb_magicnum; /* magic number == XFS_SB_MAGIC */
89 __uint32_t sb_blocksize; /* logical block size, bytes */ 89 __uint32_t sb_blocksize; /* logical block size, bytes */
90 xfs_drfsbno_t sb_dblocks; /* number of data blocks */ 90 xfs_rfsblock_t sb_dblocks; /* number of data blocks */
91 xfs_drfsbno_t sb_rblocks; /* number of realtime blocks */ 91 xfs_rfsblock_t sb_rblocks; /* number of realtime blocks */
92 xfs_drtbno_t sb_rextents; /* number of realtime extents */ 92 xfs_rtblock_t sb_rextents; /* number of realtime extents */
93 uuid_t sb_uuid; /* file system unique id */ 93 uuid_t sb_uuid; /* file system unique id */
94 xfs_dfsbno_t sb_logstart; /* starting block of log if internal */ 94 xfs_fsblock_t sb_logstart; /* starting block of log if internal */
95 xfs_ino_t sb_rootino; /* root inode number */ 95 xfs_ino_t sb_rootino; /* root inode number */
96 xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */ 96 xfs_ino_t sb_rbmino; /* bitmap inode for realtime extents */
97 xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */ 97 xfs_ino_t sb_rsumino; /* summary inode for rt bitmap */
diff --git a/fs/xfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..82404da2ca67 100644
--- a/fs/xfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
diff --git a/fs/xfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 23c2f2577c8d..5782f037eab4 100644
--- a/fs/xfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -133,9 +133,9 @@ xfs_symlink_read_verify(
133 return; 133 return;
134 134
135 if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) 135 if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF))
136 xfs_buf_ioerror(bp, EFSBADCRC); 136 xfs_buf_ioerror(bp, -EFSBADCRC);
137 else if (!xfs_symlink_verify(bp)) 137 else if (!xfs_symlink_verify(bp))
138 xfs_buf_ioerror(bp, EFSCORRUPTED); 138 xfs_buf_ioerror(bp, -EFSCORRUPTED);
139 139
140 if (bp->b_error) 140 if (bp->b_error)
141 xfs_verifier_error(bp); 141 xfs_verifier_error(bp);
@@ -153,7 +153,7 @@ xfs_symlink_write_verify(
153 return; 153 return;
154 154
155 if (!xfs_symlink_verify(bp)) { 155 if (!xfs_symlink_verify(bp)) {
156 xfs_buf_ioerror(bp, EFSCORRUPTED); 156 xfs_buf_ioerror(bp, -EFSCORRUPTED);
157 xfs_verifier_error(bp); 157 xfs_verifier_error(bp);
158 return; 158 return;
159 } 159 }
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index f2bda7c76b8a..f2bda7c76b8a 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
diff --git a/fs/xfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..1097d14cd583 100644
--- a/fs/xfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
diff --git a/fs/xfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index bf9c4579334d..bf9c4579334d 100644
--- a/fs/xfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
diff --git a/fs/xfs/time.h b/fs/xfs/time.h
deleted file mode 100644
index 387e695a184c..000000000000
--- a/fs/xfs/time.h
+++ /dev/null
@@ -1,36 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_TIME_H__
19#define __XFS_SUPPORT_TIME_H__
20
21#include <linux/sched.h>
22#include <linux/time.h>
23
24typedef struct timespec timespec_t;
25
26static inline void delay(long ticks)
27{
28 schedule_timeout_uninterruptible(ticks);
29}
30
31static inline void nanotime(struct timespec *tvp)
32{
33 *tvp = CURRENT_TIME;
34}
35
36#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6888ad886ff6..a65fa5dde6e9 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type)
152 if (!xfs_acl) 152 if (!xfs_acl)
153 return ERR_PTR(-ENOMEM); 153 return ERR_PTR(-ENOMEM);
154 154
155 error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl, 155 error = xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
156 &len, ATTR_ROOT); 156 &len, ATTR_ROOT);
157 if (error) { 157 if (error) {
158 /* 158 /*
@@ -210,7 +210,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
210 len -= sizeof(struct xfs_acl_entry) * 210 len -= sizeof(struct xfs_acl_entry) *
211 (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); 211 (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count);
212 212
213 error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, 213 error = xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
214 len, ATTR_ROOT); 214 len, ATTR_ROOT);
215 215
216 kmem_free(xfs_acl); 216 kmem_free(xfs_acl);
@@ -218,7 +218,7 @@ __xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
218 /* 218 /*
219 * A NULL ACL argument means we want to remove the ACL. 219 * A NULL ACL argument means we want to remove the ACL.
220 */ 220 */
221 error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT); 221 error = xfs_attr_remove(ip, ea_name, ATTR_ROOT);
222 222
223 /* 223 /*
224 * If the attribute didn't exist to start with that's fine. 224 * If the attribute didn't exist to start with that's fine.
@@ -244,7 +244,7 @@ xfs_set_mode(struct inode *inode, umode_t mode)
244 iattr.ia_mode = mode; 244 iattr.ia_mode = mode;
245 iattr.ia_ctime = current_fs_time(inode->i_sb); 245 iattr.ia_ctime = current_fs_time(inode->i_sb);
246 246
247 error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL); 247 error = xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
248 } 248 }
249 249
250 return error; 250 return error;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index faaf716e2080..f5b2453a43b2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -240,7 +240,7 @@ xfs_end_io(
240 240
241done: 241done:
242 if (error) 242 if (error)
243 ioend->io_error = -error; 243 ioend->io_error = error;
244 xfs_destroy_ioend(ioend); 244 xfs_destroy_ioend(ioend);
245} 245}
246 246
@@ -308,14 +308,14 @@ xfs_map_blocks(
308 int nimaps = 1; 308 int nimaps = 1;
309 309
310 if (XFS_FORCED_SHUTDOWN(mp)) 310 if (XFS_FORCED_SHUTDOWN(mp))
311 return -XFS_ERROR(EIO); 311 return -EIO;
312 312
313 if (type == XFS_IO_UNWRITTEN) 313 if (type == XFS_IO_UNWRITTEN)
314 bmapi_flags |= XFS_BMAPI_IGSTATE; 314 bmapi_flags |= XFS_BMAPI_IGSTATE;
315 315
316 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 316 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
317 if (nonblocking) 317 if (nonblocking)
318 return -XFS_ERROR(EAGAIN); 318 return -EAGAIN;
319 xfs_ilock(ip, XFS_ILOCK_SHARED); 319 xfs_ilock(ip, XFS_ILOCK_SHARED);
320 } 320 }
321 321
@@ -332,14 +332,14 @@ xfs_map_blocks(
332 xfs_iunlock(ip, XFS_ILOCK_SHARED); 332 xfs_iunlock(ip, XFS_ILOCK_SHARED);
333 333
334 if (error) 334 if (error)
335 return -XFS_ERROR(error); 335 return error;
336 336
337 if (type == XFS_IO_DELALLOC && 337 if (type == XFS_IO_DELALLOC &&
338 (!nimaps || isnullstartblock(imap->br_startblock))) { 338 (!nimaps || isnullstartblock(imap->br_startblock))) {
339 error = xfs_iomap_write_allocate(ip, offset, imap); 339 error = xfs_iomap_write_allocate(ip, offset, imap);
340 if (!error) 340 if (!error)
341 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); 341 trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
342 return -XFS_ERROR(error); 342 return error;
343 } 343 }
344 344
345#ifdef DEBUG 345#ifdef DEBUG
@@ -434,10 +434,22 @@ xfs_start_page_writeback(
434{ 434{
435 ASSERT(PageLocked(page)); 435 ASSERT(PageLocked(page));
436 ASSERT(!PageWriteback(page)); 436 ASSERT(!PageWriteback(page));
437 if (clear_dirty) 437
438 /*
439 * if the page was not fully cleaned, we need to ensure that the higher
440 * layers come back to it correctly. That means we need to keep the page
441 * dirty, and for WB_SYNC_ALL writeback we need to ensure the
442 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
443 * write this page in this writeback sweep will be made.
444 */
445 if (clear_dirty) {
438 clear_page_dirty_for_io(page); 446 clear_page_dirty_for_io(page);
439 set_page_writeback(page); 447 set_page_writeback(page);
448 } else
449 set_page_writeback_keepwrite(page);
450
440 unlock_page(page); 451 unlock_page(page);
452
441 /* If no buffers on the page are to be written, finish it here */ 453 /* If no buffers on the page are to be written, finish it here */
442 if (!buffers) 454 if (!buffers)
443 end_page_writeback(page); 455 end_page_writeback(page);
@@ -502,7 +514,7 @@ xfs_submit_ioend(
502 * time. 514 * time.
503 */ 515 */
504 if (fail) { 516 if (fail) {
505 ioend->io_error = -fail; 517 ioend->io_error = fail;
506 xfs_finish_ioend(ioend); 518 xfs_finish_ioend(ioend);
507 continue; 519 continue;
508 } 520 }
@@ -548,6 +560,13 @@ xfs_cancel_ioend(
548 do { 560 do {
549 next_bh = bh->b_private; 561 next_bh = bh->b_private;
550 clear_buffer_async_write(bh); 562 clear_buffer_async_write(bh);
563 /*
564 * The unwritten flag is cleared when added to the
565 * ioend. We're not submitting for I/O so mark the
566 * buffer unwritten again for next time around.
567 */
568 if (ioend->io_type == XFS_IO_UNWRITTEN)
569 set_buffer_unwritten(bh);
551 unlock_buffer(bh); 570 unlock_buffer(bh);
552 } while ((bh = next_bh) != NULL); 571 } while ((bh = next_bh) != NULL);
553 572
@@ -1253,7 +1272,7 @@ __xfs_get_blocks(
1253 int new = 0; 1272 int new = 0;
1254 1273
1255 if (XFS_FORCED_SHUTDOWN(mp)) 1274 if (XFS_FORCED_SHUTDOWN(mp))
1256 return -XFS_ERROR(EIO); 1275 return -EIO;
1257 1276
1258 offset = (xfs_off_t)iblock << inode->i_blkbits; 1277 offset = (xfs_off_t)iblock << inode->i_blkbits;
1259 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); 1278 ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
@@ -1302,7 +1321,7 @@ __xfs_get_blocks(
1302 error = xfs_iomap_write_direct(ip, offset, size, 1321 error = xfs_iomap_write_direct(ip, offset, size,
1303 &imap, nimaps); 1322 &imap, nimaps);
1304 if (error) 1323 if (error)
1305 return -error; 1324 return error;
1306 new = 1; 1325 new = 1;
1307 } else { 1326 } else {
1308 /* 1327 /*
@@ -1415,7 +1434,7 @@ __xfs_get_blocks(
1415 1434
1416out_unlock: 1435out_unlock:
1417 xfs_iunlock(ip, lockmode); 1436 xfs_iunlock(ip, lockmode);
1418 return -error; 1437 return error;
1419} 1438}
1420 1439
1421int 1440int
@@ -1753,11 +1772,72 @@ xfs_vm_readpages(
1753 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); 1772 return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1754} 1773}
1755 1774
1775/*
1776 * This is basically a copy of __set_page_dirty_buffers() with one
1777 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1778 * dirty, we'll never be able to clean them because we don't write buffers
1779 * beyond EOF, and that means we can't invalidate pages that span EOF
1780 * that have been marked dirty. Further, the dirty state can leak into
1781 * the file interior if the file is extended, resulting in all sorts of
1782 * bad things happening as the state does not match the underlying data.
1783 *
1784 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1785 * this only exist because of bufferheads and how the generic code manages them.
1786 */
1787STATIC int
1788xfs_vm_set_page_dirty(
1789 struct page *page)
1790{
1791 struct address_space *mapping = page->mapping;
1792 struct inode *inode = mapping->host;
1793 loff_t end_offset;
1794 loff_t offset;
1795 int newly_dirty;
1796
1797 if (unlikely(!mapping))
1798 return !TestSetPageDirty(page);
1799
1800 end_offset = i_size_read(inode);
1801 offset = page_offset(page);
1802
1803 spin_lock(&mapping->private_lock);
1804 if (page_has_buffers(page)) {
1805 struct buffer_head *head = page_buffers(page);
1806 struct buffer_head *bh = head;
1807
1808 do {
1809 if (offset < end_offset)
1810 set_buffer_dirty(bh);
1811 bh = bh->b_this_page;
1812 offset += 1 << inode->i_blkbits;
1813 } while (bh != head);
1814 }
1815 newly_dirty = !TestSetPageDirty(page);
1816 spin_unlock(&mapping->private_lock);
1817
1818 if (newly_dirty) {
1819 /* sigh - __set_page_dirty() is static, so copy it here, too */
1820 unsigned long flags;
1821
1822 spin_lock_irqsave(&mapping->tree_lock, flags);
1823 if (page->mapping) { /* Race with truncate? */
1824 WARN_ON_ONCE(!PageUptodate(page));
1825 account_page_dirtied(page, mapping);
1826 radix_tree_tag_set(&mapping->page_tree,
1827 page_index(page), PAGECACHE_TAG_DIRTY);
1828 }
1829 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1830 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1831 }
1832 return newly_dirty;
1833}
1834
1756const struct address_space_operations xfs_address_space_operations = { 1835const struct address_space_operations xfs_address_space_operations = {
1757 .readpage = xfs_vm_readpage, 1836 .readpage = xfs_vm_readpage,
1758 .readpages = xfs_vm_readpages, 1837 .readpages = xfs_vm_readpages,
1759 .writepage = xfs_vm_writepage, 1838 .writepage = xfs_vm_writepage,
1760 .writepages = xfs_vm_writepages, 1839 .writepages = xfs_vm_writepages,
1840 .set_page_dirty = xfs_vm_set_page_dirty,
1761 .releasepage = xfs_vm_releasepage, 1841 .releasepage = xfs_vm_releasepage,
1762 .invalidatepage = xfs_vm_invalidatepage, 1842 .invalidatepage = xfs_vm_invalidatepage,
1763 .write_begin = xfs_vm_write_begin, 1843 .write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 09480c57f069..aa2a8b1838a2 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -76,7 +76,7 @@ xfs_attr3_leaf_freextent(
76 error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt, 76 error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
77 &map, &nmap, XFS_BMAPI_ATTRFORK); 77 &map, &nmap, XFS_BMAPI_ATTRFORK);
78 if (error) { 78 if (error) {
79 return(error); 79 return error;
80 } 80 }
81 ASSERT(nmap == 1); 81 ASSERT(nmap == 1);
82 ASSERT(map.br_startblock != DELAYSTARTBLOCK); 82 ASSERT(map.br_startblock != DELAYSTARTBLOCK);
@@ -95,21 +95,21 @@ xfs_attr3_leaf_freextent(
95 dp->i_mount->m_ddev_targp, 95 dp->i_mount->m_ddev_targp,
96 dblkno, dblkcnt, 0); 96 dblkno, dblkcnt, 0);
97 if (!bp) 97 if (!bp)
98 return ENOMEM; 98 return -ENOMEM;
99 xfs_trans_binval(*trans, bp); 99 xfs_trans_binval(*trans, bp);
100 /* 100 /*
101 * Roll to next transaction. 101 * Roll to next transaction.
102 */ 102 */
103 error = xfs_trans_roll(trans, dp); 103 error = xfs_trans_roll(trans, dp);
104 if (error) 104 if (error)
105 return (error); 105 return error;
106 } 106 }
107 107
108 tblkno += map.br_blockcount; 108 tblkno += map.br_blockcount;
109 tblkcnt -= map.br_blockcount; 109 tblkcnt -= map.br_blockcount;
110 } 110 }
111 111
112 return(0); 112 return 0;
113} 113}
114 114
115/* 115/*
@@ -227,7 +227,7 @@ xfs_attr3_node_inactive(
227 */ 227 */
228 if (level > XFS_DA_NODE_MAXDEPTH) { 228 if (level > XFS_DA_NODE_MAXDEPTH) {
229 xfs_trans_brelse(*trans, bp); /* no locks for later trans */ 229 xfs_trans_brelse(*trans, bp); /* no locks for later trans */
230 return XFS_ERROR(EIO); 230 return -EIO;
231 } 231 }
232 232
233 node = bp->b_addr; 233 node = bp->b_addr;
@@ -256,7 +256,7 @@ xfs_attr3_node_inactive(
256 error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp, 256 error = xfs_da3_node_read(*trans, dp, child_fsb, -2, &child_bp,
257 XFS_ATTR_FORK); 257 XFS_ATTR_FORK);
258 if (error) 258 if (error)
259 return(error); 259 return error;
260 if (child_bp) { 260 if (child_bp) {
261 /* save for re-read later */ 261 /* save for re-read later */
262 child_blkno = XFS_BUF_ADDR(child_bp); 262 child_blkno = XFS_BUF_ADDR(child_bp);
@@ -277,7 +277,7 @@ xfs_attr3_node_inactive(
277 child_bp); 277 child_bp);
278 break; 278 break;
279 default: 279 default:
280 error = XFS_ERROR(EIO); 280 error = -EIO;
281 xfs_trans_brelse(*trans, child_bp); 281 xfs_trans_brelse(*trans, child_bp);
282 break; 282 break;
283 } 283 }
@@ -360,7 +360,7 @@ xfs_attr3_root_inactive(
360 error = xfs_attr3_leaf_inactive(trans, dp, bp); 360 error = xfs_attr3_leaf_inactive(trans, dp, bp);
361 break; 361 break;
362 default: 362 default:
363 error = XFS_ERROR(EIO); 363 error = -EIO;
364 xfs_trans_brelse(*trans, bp); 364 xfs_trans_brelse(*trans, bp);
365 break; 365 break;
366 } 366 }
@@ -414,7 +414,7 @@ xfs_attr_inactive(xfs_inode_t *dp)
414 error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); 414 error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0);
415 if (error) { 415 if (error) {
416 xfs_trans_cancel(trans, 0); 416 xfs_trans_cancel(trans, 0);
417 return(error); 417 return error;
418 } 418 }
419 xfs_ilock(dp, XFS_ILOCK_EXCL); 419 xfs_ilock(dp, XFS_ILOCK_EXCL);
420 420
@@ -443,10 +443,10 @@ xfs_attr_inactive(xfs_inode_t *dp)
443 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); 443 error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES);
444 xfs_iunlock(dp, XFS_ILOCK_EXCL); 444 xfs_iunlock(dp, XFS_ILOCK_EXCL);
445 445
446 return(error); 446 return error;
447 447
448out: 448out:
449 xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 449 xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
450 xfs_iunlock(dp, XFS_ILOCK_EXCL); 450 xfs_iunlock(dp, XFS_ILOCK_EXCL);
451 return(error); 451 return error;
452} 452}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 90e2eeb21207..62db83ab6cbc 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -50,11 +50,11 @@ xfs_attr_shortform_compare(const void *a, const void *b)
50 sa = (xfs_attr_sf_sort_t *)a; 50 sa = (xfs_attr_sf_sort_t *)a;
51 sb = (xfs_attr_sf_sort_t *)b; 51 sb = (xfs_attr_sf_sort_t *)b;
52 if (sa->hash < sb->hash) { 52 if (sa->hash < sb->hash) {
53 return(-1); 53 return -1;
54 } else if (sa->hash > sb->hash) { 54 } else if (sa->hash > sb->hash) {
55 return(1); 55 return 1;
56 } else { 56 } else {
57 return(sa->entno - sb->entno); 57 return sa->entno - sb->entno;
58 } 58 }
59} 59}
60 60
@@ -86,7 +86,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
86 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data; 86 sf = (xfs_attr_shortform_t *)dp->i_afp->if_u1.if_data;
87 ASSERT(sf != NULL); 87 ASSERT(sf != NULL);
88 if (!sf->hdr.count) 88 if (!sf->hdr.count)
89 return(0); 89 return 0;
90 cursor = context->cursor; 90 cursor = context->cursor;
91 ASSERT(cursor != NULL); 91 ASSERT(cursor != NULL);
92 92
@@ -124,7 +124,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
124 sfe = XFS_ATTR_SF_NEXTENTRY(sfe); 124 sfe = XFS_ATTR_SF_NEXTENTRY(sfe);
125 } 125 }
126 trace_xfs_attr_list_sf_all(context); 126 trace_xfs_attr_list_sf_all(context);
127 return(0); 127 return 0;
128 } 128 }
129 129
130 /* do no more for a search callback */ 130 /* do no more for a search callback */
@@ -150,7 +150,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
150 XFS_ERRLEVEL_LOW, 150 XFS_ERRLEVEL_LOW,
151 context->dp->i_mount, sfe); 151 context->dp->i_mount, sfe);
152 kmem_free(sbuf); 152 kmem_free(sbuf);
153 return XFS_ERROR(EFSCORRUPTED); 153 return -EFSCORRUPTED;
154 } 154 }
155 155
156 sbp->entno = i; 156 sbp->entno = i;
@@ -188,7 +188,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
188 } 188 }
189 if (i == nsbuf) { 189 if (i == nsbuf) {
190 kmem_free(sbuf); 190 kmem_free(sbuf);
191 return(0); 191 return 0;
192 } 192 }
193 193
194 /* 194 /*
@@ -213,7 +213,7 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
213 } 213 }
214 214
215 kmem_free(sbuf); 215 kmem_free(sbuf);
216 return(0); 216 return 0;
217} 217}
218 218
219STATIC int 219STATIC int
@@ -243,8 +243,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
243 if (cursor->blkno > 0) { 243 if (cursor->blkno > 0) {
244 error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1, 244 error = xfs_da3_node_read(NULL, dp, cursor->blkno, -1,
245 &bp, XFS_ATTR_FORK); 245 &bp, XFS_ATTR_FORK);
246 if ((error != 0) && (error != EFSCORRUPTED)) 246 if ((error != 0) && (error != -EFSCORRUPTED))
247 return(error); 247 return error;
248 if (bp) { 248 if (bp) {
249 struct xfs_attr_leaf_entry *entries; 249 struct xfs_attr_leaf_entry *entries;
250 250
@@ -295,7 +295,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
295 cursor->blkno, -1, &bp, 295 cursor->blkno, -1, &bp,
296 XFS_ATTR_FORK); 296 XFS_ATTR_FORK);
297 if (error) 297 if (error)
298 return(error); 298 return error;
299 node = bp->b_addr; 299 node = bp->b_addr;
300 magic = be16_to_cpu(node->hdr.info.magic); 300 magic = be16_to_cpu(node->hdr.info.magic);
301 if (magic == XFS_ATTR_LEAF_MAGIC || 301 if (magic == XFS_ATTR_LEAF_MAGIC ||
@@ -308,7 +308,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
308 context->dp->i_mount, 308 context->dp->i_mount,
309 node); 309 node);
310 xfs_trans_brelse(NULL, bp); 310 xfs_trans_brelse(NULL, bp);
311 return XFS_ERROR(EFSCORRUPTED); 311 return -EFSCORRUPTED;
312 } 312 }
313 313
314 dp->d_ops->node_hdr_from_disk(&nodehdr, node); 314 dp->d_ops->node_hdr_from_disk(&nodehdr, node);
@@ -496,11 +496,11 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
496 context->cursor->blkno = 0; 496 context->cursor->blkno = 0;
497 error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp); 497 error = xfs_attr3_leaf_read(NULL, context->dp, 0, -1, &bp);
498 if (error) 498 if (error)
499 return XFS_ERROR(error); 499 return error;
500 500
501 error = xfs_attr3_leaf_list_int(bp, context); 501 error = xfs_attr3_leaf_list_int(bp, context);
502 xfs_trans_brelse(NULL, bp); 502 xfs_trans_brelse(NULL, bp);
503 return XFS_ERROR(error); 503 return error;
504} 504}
505 505
506int 506int
@@ -514,7 +514,7 @@ xfs_attr_list_int(
514 XFS_STATS_INC(xs_attr_list); 514 XFS_STATS_INC(xs_attr_list);
515 515
516 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 516 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
517 return EIO; 517 return -EIO;
518 518
519 /* 519 /*
520 * Decide on what work routines to call based on the inode size. 520 * Decide on what work routines to call based on the inode size.
@@ -616,16 +616,16 @@ xfs_attr_list(
616 * Validate the cursor. 616 * Validate the cursor.
617 */ 617 */
618 if (cursor->pad1 || cursor->pad2) 618 if (cursor->pad1 || cursor->pad2)
619 return(XFS_ERROR(EINVAL)); 619 return -EINVAL;
620 if ((cursor->initted == 0) && 620 if ((cursor->initted == 0) &&
621 (cursor->hashval || cursor->blkno || cursor->offset)) 621 (cursor->hashval || cursor->blkno || cursor->offset))
622 return XFS_ERROR(EINVAL); 622 return -EINVAL;
623 623
624 /* 624 /*
625 * Check for a properly aligned buffer. 625 * Check for a properly aligned buffer.
626 */ 626 */
627 if (((long)buffer) & (sizeof(int)-1)) 627 if (((long)buffer) & (sizeof(int)-1))
628 return XFS_ERROR(EFAULT); 628 return -EFAULT;
629 if (flags & ATTR_KERNOVAL) 629 if (flags & ATTR_KERNOVAL)
630 bufsize = 0; 630 bufsize = 0;
631 631
@@ -648,6 +648,6 @@ xfs_attr_list(
648 alist->al_offset[0] = context.bufsize; 648 alist->al_offset[0] = context.bufsize;
649 649
650 error = xfs_attr_list_int(&context); 650 error = xfs_attr_list_int(&context);
651 ASSERT(error >= 0); 651 ASSERT(error <= 0);
652 return error; 652 return error;
653} 653}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 64731ef3324d..92e8f99a5857 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -133,7 +133,7 @@ xfs_bmap_finish(
133 mp = ntp->t_mountp; 133 mp = ntp->t_mountp;
134 if (!XFS_FORCED_SHUTDOWN(mp)) 134 if (!XFS_FORCED_SHUTDOWN(mp))
135 xfs_force_shutdown(mp, 135 xfs_force_shutdown(mp,
136 (error == EFSCORRUPTED) ? 136 (error == -EFSCORRUPTED) ?
137 SHUTDOWN_CORRUPT_INCORE : 137 SHUTDOWN_CORRUPT_INCORE :
138 SHUTDOWN_META_IO_ERROR); 138 SHUTDOWN_META_IO_ERROR);
139 return error; 139 return error;
@@ -365,7 +365,7 @@ xfs_bmap_count_tree(
365 xfs_trans_brelse(tp, bp); 365 xfs_trans_brelse(tp, bp);
366 XFS_ERROR_REPORT("xfs_bmap_count_tree(1)", 366 XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
367 XFS_ERRLEVEL_LOW, mp); 367 XFS_ERRLEVEL_LOW, mp);
368 return XFS_ERROR(EFSCORRUPTED); 368 return -EFSCORRUPTED;
369 } 369 }
370 xfs_trans_brelse(tp, bp); 370 xfs_trans_brelse(tp, bp);
371 } else { 371 } else {
@@ -425,14 +425,14 @@ xfs_bmap_count_blocks(
425 ASSERT(level > 0); 425 ASSERT(level > 0);
426 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes); 426 pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
427 bno = be64_to_cpu(*pp); 427 bno = be64_to_cpu(*pp);
428 ASSERT(bno != NULLDFSBNO); 428 ASSERT(bno != NULLFSBLOCK);
429 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount); 429 ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
430 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks); 430 ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
431 431
432 if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) { 432 if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
433 XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW, 433 XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
434 mp); 434 mp);
435 return XFS_ERROR(EFSCORRUPTED); 435 return -EFSCORRUPTED;
436 } 436 }
437 437
438 return 0; 438 return 0;
@@ -524,13 +524,13 @@ xfs_getbmap(
524 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS && 524 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
525 ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE && 525 ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
526 ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) 526 ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
527 return XFS_ERROR(EINVAL); 527 return -EINVAL;
528 } else if (unlikely( 528 } else if (unlikely(
529 ip->i_d.di_aformat != 0 && 529 ip->i_d.di_aformat != 0 &&
530 ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) { 530 ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
531 XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW, 531 XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
532 ip->i_mount); 532 ip->i_mount);
533 return XFS_ERROR(EFSCORRUPTED); 533 return -EFSCORRUPTED;
534 } 534 }
535 535
536 prealloced = 0; 536 prealloced = 0;
@@ -539,7 +539,7 @@ xfs_getbmap(
539 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS && 539 if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
540 ip->i_d.di_format != XFS_DINODE_FMT_BTREE && 540 ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
541 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL) 541 ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
542 return XFS_ERROR(EINVAL); 542 return -EINVAL;
543 543
544 if (xfs_get_extsz_hint(ip) || 544 if (xfs_get_extsz_hint(ip) ||
545 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){ 545 ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
@@ -559,26 +559,26 @@ xfs_getbmap(
559 bmv->bmv_entries = 0; 559 bmv->bmv_entries = 0;
560 return 0; 560 return 0;
561 } else if (bmv->bmv_length < 0) { 561 } else if (bmv->bmv_length < 0) {
562 return XFS_ERROR(EINVAL); 562 return -EINVAL;
563 } 563 }
564 564
565 nex = bmv->bmv_count - 1; 565 nex = bmv->bmv_count - 1;
566 if (nex <= 0) 566 if (nex <= 0)
567 return XFS_ERROR(EINVAL); 567 return -EINVAL;
568 bmvend = bmv->bmv_offset + bmv->bmv_length; 568 bmvend = bmv->bmv_offset + bmv->bmv_length;
569 569
570 570
571 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx)) 571 if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
572 return XFS_ERROR(ENOMEM); 572 return -ENOMEM;
573 out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0); 573 out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
574 if (!out) 574 if (!out)
575 return XFS_ERROR(ENOMEM); 575 return -ENOMEM;
576 576
577 xfs_ilock(ip, XFS_IOLOCK_SHARED); 577 xfs_ilock(ip, XFS_IOLOCK_SHARED);
578 if (whichfork == XFS_DATA_FORK) { 578 if (whichfork == XFS_DATA_FORK) {
579 if (!(iflags & BMV_IF_DELALLOC) && 579 if (!(iflags & BMV_IF_DELALLOC) &&
580 (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) { 580 (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
581 error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); 581 error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
582 if (error) 582 if (error)
583 goto out_unlock_iolock; 583 goto out_unlock_iolock;
584 584
@@ -611,7 +611,7 @@ xfs_getbmap(
611 /* 611 /*
612 * Allocate enough space to handle "subnex" maps at a time. 612 * Allocate enough space to handle "subnex" maps at a time.
613 */ 613 */
614 error = ENOMEM; 614 error = -ENOMEM;
615 subnex = 16; 615 subnex = 16;
616 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); 616 map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
617 if (!map) 617 if (!map)
@@ -809,7 +809,7 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
809 * have speculative prealloc/delalloc blocks to remove. 809 * have speculative prealloc/delalloc blocks to remove.
810 */ 810 */
811 if (VFS_I(ip)->i_size == 0 && 811 if (VFS_I(ip)->i_size == 0 &&
812 VN_CACHED(VFS_I(ip)) == 0 && 812 VFS_I(ip)->i_mapping->nrpages == 0 &&
813 ip->i_delayed_blks == 0) 813 ip->i_delayed_blks == 0)
814 return false; 814 return false;
815 815
@@ -882,7 +882,7 @@ xfs_free_eofblocks(
882 if (need_iolock) { 882 if (need_iolock) {
883 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 883 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
884 xfs_trans_cancel(tp, 0); 884 xfs_trans_cancel(tp, 0);
885 return EAGAIN; 885 return -EAGAIN;
886 } 886 }
887 } 887 }
888 888
@@ -955,14 +955,14 @@ xfs_alloc_file_space(
955 trace_xfs_alloc_file_space(ip); 955 trace_xfs_alloc_file_space(ip);
956 956
957 if (XFS_FORCED_SHUTDOWN(mp)) 957 if (XFS_FORCED_SHUTDOWN(mp))
958 return XFS_ERROR(EIO); 958 return -EIO;
959 959
960 error = xfs_qm_dqattach(ip, 0); 960 error = xfs_qm_dqattach(ip, 0);
961 if (error) 961 if (error)
962 return error; 962 return error;
963 963
964 if (len <= 0) 964 if (len <= 0)
965 return XFS_ERROR(EINVAL); 965 return -EINVAL;
966 966
967 rt = XFS_IS_REALTIME_INODE(ip); 967 rt = XFS_IS_REALTIME_INODE(ip);
968 extsz = xfs_get_extsz_hint(ip); 968 extsz = xfs_get_extsz_hint(ip);
@@ -1028,7 +1028,7 @@ xfs_alloc_file_space(
1028 /* 1028 /*
1029 * Free the transaction structure. 1029 * Free the transaction structure.
1030 */ 1030 */
1031 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); 1031 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1032 xfs_trans_cancel(tp, 0); 1032 xfs_trans_cancel(tp, 0);
1033 break; 1033 break;
1034 } 1034 }
@@ -1065,7 +1065,7 @@ xfs_alloc_file_space(
1065 allocated_fsb = imapp->br_blockcount; 1065 allocated_fsb = imapp->br_blockcount;
1066 1066
1067 if (nimaps == 0) { 1067 if (nimaps == 0) {
1068 error = XFS_ERROR(ENOSPC); 1068 error = -ENOSPC;
1069 break; 1069 break;
1070 } 1070 }
1071 1071
@@ -1122,14 +1122,6 @@ xfs_zero_remaining_bytes(
1122 if (endoff > XFS_ISIZE(ip)) 1122 if (endoff > XFS_ISIZE(ip))
1123 endoff = XFS_ISIZE(ip); 1123 endoff = XFS_ISIZE(ip);
1124 1124
1125 bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ?
1126 mp->m_rtdev_targp : mp->m_ddev_targp,
1127 BTOBB(mp->m_sb.sb_blocksize), 0);
1128 if (!bp)
1129 return XFS_ERROR(ENOMEM);
1130
1131 xfs_buf_unlock(bp);
1132
1133 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { 1125 for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1134 uint lock_mode; 1126 uint lock_mode;
1135 1127
@@ -1152,42 +1144,24 @@ xfs_zero_remaining_bytes(
1152 ASSERT(imap.br_startblock != DELAYSTARTBLOCK); 1144 ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1153 if (imap.br_state == XFS_EXT_UNWRITTEN) 1145 if (imap.br_state == XFS_EXT_UNWRITTEN)
1154 continue; 1146 continue;
1155 XFS_BUF_UNDONE(bp);
1156 XFS_BUF_UNWRITE(bp);
1157 XFS_BUF_READ(bp);
1158 XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
1159 1147
1160 if (XFS_FORCED_SHUTDOWN(mp)) { 1148 error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
1161 error = XFS_ERROR(EIO); 1149 mp->m_rtdev_targp : mp->m_ddev_targp,
1162 break; 1150 xfs_fsb_to_db(ip, imap.br_startblock),
1163 } 1151 BTOBB(mp->m_sb.sb_blocksize),
1164 xfs_buf_iorequest(bp); 1152 0, &bp, NULL);
1165 error = xfs_buf_iowait(bp); 1153 if (error)
1166 if (error) { 1154 return error;
1167 xfs_buf_ioerror_alert(bp, 1155
1168 "xfs_zero_remaining_bytes(read)");
1169 break;
1170 }
1171 memset(bp->b_addr + 1156 memset(bp->b_addr +
1172 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), 1157 (offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
1173 0, lastoffset - offset + 1); 1158 0, lastoffset - offset + 1);
1174 XFS_BUF_UNDONE(bp); 1159
1175 XFS_BUF_UNREAD(bp); 1160 error = xfs_bwrite(bp);
1176 XFS_BUF_WRITE(bp); 1161 xfs_buf_relse(bp);
1177 1162 if (error)
1178 if (XFS_FORCED_SHUTDOWN(mp)) { 1163 return error;
1179 error = XFS_ERROR(EIO);
1180 break;
1181 }
1182 xfs_buf_iorequest(bp);
1183 error = xfs_buf_iowait(bp);
1184 if (error) {
1185 xfs_buf_ioerror_alert(bp,
1186 "xfs_zero_remaining_bytes(write)");
1187 break;
1188 }
1189 } 1164 }
1190 xfs_buf_free(bp);
1191 return error; 1165 return error;
1192} 1166}
1193 1167
@@ -1205,6 +1179,7 @@ xfs_free_file_space(
1205 xfs_bmap_free_t free_list; 1179 xfs_bmap_free_t free_list;
1206 xfs_bmbt_irec_t imap; 1180 xfs_bmbt_irec_t imap;
1207 xfs_off_t ioffset; 1181 xfs_off_t ioffset;
1182 xfs_off_t iendoffset;
1208 xfs_extlen_t mod=0; 1183 xfs_extlen_t mod=0;
1209 xfs_mount_t *mp; 1184 xfs_mount_t *mp;
1210 int nimap; 1185 int nimap;
@@ -1233,12 +1208,13 @@ xfs_free_file_space(
1233 inode_dio_wait(VFS_I(ip)); 1208 inode_dio_wait(VFS_I(ip));
1234 1209
1235 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); 1210 rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1236 ioffset = offset & ~(rounding - 1); 1211 ioffset = round_down(offset, rounding);
1237 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 1212 iendoffset = round_up(offset + len, rounding) - 1;
1238 ioffset, -1); 1213 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
1214 iendoffset);
1239 if (error) 1215 if (error)
1240 goto out; 1216 goto out;
1241 truncate_pagecache_range(VFS_I(ip), ioffset, -1); 1217 truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
1242 1218
1243 /* 1219 /*
1244 * Need to zero the stuff we're not freeing, on disk. 1220 * Need to zero the stuff we're not freeing, on disk.
@@ -1315,7 +1291,7 @@ xfs_free_file_space(
1315 /* 1291 /*
1316 * Free the transaction structure. 1292 * Free the transaction structure.
1317 */ 1293 */
1318 ASSERT(error == ENOSPC || XFS_FORCED_SHUTDOWN(mp)); 1294 ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1319 xfs_trans_cancel(tp, 0); 1295 xfs_trans_cancel(tp, 0);
1320 break; 1296 break;
1321 } 1297 }
@@ -1392,14 +1368,14 @@ xfs_zero_file_space(
1392 1368
1393 if (start_boundary < end_boundary - 1) { 1369 if (start_boundary < end_boundary - 1) {
1394 /* 1370 /*
1395 * punch out delayed allocation blocks and the page cache over 1371 * Writeback the range to ensure any inode size updates due to
1396 * the conversion range 1372 * appending writes make it to disk (otherwise we could just
1373 * punch out the delalloc blocks).
1397 */ 1374 */
1398 xfs_ilock(ip, XFS_ILOCK_EXCL); 1375 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1399 error = xfs_bmap_punch_delalloc_range(ip, 1376 start_boundary, end_boundary - 1);
1400 XFS_B_TO_FSBT(mp, start_boundary), 1377 if (error)
1401 XFS_B_TO_FSB(mp, end_boundary - start_boundary)); 1378 goto out;
1402 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1403 truncate_pagecache_range(VFS_I(ip), start_boundary, 1379 truncate_pagecache_range(VFS_I(ip), start_boundary,
1404 end_boundary - 1); 1380 end_boundary - 1);
1405 1381
@@ -1456,24 +1432,50 @@ xfs_collapse_file_space(
1456 struct xfs_mount *mp = ip->i_mount; 1432 struct xfs_mount *mp = ip->i_mount;
1457 struct xfs_trans *tp; 1433 struct xfs_trans *tp;
1458 int error; 1434 int error;
1459 xfs_extnum_t current_ext = 0;
1460 struct xfs_bmap_free free_list; 1435 struct xfs_bmap_free free_list;
1461 xfs_fsblock_t first_block; 1436 xfs_fsblock_t first_block;
1462 int committed; 1437 int committed;
1463 xfs_fileoff_t start_fsb; 1438 xfs_fileoff_t start_fsb;
1439 xfs_fileoff_t next_fsb;
1464 xfs_fileoff_t shift_fsb; 1440 xfs_fileoff_t shift_fsb;
1465 1441
1466 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1442 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1467 1443
1468 trace_xfs_collapse_file_space(ip); 1444 trace_xfs_collapse_file_space(ip);
1469 1445
1470 start_fsb = XFS_B_TO_FSB(mp, offset + len); 1446 next_fsb = XFS_B_TO_FSB(mp, offset + len);
1471 shift_fsb = XFS_B_TO_FSB(mp, len); 1447 shift_fsb = XFS_B_TO_FSB(mp, len);
1472 1448
1473 error = xfs_free_file_space(ip, offset, len); 1449 error = xfs_free_file_space(ip, offset, len);
1474 if (error) 1450 if (error)
1475 return error; 1451 return error;
1476 1452
1453 /*
1454 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
1455 * into the accessible region of the file.
1456 */
1457 if (xfs_can_free_eofblocks(ip, true)) {
1458 error = xfs_free_eofblocks(mp, ip, false);
1459 if (error)
1460 return error;
1461 }
1462
1463 /*
1464 * Writeback and invalidate cache for the remainder of the file as we're
1465 * about to shift down every extent from the collapse range to EOF. The
1466 * free of the collapse range above might have already done some of
1467 * this, but we shouldn't rely on it to do anything outside of the range
1468 * that was freed.
1469 */
1470 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1471 offset + len, -1);
1472 if (error)
1473 return error;
1474 error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
1475 (offset + len) >> PAGE_CACHE_SHIFT, -1);
1476 if (error)
1477 return error;
1478
1477 while (!error && !done) { 1479 while (!error && !done) {
1478 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); 1480 tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1479 /* 1481 /*
@@ -1505,10 +1507,10 @@ xfs_collapse_file_space(
1505 * We are using the write transaction in which max 2 bmbt 1507 * We are using the write transaction in which max 2 bmbt
1506 * updates are allowed 1508 * updates are allowed
1507 */ 1509 */
1508 error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb, 1510 start_fsb = next_fsb;
1509 shift_fsb, &current_ext, 1511 error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb,
1510 &first_block, &free_list, 1512 &done, &next_fsb, &first_block, &free_list,
1511 XFS_BMAP_MAX_SHIFT_EXTENTS); 1513 XFS_BMAP_MAX_SHIFT_EXTENTS);
1512 if (error) 1514 if (error)
1513 goto out; 1515 goto out;
1514 1516
@@ -1557,14 +1559,14 @@ xfs_swap_extents_check_format(
1557 /* Should never get a local format */ 1559 /* Should never get a local format */
1558 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || 1560 if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
1559 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) 1561 tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
1560 return EINVAL; 1562 return -EINVAL;
1561 1563
1562 /* 1564 /*
1563 * if the target inode has less extents that then temporary inode then 1565 * if the target inode has less extents that then temporary inode then
1564 * why did userspace call us? 1566 * why did userspace call us?
1565 */ 1567 */
1566 if (ip->i_d.di_nextents < tip->i_d.di_nextents) 1568 if (ip->i_d.di_nextents < tip->i_d.di_nextents)
1567 return EINVAL; 1569 return -EINVAL;
1568 1570
1569 /* 1571 /*
1570 * if the target inode is in extent form and the temp inode is in btree 1572 * if the target inode is in extent form and the temp inode is in btree
@@ -1573,19 +1575,19 @@ xfs_swap_extents_check_format(
1573 */ 1575 */
1574 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 1576 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1575 tip->i_d.di_format == XFS_DINODE_FMT_BTREE) 1577 tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1576 return EINVAL; 1578 return -EINVAL;
1577 1579
1578 /* Check temp in extent form to max in target */ 1580 /* Check temp in extent form to max in target */
1579 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 1581 if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1580 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > 1582 XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
1581 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) 1583 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1582 return EINVAL; 1584 return -EINVAL;
1583 1585
1584 /* Check target in extent form to max in temp */ 1586 /* Check target in extent form to max in temp */
1585 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && 1587 if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1586 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > 1588 XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
1587 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) 1589 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1588 return EINVAL; 1590 return -EINVAL;
1589 1591
1590 /* 1592 /*
1591 * If we are in a btree format, check that the temp root block will fit 1593 * If we are in a btree format, check that the temp root block will fit
@@ -1599,25 +1601,49 @@ xfs_swap_extents_check_format(
1599 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 1601 if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1600 if (XFS_IFORK_BOFF(ip) && 1602 if (XFS_IFORK_BOFF(ip) &&
1601 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip)) 1603 XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
1602 return EINVAL; 1604 return -EINVAL;
1603 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= 1605 if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
1604 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)) 1606 XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1605 return EINVAL; 1607 return -EINVAL;
1606 } 1608 }
1607 1609
1608 /* Reciprocal target->temp btree format checks */ 1610 /* Reciprocal target->temp btree format checks */
1609 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { 1611 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1610 if (XFS_IFORK_BOFF(tip) && 1612 if (XFS_IFORK_BOFF(tip) &&
1611 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip)) 1613 XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
1612 return EINVAL; 1614 return -EINVAL;
1613 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= 1615 if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
1614 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK)) 1616 XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1615 return EINVAL; 1617 return -EINVAL;
1616 } 1618 }
1617 1619
1618 return 0; 1620 return 0;
1619} 1621}
1620 1622
1623static int
1624xfs_swap_extent_flush(
1625 struct xfs_inode *ip)
1626{
1627 int error;
1628
1629 error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1630 if (error)
1631 return error;
1632 truncate_pagecache_range(VFS_I(ip), 0, -1);
1633
1634 /* Verify O_DIRECT for ftmp */
1635 if (VFS_I(ip)->i_mapping->nrpages)
1636 return -EINVAL;
1637
1638 /*
1639 * Don't try to swap extents on mmap()d files because we can't lock
1640 * out races against page faults safely.
1641 */
1642 if (mapping_mapped(VFS_I(ip)->i_mapping))
1643 return -EBUSY;
1644 return 0;
1645}
1646
1621int 1647int
1622xfs_swap_extents( 1648xfs_swap_extents(
1623 xfs_inode_t *ip, /* target inode */ 1649 xfs_inode_t *ip, /* target inode */
@@ -1633,51 +1659,57 @@ xfs_swap_extents(
1633 int aforkblks = 0; 1659 int aforkblks = 0;
1634 int taforkblks = 0; 1660 int taforkblks = 0;
1635 __uint64_t tmp; 1661 __uint64_t tmp;
1662 int lock_flags;
1636 1663
1637 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); 1664 tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1638 if (!tempifp) { 1665 if (!tempifp) {
1639 error = XFS_ERROR(ENOMEM); 1666 error = -ENOMEM;
1640 goto out; 1667 goto out;
1641 } 1668 }
1642 1669
1643 /* 1670 /*
1644 * we have to do two separate lock calls here to keep lockdep 1671 * Lock up the inodes against other IO and truncate to begin with.
1645 * happy. If we try to get all the locks in one call, lock will 1672 * Then we can ensure the inodes are flushed and have no page cache
1646 * report false positives when we drop the ILOCK and regain them 1673 * safely. Once we have done this we can take the ilocks and do the rest
1647 * below. 1674 * of the checks.
1648 */ 1675 */
1676 lock_flags = XFS_IOLOCK_EXCL;
1649 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); 1677 xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1650 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1651 1678
1652 /* Verify that both files have the same format */ 1679 /* Verify that both files have the same format */
1653 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { 1680 if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
1654 error = XFS_ERROR(EINVAL); 1681 error = -EINVAL;
1655 goto out_unlock; 1682 goto out_unlock;
1656 } 1683 }
1657 1684
1658 /* Verify both files are either real-time or non-realtime */ 1685 /* Verify both files are either real-time or non-realtime */
1659 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) { 1686 if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1660 error = XFS_ERROR(EINVAL); 1687 error = -EINVAL;
1661 goto out_unlock; 1688 goto out_unlock;
1662 } 1689 }
1663 1690
1664 error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); 1691 error = xfs_swap_extent_flush(ip);
1692 if (error)
1693 goto out_unlock;
1694 error = xfs_swap_extent_flush(tip);
1665 if (error) 1695 if (error)
1666 goto out_unlock; 1696 goto out_unlock;
1667 truncate_pagecache_range(VFS_I(tip), 0, -1);
1668 1697
1669 /* Verify O_DIRECT for ftmp */ 1698 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
1670 if (VN_CACHED(VFS_I(tip)) != 0) { 1699 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1671 error = XFS_ERROR(EINVAL); 1700 if (error) {
1701 xfs_trans_cancel(tp, 0);
1672 goto out_unlock; 1702 goto out_unlock;
1673 } 1703 }
1704 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1705 lock_flags |= XFS_ILOCK_EXCL;
1674 1706
1675 /* Verify all data are being swapped */ 1707 /* Verify all data are being swapped */
1676 if (sxp->sx_offset != 0 || 1708 if (sxp->sx_offset != 0 ||
1677 sxp->sx_length != ip->i_d.di_size || 1709 sxp->sx_length != ip->i_d.di_size ||
1678 sxp->sx_length != tip->i_d.di_size) { 1710 sxp->sx_length != tip->i_d.di_size) {
1679 error = XFS_ERROR(EFAULT); 1711 error = -EFAULT;
1680 goto out_unlock; 1712 goto out_trans_cancel;
1681 } 1713 }
1682 1714
1683 trace_xfs_swap_extent_before(ip, 0); 1715 trace_xfs_swap_extent_before(ip, 0);
@@ -1689,7 +1721,7 @@ xfs_swap_extents(
1689 xfs_notice(mp, 1721 xfs_notice(mp,
1690 "%s: inode 0x%llx format is incompatible for exchanging.", 1722 "%s: inode 0x%llx format is incompatible for exchanging.",
1691 __func__, ip->i_ino); 1723 __func__, ip->i_ino);
1692 goto out_unlock; 1724 goto out_trans_cancel;
1693 } 1725 }
1694 1726
1695 /* 1727 /*
@@ -1703,43 +1735,9 @@ xfs_swap_extents(
1703 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || 1735 (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
1704 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || 1736 (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
1705 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { 1737 (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
1706 error = XFS_ERROR(EBUSY); 1738 error = -EBUSY;
1707 goto out_unlock; 1739 goto out_trans_cancel;
1708 }
1709
1710 /* We need to fail if the file is memory mapped. Once we have tossed
1711 * all existing pages, the page fault will have no option
1712 * but to go to the filesystem for pages. By making the page fault call
1713 * vop_read (or write in the case of autogrow) they block on the iolock
1714 * until we have switched the extents.
1715 */
1716 if (VN_MAPPED(VFS_I(ip))) {
1717 error = XFS_ERROR(EBUSY);
1718 goto out_unlock;
1719 }
1720
1721 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1722 xfs_iunlock(tip, XFS_ILOCK_EXCL);
1723
1724 /*
1725 * There is a race condition here since we gave up the
1726 * ilock. However, the data fork will not change since
1727 * we have the iolock (locked for truncation too) so we
1728 * are safe. We don't really care if non-io related
1729 * fields change.
1730 */
1731 truncate_pagecache_range(VFS_I(ip), 0, -1);
1732
1733 tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
1734 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1735 if (error) {
1736 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1737 xfs_iunlock(tip, XFS_IOLOCK_EXCL);
1738 xfs_trans_cancel(tp, 0);
1739 goto out;
1740 } 1740 }
1741 xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1742
1743 /* 1741 /*
1744 * Count the number of extended attribute blocks 1742 * Count the number of extended attribute blocks
1745 */ 1743 */
@@ -1757,8 +1755,8 @@ xfs_swap_extents(
1757 goto out_trans_cancel; 1755 goto out_trans_cancel;
1758 } 1756 }
1759 1757
1760 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1758 xfs_trans_ijoin(tp, ip, lock_flags);
1761 xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1759 xfs_trans_ijoin(tp, tip, lock_flags);
1762 1760
1763 /* 1761 /*
1764 * Before we've swapped the forks, lets set the owners of the forks 1762 * Before we've swapped the forks, lets set the owners of the forks
@@ -1887,8 +1885,8 @@ out:
1887 return error; 1885 return error;
1888 1886
1889out_unlock: 1887out_unlock:
1890 xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1888 xfs_iunlock(ip, lock_flags);
1891 xfs_iunlock(tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1889 xfs_iunlock(tip, lock_flags);
1892 goto out; 1890 goto out;
1893 1891
1894out_trans_cancel: 1892out_trans_cancel:
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 7a34a1ae6552..24b4ebea0d4d 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -130,7 +130,7 @@ xfs_buf_get_maps(
130 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 130 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
131 KM_NOFS); 131 KM_NOFS);
132 if (!bp->b_maps) 132 if (!bp->b_maps)
133 return ENOMEM; 133 return -ENOMEM;
134 return 0; 134 return 0;
135} 135}
136 136
@@ -344,7 +344,7 @@ retry:
344 if (unlikely(page == NULL)) { 344 if (unlikely(page == NULL)) {
345 if (flags & XBF_READ_AHEAD) { 345 if (flags & XBF_READ_AHEAD) {
346 bp->b_page_count = i; 346 bp->b_page_count = i;
347 error = ENOMEM; 347 error = -ENOMEM;
348 goto out_free_pages; 348 goto out_free_pages;
349 } 349 }
350 350
@@ -465,7 +465,7 @@ _xfs_buf_find(
465 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 465 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
466 if (blkno >= eofs) { 466 if (blkno >= eofs) {
467 /* 467 /*
468 * XXX (dgc): we should really be returning EFSCORRUPTED here, 468 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
469 * but none of the higher level infrastructure supports 469 * but none of the higher level infrastructure supports
470 * returning a specific error on buffer lookup failures. 470 * returning a specific error on buffer lookup failures.
471 */ 471 */
@@ -623,10 +623,11 @@ _xfs_buf_read(
623 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 623 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
624 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 624 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
625 625
626 xfs_buf_iorequest(bp); 626 if (flags & XBF_ASYNC) {
627 if (flags & XBF_ASYNC) 627 xfs_buf_submit(bp);
628 return 0; 628 return 0;
629 return xfs_buf_iowait(bp); 629 }
630 return xfs_buf_submit_wait(bp);
630} 631}
631 632
632xfs_buf_t * 633xfs_buf_t *
@@ -687,34 +688,39 @@ xfs_buf_readahead_map(
687 * Read an uncached buffer from disk. Allocates and returns a locked 688 * Read an uncached buffer from disk. Allocates and returns a locked
688 * buffer containing the disk contents or nothing. 689 * buffer containing the disk contents or nothing.
689 */ 690 */
690struct xfs_buf * 691int
691xfs_buf_read_uncached( 692xfs_buf_read_uncached(
692 struct xfs_buftarg *target, 693 struct xfs_buftarg *target,
693 xfs_daddr_t daddr, 694 xfs_daddr_t daddr,
694 size_t numblks, 695 size_t numblks,
695 int flags, 696 int flags,
697 struct xfs_buf **bpp,
696 const struct xfs_buf_ops *ops) 698 const struct xfs_buf_ops *ops)
697{ 699{
698 struct xfs_buf *bp; 700 struct xfs_buf *bp;
699 701
702 *bpp = NULL;
703
700 bp = xfs_buf_get_uncached(target, numblks, flags); 704 bp = xfs_buf_get_uncached(target, numblks, flags);
701 if (!bp) 705 if (!bp)
702 return NULL; 706 return -ENOMEM;
703 707
704 /* set up the buffer for a read IO */ 708 /* set up the buffer for a read IO */
705 ASSERT(bp->b_map_count == 1); 709 ASSERT(bp->b_map_count == 1);
706 bp->b_bn = daddr; 710 bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */
707 bp->b_maps[0].bm_bn = daddr; 711 bp->b_maps[0].bm_bn = daddr;
708 bp->b_flags |= XBF_READ; 712 bp->b_flags |= XBF_READ;
709 bp->b_ops = ops; 713 bp->b_ops = ops;
710 714
711 if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { 715 xfs_buf_submit_wait(bp);
716 if (bp->b_error) {
717 int error = bp->b_error;
712 xfs_buf_relse(bp); 718 xfs_buf_relse(bp);
713 return NULL; 719 return error;
714 } 720 }
715 xfs_buf_iorequest(bp); 721
716 xfs_buf_iowait(bp); 722 *bpp = bp;
717 return bp; 723 return 0;
718} 724}
719 725
720/* 726/*
@@ -998,53 +1004,56 @@ xfs_buf_wait_unpin(
998 * Buffer Utility Routines 1004 * Buffer Utility Routines
999 */ 1005 */
1000 1006
1001STATIC void 1007void
1002xfs_buf_iodone_work( 1008xfs_buf_ioend(
1003 struct work_struct *work) 1009 struct xfs_buf *bp)
1004{ 1010{
1005 struct xfs_buf *bp = 1011 bool read = bp->b_flags & XBF_READ;
1006 container_of(work, xfs_buf_t, b_iodone_work); 1012
1007 bool read = !!(bp->b_flags & XBF_READ); 1013 trace_xfs_buf_iodone(bp, _RET_IP_);
1008 1014
1009 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1015 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1010 1016
1011 /* only validate buffers that were read without errors */ 1017 /*
1012 if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) 1018 * Pull in IO completion errors now. We are guaranteed to be running
1019 * single threaded, so we don't need the lock to read b_io_error.
1020 */
1021 if (!bp->b_error && bp->b_io_error)
1022 xfs_buf_ioerror(bp, bp->b_io_error);
1023
1024 /* Only validate buffers that were read without errors */
1025 if (read && !bp->b_error && bp->b_ops) {
1026 ASSERT(!bp->b_iodone);
1013 bp->b_ops->verify_read(bp); 1027 bp->b_ops->verify_read(bp);
1028 }
1029
1030 if (!bp->b_error)
1031 bp->b_flags |= XBF_DONE;
1014 1032
1015 if (bp->b_iodone) 1033 if (bp->b_iodone)
1016 (*(bp->b_iodone))(bp); 1034 (*(bp->b_iodone))(bp);
1017 else if (bp->b_flags & XBF_ASYNC) 1035 else if (bp->b_flags & XBF_ASYNC)
1018 xfs_buf_relse(bp); 1036 xfs_buf_relse(bp);
1019 else { 1037 else
1020 ASSERT(read && bp->b_ops);
1021 complete(&bp->b_iowait); 1038 complete(&bp->b_iowait);
1022 }
1023} 1039}
1024 1040
1025void 1041static void
1026xfs_buf_ioend( 1042xfs_buf_ioend_work(
1027 struct xfs_buf *bp, 1043 struct work_struct *work)
1028 int schedule)
1029{ 1044{
1030 bool read = !!(bp->b_flags & XBF_READ); 1045 struct xfs_buf *bp =
1031 1046 container_of(work, xfs_buf_t, b_iodone_work);
1032 trace_xfs_buf_iodone(bp, _RET_IP_);
1033 1047
1034 if (bp->b_error == 0) 1048 xfs_buf_ioend(bp);
1035 bp->b_flags |= XBF_DONE; 1049}
1036 1050
1037 if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { 1051void
1038 if (schedule) { 1052xfs_buf_ioend_async(
1039 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1053 struct xfs_buf *bp)
1040 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1054{
1041 } else { 1055 INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work);
1042 xfs_buf_iodone_work(&bp->b_iodone_work); 1056 queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1043 }
1044 } else {
1045 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1046 complete(&bp->b_iowait);
1047 }
1048} 1057}
1049 1058
1050void 1059void
@@ -1052,8 +1061,8 @@ xfs_buf_ioerror(
1052 xfs_buf_t *bp, 1061 xfs_buf_t *bp,
1053 int error) 1062 int error)
1054{ 1063{
1055 ASSERT(error >= 0 && error <= 0xffff); 1064 ASSERT(error <= 0 && error >= -1000);
1056 bp->b_error = (unsigned short)error; 1065 bp->b_error = error;
1057 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 1066 trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1058} 1067}
1059 1068
@@ -1064,97 +1073,7 @@ xfs_buf_ioerror_alert(
1064{ 1073{
1065 xfs_alert(bp->b_target->bt_mount, 1074 xfs_alert(bp->b_target->bt_mount,
1066"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", 1075"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
1067 (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); 1076 (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
1068}
1069
1070/*
1071 * Called when we want to stop a buffer from getting written or read.
1072 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1073 * so that the proper iodone callbacks get called.
1074 */
1075STATIC int
1076xfs_bioerror(
1077 xfs_buf_t *bp)
1078{
1079#ifdef XFSERRORDEBUG
1080 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1081#endif
1082
1083 /*
1084 * No need to wait until the buffer is unpinned, we aren't flushing it.
1085 */
1086 xfs_buf_ioerror(bp, EIO);
1087
1088 /*
1089 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1090 */
1091 XFS_BUF_UNREAD(bp);
1092 XFS_BUF_UNDONE(bp);
1093 xfs_buf_stale(bp);
1094
1095 xfs_buf_ioend(bp, 0);
1096
1097 return EIO;
1098}
1099
1100/*
1101 * Same as xfs_bioerror, except that we are releasing the buffer
1102 * here ourselves, and avoiding the xfs_buf_ioend call.
1103 * This is meant for userdata errors; metadata bufs come with
1104 * iodone functions attached, so that we can track down errors.
1105 */
1106int
1107xfs_bioerror_relse(
1108 struct xfs_buf *bp)
1109{
1110 int64_t fl = bp->b_flags;
1111 /*
1112 * No need to wait until the buffer is unpinned.
1113 * We aren't flushing it.
1114 *
1115 * chunkhold expects B_DONE to be set, whether
1116 * we actually finish the I/O or not. We don't want to
1117 * change that interface.
1118 */
1119 XFS_BUF_UNREAD(bp);
1120 XFS_BUF_DONE(bp);
1121 xfs_buf_stale(bp);
1122 bp->b_iodone = NULL;
1123 if (!(fl & XBF_ASYNC)) {
1124 /*
1125 * Mark b_error and B_ERROR _both_.
1126 * Lot's of chunkcache code assumes that.
1127 * There's no reason to mark error for
1128 * ASYNC buffers.
1129 */
1130 xfs_buf_ioerror(bp, EIO);
1131 complete(&bp->b_iowait);
1132 } else {
1133 xfs_buf_relse(bp);
1134 }
1135
1136 return EIO;
1137}
1138
1139STATIC int
1140xfs_bdstrat_cb(
1141 struct xfs_buf *bp)
1142{
1143 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1144 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1145 /*
1146 * Metadata write that didn't get logged but
1147 * written delayed anyway. These aren't associated
1148 * with a transaction, and can be ignored.
1149 */
1150 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1151 return xfs_bioerror_relse(bp);
1152 else
1153 return xfs_bioerror(bp);
1154 }
1155
1156 xfs_buf_iorequest(bp);
1157 return 0;
1158} 1077}
1159 1078
1160int 1079int
@@ -1166,11 +1085,10 @@ xfs_bwrite(
1166 ASSERT(xfs_buf_islocked(bp)); 1085 ASSERT(xfs_buf_islocked(bp));
1167 1086
1168 bp->b_flags |= XBF_WRITE; 1087 bp->b_flags |= XBF_WRITE;
1169 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); 1088 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
1170 1089 XBF_WRITE_FAIL | XBF_DONE);
1171 xfs_bdstrat_cb(bp);
1172 1090
1173 error = xfs_buf_iowait(bp); 1091 error = xfs_buf_submit_wait(bp);
1174 if (error) { 1092 if (error) {
1175 xfs_force_shutdown(bp->b_target->bt_mount, 1093 xfs_force_shutdown(bp->b_target->bt_mount,
1176 SHUTDOWN_META_IO_ERROR); 1094 SHUTDOWN_META_IO_ERROR);
@@ -1179,15 +1097,6 @@ xfs_bwrite(
1179} 1097}
1180 1098
1181STATIC void 1099STATIC void
1182_xfs_buf_ioend(
1183 xfs_buf_t *bp,
1184 int schedule)
1185{
1186 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1187 xfs_buf_ioend(bp, schedule);
1188}
1189
1190STATIC void
1191xfs_buf_bio_end_io( 1100xfs_buf_bio_end_io(
1192 struct bio *bio, 1101 struct bio *bio,
1193 int error) 1102 int error)
@@ -1198,13 +1107,18 @@ xfs_buf_bio_end_io(
1198 * don't overwrite existing errors - otherwise we can lose errors on 1107 * don't overwrite existing errors - otherwise we can lose errors on
1199 * buffers that require multiple bios to complete. 1108 * buffers that require multiple bios to complete.
1200 */ 1109 */
1201 if (!bp->b_error) 1110 if (error) {
1202 xfs_buf_ioerror(bp, -error); 1111 spin_lock(&bp->b_lock);
1112 if (!bp->b_io_error)
1113 bp->b_io_error = error;
1114 spin_unlock(&bp->b_lock);
1115 }
1203 1116
1204 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1117 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1205 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1118 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1206 1119
1207 _xfs_buf_ioend(bp, 1); 1120 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1121 xfs_buf_ioend_async(bp);
1208 bio_put(bio); 1122 bio_put(bio);
1209} 1123}
1210 1124
@@ -1283,10 +1197,10 @@ next_chunk:
1283 } else { 1197 } else {
1284 /* 1198 /*
1285 * This is guaranteed not to be the last io reference count 1199 * This is guaranteed not to be the last io reference count
1286 * because the caller (xfs_buf_iorequest) holds a count itself. 1200 * because the caller (xfs_buf_submit) holds a count itself.
1287 */ 1201 */
1288 atomic_dec(&bp->b_io_remaining); 1202 atomic_dec(&bp->b_io_remaining);
1289 xfs_buf_ioerror(bp, EIO); 1203 xfs_buf_ioerror(bp, -EIO);
1290 bio_put(bio); 1204 bio_put(bio);
1291 } 1205 }
1292 1206
@@ -1330,6 +1244,20 @@ _xfs_buf_ioapply(
1330 SHUTDOWN_CORRUPT_INCORE); 1244 SHUTDOWN_CORRUPT_INCORE);
1331 return; 1245 return;
1332 } 1246 }
1247 } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
1248 struct xfs_mount *mp = bp->b_target->bt_mount;
1249
1250 /*
1251 * non-crc filesystems don't attach verifiers during
1252 * log recovery, so don't warn for such filesystems.
1253 */
1254 if (xfs_sb_version_hascrc(&mp->m_sb)) {
1255 xfs_warn(mp,
1256 "%s: no ops on block 0x%llx/0x%x",
1257 __func__, bp->b_bn, bp->b_length);
1258 xfs_hex_dump(bp->b_addr, 64);
1259 dump_stack();
1260 }
1333 } 1261 }
1334 } else if (bp->b_flags & XBF_READ_AHEAD) { 1262 } else if (bp->b_flags & XBF_READ_AHEAD) {
1335 rw = READA; 1263 rw = READA;
@@ -1359,53 +1287,131 @@ _xfs_buf_ioapply(
1359 blk_finish_plug(&plug); 1287 blk_finish_plug(&plug);
1360} 1288}
1361 1289
1290/*
1291 * Asynchronous IO submission path. This transfers the buffer lock ownership and
1292 * the current reference to the IO. It is not safe to reference the buffer after
1293 * a call to this function unless the caller holds an additional reference
1294 * itself.
1295 */
1362void 1296void
1363xfs_buf_iorequest( 1297xfs_buf_submit(
1364 xfs_buf_t *bp) 1298 struct xfs_buf *bp)
1365{ 1299{
1366 trace_xfs_buf_iorequest(bp, _RET_IP_); 1300 trace_xfs_buf_submit(bp, _RET_IP_);
1367 1301
1368 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1302 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1303 ASSERT(bp->b_flags & XBF_ASYNC);
1304
1305 /* on shutdown we stale and complete the buffer immediately */
1306 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1307 xfs_buf_ioerror(bp, -EIO);
1308 bp->b_flags &= ~XBF_DONE;
1309 xfs_buf_stale(bp);
1310 xfs_buf_ioend(bp);
1311 return;
1312 }
1369 1313
1370 if (bp->b_flags & XBF_WRITE) 1314 if (bp->b_flags & XBF_WRITE)
1371 xfs_buf_wait_unpin(bp); 1315 xfs_buf_wait_unpin(bp);
1316
1317 /* clear the internal error state to avoid spurious errors */
1318 bp->b_io_error = 0;
1319
1320 /*
1321 * The caller's reference is released during I/O completion.
1322 * This occurs some time after the last b_io_remaining reference is
1323 * released, so after we drop our Io reference we have to have some
1324 * other reference to ensure the buffer doesn't go away from underneath
1325 * us. Take a direct reference to ensure we have safe access to the
1326 * buffer until we are finished with it.
1327 */
1372 xfs_buf_hold(bp); 1328 xfs_buf_hold(bp);
1373 1329
1374 /* 1330 /*
1375 * Set the count to 1 initially, this will stop an I/O 1331 * Set the count to 1 initially, this will stop an I/O completion
1376 * completion callout which happens before we have started 1332 * callout which happens before we have started all the I/O from calling
1377 * all the I/O from calling xfs_buf_ioend too early. 1333 * xfs_buf_ioend too early.
1378 */ 1334 */
1379 atomic_set(&bp->b_io_remaining, 1); 1335 atomic_set(&bp->b_io_remaining, 1);
1380 _xfs_buf_ioapply(bp); 1336 _xfs_buf_ioapply(bp);
1337
1381 /* 1338 /*
1382 * If _xfs_buf_ioapply failed, we'll get back here with 1339 * If _xfs_buf_ioapply failed, we can get back here with only the IO
1383 * only the reference we took above. _xfs_buf_ioend will 1340 * reference we took above. If we drop it to zero, run completion so
1384 * drop it to zero, so we'd better not queue it for later, 1341 * that we don't return to the caller with completion still pending.
1385 * or we'll free it before it's done.
1386 */ 1342 */
1387 _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); 1343 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1344 if (bp->b_error)
1345 xfs_buf_ioend(bp);
1346 else
1347 xfs_buf_ioend_async(bp);
1348 }
1388 1349
1389 xfs_buf_rele(bp); 1350 xfs_buf_rele(bp);
1351 /* Note: it is not safe to reference bp now we've dropped our ref */
1390} 1352}
1391 1353
1392/* 1354/*
1393 * Waits for I/O to complete on the buffer supplied. It returns immediately if 1355 * Synchronous buffer IO submission path, read or write.
1394 * no I/O is pending or there is already a pending error on the buffer, in which
1395 * case nothing will ever complete. It returns the I/O error code, if any, or
1396 * 0 if there was no error.
1397 */ 1356 */
1398int 1357int
1399xfs_buf_iowait( 1358xfs_buf_submit_wait(
1400 xfs_buf_t *bp) 1359 struct xfs_buf *bp)
1401{ 1360{
1402 trace_xfs_buf_iowait(bp, _RET_IP_); 1361 int error;
1403 1362
1404 if (!bp->b_error) 1363 trace_xfs_buf_submit_wait(bp, _RET_IP_);
1405 wait_for_completion(&bp->b_iowait); 1364
1365 ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
1366
1367 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1368 xfs_buf_ioerror(bp, -EIO);
1369 xfs_buf_stale(bp);
1370 bp->b_flags &= ~XBF_DONE;
1371 return -EIO;
1372 }
1373
1374 if (bp->b_flags & XBF_WRITE)
1375 xfs_buf_wait_unpin(bp);
1376
1377 /* clear the internal error state to avoid spurious errors */
1378 bp->b_io_error = 0;
1379
1380 /*
1381 * For synchronous IO, the IO does not inherit the submitters reference
1382 * count, nor the buffer lock. Hence we cannot release the reference we
1383 * are about to take until we've waited for all IO completion to occur,
1384 * including any xfs_buf_ioend_async() work that may be pending.
1385 */
1386 xfs_buf_hold(bp);
1387
1388 /*
1389 * Set the count to 1 initially, this will stop an I/O completion
1390 * callout which happens before we have started all the I/O from calling
1391 * xfs_buf_ioend too early.
1392 */
1393 atomic_set(&bp->b_io_remaining, 1);
1394 _xfs_buf_ioapply(bp);
1395
1396 /*
1397 * make sure we run completion synchronously if it raced with us and is
1398 * already complete.
1399 */
1400 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1401 xfs_buf_ioend(bp);
1406 1402
1403 /* wait for completion before gathering the error from the buffer */
1404 trace_xfs_buf_iowait(bp, _RET_IP_);
1405 wait_for_completion(&bp->b_iowait);
1407 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1406 trace_xfs_buf_iowait_done(bp, _RET_IP_);
1408 return bp->b_error; 1407 error = bp->b_error;
1408
1409 /*
1410 * all done now, we can release the hold that keeps the buffer
1411 * referenced for the entire IO.
1412 */
1413 xfs_buf_rele(bp);
1414 return error;
1409} 1415}
1410 1416
1411xfs_caddr_t 1417xfs_caddr_t
@@ -1628,7 +1634,7 @@ xfs_setsize_buftarg(
1628 xfs_warn(btp->bt_mount, 1634 xfs_warn(btp->bt_mount,
1629 "Cannot set_blocksize to %u on device %s", 1635 "Cannot set_blocksize to %u on device %s",
1630 sectorsize, name); 1636 sectorsize, name);
1631 return EINVAL; 1637 return -EINVAL;
1632 } 1638 }
1633 1639
1634 /* Set up device logical sector size mask */ 1640 /* Set up device logical sector size mask */
@@ -1664,8 +1670,6 @@ xfs_alloc_buftarg(
1664 btp->bt_dev = bdev->bd_dev; 1670 btp->bt_dev = bdev->bd_dev;
1665 btp->bt_bdev = bdev; 1671 btp->bt_bdev = bdev;
1666 btp->bt_bdi = blk_get_backing_dev_info(bdev); 1672 btp->bt_bdi = blk_get_backing_dev_info(bdev);
1667 if (!btp->bt_bdi)
1668 goto error;
1669 1673
1670 if (xfs_setsize_buftarg_early(btp, bdev)) 1674 if (xfs_setsize_buftarg_early(btp, bdev))
1671 goto error; 1675 goto error;
@@ -1799,13 +1803,19 @@ __xfs_buf_delwri_submit(
1799 blk_start_plug(&plug); 1803 blk_start_plug(&plug);
1800 list_for_each_entry_safe(bp, n, io_list, b_list) { 1804 list_for_each_entry_safe(bp, n, io_list, b_list) {
1801 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); 1805 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
1802 bp->b_flags |= XBF_WRITE; 1806 bp->b_flags |= XBF_WRITE | XBF_ASYNC;
1803 1807
1804 if (!wait) { 1808 /*
1805 bp->b_flags |= XBF_ASYNC; 1809 * we do all Io submission async. This means if we need to wait
1810 * for IO completion we need to take an extra reference so the
1811 * buffer is still valid on the other side.
1812 */
1813 if (wait)
1814 xfs_buf_hold(bp);
1815 else
1806 list_del_init(&bp->b_list); 1816 list_del_init(&bp->b_list);
1807 } 1817
1808 xfs_bdstrat_cb(bp); 1818 xfs_buf_submit(bp);
1809 } 1819 }
1810 blk_finish_plug(&plug); 1820 blk_finish_plug(&plug);
1811 1821
@@ -1852,7 +1862,10 @@ xfs_buf_delwri_submit(
1852 bp = list_first_entry(&io_list, struct xfs_buf, b_list); 1862 bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1853 1863
1854 list_del_init(&bp->b_list); 1864 list_del_init(&bp->b_list);
1855 error2 = xfs_buf_iowait(bp); 1865
1866 /* locking the buffer will wait for async IO completion. */
1867 xfs_buf_lock(bp);
1868 error2 = bp->b_error;
1856 xfs_buf_relse(bp); 1869 xfs_buf_relse(bp);
1857 if (!error) 1870 if (!error)
1858 error = error2; 1871 error = error2;
@@ -1870,7 +1883,7 @@ xfs_buf_init(void)
1870 goto out; 1883 goto out;
1871 1884
1872 xfslogd_workqueue = alloc_workqueue("xfslogd", 1885 xfslogd_workqueue = alloc_workqueue("xfslogd",
1873 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); 1886 WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1);
1874 if (!xfslogd_workqueue) 1887 if (!xfslogd_workqueue)
1875 goto out_free_buf_zone; 1888 goto out_free_buf_zone;
1876 1889
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3a7a5523d3dc..82002c00af90 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -158,6 +158,7 @@ typedef struct xfs_buf {
158 struct list_head b_lru; /* lru list */ 158 struct list_head b_lru; /* lru list */
159 spinlock_t b_lock; /* internal state lock */ 159 spinlock_t b_lock; /* internal state lock */
160 unsigned int b_state; /* internal state flags */ 160 unsigned int b_state; /* internal state flags */
161 int b_io_error; /* internal IO error state */
161 wait_queue_head_t b_waiters; /* unpin waiters */ 162 wait_queue_head_t b_waiters; /* unpin waiters */
162 struct list_head b_list; 163 struct list_head b_list;
163 struct xfs_perag *b_pag; /* contains rbtree root */ 164 struct xfs_perag *b_pag; /* contains rbtree root */
@@ -178,7 +179,7 @@ typedef struct xfs_buf {
178 atomic_t b_io_remaining; /* #outstanding I/O requests */ 179 atomic_t b_io_remaining; /* #outstanding I/O requests */
179 unsigned int b_page_count; /* size of page array */ 180 unsigned int b_page_count; /* size of page array */
180 unsigned int b_offset; /* page offset in first page */ 181 unsigned int b_offset; /* page offset in first page */
181 unsigned short b_error; /* error code on I/O */ 182 int b_error; /* error code on I/O */
182 const struct xfs_buf_ops *b_ops; 183 const struct xfs_buf_ops *b_ops;
183 184
184#ifdef XFS_BUF_LOCK_TRACKING 185#ifdef XFS_BUF_LOCK_TRACKING
@@ -268,9 +269,9 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
268 269
269struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, 270struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
270 int flags); 271 int flags);
271struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, 272int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr,
272 xfs_daddr_t daddr, size_t numblks, int flags, 273 size_t numblks, int flags, struct xfs_buf **bpp,
273 const struct xfs_buf_ops *ops); 274 const struct xfs_buf_ops *ops);
274void xfs_buf_hold(struct xfs_buf *bp); 275void xfs_buf_hold(struct xfs_buf *bp);
275 276
276/* Releasing Buffers */ 277/* Releasing Buffers */
@@ -286,18 +287,16 @@ extern void xfs_buf_unlock(xfs_buf_t *);
286 287
287/* Buffer Read and Write Routines */ 288/* Buffer Read and Write Routines */
288extern int xfs_bwrite(struct xfs_buf *bp); 289extern int xfs_bwrite(struct xfs_buf *bp);
289extern void xfs_buf_ioend(xfs_buf_t *, int); 290extern void xfs_buf_ioend(struct xfs_buf *bp);
290extern void xfs_buf_ioerror(xfs_buf_t *, int); 291extern void xfs_buf_ioerror(xfs_buf_t *, int);
291extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); 292extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
292extern void xfs_buf_iorequest(xfs_buf_t *); 293extern void xfs_buf_submit(struct xfs_buf *bp);
293extern int xfs_buf_iowait(xfs_buf_t *); 294extern int xfs_buf_submit_wait(struct xfs_buf *bp);
294extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, 295extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
295 xfs_buf_rw_t); 296 xfs_buf_rw_t);
296#define xfs_buf_zero(bp, off, len) \ 297#define xfs_buf_zero(bp, off, len) \
297 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 298 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
298 299
299extern int xfs_bioerror_relse(struct xfs_buf *);
300
301/* Buffer Utility Routines */ 300/* Buffer Utility Routines */
302extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 301extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
303 302
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 4654338b03fc..f15969543326 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -488,10 +488,10 @@ xfs_buf_item_unpin(
488 xfs_buf_lock(bp); 488 xfs_buf_lock(bp);
489 xfs_buf_hold(bp); 489 xfs_buf_hold(bp);
490 bp->b_flags |= XBF_ASYNC; 490 bp->b_flags |= XBF_ASYNC;
491 xfs_buf_ioerror(bp, EIO); 491 xfs_buf_ioerror(bp, -EIO);
492 XFS_BUF_UNDONE(bp); 492 XFS_BUF_UNDONE(bp);
493 xfs_buf_stale(bp); 493 xfs_buf_stale(bp);
494 xfs_buf_ioend(bp, 0); 494 xfs_buf_ioend(bp);
495 } 495 }
496} 496}
497 497
@@ -501,7 +501,7 @@ xfs_buf_item_unpin(
501 * buffer being bad.. 501 * buffer being bad..
502 */ 502 */
503 503
504DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); 504static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
505 505
506STATIC uint 506STATIC uint
507xfs_buf_item_push( 507xfs_buf_item_push(
@@ -725,7 +725,7 @@ xfs_buf_item_get_format(
725 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format), 725 bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
726 KM_SLEEP); 726 KM_SLEEP);
727 if (!bip->bli_formats) 727 if (!bip->bli_formats)
728 return ENOMEM; 728 return -ENOMEM;
729 return 0; 729 return 0;
730} 730}
731 731
@@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks(
1081 * a way to shut the filesystem down if the writes keep failing. 1081 * a way to shut the filesystem down if the writes keep failing.
1082 * 1082 *
1083 * In practice we'll shut the filesystem down soon as non-transient 1083 * In practice we'll shut the filesystem down soon as non-transient
1084 * erorrs tend to affect the whole device and a failing log write 1084 * errors tend to affect the whole device and a failing log write
1085 * will make us give up. But we really ought to do better here. 1085 * will make us give up. But we really ought to do better here.
1086 */ 1086 */
1087 if (XFS_BUF_ISASYNC(bp)) { 1087 if (XFS_BUF_ISASYNC(bp)) {
@@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks(
1094 if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { 1094 if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
1095 bp->b_flags |= XBF_WRITE | XBF_ASYNC | 1095 bp->b_flags |= XBF_WRITE | XBF_ASYNC |
1096 XBF_DONE | XBF_WRITE_FAIL; 1096 XBF_DONE | XBF_WRITE_FAIL;
1097 xfs_buf_iorequest(bp); 1097 xfs_buf_submit(bp);
1098 } else { 1098 } else {
1099 xfs_buf_relse(bp); 1099 xfs_buf_relse(bp);
1100 } 1100 }
@@ -1115,7 +1115,7 @@ do_callbacks:
1115 xfs_buf_do_callbacks(bp); 1115 xfs_buf_do_callbacks(bp);
1116 bp->b_fspriv = NULL; 1116 bp->b_fspriv = NULL;
1117 bp->b_iodone = NULL; 1117 bp->b_iodone = NULL;
1118 xfs_buf_ioend(bp, 0); 1118 xfs_buf_ioend(bp);
1119} 1119}
1120 1120
1121/* 1121/*
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 48e99afb9cb0..f1b69edcdf31 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -95,7 +95,7 @@ xfs_dir2_sf_getdents(
95 */ 95 */
96 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) { 96 if (dp->i_d.di_size < offsetof(xfs_dir2_sf_hdr_t, parent)) {
97 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount)); 97 ASSERT(XFS_FORCED_SHUTDOWN(dp->i_mount));
98 return XFS_ERROR(EIO); 98 return -EIO;
99 } 99 }
100 100
101 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size); 101 ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
@@ -677,7 +677,7 @@ xfs_readdir(
677 trace_xfs_readdir(dp); 677 trace_xfs_readdir(dp);
678 678
679 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 679 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
680 return XFS_ERROR(EIO); 680 return -EIO;
681 681
682 ASSERT(S_ISDIR(dp->i_d.di_mode)); 682 ASSERT(S_ISDIR(dp->i_d.di_mode));
683 XFS_STATS_INC(xs_dir_getdents); 683 XFS_STATS_INC(xs_dir_getdents);
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 4f11ef011139..13d08a1b390e 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -124,7 +124,7 @@ xfs_trim_extents(
124 } 124 }
125 125
126 trace_xfs_discard_extent(mp, agno, fbno, flen); 126 trace_xfs_discard_extent(mp, agno, fbno, flen);
127 error = -blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0); 127 error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS, 0);
128 if (error) 128 if (error)
129 goto out_del_cursor; 129 goto out_del_cursor;
130 *blocks_trimmed += flen; 130 *blocks_trimmed += flen;
@@ -166,11 +166,11 @@ xfs_ioc_trim(
166 int error, last_error = 0; 166 int error, last_error = 0;
167 167
168 if (!capable(CAP_SYS_ADMIN)) 168 if (!capable(CAP_SYS_ADMIN))
169 return -XFS_ERROR(EPERM); 169 return -EPERM;
170 if (!blk_queue_discard(q)) 170 if (!blk_queue_discard(q))
171 return -XFS_ERROR(EOPNOTSUPP); 171 return -EOPNOTSUPP;
172 if (copy_from_user(&range, urange, sizeof(range))) 172 if (copy_from_user(&range, urange, sizeof(range)))
173 return -XFS_ERROR(EFAULT); 173 return -EFAULT;
174 174
175 /* 175 /*
176 * Truncating down the len isn't actually quite correct, but using 176 * Truncating down the len isn't actually quite correct, but using
@@ -182,7 +182,7 @@ xfs_ioc_trim(
182 if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || 182 if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
183 range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) || 183 range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
184 range.len < mp->m_sb.sb_blocksize) 184 range.len < mp->m_sb.sb_blocksize)
185 return -XFS_ERROR(EINVAL); 185 return -EINVAL;
186 186
187 start = BTOBB(range.start); 187 start = BTOBB(range.start);
188 end = start + BTOBBT(range.len) - 1; 188 end = start + BTOBBT(range.len) - 1;
@@ -195,7 +195,7 @@ xfs_ioc_trim(
195 end_agno = xfs_daddr_to_agno(mp, end); 195 end_agno = xfs_daddr_to_agno(mp, end);
196 196
197 for (agno = start_agno; agno <= end_agno; agno++) { 197 for (agno = start_agno; agno <= end_agno; agno++) {
198 error = -xfs_trim_extents(mp, agno, start, end, minlen, 198 error = xfs_trim_extents(mp, agno, start, end, minlen,
199 &blocks_trimmed); 199 &blocks_trimmed);
200 if (error) 200 if (error)
201 last_error = error; 201 last_error = error;
@@ -206,7 +206,7 @@ xfs_ioc_trim(
206 206
207 range.len = XFS_FSB_TO_B(mp, blocks_trimmed); 207 range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
208 if (copy_to_user(urange, &range, sizeof(range))) 208 if (copy_to_user(urange, &range, sizeof(range)))
209 return -XFS_ERROR(EFAULT); 209 return -EFAULT;
210 return 0; 210 return 0;
211} 211}
212 212
@@ -222,11 +222,11 @@ xfs_discard_extents(
222 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, 222 trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
223 busyp->length); 223 busyp->length);
224 224
225 error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, 225 error = blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
226 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), 226 XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
227 XFS_FSB_TO_BB(mp, busyp->length), 227 XFS_FSB_TO_BB(mp, busyp->length),
228 GFP_NOFS, 0); 228 GFP_NOFS, 0);
229 if (error && error != EOPNOTSUPP) { 229 if (error && error != -EOPNOTSUPP) {
230 xfs_info(mp, 230 xfs_info(mp,
231 "discard failed for extent [0x%llu,%u], error %d", 231 "discard failed for extent [0x%llu,%u], error %d",
232 (unsigned long long)busyp->bno, 232 (unsigned long long)busyp->bno,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 3ee0cd43edc0..63c2de49f61d 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -327,7 +327,7 @@ xfs_qm_dqalloc(
327 */ 327 */
328 if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { 328 if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) {
329 xfs_iunlock(quotip, XFS_ILOCK_EXCL); 329 xfs_iunlock(quotip, XFS_ILOCK_EXCL);
330 return (ESRCH); 330 return -ESRCH;
331 } 331 }
332 332
333 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); 333 xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL);
@@ -354,7 +354,7 @@ xfs_qm_dqalloc(
354 mp->m_quotainfo->qi_dqchunklen, 354 mp->m_quotainfo->qi_dqchunklen,
355 0); 355 0);
356 if (!bp) { 356 if (!bp) {
357 error = ENOMEM; 357 error = -ENOMEM;
358 goto error1; 358 goto error1;
359 } 359 }
360 bp->b_ops = &xfs_dquot_buf_ops; 360 bp->b_ops = &xfs_dquot_buf_ops;
@@ -400,7 +400,7 @@ xfs_qm_dqalloc(
400 error0: 400 error0:
401 xfs_iunlock(quotip, XFS_ILOCK_EXCL); 401 xfs_iunlock(quotip, XFS_ILOCK_EXCL);
402 402
403 return (error); 403 return error;
404} 404}
405 405
406STATIC int 406STATIC int
@@ -426,7 +426,7 @@ xfs_qm_dqrepair(
426 426
427 if (error) { 427 if (error) {
428 ASSERT(*bpp == NULL); 428 ASSERT(*bpp == NULL);
429 return XFS_ERROR(error); 429 return error;
430 } 430 }
431 (*bpp)->b_ops = &xfs_dquot_buf_ops; 431 (*bpp)->b_ops = &xfs_dquot_buf_ops;
432 432
@@ -442,7 +442,7 @@ xfs_qm_dqrepair(
442 if (error) { 442 if (error) {
443 /* repair failed, we're screwed */ 443 /* repair failed, we're screwed */
444 xfs_trans_brelse(tp, *bpp); 444 xfs_trans_brelse(tp, *bpp);
445 return XFS_ERROR(EIO); 445 return -EIO;
446 } 446 }
447 } 447 }
448 448
@@ -480,7 +480,7 @@ xfs_qm_dqtobp(
480 * didn't have the quota inode lock. 480 * didn't have the quota inode lock.
481 */ 481 */
482 xfs_iunlock(quotip, lock_mode); 482 xfs_iunlock(quotip, lock_mode);
483 return ESRCH; 483 return -ESRCH;
484 } 484 }
485 485
486 /* 486 /*
@@ -508,7 +508,7 @@ xfs_qm_dqtobp(
508 * We don't allocate unless we're asked to 508 * We don't allocate unless we're asked to
509 */ 509 */
510 if (!(flags & XFS_QMOPT_DQALLOC)) 510 if (!(flags & XFS_QMOPT_DQALLOC))
511 return ENOENT; 511 return -ENOENT;
512 512
513 ASSERT(tp); 513 ASSERT(tp);
514 error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, 514 error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
@@ -530,7 +530,7 @@ xfs_qm_dqtobp(
530 mp->m_quotainfo->qi_dqchunklen, 530 mp->m_quotainfo->qi_dqchunklen,
531 0, &bp, &xfs_dquot_buf_ops); 531 0, &bp, &xfs_dquot_buf_ops);
532 532
533 if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) { 533 if (error == -EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
534 xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff * 534 xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
535 mp->m_quotainfo->qi_dqperchunk; 535 mp->m_quotainfo->qi_dqperchunk;
536 ASSERT(bp == NULL); 536 ASSERT(bp == NULL);
@@ -539,7 +539,7 @@ xfs_qm_dqtobp(
539 539
540 if (error) { 540 if (error) {
541 ASSERT(bp == NULL); 541 ASSERT(bp == NULL);
542 return XFS_ERROR(error); 542 return error;
543 } 543 }
544 } 544 }
545 545
@@ -547,7 +547,7 @@ xfs_qm_dqtobp(
547 *O_bpp = bp; 547 *O_bpp = bp;
548 *O_ddpp = bp->b_addr + dqp->q_bufoffset; 548 *O_ddpp = bp->b_addr + dqp->q_bufoffset;
549 549
550 return (0); 550 return 0;
551} 551}
552 552
553 553
@@ -715,7 +715,7 @@ xfs_qm_dqget(
715 if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || 715 if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
716 (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || 716 (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
717 (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { 717 (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
718 return (ESRCH); 718 return -ESRCH;
719 } 719 }
720 720
721#ifdef DEBUG 721#ifdef DEBUG
@@ -723,7 +723,7 @@ xfs_qm_dqget(
723 if ((xfs_dqerror_target == mp->m_ddev_targp) && 723 if ((xfs_dqerror_target == mp->m_ddev_targp) &&
724 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { 724 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
725 xfs_debug(mp, "Returning error in dqget"); 725 xfs_debug(mp, "Returning error in dqget");
726 return (EIO); 726 return -EIO;
727 } 727 }
728 } 728 }
729 729
@@ -796,14 +796,14 @@ restart:
796 } else { 796 } else {
797 /* inode stays locked on return */ 797 /* inode stays locked on return */
798 xfs_qm_dqdestroy(dqp); 798 xfs_qm_dqdestroy(dqp);
799 return XFS_ERROR(ESRCH); 799 return -ESRCH;
800 } 800 }
801 } 801 }
802 802
803 mutex_lock(&qi->qi_tree_lock); 803 mutex_lock(&qi->qi_tree_lock);
804 error = -radix_tree_insert(tree, id, dqp); 804 error = radix_tree_insert(tree, id, dqp);
805 if (unlikely(error)) { 805 if (unlikely(error)) {
806 WARN_ON(error != EEXIST); 806 WARN_ON(error != -EEXIST);
807 807
808 /* 808 /*
809 * Duplicate found. Just throw away the new dquot and start 809 * Duplicate found. Just throw away the new dquot and start
@@ -829,7 +829,7 @@ restart:
829 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); 829 ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
830 trace_xfs_dqget_miss(dqp); 830 trace_xfs_dqget_miss(dqp);
831 *O_dqpp = dqp; 831 *O_dqpp = dqp;
832 return (0); 832 return 0;
833} 833}
834 834
835/* 835/*
@@ -966,7 +966,7 @@ xfs_qm_dqflush(
966 SHUTDOWN_CORRUPT_INCORE); 966 SHUTDOWN_CORRUPT_INCORE);
967 else 967 else
968 spin_unlock(&mp->m_ail->xa_lock); 968 spin_unlock(&mp->m_ail->xa_lock);
969 error = XFS_ERROR(EIO); 969 error = -EIO;
970 goto out_unlock; 970 goto out_unlock;
971 } 971 }
972 972
@@ -974,7 +974,8 @@ xfs_qm_dqflush(
974 * Get the buffer containing the on-disk dquot 974 * Get the buffer containing the on-disk dquot
975 */ 975 */
976 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, 976 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
977 mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL); 977 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
978 &xfs_dquot_buf_ops);
978 if (error) 979 if (error)
979 goto out_unlock; 980 goto out_unlock;
980 981
@@ -992,7 +993,7 @@ xfs_qm_dqflush(
992 xfs_buf_relse(bp); 993 xfs_buf_relse(bp);
993 xfs_dqfunlock(dqp); 994 xfs_dqfunlock(dqp);
994 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 995 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
995 return XFS_ERROR(EIO); 996 return -EIO;
996 } 997 }
997 998
998 /* This is the only portion of data that needs to persist */ 999 /* This is the only portion of data that needs to persist */
@@ -1045,7 +1046,7 @@ xfs_qm_dqflush(
1045 1046
1046out_unlock: 1047out_unlock:
1047 xfs_dqfunlock(dqp); 1048 xfs_dqfunlock(dqp);
1048 return XFS_ERROR(EIO); 1049 return -EIO;
1049} 1050}
1050 1051
1051/* 1052/*
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 68a68f704837..c24c67e22a2a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -139,6 +139,21 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
139 } 139 }
140} 140}
141 141
142/*
143 * Check whether a dquot is under low free space conditions. We assume the quota
144 * is enabled and enforced.
145 */
146static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
147{
148 int64_t freesp;
149
150 freesp = be64_to_cpu(dqp->q_core.d_blk_hardlimit) - dqp->q_res_bcount;
151 if (freesp < dqp->q_low_space[XFS_QLOWSP_1_PCNT])
152 return true;
153
154 return false;
155}
156
142#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock))) 157#define XFS_DQ_IS_LOCKED(dqp) (mutex_is_locked(&((dqp)->q_qlock)))
143#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY) 158#define XFS_DQ_IS_DIRTY(dqp) ((dqp)->dq_flags & XFS_DQ_DIRTY)
144#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER) 159#define XFS_QM_ISUDQ(dqp) ((dqp)->dq_flags & XFS_DQ_USER)
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index edac5b057d28..b92fd7bc49e3 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -27,29 +27,6 @@
27 27
28#ifdef DEBUG 28#ifdef DEBUG
29 29
30int xfs_etrap[XFS_ERROR_NTRAP] = {
31 0,
32};
33
34int
35xfs_error_trap(int e)
36{
37 int i;
38
39 if (!e)
40 return 0;
41 for (i = 0; i < XFS_ERROR_NTRAP; i++) {
42 if (xfs_etrap[i] == 0)
43 break;
44 if (e != xfs_etrap[i])
45 continue;
46 xfs_notice(NULL, "%s: error %d", __func__, e);
47 BUG();
48 break;
49 }
50 return e;
51}
52
53int xfs_etest[XFS_NUM_INJECT_ERROR]; 30int xfs_etest[XFS_NUM_INJECT_ERROR];
54int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR]; 31int64_t xfs_etest_fsid[XFS_NUM_INJECT_ERROR];
55char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR]; 32char * xfs_etest_fsname[XFS_NUM_INJECT_ERROR];
@@ -190,7 +167,7 @@ xfs_verifier_error(
190 struct xfs_mount *mp = bp->b_target->bt_mount; 167 struct xfs_mount *mp = bp->b_target->bt_mount;
191 168
192 xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", 169 xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
193 bp->b_error == EFSBADCRC ? "CRC error" : "corruption", 170 bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
194 __return_address, bp->b_bn); 171 __return_address, bp->b_bn);
195 172
196 xfs_alert(mp, "Unmount and run xfs_repair"); 173 xfs_alert(mp, "Unmount and run xfs_repair");
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index c1c57d4a4b5d..279a76e52791 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -18,15 +18,6 @@
18#ifndef __XFS_ERROR_H__ 18#ifndef __XFS_ERROR_H__
19#define __XFS_ERROR_H__ 19#define __XFS_ERROR_H__
20 20
21#ifdef DEBUG
22#define XFS_ERROR_NTRAP 10
23extern int xfs_etrap[XFS_ERROR_NTRAP];
24extern int xfs_error_trap(int);
25#define XFS_ERROR(e) xfs_error_trap(e)
26#else
27#define XFS_ERROR(e) (e)
28#endif
29
30struct xfs_mount; 21struct xfs_mount;
31 22
32extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, 23extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
@@ -56,7 +47,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
56 if (unlikely(!fs_is_ok)) { \ 47 if (unlikely(!fs_is_ok)) { \
57 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ 48 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \
58 XFS_ERRLEVEL_LOW, NULL); \ 49 XFS_ERRLEVEL_LOW, NULL); \
59 error = XFS_ERROR(EFSCORRUPTED); \ 50 error = -EFSCORRUPTED; \
60 goto l; \ 51 goto l; \
61 } \ 52 } \
62 } 53 }
@@ -68,7 +59,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
68 if (unlikely(!fs_is_ok)) { \ 59 if (unlikely(!fs_is_ok)) { \
69 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ 60 XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \
70 XFS_ERRLEVEL_LOW, NULL); \ 61 XFS_ERRLEVEL_LOW, NULL); \
71 return XFS_ERROR(EFSCORRUPTED); \ 62 return -EFSCORRUPTED; \
72 } \ 63 } \
73 } 64 }
74 65
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 753e467aa1a5..5a6bd5d8779a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -147,9 +147,9 @@ xfs_nfs_get_inode(
147 * We don't use ESTALE directly down the chain to not 147 * We don't use ESTALE directly down the chain to not
148 * confuse applications using bulkstat that expect EINVAL. 148 * confuse applications using bulkstat that expect EINVAL.
149 */ 149 */
150 if (error == EINVAL || error == ENOENT) 150 if (error == -EINVAL || error == -ENOENT)
151 error = ESTALE; 151 error = -ESTALE;
152 return ERR_PTR(-error); 152 return ERR_PTR(error);
153 } 153 }
154 154
155 if (ip->i_d.di_gen != generation) { 155 if (ip->i_d.di_gen != generation) {
@@ -217,7 +217,7 @@ xfs_fs_get_parent(
217 217
218 error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); 218 error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
219 if (unlikely(error)) 219 if (unlikely(error))
220 return ERR_PTR(-error); 220 return ERR_PTR(error);
221 221
222 return d_obtain_alias(VFS_I(cip)); 222 return d_obtain_alias(VFS_I(cip));
223} 223}
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
237 237
238 if (!lsn) 238 if (!lsn)
239 return 0; 239 return 0;
240 return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); 240 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
241} 241}
242 242
243const struct export_operations xfs_export_operations = { 243const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index fb7a4c1ce1c5..c4327419dc5c 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -298,7 +298,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
298 } 298 }
299 return 0; 299 return 0;
300 } 300 }
301 return EFSCORRUPTED; 301 return -EFSCORRUPTED;
302} 302}
303 303
304/* 304/*
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1f66779d7a46..eb596b419942 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
38#include "xfs_trace.h" 38#include "xfs_trace.h"
39#include "xfs_log.h" 39#include "xfs_log.h"
40#include "xfs_dinode.h" 40#include "xfs_dinode.h"
41#include "xfs_icache.h"
41 42
42#include <linux/aio.h> 43#include <linux/aio.h>
43#include <linux/dcache.h> 44#include <linux/dcache.h>
@@ -155,7 +156,7 @@ xfs_dir_fsync(
155 156
156 if (!lsn) 157 if (!lsn)
157 return 0; 158 return 0;
158 return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); 159 return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
159} 160}
160 161
161STATIC int 162STATIC int
@@ -179,7 +180,7 @@ xfs_file_fsync(
179 return error; 180 return error;
180 181
181 if (XFS_FORCED_SHUTDOWN(mp)) 182 if (XFS_FORCED_SHUTDOWN(mp))
182 return -XFS_ERROR(EIO); 183 return -EIO;
183 184
184 xfs_iflags_clear(ip, XFS_ITRUNCATED); 185 xfs_iflags_clear(ip, XFS_ITRUNCATED);
185 186
@@ -225,7 +226,7 @@ xfs_file_fsync(
225 !log_flushed) 226 !log_flushed)
226 xfs_blkdev_issue_flush(mp->m_ddev_targp); 227 xfs_blkdev_issue_flush(mp->m_ddev_targp);
227 228
228 return -error; 229 return error;
229} 230}
230 231
231STATIC ssize_t 232STATIC ssize_t
@@ -246,11 +247,11 @@ xfs_file_read_iter(
246 XFS_STATS_INC(xs_read_calls); 247 XFS_STATS_INC(xs_read_calls);
247 248
248 if (unlikely(file->f_flags & O_DIRECT)) 249 if (unlikely(file->f_flags & O_DIRECT))
249 ioflags |= IO_ISDIRECT; 250 ioflags |= XFS_IO_ISDIRECT;
250 if (file->f_mode & FMODE_NOCMTIME) 251 if (file->f_mode & FMODE_NOCMTIME)
251 ioflags |= IO_INVIS; 252 ioflags |= XFS_IO_INVIS;
252 253
253 if (unlikely(ioflags & IO_ISDIRECT)) { 254 if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
254 xfs_buftarg_t *target = 255 xfs_buftarg_t *target =
255 XFS_IS_REALTIME_INODE(ip) ? 256 XFS_IS_REALTIME_INODE(ip) ?
256 mp->m_rtdev_targp : mp->m_ddev_targp; 257 mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -258,7 +259,7 @@ xfs_file_read_iter(
258 if ((pos | size) & target->bt_logical_sectormask) { 259 if ((pos | size) & target->bt_logical_sectormask) {
259 if (pos == i_size_read(inode)) 260 if (pos == i_size_read(inode))
260 return 0; 261 return 0;
261 return -XFS_ERROR(EINVAL); 262 return -EINVAL;
262 } 263 }
263 } 264 }
264 265
@@ -283,19 +284,29 @@ xfs_file_read_iter(
283 * proceeed concurrently without serialisation. 284 * proceeed concurrently without serialisation.
284 */ 285 */
285 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); 286 xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
286 if ((ioflags & IO_ISDIRECT) && inode->i_mapping->nrpages) { 287 if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
287 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); 288 xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
288 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); 289 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
289 290
290 if (inode->i_mapping->nrpages) { 291 if (inode->i_mapping->nrpages) {
291 ret = filemap_write_and_wait_range( 292 ret = filemap_write_and_wait_range(
292 VFS_I(ip)->i_mapping, 293 VFS_I(ip)->i_mapping,
293 pos, -1); 294 pos, pos + size - 1);
294 if (ret) { 295 if (ret) {
295 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL); 296 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
296 return ret; 297 return ret;
297 } 298 }
298 truncate_pagecache_range(VFS_I(ip), pos, -1); 299
300 /*
301 * Invalidate whole pages. This can return an error if
302 * we fail to invalidate a page, but this should never
303 * happen on XFS. Warn if it does fail.
304 */
305 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
306 pos >> PAGE_CACHE_SHIFT,
307 (pos + size - 1) >> PAGE_CACHE_SHIFT);
308 WARN_ON_ONCE(ret);
309 ret = 0;
299 } 310 }
300 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); 311 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
301 } 312 }
@@ -325,7 +336,7 @@ xfs_file_splice_read(
325 XFS_STATS_INC(xs_read_calls); 336 XFS_STATS_INC(xs_read_calls);
326 337
327 if (infilp->f_mode & FMODE_NOCMTIME) 338 if (infilp->f_mode & FMODE_NOCMTIME)
328 ioflags |= IO_INVIS; 339 ioflags |= XFS_IO_INVIS;
329 340
330 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 341 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
331 return -EIO; 342 return -EIO;
@@ -524,7 +535,7 @@ restart:
524 xfs_rw_ilock(ip, *iolock); 535 xfs_rw_ilock(ip, *iolock);
525 goto restart; 536 goto restart;
526 } 537 }
527 error = -xfs_zero_eof(ip, *pos, i_size_read(inode)); 538 error = xfs_zero_eof(ip, *pos, i_size_read(inode));
528 if (error) 539 if (error)
529 return error; 540 return error;
530 } 541 }
@@ -594,7 +605,7 @@ xfs_file_dio_aio_write(
594 605
595 /* DIO must be aligned to device logical sector size */ 606 /* DIO must be aligned to device logical sector size */
596 if ((pos | count) & target->bt_logical_sectormask) 607 if ((pos | count) & target->bt_logical_sectormask)
597 return -XFS_ERROR(EINVAL); 608 return -EINVAL;
598 609
599 /* "unaligned" here means not aligned to a filesystem block */ 610 /* "unaligned" here means not aligned to a filesystem block */
600 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask)) 611 if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
@@ -631,10 +642,19 @@ xfs_file_dio_aio_write(
631 642
632 if (mapping->nrpages) { 643 if (mapping->nrpages) {
633 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 644 ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
634 pos, -1); 645 pos, pos + count - 1);
635 if (ret) 646 if (ret)
636 goto out; 647 goto out;
637 truncate_pagecache_range(VFS_I(ip), pos, -1); 648 /*
649 * Invalidate whole pages. This can return an error if
650 * we fail to invalidate a page, but this should never
651 * happen on XFS. Warn if it does fail.
652 */
653 ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
654 pos >> PAGE_CACHE_SHIFT,
655 (pos + count - 1) >> PAGE_CACHE_SHIFT);
656 WARN_ON_ONCE(ret);
657 ret = 0;
638 } 658 }
639 659
640 /* 660 /*
@@ -689,14 +709,28 @@ write_retry:
689 ret = generic_perform_write(file, from, pos); 709 ret = generic_perform_write(file, from, pos);
690 if (likely(ret >= 0)) 710 if (likely(ret >= 0))
691 iocb->ki_pos = pos + ret; 711 iocb->ki_pos = pos + ret;
712
692 /* 713 /*
693 * If we just got an ENOSPC, try to write back all dirty inodes to 714 * If we hit a space limit, try to free up some lingering preallocated
694 * convert delalloc space to free up some of the excess reserved 715 * space before returning an error. In the case of ENOSPC, first try to
695 * metadata space. 716 * write back all dirty inodes to free up some of the excess reserved
717 * metadata space. This reduces the chances that the eofblocks scan
718 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
719 * also behaves as a filter to prevent too many eofblocks scans from
720 * running at the same time.
696 */ 721 */
697 if (ret == -ENOSPC && !enospc) { 722 if (ret == -EDQUOT && !enospc) {
723 enospc = xfs_inode_free_quota_eofblocks(ip);
724 if (enospc)
725 goto write_retry;
726 } else if (ret == -ENOSPC && !enospc) {
727 struct xfs_eofblocks eofb = {0};
728
698 enospc = 1; 729 enospc = 1;
699 xfs_flush_inodes(ip->i_mount); 730 xfs_flush_inodes(ip->i_mount);
731 eofb.eof_scan_owner = ip->i_ino; /* for locking */
732 eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
733 xfs_icache_free_eofblocks(ip->i_mount, &eofb);
700 goto write_retry; 734 goto write_retry;
701 } 735 }
702 736
@@ -772,7 +806,7 @@ xfs_file_fallocate(
772 unsigned blksize_mask = (1 << inode->i_blkbits) - 1; 806 unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
773 807
774 if (offset & blksize_mask || len & blksize_mask) { 808 if (offset & blksize_mask || len & blksize_mask) {
775 error = EINVAL; 809 error = -EINVAL;
776 goto out_unlock; 810 goto out_unlock;
777 } 811 }
778 812
@@ -781,7 +815,7 @@ xfs_file_fallocate(
781 * in which case it is effectively a truncate operation 815 * in which case it is effectively a truncate operation
782 */ 816 */
783 if (offset + len >= i_size_read(inode)) { 817 if (offset + len >= i_size_read(inode)) {
784 error = EINVAL; 818 error = -EINVAL;
785 goto out_unlock; 819 goto out_unlock;
786 } 820 }
787 821
@@ -794,7 +828,7 @@ xfs_file_fallocate(
794 if (!(mode & FALLOC_FL_KEEP_SIZE) && 828 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
795 offset + len > i_size_read(inode)) { 829 offset + len > i_size_read(inode)) {
796 new_size = offset + len; 830 new_size = offset + len;
797 error = -inode_newsize_ok(inode, new_size); 831 error = inode_newsize_ok(inode, new_size);
798 if (error) 832 if (error)
799 goto out_unlock; 833 goto out_unlock;
800 } 834 }
@@ -844,7 +878,7 @@ xfs_file_fallocate(
844 878
845out_unlock: 879out_unlock:
846 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 880 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
847 return -error; 881 return error;
848} 882}
849 883
850 884
@@ -889,7 +923,7 @@ xfs_file_release(
889 struct inode *inode, 923 struct inode *inode,
890 struct file *filp) 924 struct file *filp)
891{ 925{
892 return -xfs_release(XFS_I(inode)); 926 return xfs_release(XFS_I(inode));
893} 927}
894 928
895STATIC int 929STATIC int
@@ -918,7 +952,7 @@ xfs_file_readdir(
918 952
919 error = xfs_readdir(ip, ctx, bufsize); 953 error = xfs_readdir(ip, ctx, bufsize);
920 if (error) 954 if (error)
921 return -error; 955 return error;
922 return 0; 956 return 0;
923} 957}
924 958
@@ -949,7 +983,7 @@ xfs_vm_page_mkwrite(
949 983
950/* 984/*
951 * This type is designed to indicate the type of offset we would like 985 * This type is designed to indicate the type of offset we would like
952 * to search from page cache for either xfs_seek_data() or xfs_seek_hole(). 986 * to search from page cache for xfs_seek_hole_data().
953 */ 987 */
954enum { 988enum {
955 HOLE_OFF = 0, 989 HOLE_OFF = 0,
@@ -1006,7 +1040,7 @@ xfs_lookup_buffer_offset(
1006/* 1040/*
1007 * This routine is called to find out and return a data or hole offset 1041 * This routine is called to find out and return a data or hole offset
1008 * from the page cache for unwritten extents according to the desired 1042 * from the page cache for unwritten extents according to the desired
1009 * type for xfs_seek_data() or xfs_seek_hole(). 1043 * type for xfs_seek_hole_data().
1010 * 1044 *
1011 * The argument offset is used to tell where we start to search from the 1045 * The argument offset is used to tell where we start to search from the
1012 * page cache. Map is used to figure out the end points of the range to 1046 * page cache. Map is used to figure out the end points of the range to
@@ -1166,9 +1200,10 @@ out:
1166} 1200}
1167 1201
1168STATIC loff_t 1202STATIC loff_t
1169xfs_seek_data( 1203xfs_seek_hole_data(
1170 struct file *file, 1204 struct file *file,
1171 loff_t start) 1205 loff_t start,
1206 int whence)
1172{ 1207{
1173 struct inode *inode = file->f_mapping->host; 1208 struct inode *inode = file->f_mapping->host;
1174 struct xfs_inode *ip = XFS_I(inode); 1209 struct xfs_inode *ip = XFS_I(inode);
@@ -1180,11 +1215,14 @@ xfs_seek_data(
1180 uint lock; 1215 uint lock;
1181 int error; 1216 int error;
1182 1217
1218 if (XFS_FORCED_SHUTDOWN(mp))
1219 return -EIO;
1220
1183 lock = xfs_ilock_data_map_shared(ip); 1221 lock = xfs_ilock_data_map_shared(ip);
1184 1222
1185 isize = i_size_read(inode); 1223 isize = i_size_read(inode);
1186 if (start >= isize) { 1224 if (start >= isize) {
1187 error = ENXIO; 1225 error = -ENXIO;
1188 goto out_unlock; 1226 goto out_unlock;
1189 } 1227 }
1190 1228
@@ -1194,6 +1232,7 @@ xfs_seek_data(
1194 */ 1232 */
1195 fsbno = XFS_B_TO_FSBT(mp, start); 1233 fsbno = XFS_B_TO_FSBT(mp, start);
1196 end = XFS_B_TO_FSB(mp, isize); 1234 end = XFS_B_TO_FSB(mp, isize);
1235
1197 for (;;) { 1236 for (;;) {
1198 struct xfs_bmbt_irec map[2]; 1237 struct xfs_bmbt_irec map[2];
1199 int nmap = 2; 1238 int nmap = 2;
@@ -1206,7 +1245,7 @@ xfs_seek_data(
1206 1245
1207 /* No extents at given offset, must be beyond EOF */ 1246 /* No extents at given offset, must be beyond EOF */
1208 if (nmap == 0) { 1247 if (nmap == 0) {
1209 error = ENXIO; 1248 error = -ENXIO;
1210 goto out_unlock; 1249 goto out_unlock;
1211 } 1250 }
1212 1251
@@ -1214,30 +1253,49 @@ xfs_seek_data(
1214 offset = max_t(loff_t, start, 1253 offset = max_t(loff_t, start,
1215 XFS_FSB_TO_B(mp, map[i].br_startoff)); 1254 XFS_FSB_TO_B(mp, map[i].br_startoff));
1216 1255
1217 /* Landed in a data extent */ 1256 /* Landed in the hole we wanted? */
1218 if (map[i].br_startblock == DELAYSTARTBLOCK || 1257 if (whence == SEEK_HOLE &&
1219 (map[i].br_state == XFS_EXT_NORM && 1258 map[i].br_startblock == HOLESTARTBLOCK)
1220 !isnullstartblock(map[i].br_startblock))) 1259 goto out;
1260
1261 /* Landed in the data extent we wanted? */
1262 if (whence == SEEK_DATA &&
1263 (map[i].br_startblock == DELAYSTARTBLOCK ||
1264 (map[i].br_state == XFS_EXT_NORM &&
1265 !isnullstartblock(map[i].br_startblock))))
1221 goto out; 1266 goto out;
1222 1267
1223 /* 1268 /*
1224 * Landed in an unwritten extent, try to search data 1269 * Landed in an unwritten extent, try to search
1225 * from page cache. 1270 * for hole or data from page cache.
1226 */ 1271 */
1227 if (map[i].br_state == XFS_EXT_UNWRITTEN) { 1272 if (map[i].br_state == XFS_EXT_UNWRITTEN) {
1228 if (xfs_find_get_desired_pgoff(inode, &map[i], 1273 if (xfs_find_get_desired_pgoff(inode, &map[i],
1229 DATA_OFF, &offset)) 1274 whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF,
1275 &offset))
1230 goto out; 1276 goto out;
1231 } 1277 }
1232 } 1278 }
1233 1279
1234 /* 1280 /*
1235 * map[0] is hole or its an unwritten extent but 1281 * We only received one extent out of the two requested. This
1236 * without data in page cache. Probably means that 1282 * means we've hit EOF and didn't find what we are looking for.
1237 * we are reading after EOF if nothing in map[1].
1238 */ 1283 */
1239 if (nmap == 1) { 1284 if (nmap == 1) {
1240 error = ENXIO; 1285 /*
1286 * If we were looking for a hole, set offset to
1287 * the end of the file (i.e., there is an implicit
1288 * hole at the end of any file).
1289 */
1290 if (whence == SEEK_HOLE) {
1291 offset = isize;
1292 break;
1293 }
1294 /*
1295 * If we were looking for data, it's nowhere to be found
1296 */
1297 ASSERT(whence == SEEK_DATA);
1298 error = -ENXIO;
1241 goto out_unlock; 1299 goto out_unlock;
1242 } 1300 }
1243 1301
@@ -1245,132 +1303,37 @@ xfs_seek_data(
1245 1303
1246 /* 1304 /*
1247 * Nothing was found, proceed to the next round of search 1305 * Nothing was found, proceed to the next round of search
1248 * if reading offset not beyond or hit EOF. 1306 * if the next reading offset is not at or beyond EOF.
1249 */ 1307 */
1250 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; 1308 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
1251 start = XFS_FSB_TO_B(mp, fsbno); 1309 start = XFS_FSB_TO_B(mp, fsbno);
1252 if (start >= isize) { 1310 if (start >= isize) {
1253 error = ENXIO; 1311 if (whence == SEEK_HOLE) {
1254 goto out_unlock; 1312 offset = isize;
1255 } 1313 break;
1256 }
1257
1258out:
1259 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1260
1261out_unlock:
1262 xfs_iunlock(ip, lock);
1263
1264 if (error)
1265 return -error;
1266 return offset;
1267}
1268
1269STATIC loff_t
1270xfs_seek_hole(
1271 struct file *file,
1272 loff_t start)
1273{
1274 struct inode *inode = file->f_mapping->host;
1275 struct xfs_inode *ip = XFS_I(inode);
1276 struct xfs_mount *mp = ip->i_mount;
1277 loff_t uninitialized_var(offset);
1278 xfs_fsize_t isize;
1279 xfs_fileoff_t fsbno;
1280 xfs_filblks_t end;
1281 uint lock;
1282 int error;
1283
1284 if (XFS_FORCED_SHUTDOWN(mp))
1285 return -XFS_ERROR(EIO);
1286
1287 lock = xfs_ilock_data_map_shared(ip);
1288
1289 isize = i_size_read(inode);
1290 if (start >= isize) {
1291 error = ENXIO;
1292 goto out_unlock;
1293 }
1294
1295 fsbno = XFS_B_TO_FSBT(mp, start);
1296 end = XFS_B_TO_FSB(mp, isize);
1297
1298 for (;;) {
1299 struct xfs_bmbt_irec map[2];
1300 int nmap = 2;
1301 unsigned int i;
1302
1303 error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap,
1304 XFS_BMAPI_ENTIRE);
1305 if (error)
1306 goto out_unlock;
1307
1308 /* No extents at given offset, must be beyond EOF */
1309 if (nmap == 0) {
1310 error = ENXIO;
1311 goto out_unlock;
1312 }
1313
1314 for (i = 0; i < nmap; i++) {
1315 offset = max_t(loff_t, start,
1316 XFS_FSB_TO_B(mp, map[i].br_startoff));
1317
1318 /* Landed in a hole */
1319 if (map[i].br_startblock == HOLESTARTBLOCK)
1320 goto out;
1321
1322 /*
1323 * Landed in an unwritten extent, try to search hole
1324 * from page cache.
1325 */
1326 if (map[i].br_state == XFS_EXT_UNWRITTEN) {
1327 if (xfs_find_get_desired_pgoff(inode, &map[i],
1328 HOLE_OFF, &offset))
1329 goto out;
1330 } 1314 }
1331 } 1315 ASSERT(whence == SEEK_DATA);
1332 1316 error = -ENXIO;
1333 /* 1317 goto out_unlock;
1334 * map[0] contains data or its unwritten but contains
1335 * data in page cache, probably means that we are
1336 * reading after EOF. We should fix offset to point
1337 * to the end of the file(i.e., there is an implicit
1338 * hole at the end of any file).
1339 */
1340 if (nmap == 1) {
1341 offset = isize;
1342 break;
1343 }
1344
1345 ASSERT(i > 1);
1346
1347 /*
1348 * Both mappings contains data, proceed to the next round of
1349 * search if the current reading offset not beyond or hit EOF.
1350 */
1351 fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount;
1352 start = XFS_FSB_TO_B(mp, fsbno);
1353 if (start >= isize) {
1354 offset = isize;
1355 break;
1356 } 1318 }
1357 } 1319 }
1358 1320
1359out: 1321out:
1360 /* 1322 /*
1361 * At this point, we must have found a hole. However, the returned 1323 * If at this point we have found the hole we wanted, the returned
1362 * offset may be bigger than the file size as it may be aligned to 1324 * offset may be bigger than the file size as it may be aligned to
1363 * page boundary for unwritten extents, we need to deal with this 1325 * page boundary for unwritten extents. We need to deal with this
1364 * situation in particular. 1326 * situation in particular.
1365 */ 1327 */
1366 offset = min_t(loff_t, offset, isize); 1328 if (whence == SEEK_HOLE)
1329 offset = min_t(loff_t, offset, isize);
1367 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); 1330 offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1368 1331
1369out_unlock: 1332out_unlock:
1370 xfs_iunlock(ip, lock); 1333 xfs_iunlock(ip, lock);
1371 1334
1372 if (error) 1335 if (error)
1373 return -error; 1336 return error;
1374 return offset; 1337 return offset;
1375} 1338}
1376 1339
@@ -1378,17 +1341,16 @@ STATIC loff_t
1378xfs_file_llseek( 1341xfs_file_llseek(
1379 struct file *file, 1342 struct file *file,
1380 loff_t offset, 1343 loff_t offset,
1381 int origin) 1344 int whence)
1382{ 1345{
1383 switch (origin) { 1346 switch (whence) {
1384 case SEEK_END: 1347 case SEEK_END:
1385 case SEEK_CUR: 1348 case SEEK_CUR:
1386 case SEEK_SET: 1349 case SEEK_SET:
1387 return generic_file_llseek(file, offset, origin); 1350 return generic_file_llseek(file, offset, whence);
1388 case SEEK_DATA:
1389 return xfs_seek_data(file, offset);
1390 case SEEK_HOLE: 1351 case SEEK_HOLE:
1391 return xfs_seek_hole(file, offset); 1352 case SEEK_DATA:
1353 return xfs_seek_hole_data(file, offset, whence);
1392 default: 1354 default:
1393 return -EINVAL; 1355 return -EINVAL;
1394 } 1356 }
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 8ec81bed7992..e92730c1d3ca 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -258,7 +258,7 @@ next_ag:
258 if (*agp == NULLAGNUMBER) 258 if (*agp == NULLAGNUMBER)
259 return 0; 259 return 0;
260 260
261 err = ENOMEM; 261 err = -ENOMEM;
262 item = kmem_alloc(sizeof(*item), KM_MAYFAIL); 262 item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
263 if (!item) 263 if (!item)
264 goto out_put_ag; 264 goto out_put_ag;
@@ -268,7 +268,7 @@ next_ag:
268 268
269 err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru); 269 err = xfs_mru_cache_insert(mp->m_filestream, ip->i_ino, &item->mru);
270 if (err) { 270 if (err) {
271 if (err == EEXIST) 271 if (err == -EEXIST)
272 err = 0; 272 err = 0;
273 goto out_free_item; 273 goto out_free_item;
274 } 274 }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index d34703dbcb42..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -255,8 +255,8 @@ typedef struct xfs_fsop_resblks {
255 ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) 255 ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
256 256
257/* Used for sanity checks on superblock */ 257/* Used for sanity checks on superblock */
258#define XFS_MAX_DBLOCKS(s) ((xfs_drfsbno_t)(s)->sb_agcount * (s)->sb_agblocks) 258#define XFS_MAX_DBLOCKS(s) ((xfs_rfsblock_t)(s)->sb_agcount * (s)->sb_agblocks)
259#define XFS_MIN_DBLOCKS(s) ((xfs_drfsbno_t)((s)->sb_agcount - 1) * \ 259#define XFS_MIN_DBLOCKS(s) ((xfs_rfsblock_t)((s)->sb_agcount - 1) * \
260 (s)->sb_agblocks + XFS_MIN_AG_BLOCKS) 260 (s)->sb_agblocks + XFS_MIN_AG_BLOCKS)
261 261
262/* 262/*
@@ -375,6 +375,9 @@ struct xfs_fs_eofblocks {
375#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */ 375#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
376#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */ 376#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
377#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */ 377#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
378#define XFS_EOF_FLAGS_UNION (1 << 5) /* union filter algorithm;
379 * kernel only, not included in
380 * valid mask */
378#define XFS_EOF_FLAGS_VALID \ 381#define XFS_EOF_FLAGS_VALID \
379 (XFS_EOF_FLAGS_SYNC | \ 382 (XFS_EOF_FLAGS_SYNC | \
380 XFS_EOF_FLAGS_UID | \ 383 XFS_EOF_FLAGS_UID | \
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index d2295561570a..c05ac8b70fa9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -168,20 +168,15 @@ xfs_growfs_data_private(
168 nb = in->newblocks; 168 nb = in->newblocks;
169 pct = in->imaxpct; 169 pct = in->imaxpct;
170 if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) 170 if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100)
171 return XFS_ERROR(EINVAL); 171 return -EINVAL;
172 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) 172 if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
173 return error; 173 return error;
174 dpct = pct - mp->m_sb.sb_imax_pct; 174 dpct = pct - mp->m_sb.sb_imax_pct;
175 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 175 error = xfs_buf_read_uncached(mp->m_ddev_targp,
176 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), 176 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
177 XFS_FSS_TO_BB(mp, 1), 0, NULL); 177 XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
178 if (!bp) 178 if (error)
179 return EIO;
180 if (bp->b_error) {
181 error = bp->b_error;
182 xfs_buf_relse(bp);
183 return error; 179 return error;
184 }
185 xfs_buf_relse(bp); 180 xfs_buf_relse(bp);
186 181
187 new = nb; /* use new as a temporary here */ 182 new = nb; /* use new as a temporary here */
@@ -191,7 +186,7 @@ xfs_growfs_data_private(
191 nagcount--; 186 nagcount--;
192 nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; 187 nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
193 if (nb < mp->m_sb.sb_dblocks) 188 if (nb < mp->m_sb.sb_dblocks)
194 return XFS_ERROR(EINVAL); 189 return -EINVAL;
195 } 190 }
196 new = nb - mp->m_sb.sb_dblocks; 191 new = nb - mp->m_sb.sb_dblocks;
197 oagcount = mp->m_sb.sb_agcount; 192 oagcount = mp->m_sb.sb_agcount;
@@ -229,7 +224,7 @@ xfs_growfs_data_private(
229 XFS_FSS_TO_BB(mp, 1), 0, 224 XFS_FSS_TO_BB(mp, 1), 0,
230 &xfs_agf_buf_ops); 225 &xfs_agf_buf_ops);
231 if (!bp) { 226 if (!bp) {
232 error = ENOMEM; 227 error = -ENOMEM;
233 goto error0; 228 goto error0;
234 } 229 }
235 230
@@ -270,7 +265,7 @@ xfs_growfs_data_private(
270 XFS_FSS_TO_BB(mp, 1), 0, 265 XFS_FSS_TO_BB(mp, 1), 0,
271 &xfs_agfl_buf_ops); 266 &xfs_agfl_buf_ops);
272 if (!bp) { 267 if (!bp) {
273 error = ENOMEM; 268 error = -ENOMEM;
274 goto error0; 269 goto error0;
275 } 270 }
276 271
@@ -298,7 +293,7 @@ xfs_growfs_data_private(
298 XFS_FSS_TO_BB(mp, 1), 0, 293 XFS_FSS_TO_BB(mp, 1), 0,
299 &xfs_agi_buf_ops); 294 &xfs_agi_buf_ops);
300 if (!bp) { 295 if (!bp) {
301 error = ENOMEM; 296 error = -ENOMEM;
302 goto error0; 297 goto error0;
303 } 298 }
304 299
@@ -336,7 +331,7 @@ xfs_growfs_data_private(
336 &xfs_allocbt_buf_ops); 331 &xfs_allocbt_buf_ops);
337 332
338 if (!bp) { 333 if (!bp) {
339 error = ENOMEM; 334 error = -ENOMEM;
340 goto error0; 335 goto error0;
341 } 336 }
342 337
@@ -365,7 +360,7 @@ xfs_growfs_data_private(
365 BTOBB(mp->m_sb.sb_blocksize), 0, 360 BTOBB(mp->m_sb.sb_blocksize), 0,
366 &xfs_allocbt_buf_ops); 361 &xfs_allocbt_buf_ops);
367 if (!bp) { 362 if (!bp) {
368 error = ENOMEM; 363 error = -ENOMEM;
369 goto error0; 364 goto error0;
370 } 365 }
371 366
@@ -395,7 +390,7 @@ xfs_growfs_data_private(
395 BTOBB(mp->m_sb.sb_blocksize), 0, 390 BTOBB(mp->m_sb.sb_blocksize), 0,
396 &xfs_inobt_buf_ops); 391 &xfs_inobt_buf_ops);
397 if (!bp) { 392 if (!bp) {
398 error = ENOMEM; 393 error = -ENOMEM;
399 goto error0; 394 goto error0;
400 } 395 }
401 396
@@ -420,7 +415,7 @@ xfs_growfs_data_private(
420 BTOBB(mp->m_sb.sb_blocksize), 0, 415 BTOBB(mp->m_sb.sb_blocksize), 0,
421 &xfs_inobt_buf_ops); 416 &xfs_inobt_buf_ops);
422 if (!bp) { 417 if (!bp) {
423 error = ENOMEM; 418 error = -ENOMEM;
424 goto error0; 419 goto error0;
425 } 420 }
426 421
@@ -531,7 +526,7 @@ xfs_growfs_data_private(
531 bp->b_ops = &xfs_sb_buf_ops; 526 bp->b_ops = &xfs_sb_buf_ops;
532 xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); 527 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
533 } else 528 } else
534 error = ENOMEM; 529 error = -ENOMEM;
535 } 530 }
536 531
537 /* 532 /*
@@ -576,17 +571,17 @@ xfs_growfs_log_private(
576 571
577 nb = in->newblocks; 572 nb = in->newblocks;
578 if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES)) 573 if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
579 return XFS_ERROR(EINVAL); 574 return -EINVAL;
580 if (nb == mp->m_sb.sb_logblocks && 575 if (nb == mp->m_sb.sb_logblocks &&
581 in->isint == (mp->m_sb.sb_logstart != 0)) 576 in->isint == (mp->m_sb.sb_logstart != 0))
582 return XFS_ERROR(EINVAL); 577 return -EINVAL;
583 /* 578 /*
584 * Moving the log is hard, need new interfaces to sync 579 * Moving the log is hard, need new interfaces to sync
585 * the log first, hold off all activity while moving it. 580 * the log first, hold off all activity while moving it.
586 * Can have shorter or longer log in the same space, 581 * Can have shorter or longer log in the same space,
587 * or transform internal to external log or vice versa. 582 * or transform internal to external log or vice versa.
588 */ 583 */
589 return XFS_ERROR(ENOSYS); 584 return -ENOSYS;
590} 585}
591 586
592/* 587/*
@@ -604,9 +599,9 @@ xfs_growfs_data(
604 int error; 599 int error;
605 600
606 if (!capable(CAP_SYS_ADMIN)) 601 if (!capable(CAP_SYS_ADMIN))
607 return XFS_ERROR(EPERM); 602 return -EPERM;
608 if (!mutex_trylock(&mp->m_growlock)) 603 if (!mutex_trylock(&mp->m_growlock))
609 return XFS_ERROR(EWOULDBLOCK); 604 return -EWOULDBLOCK;
610 error = xfs_growfs_data_private(mp, in); 605 error = xfs_growfs_data_private(mp, in);
611 mutex_unlock(&mp->m_growlock); 606 mutex_unlock(&mp->m_growlock);
612 return error; 607 return error;
@@ -620,9 +615,9 @@ xfs_growfs_log(
620 int error; 615 int error;
621 616
622 if (!capable(CAP_SYS_ADMIN)) 617 if (!capable(CAP_SYS_ADMIN))
623 return XFS_ERROR(EPERM); 618 return -EPERM;
624 if (!mutex_trylock(&mp->m_growlock)) 619 if (!mutex_trylock(&mp->m_growlock))
625 return XFS_ERROR(EWOULDBLOCK); 620 return -EWOULDBLOCK;
626 error = xfs_growfs_log_private(mp, in); 621 error = xfs_growfs_log_private(mp, in);
627 mutex_unlock(&mp->m_growlock); 622 mutex_unlock(&mp->m_growlock);
628 return error; 623 return error;
@@ -674,7 +669,7 @@ xfs_reserve_blocks(
674 /* If inval is null, report current values and return */ 669 /* If inval is null, report current values and return */
675 if (inval == (__uint64_t *)NULL) { 670 if (inval == (__uint64_t *)NULL) {
676 if (!outval) 671 if (!outval)
677 return EINVAL; 672 return -EINVAL;
678 outval->resblks = mp->m_resblks; 673 outval->resblks = mp->m_resblks;
679 outval->resblks_avail = mp->m_resblks_avail; 674 outval->resblks_avail = mp->m_resblks_avail;
680 return 0; 675 return 0;
@@ -757,7 +752,7 @@ out:
757 int error; 752 int error;
758 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, 753 error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS,
759 fdblks_delta, 0); 754 fdblks_delta, 0);
760 if (error == ENOSPC) 755 if (error == -ENOSPC)
761 goto retry; 756 goto retry;
762 } 757 }
763 return 0; 758 return 0;
@@ -818,7 +813,7 @@ xfs_fs_goingdown(
818 SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR); 813 SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
819 break; 814 break;
820 default: 815 default:
821 return XFS_ERROR(EINVAL); 816 return -EINVAL;
822 } 817 }
823 818
824 return 0; 819 return 0;
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 5399ef222dd7..4d41b241298f 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -43,3 +43,7 @@ xfs_param_t xfs_params = {
43 .fstrm_timer = { 1, 30*100, 3600*100}, 43 .fstrm_timer = { 1, 30*100, 3600*100},
44 .eofb_timer = { 1, 300, 3600*24}, 44 .eofb_timer = { 1, 300, 3600*24},
45}; 45};
46
47struct xfs_globals xfs_globals = {
48 .log_recovery_delay = 0, /* no delay by default */
49};
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index c48df5f25b9f..b45f7b27b5df 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,6 +33,8 @@
33#include "xfs_trace.h" 33#include "xfs_trace.h"
34#include "xfs_icache.h" 34#include "xfs_icache.h"
35#include "xfs_bmap_util.h" 35#include "xfs_bmap_util.h"
36#include "xfs_dquot_item.h"
37#include "xfs_dquot.h"
36 38
37#include <linux/kthread.h> 39#include <linux/kthread.h>
38#include <linux/freezer.h> 40#include <linux/freezer.h>
@@ -158,7 +160,7 @@ xfs_iget_cache_hit(
158 if (ip->i_ino != ino) { 160 if (ip->i_ino != ino) {
159 trace_xfs_iget_skip(ip); 161 trace_xfs_iget_skip(ip);
160 XFS_STATS_INC(xs_ig_frecycle); 162 XFS_STATS_INC(xs_ig_frecycle);
161 error = EAGAIN; 163 error = -EAGAIN;
162 goto out_error; 164 goto out_error;
163 } 165 }
164 166
@@ -176,7 +178,7 @@ xfs_iget_cache_hit(
176 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 178 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
177 trace_xfs_iget_skip(ip); 179 trace_xfs_iget_skip(ip);
178 XFS_STATS_INC(xs_ig_frecycle); 180 XFS_STATS_INC(xs_ig_frecycle);
179 error = EAGAIN; 181 error = -EAGAIN;
180 goto out_error; 182 goto out_error;
181 } 183 }
182 184
@@ -184,7 +186,7 @@ xfs_iget_cache_hit(
184 * If lookup is racing with unlink return an error immediately. 186 * If lookup is racing with unlink return an error immediately.
185 */ 187 */
186 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { 188 if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
187 error = ENOENT; 189 error = -ENOENT;
188 goto out_error; 190 goto out_error;
189 } 191 }
190 192
@@ -206,7 +208,7 @@ xfs_iget_cache_hit(
206 spin_unlock(&ip->i_flags_lock); 208 spin_unlock(&ip->i_flags_lock);
207 rcu_read_unlock(); 209 rcu_read_unlock();
208 210
209 error = -inode_init_always(mp->m_super, inode); 211 error = inode_init_always(mp->m_super, inode);
210 if (error) { 212 if (error) {
211 /* 213 /*
212 * Re-initializing the inode failed, and we are in deep 214 * Re-initializing the inode failed, and we are in deep
@@ -243,7 +245,7 @@ xfs_iget_cache_hit(
243 /* If the VFS inode is being torn down, pause and try again. */ 245 /* If the VFS inode is being torn down, pause and try again. */
244 if (!igrab(inode)) { 246 if (!igrab(inode)) {
245 trace_xfs_iget_skip(ip); 247 trace_xfs_iget_skip(ip);
246 error = EAGAIN; 248 error = -EAGAIN;
247 goto out_error; 249 goto out_error;
248 } 250 }
249 251
@@ -285,7 +287,7 @@ xfs_iget_cache_miss(
285 287
286 ip = xfs_inode_alloc(mp, ino); 288 ip = xfs_inode_alloc(mp, ino);
287 if (!ip) 289 if (!ip)
288 return ENOMEM; 290 return -ENOMEM;
289 291
290 error = xfs_iread(mp, tp, ip, flags); 292 error = xfs_iread(mp, tp, ip, flags);
291 if (error) 293 if (error)
@@ -294,7 +296,7 @@ xfs_iget_cache_miss(
294 trace_xfs_iget_miss(ip); 296 trace_xfs_iget_miss(ip);
295 297
296 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 298 if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
297 error = ENOENT; 299 error = -ENOENT;
298 goto out_destroy; 300 goto out_destroy;
299 } 301 }
300 302
@@ -305,7 +307,7 @@ xfs_iget_cache_miss(
305 * recurse into the file system. 307 * recurse into the file system.
306 */ 308 */
307 if (radix_tree_preload(GFP_NOFS)) { 309 if (radix_tree_preload(GFP_NOFS)) {
308 error = EAGAIN; 310 error = -EAGAIN;
309 goto out_destroy; 311 goto out_destroy;
310 } 312 }
311 313
@@ -341,7 +343,7 @@ xfs_iget_cache_miss(
341 if (unlikely(error)) { 343 if (unlikely(error)) {
342 WARN_ON(error != -EEXIST); 344 WARN_ON(error != -EEXIST);
343 XFS_STATS_INC(xs_ig_dup); 345 XFS_STATS_INC(xs_ig_dup);
344 error = EAGAIN; 346 error = -EAGAIN;
345 goto out_preload_end; 347 goto out_preload_end;
346 } 348 }
347 spin_unlock(&pag->pag_ici_lock); 349 spin_unlock(&pag->pag_ici_lock);
@@ -408,7 +410,7 @@ xfs_iget(
408 410
409 /* reject inode numbers outside existing AGs */ 411 /* reject inode numbers outside existing AGs */
410 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 412 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
411 return EINVAL; 413 return -EINVAL;
412 414
413 /* get the perag structure and ensure that it's inode capable */ 415 /* get the perag structure and ensure that it's inode capable */
414 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 416 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -445,7 +447,7 @@ again:
445 return 0; 447 return 0;
446 448
447out_error_or_again: 449out_error_or_again:
448 if (error == EAGAIN) { 450 if (error == -EAGAIN) {
449 delay(1); 451 delay(1);
450 goto again; 452 goto again;
451 } 453 }
@@ -489,18 +491,18 @@ xfs_inode_ag_walk_grab(
489 491
490 /* nothing to sync during shutdown */ 492 /* nothing to sync during shutdown */
491 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 493 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
492 return EFSCORRUPTED; 494 return -EFSCORRUPTED;
493 495
494 /* If we can't grab the inode, it must on it's way to reclaim. */ 496 /* If we can't grab the inode, it must on it's way to reclaim. */
495 if (!igrab(inode)) 497 if (!igrab(inode))
496 return ENOENT; 498 return -ENOENT;
497 499
498 /* inode is valid */ 500 /* inode is valid */
499 return 0; 501 return 0;
500 502
501out_unlock_noent: 503out_unlock_noent:
502 spin_unlock(&ip->i_flags_lock); 504 spin_unlock(&ip->i_flags_lock);
503 return ENOENT; 505 return -ENOENT;
504} 506}
505 507
506STATIC int 508STATIC int
@@ -583,16 +585,16 @@ restart:
583 continue; 585 continue;
584 error = execute(batch[i], flags, args); 586 error = execute(batch[i], flags, args);
585 IRELE(batch[i]); 587 IRELE(batch[i]);
586 if (error == EAGAIN) { 588 if (error == -EAGAIN) {
587 skipped++; 589 skipped++;
588 continue; 590 continue;
589 } 591 }
590 if (error && last_error != EFSCORRUPTED) 592 if (error && last_error != -EFSCORRUPTED)
591 last_error = error; 593 last_error = error;
592 } 594 }
593 595
594 /* bail out if the filesystem is corrupted. */ 596 /* bail out if the filesystem is corrupted. */
595 if (error == EFSCORRUPTED) 597 if (error == -EFSCORRUPTED)
596 break; 598 break;
597 599
598 cond_resched(); 600 cond_resched();
@@ -652,11 +654,11 @@ xfs_inode_ag_iterator(
652 xfs_perag_put(pag); 654 xfs_perag_put(pag);
653 if (error) { 655 if (error) {
654 last_error = error; 656 last_error = error;
655 if (error == EFSCORRUPTED) 657 if (error == -EFSCORRUPTED)
656 break; 658 break;
657 } 659 }
658 } 660 }
659 return XFS_ERROR(last_error); 661 return last_error;
660} 662}
661 663
662int 664int
@@ -680,11 +682,11 @@ xfs_inode_ag_iterator_tag(
680 xfs_perag_put(pag); 682 xfs_perag_put(pag);
681 if (error) { 683 if (error) {
682 last_error = error; 684 last_error = error;
683 if (error == EFSCORRUPTED) 685 if (error == -EFSCORRUPTED)
684 break; 686 break;
685 } 687 }
686 } 688 }
687 return XFS_ERROR(last_error); 689 return last_error;
688} 690}
689 691
690/* 692/*
@@ -944,7 +946,7 @@ restart:
944 * see the stale flag set on the inode. 946 * see the stale flag set on the inode.
945 */ 947 */
946 error = xfs_iflush(ip, &bp); 948 error = xfs_iflush(ip, &bp);
947 if (error == EAGAIN) { 949 if (error == -EAGAIN) {
948 xfs_iunlock(ip, XFS_ILOCK_EXCL); 950 xfs_iunlock(ip, XFS_ILOCK_EXCL);
949 /* backoff longer than in xfs_ifree_cluster */ 951 /* backoff longer than in xfs_ifree_cluster */
950 delay(2); 952 delay(2);
@@ -997,7 +999,7 @@ out:
997 xfs_iflags_clear(ip, XFS_IRECLAIM); 999 xfs_iflags_clear(ip, XFS_IRECLAIM);
998 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1000 xfs_iunlock(ip, XFS_ILOCK_EXCL);
999 /* 1001 /*
1000 * We could return EAGAIN here to make reclaim rescan the inode tree in 1002 * We could return -EAGAIN here to make reclaim rescan the inode tree in
1001 * a short while. However, this just burns CPU time scanning the tree 1003 * a short while. However, this just burns CPU time scanning the tree
1002 * waiting for IO to complete and the reclaim work never goes back to 1004 * waiting for IO to complete and the reclaim work never goes back to
1003 * the idle state. Instead, return 0 to let the next scheduled 1005 * the idle state. Instead, return 0 to let the next scheduled
@@ -1100,7 +1102,7 @@ restart:
1100 if (!batch[i]) 1102 if (!batch[i])
1101 continue; 1103 continue;
1102 error = xfs_reclaim_inode(batch[i], pag, flags); 1104 error = xfs_reclaim_inode(batch[i], pag, flags);
1103 if (error && last_error != EFSCORRUPTED) 1105 if (error && last_error != -EFSCORRUPTED)
1104 last_error = error; 1106 last_error = error;
1105 } 1107 }
1106 1108
@@ -1129,7 +1131,7 @@ restart:
1129 trylock = 0; 1131 trylock = 0;
1130 goto restart; 1132 goto restart;
1131 } 1133 }
1132 return XFS_ERROR(last_error); 1134 return last_error;
1133} 1135}
1134 1136
1135int 1137int
@@ -1203,6 +1205,30 @@ xfs_inode_match_id(
1203 return 1; 1205 return 1;
1204} 1206}
1205 1207
1208/*
1209 * A union-based inode filtering algorithm. Process the inode if any of the
1210 * criteria match. This is for global/internal scans only.
1211 */
1212STATIC int
1213xfs_inode_match_id_union(
1214 struct xfs_inode *ip,
1215 struct xfs_eofblocks *eofb)
1216{
1217 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1218 uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
1219 return 1;
1220
1221 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1222 gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
1223 return 1;
1224
1225 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1226 xfs_get_projid(ip) == eofb->eof_prid)
1227 return 1;
1228
1229 return 0;
1230}
1231
1206STATIC int 1232STATIC int
1207xfs_inode_free_eofblocks( 1233xfs_inode_free_eofblocks(
1208 struct xfs_inode *ip, 1234 struct xfs_inode *ip,
@@ -1211,6 +1237,10 @@ xfs_inode_free_eofblocks(
1211{ 1237{
1212 int ret; 1238 int ret;
1213 struct xfs_eofblocks *eofb = args; 1239 struct xfs_eofblocks *eofb = args;
1240 bool need_iolock = true;
1241 int match;
1242
1243 ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
1214 1244
1215 if (!xfs_can_free_eofblocks(ip, false)) { 1245 if (!xfs_can_free_eofblocks(ip, false)) {
1216 /* inode could be preallocated or append-only */ 1246 /* inode could be preallocated or append-only */
@@ -1228,19 +1258,31 @@ xfs_inode_free_eofblocks(
1228 return 0; 1258 return 0;
1229 1259
1230 if (eofb) { 1260 if (eofb) {
1231 if (!xfs_inode_match_id(ip, eofb)) 1261 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
1262 match = xfs_inode_match_id_union(ip, eofb);
1263 else
1264 match = xfs_inode_match_id(ip, eofb);
1265 if (!match)
1232 return 0; 1266 return 0;
1233 1267
1234 /* skip the inode if the file size is too small */ 1268 /* skip the inode if the file size is too small */
1235 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1269 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
1236 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1270 XFS_ISIZE(ip) < eofb->eof_min_file_size)
1237 return 0; 1271 return 0;
1272
1273 /*
1274 * A scan owner implies we already hold the iolock. Skip it in
1275 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
1276 * the possibility of EAGAIN being returned.
1277 */
1278 if (eofb->eof_scan_owner == ip->i_ino)
1279 need_iolock = false;
1238 } 1280 }
1239 1281
1240 ret = xfs_free_eofblocks(ip->i_mount, ip, true); 1282 ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock);
1241 1283
1242 /* don't revisit the inode if we're not waiting */ 1284 /* don't revisit the inode if we're not waiting */
1243 if (ret == EAGAIN && !(flags & SYNC_WAIT)) 1285 if (ret == -EAGAIN && !(flags & SYNC_WAIT))
1244 ret = 0; 1286 ret = 0;
1245 1287
1246 return ret; 1288 return ret;
@@ -1260,6 +1302,55 @@ xfs_icache_free_eofblocks(
1260 eofb, XFS_ICI_EOFBLOCKS_TAG); 1302 eofb, XFS_ICI_EOFBLOCKS_TAG);
1261} 1303}
1262 1304
1305/*
1306 * Run eofblocks scans on the quotas applicable to the inode. For inodes with
1307 * multiple quotas, we don't know exactly which quota caused an allocation
1308 * failure. We make a best effort by including each quota under low free space
1309 * conditions (less than 1% free space) in the scan.
1310 */
1311int
1312xfs_inode_free_quota_eofblocks(
1313 struct xfs_inode *ip)
1314{
1315 int scan = 0;
1316 struct xfs_eofblocks eofb = {0};
1317 struct xfs_dquot *dq;
1318
1319 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1320
1321 /*
1322 * Set the scan owner to avoid a potential livelock. Otherwise, the scan
1323 * can repeatedly trylock on the inode we're currently processing. We
1324 * run a sync scan to increase effectiveness and use the union filter to
1325 * cover all applicable quotas in a single scan.
1326 */
1327 eofb.eof_scan_owner = ip->i_ino;
1328 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC;
1329
1330 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) {
1331 dq = xfs_inode_dquot(ip, XFS_DQ_USER);
1332 if (dq && xfs_dquot_lowsp(dq)) {
1333 eofb.eof_uid = VFS_I(ip)->i_uid;
1334 eofb.eof_flags |= XFS_EOF_FLAGS_UID;
1335 scan = 1;
1336 }
1337 }
1338
1339 if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) {
1340 dq = xfs_inode_dquot(ip, XFS_DQ_GROUP);
1341 if (dq && xfs_dquot_lowsp(dq)) {
1342 eofb.eof_gid = VFS_I(ip)->i_gid;
1343 eofb.eof_flags |= XFS_EOF_FLAGS_GID;
1344 scan = 1;
1345 }
1346 }
1347
1348 if (scan)
1349 xfs_icache_free_eofblocks(ip->i_mount, &eofb);
1350
1351 return scan;
1352}
1353
1263void 1354void
1264xfs_inode_set_eofblocks_tag( 1355xfs_inode_set_eofblocks_tag(
1265 xfs_inode_t *ip) 1356 xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 9cf017b899be..46748b86b12f 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -27,6 +27,7 @@ struct xfs_eofblocks {
27 kgid_t eof_gid; 27 kgid_t eof_gid;
28 prid_t eof_prid; 28 prid_t eof_prid;
29 __u64 eof_min_file_size; 29 __u64 eof_min_file_size;
30 xfs_ino_t eof_scan_owner;
30}; 31};
31 32
32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 33#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
@@ -57,6 +58,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
57void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); 58void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
58void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); 59void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
59int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *); 60int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
61int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
60void xfs_eofblocks_worker(struct work_struct *); 62void xfs_eofblocks_worker(struct work_struct *);
61 63
62int xfs_inode_ag_iterator(struct xfs_mount *mp, 64int xfs_inode_ag_iterator(struct xfs_mount *mp,
@@ -72,31 +74,32 @@ xfs_fs_eofblocks_from_user(
72 struct xfs_eofblocks *dst) 74 struct xfs_eofblocks *dst)
73{ 75{
74 if (src->eof_version != XFS_EOFBLOCKS_VERSION) 76 if (src->eof_version != XFS_EOFBLOCKS_VERSION)
75 return EINVAL; 77 return -EINVAL;
76 78
77 if (src->eof_flags & ~XFS_EOF_FLAGS_VALID) 79 if (src->eof_flags & ~XFS_EOF_FLAGS_VALID)
78 return EINVAL; 80 return -EINVAL;
79 81
80 if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) || 82 if (memchr_inv(&src->pad32, 0, sizeof(src->pad32)) ||
81 memchr_inv(src->pad64, 0, sizeof(src->pad64))) 83 memchr_inv(src->pad64, 0, sizeof(src->pad64)))
82 return EINVAL; 84 return -EINVAL;
83 85
84 dst->eof_flags = src->eof_flags; 86 dst->eof_flags = src->eof_flags;
85 dst->eof_prid = src->eof_prid; 87 dst->eof_prid = src->eof_prid;
86 dst->eof_min_file_size = src->eof_min_file_size; 88 dst->eof_min_file_size = src->eof_min_file_size;
89 dst->eof_scan_owner = NULLFSINO;
87 90
88 dst->eof_uid = INVALID_UID; 91 dst->eof_uid = INVALID_UID;
89 if (src->eof_flags & XFS_EOF_FLAGS_UID) { 92 if (src->eof_flags & XFS_EOF_FLAGS_UID) {
90 dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid); 93 dst->eof_uid = make_kuid(current_user_ns(), src->eof_uid);
91 if (!uid_valid(dst->eof_uid)) 94 if (!uid_valid(dst->eof_uid))
92 return EINVAL; 95 return -EINVAL;
93 } 96 }
94 97
95 dst->eof_gid = INVALID_GID; 98 dst->eof_gid = INVALID_GID;
96 if (src->eof_flags & XFS_EOF_FLAGS_GID) { 99 if (src->eof_flags & XFS_EOF_FLAGS_GID) {
97 dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid); 100 dst->eof_gid = make_kgid(current_user_ns(), src->eof_gid);
98 if (!gid_valid(dst->eof_gid)) 101 if (!gid_valid(dst->eof_gid))
99 return EINVAL; 102 return -EINVAL;
100 } 103 }
101 return 0; 104 return 0;
102} 105}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a6115fe1ac94..8ed049d1e332 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -583,7 +583,7 @@ xfs_lookup(
583 trace_xfs_lookup(dp, name); 583 trace_xfs_lookup(dp, name);
584 584
585 if (XFS_FORCED_SHUTDOWN(dp->i_mount)) 585 if (XFS_FORCED_SHUTDOWN(dp->i_mount))
586 return XFS_ERROR(EIO); 586 return -EIO;
587 587
588 lock_mode = xfs_ilock_data_map_shared(dp); 588 lock_mode = xfs_ilock_data_map_shared(dp);
589 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 589 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
@@ -654,7 +654,7 @@ xfs_ialloc(
654 xfs_inode_t *ip; 654 xfs_inode_t *ip;
655 uint flags; 655 uint flags;
656 int error; 656 int error;
657 timespec_t tv; 657 struct timespec tv;
658 658
659 /* 659 /*
660 * Call the space management code to pick 660 * Call the space management code to pick
@@ -720,7 +720,7 @@ xfs_ialloc(
720 ip->i_d.di_nextents = 0; 720 ip->i_d.di_nextents = 0;
721 ASSERT(ip->i_d.di_nblocks == 0); 721 ASSERT(ip->i_d.di_nblocks == 0);
722 722
723 nanotime(&tv); 723 tv = current_fs_time(mp->m_super);
724 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; 724 ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
725 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; 725 ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
726 ip->i_d.di_atime = ip->i_d.di_mtime; 726 ip->i_d.di_atime = ip->i_d.di_mtime;
@@ -769,6 +769,8 @@ xfs_ialloc(
769 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 769 di_flags |= XFS_DIFLAG_EXTSZINHERIT;
770 ip->i_d.di_extsize = pip->i_d.di_extsize; 770 ip->i_d.di_extsize = pip->i_d.di_extsize;
771 } 771 }
772 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
773 di_flags |= XFS_DIFLAG_PROJINHERIT;
772 } else if (S_ISREG(mode)) { 774 } else if (S_ISREG(mode)) {
773 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) 775 if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
774 di_flags |= XFS_DIFLAG_REALTIME; 776 di_flags |= XFS_DIFLAG_REALTIME;
@@ -789,8 +791,6 @@ xfs_ialloc(
789 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && 791 if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
790 xfs_inherit_nosymlinks) 792 xfs_inherit_nosymlinks)
791 di_flags |= XFS_DIFLAG_NOSYMLINKS; 793 di_flags |= XFS_DIFLAG_NOSYMLINKS;
792 if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
793 di_flags |= XFS_DIFLAG_PROJINHERIT;
794 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && 794 if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
795 xfs_inherit_nodefrag) 795 xfs_inherit_nodefrag)
796 di_flags |= XFS_DIFLAG_NODEFRAG; 796 di_flags |= XFS_DIFLAG_NODEFRAG;
@@ -893,7 +893,7 @@ xfs_dir_ialloc(
893 } 893 }
894 if (!ialloc_context && !ip) { 894 if (!ialloc_context && !ip) {
895 *ipp = NULL; 895 *ipp = NULL;
896 return XFS_ERROR(ENOSPC); 896 return -ENOSPC;
897 } 897 }
898 898
899 /* 899 /*
@@ -1088,7 +1088,7 @@ xfs_create(
1088 trace_xfs_create(dp, name); 1088 trace_xfs_create(dp, name);
1089 1089
1090 if (XFS_FORCED_SHUTDOWN(mp)) 1090 if (XFS_FORCED_SHUTDOWN(mp))
1091 return XFS_ERROR(EIO); 1091 return -EIO;
1092 1092
1093 prid = xfs_get_initial_prid(dp); 1093 prid = xfs_get_initial_prid(dp);
1094 1094
@@ -1125,12 +1125,12 @@ xfs_create(
1125 */ 1125 */
1126 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; 1126 tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
1127 error = xfs_trans_reserve(tp, &tres, resblks, 0); 1127 error = xfs_trans_reserve(tp, &tres, resblks, 0);
1128 if (error == ENOSPC) { 1128 if (error == -ENOSPC) {
1129 /* flush outstanding delalloc blocks and retry */ 1129 /* flush outstanding delalloc blocks and retry */
1130 xfs_flush_inodes(mp); 1130 xfs_flush_inodes(mp);
1131 error = xfs_trans_reserve(tp, &tres, resblks, 0); 1131 error = xfs_trans_reserve(tp, &tres, resblks, 0);
1132 } 1132 }
1133 if (error == ENOSPC) { 1133 if (error == -ENOSPC) {
1134 /* No space at all so try a "no-allocation" reservation */ 1134 /* No space at all so try a "no-allocation" reservation */
1135 resblks = 0; 1135 resblks = 0;
1136 error = xfs_trans_reserve(tp, &tres, 0, 0); 1136 error = xfs_trans_reserve(tp, &tres, 0, 0);
@@ -1153,9 +1153,11 @@ xfs_create(
1153 if (error) 1153 if (error)
1154 goto out_trans_cancel; 1154 goto out_trans_cancel;
1155 1155
1156 error = xfs_dir_canenter(tp, dp, name, resblks); 1156 if (!resblks) {
1157 if (error) 1157 error = xfs_dir_canenter(tp, dp, name);
1158 goto out_trans_cancel; 1158 if (error)
1159 goto out_trans_cancel;
1160 }
1159 1161
1160 /* 1162 /*
1161 * A newly created regular or special file just has one directory 1163 * A newly created regular or special file just has one directory
@@ -1165,7 +1167,7 @@ xfs_create(
1165 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, 1167 error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1166 prid, resblks > 0, &ip, &committed); 1168 prid, resblks > 0, &ip, &committed);
1167 if (error) { 1169 if (error) {
1168 if (error == ENOSPC) 1170 if (error == -ENOSPC)
1169 goto out_trans_cancel; 1171 goto out_trans_cancel;
1170 goto out_trans_abort; 1172 goto out_trans_abort;
1171 } 1173 }
@@ -1184,7 +1186,7 @@ xfs_create(
1184 &first_block, &free_list, resblks ? 1186 &first_block, &free_list, resblks ?
1185 resblks - XFS_IALLOC_SPACE_RES(mp) : 0); 1187 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1186 if (error) { 1188 if (error) {
1187 ASSERT(error != ENOSPC); 1189 ASSERT(error != -ENOSPC);
1188 goto out_trans_abort; 1190 goto out_trans_abort;
1189 } 1191 }
1190 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1192 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -1274,7 +1276,7 @@ xfs_create_tmpfile(
1274 uint resblks; 1276 uint resblks;
1275 1277
1276 if (XFS_FORCED_SHUTDOWN(mp)) 1278 if (XFS_FORCED_SHUTDOWN(mp))
1277 return XFS_ERROR(EIO); 1279 return -EIO;
1278 1280
1279 prid = xfs_get_initial_prid(dp); 1281 prid = xfs_get_initial_prid(dp);
1280 1282
@@ -1293,7 +1295,7 @@ xfs_create_tmpfile(
1293 1295
1294 tres = &M_RES(mp)->tr_create_tmpfile; 1296 tres = &M_RES(mp)->tr_create_tmpfile;
1295 error = xfs_trans_reserve(tp, tres, resblks, 0); 1297 error = xfs_trans_reserve(tp, tres, resblks, 0);
1296 if (error == ENOSPC) { 1298 if (error == -ENOSPC) {
1297 /* No space at all so try a "no-allocation" reservation */ 1299 /* No space at all so try a "no-allocation" reservation */
1298 resblks = 0; 1300 resblks = 0;
1299 error = xfs_trans_reserve(tp, tres, 0, 0); 1301 error = xfs_trans_reserve(tp, tres, 0, 0);
@@ -1311,7 +1313,7 @@ xfs_create_tmpfile(
1311 error = xfs_dir_ialloc(&tp, dp, mode, 1, 0, 1313 error = xfs_dir_ialloc(&tp, dp, mode, 1, 0,
1312 prid, resblks > 0, &ip, NULL); 1314 prid, resblks > 0, &ip, NULL);
1313 if (error) { 1315 if (error) {
1314 if (error == ENOSPC) 1316 if (error == -ENOSPC)
1315 goto out_trans_cancel; 1317 goto out_trans_cancel;
1316 goto out_trans_abort; 1318 goto out_trans_abort;
1317 } 1319 }
@@ -1382,7 +1384,7 @@ xfs_link(
1382 ASSERT(!S_ISDIR(sip->i_d.di_mode)); 1384 ASSERT(!S_ISDIR(sip->i_d.di_mode));
1383 1385
1384 if (XFS_FORCED_SHUTDOWN(mp)) 1386 if (XFS_FORCED_SHUTDOWN(mp))
1385 return XFS_ERROR(EIO); 1387 return -EIO;
1386 1388
1387 error = xfs_qm_dqattach(sip, 0); 1389 error = xfs_qm_dqattach(sip, 0);
1388 if (error) 1390 if (error)
@@ -1396,7 +1398,7 @@ xfs_link(
1396 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 1398 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1397 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1399 resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1398 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); 1400 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
1399 if (error == ENOSPC) { 1401 if (error == -ENOSPC) {
1400 resblks = 0; 1402 resblks = 0;
1401 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); 1403 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
1402 } 1404 }
@@ -1417,13 +1419,15 @@ xfs_link(
1417 */ 1419 */
1418 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 1420 if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1419 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) { 1421 (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1420 error = XFS_ERROR(EXDEV); 1422 error = -EXDEV;
1421 goto error_return; 1423 goto error_return;
1422 } 1424 }
1423 1425
1424 error = xfs_dir_canenter(tp, tdp, target_name, resblks); 1426 if (!resblks) {
1425 if (error) 1427 error = xfs_dir_canenter(tp, tdp, target_name);
1426 goto error_return; 1428 if (error)
1429 goto error_return;
1430 }
1427 1431
1428 xfs_bmap_init(&free_list, &first_block); 1432 xfs_bmap_init(&free_list, &first_block);
1429 1433
@@ -1635,8 +1639,8 @@ xfs_release(
1635 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1639 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1636 if (truncated) { 1640 if (truncated) {
1637 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 1641 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1638 if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) { 1642 if (ip->i_delayed_blks > 0) {
1639 error = -filemap_flush(VFS_I(ip)->i_mapping); 1643 error = filemap_flush(VFS_I(ip)->i_mapping);
1640 if (error) 1644 if (error)
1641 return error; 1645 return error;
1642 } 1646 }
@@ -1673,7 +1677,7 @@ xfs_release(
1673 return 0; 1677 return 0;
1674 1678
1675 error = xfs_free_eofblocks(mp, ip, true); 1679 error = xfs_free_eofblocks(mp, ip, true);
1676 if (error && error != EAGAIN) 1680 if (error && error != -EAGAIN)
1677 return error; 1681 return error;
1678 1682
1679 /* delalloc blocks after truncation means it really is dirty */ 1683 /* delalloc blocks after truncation means it really is dirty */
@@ -1772,7 +1776,7 @@ xfs_inactive_ifree(
1772 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 1776 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree,
1773 XFS_IFREE_SPACE_RES(mp), 0); 1777 XFS_IFREE_SPACE_RES(mp), 0);
1774 if (error) { 1778 if (error) {
1775 if (error == ENOSPC) { 1779 if (error == -ENOSPC) {
1776 xfs_warn_ratelimited(mp, 1780 xfs_warn_ratelimited(mp,
1777 "Failed to remove inode(s) from unlinked list. " 1781 "Failed to remove inode(s) from unlinked list. "
1778 "Please free space, unmount and run xfs_repair."); 1782 "Please free space, unmount and run xfs_repair.");
@@ -2219,7 +2223,7 @@ xfs_ifree_cluster(
2219 XBF_UNMAPPED); 2223 XBF_UNMAPPED);
2220 2224
2221 if (!bp) 2225 if (!bp)
2222 return ENOMEM; 2226 return -ENOMEM;
2223 2227
2224 /* 2228 /*
2225 * This buffer may not have been correctly initialised as we 2229 * This buffer may not have been correctly initialised as we
@@ -2491,7 +2495,7 @@ xfs_remove(
2491 trace_xfs_remove(dp, name); 2495 trace_xfs_remove(dp, name);
2492 2496
2493 if (XFS_FORCED_SHUTDOWN(mp)) 2497 if (XFS_FORCED_SHUTDOWN(mp))
2494 return XFS_ERROR(EIO); 2498 return -EIO;
2495 2499
2496 error = xfs_qm_dqattach(dp, 0); 2500 error = xfs_qm_dqattach(dp, 0);
2497 if (error) 2501 if (error)
@@ -2521,12 +2525,12 @@ xfs_remove(
2521 */ 2525 */
2522 resblks = XFS_REMOVE_SPACE_RES(mp); 2526 resblks = XFS_REMOVE_SPACE_RES(mp);
2523 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0); 2527 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
2524 if (error == ENOSPC) { 2528 if (error == -ENOSPC) {
2525 resblks = 0; 2529 resblks = 0;
2526 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0); 2530 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
2527 } 2531 }
2528 if (error) { 2532 if (error) {
2529 ASSERT(error != ENOSPC); 2533 ASSERT(error != -ENOSPC);
2530 cancel_flags = 0; 2534 cancel_flags = 0;
2531 goto out_trans_cancel; 2535 goto out_trans_cancel;
2532 } 2536 }
@@ -2543,11 +2547,11 @@ xfs_remove(
2543 if (is_dir) { 2547 if (is_dir) {
2544 ASSERT(ip->i_d.di_nlink >= 2); 2548 ASSERT(ip->i_d.di_nlink >= 2);
2545 if (ip->i_d.di_nlink != 2) { 2549 if (ip->i_d.di_nlink != 2) {
2546 error = XFS_ERROR(ENOTEMPTY); 2550 error = -ENOTEMPTY;
2547 goto out_trans_cancel; 2551 goto out_trans_cancel;
2548 } 2552 }
2549 if (!xfs_dir_isempty(ip)) { 2553 if (!xfs_dir_isempty(ip)) {
2550 error = XFS_ERROR(ENOTEMPTY); 2554 error = -ENOTEMPTY;
2551 goto out_trans_cancel; 2555 goto out_trans_cancel;
2552 } 2556 }
2553 2557
@@ -2582,7 +2586,7 @@ xfs_remove(
2582 error = xfs_dir_removename(tp, dp, name, ip->i_ino, 2586 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2583 &first_block, &free_list, resblks); 2587 &first_block, &free_list, resblks);
2584 if (error) { 2588 if (error) {
2585 ASSERT(error != ENOENT); 2589 ASSERT(error != -ENOENT);
2586 goto out_bmap_cancel; 2590 goto out_bmap_cancel;
2587 } 2591 }
2588 2592
@@ -2702,7 +2706,7 @@ xfs_rename(
2702 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 2706 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2703 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 2707 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2704 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); 2708 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
2705 if (error == ENOSPC) { 2709 if (error == -ENOSPC) {
2706 spaceres = 0; 2710 spaceres = 0;
2707 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); 2711 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
2708 } 2712 }
@@ -2747,7 +2751,7 @@ xfs_rename(
2747 */ 2751 */
2748 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && 2752 if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2749 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { 2753 (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
2750 error = XFS_ERROR(EXDEV); 2754 error = -EXDEV;
2751 goto error_return; 2755 goto error_return;
2752 } 2756 }
2753 2757
@@ -2759,9 +2763,11 @@ xfs_rename(
2759 * If there's no space reservation, check the entry will 2763 * If there's no space reservation, check the entry will
2760 * fit before actually inserting it. 2764 * fit before actually inserting it.
2761 */ 2765 */
2762 error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); 2766 if (!spaceres) {
2763 if (error) 2767 error = xfs_dir_canenter(tp, target_dp, target_name);
2764 goto error_return; 2768 if (error)
2769 goto error_return;
2770 }
2765 /* 2771 /*
2766 * If target does not exist and the rename crosses 2772 * If target does not exist and the rename crosses
2767 * directories, adjust the target directory link count 2773 * directories, adjust the target directory link count
@@ -2770,7 +2776,7 @@ xfs_rename(
2770 error = xfs_dir_createname(tp, target_dp, target_name, 2776 error = xfs_dir_createname(tp, target_dp, target_name,
2771 src_ip->i_ino, &first_block, 2777 src_ip->i_ino, &first_block,
2772 &free_list, spaceres); 2778 &free_list, spaceres);
2773 if (error == ENOSPC) 2779 if (error == -ENOSPC)
2774 goto error_return; 2780 goto error_return;
2775 if (error) 2781 if (error)
2776 goto abort_return; 2782 goto abort_return;
@@ -2795,7 +2801,7 @@ xfs_rename(
2795 */ 2801 */
2796 if (!(xfs_dir_isempty(target_ip)) || 2802 if (!(xfs_dir_isempty(target_ip)) ||
2797 (target_ip->i_d.di_nlink > 2)) { 2803 (target_ip->i_d.di_nlink > 2)) {
2798 error = XFS_ERROR(EEXIST); 2804 error = -EEXIST;
2799 goto error_return; 2805 goto error_return;
2800 } 2806 }
2801 } 2807 }
@@ -2847,7 +2853,7 @@ xfs_rename(
2847 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, 2853 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
2848 target_dp->i_ino, 2854 target_dp->i_ino,
2849 &first_block, &free_list, spaceres); 2855 &first_block, &free_list, spaceres);
2850 ASSERT(error != EEXIST); 2856 ASSERT(error != -EEXIST);
2851 if (error) 2857 if (error)
2852 goto abort_return; 2858 goto abort_return;
2853 } 2859 }
@@ -3055,8 +3061,8 @@ cluster_corrupt_out:
3055 if (bp->b_iodone) { 3061 if (bp->b_iodone) {
3056 XFS_BUF_UNDONE(bp); 3062 XFS_BUF_UNDONE(bp);
3057 xfs_buf_stale(bp); 3063 xfs_buf_stale(bp);
3058 xfs_buf_ioerror(bp, EIO); 3064 xfs_buf_ioerror(bp, -EIO);
3059 xfs_buf_ioend(bp, 0); 3065 xfs_buf_ioend(bp);
3060 } else { 3066 } else {
3061 xfs_buf_stale(bp); 3067 xfs_buf_stale(bp);
3062 xfs_buf_relse(bp); 3068 xfs_buf_relse(bp);
@@ -3069,7 +3075,7 @@ cluster_corrupt_out:
3069 xfs_iflush_abort(iq, false); 3075 xfs_iflush_abort(iq, false);
3070 kmem_free(ilist); 3076 kmem_free(ilist);
3071 xfs_perag_put(pag); 3077 xfs_perag_put(pag);
3072 return XFS_ERROR(EFSCORRUPTED); 3078 return -EFSCORRUPTED;
3073} 3079}
3074 3080
3075/* 3081/*
@@ -3124,7 +3130,7 @@ xfs_iflush(
3124 * as we wait for an empty AIL as part of the unmount process. 3130 * as we wait for an empty AIL as part of the unmount process.
3125 */ 3131 */
3126 if (XFS_FORCED_SHUTDOWN(mp)) { 3132 if (XFS_FORCED_SHUTDOWN(mp)) {
3127 error = XFS_ERROR(EIO); 3133 error = -EIO;
3128 goto abort_out; 3134 goto abort_out;
3129 } 3135 }
3130 3136
@@ -3167,7 +3173,7 @@ corrupt_out:
3167 xfs_buf_relse(bp); 3173 xfs_buf_relse(bp);
3168 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3174 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3169cluster_corrupt_out: 3175cluster_corrupt_out:
3170 error = XFS_ERROR(EFSCORRUPTED); 3176 error = -EFSCORRUPTED;
3171abort_out: 3177abort_out:
3172 /* 3178 /*
3173 * Unlocks the flush lock 3179 * Unlocks the flush lock
@@ -3331,5 +3337,5 @@ xfs_iflush_int(
3331 return 0; 3337 return 0;
3332 3338
3333corrupt_out: 3339corrupt_out:
3334 return XFS_ERROR(EFSCORRUPTED); 3340 return -EFSCORRUPTED;
3335} 3341}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f72bffa67266..9af2882e1f4c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -102,7 +102,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size)
102{ 102{
103 xfs_fsize_t i_size = i_size_read(VFS_I(ip)); 103 xfs_fsize_t i_size = i_size_read(VFS_I(ip));
104 104
105 if (new_size > i_size) 105 if (new_size > i_size || new_size < 0)
106 new_size = i_size; 106 new_size = i_size;
107 return new_size > ip->i_d.di_size ? new_size : 0; 107 return new_size > ip->i_d.di_size ? new_size : 0;
108} 108}
@@ -398,4 +398,14 @@ do { \
398 398
399extern struct kmem_zone *xfs_inode_zone; 399extern struct kmem_zone *xfs_inode_zone;
400 400
401/*
402 * Flags for read/write calls
403 */
404#define XFS_IO_ISDIRECT 0x00001 /* bypass page cache */
405#define XFS_IO_INVIS 0x00002 /* don't update inode timestamps */
406
407#define XFS_IO_FLAGS \
408 { XFS_IO_ISDIRECT, "DIRECT" }, \
409 { XFS_IO_INVIS, "INVIS"}
410
401#endif /* __XFS_INODE_H__ */ 411#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index a640137b3573..63de0b0acc32 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -615,7 +615,7 @@ xfs_iflush_done(
615 blip = bp->b_fspriv; 615 blip = bp->b_fspriv;
616 prev = NULL; 616 prev = NULL;
617 while (blip != NULL) { 617 while (blip != NULL) {
618 if (lip->li_cb != xfs_iflush_done) { 618 if (blip->li_cb != xfs_iflush_done) {
619 prev = blip; 619 prev = blip;
620 blip = blip->li_bio_list; 620 blip = blip->li_bio_list;
621 continue; 621 continue;
@@ -788,5 +788,5 @@ xfs_inode_item_format_convert(
788 in_f->ilf_boffset = in_f64->ilf_boffset; 788 in_f->ilf_boffset = in_f64->ilf_boffset;
789 return 0; 789 return 0;
790 } 790 }
791 return EFSCORRUPTED; 791 return -EFSCORRUPTED;
792} 792}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8bc1bbce7451..24c926b6fe85 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -207,7 +207,7 @@ xfs_open_by_handle(
207 struct path path; 207 struct path path;
208 208
209 if (!capable(CAP_SYS_ADMIN)) 209 if (!capable(CAP_SYS_ADMIN))
210 return -XFS_ERROR(EPERM); 210 return -EPERM;
211 211
212 dentry = xfs_handlereq_to_dentry(parfilp, hreq); 212 dentry = xfs_handlereq_to_dentry(parfilp, hreq);
213 if (IS_ERR(dentry)) 213 if (IS_ERR(dentry))
@@ -216,7 +216,7 @@ xfs_open_by_handle(
216 216
217 /* Restrict xfs_open_by_handle to directories & regular files. */ 217 /* Restrict xfs_open_by_handle to directories & regular files. */
218 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { 218 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
219 error = -XFS_ERROR(EPERM); 219 error = -EPERM;
220 goto out_dput; 220 goto out_dput;
221 } 221 }
222 222
@@ -228,18 +228,18 @@ xfs_open_by_handle(
228 fmode = OPEN_FMODE(permflag); 228 fmode = OPEN_FMODE(permflag);
229 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) && 229 if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
230 (fmode & FMODE_WRITE) && IS_APPEND(inode)) { 230 (fmode & FMODE_WRITE) && IS_APPEND(inode)) {
231 error = -XFS_ERROR(EPERM); 231 error = -EPERM;
232 goto out_dput; 232 goto out_dput;
233 } 233 }
234 234
235 if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) { 235 if ((fmode & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
236 error = -XFS_ERROR(EACCES); 236 error = -EACCES;
237 goto out_dput; 237 goto out_dput;
238 } 238 }
239 239
240 /* Can't write directories. */ 240 /* Can't write directories. */
241 if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) { 241 if (S_ISDIR(inode->i_mode) && (fmode & FMODE_WRITE)) {
242 error = -XFS_ERROR(EISDIR); 242 error = -EISDIR;
243 goto out_dput; 243 goto out_dput;
244 } 244 }
245 245
@@ -282,7 +282,7 @@ xfs_readlink_by_handle(
282 int error; 282 int error;
283 283
284 if (!capable(CAP_SYS_ADMIN)) 284 if (!capable(CAP_SYS_ADMIN))
285 return -XFS_ERROR(EPERM); 285 return -EPERM;
286 286
287 dentry = xfs_handlereq_to_dentry(parfilp, hreq); 287 dentry = xfs_handlereq_to_dentry(parfilp, hreq);
288 if (IS_ERR(dentry)) 288 if (IS_ERR(dentry))
@@ -290,22 +290,22 @@ xfs_readlink_by_handle(
290 290
291 /* Restrict this handle operation to symlinks only. */ 291 /* Restrict this handle operation to symlinks only. */
292 if (!S_ISLNK(dentry->d_inode->i_mode)) { 292 if (!S_ISLNK(dentry->d_inode->i_mode)) {
293 error = -XFS_ERROR(EINVAL); 293 error = -EINVAL;
294 goto out_dput; 294 goto out_dput;
295 } 295 }
296 296
297 if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) { 297 if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
298 error = -XFS_ERROR(EFAULT); 298 error = -EFAULT;
299 goto out_dput; 299 goto out_dput;
300 } 300 }
301 301
302 link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); 302 link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
303 if (!link) { 303 if (!link) {
304 error = -XFS_ERROR(ENOMEM); 304 error = -ENOMEM;
305 goto out_dput; 305 goto out_dput;
306 } 306 }
307 307
308 error = -xfs_readlink(XFS_I(dentry->d_inode), link); 308 error = xfs_readlink(XFS_I(dentry->d_inode), link);
309 if (error) 309 if (error)
310 goto out_kfree; 310 goto out_kfree;
311 error = readlink_copy(hreq->ohandle, olen, link); 311 error = readlink_copy(hreq->ohandle, olen, link);
@@ -330,10 +330,10 @@ xfs_set_dmattrs(
330 int error; 330 int error;
331 331
332 if (!capable(CAP_SYS_ADMIN)) 332 if (!capable(CAP_SYS_ADMIN))
333 return XFS_ERROR(EPERM); 333 return -EPERM;
334 334
335 if (XFS_FORCED_SHUTDOWN(mp)) 335 if (XFS_FORCED_SHUTDOWN(mp))
336 return XFS_ERROR(EIO); 336 return -EIO;
337 337
338 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); 338 tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS);
339 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); 339 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
@@ -364,9 +364,9 @@ xfs_fssetdm_by_handle(
364 struct dentry *dentry; 364 struct dentry *dentry;
365 365
366 if (!capable(CAP_MKNOD)) 366 if (!capable(CAP_MKNOD))
367 return -XFS_ERROR(EPERM); 367 return -EPERM;
368 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t))) 368 if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
369 return -XFS_ERROR(EFAULT); 369 return -EFAULT;
370 370
371 error = mnt_want_write_file(parfilp); 371 error = mnt_want_write_file(parfilp);
372 if (error) 372 if (error)
@@ -379,16 +379,16 @@ xfs_fssetdm_by_handle(
379 } 379 }
380 380
381 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { 381 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
382 error = -XFS_ERROR(EPERM); 382 error = -EPERM;
383 goto out; 383 goto out;
384 } 384 }
385 385
386 if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) { 386 if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
387 error = -XFS_ERROR(EFAULT); 387 error = -EFAULT;
388 goto out; 388 goto out;
389 } 389 }
390 390
391 error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, 391 error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
392 fsd.fsd_dmstate); 392 fsd.fsd_dmstate);
393 393
394 out: 394 out:
@@ -409,18 +409,18 @@ xfs_attrlist_by_handle(
409 char *kbuf; 409 char *kbuf;
410 410
411 if (!capable(CAP_SYS_ADMIN)) 411 if (!capable(CAP_SYS_ADMIN))
412 return -XFS_ERROR(EPERM); 412 return -EPERM;
413 if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) 413 if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
414 return -XFS_ERROR(EFAULT); 414 return -EFAULT;
415 if (al_hreq.buflen < sizeof(struct attrlist) || 415 if (al_hreq.buflen < sizeof(struct attrlist) ||
416 al_hreq.buflen > XATTR_LIST_MAX) 416 al_hreq.buflen > XATTR_LIST_MAX)
417 return -XFS_ERROR(EINVAL); 417 return -EINVAL;
418 418
419 /* 419 /*
420 * Reject flags, only allow namespaces. 420 * Reject flags, only allow namespaces.
421 */ 421 */
422 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) 422 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
423 return -XFS_ERROR(EINVAL); 423 return -EINVAL;
424 424
425 dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq); 425 dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
426 if (IS_ERR(dentry)) 426 if (IS_ERR(dentry))
@@ -431,7 +431,7 @@ xfs_attrlist_by_handle(
431 goto out_dput; 431 goto out_dput;
432 432
433 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; 433 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
434 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, 434 error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
435 al_hreq.flags, cursor); 435 al_hreq.flags, cursor);
436 if (error) 436 if (error)
437 goto out_kfree; 437 goto out_kfree;
@@ -455,20 +455,20 @@ xfs_attrmulti_attr_get(
455 __uint32_t flags) 455 __uint32_t flags)
456{ 456{
457 unsigned char *kbuf; 457 unsigned char *kbuf;
458 int error = EFAULT; 458 int error = -EFAULT;
459 459
460 if (*len > XATTR_SIZE_MAX) 460 if (*len > XATTR_SIZE_MAX)
461 return EINVAL; 461 return -EINVAL;
462 kbuf = kmem_zalloc_large(*len, KM_SLEEP); 462 kbuf = kmem_zalloc_large(*len, KM_SLEEP);
463 if (!kbuf) 463 if (!kbuf)
464 return ENOMEM; 464 return -ENOMEM;
465 465
466 error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); 466 error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
467 if (error) 467 if (error)
468 goto out_kfree; 468 goto out_kfree;
469 469
470 if (copy_to_user(ubuf, kbuf, *len)) 470 if (copy_to_user(ubuf, kbuf, *len))
471 error = EFAULT; 471 error = -EFAULT;
472 472
473out_kfree: 473out_kfree:
474 kmem_free(kbuf); 474 kmem_free(kbuf);
@@ -484,20 +484,17 @@ xfs_attrmulti_attr_set(
484 __uint32_t flags) 484 __uint32_t flags)
485{ 485{
486 unsigned char *kbuf; 486 unsigned char *kbuf;
487 int error = EFAULT;
488 487
489 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 488 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
490 return EPERM; 489 return -EPERM;
491 if (len > XATTR_SIZE_MAX) 490 if (len > XATTR_SIZE_MAX)
492 return EINVAL; 491 return -EINVAL;
493 492
494 kbuf = memdup_user(ubuf, len); 493 kbuf = memdup_user(ubuf, len);
495 if (IS_ERR(kbuf)) 494 if (IS_ERR(kbuf))
496 return PTR_ERR(kbuf); 495 return PTR_ERR(kbuf);
497 496
498 error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); 497 return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
499
500 return error;
501} 498}
502 499
503int 500int
@@ -507,7 +504,7 @@ xfs_attrmulti_attr_remove(
507 __uint32_t flags) 504 __uint32_t flags)
508{ 505{
509 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 506 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
510 return EPERM; 507 return -EPERM;
511 return xfs_attr_remove(XFS_I(inode), name, flags); 508 return xfs_attr_remove(XFS_I(inode), name, flags);
512} 509}
513 510
@@ -524,9 +521,9 @@ xfs_attrmulti_by_handle(
524 unsigned char *attr_name; 521 unsigned char *attr_name;
525 522
526 if (!capable(CAP_SYS_ADMIN)) 523 if (!capable(CAP_SYS_ADMIN))
527 return -XFS_ERROR(EPERM); 524 return -EPERM;
528 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t))) 525 if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
529 return -XFS_ERROR(EFAULT); 526 return -EFAULT;
530 527
531 /* overflow check */ 528 /* overflow check */
532 if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t)) 529 if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
@@ -536,18 +533,18 @@ xfs_attrmulti_by_handle(
536 if (IS_ERR(dentry)) 533 if (IS_ERR(dentry))
537 return PTR_ERR(dentry); 534 return PTR_ERR(dentry);
538 535
539 error = E2BIG; 536 error = -E2BIG;
540 size = am_hreq.opcount * sizeof(xfs_attr_multiop_t); 537 size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
541 if (!size || size > 16 * PAGE_SIZE) 538 if (!size || size > 16 * PAGE_SIZE)
542 goto out_dput; 539 goto out_dput;
543 540
544 ops = memdup_user(am_hreq.ops, size); 541 ops = memdup_user(am_hreq.ops, size);
545 if (IS_ERR(ops)) { 542 if (IS_ERR(ops)) {
546 error = -PTR_ERR(ops); 543 error = PTR_ERR(ops);
547 goto out_dput; 544 goto out_dput;
548 } 545 }
549 546
550 error = ENOMEM; 547 error = -ENOMEM;
551 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); 548 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
552 if (!attr_name) 549 if (!attr_name)
553 goto out_kfree_ops; 550 goto out_kfree_ops;
@@ -557,7 +554,7 @@ xfs_attrmulti_by_handle(
557 ops[i].am_error = strncpy_from_user((char *)attr_name, 554 ops[i].am_error = strncpy_from_user((char *)attr_name,
558 ops[i].am_attrname, MAXNAMELEN); 555 ops[i].am_attrname, MAXNAMELEN);
559 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 556 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
560 error = ERANGE; 557 error = -ERANGE;
561 if (ops[i].am_error < 0) 558 if (ops[i].am_error < 0)
562 break; 559 break;
563 560
@@ -588,19 +585,19 @@ xfs_attrmulti_by_handle(
588 mnt_drop_write_file(parfilp); 585 mnt_drop_write_file(parfilp);
589 break; 586 break;
590 default: 587 default:
591 ops[i].am_error = EINVAL; 588 ops[i].am_error = -EINVAL;
592 } 589 }
593 } 590 }
594 591
595 if (copy_to_user(am_hreq.ops, ops, size)) 592 if (copy_to_user(am_hreq.ops, ops, size))
596 error = XFS_ERROR(EFAULT); 593 error = -EFAULT;
597 594
598 kfree(attr_name); 595 kfree(attr_name);
599 out_kfree_ops: 596 out_kfree_ops:
600 kfree(ops); 597 kfree(ops);
601 out_dput: 598 out_dput:
602 dput(dentry); 599 dput(dentry);
603 return -error; 600 return error;
604} 601}
605 602
606int 603int
@@ -625,16 +622,16 @@ xfs_ioc_space(
625 */ 622 */
626 if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) && 623 if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
627 !capable(CAP_SYS_ADMIN)) 624 !capable(CAP_SYS_ADMIN))
628 return -XFS_ERROR(EPERM); 625 return -EPERM;
629 626
630 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) 627 if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
631 return -XFS_ERROR(EPERM); 628 return -EPERM;
632 629
633 if (!(filp->f_mode & FMODE_WRITE)) 630 if (!(filp->f_mode & FMODE_WRITE))
634 return -XFS_ERROR(EBADF); 631 return -EBADF;
635 632
636 if (!S_ISREG(inode->i_mode)) 633 if (!S_ISREG(inode->i_mode))
637 return -XFS_ERROR(EINVAL); 634 return -EINVAL;
638 635
639 error = mnt_want_write_file(filp); 636 error = mnt_want_write_file(filp);
640 if (error) 637 if (error)
@@ -652,7 +649,7 @@ xfs_ioc_space(
652 bf->l_start += XFS_ISIZE(ip); 649 bf->l_start += XFS_ISIZE(ip);
653 break; 650 break;
654 default: 651 default:
655 error = XFS_ERROR(EINVAL); 652 error = -EINVAL;
656 goto out_unlock; 653 goto out_unlock;
657 } 654 }
658 655
@@ -669,7 +666,7 @@ xfs_ioc_space(
669 case XFS_IOC_UNRESVSP: 666 case XFS_IOC_UNRESVSP:
670 case XFS_IOC_UNRESVSP64: 667 case XFS_IOC_UNRESVSP64:
671 if (bf->l_len <= 0) { 668 if (bf->l_len <= 0) {
672 error = XFS_ERROR(EINVAL); 669 error = -EINVAL;
673 goto out_unlock; 670 goto out_unlock;
674 } 671 }
675 break; 672 break;
@@ -682,7 +679,7 @@ xfs_ioc_space(
682 bf->l_start > mp->m_super->s_maxbytes || 679 bf->l_start > mp->m_super->s_maxbytes ||
683 bf->l_start + bf->l_len < 0 || 680 bf->l_start + bf->l_len < 0 ||
684 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) { 681 bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
685 error = XFS_ERROR(EINVAL); 682 error = -EINVAL;
686 goto out_unlock; 683 goto out_unlock;
687 } 684 }
688 685
@@ -723,7 +720,7 @@ xfs_ioc_space(
723 break; 720 break;
724 default: 721 default:
725 ASSERT(0); 722 ASSERT(0);
726 error = XFS_ERROR(EINVAL); 723 error = -EINVAL;
727 } 724 }
728 725
729 if (error) 726 if (error)
@@ -739,7 +736,7 @@ xfs_ioc_space(
739 xfs_ilock(ip, XFS_ILOCK_EXCL); 736 xfs_ilock(ip, XFS_ILOCK_EXCL);
740 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 737 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
741 738
742 if (!(ioflags & IO_INVIS)) { 739 if (!(ioflags & XFS_IO_INVIS)) {
743 ip->i_d.di_mode &= ~S_ISUID; 740 ip->i_d.di_mode &= ~S_ISUID;
744 if (ip->i_d.di_mode & S_IXGRP) 741 if (ip->i_d.di_mode & S_IXGRP)
745 ip->i_d.di_mode &= ~S_ISGID; 742 ip->i_d.di_mode &= ~S_ISGID;
@@ -759,7 +756,7 @@ xfs_ioc_space(
759out_unlock: 756out_unlock:
760 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 757 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
761 mnt_drop_write_file(filp); 758 mnt_drop_write_file(filp);
762 return -error; 759 return error;
763} 760}
764 761
765STATIC int 762STATIC int
@@ -781,41 +778,41 @@ xfs_ioc_bulkstat(
781 return -EPERM; 778 return -EPERM;
782 779
783 if (XFS_FORCED_SHUTDOWN(mp)) 780 if (XFS_FORCED_SHUTDOWN(mp))
784 return -XFS_ERROR(EIO); 781 return -EIO;
785 782
786 if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t))) 783 if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
787 return -XFS_ERROR(EFAULT); 784 return -EFAULT;
788 785
789 if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) 786 if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
790 return -XFS_ERROR(EFAULT); 787 return -EFAULT;
791 788
792 if ((count = bulkreq.icount) <= 0) 789 if ((count = bulkreq.icount) <= 0)
793 return -XFS_ERROR(EINVAL); 790 return -EINVAL;
794 791
795 if (bulkreq.ubuffer == NULL) 792 if (bulkreq.ubuffer == NULL)
796 return -XFS_ERROR(EINVAL); 793 return -EINVAL;
797 794
798 if (cmd == XFS_IOC_FSINUMBERS) 795 if (cmd == XFS_IOC_FSINUMBERS)
799 error = xfs_inumbers(mp, &inlast, &count, 796 error = xfs_inumbers(mp, &inlast, &count,
800 bulkreq.ubuffer, xfs_inumbers_fmt); 797 bulkreq.ubuffer, xfs_inumbers_fmt);
801 else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE) 798 else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
802 error = xfs_bulkstat_single(mp, &inlast, 799 error = xfs_bulkstat_one(mp, inlast, bulkreq.ubuffer,
803 bulkreq.ubuffer, &done); 800 sizeof(xfs_bstat_t), NULL, &done);
804 else /* XFS_IOC_FSBULKSTAT */ 801 else /* XFS_IOC_FSBULKSTAT */
805 error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one, 802 error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
806 sizeof(xfs_bstat_t), bulkreq.ubuffer, 803 sizeof(xfs_bstat_t), bulkreq.ubuffer,
807 &done); 804 &done);
808 805
809 if (error) 806 if (error)
810 return -error; 807 return error;
811 808
812 if (bulkreq.ocount != NULL) { 809 if (bulkreq.ocount != NULL) {
813 if (copy_to_user(bulkreq.lastip, &inlast, 810 if (copy_to_user(bulkreq.lastip, &inlast,
814 sizeof(xfs_ino_t))) 811 sizeof(xfs_ino_t)))
815 return -XFS_ERROR(EFAULT); 812 return -EFAULT;
816 813
817 if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) 814 if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
818 return -XFS_ERROR(EFAULT); 815 return -EFAULT;
819 } 816 }
820 817
821 return 0; 818 return 0;
@@ -831,7 +828,7 @@ xfs_ioc_fsgeometry_v1(
831 828
832 error = xfs_fs_geometry(mp, &fsgeo, 3); 829 error = xfs_fs_geometry(mp, &fsgeo, 3);
833 if (error) 830 if (error)
834 return -error; 831 return error;
835 832
836 /* 833 /*
837 * Caller should have passed an argument of type 834 * Caller should have passed an argument of type
@@ -839,7 +836,7 @@ xfs_ioc_fsgeometry_v1(
839 * xfs_fsop_geom_t that xfs_fs_geometry() fills in. 836 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
840 */ 837 */
841 if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t))) 838 if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
842 return -XFS_ERROR(EFAULT); 839 return -EFAULT;
843 return 0; 840 return 0;
844} 841}
845 842
@@ -853,10 +850,10 @@ xfs_ioc_fsgeometry(
853 850
854 error = xfs_fs_geometry(mp, &fsgeo, 4); 851 error = xfs_fs_geometry(mp, &fsgeo, 4);
855 if (error) 852 if (error)
856 return -error; 853 return error;
857 854
858 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) 855 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
859 return -XFS_ERROR(EFAULT); 856 return -EFAULT;
860 return 0; 857 return 0;
861} 858}
862 859
@@ -971,8 +968,6 @@ xfs_set_diflags(
971 di_flags |= XFS_DIFLAG_NOATIME; 968 di_flags |= XFS_DIFLAG_NOATIME;
972 if (xflags & XFS_XFLAG_NODUMP) 969 if (xflags & XFS_XFLAG_NODUMP)
973 di_flags |= XFS_DIFLAG_NODUMP; 970 di_flags |= XFS_DIFLAG_NODUMP;
974 if (xflags & XFS_XFLAG_PROJINHERIT)
975 di_flags |= XFS_DIFLAG_PROJINHERIT;
976 if (xflags & XFS_XFLAG_NODEFRAG) 971 if (xflags & XFS_XFLAG_NODEFRAG)
977 di_flags |= XFS_DIFLAG_NODEFRAG; 972 di_flags |= XFS_DIFLAG_NODEFRAG;
978 if (xflags & XFS_XFLAG_FILESTREAM) 973 if (xflags & XFS_XFLAG_FILESTREAM)
@@ -984,6 +979,8 @@ xfs_set_diflags(
984 di_flags |= XFS_DIFLAG_NOSYMLINKS; 979 di_flags |= XFS_DIFLAG_NOSYMLINKS;
985 if (xflags & XFS_XFLAG_EXTSZINHERIT) 980 if (xflags & XFS_XFLAG_EXTSZINHERIT)
986 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 981 di_flags |= XFS_DIFLAG_EXTSZINHERIT;
982 if (xflags & XFS_XFLAG_PROJINHERIT)
983 di_flags |= XFS_DIFLAG_PROJINHERIT;
987 } else if (S_ISREG(ip->i_d.di_mode)) { 984 } else if (S_ISREG(ip->i_d.di_mode)) {
988 if (xflags & XFS_XFLAG_REALTIME) 985 if (xflags & XFS_XFLAG_REALTIME)
989 di_flags |= XFS_DIFLAG_REALTIME; 986 di_flags |= XFS_DIFLAG_REALTIME;
@@ -1041,16 +1038,16 @@ xfs_ioctl_setattr(
1041 trace_xfs_ioctl_setattr(ip); 1038 trace_xfs_ioctl_setattr(ip);
1042 1039
1043 if (mp->m_flags & XFS_MOUNT_RDONLY) 1040 if (mp->m_flags & XFS_MOUNT_RDONLY)
1044 return XFS_ERROR(EROFS); 1041 return -EROFS;
1045 if (XFS_FORCED_SHUTDOWN(mp)) 1042 if (XFS_FORCED_SHUTDOWN(mp))
1046 return XFS_ERROR(EIO); 1043 return -EIO;
1047 1044
1048 /* 1045 /*
1049 * Disallow 32bit project ids when projid32bit feature is not enabled. 1046 * Disallow 32bit project ids when projid32bit feature is not enabled.
1050 */ 1047 */
1051 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && 1048 if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
1052 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) 1049 !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
1053 return XFS_ERROR(EINVAL); 1050 return -EINVAL;
1054 1051
1055 /* 1052 /*
1056 * If disk quotas is on, we make sure that the dquots do exist on disk, 1053 * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1088,7 +1085,7 @@ xfs_ioctl_setattr(
1088 * CAP_FSETID capability is applicable. 1085 * CAP_FSETID capability is applicable.
1089 */ 1086 */
1090 if (!inode_owner_or_capable(VFS_I(ip))) { 1087 if (!inode_owner_or_capable(VFS_I(ip))) {
1091 code = XFS_ERROR(EPERM); 1088 code = -EPERM;
1092 goto error_return; 1089 goto error_return;
1093 } 1090 }
1094 1091
@@ -1099,7 +1096,7 @@ xfs_ioctl_setattr(
1099 */ 1096 */
1100 if (mask & FSX_PROJID) { 1097 if (mask & FSX_PROJID) {
1101 if (current_user_ns() != &init_user_ns) { 1098 if (current_user_ns() != &init_user_ns) {
1102 code = XFS_ERROR(EINVAL); 1099 code = -EINVAL;
1103 goto error_return; 1100 goto error_return;
1104 } 1101 }
1105 1102
@@ -1122,7 +1119,7 @@ xfs_ioctl_setattr(
1122 if (ip->i_d.di_nextents && 1119 if (ip->i_d.di_nextents &&
1123 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != 1120 ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
1124 fa->fsx_extsize)) { 1121 fa->fsx_extsize)) {
1125 code = XFS_ERROR(EINVAL); /* EFBIG? */ 1122 code = -EINVAL; /* EFBIG? */
1126 goto error_return; 1123 goto error_return;
1127 } 1124 }
1128 1125
@@ -1141,7 +1138,7 @@ xfs_ioctl_setattr(
1141 1138
1142 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize); 1139 extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
1143 if (extsize_fsb > MAXEXTLEN) { 1140 if (extsize_fsb > MAXEXTLEN) {
1144 code = XFS_ERROR(EINVAL); 1141 code = -EINVAL;
1145 goto error_return; 1142 goto error_return;
1146 } 1143 }
1147 1144
@@ -1153,13 +1150,13 @@ xfs_ioctl_setattr(
1153 } else { 1150 } else {
1154 size = mp->m_sb.sb_blocksize; 1151 size = mp->m_sb.sb_blocksize;
1155 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) { 1152 if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
1156 code = XFS_ERROR(EINVAL); 1153 code = -EINVAL;
1157 goto error_return; 1154 goto error_return;
1158 } 1155 }
1159 } 1156 }
1160 1157
1161 if (fa->fsx_extsize % size) { 1158 if (fa->fsx_extsize % size) {
1162 code = XFS_ERROR(EINVAL); 1159 code = -EINVAL;
1163 goto error_return; 1160 goto error_return;
1164 } 1161 }
1165 } 1162 }
@@ -1173,7 +1170,7 @@ xfs_ioctl_setattr(
1173 if ((ip->i_d.di_nextents || ip->i_delayed_blks) && 1170 if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
1174 (XFS_IS_REALTIME_INODE(ip)) != 1171 (XFS_IS_REALTIME_INODE(ip)) !=
1175 (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { 1172 (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
1176 code = XFS_ERROR(EINVAL); /* EFBIG? */ 1173 code = -EINVAL; /* EFBIG? */
1177 goto error_return; 1174 goto error_return;
1178 } 1175 }
1179 1176
@@ -1184,7 +1181,7 @@ xfs_ioctl_setattr(
1184 if ((mp->m_sb.sb_rblocks == 0) || 1181 if ((mp->m_sb.sb_rblocks == 0) ||
1185 (mp->m_sb.sb_rextsize == 0) || 1182 (mp->m_sb.sb_rextsize == 0) ||
1186 (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) { 1183 (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
1187 code = XFS_ERROR(EINVAL); 1184 code = -EINVAL;
1188 goto error_return; 1185 goto error_return;
1189 } 1186 }
1190 } 1187 }
@@ -1198,7 +1195,7 @@ xfs_ioctl_setattr(
1198 (fa->fsx_xflags & 1195 (fa->fsx_xflags &
1199 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && 1196 (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
1200 !capable(CAP_LINUX_IMMUTABLE)) { 1197 !capable(CAP_LINUX_IMMUTABLE)) {
1201 code = XFS_ERROR(EPERM); 1198 code = -EPERM;
1202 goto error_return; 1199 goto error_return;
1203 } 1200 }
1204 } 1201 }
@@ -1234,13 +1231,25 @@ xfs_ioctl_setattr(
1234 1231
1235 } 1232 }
1236 1233
1237 if (mask & FSX_EXTSIZE)
1238 ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
1239 if (mask & FSX_XFLAGS) { 1234 if (mask & FSX_XFLAGS) {
1240 xfs_set_diflags(ip, fa->fsx_xflags); 1235 xfs_set_diflags(ip, fa->fsx_xflags);
1241 xfs_diflags_to_linux(ip); 1236 xfs_diflags_to_linux(ip);
1242 } 1237 }
1243 1238
1239 /*
1240 * Only set the extent size hint if we've already determined that the
1241 * extent size hint should be set on the inode. If no extent size flags
1242 * are set on the inode then unconditionally clear the extent size hint.
1243 */
1244 if (mask & FSX_EXTSIZE) {
1245 int extsize = 0;
1246
1247 if (ip->i_d.di_flags &
1248 (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
1249 extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
1250 ip->i_d.di_extsize = extsize;
1251 }
1252
1244 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1253 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1245 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1254 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1246 1255
@@ -1301,7 +1310,7 @@ xfs_ioc_fssetxattr(
1301 return error; 1310 return error;
1302 error = xfs_ioctl_setattr(ip, &fa, mask); 1311 error = xfs_ioctl_setattr(ip, &fa, mask);
1303 mnt_drop_write_file(filp); 1312 mnt_drop_write_file(filp);
1304 return -error; 1313 return error;
1305} 1314}
1306 1315
1307STATIC int 1316STATIC int
@@ -1346,17 +1355,17 @@ xfs_ioc_setxflags(
1346 return error; 1355 return error;
1347 error = xfs_ioctl_setattr(ip, &fa, mask); 1356 error = xfs_ioctl_setattr(ip, &fa, mask);
1348 mnt_drop_write_file(filp); 1357 mnt_drop_write_file(filp);
1349 return -error; 1358 return error;
1350} 1359}
1351 1360
1352STATIC int 1361STATIC int
1353xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) 1362xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
1354{ 1363{
1355 struct getbmap __user *base = *ap; 1364 struct getbmap __user *base = (struct getbmap __user *)*ap;
1356 1365
1357 /* copy only getbmap portion (not getbmapx) */ 1366 /* copy only getbmap portion (not getbmapx) */
1358 if (copy_to_user(base, bmv, sizeof(struct getbmap))) 1367 if (copy_to_user(base, bmv, sizeof(struct getbmap)))
1359 return XFS_ERROR(EFAULT); 1368 return -EFAULT;
1360 1369
1361 *ap += sizeof(struct getbmap); 1370 *ap += sizeof(struct getbmap);
1362 return 0; 1371 return 0;
@@ -1373,33 +1382,33 @@ xfs_ioc_getbmap(
1373 int error; 1382 int error;
1374 1383
1375 if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) 1384 if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
1376 return -XFS_ERROR(EFAULT); 1385 return -EFAULT;
1377 1386
1378 if (bmx.bmv_count < 2) 1387 if (bmx.bmv_count < 2)
1379 return -XFS_ERROR(EINVAL); 1388 return -EINVAL;
1380 1389
1381 bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); 1390 bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
1382 if (ioflags & IO_INVIS) 1391 if (ioflags & XFS_IO_INVIS)
1383 bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; 1392 bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
1384 1393
1385 error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, 1394 error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
1386 (struct getbmap *)arg+1); 1395 (__force struct getbmap *)arg+1);
1387 if (error) 1396 if (error)
1388 return -error; 1397 return error;
1389 1398
1390 /* copy back header - only size of getbmap */ 1399 /* copy back header - only size of getbmap */
1391 if (copy_to_user(arg, &bmx, sizeof(struct getbmap))) 1400 if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
1392 return -XFS_ERROR(EFAULT); 1401 return -EFAULT;
1393 return 0; 1402 return 0;
1394} 1403}
1395 1404
1396STATIC int 1405STATIC int
1397xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) 1406xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
1398{ 1407{
1399 struct getbmapx __user *base = *ap; 1408 struct getbmapx __user *base = (struct getbmapx __user *)*ap;
1400 1409
1401 if (copy_to_user(base, bmv, sizeof(struct getbmapx))) 1410 if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
1402 return XFS_ERROR(EFAULT); 1411 return -EFAULT;
1403 1412
1404 *ap += sizeof(struct getbmapx); 1413 *ap += sizeof(struct getbmapx);
1405 return 0; 1414 return 0;
@@ -1414,22 +1423,22 @@ xfs_ioc_getbmapx(
1414 int error; 1423 int error;
1415 1424
1416 if (copy_from_user(&bmx, arg, sizeof(bmx))) 1425 if (copy_from_user(&bmx, arg, sizeof(bmx)))
1417 return -XFS_ERROR(EFAULT); 1426 return -EFAULT;
1418 1427
1419 if (bmx.bmv_count < 2) 1428 if (bmx.bmv_count < 2)
1420 return -XFS_ERROR(EINVAL); 1429 return -EINVAL;
1421 1430
1422 if (bmx.bmv_iflags & (~BMV_IF_VALID)) 1431 if (bmx.bmv_iflags & (~BMV_IF_VALID))
1423 return -XFS_ERROR(EINVAL); 1432 return -EINVAL;
1424 1433
1425 error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, 1434 error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
1426 (struct getbmapx *)arg+1); 1435 (__force struct getbmapx *)arg+1);
1427 if (error) 1436 if (error)
1428 return -error; 1437 return error;
1429 1438
1430 /* copy back header */ 1439 /* copy back header */
1431 if (copy_to_user(arg, &bmx, sizeof(struct getbmapx))) 1440 if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
1432 return -XFS_ERROR(EFAULT); 1441 return -EFAULT;
1433 1442
1434 return 0; 1443 return 0;
1435} 1444}
@@ -1445,33 +1454,33 @@ xfs_ioc_swapext(
1445 /* Pull information for the target fd */ 1454 /* Pull information for the target fd */
1446 f = fdget((int)sxp->sx_fdtarget); 1455 f = fdget((int)sxp->sx_fdtarget);
1447 if (!f.file) { 1456 if (!f.file) {
1448 error = XFS_ERROR(EINVAL); 1457 error = -EINVAL;
1449 goto out; 1458 goto out;
1450 } 1459 }
1451 1460
1452 if (!(f.file->f_mode & FMODE_WRITE) || 1461 if (!(f.file->f_mode & FMODE_WRITE) ||
1453 !(f.file->f_mode & FMODE_READ) || 1462 !(f.file->f_mode & FMODE_READ) ||
1454 (f.file->f_flags & O_APPEND)) { 1463 (f.file->f_flags & O_APPEND)) {
1455 error = XFS_ERROR(EBADF); 1464 error = -EBADF;
1456 goto out_put_file; 1465 goto out_put_file;
1457 } 1466 }
1458 1467
1459 tmp = fdget((int)sxp->sx_fdtmp); 1468 tmp = fdget((int)sxp->sx_fdtmp);
1460 if (!tmp.file) { 1469 if (!tmp.file) {
1461 error = XFS_ERROR(EINVAL); 1470 error = -EINVAL;
1462 goto out_put_file; 1471 goto out_put_file;
1463 } 1472 }
1464 1473
1465 if (!(tmp.file->f_mode & FMODE_WRITE) || 1474 if (!(tmp.file->f_mode & FMODE_WRITE) ||
1466 !(tmp.file->f_mode & FMODE_READ) || 1475 !(tmp.file->f_mode & FMODE_READ) ||
1467 (tmp.file->f_flags & O_APPEND)) { 1476 (tmp.file->f_flags & O_APPEND)) {
1468 error = XFS_ERROR(EBADF); 1477 error = -EBADF;
1469 goto out_put_tmp_file; 1478 goto out_put_tmp_file;
1470 } 1479 }
1471 1480
1472 if (IS_SWAPFILE(file_inode(f.file)) || 1481 if (IS_SWAPFILE(file_inode(f.file)) ||
1473 IS_SWAPFILE(file_inode(tmp.file))) { 1482 IS_SWAPFILE(file_inode(tmp.file))) {
1474 error = XFS_ERROR(EINVAL); 1483 error = -EINVAL;
1475 goto out_put_tmp_file; 1484 goto out_put_tmp_file;
1476 } 1485 }
1477 1486
@@ -1479,17 +1488,17 @@ xfs_ioc_swapext(
1479 tip = XFS_I(file_inode(tmp.file)); 1488 tip = XFS_I(file_inode(tmp.file));
1480 1489
1481 if (ip->i_mount != tip->i_mount) { 1490 if (ip->i_mount != tip->i_mount) {
1482 error = XFS_ERROR(EINVAL); 1491 error = -EINVAL;
1483 goto out_put_tmp_file; 1492 goto out_put_tmp_file;
1484 } 1493 }
1485 1494
1486 if (ip->i_ino == tip->i_ino) { 1495 if (ip->i_ino == tip->i_ino) {
1487 error = XFS_ERROR(EINVAL); 1496 error = -EINVAL;
1488 goto out_put_tmp_file; 1497 goto out_put_tmp_file;
1489 } 1498 }
1490 1499
1491 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1500 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1492 error = XFS_ERROR(EIO); 1501 error = -EIO;
1493 goto out_put_tmp_file; 1502 goto out_put_tmp_file;
1494 } 1503 }
1495 1504
@@ -1523,7 +1532,7 @@ xfs_file_ioctl(
1523 int error; 1532 int error;
1524 1533
1525 if (filp->f_mode & FMODE_NOCMTIME) 1534 if (filp->f_mode & FMODE_NOCMTIME)
1526 ioflags |= IO_INVIS; 1535 ioflags |= XFS_IO_INVIS;
1527 1536
1528 trace_xfs_file_ioctl(ip); 1537 trace_xfs_file_ioctl(ip);
1529 1538
@@ -1542,7 +1551,7 @@ xfs_file_ioctl(
1542 xfs_flock64_t bf; 1551 xfs_flock64_t bf;
1543 1552
1544 if (copy_from_user(&bf, arg, sizeof(bf))) 1553 if (copy_from_user(&bf, arg, sizeof(bf)))
1545 return -XFS_ERROR(EFAULT); 1554 return -EFAULT;
1546 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); 1555 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
1547 } 1556 }
1548 case XFS_IOC_DIOINFO: { 1557 case XFS_IOC_DIOINFO: {
@@ -1555,7 +1564,7 @@ xfs_file_ioctl(
1555 da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1); 1564 da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
1556 1565
1557 if (copy_to_user(arg, &da, sizeof(da))) 1566 if (copy_to_user(arg, &da, sizeof(da)))
1558 return -XFS_ERROR(EFAULT); 1567 return -EFAULT;
1559 return 0; 1568 return 0;
1560 } 1569 }
1561 1570
@@ -1588,7 +1597,7 @@ xfs_file_ioctl(
1588 struct fsdmidata dmi; 1597 struct fsdmidata dmi;
1589 1598
1590 if (copy_from_user(&dmi, arg, sizeof(dmi))) 1599 if (copy_from_user(&dmi, arg, sizeof(dmi)))
1591 return -XFS_ERROR(EFAULT); 1600 return -EFAULT;
1592 1601
1593 error = mnt_want_write_file(filp); 1602 error = mnt_want_write_file(filp);
1594 if (error) 1603 if (error)
@@ -1597,7 +1606,7 @@ xfs_file_ioctl(
1597 error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask, 1606 error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
1598 dmi.fsd_dmstate); 1607 dmi.fsd_dmstate);
1599 mnt_drop_write_file(filp); 1608 mnt_drop_write_file(filp);
1600 return -error; 1609 return error;
1601 } 1610 }
1602 1611
1603 case XFS_IOC_GETBMAP: 1612 case XFS_IOC_GETBMAP:
@@ -1613,14 +1622,14 @@ xfs_file_ioctl(
1613 xfs_fsop_handlereq_t hreq; 1622 xfs_fsop_handlereq_t hreq;
1614 1623
1615 if (copy_from_user(&hreq, arg, sizeof(hreq))) 1624 if (copy_from_user(&hreq, arg, sizeof(hreq)))
1616 return -XFS_ERROR(EFAULT); 1625 return -EFAULT;
1617 return xfs_find_handle(cmd, &hreq); 1626 return xfs_find_handle(cmd, &hreq);
1618 } 1627 }
1619 case XFS_IOC_OPEN_BY_HANDLE: { 1628 case XFS_IOC_OPEN_BY_HANDLE: {
1620 xfs_fsop_handlereq_t hreq; 1629 xfs_fsop_handlereq_t hreq;
1621 1630
1622 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) 1631 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
1623 return -XFS_ERROR(EFAULT); 1632 return -EFAULT;
1624 return xfs_open_by_handle(filp, &hreq); 1633 return xfs_open_by_handle(filp, &hreq);
1625 } 1634 }
1626 case XFS_IOC_FSSETDM_BY_HANDLE: 1635 case XFS_IOC_FSSETDM_BY_HANDLE:
@@ -1630,7 +1639,7 @@ xfs_file_ioctl(
1630 xfs_fsop_handlereq_t hreq; 1639 xfs_fsop_handlereq_t hreq;
1631 1640
1632 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t))) 1641 if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
1633 return -XFS_ERROR(EFAULT); 1642 return -EFAULT;
1634 return xfs_readlink_by_handle(filp, &hreq); 1643 return xfs_readlink_by_handle(filp, &hreq);
1635 } 1644 }
1636 case XFS_IOC_ATTRLIST_BY_HANDLE: 1645 case XFS_IOC_ATTRLIST_BY_HANDLE:
@@ -1643,13 +1652,13 @@ xfs_file_ioctl(
1643 struct xfs_swapext sxp; 1652 struct xfs_swapext sxp;
1644 1653
1645 if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t))) 1654 if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
1646 return -XFS_ERROR(EFAULT); 1655 return -EFAULT;
1647 error = mnt_want_write_file(filp); 1656 error = mnt_want_write_file(filp);
1648 if (error) 1657 if (error)
1649 return error; 1658 return error;
1650 error = xfs_ioc_swapext(&sxp); 1659 error = xfs_ioc_swapext(&sxp);
1651 mnt_drop_write_file(filp); 1660 mnt_drop_write_file(filp);
1652 return -error; 1661 return error;
1653 } 1662 }
1654 1663
1655 case XFS_IOC_FSCOUNTS: { 1664 case XFS_IOC_FSCOUNTS: {
@@ -1657,10 +1666,10 @@ xfs_file_ioctl(
1657 1666
1658 error = xfs_fs_counts(mp, &out); 1667 error = xfs_fs_counts(mp, &out);
1659 if (error) 1668 if (error)
1660 return -error; 1669 return error;
1661 1670
1662 if (copy_to_user(arg, &out, sizeof(out))) 1671 if (copy_to_user(arg, &out, sizeof(out)))
1663 return -XFS_ERROR(EFAULT); 1672 return -EFAULT;
1664 return 0; 1673 return 0;
1665 } 1674 }
1666 1675
@@ -1672,10 +1681,10 @@ xfs_file_ioctl(
1672 return -EPERM; 1681 return -EPERM;
1673 1682
1674 if (mp->m_flags & XFS_MOUNT_RDONLY) 1683 if (mp->m_flags & XFS_MOUNT_RDONLY)
1675 return -XFS_ERROR(EROFS); 1684 return -EROFS;
1676 1685
1677 if (copy_from_user(&inout, arg, sizeof(inout))) 1686 if (copy_from_user(&inout, arg, sizeof(inout)))
1678 return -XFS_ERROR(EFAULT); 1687 return -EFAULT;
1679 1688
1680 error = mnt_want_write_file(filp); 1689 error = mnt_want_write_file(filp);
1681 if (error) 1690 if (error)
@@ -1686,10 +1695,10 @@ xfs_file_ioctl(
1686 error = xfs_reserve_blocks(mp, &in, &inout); 1695 error = xfs_reserve_blocks(mp, &in, &inout);
1687 mnt_drop_write_file(filp); 1696 mnt_drop_write_file(filp);
1688 if (error) 1697 if (error)
1689 return -error; 1698 return error;
1690 1699
1691 if (copy_to_user(arg, &inout, sizeof(inout))) 1700 if (copy_to_user(arg, &inout, sizeof(inout)))
1692 return -XFS_ERROR(EFAULT); 1701 return -EFAULT;
1693 return 0; 1702 return 0;
1694 } 1703 }
1695 1704
@@ -1701,10 +1710,10 @@ xfs_file_ioctl(
1701 1710
1702 error = xfs_reserve_blocks(mp, NULL, &out); 1711 error = xfs_reserve_blocks(mp, NULL, &out);
1703 if (error) 1712 if (error)
1704 return -error; 1713 return error;
1705 1714
1706 if (copy_to_user(arg, &out, sizeof(out))) 1715 if (copy_to_user(arg, &out, sizeof(out)))
1707 return -XFS_ERROR(EFAULT); 1716 return -EFAULT;
1708 1717
1709 return 0; 1718 return 0;
1710 } 1719 }
@@ -1713,42 +1722,42 @@ xfs_file_ioctl(
1713 xfs_growfs_data_t in; 1722 xfs_growfs_data_t in;
1714 1723
1715 if (copy_from_user(&in, arg, sizeof(in))) 1724 if (copy_from_user(&in, arg, sizeof(in)))
1716 return -XFS_ERROR(EFAULT); 1725 return -EFAULT;
1717 1726
1718 error = mnt_want_write_file(filp); 1727 error = mnt_want_write_file(filp);
1719 if (error) 1728 if (error)
1720 return error; 1729 return error;
1721 error = xfs_growfs_data(mp, &in); 1730 error = xfs_growfs_data(mp, &in);
1722 mnt_drop_write_file(filp); 1731 mnt_drop_write_file(filp);
1723 return -error; 1732 return error;
1724 } 1733 }
1725 1734
1726 case XFS_IOC_FSGROWFSLOG: { 1735 case XFS_IOC_FSGROWFSLOG: {
1727 xfs_growfs_log_t in; 1736 xfs_growfs_log_t in;
1728 1737
1729 if (copy_from_user(&in, arg, sizeof(in))) 1738 if (copy_from_user(&in, arg, sizeof(in)))
1730 return -XFS_ERROR(EFAULT); 1739 return -EFAULT;
1731 1740
1732 error = mnt_want_write_file(filp); 1741 error = mnt_want_write_file(filp);
1733 if (error) 1742 if (error)
1734 return error; 1743 return error;
1735 error = xfs_growfs_log(mp, &in); 1744 error = xfs_growfs_log(mp, &in);
1736 mnt_drop_write_file(filp); 1745 mnt_drop_write_file(filp);
1737 return -error; 1746 return error;
1738 } 1747 }
1739 1748
1740 case XFS_IOC_FSGROWFSRT: { 1749 case XFS_IOC_FSGROWFSRT: {
1741 xfs_growfs_rt_t in; 1750 xfs_growfs_rt_t in;
1742 1751
1743 if (copy_from_user(&in, arg, sizeof(in))) 1752 if (copy_from_user(&in, arg, sizeof(in)))
1744 return -XFS_ERROR(EFAULT); 1753 return -EFAULT;
1745 1754
1746 error = mnt_want_write_file(filp); 1755 error = mnt_want_write_file(filp);
1747 if (error) 1756 if (error)
1748 return error; 1757 return error;
1749 error = xfs_growfs_rt(mp, &in); 1758 error = xfs_growfs_rt(mp, &in);
1750 mnt_drop_write_file(filp); 1759 mnt_drop_write_file(filp);
1751 return -error; 1760 return error;
1752 } 1761 }
1753 1762
1754 case XFS_IOC_GOINGDOWN: { 1763 case XFS_IOC_GOINGDOWN: {
@@ -1758,10 +1767,9 @@ xfs_file_ioctl(
1758 return -EPERM; 1767 return -EPERM;
1759 1768
1760 if (get_user(in, (__uint32_t __user *)arg)) 1769 if (get_user(in, (__uint32_t __user *)arg))
1761 return -XFS_ERROR(EFAULT); 1770 return -EFAULT;
1762 1771
1763 error = xfs_fs_goingdown(mp, in); 1772 return xfs_fs_goingdown(mp, in);
1764 return -error;
1765 } 1773 }
1766 1774
1767 case XFS_IOC_ERROR_INJECTION: { 1775 case XFS_IOC_ERROR_INJECTION: {
@@ -1771,18 +1779,16 @@ xfs_file_ioctl(
1771 return -EPERM; 1779 return -EPERM;
1772 1780
1773 if (copy_from_user(&in, arg, sizeof(in))) 1781 if (copy_from_user(&in, arg, sizeof(in)))
1774 return -XFS_ERROR(EFAULT); 1782 return -EFAULT;
1775 1783
1776 error = xfs_errortag_add(in.errtag, mp); 1784 return xfs_errortag_add(in.errtag, mp);
1777 return -error;
1778 } 1785 }
1779 1786
1780 case XFS_IOC_ERROR_CLEARALL: 1787 case XFS_IOC_ERROR_CLEARALL:
1781 if (!capable(CAP_SYS_ADMIN)) 1788 if (!capable(CAP_SYS_ADMIN))
1782 return -EPERM; 1789 return -EPERM;
1783 1790
1784 error = xfs_errortag_clearall(mp, 1); 1791 return xfs_errortag_clearall(mp, 1);
1785 return -error;
1786 1792
1787 case XFS_IOC_FREE_EOFBLOCKS: { 1793 case XFS_IOC_FREE_EOFBLOCKS: {
1788 struct xfs_fs_eofblocks eofb; 1794 struct xfs_fs_eofblocks eofb;
@@ -1792,16 +1798,16 @@ xfs_file_ioctl(
1792 return -EPERM; 1798 return -EPERM;
1793 1799
1794 if (mp->m_flags & XFS_MOUNT_RDONLY) 1800 if (mp->m_flags & XFS_MOUNT_RDONLY)
1795 return -XFS_ERROR(EROFS); 1801 return -EROFS;
1796 1802
1797 if (copy_from_user(&eofb, arg, sizeof(eofb))) 1803 if (copy_from_user(&eofb, arg, sizeof(eofb)))
1798 return -XFS_ERROR(EFAULT); 1804 return -EFAULT;
1799 1805
1800 error = xfs_fs_eofblocks_from_user(&eofb, &keofb); 1806 error = xfs_fs_eofblocks_from_user(&eofb, &keofb);
1801 if (error) 1807 if (error)
1802 return -error; 1808 return error;
1803 1809
1804 return -xfs_icache_free_eofblocks(mp, &keofb); 1810 return xfs_icache_free_eofblocks(mp, &keofb);
1805 } 1811 }
1806 1812
1807 default: 1813 default:
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 944d5baa710a..94ce027e28e3 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -28,7 +28,6 @@
28#include "xfs_sb.h" 28#include "xfs_sb.h"
29#include "xfs_ag.h" 29#include "xfs_ag.h"
30#include "xfs_mount.h" 30#include "xfs_mount.h"
31#include "xfs_vnode.h"
32#include "xfs_inode.h" 31#include "xfs_inode.h"
33#include "xfs_itable.h" 32#include "xfs_itable.h"
34#include "xfs_error.h" 33#include "xfs_error.h"
@@ -56,7 +55,7 @@ xfs_compat_flock64_copyin(
56 get_user(bf->l_sysid, &arg32->l_sysid) || 55 get_user(bf->l_sysid, &arg32->l_sysid) ||
57 get_user(bf->l_pid, &arg32->l_pid) || 56 get_user(bf->l_pid, &arg32->l_pid) ||
58 copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32))) 57 copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32)))
59 return -XFS_ERROR(EFAULT); 58 return -EFAULT;
60 return 0; 59 return 0;
61} 60}
62 61
@@ -70,10 +69,10 @@ xfs_compat_ioc_fsgeometry_v1(
70 69
71 error = xfs_fs_geometry(mp, &fsgeo, 3); 70 error = xfs_fs_geometry(mp, &fsgeo, 3);
72 if (error) 71 if (error)
73 return -error; 72 return error;
74 /* The 32-bit variant simply has some padding at the end */ 73 /* The 32-bit variant simply has some padding at the end */
75 if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) 74 if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
76 return -XFS_ERROR(EFAULT); 75 return -EFAULT;
77 return 0; 76 return 0;
78} 77}
79 78
@@ -84,7 +83,7 @@ xfs_compat_growfs_data_copyin(
84{ 83{
85 if (get_user(in->newblocks, &arg32->newblocks) || 84 if (get_user(in->newblocks, &arg32->newblocks) ||
86 get_user(in->imaxpct, &arg32->imaxpct)) 85 get_user(in->imaxpct, &arg32->imaxpct))
87 return -XFS_ERROR(EFAULT); 86 return -EFAULT;
88 return 0; 87 return 0;
89} 88}
90 89
@@ -95,14 +94,14 @@ xfs_compat_growfs_rt_copyin(
95{ 94{
96 if (get_user(in->newblocks, &arg32->newblocks) || 95 if (get_user(in->newblocks, &arg32->newblocks) ||
97 get_user(in->extsize, &arg32->extsize)) 96 get_user(in->extsize, &arg32->extsize))
98 return -XFS_ERROR(EFAULT); 97 return -EFAULT;
99 return 0; 98 return 0;
100} 99}
101 100
102STATIC int 101STATIC int
103xfs_inumbers_fmt_compat( 102xfs_inumbers_fmt_compat(
104 void __user *ubuffer, 103 void __user *ubuffer,
105 const xfs_inogrp_t *buffer, 104 const struct xfs_inogrp *buffer,
106 long count, 105 long count,
107 long *written) 106 long *written)
108{ 107{
@@ -113,7 +112,7 @@ xfs_inumbers_fmt_compat(
113 if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) || 112 if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) ||
114 put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) || 113 put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
115 put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask)) 114 put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask))
116 return -XFS_ERROR(EFAULT); 115 return -EFAULT;
117 } 116 }
118 *written = count * sizeof(*p32); 117 *written = count * sizeof(*p32);
119 return 0; 118 return 0;
@@ -132,7 +131,7 @@ xfs_ioctl32_bstime_copyin(
132 131
133 if (get_user(sec32, &bstime32->tv_sec) || 132 if (get_user(sec32, &bstime32->tv_sec) ||
134 get_user(bstime->tv_nsec, &bstime32->tv_nsec)) 133 get_user(bstime->tv_nsec, &bstime32->tv_nsec))
135 return -XFS_ERROR(EFAULT); 134 return -EFAULT;
136 bstime->tv_sec = sec32; 135 bstime->tv_sec = sec32;
137 return 0; 136 return 0;
138} 137}
@@ -161,10 +160,11 @@ xfs_ioctl32_bstat_copyin(
161 get_user(bstat->bs_gen, &bstat32->bs_gen) || 160 get_user(bstat->bs_gen, &bstat32->bs_gen) ||
162 get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || 161 get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
163 get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || 162 get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
163 get_user(bstat->bs_forkoff, &bstat32->bs_forkoff) ||
164 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || 164 get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
165 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || 165 get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
166 get_user(bstat->bs_aextents, &bstat32->bs_aextents)) 166 get_user(bstat->bs_aextents, &bstat32->bs_aextents))
167 return -XFS_ERROR(EFAULT); 167 return -EFAULT;
168 return 0; 168 return 0;
169} 169}
170 170
@@ -180,7 +180,7 @@ xfs_bstime_store_compat(
180 sec32 = p->tv_sec; 180 sec32 = p->tv_sec;
181 if (put_user(sec32, &p32->tv_sec) || 181 if (put_user(sec32, &p32->tv_sec) ||
182 put_user(p->tv_nsec, &p32->tv_nsec)) 182 put_user(p->tv_nsec, &p32->tv_nsec))
183 return -XFS_ERROR(EFAULT); 183 return -EFAULT;
184 return 0; 184 return 0;
185} 185}
186 186
@@ -195,7 +195,7 @@ xfs_bulkstat_one_fmt_compat(
195 compat_xfs_bstat_t __user *p32 = ubuffer; 195 compat_xfs_bstat_t __user *p32 = ubuffer;
196 196
197 if (ubsize < sizeof(*p32)) 197 if (ubsize < sizeof(*p32))
198 return XFS_ERROR(ENOMEM); 198 return -ENOMEM;
199 199
200 if (put_user(buffer->bs_ino, &p32->bs_ino) || 200 if (put_user(buffer->bs_ino, &p32->bs_ino) ||
201 put_user(buffer->bs_mode, &p32->bs_mode) || 201 put_user(buffer->bs_mode, &p32->bs_mode) ||
@@ -215,10 +215,11 @@ xfs_bulkstat_one_fmt_compat(
215 put_user(buffer->bs_gen, &p32->bs_gen) || 215 put_user(buffer->bs_gen, &p32->bs_gen) ||
216 put_user(buffer->bs_projid, &p32->bs_projid) || 216 put_user(buffer->bs_projid, &p32->bs_projid) ||
217 put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || 217 put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) ||
218 put_user(buffer->bs_forkoff, &p32->bs_forkoff) ||
218 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || 219 put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
219 put_user(buffer->bs_dmstate, &p32->bs_dmstate) || 220 put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
220 put_user(buffer->bs_aextents, &p32->bs_aextents)) 221 put_user(buffer->bs_aextents, &p32->bs_aextents))
221 return XFS_ERROR(EFAULT); 222 return -EFAULT;
222 if (ubused) 223 if (ubused)
223 *ubused = sizeof(*p32); 224 *ubused = sizeof(*p32);
224 return 0; 225 return 0;
@@ -256,30 +257,30 @@ xfs_compat_ioc_bulkstat(
256 /* should be called again (unused here, but used in dmapi) */ 257 /* should be called again (unused here, but used in dmapi) */
257 258
258 if (!capable(CAP_SYS_ADMIN)) 259 if (!capable(CAP_SYS_ADMIN))
259 return -XFS_ERROR(EPERM); 260 return -EPERM;
260 261
261 if (XFS_FORCED_SHUTDOWN(mp)) 262 if (XFS_FORCED_SHUTDOWN(mp))
262 return -XFS_ERROR(EIO); 263 return -EIO;
263 264
264 if (get_user(addr, &p32->lastip)) 265 if (get_user(addr, &p32->lastip))
265 return -XFS_ERROR(EFAULT); 266 return -EFAULT;
266 bulkreq.lastip = compat_ptr(addr); 267 bulkreq.lastip = compat_ptr(addr);
267 if (get_user(bulkreq.icount, &p32->icount) || 268 if (get_user(bulkreq.icount, &p32->icount) ||
268 get_user(addr, &p32->ubuffer)) 269 get_user(addr, &p32->ubuffer))
269 return -XFS_ERROR(EFAULT); 270 return -EFAULT;
270 bulkreq.ubuffer = compat_ptr(addr); 271 bulkreq.ubuffer = compat_ptr(addr);
271 if (get_user(addr, &p32->ocount)) 272 if (get_user(addr, &p32->ocount))
272 return -XFS_ERROR(EFAULT); 273 return -EFAULT;
273 bulkreq.ocount = compat_ptr(addr); 274 bulkreq.ocount = compat_ptr(addr);
274 275
275 if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64))) 276 if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
276 return -XFS_ERROR(EFAULT); 277 return -EFAULT;
277 278
278 if ((count = bulkreq.icount) <= 0) 279 if ((count = bulkreq.icount) <= 0)
279 return -XFS_ERROR(EINVAL); 280 return -EINVAL;
280 281
281 if (bulkreq.ubuffer == NULL) 282 if (bulkreq.ubuffer == NULL)
282 return -XFS_ERROR(EINVAL); 283 return -EINVAL;
283 284
284 if (cmd == XFS_IOC_FSINUMBERS_32) { 285 if (cmd == XFS_IOC_FSINUMBERS_32) {
285 error = xfs_inumbers(mp, &inlast, &count, 286 error = xfs_inumbers(mp, &inlast, &count,
@@ -294,17 +295,17 @@ xfs_compat_ioc_bulkstat(
294 xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t), 295 xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
295 bulkreq.ubuffer, &done); 296 bulkreq.ubuffer, &done);
296 } else 297 } else
297 error = XFS_ERROR(EINVAL); 298 error = -EINVAL;
298 if (error) 299 if (error)
299 return -error; 300 return error;
300 301
301 if (bulkreq.ocount != NULL) { 302 if (bulkreq.ocount != NULL) {
302 if (copy_to_user(bulkreq.lastip, &inlast, 303 if (copy_to_user(bulkreq.lastip, &inlast,
303 sizeof(xfs_ino_t))) 304 sizeof(xfs_ino_t)))
304 return -XFS_ERROR(EFAULT); 305 return -EFAULT;
305 306
306 if (copy_to_user(bulkreq.ocount, &count, sizeof(count))) 307 if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
307 return -XFS_ERROR(EFAULT); 308 return -EFAULT;
308 } 309 }
309 310
310 return 0; 311 return 0;
@@ -318,7 +319,7 @@ xfs_compat_handlereq_copyin(
318 compat_xfs_fsop_handlereq_t hreq32; 319 compat_xfs_fsop_handlereq_t hreq32;
319 320
320 if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t))) 321 if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
321 return -XFS_ERROR(EFAULT); 322 return -EFAULT;
322 323
323 hreq->fd = hreq32.fd; 324 hreq->fd = hreq32.fd;
324 hreq->path = compat_ptr(hreq32.path); 325 hreq->path = compat_ptr(hreq32.path);
@@ -352,19 +353,19 @@ xfs_compat_attrlist_by_handle(
352 char *kbuf; 353 char *kbuf;
353 354
354 if (!capable(CAP_SYS_ADMIN)) 355 if (!capable(CAP_SYS_ADMIN))
355 return -XFS_ERROR(EPERM); 356 return -EPERM;
356 if (copy_from_user(&al_hreq, arg, 357 if (copy_from_user(&al_hreq, arg,
357 sizeof(compat_xfs_fsop_attrlist_handlereq_t))) 358 sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
358 return -XFS_ERROR(EFAULT); 359 return -EFAULT;
359 if (al_hreq.buflen < sizeof(struct attrlist) || 360 if (al_hreq.buflen < sizeof(struct attrlist) ||
360 al_hreq.buflen > XATTR_LIST_MAX) 361 al_hreq.buflen > XATTR_LIST_MAX)
361 return -XFS_ERROR(EINVAL); 362 return -EINVAL;
362 363
363 /* 364 /*
364 * Reject flags, only allow namespaces. 365 * Reject flags, only allow namespaces.
365 */ 366 */
366 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE)) 367 if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
367 return -XFS_ERROR(EINVAL); 368 return -EINVAL;
368 369
369 dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq); 370 dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
370 if (IS_ERR(dentry)) 371 if (IS_ERR(dentry))
@@ -376,7 +377,7 @@ xfs_compat_attrlist_by_handle(
376 goto out_dput; 377 goto out_dput;
377 378
378 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; 379 cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
379 error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, 380 error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
380 al_hreq.flags, cursor); 381 al_hreq.flags, cursor);
381 if (error) 382 if (error)
382 goto out_kfree; 383 goto out_kfree;
@@ -404,10 +405,10 @@ xfs_compat_attrmulti_by_handle(
404 unsigned char *attr_name; 405 unsigned char *attr_name;
405 406
406 if (!capable(CAP_SYS_ADMIN)) 407 if (!capable(CAP_SYS_ADMIN))
407 return -XFS_ERROR(EPERM); 408 return -EPERM;
408 if (copy_from_user(&am_hreq, arg, 409 if (copy_from_user(&am_hreq, arg,
409 sizeof(compat_xfs_fsop_attrmulti_handlereq_t))) 410 sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
410 return -XFS_ERROR(EFAULT); 411 return -EFAULT;
411 412
412 /* overflow check */ 413 /* overflow check */
413 if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t)) 414 if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
@@ -417,7 +418,7 @@ xfs_compat_attrmulti_by_handle(
417 if (IS_ERR(dentry)) 418 if (IS_ERR(dentry))
418 return PTR_ERR(dentry); 419 return PTR_ERR(dentry);
419 420
420 error = E2BIG; 421 error = -E2BIG;
421 size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t); 422 size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
422 if (!size || size > 16 * PAGE_SIZE) 423 if (!size || size > 16 * PAGE_SIZE)
423 goto out_dput; 424 goto out_dput;
@@ -428,7 +429,7 @@ xfs_compat_attrmulti_by_handle(
428 goto out_dput; 429 goto out_dput;
429 } 430 }
430 431
431 error = ENOMEM; 432 error = -ENOMEM;
432 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL); 433 attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
433 if (!attr_name) 434 if (!attr_name)
434 goto out_kfree_ops; 435 goto out_kfree_ops;
@@ -439,7 +440,7 @@ xfs_compat_attrmulti_by_handle(
439 compat_ptr(ops[i].am_attrname), 440 compat_ptr(ops[i].am_attrname),
440 MAXNAMELEN); 441 MAXNAMELEN);
441 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN) 442 if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
442 error = ERANGE; 443 error = -ERANGE;
443 if (ops[i].am_error < 0) 444 if (ops[i].am_error < 0)
444 break; 445 break;
445 446
@@ -470,19 +471,19 @@ xfs_compat_attrmulti_by_handle(
470 mnt_drop_write_file(parfilp); 471 mnt_drop_write_file(parfilp);
471 break; 472 break;
472 default: 473 default:
473 ops[i].am_error = EINVAL; 474 ops[i].am_error = -EINVAL;
474 } 475 }
475 } 476 }
476 477
477 if (copy_to_user(compat_ptr(am_hreq.ops), ops, size)) 478 if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
478 error = XFS_ERROR(EFAULT); 479 error = -EFAULT;
479 480
480 kfree(attr_name); 481 kfree(attr_name);
481 out_kfree_ops: 482 out_kfree_ops:
482 kfree(ops); 483 kfree(ops);
483 out_dput: 484 out_dput:
484 dput(dentry); 485 dput(dentry);
485 return -error; 486 return error;
486} 487}
487 488
488STATIC int 489STATIC int
@@ -496,26 +497,26 @@ xfs_compat_fssetdm_by_handle(
496 struct dentry *dentry; 497 struct dentry *dentry;
497 498
498 if (!capable(CAP_MKNOD)) 499 if (!capable(CAP_MKNOD))
499 return -XFS_ERROR(EPERM); 500 return -EPERM;
500 if (copy_from_user(&dmhreq, arg, 501 if (copy_from_user(&dmhreq, arg,
501 sizeof(compat_xfs_fsop_setdm_handlereq_t))) 502 sizeof(compat_xfs_fsop_setdm_handlereq_t)))
502 return -XFS_ERROR(EFAULT); 503 return -EFAULT;
503 504
504 dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq); 505 dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
505 if (IS_ERR(dentry)) 506 if (IS_ERR(dentry))
506 return PTR_ERR(dentry); 507 return PTR_ERR(dentry);
507 508
508 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { 509 if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
509 error = -XFS_ERROR(EPERM); 510 error = -EPERM;
510 goto out; 511 goto out;
511 } 512 }
512 513
513 if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) { 514 if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
514 error = -XFS_ERROR(EFAULT); 515 error = -EFAULT;
515 goto out; 516 goto out;
516 } 517 }
517 518
518 error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, 519 error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
519 fsd.fsd_dmstate); 520 fsd.fsd_dmstate);
520 521
521out: 522out:
@@ -537,7 +538,7 @@ xfs_file_compat_ioctl(
537 int error; 538 int error;
538 539
539 if (filp->f_mode & FMODE_NOCMTIME) 540 if (filp->f_mode & FMODE_NOCMTIME)
540 ioflags |= IO_INVIS; 541 ioflags |= XFS_IO_INVIS;
541 542
542 trace_xfs_file_compat_ioctl(ip); 543 trace_xfs_file_compat_ioctl(ip);
543 544
@@ -588,7 +589,7 @@ xfs_file_compat_ioctl(
588 struct xfs_flock64 bf; 589 struct xfs_flock64 bf;
589 590
590 if (xfs_compat_flock64_copyin(&bf, arg)) 591 if (xfs_compat_flock64_copyin(&bf, arg))
591 return -XFS_ERROR(EFAULT); 592 return -EFAULT;
592 cmd = _NATIVE_IOC(cmd, struct xfs_flock64); 593 cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
593 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf); 594 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
594 } 595 }
@@ -598,25 +599,25 @@ xfs_file_compat_ioctl(
598 struct xfs_growfs_data in; 599 struct xfs_growfs_data in;
599 600
600 if (xfs_compat_growfs_data_copyin(&in, arg)) 601 if (xfs_compat_growfs_data_copyin(&in, arg))
601 return -XFS_ERROR(EFAULT); 602 return -EFAULT;
602 error = mnt_want_write_file(filp); 603 error = mnt_want_write_file(filp);
603 if (error) 604 if (error)
604 return error; 605 return error;
605 error = xfs_growfs_data(mp, &in); 606 error = xfs_growfs_data(mp, &in);
606 mnt_drop_write_file(filp); 607 mnt_drop_write_file(filp);
607 return -error; 608 return error;
608 } 609 }
609 case XFS_IOC_FSGROWFSRT_32: { 610 case XFS_IOC_FSGROWFSRT_32: {
610 struct xfs_growfs_rt in; 611 struct xfs_growfs_rt in;
611 612
612 if (xfs_compat_growfs_rt_copyin(&in, arg)) 613 if (xfs_compat_growfs_rt_copyin(&in, arg))
613 return -XFS_ERROR(EFAULT); 614 return -EFAULT;
614 error = mnt_want_write_file(filp); 615 error = mnt_want_write_file(filp);
615 if (error) 616 if (error)
616 return error; 617 return error;
617 error = xfs_growfs_rt(mp, &in); 618 error = xfs_growfs_rt(mp, &in);
618 mnt_drop_write_file(filp); 619 mnt_drop_write_file(filp);
619 return -error; 620 return error;
620 } 621 }
621#endif 622#endif
622 /* long changes size, but xfs only copiese out 32 bits */ 623 /* long changes size, but xfs only copiese out 32 bits */
@@ -633,13 +634,13 @@ xfs_file_compat_ioctl(
633 if (copy_from_user(&sxp, sxu, 634 if (copy_from_user(&sxp, sxu,
634 offsetof(struct xfs_swapext, sx_stat)) || 635 offsetof(struct xfs_swapext, sx_stat)) ||
635 xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat)) 636 xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
636 return -XFS_ERROR(EFAULT); 637 return -EFAULT;
637 error = mnt_want_write_file(filp); 638 error = mnt_want_write_file(filp);
638 if (error) 639 if (error)
639 return error; 640 return error;
640 error = xfs_ioc_swapext(&sxp); 641 error = xfs_ioc_swapext(&sxp);
641 mnt_drop_write_file(filp); 642 mnt_drop_write_file(filp);
642 return -error; 643 return error;
643 } 644 }
644 case XFS_IOC_FSBULKSTAT_32: 645 case XFS_IOC_FSBULKSTAT_32:
645 case XFS_IOC_FSBULKSTAT_SINGLE_32: 646 case XFS_IOC_FSBULKSTAT_SINGLE_32:
@@ -651,7 +652,7 @@ xfs_file_compat_ioctl(
651 struct xfs_fsop_handlereq hreq; 652 struct xfs_fsop_handlereq hreq;
652 653
653 if (xfs_compat_handlereq_copyin(&hreq, arg)) 654 if (xfs_compat_handlereq_copyin(&hreq, arg))
654 return -XFS_ERROR(EFAULT); 655 return -EFAULT;
655 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq); 656 cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
656 return xfs_find_handle(cmd, &hreq); 657 return xfs_find_handle(cmd, &hreq);
657 } 658 }
@@ -659,14 +660,14 @@ xfs_file_compat_ioctl(
659 struct xfs_fsop_handlereq hreq; 660 struct xfs_fsop_handlereq hreq;
660 661
661 if (xfs_compat_handlereq_copyin(&hreq, arg)) 662 if (xfs_compat_handlereq_copyin(&hreq, arg))
662 return -XFS_ERROR(EFAULT); 663 return -EFAULT;
663 return xfs_open_by_handle(filp, &hreq); 664 return xfs_open_by_handle(filp, &hreq);
664 } 665 }
665 case XFS_IOC_READLINK_BY_HANDLE_32: { 666 case XFS_IOC_READLINK_BY_HANDLE_32: {
666 struct xfs_fsop_handlereq hreq; 667 struct xfs_fsop_handlereq hreq;
667 668
668 if (xfs_compat_handlereq_copyin(&hreq, arg)) 669 if (xfs_compat_handlereq_copyin(&hreq, arg))
669 return -XFS_ERROR(EFAULT); 670 return -EFAULT;
670 return xfs_readlink_by_handle(filp, &hreq); 671 return xfs_readlink_by_handle(filp, &hreq);
671 } 672 }
672 case XFS_IOC_ATTRLIST_BY_HANDLE_32: 673 case XFS_IOC_ATTRLIST_BY_HANDLE_32:
@@ -676,6 +677,6 @@ xfs_file_compat_ioctl(
676 case XFS_IOC_FSSETDM_BY_HANDLE_32: 677 case XFS_IOC_FSSETDM_BY_HANDLE_32:
677 return xfs_compat_fssetdm_by_handle(filp, arg); 678 return xfs_compat_fssetdm_by_handle(filp, arg);
678 default: 679 default:
679 return -XFS_ERROR(ENOIOCTLCMD); 680 return -ENOIOCTLCMD;
680 } 681 }
681} 682}
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
index 80f4060e8970..b1bb45444df8 100644
--- a/fs/xfs/xfs_ioctl32.h
+++ b/fs/xfs/xfs_ioctl32.h
@@ -67,8 +67,9 @@ typedef struct compat_xfs_bstat {
67 __u32 bs_gen; /* generation count */ 67 __u32 bs_gen; /* generation count */
68 __u16 bs_projid_lo; /* lower part of project id */ 68 __u16 bs_projid_lo; /* lower part of project id */
69#define bs_projid bs_projid_lo /* (previously just bs_projid) */ 69#define bs_projid bs_projid_lo /* (previously just bs_projid) */
70 __u16 bs_forkoff; /* inode fork offset in bytes */
70 __u16 bs_projid_hi; /* high part of project id */ 71 __u16 bs_projid_hi; /* high part of project id */
71 unsigned char bs_pad[12]; /* pad space, unused */ 72 unsigned char bs_pad[10]; /* pad space, unused */
72 __u32 bs_dmevmask; /* DMIG event mask */ 73 __u32 bs_dmevmask; /* DMIG event mask */
73 __u16 bs_dmstate; /* DMIG state info */ 74 __u16 bs_dmstate; /* DMIG state info */
74 __u16 bs_aextents; /* attribute number of extents */ 75 __u16 bs_aextents; /* attribute number of extents */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6d3ec2b6ee29..afcf3c926565 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -110,7 +110,7 @@ xfs_alert_fsblock_zero(
110 (unsigned long long)imap->br_startoff, 110 (unsigned long long)imap->br_startoff,
111 (unsigned long long)imap->br_blockcount, 111 (unsigned long long)imap->br_blockcount,
112 imap->br_state); 112 imap->br_state);
113 return EFSCORRUPTED; 113 return -EFSCORRUPTED;
114} 114}
115 115
116int 116int
@@ -138,7 +138,7 @@ xfs_iomap_write_direct(
138 138
139 error = xfs_qm_dqattach(ip, 0); 139 error = xfs_qm_dqattach(ip, 0);
140 if (error) 140 if (error)
141 return XFS_ERROR(error); 141 return error;
142 142
143 rt = XFS_IS_REALTIME_INODE(ip); 143 rt = XFS_IS_REALTIME_INODE(ip);
144 extsz = xfs_get_extsz_hint(ip); 144 extsz = xfs_get_extsz_hint(ip);
@@ -148,7 +148,7 @@ xfs_iomap_write_direct(
148 if ((offset + count) > XFS_ISIZE(ip)) { 148 if ((offset + count) > XFS_ISIZE(ip)) {
149 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); 149 error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
150 if (error) 150 if (error)
151 return XFS_ERROR(error); 151 return error;
152 } else { 152 } else {
153 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) 153 if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
154 last_fsb = MIN(last_fsb, (xfs_fileoff_t) 154 last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -188,7 +188,7 @@ xfs_iomap_write_direct(
188 */ 188 */
189 if (error) { 189 if (error) {
190 xfs_trans_cancel(tp, 0); 190 xfs_trans_cancel(tp, 0);
191 return XFS_ERROR(error); 191 return error;
192 } 192 }
193 193
194 xfs_ilock(ip, XFS_ILOCK_EXCL); 194 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -225,7 +225,7 @@ xfs_iomap_write_direct(
225 * Copy any maps to caller's array and return any error. 225 * Copy any maps to caller's array and return any error.
226 */ 226 */
227 if (nimaps == 0) { 227 if (nimaps == 0) {
228 error = XFS_ERROR(ENOSPC); 228 error = -ENOSPC;
229 goto out_unlock; 229 goto out_unlock;
230 } 230 }
231 231
@@ -397,15 +397,17 @@ xfs_quota_calc_throttle(
397 struct xfs_inode *ip, 397 struct xfs_inode *ip,
398 int type, 398 int type,
399 xfs_fsblock_t *qblocks, 399 xfs_fsblock_t *qblocks,
400 int *qshift) 400 int *qshift,
401 int64_t *qfreesp)
401{ 402{
402 int64_t freesp; 403 int64_t freesp;
403 int shift = 0; 404 int shift = 0;
404 struct xfs_dquot *dq = xfs_inode_dquot(ip, type); 405 struct xfs_dquot *dq = xfs_inode_dquot(ip, type);
405 406
406 /* over hi wmark, squash the prealloc completely */ 407 /* no dq, or over hi wmark, squash the prealloc completely */
407 if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { 408 if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) {
408 *qblocks = 0; 409 *qblocks = 0;
410 *qfreesp = 0;
409 return; 411 return;
410 } 412 }
411 413
@@ -418,6 +420,9 @@ xfs_quota_calc_throttle(
418 shift += 2; 420 shift += 2;
419 } 421 }
420 422
423 if (freesp < *qfreesp)
424 *qfreesp = freesp;
425
421 /* only overwrite the throttle values if we are more aggressive */ 426 /* only overwrite the throttle values if we are more aggressive */
422 if ((freesp >> shift) < (*qblocks >> *qshift)) { 427 if ((freesp >> shift) < (*qblocks >> *qshift)) {
423 *qblocks = freesp; 428 *qblocks = freesp;
@@ -476,15 +481,18 @@ xfs_iomap_prealloc_size(
476 } 481 }
477 482
478 /* 483 /*
479 * Check each quota to cap the prealloc size and provide a shift 484 * Check each quota to cap the prealloc size, provide a shift value to
480 * value to throttle with. 485 * throttle with and adjust amount of available space.
481 */ 486 */
482 if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks)) 487 if (xfs_quota_need_throttle(ip, XFS_DQ_USER, alloc_blocks))
483 xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift); 488 xfs_quota_calc_throttle(ip, XFS_DQ_USER, &qblocks, &qshift,
489 &freesp);
484 if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks)) 490 if (xfs_quota_need_throttle(ip, XFS_DQ_GROUP, alloc_blocks))
485 xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift); 491 xfs_quota_calc_throttle(ip, XFS_DQ_GROUP, &qblocks, &qshift,
492 &freesp);
486 if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks)) 493 if (xfs_quota_need_throttle(ip, XFS_DQ_PROJ, alloc_blocks))
487 xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift); 494 xfs_quota_calc_throttle(ip, XFS_DQ_PROJ, &qblocks, &qshift,
495 &freesp);
488 496
489 /* 497 /*
490 * The final prealloc size is set to the minimum of free space available 498 * The final prealloc size is set to the minimum of free space available
@@ -552,7 +560,7 @@ xfs_iomap_write_delay(
552 */ 560 */
553 error = xfs_qm_dqattach_locked(ip, 0); 561 error = xfs_qm_dqattach_locked(ip, 0);
554 if (error) 562 if (error)
555 return XFS_ERROR(error); 563 return error;
556 564
557 extsz = xfs_get_extsz_hint(ip); 565 extsz = xfs_get_extsz_hint(ip);
558 offset_fsb = XFS_B_TO_FSBT(mp, offset); 566 offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -596,11 +604,11 @@ retry:
596 imap, &nimaps, XFS_BMAPI_ENTIRE); 604 imap, &nimaps, XFS_BMAPI_ENTIRE);
597 switch (error) { 605 switch (error) {
598 case 0: 606 case 0:
599 case ENOSPC: 607 case -ENOSPC:
600 case EDQUOT: 608 case -EDQUOT:
601 break; 609 break;
602 default: 610 default:
603 return XFS_ERROR(error); 611 return error;
604 } 612 }
605 613
606 /* 614 /*
@@ -614,7 +622,7 @@ retry:
614 error = 0; 622 error = 0;
615 goto retry; 623 goto retry;
616 } 624 }
617 return XFS_ERROR(error ? error : ENOSPC); 625 return error ? error : -ENOSPC;
618 } 626 }
619 627
620 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 628 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
@@ -663,7 +671,7 @@ xfs_iomap_write_allocate(
663 */ 671 */
664 error = xfs_qm_dqattach(ip, 0); 672 error = xfs_qm_dqattach(ip, 0);
665 if (error) 673 if (error)
666 return XFS_ERROR(error); 674 return error;
667 675
668 offset_fsb = XFS_B_TO_FSBT(mp, offset); 676 offset_fsb = XFS_B_TO_FSBT(mp, offset);
669 count_fsb = imap->br_blockcount; 677 count_fsb = imap->br_blockcount;
@@ -690,7 +698,7 @@ xfs_iomap_write_allocate(
690 nres, 0); 698 nres, 0);
691 if (error) { 699 if (error) {
692 xfs_trans_cancel(tp, 0); 700 xfs_trans_cancel(tp, 0);
693 return XFS_ERROR(error); 701 return error;
694 } 702 }
695 xfs_ilock(ip, XFS_ILOCK_EXCL); 703 xfs_ilock(ip, XFS_ILOCK_EXCL);
696 xfs_trans_ijoin(tp, ip, 0); 704 xfs_trans_ijoin(tp, ip, 0);
@@ -739,7 +747,7 @@ xfs_iomap_write_allocate(
739 if ((map_start_fsb + count_fsb) > last_block) { 747 if ((map_start_fsb + count_fsb) > last_block) {
740 count_fsb = last_block - map_start_fsb; 748 count_fsb = last_block - map_start_fsb;
741 if (count_fsb == 0) { 749 if (count_fsb == 0) {
742 error = EAGAIN; 750 error = -EAGAIN;
743 goto trans_cancel; 751 goto trans_cancel;
744 } 752 }
745 } 753 }
@@ -793,7 +801,7 @@ trans_cancel:
793 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); 801 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
794error0: 802error0:
795 xfs_iunlock(ip, XFS_ILOCK_EXCL); 803 xfs_iunlock(ip, XFS_ILOCK_EXCL);
796 return XFS_ERROR(error); 804 return error;
797} 805}
798 806
799int 807int
@@ -853,7 +861,7 @@ xfs_iomap_write_unwritten(
853 resblks, 0); 861 resblks, 0);
854 if (error) { 862 if (error) {
855 xfs_trans_cancel(tp, 0); 863 xfs_trans_cancel(tp, 0);
856 return XFS_ERROR(error); 864 return error;
857 } 865 }
858 866
859 xfs_ilock(ip, XFS_ILOCK_EXCL); 867 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -892,7 +900,7 @@ xfs_iomap_write_unwritten(
892 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 900 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
893 xfs_iunlock(ip, XFS_ILOCK_EXCL); 901 xfs_iunlock(ip, XFS_ILOCK_EXCL);
894 if (error) 902 if (error)
895 return XFS_ERROR(error); 903 return error;
896 904
897 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 905 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
898 return xfs_alert_fsblock_zero(ip, &imap); 906 return xfs_alert_fsblock_zero(ip, &imap);
@@ -915,5 +923,5 @@ error_on_bmapi_transaction:
915 xfs_bmap_cancel(&free_list); 923 xfs_bmap_cancel(&free_list);
916 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); 924 xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT));
917 xfs_iunlock(ip, XFS_ILOCK_EXCL); 925 xfs_iunlock(ip, XFS_ILOCK_EXCL);
918 return XFS_ERROR(error); 926 return error;
919} 927}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 205613a06068..ec6dcdc181ee 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -72,7 +72,7 @@ xfs_initxattrs(
72 int error = 0; 72 int error = 0;
73 73
74 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 74 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
75 error = -xfs_attr_set(ip, xattr->name, xattr->value, 75 error = xfs_attr_set(ip, xattr->name, xattr->value,
76 xattr->value_len, ATTR_SECURE); 76 xattr->value_len, ATTR_SECURE);
77 if (error < 0) 77 if (error < 0)
78 break; 78 break;
@@ -93,7 +93,7 @@ xfs_init_security(
93 struct inode *dir, 93 struct inode *dir,
94 const struct qstr *qstr) 94 const struct qstr *qstr)
95{ 95{
96 return -security_inode_init_security(inode, dir, qstr, 96 return security_inode_init_security(inode, dir, qstr,
97 &xfs_initxattrs, NULL); 97 &xfs_initxattrs, NULL);
98} 98}
99 99
@@ -173,12 +173,12 @@ xfs_generic_create(
173 173
174#ifdef CONFIG_XFS_POSIX_ACL 174#ifdef CONFIG_XFS_POSIX_ACL
175 if (default_acl) { 175 if (default_acl) {
176 error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); 176 error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
177 if (error) 177 if (error)
178 goto out_cleanup_inode; 178 goto out_cleanup_inode;
179 } 179 }
180 if (acl) { 180 if (acl) {
181 error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); 181 error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
182 if (error) 182 if (error)
183 goto out_cleanup_inode; 183 goto out_cleanup_inode;
184 } 184 }
@@ -194,7 +194,7 @@ xfs_generic_create(
194 posix_acl_release(default_acl); 194 posix_acl_release(default_acl);
195 if (acl) 195 if (acl)
196 posix_acl_release(acl); 196 posix_acl_release(acl);
197 return -error; 197 return error;
198 198
199 out_cleanup_inode: 199 out_cleanup_inode:
200 if (!tmpfile) 200 if (!tmpfile)
@@ -248,8 +248,8 @@ xfs_vn_lookup(
248 xfs_dentry_to_name(&name, dentry, 0); 248 xfs_dentry_to_name(&name, dentry, 0);
249 error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); 249 error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
250 if (unlikely(error)) { 250 if (unlikely(error)) {
251 if (unlikely(error != ENOENT)) 251 if (unlikely(error != -ENOENT))
252 return ERR_PTR(-error); 252 return ERR_PTR(error);
253 d_add(dentry, NULL); 253 d_add(dentry, NULL);
254 return NULL; 254 return NULL;
255 } 255 }
@@ -275,8 +275,8 @@ xfs_vn_ci_lookup(
275 xfs_dentry_to_name(&xname, dentry, 0); 275 xfs_dentry_to_name(&xname, dentry, 0);
276 error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); 276 error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
277 if (unlikely(error)) { 277 if (unlikely(error)) {
278 if (unlikely(error != ENOENT)) 278 if (unlikely(error != -ENOENT))
279 return ERR_PTR(-error); 279 return ERR_PTR(error);
280 /* 280 /*
281 * call d_add(dentry, NULL) here when d_drop_negative_children 281 * call d_add(dentry, NULL) here when d_drop_negative_children
282 * is called in xfs_vn_mknod (ie. allow negative dentries 282 * is called in xfs_vn_mknod (ie. allow negative dentries
@@ -311,7 +311,7 @@ xfs_vn_link(
311 311
312 error = xfs_link(XFS_I(dir), XFS_I(inode), &name); 312 error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
313 if (unlikely(error)) 313 if (unlikely(error))
314 return -error; 314 return error;
315 315
316 ihold(inode); 316 ihold(inode);
317 d_instantiate(dentry, inode); 317 d_instantiate(dentry, inode);
@@ -328,7 +328,7 @@ xfs_vn_unlink(
328 328
329 xfs_dentry_to_name(&name, dentry, 0); 329 xfs_dentry_to_name(&name, dentry, 0);
330 330
331 error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); 331 error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
332 if (error) 332 if (error)
333 return error; 333 return error;
334 334
@@ -375,7 +375,7 @@ xfs_vn_symlink(
375 xfs_cleanup_inode(dir, inode, dentry); 375 xfs_cleanup_inode(dir, inode, dentry);
376 iput(inode); 376 iput(inode);
377 out: 377 out:
378 return -error; 378 return error;
379} 379}
380 380
381STATIC int 381STATIC int
@@ -392,8 +392,8 @@ xfs_vn_rename(
392 xfs_dentry_to_name(&oname, odentry, 0); 392 xfs_dentry_to_name(&oname, odentry, 0);
393 xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); 393 xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
394 394
395 return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), 395 return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
396 XFS_I(ndir), &nname, new_inode ? 396 XFS_I(ndir), &nname, new_inode ?
397 XFS_I(new_inode) : NULL); 397 XFS_I(new_inode) : NULL);
398} 398}
399 399
@@ -414,7 +414,7 @@ xfs_vn_follow_link(
414 if (!link) 414 if (!link)
415 goto out_err; 415 goto out_err;
416 416
417 error = -xfs_readlink(XFS_I(dentry->d_inode), link); 417 error = xfs_readlink(XFS_I(dentry->d_inode), link);
418 if (unlikely(error)) 418 if (unlikely(error))
419 goto out_kfree; 419 goto out_kfree;
420 420
@@ -441,7 +441,7 @@ xfs_vn_getattr(
441 trace_xfs_getattr(ip); 441 trace_xfs_getattr(ip);
442 442
443 if (XFS_FORCED_SHUTDOWN(mp)) 443 if (XFS_FORCED_SHUTDOWN(mp))
444 return -XFS_ERROR(EIO); 444 return -EIO;
445 445
446 stat->size = XFS_ISIZE(ip); 446 stat->size = XFS_ISIZE(ip);
447 stat->dev = inode->i_sb->s_dev; 447 stat->dev = inode->i_sb->s_dev;
@@ -546,14 +546,14 @@ xfs_setattr_nonsize(
546 /* If acls are being inherited, we already have this checked */ 546 /* If acls are being inherited, we already have this checked */
547 if (!(flags & XFS_ATTR_NOACL)) { 547 if (!(flags & XFS_ATTR_NOACL)) {
548 if (mp->m_flags & XFS_MOUNT_RDONLY) 548 if (mp->m_flags & XFS_MOUNT_RDONLY)
549 return XFS_ERROR(EROFS); 549 return -EROFS;
550 550
551 if (XFS_FORCED_SHUTDOWN(mp)) 551 if (XFS_FORCED_SHUTDOWN(mp))
552 return XFS_ERROR(EIO); 552 return -EIO;
553 553
554 error = -inode_change_ok(inode, iattr); 554 error = inode_change_ok(inode, iattr);
555 if (error) 555 if (error)
556 return XFS_ERROR(error); 556 return error;
557 } 557 }
558 558
559 ASSERT((mask & ATTR_SIZE) == 0); 559 ASSERT((mask & ATTR_SIZE) == 0);
@@ -703,7 +703,7 @@ xfs_setattr_nonsize(
703 xfs_qm_dqrele(gdqp); 703 xfs_qm_dqrele(gdqp);
704 704
705 if (error) 705 if (error)
706 return XFS_ERROR(error); 706 return error;
707 707
708 /* 708 /*
709 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode 709 * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
@@ -713,9 +713,9 @@ xfs_setattr_nonsize(
713 * Posix ACL code seems to care about this issue either. 713 * Posix ACL code seems to care about this issue either.
714 */ 714 */
715 if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) { 715 if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
716 error = -posix_acl_chmod(inode, inode->i_mode); 716 error = posix_acl_chmod(inode, inode->i_mode);
717 if (error) 717 if (error)
718 return XFS_ERROR(error); 718 return error;
719 } 719 }
720 720
721 return 0; 721 return 0;
@@ -748,14 +748,14 @@ xfs_setattr_size(
748 trace_xfs_setattr(ip); 748 trace_xfs_setattr(ip);
749 749
750 if (mp->m_flags & XFS_MOUNT_RDONLY) 750 if (mp->m_flags & XFS_MOUNT_RDONLY)
751 return XFS_ERROR(EROFS); 751 return -EROFS;
752 752
753 if (XFS_FORCED_SHUTDOWN(mp)) 753 if (XFS_FORCED_SHUTDOWN(mp))
754 return XFS_ERROR(EIO); 754 return -EIO;
755 755
756 error = -inode_change_ok(inode, iattr); 756 error = inode_change_ok(inode, iattr);
757 if (error) 757 if (error)
758 return XFS_ERROR(error); 758 return error;
759 759
760 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 760 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
761 ASSERT(S_ISREG(ip->i_d.di_mode)); 761 ASSERT(S_ISREG(ip->i_d.di_mode));
@@ -818,7 +818,7 @@ xfs_setattr_size(
818 * care about here. 818 * care about here.
819 */ 819 */
820 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { 820 if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
821 error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping, 821 error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
822 ip->i_d.di_size, newsize); 822 ip->i_d.di_size, newsize);
823 if (error) 823 if (error)
824 return error; 824 return error;
@@ -844,11 +844,41 @@ xfs_setattr_size(
844 * much we can do about this, except to hope that the caller sees ENOMEM 844 * much we can do about this, except to hope that the caller sees ENOMEM
845 * and retries the truncate operation. 845 * and retries the truncate operation.
846 */ 846 */
847 error = -block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); 847 error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks);
848 if (error) 848 if (error)
849 return error; 849 return error;
850 truncate_setsize(inode, newsize); 850 truncate_setsize(inode, newsize);
851 851
852 /*
853 * The "we can't serialise against page faults" pain gets worse.
854 *
855 * If the file is mapped then we have to clean the page at the old EOF
856 * when extending the file. Extending the file can expose changes the
857 * underlying page mapping (e.g. from beyond EOF to a hole or
858 * unwritten), and so on the next attempt to write to that page we need
859 * to remap it for write. i.e. we need .page_mkwrite() to be called.
860 * Hence we need to clean the page to clean the pte and so a new write
861 * fault will be triggered appropriately.
862 *
863 * If we do it before we change the inode size, then we can race with a
864 * page fault that maps the page with exactly the same problem. If we do
865 * it after we change the file size, then a new page fault can come in
866 * and allocate space before we've run the rest of the truncate
867 * transaction. That's kinda grotesque, but it's better than have data
868 * over a hole, and so that's the lesser evil that has been chosen here.
869 *
870 * The real solution, however, is to have some mechanism for locking out
871 * page faults while a truncate is in progress.
872 */
873 if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) {
874 error = filemap_write_and_wait_range(
875 VFS_I(ip)->i_mapping,
876 round_down(oldsize, PAGE_CACHE_SIZE),
877 round_up(oldsize, PAGE_CACHE_SIZE) - 1);
878 if (error)
879 return error;
880 }
881
852 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 882 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
853 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); 883 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
854 if (error) 884 if (error)
@@ -950,7 +980,7 @@ xfs_vn_setattr(
950 error = xfs_setattr_nonsize(ip, iattr, 0); 980 error = xfs_setattr_nonsize(ip, iattr, 0);
951 } 981 }
952 982
953 return -error; 983 return error;
954} 984}
955 985
956STATIC int 986STATIC int
@@ -970,7 +1000,7 @@ xfs_vn_update_time(
970 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); 1000 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
971 if (error) { 1001 if (error) {
972 xfs_trans_cancel(tp, 0); 1002 xfs_trans_cancel(tp, 0);
973 return -error; 1003 return error;
974 } 1004 }
975 1005
976 xfs_ilock(ip, XFS_ILOCK_EXCL); 1006 xfs_ilock(ip, XFS_ILOCK_EXCL);
@@ -991,7 +1021,7 @@ xfs_vn_update_time(
991 } 1021 }
992 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1022 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
993 xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); 1023 xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP);
994 return -xfs_trans_commit(tp, 0); 1024 return xfs_trans_commit(tp, 0);
995} 1025}
996 1026
997#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) 1027#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@ -1036,7 +1066,7 @@ xfs_fiemap_format(
1036 *full = 1; /* user array now full */ 1066 *full = 1; /* user array now full */
1037 } 1067 }
1038 1068
1039 return -error; 1069 return error;
1040} 1070}
1041 1071
1042STATIC int 1072STATIC int
@@ -1055,12 +1085,12 @@ xfs_vn_fiemap(
1055 return error; 1085 return error;
1056 1086
1057 /* Set up bmap header for xfs internal routine */ 1087 /* Set up bmap header for xfs internal routine */
1058 bm.bmv_offset = BTOBB(start); 1088 bm.bmv_offset = BTOBBT(start);
1059 /* Special case for whole file */ 1089 /* Special case for whole file */
1060 if (length == FIEMAP_MAX_OFFSET) 1090 if (length == FIEMAP_MAX_OFFSET)
1061 bm.bmv_length = -1LL; 1091 bm.bmv_length = -1LL;
1062 else 1092 else
1063 bm.bmv_length = BTOBB(length); 1093 bm.bmv_length = BTOBB(start + length) - bm.bmv_offset;
1064 1094
1065 /* We add one because in getbmap world count includes the header */ 1095 /* We add one because in getbmap world count includes the header */
1066 bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM : 1096 bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
@@ -1075,7 +1105,7 @@ xfs_vn_fiemap(
1075 1105
1076 error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo); 1106 error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
1077 if (error) 1107 if (error)
1078 return -error; 1108 return error;
1079 1109
1080 return 0; 1110 return 0;
1081} 1111}
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cb64f222d607..f1deb961a296 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -67,19 +67,17 @@ xfs_bulkstat_one_int(
67 *stat = BULKSTAT_RV_NOTHING; 67 *stat = BULKSTAT_RV_NOTHING;
68 68
69 if (!buffer || xfs_internal_inum(mp, ino)) 69 if (!buffer || xfs_internal_inum(mp, ino))
70 return XFS_ERROR(EINVAL); 70 return -EINVAL;
71 71
72 buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL); 72 buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
73 if (!buf) 73 if (!buf)
74 return XFS_ERROR(ENOMEM); 74 return -ENOMEM;
75 75
76 error = xfs_iget(mp, NULL, ino, 76 error = xfs_iget(mp, NULL, ino,
77 (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED), 77 (XFS_IGET_DONTCACHE | XFS_IGET_UNTRUSTED),
78 XFS_ILOCK_SHARED, &ip); 78 XFS_ILOCK_SHARED, &ip);
79 if (error) { 79 if (error)
80 *stat = BULKSTAT_RV_NOTHING;
81 goto out_free; 80 goto out_free;
82 }
83 81
84 ASSERT(ip != NULL); 82 ASSERT(ip != NULL);
85 ASSERT(ip->i_imap.im_blkno != 0); 83 ASSERT(ip->i_imap.im_blkno != 0);
@@ -136,7 +134,6 @@ xfs_bulkstat_one_int(
136 IRELE(ip); 134 IRELE(ip);
137 135
138 error = formatter(buffer, ubsize, ubused, buf); 136 error = formatter(buffer, ubsize, ubused, buf);
139
140 if (!error) 137 if (!error)
141 *stat = BULKSTAT_RV_DIDONE; 138 *stat = BULKSTAT_RV_DIDONE;
142 139
@@ -154,9 +151,9 @@ xfs_bulkstat_one_fmt(
154 const xfs_bstat_t *buffer) 151 const xfs_bstat_t *buffer)
155{ 152{
156 if (ubsize < sizeof(*buffer)) 153 if (ubsize < sizeof(*buffer))
157 return XFS_ERROR(ENOMEM); 154 return -ENOMEM;
158 if (copy_to_user(ubuffer, buffer, sizeof(*buffer))) 155 if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
159 return XFS_ERROR(EFAULT); 156 return -EFAULT;
160 if (ubused) 157 if (ubused)
161 *ubused = sizeof(*buffer); 158 *ubused = sizeof(*buffer);
162 return 0; 159 return 0;
@@ -175,9 +172,170 @@ xfs_bulkstat_one(
175 xfs_bulkstat_one_fmt, ubused, stat); 172 xfs_bulkstat_one_fmt, ubused, stat);
176} 173}
177 174
175/*
176 * Loop over all clusters in a chunk for a given incore inode allocation btree
177 * record. Do a readahead if there are any allocated inodes in that cluster.
178 */
179STATIC void
180xfs_bulkstat_ichunk_ra(
181 struct xfs_mount *mp,
182 xfs_agnumber_t agno,
183 struct xfs_inobt_rec_incore *irec)
184{
185 xfs_agblock_t agbno;
186 struct blk_plug plug;
187 int blks_per_cluster;
188 int inodes_per_cluster;
189 int i; /* inode chunk index */
190
191 agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
192 blks_per_cluster = xfs_icluster_size_fsb(mp);
193 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
194
195 blk_start_plug(&plug);
196 for (i = 0; i < XFS_INODES_PER_CHUNK;
197 i += inodes_per_cluster, agbno += blks_per_cluster) {
198 if (xfs_inobt_maskn(i, inodes_per_cluster) & ~irec->ir_free) {
199 xfs_btree_reada_bufs(mp, agno, agbno, blks_per_cluster,
200 &xfs_inode_buf_ops);
201 }
202 }
203 blk_finish_plug(&plug);
204}
205
206/*
207 * Lookup the inode chunk that the given inode lives in and then get the record
208 * if we found the chunk. If the inode was not the last in the chunk and there
209 * are some left allocated, update the data for the pointed-to record as well as
210 * return the count of grabbed inodes.
211 */
212STATIC int
213xfs_bulkstat_grab_ichunk(
214 struct xfs_btree_cur *cur, /* btree cursor */
215 xfs_agino_t agino, /* starting inode of chunk */
216 int *icount,/* return # of inodes grabbed */
217 struct xfs_inobt_rec_incore *irec) /* btree record */
218{
219 int idx; /* index into inode chunk */
220 int stat;
221 int error = 0;
222
223 /* Lookup the inode chunk that this inode lives in */
224 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &stat);
225 if (error)
226 return error;
227 if (!stat) {
228 *icount = 0;
229 return error;
230 }
231
232 /* Get the record, should always work */
233 error = xfs_inobt_get_rec(cur, irec, &stat);
234 if (error)
235 return error;
236 XFS_WANT_CORRUPTED_RETURN(stat == 1);
237
238 /* Check if the record contains the inode in request */
239 if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
240 return -EINVAL;
241
242 idx = agino - irec->ir_startino + 1;
243 if (idx < XFS_INODES_PER_CHUNK &&
244 (xfs_inobt_maskn(idx, XFS_INODES_PER_CHUNK - idx) & ~irec->ir_free)) {
245 int i;
246
247 /* We got a right chunk with some left inodes allocated at it.
248 * Grab the chunk record. Mark all the uninteresting inodes
249 * free -- because they're before our start point.
250 */
251 for (i = 0; i < idx; i++) {
252 if (XFS_INOBT_MASK(i) & ~irec->ir_free)
253 irec->ir_freecount++;
254 }
255
256 irec->ir_free |= xfs_inobt_maskn(0, idx);
257 *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount;
258 }
259
260 return 0;
261}
262
178#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) 263#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
179 264
180/* 265/*
266 * Process inodes in chunk with a pointer to a formatter function
267 * that will iget the inode and fill in the appropriate structure.
268 */
269int
270xfs_bulkstat_ag_ichunk(
271 struct xfs_mount *mp,
272 xfs_agnumber_t agno,
273 struct xfs_inobt_rec_incore *irbp,
274 bulkstat_one_pf formatter,
275 size_t statstruct_size,
276 struct xfs_bulkstat_agichunk *acp)
277{
278 xfs_ino_t lastino = acp->ac_lastino;
279 char __user **ubufp = acp->ac_ubuffer;
280 int ubleft = acp->ac_ubleft;
281 int ubelem = acp->ac_ubelem;
282 int chunkidx, clustidx;
283 int error = 0;
284 xfs_agino_t agino;
285
286 for (agino = irbp->ir_startino, chunkidx = clustidx = 0;
287 XFS_BULKSTAT_UBLEFT(ubleft) &&
288 irbp->ir_freecount < XFS_INODES_PER_CHUNK;
289 chunkidx++, clustidx++, agino++) {
290 int fmterror; /* bulkstat formatter result */
291 int ubused;
292 xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino);
293
294 ASSERT(chunkidx < XFS_INODES_PER_CHUNK);
295
296 /* Skip if this inode is free */
297 if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) {
298 lastino = ino;
299 continue;
300 }
301
302 /*
303 * Count used inodes as free so we can tell when the
304 * chunk is used up.
305 */
306 irbp->ir_freecount++;
307
308 /* Get the inode and fill in a single buffer */
309 ubused = statstruct_size;
310 error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror);
311 if (fmterror == BULKSTAT_RV_NOTHING) {
312 if (error && error != -ENOENT && error != -EINVAL) {
313 ubleft = 0;
314 break;
315 }
316 lastino = ino;
317 continue;
318 }
319 if (fmterror == BULKSTAT_RV_GIVEUP) {
320 ubleft = 0;
321 ASSERT(error);
322 break;
323 }
324 if (*ubufp)
325 *ubufp += ubused;
326 ubleft -= ubused;
327 ubelem++;
328 lastino = ino;
329 }
330
331 acp->ac_lastino = lastino;
332 acp->ac_ubleft = ubleft;
333 acp->ac_ubelem = ubelem;
334
335 return error;
336}
337
338/*
181 * Return stat information in bulk (by-inode) for the filesystem. 339 * Return stat information in bulk (by-inode) for the filesystem.
182 */ 340 */
183int /* error status */ 341int /* error status */
@@ -190,13 +348,10 @@ xfs_bulkstat(
190 char __user *ubuffer, /* buffer with inode stats */ 348 char __user *ubuffer, /* buffer with inode stats */
191 int *done) /* 1 if there are more stats to get */ 349 int *done) /* 1 if there are more stats to get */
192{ 350{
193 xfs_agblock_t agbno=0;/* allocation group block number */
194 xfs_buf_t *agbp; /* agi header buffer */ 351 xfs_buf_t *agbp; /* agi header buffer */
195 xfs_agi_t *agi; /* agi header data */ 352 xfs_agi_t *agi; /* agi header data */
196 xfs_agino_t agino; /* inode # in allocation group */ 353 xfs_agino_t agino; /* inode # in allocation group */
197 xfs_agnumber_t agno; /* allocation group number */ 354 xfs_agnumber_t agno; /* allocation group number */
198 int chunkidx; /* current index into inode chunk */
199 int clustidx; /* current index into inode cluster */
200 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ 355 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
201 int end_of_ag; /* set if we've seen the ag end */ 356 int end_of_ag; /* set if we've seen the ag end */
202 int error; /* error code */ 357 int error; /* error code */
@@ -209,8 +364,6 @@ xfs_bulkstat(
209 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ 364 xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
210 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */ 365 xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
211 xfs_ino_t lastino; /* last inode number returned */ 366 xfs_ino_t lastino; /* last inode number returned */
212 int blks_per_cluster; /* # of blocks per cluster */
213 int inodes_per_cluster;/* # of inodes per cluster */
214 int nirbuf; /* size of irbuf */ 367 int nirbuf; /* size of irbuf */
215 int rval; /* return value error code */ 368 int rval; /* return value error code */
216 int tmp; /* result value from btree calls */ 369 int tmp; /* result value from btree calls */
@@ -218,7 +371,6 @@ xfs_bulkstat(
218 int ubleft; /* bytes left in user's buffer */ 371 int ubleft; /* bytes left in user's buffer */
219 char __user *ubufp; /* pointer into user's buffer */ 372 char __user *ubufp; /* pointer into user's buffer */
220 int ubelem; /* spaces used in user's buffer */ 373 int ubelem; /* spaces used in user's buffer */
221 int ubused; /* bytes used by formatter */
222 374
223 /* 375 /*
224 * Get the last inode value, see if there's nothing to do. 376 * Get the last inode value, see if there's nothing to do.
@@ -233,20 +385,16 @@ xfs_bulkstat(
233 *ubcountp = 0; 385 *ubcountp = 0;
234 return 0; 386 return 0;
235 } 387 }
236 if (!ubcountp || *ubcountp <= 0) { 388
237 return EINVAL;
238 }
239 ubcount = *ubcountp; /* statstruct's */ 389 ubcount = *ubcountp; /* statstruct's */
240 ubleft = ubcount * statstruct_size; /* bytes */ 390 ubleft = ubcount * statstruct_size; /* bytes */
241 *ubcountp = ubelem = 0; 391 *ubcountp = ubelem = 0;
242 *done = 0; 392 *done = 0;
243 fmterror = 0; 393 fmterror = 0;
244 ubufp = ubuffer; 394 ubufp = ubuffer;
245 blks_per_cluster = xfs_icluster_size_fsb(mp);
246 inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
247 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); 395 irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
248 if (!irbuf) 396 if (!irbuf)
249 return ENOMEM; 397 return -ENOMEM;
250 398
251 nirbuf = irbsize / sizeof(*irbuf); 399 nirbuf = irbsize / sizeof(*irbuf);
252 400
@@ -258,14 +406,8 @@ xfs_bulkstat(
258 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { 406 while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) {
259 cond_resched(); 407 cond_resched();
260 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 408 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
261 if (error) { 409 if (error)
262 /* 410 break;
263 * Skip this allocation group and go to the next one.
264 */
265 agno++;
266 agino = 0;
267 continue;
268 }
269 agi = XFS_BUF_TO_AGI(agbp); 411 agi = XFS_BUF_TO_AGI(agbp);
270 /* 412 /*
271 * Allocate and initialize a btree cursor for ialloc btree. 413 * Allocate and initialize a btree cursor for ialloc btree.
@@ -275,96 +417,39 @@ xfs_bulkstat(
275 irbp = irbuf; 417 irbp = irbuf;
276 irbufend = irbuf + nirbuf; 418 irbufend = irbuf + nirbuf;
277 end_of_ag = 0; 419 end_of_ag = 0;
278 /* 420 icount = 0;
279 * If we're returning in the middle of an allocation group,
280 * we need to get the remainder of the chunk we're in.
281 */
282 if (agino > 0) { 421 if (agino > 0) {
283 xfs_inobt_rec_incore_t r;
284
285 /* 422 /*
286 * Lookup the inode chunk that this inode lives in. 423 * In the middle of an allocation group, we need to get
424 * the remainder of the chunk we're in.
287 */ 425 */
288 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, 426 struct xfs_inobt_rec_incore r;
289 &tmp); 427
290 if (!error && /* no I/O error */ 428 error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
291 tmp && /* lookup succeeded */ 429 if (error)
292 /* got the record, should always work */ 430 break;
293 !(error = xfs_inobt_get_rec(cur, &r, &i)) && 431 if (icount) {
294 i == 1 &&
295 /* this is the right chunk */
296 agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
297 /* lastino was not last in chunk */
298 (chunkidx = agino - r.ir_startino + 1) <
299 XFS_INODES_PER_CHUNK &&
300 /* there are some left allocated */
301 xfs_inobt_maskn(chunkidx,
302 XFS_INODES_PER_CHUNK - chunkidx) &
303 ~r.ir_free) {
304 /*
305 * Grab the chunk record. Mark all the
306 * uninteresting inodes (because they're
307 * before our start point) free.
308 */
309 for (i = 0; i < chunkidx; i++) {
310 if (XFS_INOBT_MASK(i) & ~r.ir_free)
311 r.ir_freecount++;
312 }
313 r.ir_free |= xfs_inobt_maskn(0, chunkidx);
314 irbp->ir_startino = r.ir_startino; 432 irbp->ir_startino = r.ir_startino;
315 irbp->ir_freecount = r.ir_freecount; 433 irbp->ir_freecount = r.ir_freecount;
316 irbp->ir_free = r.ir_free; 434 irbp->ir_free = r.ir_free;
317 irbp++; 435 irbp++;
318 agino = r.ir_startino + XFS_INODES_PER_CHUNK; 436 agino = r.ir_startino + XFS_INODES_PER_CHUNK;
319 icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
320 } else {
321 /*
322 * If any of those tests failed, bump the
323 * inode number (just in case).
324 */
325 agino++;
326 icount = 0;
327 } 437 }
328 /* 438 /* Increment to the next record */
329 * In any case, increment to the next record. 439 error = xfs_btree_increment(cur, 0, &tmp);
330 */
331 if (!error)
332 error = xfs_btree_increment(cur, 0, &tmp);
333 } else { 440 } else {
334 /* 441 /* Start of ag. Lookup the first inode chunk */
335 * Start of ag. Lookup the first inode chunk.
336 */
337 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); 442 error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
338 icount = 0;
339 } 443 }
444 if (error)
445 break;
446
340 /* 447 /*
341 * Loop through inode btree records in this ag, 448 * Loop through inode btree records in this ag,
342 * until we run out of inodes or space in the buffer. 449 * until we run out of inodes or space in the buffer.
343 */ 450 */
344 while (irbp < irbufend && icount < ubcount) { 451 while (irbp < irbufend && icount < ubcount) {
345 xfs_inobt_rec_incore_t r; 452 struct xfs_inobt_rec_incore r;
346
347 /*
348 * Loop as long as we're unable to read the
349 * inode btree.
350 */
351 while (error) {
352 agino += XFS_INODES_PER_CHUNK;
353 if (XFS_AGINO_TO_AGBNO(mp, agino) >=
354 be32_to_cpu(agi->agi_length))
355 break;
356 error = xfs_inobt_lookup(cur, agino,
357 XFS_LOOKUP_GE, &tmp);
358 cond_resched();
359 }
360 /*
361 * If ran off the end of the ag either with an error,
362 * or the normal way, set end and stop collecting.
363 */
364 if (error) {
365 end_of_ag = 1;
366 break;
367 }
368 453
369 error = xfs_inobt_get_rec(cur, &r, &i); 454 error = xfs_inobt_get_rec(cur, &r, &i);
370 if (error || i == 0) { 455 if (error || i == 0) {
@@ -377,25 +462,7 @@ xfs_bulkstat(
377 * Also start read-ahead now for this chunk. 462 * Also start read-ahead now for this chunk.
378 */ 463 */
379 if (r.ir_freecount < XFS_INODES_PER_CHUNK) { 464 if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
380 struct blk_plug plug; 465 xfs_bulkstat_ichunk_ra(mp, agno, &r);
381 /*
382 * Loop over all clusters in the next chunk.
383 * Do a readahead if there are any allocated
384 * inodes in that cluster.
385 */
386 blk_start_plug(&plug);
387 agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
388 for (chunkidx = 0;
389 chunkidx < XFS_INODES_PER_CHUNK;
390 chunkidx += inodes_per_cluster,
391 agbno += blks_per_cluster) {
392 if (xfs_inobt_maskn(chunkidx,
393 inodes_per_cluster) & ~r.ir_free)
394 xfs_btree_reada_bufs(mp, agno,
395 agbno, blks_per_cluster,
396 &xfs_inode_buf_ops);
397 }
398 blk_finish_plug(&plug);
399 irbp->ir_startino = r.ir_startino; 466 irbp->ir_startino = r.ir_startino;
400 irbp->ir_freecount = r.ir_freecount; 467 irbp->ir_freecount = r.ir_freecount;
401 irbp->ir_free = r.ir_free; 468 irbp->ir_free = r.ir_free;
@@ -422,57 +489,20 @@ xfs_bulkstat(
422 irbufend = irbp; 489 irbufend = irbp;
423 for (irbp = irbuf; 490 for (irbp = irbuf;
424 irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { 491 irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) {
425 /* 492 struct xfs_bulkstat_agichunk ac;
426 * Now process this chunk of inodes. 493
427 */ 494 ac.ac_lastino = lastino;
428 for (agino = irbp->ir_startino, chunkidx = clustidx = 0; 495 ac.ac_ubuffer = &ubuffer;
429 XFS_BULKSTAT_UBLEFT(ubleft) && 496 ac.ac_ubleft = ubleft;
430 irbp->ir_freecount < XFS_INODES_PER_CHUNK; 497 ac.ac_ubelem = ubelem;
431 chunkidx++, clustidx++, agino++) { 498 error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
432 ASSERT(chunkidx < XFS_INODES_PER_CHUNK); 499 formatter, statstruct_size, &ac);
433 500 if (error)
434 ino = XFS_AGINO_TO_INO(mp, agno, agino); 501 rval = error;
435 /* 502
436 * Skip if this inode is free. 503 lastino = ac.ac_lastino;
437 */ 504 ubleft = ac.ac_ubleft;
438 if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { 505 ubelem = ac.ac_ubelem;
439 lastino = ino;
440 continue;
441 }
442 /*
443 * Count used inodes as free so we can tell
444 * when the chunk is used up.
445 */
446 irbp->ir_freecount++;
447
448 /*
449 * Get the inode and fill in a single buffer.
450 */
451 ubused = statstruct_size;
452 error = formatter(mp, ino, ubufp, ubleft,
453 &ubused, &fmterror);
454 if (fmterror == BULKSTAT_RV_NOTHING) {
455 if (error && error != ENOENT &&
456 error != EINVAL) {
457 ubleft = 0;
458 rval = error;
459 break;
460 }
461 lastino = ino;
462 continue;
463 }
464 if (fmterror == BULKSTAT_RV_GIVEUP) {
465 ubleft = 0;
466 ASSERT(error);
467 rval = error;
468 break;
469 }
470 if (ubufp)
471 ubufp += ubused;
472 ubleft -= ubused;
473 ubelem++;
474 lastino = ino;
475 }
476 506
477 cond_resched(); 507 cond_resched();
478 } 508 }
@@ -512,58 +542,10 @@ xfs_bulkstat(
512 return rval; 542 return rval;
513} 543}
514 544
515/*
516 * Return stat information in bulk (by-inode) for the filesystem.
517 * Special case for non-sequential one inode bulkstat.
518 */
519int /* error status */
520xfs_bulkstat_single(
521 xfs_mount_t *mp, /* mount point for filesystem */
522 xfs_ino_t *lastinop, /* inode to return */
523 char __user *buffer, /* buffer with inode stats */
524 int *done) /* 1 if there are more stats to get */
525{
526 int count; /* count value for bulkstat call */
527 int error; /* return value */
528 xfs_ino_t ino; /* filesystem inode number */
529 int res; /* result from bs1 */
530
531 /*
532 * note that requesting valid inode numbers which are not allocated
533 * to inodes will most likely cause xfs_imap_to_bp to generate warning
534 * messages about bad magic numbers. This is ok. The fact that
535 * the inode isn't actually an inode is handled by the
536 * error check below. Done this way to make the usual case faster
537 * at the expense of the error case.
538 */
539
540 ino = *lastinop;
541 error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
542 NULL, &res);
543 if (error) {
544 /*
545 * Special case way failed, do it the "long" way
546 * to see if that works.
547 */
548 (*lastinop)--;
549 count = 1;
550 if (xfs_bulkstat(mp, lastinop, &count, xfs_bulkstat_one,
551 sizeof(xfs_bstat_t), buffer, done))
552 return error;
553 if (count == 0 || (xfs_ino_t)*lastinop != ino)
554 return error == EFSCORRUPTED ?
555 XFS_ERROR(EINVAL) : error;
556 else
557 return 0;
558 }
559 *done = 0;
560 return 0;
561}
562
563int 545int
564xfs_inumbers_fmt( 546xfs_inumbers_fmt(
565 void __user *ubuffer, /* buffer to write to */ 547 void __user *ubuffer, /* buffer to write to */
566 const xfs_inogrp_t *buffer, /* buffer to read from */ 548 const struct xfs_inogrp *buffer, /* buffer to read from */
567 long count, /* # of elements to read */ 549 long count, /* # of elements to read */
568 long *written) /* # of bytes written */ 550 long *written) /* # of bytes written */
569{ 551{
@@ -578,127 +560,105 @@ xfs_inumbers_fmt(
578 */ 560 */
579int /* error status */ 561int /* error status */
580xfs_inumbers( 562xfs_inumbers(
581 xfs_mount_t *mp, /* mount point for filesystem */ 563 struct xfs_mount *mp,/* mount point for filesystem */
582 xfs_ino_t *lastino, /* last inode returned */ 564 xfs_ino_t *lastino,/* last inode returned */
583 int *count, /* size of buffer/count returned */ 565 int *count,/* size of buffer/count returned */
584 void __user *ubuffer,/* buffer with inode descriptions */ 566 void __user *ubuffer,/* buffer with inode descriptions */
585 inumbers_fmt_pf formatter) 567 inumbers_fmt_pf formatter)
586{ 568{
587 xfs_buf_t *agbp; 569 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, *lastino);
588 xfs_agino_t agino; 570 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, *lastino);
589 xfs_agnumber_t agno; 571 struct xfs_btree_cur *cur = NULL;
590 int bcount; 572 struct xfs_buf *agbp = NULL;
591 xfs_inogrp_t *buffer; 573 struct xfs_inogrp *buffer;
592 int bufidx; 574 int bcount;
593 xfs_btree_cur_t *cur; 575 int left = *count;
594 int error; 576 int bufidx = 0;
595 xfs_inobt_rec_incore_t r; 577 int error = 0;
596 int i; 578
597 xfs_ino_t ino;
598 int left;
599 int tmp;
600
601 ino = (xfs_ino_t)*lastino;
602 agno = XFS_INO_TO_AGNO(mp, ino);
603 agino = XFS_INO_TO_AGINO(mp, ino);
604 left = *count;
605 *count = 0; 579 *count = 0;
580 if (agno >= mp->m_sb.sb_agcount ||
581 *lastino != XFS_AGINO_TO_INO(mp, agno, agino))
582 return error;
583
606 bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer))); 584 bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
607 buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP); 585 buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
608 error = bufidx = 0; 586 do {
609 cur = NULL; 587 struct xfs_inobt_rec_incore r;
610 agbp = NULL; 588 int stat;
611 while (left > 0 && agno < mp->m_sb.sb_agcount) { 589
612 if (agbp == NULL) { 590 if (!agbp) {
613 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); 591 error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
614 if (error) { 592 if (error)
615 /* 593 break;
616 * If we can't read the AGI of this ag, 594
617 * then just skip to the next one.
618 */
619 ASSERT(cur == NULL);
620 agbp = NULL;
621 agno++;
622 agino = 0;
623 continue;
624 }
625 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, 595 cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
626 XFS_BTNUM_INO); 596 XFS_BTNUM_INO);
627 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE, 597 error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
628 &tmp); 598 &stat);
629 if (error) { 599 if (error)
630 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 600 break;
631 cur = NULL; 601 if (!stat)
632 xfs_buf_relse(agbp); 602 goto next_ag;
633 agbp = NULL;
634 /*
635 * Move up the last inode in the current
636 * chunk. The lookup_ge will always get
637 * us the first inode in the next chunk.
638 */
639 agino += XFS_INODES_PER_CHUNK - 1;
640 continue;
641 }
642 }
643 error = xfs_inobt_get_rec(cur, &r, &i);
644 if (error || i == 0) {
645 xfs_buf_relse(agbp);
646 agbp = NULL;
647 xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
648 cur = NULL;
649 agno++;
650 agino = 0;
651 continue;
652 } 603 }
604
605 error = xfs_inobt_get_rec(cur, &r, &stat);
606 if (error)
607 break;
608 if (!stat)
609 goto next_ag;
610
653 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; 611 agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
654 buffer[bufidx].xi_startino = 612 buffer[bufidx].xi_startino =
655 XFS_AGINO_TO_INO(mp, agno, r.ir_startino); 613 XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
656 buffer[bufidx].xi_alloccount = 614 buffer[bufidx].xi_alloccount =
657 XFS_INODES_PER_CHUNK - r.ir_freecount; 615 XFS_INODES_PER_CHUNK - r.ir_freecount;
658 buffer[bufidx].xi_allocmask = ~r.ir_free; 616 buffer[bufidx].xi_allocmask = ~r.ir_free;
659 bufidx++; 617 if (++bufidx == bcount) {
660 left--; 618 long written;
661 if (bufidx == bcount) { 619
662 long written; 620 error = formatter(ubuffer, buffer, bufidx, &written);
663 if (formatter(ubuffer, buffer, bufidx, &written)) { 621 if (error)
664 error = XFS_ERROR(EFAULT);
665 break; 622 break;
666 }
667 ubuffer += written; 623 ubuffer += written;
668 *count += bufidx; 624 *count += bufidx;
669 bufidx = 0; 625 bufidx = 0;
670 } 626 }
671 if (left) { 627 if (!--left)
672 error = xfs_btree_increment(cur, 0, &tmp); 628 break;
673 if (error) { 629
674 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); 630 error = xfs_btree_increment(cur, 0, &stat);
675 cur = NULL; 631 if (error)
676 xfs_buf_relse(agbp); 632 break;
677 agbp = NULL; 633 if (stat)
678 /* 634 continue;
679 * The agino value has already been bumped. 635
680 * Just try to skip up to it. 636next_ag:
681 */ 637 xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
682 agino += XFS_INODES_PER_CHUNK; 638 cur = NULL;
683 continue; 639 xfs_buf_relse(agbp);
684 } 640 agbp = NULL;
685 } 641 agino = 0;
686 } 642 agno++;
643 } while (agno < mp->m_sb.sb_agcount);
644
687 if (!error) { 645 if (!error) {
688 if (bufidx) { 646 if (bufidx) {
689 long written; 647 long written;
690 if (formatter(ubuffer, buffer, bufidx, &written)) 648
691 error = XFS_ERROR(EFAULT); 649 error = formatter(ubuffer, buffer, bufidx, &written);
692 else 650 if (!error)
693 *count += bufidx; 651 *count += bufidx;
694 } 652 }
695 *lastino = XFS_AGINO_TO_INO(mp, agno, agino); 653 *lastino = XFS_AGINO_TO_INO(mp, agno, agino);
696 } 654 }
655
697 kmem_free(buffer); 656 kmem_free(buffer);
698 if (cur) 657 if (cur)
699 xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR : 658 xfs_btree_del_cursor(cur, (error ? XFS_BTREE_ERROR :
700 XFS_BTREE_NOERROR)); 659 XFS_BTREE_NOERROR));
701 if (agbp) 660 if (agbp)
702 xfs_buf_relse(agbp); 661 xfs_buf_relse(agbp);
662
703 return error; 663 return error;
704} 664}
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 97295d91d170..aaed08022eb9 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -30,6 +30,22 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
30 int *ubused, 30 int *ubused,
31 int *stat); 31 int *stat);
32 32
33struct xfs_bulkstat_agichunk {
34 xfs_ino_t ac_lastino; /* last inode returned */
35 char __user **ac_ubuffer;/* pointer into user's buffer */
36 int ac_ubleft; /* bytes left in user's buffer */
37 int ac_ubelem; /* spaces used in user's buffer */
38};
39
40int
41xfs_bulkstat_ag_ichunk(
42 struct xfs_mount *mp,
43 xfs_agnumber_t agno,
44 struct xfs_inobt_rec_incore *irbp,
45 bulkstat_one_pf formatter,
46 size_t statstruct_size,
47 struct xfs_bulkstat_agichunk *acp);
48
33/* 49/*
34 * Values for stat return value. 50 * Values for stat return value.
35 */ 51 */
@@ -50,13 +66,6 @@ xfs_bulkstat(
50 char __user *ubuffer,/* buffer with inode stats */ 66 char __user *ubuffer,/* buffer with inode stats */
51 int *done); /* 1 if there are more stats to get */ 67 int *done); /* 1 if there are more stats to get */
52 68
53int
54xfs_bulkstat_single(
55 xfs_mount_t *mp,
56 xfs_ino_t *lastinop,
57 char __user *buffer,
58 int *done);
59
60typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */ 69typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */
61 void __user *ubuffer, /* buffer to write to */ 70 void __user *ubuffer, /* buffer to write to */
62 int ubsize, /* remaining user buffer sz */ 71 int ubsize, /* remaining user buffer sz */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 825249d2dfc1..6a51619d8690 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -21,18 +21,6 @@
21#include <linux/types.h> 21#include <linux/types.h>
22 22
23/* 23/*
24 * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
25 * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
26 */
27#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
28# define XFS_BIG_BLKNOS 1
29# define XFS_BIG_INUMS 1
30#else
31# define XFS_BIG_BLKNOS 0
32# define XFS_BIG_INUMS 0
33#endif
34
35/*
36 * Kernel specific type declarations for XFS 24 * Kernel specific type declarations for XFS
37 */ 25 */
38typedef signed char __int8_t; 26typedef signed char __int8_t;
@@ -68,7 +56,6 @@ typedef __uint64_t __psunsigned_t;
68 56
69#include "kmem.h" 57#include "kmem.h"
70#include "mrlock.h" 58#include "mrlock.h"
71#include "time.h"
72#include "uuid.h" 59#include "uuid.h"
73 60
74#include <linux/semaphore.h> 61#include <linux/semaphore.h>
@@ -113,7 +100,7 @@ typedef __uint64_t __psunsigned_t;
113#include <asm/byteorder.h> 100#include <asm/byteorder.h>
114#include <asm/unaligned.h> 101#include <asm/unaligned.h>
115 102
116#include "xfs_vnode.h" 103#include "xfs_fs.h"
117#include "xfs_stats.h" 104#include "xfs_stats.h"
118#include "xfs_sysctl.h" 105#include "xfs_sysctl.h"
119#include "xfs_iops.h" 106#include "xfs_iops.h"
@@ -191,6 +178,22 @@ typedef __uint64_t __psunsigned_t;
191#define MAX(a,b) (max(a,b)) 178#define MAX(a,b) (max(a,b))
192#define howmany(x, y) (((x)+((y)-1))/(y)) 179#define howmany(x, y) (((x)+((y)-1))/(y))
193 180
181static inline void delay(long ticks)
182{
183 schedule_timeout_uninterruptible(ticks);
184}
185
186/*
187 * XFS wrapper structure for sysfs support. It depends on external data
188 * structures and is embedded in various internal data structures to implement
189 * the XFS sysfs object heirarchy. Define it here for broad access throughout
190 * the codebase.
191 */
192struct xfs_kobj {
193 struct kobject kobject;
194 struct completion complete;
195};
196
194/* Kernel uid/gid conversion. These are used to convert to/from the on disk 197/* Kernel uid/gid conversion. These are used to convert to/from the on disk
195 * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. 198 * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
196 * The conversion here is type only, the value will remain the same since we 199 * The conversion here is type only, the value will remain the same since we
@@ -331,7 +334,7 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
331{ 334{
332 x += y - 1; 335 x += y - 1;
333 do_div(x, y); 336 do_div(x, y);
334 return(x * y); 337 return x * y;
335} 338}
336 339
337static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) 340static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 292308dede6d..fe88ef67f93a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,7 @@
34#include "xfs_trace.h" 34#include "xfs_trace.h"
35#include "xfs_fsops.h" 35#include "xfs_fsops.h"
36#include "xfs_cksum.h" 36#include "xfs_cksum.h"
37#include "xfs_sysfs.h"
37 38
38kmem_zone_t *xfs_log_ticket_zone; 39kmem_zone_t *xfs_log_ticket_zone;
39 40
@@ -283,7 +284,7 @@ xlog_grant_head_wait(
283 return 0; 284 return 0;
284shutdown: 285shutdown:
285 list_del_init(&tic->t_queue); 286 list_del_init(&tic->t_queue);
286 return XFS_ERROR(EIO); 287 return -EIO;
287} 288}
288 289
289/* 290/*
@@ -377,7 +378,7 @@ xfs_log_regrant(
377 int error = 0; 378 int error = 0;
378 379
379 if (XLOG_FORCED_SHUTDOWN(log)) 380 if (XLOG_FORCED_SHUTDOWN(log))
380 return XFS_ERROR(EIO); 381 return -EIO;
381 382
382 XFS_STATS_INC(xs_try_logspace); 383 XFS_STATS_INC(xs_try_logspace);
383 384
@@ -446,7 +447,7 @@ xfs_log_reserve(
446 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); 447 ASSERT(client == XFS_TRANSACTION || client == XFS_LOG);
447 448
448 if (XLOG_FORCED_SHUTDOWN(log)) 449 if (XLOG_FORCED_SHUTDOWN(log))
449 return XFS_ERROR(EIO); 450 return -EIO;
450 451
451 XFS_STATS_INC(xs_try_logspace); 452 XFS_STATS_INC(xs_try_logspace);
452 453
@@ -454,7 +455,7 @@ xfs_log_reserve(
454 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, 455 tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
455 KM_SLEEP | KM_MAYFAIL); 456 KM_SLEEP | KM_MAYFAIL);
456 if (!tic) 457 if (!tic)
457 return XFS_ERROR(ENOMEM); 458 return -ENOMEM;
458 459
459 tic->t_trans_type = t_type; 460 tic->t_trans_type = t_type;
460 *ticp = tic; 461 *ticp = tic;
@@ -590,7 +591,7 @@ xfs_log_release_iclog(
590{ 591{
591 if (xlog_state_release_iclog(mp->m_log, iclog)) { 592 if (xlog_state_release_iclog(mp->m_log, iclog)) {
592 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 593 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
593 return EIO; 594 return -EIO;
594 } 595 }
595 596
596 return 0; 597 return 0;
@@ -628,7 +629,7 @@ xfs_log_mount(
628 629
629 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); 630 mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks);
630 if (IS_ERR(mp->m_log)) { 631 if (IS_ERR(mp->m_log)) {
631 error = -PTR_ERR(mp->m_log); 632 error = PTR_ERR(mp->m_log);
632 goto out; 633 goto out;
633 } 634 }
634 635
@@ -652,18 +653,18 @@ xfs_log_mount(
652 xfs_warn(mp, 653 xfs_warn(mp,
653 "Log size %d blocks too small, minimum size is %d blocks", 654 "Log size %d blocks too small, minimum size is %d blocks",
654 mp->m_sb.sb_logblocks, min_logfsbs); 655 mp->m_sb.sb_logblocks, min_logfsbs);
655 error = EINVAL; 656 error = -EINVAL;
656 } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { 657 } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
657 xfs_warn(mp, 658 xfs_warn(mp,
658 "Log size %d blocks too large, maximum size is %lld blocks", 659 "Log size %d blocks too large, maximum size is %lld blocks",
659 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); 660 mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
660 error = EINVAL; 661 error = -EINVAL;
661 } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { 662 } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
662 xfs_warn(mp, 663 xfs_warn(mp,
663 "log size %lld bytes too large, maximum size is %lld bytes", 664 "log size %lld bytes too large, maximum size is %lld bytes",
664 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), 665 XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
665 XFS_MAX_LOG_BYTES); 666 XFS_MAX_LOG_BYTES);
666 error = EINVAL; 667 error = -EINVAL;
667 } 668 }
668 if (error) { 669 if (error) {
669 if (xfs_sb_version_hascrc(&mp->m_sb)) { 670 if (xfs_sb_version_hascrc(&mp->m_sb)) {
@@ -707,6 +708,11 @@ xfs_log_mount(
707 } 708 }
708 } 709 }
709 710
711 error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj,
712 "log");
713 if (error)
714 goto out_destroy_ail;
715
710 /* Normal transactions can now occur */ 716 /* Normal transactions can now occur */
711 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 717 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
712 718
@@ -947,6 +953,9 @@ xfs_log_unmount(
947 xfs_log_quiesce(mp); 953 xfs_log_quiesce(mp);
948 954
949 xfs_trans_ail_destroy(mp); 955 xfs_trans_ail_destroy(mp);
956
957 xfs_sysfs_del(&mp->m_log->l_kobj);
958
950 xlog_dealloc_log(mp->m_log); 959 xlog_dealloc_log(mp->m_log);
951} 960}
952 961
@@ -1313,7 +1322,7 @@ xlog_alloc_log(
1313 xlog_in_core_t *iclog, *prev_iclog=NULL; 1322 xlog_in_core_t *iclog, *prev_iclog=NULL;
1314 xfs_buf_t *bp; 1323 xfs_buf_t *bp;
1315 int i; 1324 int i;
1316 int error = ENOMEM; 1325 int error = -ENOMEM;
1317 uint log2_size = 0; 1326 uint log2_size = 0;
1318 1327
1319 log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL); 1328 log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
@@ -1340,7 +1349,7 @@ xlog_alloc_log(
1340 xlog_grant_head_init(&log->l_reserve_head); 1349 xlog_grant_head_init(&log->l_reserve_head);
1341 xlog_grant_head_init(&log->l_write_head); 1350 xlog_grant_head_init(&log->l_write_head);
1342 1351
1343 error = EFSCORRUPTED; 1352 error = -EFSCORRUPTED;
1344 if (xfs_sb_version_hassector(&mp->m_sb)) { 1353 if (xfs_sb_version_hassector(&mp->m_sb)) {
1345 log2_size = mp->m_sb.sb_logsectlog; 1354 log2_size = mp->m_sb.sb_logsectlog;
1346 if (log2_size < BBSHIFT) { 1355 if (log2_size < BBSHIFT) {
@@ -1369,8 +1378,14 @@ xlog_alloc_log(
1369 1378
1370 xlog_get_iclog_buffer_size(mp, log); 1379 xlog_get_iclog_buffer_size(mp, log);
1371 1380
1372 error = ENOMEM; 1381 /*
1373 bp = xfs_buf_alloc(mp->m_logdev_targp, 0, BTOBB(log->l_iclog_size), 0); 1382 * Use a NULL block for the extra log buffer used during splits so that
1383 * it will trigger errors if we ever try to do IO on it without first
1384 * having set it up properly.
1385 */
1386 error = -ENOMEM;
1387 bp = xfs_buf_alloc(mp->m_logdev_targp, XFS_BUF_DADDR_NULL,
1388 BTOBB(log->l_iclog_size), 0);
1374 if (!bp) 1389 if (!bp)
1375 goto out_free_log; 1390 goto out_free_log;
1376 1391
@@ -1463,7 +1478,7 @@ out_free_iclog:
1463out_free_log: 1478out_free_log:
1464 kmem_free(log); 1479 kmem_free(log);
1465out: 1480out:
1466 return ERR_PTR(-error); 1481 return ERR_PTR(error);
1467} /* xlog_alloc_log */ 1482} /* xlog_alloc_log */
1468 1483
1469 1484
@@ -1661,9 +1676,9 @@ xlog_bdstrat(
1661 1676
1662 xfs_buf_lock(bp); 1677 xfs_buf_lock(bp);
1663 if (iclog->ic_state & XLOG_STATE_IOERROR) { 1678 if (iclog->ic_state & XLOG_STATE_IOERROR) {
1664 xfs_buf_ioerror(bp, EIO); 1679 xfs_buf_ioerror(bp, -EIO);
1665 xfs_buf_stale(bp); 1680 xfs_buf_stale(bp);
1666 xfs_buf_ioend(bp, 0); 1681 xfs_buf_ioend(bp);
1667 /* 1682 /*
1668 * It would seem logical to return EIO here, but we rely on 1683 * It would seem logical to return EIO here, but we rely on
1669 * the log state machine to propagate I/O errors instead of 1684 * the log state machine to propagate I/O errors instead of
@@ -1673,7 +1688,7 @@ xlog_bdstrat(
1673 return 0; 1688 return 0;
1674 } 1689 }
1675 1690
1676 xfs_buf_iorequest(bp); 1691 xfs_buf_submit(bp);
1677 return 0; 1692 return 0;
1678} 1693}
1679 1694
@@ -2360,7 +2375,7 @@ xlog_write(
2360 2375
2361 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags); 2376 ophdr = xlog_write_setup_ophdr(log, ptr, ticket, flags);
2362 if (!ophdr) 2377 if (!ophdr)
2363 return XFS_ERROR(EIO); 2378 return -EIO;
2364 2379
2365 xlog_write_adv_cnt(&ptr, &len, &log_offset, 2380 xlog_write_adv_cnt(&ptr, &len, &log_offset,
2366 sizeof(struct xlog_op_header)); 2381 sizeof(struct xlog_op_header));
@@ -2859,7 +2874,7 @@ restart:
2859 spin_lock(&log->l_icloglock); 2874 spin_lock(&log->l_icloglock);
2860 if (XLOG_FORCED_SHUTDOWN(log)) { 2875 if (XLOG_FORCED_SHUTDOWN(log)) {
2861 spin_unlock(&log->l_icloglock); 2876 spin_unlock(&log->l_icloglock);
2862 return XFS_ERROR(EIO); 2877 return -EIO;
2863 } 2878 }
2864 2879
2865 iclog = log->l_iclog; 2880 iclog = log->l_iclog;
@@ -3047,7 +3062,7 @@ xlog_state_release_iclog(
3047 int sync = 0; /* do we sync? */ 3062 int sync = 0; /* do we sync? */
3048 3063
3049 if (iclog->ic_state & XLOG_STATE_IOERROR) 3064 if (iclog->ic_state & XLOG_STATE_IOERROR)
3050 return XFS_ERROR(EIO); 3065 return -EIO;
3051 3066
3052 ASSERT(atomic_read(&iclog->ic_refcnt) > 0); 3067 ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
3053 if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock)) 3068 if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
@@ -3055,7 +3070,7 @@ xlog_state_release_iclog(
3055 3070
3056 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3071 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3057 spin_unlock(&log->l_icloglock); 3072 spin_unlock(&log->l_icloglock);
3058 return XFS_ERROR(EIO); 3073 return -EIO;
3059 } 3074 }
3060 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || 3075 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
3061 iclog->ic_state == XLOG_STATE_WANT_SYNC); 3076 iclog->ic_state == XLOG_STATE_WANT_SYNC);
@@ -3172,7 +3187,7 @@ _xfs_log_force(
3172 iclog = log->l_iclog; 3187 iclog = log->l_iclog;
3173 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3188 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3174 spin_unlock(&log->l_icloglock); 3189 spin_unlock(&log->l_icloglock);
3175 return XFS_ERROR(EIO); 3190 return -EIO;
3176 } 3191 }
3177 3192
3178 /* If the head iclog is not active nor dirty, we just attach 3193 /* If the head iclog is not active nor dirty, we just attach
@@ -3210,7 +3225,7 @@ _xfs_log_force(
3210 spin_unlock(&log->l_icloglock); 3225 spin_unlock(&log->l_icloglock);
3211 3226
3212 if (xlog_state_release_iclog(log, iclog)) 3227 if (xlog_state_release_iclog(log, iclog))
3213 return XFS_ERROR(EIO); 3228 return -EIO;
3214 3229
3215 if (log_flushed) 3230 if (log_flushed)
3216 *log_flushed = 1; 3231 *log_flushed = 1;
@@ -3246,7 +3261,7 @@ maybe_sleep:
3246 */ 3261 */
3247 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3262 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3248 spin_unlock(&log->l_icloglock); 3263 spin_unlock(&log->l_icloglock);
3249 return XFS_ERROR(EIO); 3264 return -EIO;
3250 } 3265 }
3251 XFS_STATS_INC(xs_log_force_sleep); 3266 XFS_STATS_INC(xs_log_force_sleep);
3252 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 3267 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3256,7 +3271,7 @@ maybe_sleep:
3256 * and the memory read should be atomic. 3271 * and the memory read should be atomic.
3257 */ 3272 */
3258 if (iclog->ic_state & XLOG_STATE_IOERROR) 3273 if (iclog->ic_state & XLOG_STATE_IOERROR)
3259 return XFS_ERROR(EIO); 3274 return -EIO;
3260 if (log_flushed) 3275 if (log_flushed)
3261 *log_flushed = 1; 3276 *log_flushed = 1;
3262 } else { 3277 } else {
@@ -3324,7 +3339,7 @@ try_again:
3324 iclog = log->l_iclog; 3339 iclog = log->l_iclog;
3325 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3340 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3326 spin_unlock(&log->l_icloglock); 3341 spin_unlock(&log->l_icloglock);
3327 return XFS_ERROR(EIO); 3342 return -EIO;
3328 } 3343 }
3329 3344
3330 do { 3345 do {
@@ -3375,7 +3390,7 @@ try_again:
3375 xlog_state_switch_iclogs(log, iclog, 0); 3390 xlog_state_switch_iclogs(log, iclog, 0);
3376 spin_unlock(&log->l_icloglock); 3391 spin_unlock(&log->l_icloglock);
3377 if (xlog_state_release_iclog(log, iclog)) 3392 if (xlog_state_release_iclog(log, iclog))
3378 return XFS_ERROR(EIO); 3393 return -EIO;
3379 if (log_flushed) 3394 if (log_flushed)
3380 *log_flushed = 1; 3395 *log_flushed = 1;
3381 spin_lock(&log->l_icloglock); 3396 spin_lock(&log->l_icloglock);
@@ -3390,7 +3405,7 @@ try_again:
3390 */ 3405 */
3391 if (iclog->ic_state & XLOG_STATE_IOERROR) { 3406 if (iclog->ic_state & XLOG_STATE_IOERROR) {
3392 spin_unlock(&log->l_icloglock); 3407 spin_unlock(&log->l_icloglock);
3393 return XFS_ERROR(EIO); 3408 return -EIO;
3394 } 3409 }
3395 XFS_STATS_INC(xs_log_force_sleep); 3410 XFS_STATS_INC(xs_log_force_sleep);
3396 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); 3411 xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
@@ -3400,7 +3415,7 @@ try_again:
3400 * and the memory read should be atomic. 3415 * and the memory read should be atomic.
3401 */ 3416 */
3402 if (iclog->ic_state & XLOG_STATE_IOERROR) 3417 if (iclog->ic_state & XLOG_STATE_IOERROR)
3403 return XFS_ERROR(EIO); 3418 return -EIO;
3404 3419
3405 if (log_flushed) 3420 if (log_flushed)
3406 *log_flushed = 1; 3421 *log_flushed = 1;
@@ -3852,18 +3867,17 @@ xlog_state_ioerror(
3852 * This is called from xfs_force_shutdown, when we're forcibly 3867 * This is called from xfs_force_shutdown, when we're forcibly
3853 * shutting down the filesystem, typically because of an IO error. 3868 * shutting down the filesystem, typically because of an IO error.
3854 * Our main objectives here are to make sure that: 3869 * Our main objectives here are to make sure that:
3855 * a. the filesystem gets marked 'SHUTDOWN' for all interested 3870 * a. if !logerror, flush the logs to disk. Anything modified
3871 * after this is ignored.
3872 * b. the filesystem gets marked 'SHUTDOWN' for all interested
3856 * parties to find out, 'atomically'. 3873 * parties to find out, 'atomically'.
3857 * b. those who're sleeping on log reservations, pinned objects and 3874 * c. those who're sleeping on log reservations, pinned objects and
3858 * other resources get woken up, and be told the bad news. 3875 * other resources get woken up, and be told the bad news.
3859 * c. nothing new gets queued up after (a) and (b) are done. 3876 * d. nothing new gets queued up after (b) and (c) are done.
3860 * d. if !logerror, flush the iclogs to disk, then seal them off
3861 * for business.
3862 * 3877 *
3863 * Note: for delayed logging the !logerror case needs to flush the regions 3878 * Note: for the !logerror case we need to flush the regions held in memory out
3864 * held in memory out to the iclogs before flushing them to disk. This needs 3879 * to disk first. This needs to be done before the log is marked as shutdown,
3865 * to be done before the log is marked as shutdown, otherwise the flush to the 3880 * otherwise the iclog writes will fail.
3866 * iclogs will fail.
3867 */ 3881 */
3868int 3882int
3869xfs_log_force_umount( 3883xfs_log_force_umount(
@@ -3895,16 +3909,16 @@ xfs_log_force_umount(
3895 ASSERT(XLOG_FORCED_SHUTDOWN(log)); 3909 ASSERT(XLOG_FORCED_SHUTDOWN(log));
3896 return 1; 3910 return 1;
3897 } 3911 }
3898 retval = 0;
3899 3912
3900 /* 3913 /*
3901 * Flush the in memory commit item list before marking the log as 3914 * Flush all the completed transactions to disk before marking the log
3902 * being shut down. We need to do it in this order to ensure all the 3915 * being shut down. We need to do it in this order to ensure that
3903 * completed transactions are flushed to disk with the xfs_log_force() 3916 * completed operations are safely on disk before we shut down, and that
3904 * call below. 3917 * we don't have to issue any buffer IO after the shutdown flags are set
3918 * to guarantee this.
3905 */ 3919 */
3906 if (!logerror) 3920 if (!logerror)
3907 xlog_cil_force(log); 3921 _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
3908 3922
3909 /* 3923 /*
3910 * mark the filesystem and the as in a shutdown state and wake 3924 * mark the filesystem and the as in a shutdown state and wake
@@ -3916,18 +3930,11 @@ xfs_log_force_umount(
3916 XFS_BUF_DONE(mp->m_sb_bp); 3930 XFS_BUF_DONE(mp->m_sb_bp);
3917 3931
3918 /* 3932 /*
3919 * This flag is sort of redundant because of the mount flag, but 3933 * Mark the log and the iclogs with IO error flags to prevent any
3920 * it's good to maintain the separation between the log and the rest 3934 * further log IO from being issued or completed.
3921 * of XFS.
3922 */ 3935 */
3923 log->l_flags |= XLOG_IO_ERROR; 3936 log->l_flags |= XLOG_IO_ERROR;
3924 3937 retval = xlog_state_ioerror(log);
3925 /*
3926 * If we hit a log error, we want to mark all the iclogs IOERROR
3927 * while we're still holding the loglock.
3928 */
3929 if (logerror)
3930 retval = xlog_state_ioerror(log);
3931 spin_unlock(&log->l_icloglock); 3938 spin_unlock(&log->l_icloglock);
3932 3939
3933 /* 3940 /*
@@ -3940,19 +3947,6 @@ xfs_log_force_umount(
3940 xlog_grant_head_wake_all(&log->l_reserve_head); 3947 xlog_grant_head_wake_all(&log->l_reserve_head);
3941 xlog_grant_head_wake_all(&log->l_write_head); 3948 xlog_grant_head_wake_all(&log->l_write_head);
3942 3949
3943 if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {
3944 ASSERT(!logerror);
3945 /*
3946 * Force the incore logs to disk before shutting the
3947 * log down completely.
3948 */
3949 _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
3950
3951 spin_lock(&log->l_icloglock);
3952 retval = xlog_state_ioerror(log);
3953 spin_unlock(&log->l_icloglock);
3954 }
3955
3956 /* 3950 /*
3957 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first 3951 * Wake up everybody waiting on xfs_log_force. Wake the CIL push first
3958 * as if the log writes were completed. The abort handling in the log 3952 * as if the log writes were completed. The abort handling in the log
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b3425b34e3d5..f506c457011e 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -78,8 +78,6 @@ xlog_cil_init_post_recovery(
78{ 78{
79 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); 79 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
80 log->l_cilp->xc_ctx->sequence = 1; 80 log->l_cilp->xc_ctx->sequence = 1;
81 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
82 log->l_curr_block);
83} 81}
84 82
85/* 83/*
@@ -465,12 +463,40 @@ xlog_cil_push(
465 spin_unlock(&cil->xc_push_lock); 463 spin_unlock(&cil->xc_push_lock);
466 goto out_skip; 464 goto out_skip;
467 } 465 }
468 spin_unlock(&cil->xc_push_lock);
469 466
470 467
471 /* check for a previously pushed seqeunce */ 468 /* check for a previously pushed seqeunce */
472 if (push_seq < cil->xc_ctx->sequence) 469 if (push_seq < cil->xc_ctx->sequence) {
470 spin_unlock(&cil->xc_push_lock);
473 goto out_skip; 471 goto out_skip;
472 }
473
474 /*
475 * We are now going to push this context, so add it to the committing
476 * list before we do anything else. This ensures that anyone waiting on
477 * this push can easily detect the difference between a "push in
478 * progress" and "CIL is empty, nothing to do".
479 *
480 * IOWs, a wait loop can now check for:
481 * the current sequence not being found on the committing list;
482 * an empty CIL; and
483 * an unchanged sequence number
484 * to detect a push that had nothing to do and therefore does not need
485 * waiting on. If the CIL is not empty, we get put on the committing
486 * list before emptying the CIL and bumping the sequence number. Hence
487 * an empty CIL and an unchanged sequence number means we jumped out
488 * above after doing nothing.
489 *
490 * Hence the waiter will either find the commit sequence on the
491 * committing list or the sequence number will be unchanged and the CIL
492 * still dirty. In that latter case, the push has not yet started, and
493 * so the waiter will have to continue trying to check the CIL
494 * committing list until it is found. In extreme cases of delay, the
495 * sequence may fully commit between the attempts the wait makes to wait
496 * on the commit sequence.
497 */
498 list_add(&ctx->committing, &cil->xc_committing);
499 spin_unlock(&cil->xc_push_lock);
474 500
475 /* 501 /*
476 * pull all the log vectors off the items in the CIL, and 502 * pull all the log vectors off the items in the CIL, and
@@ -534,7 +560,6 @@ xlog_cil_push(
534 */ 560 */
535 spin_lock(&cil->xc_push_lock); 561 spin_lock(&cil->xc_push_lock);
536 cil->xc_current_sequence = new_ctx->sequence; 562 cil->xc_current_sequence = new_ctx->sequence;
537 list_add(&ctx->committing, &cil->xc_committing);
538 spin_unlock(&cil->xc_push_lock); 563 spin_unlock(&cil->xc_push_lock);
539 up_write(&cil->xc_ctx_lock); 564 up_write(&cil->xc_ctx_lock);
540 565
@@ -634,7 +659,7 @@ out_abort_free_ticket:
634 xfs_log_ticket_put(tic); 659 xfs_log_ticket_put(tic);
635out_abort: 660out_abort:
636 xlog_cil_committed(ctx, XFS_LI_ABORTED); 661 xlog_cil_committed(ctx, XFS_LI_ABORTED);
637 return XFS_ERROR(EIO); 662 return -EIO;
638} 663}
639 664
640static void 665static void
@@ -857,13 +882,15 @@ restart:
857 * Hence by the time we have got here it our sequence may not have been 882 * Hence by the time we have got here it our sequence may not have been
858 * pushed yet. This is true if the current sequence still matches the 883 * pushed yet. This is true if the current sequence still matches the
859 * push sequence after the above wait loop and the CIL still contains 884 * push sequence after the above wait loop and the CIL still contains
860 * dirty objects. 885 * dirty objects. This is guaranteed by the push code first adding the
886 * context to the committing list before emptying the CIL.
861 * 887 *
862 * When the push occurs, it will empty the CIL and atomically increment 888 * Hence if we don't find the context in the committing list and the
863 * the currect sequence past the push sequence and move it into the 889 * current sequence number is unchanged then the CIL contents are
864 * committing list. Of course, if the CIL is clean at the time of the 890 * significant. If the CIL is empty, if means there was nothing to push
865 * push, it won't have pushed the CIL at all, so in that case we should 891 * and that means there is nothing to wait for. If the CIL is not empty,
866 * try the push for this sequence again from the start just in case. 892 * it means we haven't yet started the push, because if it had started
893 * we would have found the context on the committing list.
867 */ 894 */
868 if (sequence == cil->xc_current_sequence && 895 if (sequence == cil->xc_current_sequence &&
869 !list_empty(&cil->xc_cil)) { 896 !list_empty(&cil->xc_cil)) {
@@ -928,12 +955,12 @@ xlog_cil_init(
928 955
929 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL); 956 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
930 if (!cil) 957 if (!cil)
931 return ENOMEM; 958 return -ENOMEM;
932 959
933 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL); 960 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
934 if (!ctx) { 961 if (!ctx) {
935 kmem_free(cil); 962 kmem_free(cil);
936 return ENOMEM; 963 return -ENOMEM;
937 } 964 }
938 965
939 INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); 966 INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9bc403a9e54f..db7cbdeb2b42 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -405,6 +405,8 @@ struct xlog {
405 struct xlog_grant_head l_reserve_head; 405 struct xlog_grant_head l_reserve_head;
406 struct xlog_grant_head l_write_head; 406 struct xlog_grant_head l_write_head;
407 407
408 struct xfs_kobj l_kobj;
409
408 /* The following field are used for debugging; need to hold icloglock */ 410 /* The following field are used for debugging; need to hold icloglock */
409#ifdef DEBUG 411#ifdef DEBUG
410 char *l_iclog_bak[XLOG_MAX_ICLOGS]; 412 char *l_iclog_bak[XLOG_MAX_ICLOGS];
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 981af0f6504b..00cd7f3a8f59 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -179,7 +179,7 @@ xlog_bread_noalign(
179 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", 179 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
180 nbblks); 180 nbblks);
181 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 181 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
182 return EFSCORRUPTED; 182 return -EFSCORRUPTED;
183 } 183 }
184 184
185 blk_no = round_down(blk_no, log->l_sectBBsize); 185 blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -193,12 +193,8 @@ xlog_bread_noalign(
193 bp->b_io_length = nbblks; 193 bp->b_io_length = nbblks;
194 bp->b_error = 0; 194 bp->b_error = 0;
195 195
196 if (XFS_FORCED_SHUTDOWN(log->l_mp)) 196 error = xfs_buf_submit_wait(bp);
197 return XFS_ERROR(EIO); 197 if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
198
199 xfs_buf_iorequest(bp);
200 error = xfs_buf_iowait(bp);
201 if (error)
202 xfs_buf_ioerror_alert(bp, __func__); 198 xfs_buf_ioerror_alert(bp, __func__);
203 return error; 199 return error;
204} 200}
@@ -268,7 +264,7 @@ xlog_bwrite(
268 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", 264 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
269 nbblks); 265 nbblks);
270 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 266 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
271 return EFSCORRUPTED; 267 return -EFSCORRUPTED;
272 } 268 }
273 269
274 blk_no = round_down(blk_no, log->l_sectBBsize); 270 blk_no = round_down(blk_no, log->l_sectBBsize);
@@ -330,14 +326,14 @@ xlog_header_check_recover(
330 xlog_header_check_dump(mp, head); 326 xlog_header_check_dump(mp, head);
331 XFS_ERROR_REPORT("xlog_header_check_recover(1)", 327 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
332 XFS_ERRLEVEL_HIGH, mp); 328 XFS_ERRLEVEL_HIGH, mp);
333 return XFS_ERROR(EFSCORRUPTED); 329 return -EFSCORRUPTED;
334 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 330 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
335 xfs_warn(mp, 331 xfs_warn(mp,
336 "dirty log entry has mismatched uuid - can't recover"); 332 "dirty log entry has mismatched uuid - can't recover");
337 xlog_header_check_dump(mp, head); 333 xlog_header_check_dump(mp, head);
338 XFS_ERROR_REPORT("xlog_header_check_recover(2)", 334 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
339 XFS_ERRLEVEL_HIGH, mp); 335 XFS_ERRLEVEL_HIGH, mp);
340 return XFS_ERROR(EFSCORRUPTED); 336 return -EFSCORRUPTED;
341 } 337 }
342 return 0; 338 return 0;
343} 339}
@@ -364,7 +360,7 @@ xlog_header_check_mount(
364 xlog_header_check_dump(mp, head); 360 xlog_header_check_dump(mp, head);
365 XFS_ERROR_REPORT("xlog_header_check_mount", 361 XFS_ERROR_REPORT("xlog_header_check_mount",
366 XFS_ERRLEVEL_HIGH, mp); 362 XFS_ERRLEVEL_HIGH, mp);
367 return XFS_ERROR(EFSCORRUPTED); 363 return -EFSCORRUPTED;
368 } 364 }
369 return 0; 365 return 0;
370} 366}
@@ -378,12 +374,14 @@ xlog_recover_iodone(
378 * We're not going to bother about retrying 374 * We're not going to bother about retrying
379 * this during recovery. One strike! 375 * this during recovery. One strike!
380 */ 376 */
381 xfs_buf_ioerror_alert(bp, __func__); 377 if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
382 xfs_force_shutdown(bp->b_target->bt_mount, 378 xfs_buf_ioerror_alert(bp, __func__);
383 SHUTDOWN_META_IO_ERROR); 379 xfs_force_shutdown(bp->b_target->bt_mount,
380 SHUTDOWN_META_IO_ERROR);
381 }
384 } 382 }
385 bp->b_iodone = NULL; 383 bp->b_iodone = NULL;
386 xfs_buf_ioend(bp, 0); 384 xfs_buf_ioend(bp);
387} 385}
388 386
389/* 387/*
@@ -462,7 +460,7 @@ xlog_find_verify_cycle(
462 while (!(bp = xlog_get_bp(log, bufblks))) { 460 while (!(bp = xlog_get_bp(log, bufblks))) {
463 bufblks >>= 1; 461 bufblks >>= 1;
464 if (bufblks < log->l_sectBBsize) 462 if (bufblks < log->l_sectBBsize)
465 return ENOMEM; 463 return -ENOMEM;
466 } 464 }
467 465
468 for (i = start_blk; i < start_blk + nbblks; i += bufblks) { 466 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
@@ -524,7 +522,7 @@ xlog_find_verify_log_record(
524 522
525 if (!(bp = xlog_get_bp(log, num_blks))) { 523 if (!(bp = xlog_get_bp(log, num_blks))) {
526 if (!(bp = xlog_get_bp(log, 1))) 524 if (!(bp = xlog_get_bp(log, 1)))
527 return ENOMEM; 525 return -ENOMEM;
528 smallmem = 1; 526 smallmem = 1;
529 } else { 527 } else {
530 error = xlog_bread(log, start_blk, num_blks, bp, &offset); 528 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
@@ -539,7 +537,7 @@ xlog_find_verify_log_record(
539 xfs_warn(log->l_mp, 537 xfs_warn(log->l_mp,
540 "Log inconsistent (didn't find previous header)"); 538 "Log inconsistent (didn't find previous header)");
541 ASSERT(0); 539 ASSERT(0);
542 error = XFS_ERROR(EIO); 540 error = -EIO;
543 goto out; 541 goto out;
544 } 542 }
545 543
@@ -564,7 +562,7 @@ xlog_find_verify_log_record(
564 * will be called again for the end of the physical log. 562 * will be called again for the end of the physical log.
565 */ 563 */
566 if (i == -1) { 564 if (i == -1) {
567 error = -1; 565 error = 1;
568 goto out; 566 goto out;
569 } 567 }
570 568
@@ -628,7 +626,12 @@ xlog_find_head(
628 int error, log_bbnum = log->l_logBBsize; 626 int error, log_bbnum = log->l_logBBsize;
629 627
630 /* Is the end of the log device zeroed? */ 628 /* Is the end of the log device zeroed? */
631 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) { 629 error = xlog_find_zeroed(log, &first_blk);
630 if (error < 0) {
631 xfs_warn(log->l_mp, "empty log check failed");
632 return error;
633 }
634 if (error == 1) {
632 *return_head_blk = first_blk; 635 *return_head_blk = first_blk;
633 636
634 /* Is the whole lot zeroed? */ 637 /* Is the whole lot zeroed? */
@@ -641,15 +644,12 @@ xlog_find_head(
641 } 644 }
642 645
643 return 0; 646 return 0;
644 } else if (error) {
645 xfs_warn(log->l_mp, "empty log check failed");
646 return error;
647 } 647 }
648 648
649 first_blk = 0; /* get cycle # of 1st block */ 649 first_blk = 0; /* get cycle # of 1st block */
650 bp = xlog_get_bp(log, 1); 650 bp = xlog_get_bp(log, 1);
651 if (!bp) 651 if (!bp)
652 return ENOMEM; 652 return -ENOMEM;
653 653
654 error = xlog_bread(log, 0, 1, bp, &offset); 654 error = xlog_bread(log, 0, 1, bp, &offset);
655 if (error) 655 if (error)
@@ -818,29 +818,29 @@ validate_head:
818 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ 818 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
819 819
820 /* start ptr at last block ptr before head_blk */ 820 /* start ptr at last block ptr before head_blk */
821 if ((error = xlog_find_verify_log_record(log, start_blk, 821 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
822 &head_blk, 0)) == -1) { 822 if (error == 1)
823 error = XFS_ERROR(EIO); 823 error = -EIO;
824 goto bp_err; 824 if (error)
825 } else if (error)
826 goto bp_err; 825 goto bp_err;
827 } else { 826 } else {
828 start_blk = 0; 827 start_blk = 0;
829 ASSERT(head_blk <= INT_MAX); 828 ASSERT(head_blk <= INT_MAX);
830 if ((error = xlog_find_verify_log_record(log, start_blk, 829 error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
831 &head_blk, 0)) == -1) { 830 if (error < 0)
831 goto bp_err;
832 if (error == 1) {
832 /* We hit the beginning of the log during our search */ 833 /* We hit the beginning of the log during our search */
833 start_blk = log_bbnum - (num_scan_bblks - head_blk); 834 start_blk = log_bbnum - (num_scan_bblks - head_blk);
834 new_blk = log_bbnum; 835 new_blk = log_bbnum;
835 ASSERT(start_blk <= INT_MAX && 836 ASSERT(start_blk <= INT_MAX &&
836 (xfs_daddr_t) log_bbnum-start_blk >= 0); 837 (xfs_daddr_t) log_bbnum-start_blk >= 0);
837 ASSERT(head_blk <= INT_MAX); 838 ASSERT(head_blk <= INT_MAX);
838 if ((error = xlog_find_verify_log_record(log, 839 error = xlog_find_verify_log_record(log, start_blk,
839 start_blk, &new_blk, 840 &new_blk, (int)head_blk);
840 (int)head_blk)) == -1) { 841 if (error == 1)
841 error = XFS_ERROR(EIO); 842 error = -EIO;
842 goto bp_err; 843 if (error)
843 } else if (error)
844 goto bp_err; 844 goto bp_err;
845 if (new_blk != log_bbnum) 845 if (new_blk != log_bbnum)
846 head_blk = new_blk; 846 head_blk = new_blk;
@@ -911,7 +911,7 @@ xlog_find_tail(
911 911
912 bp = xlog_get_bp(log, 1); 912 bp = xlog_get_bp(log, 1);
913 if (!bp) 913 if (!bp)
914 return ENOMEM; 914 return -ENOMEM;
915 if (*head_blk == 0) { /* special case */ 915 if (*head_blk == 0) { /* special case */
916 error = xlog_bread(log, 0, 1, bp, &offset); 916 error = xlog_bread(log, 0, 1, bp, &offset);
917 if (error) 917 if (error)
@@ -961,7 +961,7 @@ xlog_find_tail(
961 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); 961 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
962 xlog_put_bp(bp); 962 xlog_put_bp(bp);
963 ASSERT(0); 963 ASSERT(0);
964 return XFS_ERROR(EIO); 964 return -EIO;
965 } 965 }
966 966
967 /* find blk_no of tail of log */ 967 /* find blk_no of tail of log */
@@ -1092,8 +1092,8 @@ done:
1092 * 1092 *
1093 * Return: 1093 * Return:
1094 * 0 => the log is completely written to 1094 * 0 => the log is completely written to
1095 * -1 => use *blk_no as the first block of the log 1095 * 1 => use *blk_no as the first block of the log
1096 * >0 => error has occurred 1096 * <0 => error has occurred
1097 */ 1097 */
1098STATIC int 1098STATIC int
1099xlog_find_zeroed( 1099xlog_find_zeroed(
@@ -1112,7 +1112,7 @@ xlog_find_zeroed(
1112 /* check totally zeroed log */ 1112 /* check totally zeroed log */
1113 bp = xlog_get_bp(log, 1); 1113 bp = xlog_get_bp(log, 1);
1114 if (!bp) 1114 if (!bp)
1115 return ENOMEM; 1115 return -ENOMEM;
1116 error = xlog_bread(log, 0, 1, bp, &offset); 1116 error = xlog_bread(log, 0, 1, bp, &offset);
1117 if (error) 1117 if (error)
1118 goto bp_err; 1118 goto bp_err;
@@ -1121,7 +1121,7 @@ xlog_find_zeroed(
1121 if (first_cycle == 0) { /* completely zeroed log */ 1121 if (first_cycle == 0) { /* completely zeroed log */
1122 *blk_no = 0; 1122 *blk_no = 0;
1123 xlog_put_bp(bp); 1123 xlog_put_bp(bp);
1124 return -1; 1124 return 1;
1125 } 1125 }
1126 1126
1127 /* check partially zeroed log */ 1127 /* check partially zeroed log */
@@ -1141,7 +1141,7 @@ xlog_find_zeroed(
1141 */ 1141 */
1142 xfs_warn(log->l_mp, 1142 xfs_warn(log->l_mp,
1143 "Log inconsistent or not a log (last==0, first!=1)"); 1143 "Log inconsistent or not a log (last==0, first!=1)");
1144 error = XFS_ERROR(EINVAL); 1144 error = -EINVAL;
1145 goto bp_err; 1145 goto bp_err;
1146 } 1146 }
1147 1147
@@ -1179,19 +1179,18 @@ xlog_find_zeroed(
1179 * Potentially backup over partial log record write. We don't need 1179 * Potentially backup over partial log record write. We don't need
1180 * to search the end of the log because we know it is zero. 1180 * to search the end of the log because we know it is zero.
1181 */ 1181 */
1182 if ((error = xlog_find_verify_log_record(log, start_blk, 1182 error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1183 &last_blk, 0)) == -1) { 1183 if (error == 1)
1184 error = XFS_ERROR(EIO); 1184 error = -EIO;
1185 goto bp_err; 1185 if (error)
1186 } else if (error) 1186 goto bp_err;
1187 goto bp_err;
1188 1187
1189 *blk_no = last_blk; 1188 *blk_no = last_blk;
1190bp_err: 1189bp_err:
1191 xlog_put_bp(bp); 1190 xlog_put_bp(bp);
1192 if (error) 1191 if (error)
1193 return error; 1192 return error;
1194 return -1; 1193 return 1;
1195} 1194}
1196 1195
1197/* 1196/*
@@ -1251,7 +1250,7 @@ xlog_write_log_records(
1251 while (!(bp = xlog_get_bp(log, bufblks))) { 1250 while (!(bp = xlog_get_bp(log, bufblks))) {
1252 bufblks >>= 1; 1251 bufblks >>= 1;
1253 if (bufblks < sectbb) 1252 if (bufblks < sectbb)
1254 return ENOMEM; 1253 return -ENOMEM;
1255 } 1254 }
1256 1255
1257 /* We may need to do a read at the start to fill in part of 1256 /* We may need to do a read at the start to fill in part of
@@ -1354,7 +1353,7 @@ xlog_clear_stale_blocks(
1354 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { 1353 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1355 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", 1354 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1356 XFS_ERRLEVEL_LOW, log->l_mp); 1355 XFS_ERRLEVEL_LOW, log->l_mp);
1357 return XFS_ERROR(EFSCORRUPTED); 1356 return -EFSCORRUPTED;
1358 } 1357 }
1359 tail_distance = tail_block + (log->l_logBBsize - head_block); 1358 tail_distance = tail_block + (log->l_logBBsize - head_block);
1360 } else { 1359 } else {
@@ -1366,7 +1365,7 @@ xlog_clear_stale_blocks(
1366 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ 1365 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1367 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", 1366 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1368 XFS_ERRLEVEL_LOW, log->l_mp); 1367 XFS_ERRLEVEL_LOW, log->l_mp);
1369 return XFS_ERROR(EFSCORRUPTED); 1368 return -EFSCORRUPTED;
1370 } 1369 }
1371 tail_distance = tail_block - head_block; 1370 tail_distance = tail_block - head_block;
1372 } 1371 }
@@ -1444,160 +1443,6 @@ xlog_clear_stale_blocks(
1444 ****************************************************************************** 1443 ******************************************************************************
1445 */ 1444 */
1446 1445
1447STATIC xlog_recover_t *
1448xlog_recover_find_tid(
1449 struct hlist_head *head,
1450 xlog_tid_t tid)
1451{
1452 xlog_recover_t *trans;
1453
1454 hlist_for_each_entry(trans, head, r_list) {
1455 if (trans->r_log_tid == tid)
1456 return trans;
1457 }
1458 return NULL;
1459}
1460
1461STATIC void
1462xlog_recover_new_tid(
1463 struct hlist_head *head,
1464 xlog_tid_t tid,
1465 xfs_lsn_t lsn)
1466{
1467 xlog_recover_t *trans;
1468
1469 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1470 trans->r_log_tid = tid;
1471 trans->r_lsn = lsn;
1472 INIT_LIST_HEAD(&trans->r_itemq);
1473
1474 INIT_HLIST_NODE(&trans->r_list);
1475 hlist_add_head(&trans->r_list, head);
1476}
1477
1478STATIC void
1479xlog_recover_add_item(
1480 struct list_head *head)
1481{
1482 xlog_recover_item_t *item;
1483
1484 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1485 INIT_LIST_HEAD(&item->ri_list);
1486 list_add_tail(&item->ri_list, head);
1487}
1488
1489STATIC int
1490xlog_recover_add_to_cont_trans(
1491 struct xlog *log,
1492 struct xlog_recover *trans,
1493 xfs_caddr_t dp,
1494 int len)
1495{
1496 xlog_recover_item_t *item;
1497 xfs_caddr_t ptr, old_ptr;
1498 int old_len;
1499
1500 if (list_empty(&trans->r_itemq)) {
1501 /* finish copying rest of trans header */
1502 xlog_recover_add_item(&trans->r_itemq);
1503 ptr = (xfs_caddr_t) &trans->r_theader +
1504 sizeof(xfs_trans_header_t) - len;
1505 memcpy(ptr, dp, len); /* d, s, l */
1506 return 0;
1507 }
1508 /* take the tail entry */
1509 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1510
1511 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1512 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1513
1514 ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
1515 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1516 item->ri_buf[item->ri_cnt-1].i_len += len;
1517 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1518 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1519 return 0;
1520}
1521
1522/*
1523 * The next region to add is the start of a new region. It could be
1524 * a whole region or it could be the first part of a new region. Because
1525 * of this, the assumption here is that the type and size fields of all
1526 * format structures fit into the first 32 bits of the structure.
1527 *
1528 * This works because all regions must be 32 bit aligned. Therefore, we
1529 * either have both fields or we have neither field. In the case we have
1530 * neither field, the data part of the region is zero length. We only have
1531 * a log_op_header and can throw away the header since a new one will appear
1532 * later. If we have at least 4 bytes, then we can determine how many regions
1533 * will appear in the current log item.
1534 */
1535STATIC int
1536xlog_recover_add_to_trans(
1537 struct xlog *log,
1538 struct xlog_recover *trans,
1539 xfs_caddr_t dp,
1540 int len)
1541{
1542 xfs_inode_log_format_t *in_f; /* any will do */
1543 xlog_recover_item_t *item;
1544 xfs_caddr_t ptr;
1545
1546 if (!len)
1547 return 0;
1548 if (list_empty(&trans->r_itemq)) {
1549 /* we need to catch log corruptions here */
1550 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1551 xfs_warn(log->l_mp, "%s: bad header magic number",
1552 __func__);
1553 ASSERT(0);
1554 return XFS_ERROR(EIO);
1555 }
1556 if (len == sizeof(xfs_trans_header_t))
1557 xlog_recover_add_item(&trans->r_itemq);
1558 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1559 return 0;
1560 }
1561
1562 ptr = kmem_alloc(len, KM_SLEEP);
1563 memcpy(ptr, dp, len);
1564 in_f = (xfs_inode_log_format_t *)ptr;
1565
1566 /* take the tail entry */
1567 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1568 if (item->ri_total != 0 &&
1569 item->ri_total == item->ri_cnt) {
1570 /* tail item is in use, get a new one */
1571 xlog_recover_add_item(&trans->r_itemq);
1572 item = list_entry(trans->r_itemq.prev,
1573 xlog_recover_item_t, ri_list);
1574 }
1575
1576 if (item->ri_total == 0) { /* first region to be added */
1577 if (in_f->ilf_size == 0 ||
1578 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1579 xfs_warn(log->l_mp,
1580 "bad number of regions (%d) in inode log format",
1581 in_f->ilf_size);
1582 ASSERT(0);
1583 kmem_free(ptr);
1584 return XFS_ERROR(EIO);
1585 }
1586
1587 item->ri_total = in_f->ilf_size;
1588 item->ri_buf =
1589 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1590 KM_SLEEP);
1591 }
1592 ASSERT(item->ri_total > item->ri_cnt);
1593 /* Description region is ri_buf[0] */
1594 item->ri_buf[item->ri_cnt].i_addr = ptr;
1595 item->ri_buf[item->ri_cnt].i_len = len;
1596 item->ri_cnt++;
1597 trace_xfs_log_recover_item_add(log, trans, item, 0);
1598 return 0;
1599}
1600
1601/* 1446/*
1602 * Sort the log items in the transaction. 1447 * Sort the log items in the transaction.
1603 * 1448 *
@@ -1702,7 +1547,7 @@ xlog_recover_reorder_trans(
1702 */ 1547 */
1703 if (!list_empty(&sort_list)) 1548 if (!list_empty(&sort_list))
1704 list_splice_init(&sort_list, &trans->r_itemq); 1549 list_splice_init(&sort_list, &trans->r_itemq);
1705 error = XFS_ERROR(EIO); 1550 error = -EIO;
1706 goto out; 1551 goto out;
1707 } 1552 }
1708 } 1553 }
@@ -1943,7 +1788,7 @@ xlog_recover_do_inode_buffer(
1943 item, bp); 1788 item, bp);
1944 XFS_ERROR_REPORT("xlog_recover_do_inode_buf", 1789 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1945 XFS_ERRLEVEL_LOW, mp); 1790 XFS_ERRLEVEL_LOW, mp);
1946 return XFS_ERROR(EFSCORRUPTED); 1791 return -EFSCORRUPTED;
1947 } 1792 }
1948 1793
1949 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, 1794 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
@@ -2125,6 +1970,17 @@ xlog_recover_validate_buf_type(
2125 __uint16_t magic16; 1970 __uint16_t magic16;
2126 __uint16_t magicda; 1971 __uint16_t magicda;
2127 1972
1973 /*
1974 * We can only do post recovery validation on items on CRC enabled
1975 * fielsystems as we need to know when the buffer was written to be able
1976 * to determine if we should have replayed the item. If we replay old
1977 * metadata over a newer buffer, then it will enter a temporarily
1978 * inconsistent state resulting in verification failures. Hence for now
1979 * just avoid the verification stage for non-crc filesystems
1980 */
1981 if (!xfs_sb_version_hascrc(&mp->m_sb))
1982 return;
1983
2128 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); 1984 magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
2129 magic16 = be16_to_cpu(*(__be16*)bp->b_addr); 1985 magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
2130 magicda = be16_to_cpu(info->magic); 1986 magicda = be16_to_cpu(info->magic);
@@ -2162,8 +2018,6 @@ xlog_recover_validate_buf_type(
2162 bp->b_ops = &xfs_agf_buf_ops; 2018 bp->b_ops = &xfs_agf_buf_ops;
2163 break; 2019 break;
2164 case XFS_BLFT_AGFL_BUF: 2020 case XFS_BLFT_AGFL_BUF:
2165 if (!xfs_sb_version_hascrc(&mp->m_sb))
2166 break;
2167 if (magic32 != XFS_AGFL_MAGIC) { 2021 if (magic32 != XFS_AGFL_MAGIC) {
2168 xfs_warn(mp, "Bad AGFL block magic!"); 2022 xfs_warn(mp, "Bad AGFL block magic!");
2169 ASSERT(0); 2023 ASSERT(0);
@@ -2196,10 +2050,6 @@ xlog_recover_validate_buf_type(
2196#endif 2050#endif
2197 break; 2051 break;
2198 case XFS_BLFT_DINO_BUF: 2052 case XFS_BLFT_DINO_BUF:
2199 /*
2200 * we get here with inode allocation buffers, not buffers that
2201 * track unlinked list changes.
2202 */
2203 if (magic16 != XFS_DINODE_MAGIC) { 2053 if (magic16 != XFS_DINODE_MAGIC) {
2204 xfs_warn(mp, "Bad INODE block magic!"); 2054 xfs_warn(mp, "Bad INODE block magic!");
2205 ASSERT(0); 2055 ASSERT(0);
@@ -2279,8 +2129,6 @@ xlog_recover_validate_buf_type(
2279 bp->b_ops = &xfs_attr3_leaf_buf_ops; 2129 bp->b_ops = &xfs_attr3_leaf_buf_ops;
2280 break; 2130 break;
2281 case XFS_BLFT_ATTR_RMT_BUF: 2131 case XFS_BLFT_ATTR_RMT_BUF:
2282 if (!xfs_sb_version_hascrc(&mp->m_sb))
2283 break;
2284 if (magic32 != XFS_ATTR3_RMT_MAGIC) { 2132 if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2285 xfs_warn(mp, "Bad attr remote magic!"); 2133 xfs_warn(mp, "Bad attr remote magic!");
2286 ASSERT(0); 2134 ASSERT(0);
@@ -2387,16 +2235,7 @@ xlog_recover_do_reg_buffer(
2387 /* Shouldn't be any more regions */ 2235 /* Shouldn't be any more regions */
2388 ASSERT(i == item->ri_total); 2236 ASSERT(i == item->ri_total);
2389 2237
2390 /* 2238 xlog_recover_validate_buf_type(mp, bp, buf_f);
2391 * We can only do post recovery validation on items on CRC enabled
2392 * fielsystems as we need to know when the buffer was written to be able
2393 * to determine if we should have replayed the item. If we replay old
2394 * metadata over a newer buffer, then it will enter a temporarily
2395 * inconsistent state resulting in verification failures. Hence for now
2396 * just avoid the verification stage for non-crc filesystems
2397 */
2398 if (xfs_sb_version_hascrc(&mp->m_sb))
2399 xlog_recover_validate_buf_type(mp, bp, buf_f);
2400} 2239}
2401 2240
2402/* 2241/*
@@ -2404,8 +2243,11 @@ xlog_recover_do_reg_buffer(
2404 * Simple algorithm: if we have found a QUOTAOFF log item of the same type 2243 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2405 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 2244 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2406 * Else, treat it as a regular buffer and do recovery. 2245 * Else, treat it as a regular buffer and do recovery.
2246 *
2247 * Return false if the buffer was tossed and true if we recovered the buffer to
2248 * indicate to the caller if the buffer needs writing.
2407 */ 2249 */
2408STATIC void 2250STATIC bool
2409xlog_recover_do_dquot_buffer( 2251xlog_recover_do_dquot_buffer(
2410 struct xfs_mount *mp, 2252 struct xfs_mount *mp,
2411 struct xlog *log, 2253 struct xlog *log,
@@ -2420,9 +2262,8 @@ xlog_recover_do_dquot_buffer(
2420 /* 2262 /*
2421 * Filesystems are required to send in quota flags at mount time. 2263 * Filesystems are required to send in quota flags at mount time.
2422 */ 2264 */
2423 if (mp->m_qflags == 0) { 2265 if (!mp->m_qflags)
2424 return; 2266 return false;
2425 }
2426 2267
2427 type = 0; 2268 type = 0;
2428 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) 2269 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
@@ -2435,9 +2276,10 @@ xlog_recover_do_dquot_buffer(
2435 * This type of quotas was turned off, so ignore this buffer 2276 * This type of quotas was turned off, so ignore this buffer
2436 */ 2277 */
2437 if (log->l_quotaoffs_flag & type) 2278 if (log->l_quotaoffs_flag & type)
2438 return; 2279 return false;
2439 2280
2440 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2281 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2282 return true;
2441} 2283}
2442 2284
2443/* 2285/*
@@ -2496,7 +2338,7 @@ xlog_recover_buffer_pass2(
2496 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, 2338 bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2497 buf_flags, NULL); 2339 buf_flags, NULL);
2498 if (!bp) 2340 if (!bp)
2499 return XFS_ERROR(ENOMEM); 2341 return -ENOMEM;
2500 error = bp->b_error; 2342 error = bp->b_error;
2501 if (error) { 2343 if (error) {
2502 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); 2344 xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
@@ -2504,23 +2346,44 @@ xlog_recover_buffer_pass2(
2504 } 2346 }
2505 2347
2506 /* 2348 /*
2507 * recover the buffer only if we get an LSN from it and it's less than 2349 * Recover the buffer only if we get an LSN from it and it's less than
2508 * the lsn of the transaction we are replaying. 2350 * the lsn of the transaction we are replaying.
2351 *
2352 * Note that we have to be extremely careful of readahead here.
2353 * Readahead does not attach verfiers to the buffers so if we don't
2354 * actually do any replay after readahead because of the LSN we found
2355 * in the buffer if more recent than that current transaction then we
2356 * need to attach the verifier directly. Failure to do so can lead to
2357 * future recovery actions (e.g. EFI and unlinked list recovery) can
2358 * operate on the buffers and they won't get the verifier attached. This
2359 * can lead to blocks on disk having the correct content but a stale
2360 * CRC.
2361 *
2362 * It is safe to assume these clean buffers are currently up to date.
2363 * If the buffer is dirtied by a later transaction being replayed, then
2364 * the verifier will be reset to match whatever recover turns that
2365 * buffer into.
2509 */ 2366 */
2510 lsn = xlog_recover_get_buf_lsn(mp, bp); 2367 lsn = xlog_recover_get_buf_lsn(mp, bp);
2511 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) 2368 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2369 xlog_recover_validate_buf_type(mp, bp, buf_f);
2512 goto out_release; 2370 goto out_release;
2371 }
2513 2372
2514 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { 2373 if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2515 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2374 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2375 if (error)
2376 goto out_release;
2516 } else if (buf_f->blf_flags & 2377 } else if (buf_f->blf_flags &
2517 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 2378 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2518 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2379 bool dirty;
2380
2381 dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2382 if (!dirty)
2383 goto out_release;
2519 } else { 2384 } else {
2520 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2385 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2521 } 2386 }
2522 if (error)
2523 goto out_release;
2524 2387
2525 /* 2388 /*
2526 * Perform delayed write on the buffer. Asynchronous writes will be 2389 * Perform delayed write on the buffer. Asynchronous writes will be
@@ -2598,7 +2461,7 @@ xfs_recover_inode_owner_change(
2598 2461
2599 ip = xfs_inode_alloc(mp, in_f->ilf_ino); 2462 ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2600 if (!ip) 2463 if (!ip)
2601 return ENOMEM; 2464 return -ENOMEM;
2602 2465
2603 /* instantiate the inode */ 2466 /* instantiate the inode */
2604 xfs_dinode_from_disk(&ip->i_d, dip); 2467 xfs_dinode_from_disk(&ip->i_d, dip);
@@ -2676,7 +2539,7 @@ xlog_recover_inode_pass2(
2676 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, 2539 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2677 &xfs_inode_buf_ops); 2540 &xfs_inode_buf_ops);
2678 if (!bp) { 2541 if (!bp) {
2679 error = ENOMEM; 2542 error = -ENOMEM;
2680 goto error; 2543 goto error;
2681 } 2544 }
2682 error = bp->b_error; 2545 error = bp->b_error;
@@ -2697,7 +2560,7 @@ xlog_recover_inode_pass2(
2697 __func__, dip, bp, in_f->ilf_ino); 2560 __func__, dip, bp, in_f->ilf_ino);
2698 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", 2561 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2699 XFS_ERRLEVEL_LOW, mp); 2562 XFS_ERRLEVEL_LOW, mp);
2700 error = EFSCORRUPTED; 2563 error = -EFSCORRUPTED;
2701 goto out_release; 2564 goto out_release;
2702 } 2565 }
2703 dicp = item->ri_buf[1].i_addr; 2566 dicp = item->ri_buf[1].i_addr;
@@ -2707,7 +2570,7 @@ xlog_recover_inode_pass2(
2707 __func__, item, in_f->ilf_ino); 2570 __func__, item, in_f->ilf_ino);
2708 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", 2571 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2709 XFS_ERRLEVEL_LOW, mp); 2572 XFS_ERRLEVEL_LOW, mp);
2710 error = EFSCORRUPTED; 2573 error = -EFSCORRUPTED;
2711 goto out_release; 2574 goto out_release;
2712 } 2575 }
2713 2576
@@ -2764,7 +2627,7 @@ xlog_recover_inode_pass2(
2764 "%s: Bad regular inode log record, rec ptr 0x%p, " 2627 "%s: Bad regular inode log record, rec ptr 0x%p, "
2765 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2628 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2766 __func__, item, dip, bp, in_f->ilf_ino); 2629 __func__, item, dip, bp, in_f->ilf_ino);
2767 error = EFSCORRUPTED; 2630 error = -EFSCORRUPTED;
2768 goto out_release; 2631 goto out_release;
2769 } 2632 }
2770 } else if (unlikely(S_ISDIR(dicp->di_mode))) { 2633 } else if (unlikely(S_ISDIR(dicp->di_mode))) {
@@ -2777,7 +2640,7 @@ xlog_recover_inode_pass2(
2777 "%s: Bad dir inode log record, rec ptr 0x%p, " 2640 "%s: Bad dir inode log record, rec ptr 0x%p, "
2778 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2641 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2779 __func__, item, dip, bp, in_f->ilf_ino); 2642 __func__, item, dip, bp, in_f->ilf_ino);
2780 error = EFSCORRUPTED; 2643 error = -EFSCORRUPTED;
2781 goto out_release; 2644 goto out_release;
2782 } 2645 }
2783 } 2646 }
@@ -2790,7 +2653,7 @@ xlog_recover_inode_pass2(
2790 __func__, item, dip, bp, in_f->ilf_ino, 2653 __func__, item, dip, bp, in_f->ilf_ino,
2791 dicp->di_nextents + dicp->di_anextents, 2654 dicp->di_nextents + dicp->di_anextents,
2792 dicp->di_nblocks); 2655 dicp->di_nblocks);
2793 error = EFSCORRUPTED; 2656 error = -EFSCORRUPTED;
2794 goto out_release; 2657 goto out_release;
2795 } 2658 }
2796 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2659 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
@@ -2800,7 +2663,7 @@ xlog_recover_inode_pass2(
2800 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " 2663 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2801 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, 2664 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2802 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); 2665 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2803 error = EFSCORRUPTED; 2666 error = -EFSCORRUPTED;
2804 goto out_release; 2667 goto out_release;
2805 } 2668 }
2806 isize = xfs_icdinode_size(dicp->di_version); 2669 isize = xfs_icdinode_size(dicp->di_version);
@@ -2810,7 +2673,7 @@ xlog_recover_inode_pass2(
2810 xfs_alert(mp, 2673 xfs_alert(mp,
2811 "%s: Bad inode log record length %d, rec ptr 0x%p", 2674 "%s: Bad inode log record length %d, rec ptr 0x%p",
2812 __func__, item->ri_buf[1].i_len, item); 2675 __func__, item->ri_buf[1].i_len, item);
2813 error = EFSCORRUPTED; 2676 error = -EFSCORRUPTED;
2814 goto out_release; 2677 goto out_release;
2815 } 2678 }
2816 2679
@@ -2898,7 +2761,7 @@ xlog_recover_inode_pass2(
2898 default: 2761 default:
2899 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 2762 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2900 ASSERT(0); 2763 ASSERT(0);
2901 error = EIO; 2764 error = -EIO;
2902 goto out_release; 2765 goto out_release;
2903 } 2766 }
2904 } 2767 }
@@ -2919,7 +2782,7 @@ out_release:
2919error: 2782error:
2920 if (need_free) 2783 if (need_free)
2921 kmem_free(in_f); 2784 kmem_free(in_f);
2922 return XFS_ERROR(error); 2785 return error;
2923} 2786}
2924 2787
2925/* 2788/*
@@ -2946,7 +2809,7 @@ xlog_recover_quotaoff_pass1(
2946 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) 2809 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2947 log->l_quotaoffs_flag |= XFS_DQ_GROUP; 2810 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2948 2811
2949 return (0); 2812 return 0;
2950} 2813}
2951 2814
2952/* 2815/*
@@ -2971,17 +2834,17 @@ xlog_recover_dquot_pass2(
2971 * Filesystems are required to send in quota flags at mount time. 2834 * Filesystems are required to send in quota flags at mount time.
2972 */ 2835 */
2973 if (mp->m_qflags == 0) 2836 if (mp->m_qflags == 0)
2974 return (0); 2837 return 0;
2975 2838
2976 recddq = item->ri_buf[1].i_addr; 2839 recddq = item->ri_buf[1].i_addr;
2977 if (recddq == NULL) { 2840 if (recddq == NULL) {
2978 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); 2841 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2979 return XFS_ERROR(EIO); 2842 return -EIO;
2980 } 2843 }
2981 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { 2844 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2982 xfs_alert(log->l_mp, "dquot too small (%d) in %s.", 2845 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2983 item->ri_buf[1].i_len, __func__); 2846 item->ri_buf[1].i_len, __func__);
2984 return XFS_ERROR(EIO); 2847 return -EIO;
2985 } 2848 }
2986 2849
2987 /* 2850 /*
@@ -2990,7 +2853,7 @@ xlog_recover_dquot_pass2(
2990 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); 2853 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2991 ASSERT(type); 2854 ASSERT(type);
2992 if (log->l_quotaoffs_flag & type) 2855 if (log->l_quotaoffs_flag & type)
2993 return (0); 2856 return 0;
2994 2857
2995 /* 2858 /*
2996 * At this point we know that quota was _not_ turned off. 2859 * At this point we know that quota was _not_ turned off.
@@ -3007,12 +2870,19 @@ xlog_recover_dquot_pass2(
3007 error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2870 error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
3008 "xlog_recover_dquot_pass2 (log copy)"); 2871 "xlog_recover_dquot_pass2 (log copy)");
3009 if (error) 2872 if (error)
3010 return XFS_ERROR(EIO); 2873 return -EIO;
3011 ASSERT(dq_f->qlf_len == 1); 2874 ASSERT(dq_f->qlf_len == 1);
3012 2875
2876 /*
2877 * At this point we are assuming that the dquots have been allocated
2878 * and hence the buffer has valid dquots stamped in it. It should,
2879 * therefore, pass verifier validation. If the dquot is bad, then the
2880 * we'll return an error here, so we don't need to specifically check
2881 * the dquot in the buffer after the verifier has run.
2882 */
3013 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, 2883 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
3014 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, 2884 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
3015 NULL); 2885 &xfs_dquot_buf_ops);
3016 if (error) 2886 if (error)
3017 return error; 2887 return error;
3018 2888
@@ -3020,18 +2890,6 @@ xlog_recover_dquot_pass2(
3020 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); 2890 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
3021 2891
3022 /* 2892 /*
3023 * At least the magic num portion should be on disk because this
3024 * was among a chunk of dquots created earlier, and we did some
3025 * minimal initialization then.
3026 */
3027 error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
3028 "xlog_recover_dquot_pass2");
3029 if (error) {
3030 xfs_buf_relse(bp);
3031 return XFS_ERROR(EIO);
3032 }
3033
3034 /*
3035 * If the dquot has an LSN in it, recover the dquot only if it's less 2893 * If the dquot has an LSN in it, recover the dquot only if it's less
3036 * than the lsn of the transaction we are replaying. 2894 * than the lsn of the transaction we are replaying.
3037 */ 2895 */
@@ -3178,38 +3036,38 @@ xlog_recover_do_icreate_pass2(
3178 icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; 3036 icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3179 if (icl->icl_type != XFS_LI_ICREATE) { 3037 if (icl->icl_type != XFS_LI_ICREATE) {
3180 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); 3038 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3181 return EINVAL; 3039 return -EINVAL;
3182 } 3040 }
3183 3041
3184 if (icl->icl_size != 1) { 3042 if (icl->icl_size != 1) {
3185 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); 3043 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3186 return EINVAL; 3044 return -EINVAL;
3187 } 3045 }
3188 3046
3189 agno = be32_to_cpu(icl->icl_ag); 3047 agno = be32_to_cpu(icl->icl_ag);
3190 if (agno >= mp->m_sb.sb_agcount) { 3048 if (agno >= mp->m_sb.sb_agcount) {
3191 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); 3049 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3192 return EINVAL; 3050 return -EINVAL;
3193 } 3051 }
3194 agbno = be32_to_cpu(icl->icl_agbno); 3052 agbno = be32_to_cpu(icl->icl_agbno);
3195 if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { 3053 if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3196 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); 3054 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3197 return EINVAL; 3055 return -EINVAL;
3198 } 3056 }
3199 isize = be32_to_cpu(icl->icl_isize); 3057 isize = be32_to_cpu(icl->icl_isize);
3200 if (isize != mp->m_sb.sb_inodesize) { 3058 if (isize != mp->m_sb.sb_inodesize) {
3201 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); 3059 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3202 return EINVAL; 3060 return -EINVAL;
3203 } 3061 }
3204 count = be32_to_cpu(icl->icl_count); 3062 count = be32_to_cpu(icl->icl_count);
3205 if (!count) { 3063 if (!count) {
3206 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); 3064 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3207 return EINVAL; 3065 return -EINVAL;
3208 } 3066 }
3209 length = be32_to_cpu(icl->icl_length); 3067 length = be32_to_cpu(icl->icl_length);
3210 if (!length || length >= mp->m_sb.sb_agblocks) { 3068 if (!length || length >= mp->m_sb.sb_agblocks) {
3211 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); 3069 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3212 return EINVAL; 3070 return -EINVAL;
3213 } 3071 }
3214 3072
3215 /* existing allocation is fixed value */ 3073 /* existing allocation is fixed value */
@@ -3218,7 +3076,7 @@ xlog_recover_do_icreate_pass2(
3218 if (count != mp->m_ialloc_inos || 3076 if (count != mp->m_ialloc_inos ||
3219 length != mp->m_ialloc_blks) { 3077 length != mp->m_ialloc_blks) {
3220 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); 3078 xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
3221 return EINVAL; 3079 return -EINVAL;
3222 } 3080 }
3223 3081
3224 /* 3082 /*
@@ -3240,31 +3098,6 @@ xlog_recover_do_icreate_pass2(
3240 return 0; 3098 return 0;
3241} 3099}
3242 3100
3243/*
3244 * Free up any resources allocated by the transaction
3245 *
3246 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
3247 */
3248STATIC void
3249xlog_recover_free_trans(
3250 struct xlog_recover *trans)
3251{
3252 xlog_recover_item_t *item, *n;
3253 int i;
3254
3255 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
3256 /* Free the regions in the item. */
3257 list_del(&item->ri_list);
3258 for (i = 0; i < item->ri_cnt; i++)
3259 kmem_free(item->ri_buf[i].i_addr);
3260 /* Free the item itself */
3261 kmem_free(item->ri_buf);
3262 kmem_free(item);
3263 }
3264 /* Free the transaction recover structure */
3265 kmem_free(trans);
3266}
3267
3268STATIC void 3101STATIC void
3269xlog_recover_buffer_ra_pass2( 3102xlog_recover_buffer_ra_pass2(
3270 struct xlog *log, 3103 struct xlog *log,
@@ -3389,7 +3222,7 @@ xlog_recover_commit_pass1(
3389 xfs_warn(log->l_mp, "%s: invalid item type (%d)", 3222 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3390 __func__, ITEM_TYPE(item)); 3223 __func__, ITEM_TYPE(item));
3391 ASSERT(0); 3224 ASSERT(0);
3392 return XFS_ERROR(EIO); 3225 return -EIO;
3393 } 3226 }
3394} 3227}
3395 3228
@@ -3425,7 +3258,7 @@ xlog_recover_commit_pass2(
3425 xfs_warn(log->l_mp, "%s: invalid item type (%d)", 3258 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3426 __func__, ITEM_TYPE(item)); 3259 __func__, ITEM_TYPE(item));
3427 ASSERT(0); 3260 ASSERT(0);
3428 return XFS_ERROR(EIO); 3261 return -EIO;
3429 } 3262 }
3430} 3263}
3431 3264
@@ -3514,22 +3347,309 @@ out:
3514 if (!list_empty(&done_list)) 3347 if (!list_empty(&done_list))
3515 list_splice_init(&done_list, &trans->r_itemq); 3348 list_splice_init(&done_list, &trans->r_itemq);
3516 3349
3517 xlog_recover_free_trans(trans);
3518
3519 error2 = xfs_buf_delwri_submit(&buffer_list); 3350 error2 = xfs_buf_delwri_submit(&buffer_list);
3520 return error ? error : error2; 3351 return error ? error : error2;
3521} 3352}
3522 3353
3354STATIC void
3355xlog_recover_add_item(
3356 struct list_head *head)
3357{
3358 xlog_recover_item_t *item;
3359
3360 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
3361 INIT_LIST_HEAD(&item->ri_list);
3362 list_add_tail(&item->ri_list, head);
3363}
3364
3523STATIC int 3365STATIC int
3524xlog_recover_unmount_trans( 3366xlog_recover_add_to_cont_trans(
3525 struct xlog *log) 3367 struct xlog *log,
3368 struct xlog_recover *trans,
3369 xfs_caddr_t dp,
3370 int len)
3526{ 3371{
3527 /* Do nothing now */ 3372 xlog_recover_item_t *item;
3528 xfs_warn(log->l_mp, "%s: Unmount LR", __func__); 3373 xfs_caddr_t ptr, old_ptr;
3374 int old_len;
3375
3376 if (list_empty(&trans->r_itemq)) {
3377 /* finish copying rest of trans header */
3378 xlog_recover_add_item(&trans->r_itemq);
3379 ptr = (xfs_caddr_t) &trans->r_theader +
3380 sizeof(xfs_trans_header_t) - len;
3381 memcpy(ptr, dp, len);
3382 return 0;
3383 }
3384 /* take the tail entry */
3385 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
3386
3387 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
3388 old_len = item->ri_buf[item->ri_cnt-1].i_len;
3389
3390 ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
3391 memcpy(&ptr[old_len], dp, len);
3392 item->ri_buf[item->ri_cnt-1].i_len += len;
3393 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
3394 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
3395 return 0;
3396}
3397
3398/*
3399 * The next region to add is the start of a new region. It could be
3400 * a whole region or it could be the first part of a new region. Because
3401 * of this, the assumption here is that the type and size fields of all
3402 * format structures fit into the first 32 bits of the structure.
3403 *
3404 * This works because all regions must be 32 bit aligned. Therefore, we
3405 * either have both fields or we have neither field. In the case we have
3406 * neither field, the data part of the region is zero length. We only have
3407 * a log_op_header and can throw away the header since a new one will appear
3408 * later. If we have at least 4 bytes, then we can determine how many regions
3409 * will appear in the current log item.
3410 */
3411STATIC int
3412xlog_recover_add_to_trans(
3413 struct xlog *log,
3414 struct xlog_recover *trans,
3415 xfs_caddr_t dp,
3416 int len)
3417{
3418 xfs_inode_log_format_t *in_f; /* any will do */
3419 xlog_recover_item_t *item;
3420 xfs_caddr_t ptr;
3421
3422 if (!len)
3423 return 0;
3424 if (list_empty(&trans->r_itemq)) {
3425 /* we need to catch log corruptions here */
3426 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
3427 xfs_warn(log->l_mp, "%s: bad header magic number",
3428 __func__);
3429 ASSERT(0);
3430 return -EIO;
3431 }
3432 if (len == sizeof(xfs_trans_header_t))
3433 xlog_recover_add_item(&trans->r_itemq);
3434 memcpy(&trans->r_theader, dp, len);
3435 return 0;
3436 }
3437
3438 ptr = kmem_alloc(len, KM_SLEEP);
3439 memcpy(ptr, dp, len);
3440 in_f = (xfs_inode_log_format_t *)ptr;
3441
3442 /* take the tail entry */
3443 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
3444 if (item->ri_total != 0 &&
3445 item->ri_total == item->ri_cnt) {
3446 /* tail item is in use, get a new one */
3447 xlog_recover_add_item(&trans->r_itemq);
3448 item = list_entry(trans->r_itemq.prev,
3449 xlog_recover_item_t, ri_list);
3450 }
3451
3452 if (item->ri_total == 0) { /* first region to be added */
3453 if (in_f->ilf_size == 0 ||
3454 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
3455 xfs_warn(log->l_mp,
3456 "bad number of regions (%d) in inode log format",
3457 in_f->ilf_size);
3458 ASSERT(0);
3459 kmem_free(ptr);
3460 return -EIO;
3461 }
3462
3463 item->ri_total = in_f->ilf_size;
3464 item->ri_buf =
3465 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
3466 KM_SLEEP);
3467 }
3468 ASSERT(item->ri_total > item->ri_cnt);
3469 /* Description region is ri_buf[0] */
3470 item->ri_buf[item->ri_cnt].i_addr = ptr;
3471 item->ri_buf[item->ri_cnt].i_len = len;
3472 item->ri_cnt++;
3473 trace_xfs_log_recover_item_add(log, trans, item, 0);
3529 return 0; 3474 return 0;
3530} 3475}
3531 3476
3532/* 3477/*
3478 * Free up any resources allocated by the transaction
3479 *
3480 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
3481 */
3482STATIC void
3483xlog_recover_free_trans(
3484 struct xlog_recover *trans)
3485{
3486 xlog_recover_item_t *item, *n;
3487 int i;
3488
3489 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
3490 /* Free the regions in the item. */
3491 list_del(&item->ri_list);
3492 for (i = 0; i < item->ri_cnt; i++)
3493 kmem_free(item->ri_buf[i].i_addr);
3494 /* Free the item itself */
3495 kmem_free(item->ri_buf);
3496 kmem_free(item);
3497 }
3498 /* Free the transaction recover structure */
3499 kmem_free(trans);
3500}
3501
3502/*
3503 * On error or completion, trans is freed.
3504 */
3505STATIC int
3506xlog_recovery_process_trans(
3507 struct xlog *log,
3508 struct xlog_recover *trans,
3509 xfs_caddr_t dp,
3510 unsigned int len,
3511 unsigned int flags,
3512 int pass)
3513{
3514 int error = 0;
3515 bool freeit = false;
3516
3517 /* mask off ophdr transaction container flags */
3518 flags &= ~XLOG_END_TRANS;
3519 if (flags & XLOG_WAS_CONT_TRANS)
3520 flags &= ~XLOG_CONTINUE_TRANS;
3521
3522 /*
3523 * Callees must not free the trans structure. We'll decide if we need to
3524 * free it or not based on the operation being done and it's result.
3525 */
3526 switch (flags) {
3527 /* expected flag values */
3528 case 0:
3529 case XLOG_CONTINUE_TRANS:
3530 error = xlog_recover_add_to_trans(log, trans, dp, len);
3531 break;
3532 case XLOG_WAS_CONT_TRANS:
3533 error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
3534 break;
3535 case XLOG_COMMIT_TRANS:
3536 error = xlog_recover_commit_trans(log, trans, pass);
3537 /* success or fail, we are now done with this transaction. */
3538 freeit = true;
3539 break;
3540
3541 /* unexpected flag values */
3542 case XLOG_UNMOUNT_TRANS:
3543 /* just skip trans */
3544 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
3545 freeit = true;
3546 break;
3547 case XLOG_START_TRANS:
3548 default:
3549 xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
3550 ASSERT(0);
3551 error = -EIO;
3552 break;
3553 }
3554 if (error || freeit)
3555 xlog_recover_free_trans(trans);
3556 return error;
3557}
3558
3559/*
3560 * Lookup the transaction recovery structure associated with the ID in the
3561 * current ophdr. If the transaction doesn't exist and the start flag is set in
3562 * the ophdr, then allocate a new transaction for future ID matches to find.
3563 * Either way, return what we found during the lookup - an existing transaction
3564 * or nothing.
3565 */
3566STATIC struct xlog_recover *
3567xlog_recover_ophdr_to_trans(
3568 struct hlist_head rhash[],
3569 struct xlog_rec_header *rhead,
3570 struct xlog_op_header *ohead)
3571{
3572 struct xlog_recover *trans;
3573 xlog_tid_t tid;
3574 struct hlist_head *rhp;
3575
3576 tid = be32_to_cpu(ohead->oh_tid);
3577 rhp = &rhash[XLOG_RHASH(tid)];
3578 hlist_for_each_entry(trans, rhp, r_list) {
3579 if (trans->r_log_tid == tid)
3580 return trans;
3581 }
3582
3583 /*
3584 * skip over non-start transaction headers - we could be
3585 * processing slack space before the next transaction starts
3586 */
3587 if (!(ohead->oh_flags & XLOG_START_TRANS))
3588 return NULL;
3589
3590 ASSERT(be32_to_cpu(ohead->oh_len) == 0);
3591
3592 /*
3593 * This is a new transaction so allocate a new recovery container to
3594 * hold the recovery ops that will follow.
3595 */
3596 trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
3597 trans->r_log_tid = tid;
3598 trans->r_lsn = be64_to_cpu(rhead->h_lsn);
3599 INIT_LIST_HEAD(&trans->r_itemq);
3600 INIT_HLIST_NODE(&trans->r_list);
3601 hlist_add_head(&trans->r_list, rhp);
3602
3603 /*
3604 * Nothing more to do for this ophdr. Items to be added to this new
3605 * transaction will be in subsequent ophdr containers.
3606 */
3607 return NULL;
3608}
3609
3610STATIC int
3611xlog_recover_process_ophdr(
3612 struct xlog *log,
3613 struct hlist_head rhash[],
3614 struct xlog_rec_header *rhead,
3615 struct xlog_op_header *ohead,
3616 xfs_caddr_t dp,
3617 xfs_caddr_t end,
3618 int pass)
3619{
3620 struct xlog_recover *trans;
3621 unsigned int len;
3622
3623 /* Do we understand who wrote this op? */
3624 if (ohead->oh_clientid != XFS_TRANSACTION &&
3625 ohead->oh_clientid != XFS_LOG) {
3626 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
3627 __func__, ohead->oh_clientid);
3628 ASSERT(0);
3629 return -EIO;
3630 }
3631
3632 /*
3633 * Check the ophdr contains all the data it is supposed to contain.
3634 */
3635 len = be32_to_cpu(ohead->oh_len);
3636 if (dp + len > end) {
3637 xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
3638 WARN_ON(1);
3639 return -EIO;
3640 }
3641
3642 trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
3643 if (!trans) {
3644 /* nothing to do, so skip over this ophdr */
3645 return 0;
3646 }
3647
3648 return xlog_recovery_process_trans(log, trans, dp, len,
3649 ohead->oh_flags, pass);
3650}
3651
3652/*
3533 * There are two valid states of the r_state field. 0 indicates that the 3653 * There are two valid states of the r_state field. 0 indicates that the
3534 * transaction structure is in a normal state. We have either seen the 3654 * transaction structure is in a normal state. We have either seen the
3535 * start of the transaction or the last operation we added was not a partial 3655 * start of the transaction or the last operation we added was not a partial
@@ -3546,86 +3666,30 @@ xlog_recover_process_data(
3546 xfs_caddr_t dp, 3666 xfs_caddr_t dp,
3547 int pass) 3667 int pass)
3548{ 3668{
3549 xfs_caddr_t lp; 3669 struct xlog_op_header *ohead;
3670 xfs_caddr_t end;
3550 int num_logops; 3671 int num_logops;
3551 xlog_op_header_t *ohead;
3552 xlog_recover_t *trans;
3553 xlog_tid_t tid;
3554 int error; 3672 int error;
3555 unsigned long hash;
3556 uint flags;
3557 3673
3558 lp = dp + be32_to_cpu(rhead->h_len); 3674 end = dp + be32_to_cpu(rhead->h_len);
3559 num_logops = be32_to_cpu(rhead->h_num_logops); 3675 num_logops = be32_to_cpu(rhead->h_num_logops);
3560 3676
3561 /* check the log format matches our own - else we can't recover */ 3677 /* check the log format matches our own - else we can't recover */
3562 if (xlog_header_check_recover(log->l_mp, rhead)) 3678 if (xlog_header_check_recover(log->l_mp, rhead))
3563 return (XFS_ERROR(EIO)); 3679 return -EIO;
3564 3680
3565 while ((dp < lp) && num_logops) { 3681 while ((dp < end) && num_logops) {
3566 ASSERT(dp + sizeof(xlog_op_header_t) <= lp); 3682
3567 ohead = (xlog_op_header_t *)dp; 3683 ohead = (struct xlog_op_header *)dp;
3568 dp += sizeof(xlog_op_header_t); 3684 dp += sizeof(*ohead);
3569 if (ohead->oh_clientid != XFS_TRANSACTION && 3685 ASSERT(dp <= end);
3570 ohead->oh_clientid != XFS_LOG) { 3686
3571 xfs_warn(log->l_mp, "%s: bad clientid 0x%x", 3687 /* errors will abort recovery */
3572 __func__, ohead->oh_clientid); 3688 error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
3573 ASSERT(0); 3689 dp, end, pass);
3574 return (XFS_ERROR(EIO)); 3690 if (error)
3575 } 3691 return error;
3576 tid = be32_to_cpu(ohead->oh_tid); 3692
3577 hash = XLOG_RHASH(tid);
3578 trans = xlog_recover_find_tid(&rhash[hash], tid);
3579 if (trans == NULL) { /* not found; add new tid */
3580 if (ohead->oh_flags & XLOG_START_TRANS)
3581 xlog_recover_new_tid(&rhash[hash], tid,
3582 be64_to_cpu(rhead->h_lsn));
3583 } else {
3584 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
3585 xfs_warn(log->l_mp, "%s: bad length 0x%x",
3586 __func__, be32_to_cpu(ohead->oh_len));
3587 WARN_ON(1);
3588 return (XFS_ERROR(EIO));
3589 }
3590 flags = ohead->oh_flags & ~XLOG_END_TRANS;
3591 if (flags & XLOG_WAS_CONT_TRANS)
3592 flags &= ~XLOG_CONTINUE_TRANS;
3593 switch (flags) {
3594 case XLOG_COMMIT_TRANS:
3595 error = xlog_recover_commit_trans(log,
3596 trans, pass);
3597 break;
3598 case XLOG_UNMOUNT_TRANS:
3599 error = xlog_recover_unmount_trans(log);
3600 break;
3601 case XLOG_WAS_CONT_TRANS:
3602 error = xlog_recover_add_to_cont_trans(log,
3603 trans, dp,
3604 be32_to_cpu(ohead->oh_len));
3605 break;
3606 case XLOG_START_TRANS:
3607 xfs_warn(log->l_mp, "%s: bad transaction",
3608 __func__);
3609 ASSERT(0);
3610 error = XFS_ERROR(EIO);
3611 break;
3612 case 0:
3613 case XLOG_CONTINUE_TRANS:
3614 error = xlog_recover_add_to_trans(log, trans,
3615 dp, be32_to_cpu(ohead->oh_len));
3616 break;
3617 default:
3618 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
3619 __func__, flags);
3620 ASSERT(0);
3621 error = XFS_ERROR(EIO);
3622 break;
3623 }
3624 if (error) {
3625 xlog_recover_free_trans(trans);
3626 return error;
3627 }
3628 }
3629 dp += be32_to_cpu(ohead->oh_len); 3693 dp += be32_to_cpu(ohead->oh_len);
3630 num_logops--; 3694 num_logops--;
3631 } 3695 }
@@ -3669,7 +3733,7 @@ xlog_recover_process_efi(
3669 */ 3733 */
3670 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); 3734 set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
3671 xfs_efi_release(efip, efip->efi_format.efi_nextents); 3735 xfs_efi_release(efip, efip->efi_format.efi_nextents);
3672 return XFS_ERROR(EIO); 3736 return -EIO;
3673 } 3737 }
3674 } 3738 }
3675 3739
@@ -3969,7 +4033,7 @@ xlog_unpack_data_crc(
3969 * CRC protection by punting an error back up the stack. 4033 * CRC protection by punting an error back up the stack.
3970 */ 4034 */
3971 if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) 4035 if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
3972 return EFSCORRUPTED; 4036 return -EFSCORRUPTED;
3973 } 4037 }
3974 4038
3975 return 0; 4039 return 0;
@@ -4018,14 +4082,14 @@ xlog_valid_rec_header(
4018 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { 4082 if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
4019 XFS_ERROR_REPORT("xlog_valid_rec_header(1)", 4083 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
4020 XFS_ERRLEVEL_LOW, log->l_mp); 4084 XFS_ERRLEVEL_LOW, log->l_mp);
4021 return XFS_ERROR(EFSCORRUPTED); 4085 return -EFSCORRUPTED;
4022 } 4086 }
4023 if (unlikely( 4087 if (unlikely(
4024 (!rhead->h_version || 4088 (!rhead->h_version ||
4025 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 4089 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
4026 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", 4090 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
4027 __func__, be32_to_cpu(rhead->h_version)); 4091 __func__, be32_to_cpu(rhead->h_version));
4028 return XFS_ERROR(EIO); 4092 return -EIO;
4029 } 4093 }
4030 4094
4031 /* LR body must have data or it wouldn't have been written */ 4095 /* LR body must have data or it wouldn't have been written */
@@ -4033,12 +4097,12 @@ xlog_valid_rec_header(
4033 if (unlikely( hlen <= 0 || hlen > INT_MAX )) { 4097 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
4034 XFS_ERROR_REPORT("xlog_valid_rec_header(2)", 4098 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
4035 XFS_ERRLEVEL_LOW, log->l_mp); 4099 XFS_ERRLEVEL_LOW, log->l_mp);
4036 return XFS_ERROR(EFSCORRUPTED); 4100 return -EFSCORRUPTED;
4037 } 4101 }
4038 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { 4102 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
4039 XFS_ERROR_REPORT("xlog_valid_rec_header(3)", 4103 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
4040 XFS_ERRLEVEL_LOW, log->l_mp); 4104 XFS_ERRLEVEL_LOW, log->l_mp);
4041 return XFS_ERROR(EFSCORRUPTED); 4105 return -EFSCORRUPTED;
4042 } 4106 }
4043 return 0; 4107 return 0;
4044} 4108}
@@ -4081,7 +4145,7 @@ xlog_do_recovery_pass(
4081 */ 4145 */
4082 hbp = xlog_get_bp(log, 1); 4146 hbp = xlog_get_bp(log, 1);
4083 if (!hbp) 4147 if (!hbp)
4084 return ENOMEM; 4148 return -ENOMEM;
4085 4149
4086 error = xlog_bread(log, tail_blk, 1, hbp, &offset); 4150 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
4087 if (error) 4151 if (error)
@@ -4110,49 +4174,21 @@ xlog_do_recovery_pass(
4110 } 4174 }
4111 4175
4112 if (!hbp) 4176 if (!hbp)
4113 return ENOMEM; 4177 return -ENOMEM;
4114 dbp = xlog_get_bp(log, BTOBB(h_size)); 4178 dbp = xlog_get_bp(log, BTOBB(h_size));
4115 if (!dbp) { 4179 if (!dbp) {
4116 xlog_put_bp(hbp); 4180 xlog_put_bp(hbp);
4117 return ENOMEM; 4181 return -ENOMEM;
4118 } 4182 }
4119 4183
4120 memset(rhash, 0, sizeof(rhash)); 4184 memset(rhash, 0, sizeof(rhash));
4121 if (tail_blk <= head_blk) { 4185 blk_no = tail_blk;
4122 for (blk_no = tail_blk; blk_no < head_blk; ) { 4186 if (tail_blk > head_blk) {
4123 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
4124 if (error)
4125 goto bread_err2;
4126
4127 rhead = (xlog_rec_header_t *)offset;
4128 error = xlog_valid_rec_header(log, rhead, blk_no);
4129 if (error)
4130 goto bread_err2;
4131
4132 /* blocks in data section */
4133 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
4134 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
4135 &offset);
4136 if (error)
4137 goto bread_err2;
4138
4139 error = xlog_unpack_data(rhead, offset, log);
4140 if (error)
4141 goto bread_err2;
4142
4143 error = xlog_recover_process_data(log,
4144 rhash, rhead, offset, pass);
4145 if (error)
4146 goto bread_err2;
4147 blk_no += bblks + hblks;
4148 }
4149 } else {
4150 /* 4187 /*
4151 * Perform recovery around the end of the physical log. 4188 * Perform recovery around the end of the physical log.
4152 * When the head is not on the same cycle number as the tail, 4189 * When the head is not on the same cycle number as the tail,
4153 * we can't do a sequential recovery as above. 4190 * we can't do a sequential recovery.
4154 */ 4191 */
4155 blk_no = tail_blk;
4156 while (blk_no < log->l_logBBsize) { 4192 while (blk_no < log->l_logBBsize) {
4157 /* 4193 /*
4158 * Check for header wrapping around physical end-of-log 4194 * Check for header wrapping around physical end-of-log
@@ -4266,34 +4302,35 @@ xlog_do_recovery_pass(
4266 4302
4267 ASSERT(blk_no >= log->l_logBBsize); 4303 ASSERT(blk_no >= log->l_logBBsize);
4268 blk_no -= log->l_logBBsize; 4304 blk_no -= log->l_logBBsize;
4305 }
4269 4306
4270 /* read first part of physical log */ 4307 /* read first part of physical log */
4271 while (blk_no < head_blk) { 4308 while (blk_no < head_blk) {
4272 error = xlog_bread(log, blk_no, hblks, hbp, &offset); 4309 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
4273 if (error) 4310 if (error)
4274 goto bread_err2; 4311 goto bread_err2;
4275 4312
4276 rhead = (xlog_rec_header_t *)offset; 4313 rhead = (xlog_rec_header_t *)offset;
4277 error = xlog_valid_rec_header(log, rhead, blk_no); 4314 error = xlog_valid_rec_header(log, rhead, blk_no);
4278 if (error) 4315 if (error)
4279 goto bread_err2; 4316 goto bread_err2;
4280 4317
4281 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 4318 /* blocks in data section */
4282 error = xlog_bread(log, blk_no+hblks, bblks, dbp, 4319 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
4283 &offset); 4320 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
4284 if (error) 4321 &offset);
4285 goto bread_err2; 4322 if (error)
4323 goto bread_err2;
4286 4324
4287 error = xlog_unpack_data(rhead, offset, log); 4325 error = xlog_unpack_data(rhead, offset, log);
4288 if (error) 4326 if (error)
4289 goto bread_err2; 4327 goto bread_err2;
4290 4328
4291 error = xlog_recover_process_data(log, rhash, 4329 error = xlog_recover_process_data(log, rhash,
4292 rhead, offset, pass); 4330 rhead, offset, pass);
4293 if (error) 4331 if (error)
4294 goto bread_err2; 4332 goto bread_err2;
4295 blk_no += bblks + hblks; 4333 blk_no += bblks + hblks;
4296 }
4297 } 4334 }
4298 4335
4299 bread_err2: 4336 bread_err2:
@@ -4388,7 +4425,7 @@ xlog_do_recover(
4388 * If IO errors happened during recovery, bail out. 4425 * If IO errors happened during recovery, bail out.
4389 */ 4426 */
4390 if (XFS_FORCED_SHUTDOWN(log->l_mp)) { 4427 if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
4391 return (EIO); 4428 return -EIO;
4392 } 4429 }
4393 4430
4394 /* 4431 /*
@@ -4413,16 +4450,12 @@ xlog_do_recover(
4413 XFS_BUF_UNASYNC(bp); 4450 XFS_BUF_UNASYNC(bp);
4414 bp->b_ops = &xfs_sb_buf_ops; 4451 bp->b_ops = &xfs_sb_buf_ops;
4415 4452
4416 if (XFS_FORCED_SHUTDOWN(log->l_mp)) { 4453 error = xfs_buf_submit_wait(bp);
4417 xfs_buf_relse(bp);
4418 return XFS_ERROR(EIO);
4419 }
4420
4421 xfs_buf_iorequest(bp);
4422 error = xfs_buf_iowait(bp);
4423 if (error) { 4454 if (error) {
4424 xfs_buf_ioerror_alert(bp, __func__); 4455 if (!XFS_FORCED_SHUTDOWN(log->l_mp)) {
4425 ASSERT(0); 4456 xfs_buf_ioerror_alert(bp, __func__);
4457 ASSERT(0);
4458 }
4426 xfs_buf_relse(bp); 4459 xfs_buf_relse(bp);
4427 return error; 4460 return error;
4428 } 4461 }
@@ -4492,7 +4525,19 @@ xlog_recover(
4492"Please recover the log on a kernel that supports the unknown features.", 4525"Please recover the log on a kernel that supports the unknown features.",
4493 (log->l_mp->m_sb.sb_features_log_incompat & 4526 (log->l_mp->m_sb.sb_features_log_incompat &
4494 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); 4527 XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
4495 return EINVAL; 4528 return -EINVAL;
4529 }
4530
4531 /*
4532 * Delay log recovery if the debug hook is set. This is debug
4533 * instrumention to coordinate simulation of I/O failures with
4534 * log recovery.
4535 */
4536 if (xfs_globals.log_recovery_delay) {
4537 xfs_notice(log->l_mp,
4538 "Delaying log recovery for %d seconds.",
4539 xfs_globals.log_recovery_delay);
4540 msleep(xfs_globals.log_recovery_delay * 1000);
4496 } 4541 }
4497 4542
4498 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", 4543 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 3507cd0ec400..51435dbce9c4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h" 43#include "xfs_icache.h"
44#include "xfs_dinode.h" 44#include "xfs_dinode.h"
45#include "xfs_sysfs.h"
45 46
46 47
47#ifdef HAVE_PERCPU_SB 48#ifdef HAVE_PERCPU_SB
@@ -76,7 +77,7 @@ xfs_uuid_mount(
76 77
77 if (uuid_is_nil(uuid)) { 78 if (uuid_is_nil(uuid)) {
78 xfs_warn(mp, "Filesystem has nil UUID - can't mount"); 79 xfs_warn(mp, "Filesystem has nil UUID - can't mount");
79 return XFS_ERROR(EINVAL); 80 return -EINVAL;
80 } 81 }
81 82
82 mutex_lock(&xfs_uuid_table_mutex); 83 mutex_lock(&xfs_uuid_table_mutex);
@@ -104,7 +105,7 @@ xfs_uuid_mount(
104 out_duplicate: 105 out_duplicate:
105 mutex_unlock(&xfs_uuid_table_mutex); 106 mutex_unlock(&xfs_uuid_table_mutex);
106 xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid); 107 xfs_warn(mp, "Filesystem has duplicate UUID %pU - can't mount", uuid);
107 return XFS_ERROR(EINVAL); 108 return -EINVAL;
108} 109}
109 110
110STATIC void 111STATIC void
@@ -173,13 +174,9 @@ xfs_sb_validate_fsb_count(
173 ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); 174 ASSERT(PAGE_SHIFT >= sbp->sb_blocklog);
174 ASSERT(sbp->sb_blocklog >= BBSHIFT); 175 ASSERT(sbp->sb_blocklog >= BBSHIFT);
175 176
176#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ 177 /* Limited by ULONG_MAX of page cache index */
177 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) 178 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
178 return EFBIG; 179 return -EFBIG;
179#else /* Limited by UINT_MAX of sectors */
180 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
181 return EFBIG;
182#endif
183 return 0; 180 return 0;
184} 181}
185 182
@@ -250,9 +247,9 @@ xfs_initialize_perag(
250 mp->m_flags &= ~XFS_MOUNT_32BITINODES; 247 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
251 248
252 if (mp->m_flags & XFS_MOUNT_32BITINODES) 249 if (mp->m_flags & XFS_MOUNT_32BITINODES)
253 index = xfs_set_inode32(mp); 250 index = xfs_set_inode32(mp, agcount);
254 else 251 else
255 index = xfs_set_inode64(mp); 252 index = xfs_set_inode64(mp, agcount);
256 253
257 if (maxagi) 254 if (maxagi)
258 *maxagi = index; 255 *maxagi = index;
@@ -303,28 +300,21 @@ xfs_readsb(
303 * access to the superblock. 300 * access to the superblock.
304 */ 301 */
305reread: 302reread:
306 bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, 303 error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
307 BTOBB(sector_size), 0, buf_ops); 304 BTOBB(sector_size), 0, &bp, buf_ops);
308 if (!bp) { 305 if (error) {
309 if (loud)
310 xfs_warn(mp, "SB buffer read failed");
311 return EIO;
312 }
313 if (bp->b_error) {
314 error = bp->b_error;
315 if (loud) 306 if (loud)
316 xfs_warn(mp, "SB validate failed with error %d.", error); 307 xfs_warn(mp, "SB validate failed with error %d.", error);
317 /* bad CRC means corrupted metadata */ 308 /* bad CRC means corrupted metadata */
318 if (error == EFSBADCRC) 309 if (error == -EFSBADCRC)
319 error = EFSCORRUPTED; 310 error = -EFSCORRUPTED;
320 goto release_buf; 311 return error;
321 } 312 }
322 313
323 /* 314 /*
324 * Initialize the mount structure from the superblock. 315 * Initialize the mount structure from the superblock.
325 */ 316 */
326 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); 317 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
327 xfs_sb_quota_from_disk(sbp);
328 318
329 /* 319 /*
330 * If we haven't validated the superblock, do so now before we try 320 * If we haven't validated the superblock, do so now before we try
@@ -333,7 +323,7 @@ reread:
333 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 323 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
334 if (loud) 324 if (loud)
335 xfs_warn(mp, "Invalid superblock magic number"); 325 xfs_warn(mp, "Invalid superblock magic number");
336 error = EINVAL; 326 error = -EINVAL;
337 goto release_buf; 327 goto release_buf;
338 } 328 }
339 329
@@ -344,7 +334,7 @@ reread:
344 if (loud) 334 if (loud)
345 xfs_warn(mp, "device supports %u byte sectors (not %u)", 335 xfs_warn(mp, "device supports %u byte sectors (not %u)",
346 sector_size, sbp->sb_sectsize); 336 sector_size, sbp->sb_sectsize);
347 error = ENOSYS; 337 error = -ENOSYS;
348 goto release_buf; 338 goto release_buf;
349 } 339 }
350 340
@@ -392,7 +382,7 @@ xfs_update_alignment(xfs_mount_t *mp)
392 xfs_warn(mp, 382 xfs_warn(mp,
393 "alignment check failed: sunit/swidth vs. blocksize(%d)", 383 "alignment check failed: sunit/swidth vs. blocksize(%d)",
394 sbp->sb_blocksize); 384 sbp->sb_blocksize);
395 return XFS_ERROR(EINVAL); 385 return -EINVAL;
396 } else { 386 } else {
397 /* 387 /*
398 * Convert the stripe unit and width to FSBs. 388 * Convert the stripe unit and width to FSBs.
@@ -402,14 +392,14 @@ xfs_update_alignment(xfs_mount_t *mp)
402 xfs_warn(mp, 392 xfs_warn(mp,
403 "alignment check failed: sunit/swidth vs. agsize(%d)", 393 "alignment check failed: sunit/swidth vs. agsize(%d)",
404 sbp->sb_agblocks); 394 sbp->sb_agblocks);
405 return XFS_ERROR(EINVAL); 395 return -EINVAL;
406 } else if (mp->m_dalign) { 396 } else if (mp->m_dalign) {
407 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 397 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
408 } else { 398 } else {
409 xfs_warn(mp, 399 xfs_warn(mp,
410 "alignment check failed: sunit(%d) less than bsize(%d)", 400 "alignment check failed: sunit(%d) less than bsize(%d)",
411 mp->m_dalign, sbp->sb_blocksize); 401 mp->m_dalign, sbp->sb_blocksize);
412 return XFS_ERROR(EINVAL); 402 return -EINVAL;
413 } 403 }
414 } 404 }
415 405
@@ -429,7 +419,7 @@ xfs_update_alignment(xfs_mount_t *mp)
429 } else { 419 } else {
430 xfs_warn(mp, 420 xfs_warn(mp,
431 "cannot change alignment: superblock does not support data alignment"); 421 "cannot change alignment: superblock does not support data alignment");
432 return XFS_ERROR(EINVAL); 422 return -EINVAL;
433 } 423 }
434 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && 424 } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
435 xfs_sb_version_hasdalign(&mp->m_sb)) { 425 xfs_sb_version_hasdalign(&mp->m_sb)) {
@@ -548,40 +538,43 @@ xfs_set_inoalignment(xfs_mount_t *mp)
548 * Check that the data (and log if separate) is an ok size. 538 * Check that the data (and log if separate) is an ok size.
549 */ 539 */
550STATIC int 540STATIC int
551xfs_check_sizes(xfs_mount_t *mp) 541xfs_check_sizes(
542 struct xfs_mount *mp)
552{ 543{
553 xfs_buf_t *bp; 544 struct xfs_buf *bp;
554 xfs_daddr_t d; 545 xfs_daddr_t d;
546 int error;
555 547
556 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 548 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
557 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 549 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
558 xfs_warn(mp, "filesystem size mismatch detected"); 550 xfs_warn(mp, "filesystem size mismatch detected");
559 return XFS_ERROR(EFBIG); 551 return -EFBIG;
560 } 552 }
561 bp = xfs_buf_read_uncached(mp->m_ddev_targp, 553 error = xfs_buf_read_uncached(mp->m_ddev_targp,
562 d - XFS_FSS_TO_BB(mp, 1), 554 d - XFS_FSS_TO_BB(mp, 1),
563 XFS_FSS_TO_BB(mp, 1), 0, NULL); 555 XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
564 if (!bp) { 556 if (error) {
565 xfs_warn(mp, "last sector read failed"); 557 xfs_warn(mp, "last sector read failed");
566 return EIO; 558 return error;
567 } 559 }
568 xfs_buf_relse(bp); 560 xfs_buf_relse(bp);
569 561
570 if (mp->m_logdev_targp != mp->m_ddev_targp) { 562 if (mp->m_logdev_targp == mp->m_ddev_targp)
571 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 563 return 0;
572 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 564
573 xfs_warn(mp, "log size mismatch detected"); 565 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
574 return XFS_ERROR(EFBIG); 566 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
575 } 567 xfs_warn(mp, "log size mismatch detected");
576 bp = xfs_buf_read_uncached(mp->m_logdev_targp, 568 return -EFBIG;
569 }
570 error = xfs_buf_read_uncached(mp->m_logdev_targp,
577 d - XFS_FSB_TO_BB(mp, 1), 571 d - XFS_FSB_TO_BB(mp, 1),
578 XFS_FSB_TO_BB(mp, 1), 0, NULL); 572 XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
579 if (!bp) { 573 if (error) {
580 xfs_warn(mp, "log device read failed"); 574 xfs_warn(mp, "log device read failed");
581 return EIO; 575 return error;
582 }
583 xfs_buf_relse(bp);
584 } 576 }
577 xfs_buf_relse(bp);
585 return 0; 578 return 0;
586} 579}
587 580
@@ -731,10 +724,14 @@ xfs_mountfs(
731 724
732 xfs_set_maxicount(mp); 725 xfs_set_maxicount(mp);
733 726
734 error = xfs_uuid_mount(mp); 727 error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname);
735 if (error) 728 if (error)
736 goto out; 729 goto out;
737 730
731 error = xfs_uuid_mount(mp);
732 if (error)
733 goto out_remove_sysfs;
734
738 /* 735 /*
739 * Set the minimum read and write sizes 736 * Set the minimum read and write sizes
740 */ 737 */
@@ -816,7 +813,7 @@ xfs_mountfs(
816 if (!sbp->sb_logblocks) { 813 if (!sbp->sb_logblocks) {
817 xfs_warn(mp, "no log defined"); 814 xfs_warn(mp, "no log defined");
818 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); 815 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
819 error = XFS_ERROR(EFSCORRUPTED); 816 error = -EFSCORRUPTED;
820 goto out_free_perag; 817 goto out_free_perag;
821 } 818 }
822 819
@@ -855,7 +852,7 @@ xfs_mountfs(
855 !mp->m_sb.sb_inprogress) { 852 !mp->m_sb.sb_inprogress) {
856 error = xfs_initialize_perag_data(mp, sbp->sb_agcount); 853 error = xfs_initialize_perag_data(mp, sbp->sb_agcount);
857 if (error) 854 if (error)
858 goto out_fail_wait; 855 goto out_log_dealloc;
859 } 856 }
860 857
861 /* 858 /*
@@ -876,7 +873,7 @@ xfs_mountfs(
876 xfs_iunlock(rip, XFS_ILOCK_EXCL); 873 xfs_iunlock(rip, XFS_ILOCK_EXCL);
877 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 874 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
878 mp); 875 mp);
879 error = XFS_ERROR(EFSCORRUPTED); 876 error = -EFSCORRUPTED;
880 goto out_rele_rip; 877 goto out_rele_rip;
881 } 878 }
882 mp->m_rootip = rip; /* save it */ 879 mp->m_rootip = rip; /* save it */
@@ -927,7 +924,7 @@ xfs_mountfs(
927 xfs_notice(mp, "resetting quota flags"); 924 xfs_notice(mp, "resetting quota flags");
928 error = xfs_mount_reset_sbqflags(mp); 925 error = xfs_mount_reset_sbqflags(mp);
929 if (error) 926 if (error)
930 return error; 927 goto out_rtunmount;
931 } 928 }
932 } 929 }
933 930
@@ -989,6 +986,8 @@ xfs_mountfs(
989 xfs_da_unmount(mp); 986 xfs_da_unmount(mp);
990 out_remove_uuid: 987 out_remove_uuid:
991 xfs_uuid_unmount(mp); 988 xfs_uuid_unmount(mp);
989 out_remove_sysfs:
990 xfs_sysfs_del(&mp->m_kobj);
992 out: 991 out:
993 return error; 992 return error;
994} 993}
@@ -1071,6 +1070,8 @@ xfs_unmountfs(
1071 xfs_errortag_clearall(mp, 0); 1070 xfs_errortag_clearall(mp, 0);
1072#endif 1071#endif
1073 xfs_free_perag(mp); 1072 xfs_free_perag(mp);
1073
1074 xfs_sysfs_del(&mp->m_kobj);
1074} 1075}
1075 1076
1076int 1077int
@@ -1152,7 +1153,7 @@ xfs_mod_incore_sb_unlocked(
1152 lcounter += delta; 1153 lcounter += delta;
1153 if (lcounter < 0) { 1154 if (lcounter < 0) {
1154 ASSERT(0); 1155 ASSERT(0);
1155 return XFS_ERROR(EINVAL); 1156 return -EINVAL;
1156 } 1157 }
1157 mp->m_sb.sb_icount = lcounter; 1158 mp->m_sb.sb_icount = lcounter;
1158 return 0; 1159 return 0;
@@ -1161,7 +1162,7 @@ xfs_mod_incore_sb_unlocked(
1161 lcounter += delta; 1162 lcounter += delta;
1162 if (lcounter < 0) { 1163 if (lcounter < 0) {
1163 ASSERT(0); 1164 ASSERT(0);
1164 return XFS_ERROR(EINVAL); 1165 return -EINVAL;
1165 } 1166 }
1166 mp->m_sb.sb_ifree = lcounter; 1167 mp->m_sb.sb_ifree = lcounter;
1167 return 0; 1168 return 0;
@@ -1191,7 +1192,7 @@ xfs_mod_incore_sb_unlocked(
1191 * blocks if were allowed to. 1192 * blocks if were allowed to.
1192 */ 1193 */
1193 if (!rsvd) 1194 if (!rsvd)
1194 return XFS_ERROR(ENOSPC); 1195 return -ENOSPC;
1195 1196
1196 lcounter = (long long)mp->m_resblks_avail + delta; 1197 lcounter = (long long)mp->m_resblks_avail + delta;
1197 if (lcounter >= 0) { 1198 if (lcounter >= 0) {
@@ -1202,7 +1203,7 @@ xfs_mod_incore_sb_unlocked(
1202 "Filesystem \"%s\": reserve blocks depleted! " 1203 "Filesystem \"%s\": reserve blocks depleted! "
1203 "Consider increasing reserve pool size.", 1204 "Consider increasing reserve pool size.",
1204 mp->m_fsname); 1205 mp->m_fsname);
1205 return XFS_ERROR(ENOSPC); 1206 return -ENOSPC;
1206 } 1207 }
1207 1208
1208 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); 1209 mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
@@ -1211,7 +1212,7 @@ xfs_mod_incore_sb_unlocked(
1211 lcounter = (long long)mp->m_sb.sb_frextents; 1212 lcounter = (long long)mp->m_sb.sb_frextents;
1212 lcounter += delta; 1213 lcounter += delta;
1213 if (lcounter < 0) { 1214 if (lcounter < 0) {
1214 return XFS_ERROR(ENOSPC); 1215 return -ENOSPC;
1215 } 1216 }
1216 mp->m_sb.sb_frextents = lcounter; 1217 mp->m_sb.sb_frextents = lcounter;
1217 return 0; 1218 return 0;
@@ -1220,7 +1221,7 @@ xfs_mod_incore_sb_unlocked(
1220 lcounter += delta; 1221 lcounter += delta;
1221 if (lcounter < 0) { 1222 if (lcounter < 0) {
1222 ASSERT(0); 1223 ASSERT(0);
1223 return XFS_ERROR(EINVAL); 1224 return -EINVAL;
1224 } 1225 }
1225 mp->m_sb.sb_dblocks = lcounter; 1226 mp->m_sb.sb_dblocks = lcounter;
1226 return 0; 1227 return 0;
@@ -1229,7 +1230,7 @@ xfs_mod_incore_sb_unlocked(
1229 scounter += delta; 1230 scounter += delta;
1230 if (scounter < 0) { 1231 if (scounter < 0) {
1231 ASSERT(0); 1232 ASSERT(0);
1232 return XFS_ERROR(EINVAL); 1233 return -EINVAL;
1233 } 1234 }
1234 mp->m_sb.sb_agcount = scounter; 1235 mp->m_sb.sb_agcount = scounter;
1235 return 0; 1236 return 0;
@@ -1238,7 +1239,7 @@ xfs_mod_incore_sb_unlocked(
1238 scounter += delta; 1239 scounter += delta;
1239 if (scounter < 0) { 1240 if (scounter < 0) {
1240 ASSERT(0); 1241 ASSERT(0);
1241 return XFS_ERROR(EINVAL); 1242 return -EINVAL;
1242 } 1243 }
1243 mp->m_sb.sb_imax_pct = scounter; 1244 mp->m_sb.sb_imax_pct = scounter;
1244 return 0; 1245 return 0;
@@ -1247,7 +1248,7 @@ xfs_mod_incore_sb_unlocked(
1247 scounter += delta; 1248 scounter += delta;
1248 if (scounter < 0) { 1249 if (scounter < 0) {
1249 ASSERT(0); 1250 ASSERT(0);
1250 return XFS_ERROR(EINVAL); 1251 return -EINVAL;
1251 } 1252 }
1252 mp->m_sb.sb_rextsize = scounter; 1253 mp->m_sb.sb_rextsize = scounter;
1253 return 0; 1254 return 0;
@@ -1256,7 +1257,7 @@ xfs_mod_incore_sb_unlocked(
1256 scounter += delta; 1257 scounter += delta;
1257 if (scounter < 0) { 1258 if (scounter < 0) {
1258 ASSERT(0); 1259 ASSERT(0);
1259 return XFS_ERROR(EINVAL); 1260 return -EINVAL;
1260 } 1261 }
1261 mp->m_sb.sb_rbmblocks = scounter; 1262 mp->m_sb.sb_rbmblocks = scounter;
1262 return 0; 1263 return 0;
@@ -1265,7 +1266,7 @@ xfs_mod_incore_sb_unlocked(
1265 lcounter += delta; 1266 lcounter += delta;
1266 if (lcounter < 0) { 1267 if (lcounter < 0) {
1267 ASSERT(0); 1268 ASSERT(0);
1268 return XFS_ERROR(EINVAL); 1269 return -EINVAL;
1269 } 1270 }
1270 mp->m_sb.sb_rblocks = lcounter; 1271 mp->m_sb.sb_rblocks = lcounter;
1271 return 0; 1272 return 0;
@@ -1274,7 +1275,7 @@ xfs_mod_incore_sb_unlocked(
1274 lcounter += delta; 1275 lcounter += delta;
1275 if (lcounter < 0) { 1276 if (lcounter < 0) {
1276 ASSERT(0); 1277 ASSERT(0);
1277 return XFS_ERROR(EINVAL); 1278 return -EINVAL;
1278 } 1279 }
1279 mp->m_sb.sb_rextents = lcounter; 1280 mp->m_sb.sb_rextents = lcounter;
1280 return 0; 1281 return 0;
@@ -1283,13 +1284,13 @@ xfs_mod_incore_sb_unlocked(
1283 scounter += delta; 1284 scounter += delta;
1284 if (scounter < 0) { 1285 if (scounter < 0) {
1285 ASSERT(0); 1286 ASSERT(0);
1286 return XFS_ERROR(EINVAL); 1287 return -EINVAL;
1287 } 1288 }
1288 mp->m_sb.sb_rextslog = scounter; 1289 mp->m_sb.sb_rextslog = scounter;
1289 return 0; 1290 return 0;
1290 default: 1291 default:
1291 ASSERT(0); 1292 ASSERT(0);
1292 return XFS_ERROR(EINVAL); 1293 return -EINVAL;
1293 } 1294 }
1294} 1295}
1295 1296
@@ -1452,7 +1453,7 @@ xfs_dev_is_read_only(
1452 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 1453 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
1453 xfs_notice(mp, "%s required on read-only device.", message); 1454 xfs_notice(mp, "%s required on read-only device.", message);
1454 xfs_notice(mp, "write access unavailable, cannot proceed."); 1455 xfs_notice(mp, "write access unavailable, cannot proceed.");
1455 return EROFS; 1456 return -EROFS;
1456 } 1457 }
1457 return 0; 1458 return 0;
1458} 1459}
@@ -1995,7 +1996,7 @@ slow_path:
1995 * (e.g. lots of space just got freed). After that 1996 * (e.g. lots of space just got freed). After that
1996 * we are done. 1997 * we are done.
1997 */ 1998 */
1998 if (ret != ENOSPC) 1999 if (ret != -ENOSPC)
1999 xfs_icsb_balance_counter(mp, field, 0); 2000 xfs_icsb_balance_counter(mp, field, 0);
2000 xfs_icsb_unlock(mp); 2001 xfs_icsb_unlock(mp);
2001 return ret; 2002 return ret;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7295a0b7c343..b0447c86e7e2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -166,6 +166,7 @@ typedef struct xfs_mount {
166 on the next remount,rw */ 166 on the next remount,rw */
167 int64_t m_low_space[XFS_LOWSP_MAX]; 167 int64_t m_low_space[XFS_LOWSP_MAX];
168 /* low free space thresholds */ 168 /* low free space thresholds */
169 struct xfs_kobj m_kobj;
169 170
170 struct workqueue_struct *m_data_workqueue; 171 struct workqueue_struct *m_data_workqueue;
171 struct workqueue_struct *m_unwritten_workqueue; 172 struct workqueue_struct *m_unwritten_workqueue;
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f99b4933dc22..30ecca3037e3 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -304,7 +304,8 @@ _xfs_mru_cache_reap(
304int 304int
305xfs_mru_cache_init(void) 305xfs_mru_cache_init(void)
306{ 306{
307 xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); 307 xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache",
308 WQ_MEM_RECLAIM|WQ_FREEZABLE, 1);
308 if (!xfs_mru_reap_wq) 309 if (!xfs_mru_reap_wq)
309 return -ENOMEM; 310 return -ENOMEM;
310 return 0; 311 return 0;
@@ -337,20 +338,20 @@ xfs_mru_cache_create(
337 *mrup = NULL; 338 *mrup = NULL;
338 339
339 if (!mrup || !grp_count || !lifetime_ms || !free_func) 340 if (!mrup || !grp_count || !lifetime_ms || !free_func)
340 return EINVAL; 341 return -EINVAL;
341 342
342 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count)) 343 if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
343 return EINVAL; 344 return -EINVAL;
344 345
345 if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP))) 346 if (!(mru = kmem_zalloc(sizeof(*mru), KM_SLEEP)))
346 return ENOMEM; 347 return -ENOMEM;
347 348
348 /* An extra list is needed to avoid reaping up to a grp_time early. */ 349 /* An extra list is needed to avoid reaping up to a grp_time early. */
349 mru->grp_count = grp_count + 1; 350 mru->grp_count = grp_count + 1;
350 mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP); 351 mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), KM_SLEEP);
351 352
352 if (!mru->lists) { 353 if (!mru->lists) {
353 err = ENOMEM; 354 err = -ENOMEM;
354 goto exit; 355 goto exit;
355 } 356 }
356 357
@@ -434,16 +435,16 @@ xfs_mru_cache_insert(
434 435
435 ASSERT(mru && mru->lists); 436 ASSERT(mru && mru->lists);
436 if (!mru || !mru->lists) 437 if (!mru || !mru->lists)
437 return EINVAL; 438 return -EINVAL;
438 439
439 if (radix_tree_preload(GFP_KERNEL)) 440 if (radix_tree_preload(GFP_KERNEL))
440 return ENOMEM; 441 return -ENOMEM;
441 442
442 INIT_LIST_HEAD(&elem->list_node); 443 INIT_LIST_HEAD(&elem->list_node);
443 elem->key = key; 444 elem->key = key;
444 445
445 spin_lock(&mru->lock); 446 spin_lock(&mru->lock);
446 error = -radix_tree_insert(&mru->store, key, elem); 447 error = radix_tree_insert(&mru->store, key, elem);
447 radix_tree_preload_end(); 448 radix_tree_preload_end();
448 if (!error) 449 if (!error)
449 _xfs_mru_cache_list_insert(mru, elem); 450 _xfs_mru_cache_list_insert(mru, elem);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6d26759c779a..d68f23021af3 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -98,18 +98,18 @@ restart:
98 next_index = be32_to_cpu(dqp->q_core.d_id) + 1; 98 next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
99 99
100 error = execute(batch[i], data); 100 error = execute(batch[i], data);
101 if (error == EAGAIN) { 101 if (error == -EAGAIN) {
102 skipped++; 102 skipped++;
103 continue; 103 continue;
104 } 104 }
105 if (error && last_error != EFSCORRUPTED) 105 if (error && last_error != -EFSCORRUPTED)
106 last_error = error; 106 last_error = error;
107 } 107 }
108 108
109 mutex_unlock(&qi->qi_tree_lock); 109 mutex_unlock(&qi->qi_tree_lock);
110 110
111 /* bail out if the filesystem is corrupted. */ 111 /* bail out if the filesystem is corrupted. */
112 if (last_error == EFSCORRUPTED) { 112 if (last_error == -EFSCORRUPTED) {
113 skipped = 0; 113 skipped = 0;
114 break; 114 break;
115 } 115 }
@@ -138,7 +138,7 @@ xfs_qm_dqpurge(
138 xfs_dqlock(dqp); 138 xfs_dqlock(dqp);
139 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) { 139 if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
140 xfs_dqunlock(dqp); 140 xfs_dqunlock(dqp);
141 return EAGAIN; 141 return -EAGAIN;
142 } 142 }
143 143
144 dqp->dq_flags |= XFS_DQ_FREEING; 144 dqp->dq_flags |= XFS_DQ_FREEING;
@@ -221,100 +221,6 @@ xfs_qm_unmount(
221 } 221 }
222} 222}
223 223
224
225/*
226 * This is called from xfs_mountfs to start quotas and initialize all
227 * necessary data structures like quotainfo. This is also responsible for
228 * running a quotacheck as necessary. We are guaranteed that the superblock
229 * is consistently read in at this point.
230 *
231 * If we fail here, the mount will continue with quota turned off. We don't
232 * need to inidicate success or failure at all.
233 */
234void
235xfs_qm_mount_quotas(
236 xfs_mount_t *mp)
237{
238 int error = 0;
239 uint sbf;
240
241 /*
242 * If quotas on realtime volumes is not supported, we disable
243 * quotas immediately.
244 */
245 if (mp->m_sb.sb_rextents) {
246 xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
247 mp->m_qflags = 0;
248 goto write_changes;
249 }
250
251 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
252
253 /*
254 * Allocate the quotainfo structure inside the mount struct, and
255 * create quotainode(s), and change/rev superblock if necessary.
256 */
257 error = xfs_qm_init_quotainfo(mp);
258 if (error) {
259 /*
260 * We must turn off quotas.
261 */
262 ASSERT(mp->m_quotainfo == NULL);
263 mp->m_qflags = 0;
264 goto write_changes;
265 }
266 /*
267 * If any of the quotas are not consistent, do a quotacheck.
268 */
269 if (XFS_QM_NEED_QUOTACHECK(mp)) {
270 error = xfs_qm_quotacheck(mp);
271 if (error) {
272 /* Quotacheck failed and disabled quotas. */
273 return;
274 }
275 }
276 /*
277 * If one type of quotas is off, then it will lose its
278 * quotachecked status, since we won't be doing accounting for
279 * that type anymore.
280 */
281 if (!XFS_IS_UQUOTA_ON(mp))
282 mp->m_qflags &= ~XFS_UQUOTA_CHKD;
283 if (!XFS_IS_GQUOTA_ON(mp))
284 mp->m_qflags &= ~XFS_GQUOTA_CHKD;
285 if (!XFS_IS_PQUOTA_ON(mp))
286 mp->m_qflags &= ~XFS_PQUOTA_CHKD;
287
288 write_changes:
289 /*
290 * We actually don't have to acquire the m_sb_lock at all.
291 * This can only be called from mount, and that's single threaded. XXX
292 */
293 spin_lock(&mp->m_sb_lock);
294 sbf = mp->m_sb.sb_qflags;
295 mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
296 spin_unlock(&mp->m_sb_lock);
297
298 if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
299 if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
300 /*
301 * We could only have been turning quotas off.
302 * We aren't in very good shape actually because
303 * the incore structures are convinced that quotas are
304 * off, but the on disk superblock doesn't know that !
305 */
306 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
307 xfs_alert(mp, "%s: Superblock update failed!",
308 __func__);
309 }
310 }
311
312 if (error) {
313 xfs_warn(mp, "Failed to initialize disk quotas.");
314 return;
315 }
316}
317
318/* 224/*
319 * Called from the vfsops layer. 225 * Called from the vfsops layer.
320 */ 226 */
@@ -528,6 +434,7 @@ xfs_qm_dquot_isolate(
528 struct list_head *item, 434 struct list_head *item,
529 spinlock_t *lru_lock, 435 spinlock_t *lru_lock,
530 void *arg) 436 void *arg)
437 __releases(lru_lock) __acquires(lru_lock)
531{ 438{
532 struct xfs_dquot *dqp = container_of(item, 439 struct xfs_dquot *dqp = container_of(item,
533 struct xfs_dquot, q_lru); 440 struct xfs_dquot, q_lru);
@@ -671,7 +578,7 @@ xfs_qm_init_quotainfo(
671 578
672 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); 579 qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
673 580
674 error = -list_lru_init(&qinf->qi_lru); 581 error = list_lru_init(&qinf->qi_lru);
675 if (error) 582 if (error)
676 goto out_free_qinf; 583 goto out_free_qinf;
677 584
@@ -995,7 +902,7 @@ xfs_qm_dqiter_bufs(
995 * will leave a trace in the log indicating corruption has 902 * will leave a trace in the log indicating corruption has
996 * been detected. 903 * been detected.
997 */ 904 */
998 if (error == EFSCORRUPTED) { 905 if (error == -EFSCORRUPTED) {
999 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, 906 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
1000 XFS_FSB_TO_DADDR(mp, bno), 907 XFS_FSB_TO_DADDR(mp, bno),
1001 mp->m_quotainfo->qi_dqchunklen, 0, &bp, 908 mp->m_quotainfo->qi_dqchunklen, 0, &bp,
@@ -1005,6 +912,12 @@ xfs_qm_dqiter_bufs(
1005 if (error) 912 if (error)
1006 break; 913 break;
1007 914
915 /*
916 * A corrupt buffer might not have a verifier attached, so
917 * make sure we have the correct one attached before writeback
918 * occurs.
919 */
920 bp->b_ops = &xfs_dquot_buf_ops;
1008 xfs_qm_reset_dqcounts(mp, bp, firstid, type); 921 xfs_qm_reset_dqcounts(mp, bp, firstid, type);
1009 xfs_buf_delwri_queue(bp, buffer_list); 922 xfs_buf_delwri_queue(bp, buffer_list);
1010 xfs_buf_relse(bp); 923 xfs_buf_relse(bp);
@@ -1090,7 +1003,7 @@ xfs_qm_dqiterate(
1090 xfs_buf_readahead(mp->m_ddev_targp, 1003 xfs_buf_readahead(mp->m_ddev_targp,
1091 XFS_FSB_TO_DADDR(mp, rablkno), 1004 XFS_FSB_TO_DADDR(mp, rablkno),
1092 mp->m_quotainfo->qi_dqchunklen, 1005 mp->m_quotainfo->qi_dqchunklen,
1093 NULL); 1006 &xfs_dquot_buf_ops);
1094 rablkno++; 1007 rablkno++;
1095 } 1008 }
1096 } 1009 }
@@ -1138,8 +1051,8 @@ xfs_qm_quotacheck_dqadjust(
1138 /* 1051 /*
1139 * Shouldn't be able to turn off quotas here. 1052 * Shouldn't be able to turn off quotas here.
1140 */ 1053 */
1141 ASSERT(error != ESRCH); 1054 ASSERT(error != -ESRCH);
1142 ASSERT(error != ENOENT); 1055 ASSERT(error != -ENOENT);
1143 return error; 1056 return error;
1144 } 1057 }
1145 1058
@@ -1226,7 +1139,7 @@ xfs_qm_dqusage_adjust(
1226 */ 1139 */
1227 if (xfs_is_quota_inode(&mp->m_sb, ino)) { 1140 if (xfs_is_quota_inode(&mp->m_sb, ino)) {
1228 *res = BULKSTAT_RV_NOTHING; 1141 *res = BULKSTAT_RV_NOTHING;
1229 return XFS_ERROR(EINVAL); 1142 return -EINVAL;
1230 } 1143 }
1231 1144
1232 /* 1145 /*
@@ -1330,7 +1243,7 @@ out_unlock:
1330 * Walk thru all the filesystem inodes and construct a consistent view 1243 * Walk thru all the filesystem inodes and construct a consistent view
1331 * of the disk quota world. If the quotacheck fails, disable quotas. 1244 * of the disk quota world. If the quotacheck fails, disable quotas.
1332 */ 1245 */
1333int 1246STATIC int
1334xfs_qm_quotacheck( 1247xfs_qm_quotacheck(
1335 xfs_mount_t *mp) 1248 xfs_mount_t *mp)
1336{ 1249{
@@ -1463,7 +1376,100 @@ xfs_qm_quotacheck(
1463 } 1376 }
1464 } else 1377 } else
1465 xfs_notice(mp, "Quotacheck: Done."); 1378 xfs_notice(mp, "Quotacheck: Done.");
1466 return (error); 1379 return error;
1380}
1381
1382/*
1383 * This is called from xfs_mountfs to start quotas and initialize all
1384 * necessary data structures like quotainfo. This is also responsible for
1385 * running a quotacheck as necessary. We are guaranteed that the superblock
1386 * is consistently read in at this point.
1387 *
1388 * If we fail here, the mount will continue with quota turned off. We don't
1389 * need to inidicate success or failure at all.
1390 */
1391void
1392xfs_qm_mount_quotas(
1393 struct xfs_mount *mp)
1394{
1395 int error = 0;
1396 uint sbf;
1397
1398 /*
1399 * If quotas on realtime volumes is not supported, we disable
1400 * quotas immediately.
1401 */
1402 if (mp->m_sb.sb_rextents) {
1403 xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
1404 mp->m_qflags = 0;
1405 goto write_changes;
1406 }
1407
1408 ASSERT(XFS_IS_QUOTA_RUNNING(mp));
1409
1410 /*
1411 * Allocate the quotainfo structure inside the mount struct, and
1412 * create quotainode(s), and change/rev superblock if necessary.
1413 */
1414 error = xfs_qm_init_quotainfo(mp);
1415 if (error) {
1416 /*
1417 * We must turn off quotas.
1418 */
1419 ASSERT(mp->m_quotainfo == NULL);
1420 mp->m_qflags = 0;
1421 goto write_changes;
1422 }
1423 /*
1424 * If any of the quotas are not consistent, do a quotacheck.
1425 */
1426 if (XFS_QM_NEED_QUOTACHECK(mp)) {
1427 error = xfs_qm_quotacheck(mp);
1428 if (error) {
1429 /* Quotacheck failed and disabled quotas. */
1430 return;
1431 }
1432 }
1433 /*
1434 * If one type of quotas is off, then it will lose its
1435 * quotachecked status, since we won't be doing accounting for
1436 * that type anymore.
1437 */
1438 if (!XFS_IS_UQUOTA_ON(mp))
1439 mp->m_qflags &= ~XFS_UQUOTA_CHKD;
1440 if (!XFS_IS_GQUOTA_ON(mp))
1441 mp->m_qflags &= ~XFS_GQUOTA_CHKD;
1442 if (!XFS_IS_PQUOTA_ON(mp))
1443 mp->m_qflags &= ~XFS_PQUOTA_CHKD;
1444
1445 write_changes:
1446 /*
1447 * We actually don't have to acquire the m_sb_lock at all.
1448 * This can only be called from mount, and that's single threaded. XXX
1449 */
1450 spin_lock(&mp->m_sb_lock);
1451 sbf = mp->m_sb.sb_qflags;
1452 mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
1453 spin_unlock(&mp->m_sb_lock);
1454
1455 if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
1456 if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
1457 /*
1458 * We could only have been turning quotas off.
1459 * We aren't in very good shape actually because
1460 * the incore structures are convinced that quotas are
1461 * off, but the on disk superblock doesn't know that !
1462 */
1463 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
1464 xfs_alert(mp, "%s: Superblock update failed!",
1465 __func__);
1466 }
1467 }
1468
1469 if (error) {
1470 xfs_warn(mp, "Failed to initialize disk quotas.");
1471 return;
1472 }
1467} 1473}
1468 1474
1469/* 1475/*
@@ -1493,7 +1499,7 @@ xfs_qm_init_quotainos(
1493 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 1499 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
1494 0, 0, &uip); 1500 0, 0, &uip);
1495 if (error) 1501 if (error)
1496 return XFS_ERROR(error); 1502 return error;
1497 } 1503 }
1498 if (XFS_IS_GQUOTA_ON(mp) && 1504 if (XFS_IS_GQUOTA_ON(mp) &&
1499 mp->m_sb.sb_gquotino != NULLFSINO) { 1505 mp->m_sb.sb_gquotino != NULLFSINO) {
@@ -1563,7 +1569,7 @@ error_rele:
1563 IRELE(gip); 1569 IRELE(gip);
1564 if (pip) 1570 if (pip)
1565 IRELE(pip); 1571 IRELE(pip);
1566 return XFS_ERROR(error); 1572 return error;
1567} 1573}
1568 1574
1569STATIC void 1575STATIC void
@@ -1679,7 +1685,7 @@ xfs_qm_vop_dqalloc(
1679 XFS_QMOPT_DOWARN, 1685 XFS_QMOPT_DOWARN,
1680 &uq); 1686 &uq);
1681 if (error) { 1687 if (error) {
1682 ASSERT(error != ENOENT); 1688 ASSERT(error != -ENOENT);
1683 return error; 1689 return error;
1684 } 1690 }
1685 /* 1691 /*
@@ -1706,7 +1712,7 @@ xfs_qm_vop_dqalloc(
1706 XFS_QMOPT_DOWARN, 1712 XFS_QMOPT_DOWARN,
1707 &gq); 1713 &gq);
1708 if (error) { 1714 if (error) {
1709 ASSERT(error != ENOENT); 1715 ASSERT(error != -ENOENT);
1710 goto error_rele; 1716 goto error_rele;
1711 } 1717 }
1712 xfs_dqunlock(gq); 1718 xfs_dqunlock(gq);
@@ -1726,7 +1732,7 @@ xfs_qm_vop_dqalloc(
1726 XFS_QMOPT_DOWARN, 1732 XFS_QMOPT_DOWARN,
1727 &pq); 1733 &pq);
1728 if (error) { 1734 if (error) {
1729 ASSERT(error != ENOENT); 1735 ASSERT(error != -ENOENT);
1730 goto error_rele; 1736 goto error_rele;
1731 } 1737 }
1732 xfs_dqunlock(pq); 1738 xfs_dqunlock(pq);
@@ -1895,7 +1901,7 @@ xfs_qm_vop_chown_reserve(
1895 -((xfs_qcnt_t)delblks), 0, blkflags); 1901 -((xfs_qcnt_t)delblks), 0, blkflags);
1896 } 1902 }
1897 1903
1898 return (0); 1904 return 0;
1899} 1905}
1900 1906
1901int 1907int
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 797fd4636273..3a07a937e232 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
157#define XFS_QM_RTBWARNLIMIT 5 157#define XFS_QM_RTBWARNLIMIT 5
158 158
159extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); 159extern void xfs_qm_destroy_quotainfo(struct xfs_mount *);
160extern int xfs_qm_quotacheck(struct xfs_mount *);
161extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t); 160extern int xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
162 161
163/* dquot stuff */ 162/* dquot stuff */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index e9be63abd8d2..2c61e61b0205 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -117,7 +117,7 @@ xfs_qm_newmount(
117 (uquotaondisk ? " usrquota" : ""), 117 (uquotaondisk ? " usrquota" : ""),
118 (gquotaondisk ? " grpquota" : ""), 118 (gquotaondisk ? " grpquota" : ""),
119 (pquotaondisk ? " prjquota" : "")); 119 (pquotaondisk ? " prjquota" : ""));
120 return XFS_ERROR(EPERM); 120 return -EPERM;
121 } 121 }
122 122
123 if (XFS_IS_QUOTA_ON(mp) || quotaondisk) { 123 if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index bbc813caba4c..80f2d77d929a 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -64,10 +64,10 @@ xfs_qm_scall_quotaoff(
64 /* 64 /*
65 * No file system can have quotas enabled on disk but not in core. 65 * No file system can have quotas enabled on disk but not in core.
66 * Note that quota utilities (like quotaoff) _expect_ 66 * Note that quota utilities (like quotaoff) _expect_
67 * errno == EEXIST here. 67 * errno == -EEXIST here.
68 */ 68 */
69 if ((mp->m_qflags & flags) == 0) 69 if ((mp->m_qflags & flags) == 0)
70 return XFS_ERROR(EEXIST); 70 return -EEXIST;
71 error = 0; 71 error = 0;
72 72
73 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 73 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
@@ -94,7 +94,7 @@ xfs_qm_scall_quotaoff(
94 94
95 /* XXX what to do if error ? Revert back to old vals incore ? */ 95 /* XXX what to do if error ? Revert back to old vals incore ? */
96 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS); 96 error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
97 return (error); 97 return error;
98 } 98 }
99 99
100 dqtype = 0; 100 dqtype = 0;
@@ -198,7 +198,7 @@ xfs_qm_scall_quotaoff(
198 if (mp->m_qflags == 0) { 198 if (mp->m_qflags == 0) {
199 mutex_unlock(&q->qi_quotaofflock); 199 mutex_unlock(&q->qi_quotaofflock);
200 xfs_qm_destroy_quotainfo(mp); 200 xfs_qm_destroy_quotainfo(mp);
201 return (0); 201 return 0;
202 } 202 }
203 203
204 /* 204 /*
@@ -278,13 +278,13 @@ xfs_qm_scall_trunc_qfiles(
278 xfs_mount_t *mp, 278 xfs_mount_t *mp,
279 uint flags) 279 uint flags)
280{ 280{
281 int error = EINVAL; 281 int error = -EINVAL;
282 282
283 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || 283 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 ||
284 (flags & ~XFS_DQ_ALLTYPES)) { 284 (flags & ~XFS_DQ_ALLTYPES)) {
285 xfs_debug(mp, "%s: flags=%x m_qflags=%x", 285 xfs_debug(mp, "%s: flags=%x m_qflags=%x",
286 __func__, flags, mp->m_qflags); 286 __func__, flags, mp->m_qflags);
287 return XFS_ERROR(EINVAL); 287 return -EINVAL;
288 } 288 }
289 289
290 if (flags & XFS_DQ_USER) { 290 if (flags & XFS_DQ_USER) {
@@ -328,7 +328,7 @@ xfs_qm_scall_quotaon(
328 if (flags == 0) { 328 if (flags == 0) {
329 xfs_debug(mp, "%s: zero flags, m_qflags=%x", 329 xfs_debug(mp, "%s: zero flags, m_qflags=%x",
330 __func__, mp->m_qflags); 330 __func__, mp->m_qflags);
331 return XFS_ERROR(EINVAL); 331 return -EINVAL;
332 } 332 }
333 333
334 /* No fs can turn on quotas with a delayed effect */ 334 /* No fs can turn on quotas with a delayed effect */
@@ -351,13 +351,13 @@ xfs_qm_scall_quotaon(
351 xfs_debug(mp, 351 xfs_debug(mp,
352 "%s: Can't enforce without acct, flags=%x sbflags=%x", 352 "%s: Can't enforce without acct, flags=%x sbflags=%x",
353 __func__, flags, mp->m_sb.sb_qflags); 353 __func__, flags, mp->m_sb.sb_qflags);
354 return XFS_ERROR(EINVAL); 354 return -EINVAL;
355 } 355 }
356 /* 356 /*
357 * If everything's up to-date incore, then don't waste time. 357 * If everything's up to-date incore, then don't waste time.
358 */ 358 */
359 if ((mp->m_qflags & flags) == flags) 359 if ((mp->m_qflags & flags) == flags)
360 return XFS_ERROR(EEXIST); 360 return -EEXIST;
361 361
362 /* 362 /*
363 * Change sb_qflags on disk but not incore mp->qflags 363 * Change sb_qflags on disk but not incore mp->qflags
@@ -372,11 +372,11 @@ xfs_qm_scall_quotaon(
372 * There's nothing to change if it's the same. 372 * There's nothing to change if it's the same.
373 */ 373 */
374 if ((qf & flags) == flags && sbflags == 0) 374 if ((qf & flags) == flags && sbflags == 0)
375 return XFS_ERROR(EEXIST); 375 return -EEXIST;
376 sbflags |= XFS_SB_QFLAGS; 376 sbflags |= XFS_SB_QFLAGS;
377 377
378 if ((error = xfs_qm_write_sb_changes(mp, sbflags))) 378 if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
379 return (error); 379 return error;
380 /* 380 /*
381 * If we aren't trying to switch on quota enforcement, we are done. 381 * If we aren't trying to switch on quota enforcement, we are done.
382 */ 382 */
@@ -387,10 +387,10 @@ xfs_qm_scall_quotaon(
387 ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) != 387 ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
388 (mp->m_qflags & XFS_GQUOTA_ACCT)) || 388 (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
389 (flags & XFS_ALL_QUOTA_ENFD) == 0) 389 (flags & XFS_ALL_QUOTA_ENFD) == 0)
390 return (0); 390 return 0;
391 391
392 if (! XFS_IS_QUOTA_RUNNING(mp)) 392 if (! XFS_IS_QUOTA_RUNNING(mp))
393 return XFS_ERROR(ESRCH); 393 return -ESRCH;
394 394
395 /* 395 /*
396 * Switch on quota enforcement in core. 396 * Switch on quota enforcement in core.
@@ -399,7 +399,7 @@ xfs_qm_scall_quotaon(
399 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD); 399 mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
400 mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); 400 mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
401 401
402 return (0); 402 return 0;
403} 403}
404 404
405 405
@@ -426,7 +426,7 @@ xfs_qm_scall_getqstat(
426 if (!xfs_sb_version_hasquota(&mp->m_sb)) { 426 if (!xfs_sb_version_hasquota(&mp->m_sb)) {
427 out->qs_uquota.qfs_ino = NULLFSINO; 427 out->qs_uquota.qfs_ino = NULLFSINO;
428 out->qs_gquota.qfs_ino = NULLFSINO; 428 out->qs_gquota.qfs_ino = NULLFSINO;
429 return (0); 429 return 0;
430 } 430 }
431 431
432 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & 432 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -514,7 +514,7 @@ xfs_qm_scall_getqstatv(
514 out->qs_uquota.qfs_ino = NULLFSINO; 514 out->qs_uquota.qfs_ino = NULLFSINO;
515 out->qs_gquota.qfs_ino = NULLFSINO; 515 out->qs_gquota.qfs_ino = NULLFSINO;
516 out->qs_pquota.qfs_ino = NULLFSINO; 516 out->qs_pquota.qfs_ino = NULLFSINO;
517 return (0); 517 return 0;
518 } 518 }
519 519
520 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & 520 out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
@@ -595,7 +595,7 @@ xfs_qm_scall_setqlim(
595 xfs_qcnt_t hard, soft; 595 xfs_qcnt_t hard, soft;
596 596
597 if (newlim->d_fieldmask & ~XFS_DQ_MASK) 597 if (newlim->d_fieldmask & ~XFS_DQ_MASK)
598 return EINVAL; 598 return -EINVAL;
599 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) 599 if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
600 return 0; 600 return 0;
601 601
@@ -615,7 +615,7 @@ xfs_qm_scall_setqlim(
615 */ 615 */
616 error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); 616 error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp);
617 if (error) { 617 if (error) {
618 ASSERT(error != ENOENT); 618 ASSERT(error != -ENOENT);
619 goto out_unlock; 619 goto out_unlock;
620 } 620 }
621 xfs_dqunlock(dqp); 621 xfs_dqunlock(dqp);
@@ -758,7 +758,7 @@ xfs_qm_log_quotaoff_end(
758 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); 758 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0);
759 if (error) { 759 if (error) {
760 xfs_trans_cancel(tp, 0); 760 xfs_trans_cancel(tp, 0);
761 return (error); 761 return error;
762 } 762 }
763 763
764 qoffi = xfs_trans_get_qoff_item(tp, startqoff, 764 qoffi = xfs_trans_get_qoff_item(tp, startqoff,
@@ -772,7 +772,7 @@ xfs_qm_log_quotaoff_end(
772 */ 772 */
773 xfs_trans_set_sync(tp); 773 xfs_trans_set_sync(tp);
774 error = xfs_trans_commit(tp, 0); 774 error = xfs_trans_commit(tp, 0);
775 return (error); 775 return error;
776} 776}
777 777
778 778
@@ -822,7 +822,7 @@ error0:
822 spin_unlock(&mp->m_sb_lock); 822 spin_unlock(&mp->m_sb_lock);
823 } 823 }
824 *qoffstartp = qoffi; 824 *qoffstartp = qoffi;
825 return (error); 825 return error;
826} 826}
827 827
828 828
@@ -850,7 +850,7 @@ xfs_qm_scall_getquota(
850 * our utility programs are concerned. 850 * our utility programs are concerned.
851 */ 851 */
852 if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { 852 if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
853 error = XFS_ERROR(ENOENT); 853 error = -ENOENT;
854 goto out_put; 854 goto out_put;
855 } 855 }
856 856
@@ -953,7 +953,7 @@ xfs_qm_export_flags(
953 uflags |= FS_QUOTA_GDQ_ENFD; 953 uflags |= FS_QUOTA_GDQ_ENFD;
954 if (flags & XFS_PQUOTA_ENFD) 954 if (flags & XFS_PQUOTA_ENFD)
955 uflags |= FS_QUOTA_PDQ_ENFD; 955 uflags |= FS_QUOTA_PDQ_ENFD;
956 return (uflags); 956 return uflags;
957} 957}
958 958
959 959
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 2ad1b9822e92..b238027df987 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -51,7 +51,7 @@ xfs_fs_get_xstate(
51 51
52 if (!XFS_IS_QUOTA_RUNNING(mp)) 52 if (!XFS_IS_QUOTA_RUNNING(mp))
53 return -ENOSYS; 53 return -ENOSYS;
54 return -xfs_qm_scall_getqstat(mp, fqs); 54 return xfs_qm_scall_getqstat(mp, fqs);
55} 55}
56 56
57STATIC int 57STATIC int
@@ -63,7 +63,7 @@ xfs_fs_get_xstatev(
63 63
64 if (!XFS_IS_QUOTA_RUNNING(mp)) 64 if (!XFS_IS_QUOTA_RUNNING(mp))
65 return -ENOSYS; 65 return -ENOSYS;
66 return -xfs_qm_scall_getqstatv(mp, fqs); 66 return xfs_qm_scall_getqstatv(mp, fqs);
67} 67}
68 68
69STATIC int 69STATIC int
@@ -95,11 +95,11 @@ xfs_fs_set_xstate(
95 95
96 switch (op) { 96 switch (op) {
97 case Q_XQUOTAON: 97 case Q_XQUOTAON:
98 return -xfs_qm_scall_quotaon(mp, flags); 98 return xfs_qm_scall_quotaon(mp, flags);
99 case Q_XQUOTAOFF: 99 case Q_XQUOTAOFF:
100 if (!XFS_IS_QUOTA_ON(mp)) 100 if (!XFS_IS_QUOTA_ON(mp))
101 return -EINVAL; 101 return -EINVAL;
102 return -xfs_qm_scall_quotaoff(mp, flags); 102 return xfs_qm_scall_quotaoff(mp, flags);
103 } 103 }
104 104
105 return -EINVAL; 105 return -EINVAL;
@@ -112,7 +112,7 @@ xfs_fs_rm_xquota(
112{ 112{
113 struct xfs_mount *mp = XFS_M(sb); 113 struct xfs_mount *mp = XFS_M(sb);
114 unsigned int flags = 0; 114 unsigned int flags = 0;
115 115
116 if (sb->s_flags & MS_RDONLY) 116 if (sb->s_flags & MS_RDONLY)
117 return -EROFS; 117 return -EROFS;
118 118
@@ -123,11 +123,11 @@ xfs_fs_rm_xquota(
123 flags |= XFS_DQ_USER; 123 flags |= XFS_DQ_USER;
124 if (uflags & FS_GROUP_QUOTA) 124 if (uflags & FS_GROUP_QUOTA)
125 flags |= XFS_DQ_GROUP; 125 flags |= XFS_DQ_GROUP;
126 if (uflags & FS_USER_QUOTA) 126 if (uflags & FS_PROJ_QUOTA)
127 flags |= XFS_DQ_PROJ; 127 flags |= XFS_DQ_PROJ;
128 128
129 return -xfs_qm_scall_trunc_qfiles(mp, flags); 129 return xfs_qm_scall_trunc_qfiles(mp, flags);
130} 130}
131 131
132STATIC int 132STATIC int
133xfs_fs_get_dqblk( 133xfs_fs_get_dqblk(
@@ -142,7 +142,7 @@ xfs_fs_get_dqblk(
142 if (!XFS_IS_QUOTA_ON(mp)) 142 if (!XFS_IS_QUOTA_ON(mp))
143 return -ESRCH; 143 return -ESRCH;
144 144
145 return -xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid), 145 return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
146 xfs_quota_type(qid.type), fdq); 146 xfs_quota_type(qid.type), fdq);
147} 147}
148 148
@@ -161,7 +161,7 @@ xfs_fs_set_dqblk(
161 if (!XFS_IS_QUOTA_ON(mp)) 161 if (!XFS_IS_QUOTA_ON(mp))
162 return -ESRCH; 162 return -ESRCH;
163 163
164 return -xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), 164 return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
165 xfs_quota_type(qid.type), fdq); 165 xfs_quota_type(qid.type), fdq);
166} 166}
167 167
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ec5ca65c6211..e1175ea9b551 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -46,7 +46,7 @@
46 * Keeps track of a current summary block, so we don't keep reading 46 * Keeps track of a current summary block, so we don't keep reading
47 * it from the buffer cache. 47 * it from the buffer cache.
48 */ 48 */
49STATIC int /* error */ 49static int
50xfs_rtget_summary( 50xfs_rtget_summary(
51 xfs_mount_t *mp, /* file system mount structure */ 51 xfs_mount_t *mp, /* file system mount structure */
52 xfs_trans_t *tp, /* transaction pointer */ 52 xfs_trans_t *tp, /* transaction pointer */
@@ -56,60 +56,9 @@ xfs_rtget_summary(
56 xfs_fsblock_t *rsb, /* in/out: summary block number */ 56 xfs_fsblock_t *rsb, /* in/out: summary block number */
57 xfs_suminfo_t *sum) /* out: summary info for this block */ 57 xfs_suminfo_t *sum) /* out: summary info for this block */
58{ 58{
59 xfs_buf_t *bp; /* buffer for summary block */ 59 return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum);
60 int error; /* error value */
61 xfs_fsblock_t sb; /* summary fsblock */
62 int so; /* index into the summary file */
63 xfs_suminfo_t *sp; /* pointer to returned data */
64
65 /*
66 * Compute entry number in the summary file.
67 */
68 so = XFS_SUMOFFS(mp, log, bbno);
69 /*
70 * Compute the block number in the summary file.
71 */
72 sb = XFS_SUMOFFSTOBLOCK(mp, so);
73 /*
74 * If we have an old buffer, and the block number matches, use that.
75 */
76 if (rbpp && *rbpp && *rsb == sb)
77 bp = *rbpp;
78 /*
79 * Otherwise we have to get the buffer.
80 */
81 else {
82 /*
83 * If there was an old one, get rid of it first.
84 */
85 if (rbpp && *rbpp)
86 xfs_trans_brelse(tp, *rbpp);
87 error = xfs_rtbuf_get(mp, tp, sb, 1, &bp);
88 if (error) {
89 return error;
90 }
91 /*
92 * Remember this buffer and block for the next call.
93 */
94 if (rbpp) {
95 *rbpp = bp;
96 *rsb = sb;
97 }
98 }
99 /*
100 * Point to the summary information & copy it out.
101 */
102 sp = XFS_SUMPTR(mp, bp, so);
103 *sum = *sp;
104 /*
105 * Drop the buffer if we're not asked to remember it.
106 */
107 if (!rbpp)
108 xfs_trans_brelse(tp, bp);
109 return 0;
110} 60}
111 61
112
113/* 62/*
114 * Return whether there are any free extents in the size range given 63 * Return whether there are any free extents in the size range given
115 * by low and high, for the bitmap block bbno. 64 * by low and high, for the bitmap block bbno.
@@ -863,7 +812,7 @@ xfs_growfs_rt_alloc(
863 XFS_BMAPI_METADATA, &firstblock, 812 XFS_BMAPI_METADATA, &firstblock,
864 resblks, &map, &nmap, &flist); 813 resblks, &map, &nmap, &flist);
865 if (!error && nmap < 1) 814 if (!error && nmap < 1)
866 error = XFS_ERROR(ENOSPC); 815 error = -ENOSPC;
867 if (error) 816 if (error)
868 goto error_cancel; 817 goto error_cancel;
869 /* 818 /*
@@ -903,7 +852,7 @@ xfs_growfs_rt_alloc(
903 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 852 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
904 mp->m_bsize, 0); 853 mp->m_bsize, 0);
905 if (bp == NULL) { 854 if (bp == NULL) {
906 error = XFS_ERROR(EIO); 855 error = -EIO;
907error_cancel: 856error_cancel:
908 xfs_trans_cancel(tp, cancelflags); 857 xfs_trans_cancel(tp, cancelflags);
909 goto error; 858 goto error;
@@ -944,9 +893,9 @@ xfs_growfs_rt(
944 xfs_buf_t *bp; /* temporary buffer */ 893 xfs_buf_t *bp; /* temporary buffer */
945 int error; /* error return value */ 894 int error; /* error return value */
946 xfs_mount_t *nmp; /* new (fake) mount structure */ 895 xfs_mount_t *nmp; /* new (fake) mount structure */
947 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ 896 xfs_rfsblock_t nrblocks; /* new number of realtime blocks */
948 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ 897 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
949 xfs_drtbno_t nrextents; /* new number of realtime extents */ 898 xfs_rtblock_t nrextents; /* new number of realtime extents */
950 uint8_t nrextslog; /* new log2 of sb_rextents */ 899 uint8_t nrextslog; /* new log2 of sb_rextents */
951 xfs_extlen_t nrsumblocks; /* new number of summary blocks */ 900 xfs_extlen_t nrsumblocks; /* new number of summary blocks */
952 uint nrsumlevels; /* new rt summary levels */ 901 uint nrsumlevels; /* new rt summary levels */
@@ -962,26 +911,21 @@ xfs_growfs_rt(
962 * Initial error checking. 911 * Initial error checking.
963 */ 912 */
964 if (!capable(CAP_SYS_ADMIN)) 913 if (!capable(CAP_SYS_ADMIN))
965 return XFS_ERROR(EPERM); 914 return -EPERM;
966 if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || 915 if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
967 (nrblocks = in->newblocks) <= sbp->sb_rblocks || 916 (nrblocks = in->newblocks) <= sbp->sb_rblocks ||
968 (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) 917 (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
969 return XFS_ERROR(EINVAL); 918 return -EINVAL;
970 if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) 919 if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks)))
971 return error; 920 return error;
972 /* 921 /*
973 * Read in the last block of the device, make sure it exists. 922 * Read in the last block of the device, make sure it exists.
974 */ 923 */
975 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 924 error = xfs_buf_read_uncached(mp->m_rtdev_targp,
976 XFS_FSB_TO_BB(mp, nrblocks - 1), 925 XFS_FSB_TO_BB(mp, nrblocks - 1),
977 XFS_FSB_TO_BB(mp, 1), 0, NULL); 926 XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
978 if (!bp) 927 if (error)
979 return EIO;
980 if (bp->b_error) {
981 error = bp->b_error;
982 xfs_buf_relse(bp);
983 return error; 928 return error;
984 }
985 xfs_buf_relse(bp); 929 xfs_buf_relse(bp);
986 930
987 /* 931 /*
@@ -1001,7 +945,7 @@ xfs_growfs_rt(
1001 * since we'll log basically the whole summary file at once. 945 * since we'll log basically the whole summary file at once.
1002 */ 946 */
1003 if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1)) 947 if (nrsumblocks > (mp->m_sb.sb_logblocks >> 1))
1004 return XFS_ERROR(EINVAL); 948 return -EINVAL;
1005 /* 949 /*
1006 * Get the old block counts for bitmap and summary inodes. 950 * Get the old block counts for bitmap and summary inodes.
1007 * These can't change since other growfs callers are locked out. 951 * These can't change since other growfs callers are locked out.
@@ -1208,7 +1152,7 @@ xfs_rtallocate_extent(
1208 len, &sumbp, &sb, prod, &r); 1152 len, &sumbp, &sb, prod, &r);
1209 break; 1153 break;
1210 default: 1154 default:
1211 error = EIO; 1155 error = -EIO;
1212 ASSERT(0); 1156 ASSERT(0);
1213 } 1157 }
1214 if (error) 1158 if (error)
@@ -1235,11 +1179,12 @@ xfs_rtallocate_extent(
1235 */ 1179 */
1236int /* error */ 1180int /* error */
1237xfs_rtmount_init( 1181xfs_rtmount_init(
1238 xfs_mount_t *mp) /* file system mount structure */ 1182 struct xfs_mount *mp) /* file system mount structure */
1239{ 1183{
1240 xfs_buf_t *bp; /* buffer for last block of subvolume */ 1184 struct xfs_buf *bp; /* buffer for last block of subvolume */
1241 xfs_daddr_t d; /* address of last block of subvolume */ 1185 struct xfs_sb *sbp; /* filesystem superblock copy in mount */
1242 xfs_sb_t *sbp; /* filesystem superblock copy in mount */ 1186 xfs_daddr_t d; /* address of last block of subvolume */
1187 int error;
1243 1188
1244 sbp = &mp->m_sb; 1189 sbp = &mp->m_sb;
1245 if (sbp->sb_rblocks == 0) 1190 if (sbp->sb_rblocks == 0)
@@ -1247,7 +1192,7 @@ xfs_rtmount_init(
1247 if (mp->m_rtdev_targp == NULL) { 1192 if (mp->m_rtdev_targp == NULL) {
1248 xfs_warn(mp, 1193 xfs_warn(mp,
1249 "Filesystem has a realtime volume, use rtdev=device option"); 1194 "Filesystem has a realtime volume, use rtdev=device option");
1250 return XFS_ERROR(ENODEV); 1195 return -ENODEV;
1251 } 1196 }
1252 mp->m_rsumlevels = sbp->sb_rextslog + 1; 1197 mp->m_rsumlevels = sbp->sb_rextslog + 1;
1253 mp->m_rsumsize = 1198 mp->m_rsumsize =
@@ -1263,16 +1208,14 @@ xfs_rtmount_init(
1263 xfs_warn(mp, "realtime mount -- %llu != %llu", 1208 xfs_warn(mp, "realtime mount -- %llu != %llu",
1264 (unsigned long long) XFS_BB_TO_FSB(mp, d), 1209 (unsigned long long) XFS_BB_TO_FSB(mp, d),
1265 (unsigned long long) mp->m_sb.sb_rblocks); 1210 (unsigned long long) mp->m_sb.sb_rblocks);
1266 return XFS_ERROR(EFBIG); 1211 return -EFBIG;
1267 } 1212 }
1268 bp = xfs_buf_read_uncached(mp->m_rtdev_targp, 1213 error = xfs_buf_read_uncached(mp->m_rtdev_targp,
1269 d - XFS_FSB_TO_BB(mp, 1), 1214 d - XFS_FSB_TO_BB(mp, 1),
1270 XFS_FSB_TO_BB(mp, 1), 0, NULL); 1215 XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL);
1271 if (!bp || bp->b_error) { 1216 if (error) {
1272 xfs_warn(mp, "realtime device size check failed"); 1217 xfs_warn(mp, "realtime device size check failed");
1273 if (bp) 1218 return error;
1274 xfs_buf_relse(bp);
1275 return EIO;
1276 } 1219 }
1277 xfs_buf_relse(bp); 1220 xfs_buf_relse(bp);
1278 return 0; 1221 return 0;
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 752b63d10300..76c0a4a9bb17 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -111,6 +111,10 @@ int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp,
111 xfs_rtblock_t *rtblock); 111 xfs_rtblock_t *rtblock);
112int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, 112int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp,
113 xfs_rtblock_t start, xfs_extlen_t len, int val); 113 xfs_rtblock_t start, xfs_extlen_t len, int val);
114int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp,
115 int log, xfs_rtblock_t bbno, int delta,
116 xfs_buf_t **rbpp, xfs_fsblock_t *rsb,
117 xfs_suminfo_t *sum);
114int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, 118int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
115 xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp, 119 xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp,
116 xfs_fsblock_t *rsb); 120 xfs_fsblock_t *rsb);
@@ -132,7 +136,7 @@ xfs_rtmount_init(
132 return 0; 136 return 0;
133 137
134 xfs_warn(mp, "Not built with CONFIG_XFS_RT"); 138 xfs_warn(mp, "Not built with CONFIG_XFS_RT");
135 return ENOSYS; 139 return -ENOSYS;
136} 140}
137# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 141# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
138# define xfs_rtunmount_inodes(m) 142# define xfs_rtunmount_inodes(m)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8f0333b3f7a0..9f622feda6a4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,6 +47,7 @@
47#include "xfs_dinode.h" 47#include "xfs_dinode.h"
48#include "xfs_filestream.h" 48#include "xfs_filestream.h"
49#include "xfs_quota.h" 49#include "xfs_quota.h"
50#include "xfs_sysfs.h"
50 51
51#include <linux/namei.h> 52#include <linux/namei.h>
52#include <linux/init.h> 53#include <linux/init.h>
@@ -62,6 +63,11 @@ static const struct super_operations xfs_super_operations;
62static kmem_zone_t *xfs_ioend_zone; 63static kmem_zone_t *xfs_ioend_zone;
63mempool_t *xfs_ioend_pool; 64mempool_t *xfs_ioend_pool;
64 65
66static struct kset *xfs_kset; /* top-level xfs sysfs dir */
67#ifdef DEBUG
68static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
69#endif
70
65#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ 71#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
66#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ 72#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
67#define MNTOPT_LOGDEV "logdev" /* log device */ 73#define MNTOPT_LOGDEV "logdev" /* log device */
@@ -185,7 +191,7 @@ xfs_parseargs(
185 */ 191 */
186 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); 192 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
187 if (!mp->m_fsname) 193 if (!mp->m_fsname)
188 return ENOMEM; 194 return -ENOMEM;
189 mp->m_fsname_len = strlen(mp->m_fsname) + 1; 195 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
190 196
191 /* 197 /*
@@ -204,9 +210,6 @@ xfs_parseargs(
204 */ 210 */
205 mp->m_flags |= XFS_MOUNT_BARRIER; 211 mp->m_flags |= XFS_MOUNT_BARRIER;
206 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 212 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
207#if !XFS_BIG_INUMS
208 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
209#endif
210 213
211 /* 214 /*
212 * These can be overridden by the mount option parsing. 215 * These can be overridden by the mount option parsing.
@@ -227,57 +230,57 @@ xfs_parseargs(
227 if (!value || !*value) { 230 if (!value || !*value) {
228 xfs_warn(mp, "%s option requires an argument", 231 xfs_warn(mp, "%s option requires an argument",
229 this_char); 232 this_char);
230 return EINVAL; 233 return -EINVAL;
231 } 234 }
232 if (kstrtoint(value, 10, &mp->m_logbufs)) 235 if (kstrtoint(value, 10, &mp->m_logbufs))
233 return EINVAL; 236 return -EINVAL;
234 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { 237 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
235 if (!value || !*value) { 238 if (!value || !*value) {
236 xfs_warn(mp, "%s option requires an argument", 239 xfs_warn(mp, "%s option requires an argument",
237 this_char); 240 this_char);
238 return EINVAL; 241 return -EINVAL;
239 } 242 }
240 if (suffix_kstrtoint(value, 10, &mp->m_logbsize)) 243 if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
241 return EINVAL; 244 return -EINVAL;
242 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { 245 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
243 if (!value || !*value) { 246 if (!value || !*value) {
244 xfs_warn(mp, "%s option requires an argument", 247 xfs_warn(mp, "%s option requires an argument",
245 this_char); 248 this_char);
246 return EINVAL; 249 return -EINVAL;
247 } 250 }
248 mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); 251 mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
249 if (!mp->m_logname) 252 if (!mp->m_logname)
250 return ENOMEM; 253 return -ENOMEM;
251 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 254 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
252 xfs_warn(mp, "%s option not allowed on this system", 255 xfs_warn(mp, "%s option not allowed on this system",
253 this_char); 256 this_char);
254 return EINVAL; 257 return -EINVAL;
255 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 258 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
256 if (!value || !*value) { 259 if (!value || !*value) {
257 xfs_warn(mp, "%s option requires an argument", 260 xfs_warn(mp, "%s option requires an argument",
258 this_char); 261 this_char);
259 return EINVAL; 262 return -EINVAL;
260 } 263 }
261 mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL); 264 mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
262 if (!mp->m_rtname) 265 if (!mp->m_rtname)
263 return ENOMEM; 266 return -ENOMEM;
264 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { 267 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
265 if (!value || !*value) { 268 if (!value || !*value) {
266 xfs_warn(mp, "%s option requires an argument", 269 xfs_warn(mp, "%s option requires an argument",
267 this_char); 270 this_char);
268 return EINVAL; 271 return -EINVAL;
269 } 272 }
270 if (kstrtoint(value, 10, &iosize)) 273 if (kstrtoint(value, 10, &iosize))
271 return EINVAL; 274 return -EINVAL;
272 iosizelog = ffs(iosize) - 1; 275 iosizelog = ffs(iosize) - 1;
273 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { 276 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
274 if (!value || !*value) { 277 if (!value || !*value) {
275 xfs_warn(mp, "%s option requires an argument", 278 xfs_warn(mp, "%s option requires an argument",
276 this_char); 279 this_char);
277 return EINVAL; 280 return -EINVAL;
278 } 281 }
279 if (suffix_kstrtoint(value, 10, &iosize)) 282 if (suffix_kstrtoint(value, 10, &iosize))
280 return EINVAL; 283 return -EINVAL;
281 iosizelog = ffs(iosize) - 1; 284 iosizelog = ffs(iosize) - 1;
282 } else if (!strcmp(this_char, MNTOPT_GRPID) || 285 } else if (!strcmp(this_char, MNTOPT_GRPID) ||
283 !strcmp(this_char, MNTOPT_BSDGROUPS)) { 286 !strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -297,27 +300,22 @@ xfs_parseargs(
297 if (!value || !*value) { 300 if (!value || !*value) {
298 xfs_warn(mp, "%s option requires an argument", 301 xfs_warn(mp, "%s option requires an argument",
299 this_char); 302 this_char);
300 return EINVAL; 303 return -EINVAL;
301 } 304 }
302 if (kstrtoint(value, 10, &dsunit)) 305 if (kstrtoint(value, 10, &dsunit))
303 return EINVAL; 306 return -EINVAL;
304 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { 307 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
305 if (!value || !*value) { 308 if (!value || !*value) {
306 xfs_warn(mp, "%s option requires an argument", 309 xfs_warn(mp, "%s option requires an argument",
307 this_char); 310 this_char);
308 return EINVAL; 311 return -EINVAL;
309 } 312 }
310 if (kstrtoint(value, 10, &dswidth)) 313 if (kstrtoint(value, 10, &dswidth))
311 return EINVAL; 314 return -EINVAL;
312 } else if (!strcmp(this_char, MNTOPT_32BITINODE)) { 315 } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
313 mp->m_flags |= XFS_MOUNT_SMALL_INUMS; 316 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
314 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 317 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
315 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; 318 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
316#if !XFS_BIG_INUMS
317 xfs_warn(mp, "%s option not allowed on this system",
318 this_char);
319 return EINVAL;
320#endif
321 } else if (!strcmp(this_char, MNTOPT_NOUUID)) { 319 } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
322 mp->m_flags |= XFS_MOUNT_NOUUID; 320 mp->m_flags |= XFS_MOUNT_NOUUID;
323 } else if (!strcmp(this_char, MNTOPT_BARRIER)) { 321 } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
@@ -390,7 +388,7 @@ xfs_parseargs(
390 "irixsgid is now a sysctl(2) variable, option is deprecated."); 388 "irixsgid is now a sysctl(2) variable, option is deprecated.");
391 } else { 389 } else {
392 xfs_warn(mp, "unknown mount option [%s].", this_char); 390 xfs_warn(mp, "unknown mount option [%s].", this_char);
393 return EINVAL; 391 return -EINVAL;
394 } 392 }
395 } 393 }
396 394
@@ -400,32 +398,32 @@ xfs_parseargs(
400 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && 398 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
401 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 399 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
402 xfs_warn(mp, "no-recovery mounts must be read-only."); 400 xfs_warn(mp, "no-recovery mounts must be read-only.");
403 return EINVAL; 401 return -EINVAL;
404 } 402 }
405 403
406 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { 404 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
407 xfs_warn(mp, 405 xfs_warn(mp,
408 "sunit and swidth options incompatible with the noalign option"); 406 "sunit and swidth options incompatible with the noalign option");
409 return EINVAL; 407 return -EINVAL;
410 } 408 }
411 409
412#ifndef CONFIG_XFS_QUOTA 410#ifndef CONFIG_XFS_QUOTA
413 if (XFS_IS_QUOTA_RUNNING(mp)) { 411 if (XFS_IS_QUOTA_RUNNING(mp)) {
414 xfs_warn(mp, "quota support not available in this kernel."); 412 xfs_warn(mp, "quota support not available in this kernel.");
415 return EINVAL; 413 return -EINVAL;
416 } 414 }
417#endif 415#endif
418 416
419 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 417 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
420 xfs_warn(mp, "sunit and swidth must be specified together"); 418 xfs_warn(mp, "sunit and swidth must be specified together");
421 return EINVAL; 419 return -EINVAL;
422 } 420 }
423 421
424 if (dsunit && (dswidth % dsunit != 0)) { 422 if (dsunit && (dswidth % dsunit != 0)) {
425 xfs_warn(mp, 423 xfs_warn(mp,
426 "stripe width (%d) must be a multiple of the stripe unit (%d)", 424 "stripe width (%d) must be a multiple of the stripe unit (%d)",
427 dswidth, dsunit); 425 dswidth, dsunit);
428 return EINVAL; 426 return -EINVAL;
429 } 427 }
430 428
431done: 429done:
@@ -446,7 +444,7 @@ done:
446 mp->m_logbufs > XLOG_MAX_ICLOGS)) { 444 mp->m_logbufs > XLOG_MAX_ICLOGS)) {
447 xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", 445 xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
448 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); 446 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
449 return XFS_ERROR(EINVAL); 447 return -EINVAL;
450 } 448 }
451 if (mp->m_logbsize != -1 && 449 if (mp->m_logbsize != -1 &&
452 mp->m_logbsize != 0 && 450 mp->m_logbsize != 0 &&
@@ -456,7 +454,7 @@ done:
456 xfs_warn(mp, 454 xfs_warn(mp,
457 "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", 455 "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
458 mp->m_logbsize); 456 mp->m_logbsize);
459 return XFS_ERROR(EINVAL); 457 return -EINVAL;
460 } 458 }
461 459
462 if (iosizelog) { 460 if (iosizelog) {
@@ -465,7 +463,7 @@ done:
465 xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", 463 xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
466 iosizelog, XFS_MIN_IO_LOG, 464 iosizelog, XFS_MIN_IO_LOG,
467 XFS_MAX_IO_LOG); 465 XFS_MAX_IO_LOG);
468 return XFS_ERROR(EINVAL); 466 return -EINVAL;
469 } 467 }
470 468
471 mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; 469 mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
@@ -597,15 +595,20 @@ xfs_max_file_offset(
597 return (((__uint64_t)pagefactor) << bitshift) - 1; 595 return (((__uint64_t)pagefactor) << bitshift) - 1;
598} 596}
599 597
598/*
599 * xfs_set_inode32() and xfs_set_inode64() are passed an agcount
600 * because in the growfs case, mp->m_sb.sb_agcount is not updated
601 * yet to the potentially higher ag count.
602 */
600xfs_agnumber_t 603xfs_agnumber_t
601xfs_set_inode32(struct xfs_mount *mp) 604xfs_set_inode32(struct xfs_mount *mp, xfs_agnumber_t agcount)
602{ 605{
603 xfs_agnumber_t index = 0; 606 xfs_agnumber_t index = 0;
604 xfs_agnumber_t maxagi = 0; 607 xfs_agnumber_t maxagi = 0;
605 xfs_sb_t *sbp = &mp->m_sb; 608 xfs_sb_t *sbp = &mp->m_sb;
606 xfs_agnumber_t max_metadata; 609 xfs_agnumber_t max_metadata;
607 xfs_agino_t agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks -1, 0); 610 xfs_agino_t agino;
608 xfs_ino_t ino = XFS_AGINO_TO_INO(mp, sbp->sb_agcount -1, agino); 611 xfs_ino_t ino;
609 xfs_perag_t *pag; 612 xfs_perag_t *pag;
610 613
611 /* Calculate how much should be reserved for inodes to meet 614 /* Calculate how much should be reserved for inodes to meet
@@ -620,10 +623,12 @@ xfs_set_inode32(struct xfs_mount *mp)
620 do_div(icount, sbp->sb_agblocks); 623 do_div(icount, sbp->sb_agblocks);
621 max_metadata = icount; 624 max_metadata = icount;
622 } else { 625 } else {
623 max_metadata = sbp->sb_agcount; 626 max_metadata = agcount;
624 } 627 }
625 628
626 for (index = 0; index < sbp->sb_agcount; index++) { 629 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
630
631 for (index = 0; index < agcount; index++) {
627 ino = XFS_AGINO_TO_INO(mp, index, agino); 632 ino = XFS_AGINO_TO_INO(mp, index, agino);
628 633
629 if (ino > XFS_MAXINUMBER_32) { 634 if (ino > XFS_MAXINUMBER_32) {
@@ -648,11 +653,11 @@ xfs_set_inode32(struct xfs_mount *mp)
648} 653}
649 654
650xfs_agnumber_t 655xfs_agnumber_t
651xfs_set_inode64(struct xfs_mount *mp) 656xfs_set_inode64(struct xfs_mount *mp, xfs_agnumber_t agcount)
652{ 657{
653 xfs_agnumber_t index = 0; 658 xfs_agnumber_t index = 0;
654 659
655 for (index = 0; index < mp->m_sb.sb_agcount; index++) { 660 for (index = 0; index < agcount; index++) {
656 struct xfs_perag *pag; 661 struct xfs_perag *pag;
657 662
658 pag = xfs_perag_get(mp, index); 663 pag = xfs_perag_get(mp, index);
@@ -686,7 +691,7 @@ xfs_blkdev_get(
686 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error); 691 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
687 } 692 }
688 693
689 return -error; 694 return error;
690} 695}
691 696
692STATIC void 697STATIC void
@@ -756,7 +761,7 @@ xfs_open_devices(
756 if (rtdev == ddev || rtdev == logdev) { 761 if (rtdev == ddev || rtdev == logdev) {
757 xfs_warn(mp, 762 xfs_warn(mp,
758 "Cannot mount filesystem with identical rtdev and ddev/logdev."); 763 "Cannot mount filesystem with identical rtdev and ddev/logdev.");
759 error = EINVAL; 764 error = -EINVAL;
760 goto out_close_rtdev; 765 goto out_close_rtdev;
761 } 766 }
762 } 767 }
@@ -764,7 +769,7 @@ xfs_open_devices(
764 /* 769 /*
765 * Setup xfs_mount buffer target pointers 770 * Setup xfs_mount buffer target pointers
766 */ 771 */
767 error = ENOMEM; 772 error = -ENOMEM;
768 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); 773 mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev);
769 if (!mp->m_ddev_targp) 774 if (!mp->m_ddev_targp)
770 goto out_close_rtdev; 775 goto out_close_rtdev;
@@ -838,32 +843,32 @@ xfs_init_mount_workqueues(
838 struct xfs_mount *mp) 843 struct xfs_mount *mp)
839{ 844{
840 mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", 845 mp->m_data_workqueue = alloc_workqueue("xfs-data/%s",
841 WQ_MEM_RECLAIM, 0, mp->m_fsname); 846 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
842 if (!mp->m_data_workqueue) 847 if (!mp->m_data_workqueue)
843 goto out; 848 goto out;
844 849
845 mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", 850 mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s",
846 WQ_MEM_RECLAIM, 0, mp->m_fsname); 851 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
847 if (!mp->m_unwritten_workqueue) 852 if (!mp->m_unwritten_workqueue)
848 goto out_destroy_data_iodone_queue; 853 goto out_destroy_data_iodone_queue;
849 854
850 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", 855 mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
851 WQ_MEM_RECLAIM, 0, mp->m_fsname); 856 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
852 if (!mp->m_cil_workqueue) 857 if (!mp->m_cil_workqueue)
853 goto out_destroy_unwritten; 858 goto out_destroy_unwritten;
854 859
855 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", 860 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
856 0, 0, mp->m_fsname); 861 WQ_FREEZABLE, 0, mp->m_fsname);
857 if (!mp->m_reclaim_workqueue) 862 if (!mp->m_reclaim_workqueue)
858 goto out_destroy_cil; 863 goto out_destroy_cil;
859 864
860 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", 865 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
861 0, 0, mp->m_fsname); 866 WQ_FREEZABLE, 0, mp->m_fsname);
862 if (!mp->m_log_workqueue) 867 if (!mp->m_log_workqueue)
863 goto out_destroy_reclaim; 868 goto out_destroy_reclaim;
864 869
865 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", 870 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
866 0, 0, mp->m_fsname); 871 WQ_FREEZABLE, 0, mp->m_fsname);
867 if (!mp->m_eofblocks_workqueue) 872 if (!mp->m_eofblocks_workqueue)
868 goto out_destroy_log; 873 goto out_destroy_log;
869 874
@@ -1188,6 +1193,7 @@ xfs_fs_remount(
1188 char *options) 1193 char *options)
1189{ 1194{
1190 struct xfs_mount *mp = XFS_M(sb); 1195 struct xfs_mount *mp = XFS_M(sb);
1196 xfs_sb_t *sbp = &mp->m_sb;
1191 substring_t args[MAX_OPT_ARGS]; 1197 substring_t args[MAX_OPT_ARGS];
1192 char *p; 1198 char *p;
1193 int error; 1199 int error;
@@ -1208,10 +1214,10 @@ xfs_fs_remount(
1208 mp->m_flags &= ~XFS_MOUNT_BARRIER; 1214 mp->m_flags &= ~XFS_MOUNT_BARRIER;
1209 break; 1215 break;
1210 case Opt_inode64: 1216 case Opt_inode64:
1211 mp->m_maxagi = xfs_set_inode64(mp); 1217 mp->m_maxagi = xfs_set_inode64(mp, sbp->sb_agcount);
1212 break; 1218 break;
1213 case Opt_inode32: 1219 case Opt_inode32:
1214 mp->m_maxagi = xfs_set_inode32(mp); 1220 mp->m_maxagi = xfs_set_inode32(mp, sbp->sb_agcount);
1215 break; 1221 break;
1216 default: 1222 default:
1217 /* 1223 /*
@@ -1295,7 +1301,7 @@ xfs_fs_freeze(
1295 1301
1296 xfs_save_resvblks(mp); 1302 xfs_save_resvblks(mp);
1297 xfs_quiesce_attr(mp); 1303 xfs_quiesce_attr(mp);
1298 return -xfs_fs_log_dummy(mp); 1304 return xfs_fs_log_dummy(mp);
1299} 1305}
1300 1306
1301STATIC int 1307STATIC int
@@ -1314,7 +1320,7 @@ xfs_fs_show_options(
1314 struct seq_file *m, 1320 struct seq_file *m,
1315 struct dentry *root) 1321 struct dentry *root)
1316{ 1322{
1317 return -xfs_showargs(XFS_M(root->d_sb), m); 1323 return xfs_showargs(XFS_M(root->d_sb), m);
1318} 1324}
1319 1325
1320/* 1326/*
@@ -1336,14 +1342,14 @@ xfs_finish_flags(
1336 mp->m_logbsize < mp->m_sb.sb_logsunit) { 1342 mp->m_logbsize < mp->m_sb.sb_logsunit) {
1337 xfs_warn(mp, 1343 xfs_warn(mp,
1338 "logbuf size must be greater than or equal to log stripe size"); 1344 "logbuf size must be greater than or equal to log stripe size");
1339 return XFS_ERROR(EINVAL); 1345 return -EINVAL;
1340 } 1346 }
1341 } else { 1347 } else {
1342 /* Fail a mount if the logbuf is larger than 32K */ 1348 /* Fail a mount if the logbuf is larger than 32K */
1343 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { 1349 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
1344 xfs_warn(mp, 1350 xfs_warn(mp,
1345 "logbuf size for version 1 logs must be 16K or 32K"); 1351 "logbuf size for version 1 logs must be 16K or 32K");
1346 return XFS_ERROR(EINVAL); 1352 return -EINVAL;
1347 } 1353 }
1348 } 1354 }
1349 1355
@@ -1355,7 +1361,7 @@ xfs_finish_flags(
1355 xfs_warn(mp, 1361 xfs_warn(mp,
1356"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", 1362"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.",
1357 MNTOPT_NOATTR2, MNTOPT_ATTR2); 1363 MNTOPT_NOATTR2, MNTOPT_ATTR2);
1358 return XFS_ERROR(EINVAL); 1364 return -EINVAL;
1359 } 1365 }
1360 1366
1361 /* 1367 /*
@@ -1372,7 +1378,7 @@ xfs_finish_flags(
1372 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { 1378 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
1373 xfs_warn(mp, 1379 xfs_warn(mp,
1374 "cannot mount a read-only filesystem as read-write"); 1380 "cannot mount a read-only filesystem as read-write");
1375 return XFS_ERROR(EROFS); 1381 return -EROFS;
1376 } 1382 }
1377 1383
1378 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && 1384 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
@@ -1380,7 +1386,7 @@ xfs_finish_flags(
1380 !xfs_sb_version_has_pquotino(&mp->m_sb)) { 1386 !xfs_sb_version_has_pquotino(&mp->m_sb)) {
1381 xfs_warn(mp, 1387 xfs_warn(mp,
1382 "Super block does not support project and group quota together"); 1388 "Super block does not support project and group quota together");
1383 return XFS_ERROR(EINVAL); 1389 return -EINVAL;
1384 } 1390 }
1385 1391
1386 return 0; 1392 return 0;
@@ -1394,7 +1400,7 @@ xfs_fs_fill_super(
1394{ 1400{
1395 struct inode *root; 1401 struct inode *root;
1396 struct xfs_mount *mp = NULL; 1402 struct xfs_mount *mp = NULL;
1397 int flags = 0, error = ENOMEM; 1403 int flags = 0, error = -ENOMEM;
1398 1404
1399 mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL); 1405 mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
1400 if (!mp) 1406 if (!mp)
@@ -1405,6 +1411,7 @@ xfs_fs_fill_super(
1405 atomic_set(&mp->m_active_trans, 0); 1411 atomic_set(&mp->m_active_trans, 0);
1406 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); 1412 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
1407 INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); 1413 INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
1414 mp->m_kobj.kobject.kset = xfs_kset;
1408 1415
1409 mp->m_super = sb; 1416 mp->m_super = sb;
1410 sb->s_fs_info = mp; 1417 sb->s_fs_info = mp;
@@ -1428,11 +1435,11 @@ xfs_fs_fill_super(
1428 if (error) 1435 if (error)
1429 goto out_free_fsname; 1436 goto out_free_fsname;
1430 1437
1431 error = -xfs_init_mount_workqueues(mp); 1438 error = xfs_init_mount_workqueues(mp);
1432 if (error) 1439 if (error)
1433 goto out_close_devices; 1440 goto out_close_devices;
1434 1441
1435 error = -xfs_icsb_init_counters(mp); 1442 error = xfs_icsb_init_counters(mp);
1436 if (error) 1443 if (error)
1437 goto out_destroy_workqueues; 1444 goto out_destroy_workqueues;
1438 1445
@@ -1474,12 +1481,12 @@ xfs_fs_fill_super(
1474 1481
1475 root = igrab(VFS_I(mp->m_rootip)); 1482 root = igrab(VFS_I(mp->m_rootip));
1476 if (!root) { 1483 if (!root) {
1477 error = ENOENT; 1484 error = -ENOENT;
1478 goto out_unmount; 1485 goto out_unmount;
1479 } 1486 }
1480 sb->s_root = d_make_root(root); 1487 sb->s_root = d_make_root(root);
1481 if (!sb->s_root) { 1488 if (!sb->s_root) {
1482 error = ENOMEM; 1489 error = -ENOMEM;
1483 goto out_unmount; 1490 goto out_unmount;
1484 } 1491 }
1485 1492
@@ -1499,7 +1506,7 @@ out_destroy_workqueues:
1499 xfs_free_fsname(mp); 1506 xfs_free_fsname(mp);
1500 kfree(mp); 1507 kfree(mp);
1501 out: 1508 out:
1502 return -error; 1509 return error;
1503 1510
1504 out_unmount: 1511 out_unmount:
1505 xfs_filestream_unmount(mp); 1512 xfs_filestream_unmount(mp);
@@ -1714,7 +1721,8 @@ xfs_init_workqueues(void)
1714 * AGs in all the filesystems mounted. Hence use the default large 1721 * AGs in all the filesystems mounted. Hence use the default large
1715 * max_active value for this workqueue. 1722 * max_active value for this workqueue.
1716 */ 1723 */
1717 xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); 1724 xfs_alloc_wq = alloc_workqueue("xfsalloc",
1725 WQ_MEM_RECLAIM|WQ_FREEZABLE, 0);
1718 if (!xfs_alloc_wq) 1726 if (!xfs_alloc_wq)
1719 return -ENOMEM; 1727 return -ENOMEM;
1720 1728
@@ -1761,9 +1769,22 @@ init_xfs_fs(void)
1761 if (error) 1769 if (error)
1762 goto out_cleanup_procfs; 1770 goto out_cleanup_procfs;
1763 1771
1772 xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
1773 if (!xfs_kset) {
1774 error = -ENOMEM;
1775 goto out_sysctl_unregister;;
1776 }
1777
1778#ifdef DEBUG
1779 xfs_dbg_kobj.kobject.kset = xfs_kset;
1780 error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
1781 if (error)
1782 goto out_kset_unregister;
1783#endif
1784
1764 error = xfs_qm_init(); 1785 error = xfs_qm_init();
1765 if (error) 1786 if (error)
1766 goto out_sysctl_unregister; 1787 goto out_remove_kobj;
1767 1788
1768 error = register_filesystem(&xfs_fs_type); 1789 error = register_filesystem(&xfs_fs_type);
1769 if (error) 1790 if (error)
@@ -1772,6 +1793,12 @@ init_xfs_fs(void)
1772 1793
1773 out_qm_exit: 1794 out_qm_exit:
1774 xfs_qm_exit(); 1795 xfs_qm_exit();
1796 out_remove_kobj:
1797#ifdef DEBUG
1798 xfs_sysfs_del(&xfs_dbg_kobj);
1799 out_kset_unregister:
1800#endif
1801 kset_unregister(xfs_kset);
1775 out_sysctl_unregister: 1802 out_sysctl_unregister:
1776 xfs_sysctl_unregister(); 1803 xfs_sysctl_unregister();
1777 out_cleanup_procfs: 1804 out_cleanup_procfs:
@@ -1793,6 +1820,10 @@ exit_xfs_fs(void)
1793{ 1820{
1794 xfs_qm_exit(); 1821 xfs_qm_exit();
1795 unregister_filesystem(&xfs_fs_type); 1822 unregister_filesystem(&xfs_fs_type);
1823#ifdef DEBUG
1824 xfs_sysfs_del(&xfs_dbg_kobj);
1825#endif
1826 kset_unregister(xfs_kset);
1796 xfs_sysctl_unregister(); 1827 xfs_sysctl_unregister();
1797 xfs_cleanup_procfs(); 1828 xfs_cleanup_procfs();
1798 xfs_buf_terminate(); 1829 xfs_buf_terminate();
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index bbe3d15a7904..2b830c2f322e 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -44,16 +44,6 @@ extern void xfs_qm_exit(void);
44# define XFS_REALTIME_STRING 44# define XFS_REALTIME_STRING
45#endif 45#endif
46 46
47#if XFS_BIG_BLKNOS
48# if XFS_BIG_INUMS
49# define XFS_BIGFS_STRING "large block/inode numbers, "
50# else
51# define XFS_BIGFS_STRING "large block numbers, "
52# endif
53#else
54# define XFS_BIGFS_STRING
55#endif
56
57#ifdef DEBUG 47#ifdef DEBUG
58# define XFS_DBG_STRING "debug" 48# define XFS_DBG_STRING "debug"
59#else 49#else
@@ -64,7 +54,6 @@ extern void xfs_qm_exit(void);
64#define XFS_BUILD_OPTIONS XFS_ACL_STRING \ 54#define XFS_BUILD_OPTIONS XFS_ACL_STRING \
65 XFS_SECURITY_STRING \ 55 XFS_SECURITY_STRING \
66 XFS_REALTIME_STRING \ 56 XFS_REALTIME_STRING \
67 XFS_BIGFS_STRING \
68 XFS_DBG_STRING /* DBG must be last */ 57 XFS_DBG_STRING /* DBG must be last */
69 58
70struct xfs_inode; 59struct xfs_inode;
@@ -76,8 +65,8 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
76 65
77extern void xfs_flush_inodes(struct xfs_mount *mp); 66extern void xfs_flush_inodes(struct xfs_mount *mp);
78extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 67extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
79extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); 68extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *, xfs_agnumber_t agcount);
80extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); 69extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *, xfs_agnumber_t agcount);
81 70
82extern const struct export_operations xfs_export_operations; 71extern const struct export_operations xfs_export_operations;
83extern const struct xattr_handler *xfs_xattr_handlers[]; 72extern const struct xattr_handler *xfs_xattr_handlers[];
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index d69363c833e1..02ae62a998e0 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -76,15 +76,15 @@ xfs_readlink_bmap(
76 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, 76 bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
77 &xfs_symlink_buf_ops); 77 &xfs_symlink_buf_ops);
78 if (!bp) 78 if (!bp)
79 return XFS_ERROR(ENOMEM); 79 return -ENOMEM;
80 error = bp->b_error; 80 error = bp->b_error;
81 if (error) { 81 if (error) {
82 xfs_buf_ioerror_alert(bp, __func__); 82 xfs_buf_ioerror_alert(bp, __func__);
83 xfs_buf_relse(bp); 83 xfs_buf_relse(bp);
84 84
85 /* bad CRC means corrupted metadata */ 85 /* bad CRC means corrupted metadata */
86 if (error == EFSBADCRC) 86 if (error == -EFSBADCRC)
87 error = EFSCORRUPTED; 87 error = -EFSCORRUPTED;
88 goto out; 88 goto out;
89 } 89 }
90 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); 90 byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
@@ -95,7 +95,7 @@ xfs_readlink_bmap(
95 if (xfs_sb_version_hascrc(&mp->m_sb)) { 95 if (xfs_sb_version_hascrc(&mp->m_sb)) {
96 if (!xfs_symlink_hdr_ok(ip->i_ino, offset, 96 if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
97 byte_cnt, bp)) { 97 byte_cnt, bp)) {
98 error = EFSCORRUPTED; 98 error = -EFSCORRUPTED;
99 xfs_alert(mp, 99 xfs_alert(mp,
100"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", 100"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
101 offset, byte_cnt, ip->i_ino); 101 offset, byte_cnt, ip->i_ino);
@@ -135,7 +135,7 @@ xfs_readlink(
135 trace_xfs_readlink(ip); 135 trace_xfs_readlink(ip);
136 136
137 if (XFS_FORCED_SHUTDOWN(mp)) 137 if (XFS_FORCED_SHUTDOWN(mp))
138 return XFS_ERROR(EIO); 138 return -EIO;
139 139
140 xfs_ilock(ip, XFS_ILOCK_SHARED); 140 xfs_ilock(ip, XFS_ILOCK_SHARED);
141 141
@@ -148,7 +148,7 @@ xfs_readlink(
148 __func__, (unsigned long long) ip->i_ino, 148 __func__, (unsigned long long) ip->i_ino,
149 (long long) pathlen); 149 (long long) pathlen);
150 ASSERT(0); 150 ASSERT(0);
151 error = XFS_ERROR(EFSCORRUPTED); 151 error = -EFSCORRUPTED;
152 goto out; 152 goto out;
153 } 153 }
154 154
@@ -203,14 +203,14 @@ xfs_symlink(
203 trace_xfs_symlink(dp, link_name); 203 trace_xfs_symlink(dp, link_name);
204 204
205 if (XFS_FORCED_SHUTDOWN(mp)) 205 if (XFS_FORCED_SHUTDOWN(mp))
206 return XFS_ERROR(EIO); 206 return -EIO;
207 207
208 /* 208 /*
209 * Check component lengths of the target path name. 209 * Check component lengths of the target path name.
210 */ 210 */
211 pathlen = strlen(target_path); 211 pathlen = strlen(target_path);
212 if (pathlen >= MAXPATHLEN) /* total string too long */ 212 if (pathlen >= MAXPATHLEN) /* total string too long */
213 return XFS_ERROR(ENAMETOOLONG); 213 return -ENAMETOOLONG;
214 214
215 udqp = gdqp = NULL; 215 udqp = gdqp = NULL;
216 prid = xfs_get_initial_prid(dp); 216 prid = xfs_get_initial_prid(dp);
@@ -238,7 +238,7 @@ xfs_symlink(
238 fs_blocks = xfs_symlink_blocks(mp, pathlen); 238 fs_blocks = xfs_symlink_blocks(mp, pathlen);
239 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); 239 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
240 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0); 240 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, resblks, 0);
241 if (error == ENOSPC && fs_blocks == 0) { 241 if (error == -ENOSPC && fs_blocks == 0) {
242 resblks = 0; 242 resblks = 0;
243 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); 243 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0);
244 } 244 }
@@ -254,7 +254,7 @@ xfs_symlink(
254 * Check whether the directory allows new symlinks or not. 254 * Check whether the directory allows new symlinks or not.
255 */ 255 */
256 if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { 256 if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) {
257 error = XFS_ERROR(EPERM); 257 error = -EPERM;
258 goto error_return; 258 goto error_return;
259 } 259 }
260 260
@@ -269,9 +269,11 @@ xfs_symlink(
269 /* 269 /*
270 * Check for ability to enter directory entry, if no space reserved. 270 * Check for ability to enter directory entry, if no space reserved.
271 */ 271 */
272 error = xfs_dir_canenter(tp, dp, link_name, resblks); 272 if (!resblks) {
273 if (error) 273 error = xfs_dir_canenter(tp, dp, link_name);
274 goto error_return; 274 if (error)
275 goto error_return;
276 }
275 /* 277 /*
276 * Initialize the bmap freelist prior to calling either 278 * Initialize the bmap freelist prior to calling either
277 * bmapi or the directory create code. 279 * bmapi or the directory create code.
@@ -284,7 +286,7 @@ xfs_symlink(
284 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, 286 error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0,
285 prid, resblks > 0, &ip, NULL); 287 prid, resblks > 0, &ip, NULL);
286 if (error) { 288 if (error) {
287 if (error == ENOSPC) 289 if (error == -ENOSPC)
288 goto error_return; 290 goto error_return;
289 goto error1; 291 goto error1;
290 } 292 }
@@ -348,7 +350,7 @@ xfs_symlink(
348 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, 350 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
349 BTOBB(byte_cnt), 0); 351 BTOBB(byte_cnt), 0);
350 if (!bp) { 352 if (!bp) {
351 error = ENOMEM; 353 error = -ENOMEM;
352 goto error2; 354 goto error2;
353 } 355 }
354 bp->b_ops = &xfs_symlink_buf_ops; 356 bp->b_ops = &xfs_symlink_buf_ops;
@@ -489,7 +491,7 @@ xfs_inactive_symlink_rmt(
489 XFS_FSB_TO_DADDR(mp, mval[i].br_startblock), 491 XFS_FSB_TO_DADDR(mp, mval[i].br_startblock),
490 XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0); 492 XFS_FSB_TO_BB(mp, mval[i].br_blockcount), 0);
491 if (!bp) { 493 if (!bp) {
492 error = ENOMEM; 494 error = -ENOMEM;
493 goto error_bmap_cancel; 495 goto error_bmap_cancel;
494 } 496 }
495 xfs_trans_binval(tp, bp); 497 xfs_trans_binval(tp, bp);
@@ -562,7 +564,7 @@ xfs_inactive_symlink(
562 trace_xfs_inactive_symlink(ip); 564 trace_xfs_inactive_symlink(ip);
563 565
564 if (XFS_FORCED_SHUTDOWN(mp)) 566 if (XFS_FORCED_SHUTDOWN(mp))
565 return XFS_ERROR(EIO); 567 return -EIO;
566 568
567 xfs_ilock(ip, XFS_ILOCK_EXCL); 569 xfs_ilock(ip, XFS_ILOCK_EXCL);
568 570
@@ -580,7 +582,7 @@ xfs_inactive_symlink(
580 __func__, (unsigned long long)ip->i_ino, pathlen); 582 __func__, (unsigned long long)ip->i_ino, pathlen);
581 xfs_iunlock(ip, XFS_ILOCK_EXCL); 583 xfs_iunlock(ip, XFS_ILOCK_EXCL);
582 ASSERT(0); 584 ASSERT(0);
583 return XFS_ERROR(EFSCORRUPTED); 585 return -EFSCORRUPTED;
584 } 586 }
585 587
586 if (ip->i_df.if_flags & XFS_IFINLINE) { 588 if (ip->i_df.if_flags & XFS_IFINLINE) {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index bd8e157c20ef..ffef45375754 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -92,6 +92,11 @@ enum {
92 92
93extern xfs_param_t xfs_params; 93extern xfs_param_t xfs_params;
94 94
95struct xfs_globals {
96 int log_recovery_delay; /* log recovery delay (secs) */
97};
98extern struct xfs_globals xfs_globals;
99
95#ifdef CONFIG_SYSCTL 100#ifdef CONFIG_SYSCTL
96extern int xfs_sysctl_register(void); 101extern int xfs_sysctl_register(void);
97extern void xfs_sysctl_unregister(void); 102extern void xfs_sysctl_unregister(void);
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
new file mode 100644
index 000000000000..aa03670851d8
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.c
@@ -0,0 +1,239 @@
1/*
2 * Copyright (c) 2014 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19#include "xfs.h"
20#include "xfs_sysfs.h"
21#include "xfs_log_format.h"
22#include "xfs_log.h"
23#include "xfs_log_priv.h"
24
25struct xfs_sysfs_attr {
26 struct attribute attr;
27 ssize_t (*show)(char *buf, void *data);
28 ssize_t (*store)(const char *buf, size_t count, void *data);
29};
30
31static inline struct xfs_sysfs_attr *
32to_attr(struct attribute *attr)
33{
34 return container_of(attr, struct xfs_sysfs_attr, attr);
35}
36
37#define XFS_SYSFS_ATTR_RW(name) \
38 static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
39#define XFS_SYSFS_ATTR_RO(name) \
40 static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
41
42#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
43
44/*
45 * xfs_mount kobject. This currently has no attributes and thus no need for show
46 * and store helpers. The mp kobject serves as the per-mount parent object that
47 * is identified by the fsname under sysfs.
48 */
49
50struct kobj_type xfs_mp_ktype = {
51 .release = xfs_sysfs_release,
52};
53
54#ifdef DEBUG
55/* debug */
56
57STATIC ssize_t
58log_recovery_delay_store(
59 const char *buf,
60 size_t count,
61 void *data)
62{
63 int ret;
64 int val;
65
66 ret = kstrtoint(buf, 0, &val);
67 if (ret)
68 return ret;
69
70 if (val < 0 || val > 60)
71 return -EINVAL;
72
73 xfs_globals.log_recovery_delay = val;
74
75 return count;
76}
77
78STATIC ssize_t
79log_recovery_delay_show(
80 char *buf,
81 void *data)
82{
83 return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
84}
85XFS_SYSFS_ATTR_RW(log_recovery_delay);
86
87static struct attribute *xfs_dbg_attrs[] = {
88 ATTR_LIST(log_recovery_delay),
89 NULL,
90};
91
92STATIC ssize_t
93xfs_dbg_show(
94 struct kobject *kobject,
95 struct attribute *attr,
96 char *buf)
97{
98 struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
99
100 return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
101}
102
103STATIC ssize_t
104xfs_dbg_store(
105 struct kobject *kobject,
106 struct attribute *attr,
107 const char *buf,
108 size_t count)
109{
110 struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
111
112 return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
113}
114
115static struct sysfs_ops xfs_dbg_ops = {
116 .show = xfs_dbg_show,
117 .store = xfs_dbg_store,
118};
119
120struct kobj_type xfs_dbg_ktype = {
121 .release = xfs_sysfs_release,
122 .sysfs_ops = &xfs_dbg_ops,
123 .default_attrs = xfs_dbg_attrs,
124};
125
126#endif /* DEBUG */
127
128/* xlog */
129
130STATIC ssize_t
131log_head_lsn_show(
132 char *buf,
133 void *data)
134{
135 struct xlog *log = data;
136 int cycle;
137 int block;
138
139 spin_lock(&log->l_icloglock);
140 cycle = log->l_curr_cycle;
141 block = log->l_curr_block;
142 spin_unlock(&log->l_icloglock);
143
144 return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
145}
146XFS_SYSFS_ATTR_RO(log_head_lsn);
147
148STATIC ssize_t
149log_tail_lsn_show(
150 char *buf,
151 void *data)
152{
153 struct xlog *log = data;
154 int cycle;
155 int block;
156
157 xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
158 return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
159}
160XFS_SYSFS_ATTR_RO(log_tail_lsn);
161
162STATIC ssize_t
163reserve_grant_head_show(
164 char *buf,
165 void *data)
166{
167 struct xlog *log = data;
168 int cycle;
169 int bytes;
170
171 xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
172 return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
173}
174XFS_SYSFS_ATTR_RO(reserve_grant_head);
175
176STATIC ssize_t
177write_grant_head_show(
178 char *buf,
179 void *data)
180{
181 struct xlog *log = data;
182 int cycle;
183 int bytes;
184
185 xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
186 return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
187}
188XFS_SYSFS_ATTR_RO(write_grant_head);
189
190static struct attribute *xfs_log_attrs[] = {
191 ATTR_LIST(log_head_lsn),
192 ATTR_LIST(log_tail_lsn),
193 ATTR_LIST(reserve_grant_head),
194 ATTR_LIST(write_grant_head),
195 NULL,
196};
197
198static inline struct xlog *
199to_xlog(struct kobject *kobject)
200{
201 struct xfs_kobj *kobj = to_kobj(kobject);
202 return container_of(kobj, struct xlog, l_kobj);
203}
204
205STATIC ssize_t
206xfs_log_show(
207 struct kobject *kobject,
208 struct attribute *attr,
209 char *buf)
210{
211 struct xlog *log = to_xlog(kobject);
212 struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
213
214 return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
215}
216
217STATIC ssize_t
218xfs_log_store(
219 struct kobject *kobject,
220 struct attribute *attr,
221 const char *buf,
222 size_t count)
223{
224 struct xlog *log = to_xlog(kobject);
225 struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
226
227 return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
228}
229
230static struct sysfs_ops xfs_log_ops = {
231 .show = xfs_log_show,
232 .store = xfs_log_store,
233};
234
235struct kobj_type xfs_log_ktype = {
236 .release = xfs_sysfs_release,
237 .sysfs_ops = &xfs_log_ops,
238 .default_attrs = xfs_log_attrs,
239};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
new file mode 100644
index 000000000000..240eee35f342
--- /dev/null
+++ b/fs/xfs/xfs_sysfs.h
@@ -0,0 +1,60 @@
1/*
2 * Copyright (c) 2014 Red Hat, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19#ifndef __XFS_SYSFS_H__
20#define __XFS_SYSFS_H__
21
22extern struct kobj_type xfs_mp_ktype; /* xfs_mount */
23extern struct kobj_type xfs_dbg_ktype; /* debug */
24extern struct kobj_type xfs_log_ktype; /* xlog */
25
26static inline struct xfs_kobj *
27to_kobj(struct kobject *kobject)
28{
29 return container_of(kobject, struct xfs_kobj, kobject);
30}
31
32static inline void
33xfs_sysfs_release(struct kobject *kobject)
34{
35 struct xfs_kobj *kobj = to_kobj(kobject);
36 complete(&kobj->complete);
37}
38
39static inline int
40xfs_sysfs_init(
41 struct xfs_kobj *kobj,
42 struct kobj_type *ktype,
43 struct xfs_kobj *parent_kobj,
44 const char *name)
45{
46 init_completion(&kobj->complete);
47 return kobject_init_and_add(&kobj->kobject, ktype,
48 &parent_kobj->kobject, "%s", name);
49}
50
51static inline void
52xfs_sysfs_del(
53 struct xfs_kobj *kobj)
54{
55 kobject_del(&kobj->kobject);
56 kobject_put(&kobj->kobject);
57 wait_for_completion(&kobj->complete);
58}
59
60#endif /* __XFS_SYSFS_H__ */
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 152f82782630..51372e34d988 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -349,7 +349,8 @@ DEFINE_BUF_EVENT(xfs_buf_free);
349DEFINE_BUF_EVENT(xfs_buf_hold); 349DEFINE_BUF_EVENT(xfs_buf_hold);
350DEFINE_BUF_EVENT(xfs_buf_rele); 350DEFINE_BUF_EVENT(xfs_buf_rele);
351DEFINE_BUF_EVENT(xfs_buf_iodone); 351DEFINE_BUF_EVENT(xfs_buf_iodone);
352DEFINE_BUF_EVENT(xfs_buf_iorequest); 352DEFINE_BUF_EVENT(xfs_buf_submit);
353DEFINE_BUF_EVENT(xfs_buf_submit_wait);
353DEFINE_BUF_EVENT(xfs_buf_bawrite); 354DEFINE_BUF_EVENT(xfs_buf_bawrite);
354DEFINE_BUF_EVENT(xfs_buf_lock); 355DEFINE_BUF_EVENT(xfs_buf_lock);
355DEFINE_BUF_EVENT(xfs_buf_lock_done); 356DEFINE_BUF_EVENT(xfs_buf_lock_done);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index d03932564ccb..30e8e3410955 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -190,7 +190,7 @@ xfs_trans_reserve(
190 -((int64_t)blocks), rsvd); 190 -((int64_t)blocks), rsvd);
191 if (error != 0) { 191 if (error != 0) {
192 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 192 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
193 return (XFS_ERROR(ENOSPC)); 193 return -ENOSPC;
194 } 194 }
195 tp->t_blk_res += blocks; 195 tp->t_blk_res += blocks;
196 } 196 }
@@ -241,7 +241,7 @@ xfs_trans_reserve(
241 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, 241 error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS,
242 -((int64_t)rtextents), rsvd); 242 -((int64_t)rtextents), rsvd);
243 if (error) { 243 if (error) {
244 error = XFS_ERROR(ENOSPC); 244 error = -ENOSPC;
245 goto undo_log; 245 goto undo_log;
246 } 246 }
247 tp->t_rtx_res += rtextents; 247 tp->t_rtx_res += rtextents;
@@ -874,7 +874,7 @@ xfs_trans_commit(
874 goto out_unreserve; 874 goto out_unreserve;
875 875
876 if (XFS_FORCED_SHUTDOWN(mp)) { 876 if (XFS_FORCED_SHUTDOWN(mp)) {
877 error = XFS_ERROR(EIO); 877 error = -EIO;
878 goto out_unreserve; 878 goto out_unreserve;
879 } 879 }
880 880
@@ -917,7 +917,7 @@ out_unreserve:
917 if (tp->t_ticket) { 917 if (tp->t_ticket) {
918 commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); 918 commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
919 if (commit_lsn == -1 && !error) 919 if (commit_lsn == -1 && !error)
920 error = XFS_ERROR(EIO); 920 error = -EIO;
921 } 921 }
922 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 922 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
923 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); 923 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
@@ -1024,7 +1024,7 @@ xfs_trans_roll(
1024 */ 1024 */
1025 error = xfs_trans_commit(trans, 0); 1025 error = xfs_trans_commit(trans, 0);
1026 if (error) 1026 if (error)
1027 return (error); 1027 return error;
1028 1028
1029 trans = *tpp; 1029 trans = *tpp;
1030 1030
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index cb0f3a84cc68..859482f53b5a 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -762,7 +762,7 @@ xfs_trans_ail_init(
762 762
763 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); 763 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
764 if (!ailp) 764 if (!ailp)
765 return ENOMEM; 765 return -ENOMEM;
766 766
767 ailp->xa_mount = mp; 767 ailp->xa_mount = mp;
768 INIT_LIST_HEAD(&ailp->xa_ail); 768 INIT_LIST_HEAD(&ailp->xa_ail);
@@ -781,7 +781,7 @@ xfs_trans_ail_init(
781 781
782out_free_ailp: 782out_free_ailp:
783 kmem_free(ailp); 783 kmem_free(ailp);
784 return ENOMEM; 784 return -ENOMEM;
785} 785}
786 786
787void 787void
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index b8eef0549f3f..e2b2216b1635 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -166,7 +166,7 @@ xfs_trans_get_buf_map(
166 ASSERT(atomic_read(&bip->bli_refcount) > 0); 166 ASSERT(atomic_read(&bip->bli_refcount) > 0);
167 bip->bli_recur++; 167 bip->bli_recur++;
168 trace_xfs_trans_get_buf_recur(bip); 168 trace_xfs_trans_get_buf_recur(bip);
169 return (bp); 169 return bp;
170 } 170 }
171 171
172 bp = xfs_buf_get_map(target, map, nmaps, flags); 172 bp = xfs_buf_get_map(target, map, nmaps, flags);
@@ -178,7 +178,7 @@ xfs_trans_get_buf_map(
178 178
179 _xfs_trans_bjoin(tp, bp, 1); 179 _xfs_trans_bjoin(tp, bp, 1);
180 trace_xfs_trans_get_buf(bp->b_fspriv); 180 trace_xfs_trans_get_buf(bp->b_fspriv);
181 return (bp); 181 return bp;
182} 182}
183 183
184/* 184/*
@@ -201,9 +201,8 @@ xfs_trans_getsb(xfs_trans_t *tp,
201 * Default to just trying to lock the superblock buffer 201 * Default to just trying to lock the superblock buffer
202 * if tp is NULL. 202 * if tp is NULL.
203 */ 203 */
204 if (tp == NULL) { 204 if (tp == NULL)
205 return (xfs_getsb(mp, flags)); 205 return xfs_getsb(mp, flags);
206 }
207 206
208 /* 207 /*
209 * If the superblock buffer already has this transaction 208 * If the superblock buffer already has this transaction
@@ -218,7 +217,7 @@ xfs_trans_getsb(xfs_trans_t *tp,
218 ASSERT(atomic_read(&bip->bli_refcount) > 0); 217 ASSERT(atomic_read(&bip->bli_refcount) > 0);
219 bip->bli_recur++; 218 bip->bli_recur++;
220 trace_xfs_trans_getsb_recur(bip); 219 trace_xfs_trans_getsb_recur(bip);
221 return (bp); 220 return bp;
222 } 221 }
223 222
224 bp = xfs_getsb(mp, flags); 223 bp = xfs_getsb(mp, flags);
@@ -227,7 +226,7 @@ xfs_trans_getsb(xfs_trans_t *tp,
227 226
228 _xfs_trans_bjoin(tp, bp, 1); 227 _xfs_trans_bjoin(tp, bp, 1);
229 trace_xfs_trans_getsb(bp->b_fspriv); 228 trace_xfs_trans_getsb(bp->b_fspriv);
230 return (bp); 229 return bp;
231} 230}
232 231
233#ifdef DEBUG 232#ifdef DEBUG
@@ -267,7 +266,7 @@ xfs_trans_read_buf_map(
267 bp = xfs_buf_read_map(target, map, nmaps, flags, ops); 266 bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
268 if (!bp) 267 if (!bp)
269 return (flags & XBF_TRYLOCK) ? 268 return (flags & XBF_TRYLOCK) ?
270 EAGAIN : XFS_ERROR(ENOMEM); 269 -EAGAIN : -ENOMEM;
271 270
272 if (bp->b_error) { 271 if (bp->b_error) {
273 error = bp->b_error; 272 error = bp->b_error;
@@ -277,8 +276,8 @@ xfs_trans_read_buf_map(
277 xfs_buf_relse(bp); 276 xfs_buf_relse(bp);
278 277
279 /* bad CRC means corrupted metadata */ 278 /* bad CRC means corrupted metadata */
280 if (error == EFSBADCRC) 279 if (error == -EFSBADCRC)
281 error = EFSCORRUPTED; 280 error = -EFSCORRUPTED;
282 return error; 281 return error;
283 } 282 }
284#ifdef DEBUG 283#ifdef DEBUG
@@ -287,7 +286,7 @@ xfs_trans_read_buf_map(
287 if (((xfs_req_num++) % xfs_error_mod) == 0) { 286 if (((xfs_req_num++) % xfs_error_mod) == 0) {
288 xfs_buf_relse(bp); 287 xfs_buf_relse(bp);
289 xfs_debug(mp, "Returning error!"); 288 xfs_debug(mp, "Returning error!");
290 return XFS_ERROR(EIO); 289 return -EIO;
291 } 290 }
292 } 291 }
293 } 292 }
@@ -319,20 +318,10 @@ xfs_trans_read_buf_map(
319 XFS_BUF_READ(bp); 318 XFS_BUF_READ(bp);
320 bp->b_ops = ops; 319 bp->b_ops = ops;
321 320
322 /* 321 error = xfs_buf_submit_wait(bp);
323 * XXX(hch): clean up the error handling here to be less
324 * of a mess..
325 */
326 if (XFS_FORCED_SHUTDOWN(mp)) {
327 trace_xfs_bdstrat_shut(bp, _RET_IP_);
328 xfs_bioerror_relse(bp);
329 } else {
330 xfs_buf_iorequest(bp);
331 }
332
333 error = xfs_buf_iowait(bp);
334 if (error) { 322 if (error) {
335 xfs_buf_ioerror_alert(bp, __func__); 323 if (!XFS_FORCED_SHUTDOWN(mp))
324 xfs_buf_ioerror_alert(bp, __func__);
336 xfs_buf_relse(bp); 325 xfs_buf_relse(bp);
337 /* 326 /*
338 * We can gracefully recover from most read 327 * We can gracefully recover from most read
@@ -343,8 +332,8 @@ xfs_trans_read_buf_map(
343 xfs_force_shutdown(tp->t_mountp, 332 xfs_force_shutdown(tp->t_mountp,
344 SHUTDOWN_META_IO_ERROR); 333 SHUTDOWN_META_IO_ERROR);
345 /* bad CRC means corrupted metadata */ 334 /* bad CRC means corrupted metadata */
346 if (error == EFSBADCRC) 335 if (error == -EFSBADCRC)
347 error = EFSCORRUPTED; 336 error = -EFSCORRUPTED;
348 return error; 337 return error;
349 } 338 }
350 } 339 }
@@ -355,7 +344,7 @@ xfs_trans_read_buf_map(
355 if (XFS_FORCED_SHUTDOWN(mp)) { 344 if (XFS_FORCED_SHUTDOWN(mp)) {
356 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 345 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
357 *bpp = NULL; 346 *bpp = NULL;
358 return XFS_ERROR(EIO); 347 return -EIO;
359 } 348 }
360 349
361 350
@@ -372,7 +361,7 @@ xfs_trans_read_buf_map(
372 if (bp == NULL) { 361 if (bp == NULL) {
373 *bpp = NULL; 362 *bpp = NULL;
374 return (flags & XBF_TRYLOCK) ? 363 return (flags & XBF_TRYLOCK) ?
375 0 : XFS_ERROR(ENOMEM); 364 0 : -ENOMEM;
376 } 365 }
377 if (bp->b_error) { 366 if (bp->b_error) {
378 error = bp->b_error; 367 error = bp->b_error;
@@ -384,8 +373,8 @@ xfs_trans_read_buf_map(
384 xfs_buf_relse(bp); 373 xfs_buf_relse(bp);
385 374
386 /* bad CRC means corrupted metadata */ 375 /* bad CRC means corrupted metadata */
387 if (error == EFSBADCRC) 376 if (error == -EFSBADCRC)
388 error = EFSCORRUPTED; 377 error = -EFSCORRUPTED;
389 return error; 378 return error;
390 } 379 }
391#ifdef DEBUG 380#ifdef DEBUG
@@ -396,7 +385,7 @@ xfs_trans_read_buf_map(
396 SHUTDOWN_META_IO_ERROR); 385 SHUTDOWN_META_IO_ERROR);
397 xfs_buf_relse(bp); 386 xfs_buf_relse(bp);
398 xfs_debug(mp, "Returning trans error!"); 387 xfs_debug(mp, "Returning trans error!");
399 return XFS_ERROR(EIO); 388 return -EIO;
400 } 389 }
401 } 390 }
402 } 391 }
@@ -414,7 +403,7 @@ shutdown_abort:
414 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 403 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
415 xfs_buf_relse(bp); 404 xfs_buf_relse(bp);
416 *bpp = NULL; 405 *bpp = NULL;
417 return XFS_ERROR(EIO); 406 return -EIO;
418} 407}
419 408
420/* 409/*
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 41172861e857..846e061c2e98 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -722,8 +722,8 @@ xfs_trans_dqresv(
722error_return: 722error_return:
723 xfs_dqunlock(dqp); 723 xfs_dqunlock(dqp);
724 if (flags & XFS_QMOPT_ENOSPC) 724 if (flags & XFS_QMOPT_ENOSPC)
725 return ENOSPC; 725 return -ENOSPC;
726 return EDQUOT; 726 return -EDQUOT;
727} 727}
728 728
729 729
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 50c3f5614288..cdb4d86520e1 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -70,7 +70,7 @@ xfs_trans_ichgtime(
70 int flags) 70 int flags)
71{ 71{
72 struct inode *inode = VFS_I(ip); 72 struct inode *inode = VFS_I(ip);
73 timespec_t tv; 73 struct timespec tv;
74 74
75 ASSERT(tp); 75 ASSERT(tp);
76 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 76 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 65c6e6650b1a..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -38,43 +38,18 @@ typedef __int32_t xfs_tid_t; /* transaction identifier */
38typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */ 38typedef __uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
39typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */ 39typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
40 40
41/*
42 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
43 * Disk based types:
44 */
45typedef __uint64_t xfs_dfsbno_t; /* blockno in filesystem (agno|agbno) */
46typedef __uint64_t xfs_drfsbno_t; /* blockno in filesystem (raw) */
47typedef __uint64_t xfs_drtbno_t; /* extent (block) in realtime area */
48typedef __uint64_t xfs_dfiloff_t; /* block number in a file */
49typedef __uint64_t xfs_dfilblks_t; /* number of blocks in a file */
50
51/*
52 * Memory based types are conditional.
53 */
54#if XFS_BIG_BLKNOS
55typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */ 41typedef __uint64_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
56typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */ 42typedef __uint64_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
57typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */ 43typedef __uint64_t xfs_rtblock_t; /* extent (block) in realtime area */
58typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
59#else
60typedef __uint32_t xfs_fsblock_t; /* blockno in filesystem (agno|agbno) */
61typedef __uint32_t xfs_rfsblock_t; /* blockno in filesystem (raw) */
62typedef __uint32_t xfs_rtblock_t; /* extent (block) in realtime area */
63typedef __int32_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
64#endif
65typedef __uint64_t xfs_fileoff_t; /* block number in a file */ 44typedef __uint64_t xfs_fileoff_t; /* block number in a file */
66typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
67typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */ 45typedef __uint64_t xfs_filblks_t; /* number of blocks in a file */
68 46
47typedef __int64_t xfs_srtblock_t; /* signed version of xfs_rtblock_t */
48typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */
69 49
70/* 50/*
71 * Null values for the types. 51 * Null values for the types.
72 */ 52 */
73#define NULLDFSBNO ((xfs_dfsbno_t)-1)
74#define NULLDRFSBNO ((xfs_drfsbno_t)-1)
75#define NULLDRTBNO ((xfs_drtbno_t)-1)
76#define NULLDFILOFF ((xfs_dfiloff_t)-1)
77
78#define NULLFSBLOCK ((xfs_fsblock_t)-1) 53#define NULLFSBLOCK ((xfs_fsblock_t)-1)
79#define NULLRFSBLOCK ((xfs_rfsblock_t)-1) 54#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
80#define NULLRTBLOCK ((xfs_rtblock_t)-1) 55#define NULLRTBLOCK ((xfs_rtblock_t)-1)
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
deleted file mode 100644
index e8a77383c0d5..000000000000
--- a/fs/xfs/xfs_vnode.h
+++ /dev/null
@@ -1,46 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_VNODE_H__
19#define __XFS_VNODE_H__
20
21#include "xfs_fs.h"
22
23struct file;
24struct xfs_inode;
25struct attrlist_cursor_kern;
26
27/*
28 * Flags for read/write calls - same values as IRIX
29 */
30#define IO_ISDIRECT 0x00004 /* bypass page cache */
31#define IO_INVIS 0x00020 /* don't update inode timestamps */
32
33#define XFS_IO_FLAGS \
34 { IO_ISDIRECT, "DIRECT" }, \
35 { IO_INVIS, "INVIS"}
36
37/*
38 * Some useful predicates.
39 */
40#define VN_MAPPED(vp) mapping_mapped(vp->i_mapping)
41#define VN_CACHED(vp) (vp->i_mapping->nrpages)
42#define VN_DIRTY(vp) mapping_tagged(vp->i_mapping, \
43 PAGECACHE_TAG_DIRTY)
44
45
46#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 78ed92a46fdd..93455b998041 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -49,7 +49,7 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
49 value = NULL; 49 value = NULL;
50 } 50 }
51 51
52 error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags); 52 error = xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
53 if (error) 53 if (error)
54 return error; 54 return error;
55 return asize; 55 return asize;
@@ -71,8 +71,8 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
71 xflags |= ATTR_REPLACE; 71 xflags |= ATTR_REPLACE;
72 72
73 if (!value) 73 if (!value)
74 return -xfs_attr_remove(ip, (unsigned char *)name, xflags); 74 return xfs_attr_remove(ip, (unsigned char *)name, xflags);
75 return -xfs_attr_set(ip, (unsigned char *)name, 75 return xfs_attr_set(ip, (unsigned char *)name,
76 (void *)value, size, xflags); 76 (void *)value, size, xflags);
77} 77}
78 78