aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDmitry Torokhov <dmitry.torokhov@gmail.com>2011-05-24 03:06:26 -0400
committerDmitry Torokhov <dmitry.torokhov@gmail.com>2011-05-24 03:06:26 -0400
commitb73077eb03f510a84b102fb97640e595a958403c (patch)
tree8b639000418e2756bf6baece4e00e07d2534bccc /fs
parent28350e330cfab46b60a1dbf763b678d859f9f3d9 (diff)
parent9d2e173644bb5c42ff1b280fbdda3f195a7cf1f7 (diff)
Merge branch 'next' into for-linus
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c34
-rw-r--r--fs/9p/cache.c204
-rw-r--r--fs/9p/cache.h64
-rw-r--r--fs/9p/fid.c116
-rw-r--r--fs/9p/fid.h5
-rw-r--r--fs/9p/v9fs.c108
-rw-r--r--fs/9p/v9fs.h58
-rw-r--r--fs/9p/v9fs_vfs.h26
-rw-r--r--fs/9p/vfs_addr.c194
-rw-r--r--fs/9p/vfs_dentry.c49
-rw-r--r--fs/9p/vfs_dir.c1
-rw-r--r--fs/9p/vfs_file.c323
-rw-r--r--fs/9p/vfs_inode.c322
-rw-r--r--fs/9p/vfs_inode_dotl.c208
-rw-r--r--fs/9p/vfs_super.c81
-rw-r--r--fs/Kconfig3
-rw-r--r--fs/Makefile3
-rw-r--r--fs/adfs/Kconfig1
-rw-r--r--fs/adfs/adfs.h25
-rw-r--r--fs/adfs/dir.c6
-rw-r--r--fs/adfs/dir_f.c23
-rw-r--r--fs/adfs/dir_fplus.c119
-rw-r--r--fs/adfs/inode.c69
-rw-r--r--fs/adfs/map.c2
-rw-r--r--fs/adfs/super.c36
-rw-r--r--fs/affs/Makefile2
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/afs/cache.c12
-rw-r--r--fs/afs/cell.c2
-rw-r--r--fs/afs/write.c1
-rw-r--r--fs/aio.c137
-rw-r--r--fs/attr.c6
-rw-r--r--fs/autofs4/autofs_i.h2
-rw-r--r--fs/autofs4/dev-ioctl.c4
-rw-r--r--fs/autofs4/expire.c84
-rw-r--r--fs/autofs4/root.c70
-rw-r--r--fs/autofs4/waitq.c6
-rw-r--r--fs/befs/ChangeLog10
-rw-r--r--fs/befs/befs_fs_types.h2
-rw-r--r--fs/befs/btree.c2
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/bfs/dir.c2
-rw-r--r--fs/bfs/file.c1
-rw-r--r--fs/binfmt_elf.c10
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/bio-integrity.c3
-rw-r--r--fs/bio.c14
-rw-r--r--fs/block_dev.c65
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/compression.c44
-rw-r--r--fs/btrfs/ctree.c159
-rw-r--r--fs/btrfs/ctree.h44
-rw-r--r--fs/btrfs/delayed-ref.c6
-rw-r--r--fs/btrfs/dir-item.c45
-rw-r--r--fs/btrfs/disk-io.c234
-rw-r--r--fs/btrfs/export.c10
-rw-r--r--fs/btrfs/extent-tree.c480
-rw-r--r--fs/btrfs/extent_io.c306
-rw-r--r--fs/btrfs/extent_io.h5
-rw-r--r--fs/btrfs/extent_map.c6
-rw-r--r--fs/btrfs/file-item.c10
-rw-r--r--fs/btrfs/file.c500
-rw-r--r--fs/btrfs/free-space-cache.c863
-rw-r--r--fs/btrfs/free-space-cache.h2
-rw-r--r--fs/btrfs/inode-map.c3
-rw-r--r--fs/btrfs/inode.c739
-rw-r--r--fs/btrfs/ioctl.c146
-rw-r--r--fs/btrfs/lzo.c21
-rw-r--r--fs/btrfs/ordered-data.c10
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/relocation.c53
-rw-r--r--fs/btrfs/root-tree.c24
-rw-r--r--fs/btrfs/super.c80
-rw-r--r--fs/btrfs/transaction.c69
-rw-r--r--fs/btrfs/transaction.h4
-rw-r--r--fs/btrfs/tree-log.c92
-rw-r--r--fs/btrfs/volumes.c269
-rw-r--r--fs/btrfs/volumes.h12
-rw-r--r--fs/btrfs/xattr.c41
-rw-r--r--fs/btrfs/xattr.h3
-rw-r--r--fs/btrfs/zlib.c3
-rw-r--r--fs/buffer.c53
-rw-r--r--fs/cachefiles/interface.c2
-rw-r--r--fs/cachefiles/namei.c52
-rw-r--r--fs/ceph/addr.c4
-rw-r--r--fs/ceph/caps.c2
-rw-r--r--fs/ceph/debugfs.c6
-rw-r--r--fs/ceph/dir.c49
-rw-r--r--fs/ceph/file.c10
-rw-r--r--fs/ceph/inode.c27
-rw-r--r--fs/ceph/mds_client.c6
-rw-r--r--fs/ceph/snap.c20
-rw-r--r--fs/ceph/super.c11
-rw-r--r--fs/ceph/super.h66
-rw-r--r--fs/cifs/AUTHORS2
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/README16
-rw-r--r--fs/cifs/cache.c2
-rw-r--r--fs/cifs/cifs_debug.c43
-rw-r--r--fs/cifs/cifs_dfs_ref.c11
-rw-r--r--fs/cifs/cifs_spnego.c4
-rw-r--r--fs/cifs/cifs_unicode.c35
-rw-r--r--fs/cifs/cifs_unicode.h2
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsencrypt.c26
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h17
-rw-r--r--fs/cifs/cifssmb.c24
-rw-r--r--fs/cifs/connect.c119
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c115
-rw-r--r--fs/cifs/link.c7
-rw-r--r--fs/cifs/misc.c119
-rw-r--r--fs/cifs/netmisc.c8
-rw-r--r--fs/cifs/readdir.c3
-rw-r--r--fs/cifs/sess.c31
-rw-r--r--fs/cifs/smbencrypt.c3
-rw-r--r--fs/cifs/transport.c69
-rw-r--r--fs/coda/Makefile2
-rw-r--r--fs/coda/sysctl.c17
-rw-r--r--fs/compat.c72
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/dcache.c130
-rw-r--r--fs/debugfs/inode.c26
-rw-r--r--fs/devpts/inode.c21
-rw-r--r--fs/direct-io.c13
-rw-r--r--fs/dlm/ast.c257
-rw-r--r--fs/dlm/ast.h7
-rw-r--r--fs/dlm/config.c4
-rw-r--r--fs/dlm/debug_fs.c4
-rw-r--r--fs/dlm/dlm_internal.h35
-rw-r--r--fs/dlm/lock.c40
-rw-r--r--fs/dlm/lowcomms.c10
-rw-r--r--fs/dlm/rcom.c4
-rw-r--r--fs/dlm/recover.c2
-rw-r--r--fs/dlm/user.c185
-rw-r--r--fs/dlm/user.h3
-rw-r--r--fs/drop_caches.c24
-rw-r--r--fs/ecryptfs/crypto.c2
-rw-r--r--fs/ecryptfs/dentry.c22
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h33
-rw-r--r--fs/ecryptfs/file.c10
-rw-r--r--fs/ecryptfs/inode.c162
-rw-r--r--fs/ecryptfs/keystore.c272
-rw-r--r--fs/ecryptfs/main.c14
-rw-r--r--fs/ecryptfs/mmap.c61
-rw-r--r--fs/ecryptfs/read_write.c12
-rw-r--r--fs/ecryptfs/super.c3
-rw-r--r--fs/efs/inode.c1
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/eventpoll.c183
-rw-r--r--fs/exec.c20
-rw-r--r--fs/exofs/common.h22
-rw-r--r--fs/exofs/dir.c33
-rw-r--r--fs/exofs/exofs.h6
-rw-r--r--fs/exofs/file.c16
-rw-r--r--fs/exofs/inode.c50
-rw-r--r--fs/exofs/namei.c8
-rw-r--r--fs/exofs/super.c190
-rw-r--r--fs/exportfs/expfs.c11
-rw-r--r--fs/ext2/acl.c2
-rw-r--r--fs/ext2/balloc.c6
-rw-r--r--fs/ext2/ext2.h8
-rw-r--r--fs/ext2/ialloc.c5
-rw-r--r--fs/ext2/inode.c10
-rw-r--r--fs/ext2/ioctl.c6
-rw-r--r--fs/ext2/namei.c17
-rw-r--r--fs/ext2/super.c2
-rw-r--r--fs/ext2/xattr.c2
-rw-r--r--fs/ext2/xattr.h6
-rw-r--r--fs/ext2/xattr_security.c5
-rw-r--r--fs/ext3/acl.c2
-rw-r--r--fs/ext3/balloc.c31
-rw-r--r--fs/ext3/ialloc.c5
-rw-r--r--fs/ext3/inode.c11
-rw-r--r--fs/ext3/ioctl.c6
-rw-r--r--fs/ext3/namei.c17
-rw-r--r--fs/ext3/resize.c2
-rw-r--r--fs/ext3/super.c10
-rw-r--r--fs/ext3/xattr.h4
-rw-r--r--fs/ext3/xattr_security.c5
-rw-r--r--fs/ext4/acl.c2
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/ext4_jbd2.h11
-rw-r--r--fs/ext4/extents.c237
-rw-r--r--fs/ext4/file.c60
-rw-r--r--fs/ext4/fsync.c33
-rw-r--r--fs/ext4/ialloc.c10
-rw-r--r--fs/ext4/inode.c467
-rw-r--r--fs/ext4/ioctl.c15
-rw-r--r--fs/ext4/mballoc.c136
-rw-r--r--fs/ext4/mballoc.h2
-rw-r--r--fs/ext4/migrate.c12
-rw-r--r--fs/ext4/namei.c20
-rw-r--r--fs/ext4/page-io.c52
-rw-r--r--fs/ext4/resize.c12
-rw-r--r--fs/ext4/super.c199
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--fs/ext4/xattr.h4
-rw-r--r--fs/ext4/xattr_security.c5
-rw-r--r--fs/fat/inode.c5
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/fcntl.c39
-rw-r--r--fs/fhandle.c266
-rw-r--r--fs/fifo.c3
-rw-r--r--fs/file_table.c66
-rw-r--r--fs/filesystems.c3
-rw-r--r--fs/freevxfs/vxfs_fshead.c2
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/freevxfs/vxfs_olt.h2
-rw-r--r--fs/freevxfs/vxfs_subr.c1
-rw-r--r--fs/fs-writeback.c143
-rw-r--r--fs/fuse/cuse.c14
-rw-r--r--fs/fuse/dev.c27
-rw-r--r--fs/fuse/dir.c45
-rw-r--r--fs/fuse/file.c56
-rw-r--r--fs/fuse/fuse_i.h7
-rw-r--r--fs/fuse/inode.c5
-rw-r--r--fs/generic_acl.c2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c7
-rw-r--r--fs/gfs2/aops.c4
-rw-r--r--fs/gfs2/bmap.c22
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/export.c8
-rw-r--r--fs/gfs2/file.c79
-rw-r--r--fs/gfs2/glock.c416
-rw-r--r--fs/gfs2/glock.h39
-rw-r--r--fs/gfs2/glops.c33
-rw-r--r--fs/gfs2/incore.h7
-rw-r--r--fs/gfs2/inode.c7
-rw-r--r--fs/gfs2/lock_dlm.c14
-rw-r--r--fs/gfs2/log.c36
-rw-r--r--fs/gfs2/lops.c22
-rw-r--r--fs/gfs2/main.c17
-rw-r--r--fs/gfs2/meta_io.c5
-rw-r--r--fs/gfs2/ops_fstype.c11
-rw-r--r--fs/gfs2/ops_inode.c10
-rw-r--r--fs/gfs2/quota.c14
-rw-r--r--fs/gfs2/rgrp.c34
-rw-r--r--fs/gfs2/rgrp.h2
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hfs/dir.c50
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfsplus/extents.c4
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c2
-rw-r--r--fs/hfsplus/part_tbl.c4
-rw-r--r--fs/hfsplus/super.c106
-rw-r--r--fs/hfsplus/wrapper.c4
-rw-r--r--fs/hpfs/Kconfig2
-rw-r--r--fs/hpfs/dir.c23
-rw-r--r--fs/hpfs/file.c10
-rw-r--r--fs/hpfs/hpfs_fn.h22
-rw-r--r--fs/hpfs/inode.c9
-rw-r--r--fs/hpfs/namei.c49
-rw-r--r--fs/hpfs/super.c23
-rw-r--r--fs/hugetlbfs/inode.c3
-rw-r--r--fs/inode.c740
-rw-r--r--fs/internal.h28
-rw-r--r--fs/ioctl.c28
-rw-r--r--fs/isofs/export.c8
-rw-r--r--fs/isofs/inode.c1
-rw-r--r--fs/jbd/commit.c24
-rw-r--r--fs/jbd/journal.c6
-rw-r--r--fs/jbd/revoke.c2
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jbd2/commit.c28
-rw-r--r--fs/jbd2/journal.c18
-rw-r--r--fs/jbd2/revoke.c2
-rw-r--r--fs/jbd2/transaction.c23
-rw-r--r--fs/jffs2/TODO2
-rw-r--r--fs/jffs2/acl.c2
-rw-r--r--fs/jffs2/compr_zlib.c7
-rw-r--r--fs/jffs2/dir.c9
-rw-r--r--fs/jffs2/nodelist.h2
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/security.c5
-rw-r--r--fs/jffs2/summary.c4
-rw-r--r--fs/jffs2/wbuf.c2
-rw-r--r--fs/jffs2/write.c18
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/jffs2/xattr.h5
-rw-r--r--fs/jfs/Makefile2
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/ioctl.c2
-rw-r--r--fs/jfs/jfs_dmap.c4
-rw-r--r--fs/jfs/jfs_extent.c6
-rw-r--r--fs/jfs/jfs_imap.c14
-rw-r--r--fs/jfs/jfs_logmgr.h2
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_metapage.h2
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xattr.h5
-rw-r--r--fs/jfs/namei.c13
-rw-r--r--fs/jfs/resize.c4
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/jfs/xattr.c8
-rw-r--r--fs/locks.c13
-rw-r--r--fs/logfs/compr.c2
-rw-r--r--fs/logfs/dev_bdev.c2
-rw-r--r--fs/logfs/dev_mtd.c2
-rw-r--r--fs/logfs/dir.c2
-rw-r--r--fs/logfs/file.c2
-rw-r--r--fs/logfs/inode.c2
-rw-r--r--fs/logfs/readwrite.c2
-rw-r--r--fs/mbcache.c2
-rw-r--r--fs/minix/Kconfig8
-rw-r--r--fs/minix/inode.c1
-rw-r--r--fs/minix/minix.h74
-rw-r--r--fs/minix/namei.c8
-rw-r--r--fs/mpage.c8
-rw-r--r--fs/namei.c1601
-rw-r--r--fs/namespace.c339
-rw-r--r--fs/ncpfs/Makefile2
-rw-r--r--fs/ncpfs/inode.c2
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c2
-rw-r--r--fs/nfs/client.c131
-rw-r--r--fs/nfs/dir.c102
-rw-r--r--fs/nfs/direct.c8
-rw-r--r--fs/nfs/file.c9
-rw-r--r--fs/nfs/getroot.c46
-rw-r--r--fs/nfs/idmap.c90
-rw-r--r--fs/nfs/inode.c19
-rw-r--r--fs/nfs/internal.h70
-rw-r--r--fs/nfs/namespace.c173
-rw-r--r--fs/nfs/nfs3proc.c3
-rw-r--r--fs/nfs/nfs4_fs.h43
-rw-r--r--fs/nfs/nfs4filelayout.c695
-rw-r--r--fs/nfs/nfs4filelayout.h23
-rw-r--r--fs/nfs/nfs4filelayoutdev.c434
-rw-r--r--fs/nfs/nfs4namespace.c41
-rw-r--r--fs/nfs/nfs4proc.c554
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4state.c38
-rw-r--r--fs/nfs/nfs4xdr.c355
-rw-r--r--fs/nfs/nfsroot.c29
-rw-r--r--fs/nfs/pagelist.c34
-rw-r--r--fs/nfs/pnfs.c462
-rw-r--r--fs/nfs/pnfs.h201
-rw-r--r--fs/nfs/proc.c3
-rw-r--r--fs/nfs/read.c127
-rw-r--r--fs/nfs/super.c478
-rw-r--r--fs/nfs/unlink.c22
-rw-r--r--fs/nfs/write.c384
-rw-r--r--fs/nfs_common/nfsacl.c3
-rw-r--r--fs/nfsctl.c21
-rw-r--r--fs/nfsd/export.c1
-rw-r--r--fs/nfsd/lockd.c1
-rw-r--r--fs/nfsd/nfs3xdr.c2
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/nfs4idmap.c1
-rw-r--r--fs/nfsd/nfs4proc.c4
-rw-r--r--fs/nfsd/nfs4state.c356
-rw-r--r--fs/nfsd/nfs4xdr.c11
-rw-r--r--fs/nfsd/nfsctl.c35
-rw-r--r--fs/nfsd/nfsxdr.c2
-rw-r--r--fs/nfsd/state.h17
-rw-r--r--fs/nfsd/vfs.c21
-rw-r--r--fs/nilfs2/alloc.c12
-rw-r--r--fs/nilfs2/alloc.h2
-rw-r--r--fs/nilfs2/bmap.c12
-rw-r--r--fs/nilfs2/bmap.h3
-rw-r--r--fs/nilfs2/btnode.c12
-rw-r--r--fs/nilfs2/btnode.h1
-rw-r--r--fs/nilfs2/btree.c6
-rw-r--r--fs/nilfs2/dir.c5
-rw-r--r--fs/nilfs2/direct.c4
-rw-r--r--fs/nilfs2/file.c15
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/inode.c84
-rw-r--r--fs/nilfs2/ioctl.c115
-rw-r--r--fs/nilfs2/mdt.c13
-rw-r--r--fs/nilfs2/mdt.h2
-rw-r--r--fs/nilfs2/namei.c10
-rw-r--r--fs/nilfs2/nilfs.h47
-rw-r--r--fs/nilfs2/page.c18
-rw-r--r--fs/nilfs2/page.h4
-rw-r--r--fs/nilfs2/recovery.c32
-rw-r--r--fs/nilfs2/sb.h85
-rw-r--r--fs/nilfs2/segbuf.c2
-rw-r--r--fs/nilfs2/segment.c261
-rw-r--r--fs/nilfs2/segment.h14
-rw-r--r--fs/nilfs2/super.c216
-rw-r--r--fs/nilfs2/the_nilfs.c44
-rw-r--r--fs/nilfs2/the_nilfs.h51
-rw-r--r--fs/notify/fanotify/fanotify_user.c4
-rw-r--r--fs/notify/inode_mark.c42
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c3
-rw-r--r--fs/notify/inotify/inotify_user.c41
-rw-r--r--fs/notify/mark.c3
-rw-r--r--fs/notify/vfsmount_mark.c1
-rw-r--r--fs/ntfs/Makefile19
-rw-r--r--fs/ntfs/aops.c4
-rw-r--r--fs/ntfs/attrib.c4
-rw-r--r--fs/ntfs/compress.c5
-rw-r--r--fs/ntfs/inode.c8
-rw-r--r--fs/ntfs/layout.h12
-rw-r--r--fs/ntfs/logfile.c2
-rw-r--r--fs/ntfs/logfile.h2
-rw-r--r--fs/ntfs/mft.c8
-rw-r--r--fs/ntfs/runlist.c2
-rw-r--r--fs/ntfs/super.c14
-rw-r--r--fs/ocfs2/Makefile4
-rw-r--r--fs/ocfs2/acl.c3
-rw-r--r--fs/ocfs2/alloc.c216
-rw-r--r--fs/ocfs2/aops.c83
-rw-r--r--fs/ocfs2/aops.h2
-rw-r--r--fs/ocfs2/buffer_head_io.c49
-rw-r--r--fs/ocfs2/cluster/heartbeat.c9
-rw-r--r--fs/ocfs2/cluster/masklog.c20
-rw-r--r--fs/ocfs2/cluster/masklog.h105
-rw-r--r--fs/ocfs2/cluster/quorum.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c12
-rw-r--r--fs/ocfs2/dcache.c47
-rw-r--r--fs/ocfs2/dir.c123
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c6
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c36
-rw-r--r--fs/ocfs2/dlm/dlmlock.c10
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c10
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c9
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmglue.c246
-rw-r--r--fs/ocfs2/export.c55
-rw-r--r--fs/ocfs2/extent_map.c10
-rw-r--r--fs/ocfs2/file.c220
-rw-r--r--fs/ocfs2/heartbeat.c4
-rw-r--r--fs/ocfs2/inode.c138
-rw-r--r--fs/ocfs2/ioctl.c43
-rw-r--r--fs/ocfs2/journal.c170
-rw-r--r--fs/ocfs2/journal.h8
-rw-r--r--fs/ocfs2/localalloc.c109
-rw-r--r--fs/ocfs2/locks.c1
-rw-r--r--fs/ocfs2/mmap.c7
-rw-r--r--fs/ocfs2/namei.c181
-rw-r--r--fs/ocfs2/ocfs2.h33
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/ocfs2_trace.h2739
-rw-r--r--fs/ocfs2/quota.h3
-rw-r--r--fs/ocfs2/quota_global.c74
-rw-r--r--fs/ocfs2/quota_local.c16
-rw-r--r--fs/ocfs2/refcounttree.c170
-rw-r--r--fs/ocfs2/reservations.c57
-rw-r--r--fs/ocfs2/reservations.h2
-rw-r--r--fs/ocfs2/resize.c23
-rw-r--r--fs/ocfs2/slot_map.c16
-rw-r--r--fs/ocfs2/stackglue.h2
-rw-r--r--fs/ocfs2/suballoc.c193
-rw-r--r--fs/ocfs2/super.c126
-rw-r--r--fs/ocfs2/symlink.c14
-rw-r--r--fs/ocfs2/sysfile.c1
-rw-r--r--fs/ocfs2/uptodate.c73
-rw-r--r--fs/ocfs2/xattr.c169
-rw-r--r--fs/ocfs2/xattr.h4
-rw-r--r--fs/omfs/dir.c66
-rw-r--r--fs/omfs/file.c1
-rw-r--r--fs/open.c152
-rw-r--r--fs/partitions/check.c7
-rw-r--r--fs/partitions/ldm.c21
-rw-r--r--fs/partitions/mac.c17
-rw-r--r--fs/partitions/osf.c12
-rw-r--r--fs/proc/array.c7
-rw-r--r--fs/proc/base.c219
-rw-r--r--fs/proc/generic.c8
-rw-r--r--fs/proc/inode.c10
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/proc_devtree.c2
-rw-r--r--fs/proc/proc_sysctl.c8
-rw-r--r--fs/proc/root.c32
-rw-r--r--fs/proc/task_mmu.c138
-rw-r--r--fs/proc/task_nommu.c6
-rw-r--r--fs/pstore/Kconfig13
-rw-r--r--fs/pstore/Makefile7
-rw-r--r--fs/pstore/inode.c311
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c201
-rw-r--r--fs/qnx4/inode.c1
-rw-r--r--fs/quota/dquot.c56
-rw-r--r--fs/quota/quota_v2.c2
-rw-r--r--fs/ramfs/file-nommu.c1
-rw-r--r--fs/reiserfs/Makefile4
-rw-r--r--fs/reiserfs/inode.c8
-rw-r--r--fs/reiserfs/ioctl.c4
-rw-r--r--fs/reiserfs/journal.c6
-rw-r--r--fs/reiserfs/lock.c2
-rw-r--r--fs/reiserfs/namei.c15
-rw-r--r--fs/reiserfs/super.c4
-rw-r--r--fs/reiserfs/xattr.c4
-rw-r--r--fs/reiserfs/xattr_acl.c2
-rw-r--r--fs/reiserfs/xattr_security.c3
-rw-r--r--fs/select.c3
-rw-r--r--fs/squashfs/Kconfig12
-rw-r--r--fs/squashfs/cache.c4
-rw-r--r--fs/squashfs/decompressor.c34
-rw-r--r--fs/squashfs/decompressor.h7
-rw-r--r--fs/squashfs/dir.c9
-rw-r--r--fs/squashfs/lzo_wrapper.c4
-rw-r--r--fs/squashfs/namei.c12
-rw-r--r--fs/squashfs/squashfs.h1
-rw-r--r--fs/squashfs/squashfs_fs.h4
-rw-r--r--fs/squashfs/super.c15
-rw-r--r--fs/squashfs/xz_wrapper.c53
-rw-r--r--fs/squashfs/zlib_wrapper.c10
-rw-r--r--fs/stat.c7
-rw-r--r--fs/statfs.c176
-rw-r--r--fs/super.c166
-rw-r--r--fs/sync.c28
-rw-r--r--fs/sysv/itree.c1
-rw-r--r--fs/sysv/namei.c8
-rw-r--r--fs/ubifs/Kconfig36
-rw-r--r--fs/ubifs/budget.c2
-rw-r--r--fs/ubifs/commit.c60
-rw-r--r--fs/ubifs/debug.c97
-rw-r--r--fs/ubifs/debug.h182
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/file.c14
-rw-r--r--fs/ubifs/io.c201
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c28
-rw-r--r--fs/ubifs/lprops.c26
-rw-r--r--fs/ubifs/lpt.c7
-rw-r--r--fs/ubifs/lpt_commit.c56
-rw-r--r--fs/ubifs/orphan.c10
-rw-r--r--fs/ubifs/recovery.c44
-rw-r--r--fs/ubifs/scan.c2
-rw-r--r--fs/ubifs/super.c58
-rw-r--r--fs/ubifs/tnc.c10
-rw-r--r--fs/ubifs/ubifs.h45
-rw-r--r--fs/ubifs/xattr.c4
-rw-r--r--fs/udf/balloc.c11
-rw-r--r--fs/udf/file.c8
-rw-r--r--fs/udf/inode.c240
-rw-r--r--fs/udf/namei.c18
-rw-r--r--fs/udf/truncate.c146
-rw-r--r--fs/udf/udfdecl.h12
-rw-r--r--fs/ufs/Kconfig1
-rw-r--r--fs/ufs/inode.c81
-rw-r--r--fs/ufs/namei.c44
-rw-r--r--fs/ufs/super.c70
-rw-r--r--fs/ufs/truncate.c6
-rw-r--r--fs/ufs/ufs.h6
-rw-r--r--fs/ufs/util.c2
-rw-r--r--fs/ufs/util.h2
-rw-r--r--fs/utimes.c2
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/Makefile12
-rw-r--r--fs/xfs/linux-2.6/kmem.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c397
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h40
-rw-r--r--fs/xfs/linux-2.6/xfs_discard.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h23
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c124
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h40
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c293
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c265
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c2
-rw-r--r--fs/xfs/quota/xfs_dquot.c50
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c5
-rw-r--r--fs/xfs/quota/xfs_qm.c56
-rw-r--r--fs/xfs/quota/xfs_qm.h5
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c5
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c91
-rw-r--r--fs/xfs/quota/xfs_trans_dquot.c5
-rw-r--r--fs/xfs/support/debug.c107
-rw-r--r--fs/xfs/support/debug.h61
-rw-r--r--fs/xfs/xfs_alloc.c188
-rw-r--r--fs/xfs/xfs_bmap.c24
-rw-r--r--fs/xfs/xfs_buf_item.c17
-rw-r--r--fs/xfs/xfs_da_btree.c9
-rw-r--r--fs/xfs/xfs_dfrag.c4
-rw-r--r--fs/xfs/xfs_dir2.c2
-rw-r--r--fs/xfs/xfs_dir2_node.c25
-rw-r--r--fs/xfs/xfs_error.c22
-rw-r--r--fs/xfs/xfs_error.h19
-rw-r--r--fs/xfs/xfs_fsops.c9
-rw-r--r--fs/xfs/xfs_ialloc.c82
-rw-r--r--fs/xfs/xfs_inode.c133
-rw-r--r--fs/xfs/xfs_inode.h27
-rw-r--r--fs/xfs/xfs_inode_item.c73
-rw-r--r--fs/xfs/xfs_iomap.c12
-rw-r--r--fs/xfs/xfs_itable.c2
-rw-r--r--fs/xfs/xfs_log.c162
-rw-r--r--fs/xfs/xfs_log_priv.h7
-rw-r--r--fs/xfs/xfs_log_recover.c227
-rw-r--r--fs/xfs/xfs_mount.c148
-rw-r--r--fs/xfs/xfs_mount.h9
-rw-r--r--fs/xfs/xfs_mru_cache.c2
-rw-r--r--fs/xfs/xfs_quota.h3
-rw-r--r--fs/xfs/xfs_rtalloc.c92
-rw-r--r--fs/xfs/xfs_rtalloc.h2
-rw-r--r--fs/xfs/xfs_rw.c58
-rw-r--r--fs/xfs/xfs_trans.h2
-rw-r--r--fs/xfs/xfs_trans_ail.c423
-rw-r--r--fs/xfs/xfs_trans_buf.c9
-rw-r--r--fs/xfs/xfs_trans_inode.c24
-rw-r--r--fs/xfs/xfs_trans_priv.h22
-rw-r--r--fs/xfs/xfs_vnodeops.c81
-rw-r--r--fs/xfs/xfs_vnodeops.h1
610 files changed, 21991 insertions, 13202 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 02a2cf616318..535ab6eccb1a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -21,8 +21,8 @@
21#include <linux/posix_acl_xattr.h> 21#include <linux/posix_acl_xattr.h>
22#include "xattr.h" 22#include "xattr.h"
23#include "acl.h" 23#include "acl.h"
24#include "v9fs_vfs.h"
25#include "v9fs.h" 24#include "v9fs.h"
25#include "v9fs_vfs.h"
26 26
27static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name) 27static struct posix_acl *__v9fs_get_acl(struct p9_fid *fid, char *name)
28{ 28{
@@ -59,7 +59,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
59 struct v9fs_session_info *v9ses; 59 struct v9fs_session_info *v9ses;
60 60
61 v9ses = v9fs_inode2v9ses(inode); 61 v9ses = v9fs_inode2v9ses(inode);
62 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { 62 if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
63 ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
63 set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL); 64 set_cached_acl(inode, ACL_TYPE_DEFAULT, NULL);
64 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL); 65 set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
65 return 0; 66 return 0;
@@ -71,11 +72,15 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
71 if (!IS_ERR(dacl) && !IS_ERR(pacl)) { 72 if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
72 set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); 73 set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
73 set_cached_acl(inode, ACL_TYPE_ACCESS, pacl); 74 set_cached_acl(inode, ACL_TYPE_ACCESS, pacl);
74 posix_acl_release(dacl);
75 posix_acl_release(pacl);
76 } else 75 } else
77 retval = -EIO; 76 retval = -EIO;
78 77
78 if (!IS_ERR(dacl))
79 posix_acl_release(dacl);
80
81 if (!IS_ERR(pacl))
82 posix_acl_release(pacl);
83
79 return retval; 84 return retval;
80} 85}
81 86
@@ -100,9 +105,10 @@ int v9fs_check_acl(struct inode *inode, int mask, unsigned int flags)
100 return -ECHILD; 105 return -ECHILD;
101 106
102 v9ses = v9fs_inode2v9ses(inode); 107 v9ses = v9fs_inode2v9ses(inode);
103 if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) { 108 if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) ||
109 ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) {
104 /* 110 /*
105 * On access = client mode get the acl 111 * On access = client and acl = on mode get the acl
106 * values from the server 112 * values from the server
107 */ 113 */
108 return 0; 114 return 0;
@@ -128,6 +134,10 @@ static int v9fs_set_acl(struct dentry *dentry, int type, struct posix_acl *acl)
128 struct inode *inode = dentry->d_inode; 134 struct inode *inode = dentry->d_inode;
129 135
130 set_cached_acl(inode, type, acl); 136 set_cached_acl(inode, type, acl);
137
138 if (!acl)
139 return 0;
140
131 /* Set a setxattr request to server */ 141 /* Set a setxattr request to server */
132 size = posix_acl_xattr_size(acl->a_count); 142 size = posix_acl_xattr_size(acl->a_count);
133 buffer = kmalloc(size, GFP_KERNEL); 143 buffer = kmalloc(size, GFP_KERNEL);
@@ -177,10 +187,8 @@ int v9fs_acl_chmod(struct dentry *dentry)
177int v9fs_set_create_acl(struct dentry *dentry, 187int v9fs_set_create_acl(struct dentry *dentry,
178 struct posix_acl *dpacl, struct posix_acl *pacl) 188 struct posix_acl *dpacl, struct posix_acl *pacl)
179{ 189{
180 if (dpacl) 190 v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl);
181 v9fs_set_acl(dentry, ACL_TYPE_DEFAULT, dpacl); 191 v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
182 if (pacl)
183 v9fs_set_acl(dentry, ACL_TYPE_ACCESS, pacl);
184 posix_acl_release(dpacl); 192 posix_acl_release(dpacl);
185 posix_acl_release(pacl); 193 posix_acl_release(pacl);
186 return 0; 194 return 0;
@@ -254,7 +262,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
254 if (strcmp(name, "") != 0) 262 if (strcmp(name, "") != 0)
255 return -EINVAL; 263 return -EINVAL;
256 264
257 v9ses = v9fs_inode2v9ses(dentry->d_inode); 265 v9ses = v9fs_dentry2v9ses(dentry);
258 /* 266 /*
259 * We allow set/get/list of acl when access=client is not specified 267 * We allow set/get/list of acl when access=client is not specified
260 */ 268 */
@@ -304,7 +312,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
304 if (strcmp(name, "") != 0) 312 if (strcmp(name, "") != 0)
305 return -EINVAL; 313 return -EINVAL;
306 314
307 v9ses = v9fs_inode2v9ses(dentry->d_inode); 315 v9ses = v9fs_dentry2v9ses(dentry);
308 /* 316 /*
309 * set the attribute on the remote. Without even looking at the 317 * set the attribute on the remote. Without even looking at the
310 * xattr value. We leave it to the server to validate 318 * xattr value. We leave it to the server to validate
@@ -315,7 +323,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
315 323
316 if (S_ISLNK(inode->i_mode)) 324 if (S_ISLNK(inode->i_mode))
317 return -EOPNOTSUPP; 325 return -EOPNOTSUPP;
318 if (!is_owner_or_cap(inode)) 326 if (!inode_owner_or_capable(inode))
319 return -EPERM; 327 return -EPERM;
320 if (value) { 328 if (value) {
321 /* update the cached acl value */ 329 /* update the cached acl value */
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index 0dbe0d139ac2..5b335c5086a1 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -33,67 +33,11 @@
33 33
34#define CACHETAG_LEN 11 34#define CACHETAG_LEN 11
35 35
36struct kmem_cache *vcookie_cache;
37
38struct fscache_netfs v9fs_cache_netfs = { 36struct fscache_netfs v9fs_cache_netfs = {
39 .name = "9p", 37 .name = "9p",
40 .version = 0, 38 .version = 0,
41}; 39};
42 40
43static void init_once(void *foo)
44{
45 struct v9fs_cookie *vcookie = (struct v9fs_cookie *) foo;
46 vcookie->fscache = NULL;
47 vcookie->qid = NULL;
48 inode_init_once(&vcookie->inode);
49}
50
51/**
52 * v9fs_init_vcookiecache - initialize a cache for vcookies to maintain
53 * vcookie to inode mapping
54 *
55 * Returns 0 on success.
56 */
57
58static int v9fs_init_vcookiecache(void)
59{
60 vcookie_cache = kmem_cache_create("vcookie_cache",
61 sizeof(struct v9fs_cookie),
62 0, (SLAB_RECLAIM_ACCOUNT|
63 SLAB_MEM_SPREAD),
64 init_once);
65 if (!vcookie_cache)
66 return -ENOMEM;
67
68 return 0;
69}
70
71/**
72 * v9fs_destroy_vcookiecache - destroy the cache of vcookies
73 *
74 */
75
76static void v9fs_destroy_vcookiecache(void)
77{
78 kmem_cache_destroy(vcookie_cache);
79}
80
81int __v9fs_cache_register(void)
82{
83 int ret;
84 ret = v9fs_init_vcookiecache();
85 if (ret < 0)
86 return ret;
87
88 return fscache_register_netfs(&v9fs_cache_netfs);
89}
90
91void __v9fs_cache_unregister(void)
92{
93 v9fs_destroy_vcookiecache();
94 fscache_unregister_netfs(&v9fs_cache_netfs);
95}
96
97/** 41/**
98 * v9fs_random_cachetag - Generate a random tag to be associated 42 * v9fs_random_cachetag - Generate a random tag to be associated
99 * with a new cache session. 43 * with a new cache session.
@@ -133,9 +77,9 @@ static uint16_t v9fs_cache_session_get_key(const void *cookie_netfs_data,
133} 77}
134 78
135const struct fscache_cookie_def v9fs_cache_session_index_def = { 79const struct fscache_cookie_def v9fs_cache_session_index_def = {
136 .name = "9P.session", 80 .name = "9P.session",
137 .type = FSCACHE_COOKIE_TYPE_INDEX, 81 .type = FSCACHE_COOKIE_TYPE_INDEX,
138 .get_key = v9fs_cache_session_get_key, 82 .get_key = v9fs_cache_session_get_key,
139}; 83};
140 84
141void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) 85void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses)
@@ -163,33 +107,33 @@ void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses)
163static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data, 107static uint16_t v9fs_cache_inode_get_key(const void *cookie_netfs_data,
164 void *buffer, uint16_t bufmax) 108 void *buffer, uint16_t bufmax)
165{ 109{
166 const struct v9fs_cookie *vcookie = cookie_netfs_data; 110 const struct v9fs_inode *v9inode = cookie_netfs_data;
167 memcpy(buffer, &vcookie->qid->path, sizeof(vcookie->qid->path)); 111 memcpy(buffer, &v9inode->fscache_key->path,
168 112 sizeof(v9inode->fscache_key->path));
169 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &vcookie->inode, 113 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get key %llu", &v9inode->vfs_inode,
170 vcookie->qid->path); 114 v9inode->fscache_key->path);
171 return sizeof(vcookie->qid->path); 115 return sizeof(v9inode->fscache_key->path);
172} 116}
173 117
174static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data, 118static void v9fs_cache_inode_get_attr(const void *cookie_netfs_data,
175 uint64_t *size) 119 uint64_t *size)
176{ 120{
177 const struct v9fs_cookie *vcookie = cookie_netfs_data; 121 const struct v9fs_inode *v9inode = cookie_netfs_data;
178 *size = i_size_read(&vcookie->inode); 122 *size = i_size_read(&v9inode->vfs_inode);
179 123
180 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &vcookie->inode, 124 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get attr %llu", &v9inode->vfs_inode,
181 *size); 125 *size);
182} 126}
183 127
184static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data, 128static uint16_t v9fs_cache_inode_get_aux(const void *cookie_netfs_data,
185 void *buffer, uint16_t buflen) 129 void *buffer, uint16_t buflen)
186{ 130{
187 const struct v9fs_cookie *vcookie = cookie_netfs_data; 131 const struct v9fs_inode *v9inode = cookie_netfs_data;
188 memcpy(buffer, &vcookie->qid->version, sizeof(vcookie->qid->version)); 132 memcpy(buffer, &v9inode->fscache_key->version,
189 133 sizeof(v9inode->fscache_key->version));
190 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &vcookie->inode, 134 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get aux %u", &v9inode->vfs_inode,
191 vcookie->qid->version); 135 v9inode->fscache_key->version);
192 return sizeof(vcookie->qid->version); 136 return sizeof(v9inode->fscache_key->version);
193} 137}
194 138
195static enum 139static enum
@@ -197,13 +141,13 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
197 const void *buffer, 141 const void *buffer,
198 uint16_t buflen) 142 uint16_t buflen)
199{ 143{
200 const struct v9fs_cookie *vcookie = cookie_netfs_data; 144 const struct v9fs_inode *v9inode = cookie_netfs_data;
201 145
202 if (buflen != sizeof(vcookie->qid->version)) 146 if (buflen != sizeof(v9inode->fscache_key->version))
203 return FSCACHE_CHECKAUX_OBSOLETE; 147 return FSCACHE_CHECKAUX_OBSOLETE;
204 148
205 if (memcmp(buffer, &vcookie->qid->version, 149 if (memcmp(buffer, &v9inode->fscache_key->version,
206 sizeof(vcookie->qid->version))) 150 sizeof(v9inode->fscache_key->version)))
207 return FSCACHE_CHECKAUX_OBSOLETE; 151 return FSCACHE_CHECKAUX_OBSOLETE;
208 152
209 return FSCACHE_CHECKAUX_OKAY; 153 return FSCACHE_CHECKAUX_OKAY;
@@ -211,7 +155,7 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data,
211 155
212static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) 156static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
213{ 157{
214 struct v9fs_cookie *vcookie = cookie_netfs_data; 158 struct v9fs_inode *v9inode = cookie_netfs_data;
215 struct pagevec pvec; 159 struct pagevec pvec;
216 pgoff_t first; 160 pgoff_t first;
217 int loop, nr_pages; 161 int loop, nr_pages;
@@ -220,7 +164,7 @@ static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data)
220 first = 0; 164 first = 0;
221 165
222 for (;;) { 166 for (;;) {
223 nr_pages = pagevec_lookup(&pvec, vcookie->inode.i_mapping, 167 nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping,
224 first, 168 first,
225 PAGEVEC_SIZE - pagevec_count(&pvec)); 169 PAGEVEC_SIZE - pagevec_count(&pvec));
226 if (!nr_pages) 170 if (!nr_pages)
@@ -249,115 +193,114 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = {
249 193
250void v9fs_cache_inode_get_cookie(struct inode *inode) 194void v9fs_cache_inode_get_cookie(struct inode *inode)
251{ 195{
252 struct v9fs_cookie *vcookie; 196 struct v9fs_inode *v9inode;
253 struct v9fs_session_info *v9ses; 197 struct v9fs_session_info *v9ses;
254 198
255 if (!S_ISREG(inode->i_mode)) 199 if (!S_ISREG(inode->i_mode))
256 return; 200 return;
257 201
258 vcookie = v9fs_inode2cookie(inode); 202 v9inode = V9FS_I(inode);
259 if (vcookie->fscache) 203 if (v9inode->fscache)
260 return; 204 return;
261 205
262 v9ses = v9fs_inode2v9ses(inode); 206 v9ses = v9fs_inode2v9ses(inode);
263 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, 207 v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
264 &v9fs_cache_inode_index_def, 208 &v9fs_cache_inode_index_def,
265 vcookie); 209 v9inode);
266 210
267 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode, 211 P9_DPRINTK(P9_DEBUG_FSC, "inode %p get cookie %p", inode,
268 vcookie->fscache); 212 v9inode->fscache);
269} 213}
270 214
271void v9fs_cache_inode_put_cookie(struct inode *inode) 215void v9fs_cache_inode_put_cookie(struct inode *inode)
272{ 216{
273 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 217 struct v9fs_inode *v9inode = V9FS_I(inode);
274 218
275 if (!vcookie->fscache) 219 if (!v9inode->fscache)
276 return; 220 return;
277 P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode, 221 P9_DPRINTK(P9_DEBUG_FSC, "inode %p put cookie %p", inode,
278 vcookie->fscache); 222 v9inode->fscache);
279 223
280 fscache_relinquish_cookie(vcookie->fscache, 0); 224 fscache_relinquish_cookie(v9inode->fscache, 0);
281 vcookie->fscache = NULL; 225 v9inode->fscache = NULL;
282} 226}
283 227
284void v9fs_cache_inode_flush_cookie(struct inode *inode) 228void v9fs_cache_inode_flush_cookie(struct inode *inode)
285{ 229{
286 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 230 struct v9fs_inode *v9inode = V9FS_I(inode);
287 231
288 if (!vcookie->fscache) 232 if (!v9inode->fscache)
289 return; 233 return;
290 P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode, 234 P9_DPRINTK(P9_DEBUG_FSC, "inode %p flush cookie %p", inode,
291 vcookie->fscache); 235 v9inode->fscache);
292 236
293 fscache_relinquish_cookie(vcookie->fscache, 1); 237 fscache_relinquish_cookie(v9inode->fscache, 1);
294 vcookie->fscache = NULL; 238 v9inode->fscache = NULL;
295} 239}
296 240
297void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) 241void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
298{ 242{
299 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 243 struct v9fs_inode *v9inode = V9FS_I(inode);
300 struct p9_fid *fid; 244 struct p9_fid *fid;
301 245
302 if (!vcookie->fscache) 246 if (!v9inode->fscache)
303 return; 247 return;
304 248
305 spin_lock(&vcookie->lock); 249 spin_lock(&v9inode->fscache_lock);
306 fid = filp->private_data; 250 fid = filp->private_data;
307 if ((filp->f_flags & O_ACCMODE) != O_RDONLY) 251 if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
308 v9fs_cache_inode_flush_cookie(inode); 252 v9fs_cache_inode_flush_cookie(inode);
309 else 253 else
310 v9fs_cache_inode_get_cookie(inode); 254 v9fs_cache_inode_get_cookie(inode);
311 255
312 spin_unlock(&vcookie->lock); 256 spin_unlock(&v9inode->fscache_lock);
313} 257}
314 258
315void v9fs_cache_inode_reset_cookie(struct inode *inode) 259void v9fs_cache_inode_reset_cookie(struct inode *inode)
316{ 260{
317 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 261 struct v9fs_inode *v9inode = V9FS_I(inode);
318 struct v9fs_session_info *v9ses; 262 struct v9fs_session_info *v9ses;
319 struct fscache_cookie *old; 263 struct fscache_cookie *old;
320 264
321 if (!vcookie->fscache) 265 if (!v9inode->fscache)
322 return; 266 return;
323 267
324 old = vcookie->fscache; 268 old = v9inode->fscache;
325 269
326 spin_lock(&vcookie->lock); 270 spin_lock(&v9inode->fscache_lock);
327 fscache_relinquish_cookie(vcookie->fscache, 1); 271 fscache_relinquish_cookie(v9inode->fscache, 1);
328 272
329 v9ses = v9fs_inode2v9ses(inode); 273 v9ses = v9fs_inode2v9ses(inode);
330 vcookie->fscache = fscache_acquire_cookie(v9ses->fscache, 274 v9inode->fscache = fscache_acquire_cookie(v9ses->fscache,
331 &v9fs_cache_inode_index_def, 275 &v9fs_cache_inode_index_def,
332 vcookie); 276 v9inode);
333
334 P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p", 277 P9_DPRINTK(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p",
335 inode, old, vcookie->fscache); 278 inode, old, v9inode->fscache);
336 279
337 spin_unlock(&vcookie->lock); 280 spin_unlock(&v9inode->fscache_lock);
338} 281}
339 282
340int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) 283int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
341{ 284{
342 struct inode *inode = page->mapping->host; 285 struct inode *inode = page->mapping->host;
343 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 286 struct v9fs_inode *v9inode = V9FS_I(inode);
344 287
345 BUG_ON(!vcookie->fscache); 288 BUG_ON(!v9inode->fscache);
346 289
347 return fscache_maybe_release_page(vcookie->fscache, page, gfp); 290 return fscache_maybe_release_page(v9inode->fscache, page, gfp);
348} 291}
349 292
350void __v9fs_fscache_invalidate_page(struct page *page) 293void __v9fs_fscache_invalidate_page(struct page *page)
351{ 294{
352 struct inode *inode = page->mapping->host; 295 struct inode *inode = page->mapping->host;
353 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 296 struct v9fs_inode *v9inode = V9FS_I(inode);
354 297
355 BUG_ON(!vcookie->fscache); 298 BUG_ON(!v9inode->fscache);
356 299
357 if (PageFsCache(page)) { 300 if (PageFsCache(page)) {
358 fscache_wait_on_page_write(vcookie->fscache, page); 301 fscache_wait_on_page_write(v9inode->fscache, page);
359 BUG_ON(!PageLocked(page)); 302 BUG_ON(!PageLocked(page));
360 fscache_uncache_page(vcookie->fscache, page); 303 fscache_uncache_page(v9inode->fscache, page);
361 } 304 }
362} 305}
363 306
@@ -380,13 +323,13 @@ static void v9fs_vfs_readpage_complete(struct page *page, void *data,
380int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) 323int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page)
381{ 324{
382 int ret; 325 int ret;
383 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 326 const struct v9fs_inode *v9inode = V9FS_I(inode);
384 327
385 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); 328 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
386 if (!vcookie->fscache) 329 if (!v9inode->fscache)
387 return -ENOBUFS; 330 return -ENOBUFS;
388 331
389 ret = fscache_read_or_alloc_page(vcookie->fscache, 332 ret = fscache_read_or_alloc_page(v9inode->fscache,
390 page, 333 page,
391 v9fs_vfs_readpage_complete, 334 v9fs_vfs_readpage_complete,
392 NULL, 335 NULL,
@@ -418,13 +361,13 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
418 unsigned *nr_pages) 361 unsigned *nr_pages)
419{ 362{
420 int ret; 363 int ret;
421 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 364 const struct v9fs_inode *v9inode = V9FS_I(inode);
422 365
423 P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages); 366 P9_DPRINTK(P9_DEBUG_FSC, "inode %p pages %u", inode, *nr_pages);
424 if (!vcookie->fscache) 367 if (!v9inode->fscache)
425 return -ENOBUFS; 368 return -ENOBUFS;
426 369
427 ret = fscache_read_or_alloc_pages(vcookie->fscache, 370 ret = fscache_read_or_alloc_pages(v9inode->fscache,
428 mapping, pages, nr_pages, 371 mapping, pages, nr_pages,
429 v9fs_vfs_readpage_complete, 372 v9fs_vfs_readpage_complete,
430 NULL, 373 NULL,
@@ -453,11 +396,22 @@ int __v9fs_readpages_from_fscache(struct inode *inode,
453void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) 396void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page)
454{ 397{
455 int ret; 398 int ret;
456 const struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 399 const struct v9fs_inode *v9inode = V9FS_I(inode);
457 400
458 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page); 401 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
459 ret = fscache_write_page(vcookie->fscache, page, GFP_KERNEL); 402 ret = fscache_write_page(v9inode->fscache, page, GFP_KERNEL);
460 P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret); 403 P9_DPRINTK(P9_DEBUG_FSC, "ret = %d", ret);
461 if (ret != 0) 404 if (ret != 0)
462 v9fs_uncache_page(inode, page); 405 v9fs_uncache_page(inode, page);
463} 406}
407
408/*
409 * wait for a page to complete writing to the cache
410 */
411void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page)
412{
413 const struct v9fs_inode *v9inode = V9FS_I(inode);
414 P9_DPRINTK(P9_DEBUG_FSC, "inode %p page %p", inode, page);
415 if (PageFsCache(page))
416 fscache_wait_on_page_write(v9inode->fscache, page);
417}
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index a94192bfaee8..049507a5b01c 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -25,20 +25,6 @@
25#include <linux/fscache.h> 25#include <linux/fscache.h>
26#include <linux/spinlock.h> 26#include <linux/spinlock.h>
27 27
28extern struct kmem_cache *vcookie_cache;
29
30struct v9fs_cookie {
31 spinlock_t lock;
32 struct inode inode;
33 struct fscache_cookie *fscache;
34 struct p9_qid *qid;
35};
36
37static inline struct v9fs_cookie *v9fs_inode2cookie(const struct inode *inode)
38{
39 return container_of(inode, struct v9fs_cookie, inode);
40}
41
42extern struct fscache_netfs v9fs_cache_netfs; 28extern struct fscache_netfs v9fs_cache_netfs;
43extern const struct fscache_cookie_def v9fs_cache_session_index_def; 29extern const struct fscache_cookie_def v9fs_cache_session_index_def;
44extern const struct fscache_cookie_def v9fs_cache_inode_index_def; 30extern const struct fscache_cookie_def v9fs_cache_inode_index_def;
@@ -64,23 +50,8 @@ extern int __v9fs_readpages_from_fscache(struct inode *inode,
64 struct list_head *pages, 50 struct list_head *pages,
65 unsigned *nr_pages); 51 unsigned *nr_pages);
66extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page); 52extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page);
67 53extern void __v9fs_fscache_wait_on_page_write(struct inode *inode,
68 54 struct page *page);
69/**
70 * v9fs_cache_register - Register v9fs file system with the cache
71 */
72static inline int v9fs_cache_register(void)
73{
74 return __v9fs_cache_register();
75}
76
77/**
78 * v9fs_cache_unregister - Unregister v9fs from the cache
79 */
80static inline void v9fs_cache_unregister(void)
81{
82 __v9fs_cache_unregister();
83}
84 55
85static inline int v9fs_fscache_release_page(struct page *page, 56static inline int v9fs_fscache_release_page(struct page *page,
86 gfp_t gfp) 57 gfp_t gfp)
@@ -117,28 +88,27 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
117 88
118static inline void v9fs_uncache_page(struct inode *inode, struct page *page) 89static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
119{ 90{
120 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 91 struct v9fs_inode *v9inode = V9FS_I(inode);
121 fscache_uncache_page(vcookie->fscache, page); 92 fscache_uncache_page(v9inode->fscache, page);
122 BUG_ON(PageFsCache(page)); 93 BUG_ON(PageFsCache(page));
123} 94}
124 95
125static inline void v9fs_vcookie_set_qid(struct inode *inode, 96static inline void v9fs_fscache_set_key(struct inode *inode,
126 struct p9_qid *qid) 97 struct p9_qid *qid)
127{ 98{
128 struct v9fs_cookie *vcookie = v9fs_inode2cookie(inode); 99 struct v9fs_inode *v9inode = V9FS_I(inode);
129 spin_lock(&vcookie->lock); 100 spin_lock(&v9inode->fscache_lock);
130 vcookie->qid = qid; 101 v9inode->fscache_key = qid;
131 spin_unlock(&vcookie->lock); 102 spin_unlock(&v9inode->fscache_lock);
132} 103}
133 104
134#else /* CONFIG_9P_FSCACHE */ 105static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
135 106 struct page *page)
136static inline int v9fs_cache_register(void)
137{ 107{
138 return 1; 108 return __v9fs_fscache_wait_on_page_write(inode, page);
139} 109}
140 110
141static inline void v9fs_cache_unregister(void) {} 111#else /* CONFIG_9P_FSCACHE */
142 112
143static inline int v9fs_fscache_release_page(struct page *page, 113static inline int v9fs_fscache_release_page(struct page *page,
144 gfp_t gfp) { 114 gfp_t gfp) {
@@ -168,9 +138,11 @@ static inline void v9fs_readpage_to_fscache(struct inode *inode,
168static inline void v9fs_uncache_page(struct inode *inode, struct page *page) 138static inline void v9fs_uncache_page(struct inode *inode, struct page *page)
169{} 139{}
170 140
171static inline void v9fs_vcookie_set_qid(struct inode *inode, 141static inline void v9fs_fscache_wait_on_page_write(struct inode *inode,
172 struct p9_qid *qid) 142 struct page *page)
173{} 143{
144 return;
145}
174 146
175#endif /* CONFIG_9P_FSCACHE */ 147#endif /* CONFIG_9P_FSCACHE */
176#endif /* _9P_CACHE_H */ 148#endif /* _9P_CACHE_H */
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index b00223c99d70..85b67ffa2a43 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -125,46 +125,17 @@ err_out:
125 return -ENOMEM; 125 return -ENOMEM;
126} 126}
127 127
128/** 128static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry,
129 * v9fs_fid_lookup - lookup for a fid, try to walk if not found 129 uid_t uid, int any)
130 * @dentry: dentry to look for fid in
131 *
132 * Look for a fid in the specified dentry for the current user.
133 * If no fid is found, try to create one walking from a fid from the parent
134 * dentry (if it has one), or the root dentry. If the user haven't accessed
135 * the fs yet, attach now and walk from the root.
136 */
137
138struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
139{ 130{
140 int i, n, l, clone, any, access;
141 u32 uid;
142 struct p9_fid *fid, *old_fid = NULL;
143 struct dentry *ds; 131 struct dentry *ds;
144 struct v9fs_session_info *v9ses;
145 char **wnames, *uname; 132 char **wnames, *uname;
133 int i, n, l, clone, access;
134 struct v9fs_session_info *v9ses;
135 struct p9_fid *fid, *old_fid = NULL;
146 136
147 v9ses = v9fs_inode2v9ses(dentry->d_inode); 137 v9ses = v9fs_dentry2v9ses(dentry);
148 access = v9ses->flags & V9FS_ACCESS_MASK; 138 access = v9ses->flags & V9FS_ACCESS_MASK;
149 switch (access) {
150 case V9FS_ACCESS_SINGLE:
151 case V9FS_ACCESS_USER:
152 case V9FS_ACCESS_CLIENT:
153 uid = current_fsuid();
154 any = 0;
155 break;
156
157 case V9FS_ACCESS_ANY:
158 uid = v9ses->uid;
159 any = 1;
160 break;
161
162 default:
163 uid = ~0;
164 any = 0;
165 break;
166 }
167
168 fid = v9fs_fid_find(dentry, uid, any); 139 fid = v9fs_fid_find(dentry, uid, any);
169 if (fid) 140 if (fid)
170 return fid; 141 return fid;
@@ -250,6 +221,45 @@ err_out:
250 return fid; 221 return fid;
251} 222}
252 223
224/**
225 * v9fs_fid_lookup - lookup for a fid, try to walk if not found
226 * @dentry: dentry to look for fid in
227 *
228 * Look for a fid in the specified dentry for the current user.
229 * If no fid is found, try to create one walking from a fid from the parent
230 * dentry (if it has one), or the root dentry. If the user haven't accessed
231 * the fs yet, attach now and walk from the root.
232 */
233
234struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
235{
236 uid_t uid;
237 int any, access;
238 struct v9fs_session_info *v9ses;
239
240 v9ses = v9fs_dentry2v9ses(dentry);
241 access = v9ses->flags & V9FS_ACCESS_MASK;
242 switch (access) {
243 case V9FS_ACCESS_SINGLE:
244 case V9FS_ACCESS_USER:
245 case V9FS_ACCESS_CLIENT:
246 uid = current_fsuid();
247 any = 0;
248 break;
249
250 case V9FS_ACCESS_ANY:
251 uid = v9ses->uid;
252 any = 1;
253 break;
254
255 default:
256 uid = ~0;
257 any = 0;
258 break;
259 }
260 return v9fs_fid_lookup_with_uid(dentry, uid, any);
261}
262
253struct p9_fid *v9fs_fid_clone(struct dentry *dentry) 263struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
254{ 264{
255 struct p9_fid *fid, *ret; 265 struct p9_fid *fid, *ret;
@@ -261,3 +271,39 @@ struct p9_fid *v9fs_fid_clone(struct dentry *dentry)
261 ret = p9_client_walk(fid, 0, NULL, 1); 271 ret = p9_client_walk(fid, 0, NULL, 1);
262 return ret; 272 return ret;
263} 273}
274
275static struct p9_fid *v9fs_fid_clone_with_uid(struct dentry *dentry, uid_t uid)
276{
277 struct p9_fid *fid, *ret;
278
279 fid = v9fs_fid_lookup_with_uid(dentry, uid, 0);
280 if (IS_ERR(fid))
281 return fid;
282
283 ret = p9_client_walk(fid, 0, NULL, 1);
284 return ret;
285}
286
287struct p9_fid *v9fs_writeback_fid(struct dentry *dentry)
288{
289 int err;
290 struct p9_fid *fid;
291
292 fid = v9fs_fid_clone_with_uid(dentry, 0);
293 if (IS_ERR(fid))
294 goto error_out;
295 /*
296 * writeback fid will only be used to write back the
297 * dirty pages. We always request for the open fid in read-write
298 * mode so that a partial page write which result in page
299 * read can work.
300 */
301 err = p9_client_open(fid, O_RDWR);
302 if (err < 0) {
303 p9_client_clunk(fid);
304 fid = ERR_PTR(err);
305 goto error_out;
306 }
307error_out:
308 return fid;
309}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index c3bbd6af996d..bb0b6e7f58fc 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -19,7 +19,8 @@
19 * Boston, MA 02111-1301 USA 19 * Boston, MA 02111-1301 USA
20 * 20 *
21 */ 21 */
22 22#ifndef FS_9P_FID_H
23#define FS_9P_FID_H
23#include <linux/list.h> 24#include <linux/list.h>
24 25
25/** 26/**
@@ -45,3 +46,5 @@ struct v9fs_dentry {
45struct p9_fid *v9fs_fid_lookup(struct dentry *dentry); 46struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
46struct p9_fid *v9fs_fid_clone(struct dentry *dentry); 47struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
47int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid); 48int v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
49struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
50#endif
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 2f77cd33ba83..c82b017f51f3 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -39,6 +39,7 @@
39 39
40static DEFINE_SPINLOCK(v9fs_sessionlist_lock); 40static DEFINE_SPINLOCK(v9fs_sessionlist_lock);
41static LIST_HEAD(v9fs_sessionlist); 41static LIST_HEAD(v9fs_sessionlist);
42struct kmem_cache *v9fs_inode_cache;
42 43
43/* 44/*
44 * Option Parsing (code inspired by NFS code) 45 * Option Parsing (code inspired by NFS code)
@@ -55,7 +56,7 @@ enum {
55 /* Cache options */ 56 /* Cache options */
56 Opt_cache_loose, Opt_fscache, 57 Opt_cache_loose, Opt_fscache,
57 /* Access options */ 58 /* Access options */
58 Opt_access, 59 Opt_access, Opt_posixacl,
59 /* Error token */ 60 /* Error token */
60 Opt_err 61 Opt_err
61}; 62};
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
73 {Opt_fscache, "fscache"}, 74 {Opt_fscache, "fscache"},
74 {Opt_cachetag, "cachetag=%s"}, 75 {Opt_cachetag, "cachetag=%s"},
75 {Opt_access, "access=%s"}, 76 {Opt_access, "access=%s"},
77 {Opt_posixacl, "posixacl"},
76 {Opt_err, NULL} 78 {Opt_err, NULL}
77}; 79};
78 80
@@ -194,15 +196,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
194 else if (strcmp(s, "any") == 0) 196 else if (strcmp(s, "any") == 0)
195 v9ses->flags |= V9FS_ACCESS_ANY; 197 v9ses->flags |= V9FS_ACCESS_ANY;
196 else if (strcmp(s, "client") == 0) { 198 else if (strcmp(s, "client") == 0) {
197#ifdef CONFIG_9P_FS_POSIX_ACL
198 v9ses->flags |= V9FS_ACCESS_CLIENT; 199 v9ses->flags |= V9FS_ACCESS_CLIENT;
199#else
200 P9_DPRINTK(P9_DEBUG_ERROR,
201 "access=client option not supported\n");
202 kfree(s);
203 ret = -EINVAL;
204 goto free_and_return;
205#endif
206 } else { 200 } else {
207 v9ses->flags |= V9FS_ACCESS_SINGLE; 201 v9ses->flags |= V9FS_ACCESS_SINGLE;
208 v9ses->uid = simple_strtoul(s, &e, 10); 202 v9ses->uid = simple_strtoul(s, &e, 10);
@@ -212,6 +206,16 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
212 kfree(s); 206 kfree(s);
213 break; 207 break;
214 208
209 case Opt_posixacl:
210#ifdef CONFIG_9P_FS_POSIX_ACL
211 v9ses->flags |= V9FS_POSIX_ACL;
212#else
213 P9_DPRINTK(P9_DEBUG_ERROR,
214 "Not defined CONFIG_9P_FS_POSIX_ACL. "
215 "Ignoring posixacl option\n");
216#endif
217 break;
218
215 default: 219 default:
216 continue; 220 continue;
217 } 221 }
@@ -260,19 +264,12 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
260 list_add(&v9ses->slist, &v9fs_sessionlist); 264 list_add(&v9ses->slist, &v9fs_sessionlist);
261 spin_unlock(&v9fs_sessionlist_lock); 265 spin_unlock(&v9fs_sessionlist_lock);
262 266
263 v9ses->flags = V9FS_ACCESS_USER;
264 strcpy(v9ses->uname, V9FS_DEFUSER); 267 strcpy(v9ses->uname, V9FS_DEFUSER);
265 strcpy(v9ses->aname, V9FS_DEFANAME); 268 strcpy(v9ses->aname, V9FS_DEFANAME);
266 v9ses->uid = ~0; 269 v9ses->uid = ~0;
267 v9ses->dfltuid = V9FS_DEFUID; 270 v9ses->dfltuid = V9FS_DEFUID;
268 v9ses->dfltgid = V9FS_DEFGID; 271 v9ses->dfltgid = V9FS_DEFGID;
269 272
270 rc = v9fs_parse_options(v9ses, data);
271 if (rc < 0) {
272 retval = rc;
273 goto error;
274 }
275
276 v9ses->clnt = p9_client_create(dev_name, data); 273 v9ses->clnt = p9_client_create(dev_name, data);
277 if (IS_ERR(v9ses->clnt)) { 274 if (IS_ERR(v9ses->clnt)) {
278 retval = PTR_ERR(v9ses->clnt); 275 retval = PTR_ERR(v9ses->clnt);
@@ -281,10 +278,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
281 goto error; 278 goto error;
282 } 279 }
283 280
284 if (p9_is_proto_dotl(v9ses->clnt)) 281 v9ses->flags = V9FS_ACCESS_USER;
282
283 if (p9_is_proto_dotl(v9ses->clnt)) {
284 v9ses->flags = V9FS_ACCESS_CLIENT;
285 v9ses->flags |= V9FS_PROTO_2000L; 285 v9ses->flags |= V9FS_PROTO_2000L;
286 else if (p9_is_proto_dotu(v9ses->clnt)) 286 } else if (p9_is_proto_dotu(v9ses->clnt)) {
287 v9ses->flags |= V9FS_PROTO_2000U; 287 v9ses->flags |= V9FS_PROTO_2000U;
288 }
289
290 rc = v9fs_parse_options(v9ses, data);
291 if (rc < 0) {
292 retval = rc;
293 goto error;
294 }
288 295
289 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; 296 v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
290 297
@@ -306,6 +313,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
306 v9ses->flags |= V9FS_ACCESS_ANY; 313 v9ses->flags |= V9FS_ACCESS_ANY;
307 v9ses->uid = ~0; 314 v9ses->uid = ~0;
308 } 315 }
316 if (!v9fs_proto_dotl(v9ses) ||
317 !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) {
318 /*
319 * We support ACL checks on clinet only if the protocol is
320 * 9P2000.L and access is V9FS_ACCESS_CLIENT.
321 */
322 v9ses->flags &= ~V9FS_ACL_MASK;
323 }
309 324
310 fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0, 325 fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, ~0,
311 v9ses->aname); 326 v9ses->aname);
@@ -467,6 +482,63 @@ static void v9fs_sysfs_cleanup(void)
467 kobject_put(v9fs_kobj); 482 kobject_put(v9fs_kobj);
468} 483}
469 484
485static void v9fs_inode_init_once(void *foo)
486{
487 struct v9fs_inode *v9inode = (struct v9fs_inode *)foo;
488#ifdef CONFIG_9P_FSCACHE
489 v9inode->fscache = NULL;
490 v9inode->fscache_key = NULL;
491#endif
492 inode_init_once(&v9inode->vfs_inode);
493}
494
495/**
496 * v9fs_init_inode_cache - initialize a cache for 9P
497 * Returns 0 on success.
498 */
499static int v9fs_init_inode_cache(void)
500{
501 v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
502 sizeof(struct v9fs_inode),
503 0, (SLAB_RECLAIM_ACCOUNT|
504 SLAB_MEM_SPREAD),
505 v9fs_inode_init_once);
506 if (!v9fs_inode_cache)
507 return -ENOMEM;
508
509 return 0;
510}
511
512/**
513 * v9fs_destroy_inode_cache - destroy the cache of 9P inode
514 *
515 */
516static void v9fs_destroy_inode_cache(void)
517{
518 kmem_cache_destroy(v9fs_inode_cache);
519}
520
521static int v9fs_cache_register(void)
522{
523 int ret;
524 ret = v9fs_init_inode_cache();
525 if (ret < 0)
526 return ret;
527#ifdef CONFIG_9P_FSCACHE
528 return fscache_register_netfs(&v9fs_cache_netfs);
529#else
530 return ret;
531#endif
532}
533
534static void v9fs_cache_unregister(void)
535{
536 v9fs_destroy_inode_cache();
537#ifdef CONFIG_9P_FSCACHE
538 fscache_unregister_netfs(&v9fs_cache_netfs);
539#endif
540}
541
470/** 542/**
471 * init_v9fs - Initialize module 543 * init_v9fs - Initialize module
472 * 544 *
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index c4b5d8864f0d..e5ebedfc5ed8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -20,6 +20,9 @@
20 * Boston, MA 02111-1301 USA 20 * Boston, MA 02111-1301 USA
21 * 21 *
22 */ 22 */
23#ifndef FS_9P_V9FS_H
24#define FS_9P_V9FS_H
25
23#include <linux/backing-dev.h> 26#include <linux/backing-dev.h>
24 27
25/** 28/**
@@ -28,8 +31,10 @@
28 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions 31 * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions
29 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy 32 * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy
30 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) 33 * @V9FS_ACCESS_USER: a new attach will be issued for every user (default)
34 * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client.
31 * @V9FS_ACCESS_ANY: use a single attach for all users 35 * @V9FS_ACCESS_ANY: use a single attach for all users
32 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options 36 * @V9FS_ACCESS_MASK: bit mask of different ACCESS options
37 * @V9FS_POSIX_ACL: POSIX ACLs are enforced
33 * 38 *
34 * Session flags reflect options selected by users at mount time 39 * Session flags reflect options selected by users at mount time
35 */ 40 */
@@ -37,13 +42,15 @@
37 V9FS_ACCESS_USER | \ 42 V9FS_ACCESS_USER | \
38 V9FS_ACCESS_CLIENT) 43 V9FS_ACCESS_CLIENT)
39#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY 44#define V9FS_ACCESS_MASK V9FS_ACCESS_ANY
45#define V9FS_ACL_MASK V9FS_POSIX_ACL
40 46
41enum p9_session_flags { 47enum p9_session_flags {
42 V9FS_PROTO_2000U = 0x01, 48 V9FS_PROTO_2000U = 0x01,
43 V9FS_PROTO_2000L = 0x02, 49 V9FS_PROTO_2000L = 0x02,
44 V9FS_ACCESS_SINGLE = 0x04, 50 V9FS_ACCESS_SINGLE = 0x04,
45 V9FS_ACCESS_USER = 0x08, 51 V9FS_ACCESS_USER = 0x08,
46 V9FS_ACCESS_CLIENT = 0x10 52 V9FS_ACCESS_CLIENT = 0x10,
53 V9FS_POSIX_ACL = 0x20
47}; 54};
48 55
49/* possible values of ->cache */ 56/* possible values of ->cache */
@@ -111,6 +118,26 @@ struct v9fs_session_info {
111 struct rw_semaphore rename_sem; 118 struct rw_semaphore rename_sem;
112}; 119};
113 120
121/* cache_validity flags */
122#define V9FS_INO_INVALID_ATTR 0x01
123
124struct v9fs_inode {
125#ifdef CONFIG_9P_FSCACHE
126 spinlock_t fscache_lock;
127 struct fscache_cookie *fscache;
128 struct p9_qid *fscache_key;
129#endif
130 unsigned int cache_validity;
131 struct p9_fid *writeback_fid;
132 struct mutex v_mutex;
133 struct inode vfs_inode;
134};
135
136static inline struct v9fs_inode *V9FS_I(const struct inode *inode)
137{
138 return container_of(inode, struct v9fs_inode, vfs_inode);
139}
140
114struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, 141struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *,
115 char *); 142 char *);
116extern void v9fs_session_close(struct v9fs_session_info *v9ses); 143extern void v9fs_session_close(struct v9fs_session_info *v9ses);
@@ -124,16 +151,15 @@ extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
124 struct inode *new_dir, struct dentry *new_dentry); 151 struct inode *new_dir, struct dentry *new_dentry);
125extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, 152extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd,
126 void *p); 153 void *p);
127extern struct inode *v9fs_inode(struct v9fs_session_info *v9ses, 154extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
128 struct p9_fid *fid, 155 struct p9_fid *fid,
129 struct super_block *sb); 156 struct super_block *sb);
130
131extern const struct inode_operations v9fs_dir_inode_operations_dotl; 157extern const struct inode_operations v9fs_dir_inode_operations_dotl;
132extern const struct inode_operations v9fs_file_inode_operations_dotl; 158extern const struct inode_operations v9fs_file_inode_operations_dotl;
133extern const struct inode_operations v9fs_symlink_inode_operations_dotl; 159extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
134extern struct inode *v9fs_inode_dotl(struct v9fs_session_info *v9ses, 160extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
135 struct p9_fid *fid, 161 struct p9_fid *fid,
136 struct super_block *sb); 162 struct super_block *sb);
137 163
138/* other default globals */ 164/* other default globals */
139#define V9FS_PORT 564 165#define V9FS_PORT 564
@@ -147,6 +173,11 @@ static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode)
147 return (inode->i_sb->s_fs_info); 173 return (inode->i_sb->s_fs_info);
148} 174}
149 175
176static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry)
177{
178 return dentry->d_sb->s_fs_info;
179}
180
150static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) 181static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses)
151{ 182{
152 return v9ses->flags & V9FS_PROTO_2000U; 183 return v9ses->flags & V9FS_PROTO_2000U;
@@ -158,7 +189,7 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
158} 189}
159 190
160/** 191/**
161 * v9fs_inode_from_fid - Helper routine to populate an inode by 192 * v9fs_get_inode_from_fid - Helper routine to populate an inode by
162 * issuing a attribute request 193 * issuing a attribute request
163 * @v9ses: session information 194 * @v9ses: session information
164 * @fid: fid to issue attribute request for 195 * @fid: fid to issue attribute request for
@@ -166,11 +197,12 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
166 * 197 *
167 */ 198 */
168static inline struct inode * 199static inline struct inode *
169v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, 200v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
170 struct super_block *sb) 201 struct super_block *sb)
171{ 202{
172 if (v9fs_proto_dotl(v9ses)) 203 if (v9fs_proto_dotl(v9ses))
173 return v9fs_inode_dotl(v9ses, fid, sb); 204 return v9fs_inode_from_fid_dotl(v9ses, fid, sb);
174 else 205 else
175 return v9fs_inode(v9ses, fid, sb); 206 return v9fs_inode_from_fid(v9ses, fid, sb);
176} 207}
208#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index b789f8e597ec..4014160903a9 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -20,6 +20,8 @@
20 * Boston, MA 02111-1301 USA 20 * Boston, MA 02111-1301 USA
21 * 21 *
22 */ 22 */
23#ifndef FS_9P_V9FS_VFS_H
24#define FS_9P_V9FS_VFS_H
23 25
24/* plan9 semantics are that created files are implicitly opened. 26/* plan9 semantics are that created files are implicitly opened.
25 * But linux semantics are that you call create, then open. 27 * But linux semantics are that you call create, then open.
@@ -36,6 +38,7 @@
36 * unlink calls remove, which is an implicit clunk. So we have to track 38 * unlink calls remove, which is an implicit clunk. So we have to track
37 * that kind of thing so that we don't try to clunk a dead fid. 39 * that kind of thing so that we don't try to clunk a dead fid.
38 */ 40 */
41#define P9_LOCK_TIMEOUT (30*HZ)
39 42
40extern struct file_system_type v9fs_fs_type; 43extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 44extern const struct address_space_operations v9fs_addr_operations;
@@ -45,13 +48,15 @@ extern const struct file_operations v9fs_dir_operations;
45extern const struct file_operations v9fs_dir_operations_dotl; 48extern const struct file_operations v9fs_dir_operations_dotl;
46extern const struct dentry_operations v9fs_dentry_operations; 49extern const struct dentry_operations v9fs_dentry_operations;
47extern const struct dentry_operations v9fs_cached_dentry_operations; 50extern const struct dentry_operations v9fs_cached_dentry_operations;
51extern const struct file_operations v9fs_cached_file_operations;
52extern const struct file_operations v9fs_cached_file_operations_dotl;
53extern struct kmem_cache *v9fs_inode_cache;
48 54
49#ifdef CONFIG_9P_FSCACHE
50struct inode *v9fs_alloc_inode(struct super_block *sb); 55struct inode *v9fs_alloc_inode(struct super_block *sb);
51void v9fs_destroy_inode(struct inode *inode); 56void v9fs_destroy_inode(struct inode *inode);
52#endif
53
54struct inode *v9fs_get_inode(struct super_block *sb, int mode); 57struct inode *v9fs_get_inode(struct super_block *sb, int mode);
58int v9fs_init_inode(struct v9fs_session_info *v9ses,
59 struct inode *inode, int mode);
55void v9fs_evict_inode(struct inode *inode); 60void v9fs_evict_inode(struct inode *inode);
56ino_t v9fs_qid2ino(struct p9_qid *qid); 61ino_t v9fs_qid2ino(struct p9_qid *qid);
57void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *); 62void v9fs_stat2inode(struct p9_wstat *, struct inode *, struct super_block *);
@@ -62,8 +67,19 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat);
62int v9fs_uflags2omode(int uflags, int extended); 67int v9fs_uflags2omode(int uflags, int extended);
63 68
64ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); 69ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64);
70ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64);
65void v9fs_blank_wstat(struct p9_wstat *wstat); 71void v9fs_blank_wstat(struct p9_wstat *wstat);
66int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); 72int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *);
67int v9fs_file_fsync_dotl(struct file *filp, int datasync); 73int v9fs_file_fsync_dotl(struct file *filp, int datasync);
68 74ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *,
69#define P9_LOCK_TIMEOUT (30*HZ) 75 const char __user *, size_t, loff_t *, int);
76int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode);
77int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode);
78static inline void v9fs_invalidate_inode_attr(struct inode *inode)
79{
80 struct v9fs_inode *v9inode;
81 v9inode = V9FS_I(inode);
82 v9inode->cache_validity |= V9FS_INO_INVALID_ATTR;
83 return;
84}
85#endif
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index b7f2a8e3863e..2524e4cbb8ea 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -39,16 +39,16 @@
39#include "v9fs.h" 39#include "v9fs.h"
40#include "v9fs_vfs.h" 40#include "v9fs_vfs.h"
41#include "cache.h" 41#include "cache.h"
42#include "fid.h"
42 43
43/** 44/**
44 * v9fs_vfs_readpage - read an entire page in from 9P 45 * v9fs_fid_readpage - read an entire page in from 9P
45 * 46 *
46 * @filp: file being read 47 * @fid: fid being read
47 * @page: structure to page 48 * @page: structure to page
48 * 49 *
49 */ 50 */
50 51static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page)
51static int v9fs_vfs_readpage(struct file *filp, struct page *page)
52{ 52{
53 int retval; 53 int retval;
54 loff_t offset; 54 loff_t offset;
@@ -67,7 +67,7 @@ static int v9fs_vfs_readpage(struct file *filp, struct page *page)
67 buffer = kmap(page); 67 buffer = kmap(page);
68 offset = page_offset(page); 68 offset = page_offset(page);
69 69
70 retval = v9fs_file_readn(filp, buffer, NULL, PAGE_CACHE_SIZE, offset); 70 retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset);
71 if (retval < 0) { 71 if (retval < 0) {
72 v9fs_uncache_page(inode, page); 72 v9fs_uncache_page(inode, page);
73 goto done; 73 goto done;
@@ -87,6 +87,19 @@ done:
87} 87}
88 88
89/** 89/**
90 * v9fs_vfs_readpage - read an entire page in from 9P
91 *
92 * @filp: file being read
93 * @page: structure to page
94 *
95 */
96
97static int v9fs_vfs_readpage(struct file *filp, struct page *page)
98{
99 return v9fs_fid_readpage(filp->private_data, page);
100}
101
102/**
90 * v9fs_vfs_readpages - read a set of pages from 9P 103 * v9fs_vfs_readpages - read a set of pages from 9P
91 * 104 *
92 * @filp: file being read 105 * @filp: file being read
@@ -124,7 +137,6 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
124{ 137{
125 if (PagePrivate(page)) 138 if (PagePrivate(page))
126 return 0; 139 return 0;
127
128 return v9fs_fscache_release_page(page, gfp); 140 return v9fs_fscache_release_page(page, gfp);
129} 141}
130 142
@@ -137,20 +149,89 @@ static int v9fs_release_page(struct page *page, gfp_t gfp)
137 149
138static void v9fs_invalidate_page(struct page *page, unsigned long offset) 150static void v9fs_invalidate_page(struct page *page, unsigned long offset)
139{ 151{
152 /*
153 * If called with zero offset, we should release
154 * the private state assocated with the page
155 */
140 if (offset == 0) 156 if (offset == 0)
141 v9fs_fscache_invalidate_page(page); 157 v9fs_fscache_invalidate_page(page);
142} 158}
143 159
160static int v9fs_vfs_writepage_locked(struct page *page)
161{
162 char *buffer;
163 int retval, len;
164 loff_t offset, size;
165 mm_segment_t old_fs;
166 struct v9fs_inode *v9inode;
167 struct inode *inode = page->mapping->host;
168
169 v9inode = V9FS_I(inode);
170 size = i_size_read(inode);
171 if (page->index == size >> PAGE_CACHE_SHIFT)
172 len = size & ~PAGE_CACHE_MASK;
173 else
174 len = PAGE_CACHE_SIZE;
175
176 set_page_writeback(page);
177
178 buffer = kmap(page);
179 offset = page_offset(page);
180
181 old_fs = get_fs();
182 set_fs(get_ds());
183 /* We should have writeback_fid always set */
184 BUG_ON(!v9inode->writeback_fid);
185
186 retval = v9fs_file_write_internal(inode,
187 v9inode->writeback_fid,
188 (__force const char __user *)buffer,
189 len, &offset, 0);
190 if (retval > 0)
191 retval = 0;
192
193 set_fs(old_fs);
194 kunmap(page);
195 end_page_writeback(page);
196 return retval;
197}
198
199static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
200{
201 int retval;
202
203 retval = v9fs_vfs_writepage_locked(page);
204 if (retval < 0) {
205 if (retval == -EAGAIN) {
206 redirty_page_for_writepage(wbc, page);
207 retval = 0;
208 } else {
209 SetPageError(page);
210 mapping_set_error(page->mapping, retval);
211 }
212 } else
213 retval = 0;
214
215 unlock_page(page);
216 return retval;
217}
218
144/** 219/**
145 * v9fs_launder_page - Writeback a dirty page 220 * v9fs_launder_page - Writeback a dirty page
146 * Since the writes go directly to the server, we simply return a 0
147 * here to indicate success.
148 *
149 * Returns 0 on success. 221 * Returns 0 on success.
150 */ 222 */
151 223
152static int v9fs_launder_page(struct page *page) 224static int v9fs_launder_page(struct page *page)
153{ 225{
226 int retval;
227 struct inode *inode = page->mapping->host;
228
229 v9fs_fscache_wait_on_page_write(inode, page);
230 if (clear_page_dirty_for_io(page)) {
231 retval = v9fs_vfs_writepage_locked(page);
232 if (retval)
233 return retval;
234 }
154 return 0; 235 return 0;
155} 236}
156 237
@@ -173,9 +254,15 @@ static int v9fs_launder_page(struct page *page)
173 * with an error. 254 * with an error.
174 * 255 *
175 */ 256 */
176ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, 257static ssize_t
177 loff_t pos, unsigned long nr_segs) 258v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
259 loff_t pos, unsigned long nr_segs)
178{ 260{
261 /*
262 * FIXME
263 * Now that we do caching with cache mode enabled, We need
264 * to support direct IO
265 */
179 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) " 266 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) "
180 "off/no(%lld/%lu) EINVAL\n", 267 "off/no(%lld/%lu) EINVAL\n",
181 iocb->ki_filp->f_path.dentry->d_name.name, 268 iocb->ki_filp->f_path.dentry->d_name.name,
@@ -183,11 +270,84 @@ ssize_t v9fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
183 270
184 return -EINVAL; 271 return -EINVAL;
185} 272}
273
274static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
275 loff_t pos, unsigned len, unsigned flags,
276 struct page **pagep, void **fsdata)
277{
278 int retval = 0;
279 struct page *page;
280 struct v9fs_inode *v9inode;
281 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
282 struct inode *inode = mapping->host;
283
284 v9inode = V9FS_I(inode);
285start:
286 page = grab_cache_page_write_begin(mapping, index, flags);
287 if (!page) {
288 retval = -ENOMEM;
289 goto out;
290 }
291 BUG_ON(!v9inode->writeback_fid);
292 if (PageUptodate(page))
293 goto out;
294
295 if (len == PAGE_CACHE_SIZE)
296 goto out;
297
298 retval = v9fs_fid_readpage(v9inode->writeback_fid, page);
299 page_cache_release(page);
300 if (!retval)
301 goto start;
302out:
303 *pagep = page;
304 return retval;
305}
306
307static int v9fs_write_end(struct file *filp, struct address_space *mapping,
308 loff_t pos, unsigned len, unsigned copied,
309 struct page *page, void *fsdata)
310{
311 loff_t last_pos = pos + copied;
312 struct inode *inode = page->mapping->host;
313
314 if (unlikely(copied < len)) {
315 /*
316 * zero out the rest of the area
317 */
318 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
319
320 zero_user(page, from + copied, len - copied);
321 flush_dcache_page(page);
322 }
323
324 if (!PageUptodate(page))
325 SetPageUptodate(page);
326 /*
327 * No need to use i_size_read() here, the i_size
328 * cannot change under us because we hold the i_mutex.
329 */
330 if (last_pos > inode->i_size) {
331 inode_add_bytes(inode, last_pos - inode->i_size);
332 i_size_write(inode, last_pos);
333 }
334 set_page_dirty(page);
335 unlock_page(page);
336 page_cache_release(page);
337
338 return copied;
339}
340
341
186const struct address_space_operations v9fs_addr_operations = { 342const struct address_space_operations v9fs_addr_operations = {
187 .readpage = v9fs_vfs_readpage, 343 .readpage = v9fs_vfs_readpage,
188 .readpages = v9fs_vfs_readpages, 344 .readpages = v9fs_vfs_readpages,
189 .releasepage = v9fs_release_page, 345 .set_page_dirty = __set_page_dirty_nobuffers,
190 .invalidatepage = v9fs_invalidate_page, 346 .writepage = v9fs_vfs_writepage,
191 .launder_page = v9fs_launder_page, 347 .write_begin = v9fs_write_begin,
192 .direct_IO = v9fs_direct_IO, 348 .write_end = v9fs_write_end,
349 .releasepage = v9fs_release_page,
350 .invalidatepage = v9fs_invalidate_page,
351 .launder_page = v9fs_launder_page,
352 .direct_IO = v9fs_direct_IO,
193}; 353};
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index 233b7d4ffe5e..e022890c6f40 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -63,20 +63,15 @@ static int v9fs_dentry_delete(const struct dentry *dentry)
63 * v9fs_cached_dentry_delete - called when dentry refcount equals 0 63 * v9fs_cached_dentry_delete - called when dentry refcount equals 0
64 * @dentry: dentry in question 64 * @dentry: dentry in question
65 * 65 *
66 * Only return 1 if our inode is invalid. Only non-synthetic files
67 * (ones without mtime == 0) should be calling this function.
68 *
69 */ 66 */
70
71static int v9fs_cached_dentry_delete(const struct dentry *dentry) 67static int v9fs_cached_dentry_delete(const struct dentry *dentry)
72{ 68{
73 struct inode *inode = dentry->d_inode; 69 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n",
74 P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, 70 dentry->d_name.name, dentry);
75 dentry);
76 71
77 if(!inode) 72 /* Don't cache negative dentries */
73 if (!dentry->d_inode)
78 return 1; 74 return 1;
79
80 return 0; 75 return 0;
81} 76}
82 77
@@ -105,7 +100,43 @@ static void v9fs_dentry_release(struct dentry *dentry)
105 } 100 }
106} 101}
107 102
103static int v9fs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
104{
105 struct p9_fid *fid;
106 struct inode *inode;
107 struct v9fs_inode *v9inode;
108
109 if (nd->flags & LOOKUP_RCU)
110 return -ECHILD;
111
112 inode = dentry->d_inode;
113 if (!inode)
114 goto out_valid;
115
116 v9inode = V9FS_I(inode);
117 if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) {
118 int retval;
119 struct v9fs_session_info *v9ses;
120 fid = v9fs_fid_lookup(dentry);
121 if (IS_ERR(fid))
122 return PTR_ERR(fid);
123
124 v9ses = v9fs_inode2v9ses(inode);
125 if (v9fs_proto_dotl(v9ses))
126 retval = v9fs_refresh_inode_dotl(fid, inode);
127 else
128 retval = v9fs_refresh_inode(fid, inode);
129 if (retval == -ENOENT)
130 return 0;
131 if (retval < 0)
132 return retval;
133 }
134out_valid:
135 return 1;
136}
137
108const struct dentry_operations v9fs_cached_dentry_operations = { 138const struct dentry_operations v9fs_cached_dentry_operations = {
139 .d_revalidate = v9fs_lookup_revalidate,
109 .d_delete = v9fs_cached_dentry_delete, 140 .d_delete = v9fs_cached_dentry_delete,
110 .d_release = v9fs_dentry_release, 141 .d_release = v9fs_dentry_release,
111}; 142};
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index b84ebe8cefed..9c2bdda5cd9d 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -295,7 +295,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp)
295 P9_DPRINTK(P9_DEBUG_VFS, 295 P9_DPRINTK(P9_DEBUG_VFS,
296 "v9fs_dir_release: inode: %p filp: %p fid: %d\n", 296 "v9fs_dir_release: inode: %p filp: %p fid: %d\n",
297 inode, filp, fid ? fid->fid : -1); 297 inode, filp, fid ? fid->fid : -1);
298 filemap_write_and_wait(inode->i_mapping);
299 if (fid) 298 if (fid)
300 p9_client_clunk(fid); 299 p9_client_clunk(fid);
301 return 0; 300 return 0;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 240c30674396..ffed55817f0c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -44,8 +44,7 @@
44#include "fid.h" 44#include "fid.h"
45#include "cache.h" 45#include "cache.h"
46 46
47static const struct file_operations v9fs_cached_file_operations; 47static const struct vm_operations_struct v9fs_file_vm_ops;
48static const struct file_operations v9fs_cached_file_operations_dotl;
49 48
50/** 49/**
51 * v9fs_file_open - open a file (or directory) 50 * v9fs_file_open - open a file (or directory)
@@ -57,11 +56,13 @@ static const struct file_operations v9fs_cached_file_operations_dotl;
57int v9fs_file_open(struct inode *inode, struct file *file) 56int v9fs_file_open(struct inode *inode, struct file *file)
58{ 57{
59 int err; 58 int err;
59 struct v9fs_inode *v9inode;
60 struct v9fs_session_info *v9ses; 60 struct v9fs_session_info *v9ses;
61 struct p9_fid *fid; 61 struct p9_fid *fid;
62 int omode; 62 int omode;
63 63
64 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file); 64 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, file);
65 v9inode = V9FS_I(inode);
65 v9ses = v9fs_inode2v9ses(inode); 66 v9ses = v9fs_inode2v9ses(inode);
66 if (v9fs_proto_dotl(v9ses)) 67 if (v9fs_proto_dotl(v9ses))
67 omode = file->f_flags; 68 omode = file->f_flags;
@@ -89,20 +90,34 @@ int v9fs_file_open(struct inode *inode, struct file *file)
89 } 90 }
90 91
91 file->private_data = fid; 92 file->private_data = fid;
92 if ((fid->qid.version) && (v9ses->cache)) { 93 mutex_lock(&v9inode->v_mutex);
93 P9_DPRINTK(P9_DEBUG_VFS, "cached"); 94 if (v9ses->cache && !v9inode->writeback_fid &&
94 /* enable cached file options */ 95 ((file->f_flags & O_ACCMODE) != O_RDONLY)) {
95 if(file->f_op == &v9fs_file_operations) 96 /*
96 file->f_op = &v9fs_cached_file_operations; 97 * clone a fid and add it to writeback_fid
97 else if (file->f_op == &v9fs_file_operations_dotl) 98 * we do it during open time instead of
98 file->f_op = &v9fs_cached_file_operations_dotl; 99 * page dirty time via write_begin/page_mkwrite
99 100 * because we want write after unlink usecase
101 * to work.
102 */
103 fid = v9fs_writeback_fid(file->f_path.dentry);
104 if (IS_ERR(fid)) {
105 err = PTR_ERR(fid);
106 mutex_unlock(&v9inode->v_mutex);
107 goto out_error;
108 }
109 v9inode->writeback_fid = (void *) fid;
110 }
111 mutex_unlock(&v9inode->v_mutex);
100#ifdef CONFIG_9P_FSCACHE 112#ifdef CONFIG_9P_FSCACHE
113 if (v9ses->cache)
101 v9fs_cache_inode_set_cookie(inode, file); 114 v9fs_cache_inode_set_cookie(inode, file);
102#endif 115#endif
103 }
104
105 return 0; 116 return 0;
117out_error:
118 p9_client_clunk(file->private_data);
119 file->private_data = NULL;
120 return err;
106} 121}
107 122
108/** 123/**
@@ -335,25 +350,22 @@ out_err:
335} 350}
336 351
337/** 352/**
338 * v9fs_file_readn - read from a file 353 * v9fs_fid_readn - read from a fid
339 * @filp: file pointer to read 354 * @fid: fid to read
340 * @data: data buffer to read data into 355 * @data: data buffer to read data into
341 * @udata: user data buffer to read data into 356 * @udata: user data buffer to read data into
342 * @count: size of buffer 357 * @count: size of buffer
343 * @offset: offset at which to read data 358 * @offset: offset at which to read data
344 * 359 *
345 */ 360 */
346
347ssize_t 361ssize_t
348v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, 362v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count,
349 u64 offset) 363 u64 offset)
350{ 364{
351 int n, total, size; 365 int n, total, size;
352 struct p9_fid *fid = filp->private_data;
353 366
354 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid, 367 P9_DPRINTK(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", fid->fid,
355 (long long unsigned) offset, count); 368 (long long unsigned) offset, count);
356
357 n = 0; 369 n = 0;
358 total = 0; 370 total = 0;
359 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; 371 size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ;
@@ -379,6 +391,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
379} 391}
380 392
381/** 393/**
394 * v9fs_file_readn - read from a file
395 * @filp: file pointer to read
396 * @data: data buffer to read data into
397 * @udata: user data buffer to read data into
398 * @count: size of buffer
399 * @offset: offset at which to read data
400 *
401 */
402ssize_t
403v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count,
404 u64 offset)
405{
406 return v9fs_fid_readn(filp->private_data, data, udata, count, offset);
407}
408
409/**
382 * v9fs_file_read - read from a file 410 * v9fs_file_read - read from a file
383 * @filp: file pointer to read 411 * @filp: file pointer to read
384 * @udata: user data buffer to read data into 412 * @udata: user data buffer to read data into
@@ -410,45 +438,22 @@ v9fs_file_read(struct file *filp, char __user *udata, size_t count,
410 return ret; 438 return ret;
411} 439}
412 440
413/** 441ssize_t
414 * v9fs_file_write - write to a file 442v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
415 * @filp: file pointer to write 443 const char __user *data, size_t count,
416 * @data: data buffer to write data from 444 loff_t *offset, int invalidate)
417 * @count: size of buffer
418 * @offset: offset at which to write data
419 *
420 */
421
422static ssize_t
423v9fs_file_write(struct file *filp, const char __user * data,
424 size_t count, loff_t * offset)
425{ 445{
426 ssize_t retval;
427 size_t total = 0;
428 int n; 446 int n;
429 struct p9_fid *fid; 447 loff_t i_size;
448 size_t total = 0;
430 struct p9_client *clnt; 449 struct p9_client *clnt;
431 struct inode *inode = filp->f_path.dentry->d_inode;
432 loff_t origin = *offset; 450 loff_t origin = *offset;
433 unsigned long pg_start, pg_end; 451 unsigned long pg_start, pg_end;
434 452
435 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data, 453 P9_DPRINTK(P9_DEBUG_VFS, "data %p count %d offset %x\n", data,
436 (int)count, (int)*offset); 454 (int)count, (int)*offset);
437 455
438 fid = filp->private_data;
439 clnt = fid->clnt; 456 clnt = fid->clnt;
440
441 retval = generic_write_checks(filp, &origin, &count, 0);
442 if (retval)
443 goto out;
444
445 retval = -EINVAL;
446 if ((ssize_t) count < 0)
447 goto out;
448 retval = 0;
449 if (!count)
450 goto out;
451
452 do { 457 do {
453 n = p9_client_write(fid, NULL, data+total, origin+total, count); 458 n = p9_client_write(fid, NULL, data+total, origin+total, count);
454 if (n <= 0) 459 if (n <= 0)
@@ -457,25 +462,63 @@ v9fs_file_write(struct file *filp, const char __user * data,
457 total += n; 462 total += n;
458 } while (count > 0); 463 } while (count > 0);
459 464
460 if (total > 0) { 465 if (invalidate && (total > 0)) {
461 pg_start = origin >> PAGE_CACHE_SHIFT; 466 pg_start = origin >> PAGE_CACHE_SHIFT;
462 pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT; 467 pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
463 if (inode->i_mapping && inode->i_mapping->nrpages) 468 if (inode->i_mapping && inode->i_mapping->nrpages)
464 invalidate_inode_pages2_range(inode->i_mapping, 469 invalidate_inode_pages2_range(inode->i_mapping,
465 pg_start, pg_end); 470 pg_start, pg_end);
466 *offset += total; 471 *offset += total;
467 i_size_write(inode, i_size_read(inode) + total); 472 i_size = i_size_read(inode);
468 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; 473 if (*offset > i_size) {
474 inode_add_bytes(inode, *offset - i_size);
475 i_size_write(inode, *offset);
476 }
469 } 477 }
470
471 if (n < 0) 478 if (n < 0)
472 retval = n; 479 return n;
473 else 480
474 retval = total; 481 return total;
482}
483
484/**
485 * v9fs_file_write - write to a file
486 * @filp: file pointer to write
487 * @data: data buffer to write data from
488 * @count: size of buffer
489 * @offset: offset at which to write data
490 *
491 */
492static ssize_t
493v9fs_file_write(struct file *filp, const char __user * data,
494 size_t count, loff_t *offset)
495{
496 ssize_t retval = 0;
497 loff_t origin = *offset;
498
499
500 retval = generic_write_checks(filp, &origin, &count, 0);
501 if (retval)
502 goto out;
503
504 retval = -EINVAL;
505 if ((ssize_t) count < 0)
506 goto out;
507 retval = 0;
508 if (!count)
509 goto out;
510
511 retval = v9fs_file_write_internal(filp->f_path.dentry->d_inode,
512 filp->private_data,
513 data, count, &origin, 1);
514 /* update offset on successful write */
515 if (retval > 0)
516 *offset = origin;
475out: 517out:
476 return retval; 518 return retval;
477} 519}
478 520
521
479static int v9fs_file_fsync(struct file *filp, int datasync) 522static int v9fs_file_fsync(struct file *filp, int datasync)
480{ 523{
481 struct p9_fid *fid; 524 struct p9_fid *fid;
@@ -505,28 +548,182 @@ int v9fs_file_fsync_dotl(struct file *filp, int datasync)
505 return retval; 548 return retval;
506} 549}
507 550
508static const struct file_operations v9fs_cached_file_operations = { 551static int
552v9fs_file_mmap(struct file *file, struct vm_area_struct *vma)
553{
554 int retval;
555
556 retval = generic_file_mmap(file, vma);
557 if (!retval)
558 vma->vm_ops = &v9fs_file_vm_ops;
559
560 return retval;
561}
562
563static int
564v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
565{
566 struct v9fs_inode *v9inode;
567 struct page *page = vmf->page;
568 struct file *filp = vma->vm_file;
569 struct inode *inode = filp->f_path.dentry->d_inode;
570
571
572 P9_DPRINTK(P9_DEBUG_VFS, "page %p fid %lx\n",
573 page, (unsigned long)filp->private_data);
574
575 v9inode = V9FS_I(inode);
576 /* make sure the cache has finished storing the page */
577 v9fs_fscache_wait_on_page_write(inode, page);
578 BUG_ON(!v9inode->writeback_fid);
579 lock_page(page);
580 if (page->mapping != inode->i_mapping)
581 goto out_unlock;
582
583 return VM_FAULT_LOCKED;
584out_unlock:
585 unlock_page(page);
586 return VM_FAULT_NOPAGE;
587}
588
589static ssize_t
590v9fs_direct_read(struct file *filp, char __user *udata, size_t count,
591 loff_t *offsetp)
592{
593 loff_t size, offset;
594 struct inode *inode;
595 struct address_space *mapping;
596
597 offset = *offsetp;
598 mapping = filp->f_mapping;
599 inode = mapping->host;
600 if (!count)
601 return 0;
602 size = i_size_read(inode);
603 if (offset < size)
604 filemap_write_and_wait_range(mapping, offset,
605 offset + count - 1);
606
607 return v9fs_file_read(filp, udata, count, offsetp);
608}
609
610/**
611 * v9fs_cached_file_read - read from a file
612 * @filp: file pointer to read
613 * @udata: user data buffer to read data into
614 * @count: size of buffer
615 * @offset: offset at which to read data
616 *
617 */
618static ssize_t
619v9fs_cached_file_read(struct file *filp, char __user *data, size_t count,
620 loff_t *offset)
621{
622 if (filp->f_flags & O_DIRECT)
623 return v9fs_direct_read(filp, data, count, offset);
624 return do_sync_read(filp, data, count, offset);
625}
626
627static ssize_t
628v9fs_direct_write(struct file *filp, const char __user * data,
629 size_t count, loff_t *offsetp)
630{
631 loff_t offset;
632 ssize_t retval;
633 struct inode *inode;
634 struct address_space *mapping;
635
636 offset = *offsetp;
637 mapping = filp->f_mapping;
638 inode = mapping->host;
639 if (!count)
640 return 0;
641
642 mutex_lock(&inode->i_mutex);
643 retval = filemap_write_and_wait_range(mapping, offset,
644 offset + count - 1);
645 if (retval)
646 goto err_out;
647 /*
648 * After a write we want buffered reads to be sure to go to disk to get
649 * the new data. We invalidate clean cached page from the region we're
650 * about to write. We do this *before* the write so that if we fail
651 * here we fall back to buffered write
652 */
653 if (mapping->nrpages) {
654 pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
655 pgoff_t pg_end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
656
657 retval = invalidate_inode_pages2_range(mapping,
658 pg_start, pg_end);
659 /*
660 * If a page can not be invalidated, fall back
661 * to buffered write.
662 */
663 if (retval) {
664 if (retval == -EBUSY)
665 goto buff_write;
666 goto err_out;
667 }
668 }
669 retval = v9fs_file_write(filp, data, count, offsetp);
670err_out:
671 mutex_unlock(&inode->i_mutex);
672 return retval;
673
674buff_write:
675 mutex_unlock(&inode->i_mutex);
676 return do_sync_write(filp, data, count, offsetp);
677}
678
679/**
680 * v9fs_cached_file_write - write to a file
681 * @filp: file pointer to write
682 * @data: data buffer to write data from
683 * @count: size of buffer
684 * @offset: offset at which to write data
685 *
686 */
687static ssize_t
688v9fs_cached_file_write(struct file *filp, const char __user * data,
689 size_t count, loff_t *offset)
690{
691
692 if (filp->f_flags & O_DIRECT)
693 return v9fs_direct_write(filp, data, count, offset);
694 return do_sync_write(filp, data, count, offset);
695}
696
697static const struct vm_operations_struct v9fs_file_vm_ops = {
698 .fault = filemap_fault,
699 .page_mkwrite = v9fs_vm_page_mkwrite,
700};
701
702
703const struct file_operations v9fs_cached_file_operations = {
509 .llseek = generic_file_llseek, 704 .llseek = generic_file_llseek,
510 .read = do_sync_read, 705 .read = v9fs_cached_file_read,
706 .write = v9fs_cached_file_write,
511 .aio_read = generic_file_aio_read, 707 .aio_read = generic_file_aio_read,
512 .write = v9fs_file_write, 708 .aio_write = generic_file_aio_write,
513 .open = v9fs_file_open, 709 .open = v9fs_file_open,
514 .release = v9fs_dir_release, 710 .release = v9fs_dir_release,
515 .lock = v9fs_file_lock, 711 .lock = v9fs_file_lock,
516 .mmap = generic_file_readonly_mmap, 712 .mmap = v9fs_file_mmap,
517 .fsync = v9fs_file_fsync, 713 .fsync = v9fs_file_fsync,
518}; 714};
519 715
520static const struct file_operations v9fs_cached_file_operations_dotl = { 716const struct file_operations v9fs_cached_file_operations_dotl = {
521 .llseek = generic_file_llseek, 717 .llseek = generic_file_llseek,
522 .read = do_sync_read, 718 .read = v9fs_cached_file_read,
719 .write = v9fs_cached_file_write,
523 .aio_read = generic_file_aio_read, 720 .aio_read = generic_file_aio_read,
524 .write = v9fs_file_write, 721 .aio_write = generic_file_aio_write,
525 .open = v9fs_file_open, 722 .open = v9fs_file_open,
526 .release = v9fs_dir_release, 723 .release = v9fs_dir_release,
527 .lock = v9fs_file_lock_dotl, 724 .lock = v9fs_file_lock_dotl,
528 .flock = v9fs_file_flock_dotl, 725 .flock = v9fs_file_flock_dotl,
529 .mmap = generic_file_readonly_mmap, 726 .mmap = v9fs_file_mmap,
530 .fsync = v9fs_file_fsync_dotl, 727 .fsync = v9fs_file_fsync_dotl,
531}; 728};
532 729
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b76a40bdf4c2..7f6c67703195 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -203,26 +203,26 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
203 wstat->extension = NULL; 203 wstat->extension = NULL;
204} 204}
205 205
206#ifdef CONFIG_9P_FSCACHE
207/** 206/**
208 * v9fs_alloc_inode - helper function to allocate an inode 207 * v9fs_alloc_inode - helper function to allocate an inode
209 * This callback is executed before setting up the inode so that we
210 * can associate a vcookie with each inode.
211 * 208 *
212 */ 209 */
213
214struct inode *v9fs_alloc_inode(struct super_block *sb) 210struct inode *v9fs_alloc_inode(struct super_block *sb)
215{ 211{
216 struct v9fs_cookie *vcookie; 212 struct v9fs_inode *v9inode;
217 vcookie = (struct v9fs_cookie *)kmem_cache_alloc(vcookie_cache, 213 v9inode = (struct v9fs_inode *)kmem_cache_alloc(v9fs_inode_cache,
218 GFP_KERNEL); 214 GFP_KERNEL);
219 if (!vcookie) 215 if (!v9inode)
220 return NULL; 216 return NULL;
221 217#ifdef CONFIG_9P_FSCACHE
222 vcookie->fscache = NULL; 218 v9inode->fscache = NULL;
223 vcookie->qid = NULL; 219 v9inode->fscache_key = NULL;
224 spin_lock_init(&vcookie->lock); 220 spin_lock_init(&v9inode->fscache_lock);
225 return &vcookie->inode; 221#endif
222 v9inode->writeback_fid = NULL;
223 v9inode->cache_validity = 0;
224 mutex_init(&v9inode->v_mutex);
225 return &v9inode->vfs_inode;
226} 226}
227 227
228/** 228/**
@@ -234,35 +234,18 @@ static void v9fs_i_callback(struct rcu_head *head)
234{ 234{
235 struct inode *inode = container_of(head, struct inode, i_rcu); 235 struct inode *inode = container_of(head, struct inode, i_rcu);
236 INIT_LIST_HEAD(&inode->i_dentry); 236 INIT_LIST_HEAD(&inode->i_dentry);
237 kmem_cache_free(vcookie_cache, v9fs_inode2cookie(inode)); 237 kmem_cache_free(v9fs_inode_cache, V9FS_I(inode));
238} 238}
239 239
240void v9fs_destroy_inode(struct inode *inode) 240void v9fs_destroy_inode(struct inode *inode)
241{ 241{
242 call_rcu(&inode->i_rcu, v9fs_i_callback); 242 call_rcu(&inode->i_rcu, v9fs_i_callback);
243} 243}
244#endif
245
246/**
247 * v9fs_get_inode - helper function to setup an inode
248 * @sb: superblock
249 * @mode: mode to setup inode with
250 *
251 */
252 244
253struct inode *v9fs_get_inode(struct super_block *sb, int mode) 245int v9fs_init_inode(struct v9fs_session_info *v9ses,
246 struct inode *inode, int mode)
254{ 247{
255 int err; 248 int err = 0;
256 struct inode *inode;
257 struct v9fs_session_info *v9ses = sb->s_fs_info;
258
259 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
260
261 inode = new_inode(sb);
262 if (!inode) {
263 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
264 return ERR_PTR(-ENOMEM);
265 }
266 249
267 inode_init_owner(inode, NULL, mode); 250 inode_init_owner(inode, NULL, mode);
268 inode->i_blocks = 0; 251 inode->i_blocks = 0;
@@ -292,14 +275,20 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
292 case S_IFREG: 275 case S_IFREG:
293 if (v9fs_proto_dotl(v9ses)) { 276 if (v9fs_proto_dotl(v9ses)) {
294 inode->i_op = &v9fs_file_inode_operations_dotl; 277 inode->i_op = &v9fs_file_inode_operations_dotl;
295 inode->i_fop = &v9fs_file_operations_dotl; 278 if (v9ses->cache)
279 inode->i_fop =
280 &v9fs_cached_file_operations_dotl;
281 else
282 inode->i_fop = &v9fs_file_operations_dotl;
296 } else { 283 } else {
297 inode->i_op = &v9fs_file_inode_operations; 284 inode->i_op = &v9fs_file_inode_operations;
298 inode->i_fop = &v9fs_file_operations; 285 if (v9ses->cache)
286 inode->i_fop = &v9fs_cached_file_operations;
287 else
288 inode->i_fop = &v9fs_file_operations;
299 } 289 }
300 290
301 break; 291 break;
302
303 case S_IFLNK: 292 case S_IFLNK:
304 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) { 293 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
305 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with " 294 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
@@ -335,12 +324,37 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
335 err = -EINVAL; 324 err = -EINVAL;
336 goto error; 325 goto error;
337 } 326 }
327error:
328 return err;
338 329
339 return inode; 330}
340 331
341error: 332/**
342 iput(inode); 333 * v9fs_get_inode - helper function to setup an inode
343 return ERR_PTR(err); 334 * @sb: superblock
335 * @mode: mode to setup inode with
336 *
337 */
338
339struct inode *v9fs_get_inode(struct super_block *sb, int mode)
340{
341 int err;
342 struct inode *inode;
343 struct v9fs_session_info *v9ses = sb->s_fs_info;
344
345 P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
346
347 inode = new_inode(sb);
348 if (!inode) {
349 P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
350 return ERR_PTR(-ENOMEM);
351 }
352 err = v9fs_init_inode(v9ses, inode, mode);
353 if (err) {
354 iput(inode);
355 return ERR_PTR(err);
356 }
357 return inode;
344} 358}
345 359
346/* 360/*
@@ -403,6 +417,8 @@ error:
403 */ 417 */
404void v9fs_evict_inode(struct inode *inode) 418void v9fs_evict_inode(struct inode *inode)
405{ 419{
420 struct v9fs_inode *v9inode = V9FS_I(inode);
421
406 truncate_inode_pages(inode->i_mapping, 0); 422 truncate_inode_pages(inode->i_mapping, 0);
407 end_writeback(inode); 423 end_writeback(inode);
408 filemap_fdatawrite(inode->i_mapping); 424 filemap_fdatawrite(inode->i_mapping);
@@ -410,41 +426,67 @@ void v9fs_evict_inode(struct inode *inode)
410#ifdef CONFIG_9P_FSCACHE 426#ifdef CONFIG_9P_FSCACHE
411 v9fs_cache_inode_put_cookie(inode); 427 v9fs_cache_inode_put_cookie(inode);
412#endif 428#endif
429 /* clunk the fid stashed in writeback_fid */
430 if (v9inode->writeback_fid) {
431 p9_client_clunk(v9inode->writeback_fid);
432 v9inode->writeback_fid = NULL;
433 }
413} 434}
414 435
415struct inode * 436static struct inode *v9fs_qid_iget(struct super_block *sb,
416v9fs_inode(struct v9fs_session_info *v9ses, struct p9_fid *fid, 437 struct p9_qid *qid,
417 struct super_block *sb) 438 struct p9_wstat *st)
418{ 439{
419 int err, umode; 440 int retval, umode;
420 struct inode *ret = NULL; 441 unsigned long i_ino;
421 struct p9_wstat *st; 442 struct inode *inode;
422 443 struct v9fs_session_info *v9ses = sb->s_fs_info;
423 st = p9_client_stat(fid);
424 if (IS_ERR(st))
425 return ERR_CAST(st);
426 444
445 i_ino = v9fs_qid2ino(qid);
446 inode = iget_locked(sb, i_ino);
447 if (!inode)
448 return ERR_PTR(-ENOMEM);
449 if (!(inode->i_state & I_NEW))
450 return inode;
451 /*
452 * initialize the inode with the stat info
453 * FIXME!! we may need support for stale inodes
454 * later.
455 */
427 umode = p9mode2unixmode(v9ses, st->mode); 456 umode = p9mode2unixmode(v9ses, st->mode);
428 ret = v9fs_get_inode(sb, umode); 457 retval = v9fs_init_inode(v9ses, inode, umode);
429 if (IS_ERR(ret)) { 458 if (retval)
430 err = PTR_ERR(ret);
431 goto error; 459 goto error;
432 }
433
434 v9fs_stat2inode(st, ret, sb);
435 ret->i_ino = v9fs_qid2ino(&st->qid);
436 460
461 v9fs_stat2inode(st, inode, sb);
437#ifdef CONFIG_9P_FSCACHE 462#ifdef CONFIG_9P_FSCACHE
438 v9fs_vcookie_set_qid(ret, &st->qid); 463 v9fs_fscache_set_key(inode, &st->qid);
439 v9fs_cache_inode_get_cookie(ret); 464 v9fs_cache_inode_get_cookie(inode);
440#endif 465#endif
441 p9stat_free(st); 466 unlock_new_inode(inode);
442 kfree(st); 467 return inode;
443 return ret;
444error: 468error:
469 unlock_new_inode(inode);
470 iput(inode);
471 return ERR_PTR(retval);
472
473}
474
475struct inode *
476v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
477 struct super_block *sb)
478{
479 struct p9_wstat *st;
480 struct inode *inode = NULL;
481
482 st = p9_client_stat(fid);
483 if (IS_ERR(st))
484 return ERR_CAST(st);
485
486 inode = v9fs_qid_iget(sb, &st->qid, st);
445 p9stat_free(st); 487 p9stat_free(st);
446 kfree(st); 488 kfree(st);
447 return ERR_PTR(err); 489 return inode;
448} 490}
449 491
450/** 492/**
@@ -458,8 +500,8 @@ error:
458static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) 500static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
459{ 501{
460 int retval; 502 int retval;
461 struct inode *file_inode;
462 struct p9_fid *v9fid; 503 struct p9_fid *v9fid;
504 struct inode *file_inode;
463 505
464 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 506 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
465 rmdir); 507 rmdir);
@@ -470,8 +512,20 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
470 return PTR_ERR(v9fid); 512 return PTR_ERR(v9fid);
471 513
472 retval = p9_client_remove(v9fid); 514 retval = p9_client_remove(v9fid);
473 if (!retval) 515 if (!retval) {
474 drop_nlink(file_inode); 516 /*
517 * directories on unlink should have zero
518 * link count
519 */
520 if (rmdir) {
521 clear_nlink(file_inode);
522 drop_nlink(dir);
523 } else
524 drop_nlink(file_inode);
525
526 v9fs_invalidate_inode_attr(file_inode);
527 v9fs_invalidate_inode_attr(dir);
528 }
475 return retval; 529 return retval;
476} 530}
477 531
@@ -531,7 +585,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
531 } 585 }
532 586
533 /* instantiate inode and assign the unopened fid to the dentry */ 587 /* instantiate inode and assign the unopened fid to the dentry */
534 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 588 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
535 if (IS_ERR(inode)) { 589 if (IS_ERR(inode)) {
536 err = PTR_ERR(inode); 590 err = PTR_ERR(inode);
537 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 591 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -570,9 +624,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
570 int err; 624 int err;
571 u32 perm; 625 u32 perm;
572 int flags; 626 int flags;
573 struct v9fs_session_info *v9ses;
574 struct p9_fid *fid;
575 struct file *filp; 627 struct file *filp;
628 struct v9fs_inode *v9inode;
629 struct v9fs_session_info *v9ses;
630 struct p9_fid *fid, *inode_fid;
576 631
577 err = 0; 632 err = 0;
578 fid = NULL; 633 fid = NULL;
@@ -592,8 +647,29 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
592 goto error; 647 goto error;
593 } 648 }
594 649
650 v9fs_invalidate_inode_attr(dir);
595 /* if we are opening a file, assign the open fid to the file */ 651 /* if we are opening a file, assign the open fid to the file */
596 if (nd && nd->flags & LOOKUP_OPEN) { 652 if (nd && nd->flags & LOOKUP_OPEN) {
653 v9inode = V9FS_I(dentry->d_inode);
654 mutex_lock(&v9inode->v_mutex);
655 if (v9ses->cache && !v9inode->writeback_fid &&
656 ((flags & O_ACCMODE) != O_RDONLY)) {
657 /*
658 * clone a fid and add it to writeback_fid
659 * we do it during open time instead of
660 * page dirty time via write_begin/page_mkwrite
661 * because we want write after unlink usecase
662 * to work.
663 */
664 inode_fid = v9fs_writeback_fid(dentry);
665 if (IS_ERR(inode_fid)) {
666 err = PTR_ERR(inode_fid);
667 mutex_unlock(&v9inode->v_mutex);
668 goto error;
669 }
670 v9inode->writeback_fid = (void *) inode_fid;
671 }
672 mutex_unlock(&v9inode->v_mutex);
597 filp = lookup_instantiate_filp(nd, dentry, generic_file_open); 673 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
598 if (IS_ERR(filp)) { 674 if (IS_ERR(filp)) {
599 err = PTR_ERR(filp); 675 err = PTR_ERR(filp);
@@ -601,6 +677,10 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode,
601 } 677 }
602 678
603 filp->private_data = fid; 679 filp->private_data = fid;
680#ifdef CONFIG_9P_FSCACHE
681 if (v9ses->cache)
682 v9fs_cache_inode_set_cookie(dentry->d_inode, filp);
683#endif
604 } else 684 } else
605 p9_client_clunk(fid); 685 p9_client_clunk(fid);
606 686
@@ -625,8 +705,8 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
625{ 705{
626 int err; 706 int err;
627 u32 perm; 707 u32 perm;
628 struct v9fs_session_info *v9ses;
629 struct p9_fid *fid; 708 struct p9_fid *fid;
709 struct v9fs_session_info *v9ses;
630 710
631 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); 711 P9_DPRINTK(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name);
632 err = 0; 712 err = 0;
@@ -636,6 +716,9 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
636 if (IS_ERR(fid)) { 716 if (IS_ERR(fid)) {
637 err = PTR_ERR(fid); 717 err = PTR_ERR(fid);
638 fid = NULL; 718 fid = NULL;
719 } else {
720 inc_nlink(dir);
721 v9fs_invalidate_inode_attr(dir);
639 } 722 }
640 723
641 if (fid) 724 if (fid)
@@ -687,7 +770,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
687 return ERR_PTR(result); 770 return ERR_PTR(result);
688 } 771 }
689 772
690 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 773 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
691 if (IS_ERR(inode)) { 774 if (IS_ERR(inode)) {
692 result = PTR_ERR(inode); 775 result = PTR_ERR(inode);
693 inode = NULL; 776 inode = NULL;
@@ -747,17 +830,19 @@ int
747v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, 830v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
748 struct inode *new_dir, struct dentry *new_dentry) 831 struct inode *new_dir, struct dentry *new_dentry)
749{ 832{
833 int retval;
750 struct inode *old_inode; 834 struct inode *old_inode;
835 struct inode *new_inode;
751 struct v9fs_session_info *v9ses; 836 struct v9fs_session_info *v9ses;
752 struct p9_fid *oldfid; 837 struct p9_fid *oldfid;
753 struct p9_fid *olddirfid; 838 struct p9_fid *olddirfid;
754 struct p9_fid *newdirfid; 839 struct p9_fid *newdirfid;
755 struct p9_wstat wstat; 840 struct p9_wstat wstat;
756 int retval;
757 841
758 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 842 P9_DPRINTK(P9_DEBUG_VFS, "\n");
759 retval = 0; 843 retval = 0;
760 old_inode = old_dentry->d_inode; 844 old_inode = old_dentry->d_inode;
845 new_inode = new_dentry->d_inode;
761 v9ses = v9fs_inode2v9ses(old_inode); 846 v9ses = v9fs_inode2v9ses(old_inode);
762 oldfid = v9fs_fid_lookup(old_dentry); 847 oldfid = v9fs_fid_lookup(old_dentry);
763 if (IS_ERR(oldfid)) 848 if (IS_ERR(oldfid))
@@ -798,9 +883,30 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
798 retval = p9_client_wstat(oldfid, &wstat); 883 retval = p9_client_wstat(oldfid, &wstat);
799 884
800clunk_newdir: 885clunk_newdir:
801 if (!retval) 886 if (!retval) {
887 if (new_inode) {
888 if (S_ISDIR(new_inode->i_mode))
889 clear_nlink(new_inode);
890 else
891 drop_nlink(new_inode);
892 /*
893 * Work around vfs rename rehash bug with
894 * FS_RENAME_DOES_D_MOVE
895 */
896 v9fs_invalidate_inode_attr(new_inode);
897 }
898 if (S_ISDIR(old_inode->i_mode)) {
899 if (!new_inode)
900 inc_nlink(new_dir);
901 drop_nlink(old_dir);
902 }
903 v9fs_invalidate_inode_attr(old_inode);
904 v9fs_invalidate_inode_attr(old_dir);
905 v9fs_invalidate_inode_attr(new_dir);
906
802 /* successful rename */ 907 /* successful rename */
803 d_move(old_dentry, new_dentry); 908 d_move(old_dentry, new_dentry);
909 }
804 up_write(&v9ses->rename_sem); 910 up_write(&v9ses->rename_sem);
805 p9_client_clunk(newdirfid); 911 p9_client_clunk(newdirfid);
806 912
@@ -830,10 +936,11 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
830 936
831 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 937 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
832 err = -EPERM; 938 err = -EPERM;
833 v9ses = v9fs_inode2v9ses(dentry->d_inode); 939 v9ses = v9fs_dentry2v9ses(dentry);
834 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) 940 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
835 return simple_getattr(mnt, dentry, stat); 941 generic_fillattr(dentry->d_inode, stat);
836 942 return 0;
943 }
837 fid = v9fs_fid_lookup(dentry); 944 fid = v9fs_fid_lookup(dentry);
838 if (IS_ERR(fid)) 945 if (IS_ERR(fid))
839 return PTR_ERR(fid); 946 return PTR_ERR(fid);
@@ -865,8 +972,12 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
865 struct p9_wstat wstat; 972 struct p9_wstat wstat;
866 973
867 P9_DPRINTK(P9_DEBUG_VFS, "\n"); 974 P9_DPRINTK(P9_DEBUG_VFS, "\n");
975 retval = inode_change_ok(dentry->d_inode, iattr);
976 if (retval)
977 return retval;
978
868 retval = -EPERM; 979 retval = -EPERM;
869 v9ses = v9fs_inode2v9ses(dentry->d_inode); 980 v9ses = v9fs_dentry2v9ses(dentry);
870 fid = v9fs_fid_lookup(dentry); 981 fid = v9fs_fid_lookup(dentry);
871 if(IS_ERR(fid)) 982 if(IS_ERR(fid))
872 return PTR_ERR(fid); 983 return PTR_ERR(fid);
@@ -892,16 +1003,19 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr)
892 wstat.n_gid = iattr->ia_gid; 1003 wstat.n_gid = iattr->ia_gid;
893 } 1004 }
894 1005
1006 /* Write all dirty data */
1007 if (S_ISREG(dentry->d_inode->i_mode))
1008 filemap_write_and_wait(dentry->d_inode->i_mapping);
1009
895 retval = p9_client_wstat(fid, &wstat); 1010 retval = p9_client_wstat(fid, &wstat);
896 if (retval < 0) 1011 if (retval < 0)
897 return retval; 1012 return retval;
898 1013
899 if ((iattr->ia_valid & ATTR_SIZE) && 1014 if ((iattr->ia_valid & ATTR_SIZE) &&
900 iattr->ia_size != i_size_read(dentry->d_inode)) { 1015 iattr->ia_size != i_size_read(dentry->d_inode))
901 retval = vmtruncate(dentry->d_inode, iattr->ia_size); 1016 truncate_setsize(dentry->d_inode, iattr->ia_size);
902 if (retval) 1017
903 return retval; 1018 v9fs_invalidate_inode_attr(dentry->d_inode);
904 }
905 1019
906 setattr_copy(dentry->d_inode, iattr); 1020 setattr_copy(dentry->d_inode, iattr);
907 mark_inode_dirty(dentry->d_inode); 1021 mark_inode_dirty(dentry->d_inode);
@@ -924,6 +1038,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
924 char tag_name[14]; 1038 char tag_name[14];
925 unsigned int i_nlink; 1039 unsigned int i_nlink;
926 struct v9fs_session_info *v9ses = sb->s_fs_info; 1040 struct v9fs_session_info *v9ses = sb->s_fs_info;
1041 struct v9fs_inode *v9inode = V9FS_I(inode);
927 1042
928 inode->i_nlink = 1; 1043 inode->i_nlink = 1;
929 1044
@@ -983,6 +1098,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
983 1098
984 /* not real number of blocks, but 512 byte ones ... */ 1099 /* not real number of blocks, but 512 byte ones ... */
985 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9; 1100 inode->i_blocks = (i_size_read(inode) + 512 - 1) >> 9;
1101 v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
986} 1102}
987 1103
988/** 1104/**
@@ -1023,7 +1139,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen)
1023 1139
1024 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); 1140 P9_DPRINTK(P9_DEBUG_VFS, " %s\n", dentry->d_name.name);
1025 retval = -EPERM; 1141 retval = -EPERM;
1026 v9ses = v9fs_inode2v9ses(dentry->d_inode); 1142 v9ses = v9fs_dentry2v9ses(dentry);
1027 fid = v9fs_fid_lookup(dentry); 1143 fid = v9fs_fid_lookup(dentry);
1028 if (IS_ERR(fid)) 1144 if (IS_ERR(fid))
1029 return PTR_ERR(fid); 1145 return PTR_ERR(fid);
@@ -1115,8 +1231,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1115 int mode, const char *extension) 1231 int mode, const char *extension)
1116{ 1232{
1117 u32 perm; 1233 u32 perm;
1118 struct v9fs_session_info *v9ses;
1119 struct p9_fid *fid; 1234 struct p9_fid *fid;
1235 struct v9fs_session_info *v9ses;
1120 1236
1121 v9ses = v9fs_inode2v9ses(dir); 1237 v9ses = v9fs_inode2v9ses(dir);
1122 if (!v9fs_proto_dotu(v9ses)) { 1238 if (!v9fs_proto_dotu(v9ses)) {
@@ -1130,6 +1246,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry,
1130 if (IS_ERR(fid)) 1246 if (IS_ERR(fid))
1131 return PTR_ERR(fid); 1247 return PTR_ERR(fid);
1132 1248
1249 v9fs_invalidate_inode_attr(dir);
1133 p9_client_clunk(fid); 1250 p9_client_clunk(fid);
1134 return 0; 1251 return 0;
1135} 1252}
@@ -1166,8 +1283,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1166 struct dentry *dentry) 1283 struct dentry *dentry)
1167{ 1284{
1168 int retval; 1285 int retval;
1169 struct p9_fid *oldfid;
1170 char *name; 1286 char *name;
1287 struct p9_fid *oldfid;
1171 1288
1172 P9_DPRINTK(P9_DEBUG_VFS, 1289 P9_DPRINTK(P9_DEBUG_VFS,
1173 " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name, 1290 " %lu,%s,%s\n", dir->i_ino, dentry->d_name.name,
@@ -1186,7 +1303,10 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir,
1186 sprintf(name, "%d\n", oldfid->fid); 1303 sprintf(name, "%d\n", oldfid->fid);
1187 retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); 1304 retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name);
1188 __putname(name); 1305 __putname(name);
1189 1306 if (!retval) {
1307 v9fs_refresh_inode(oldfid, old_dentry->d_inode);
1308 v9fs_invalidate_inode_attr(dir);
1309 }
1190clunk_fid: 1310clunk_fid:
1191 p9_client_clunk(oldfid); 1311 p9_client_clunk(oldfid);
1192 return retval; 1312 return retval;
@@ -1237,6 +1357,32 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1237 return retval; 1357 return retval;
1238} 1358}
1239 1359
1360int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
1361{
1362 loff_t i_size;
1363 struct p9_wstat *st;
1364 struct v9fs_session_info *v9ses;
1365
1366 v9ses = v9fs_inode2v9ses(inode);
1367 st = p9_client_stat(fid);
1368 if (IS_ERR(st))
1369 return PTR_ERR(st);
1370
1371 spin_lock(&inode->i_lock);
1372 /*
1373 * We don't want to refresh inode->i_size,
1374 * because we may have cached data
1375 */
1376 i_size = inode->i_size;
1377 v9fs_stat2inode(st, inode, inode->i_sb);
1378 if (v9ses->cache)
1379 inode->i_size = i_size;
1380 spin_unlock(&inode->i_lock);
1381 p9stat_free(st);
1382 kfree(st);
1383 return 0;
1384}
1385
1240static const struct inode_operations v9fs_dir_inode_operations_dotu = { 1386static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1241 .create = v9fs_vfs_create, 1387 .create = v9fs_vfs_create,
1242 .lookup = v9fs_vfs_lookup, 1388 .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index fe3ffa9aace4..82a7c38ddad0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -86,40 +86,63 @@ static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
86 return dentry; 86 return dentry;
87} 87}
88 88
89static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
90 struct p9_qid *qid,
91 struct p9_fid *fid,
92 struct p9_stat_dotl *st)
93{
94 int retval;
95 unsigned long i_ino;
96 struct inode *inode;
97 struct v9fs_session_info *v9ses = sb->s_fs_info;
98
99 i_ino = v9fs_qid2ino(qid);
100 inode = iget_locked(sb, i_ino);
101 if (!inode)
102 return ERR_PTR(-ENOMEM);
103 if (!(inode->i_state & I_NEW))
104 return inode;
105 /*
106 * initialize the inode with the stat info
107 * FIXME!! we may need support for stale inodes
108 * later.
109 */
110 retval = v9fs_init_inode(v9ses, inode, st->st_mode);
111 if (retval)
112 goto error;
113
114 v9fs_stat2inode_dotl(st, inode);
115#ifdef CONFIG_9P_FSCACHE
116 v9fs_fscache_set_key(inode, &st->qid);
117 v9fs_cache_inode_get_cookie(inode);
118#endif
119 retval = v9fs_get_acl(inode, fid);
120 if (retval)
121 goto error;
122
123 unlock_new_inode(inode);
124 return inode;
125error:
126 unlock_new_inode(inode);
127 iput(inode);
128 return ERR_PTR(retval);
129
130}
131
89struct inode * 132struct inode *
90v9fs_inode_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, 133v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
91 struct super_block *sb) 134 struct super_block *sb)
92{ 135{
93 struct inode *ret = NULL;
94 int err;
95 struct p9_stat_dotl *st; 136 struct p9_stat_dotl *st;
137 struct inode *inode = NULL;
96 138
97 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); 139 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
98 if (IS_ERR(st)) 140 if (IS_ERR(st))
99 return ERR_CAST(st); 141 return ERR_CAST(st);
100 142
101 ret = v9fs_get_inode(sb, st->st_mode); 143 inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st);
102 if (IS_ERR(ret)) {
103 err = PTR_ERR(ret);
104 goto error;
105 }
106
107 v9fs_stat2inode_dotl(st, ret);
108 ret->i_ino = v9fs_qid2ino(&st->qid);
109#ifdef CONFIG_9P_FSCACHE
110 v9fs_vcookie_set_qid(ret, &st->qid);
111 v9fs_cache_inode_get_cookie(ret);
112#endif
113 err = v9fs_get_acl(ret, fid);
114 if (err) {
115 iput(ret);
116 goto error;
117 }
118 kfree(st); 144 kfree(st);
119 return ret; 145 return inode;
120error:
121 kfree(st);
122 return ERR_PTR(err);
123} 146}
124 147
125/** 148/**
@@ -136,16 +159,17 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
136 struct nameidata *nd) 159 struct nameidata *nd)
137{ 160{
138 int err = 0; 161 int err = 0;
139 char *name = NULL;
140 gid_t gid; 162 gid_t gid;
141 int flags; 163 int flags;
142 mode_t mode; 164 mode_t mode;
143 struct v9fs_session_info *v9ses; 165 char *name = NULL;
144 struct p9_fid *fid = NULL;
145 struct p9_fid *dfid, *ofid;
146 struct file *filp; 166 struct file *filp;
147 struct p9_qid qid; 167 struct p9_qid qid;
148 struct inode *inode; 168 struct inode *inode;
169 struct p9_fid *fid = NULL;
170 struct v9fs_inode *v9inode;
171 struct p9_fid *dfid, *ofid, *inode_fid;
172 struct v9fs_session_info *v9ses;
149 struct posix_acl *pacl = NULL, *dacl = NULL; 173 struct posix_acl *pacl = NULL, *dacl = NULL;
150 174
151 v9ses = v9fs_inode2v9ses(dir); 175 v9ses = v9fs_inode2v9ses(dir);
@@ -196,6 +220,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
196 err); 220 err);
197 goto error; 221 goto error;
198 } 222 }
223 v9fs_invalidate_inode_attr(dir);
199 224
200 /* instantiate inode and assign the unopened fid to the dentry */ 225 /* instantiate inode and assign the unopened fid to the dentry */
201 fid = p9_client_walk(dfid, 1, &name, 1); 226 fid = p9_client_walk(dfid, 1, &name, 1);
@@ -205,7 +230,7 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
205 fid = NULL; 230 fid = NULL;
206 goto error; 231 goto error;
207 } 232 }
208 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 233 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
209 if (IS_ERR(inode)) { 234 if (IS_ERR(inode)) {
210 err = PTR_ERR(inode); 235 err = PTR_ERR(inode);
211 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err); 236 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -219,6 +244,26 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
219 /* Now set the ACL based on the default value */ 244 /* Now set the ACL based on the default value */
220 v9fs_set_create_acl(dentry, dacl, pacl); 245 v9fs_set_create_acl(dentry, dacl, pacl);
221 246
247 v9inode = V9FS_I(inode);
248 mutex_lock(&v9inode->v_mutex);
249 if (v9ses->cache && !v9inode->writeback_fid &&
250 ((flags & O_ACCMODE) != O_RDONLY)) {
251 /*
252 * clone a fid and add it to writeback_fid
253 * we do it during open time instead of
254 * page dirty time via write_begin/page_mkwrite
255 * because we want write after unlink usecase
256 * to work.
257 */
258 inode_fid = v9fs_writeback_fid(dentry);
259 if (IS_ERR(inode_fid)) {
260 err = PTR_ERR(inode_fid);
261 mutex_unlock(&v9inode->v_mutex);
262 goto error;
263 }
264 v9inode->writeback_fid = (void *) inode_fid;
265 }
266 mutex_unlock(&v9inode->v_mutex);
222 /* Since we are opening a file, assign the open fid to the file */ 267 /* Since we are opening a file, assign the open fid to the file */
223 filp = lookup_instantiate_filp(nd, dentry, generic_file_open); 268 filp = lookup_instantiate_filp(nd, dentry, generic_file_open);
224 if (IS_ERR(filp)) { 269 if (IS_ERR(filp)) {
@@ -226,6 +271,10 @@ v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, int omode,
226 return PTR_ERR(filp); 271 return PTR_ERR(filp);
227 } 272 }
228 filp->private_data = ofid; 273 filp->private_data = ofid;
274#ifdef CONFIG_9P_FSCACHE
275 if (v9ses->cache)
276 v9fs_cache_inode_set_cookie(inode, filp);
277#endif
229 return 0; 278 return 0;
230 279
231error: 280error:
@@ -300,7 +349,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
300 goto error; 349 goto error;
301 } 350 }
302 351
303 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 352 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
304 if (IS_ERR(inode)) { 353 if (IS_ERR(inode)) {
305 err = PTR_ERR(inode); 354 err = PTR_ERR(inode);
306 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 355 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -327,7 +376,8 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
327 } 376 }
328 /* Now set the ACL based on the default value */ 377 /* Now set the ACL based on the default value */
329 v9fs_set_create_acl(dentry, dacl, pacl); 378 v9fs_set_create_acl(dentry, dacl, pacl);
330 379 inc_nlink(dir);
380 v9fs_invalidate_inode_attr(dir);
331error: 381error:
332 if (fid) 382 if (fid)
333 p9_client_clunk(fid); 383 p9_client_clunk(fid);
@@ -345,10 +395,11 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry,
345 395
346 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry); 396 P9_DPRINTK(P9_DEBUG_VFS, "dentry: %p\n", dentry);
347 err = -EPERM; 397 err = -EPERM;
348 v9ses = v9fs_inode2v9ses(dentry->d_inode); 398 v9ses = v9fs_dentry2v9ses(dentry);
349 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) 399 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
350 return simple_getattr(mnt, dentry, stat); 400 generic_fillattr(dentry->d_inode, stat);
351 401 return 0;
402 }
352 fid = v9fs_fid_lookup(dentry); 403 fid = v9fs_fid_lookup(dentry);
353 if (IS_ERR(fid)) 404 if (IS_ERR(fid))
354 return PTR_ERR(fid); 405 return PTR_ERR(fid);
@@ -401,22 +452,24 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
401 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; 452 p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec;
402 453
403 retval = -EPERM; 454 retval = -EPERM;
404 v9ses = v9fs_inode2v9ses(dentry->d_inode); 455 v9ses = v9fs_dentry2v9ses(dentry);
405 fid = v9fs_fid_lookup(dentry); 456 fid = v9fs_fid_lookup(dentry);
406 if (IS_ERR(fid)) 457 if (IS_ERR(fid))
407 return PTR_ERR(fid); 458 return PTR_ERR(fid);
408 459
460 /* Write all dirty data */
461 if (S_ISREG(dentry->d_inode->i_mode))
462 filemap_write_and_wait(dentry->d_inode->i_mapping);
463
409 retval = p9_client_setattr(fid, &p9attr); 464 retval = p9_client_setattr(fid, &p9attr);
410 if (retval < 0) 465 if (retval < 0)
411 return retval; 466 return retval;
412 467
413 if ((iattr->ia_valid & ATTR_SIZE) && 468 if ((iattr->ia_valid & ATTR_SIZE) &&
414 iattr->ia_size != i_size_read(dentry->d_inode)) { 469 iattr->ia_size != i_size_read(dentry->d_inode))
415 retval = vmtruncate(dentry->d_inode, iattr->ia_size); 470 truncate_setsize(dentry->d_inode, iattr->ia_size);
416 if (retval)
417 return retval;
418 }
419 471
472 v9fs_invalidate_inode_attr(dentry->d_inode);
420 setattr_copy(dentry->d_inode, iattr); 473 setattr_copy(dentry->d_inode, iattr);
421 mark_inode_dirty(dentry->d_inode); 474 mark_inode_dirty(dentry->d_inode);
422 if (iattr->ia_valid & ATTR_MODE) { 475 if (iattr->ia_valid & ATTR_MODE) {
@@ -439,6 +492,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr)
439void 492void
440v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode) 493v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
441{ 494{
495 struct v9fs_inode *v9inode = V9FS_I(inode);
442 496
443 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { 497 if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) {
444 inode->i_atime.tv_sec = stat->st_atime_sec; 498 inode->i_atime.tv_sec = stat->st_atime_sec;
@@ -497,20 +551,21 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode)
497 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION 551 /* Currently we don't support P9_STATS_BTIME and P9_STATS_DATA_VERSION
498 * because the inode structure does not have fields for them. 552 * because the inode structure does not have fields for them.
499 */ 553 */
554 v9inode->cache_validity &= ~V9FS_INO_INVALID_ATTR;
500} 555}
501 556
502static int 557static int
503v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, 558v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
504 const char *symname) 559 const char *symname)
505{ 560{
506 struct v9fs_session_info *v9ses;
507 struct p9_fid *dfid;
508 struct p9_fid *fid = NULL;
509 struct inode *inode;
510 struct p9_qid qid;
511 char *name;
512 int err; 561 int err;
513 gid_t gid; 562 gid_t gid;
563 char *name;
564 struct p9_qid qid;
565 struct inode *inode;
566 struct p9_fid *dfid;
567 struct p9_fid *fid = NULL;
568 struct v9fs_session_info *v9ses;
514 569
515 name = (char *) dentry->d_name.name; 570 name = (char *) dentry->d_name.name;
516 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n", 571 P9_DPRINTK(P9_DEBUG_VFS, "v9fs_vfs_symlink_dotl : %lu,%s,%s\n",
@@ -534,6 +589,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
534 goto error; 589 goto error;
535 } 590 }
536 591
592 v9fs_invalidate_inode_attr(dir);
537 if (v9ses->cache) { 593 if (v9ses->cache) {
538 /* Now walk from the parent so we can get an unopened fid. */ 594 /* Now walk from the parent so we can get an unopened fid. */
539 fid = p9_client_walk(dfid, 1, &name, 1); 595 fid = p9_client_walk(dfid, 1, &name, 1);
@@ -546,7 +602,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
546 } 602 }
547 603
548 /* instantiate inode and assign the unopened fid to dentry */ 604 /* instantiate inode and assign the unopened fid to dentry */
549 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 605 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
550 if (IS_ERR(inode)) { 606 if (IS_ERR(inode)) {
551 err = PTR_ERR(inode); 607 err = PTR_ERR(inode);
552 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 608 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -588,10 +644,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
588 struct dentry *dentry) 644 struct dentry *dentry)
589{ 645{
590 int err; 646 int err;
591 struct p9_fid *dfid, *oldfid;
592 char *name; 647 char *name;
593 struct v9fs_session_info *v9ses;
594 struct dentry *dir_dentry; 648 struct dentry *dir_dentry;
649 struct p9_fid *dfid, *oldfid;
650 struct v9fs_session_info *v9ses;
595 651
596 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", 652 P9_DPRINTK(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n",
597 dir->i_ino, old_dentry->d_name.name, 653 dir->i_ino, old_dentry->d_name.name,
@@ -616,29 +672,17 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
616 return err; 672 return err;
617 } 673 }
618 674
675 v9fs_invalidate_inode_attr(dir);
619 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 676 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
620 /* Get the latest stat info from server. */ 677 /* Get the latest stat info from server. */
621 struct p9_fid *fid; 678 struct p9_fid *fid;
622 struct p9_stat_dotl *st;
623
624 fid = v9fs_fid_lookup(old_dentry); 679 fid = v9fs_fid_lookup(old_dentry);
625 if (IS_ERR(fid)) 680 if (IS_ERR(fid))
626 return PTR_ERR(fid); 681 return PTR_ERR(fid);
627 682
628 st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); 683 v9fs_refresh_inode_dotl(fid, old_dentry->d_inode);
629 if (IS_ERR(st))
630 return PTR_ERR(st);
631
632 v9fs_stat2inode_dotl(st, old_dentry->d_inode);
633
634 kfree(st);
635 } else {
636 /* Caching disabled. No need to get upto date stat info.
637 * This dentry will be released immediately. So, just hold the
638 * inode
639 */
640 ihold(old_dentry->d_inode);
641 } 684 }
685 ihold(old_dentry->d_inode);
642 d_instantiate(dentry, old_dentry->d_inode); 686 d_instantiate(dentry, old_dentry->d_inode);
643 687
644 return err; 688 return err;
@@ -657,12 +701,12 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
657 dev_t rdev) 701 dev_t rdev)
658{ 702{
659 int err; 703 int err;
704 gid_t gid;
660 char *name; 705 char *name;
661 mode_t mode; 706 mode_t mode;
662 struct v9fs_session_info *v9ses; 707 struct v9fs_session_info *v9ses;
663 struct p9_fid *fid = NULL, *dfid = NULL; 708 struct p9_fid *fid = NULL, *dfid = NULL;
664 struct inode *inode; 709 struct inode *inode;
665 gid_t gid;
666 struct p9_qid qid; 710 struct p9_qid qid;
667 struct dentry *dir_dentry; 711 struct dentry *dir_dentry;
668 struct posix_acl *dacl = NULL, *pacl = NULL; 712 struct posix_acl *dacl = NULL, *pacl = NULL;
@@ -699,6 +743,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
699 if (err < 0) 743 if (err < 0)
700 goto error; 744 goto error;
701 745
746 v9fs_invalidate_inode_attr(dir);
702 /* instantiate inode and assign the unopened fid to the dentry */ 747 /* instantiate inode and assign the unopened fid to the dentry */
703 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { 748 if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) {
704 fid = p9_client_walk(dfid, 1, &name, 1); 749 fid = p9_client_walk(dfid, 1, &name, 1);
@@ -710,7 +755,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, int omode,
710 goto error; 755 goto error;
711 } 756 }
712 757
713 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 758 inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
714 if (IS_ERR(inode)) { 759 if (IS_ERR(inode)) {
715 err = PTR_ERR(inode); 760 err = PTR_ERR(inode);
716 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n", 761 P9_DPRINTK(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -766,7 +811,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd)
766 fid = v9fs_fid_lookup(dentry); 811 fid = v9fs_fid_lookup(dentry);
767 if (IS_ERR(fid)) { 812 if (IS_ERR(fid)) {
768 __putname(link); 813 __putname(link);
769 link = ERR_PTR(PTR_ERR(fid)); 814 link = ERR_CAST(fid);
770 goto ndset; 815 goto ndset;
771 } 816 }
772 retval = p9_client_readlink(fid, &target); 817 retval = p9_client_readlink(fid, &target);
@@ -782,6 +827,31 @@ ndset:
782 return NULL; 827 return NULL;
783} 828}
784 829
830int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
831{
832 loff_t i_size;
833 struct p9_stat_dotl *st;
834 struct v9fs_session_info *v9ses;
835
836 v9ses = v9fs_inode2v9ses(inode);
837 st = p9_client_getattr_dotl(fid, P9_STATS_ALL);
838 if (IS_ERR(st))
839 return PTR_ERR(st);
840
841 spin_lock(&inode->i_lock);
842 /*
843 * We don't want to refresh inode->i_size,
844 * because we may have cached data
845 */
846 i_size = inode->i_size;
847 v9fs_stat2inode_dotl(st, inode);
848 if (v9ses->cache)
849 inode->i_size = i_size;
850 spin_unlock(&inode->i_lock);
851 kfree(st);
852 return 0;
853}
854
785const struct inode_operations v9fs_dir_inode_operations_dotl = { 855const struct inode_operations v9fs_dir_inode_operations_dotl = {
786 .create = v9fs_vfs_create_dotl, 856 .create = v9fs_vfs_create_dotl,
787 .lookup = v9fs_vfs_lookup, 857 .lookup = v9fs_vfs_lookup,
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index dbaabe3b8131..feef6cdc1fd2 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -86,12 +86,15 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
86 } else 86 } else
87 sb->s_op = &v9fs_super_ops; 87 sb->s_op = &v9fs_super_ops;
88 sb->s_bdi = &v9ses->bdi; 88 sb->s_bdi = &v9ses->bdi;
89 if (v9ses->cache)
90 sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_CACHE_SIZE;
89 91
90 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 92 sb->s_flags = flags | MS_ACTIVE | MS_DIRSYNC | MS_NOATIME;
91 MS_NOATIME; 93 if (!v9ses->cache)
94 sb->s_flags |= MS_SYNCHRONOUS;
92 95
93#ifdef CONFIG_9P_FS_POSIX_ACL 96#ifdef CONFIG_9P_FS_POSIX_ACL
94 if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT) 97 if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL)
95 sb->s_flags |= MS_POSIXACL; 98 sb->s_flags |= MS_POSIXACL;
96#endif 99#endif
97 100
@@ -166,7 +169,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
166 retval = PTR_ERR(st); 169 retval = PTR_ERR(st);
167 goto release_sb; 170 goto release_sb;
168 } 171 }
169 172 root->d_inode->i_ino = v9fs_qid2ino(&st->qid);
170 v9fs_stat2inode_dotl(st, root->d_inode); 173 v9fs_stat2inode_dotl(st, root->d_inode);
171 kfree(st); 174 kfree(st);
172 } else { 175 } else {
@@ -253,7 +256,7 @@ static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
253 goto done; 256 goto done;
254 } 257 }
255 258
256 v9ses = v9fs_inode2v9ses(dentry->d_inode); 259 v9ses = v9fs_dentry2v9ses(dentry);
257 if (v9fs_proto_dotl(v9ses)) { 260 if (v9fs_proto_dotl(v9ses)) {
258 res = p9_client_statfs(fid, &rs); 261 res = p9_client_statfs(fid, &rs);
259 if (res == 0) { 262 if (res == 0) {
@@ -276,26 +279,84 @@ done:
276 return res; 279 return res;
277} 280}
278 281
282static int v9fs_drop_inode(struct inode *inode)
283{
284 struct v9fs_session_info *v9ses;
285 v9ses = v9fs_inode2v9ses(inode);
286 if (v9ses->cache)
287 return generic_drop_inode(inode);
288 /*
289 * in case of non cached mode always drop the
290 * the inode because we want the inode attribute
291 * to always match that on the server.
292 */
293 return 1;
294}
295
296static int v9fs_write_inode(struct inode *inode,
297 struct writeback_control *wbc)
298{
299 int ret;
300 struct p9_wstat wstat;
301 struct v9fs_inode *v9inode;
302 /*
303 * send an fsync request to server irrespective of
304 * wbc->sync_mode.
305 */
306 P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
307 v9inode = V9FS_I(inode);
308 if (!v9inode->writeback_fid)
309 return 0;
310 v9fs_blank_wstat(&wstat);
311
312 ret = p9_client_wstat(v9inode->writeback_fid, &wstat);
313 if (ret < 0) {
314 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
315 return ret;
316 }
317 return 0;
318}
319
320static int v9fs_write_inode_dotl(struct inode *inode,
321 struct writeback_control *wbc)
322{
323 int ret;
324 struct v9fs_inode *v9inode;
325 /*
326 * send an fsync request to server irrespective of
327 * wbc->sync_mode.
328 */
329 P9_DPRINTK(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
330 v9inode = V9FS_I(inode);
331 if (!v9inode->writeback_fid)
332 return 0;
333 ret = p9_client_fsync(v9inode->writeback_fid, 0);
334 if (ret < 0) {
335 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
336 return ret;
337 }
338 return 0;
339}
340
279static const struct super_operations v9fs_super_ops = { 341static const struct super_operations v9fs_super_ops = {
280#ifdef CONFIG_9P_FSCACHE
281 .alloc_inode = v9fs_alloc_inode, 342 .alloc_inode = v9fs_alloc_inode,
282 .destroy_inode = v9fs_destroy_inode, 343 .destroy_inode = v9fs_destroy_inode,
283#endif
284 .statfs = simple_statfs, 344 .statfs = simple_statfs,
285 .evict_inode = v9fs_evict_inode, 345 .evict_inode = v9fs_evict_inode,
286 .show_options = generic_show_options, 346 .show_options = generic_show_options,
287 .umount_begin = v9fs_umount_begin, 347 .umount_begin = v9fs_umount_begin,
348 .write_inode = v9fs_write_inode,
288}; 349};
289 350
290static const struct super_operations v9fs_super_ops_dotl = { 351static const struct super_operations v9fs_super_ops_dotl = {
291#ifdef CONFIG_9P_FSCACHE
292 .alloc_inode = v9fs_alloc_inode, 352 .alloc_inode = v9fs_alloc_inode,
293 .destroy_inode = v9fs_destroy_inode, 353 .destroy_inode = v9fs_destroy_inode,
294#endif
295 .statfs = v9fs_statfs, 354 .statfs = v9fs_statfs,
355 .drop_inode = v9fs_drop_inode,
296 .evict_inode = v9fs_evict_inode, 356 .evict_inode = v9fs_evict_inode,
297 .show_options = generic_show_options, 357 .show_options = generic_show_options,
298 .umount_begin = v9fs_umount_begin, 358 .umount_begin = v9fs_umount_begin,
359 .write_inode = v9fs_write_inode_dotl,
299}; 360};
300 361
301struct file_system_type v9fs_fs_type = { 362struct file_system_type v9fs_fs_type = {
@@ -303,5 +364,5 @@ struct file_system_type v9fs_fs_type = {
303 .mount = v9fs_mount, 364 .mount = v9fs_mount,
304 .kill_sb = v9fs_kill_super, 365 .kill_sb = v9fs_kill_super,
305 .owner = THIS_MODULE, 366 .owner = THIS_MODULE,
306 .fs_flags = FS_RENAME_DOES_D_MOVE, 367 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT,
307}; 368};
diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57edc..f3aa9b08b228 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
47 def_bool n 47 def_bool n
48 48
49config EXPORTFS 49config EXPORTFS
50 tristate 50 bool
51 51
52config FILE_LOCKING 52config FILE_LOCKING
53 bool "Enable POSIX file locking API" if EXPERT 53 bool "Enable POSIX file locking API" if EXPERT
@@ -187,6 +187,7 @@ source "fs/omfs/Kconfig"
187source "fs/hpfs/Kconfig" 187source "fs/hpfs/Kconfig"
188source "fs/qnx4/Kconfig" 188source "fs/qnx4/Kconfig"
189source "fs/romfs/Kconfig" 189source "fs/romfs/Kconfig"
190source "fs/pstore/Kconfig"
190source "fs/sysv/Kconfig" 191source "fs/sysv/Kconfig"
191source "fs/ufs/Kconfig" 192source "fs/ufs/Kconfig"
192source "fs/exofs/Kconfig" 193source "fs/exofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c8..fb68c2b8cf8a 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o
48obj-$(CONFIG_NFS_COMMON) += nfs_common/ 48obj-$(CONFIG_NFS_COMMON) += nfs_common/
49obj-$(CONFIG_GENERIC_ACL) += generic_acl.o 49obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
50 50
51obj-$(CONFIG_FHANDLE) += fhandle.o
52
51obj-y += quota/ 53obj-y += quota/
52 54
53obj-$(CONFIG_PROC_FS) += proc/ 55obj-$(CONFIG_PROC_FS) += proc/
@@ -121,3 +123,4 @@ obj-$(CONFIG_BTRFS_FS) += btrfs/
121obj-$(CONFIG_GFS2_FS) += gfs2/ 123obj-$(CONFIG_GFS2_FS) += gfs2/
122obj-$(CONFIG_EXOFS_FS) += exofs/ 124obj-$(CONFIG_EXOFS_FS) += exofs/
123obj-$(CONFIG_CEPH_FS) += ceph/ 125obj-$(CONFIG_CEPH_FS) += ceph/
126obj-$(CONFIG_PSTORE) += pstore/
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index 1dd5f34b3cf2..e55182a74605 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,7 +1,6 @@
1config ADFS_FS 1config ADFS_FS
2 tristate "ADFS file system support (EXPERIMENTAL)" 2 tristate "ADFS file system support (EXPERIMENTAL)"
3 depends on BLOCK && EXPERIMENTAL 3 depends on BLOCK && EXPERIMENTAL
4 depends on BKL # need to fix
5 help 4 help
6 The Acorn Disc Filing System is the standard file system of the 5 The Acorn Disc Filing System is the standard file system of the
7 RiscOS operating system which runs on Acorn's ARM-based Risc PC 6 RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 2ff622f6f547..718ac1f440c6 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -50,6 +50,7 @@ struct adfs_sb_info {
50 gid_t s_gid; /* owner gid */ 50 gid_t s_gid; /* owner gid */
51 umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ 51 umode_t s_owner_mask; /* ADFS owner perm -> unix perm */
52 umode_t s_other_mask; /* ADFS other perm -> unix perm */ 52 umode_t s_other_mask; /* ADFS other perm -> unix perm */
53 int s_ftsuffix; /* ,xyz hex filetype suffix option */
53 54
54 __u32 s_ids_per_zone; /* max. no ids in one zone */ 55 __u32 s_ids_per_zone; /* max. no ids in one zone */
55 __u32 s_idlen; /* length of ID in map */ 56 __u32 s_idlen; /* length of ID in map */
@@ -79,6 +80,10 @@ struct adfs_dir {
79 80
80 int nr_buffers; 81 int nr_buffers;
81 struct buffer_head *bh[4]; 82 struct buffer_head *bh[4];
83
84 /* big directories need allocated buffers */
85 struct buffer_head **bh_fplus;
86
82 unsigned int pos; 87 unsigned int pos;
83 unsigned int parent_id; 88 unsigned int parent_id;
84 89
@@ -89,7 +94,7 @@ struct adfs_dir {
89/* 94/*
90 * This is the overall maximum name length 95 * This is the overall maximum name length
91 */ 96 */
92#define ADFS_MAX_NAME_LEN 256 97#define ADFS_MAX_NAME_LEN (256 + 4) /* +4 for ,xyz hex filetype suffix */
93struct object_info { 98struct object_info {
94 __u32 parent_id; /* parent object id */ 99 __u32 parent_id; /* parent object id */
95 __u32 file_id; /* object id */ 100 __u32 file_id; /* object id */
@@ -97,10 +102,26 @@ struct object_info {
97 __u32 execaddr; /* execution address */ 102 __u32 execaddr; /* execution address */
98 __u32 size; /* size */ 103 __u32 size; /* size */
99 __u8 attr; /* RISC OS attributes */ 104 __u8 attr; /* RISC OS attributes */
100 unsigned char name_len; /* name length */ 105 unsigned int name_len; /* name length */
101 char name[ADFS_MAX_NAME_LEN];/* file name */ 106 char name[ADFS_MAX_NAME_LEN];/* file name */
107
108 /* RISC OS file type (12-bit: derived from loadaddr) */
109 __u16 filetype;
102}; 110};
103 111
112/* RISC OS 12-bit filetype converts to ,xyz hex filename suffix */
113static inline int append_filetype_suffix(char *buf, __u16 filetype)
114{
115 if (filetype == 0xffff) /* no explicit 12-bit file type was set */
116 return 0;
117
118 *buf++ = ',';
119 *buf++ = hex_asc_lo(filetype >> 8);
120 *buf++ = hex_asc_lo(filetype >> 4);
121 *buf++ = hex_asc_lo(filetype >> 0);
122 return 4;
123}
124
104struct adfs_dir_ops { 125struct adfs_dir_ops {
105 int (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir); 126 int (*read)(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir);
106 int (*setpos)(struct adfs_dir *dir, unsigned int fpos); 127 int (*setpos)(struct adfs_dir *dir, unsigned int fpos);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 3b4a764ed780..3d83075aaa2e 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -9,7 +9,6 @@
9 * 9 *
10 * Common directory handling for ADFS 10 * Common directory handling for ADFS
11 */ 11 */
12#include <linux/smp_lock.h>
13#include "adfs.h" 12#include "adfs.h"
14 13
15/* 14/*
@@ -27,8 +26,6 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
27 struct adfs_dir dir; 26 struct adfs_dir dir;
28 int ret = 0; 27 int ret = 0;
29 28
30 lock_kernel();
31
32 if (filp->f_pos >> 32) 29 if (filp->f_pos >> 32)
33 goto out; 30 goto out;
34 31
@@ -70,7 +67,6 @@ free_out:
70 ops->free(&dir); 67 ops->free(&dir);
71 68
72out: 69out:
73 unlock_kernel();
74 return ret; 70 return ret;
75} 71}
76 72
@@ -276,7 +272,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
276 struct object_info obj; 272 struct object_info obj;
277 int error; 273 int error;
278 274
279 lock_kernel();
280 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj); 275 error = adfs_dir_lookup_byname(dir, &dentry->d_name, &obj);
281 if (error == 0) { 276 if (error == 0) {
282 error = -EACCES; 277 error = -EACCES;
@@ -288,7 +283,6 @@ adfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
288 if (inode) 283 if (inode)
289 error = 0; 284 error = 0;
290 } 285 }
291 unlock_kernel();
292 d_add(dentry, inode); 286 d_add(dentry, inode);
293 return ERR_PTR(error); 287 return ERR_PTR(error);
294} 288}
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index bafc71222e25..4bbe853ee50a 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -52,7 +52,6 @@ static inline int adfs_readname(char *buf, char *ptr, int maxlen)
52 *buf++ = *ptr; 52 *buf++ = *ptr;
53 ptr++; 53 ptr++;
54 } 54 }
55 *buf = '\0';
56 55
57 return buf - old_buf; 56 return buf - old_buf;
58} 57}
@@ -208,7 +207,8 @@ release_buffers:
208 * convert a disk-based directory entry to a Linux ADFS directory entry 207 * convert a disk-based directory entry to a Linux ADFS directory entry
209 */ 208 */
210static inline void 209static inline void
211adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de) 210adfs_dir2obj(struct adfs_dir *dir, struct object_info *obj,
211 struct adfs_direntry *de)
212{ 212{
213 obj->name_len = adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN); 213 obj->name_len = adfs_readname(obj->name, de->dirobname, ADFS_F_NAME_LEN);
214 obj->file_id = adfs_readval(de->dirinddiscadd, 3); 214 obj->file_id = adfs_readval(de->dirinddiscadd, 3);
@@ -216,6 +216,23 @@ adfs_dir2obj(struct object_info *obj, struct adfs_direntry *de)
216 obj->execaddr = adfs_readval(de->direxec, 4); 216 obj->execaddr = adfs_readval(de->direxec, 4);
217 obj->size = adfs_readval(de->dirlen, 4); 217 obj->size = adfs_readval(de->dirlen, 4);
218 obj->attr = de->newdiratts; 218 obj->attr = de->newdiratts;
219 obj->filetype = -1;
220
221 /*
222 * object is a file and is filetyped and timestamped?
223 * RISC OS 12-bit filetype is stored in load_address[19:8]
224 */
225 if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
226 (0xfff00000 == (0xfff00000 & obj->loadaddr))) {
227 obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
228
229 /* optionally append the ,xyz hex filetype suffix */
230 if (ADFS_SB(dir->sb)->s_ftsuffix)
231 obj->name_len +=
232 append_filetype_suffix(
233 &obj->name[obj->name_len],
234 obj->filetype);
235 }
219} 236}
220 237
221/* 238/*
@@ -260,7 +277,7 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj)
260 if (!de.dirobname[0]) 277 if (!de.dirobname[0])
261 return -ENOENT; 278 return -ENOENT;
262 279
263 adfs_dir2obj(obj, &de); 280 adfs_dir2obj(dir, obj, &de);
264 281
265 return 0; 282 return 0;
266} 283}
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1796bb352d05..d9e3bee4e653 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -8,6 +8,7 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
11#include <linux/slab.h>
11#include "adfs.h" 12#include "adfs.h"
12#include "dir_fplus.h" 13#include "dir_fplus.h"
13 14
@@ -22,30 +23,53 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
22 23
23 dir->nr_buffers = 0; 24 dir->nr_buffers = 0;
24 25
26 /* start off using fixed bh set - only alloc for big dirs */
27 dir->bh_fplus = &dir->bh[0];
28
25 block = __adfs_block_map(sb, id, 0); 29 block = __adfs_block_map(sb, id, 0);
26 if (!block) { 30 if (!block) {
27 adfs_error(sb, "dir object %X has a hole at offset 0", id); 31 adfs_error(sb, "dir object %X has a hole at offset 0", id);
28 goto out; 32 goto out;
29 } 33 }
30 34
31 dir->bh[0] = sb_bread(sb, block); 35 dir->bh_fplus[0] = sb_bread(sb, block);
32 if (!dir->bh[0]) 36 if (!dir->bh_fplus[0])
33 goto out; 37 goto out;
34 dir->nr_buffers += 1; 38 dir->nr_buffers += 1;
35 39
36 h = (struct adfs_bigdirheader *)dir->bh[0]->b_data; 40 h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data;
37 size = le32_to_cpu(h->bigdirsize); 41 size = le32_to_cpu(h->bigdirsize);
38 if (size != sz) { 42 if (size != sz) {
39 printk(KERN_WARNING "adfs: adfs_fplus_read: directory header size\n" 43 printk(KERN_WARNING "adfs: adfs_fplus_read:"
40 " does not match directory size\n"); 44 " directory header size %X\n"
45 " does not match directory size %X\n",
46 size, sz);
41 } 47 }
42 48
43 if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || 49 if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 ||
44 h->bigdirversion[2] != 0 || size & 2047 || 50 h->bigdirversion[2] != 0 || size & 2047 ||
45 h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) 51 h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) {
52 printk(KERN_WARNING "adfs: dir object %X has"
53 " malformed dir header\n", id);
46 goto out; 54 goto out;
55 }
47 56
48 size >>= sb->s_blocksize_bits; 57 size >>= sb->s_blocksize_bits;
58 if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) {
59 /* this directory is too big for fixed bh set, must allocate */
60 struct buffer_head **bh_fplus =
61 kzalloc(size * sizeof(struct buffer_head *),
62 GFP_KERNEL);
63 if (!bh_fplus) {
64 adfs_error(sb, "not enough memory for"
65 " dir object %X (%d blocks)", id, size);
66 goto out;
67 }
68 dir->bh_fplus = bh_fplus;
69 /* copy over the pointer to the block that we've already read */
70 dir->bh_fplus[0] = dir->bh[0];
71 }
72
49 for (blk = 1; blk < size; blk++) { 73 for (blk = 1; blk < size; blk++) {
50 block = __adfs_block_map(sb, id, blk); 74 block = __adfs_block_map(sb, id, blk);
51 if (!block) { 75 if (!block) {
@@ -53,25 +77,44 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct
53 goto out; 77 goto out;
54 } 78 }
55 79
56 dir->bh[blk] = sb_bread(sb, block); 80 dir->bh_fplus[blk] = sb_bread(sb, block);
57 if (!dir->bh[blk]) 81 if (!dir->bh_fplus[blk]) {
82 adfs_error(sb, "dir object %X failed read for"
83 " offset %d, mapped block %X",
84 id, blk, block);
58 goto out; 85 goto out;
59 dir->nr_buffers = blk; 86 }
87
88 dir->nr_buffers += 1;
60 } 89 }
61 90
62 t = (struct adfs_bigdirtail *)(dir->bh[size - 1]->b_data + (sb->s_blocksize - 8)); 91 t = (struct adfs_bigdirtail *)
92 (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8));
63 93
64 if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || 94 if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) ||
65 t->bigdirendmasseq != h->startmasseq || 95 t->bigdirendmasseq != h->startmasseq ||
66 t->reserved[0] != 0 || t->reserved[1] != 0) 96 t->reserved[0] != 0 || t->reserved[1] != 0) {
97 printk(KERN_WARNING "adfs: dir object %X has "
98 "malformed dir end\n", id);
67 goto out; 99 goto out;
100 }
68 101
69 dir->parent_id = le32_to_cpu(h->bigdirparent); 102 dir->parent_id = le32_to_cpu(h->bigdirparent);
70 dir->sb = sb; 103 dir->sb = sb;
71 return 0; 104 return 0;
105
72out: 106out:
73 for (i = 0; i < dir->nr_buffers; i++) 107 if (dir->bh_fplus) {
74 brelse(dir->bh[i]); 108 for (i = 0; i < dir->nr_buffers; i++)
109 brelse(dir->bh_fplus[i]);
110
111 if (&dir->bh[0] != dir->bh_fplus)
112 kfree(dir->bh_fplus);
113
114 dir->bh_fplus = NULL;
115 }
116
117 dir->nr_buffers = 0;
75 dir->sb = NULL; 118 dir->sb = NULL;
76 return ret; 119 return ret;
77} 120}
@@ -79,7 +122,8 @@ out:
79static int 122static int
80adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) 123adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos)
81{ 124{
82 struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data; 125 struct adfs_bigdirheader *h =
126 (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
83 int ret = -ENOENT; 127 int ret = -ENOENT;
84 128
85 if (fpos <= le32_to_cpu(h->bigdirentries)) { 129 if (fpos <= le32_to_cpu(h->bigdirentries)) {
@@ -102,21 +146,27 @@ dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len)
102 partial = sb->s_blocksize - offset; 146 partial = sb->s_blocksize - offset;
103 147
104 if (partial >= len) 148 if (partial >= len)
105 memcpy(to, dir->bh[buffer]->b_data + offset, len); 149 memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len);
106 else { 150 else {
107 char *c = (char *)to; 151 char *c = (char *)to;
108 152
109 remainder = len - partial; 153 remainder = len - partial;
110 154
111 memcpy(c, dir->bh[buffer]->b_data + offset, partial); 155 memcpy(c,
112 memcpy(c + partial, dir->bh[buffer + 1]->b_data, remainder); 156 dir->bh_fplus[buffer]->b_data + offset,
157 partial);
158
159 memcpy(c + partial,
160 dir->bh_fplus[buffer + 1]->b_data,
161 remainder);
113 } 162 }
114} 163}
115 164
116static int 165static int
117adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) 166adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
118{ 167{
119 struct adfs_bigdirheader *h = (struct adfs_bigdirheader *)dir->bh[0]->b_data; 168 struct adfs_bigdirheader *h =
169 (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data;
120 struct adfs_bigdirentry bde; 170 struct adfs_bigdirentry bde;
121 unsigned int offset; 171 unsigned int offset;
122 int i, ret = -ENOENT; 172 int i, ret = -ENOENT;
@@ -147,6 +197,24 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
147 if (obj->name[i] == '/') 197 if (obj->name[i] == '/')
148 obj->name[i] = '.'; 198 obj->name[i] = '.';
149 199
200 obj->filetype = -1;
201
202 /*
203 * object is a file and is filetyped and timestamped?
204 * RISC OS 12-bit filetype is stored in load_address[19:8]
205 */
206 if ((0 == (obj->attr & ADFS_NDA_DIRECTORY)) &&
207 (0xfff00000 == (0xfff00000 & obj->loadaddr))) {
208 obj->filetype = (__u16) ((0x000fff00 & obj->loadaddr) >> 8);
209
210 /* optionally append the ,xyz hex filetype suffix */
211 if (ADFS_SB(dir->sb)->s_ftsuffix)
212 obj->name_len +=
213 append_filetype_suffix(
214 &obj->name[obj->name_len],
215 obj->filetype);
216 }
217
150 dir->pos += 1; 218 dir->pos += 1;
151 ret = 0; 219 ret = 0;
152out: 220out:
@@ -160,7 +228,7 @@ adfs_fplus_sync(struct adfs_dir *dir)
160 int i; 228 int i;
161 229
162 for (i = dir->nr_buffers - 1; i >= 0; i--) { 230 for (i = dir->nr_buffers - 1; i >= 0; i--) {
163 struct buffer_head *bh = dir->bh[i]; 231 struct buffer_head *bh = dir->bh_fplus[i];
164 sync_dirty_buffer(bh); 232 sync_dirty_buffer(bh);
165 if (buffer_req(bh) && !buffer_uptodate(bh)) 233 if (buffer_req(bh) && !buffer_uptodate(bh))
166 err = -EIO; 234 err = -EIO;
@@ -174,8 +242,17 @@ adfs_fplus_free(struct adfs_dir *dir)
174{ 242{
175 int i; 243 int i;
176 244
177 for (i = 0; i < dir->nr_buffers; i++) 245 if (dir->bh_fplus) {
178 brelse(dir->bh[i]); 246 for (i = 0; i < dir->nr_buffers; i++)
247 brelse(dir->bh_fplus[i]);
248
249 if (&dir->bh[0] != dir->bh_fplus)
250 kfree(dir->bh_fplus);
251
252 dir->bh_fplus = NULL;
253 }
254
255 dir->nr_buffers = 0;
179 dir->sb = NULL; 256 dir->sb = NULL;
180} 257}
181 258
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 65794b8fe79e..d5250c5aae21 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -7,7 +7,6 @@
7 * it under the terms of the GNU General Public License version 2 as 7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10#include <linux/smp_lock.h>
11#include <linux/buffer_head.h> 10#include <linux/buffer_head.h>
12#include <linux/writeback.h> 11#include <linux/writeback.h>
13#include "adfs.h" 12#include "adfs.h"
@@ -73,32 +72,18 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block)
73static const struct address_space_operations adfs_aops = { 72static const struct address_space_operations adfs_aops = {
74 .readpage = adfs_readpage, 73 .readpage = adfs_readpage,
75 .writepage = adfs_writepage, 74 .writepage = adfs_writepage,
76 .sync_page = block_sync_page,
77 .write_begin = adfs_write_begin, 75 .write_begin = adfs_write_begin,
78 .write_end = generic_write_end, 76 .write_end = generic_write_end,
79 .bmap = _adfs_bmap 77 .bmap = _adfs_bmap
80}; 78};
81 79
82static inline unsigned int
83adfs_filetype(struct inode *inode)
84{
85 unsigned int type;
86
87 if (ADFS_I(inode)->stamped)
88 type = (ADFS_I(inode)->loadaddr >> 8) & 0xfff;
89 else
90 type = (unsigned int) -1;
91
92 return type;
93}
94
95/* 80/*
96 * Convert ADFS attributes and filetype to Linux permission. 81 * Convert ADFS attributes and filetype to Linux permission.
97 */ 82 */
98static umode_t 83static umode_t
99adfs_atts2mode(struct super_block *sb, struct inode *inode) 84adfs_atts2mode(struct super_block *sb, struct inode *inode)
100{ 85{
101 unsigned int filetype, attr = ADFS_I(inode)->attr; 86 unsigned int attr = ADFS_I(inode)->attr;
102 umode_t mode, rmask; 87 umode_t mode, rmask;
103 struct adfs_sb_info *asb = ADFS_SB(sb); 88 struct adfs_sb_info *asb = ADFS_SB(sb);
104 89
@@ -107,9 +92,7 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode)
107 return S_IFDIR | S_IXUGO | mode; 92 return S_IFDIR | S_IXUGO | mode;
108 } 93 }
109 94
110 filetype = adfs_filetype(inode); 95 switch (ADFS_I(inode)->filetype) {
111
112 switch (filetype) {
113 case 0xfc0: /* LinkFS */ 96 case 0xfc0: /* LinkFS */
114 return S_IFLNK|S_IRWXUGO; 97 return S_IFLNK|S_IRWXUGO;
115 98
@@ -175,50 +158,48 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode)
175 158
176/* 159/*
177 * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time 160 * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time
178 * referenced to 1 Jan 1900 (til 2248) 161 * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds
162 * of time to convert from RISC OS epoch to Unix epoch.
179 */ 163 */
180static void 164static void
181adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) 165adfs_adfs2unix_time(struct timespec *tv, struct inode *inode)
182{ 166{
183 unsigned int high, low; 167 unsigned int high, low;
168 /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since
169 * 01 Jan 1900 00:00:00 (RISC OS epoch)
170 */
171 static const s64 nsec_unix_epoch_diff_risc_os_epoch =
172 2208988800000000000LL;
173 s64 nsec;
184 174
185 if (ADFS_I(inode)->stamped == 0) 175 if (ADFS_I(inode)->stamped == 0)
186 goto cur_time; 176 goto cur_time;
187 177
188 high = ADFS_I(inode)->loadaddr << 24; 178 high = ADFS_I(inode)->loadaddr & 0xFF; /* top 8 bits of timestamp */
189 low = ADFS_I(inode)->execaddr; 179 low = ADFS_I(inode)->execaddr; /* bottom 32 bits of timestamp */
190 180
191 high |= low >> 8; 181 /* convert 40-bit centi-seconds to 32-bit seconds
192 low &= 255; 182 * going via nanoseconds to retain precision
183 */
184 nsec = (((s64) high << 32) | (s64) low) * 10000000; /* cs to ns */
193 185
194 /* Files dated pre 01 Jan 1970 00:00:00. */ 186 /* Files dated pre 01 Jan 1970 00:00:00. */
195 if (high < 0x336e996a) 187 if (nsec < nsec_unix_epoch_diff_risc_os_epoch)
196 goto too_early; 188 goto too_early;
197 189
198 /* Files dated post 18 Jan 2038 03:14:05. */ 190 /* convert from RISC OS to Unix epoch */
199 if (high >= 0x656e9969) 191 nsec -= nsec_unix_epoch_diff_risc_os_epoch;
200 goto too_late;
201
202 /* discard 2208988800 (0x336e996a00) seconds of time */
203 high -= 0x336e996a;
204 192
205 /* convert 40-bit centi-seconds to 32-bit seconds */ 193 *tv = ns_to_timespec(nsec);
206 tv->tv_sec = (((high % 100) << 8) + low) / 100 + (high / 100 << 8);
207 tv->tv_nsec = 0;
208 return; 194 return;
209 195
210 cur_time: 196 cur_time:
211 *tv = CURRENT_TIME_SEC; 197 *tv = CURRENT_TIME;
212 return; 198 return;
213 199
214 too_early: 200 too_early:
215 tv->tv_sec = tv->tv_nsec = 0; 201 tv->tv_sec = tv->tv_nsec = 0;
216 return; 202 return;
217
218 too_late:
219 tv->tv_sec = 0x7ffffffd;
220 tv->tv_nsec = 0;
221 return;
222} 203}
223 204
224/* 205/*
@@ -280,7 +261,8 @@ adfs_iget(struct super_block *sb, struct object_info *obj)
280 ADFS_I(inode)->loadaddr = obj->loadaddr; 261 ADFS_I(inode)->loadaddr = obj->loadaddr;
281 ADFS_I(inode)->execaddr = obj->execaddr; 262 ADFS_I(inode)->execaddr = obj->execaddr;
282 ADFS_I(inode)->attr = obj->attr; 263 ADFS_I(inode)->attr = obj->attr;
283 ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); 264 ADFS_I(inode)->filetype = obj->filetype;
265 ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000);
284 266
285 inode->i_mode = adfs_atts2mode(sb, inode); 267 inode->i_mode = adfs_atts2mode(sb, inode);
286 adfs_adfs2unix_time(&inode->i_mtime, inode); 268 adfs_adfs2unix_time(&inode->i_mtime, inode);
@@ -316,8 +298,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
316 unsigned int ia_valid = attr->ia_valid; 298 unsigned int ia_valid = attr->ia_valid;
317 int error; 299 int error;
318 300
319 lock_kernel();
320
321 error = inode_change_ok(inode, attr); 301 error = inode_change_ok(inode, attr);
322 302
323 /* 303 /*
@@ -359,7 +339,6 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
359 if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE)) 339 if (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MODE))
360 mark_inode_dirty(inode); 340 mark_inode_dirty(inode);
361out: 341out:
362 unlock_kernel();
363 return error; 342 return error;
364} 343}
365 344
@@ -374,7 +353,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
374 struct object_info obj; 353 struct object_info obj;
375 int ret; 354 int ret;
376 355
377 lock_kernel();
378 obj.file_id = inode->i_ino; 356 obj.file_id = inode->i_ino;
379 obj.name_len = 0; 357 obj.name_len = 0;
380 obj.parent_id = ADFS_I(inode)->parent_id; 358 obj.parent_id = ADFS_I(inode)->parent_id;
@@ -384,6 +362,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc)
384 obj.size = inode->i_size; 362 obj.size = inode->i_size;
385 363
386 ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); 364 ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL);
387 unlock_kernel();
388 return ret; 365 return ret;
389} 366}
diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index d1a5932bb0f1..6935f05202ac 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -51,7 +51,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
51 51
52/* 52/*
53 * This is fun. We need to load up to 19 bits from the map at an 53 * This is fun. We need to load up to 19 bits from the map at an
54 * arbitary bit alignment. (We're limited to 19 bits by F+ version 2). 54 * arbitrary bit alignment. (We're limited to 19 bits by F+ version 2).
55 */ 55 */
56#define GET_FRAG_ID(_map,_start,_idmask) \ 56#define GET_FRAG_ID(_map,_start,_idmask) \
57 ({ \ 57 ({ \
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 2d7954049fbe..c8bf36a1996a 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -14,7 +14,6 @@
14#include <linux/mount.h> 14#include <linux/mount.h>
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/smp_lock.h>
18#include <linux/statfs.h> 17#include <linux/statfs.h>
19#include "adfs.h" 18#include "adfs.h"
20#include "dir_f.h" 19#include "dir_f.h"
@@ -120,15 +119,11 @@ static void adfs_put_super(struct super_block *sb)
120 int i; 119 int i;
121 struct adfs_sb_info *asb = ADFS_SB(sb); 120 struct adfs_sb_info *asb = ADFS_SB(sb);
122 121
123 lock_kernel();
124
125 for (i = 0; i < asb->s_map_size; i++) 122 for (i = 0; i < asb->s_map_size; i++)
126 brelse(asb->s_map[i].dm_bh); 123 brelse(asb->s_map[i].dm_bh);
127 kfree(asb->s_map); 124 kfree(asb->s_map);
128 kfree(asb); 125 kfree(asb);
129 sb->s_fs_info = NULL; 126 sb->s_fs_info = NULL;
130
131 unlock_kernel();
132} 127}
133 128
134static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) 129static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
@@ -143,17 +138,20 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
143 seq_printf(seq, ",ownmask=%o", asb->s_owner_mask); 138 seq_printf(seq, ",ownmask=%o", asb->s_owner_mask);
144 if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK) 139 if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK)
145 seq_printf(seq, ",othmask=%o", asb->s_other_mask); 140 seq_printf(seq, ",othmask=%o", asb->s_other_mask);
141 if (asb->s_ftsuffix != 0)
142 seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix);
146 143
147 return 0; 144 return 0;
148} 145}
149 146
150enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err}; 147enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
151 148
152static const match_table_t tokens = { 149static const match_table_t tokens = {
153 {Opt_uid, "uid=%u"}, 150 {Opt_uid, "uid=%u"},
154 {Opt_gid, "gid=%u"}, 151 {Opt_gid, "gid=%u"},
155 {Opt_ownmask, "ownmask=%o"}, 152 {Opt_ownmask, "ownmask=%o"},
156 {Opt_othmask, "othmask=%o"}, 153 {Opt_othmask, "othmask=%o"},
154 {Opt_ftsuffix, "ftsuffix=%u"},
157 {Opt_err, NULL} 155 {Opt_err, NULL}
158}; 156};
159 157
@@ -194,6 +192,11 @@ static int parse_options(struct super_block *sb, char *options)
194 return -EINVAL; 192 return -EINVAL;
195 asb->s_other_mask = option; 193 asb->s_other_mask = option;
196 break; 194 break;
195 case Opt_ftsuffix:
196 if (match_int(args, &option))
197 return -EINVAL;
198 asb->s_ftsuffix = option;
199 break;
197 default: 200 default:
198 printk("ADFS-fs: unrecognised mount option \"%s\" " 201 printk("ADFS-fs: unrecognised mount option \"%s\" "
199 "or missing value\n", p); 202 "or missing value\n", p);
@@ -359,15 +362,11 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
359 struct adfs_sb_info *asb; 362 struct adfs_sb_info *asb;
360 struct inode *root; 363 struct inode *root;
361 364
362 lock_kernel();
363
364 sb->s_flags |= MS_NODIRATIME; 365 sb->s_flags |= MS_NODIRATIME;
365 366
366 asb = kzalloc(sizeof(*asb), GFP_KERNEL); 367 asb = kzalloc(sizeof(*asb), GFP_KERNEL);
367 if (!asb) { 368 if (!asb)
368 unlock_kernel();
369 return -ENOMEM; 369 return -ENOMEM;
370 }
371 sb->s_fs_info = asb; 370 sb->s_fs_info = asb;
372 371
373 /* set default options */ 372 /* set default options */
@@ -375,6 +374,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
375 asb->s_gid = 0; 374 asb->s_gid = 0;
376 asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; 375 asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
377 asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; 376 asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
377 asb->s_ftsuffix = 0;
378 378
379 if (parse_options(sb, data)) 379 if (parse_options(sb, data))
380 goto error; 380 goto error;
@@ -454,11 +454,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
454 454
455 root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root); 455 root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root);
456 root_obj.name_len = 0; 456 root_obj.name_len = 0;
457 root_obj.loadaddr = 0; 457 /* Set root object date as 01 Jan 1987 00:00:00 */
458 root_obj.execaddr = 0; 458 root_obj.loadaddr = 0xfff0003f;
459 root_obj.execaddr = 0xec22c000;
459 root_obj.size = ADFS_NEWDIR_SIZE; 460 root_obj.size = ADFS_NEWDIR_SIZE;
460 root_obj.attr = ADFS_NDA_DIRECTORY | ADFS_NDA_OWNER_READ | 461 root_obj.attr = ADFS_NDA_DIRECTORY | ADFS_NDA_OWNER_READ |
461 ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ; 462 ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ;
463 root_obj.filetype = -1;
462 464
463 /* 465 /*
464 * If this is a F+ disk with variable length directories, 466 * If this is a F+ disk with variable length directories,
@@ -472,6 +474,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
472 asb->s_dir = &adfs_f_dir_ops; 474 asb->s_dir = &adfs_f_dir_ops;
473 asb->s_namelen = ADFS_F_NAME_LEN; 475 asb->s_namelen = ADFS_F_NAME_LEN;
474 } 476 }
477 /*
478 * ,xyz hex filetype suffix may be added by driver
479 * to files that have valid RISC OS filetype
480 */
481 if (asb->s_ftsuffix)
482 asb->s_namelen += 4;
475 483
476 sb->s_d_op = &adfs_dentry_operations; 484 sb->s_d_op = &adfs_dentry_operations;
477 root = adfs_iget(sb, &root_obj); 485 root = adfs_iget(sb, &root_obj);
@@ -485,7 +493,6 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent)
485 adfs_error(sb, "get root inode failed\n"); 493 adfs_error(sb, "get root inode failed\n");
486 goto error; 494 goto error;
487 } 495 }
488 unlock_kernel();
489 return 0; 496 return 0;
490 497
491error_free_bh: 498error_free_bh:
@@ -493,7 +500,6 @@ error_free_bh:
493error: 500error:
494 sb->s_fs_info = NULL; 501 sb->s_fs_info = NULL;
495 kfree(asb); 502 kfree(asb);
496 unlock_kernel();
497 return -EINVAL; 503 return -EINVAL;
498} 504}
499 505
diff --git a/fs/affs/Makefile b/fs/affs/Makefile
index b2c4f54446f3..3988b4a78339 100644
--- a/fs/affs/Makefile
+++ b/fs/affs/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the Linux affs filesystem routines. 2# Makefile for the Linux affs filesystem routines.
3# 3#
4 4
5#EXTRA_CFLAGS=-DDEBUG=1 5#ccflags-y := -DDEBUG=1
6 6
7obj-$(CONFIG_AFFS_FS) += affs.o 7obj-$(CONFIG_AFFS_FS) += affs.o
8 8
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0a90dcd46de2..acf321b70fcd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -429,7 +429,6 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block)
429const struct address_space_operations affs_aops = { 429const struct address_space_operations affs_aops = {
430 .readpage = affs_readpage, 430 .readpage = affs_readpage,
431 .writepage = affs_writepage, 431 .writepage = affs_writepage,
432 .sync_page = block_sync_page,
433 .write_begin = affs_write_begin, 432 .write_begin = affs_write_begin,
434 .write_end = generic_write_end, 433 .write_end = generic_write_end,
435 .bmap = _affs_bmap 434 .bmap = _affs_bmap
@@ -786,7 +785,6 @@ out:
786const struct address_space_operations affs_aops_ofs = { 785const struct address_space_operations affs_aops_ofs = {
787 .readpage = affs_readpage_ofs, 786 .readpage = affs_readpage_ofs,
788 //.writepage = affs_writepage_ofs, 787 //.writepage = affs_writepage_ofs,
789 //.sync_page = affs_sync_page_ofs,
790 .write_begin = affs_write_begin_ofs, 788 .write_begin = affs_write_begin_ofs,
791 .write_end = affs_write_end_ofs 789 .write_end = affs_write_end_ofs
792}; 790};
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
index 0fb315dd4d2a..577763c3d88b 100644
--- a/fs/afs/cache.c
+++ b/fs/afs/cache.c
@@ -98,7 +98,7 @@ static uint16_t afs_cell_cache_get_key(const void *cookie_netfs_data,
98} 98}
99 99
100/* 100/*
101 * provide new auxilliary cache data 101 * provide new auxiliary cache data
102 */ 102 */
103static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data, 103static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
104 void *buffer, uint16_t bufmax) 104 void *buffer, uint16_t bufmax)
@@ -117,7 +117,7 @@ static uint16_t afs_cell_cache_get_aux(const void *cookie_netfs_data,
117} 117}
118 118
119/* 119/*
120 * check that the auxilliary data indicates that the entry is still valid 120 * check that the auxiliary data indicates that the entry is still valid
121 */ 121 */
122static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data, 122static enum fscache_checkaux afs_cell_cache_check_aux(void *cookie_netfs_data,
123 const void *buffer, 123 const void *buffer,
@@ -150,7 +150,7 @@ static uint16_t afs_vlocation_cache_get_key(const void *cookie_netfs_data,
150} 150}
151 151
152/* 152/*
153 * provide new auxilliary cache data 153 * provide new auxiliary cache data
154 */ 154 */
155static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data, 155static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
156 void *buffer, uint16_t bufmax) 156 void *buffer, uint16_t bufmax)
@@ -172,7 +172,7 @@ static uint16_t afs_vlocation_cache_get_aux(const void *cookie_netfs_data,
172} 172}
173 173
174/* 174/*
175 * check that the auxilliary data indicates that the entry is still valid 175 * check that the auxiliary data indicates that the entry is still valid
176 */ 176 */
177static 177static
178enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data, 178enum fscache_checkaux afs_vlocation_cache_check_aux(void *cookie_netfs_data,
@@ -283,7 +283,7 @@ static void afs_vnode_cache_get_attr(const void *cookie_netfs_data,
283} 283}
284 284
285/* 285/*
286 * provide new auxilliary cache data 286 * provide new auxiliary cache data
287 */ 287 */
288static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, 288static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
289 void *buffer, uint16_t bufmax) 289 void *buffer, uint16_t bufmax)
@@ -309,7 +309,7 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data,
309} 309}
310 310
311/* 311/*
312 * check that the auxilliary data indicates that the entry is still valid 312 * check that the auxiliary data indicates that the entry is still valid
313 */ 313 */
314static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, 314static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data,
315 const void *buffer, 315 const void *buffer,
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 0d5eeadf6121..3c090b7555ea 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -293,7 +293,7 @@ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz,
293 if (!cell) { 293 if (!cell) {
294 /* this should not happen unless user tries to mount 294 /* this should not happen unless user tries to mount
295 * when root cell is not set. Return an impossibly 295 * when root cell is not set. Return an impossibly
296 * bizzare errno to alert the user. Things like 296 * bizarre errno to alert the user. Things like
297 * ENOENT might be "more appropriate" but they happen 297 * ENOENT might be "more appropriate" but they happen
298 * for other reasons. 298 * for other reasons.
299 */ 299 */
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 15690bb1d3b5..789b3afb3423 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
140 candidate->first = candidate->last = index; 140 candidate->first = candidate->last = index;
141 candidate->offset_first = from; 141 candidate->offset_first = from;
142 candidate->to_last = to; 142 candidate->to_last = to;
143 INIT_LIST_HEAD(&candidate->link);
143 candidate->usage = 1; 144 candidate->usage = 1;
144 candidate->state = AFS_WBACK_PENDING; 145 candidate->state = AFS_WBACK_PENDING;
145 init_waitqueue_head(&candidate->waitq); 146 init_waitqueue_head(&candidate->waitq);
diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a9..e29ec485af25 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -34,8 +34,6 @@
34#include <linux/security.h> 34#include <linux/security.h>
35#include <linux/eventfd.h> 35#include <linux/eventfd.h>
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h>
38#include <linux/hash.h>
39#include <linux/compat.h> 37#include <linux/compat.h>
40 38
41#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
@@ -65,14 +63,6 @@ static DECLARE_WORK(fput_work, aio_fput_routine);
65static DEFINE_SPINLOCK(fput_lock); 63static DEFINE_SPINLOCK(fput_lock);
66static LIST_HEAD(fput_head); 64static LIST_HEAD(fput_head);
67 65
68#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
69#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
70struct aio_batch_entry {
71 struct hlist_node list;
72 struct address_space *mapping;
73};
74mempool_t *abe_pool;
75
76static void aio_kick_handler(struct work_struct *); 66static void aio_kick_handler(struct work_struct *);
77static void aio_queue_work(struct kioctx *); 67static void aio_queue_work(struct kioctx *);
78 68
@@ -85,9 +75,8 @@ static int __init aio_setup(void)
85 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); 75 kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
86 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); 76 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
87 77
88 aio_wq = create_workqueue("aio"); 78 aio_wq = alloc_workqueue("aio", 0, 1); /* used to limit concurrency */
89 abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); 79 BUG_ON(!aio_wq);
90 BUG_ON(!aio_wq || !abe_pool);
91 80
92 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); 81 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
93 82
@@ -239,15 +228,23 @@ static void __put_ioctx(struct kioctx *ctx)
239 call_rcu(&ctx->rcu_head, ctx_rcu_free); 228 call_rcu(&ctx->rcu_head, ctx_rcu_free);
240} 229}
241 230
242#define get_ioctx(kioctx) do { \ 231static inline void get_ioctx(struct kioctx *kioctx)
243 BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ 232{
244 atomic_inc(&(kioctx)->users); \ 233 BUG_ON(atomic_read(&kioctx->users) <= 0);
245} while (0) 234 atomic_inc(&kioctx->users);
246#define put_ioctx(kioctx) do { \ 235}
247 BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ 236
248 if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ 237static inline int try_get_ioctx(struct kioctx *kioctx)
249 __put_ioctx(kioctx); \ 238{
250} while (0) 239 return atomic_inc_not_zero(&kioctx->users);
240}
241
242static inline void put_ioctx(struct kioctx *kioctx)
243{
244 BUG_ON(atomic_read(&kioctx->users) <= 0);
245 if (unlikely(atomic_dec_and_test(&kioctx->users)))
246 __put_ioctx(kioctx);
247}
251 248
252/* ioctx_alloc 249/* ioctx_alloc
253 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. 250 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
@@ -512,7 +509,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
512 ctx->reqs_active--; 509 ctx->reqs_active--;
513 510
514 if (unlikely(!ctx->reqs_active && ctx->dead)) 511 if (unlikely(!ctx->reqs_active && ctx->dead))
515 wake_up(&ctx->wait); 512 wake_up_all(&ctx->wait);
516} 513}
517 514
518static void aio_fput_routine(struct work_struct *data) 515static void aio_fput_routine(struct work_struct *data)
@@ -569,7 +566,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
569 spin_lock(&fput_lock); 566 spin_lock(&fput_lock);
570 list_add(&req->ki_list, &fput_head); 567 list_add(&req->ki_list, &fput_head);
571 spin_unlock(&fput_lock); 568 spin_unlock(&fput_lock);
572 queue_work(aio_wq, &fput_work); 569 schedule_work(&fput_work);
573 } else { 570 } else {
574 req->ki_filp = NULL; 571 req->ki_filp = NULL;
575 really_put_req(ctx, req); 572 really_put_req(ctx, req);
@@ -601,8 +598,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
601 rcu_read_lock(); 598 rcu_read_lock();
602 599
603 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { 600 hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
604 if (ctx->user_id == ctx_id && !ctx->dead) { 601 /*
605 get_ioctx(ctx); 602 * RCU protects us against accessing freed memory but
603 * we have to be careful not to get a reference when the
604 * reference count already dropped to 0 (ctx->dead test
605 * is unreliable because of races).
606 */
607 if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
606 ret = ctx; 608 ret = ctx;
607 break; 609 break;
608 } 610 }
@@ -1216,7 +1218,7 @@ static void io_destroy(struct kioctx *ioctx)
1216 * by other CPUs at this point. Right now, we rely on the 1218 * by other CPUs at this point. Right now, we rely on the
1217 * locking done by the above calls to ensure this consistency. 1219 * locking done by the above calls to ensure this consistency.
1218 */ 1220 */
1219 wake_up(&ioctx->wait); 1221 wake_up_all(&ioctx->wait);
1220 put_ioctx(ioctx); /* once for the lookup */ 1222 put_ioctx(ioctx); /* once for the lookup */
1221} 1223}
1222 1224
@@ -1512,57 +1514,8 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1512 return 0; 1514 return 0;
1513} 1515}
1514 1516
1515static void aio_batch_add(struct address_space *mapping,
1516 struct hlist_head *batch_hash)
1517{
1518 struct aio_batch_entry *abe;
1519 struct hlist_node *pos;
1520 unsigned bucket;
1521
1522 bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
1523 hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
1524 if (abe->mapping == mapping)
1525 return;
1526 }
1527
1528 abe = mempool_alloc(abe_pool, GFP_KERNEL);
1529
1530 /*
1531 * we should be using igrab here, but
1532 * we don't want to hammer on the global
1533 * inode spinlock just to take an extra
1534 * reference on a file that we must already
1535 * have a reference to.
1536 *
1537 * When we're called, we always have a reference
1538 * on the file, so we must always have a reference
1539 * on the inode, so ihold() is safe here.
1540 */
1541 ihold(mapping->host);
1542 abe->mapping = mapping;
1543 hlist_add_head(&abe->list, &batch_hash[bucket]);
1544 return;
1545}
1546
1547static void aio_batch_free(struct hlist_head *batch_hash)
1548{
1549 struct aio_batch_entry *abe;
1550 struct hlist_node *pos, *n;
1551 int i;
1552
1553 for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
1554 hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
1555 blk_run_address_space(abe->mapping);
1556 iput(abe->mapping->host);
1557 hlist_del(&abe->list);
1558 mempool_free(abe, abe_pool);
1559 }
1560 }
1561}
1562
1563static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1517static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1564 struct iocb *iocb, struct hlist_head *batch_hash, 1518 struct iocb *iocb, bool compat)
1565 bool compat)
1566{ 1519{
1567 struct kiocb *req; 1520 struct kiocb *req;
1568 struct file *file; 1521 struct file *file;
@@ -1629,6 +1582,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1629 goto out_put_req; 1582 goto out_put_req;
1630 1583
1631 spin_lock_irq(&ctx->ctx_lock); 1584 spin_lock_irq(&ctx->ctx_lock);
1585 /*
1586 * We could have raced with io_destroy() and are currently holding a
1587 * reference to ctx which should be destroyed. We cannot submit IO
1588 * since ctx gets freed as soon as io_submit() puts its reference. The
1589 * check here is reliable: io_destroy() sets ctx->dead before waiting
1590 * for outstanding IO and the barrier between these two is realized by
1591 * unlock of mm->ioctx_lock and lock of ctx->ctx_lock. Analogously we
1592 * increment ctx->reqs_active before checking for ctx->dead and the
1593 * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
1594 * don't see ctx->dead set here, io_destroy() waits for our IO to
1595 * finish.
1596 */
1597 if (ctx->dead) {
1598 spin_unlock_irq(&ctx->ctx_lock);
1599 ret = -EINVAL;
1600 goto out_put_req;
1601 }
1632 aio_run_iocb(req); 1602 aio_run_iocb(req);
1633 if (!list_empty(&ctx->run_list)) { 1603 if (!list_empty(&ctx->run_list)) {
1634 /* drain the run list */ 1604 /* drain the run list */
@@ -1636,11 +1606,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1636 ; 1606 ;
1637 } 1607 }
1638 spin_unlock_irq(&ctx->ctx_lock); 1608 spin_unlock_irq(&ctx->ctx_lock);
1639 if (req->ki_opcode == IOCB_CMD_PREAD ||
1640 req->ki_opcode == IOCB_CMD_PREADV ||
1641 req->ki_opcode == IOCB_CMD_PWRITE ||
1642 req->ki_opcode == IOCB_CMD_PWRITEV)
1643 aio_batch_add(file->f_mapping, batch_hash);
1644 1609
1645 aio_put_req(req); /* drop extra ref to req */ 1610 aio_put_req(req); /* drop extra ref to req */
1646 return 0; 1611 return 0;
@@ -1657,7 +1622,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1657 struct kioctx *ctx; 1622 struct kioctx *ctx;
1658 long ret = 0; 1623 long ret = 0;
1659 int i; 1624 int i;
1660 struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; 1625 struct blk_plug plug;
1661 1626
1662 if (unlikely(nr < 0)) 1627 if (unlikely(nr < 0))
1663 return -EINVAL; 1628 return -EINVAL;
@@ -1674,6 +1639,8 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1674 return -EINVAL; 1639 return -EINVAL;
1675 } 1640 }
1676 1641
1642 blk_start_plug(&plug);
1643
1677 /* 1644 /*
1678 * AKPM: should this return a partial result if some of the IOs were 1645 * AKPM: should this return a partial result if some of the IOs were
1679 * successfully submitted? 1646 * successfully submitted?
@@ -1692,11 +1659,11 @@ long do_io_submit(aio_context_t ctx_id, long nr,
1692 break; 1659 break;
1693 } 1660 }
1694 1661
1695 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat); 1662 ret = io_submit_one(ctx, user_iocb, &tmp, compat);
1696 if (ret) 1663 if (ret)
1697 break; 1664 break;
1698 } 1665 }
1699 aio_batch_free(batch_hash); 1666 blk_finish_plug(&plug);
1700 1667
1701 put_ioctx(ctx); 1668 put_ioctx(ctx);
1702 return i ? i : ret; 1669 return i ? i : ret;
diff --git a/fs/attr.c b/fs/attr.c
index 7ca41811afa1..91dbe2a107f2 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -59,7 +59,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
59 59
60 /* Make sure a caller can chmod. */ 60 /* Make sure a caller can chmod. */
61 if (ia_valid & ATTR_MODE) { 61 if (ia_valid & ATTR_MODE) {
62 if (!is_owner_or_cap(inode)) 62 if (!inode_owner_or_capable(inode))
63 return -EPERM; 63 return -EPERM;
64 /* Also check the setgid bit! */ 64 /* Also check the setgid bit! */
65 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : 65 if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
@@ -69,7 +69,7 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
69 69
70 /* Check for setting the inode time. */ 70 /* Check for setting the inode time. */
71 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { 71 if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
72 if (!is_owner_or_cap(inode)) 72 if (!inode_owner_or_capable(inode))
73 return -EPERM; 73 return -EPERM;
74 } 74 }
75 75
@@ -128,7 +128,7 @@ EXPORT_SYMBOL(inode_newsize_ok);
128 * setattr_copy must be called with i_mutex held. 128 * setattr_copy must be called with i_mutex held.
129 * 129 *
130 * setattr_copy updates the inode's metadata with that specified 130 * setattr_copy updates the inode's metadata with that specified
131 * in attr. Noticably missing is inode size update, which is more complex 131 * in attr. Noticeably missing is inode size update, which is more complex
132 * as it requires pagecache updates. 132 * as it requires pagecache updates.
133 * 133 *
134 * The inode is not marked as dirty after this operation. The rationale is 134 * The inode is not marked as dirty after this operation. The rationale is
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 54f923792728..475f9c597cb7 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -61,8 +61,6 @@ do { \
61 current->pid, __func__, ##args); \ 61 current->pid, __func__, ##args); \
62} while (0) 62} while (0)
63 63
64extern spinlock_t autofs4_lock;
65
66/* Unified info structure. This is pointed to by both the dentry and 64/* Unified info structure. This is pointed to by both the dentry and
67 inode structures. Each file in the filesystem has an instance of this 65 inode structures. Each file in the filesystem has an instance of this
68 structure. It holds a reference to the dentry, so dentries are never 66 structure. It holds a reference to the dentry, so dentries are never
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 1442da4860e5..509fe1eb66ae 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -372,6 +372,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
372 return -EBUSY; 372 return -EBUSY;
373 } else { 373 } else {
374 struct file *pipe = fget(pipefd); 374 struct file *pipe = fget(pipefd);
375 if (!pipe) {
376 err = -EBADF;
377 goto out;
378 }
375 if (!pipe->f_op || !pipe->f_op->write) { 379 if (!pipe->f_op || !pipe->f_op->write) {
376 err = -EPIPE; 380 err = -EPIPE;
377 fput(pipe); 381 fput(pipe);
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index f43100b9662b..450f529a4eae 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -87,18 +87,70 @@ done:
87} 87}
88 88
89/* 89/*
90 * Calculate and dget next entry in the subdirs list under root.
91 */
92static struct dentry *get_next_positive_subdir(struct dentry *prev,
93 struct dentry *root)
94{
95 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
96 struct list_head *next;
97 struct dentry *p, *q;
98
99 spin_lock(&sbi->lookup_lock);
100
101 if (prev == NULL) {
102 spin_lock(&root->d_lock);
103 prev = dget_dlock(root);
104 next = prev->d_subdirs.next;
105 p = prev;
106 goto start;
107 }
108
109 p = prev;
110 spin_lock(&p->d_lock);
111again:
112 next = p->d_u.d_child.next;
113start:
114 if (next == &root->d_subdirs) {
115 spin_unlock(&p->d_lock);
116 spin_unlock(&sbi->lookup_lock);
117 dput(prev);
118 return NULL;
119 }
120
121 q = list_entry(next, struct dentry, d_u.d_child);
122
123 spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED);
124 /* Negative dentry - try next */
125 if (!simple_positive(q)) {
126 spin_unlock(&p->d_lock);
127 p = q;
128 goto again;
129 }
130 dget_dlock(q);
131 spin_unlock(&q->d_lock);
132 spin_unlock(&p->d_lock);
133 spin_unlock(&sbi->lookup_lock);
134
135 dput(prev);
136
137 return q;
138}
139
140/*
90 * Calculate and dget next entry in top down tree traversal. 141 * Calculate and dget next entry in top down tree traversal.
91 */ 142 */
92static struct dentry *get_next_positive_dentry(struct dentry *prev, 143static struct dentry *get_next_positive_dentry(struct dentry *prev,
93 struct dentry *root) 144 struct dentry *root)
94{ 145{
146 struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
95 struct list_head *next; 147 struct list_head *next;
96 struct dentry *p, *ret; 148 struct dentry *p, *ret;
97 149
98 if (prev == NULL) 150 if (prev == NULL)
99 return dget(root); 151 return dget(root);
100 152
101 spin_lock(&autofs4_lock); 153 spin_lock(&sbi->lookup_lock);
102relock: 154relock:
103 p = prev; 155 p = prev;
104 spin_lock(&p->d_lock); 156 spin_lock(&p->d_lock);
@@ -110,7 +162,7 @@ again:
110 162
111 if (p == root) { 163 if (p == root) {
112 spin_unlock(&p->d_lock); 164 spin_unlock(&p->d_lock);
113 spin_unlock(&autofs4_lock); 165 spin_unlock(&sbi->lookup_lock);
114 dput(prev); 166 dput(prev);
115 return NULL; 167 return NULL;
116 } 168 }
@@ -140,7 +192,7 @@ again:
140 dget_dlock(ret); 192 dget_dlock(ret);
141 spin_unlock(&ret->d_lock); 193 spin_unlock(&ret->d_lock);
142 spin_unlock(&p->d_lock); 194 spin_unlock(&p->d_lock);
143 spin_unlock(&autofs4_lock); 195 spin_unlock(&sbi->lookup_lock);
144 196
145 dput(prev); 197 dput(prev);
146 198
@@ -290,11 +342,8 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
290 spin_lock(&sbi->fs_lock); 342 spin_lock(&sbi->fs_lock);
291 ino = autofs4_dentry_ino(root); 343 ino = autofs4_dentry_ino(root);
292 /* No point expiring a pending mount */ 344 /* No point expiring a pending mount */
293 if (ino->flags & AUTOFS_INF_PENDING) { 345 if (ino->flags & AUTOFS_INF_PENDING)
294 spin_unlock(&sbi->fs_lock); 346 goto out;
295 return NULL;
296 }
297 managed_dentry_set_transit(root);
298 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { 347 if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
299 struct autofs_info *ino = autofs4_dentry_ino(root); 348 struct autofs_info *ino = autofs4_dentry_ino(root);
300 ino->flags |= AUTOFS_INF_EXPIRING; 349 ino->flags |= AUTOFS_INF_EXPIRING;
@@ -302,7 +351,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
302 spin_unlock(&sbi->fs_lock); 351 spin_unlock(&sbi->fs_lock);
303 return root; 352 return root;
304 } 353 }
305 managed_dentry_clear_transit(root); 354out:
306 spin_unlock(&sbi->fs_lock); 355 spin_unlock(&sbi->fs_lock);
307 dput(root); 356 dput(root);
308 357
@@ -336,13 +385,12 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
336 timeout = sbi->exp_timeout; 385 timeout = sbi->exp_timeout;
337 386
338 dentry = NULL; 387 dentry = NULL;
339 while ((dentry = get_next_positive_dentry(dentry, root))) { 388 while ((dentry = get_next_positive_subdir(dentry, root))) {
340 spin_lock(&sbi->fs_lock); 389 spin_lock(&sbi->fs_lock);
341 ino = autofs4_dentry_ino(dentry); 390 ino = autofs4_dentry_ino(dentry);
342 /* No point expiring a pending mount */ 391 /* No point expiring a pending mount */
343 if (ino->flags & AUTOFS_INF_PENDING) 392 if (ino->flags & AUTOFS_INF_PENDING)
344 goto cont; 393 goto next;
345 managed_dentry_set_transit(dentry);
346 394
347 /* 395 /*
348 * Case 1: (i) indirect mount or top level pseudo direct mount 396 * Case 1: (i) indirect mount or top level pseudo direct mount
@@ -402,8 +450,6 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb,
402 } 450 }
403 } 451 }
404next: 452next:
405 managed_dentry_clear_transit(dentry);
406cont:
407 spin_unlock(&sbi->fs_lock); 453 spin_unlock(&sbi->fs_lock);
408 } 454 }
409 return NULL; 455 return NULL;
@@ -415,13 +461,13 @@ found:
415 ino->flags |= AUTOFS_INF_EXPIRING; 461 ino->flags |= AUTOFS_INF_EXPIRING;
416 init_completion(&ino->expire_complete); 462 init_completion(&ino->expire_complete);
417 spin_unlock(&sbi->fs_lock); 463 spin_unlock(&sbi->fs_lock);
418 spin_lock(&autofs4_lock); 464 spin_lock(&sbi->lookup_lock);
419 spin_lock(&expired->d_parent->d_lock); 465 spin_lock(&expired->d_parent->d_lock);
420 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED); 466 spin_lock_nested(&expired->d_lock, DENTRY_D_LOCK_NESTED);
421 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); 467 list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
422 spin_unlock(&expired->d_lock); 468 spin_unlock(&expired->d_lock);
423 spin_unlock(&expired->d_parent->d_lock); 469 spin_unlock(&expired->d_parent->d_lock);
424 spin_unlock(&autofs4_lock); 470 spin_unlock(&sbi->lookup_lock);
425 return expired; 471 return expired;
426} 472}
427 473
@@ -484,8 +530,6 @@ int autofs4_expire_run(struct super_block *sb,
484 spin_lock(&sbi->fs_lock); 530 spin_lock(&sbi->fs_lock);
485 ino = autofs4_dentry_ino(dentry); 531 ino = autofs4_dentry_ino(dentry);
486 ino->flags &= ~AUTOFS_INF_EXPIRING; 532 ino->flags &= ~AUTOFS_INF_EXPIRING;
487 if (!d_unhashed(dentry))
488 managed_dentry_clear_transit(dentry);
489 complete_all(&ino->expire_complete); 533 complete_all(&ino->expire_complete);
490 spin_unlock(&sbi->fs_lock); 534 spin_unlock(&sbi->fs_lock);
491 535
@@ -513,9 +557,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
513 spin_lock(&sbi->fs_lock); 557 spin_lock(&sbi->fs_lock);
514 ino->flags &= ~AUTOFS_INF_EXPIRING; 558 ino->flags &= ~AUTOFS_INF_EXPIRING;
515 spin_lock(&dentry->d_lock); 559 spin_lock(&dentry->d_lock);
516 if (ret) 560 if (!ret) {
517 __managed_dentry_clear_transit(dentry);
518 else {
519 if ((IS_ROOT(dentry) || 561 if ((IS_ROOT(dentry) ||
520 (autofs_type_indirect(sbi->type) && 562 (autofs_type_indirect(sbi->type) &&
521 IS_ROOT(dentry->d_parent))) && 563 IS_ROOT(dentry->d_parent))) &&
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 014e7aba3b08..f55ae23b137e 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -23,8 +23,6 @@
23 23
24#include "autofs_i.h" 24#include "autofs_i.h"
25 25
26DEFINE_SPINLOCK(autofs4_lock);
27
28static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 26static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
29static int autofs4_dir_unlink(struct inode *,struct dentry *); 27static int autofs4_dir_unlink(struct inode *,struct dentry *);
30static int autofs4_dir_rmdir(struct inode *,struct dentry *); 28static int autofs4_dir_rmdir(struct inode *,struct dentry *);
@@ -36,7 +34,7 @@ static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
36static int autofs4_dir_open(struct inode *inode, struct file *file); 34static int autofs4_dir_open(struct inode *inode, struct file *file);
37static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 35static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
38static struct vfsmount *autofs4_d_automount(struct path *); 36static struct vfsmount *autofs4_d_automount(struct path *);
39static int autofs4_d_manage(struct dentry *, bool, bool); 37static int autofs4_d_manage(struct dentry *, bool);
40static void autofs4_dentry_release(struct dentry *); 38static void autofs4_dentry_release(struct dentry *);
41 39
42const struct file_operations autofs4_root_operations = { 40const struct file_operations autofs4_root_operations = {
@@ -125,15 +123,15 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
125 * autofs file system so just let the libfs routines handle 123 * autofs file system so just let the libfs routines handle
126 * it. 124 * it.
127 */ 125 */
128 spin_lock(&autofs4_lock); 126 spin_lock(&sbi->lookup_lock);
129 spin_lock(&dentry->d_lock); 127 spin_lock(&dentry->d_lock);
130 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) { 128 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
131 spin_unlock(&dentry->d_lock); 129 spin_unlock(&dentry->d_lock);
132 spin_unlock(&autofs4_lock); 130 spin_unlock(&sbi->lookup_lock);
133 return -ENOENT; 131 return -ENOENT;
134 } 132 }
135 spin_unlock(&dentry->d_lock); 133 spin_unlock(&dentry->d_lock);
136 spin_unlock(&autofs4_lock); 134 spin_unlock(&sbi->lookup_lock);
137 135
138out: 136out:
139 return dcache_dir_open(inode, file); 137 return dcache_dir_open(inode, file);
@@ -171,7 +169,6 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
171 const unsigned char *str = name->name; 169 const unsigned char *str = name->name;
172 struct list_head *p, *head; 170 struct list_head *p, *head;
173 171
174 spin_lock(&autofs4_lock);
175 spin_lock(&sbi->lookup_lock); 172 spin_lock(&sbi->lookup_lock);
176 head = &sbi->active_list; 173 head = &sbi->active_list;
177 list_for_each(p, head) { 174 list_for_each(p, head) {
@@ -204,14 +201,12 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
204 dget_dlock(active); 201 dget_dlock(active);
205 spin_unlock(&active->d_lock); 202 spin_unlock(&active->d_lock);
206 spin_unlock(&sbi->lookup_lock); 203 spin_unlock(&sbi->lookup_lock);
207 spin_unlock(&autofs4_lock);
208 return active; 204 return active;
209 } 205 }
210next: 206next:
211 spin_unlock(&active->d_lock); 207 spin_unlock(&active->d_lock);
212 } 208 }
213 spin_unlock(&sbi->lookup_lock); 209 spin_unlock(&sbi->lookup_lock);
214 spin_unlock(&autofs4_lock);
215 210
216 return NULL; 211 return NULL;
217} 212}
@@ -226,7 +221,6 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
226 const unsigned char *str = name->name; 221 const unsigned char *str = name->name;
227 struct list_head *p, *head; 222 struct list_head *p, *head;
228 223
229 spin_lock(&autofs4_lock);
230 spin_lock(&sbi->lookup_lock); 224 spin_lock(&sbi->lookup_lock);
231 head = &sbi->expiring_list; 225 head = &sbi->expiring_list;
232 list_for_each(p, head) { 226 list_for_each(p, head) {
@@ -259,14 +253,12 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
259 dget_dlock(expiring); 253 dget_dlock(expiring);
260 spin_unlock(&expiring->d_lock); 254 spin_unlock(&expiring->d_lock);
261 spin_unlock(&sbi->lookup_lock); 255 spin_unlock(&sbi->lookup_lock);
262 spin_unlock(&autofs4_lock);
263 return expiring; 256 return expiring;
264 } 257 }
265next: 258next:
266 spin_unlock(&expiring->d_lock); 259 spin_unlock(&expiring->d_lock);
267 } 260 }
268 spin_unlock(&sbi->lookup_lock); 261 spin_unlock(&sbi->lookup_lock);
269 spin_unlock(&autofs4_lock);
270 262
271 return NULL; 263 return NULL;
272} 264}
@@ -275,17 +267,16 @@ static int autofs4_mount_wait(struct dentry *dentry)
275{ 267{
276 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 268 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
277 struct autofs_info *ino = autofs4_dentry_ino(dentry); 269 struct autofs_info *ino = autofs4_dentry_ino(dentry);
278 int status; 270 int status = 0;
279 271
280 if (ino->flags & AUTOFS_INF_PENDING) { 272 if (ino->flags & AUTOFS_INF_PENDING) {
281 DPRINTK("waiting for mount name=%.*s", 273 DPRINTK("waiting for mount name=%.*s",
282 dentry->d_name.len, dentry->d_name.name); 274 dentry->d_name.len, dentry->d_name.name);
283 status = autofs4_wait(sbi, dentry, NFY_MOUNT); 275 status = autofs4_wait(sbi, dentry, NFY_MOUNT);
284 DPRINTK("mount wait done status=%d", status); 276 DPRINTK("mount wait done status=%d", status);
285 ino->last_used = jiffies;
286 return status;
287 } 277 }
288 return 0; 278 ino->last_used = jiffies;
279 return status;
289} 280}
290 281
291static int do_expire_wait(struct dentry *dentry) 282static int do_expire_wait(struct dentry *dentry)
@@ -319,9 +310,12 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
319 */ 310 */
320 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { 311 if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
321 struct dentry *parent = dentry->d_parent; 312 struct dentry *parent = dentry->d_parent;
313 struct autofs_info *ino;
322 struct dentry *new = d_lookup(parent, &dentry->d_name); 314 struct dentry *new = d_lookup(parent, &dentry->d_name);
323 if (!new) 315 if (!new)
324 return NULL; 316 return NULL;
317 ino = autofs4_dentry_ino(new);
318 ino->last_used = jiffies;
325 dput(path->dentry); 319 dput(path->dentry);
326 path->dentry = new; 320 path->dentry = new;
327 } 321 }
@@ -338,18 +332,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
338 DPRINTK("dentry=%p %.*s", 332 DPRINTK("dentry=%p %.*s",
339 dentry, dentry->d_name.len, dentry->d_name.name); 333 dentry, dentry->d_name.len, dentry->d_name.name);
340 334
341 /*
342 * Someone may have manually umounted this or it was a submount
343 * that has gone away.
344 */
345 spin_lock(&dentry->d_lock);
346 if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
347 if (!(dentry->d_flags & DCACHE_MANAGE_TRANSIT) &&
348 (dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
349 __managed_dentry_set_transit(path->dentry);
350 }
351 spin_unlock(&dentry->d_lock);
352
353 /* The daemon never triggers a mount. */ 335 /* The daemon never triggers a mount. */
354 if (autofs4_oz_mode(sbi)) 336 if (autofs4_oz_mode(sbi))
355 return NULL; 337 return NULL;
@@ -418,18 +400,17 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
418done: 400done:
419 if (!(ino->flags & AUTOFS_INF_EXPIRING)) { 401 if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
420 /* 402 /*
421 * Any needed mounting has been completed and the path updated 403 * Any needed mounting has been completed and the path
422 * so turn this into a normal dentry so we don't continually 404 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
423 * call ->d_automount() and ->d_manage(). 405 * call ->d_automount() on rootless multi-mounts since
424 */ 406 * it can lead to an incorrect ELOOP error return.
425 spin_lock(&dentry->d_lock); 407 *
426 __managed_dentry_clear_transit(dentry);
427 /*
428 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and 408 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
429 * symlinks as in all other cases the dentry will be covered by 409 * symlinks as in all other cases the dentry will be covered by
430 * an actual mount so ->d_automount() won't be called during 410 * an actual mount so ->d_automount() won't be called during
431 * the follow. 411 * the follow.
432 */ 412 */
413 spin_lock(&dentry->d_lock);
433 if ((!d_mountpoint(dentry) && 414 if ((!d_mountpoint(dentry) &&
434 !list_empty(&dentry->d_subdirs)) || 415 !list_empty(&dentry->d_subdirs)) ||
435 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))) 416 (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
@@ -446,7 +427,7 @@ done:
446 return NULL; 427 return NULL;
447} 428}
448 429
449int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk) 430int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
450{ 431{
451 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); 432 struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
452 433
@@ -454,7 +435,9 @@ int autofs4_d_manage(struct dentry *dentry, bool mounting_here, bool rcu_walk)
454 dentry, dentry->d_name.len, dentry->d_name.name); 435 dentry, dentry->d_name.len, dentry->d_name.name);
455 436
456 /* The daemon never waits. */ 437 /* The daemon never waits. */
457 if (autofs4_oz_mode(sbi) || mounting_here) { 438 if (autofs4_oz_mode(sbi)) {
439 if (rcu_walk)
440 return 0;
458 if (!d_mountpoint(dentry)) 441 if (!d_mountpoint(dentry))
459 return -EISDIR; 442 return -EISDIR;
460 return 0; 443 return 0;
@@ -612,12 +595,12 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
612 595
613 dir->i_mtime = CURRENT_TIME; 596 dir->i_mtime = CURRENT_TIME;
614 597
615 spin_lock(&autofs4_lock); 598 spin_lock(&sbi->lookup_lock);
616 autofs4_add_expiring(dentry); 599 __autofs4_add_expiring(dentry);
617 spin_lock(&dentry->d_lock); 600 spin_lock(&dentry->d_lock);
618 __d_drop(dentry); 601 __d_drop(dentry);
619 spin_unlock(&dentry->d_lock); 602 spin_unlock(&dentry->d_lock);
620 spin_unlock(&autofs4_lock); 603 spin_unlock(&sbi->lookup_lock);
621 604
622 return 0; 605 return 0;
623} 606}
@@ -629,7 +612,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
629 * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves 612 * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves
630 * of the directory tree. There is no need to clear the automount flag 613 * of the directory tree. There is no need to clear the automount flag
631 * following a mount or restore it after an expire because these mounts 614 * following a mount or restore it after an expire because these mounts
632 * are always covered. However, it is neccessary to ensure that these 615 * are always covered. However, it is necessary to ensure that these
633 * flags are clear on non-empty directories to avoid unnecessary calls 616 * flags are clear on non-empty directories to avoid unnecessary calls
634 * during path walks. 617 * during path walks.
635 */ 618 */
@@ -686,20 +669,17 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
686 if (!autofs4_oz_mode(sbi)) 669 if (!autofs4_oz_mode(sbi))
687 return -EACCES; 670 return -EACCES;
688 671
689 spin_lock(&autofs4_lock);
690 spin_lock(&sbi->lookup_lock); 672 spin_lock(&sbi->lookup_lock);
691 spin_lock(&dentry->d_lock); 673 spin_lock(&dentry->d_lock);
692 if (!list_empty(&dentry->d_subdirs)) { 674 if (!list_empty(&dentry->d_subdirs)) {
693 spin_unlock(&dentry->d_lock); 675 spin_unlock(&dentry->d_lock);
694 spin_unlock(&sbi->lookup_lock); 676 spin_unlock(&sbi->lookup_lock);
695 spin_unlock(&autofs4_lock);
696 return -ENOTEMPTY; 677 return -ENOTEMPTY;
697 } 678 }
698 __autofs4_add_expiring(dentry); 679 __autofs4_add_expiring(dentry);
699 spin_unlock(&sbi->lookup_lock);
700 __d_drop(dentry); 680 __d_drop(dentry);
701 spin_unlock(&dentry->d_lock); 681 spin_unlock(&dentry->d_lock);
702 spin_unlock(&autofs4_lock); 682 spin_unlock(&sbi->lookup_lock);
703 683
704 if (sbi->version < 5) 684 if (sbi->version < 5)
705 autofs_clear_leaf_automount_flags(dentry); 685 autofs_clear_leaf_automount_flags(dentry);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 56010056b2e6..25435987d6ae 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -197,12 +197,12 @@ rename_retry:
197 197
198 seq = read_seqbegin(&rename_lock); 198 seq = read_seqbegin(&rename_lock);
199 rcu_read_lock(); 199 rcu_read_lock();
200 spin_lock(&autofs4_lock); 200 spin_lock(&sbi->fs_lock);
201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) 201 for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
202 len += tmp->d_name.len + 1; 202 len += tmp->d_name.len + 1;
203 203
204 if (!len || --len > NAME_MAX) { 204 if (!len || --len > NAME_MAX) {
205 spin_unlock(&autofs4_lock); 205 spin_unlock(&sbi->fs_lock);
206 rcu_read_unlock(); 206 rcu_read_unlock();
207 if (read_seqretry(&rename_lock, seq)) 207 if (read_seqretry(&rename_lock, seq))
208 goto rename_retry; 208 goto rename_retry;
@@ -218,7 +218,7 @@ rename_retry:
218 p -= tmp->d_name.len; 218 p -= tmp->d_name.len;
219 strncpy(p, tmp->d_name.name, tmp->d_name.len); 219 strncpy(p, tmp->d_name.name, tmp->d_name.len);
220 } 220 }
221 spin_unlock(&autofs4_lock); 221 spin_unlock(&sbi->fs_lock);
222 rcu_read_unlock(); 222 rcu_read_unlock();
223 if (read_seqretry(&rename_lock, seq)) 223 if (read_seqretry(&rename_lock, seq))
224 goto rename_retry; 224 goto rename_retry;
diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog
index ce8c787916be..75a461cfaca6 100644
--- a/fs/befs/ChangeLog
+++ b/fs/befs/ChangeLog
@@ -24,7 +24,7 @@ Version 0.9 (2002-03-14)
24 24
25Version 0.64 (2002-02-07) 25Version 0.64 (2002-02-07)
26========== 26==========
27* Did the string comparision really right this time (btree.c) [WD] 27* Did the string comparison really right this time (btree.c) [WD]
28 28
29* Fixed up some places where I assumed that a long int could hold 29* Fixed up some places where I assumed that a long int could hold
30 a pointer value. (btree.c) [WD] 30 a pointer value. (btree.c) [WD]
@@ -114,7 +114,7 @@ Version 0.6 (2001-12-15)
114 More flexible. Will soon be controllable at mount time 114 More flexible. Will soon be controllable at mount time
115 (see TODO). [WD] 115 (see TODO). [WD]
116 116
117* Rewrote datastream positon lookups. 117* Rewrote datastream position lookups.
118 (datastream.c) [WD] 118 (datastream.c) [WD]
119 119
120* Moved the TODO list to its own file. 120* Moved the TODO list to its own file.
@@ -150,7 +150,7 @@ Version 0.50 (2001-11-13)
150* Anton also told me that the blocksize is not allowed to be larger than 150* Anton also told me that the blocksize is not allowed to be larger than
151 the page size in linux, which is 4k i386. Oops. Added a test for 151 the page size in linux, which is 4k i386. Oops. Added a test for
152 (blocksize > PAGE_SIZE), and refuse to mount in that case. What this 152 (blocksize > PAGE_SIZE), and refuse to mount in that case. What this
153 practicaly means is that 8k blocksize volumes won't work without a major 153 practically means is that 8k blocksize volumes won't work without a major
154 restructuring of the driver (or an alpha or other 64bit hardware). [WD] 154 restructuring of the driver (or an alpha or other 64bit hardware). [WD]
155 155
156* Cleaned up the befs_count_blocks() function. Much smarter now. 156* Cleaned up the befs_count_blocks() function. Much smarter now.
@@ -183,7 +183,7 @@ Version 0.45 (2001-10-29)
183 structures into the generic pointer fields of the public structures 183 structures into the generic pointer fields of the public structures
184 with kmalloc(). put_super and put_inode free them. This allows us not 184 with kmalloc(). put_super and put_inode free them. This allows us not
185 to have to touch the definitions of the public structures in 185 to have to touch the definitions of the public structures in
186 include/linux/fs.h. Also, befs_inode_info is huge (becuase of the 186 include/linux/fs.h. Also, befs_inode_info is huge (because of the
187 symlink string). (super.c, inode.c, befs_fs.h) [WD] 187 symlink string). (super.c, inode.c, befs_fs.h) [WD]
188 188
189* Fixed a thinko that was corrupting file reads after the first block_run 189* Fixed a thinko that was corrupting file reads after the first block_run
@@ -404,7 +404,7 @@ Version 0.4 (2001-10-28)
404 404
405* Fixed compile errors on 2.4.1 kernel (WD) 405* Fixed compile errors on 2.4.1 kernel (WD)
406 Resolve rejected patches 406 Resolve rejected patches
407 Accomodate changed NLS interface (util.h) 407 Accommodate changed NLS interface (util.h)
408 Needed to include <linux/slab.h> in most files 408 Needed to include <linux/slab.h> in most files
409 Makefile changes 409 Makefile changes
410 fs/Config.in changes 410 fs/Config.in changes
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index 7893eaa1e58c..eb557d9dc8be 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -234,7 +234,7 @@ typedef struct {
234} PACKED befs_btree_super; 234} PACKED befs_btree_super;
235 235
236/* 236/*
237 * Header stucture of each btree node 237 * Header structure of each btree node
238 */ 238 */
239typedef struct { 239typedef struct {
240 fs64 left; 240 fs64 left;
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 4202db7496cb..a66c9b1136e0 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * Licensed under the GNU GPL. See the file COPYING for details. 6 * Licensed under the GNU GPL. See the file COPYING for details.
7 * 7 *
8 * 2002-02-05: Sergey S. Kostyliov added binary search withing 8 * 2002-02-05: Sergey S. Kostyliov added binary search within
9 * btree nodes. 9 * btree nodes.
10 * 10 *
11 * Many thanks to: 11 * Many thanks to:
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b1d0c794747b..54b8c28bebc8 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -75,7 +75,6 @@ static const struct inode_operations befs_dir_inode_operations = {
75 75
76static const struct address_space_operations befs_aops = { 76static const struct address_space_operations befs_aops = {
77 .readpage = befs_readpage, 77 .readpage = befs_readpage,
78 .sync_page = block_sync_page,
79 .bmap = befs_bmap, 78 .bmap = befs_bmap,
80}; 79};
81 80
@@ -735,7 +734,7 @@ parse_options(char *options, befs_mount_options * opts)
735 734
736/* This function has the responsibiltiy of getting the 735/* This function has the responsibiltiy of getting the
737 * filesystem ready for unmounting. 736 * filesystem ready for unmounting.
738 * Basicly, we free everything that we allocated in 737 * Basically, we free everything that we allocated in
739 * befs_read_inode 738 * befs_read_inode
740 */ 739 */
741static void 740static void
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 685ecff3ab31..b14cebfd9047 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -97,7 +97,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
97 if (!inode) 97 if (!inode)
98 return -ENOSPC; 98 return -ENOSPC;
99 mutex_lock(&info->bfs_lock); 99 mutex_lock(&info->bfs_lock);
100 ino = find_first_zero_bit(info->si_imap, info->si_lasti); 100 ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1);
101 if (ino > info->si_lasti) { 101 if (ino > info->si_lasti) {
102 mutex_unlock(&info->bfs_lock); 102 mutex_unlock(&info->bfs_lock);
103 iput(inode); 103 iput(inode);
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index eb67edd0f8ea..f20e8a71062f 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -186,7 +186,6 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block)
186const struct address_space_operations bfs_aops = { 186const struct address_space_operations bfs_aops = {
187 .readpage = bfs_readpage, 187 .readpage = bfs_readpage,
188 .writepage = bfs_writepage, 188 .writepage = bfs_writepage,
189 .sync_page = block_sync_page,
190 .write_begin = bfs_write_begin, 189 .write_begin = bfs_write_begin,
191 .write_end = generic_write_end, 190 .write_end = generic_write_end,
192 .bmap = bfs_bmap, 191 .bmap = bfs_bmap,
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d5b640ba6cb1..303983fabfd6 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -570,7 +570,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
570 unsigned long elf_entry; 570 unsigned long elf_entry;
571 unsigned long interp_load_addr = 0; 571 unsigned long interp_load_addr = 0;
572 unsigned long start_code, end_code, start_data, end_data; 572 unsigned long start_code, end_code, start_data, end_data;
573 unsigned long reloc_func_desc = 0; 573 unsigned long reloc_func_desc __maybe_unused = 0;
574 int executable_stack = EXSTACK_DEFAULT; 574 int executable_stack = EXSTACK_DEFAULT;
575 unsigned long def_flags = 0; 575 unsigned long def_flags = 0;
576 struct { 576 struct {
@@ -941,9 +941,13 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
941 current->mm->start_stack = bprm->p; 941 current->mm->start_stack = bprm->p;
942 942
943#ifdef arch_randomize_brk 943#ifdef arch_randomize_brk
944 if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) 944 if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
945 current->mm->brk = current->mm->start_brk = 945 current->mm->brk = current->mm->start_brk =
946 arch_randomize_brk(current->mm); 946 arch_randomize_brk(current->mm);
947#ifdef CONFIG_COMPAT_BRK
948 current->brk_randomized = 1;
949#endif
950 }
947#endif 951#endif
948 952
949 if (current->personality & MMAP_PAGE_ZERO) { 953 if (current->personality & MMAP_PAGE_ZERO) {
@@ -1906,7 +1910,7 @@ static int elf_core_dump(struct coredump_params *cprm)
1906 segs = current->mm->map_count; 1910 segs = current->mm->map_count;
1907 segs += elf_core_extra_phdrs(); 1911 segs += elf_core_extra_phdrs();
1908 1912
1909 gate_vma = get_gate_vma(current); 1913 gate_vma = get_gate_vma(current->mm);
1910 if (gate_vma != NULL) 1914 if (gate_vma != NULL)
1911 segs++; 1915 segs++;
1912 1916
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 811384bec8de..397d3057d336 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -717,7 +717,7 @@ static int load_flat_file(struct linux_binprm * bprm,
717 * help simplify all this mumbo jumbo 717 * help simplify all this mumbo jumbo
718 * 718 *
719 * We've got two different sections of relocation entries. 719 * We've got two different sections of relocation entries.
720 * The first is the GOT which resides at the begining of the data segment 720 * The first is the GOT which resides at the beginning of the data segment
721 * and is terminated with a -1. This one can be relocated in place. 721 * and is terminated with a -1. This one can be relocated in place.
722 * The second is the extra relocation entries tacked after the image's 722 * The second is the extra relocation entries tacked after the image's
723 * data segment. These require a little more processing as the entry is 723 * data segment. These require a little more processing as the entry is
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index e49cce234c65..9c5e6b2cd11a 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -761,6 +761,9 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
761{ 761{
762 unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES); 762 unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
763 763
764 if (bs->bio_integrity_pool)
765 return 0;
766
764 bs->bio_integrity_pool = 767 bs->bio_integrity_pool =
765 mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab); 768 mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
766 769
diff --git a/fs/bio.c b/fs/bio.c
index 4bd454fa844e..840a0d755248 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -43,7 +43,7 @@ static mempool_t *bio_split_pool __read_mostly;
43 * unsigned short 43 * unsigned short
44 */ 44 */
45#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } 45#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
46struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { 46static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
47 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), 47 BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
48}; 48};
49#undef BV 49#undef BV
@@ -111,7 +111,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
111 if (!slab) 111 if (!slab)
112 goto out_unlock; 112 goto out_unlock;
113 113
114 printk("bio: create slab <%s> at %d\n", bslab->name, entry); 114 printk(KERN_INFO "bio: create slab <%s> at %d\n", bslab->name, entry);
115 bslab->slab = slab; 115 bslab->slab = slab;
116 bslab->slab_ref = 1; 116 bslab->slab_ref = 1;
117 bslab->slab_size = sz; 117 bslab->slab_size = sz;
@@ -1436,7 +1436,7 @@ EXPORT_SYMBOL(bio_flush_dcache_pages);
1436 * preferred way to end I/O on a bio, it takes care of clearing 1436 * preferred way to end I/O on a bio, it takes care of clearing
1437 * BIO_UPTODATE on error. @error is 0 on success, and and one of the 1437 * BIO_UPTODATE on error. @error is 0 on success, and and one of the
1438 * established -Exxxx (-EIO, for instance) error values in case 1438 * established -Exxxx (-EIO, for instance) error values in case
1439 * something went wrong. Noone should call bi_end_io() directly on a 1439 * something went wrong. No one should call bi_end_io() directly on a
1440 * bio unless they own it and thus know that it has an end_io 1440 * bio unless they own it and thus know that it has an end_io
1441 * function. 1441 * function.
1442 **/ 1442 **/
@@ -1636,9 +1636,6 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
1636 if (!bs->bio_pool) 1636 if (!bs->bio_pool)
1637 goto bad; 1637 goto bad;
1638 1638
1639 if (bioset_integrity_create(bs, pool_size))
1640 goto bad;
1641
1642 if (!biovec_create_pools(bs, pool_size)) 1639 if (!biovec_create_pools(bs, pool_size))
1643 return bs; 1640 return bs;
1644 1641
@@ -1656,12 +1653,10 @@ static void __init biovec_init_slabs(void)
1656 int size; 1653 int size;
1657 struct biovec_slab *bvs = bvec_slabs + i; 1654 struct biovec_slab *bvs = bvec_slabs + i;
1658 1655
1659#ifndef CONFIG_BLK_DEV_INTEGRITY
1660 if (bvs->nr_vecs <= BIO_INLINE_VECS) { 1656 if (bvs->nr_vecs <= BIO_INLINE_VECS) {
1661 bvs->slab = NULL; 1657 bvs->slab = NULL;
1662 continue; 1658 continue;
1663 } 1659 }
1664#endif
1665 1660
1666 size = bvs->nr_vecs * sizeof(struct bio_vec); 1661 size = bvs->nr_vecs * sizeof(struct bio_vec);
1667 bvs->slab = kmem_cache_create(bvs->name, size, 0, 1662 bvs->slab = kmem_cache_create(bvs->name, size, 0,
@@ -1684,6 +1679,9 @@ static int __init init_bio(void)
1684 if (!fs_bio_set) 1679 if (!fs_bio_set)
1685 panic("bio: can't allocate bios\n"); 1680 panic("bio: can't allocate bios\n");
1686 1681
1682 if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
1683 panic("bio: can't create integrity pool\n");
1684
1687 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES, 1685 bio_split_pool = mempool_create_kmalloc_pool(BIO_SPLIT_ENTRIES,
1688 sizeof(struct bio_pair)); 1686 sizeof(struct bio_pair));
1689 if (!bio_split_pool) 1687 if (!bio_split_pool)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 333a7bb4cb9c..5147bdd3b8e1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -55,11 +55,13 @@ EXPORT_SYMBOL(I_BDEV);
55static void bdev_inode_switch_bdi(struct inode *inode, 55static void bdev_inode_switch_bdi(struct inode *inode,
56 struct backing_dev_info *dst) 56 struct backing_dev_info *dst)
57{ 57{
58 spin_lock(&inode_lock); 58 spin_lock(&inode_wb_list_lock);
59 spin_lock(&inode->i_lock);
59 inode->i_data.backing_dev_info = dst; 60 inode->i_data.backing_dev_info = dst;
60 if (inode->i_state & I_DIRTY) 61 if (inode->i_state & I_DIRTY)
61 list_move(&inode->i_wb_list, &dst->wb.b_dirty); 62 list_move(&inode->i_wb_list, &dst->wb.b_dirty);
62 spin_unlock(&inode_lock); 63 spin_unlock(&inode->i_lock);
64 spin_unlock(&inode_wb_list_lock);
63} 65}
64 66
65static sector_t max_block(struct block_device *bdev) 67static sector_t max_block(struct block_device *bdev)
@@ -651,7 +653,7 @@ void bd_forget(struct inode *inode)
651 * @whole: whole block device containing @bdev, may equal @bdev 653 * @whole: whole block device containing @bdev, may equal @bdev
652 * @holder: holder trying to claim @bdev 654 * @holder: holder trying to claim @bdev
653 * 655 *
654 * Test whther @bdev can be claimed by @holder. 656 * Test whether @bdev can be claimed by @holder.
655 * 657 *
656 * CONTEXT: 658 * CONTEXT:
657 * spin_lock(&bdev_lock). 659 * spin_lock(&bdev_lock).
@@ -873,6 +875,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
873 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); 875 ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
874 if (ret) 876 if (ret)
875 goto out_del; 877 goto out_del;
878 /*
879 * bdev could be deleted beneath us which would implicitly destroy
880 * the holder directory. Hold on to it.
881 */
882 kobject_get(bdev->bd_part->holder_dir);
876 883
877 list_add(&holder->list, &bdev->bd_holder_disks); 884 list_add(&holder->list, &bdev->bd_holder_disks);
878 goto out_unlock; 885 goto out_unlock;
@@ -909,6 +916,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
909 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); 916 del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
910 del_symlink(bdev->bd_part->holder_dir, 917 del_symlink(bdev->bd_part->holder_dir,
911 &disk_to_dev(disk)->kobj); 918 &disk_to_dev(disk)->kobj);
919 kobject_put(bdev->bd_part->holder_dir);
912 list_del_init(&holder->list); 920 list_del_init(&holder->list);
913 kfree(holder); 921 kfree(holder);
914 } 922 }
@@ -922,14 +930,15 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
922 * flush_disk - invalidates all buffer-cache entries on a disk 930 * flush_disk - invalidates all buffer-cache entries on a disk
923 * 931 *
924 * @bdev: struct block device to be flushed 932 * @bdev: struct block device to be flushed
933 * @kill_dirty: flag to guide handling of dirty inodes
925 * 934 *
926 * Invalidates all buffer-cache entries on a disk. It should be called 935 * Invalidates all buffer-cache entries on a disk. It should be called
927 * when a disk has been changed -- either by a media change or online 936 * when a disk has been changed -- either by a media change or online
928 * resize. 937 * resize.
929 */ 938 */
930static void flush_disk(struct block_device *bdev) 939static void flush_disk(struct block_device *bdev, bool kill_dirty)
931{ 940{
932 if (__invalidate_device(bdev)) { 941 if (__invalidate_device(bdev, kill_dirty)) {
933 char name[BDEVNAME_SIZE] = ""; 942 char name[BDEVNAME_SIZE] = "";
934 943
935 if (bdev->bd_disk) 944 if (bdev->bd_disk)
@@ -966,7 +975,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
966 "%s: detected capacity change from %lld to %lld\n", 975 "%s: detected capacity change from %lld to %lld\n",
967 name, bdev_size, disk_size); 976 name, bdev_size, disk_size);
968 i_size_write(bdev->bd_inode, disk_size); 977 i_size_write(bdev->bd_inode, disk_size);
969 flush_disk(bdev); 978 flush_disk(bdev, false);
970 } 979 }
971} 980}
972EXPORT_SYMBOL(check_disk_size_change); 981EXPORT_SYMBOL(check_disk_size_change);
@@ -1019,7 +1028,7 @@ int check_disk_change(struct block_device *bdev)
1019 if (!(events & DISK_EVENT_MEDIA_CHANGE)) 1028 if (!(events & DISK_EVENT_MEDIA_CHANGE))
1020 return 0; 1029 return 0;
1021 1030
1022 flush_disk(bdev); 1031 flush_disk(bdev, true);
1023 if (bdops->revalidate_disk) 1032 if (bdops->revalidate_disk)
1024 bdops->revalidate_disk(bdev->bd_disk); 1033 bdops->revalidate_disk(bdev->bd_disk);
1025 return 1; 1034 return 1;
@@ -1080,6 +1089,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1080 if (!disk) 1089 if (!disk)
1081 goto out; 1090 goto out;
1082 1091
1092 disk_block_events(disk);
1083 mutex_lock_nested(&bdev->bd_mutex, for_part); 1093 mutex_lock_nested(&bdev->bd_mutex, for_part);
1084 if (!bdev->bd_openers) { 1094 if (!bdev->bd_openers) {
1085 bdev->bd_disk = disk; 1095 bdev->bd_disk = disk;
@@ -1101,10 +1111,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1101 */ 1111 */
1102 disk_put_part(bdev->bd_part); 1112 disk_put_part(bdev->bd_part);
1103 bdev->bd_part = NULL; 1113 bdev->bd_part = NULL;
1104 module_put(disk->fops->owner);
1105 put_disk(disk);
1106 bdev->bd_disk = NULL; 1114 bdev->bd_disk = NULL;
1107 mutex_unlock(&bdev->bd_mutex); 1115 mutex_unlock(&bdev->bd_mutex);
1116 disk_unblock_events(disk);
1117 module_put(disk->fops->owner);
1118 put_disk(disk);
1108 goto restart; 1119 goto restart;
1109 } 1120 }
1110 if (ret) 1121 if (ret)
@@ -1141,9 +1152,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1141 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); 1152 bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
1142 } 1153 }
1143 } else { 1154 } else {
1144 module_put(disk->fops->owner);
1145 put_disk(disk);
1146 disk = NULL;
1147 if (bdev->bd_contains == bdev) { 1155 if (bdev->bd_contains == bdev) {
1148 if (bdev->bd_disk->fops->open) { 1156 if (bdev->bd_disk->fops->open) {
1149 ret = bdev->bd_disk->fops->open(bdev, mode); 1157 ret = bdev->bd_disk->fops->open(bdev, mode);
@@ -1153,11 +1161,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1153 if (bdev->bd_invalidated) 1161 if (bdev->bd_invalidated)
1154 rescan_partitions(bdev->bd_disk, bdev); 1162 rescan_partitions(bdev->bd_disk, bdev);
1155 } 1163 }
1164 /* only one opener holds refs to the module and disk */
1165 module_put(disk->fops->owner);
1166 put_disk(disk);
1156 } 1167 }
1157 bdev->bd_openers++; 1168 bdev->bd_openers++;
1158 if (for_part) 1169 if (for_part)
1159 bdev->bd_part_count++; 1170 bdev->bd_part_count++;
1160 mutex_unlock(&bdev->bd_mutex); 1171 mutex_unlock(&bdev->bd_mutex);
1172 disk_unblock_events(disk);
1161 return 0; 1173 return 0;
1162 1174
1163 out_clear: 1175 out_clear:
@@ -1170,10 +1182,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
1170 bdev->bd_contains = NULL; 1182 bdev->bd_contains = NULL;
1171 out_unlock_bdev: 1183 out_unlock_bdev:
1172 mutex_unlock(&bdev->bd_mutex); 1184 mutex_unlock(&bdev->bd_mutex);
1173 out: 1185 disk_unblock_events(disk);
1174 if (disk) 1186 module_put(disk->fops->owner);
1175 module_put(disk->fops->owner);
1176 put_disk(disk); 1187 put_disk(disk);
1188 out:
1177 bdput(bdev); 1189 bdput(bdev);
1178 1190
1179 return ret; 1191 return ret;
@@ -1215,12 +1227,6 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
1215 1227
1216 res = __blkdev_get(bdev, mode, 0); 1228 res = __blkdev_get(bdev, mode, 0);
1217 1229
1218 /* __blkdev_get() may alter read only status, check it afterwards */
1219 if (!res && (mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1220 __blkdev_put(bdev, mode, 0);
1221 res = -EACCES;
1222 }
1223
1224 if (whole) { 1230 if (whole) {
1225 /* finish claiming */ 1231 /* finish claiming */
1226 mutex_lock(&bdev->bd_mutex); 1232 mutex_lock(&bdev->bd_mutex);
@@ -1298,6 +1304,11 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
1298 if (err) 1304 if (err)
1299 return ERR_PTR(err); 1305 return ERR_PTR(err);
1300 1306
1307 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) {
1308 blkdev_put(bdev, mode);
1309 return ERR_PTR(-EACCES);
1310 }
1311
1301 return bdev; 1312 return bdev;
1302} 1313}
1303EXPORT_SYMBOL(blkdev_get_by_path); 1314EXPORT_SYMBOL(blkdev_get_by_path);
@@ -1440,14 +1451,13 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
1440 if (bdev_free) { 1451 if (bdev_free) {
1441 if (bdev->bd_write_holder) { 1452 if (bdev->bd_write_holder) {
1442 disk_unblock_events(bdev->bd_disk); 1453 disk_unblock_events(bdev->bd_disk);
1443 bdev->bd_write_holder = false;
1444 } else
1445 disk_check_events(bdev->bd_disk); 1454 disk_check_events(bdev->bd_disk);
1455 bdev->bd_write_holder = false;
1456 }
1446 } 1457 }
1447 1458
1448 mutex_unlock(&bdev->bd_mutex); 1459 mutex_unlock(&bdev->bd_mutex);
1449 } else 1460 }
1450 disk_check_events(bdev->bd_disk);
1451 1461
1452 return __blkdev_put(bdev, mode, 0); 1462 return __blkdev_put(bdev, mode, 0);
1453} 1463}
@@ -1521,7 +1531,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
1521static const struct address_space_operations def_blk_aops = { 1531static const struct address_space_operations def_blk_aops = {
1522 .readpage = blkdev_readpage, 1532 .readpage = blkdev_readpage,
1523 .writepage = blkdev_writepage, 1533 .writepage = blkdev_writepage,
1524 .sync_page = block_sync_page,
1525 .write_begin = blkdev_write_begin, 1534 .write_begin = blkdev_write_begin,
1526 .write_end = blkdev_write_end, 1535 .write_end = blkdev_write_end,
1527 .writepages = generic_writepages, 1536 .writepages = generic_writepages,
@@ -1601,7 +1610,7 @@ fail:
1601} 1610}
1602EXPORT_SYMBOL(lookup_bdev); 1611EXPORT_SYMBOL(lookup_bdev);
1603 1612
1604int __invalidate_device(struct block_device *bdev) 1613int __invalidate_device(struct block_device *bdev, bool kill_dirty)
1605{ 1614{
1606 struct super_block *sb = get_super(bdev); 1615 struct super_block *sb = get_super(bdev);
1607 int res = 0; 1616 int res = 0;
@@ -1614,7 +1623,7 @@ int __invalidate_device(struct block_device *bdev)
1614 * hold). 1623 * hold).
1615 */ 1624 */
1616 shrink_dcache_sb(sb); 1625 shrink_dcache_sb(sb);
1617 res = invalidate_inodes(sb); 1626 res = invalidate_inodes(sb, kill_dirty);
1618 drop_super(sb); 1627 drop_super(sb);
1619 } 1628 }
1620 invalidate_bdev(bdev); 1629 invalidate_bdev(bdev);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 15b5ca2a2606..5d505aaa72fb 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
37 char *value = NULL; 37 char *value = NULL;
38 struct posix_acl *acl; 38 struct posix_acl *acl;
39 39
40 if (!IS_POSIXACL(inode))
41 return NULL;
42
40 acl = get_cached_acl(inode, type); 43 acl = get_cached_acl(inode, type);
41 if (acl != ACL_NOT_CACHED) 44 if (acl != ACL_NOT_CACHED)
42 return acl; 45 return acl;
@@ -84,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
84 struct posix_acl *acl; 87 struct posix_acl *acl;
85 int ret = 0; 88 int ret = 0;
86 89
90 if (!IS_POSIXACL(dentry->d_inode))
91 return -EOPNOTSUPP;
92
87 acl = btrfs_get_acl(dentry->d_inode, type); 93 acl = btrfs_get_acl(dentry->d_inode, type);
88 94
89 if (IS_ERR(acl)) 95 if (IS_ERR(acl))
@@ -164,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
164 int ret; 170 int ret;
165 struct posix_acl *acl = NULL; 171 struct posix_acl *acl = NULL;
166 172
167 if (!is_owner_or_cap(dentry->d_inode)) 173 if (!inode_owner_or_capable(dentry->d_inode))
168 return -EPERM; 174 return -EPERM;
169 175
170 if (!IS_POSIXACL(dentry->d_inode)) 176 if (!IS_POSIXACL(dentry->d_inode))
@@ -172,16 +178,17 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
172 178
173 if (value) { 179 if (value) {
174 acl = posix_acl_from_xattr(value, size); 180 acl = posix_acl_from_xattr(value, size);
175 if (acl == NULL) { 181 if (acl) {
176 value = NULL; 182 ret = posix_acl_valid(acl);
177 size = 0; 183 if (ret)
184 goto out;
178 } else if (IS_ERR(acl)) { 185 } else if (IS_ERR(acl)) {
179 return PTR_ERR(acl); 186 return PTR_ERR(acl);
180 } 187 }
181 } 188 }
182 189
183 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); 190 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
184 191out:
185 posix_acl_release(acl); 192 posix_acl_release(acl);
186 193
187 return ret; 194 return ret;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ccc991c542df..57c3bb2884ce 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -136,9 +136,8 @@ struct btrfs_inode {
136 * items we think we'll end up using, and reserved_extents is the number 136 * items we think we'll end up using, and reserved_extents is the number
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents; 139 atomic_t outstanding_extents;
141 int reserved_extents; 140 atomic_t reserved_extents;
142 141
143 /* 142 /*
144 * ordered_data_close is set by truncate when a file that used 143 * ordered_data_close is set by truncate when a file that used
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index f745287fbf2e..41d1d7c70e29 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -340,6 +340,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
340 340
341 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); 341 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
342 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 342 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
343 if (!cb)
344 return -ENOMEM;
343 atomic_set(&cb->pending_bios, 0); 345 atomic_set(&cb->pending_bios, 0);
344 cb->errors = 0; 346 cb->errors = 0;
345 cb->inode = inode; 347 cb->inode = inode;
@@ -354,6 +356,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
354 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 356 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
355 357
356 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); 358 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
359 if(!bio) {
360 kfree(cb);
361 return -ENOMEM;
362 }
357 bio->bi_private = cb; 363 bio->bi_private = cb;
358 bio->bi_end_io = end_compressed_bio_write; 364 bio->bi_end_io = end_compressed_bio_write;
359 atomic_inc(&cb->pending_bios); 365 atomic_inc(&cb->pending_bios);
@@ -562,7 +568,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
562 u64 em_len; 568 u64 em_len;
563 u64 em_start; 569 u64 em_start;
564 struct extent_map *em; 570 struct extent_map *em;
565 int ret; 571 int ret = -ENOMEM;
566 u32 *sums; 572 u32 *sums;
567 573
568 tree = &BTRFS_I(inode)->io_tree; 574 tree = &BTRFS_I(inode)->io_tree;
@@ -577,6 +583,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
577 583
578 compressed_len = em->block_len; 584 compressed_len = em->block_len;
579 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 585 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
586 if (!cb)
587 goto out;
588
580 atomic_set(&cb->pending_bios, 0); 589 atomic_set(&cb->pending_bios, 0);
581 cb->errors = 0; 590 cb->errors = 0;
582 cb->inode = inode; 591 cb->inode = inode;
@@ -597,13 +606,18 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
597 606
598 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 607 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
599 PAGE_CACHE_SIZE; 608 PAGE_CACHE_SIZE;
600 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, 609 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
601 GFP_NOFS); 610 GFP_NOFS);
611 if (!cb->compressed_pages)
612 goto fail1;
613
602 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 614 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
603 615
604 for (page_index = 0; page_index < nr_pages; page_index++) { 616 for (page_index = 0; page_index < nr_pages; page_index++) {
605 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | 617 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
606 __GFP_HIGHMEM); 618 __GFP_HIGHMEM);
619 if (!cb->compressed_pages[page_index])
620 goto fail2;
607 } 621 }
608 cb->nr_pages = nr_pages; 622 cb->nr_pages = nr_pages;
609 623
@@ -614,6 +628,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
614 cb->len = uncompressed_len; 628 cb->len = uncompressed_len;
615 629
616 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); 630 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
631 if (!comp_bio)
632 goto fail2;
617 comp_bio->bi_private = cb; 633 comp_bio->bi_private = cb;
618 comp_bio->bi_end_io = end_compressed_bio_read; 634 comp_bio->bi_end_io = end_compressed_bio_read;
619 atomic_inc(&cb->pending_bios); 635 atomic_inc(&cb->pending_bios);
@@ -647,8 +663,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
647 atomic_inc(&cb->pending_bios); 663 atomic_inc(&cb->pending_bios);
648 664
649 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 665 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
650 btrfs_lookup_bio_sums(root, inode, comp_bio, 666 ret = btrfs_lookup_bio_sums(root, inode,
651 sums); 667 comp_bio, sums);
668 BUG_ON(ret);
652 } 669 }
653 sums += (comp_bio->bi_size + root->sectorsize - 1) / 670 sums += (comp_bio->bi_size + root->sectorsize - 1) /
654 root->sectorsize; 671 root->sectorsize;
@@ -673,14 +690,27 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
673 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 690 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
674 BUG_ON(ret); 691 BUG_ON(ret);
675 692
676 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) 693 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
677 btrfs_lookup_bio_sums(root, inode, comp_bio, sums); 694 ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
695 BUG_ON(ret);
696 }
678 697
679 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 698 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
680 BUG_ON(ret); 699 BUG_ON(ret);
681 700
682 bio_put(comp_bio); 701 bio_put(comp_bio);
683 return 0; 702 return 0;
703
704fail2:
705 for (page_index = 0; page_index < nr_pages; page_index++)
706 free_page((unsigned long)cb->compressed_pages[page_index]);
707
708 kfree(cb->compressed_pages);
709fail1:
710 kfree(cb);
711out:
712 free_extent_map(em);
713 return ret;
684} 714}
685 715
686static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; 716static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
@@ -900,7 +930,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
900 return ret; 930 return ret;
901} 931}
902 932
903void __exit btrfs_exit_compress(void) 933void btrfs_exit_compress(void)
904{ 934{
905 free_workspaces(); 935 free_workspaces();
906} 936}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index b5baff0dccfe..84d7ca1fe0ba 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -147,10 +147,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
147struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 147struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
148{ 148{
149 struct extent_buffer *eb; 149 struct extent_buffer *eb;
150 spin_lock(&root->node_lock); 150
151 eb = root->node; 151 rcu_read_lock();
152 eb = rcu_dereference(root->node);
152 extent_buffer_get(eb); 153 extent_buffer_get(eb);
153 spin_unlock(&root->node_lock); 154 rcu_read_unlock();
154 return eb; 155 return eb;
155} 156}
156 157
@@ -165,14 +166,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
165 while (1) { 166 while (1) {
166 eb = btrfs_root_node(root); 167 eb = btrfs_root_node(root);
167 btrfs_tree_lock(eb); 168 btrfs_tree_lock(eb);
168 169 if (eb == root->node)
169 spin_lock(&root->node_lock);
170 if (eb == root->node) {
171 spin_unlock(&root->node_lock);
172 break; 170 break;
173 }
174 spin_unlock(&root->node_lock);
175
176 btrfs_tree_unlock(eb); 171 btrfs_tree_unlock(eb);
177 free_extent_buffer(eb); 172 free_extent_buffer(eb);
178 } 173 }
@@ -458,10 +453,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
458 else 453 else
459 parent_start = 0; 454 parent_start = 0;
460 455
461 spin_lock(&root->node_lock);
462 root->node = cow;
463 extent_buffer_get(cow); 456 extent_buffer_get(cow);
464 spin_unlock(&root->node_lock); 457 rcu_assign_pointer(root->node, cow);
465 458
466 btrfs_free_tree_block(trans, root, buf, parent_start, 459 btrfs_free_tree_block(trans, root, buf, parent_start,
467 last_ref); 460 last_ref);
@@ -542,6 +535,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
542 535
543 ret = __btrfs_cow_block(trans, root, buf, parent, 536 ret = __btrfs_cow_block(trans, root, buf, parent,
544 parent_slot, cow_ret, search_start, 0); 537 parent_slot, cow_ret, search_start, 0);
538
539 trace_btrfs_cow_block(root, buf, *cow_ret);
540
545 return ret; 541 return ret;
546} 542}
547 543
@@ -686,6 +682,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
686 if (!cur) { 682 if (!cur) {
687 cur = read_tree_block(root, blocknr, 683 cur = read_tree_block(root, blocknr,
688 blocksize, gen); 684 blocksize, gen);
685 if (!cur)
686 return -EIO;
689 } else if (!uptodate) { 687 } else if (!uptodate) {
690 btrfs_read_buffer(cur, gen); 688 btrfs_read_buffer(cur, gen);
691 } 689 }
@@ -732,122 +730,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
732 return btrfs_item_offset_nr(leaf, nr - 1); 730 return btrfs_item_offset_nr(leaf, nr - 1);
733} 731}
734 732
735/*
736 * extra debugging checks to make sure all the items in a key are
737 * well formed and in the proper order
738 */
739static int check_node(struct btrfs_root *root, struct btrfs_path *path,
740 int level)
741{
742 struct extent_buffer *parent = NULL;
743 struct extent_buffer *node = path->nodes[level];
744 struct btrfs_disk_key parent_key;
745 struct btrfs_disk_key node_key;
746 int parent_slot;
747 int slot;
748 struct btrfs_key cpukey;
749 u32 nritems = btrfs_header_nritems(node);
750
751 if (path->nodes[level + 1])
752 parent = path->nodes[level + 1];
753
754 slot = path->slots[level];
755 BUG_ON(nritems == 0);
756 if (parent) {
757 parent_slot = path->slots[level + 1];
758 btrfs_node_key(parent, &parent_key, parent_slot);
759 btrfs_node_key(node, &node_key, 0);
760 BUG_ON(memcmp(&parent_key, &node_key,
761 sizeof(struct btrfs_disk_key)));
762 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
763 btrfs_header_bytenr(node));
764 }
765 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
766 if (slot != 0) {
767 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
768 btrfs_node_key(node, &node_key, slot);
769 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
770 }
771 if (slot < nritems - 1) {
772 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
773 btrfs_node_key(node, &node_key, slot);
774 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
775 }
776 return 0;
777}
778
779/*
780 * extra checking to make sure all the items in a leaf are
781 * well formed and in the proper order
782 */
783static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
784 int level)
785{
786 struct extent_buffer *leaf = path->nodes[level];
787 struct extent_buffer *parent = NULL;
788 int parent_slot;
789 struct btrfs_key cpukey;
790 struct btrfs_disk_key parent_key;
791 struct btrfs_disk_key leaf_key;
792 int slot = path->slots[0];
793
794 u32 nritems = btrfs_header_nritems(leaf);
795
796 if (path->nodes[level + 1])
797 parent = path->nodes[level + 1];
798
799 if (nritems == 0)
800 return 0;
801
802 if (parent) {
803 parent_slot = path->slots[level + 1];
804 btrfs_node_key(parent, &parent_key, parent_slot);
805 btrfs_item_key(leaf, &leaf_key, 0);
806
807 BUG_ON(memcmp(&parent_key, &leaf_key,
808 sizeof(struct btrfs_disk_key)));
809 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
810 btrfs_header_bytenr(leaf));
811 }
812 if (slot != 0 && slot < nritems - 1) {
813 btrfs_item_key(leaf, &leaf_key, slot);
814 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
815 if (comp_keys(&leaf_key, &cpukey) <= 0) {
816 btrfs_print_leaf(root, leaf);
817 printk(KERN_CRIT "slot %d offset bad key\n", slot);
818 BUG_ON(1);
819 }
820 if (btrfs_item_offset_nr(leaf, slot - 1) !=
821 btrfs_item_end_nr(leaf, slot)) {
822 btrfs_print_leaf(root, leaf);
823 printk(KERN_CRIT "slot %d offset bad\n", slot);
824 BUG_ON(1);
825 }
826 }
827 if (slot < nritems - 1) {
828 btrfs_item_key(leaf, &leaf_key, slot);
829 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
830 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
831 if (btrfs_item_offset_nr(leaf, slot) !=
832 btrfs_item_end_nr(leaf, slot + 1)) {
833 btrfs_print_leaf(root, leaf);
834 printk(KERN_CRIT "slot %d offset bad\n", slot);
835 BUG_ON(1);
836 }
837 }
838 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
839 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
840 return 0;
841}
842
843static noinline int check_block(struct btrfs_root *root,
844 struct btrfs_path *path, int level)
845{
846 return 0;
847 if (level == 0)
848 return check_leaf(root, path, level);
849 return check_node(root, path, level);
850}
851 733
852/* 734/*
853 * search for key in the extent_buffer. The items start at offset p, 735 * search for key in the extent_buffer. The items start at offset p,
@@ -1046,9 +928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1046 goto enospc; 928 goto enospc;
1047 } 929 }
1048 930
1049 spin_lock(&root->node_lock); 931 rcu_assign_pointer(root->node, child);
1050 root->node = child;
1051 spin_unlock(&root->node_lock);
1052 932
1053 add_root_to_dirty_list(root); 933 add_root_to_dirty_list(root);
1054 btrfs_tree_unlock(child); 934 btrfs_tree_unlock(child);
@@ -1188,7 +1068,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1188 } 1068 }
1189 } 1069 }
1190 /* double check we haven't messed things up */ 1070 /* double check we haven't messed things up */
1191 check_block(root, path, level);
1192 if (orig_ptr != 1071 if (orig_ptr !=
1193 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1072 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1194 BUG(); 1073 BUG();
@@ -1798,12 +1677,6 @@ cow_done:
1798 if (!cow) 1677 if (!cow)
1799 btrfs_unlock_up_safe(p, level + 1); 1678 btrfs_unlock_up_safe(p, level + 1);
1800 1679
1801 ret = check_block(root, p, level);
1802 if (ret) {
1803 ret = -1;
1804 goto done;
1805 }
1806
1807 ret = bin_search(b, key, level, &slot); 1680 ret = bin_search(b, key, level, &slot);
1808 1681
1809 if (level != 0) { 1682 if (level != 0) {
@@ -2130,10 +2003,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2130 2003
2131 btrfs_mark_buffer_dirty(c); 2004 btrfs_mark_buffer_dirty(c);
2132 2005
2133 spin_lock(&root->node_lock);
2134 old = root->node; 2006 old = root->node;
2135 root->node = c; 2007 rcu_assign_pointer(root->node, c);
2136 spin_unlock(&root->node_lock);
2137 2008
2138 /* the super has an extra ref to root->node */ 2009 /* the super has an extra ref to root->node */
2139 free_extent_buffer(old); 2010 free_extent_buffer(old);
@@ -3840,7 +3711,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3840 unsigned long ptr; 3711 unsigned long ptr;
3841 3712
3842 path = btrfs_alloc_path(); 3713 path = btrfs_alloc_path();
3843 BUG_ON(!path); 3714 if (!path)
3715 return -ENOMEM;
3844 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 3716 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3845 if (!ret) { 3717 if (!ret) {
3846 leaf = path->nodes[0]; 3718 leaf = path->nodes[0];
@@ -4217,6 +4089,7 @@ find_next_key:
4217 } 4089 }
4218 btrfs_set_path_blocking(path); 4090 btrfs_set_path_blocking(path);
4219 cur = read_node_slot(root, cur, slot); 4091 cur = read_node_slot(root, cur, slot);
4092 BUG_ON(!cur);
4220 4093
4221 btrfs_tree_lock(cur); 4094 btrfs_tree_lock(cur);
4222 4095
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2c98b3af6052..2e61fe1b6b8c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -28,6 +28,7 @@
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h>
31#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
32#include "extent_io.h" 33#include "extent_io.h"
33#include "extent_map.h" 34#include "extent_map.h"
@@ -40,6 +41,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
40extern struct kmem_cache *btrfs_transaction_cachep; 41extern struct kmem_cache *btrfs_transaction_cachep;
41extern struct kmem_cache *btrfs_bit_radix_cachep; 42extern struct kmem_cache *btrfs_bit_radix_cachep;
42extern struct kmem_cache *btrfs_path_cachep; 43extern struct kmem_cache *btrfs_path_cachep;
44extern struct kmem_cache *btrfs_free_space_cachep;
43struct btrfs_ordered_sum; 45struct btrfs_ordered_sum;
44 46
45#define BTRFS_MAGIC "_BHRfS_M" 47#define BTRFS_MAGIC "_BHRfS_M"
@@ -729,8 +731,19 @@ struct btrfs_space_info {
729 u64 disk_total; /* total bytes on disk, takes mirrors into 731 u64 disk_total; /* total bytes on disk, takes mirrors into
730 account */ 732 account */
731 733
732 int full; /* indicates that we cannot allocate any more 734 /*
735 * we bump reservation progress every time we decrement
736 * bytes_reserved. This way people waiting for reservations
737 * know something good has happened and they can check
738 * for progress. The number here isn't to be trusted, it
739 * just shows reclaim activity
740 */
741 unsigned long reservation_progress;
742
743 int full:1; /* indicates that we cannot allocate any more
733 chunks for this space */ 744 chunks for this space */
745 int chunk_alloc:1; /* set if we are allocating a chunk */
746
734 int force_alloc; /* set if we need to force a chunk alloc for 747 int force_alloc; /* set if we need to force a chunk alloc for
735 this space */ 748 this space */
736 749
@@ -773,9 +786,6 @@ struct btrfs_free_cluster {
773 /* first extent starting offset */ 786 /* first extent starting offset */
774 u64 window_start; 787 u64 window_start;
775 788
776 /* if this cluster simply points at a bitmap in the block group */
777 bool points_to_bitmap;
778
779 struct btrfs_block_group_cache *block_group; 789 struct btrfs_block_group_cache *block_group;
780 /* 790 /*
781 * when a cluster is allocated from a block group, we put the 791 * when a cluster is allocated from a block group, we put the
@@ -1254,6 +1264,7 @@ struct btrfs_root {
1254#define BTRFS_MOUNT_SPACE_CACHE (1 << 12) 1264#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
1255#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) 1265#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1256#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14) 1266#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1267#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1257 1268
1258#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1269#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1259#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1270#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1273,6 +1284,9 @@ struct btrfs_root {
1273#define BTRFS_INODE_NODUMP (1 << 8) 1284#define BTRFS_INODE_NODUMP (1 << 8)
1274#define BTRFS_INODE_NOATIME (1 << 9) 1285#define BTRFS_INODE_NOATIME (1 << 9)
1275#define BTRFS_INODE_DIRSYNC (1 << 10) 1286#define BTRFS_INODE_DIRSYNC (1 << 10)
1287#define BTRFS_INODE_COMPRESS (1 << 11)
1288
1289#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
1276 1290
1277/* some macros to generate set/get funcs for the struct fields. This 1291/* some macros to generate set/get funcs for the struct fields. This
1278 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1292 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -2147,6 +2161,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2147 u64 root_objectid, u64 owner, u64 offset); 2161 u64 root_objectid, u64 owner, u64 offset);
2148 2162
2149int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2163int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2164int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
2165 u64 num_bytes, int reserve, int sinfo);
2150int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2166int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2151 struct btrfs_root *root); 2167 struct btrfs_root *root);
2152int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2168int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2217,8 +2233,12 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2217int btrfs_error_unpin_extent_range(struct btrfs_root *root, 2233int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2218 u64 start, u64 end); 2234 u64 start, u64 end);
2219int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 2235int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2220 u64 num_bytes); 2236 u64 num_bytes, u64 *actual_bytes);
2237int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2238 struct btrfs_root *root, u64 type);
2239int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2221 2240
2241int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2222/* ctree.c */ 2242/* ctree.c */
2223int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2243int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2224 int level, int *slot); 2244 int level, int *slot);
@@ -2343,6 +2363,8 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2343int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 2363int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2344int btrfs_set_root_node(struct btrfs_root_item *item, 2364int btrfs_set_root_node(struct btrfs_root_item *item,
2345 struct extent_buffer *node); 2365 struct extent_buffer *node);
2366void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
2367
2346/* dir-item.c */ 2368/* dir-item.c */
2347int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 2369int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
2348 struct btrfs_root *root, const char *name, 2370 struct btrfs_root *root, const char *name,
@@ -2380,6 +2402,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
2380 struct btrfs_path *path, u64 dir, 2402 struct btrfs_path *path, u64 dir,
2381 const char *name, u16 name_len, 2403 const char *name, u16 name_len,
2382 int mod); 2404 int mod);
2405int verify_dir_item(struct btrfs_root *root,
2406 struct extent_buffer *leaf,
2407 struct btrfs_dir_item *dir_item);
2383 2408
2384/* orphan.c */ 2409/* orphan.c */
2385int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, 2410int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -2516,7 +2541,7 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2516 struct inode *inode); 2541 struct inode *inode);
2517int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2542int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2518int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2543int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2519void btrfs_orphan_cleanup(struct btrfs_root *root); 2544int btrfs_orphan_cleanup(struct btrfs_root *root);
2520void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, 2545void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2521 struct btrfs_pending_snapshot *pending, 2546 struct btrfs_pending_snapshot *pending,
2522 u64 *bytes_to_reserve); 2547 u64 *bytes_to_reserve);
@@ -2524,7 +2549,7 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2524 struct btrfs_pending_snapshot *pending); 2549 struct btrfs_pending_snapshot *pending);
2525void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2550void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2526 struct btrfs_root *root); 2551 struct btrfs_root *root);
2527int btrfs_cont_expand(struct inode *inode, loff_t size); 2552int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
2528int btrfs_invalidate_inodes(struct btrfs_root *root); 2553int btrfs_invalidate_inodes(struct btrfs_root *root);
2529void btrfs_add_delayed_iput(struct inode *inode); 2554void btrfs_add_delayed_iput(struct inode *inode);
2530void btrfs_run_delayed_iputs(struct btrfs_root *root); 2555void btrfs_run_delayed_iputs(struct btrfs_root *root);
@@ -2553,6 +2578,11 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
2553int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2578int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2554 struct inode *inode, u64 start, u64 end); 2579 struct inode *inode, u64 start, u64 end);
2555int btrfs_release_file(struct inode *inode, struct file *file); 2580int btrfs_release_file(struct inode *inode, struct file *file);
2581void btrfs_drop_pages(struct page **pages, size_t num_pages);
2582int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
2583 struct page **pages, size_t num_pages,
2584 loff_t pos, size_t write_bytes,
2585 struct extent_state **cached);
2556 2586
2557/* tree-defrag.c */ 2587/* tree-defrag.c */
2558int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 2588int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e807b143b857..bce28f653899 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -483,6 +483,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
483 INIT_LIST_HEAD(&head_ref->cluster); 483 INIT_LIST_HEAD(&head_ref->cluster);
484 mutex_init(&head_ref->mutex); 484 mutex_init(&head_ref->mutex);
485 485
486 trace_btrfs_delayed_ref_head(ref, head_ref, action);
487
486 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 488 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
487 489
488 if (existing) { 490 if (existing) {
@@ -537,6 +539,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
537 } 539 }
538 full_ref->level = level; 540 full_ref->level = level;
539 541
542 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
543
540 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 544 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
541 545
542 if (existing) { 546 if (existing) {
@@ -591,6 +595,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
591 full_ref->objectid = owner; 595 full_ref->objectid = owner;
592 full_ref->offset = offset; 596 full_ref->offset = offset;
593 597
598 trace_btrfs_delayed_data_ref(ref, full_ref, action);
599
594 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 600 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
595 601
596 if (existing) { 602 if (existing) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index f0cad5ae5be7..c62f02f6ae69 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -151,7 +151,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
151 ret = PTR_ERR(dir_item); 151 ret = PTR_ERR(dir_item);
152 if (ret == -EEXIST) 152 if (ret == -EEXIST)
153 goto second_insert; 153 goto second_insert;
154 goto out; 154 goto out_free;
155 } 155 }
156 156
157 leaf = path->nodes[0]; 157 leaf = path->nodes[0];
@@ -170,7 +170,7 @@ second_insert:
170 /* FIXME, use some real flag for selecting the extra index */ 170 /* FIXME, use some real flag for selecting the extra index */
171 if (root == root->fs_info->tree_root) { 171 if (root == root->fs_info->tree_root) {
172 ret = 0; 172 ret = 0;
173 goto out; 173 goto out_free;
174 } 174 }
175 btrfs_release_path(root, path); 175 btrfs_release_path(root, path);
176 176
@@ -180,7 +180,7 @@ second_insert:
180 name, name_len); 180 name, name_len);
181 if (IS_ERR(dir_item)) { 181 if (IS_ERR(dir_item)) {
182 ret2 = PTR_ERR(dir_item); 182 ret2 = PTR_ERR(dir_item);
183 goto out; 183 goto out_free;
184 } 184 }
185 leaf = path->nodes[0]; 185 leaf = path->nodes[0];
186 btrfs_cpu_key_to_disk(&disk_key, location); 186 btrfs_cpu_key_to_disk(&disk_key, location);
@@ -192,7 +192,9 @@ second_insert:
192 name_ptr = (unsigned long)(dir_item + 1); 192 name_ptr = (unsigned long)(dir_item + 1);
193 write_extent_buffer(leaf, name, name_ptr, name_len); 193 write_extent_buffer(leaf, name, name_ptr, name_len);
194 btrfs_mark_buffer_dirty(leaf); 194 btrfs_mark_buffer_dirty(leaf);
195out: 195
196out_free:
197
196 btrfs_free_path(path); 198 btrfs_free_path(path);
197 if (ret) 199 if (ret)
198 return ret; 200 return ret;
@@ -377,6 +379,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
377 379
378 leaf = path->nodes[0]; 380 leaf = path->nodes[0];
379 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 381 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
382 if (verify_dir_item(root, leaf, dir_item))
383 return NULL;
384
380 total_len = btrfs_item_size_nr(leaf, path->slots[0]); 385 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
381 while (cur < total_len) { 386 while (cur < total_len) {
382 this_len = sizeof(*dir_item) + 387 this_len = sizeof(*dir_item) +
@@ -429,3 +434,35 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
429 } 434 }
430 return ret; 435 return ret;
431} 436}
437
438int verify_dir_item(struct btrfs_root *root,
439 struct extent_buffer *leaf,
440 struct btrfs_dir_item *dir_item)
441{
442 u16 namelen = BTRFS_NAME_LEN;
443 u8 type = btrfs_dir_type(leaf, dir_item);
444
445 if (type >= BTRFS_FT_MAX) {
446 printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
447 (int)type);
448 return 1;
449 }
450
451 if (type == BTRFS_FT_XATTR)
452 namelen = XATTR_NAME_MAX;
453
454 if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
455 printk(KERN_CRIT "btrfS: invalid dir item name len: %u\n",
456 (unsigned)btrfs_dir_data_len(leaf, dir_item));
457 return 1;
458 }
459
460 /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
461 if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
462 printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
463 (unsigned)btrfs_dir_data_len(leaf, dir_item));
464 return 1;
465 }
466
467 return 0;
468}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b531c36455d8..68c84c8c24bd 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -29,6 +29,7 @@
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h> 31#include <linux/migrate.h>
32#include <asm/unaligned.h>
32#include "compat.h" 33#include "compat.h"
33#include "ctree.h" 34#include "ctree.h"
34#include "disk-io.h" 35#include "disk-io.h"
@@ -198,7 +199,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
198 199
199void btrfs_csum_final(u32 crc, char *result) 200void btrfs_csum_final(u32 crc, char *result)
200{ 201{
201 *(__le32 *)result = ~cpu_to_le32(crc); 202 put_unaligned_le32(~crc, result);
202} 203}
203 204
204/* 205/*
@@ -323,6 +324,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
323 int num_copies = 0; 324 int num_copies = 0;
324 int mirror_num = 0; 325 int mirror_num = 0;
325 326
327 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
326 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 328 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
327 while (1) { 329 while (1) {
328 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 330 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -331,6 +333,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
331 !verify_parent_transid(io_tree, eb, parent_transid)) 333 !verify_parent_transid(io_tree, eb, parent_transid))
332 return ret; 334 return ret;
333 335
336 /*
337 * This buffer's crc is fine, but its contents are corrupted, so
338 * there is no reason to read the other copies, they won't be
339 * any less wrong.
340 */
341 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
342 return ret;
343
334 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 344 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
335 eb->start, eb->len); 345 eb->start, eb->len);
336 if (num_copies == 1) 346 if (num_copies == 1)
@@ -359,10 +369,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
359 369
360 tree = &BTRFS_I(page->mapping->host)->io_tree; 370 tree = &BTRFS_I(page->mapping->host)->io_tree;
361 371
362 if (page->private == EXTENT_PAGE_PRIVATE) 372 if (page->private == EXTENT_PAGE_PRIVATE) {
373 WARN_ON(1);
363 goto out; 374 goto out;
364 if (!page->private) 375 }
376 if (!page->private) {
377 WARN_ON(1);
365 goto out; 378 goto out;
379 }
366 len = page->private >> 2; 380 len = page->private >> 2;
367 WARN_ON(len == 0); 381 WARN_ON(len == 0);
368 382
@@ -415,6 +429,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
415 return ret; 429 return ret;
416} 430}
417 431
432#define CORRUPT(reason, eb, root, slot) \
433 printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
434 "root=%llu, slot=%d\n", reason, \
435 (unsigned long long)btrfs_header_bytenr(eb), \
436 (unsigned long long)root->objectid, slot)
437
438static noinline int check_leaf(struct btrfs_root *root,
439 struct extent_buffer *leaf)
440{
441 struct btrfs_key key;
442 struct btrfs_key leaf_key;
443 u32 nritems = btrfs_header_nritems(leaf);
444 int slot;
445
446 if (nritems == 0)
447 return 0;
448
449 /* Check the 0 item */
450 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
451 BTRFS_LEAF_DATA_SIZE(root)) {
452 CORRUPT("invalid item offset size pair", leaf, root, 0);
453 return -EIO;
454 }
455
456 /*
457 * Check to make sure each items keys are in the correct order and their
458 * offsets make sense. We only have to loop through nritems-1 because
459 * we check the current slot against the next slot, which verifies the
460 * next slot's offset+size makes sense and that the current's slot
461 * offset is correct.
462 */
463 for (slot = 0; slot < nritems - 1; slot++) {
464 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
465 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
466
467 /* Make sure the keys are in the right order */
468 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
469 CORRUPT("bad key order", leaf, root, slot);
470 return -EIO;
471 }
472
473 /*
474 * Make sure the offset and ends are right, remember that the
475 * item data starts at the end of the leaf and grows towards the
476 * front.
477 */
478 if (btrfs_item_offset_nr(leaf, slot) !=
479 btrfs_item_end_nr(leaf, slot + 1)) {
480 CORRUPT("slot offset bad", leaf, root, slot);
481 return -EIO;
482 }
483
484 /*
485 * Check to make sure that we don't point outside of the leaf,
486 * just incase all the items are consistent to eachother, but
487 * all point outside of the leaf.
488 */
489 if (btrfs_item_end_nr(leaf, slot) >
490 BTRFS_LEAF_DATA_SIZE(root)) {
491 CORRUPT("slot end outside of leaf", leaf, root, slot);
492 return -EIO;
493 }
494 }
495
496 return 0;
497}
498
418#ifdef CONFIG_DEBUG_LOCK_ALLOC 499#ifdef CONFIG_DEBUG_LOCK_ALLOC
419void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) 500void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
420{ 501{
@@ -481,8 +562,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
481 btrfs_set_buffer_lockdep_class(eb, found_level); 562 btrfs_set_buffer_lockdep_class(eb, found_level);
482 563
483 ret = csum_tree_block(root, eb, 1); 564 ret = csum_tree_block(root, eb, 1);
484 if (ret) 565 if (ret) {
566 ret = -EIO;
567 goto err;
568 }
569
570 /*
571 * If this is a leaf block and it is corrupt, set the corrupt bit so
572 * that we don't try and read the other copies of this block, just
573 * return -EIO.
574 */
575 if (found_level == 0 && check_leaf(root, eb)) {
576 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
485 ret = -EIO; 577 ret = -EIO;
578 }
486 579
487 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 580 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
488 end = eb->start + end - 1; 581 end = eb->start + end - 1;
@@ -843,7 +936,6 @@ static const struct address_space_operations btree_aops = {
843 .writepages = btree_writepages, 936 .writepages = btree_writepages,
844 .releasepage = btree_releasepage, 937 .releasepage = btree_releasepage,
845 .invalidatepage = btree_invalidatepage, 938 .invalidatepage = btree_invalidatepage,
846 .sync_page = block_sync_page,
847#ifdef CONFIG_MIGRATION 939#ifdef CONFIG_MIGRATION
848 .migratepage = btree_migratepage, 940 .migratepage = btree_migratepage,
849#endif 941#endif
@@ -1156,7 +1248,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1156 root, fs_info, location->objectid); 1248 root, fs_info, location->objectid);
1157 1249
1158 path = btrfs_alloc_path(); 1250 path = btrfs_alloc_path();
1159 BUG_ON(!path); 1251 if (!path) {
1252 kfree(root);
1253 return ERR_PTR(-ENOMEM);
1254 }
1160 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1255 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1161 if (ret == 0) { 1256 if (ret == 0) {
1162 l = path->nodes[0]; 1257 l = path->nodes[0];
@@ -1180,8 +1275,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1180 root->commit_root = btrfs_root_node(root); 1275 root->commit_root = btrfs_root_node(root);
1181 BUG_ON(!root->node); 1276 BUG_ON(!root->node);
1182out: 1277out:
1183 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) 1278 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1184 root->ref_cows = 1; 1279 root->ref_cows = 1;
1280 btrfs_check_and_init_root_item(&root->root_item);
1281 }
1185 1282
1186 return root; 1283 return root;
1187} 1284}
@@ -1327,82 +1424,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1327} 1424}
1328 1425
1329/* 1426/*
1330 * this unplugs every device on the box, and it is only used when page
1331 * is null
1332 */
1333static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1334{
1335 struct btrfs_device *device;
1336 struct btrfs_fs_info *info;
1337
1338 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1339 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1340 if (!device->bdev)
1341 continue;
1342
1343 bdi = blk_get_backing_dev_info(device->bdev);
1344 if (bdi->unplug_io_fn)
1345 bdi->unplug_io_fn(bdi, page);
1346 }
1347}
1348
1349static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1350{
1351 struct inode *inode;
1352 struct extent_map_tree *em_tree;
1353 struct extent_map *em;
1354 struct address_space *mapping;
1355 u64 offset;
1356
1357 /* the generic O_DIRECT read code does this */
1358 if (1 || !page) {
1359 __unplug_io_fn(bdi, page);
1360 return;
1361 }
1362
1363 /*
1364 * page->mapping may change at any time. Get a consistent copy
1365 * and use that for everything below
1366 */
1367 smp_mb();
1368 mapping = page->mapping;
1369 if (!mapping)
1370 return;
1371
1372 inode = mapping->host;
1373
1374 /*
1375 * don't do the expensive searching for a small number of
1376 * devices
1377 */
1378 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1379 __unplug_io_fn(bdi, page);
1380 return;
1381 }
1382
1383 offset = page_offset(page);
1384
1385 em_tree = &BTRFS_I(inode)->extent_tree;
1386 read_lock(&em_tree->lock);
1387 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1388 read_unlock(&em_tree->lock);
1389 if (!em) {
1390 __unplug_io_fn(bdi, page);
1391 return;
1392 }
1393
1394 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1395 free_extent_map(em);
1396 __unplug_io_fn(bdi, page);
1397 return;
1398 }
1399 offset = offset - em->start;
1400 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1401 em->block_start + offset, page);
1402 free_extent_map(em);
1403}
1404
1405/*
1406 * If this fails, caller must call bdi_destroy() to get rid of the 1427 * If this fails, caller must call bdi_destroy() to get rid of the
1407 * bdi again. 1428 * bdi again.
1408 */ 1429 */
@@ -1416,8 +1437,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1416 return err; 1437 return err;
1417 1438
1418 bdi->ra_pages = default_backing_dev_info.ra_pages; 1439 bdi->ra_pages = default_backing_dev_info.ra_pages;
1419 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1420 bdi->unplug_io_data = info;
1421 bdi->congested_fn = btrfs_congested_fn; 1440 bdi->congested_fn = btrfs_congested_fn;
1422 bdi->congested_data = info; 1441 bdi->congested_data = info;
1423 return 0; 1442 return 0;
@@ -1550,6 +1569,7 @@ static int transaction_kthread(void *arg)
1550 spin_unlock(&root->fs_info->new_trans_lock); 1569 spin_unlock(&root->fs_info->new_trans_lock);
1551 1570
1552 trans = btrfs_join_transaction(root, 1); 1571 trans = btrfs_join_transaction(root, 1);
1572 BUG_ON(IS_ERR(trans));
1553 if (transid == trans->transid) { 1573 if (transid == trans->transid) {
1554 ret = btrfs_commit_transaction(trans, root); 1574 ret = btrfs_commit_transaction(trans, root);
1555 BUG_ON(ret); 1575 BUG_ON(ret);
@@ -1627,6 +1647,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 goto fail_bdi; 1647 goto fail_bdi;
1628 } 1648 }
1629 1649
1650 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
1651
1630 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1652 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1631 INIT_LIST_HEAD(&fs_info->trans_list); 1653 INIT_LIST_HEAD(&fs_info->trans_list);
1632 INIT_LIST_HEAD(&fs_info->dead_roots); 1654 INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1757,6 +1779,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1757 1779
1758 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 1780 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1759 1781
1782 /*
1783 * In the long term, we'll store the compression type in the super
1784 * block, and it'll be used for per file compression control.
1785 */
1786 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
1787
1760 ret = btrfs_parse_options(tree_root, options); 1788 ret = btrfs_parse_options(tree_root, options);
1761 if (ret) { 1789 if (ret) {
1762 err = ret; 1790 err = ret;
@@ -1962,6 +1990,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1962 fs_info->metadata_alloc_profile = (u64)-1; 1990 fs_info->metadata_alloc_profile = (u64)-1;
1963 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1991 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1964 1992
1993 ret = btrfs_init_space_info(fs_info);
1994 if (ret) {
1995 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
1996 goto fail_block_groups;
1997 }
1998
1965 ret = btrfs_read_block_groups(extent_root); 1999 ret = btrfs_read_block_groups(extent_root);
1966 if (ret) { 2000 if (ret) {
1967 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 2001 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -2053,9 +2087,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
2053 2087
2054 if (!(sb->s_flags & MS_RDONLY)) { 2088 if (!(sb->s_flags & MS_RDONLY)) {
2055 down_read(&fs_info->cleanup_work_sem); 2089 down_read(&fs_info->cleanup_work_sem);
2056 btrfs_orphan_cleanup(fs_info->fs_root); 2090 err = btrfs_orphan_cleanup(fs_info->fs_root);
2057 btrfs_orphan_cleanup(fs_info->tree_root); 2091 if (!err)
2092 err = btrfs_orphan_cleanup(fs_info->tree_root);
2058 up_read(&fs_info->cleanup_work_sem); 2093 up_read(&fs_info->cleanup_work_sem);
2094 if (err) {
2095 close_ctree(tree_root);
2096 return ERR_PTR(err);
2097 }
2059 } 2098 }
2060 2099
2061 return tree_root; 2100 return tree_root;
@@ -2430,8 +2469,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2430 2469
2431 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2470 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2432 for (i = 0; i < ret; i++) { 2471 for (i = 0; i < ret; i++) {
2472 int err;
2473
2433 root_objectid = gang[i]->root_key.objectid; 2474 root_objectid = gang[i]->root_key.objectid;
2434 btrfs_orphan_cleanup(gang[i]); 2475 err = btrfs_orphan_cleanup(gang[i]);
2476 if (err)
2477 return err;
2435 } 2478 }
2436 root_objectid++; 2479 root_objectid++;
2437 } 2480 }
@@ -2453,10 +2496,14 @@ int btrfs_commit_super(struct btrfs_root *root)
2453 up_write(&root->fs_info->cleanup_work_sem); 2496 up_write(&root->fs_info->cleanup_work_sem);
2454 2497
2455 trans = btrfs_join_transaction(root, 1); 2498 trans = btrfs_join_transaction(root, 1);
2499 if (IS_ERR(trans))
2500 return PTR_ERR(trans);
2456 ret = btrfs_commit_transaction(trans, root); 2501 ret = btrfs_commit_transaction(trans, root);
2457 BUG_ON(ret); 2502 BUG_ON(ret);
2458 /* run commit again to drop the original snapshot */ 2503 /* run commit again to drop the original snapshot */
2459 trans = btrfs_join_transaction(root, 1); 2504 trans = btrfs_join_transaction(root, 1);
2505 if (IS_ERR(trans))
2506 return PTR_ERR(trans);
2460 btrfs_commit_transaction(trans, root); 2507 btrfs_commit_transaction(trans, root);
2461 ret = btrfs_write_and_wait_transaction(NULL, root); 2508 ret = btrfs_write_and_wait_transaction(NULL, root);
2462 BUG_ON(ret); 2509 BUG_ON(ret);
@@ -2484,7 +2531,7 @@ int close_ctree(struct btrfs_root *root)
2484 * ERROR state on disk. 2531 * ERROR state on disk.
2485 * 2532 *
2486 * 2. when btrfs flips readonly just in btrfs_commit_super, 2533 * 2. when btrfs flips readonly just in btrfs_commit_super,
2487 * and in such case, btrfs cannnot write sb via btrfs_commit_super, 2534 * and in such case, btrfs cannot write sb via btrfs_commit_super,
2488 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag, 2535 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2489 * btrfs will cleanup all FS resources first and write sb then. 2536 * btrfs will cleanup all FS resources first and write sb then.
2490 */ 2537 */
@@ -2554,6 +2601,8 @@ int close_ctree(struct btrfs_root *root)
2554 kfree(fs_info->chunk_root); 2601 kfree(fs_info->chunk_root);
2555 kfree(fs_info->dev_root); 2602 kfree(fs_info->dev_root);
2556 kfree(fs_info->csum_root); 2603 kfree(fs_info->csum_root);
2604 kfree(fs_info);
2605
2557 return 0; 2606 return 0;
2558} 2607}
2559 2608
@@ -2936,7 +2985,10 @@ static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2936 break; 2985 break;
2937 2986
2938 /* opt_discard */ 2987 /* opt_discard */
2939 ret = btrfs_error_discard_extent(root, start, end + 1 - start); 2988 if (btrfs_test_opt(root, DISCARD))
2989 ret = btrfs_error_discard_extent(root, start,
2990 end + 1 - start,
2991 NULL);
2940 2992
2941 clear_extent_dirty(unpin, start, end, GFP_NOFS); 2993 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2942 btrfs_error_unpin_extent_range(root, start, end); 2994 btrfs_error_unpin_extent_range(root, start, end);
@@ -3005,7 +3057,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3005 btrfs_destroy_pinned_extent(root, 3057 btrfs_destroy_pinned_extent(root,
3006 root->fs_info->pinned_extents); 3058 root->fs_info->pinned_extents);
3007 3059
3008 t->use_count = 0; 3060 atomic_set(&t->use_count, 0);
3009 list_del_init(&t->list); 3061 list_del_init(&t->list);
3010 memset(t, 0, sizeof(*t)); 3062 memset(t, 0, sizeof(*t));
3011 kmem_cache_free(btrfs_transaction_cachep, t); 3063 kmem_cache_free(btrfs_transaction_cachep, t);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 9786963b07e5..b4ffad859adb 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
21 int len = *max_len; 21 int len = *max_len;
22 int type; 22 int type;
23 23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || 24 if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) 25 *max_len = BTRFS_FID_SIZE_CONNECTABLE;
26 return 255; 26 return 255;
27 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
28 *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 return 255;
30 }
27 31
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE; 32 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT; 33 type = FILEID_BTRFS_WITHOUT_PARENT;
@@ -171,6 +175,8 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
171 int ret; 175 int ret;
172 176
173 path = btrfs_alloc_path(); 177 path = btrfs_alloc_path();
178 if (!path)
179 return ERR_PTR(-ENOMEM);
174 180
175 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { 181 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
176 key.objectid = root->root_key.objectid; 182 key.objectid = root->root_key.objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b55269340cec..31f33ba56fe8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,11 +33,28 @@
33#include "locking.h" 33#include "locking.h"
34#include "free-space-cache.h" 34#include "free-space-cache.h"
35 35
36/* control flags for do_chunk_alloc's force field
37 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
38 * if we really need one.
39 *
40 * CHUNK_ALLOC_FORCE means it must try to allocate one
41 *
42 * CHUNK_ALLOC_LIMITED means to only try and allocate one
43 * if we have very few chunks already allocated. This is
44 * used as part of the clustering code to help make sure
45 * we have a good pool of storage to cluster in, without
46 * filling the FS with empty chunks
47 *
48 */
49enum {
50 CHUNK_ALLOC_NO_FORCE = 0,
51 CHUNK_ALLOC_FORCE = 1,
52 CHUNK_ALLOC_LIMITED = 2,
53};
54
36static int update_block_group(struct btrfs_trans_handle *trans, 55static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 56 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc); 57 u64 bytenr, u64 num_bytes, int alloc);
39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40 u64 num_bytes, int reserve, int sinfo);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 58static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 59 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 60 u64 bytenr, u64 num_bytes, u64 parent,
@@ -320,11 +337,6 @@ static int caching_kthread(void *data)
320 if (!path) 337 if (!path)
321 return -ENOMEM; 338 return -ENOMEM;
322 339
323 exclude_super_stripes(extent_root, block_group);
324 spin_lock(&block_group->space_info->lock);
325 block_group->space_info->bytes_readonly += block_group->bytes_super;
326 spin_unlock(&block_group->space_info->lock);
327
328 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 340 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
329 341
330 /* 342 /*
@@ -447,7 +459,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
447 * allocate blocks for the tree root we can't do the fast caching since 459 * allocate blocks for the tree root we can't do the fast caching since
448 * we likely hold important locks. 460 * we likely hold important locks.
449 */ 461 */
450 if (!trans->transaction->in_commit && 462 if (trans && (!trans->transaction->in_commit) &&
451 (root && root != root->fs_info->tree_root)) { 463 (root && root != root->fs_info->tree_root)) {
452 spin_lock(&cache->lock); 464 spin_lock(&cache->lock);
453 if (cache->cached != BTRFS_CACHE_NO) { 465 if (cache->cached != BTRFS_CACHE_NO) {
@@ -467,14 +479,16 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
467 cache->cached = BTRFS_CACHE_NO; 479 cache->cached = BTRFS_CACHE_NO;
468 } 480 }
469 spin_unlock(&cache->lock); 481 spin_unlock(&cache->lock);
470 if (ret == 1) 482 if (ret == 1) {
483 free_excluded_extents(fs_info->extent_root, cache);
471 return 0; 484 return 0;
485 }
472 } 486 }
473 487
474 if (load_cache_only) 488 if (load_cache_only)
475 return 0; 489 return 0;
476 490
477 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 491 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
478 BUG_ON(!caching_ctl); 492 BUG_ON(!caching_ctl);
479 493
480 INIT_LIST_HEAD(&caching_ctl->list); 494 INIT_LIST_HEAD(&caching_ctl->list);
@@ -1743,39 +1757,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1743 return ret; 1757 return ret;
1744} 1758}
1745 1759
1746static void btrfs_issue_discard(struct block_device *bdev, 1760static int btrfs_issue_discard(struct block_device *bdev,
1747 u64 start, u64 len) 1761 u64 start, u64 len)
1748{ 1762{
1749 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); 1763 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1750} 1764}
1751 1765
1752static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1766static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1753 u64 num_bytes) 1767 u64 num_bytes, u64 *actual_bytes)
1754{ 1768{
1755 int ret; 1769 int ret;
1756 u64 map_length = num_bytes; 1770 u64 discarded_bytes = 0;
1757 struct btrfs_multi_bio *multi = NULL; 1771 struct btrfs_multi_bio *multi = NULL;
1758 1772
1759 if (!btrfs_test_opt(root, DISCARD))
1760 return 0;
1761 1773
1762 /* Tell the block device(s) that the sectors can be discarded */ 1774 /* Tell the block device(s) that the sectors can be discarded */
1763 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1775 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1764 bytenr, &map_length, &multi, 0); 1776 bytenr, &num_bytes, &multi, 0);
1765 if (!ret) { 1777 if (!ret) {
1766 struct btrfs_bio_stripe *stripe = multi->stripes; 1778 struct btrfs_bio_stripe *stripe = multi->stripes;
1767 int i; 1779 int i;
1768 1780
1769 if (map_length > num_bytes)
1770 map_length = num_bytes;
1771 1781
1772 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1782 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1773 btrfs_issue_discard(stripe->dev->bdev, 1783 ret = btrfs_issue_discard(stripe->dev->bdev,
1774 stripe->physical, 1784 stripe->physical,
1775 map_length); 1785 stripe->length);
1786 if (!ret)
1787 discarded_bytes += stripe->length;
1788 else if (ret != -EOPNOTSUPP)
1789 break;
1776 } 1790 }
1777 kfree(multi); 1791 kfree(multi);
1778 } 1792 }
1793 if (discarded_bytes && ret == -EOPNOTSUPP)
1794 ret = 0;
1795
1796 if (actual_bytes)
1797 *actual_bytes = discarded_bytes;
1798
1779 1799
1780 return ret; 1800 return ret;
1781} 1801}
@@ -3018,7 +3038,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3018 found->bytes_readonly = 0; 3038 found->bytes_readonly = 0;
3019 found->bytes_may_use = 0; 3039 found->bytes_may_use = 0;
3020 found->full = 0; 3040 found->full = 0;
3021 found->force_alloc = 0; 3041 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3042 found->chunk_alloc = 0;
3022 *space_info = found; 3043 *space_info = found;
3023 list_add_rcu(&found->list, &info->space_info); 3044 list_add_rcu(&found->list, &info->space_info);
3024 atomic_set(&found->caching_threads, 0); 3045 atomic_set(&found->caching_threads, 0);
@@ -3149,7 +3170,7 @@ again:
3149 if (!data_sinfo->full && alloc_chunk) { 3170 if (!data_sinfo->full && alloc_chunk) {
3150 u64 alloc_target; 3171 u64 alloc_target;
3151 3172
3152 data_sinfo->force_alloc = 1; 3173 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3153 spin_unlock(&data_sinfo->lock); 3174 spin_unlock(&data_sinfo->lock);
3154alloc: 3175alloc:
3155 alloc_target = btrfs_get_alloc_profile(root, 1); 3176 alloc_target = btrfs_get_alloc_profile(root, 1);
@@ -3159,7 +3180,8 @@ alloc:
3159 3180
3160 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3181 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3161 bytes + 2 * 1024 * 1024, 3182 bytes + 2 * 1024 * 1024,
3162 alloc_target, 0); 3183 alloc_target,
3184 CHUNK_ALLOC_NO_FORCE);
3163 btrfs_end_transaction(trans, root); 3185 btrfs_end_transaction(trans, root);
3164 if (ret < 0) { 3186 if (ret < 0) {
3165 if (ret != -ENOSPC) 3187 if (ret != -ENOSPC)
@@ -3238,31 +3260,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3238 rcu_read_lock(); 3260 rcu_read_lock();
3239 list_for_each_entry_rcu(found, head, list) { 3261 list_for_each_entry_rcu(found, head, list) {
3240 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3262 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3241 found->force_alloc = 1; 3263 found->force_alloc = CHUNK_ALLOC_FORCE;
3242 } 3264 }
3243 rcu_read_unlock(); 3265 rcu_read_unlock();
3244} 3266}
3245 3267
3246static int should_alloc_chunk(struct btrfs_root *root, 3268static int should_alloc_chunk(struct btrfs_root *root,
3247 struct btrfs_space_info *sinfo, u64 alloc_bytes) 3269 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3270 int force)
3248{ 3271{
3249 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3272 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3273 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3250 u64 thresh; 3274 u64 thresh;
3251 3275
3252 if (sinfo->bytes_used + sinfo->bytes_reserved + 3276 if (force == CHUNK_ALLOC_FORCE)
3253 alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3277 return 1;
3278
3279 /*
3280 * in limited mode, we want to have some free space up to
3281 * about 1% of the FS size.
3282 */
3283 if (force == CHUNK_ALLOC_LIMITED) {
3284 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3285 thresh = max_t(u64, 64 * 1024 * 1024,
3286 div_factor_fine(thresh, 1));
3287
3288 if (num_bytes - num_allocated < thresh)
3289 return 1;
3290 }
3291
3292 /*
3293 * we have two similar checks here, one based on percentage
3294 * and once based on a hard number of 256MB. The idea
3295 * is that if we have a good amount of free
3296 * room, don't allocate a chunk. A good mount is
3297 * less than 80% utilized of the chunks we have allocated,
3298 * or more than 256MB free
3299 */
3300 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3254 return 0; 3301 return 0;
3255 3302
3256 if (sinfo->bytes_used + sinfo->bytes_reserved + 3303 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3257 alloc_bytes < div_factor(num_bytes, 8))
3258 return 0; 3304 return 0;
3259 3305
3260 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3306 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3307
3308 /* 256MB or 5% of the FS */
3261 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3309 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3262 3310
3263 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3)) 3311 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3264 return 0; 3312 return 0;
3265
3266 return 1; 3313 return 1;
3267} 3314}
3268 3315
@@ -3272,10 +3319,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3272{ 3319{
3273 struct btrfs_space_info *space_info; 3320 struct btrfs_space_info *space_info;
3274 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3321 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3322 int wait_for_alloc = 0;
3275 int ret = 0; 3323 int ret = 0;
3276 3324
3277 mutex_lock(&fs_info->chunk_mutex);
3278
3279 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3325 flags = btrfs_reduce_alloc_profile(extent_root, flags);
3280 3326
3281 space_info = __find_space_info(extent_root->fs_info, flags); 3327 space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3286,21 +3332,40 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3286 } 3332 }
3287 BUG_ON(!space_info); 3333 BUG_ON(!space_info);
3288 3334
3335again:
3289 spin_lock(&space_info->lock); 3336 spin_lock(&space_info->lock);
3290 if (space_info->force_alloc) 3337 if (space_info->force_alloc)
3291 force = 1; 3338 force = space_info->force_alloc;
3292 if (space_info->full) { 3339 if (space_info->full) {
3293 spin_unlock(&space_info->lock); 3340 spin_unlock(&space_info->lock);
3294 goto out; 3341 return 0;
3295 } 3342 }
3296 3343
3297 if (!force && !should_alloc_chunk(extent_root, space_info, 3344 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3298 alloc_bytes)) {
3299 spin_unlock(&space_info->lock); 3345 spin_unlock(&space_info->lock);
3300 goto out; 3346 return 0;
3347 } else if (space_info->chunk_alloc) {
3348 wait_for_alloc = 1;
3349 } else {
3350 space_info->chunk_alloc = 1;
3301 } 3351 }
3352
3302 spin_unlock(&space_info->lock); 3353 spin_unlock(&space_info->lock);
3303 3354
3355 mutex_lock(&fs_info->chunk_mutex);
3356
3357 /*
3358 * The chunk_mutex is held throughout the entirety of a chunk
3359 * allocation, so once we've acquired the chunk_mutex we know that the
3360 * other guy is done and we need to recheck and see if we should
3361 * allocate.
3362 */
3363 if (wait_for_alloc) {
3364 mutex_unlock(&fs_info->chunk_mutex);
3365 wait_for_alloc = 0;
3366 goto again;
3367 }
3368
3304 /* 3369 /*
3305 * If we have mixed data/metadata chunks we want to make sure we keep 3370 * If we have mixed data/metadata chunks we want to make sure we keep
3306 * allocating mixed chunks instead of individual chunks. 3371 * allocating mixed chunks instead of individual chunks.
@@ -3326,9 +3391,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3326 space_info->full = 1; 3391 space_info->full = 1;
3327 else 3392 else
3328 ret = 1; 3393 ret = 1;
3329 space_info->force_alloc = 0; 3394
3395 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3396 space_info->chunk_alloc = 0;
3330 spin_unlock(&space_info->lock); 3397 spin_unlock(&space_info->lock);
3331out:
3332 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3398 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3333 return ret; 3399 return ret;
3334} 3400}
@@ -3344,21 +3410,24 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3344 u64 reserved; 3410 u64 reserved;
3345 u64 max_reclaim; 3411 u64 max_reclaim;
3346 u64 reclaimed = 0; 3412 u64 reclaimed = 0;
3347 int pause = 1; 3413 long time_left;
3348 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3414 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3415 int loops = 0;
3416 unsigned long progress;
3349 3417
3350 block_rsv = &root->fs_info->delalloc_block_rsv; 3418 block_rsv = &root->fs_info->delalloc_block_rsv;
3351 space_info = block_rsv->space_info; 3419 space_info = block_rsv->space_info;
3352 3420
3353 smp_mb(); 3421 smp_mb();
3354 reserved = space_info->bytes_reserved; 3422 reserved = space_info->bytes_reserved;
3423 progress = space_info->reservation_progress;
3355 3424
3356 if (reserved == 0) 3425 if (reserved == 0)
3357 return 0; 3426 return 0;
3358 3427
3359 max_reclaim = min(reserved, to_reclaim); 3428 max_reclaim = min(reserved, to_reclaim);
3360 3429
3361 while (1) { 3430 while (loops < 1024) {
3362 /* have the flusher threads jump in and do some IO */ 3431 /* have the flusher threads jump in and do some IO */
3363 smp_mb(); 3432 smp_mb();
3364 nr_pages = min_t(unsigned long, nr_pages, 3433 nr_pages = min_t(unsigned long, nr_pages,
@@ -3371,17 +3440,31 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3371 reserved = space_info->bytes_reserved; 3440 reserved = space_info->bytes_reserved;
3372 spin_unlock(&space_info->lock); 3441 spin_unlock(&space_info->lock);
3373 3442
3443 loops++;
3444
3374 if (reserved == 0 || reclaimed >= max_reclaim) 3445 if (reserved == 0 || reclaimed >= max_reclaim)
3375 break; 3446 break;
3376 3447
3377 if (trans && trans->transaction->blocked) 3448 if (trans && trans->transaction->blocked)
3378 return -EAGAIN; 3449 return -EAGAIN;
3379 3450
3380 __set_current_state(TASK_INTERRUPTIBLE); 3451 time_left = schedule_timeout_interruptible(1);
3381 schedule_timeout(pause); 3452
3382 pause <<= 1; 3453 /* We were interrupted, exit */
3383 if (pause > HZ / 10) 3454 if (time_left)
3384 pause = HZ / 10; 3455 break;
3456
3457 /* we've kicked the IO a few times, if anything has been freed,
3458 * exit. There is no sense in looping here for a long time
3459 * when we really need to commit the transaction, or there are
3460 * just too many writers without enough free space
3461 */
3462
3463 if (loops > 3) {
3464 smp_mb();
3465 if (progress != space_info->reservation_progress)
3466 break;
3467 }
3385 3468
3386 } 3469 }
3387 return reclaimed >= to_reclaim; 3470 return reclaimed >= to_reclaim;
@@ -3588,10 +3671,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3588 3671
3589 if (num_bytes > 0) { 3672 if (num_bytes > 0) {
3590 if (dest) { 3673 if (dest) {
3591 block_rsv_add_bytes(dest, num_bytes, 0); 3674 spin_lock(&dest->lock);
3592 } else { 3675 if (!dest->full) {
3676 u64 bytes_to_add;
3677
3678 bytes_to_add = dest->size - dest->reserved;
3679 bytes_to_add = min(num_bytes, bytes_to_add);
3680 dest->reserved += bytes_to_add;
3681 if (dest->reserved >= dest->size)
3682 dest->full = 1;
3683 num_bytes -= bytes_to_add;
3684 }
3685 spin_unlock(&dest->lock);
3686 }
3687 if (num_bytes) {
3593 spin_lock(&space_info->lock); 3688 spin_lock(&space_info->lock);
3594 space_info->bytes_reserved -= num_bytes; 3689 space_info->bytes_reserved -= num_bytes;
3690 space_info->reservation_progress++;
3595 spin_unlock(&space_info->lock); 3691 spin_unlock(&space_info->lock);
3596 } 3692 }
3597 } 3693 }
@@ -3824,6 +3920,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3824 if (block_rsv->reserved >= block_rsv->size) { 3920 if (block_rsv->reserved >= block_rsv->size) {
3825 num_bytes = block_rsv->reserved - block_rsv->size; 3921 num_bytes = block_rsv->reserved - block_rsv->size;
3826 sinfo->bytes_reserved -= num_bytes; 3922 sinfo->bytes_reserved -= num_bytes;
3923 sinfo->reservation_progress++;
3827 block_rsv->reserved = block_rsv->size; 3924 block_rsv->reserved = block_rsv->size;
3828 block_rsv->full = 1; 3925 block_rsv->full = 1;
3829 } 3926 }
@@ -3968,6 +4065,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3968 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4065 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3969 u64 to_reserve; 4066 u64 to_reserve;
3970 int nr_extents; 4067 int nr_extents;
4068 int reserved_extents;
3971 int ret; 4069 int ret;
3972 4070
3973 if (btrfs_transaction_in_commit(root->fs_info)) 4071 if (btrfs_transaction_in_commit(root->fs_info))
@@ -3975,26 +4073,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3975 4073
3976 num_bytes = ALIGN(num_bytes, root->sectorsize); 4074 num_bytes = ALIGN(num_bytes, root->sectorsize);
3977 4075
3978 spin_lock(&BTRFS_I(inode)->accounting_lock);
3979 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 4076 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3980 if (nr_extents > BTRFS_I(inode)->reserved_extents) { 4077 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
3981 nr_extents -= BTRFS_I(inode)->reserved_extents; 4078
4079 if (nr_extents > reserved_extents) {
4080 nr_extents -= reserved_extents;
3982 to_reserve = calc_trans_metadata_size(root, nr_extents); 4081 to_reserve = calc_trans_metadata_size(root, nr_extents);
3983 } else { 4082 } else {
3984 nr_extents = 0; 4083 nr_extents = 0;
3985 to_reserve = 0; 4084 to_reserve = 0;
3986 } 4085 }
3987 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3988 4086
3989 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4087 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3990 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4088 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3991 if (ret) 4089 if (ret)
3992 return ret; 4090 return ret;
3993 4091
3994 spin_lock(&BTRFS_I(inode)->accounting_lock); 4092 atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
3995 BTRFS_I(inode)->reserved_extents += nr_extents;
3996 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 4093 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3997 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3998 4094
3999 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4095 block_rsv_add_bytes(block_rsv, to_reserve, 1);
4000 4096
@@ -4009,19 +4105,30 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4009 struct btrfs_root *root = BTRFS_I(inode)->root; 4105 struct btrfs_root *root = BTRFS_I(inode)->root;
4010 u64 to_free; 4106 u64 to_free;
4011 int nr_extents; 4107 int nr_extents;
4108 int reserved_extents;
4012 4109
4013 num_bytes = ALIGN(num_bytes, root->sectorsize); 4110 num_bytes = ALIGN(num_bytes, root->sectorsize);
4014 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4111 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
4112 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4015 4113
4016 spin_lock(&BTRFS_I(inode)->accounting_lock); 4114 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
4017 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); 4115 do {
4018 if (nr_extents < BTRFS_I(inode)->reserved_extents) { 4116 int old, new;
4019 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; 4117
4020 BTRFS_I(inode)->reserved_extents -= nr_extents; 4118 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4021 } else { 4119 if (nr_extents >= reserved_extents) {
4022 nr_extents = 0; 4120 nr_extents = 0;
4023 } 4121 break;
4024 spin_unlock(&BTRFS_I(inode)->accounting_lock); 4122 }
4123 old = reserved_extents;
4124 nr_extents = reserved_extents - nr_extents;
4125 new = reserved_extents - nr_extents;
4126 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
4127 reserved_extents, new);
4128 if (likely(old == reserved_extents))
4129 break;
4130 reserved_extents = old;
4131 } while (1);
4025 4132
4026 to_free = calc_csum_metadata_size(inode, num_bytes); 4133 to_free = calc_csum_metadata_size(inode, num_bytes);
4027 if (nr_extents > 0) 4134 if (nr_extents > 0)
@@ -4112,6 +4219,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4112 btrfs_set_block_group_used(&cache->item, old_val); 4219 btrfs_set_block_group_used(&cache->item, old_val);
4113 cache->reserved -= num_bytes; 4220 cache->reserved -= num_bytes;
4114 cache->space_info->bytes_reserved -= num_bytes; 4221 cache->space_info->bytes_reserved -= num_bytes;
4222 cache->space_info->reservation_progress++;
4115 cache->space_info->bytes_used += num_bytes; 4223 cache->space_info->bytes_used += num_bytes;
4116 cache->space_info->disk_used += num_bytes * factor; 4224 cache->space_info->disk_used += num_bytes * factor;
4117 spin_unlock(&cache->lock); 4225 spin_unlock(&cache->lock);
@@ -4163,6 +4271,7 @@ static int pin_down_extent(struct btrfs_root *root,
4163 if (reserved) { 4271 if (reserved) {
4164 cache->reserved -= num_bytes; 4272 cache->reserved -= num_bytes;
4165 cache->space_info->bytes_reserved -= num_bytes; 4273 cache->space_info->bytes_reserved -= num_bytes;
4274 cache->space_info->reservation_progress++;
4166 } 4275 }
4167 spin_unlock(&cache->lock); 4276 spin_unlock(&cache->lock);
4168 spin_unlock(&cache->space_info->lock); 4277 spin_unlock(&cache->space_info->lock);
@@ -4193,8 +4302,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
4193 * update size of reserved extents. this function may return -EAGAIN 4302 * update size of reserved extents. this function may return -EAGAIN
4194 * if 'reserve' is true or 'sinfo' is false. 4303 * if 'reserve' is true or 'sinfo' is false.
4195 */ 4304 */
4196static int update_reserved_bytes(struct btrfs_block_group_cache *cache, 4305int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4197 u64 num_bytes, int reserve, int sinfo) 4306 u64 num_bytes, int reserve, int sinfo)
4198{ 4307{
4199 int ret = 0; 4308 int ret = 0;
4200 if (sinfo) { 4309 if (sinfo) {
@@ -4213,6 +4322,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
4213 space_info->bytes_readonly += num_bytes; 4322 space_info->bytes_readonly += num_bytes;
4214 cache->reserved -= num_bytes; 4323 cache->reserved -= num_bytes;
4215 space_info->bytes_reserved -= num_bytes; 4324 space_info->bytes_reserved -= num_bytes;
4325 space_info->reservation_progress++;
4216 } 4326 }
4217 spin_unlock(&cache->lock); 4327 spin_unlock(&cache->lock);
4218 spin_unlock(&space_info->lock); 4328 spin_unlock(&space_info->lock);
@@ -4332,7 +4442,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4332 if (ret) 4442 if (ret)
4333 break; 4443 break;
4334 4444
4335 ret = btrfs_discard_extent(root, start, end + 1 - start); 4445 if (btrfs_test_opt(root, DISCARD))
4446 ret = btrfs_discard_extent(root, start,
4447 end + 1 - start, NULL);
4336 4448
4337 clear_extent_dirty(unpin, start, end, GFP_NOFS); 4449 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4338 unpin_extent_range(root, start, end); 4450 unpin_extent_range(root, start, end);
@@ -4673,10 +4785,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4673 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4785 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4674 4786
4675 btrfs_add_free_space(cache, buf->start, buf->len); 4787 btrfs_add_free_space(cache, buf->start, buf->len);
4676 ret = update_reserved_bytes(cache, buf->len, 0, 0); 4788 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
4677 if (ret == -EAGAIN) { 4789 if (ret == -EAGAIN) {
4678 /* block group became read-only */ 4790 /* block group became read-only */
4679 update_reserved_bytes(cache, buf->len, 0, 1); 4791 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4680 goto out; 4792 goto out;
4681 } 4793 }
4682 4794
@@ -4691,6 +4803,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4691 if (ret) { 4803 if (ret) {
4692 spin_lock(&cache->space_info->lock); 4804 spin_lock(&cache->space_info->lock);
4693 cache->space_info->bytes_reserved -= buf->len; 4805 cache->space_info->bytes_reserved -= buf->len;
4806 cache->space_info->reservation_progress++;
4694 spin_unlock(&cache->space_info->lock); 4807 spin_unlock(&cache->space_info->lock);
4695 } 4808 }
4696 goto out; 4809 goto out;
@@ -4712,6 +4825,11 @@ pin:
4712 } 4825 }
4713 } 4826 }
4714out: 4827out:
4828 /*
4829 * Deleting the buffer, clear the corrupt flag since it doesn't matter
4830 * anymore.
4831 */
4832 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4715 btrfs_put_block_group(cache); 4833 btrfs_put_block_group(cache);
4716} 4834}
4717 4835
@@ -5159,7 +5277,7 @@ checks:
5159 search_start - offset); 5277 search_start - offset);
5160 BUG_ON(offset > search_start); 5278 BUG_ON(offset > search_start);
5161 5279
5162 ret = update_reserved_bytes(block_group, num_bytes, 1, 5280 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
5163 (data & BTRFS_BLOCK_GROUP_DATA)); 5281 (data & BTRFS_BLOCK_GROUP_DATA));
5164 if (ret == -EAGAIN) { 5282 if (ret == -EAGAIN) {
5165 btrfs_add_free_space(block_group, offset, num_bytes); 5283 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -5250,11 +5368,13 @@ loop:
5250 5368
5251 if (allowed_chunk_alloc) { 5369 if (allowed_chunk_alloc) {
5252 ret = do_chunk_alloc(trans, root, num_bytes + 5370 ret = do_chunk_alloc(trans, root, num_bytes +
5253 2 * 1024 * 1024, data, 1); 5371 2 * 1024 * 1024, data,
5372 CHUNK_ALLOC_LIMITED);
5254 allowed_chunk_alloc = 0; 5373 allowed_chunk_alloc = 0;
5255 done_chunk_alloc = 1; 5374 done_chunk_alloc = 1;
5256 } else if (!done_chunk_alloc) { 5375 } else if (!done_chunk_alloc &&
5257 space_info->force_alloc = 1; 5376 space_info->force_alloc == CHUNK_ALLOC_NO_FORCE) {
5377 space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5258 } 5378 }
5259 5379
5260 if (loop < LOOP_NO_EMPTY_SIZE) { 5380 if (loop < LOOP_NO_EMPTY_SIZE) {
@@ -5340,7 +5460,8 @@ again:
5340 */ 5460 */
5341 if (empty_size || root->ref_cows) 5461 if (empty_size || root->ref_cows)
5342 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 5462 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5343 num_bytes + 2 * 1024 * 1024, data, 0); 5463 num_bytes + 2 * 1024 * 1024, data,
5464 CHUNK_ALLOC_NO_FORCE);
5344 5465
5345 WARN_ON(num_bytes < root->sectorsize); 5466 WARN_ON(num_bytes < root->sectorsize);
5346 ret = find_free_extent(trans, root, num_bytes, empty_size, 5467 ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5352,10 +5473,10 @@ again:
5352 num_bytes = num_bytes & ~(root->sectorsize - 1); 5473 num_bytes = num_bytes & ~(root->sectorsize - 1);
5353 num_bytes = max(num_bytes, min_alloc_size); 5474 num_bytes = max(num_bytes, min_alloc_size);
5354 do_chunk_alloc(trans, root->fs_info->extent_root, 5475 do_chunk_alloc(trans, root->fs_info->extent_root,
5355 num_bytes, data, 1); 5476 num_bytes, data, CHUNK_ALLOC_FORCE);
5356 goto again; 5477 goto again;
5357 } 5478 }
5358 if (ret == -ENOSPC) { 5479 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5359 struct btrfs_space_info *sinfo; 5480 struct btrfs_space_info *sinfo;
5360 5481
5361 sinfo = __find_space_info(root->fs_info, data); 5482 sinfo = __find_space_info(root->fs_info, data);
@@ -5365,6 +5486,8 @@ again:
5365 dump_space_info(sinfo, num_bytes, 1); 5486 dump_space_info(sinfo, num_bytes, 1);
5366 } 5487 }
5367 5488
5489 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5490
5368 return ret; 5491 return ret;
5369} 5492}
5370 5493
@@ -5380,12 +5503,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5380 return -ENOSPC; 5503 return -ENOSPC;
5381 } 5504 }
5382 5505
5383 ret = btrfs_discard_extent(root, start, len); 5506 if (btrfs_test_opt(root, DISCARD))
5507 ret = btrfs_discard_extent(root, start, len, NULL);
5384 5508
5385 btrfs_add_free_space(cache, start, len); 5509 btrfs_add_free_space(cache, start, len);
5386 update_reserved_bytes(cache, len, 0, 1); 5510 btrfs_update_reserved_bytes(cache, len, 0, 1);
5387 btrfs_put_block_group(cache); 5511 btrfs_put_block_group(cache);
5388 5512
5513 trace_btrfs_reserved_extent_free(root, start, len);
5514
5389 return ret; 5515 return ret;
5390} 5516}
5391 5517
@@ -5412,7 +5538,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5412 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 5538 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5413 5539
5414 path = btrfs_alloc_path(); 5540 path = btrfs_alloc_path();
5415 BUG_ON(!path); 5541 if (!path)
5542 return -ENOMEM;
5416 5543
5417 path->leave_spinning = 1; 5544 path->leave_spinning = 1;
5418 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5545 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5582,7 +5709,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5582 put_caching_control(caching_ctl); 5709 put_caching_control(caching_ctl);
5583 } 5710 }
5584 5711
5585 ret = update_reserved_bytes(block_group, ins->offset, 1, 1); 5712 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
5586 BUG_ON(ret); 5713 BUG_ON(ret);
5587 btrfs_put_block_group(block_group); 5714 btrfs_put_block_group(block_group);
5588 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5715 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5633,6 +5760,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5633 struct btrfs_root *root, u32 blocksize) 5760 struct btrfs_root *root, u32 blocksize)
5634{ 5761{
5635 struct btrfs_block_rsv *block_rsv; 5762 struct btrfs_block_rsv *block_rsv;
5763 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5636 int ret; 5764 int ret;
5637 5765
5638 block_rsv = get_block_rsv(trans, root); 5766 block_rsv = get_block_rsv(trans, root);
@@ -5640,14 +5768,39 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5640 if (block_rsv->size == 0) { 5768 if (block_rsv->size == 0) {
5641 ret = reserve_metadata_bytes(trans, root, block_rsv, 5769 ret = reserve_metadata_bytes(trans, root, block_rsv,
5642 blocksize, 0); 5770 blocksize, 0);
5643 if (ret) 5771 /*
5772 * If we couldn't reserve metadata bytes try and use some from
5773 * the global reserve.
5774 */
5775 if (ret && block_rsv != global_rsv) {
5776 ret = block_rsv_use_bytes(global_rsv, blocksize);
5777 if (!ret)
5778 return global_rsv;
5644 return ERR_PTR(ret); 5779 return ERR_PTR(ret);
5780 } else if (ret) {
5781 return ERR_PTR(ret);
5782 }
5645 return block_rsv; 5783 return block_rsv;
5646 } 5784 }
5647 5785
5648 ret = block_rsv_use_bytes(block_rsv, blocksize); 5786 ret = block_rsv_use_bytes(block_rsv, blocksize);
5649 if (!ret) 5787 if (!ret)
5650 return block_rsv; 5788 return block_rsv;
5789 if (ret) {
5790 WARN_ON(1);
5791 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
5792 0);
5793 if (!ret) {
5794 spin_lock(&block_rsv->lock);
5795 block_rsv->size += blocksize;
5796 spin_unlock(&block_rsv->lock);
5797 return block_rsv;
5798 } else if (ret && block_rsv != global_rsv) {
5799 ret = block_rsv_use_bytes(global_rsv, blocksize);
5800 if (!ret)
5801 return global_rsv;
5802 }
5803 }
5651 5804
5652 return ERR_PTR(-ENOSPC); 5805 return ERR_PTR(-ENOSPC);
5653} 5806}
@@ -5989,6 +6142,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5989 if (reada && level == 1) 6142 if (reada && level == 1)
5990 reada_walk_down(trans, root, wc, path); 6143 reada_walk_down(trans, root, wc, path);
5991 next = read_tree_block(root, bytenr, blocksize, generation); 6144 next = read_tree_block(root, bytenr, blocksize, generation);
6145 if (!next)
6146 return -EIO;
5992 btrfs_tree_lock(next); 6147 btrfs_tree_lock(next);
5993 btrfs_set_lock_blocking(next); 6148 btrfs_set_lock_blocking(next);
5994 } 6149 }
@@ -6221,6 +6376,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6221 BUG_ON(!wc); 6376 BUG_ON(!wc);
6222 6377
6223 trans = btrfs_start_transaction(tree_root, 0); 6378 trans = btrfs_start_transaction(tree_root, 0);
6379 BUG_ON(IS_ERR(trans));
6380
6224 if (block_rsv) 6381 if (block_rsv)
6225 trans->block_rsv = block_rsv; 6382 trans->block_rsv = block_rsv;
6226 6383
@@ -6318,6 +6475,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6318 6475
6319 btrfs_end_transaction_throttle(trans, tree_root); 6476 btrfs_end_transaction_throttle(trans, tree_root);
6320 trans = btrfs_start_transaction(tree_root, 0); 6477 trans = btrfs_start_transaction(tree_root, 0);
6478 BUG_ON(IS_ERR(trans));
6321 if (block_rsv) 6479 if (block_rsv)
6322 trans->block_rsv = block_rsv; 6480 trans->block_rsv = block_rsv;
6323 } 6481 }
@@ -6377,10 +6535,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6377 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 6535 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6378 6536
6379 path = btrfs_alloc_path(); 6537 path = btrfs_alloc_path();
6380 BUG_ON(!path); 6538 if (!path)
6539 return -ENOMEM;
6381 6540
6382 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6541 wc = kzalloc(sizeof(*wc), GFP_NOFS);
6383 BUG_ON(!wc); 6542 if (!wc) {
6543 btrfs_free_path(path);
6544 return -ENOMEM;
6545 }
6384 6546
6385 btrfs_assert_tree_locked(parent); 6547 btrfs_assert_tree_locked(parent);
6386 parent_level = btrfs_header_level(parent); 6548 parent_level = btrfs_header_level(parent);
@@ -6446,6 +6608,8 @@ static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6446 int ret = 0; 6608 int ret = 0;
6447 6609
6448 ra = kzalloc(sizeof(*ra), GFP_NOFS); 6610 ra = kzalloc(sizeof(*ra), GFP_NOFS);
6611 if (!ra)
6612 return -ENOMEM;
6449 6613
6450 mutex_lock(&inode->i_mutex); 6614 mutex_lock(&inode->i_mutex);
6451 first_index = start >> PAGE_CACHE_SHIFT; 6615 first_index = start >> PAGE_CACHE_SHIFT;
@@ -6531,7 +6695,7 @@ static noinline int relocate_data_extent(struct inode *reloc_inode,
6531 u64 end = start + extent_key->offset - 1; 6695 u64 end = start + extent_key->offset - 1;
6532 6696
6533 em = alloc_extent_map(GFP_NOFS); 6697 em = alloc_extent_map(GFP_NOFS);
6534 BUG_ON(!em || IS_ERR(em)); 6698 BUG_ON(!em);
6535 6699
6536 em->start = start; 6700 em->start = start;
6537 em->len = extent_key->offset; 6701 em->len = extent_key->offset;
@@ -6836,7 +7000,11 @@ static noinline int get_new_locations(struct inode *reloc_inode,
6836 } 7000 }
6837 7001
6838 path = btrfs_alloc_path(); 7002 path = btrfs_alloc_path();
6839 BUG_ON(!path); 7003 if (!path) {
7004 if (exts != *extents)
7005 kfree(exts);
7006 return -ENOMEM;
7007 }
6840 7008
6841 cur_pos = extent_key->objectid - offset; 7009 cur_pos = extent_key->objectid - offset;
6842 last_byte = extent_key->objectid + extent_key->offset; 7010 last_byte = extent_key->objectid + extent_key->offset;
@@ -6878,6 +7046,10 @@ static noinline int get_new_locations(struct inode *reloc_inode,
6878 struct disk_extent *old = exts; 7046 struct disk_extent *old = exts;
6879 max *= 2; 7047 max *= 2;
6880 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); 7048 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
7049 if (!exts) {
7050 ret = -ENOMEM;
7051 goto out;
7052 }
6881 memcpy(exts, old, sizeof(*exts) * nr); 7053 memcpy(exts, old, sizeof(*exts) * nr);
6882 if (old != *extents) 7054 if (old != *extents)
6883 kfree(old); 7055 kfree(old);
@@ -7360,7 +7532,8 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
7360 int ret; 7532 int ret;
7361 7533
7362 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); 7534 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
7363 BUG_ON(!new_extent); 7535 if (!new_extent)
7536 return -ENOMEM;
7364 7537
7365 ref = btrfs_lookup_leaf_ref(root, leaf->start); 7538 ref = btrfs_lookup_leaf_ref(root, leaf->start);
7366 BUG_ON(!ref); 7539 BUG_ON(!ref);
@@ -7477,7 +7650,7 @@ int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7477 BUG_ON(reloc_root->commit_root != NULL); 7650 BUG_ON(reloc_root->commit_root != NULL);
7478 while (1) { 7651 while (1) {
7479 trans = btrfs_join_transaction(root, 1); 7652 trans = btrfs_join_transaction(root, 1);
7480 BUG_ON(!trans); 7653 BUG_ON(IS_ERR(trans));
7481 7654
7482 mutex_lock(&root->fs_info->drop_mutex); 7655 mutex_lock(&root->fs_info->drop_mutex);
7483 ret = btrfs_drop_snapshot(trans, reloc_root); 7656 ret = btrfs_drop_snapshot(trans, reloc_root);
@@ -7535,7 +7708,7 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7535 7708
7536 if (found) { 7709 if (found) {
7537 trans = btrfs_start_transaction(root, 1); 7710 trans = btrfs_start_transaction(root, 1);
7538 BUG_ON(!trans); 7711 BUG_ON(IS_ERR(trans));
7539 ret = btrfs_commit_transaction(trans, root); 7712 ret = btrfs_commit_transaction(trans, root);
7540 BUG_ON(ret); 7713 BUG_ON(ret);
7541 } 7714 }
@@ -7546,7 +7719,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7546 7719
7547 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 7720 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
7548 BUG_ON(!reloc_root); 7721 BUG_ON(!reloc_root);
7549 btrfs_orphan_cleanup(reloc_root); 7722 ret = btrfs_orphan_cleanup(reloc_root);
7723 BUG_ON(ret);
7550 return 0; 7724 return 0;
7551} 7725}
7552 7726
@@ -7564,7 +7738,8 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7564 return 0; 7738 return 0;
7565 7739
7566 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 7740 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
7567 BUG_ON(!root_item); 7741 if (!root_item)
7742 return -ENOMEM;
7568 7743
7569 ret = btrfs_copy_root(trans, root, root->commit_root, 7744 ret = btrfs_copy_root(trans, root, root->commit_root,
7570 &eb, BTRFS_TREE_RELOC_OBJECTID); 7745 &eb, BTRFS_TREE_RELOC_OBJECTID);
@@ -7590,7 +7765,7 @@ static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7590 7765
7591 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 7766 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
7592 &root_key); 7767 &root_key);
7593 BUG_ON(!reloc_root); 7768 BUG_ON(IS_ERR(reloc_root));
7594 reloc_root->last_trans = trans->transid; 7769 reloc_root->last_trans = trans->transid;
7595 reloc_root->commit_root = NULL; 7770 reloc_root->commit_root = NULL;
7596 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; 7771 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
@@ -7779,7 +7954,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7779 7954
7780 7955
7781 trans = btrfs_start_transaction(extent_root, 1); 7956 trans = btrfs_start_transaction(extent_root, 1);
7782 BUG_ON(!trans); 7957 BUG_ON(IS_ERR(trans));
7783 7958
7784 if (extent_key->objectid == 0) { 7959 if (extent_key->objectid == 0) {
7785 ret = del_extent_zero(trans, extent_root, path, extent_key); 7960 ret = del_extent_zero(trans, extent_root, path, extent_key);
@@ -7843,6 +8018,10 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7843 8018
7844 eb = read_tree_block(found_root, block_start, 8019 eb = read_tree_block(found_root, block_start,
7845 block_size, 0); 8020 block_size, 0);
8021 if (!eb) {
8022 ret = -EIO;
8023 goto out;
8024 }
7846 btrfs_tree_lock(eb); 8025 btrfs_tree_lock(eb);
7847 BUG_ON(level != btrfs_header_level(eb)); 8026 BUG_ON(level != btrfs_header_level(eb));
7848 8027
@@ -7998,13 +8177,15 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7998 8177
7999 alloc_flags = update_block_group_flags(root, cache->flags); 8178 alloc_flags = update_block_group_flags(root, cache->flags);
8000 if (alloc_flags != cache->flags) 8179 if (alloc_flags != cache->flags)
8001 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8180 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8181 CHUNK_ALLOC_FORCE);
8002 8182
8003 ret = set_block_group_ro(cache); 8183 ret = set_block_group_ro(cache);
8004 if (!ret) 8184 if (!ret)
8005 goto out; 8185 goto out;
8006 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 8186 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8007 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 8187 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8188 CHUNK_ALLOC_FORCE);
8008 if (ret < 0) 8189 if (ret < 0)
8009 goto out; 8190 goto out;
8010 ret = set_block_group_ro(cache); 8191 ret = set_block_group_ro(cache);
@@ -8013,6 +8194,14 @@ out:
8013 return ret; 8194 return ret;
8014} 8195}
8015 8196
8197int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
8198 struct btrfs_root *root, u64 type)
8199{
8200 u64 alloc_flags = get_alloc_profile(root, type);
8201 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
8202 CHUNK_ALLOC_FORCE);
8203}
8204
8016/* 8205/*
8017 * helper to account the unused space of all the readonly block group in the 8206 * helper to account the unused space of all the readonly block group in the
8018 * list. takes mirrors into account. 8207 * list. takes mirrors into account.
@@ -8270,6 +8459,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
8270 if (block_group->cached == BTRFS_CACHE_STARTED) 8459 if (block_group->cached == BTRFS_CACHE_STARTED)
8271 wait_block_group_cache_done(block_group); 8460 wait_block_group_cache_done(block_group);
8272 8461
8462 /*
8463 * We haven't cached this block group, which means we could
8464 * possibly have excluded extents on this block group.
8465 */
8466 if (block_group->cached == BTRFS_CACHE_NO)
8467 free_excluded_extents(info->extent_root, block_group);
8468
8273 btrfs_remove_free_space_cache(block_group); 8469 btrfs_remove_free_space_cache(block_group);
8274 btrfs_put_block_group(block_group); 8470 btrfs_put_block_group(block_group);
8275 8471
@@ -8385,6 +8581,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8385 cache->sectorsize = root->sectorsize; 8581 cache->sectorsize = root->sectorsize;
8386 8582
8387 /* 8583 /*
8584 * We need to exclude the super stripes now so that the space
8585 * info has super bytes accounted for, otherwise we'll think
8586 * we have more space than we actually do.
8587 */
8588 exclude_super_stripes(root, cache);
8589
8590 /*
8388 * check for two cases, either we are full, and therefore 8591 * check for two cases, either we are full, and therefore
8389 * don't need to bother with the caching work since we won't 8592 * don't need to bother with the caching work since we won't
8390 * find any space, or we are empty, and we can just add all 8593 * find any space, or we are empty, and we can just add all
@@ -8392,12 +8595,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
8392 * time, particularly in the full case. 8595 * time, particularly in the full case.
8393 */ 8596 */
8394 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 8597 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8395 exclude_super_stripes(root, cache);
8396 cache->last_byte_to_unpin = (u64)-1; 8598 cache->last_byte_to_unpin = (u64)-1;
8397 cache->cached = BTRFS_CACHE_FINISHED; 8599 cache->cached = BTRFS_CACHE_FINISHED;
8398 free_excluded_extents(root, cache); 8600 free_excluded_extents(root, cache);
8399 } else if (btrfs_block_group_used(&cache->item) == 0) { 8601 } else if (btrfs_block_group_used(&cache->item) == 0) {
8400 exclude_super_stripes(root, cache);
8401 cache->last_byte_to_unpin = (u64)-1; 8602 cache->last_byte_to_unpin = (u64)-1;
8402 cache->cached = BTRFS_CACHE_FINISHED; 8603 cache->cached = BTRFS_CACHE_FINISHED;
8403 add_new_free_space(cache, root->fs_info, 8604 add_new_free_space(cache, root->fs_info,
@@ -8539,6 +8740,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8539 BUG_ON(!block_group); 8740 BUG_ON(!block_group);
8540 BUG_ON(!block_group->ro); 8741 BUG_ON(!block_group->ro);
8541 8742
8743 /*
8744 * Free the reserved super bytes from this block group before
8745 * remove it.
8746 */
8747 free_excluded_extents(root, block_group);
8748
8542 memcpy(&key, &block_group->key, sizeof(key)); 8749 memcpy(&key, &block_group->key, sizeof(key));
8543 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP | 8750 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8544 BTRFS_BLOCK_GROUP_RAID1 | 8751 BTRFS_BLOCK_GROUP_RAID1 |
@@ -8642,13 +8849,84 @@ out:
8642 return ret; 8849 return ret;
8643} 8850}
8644 8851
8852int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8853{
8854 struct btrfs_space_info *space_info;
8855 int ret;
8856
8857 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM, 0, 0,
8858 &space_info);
8859 if (ret)
8860 return ret;
8861
8862 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA, 0, 0,
8863 &space_info);
8864 if (ret)
8865 return ret;
8866
8867 ret = update_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA, 0, 0,
8868 &space_info);
8869 if (ret)
8870 return ret;
8871
8872 return ret;
8873}
8874
8645int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) 8875int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8646{ 8876{
8647 return unpin_extent_range(root, start, end); 8877 return unpin_extent_range(root, start, end);
8648} 8878}
8649 8879
8650int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, 8880int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8651 u64 num_bytes) 8881 u64 num_bytes, u64 *actual_bytes)
8652{ 8882{
8653 return btrfs_discard_extent(root, bytenr, num_bytes); 8883 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8884}
8885
8886int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8887{
8888 struct btrfs_fs_info *fs_info = root->fs_info;
8889 struct btrfs_block_group_cache *cache = NULL;
8890 u64 group_trimmed;
8891 u64 start;
8892 u64 end;
8893 u64 trimmed = 0;
8894 int ret = 0;
8895
8896 cache = btrfs_lookup_block_group(fs_info, range->start);
8897
8898 while (cache) {
8899 if (cache->key.objectid >= (range->start + range->len)) {
8900 btrfs_put_block_group(cache);
8901 break;
8902 }
8903
8904 start = max(range->start, cache->key.objectid);
8905 end = min(range->start + range->len,
8906 cache->key.objectid + cache->key.offset);
8907
8908 if (end - start >= range->minlen) {
8909 if (!block_group_cache_done(cache)) {
8910 ret = cache_block_group(cache, NULL, root, 0);
8911 if (!ret)
8912 wait_block_group_cache_done(cache);
8913 }
8914 ret = btrfs_trim_block_group(cache,
8915 &group_trimmed,
8916 start,
8917 end,
8918 range->minlen);
8919
8920 trimmed += group_trimmed;
8921 if (ret) {
8922 btrfs_put_block_group(cache);
8923 break;
8924 }
8925 }
8926
8927 cache = next_block_group(fs_info->tree_root, cache);
8928 }
8929
8930 range->len = trimmed;
8931 return ret;
8654} 8932}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e993cf1766e..315138605088 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -690,6 +690,15 @@ static void cache_state(struct extent_state *state,
690 } 690 }
691} 691}
692 692
693static void uncache_state(struct extent_state **cached_ptr)
694{
695 if (cached_ptr && (*cached_ptr)) {
696 struct extent_state *state = *cached_ptr;
697 *cached_ptr = NULL;
698 free_extent_state(state);
699 }
700}
701
693/* 702/*
694 * set some bits on a range in the tree. This may require allocations or 703 * set some bits on a range in the tree. This may require allocations or
695 * sleeping, so the gfp mask is used to indicate what is allowed. 704 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -940,10 +949,10 @@ static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
940} 949}
941 950
942int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 951int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
943 gfp_t mask) 952 struct extent_state **cached_state, gfp_t mask)
944{ 953{
945 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 954 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
946 NULL, mask); 955 NULL, cached_state, mask);
947} 956}
948 957
949static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 958static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1012,8 +1021,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1012 mask); 1021 mask);
1013} 1022}
1014 1023
1015int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1024int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1016 gfp_t mask)
1017{ 1025{
1018 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1026 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1019 mask); 1027 mask);
@@ -1433,12 +1441,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1433 */ 1441 */
1434u64 count_range_bits(struct extent_io_tree *tree, 1442u64 count_range_bits(struct extent_io_tree *tree,
1435 u64 *start, u64 search_end, u64 max_bytes, 1443 u64 *start, u64 search_end, u64 max_bytes,
1436 unsigned long bits) 1444 unsigned long bits, int contig)
1437{ 1445{
1438 struct rb_node *node; 1446 struct rb_node *node;
1439 struct extent_state *state; 1447 struct extent_state *state;
1440 u64 cur_start = *start; 1448 u64 cur_start = *start;
1441 u64 total_bytes = 0; 1449 u64 total_bytes = 0;
1450 u64 last = 0;
1442 int found = 0; 1451 int found = 0;
1443 1452
1444 if (search_end <= cur_start) { 1453 if (search_end <= cur_start) {
@@ -1463,7 +1472,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
1463 state = rb_entry(node, struct extent_state, rb_node); 1472 state = rb_entry(node, struct extent_state, rb_node);
1464 if (state->start > search_end) 1473 if (state->start > search_end)
1465 break; 1474 break;
1466 if (state->end >= cur_start && (state->state & bits)) { 1475 if (contig && found && state->start > last + 1)
1476 break;
1477 if (state->end >= cur_start && (state->state & bits) == bits) {
1467 total_bytes += min(search_end, state->end) + 1 - 1478 total_bytes += min(search_end, state->end) + 1 -
1468 max(cur_start, state->start); 1479 max(cur_start, state->start);
1469 if (total_bytes >= max_bytes) 1480 if (total_bytes >= max_bytes)
@@ -1472,6 +1483,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
1472 *start = state->start; 1483 *start = state->start;
1473 found = 1; 1484 found = 1;
1474 } 1485 }
1486 last = state->end;
1487 } else if (contig && found) {
1488 break;
1475 } 1489 }
1476 node = rb_next(node); 1490 node = rb_next(node);
1477 if (!node) 1491 if (!node)
@@ -1729,6 +1743,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1729 1743
1730 do { 1744 do {
1731 struct page *page = bvec->bv_page; 1745 struct page *page = bvec->bv_page;
1746 struct extent_state *cached = NULL;
1747 struct extent_state *state;
1748
1732 tree = &BTRFS_I(page->mapping->host)->io_tree; 1749 tree = &BTRFS_I(page->mapping->host)->io_tree;
1733 1750
1734 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1751 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1743,9 +1760,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1743 if (++bvec <= bvec_end) 1760 if (++bvec <= bvec_end)
1744 prefetchw(&bvec->bv_page->flags); 1761 prefetchw(&bvec->bv_page->flags);
1745 1762
1763 spin_lock(&tree->lock);
1764 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1765 if (state && state->start == start) {
1766 /*
1767 * take a reference on the state, unlock will drop
1768 * the ref
1769 */
1770 cache_state(state, &cached);
1771 }
1772 spin_unlock(&tree->lock);
1773
1746 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1774 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1747 ret = tree->ops->readpage_end_io_hook(page, start, end, 1775 ret = tree->ops->readpage_end_io_hook(page, start, end,
1748 NULL); 1776 state);
1749 if (ret) 1777 if (ret)
1750 uptodate = 0; 1778 uptodate = 0;
1751 } 1779 }
@@ -1758,15 +1786,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1758 test_bit(BIO_UPTODATE, &bio->bi_flags); 1786 test_bit(BIO_UPTODATE, &bio->bi_flags);
1759 if (err) 1787 if (err)
1760 uptodate = 0; 1788 uptodate = 0;
1789 uncache_state(&cached);
1761 continue; 1790 continue;
1762 } 1791 }
1763 } 1792 }
1764 1793
1765 if (uptodate) { 1794 if (uptodate) {
1766 set_extent_uptodate(tree, start, end, 1795 set_extent_uptodate(tree, start, end, &cached,
1767 GFP_ATOMIC); 1796 GFP_ATOMIC);
1768 } 1797 }
1769 unlock_extent(tree, start, end, GFP_ATOMIC); 1798 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1770 1799
1771 if (whole_page) { 1800 if (whole_page) {
1772 if (uptodate) { 1801 if (uptodate) {
@@ -1805,6 +1834,7 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1805 1834
1806 do { 1835 do {
1807 struct page *page = bvec->bv_page; 1836 struct page *page = bvec->bv_page;
1837 struct extent_state *cached = NULL;
1808 tree = &BTRFS_I(page->mapping->host)->io_tree; 1838 tree = &BTRFS_I(page->mapping->host)->io_tree;
1809 1839
1810 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1840 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1815,13 +1845,14 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1815 prefetchw(&bvec->bv_page->flags); 1845 prefetchw(&bvec->bv_page->flags);
1816 1846
1817 if (uptodate) { 1847 if (uptodate) {
1818 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1848 set_extent_uptodate(tree, start, end, &cached,
1849 GFP_ATOMIC);
1819 } else { 1850 } else {
1820 ClearPageUptodate(page); 1851 ClearPageUptodate(page);
1821 SetPageError(page); 1852 SetPageError(page);
1822 } 1853 }
1823 1854
1824 unlock_extent(tree, start, end, GFP_ATOMIC); 1855 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1825 1856
1826 } while (bvec >= bio->bi_io_vec); 1857 } while (bvec >= bio->bi_io_vec);
1827 1858
@@ -1865,7 +1896,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1865 bio_get(bio); 1896 bio_get(bio);
1866 1897
1867 if (tree->ops && tree->ops->submit_bio_hook) 1898 if (tree->ops && tree->ops->submit_bio_hook)
1868 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1899 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1869 mirror_num, bio_flags, start); 1900 mirror_num, bio_flags, start);
1870 else 1901 else
1871 submit_bio(rw, bio); 1902 submit_bio(rw, bio);
@@ -1920,6 +1951,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1920 nr = bio_get_nr_vecs(bdev); 1951 nr = bio_get_nr_vecs(bdev);
1921 1952
1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1953 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1954 if (!bio)
1955 return -ENOMEM;
1923 1956
1924 bio_add_page(bio, page, page_size, offset); 1957 bio_add_page(bio, page, page_size, offset);
1925 bio->bi_end_io = end_io_func; 1958 bio->bi_end_io = end_io_func;
@@ -1944,6 +1977,7 @@ void set_page_extent_mapped(struct page *page)
1944 1977
1945static void set_page_extent_head(struct page *page, unsigned long len) 1978static void set_page_extent_head(struct page *page, unsigned long len)
1946{ 1979{
1980 WARN_ON(!PagePrivate(page));
1947 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1981 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1948} 1982}
1949 1983
@@ -2007,14 +2041,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2007 while (cur <= end) { 2041 while (cur <= end) {
2008 if (cur >= last_byte) { 2042 if (cur >= last_byte) {
2009 char *userpage; 2043 char *userpage;
2044 struct extent_state *cached = NULL;
2045
2010 iosize = PAGE_CACHE_SIZE - page_offset; 2046 iosize = PAGE_CACHE_SIZE - page_offset;
2011 userpage = kmap_atomic(page, KM_USER0); 2047 userpage = kmap_atomic(page, KM_USER0);
2012 memset(userpage + page_offset, 0, iosize); 2048 memset(userpage + page_offset, 0, iosize);
2013 flush_dcache_page(page); 2049 flush_dcache_page(page);
2014 kunmap_atomic(userpage, KM_USER0); 2050 kunmap_atomic(userpage, KM_USER0);
2015 set_extent_uptodate(tree, cur, cur + iosize - 1, 2051 set_extent_uptodate(tree, cur, cur + iosize - 1,
2016 GFP_NOFS); 2052 &cached, GFP_NOFS);
2017 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2053 unlock_extent_cached(tree, cur, cur + iosize - 1,
2054 &cached, GFP_NOFS);
2018 break; 2055 break;
2019 } 2056 }
2020 em = get_extent(inode, page, page_offset, cur, 2057 em = get_extent(inode, page, page_offset, cur,
@@ -2054,14 +2091,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2054 /* we've found a hole, just zero and go on */ 2091 /* we've found a hole, just zero and go on */
2055 if (block_start == EXTENT_MAP_HOLE) { 2092 if (block_start == EXTENT_MAP_HOLE) {
2056 char *userpage; 2093 char *userpage;
2094 struct extent_state *cached = NULL;
2095
2057 userpage = kmap_atomic(page, KM_USER0); 2096 userpage = kmap_atomic(page, KM_USER0);
2058 memset(userpage + page_offset, 0, iosize); 2097 memset(userpage + page_offset, 0, iosize);
2059 flush_dcache_page(page); 2098 flush_dcache_page(page);
2060 kunmap_atomic(userpage, KM_USER0); 2099 kunmap_atomic(userpage, KM_USER0);
2061 2100
2062 set_extent_uptodate(tree, cur, cur + iosize - 1, 2101 set_extent_uptodate(tree, cur, cur + iosize - 1,
2063 GFP_NOFS); 2102 &cached, GFP_NOFS);
2064 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2103 unlock_extent_cached(tree, cur, cur + iosize - 1,
2104 &cached, GFP_NOFS);
2065 cur = cur + iosize; 2105 cur = cur + iosize;
2066 page_offset += iosize; 2106 page_offset += iosize;
2067 continue; 2107 continue;
@@ -2126,7 +2166,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2126 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2166 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2127 &bio_flags); 2167 &bio_flags);
2128 if (bio) 2168 if (bio)
2129 submit_one_bio(READ, bio, 0, bio_flags); 2169 ret = submit_one_bio(READ, bio, 0, bio_flags);
2130 return ret; 2170 return ret;
2131} 2171}
2132 2172
@@ -2179,10 +2219,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2179 unsigned long nr_written = 0; 2219 unsigned long nr_written = 0;
2180 2220
2181 if (wbc->sync_mode == WB_SYNC_ALL) 2221 if (wbc->sync_mode == WB_SYNC_ALL)
2182 write_flags = WRITE_SYNC_PLUG; 2222 write_flags = WRITE_SYNC;
2183 else 2223 else
2184 write_flags = WRITE; 2224 write_flags = WRITE;
2185 2225
2226 trace___extent_writepage(page, inode, wbc);
2227
2186 WARN_ON(!PageLocked(page)); 2228 WARN_ON(!PageLocked(page));
2187 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2229 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2188 if (page->index > end_index || 2230 if (page->index > end_index ||
@@ -2778,9 +2820,12 @@ int extent_prepare_write(struct extent_io_tree *tree,
2778 iocount++; 2820 iocount++;
2779 block_start = block_start + iosize; 2821 block_start = block_start + iosize;
2780 } else { 2822 } else {
2781 set_extent_uptodate(tree, block_start, cur_end, 2823 struct extent_state *cached = NULL;
2824
2825 set_extent_uptodate(tree, block_start, cur_end, &cached,
2782 GFP_NOFS); 2826 GFP_NOFS);
2783 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2827 unlock_extent_cached(tree, block_start, cur_end,
2828 &cached, GFP_NOFS);
2784 block_start = cur_end + 1; 2829 block_start = cur_end + 1;
2785 } 2830 }
2786 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2831 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
@@ -2819,9 +2864,17 @@ int try_release_extent_state(struct extent_map_tree *map,
2819 * at this point we can safely clear everything except the 2864 * at this point we can safely clear everything except the
2820 * locked bit and the nodatasum bit 2865 * locked bit and the nodatasum bit
2821 */ 2866 */
2822 clear_extent_bit(tree, start, end, 2867 ret = clear_extent_bit(tree, start, end,
2823 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2868 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2824 0, 0, NULL, mask); 2869 0, 0, NULL, mask);
2870
2871 /* if clear_extent_bit failed for enomem reasons,
2872 * we can't allow the release to continue.
2873 */
2874 if (ret < 0)
2875 ret = 0;
2876 else
2877 ret = 1;
2825 } 2878 }
2826 return ret; 2879 return ret;
2827} 2880}
@@ -2901,6 +2954,46 @@ out:
2901 return sector; 2954 return sector;
2902} 2955}
2903 2956
2957/*
2958 * helper function for fiemap, which doesn't want to see any holes.
2959 * This maps until we find something past 'last'
2960 */
2961static struct extent_map *get_extent_skip_holes(struct inode *inode,
2962 u64 offset,
2963 u64 last,
2964 get_extent_t *get_extent)
2965{
2966 u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2967 struct extent_map *em;
2968 u64 len;
2969
2970 if (offset >= last)
2971 return NULL;
2972
2973 while(1) {
2974 len = last - offset;
2975 if (len == 0)
2976 break;
2977 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2978 em = get_extent(inode, NULL, 0, offset, len, 0);
2979 if (!em || IS_ERR(em))
2980 return em;
2981
2982 /* if this isn't a hole return it */
2983 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2984 em->block_start != EXTENT_MAP_HOLE) {
2985 return em;
2986 }
2987
2988 /* this is a hole, advance to the next extent */
2989 offset = extent_map_end(em);
2990 free_extent_map(em);
2991 if (offset >= last)
2992 break;
2993 }
2994 return NULL;
2995}
2996
2904int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2997int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2905 __u64 start, __u64 len, get_extent_t *get_extent) 2998 __u64 start, __u64 len, get_extent_t *get_extent)
2906{ 2999{
@@ -2910,16 +3003,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2910 u32 flags = 0; 3003 u32 flags = 0;
2911 u32 found_type; 3004 u32 found_type;
2912 u64 last; 3005 u64 last;
3006 u64 last_for_get_extent = 0;
2913 u64 disko = 0; 3007 u64 disko = 0;
3008 u64 isize = i_size_read(inode);
2914 struct btrfs_key found_key; 3009 struct btrfs_key found_key;
2915 struct extent_map *em = NULL; 3010 struct extent_map *em = NULL;
2916 struct extent_state *cached_state = NULL; 3011 struct extent_state *cached_state = NULL;
2917 struct btrfs_path *path; 3012 struct btrfs_path *path;
2918 struct btrfs_file_extent_item *item; 3013 struct btrfs_file_extent_item *item;
2919 int end = 0; 3014 int end = 0;
2920 u64 em_start = 0, em_len = 0; 3015 u64 em_start = 0;
3016 u64 em_len = 0;
3017 u64 em_end = 0;
2921 unsigned long emflags; 3018 unsigned long emflags;
2922 int hole = 0;
2923 3019
2924 if (len == 0) 3020 if (len == 0)
2925 return -EINVAL; 3021 return -EINVAL;
@@ -2929,6 +3025,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2929 return -ENOMEM; 3025 return -ENOMEM;
2930 path->leave_spinning = 1; 3026 path->leave_spinning = 1;
2931 3027
3028 /*
3029 * lookup the last file extent. We're not using i_size here
3030 * because there might be preallocation past i_size
3031 */
2932 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, 3032 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2933 path, inode->i_ino, -1, 0); 3033 path, inode->i_ino, -1, 0);
2934 if (ret < 0) { 3034 if (ret < 0) {
@@ -2942,18 +3042,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2942 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 3042 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2943 found_type = btrfs_key_type(&found_key); 3043 found_type = btrfs_key_type(&found_key);
2944 3044
2945 /* No extents, just return */ 3045 /* No extents, but there might be delalloc bits */
2946 if (found_key.objectid != inode->i_ino || 3046 if (found_key.objectid != inode->i_ino ||
2947 found_type != BTRFS_EXTENT_DATA_KEY) { 3047 found_type != BTRFS_EXTENT_DATA_KEY) {
2948 btrfs_free_path(path); 3048 /* have to trust i_size as the end */
2949 return 0; 3049 last = (u64)-1;
3050 last_for_get_extent = isize;
3051 } else {
3052 /*
3053 * remember the start of the last extent. There are a
3054 * bunch of different factors that go into the length of the
3055 * extent, so its much less complex to remember where it started
3056 */
3057 last = found_key.offset;
3058 last_for_get_extent = last + 1;
2950 } 3059 }
2951 last = found_key.offset;
2952 btrfs_free_path(path); 3060 btrfs_free_path(path);
2953 3061
3062 /*
3063 * we might have some extents allocated but more delalloc past those
3064 * extents. so, we trust isize unless the start of the last extent is
3065 * beyond isize
3066 */
3067 if (last < isize) {
3068 last = (u64)-1;
3069 last_for_get_extent = isize;
3070 }
3071
2954 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3072 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2955 &cached_state, GFP_NOFS); 3073 &cached_state, GFP_NOFS);
2956 em = get_extent(inode, NULL, 0, off, max - off, 0); 3074
3075 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3076 get_extent);
2957 if (!em) 3077 if (!em)
2958 goto out; 3078 goto out;
2959 if (IS_ERR(em)) { 3079 if (IS_ERR(em)) {
@@ -2962,22 +3082,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2962 } 3082 }
2963 3083
2964 while (!end) { 3084 while (!end) {
2965 hole = 0; 3085 u64 offset_in_extent;
2966 off = em->start + em->len;
2967 if (off >= max)
2968 end = 1;
2969 3086
2970 if (em->block_start == EXTENT_MAP_HOLE) { 3087 /* break if the extent we found is outside the range */
2971 hole = 1; 3088 if (em->start >= max || extent_map_end(em) < off)
2972 goto next; 3089 break;
2973 }
2974 3090
2975 em_start = em->start; 3091 /*
2976 em_len = em->len; 3092 * get_extent may return an extent that starts before our
3093 * requested range. We have to make sure the ranges
3094 * we return to fiemap always move forward and don't
3095 * overlap, so adjust the offsets here
3096 */
3097 em_start = max(em->start, off);
2977 3098
3099 /*
3100 * record the offset from the start of the extent
3101 * for adjusting the disk offset below
3102 */
3103 offset_in_extent = em_start - em->start;
3104 em_end = extent_map_end(em);
3105 em_len = em_end - em_start;
3106 emflags = em->flags;
2978 disko = 0; 3107 disko = 0;
2979 flags = 0; 3108 flags = 0;
2980 3109
3110 /*
3111 * bump off for our next call to get_extent
3112 */
3113 off = extent_map_end(em);
3114 if (off >= max)
3115 end = 1;
3116
2981 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 3117 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2982 end = 1; 3118 end = 1;
2983 flags |= FIEMAP_EXTENT_LAST; 3119 flags |= FIEMAP_EXTENT_LAST;
@@ -2988,42 +3124,34 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2988 flags |= (FIEMAP_EXTENT_DELALLOC | 3124 flags |= (FIEMAP_EXTENT_DELALLOC |
2989 FIEMAP_EXTENT_UNKNOWN); 3125 FIEMAP_EXTENT_UNKNOWN);
2990 } else { 3126 } else {
2991 disko = em->block_start; 3127 disko = em->block_start + offset_in_extent;
2992 } 3128 }
2993 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 3129 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2994 flags |= FIEMAP_EXTENT_ENCODED; 3130 flags |= FIEMAP_EXTENT_ENCODED;
2995 3131
2996next:
2997 emflags = em->flags;
2998 free_extent_map(em); 3132 free_extent_map(em);
2999 em = NULL; 3133 em = NULL;
3000 if (!end) { 3134 if ((em_start >= last) || em_len == (u64)-1 ||
3001 em = get_extent(inode, NULL, 0, off, max - off, 0); 3135 (last == (u64)-1 && isize <= em_end)) {
3002 if (!em)
3003 goto out;
3004 if (IS_ERR(em)) {
3005 ret = PTR_ERR(em);
3006 goto out;
3007 }
3008 emflags = em->flags;
3009 }
3010
3011 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
3012 flags |= FIEMAP_EXTENT_LAST; 3136 flags |= FIEMAP_EXTENT_LAST;
3013 end = 1; 3137 end = 1;
3014 } 3138 }
3015 3139
3016 if (em_start == last) { 3140 /* now scan forward to see if this is really the last extent. */
3141 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3142 get_extent);
3143 if (IS_ERR(em)) {
3144 ret = PTR_ERR(em);
3145 goto out;
3146 }
3147 if (!em) {
3017 flags |= FIEMAP_EXTENT_LAST; 3148 flags |= FIEMAP_EXTENT_LAST;
3018 end = 1; 3149 end = 1;
3019 } 3150 }
3020 3151 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3021 if (!hole) { 3152 em_len, flags);
3022 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3153 if (ret)
3023 em_len, flags); 3154 goto out_free;
3024 if (ret)
3025 goto out_free;
3026 }
3027 } 3155 }
3028out_free: 3156out_free:
3029 free_extent_map(em); 3157 free_extent_map(em);
@@ -3192,7 +3320,13 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3192 } 3320 }
3193 if (!PageUptodate(p)) 3321 if (!PageUptodate(p))
3194 uptodate = 0; 3322 uptodate = 0;
3195 unlock_page(p); 3323
3324 /*
3325 * see below about how we avoid a nasty race with release page
3326 * and why we unlock later
3327 */
3328 if (i != 0)
3329 unlock_page(p);
3196 } 3330 }
3197 if (uptodate) 3331 if (uptodate)
3198 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3332 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -3216,9 +3350,26 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3216 atomic_inc(&eb->refs); 3350 atomic_inc(&eb->refs);
3217 spin_unlock(&tree->buffer_lock); 3351 spin_unlock(&tree->buffer_lock);
3218 radix_tree_preload_end(); 3352 radix_tree_preload_end();
3353
3354 /*
3355 * there is a race where release page may have
3356 * tried to find this extent buffer in the radix
3357 * but failed. It will tell the VM it is safe to
3358 * reclaim the, and it will clear the page private bit.
3359 * We must make sure to set the page private bit properly
3360 * after the extent buffer is in the radix tree so
3361 * it doesn't get lost
3362 */
3363 set_page_extent_mapped(eb->first_page);
3364 set_page_extent_head(eb->first_page, eb->len);
3365 if (!page0)
3366 unlock_page(eb->first_page);
3219 return eb; 3367 return eb;
3220 3368
3221free_eb: 3369free_eb:
3370 if (eb->first_page && !page0)
3371 unlock_page(eb->first_page);
3372
3222 if (!atomic_dec_and_test(&eb->refs)) 3373 if (!atomic_dec_and_test(&eb->refs))
3223 return exists; 3374 return exists;
3224 btrfs_release_extent_buffer(eb); 3375 btrfs_release_extent_buffer(eb);
@@ -3269,10 +3420,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3269 continue; 3420 continue;
3270 3421
3271 lock_page(page); 3422 lock_page(page);
3423 WARN_ON(!PagePrivate(page));
3424
3425 set_page_extent_mapped(page);
3272 if (i == 0) 3426 if (i == 0)
3273 set_page_extent_head(page, eb->len); 3427 set_page_extent_head(page, eb->len);
3274 else
3275 set_page_private(page, EXTENT_PAGE_PRIVATE);
3276 3428
3277 clear_page_dirty_for_io(page); 3429 clear_page_dirty_for_io(page);
3278 spin_lock_irq(&page->mapping->tree_lock); 3430 spin_lock_irq(&page->mapping->tree_lock);
@@ -3339,7 +3491,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3339 num_pages = num_extent_pages(eb->start, eb->len); 3491 num_pages = num_extent_pages(eb->start, eb->len);
3340 3492
3341 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3493 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3342 GFP_NOFS); 3494 NULL, GFP_NOFS);
3343 for (i = 0; i < num_pages; i++) { 3495 for (i = 0; i < num_pages; i++) {
3344 page = extent_buffer_page(eb, i); 3496 page = extent_buffer_page(eb, i);
3345 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3497 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3462,6 +3614,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3462 3614
3463 for (i = start_i; i < num_pages; i++) { 3615 for (i = start_i; i < num_pages; i++) {
3464 page = extent_buffer_page(eb, i); 3616 page = extent_buffer_page(eb, i);
3617
3618 WARN_ON(!PagePrivate(page));
3619
3620 set_page_extent_mapped(page);
3621 if (i == 0)
3622 set_page_extent_head(page, eb->len);
3623
3465 if (inc_all_pages) 3624 if (inc_all_pages)
3466 page_cache_get(page); 3625 page_cache_get(page);
3467 if (!PageUptodate(page)) { 3626 if (!PageUptodate(page)) {
@@ -3567,6 +3726,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3567 "wanted %lu %lu\n", (unsigned long long)eb->start, 3726 "wanted %lu %lu\n", (unsigned long long)eb->start,
3568 eb->len, start, min_len); 3727 eb->len, start, min_len);
3569 WARN_ON(1); 3728 WARN_ON(1);
3729 return -EINVAL;
3570 } 3730 }
3571 3731
3572 p = extent_buffer_page(eb, i); 3732 p = extent_buffer_page(eb, i);
@@ -3759,6 +3919,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
3759 kunmap_atomic(dst_kaddr, KM_USER0); 3919 kunmap_atomic(dst_kaddr, KM_USER0);
3760} 3920}
3761 3921
3922static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
3923{
3924 unsigned long distance = (src > dst) ? src - dst : dst - src;
3925 return distance < len;
3926}
3927
3762static void copy_pages(struct page *dst_page, struct page *src_page, 3928static void copy_pages(struct page *dst_page, struct page *src_page,
3763 unsigned long dst_off, unsigned long src_off, 3929 unsigned long dst_off, unsigned long src_off,
3764 unsigned long len) 3930 unsigned long len)
@@ -3766,10 +3932,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
3766 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3932 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3767 char *src_kaddr; 3933 char *src_kaddr;
3768 3934
3769 if (dst_page != src_page) 3935 if (dst_page != src_page) {
3770 src_kaddr = kmap_atomic(src_page, KM_USER1); 3936 src_kaddr = kmap_atomic(src_page, KM_USER1);
3771 else 3937 } else {
3772 src_kaddr = dst_kaddr; 3938 src_kaddr = dst_kaddr;
3939 BUG_ON(areas_overlap(src_off, dst_off, len));
3940 }
3773 3941
3774 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3942 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3775 kunmap_atomic(dst_kaddr, KM_USER0); 3943 kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3844,7 +4012,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3844 "len %lu len %lu\n", dst_offset, len, dst->len); 4012 "len %lu len %lu\n", dst_offset, len, dst->len);
3845 BUG_ON(1); 4013 BUG_ON(1);
3846 } 4014 }
3847 if (dst_offset < src_offset) { 4015 if (!areas_overlap(src_offset, dst_offset, len)) {
3848 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 4016 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3849 return; 4017 return;
3850 } 4018 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd061..af2d7179c372 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -31,6 +31,7 @@
31#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
32#define EXTENT_BUFFER_BLOCKING 1 32#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 33#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3
34 35
35/* these are flags for extent_clear_unlock_delalloc */ 36/* these are flags for extent_clear_unlock_delalloc */
36#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -191,7 +192,7 @@ void extent_io_exit(void);
191 192
192u64 count_range_bits(struct extent_io_tree *tree, 193u64 count_range_bits(struct extent_io_tree *tree,
193 u64 *start, u64 search_end, 194 u64 *start, u64 search_end,
194 u64 max_bytes, unsigned long bits); 195 u64 max_bytes, unsigned long bits, int contig);
195 196
196void free_extent_state(struct extent_state *state); 197void free_extent_state(struct extent_state *state);
197int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 198int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -207,7 +208,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
207 int bits, int exclusive_bits, u64 *failed_start, 208 int bits, int exclusive_bits, u64 *failed_start,
208 struct extent_state **cached_state, gfp_t mask); 209 struct extent_state **cached_state, gfp_t mask);
209int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 210int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
210 gfp_t mask); 211 struct extent_state **cached_state, gfp_t mask);
211int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 212int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
212 gfp_t mask); 213 gfp_t mask);
213int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 214int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b0e1fce12530..a24a3f2fa13e 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -51,8 +51,8 @@ struct extent_map *alloc_extent_map(gfp_t mask)
51{ 51{
52 struct extent_map *em; 52 struct extent_map *em;
53 em = kmem_cache_alloc(extent_map_cache, mask); 53 em = kmem_cache_alloc(extent_map_cache, mask);
54 if (!em || IS_ERR(em)) 54 if (!em)
55 return em; 55 return NULL;
56 em->in_tree = 0; 56 em->in_tree = 0;
57 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE; 58 em->compress_type = BTRFS_COMPRESS_NONE;
@@ -243,7 +243,7 @@ out:
243 * Insert @em into @tree or perform a simple forward/backward merge with 243 * Insert @em into @tree or perform a simple forward/backward merge with
244 * existing mappings. The extent_map struct passed in will be inserted 244 * existing mappings. The extent_map struct passed in will be inserted
245 * into the tree directly, with an additional reference taken, or a 245 * into the tree directly, with an additional reference taken, or a
246 * reference dropped if the merge attempt was successfull. 246 * reference dropped if the merge attempt was successful.
247 */ 247 */
248int add_extent_mapping(struct extent_map_tree *tree, 248int add_extent_mapping(struct extent_map_tree *tree,
249 struct extent_map *em) 249 struct extent_map *em)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..a6a9d4e8b491 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
48 struct extent_buffer *leaf; 48 struct extent_buffer *leaf;
49 49
50 path = btrfs_alloc_path(); 50 path = btrfs_alloc_path();
51 BUG_ON(!path); 51 if (!path)
52 return -ENOMEM;
52 file_key.objectid = objectid; 53 file_key.objectid = objectid;
53 file_key.offset = pos; 54 file_key.offset = pos;
54 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 55 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
@@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
169 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 170 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
170 171
171 path = btrfs_alloc_path(); 172 path = btrfs_alloc_path();
173 if (!path)
174 return -ENOMEM;
172 if (bio->bi_size > PAGE_CACHE_SIZE * 8) 175 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
173 path->reada = 2; 176 path->reada = 2;
174 177
@@ -536,6 +539,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
536 root = root->fs_info->csum_root; 539 root = root->fs_info->csum_root;
537 540
538 path = btrfs_alloc_path(); 541 path = btrfs_alloc_path();
542 if (!path)
543 return -ENOMEM;
539 544
540 while (1) { 545 while (1) {
541 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 546 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -548,7 +553,10 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
548 if (path->slots[0] == 0) 553 if (path->slots[0] == 0)
549 goto out; 554 goto out;
550 path->slots[0]--; 555 path->slots[0]--;
556 } else if (ret < 0) {
557 goto out;
551 } 558 }
559
552 leaf = path->nodes[0]; 560 leaf = path->nodes[0];
553 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 561 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
554 562
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c800d58f3013..75899a01dded 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -45,14 +45,14 @@
45 * and be replaced with calls into generic code. 45 * and be replaced with calls into generic code.
46 */ 46 */
47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
48 int write_bytes, 48 size_t write_bytes,
49 struct page **prepared_pages, 49 struct page **prepared_pages,
50 struct iov_iter *i) 50 struct iov_iter *i)
51{ 51{
52 size_t copied = 0; 52 size_t copied = 0;
53 size_t total_copied = 0;
53 int pg = 0; 54 int pg = 0;
54 int offset = pos & (PAGE_CACHE_SIZE - 1); 55 int offset = pos & (PAGE_CACHE_SIZE - 1);
55 int total_copied = 0;
56 56
57 while (write_bytes > 0) { 57 while (write_bytes > 0) {
58 size_t count = min_t(size_t, 58 size_t count = min_t(size_t,
@@ -70,14 +70,26 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
70 70
71 /* Flush processor's dcache for this page */ 71 /* Flush processor's dcache for this page */
72 flush_dcache_page(page); 72 flush_dcache_page(page);
73
74 /*
75 * if we get a partial write, we can end up with
76 * partially up to date pages. These add
77 * a lot of complexity, so make sure they don't
78 * happen by forcing this copy to be retried.
79 *
80 * The rest of the btrfs_file_write code will fall
81 * back to page at a time copies after we return 0.
82 */
83 if (!PageUptodate(page) && copied < count)
84 copied = 0;
85
73 iov_iter_advance(i, copied); 86 iov_iter_advance(i, copied);
74 write_bytes -= copied; 87 write_bytes -= copied;
75 total_copied += copied; 88 total_copied += copied;
76 89
77 /* Return to btrfs_file_aio_write to fault page */ 90 /* Return to btrfs_file_aio_write to fault page */
78 if (unlikely(copied == 0)) { 91 if (unlikely(copied == 0))
79 break; 92 break;
80 }
81 93
82 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 94 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
83 offset += copied; 95 offset += copied;
@@ -92,12 +104,10 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
92/* 104/*
93 * unlocks pages after btrfs_file_write is done with them 105 * unlocks pages after btrfs_file_write is done with them
94 */ 106 */
95static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) 107void btrfs_drop_pages(struct page **pages, size_t num_pages)
96{ 108{
97 size_t i; 109 size_t i;
98 for (i = 0; i < num_pages; i++) { 110 for (i = 0; i < num_pages; i++) {
99 if (!pages[i])
100 break;
101 /* page checked is some magic around finding pages that 111 /* page checked is some magic around finding pages that
102 * have been modified without going through btrfs_set_page_dirty 112 * have been modified without going through btrfs_set_page_dirty
103 * clear it here 113 * clear it here
@@ -117,17 +127,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
117 * this also makes the decision about creating an inline extent vs 127 * this also makes the decision about creating an inline extent vs
118 * doing real data extents, marking pages dirty and delalloc as required. 128 * doing real data extents, marking pages dirty and delalloc as required.
119 */ 129 */
120static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, 130int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
121 struct btrfs_root *root, 131 struct page **pages, size_t num_pages,
122 struct file *file, 132 loff_t pos, size_t write_bytes,
123 struct page **pages, 133 struct extent_state **cached)
124 size_t num_pages,
125 loff_t pos,
126 size_t write_bytes)
127{ 134{
128 int err = 0; 135 int err = 0;
129 int i; 136 int i;
130 struct inode *inode = fdentry(file)->d_inode;
131 u64 num_bytes; 137 u64 num_bytes;
132 u64 start_pos; 138 u64 start_pos;
133 u64 end_of_last_block; 139 u64 end_of_last_block;
@@ -140,8 +146,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
140 146
141 end_of_last_block = start_pos + num_bytes - 1; 147 end_of_last_block = start_pos + num_bytes - 1;
142 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 148 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
143 NULL); 149 cached);
144 BUG_ON(err); 150 if (err)
151 return err;
145 152
146 for (i = 0; i < num_pages; i++) { 153 for (i = 0; i < num_pages; i++) {
147 struct page *p = pages[i]; 154 struct page *p = pages[i];
@@ -149,13 +156,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
149 ClearPageChecked(p); 156 ClearPageChecked(p);
150 set_page_dirty(p); 157 set_page_dirty(p);
151 } 158 }
152 if (end_pos > isize) { 159
160 /*
161 * we've only changed i_size in ram, and we haven't updated
162 * the disk i_size. There is no need to log the inode
163 * at this time.
164 */
165 if (end_pos > isize)
153 i_size_write(inode, end_pos); 166 i_size_write(inode, end_pos);
154 /* we've only changed i_size in ram, and we haven't updated
155 * the disk i_size. There is no need to log the inode
156 * at this time.
157 */
158 }
159 return 0; 167 return 0;
160} 168}
161 169
@@ -186,6 +194,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
186 split = alloc_extent_map(GFP_NOFS); 194 split = alloc_extent_map(GFP_NOFS);
187 if (!split2) 195 if (!split2)
188 split2 = alloc_extent_map(GFP_NOFS); 196 split2 = alloc_extent_map(GFP_NOFS);
197 BUG_ON(!split || !split2);
189 198
190 write_lock(&em_tree->lock); 199 write_lock(&em_tree->lock);
191 em = lookup_extent_mapping(em_tree, start, len); 200 em = lookup_extent_mapping(em_tree, start, len);
@@ -596,6 +605,8 @@ again:
596 key.offset = split; 605 key.offset = split;
597 606
598 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 607 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
608 if (ret < 0)
609 goto out;
599 if (ret > 0 && path->slots[0] > 0) 610 if (ret > 0 && path->slots[0] > 0)
600 path->slots[0]--; 611 path->slots[0]--;
601 612
@@ -762,6 +773,27 @@ out:
762} 773}
763 774
764/* 775/*
776 * on error we return an unlocked page and the error value
777 * on success we return a locked page and 0
778 */
779static int prepare_uptodate_page(struct page *page, u64 pos)
780{
781 int ret = 0;
782
783 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
784 ret = btrfs_readpage(NULL, page);
785 if (ret)
786 return ret;
787 lock_page(page);
788 if (!PageUptodate(page)) {
789 unlock_page(page);
790 return -EIO;
791 }
792 }
793 return 0;
794}
795
796/*
765 * this gets pages into the page cache and locks them down, it also properly 797 * this gets pages into the page cache and locks them down, it also properly
766 * waits for data=ordered extents to finish before allowing the pages to be 798 * waits for data=ordered extents to finish before allowing the pages to be
767 * modified. 799 * modified.
@@ -776,6 +808,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
776 unsigned long index = pos >> PAGE_CACHE_SHIFT; 808 unsigned long index = pos >> PAGE_CACHE_SHIFT;
777 struct inode *inode = fdentry(file)->d_inode; 809 struct inode *inode = fdentry(file)->d_inode;
778 int err = 0; 810 int err = 0;
811 int faili = 0;
779 u64 start_pos; 812 u64 start_pos;
780 u64 last_pos; 813 u64 last_pos;
781 814
@@ -783,21 +816,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
783 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 816 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
784 817
785 if (start_pos > inode->i_size) { 818 if (start_pos > inode->i_size) {
786 err = btrfs_cont_expand(inode, start_pos); 819 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
787 if (err) 820 if (err)
788 return err; 821 return err;
789 } 822 }
790 823
791 memset(pages, 0, num_pages * sizeof(struct page *));
792again: 824again:
793 for (i = 0; i < num_pages; i++) { 825 for (i = 0; i < num_pages; i++) {
794 pages[i] = grab_cache_page(inode->i_mapping, index + i); 826 pages[i] = grab_cache_page(inode->i_mapping, index + i);
795 if (!pages[i]) { 827 if (!pages[i]) {
828 faili = i - 1;
796 err = -ENOMEM; 829 err = -ENOMEM;
797 BUG_ON(1); 830 goto fail;
831 }
832
833 if (i == 0)
834 err = prepare_uptodate_page(pages[i], pos);
835 if (i == num_pages - 1)
836 err = prepare_uptodate_page(pages[i],
837 pos + write_bytes);
838 if (err) {
839 page_cache_release(pages[i]);
840 faili = i - 1;
841 goto fail;
798 } 842 }
799 wait_on_page_writeback(pages[i]); 843 wait_on_page_writeback(pages[i]);
800 } 844 }
845 err = 0;
801 if (start_pos < inode->i_size) { 846 if (start_pos < inode->i_size) {
802 struct btrfs_ordered_extent *ordered; 847 struct btrfs_ordered_extent *ordered;
803 lock_extent_bits(&BTRFS_I(inode)->io_tree, 848 lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -837,187 +882,103 @@ again:
837 WARN_ON(!PageLocked(pages[i])); 882 WARN_ON(!PageLocked(pages[i]));
838 } 883 }
839 return 0; 884 return 0;
885fail:
886 while (faili >= 0) {
887 unlock_page(pages[faili]);
888 page_cache_release(pages[faili]);
889 faili--;
890 }
891 return err;
892
840} 893}
841 894
842static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 895static noinline ssize_t __btrfs_buffered_write(struct file *file,
843 const struct iovec *iov, 896 struct iov_iter *i,
844 unsigned long nr_segs, loff_t pos) 897 loff_t pos)
845{ 898{
846 struct file *file = iocb->ki_filp;
847 struct inode *inode = fdentry(file)->d_inode; 899 struct inode *inode = fdentry(file)->d_inode;
848 struct btrfs_root *root = BTRFS_I(inode)->root; 900 struct btrfs_root *root = BTRFS_I(inode)->root;
849 struct page *pinned[2];
850 struct page **pages = NULL; 901 struct page **pages = NULL;
851 struct iov_iter i;
852 loff_t *ppos = &iocb->ki_pos;
853 loff_t start_pos;
854 ssize_t num_written = 0;
855 ssize_t err = 0;
856 size_t count;
857 size_t ocount;
858 int ret = 0;
859 int nrptrs;
860 unsigned long first_index; 902 unsigned long first_index;
861 unsigned long last_index; 903 unsigned long last_index;
862 int will_write; 904 size_t num_written = 0;
863 int buffered = 0; 905 int nrptrs;
864 int copied = 0; 906 int ret = 0;
865 int dirty_pages = 0;
866
867 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
868 (file->f_flags & O_DIRECT));
869
870 pinned[0] = NULL;
871 pinned[1] = NULL;
872
873 start_pos = pos;
874
875 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
876
877 mutex_lock(&inode->i_mutex);
878
879 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
880 if (err)
881 goto out;
882 count = ocount;
883
884 current->backing_dev_info = inode->i_mapping->backing_dev_info;
885 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
886 if (err)
887 goto out;
888
889 if (count == 0)
890 goto out;
891
892 err = file_remove_suid(file);
893 if (err)
894 goto out;
895
896 /*
897 * If BTRFS flips readonly due to some impossible error
898 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
899 * although we have opened a file as writable, we have
900 * to stop this write operation to ensure FS consistency.
901 */
902 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
903 err = -EROFS;
904 goto out;
905 }
906
907 file_update_time(file);
908 BTRFS_I(inode)->sequence++;
909
910 if (unlikely(file->f_flags & O_DIRECT)) {
911 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
912 pos, ppos, count,
913 ocount);
914 /*
915 * the generic O_DIRECT will update in-memory i_size after the
916 * DIOs are done. But our endio handlers that update the on
917 * disk i_size never update past the in memory i_size. So we
918 * need one more update here to catch any additions to the
919 * file
920 */
921 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
922 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
923 mark_inode_dirty(inode);
924 }
925
926 if (num_written < 0) {
927 ret = num_written;
928 num_written = 0;
929 goto out;
930 } else if (num_written == count) {
931 /* pick up pos changes done by the generic code */
932 pos = *ppos;
933 goto out;
934 }
935 /*
936 * We are going to do buffered for the rest of the range, so we
937 * need to make sure to invalidate the buffered pages when we're
938 * done.
939 */
940 buffered = 1;
941 pos += num_written;
942 }
943 907
944 iov_iter_init(&i, iov, nr_segs, count, num_written); 908 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
945 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
946 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 909 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
947 (sizeof(struct page *))); 910 (sizeof(struct page *)));
948 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 911 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
949 912 if (!pages)
950 /* generic_write_checks can change our pos */ 913 return -ENOMEM;
951 start_pos = pos;
952 914
953 first_index = pos >> PAGE_CACHE_SHIFT; 915 first_index = pos >> PAGE_CACHE_SHIFT;
954 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; 916 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
955 917
956 /* 918 while (iov_iter_count(i) > 0) {
957 * there are lots of better ways to do this, but this code
958 * makes sure the first and last page in the file range are
959 * up to date and ready for cow
960 */
961 if ((pos & (PAGE_CACHE_SIZE - 1))) {
962 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
963 if (!PageUptodate(pinned[0])) {
964 ret = btrfs_readpage(NULL, pinned[0]);
965 BUG_ON(ret);
966 wait_on_page_locked(pinned[0]);
967 } else {
968 unlock_page(pinned[0]);
969 }
970 }
971 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
972 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
973 if (!PageUptodate(pinned[1])) {
974 ret = btrfs_readpage(NULL, pinned[1]);
975 BUG_ON(ret);
976 wait_on_page_locked(pinned[1]);
977 } else {
978 unlock_page(pinned[1]);
979 }
980 }
981
982 while (iov_iter_count(&i) > 0) {
983 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 919 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
984 size_t write_bytes = min(iov_iter_count(&i), 920 size_t write_bytes = min(iov_iter_count(i),
985 nrptrs * (size_t)PAGE_CACHE_SIZE - 921 nrptrs * (size_t)PAGE_CACHE_SIZE -
986 offset); 922 offset);
987 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 923 size_t num_pages = (write_bytes + offset +
988 PAGE_CACHE_SHIFT; 924 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
925 size_t dirty_pages;
926 size_t copied;
989 927
990 WARN_ON(num_pages > nrptrs); 928 WARN_ON(num_pages > nrptrs);
991 memset(pages, 0, sizeof(struct page *) * nrptrs);
992 929
993 /* 930 /*
994 * Fault pages before locking them in prepare_pages 931 * Fault pages before locking them in prepare_pages
995 * to avoid recursive lock 932 * to avoid recursive lock
996 */ 933 */
997 if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) { 934 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
998 ret = -EFAULT; 935 ret = -EFAULT;
999 goto out; 936 break;
1000 } 937 }
1001 938
1002 ret = btrfs_delalloc_reserve_space(inode, 939 ret = btrfs_delalloc_reserve_space(inode,
1003 num_pages << PAGE_CACHE_SHIFT); 940 num_pages << PAGE_CACHE_SHIFT);
1004 if (ret) 941 if (ret)
1005 goto out; 942 break;
1006 943
944 /*
945 * This is going to setup the pages array with the number of
946 * pages we want, so we don't really need to worry about the
947 * contents of pages from loop to loop
948 */
1007 ret = prepare_pages(root, file, pages, num_pages, 949 ret = prepare_pages(root, file, pages, num_pages,
1008 pos, first_index, last_index, 950 pos, first_index, last_index,
1009 write_bytes); 951 write_bytes);
1010 if (ret) { 952 if (ret) {
1011 btrfs_delalloc_release_space(inode, 953 btrfs_delalloc_release_space(inode,
1012 num_pages << PAGE_CACHE_SHIFT); 954 num_pages << PAGE_CACHE_SHIFT);
1013 goto out; 955 break;
1014 } 956 }
1015 957
1016 copied = btrfs_copy_from_user(pos, num_pages, 958 copied = btrfs_copy_from_user(pos, num_pages,
1017 write_bytes, pages, &i); 959 write_bytes, pages, i);
1018 dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >> 960
1019 PAGE_CACHE_SHIFT; 961 /*
962 * if we have trouble faulting in the pages, fall
963 * back to one page at a time
964 */
965 if (copied < write_bytes)
966 nrptrs = 1;
967
968 if (copied == 0)
969 dirty_pages = 0;
970 else
971 dirty_pages = (copied + offset +
972 PAGE_CACHE_SIZE - 1) >>
973 PAGE_CACHE_SHIFT;
1020 974
975 /*
976 * If we had a short copy we need to release the excess delaloc
977 * bytes we reserved. We need to increment outstanding_extents
978 * because btrfs_delalloc_release_space will decrement it, but
979 * we still have an outstanding extent for the chunk we actually
980 * managed to copy.
981 */
1021 if (num_pages > dirty_pages) { 982 if (num_pages > dirty_pages) {
1022 if (copied > 0) 983 if (copied > 0)
1023 atomic_inc( 984 atomic_inc(
@@ -1028,43 +989,157 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1028 } 989 }
1029 990
1030 if (copied > 0) { 991 if (copied > 0) {
1031 dirty_and_release_pages(NULL, root, file, pages, 992 ret = btrfs_dirty_pages(root, inode, pages,
1032 dirty_pages, pos, copied); 993 dirty_pages, pos, copied,
994 NULL);
995 if (ret) {
996 btrfs_delalloc_release_space(inode,
997 dirty_pages << PAGE_CACHE_SHIFT);
998 btrfs_drop_pages(pages, num_pages);
999 break;
1000 }
1033 } 1001 }
1034 1002
1035 btrfs_drop_pages(pages, num_pages); 1003 btrfs_drop_pages(pages, num_pages);
1036 1004
1037 if (copied > 0) { 1005 cond_resched();
1038 if (will_write) { 1006
1039 filemap_fdatawrite_range(inode->i_mapping, pos, 1007 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1040 pos + copied - 1); 1008 dirty_pages);
1041 } else { 1009 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1042 balance_dirty_pages_ratelimited_nr( 1010 btrfs_btree_balance_dirty(root, 1);
1043 inode->i_mapping, 1011 btrfs_throttle(root);
1044 dirty_pages);
1045 if (dirty_pages <
1046 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1047 btrfs_btree_balance_dirty(root, 1);
1048 btrfs_throttle(root);
1049 }
1050 }
1051 1012
1052 pos += copied; 1013 pos += copied;
1053 num_written += copied; 1014 num_written += copied;
1015 }
1054 1016
1055 cond_resched(); 1017 kfree(pages);
1018
1019 return num_written ? num_written : ret;
1020}
1021
1022static ssize_t __btrfs_direct_write(struct kiocb *iocb,
1023 const struct iovec *iov,
1024 unsigned long nr_segs, loff_t pos,
1025 loff_t *ppos, size_t count, size_t ocount)
1026{
1027 struct file *file = iocb->ki_filp;
1028 struct inode *inode = fdentry(file)->d_inode;
1029 struct iov_iter i;
1030 ssize_t written;
1031 ssize_t written_buffered;
1032 loff_t endbyte;
1033 int err;
1034
1035 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1036 count, ocount);
1037
1038 /*
1039 * the generic O_DIRECT will update in-memory i_size after the
1040 * DIOs are done. But our endio handlers that update the on
1041 * disk i_size never update past the in memory i_size. So we
1042 * need one more update here to catch any additions to the
1043 * file
1044 */
1045 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
1046 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
1047 mark_inode_dirty(inode);
1056 } 1048 }
1049
1050 if (written < 0 || written == count)
1051 return written;
1052
1053 pos += written;
1054 count -= written;
1055 iov_iter_init(&i, iov, nr_segs, count, written);
1056 written_buffered = __btrfs_buffered_write(file, &i, pos);
1057 if (written_buffered < 0) {
1058 err = written_buffered;
1059 goto out;
1060 }
1061 endbyte = pos + written_buffered - 1;
1062 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1063 if (err)
1064 goto out;
1065 written += written_buffered;
1066 *ppos = pos + written_buffered;
1067 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1068 endbyte >> PAGE_CACHE_SHIFT);
1057out: 1069out:
1058 mutex_unlock(&inode->i_mutex); 1070 return written ? written : err;
1059 if (ret) 1071}
1060 err = ret;
1061 1072
1062 kfree(pages); 1073static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1063 if (pinned[0]) 1074 const struct iovec *iov,
1064 page_cache_release(pinned[0]); 1075 unsigned long nr_segs, loff_t pos)
1065 if (pinned[1]) 1076{
1066 page_cache_release(pinned[1]); 1077 struct file *file = iocb->ki_filp;
1067 *ppos = pos; 1078 struct inode *inode = fdentry(file)->d_inode;
1079 struct btrfs_root *root = BTRFS_I(inode)->root;
1080 loff_t *ppos = &iocb->ki_pos;
1081 ssize_t num_written = 0;
1082 ssize_t err = 0;
1083 size_t count, ocount;
1084
1085 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1086
1087 mutex_lock(&inode->i_mutex);
1088
1089 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
1090 if (err) {
1091 mutex_unlock(&inode->i_mutex);
1092 goto out;
1093 }
1094 count = ocount;
1095
1096 current->backing_dev_info = inode->i_mapping->backing_dev_info;
1097 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1098 if (err) {
1099 mutex_unlock(&inode->i_mutex);
1100 goto out;
1101 }
1102
1103 if (count == 0) {
1104 mutex_unlock(&inode->i_mutex);
1105 goto out;
1106 }
1107
1108 err = file_remove_suid(file);
1109 if (err) {
1110 mutex_unlock(&inode->i_mutex);
1111 goto out;
1112 }
1113
1114 /*
1115 * If BTRFS flips readonly due to some impossible error
1116 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1117 * although we have opened a file as writable, we have
1118 * to stop this write operation to ensure FS consistency.
1119 */
1120 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
1121 mutex_unlock(&inode->i_mutex);
1122 err = -EROFS;
1123 goto out;
1124 }
1125
1126 file_update_time(file);
1127 BTRFS_I(inode)->sequence++;
1128
1129 if (unlikely(file->f_flags & O_DIRECT)) {
1130 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1131 pos, ppos, count, ocount);
1132 } else {
1133 struct iov_iter i;
1134
1135 iov_iter_init(&i, iov, nr_segs, count, num_written);
1136
1137 num_written = __btrfs_buffered_write(file, &i, pos);
1138 if (num_written > 0)
1139 *ppos = pos + num_written;
1140 }
1141
1142 mutex_unlock(&inode->i_mutex);
1068 1143
1069 /* 1144 /*
1070 * we want to make sure fsync finds this change 1145 * we want to make sure fsync finds this change
@@ -1079,43 +1154,12 @@ out:
1079 * one running right now. 1154 * one running right now.
1080 */ 1155 */
1081 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1156 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1082 1157 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1083 if (num_written > 0 && will_write) { 1158 err = generic_write_sync(file, pos, num_written);
1084 struct btrfs_trans_handle *trans; 1159 if (err < 0 && num_written > 0)
1085
1086 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1087 if (err)
1088 num_written = err; 1160 num_written = err;
1089
1090 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1091 trans = btrfs_start_transaction(root, 0);
1092 if (IS_ERR(trans)) {
1093 num_written = PTR_ERR(trans);
1094 goto done;
1095 }
1096 mutex_lock(&inode->i_mutex);
1097 ret = btrfs_log_dentry_safe(trans, root,
1098 file->f_dentry);
1099 mutex_unlock(&inode->i_mutex);
1100 if (ret == 0) {
1101 ret = btrfs_sync_log(trans, root);
1102 if (ret == 0)
1103 btrfs_end_transaction(trans, root);
1104 else
1105 btrfs_commit_transaction(trans, root);
1106 } else if (ret != BTRFS_NO_LOG_SYNC) {
1107 btrfs_commit_transaction(trans, root);
1108 } else {
1109 btrfs_end_transaction(trans, root);
1110 }
1111 }
1112 if (file->f_flags & O_DIRECT && buffered) {
1113 invalidate_mapping_pages(inode->i_mapping,
1114 start_pos >> PAGE_CACHE_SHIFT,
1115 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1116 }
1117 } 1161 }
1118done: 1162out:
1119 current->backing_dev_info = NULL; 1163 current->backing_dev_info = NULL;
1120 return num_written ? num_written : err; 1164 return num_written ? num_written : err;
1121} 1165}
@@ -1158,6 +1202,7 @@ int btrfs_sync_file(struct file *file, int datasync)
1158 int ret = 0; 1202 int ret = 0;
1159 struct btrfs_trans_handle *trans; 1203 struct btrfs_trans_handle *trans;
1160 1204
1205 trace_btrfs_sync_file(file, datasync);
1161 1206
1162 /* we wait first, since the writeback may change the inode */ 1207 /* we wait first, since the writeback may change the inode */
1163 root->log_batch++; 1208 root->log_batch++;
@@ -1285,7 +1330,8 @@ static long btrfs_fallocate(struct file *file, int mode,
1285 goto out; 1330 goto out;
1286 1331
1287 if (alloc_start > inode->i_size) { 1332 if (alloc_start > inode->i_size) {
1288 ret = btrfs_cont_expand(inode, alloc_start); 1333 ret = btrfs_cont_expand(inode, i_size_read(inode),
1334 alloc_start);
1289 if (ret) 1335 if (ret)
1290 goto out; 1336 goto out;
1291 } 1337 }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 60d684266959..11d2e9cea09e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -24,6 +24,7 @@
24#include "free-space-cache.h" 24#include "free-space-cache.h"
25#include "transaction.h" 25#include "transaction.h"
26#include "disk-io.h" 26#include "disk-io.h"
27#include "extent_io.h"
27 28
28#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 29#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
29#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 30#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
@@ -81,6 +82,8 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
81 return ERR_PTR(-ENOENT); 82 return ERR_PTR(-ENOENT);
82 } 83 }
83 84
85 inode->i_mapping->flags &= ~__GFP_FS;
86
84 spin_lock(&block_group->lock); 87 spin_lock(&block_group->lock);
85 if (!root->fs_info->closing) { 88 if (!root->fs_info->closing) {
86 block_group->inode = igrab(inode); 89 block_group->inode = igrab(inode);
@@ -222,6 +225,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
222 u64 num_entries; 225 u64 num_entries;
223 u64 num_bitmaps; 226 u64 num_bitmaps;
224 u64 generation; 227 u64 generation;
228 u64 used = btrfs_block_group_used(&block_group->item);
225 u32 cur_crc = ~(u32)0; 229 u32 cur_crc = ~(u32)0;
226 pgoff_t index = 0; 230 pgoff_t index = 0;
227 unsigned long first_page_offset; 231 unsigned long first_page_offset;
@@ -393,7 +397,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
393 break; 397 break;
394 398
395 need_loop = 1; 399 need_loop = 1;
396 e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); 400 e = kmem_cache_zalloc(btrfs_free_space_cachep,
401 GFP_NOFS);
397 if (!e) { 402 if (!e) {
398 kunmap(page); 403 kunmap(page);
399 unlock_page(page); 404 unlock_page(page);
@@ -405,7 +410,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
405 e->bytes = le64_to_cpu(entry->bytes); 410 e->bytes = le64_to_cpu(entry->bytes);
406 if (!e->bytes) { 411 if (!e->bytes) {
407 kunmap(page); 412 kunmap(page);
408 kfree(e); 413 kmem_cache_free(btrfs_free_space_cachep, e);
409 unlock_page(page); 414 unlock_page(page);
410 page_cache_release(page); 415 page_cache_release(page);
411 goto free_cache; 416 goto free_cache;
@@ -420,7 +425,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
420 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 425 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
421 if (!e->bitmap) { 426 if (!e->bitmap) {
422 kunmap(page); 427 kunmap(page);
423 kfree(e); 428 kmem_cache_free(
429 btrfs_free_space_cachep, e);
424 unlock_page(page); 430 unlock_page(page);
425 page_cache_release(page); 431 page_cache_release(page);
426 goto free_cache; 432 goto free_cache;
@@ -465,6 +471,17 @@ next:
465 index++; 471 index++;
466 } 472 }
467 473
474 spin_lock(&block_group->tree_lock);
475 if (block_group->free_space != (block_group->key.offset - used -
476 block_group->bytes_super)) {
477 spin_unlock(&block_group->tree_lock);
478 printk(KERN_ERR "block group %llu has an wrong amount of free "
479 "space\n", block_group->key.objectid);
480 ret = 0;
481 goto free_cache;
482 }
483 spin_unlock(&block_group->tree_lock);
484
468 ret = 1; 485 ret = 1;
469out: 486out:
470 kfree(checksums); 487 kfree(checksums);
@@ -491,18 +508,23 @@ int btrfs_write_out_cache(struct btrfs_root *root,
491 struct inode *inode; 508 struct inode *inode;
492 struct rb_node *node; 509 struct rb_node *node;
493 struct list_head *pos, *n; 510 struct list_head *pos, *n;
511 struct page **pages;
494 struct page *page; 512 struct page *page;
495 struct extent_state *cached_state = NULL; 513 struct extent_state *cached_state = NULL;
514 struct btrfs_free_cluster *cluster = NULL;
515 struct extent_io_tree *unpin = NULL;
496 struct list_head bitmap_list; 516 struct list_head bitmap_list;
497 struct btrfs_key key; 517 struct btrfs_key key;
518 u64 start, end, len;
498 u64 bytes = 0; 519 u64 bytes = 0;
499 u32 *crc, *checksums; 520 u32 *crc, *checksums;
500 pgoff_t index = 0, last_index = 0;
501 unsigned long first_page_offset; 521 unsigned long first_page_offset;
502 int num_checksums; 522 int index = 0, num_pages = 0;
503 int entries = 0; 523 int entries = 0;
504 int bitmaps = 0; 524 int bitmaps = 0;
505 int ret = 0; 525 int ret = 0;
526 bool next_page = false;
527 bool out_of_space = false;
506 528
507 root = root->fs_info->tree_root; 529 root = root->fs_info->tree_root;
508 530
@@ -530,24 +552,43 @@ int btrfs_write_out_cache(struct btrfs_root *root,
530 return 0; 552 return 0;
531 } 553 }
532 554
533 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 555 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
556 PAGE_CACHE_SHIFT;
534 filemap_write_and_wait(inode->i_mapping); 557 filemap_write_and_wait(inode->i_mapping);
535 btrfs_wait_ordered_range(inode, inode->i_size & 558 btrfs_wait_ordered_range(inode, inode->i_size &
536 ~(root->sectorsize - 1), (u64)-1); 559 ~(root->sectorsize - 1), (u64)-1);
537 560
538 /* We need a checksum per page. */ 561 /* We need a checksum per page. */
539 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE; 562 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
540 crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
541 if (!crc) { 563 if (!crc) {
542 iput(inode); 564 iput(inode);
543 return 0; 565 return 0;
544 } 566 }
545 567
568 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
569 if (!pages) {
570 kfree(crc);
571 iput(inode);
572 return 0;
573 }
574
546 /* Since the first page has all of our checksums and our generation we 575 /* Since the first page has all of our checksums and our generation we
547 * need to calculate the offset into the page that we can start writing 576 * need to calculate the offset into the page that we can start writing
548 * our entries. 577 * our entries.
549 */ 578 */
550 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64); 579 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
580
581 /* Get the cluster for this block_group if it exists */
582 if (!list_empty(&block_group->cluster_list))
583 cluster = list_entry(block_group->cluster_list.next,
584 struct btrfs_free_cluster,
585 block_group_list);
586
587 /*
588 * We shouldn't have switched the pinned extents yet so this is the
589 * right one
590 */
591 unpin = root->fs_info->pinned_extents;
551 592
552 /* 593 /*
553 * Lock all pages first so we can lock the extent safely. 594 * Lock all pages first so we can lock the extent safely.
@@ -557,20 +598,18 @@ int btrfs_write_out_cache(struct btrfs_root *root,
557 * after find_get_page at this point. Just putting this here so people 598 * after find_get_page at this point. Just putting this here so people
558 * know and don't freak out. 599 * know and don't freak out.
559 */ 600 */
560 while (index <= last_index) { 601 while (index < num_pages) {
561 page = grab_cache_page(inode->i_mapping, index); 602 page = grab_cache_page(inode->i_mapping, index);
562 if (!page) { 603 if (!page) {
563 pgoff_t i = 0; 604 int i;
564 605
565 while (i < index) { 606 for (i = 0; i < num_pages; i++) {
566 page = find_get_page(inode->i_mapping, i); 607 unlock_page(pages[i]);
567 unlock_page(page); 608 page_cache_release(pages[i]);
568 page_cache_release(page);
569 page_cache_release(page);
570 i++;
571 } 609 }
572 goto out_free; 610 goto out_free;
573 } 611 }
612 pages[index] = page;
574 index++; 613 index++;
575 } 614 }
576 615
@@ -578,6 +617,12 @@ int btrfs_write_out_cache(struct btrfs_root *root,
578 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 617 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
579 0, &cached_state, GFP_NOFS); 618 0, &cached_state, GFP_NOFS);
580 619
620 /*
621 * When searching for pinned extents, we need to start at our start
622 * offset.
623 */
624 start = block_group->key.objectid;
625
581 /* Write out the extent entries */ 626 /* Write out the extent entries */
582 do { 627 do {
583 struct btrfs_free_space_entry *entry; 628 struct btrfs_free_space_entry *entry;
@@ -585,18 +630,25 @@ int btrfs_write_out_cache(struct btrfs_root *root,
585 unsigned long offset = 0; 630 unsigned long offset = 0;
586 unsigned long start_offset = 0; 631 unsigned long start_offset = 0;
587 632
633 next_page = false;
634
588 if (index == 0) { 635 if (index == 0) {
589 start_offset = first_page_offset; 636 start_offset = first_page_offset;
590 offset = start_offset; 637 offset = start_offset;
591 } 638 }
592 639
593 page = find_get_page(inode->i_mapping, index); 640 if (index >= num_pages) {
641 out_of_space = true;
642 break;
643 }
644
645 page = pages[index];
594 646
595 addr = kmap(page); 647 addr = kmap(page);
596 entry = addr + start_offset; 648 entry = addr + start_offset;
597 649
598 memset(addr, 0, PAGE_CACHE_SIZE); 650 memset(addr, 0, PAGE_CACHE_SIZE);
599 while (1) { 651 while (node && !next_page) {
600 struct btrfs_free_space *e; 652 struct btrfs_free_space *e;
601 653
602 e = rb_entry(node, struct btrfs_free_space, offset_index); 654 e = rb_entry(node, struct btrfs_free_space, offset_index);
@@ -612,12 +664,49 @@ int btrfs_write_out_cache(struct btrfs_root *root,
612 entry->type = BTRFS_FREE_SPACE_EXTENT; 664 entry->type = BTRFS_FREE_SPACE_EXTENT;
613 } 665 }
614 node = rb_next(node); 666 node = rb_next(node);
615 if (!node) 667 if (!node && cluster) {
616 break; 668 node = rb_first(&cluster->root);
669 cluster = NULL;
670 }
617 offset += sizeof(struct btrfs_free_space_entry); 671 offset += sizeof(struct btrfs_free_space_entry);
618 if (offset + sizeof(struct btrfs_free_space_entry) >= 672 if (offset + sizeof(struct btrfs_free_space_entry) >=
619 PAGE_CACHE_SIZE) 673 PAGE_CACHE_SIZE)
674 next_page = true;
675 entry++;
676 }
677
678 /*
679 * We want to add any pinned extents to our free space cache
680 * so we don't leak the space
681 */
682 while (!next_page && (start < block_group->key.objectid +
683 block_group->key.offset)) {
684 ret = find_first_extent_bit(unpin, start, &start, &end,
685 EXTENT_DIRTY);
686 if (ret) {
687 ret = 0;
620 break; 688 break;
689 }
690
691 /* This pinned extent is out of our range */
692 if (start >= block_group->key.objectid +
693 block_group->key.offset)
694 break;
695
696 len = block_group->key.objectid +
697 block_group->key.offset - start;
698 len = min(len, end + 1 - start);
699
700 entries++;
701 entry->offset = cpu_to_le64(start);
702 entry->bytes = cpu_to_le64(len);
703 entry->type = BTRFS_FREE_SPACE_EXTENT;
704
705 start = end + 1;
706 offset += sizeof(struct btrfs_free_space_entry);
707 if (offset + sizeof(struct btrfs_free_space_entry) >=
708 PAGE_CACHE_SIZE)
709 next_page = true;
621 entry++; 710 entry++;
622 } 711 }
623 *crc = ~(u32)0; 712 *crc = ~(u32)0;
@@ -630,25 +719,8 @@ int btrfs_write_out_cache(struct btrfs_root *root,
630 719
631 bytes += PAGE_CACHE_SIZE; 720 bytes += PAGE_CACHE_SIZE;
632 721
633 ClearPageChecked(page);
634 set_page_extent_mapped(page);
635 SetPageUptodate(page);
636 set_page_dirty(page);
637
638 /*
639 * We need to release our reference we got for grab_cache_page,
640 * except for the first page which will hold our checksums, we
641 * do that below.
642 */
643 if (index != 0) {
644 unlock_page(page);
645 page_cache_release(page);
646 }
647
648 page_cache_release(page);
649
650 index++; 722 index++;
651 } while (node); 723 } while (node || next_page);
652 724
653 /* Write out the bitmaps */ 725 /* Write out the bitmaps */
654 list_for_each_safe(pos, n, &bitmap_list) { 726 list_for_each_safe(pos, n, &bitmap_list) {
@@ -656,7 +728,11 @@ int btrfs_write_out_cache(struct btrfs_root *root,
656 struct btrfs_free_space *entry = 728 struct btrfs_free_space *entry =
657 list_entry(pos, struct btrfs_free_space, list); 729 list_entry(pos, struct btrfs_free_space, list);
658 730
659 page = find_get_page(inode->i_mapping, index); 731 if (index >= num_pages) {
732 out_of_space = true;
733 break;
734 }
735 page = pages[index];
660 736
661 addr = kmap(page); 737 addr = kmap(page);
662 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 738 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
@@ -667,64 +743,58 @@ int btrfs_write_out_cache(struct btrfs_root *root,
667 crc++; 743 crc++;
668 bytes += PAGE_CACHE_SIZE; 744 bytes += PAGE_CACHE_SIZE;
669 745
670 ClearPageChecked(page);
671 set_page_extent_mapped(page);
672 SetPageUptodate(page);
673 set_page_dirty(page);
674 unlock_page(page);
675 page_cache_release(page);
676 page_cache_release(page);
677 list_del_init(&entry->list); 746 list_del_init(&entry->list);
678 index++; 747 index++;
679 } 748 }
680 749
750 if (out_of_space) {
751 btrfs_drop_pages(pages, num_pages);
752 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
753 i_size_read(inode) - 1, &cached_state,
754 GFP_NOFS);
755 ret = 0;
756 goto out_free;
757 }
758
681 /* Zero out the rest of the pages just to make sure */ 759 /* Zero out the rest of the pages just to make sure */
682 while (index <= last_index) { 760 while (index < num_pages) {
683 void *addr; 761 void *addr;
684 762
685 page = find_get_page(inode->i_mapping, index); 763 page = pages[index];
686
687 addr = kmap(page); 764 addr = kmap(page);
688 memset(addr, 0, PAGE_CACHE_SIZE); 765 memset(addr, 0, PAGE_CACHE_SIZE);
689 kunmap(page); 766 kunmap(page);
690 ClearPageChecked(page);
691 set_page_extent_mapped(page);
692 SetPageUptodate(page);
693 set_page_dirty(page);
694 unlock_page(page);
695 page_cache_release(page);
696 page_cache_release(page);
697 bytes += PAGE_CACHE_SIZE; 767 bytes += PAGE_CACHE_SIZE;
698 index++; 768 index++;
699 } 769 }
700 770
701 btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
702
703 /* Write the checksums and trans id to the first page */ 771 /* Write the checksums and trans id to the first page */
704 { 772 {
705 void *addr; 773 void *addr;
706 u64 *gen; 774 u64 *gen;
707 775
708 page = find_get_page(inode->i_mapping, 0); 776 page = pages[0];
709 777
710 addr = kmap(page); 778 addr = kmap(page);
711 memcpy(addr, checksums, sizeof(u32) * num_checksums); 779 memcpy(addr, checksums, sizeof(u32) * num_pages);
712 gen = addr + (sizeof(u32) * num_checksums); 780 gen = addr + (sizeof(u32) * num_pages);
713 *gen = trans->transid; 781 *gen = trans->transid;
714 kunmap(page); 782 kunmap(page);
715 ClearPageChecked(page);
716 set_page_extent_mapped(page);
717 SetPageUptodate(page);
718 set_page_dirty(page);
719 unlock_page(page);
720 page_cache_release(page);
721 page_cache_release(page);
722 } 783 }
723 BTRFS_I(inode)->generation = trans->transid;
724 784
785 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
786 bytes, &cached_state);
787 btrfs_drop_pages(pages, num_pages);
725 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 788 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
726 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 789 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
727 790
791 if (ret) {
792 ret = 0;
793 goto out_free;
794 }
795
796 BTRFS_I(inode)->generation = trans->transid;
797
728 filemap_write_and_wait(inode->i_mapping); 798 filemap_write_and_wait(inode->i_mapping);
729 799
730 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 800 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -775,6 +845,7 @@ out_free:
775 BTRFS_I(inode)->generation = 0; 845 BTRFS_I(inode)->generation = 0;
776 } 846 }
777 kfree(checksums); 847 kfree(checksums);
848 kfree(pages);
778 btrfs_update_inode(trans, root, inode); 849 btrfs_update_inode(trans, root, inode);
779 iput(inode); 850 iput(inode);
780 return ret; 851 return ret;
@@ -987,11 +1058,18 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
987 return entry; 1058 return entry;
988} 1059}
989 1060
990static void unlink_free_space(struct btrfs_block_group_cache *block_group, 1061static inline void
991 struct btrfs_free_space *info) 1062__unlink_free_space(struct btrfs_block_group_cache *block_group,
1063 struct btrfs_free_space *info)
992{ 1064{
993 rb_erase(&info->offset_index, &block_group->free_space_offset); 1065 rb_erase(&info->offset_index, &block_group->free_space_offset);
994 block_group->free_extents--; 1066 block_group->free_extents--;
1067}
1068
1069static void unlink_free_space(struct btrfs_block_group_cache *block_group,
1070 struct btrfs_free_space *info)
1071{
1072 __unlink_free_space(block_group, info);
995 block_group->free_space -= info->bytes; 1073 block_group->free_space -= info->bytes;
996} 1074}
997 1075
@@ -1016,14 +1094,18 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
1016 u64 max_bytes; 1094 u64 max_bytes;
1017 u64 bitmap_bytes; 1095 u64 bitmap_bytes;
1018 u64 extent_bytes; 1096 u64 extent_bytes;
1097 u64 size = block_group->key.offset;
1019 1098
1020 /* 1099 /*
1021 * The goal is to keep the total amount of memory used per 1gb of space 1100 * The goal is to keep the total amount of memory used per 1gb of space
1022 * at or below 32k, so we need to adjust how much memory we allow to be 1101 * at or below 32k, so we need to adjust how much memory we allow to be
1023 * used by extent based free space tracking 1102 * used by extent based free space tracking
1024 */ 1103 */
1025 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1104 if (size < 1024 * 1024 * 1024)
1026 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); 1105 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1106 else
1107 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1108 div64_u64(size, 1024 * 1024 * 1024);
1027 1109
1028 /* 1110 /*
1029 * we want to account for 1 more bitmap than what we have so we can make 1111 * we want to account for 1 more bitmap than what we have so we can make
@@ -1171,6 +1253,16 @@ static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
1171 recalculate_thresholds(block_group); 1253 recalculate_thresholds(block_group);
1172} 1254}
1173 1255
1256static void free_bitmap(struct btrfs_block_group_cache *block_group,
1257 struct btrfs_free_space *bitmap_info)
1258{
1259 unlink_free_space(block_group, bitmap_info);
1260 kfree(bitmap_info->bitmap);
1261 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1262 block_group->total_bitmaps--;
1263 recalculate_thresholds(block_group);
1264}
1265
1174static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, 1266static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
1175 struct btrfs_free_space *bitmap_info, 1267 struct btrfs_free_space *bitmap_info,
1176 u64 *offset, u64 *bytes) 1268 u64 *offset, u64 *bytes)
@@ -1195,6 +1287,7 @@ again:
1195 */ 1287 */
1196 search_start = *offset; 1288 search_start = *offset;
1197 search_bytes = *bytes; 1289 search_bytes = *bytes;
1290 search_bytes = min(search_bytes, end - search_start + 1);
1198 ret = search_bitmap(block_group, bitmap_info, &search_start, 1291 ret = search_bitmap(block_group, bitmap_info, &search_start,
1199 &search_bytes); 1292 &search_bytes);
1200 BUG_ON(ret < 0 || search_start != *offset); 1293 BUG_ON(ret < 0 || search_start != *offset);
@@ -1211,13 +1304,8 @@ again:
1211 1304
1212 if (*bytes) { 1305 if (*bytes) {
1213 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1306 struct rb_node *next = rb_next(&bitmap_info->offset_index);
1214 if (!bitmap_info->bytes) { 1307 if (!bitmap_info->bytes)
1215 unlink_free_space(block_group, bitmap_info); 1308 free_bitmap(block_group, bitmap_info);
1216 kfree(bitmap_info->bitmap);
1217 kfree(bitmap_info);
1218 block_group->total_bitmaps--;
1219 recalculate_thresholds(block_group);
1220 }
1221 1309
1222 /* 1310 /*
1223 * no entry after this bitmap, but we still have bytes to 1311 * no entry after this bitmap, but we still have bytes to
@@ -1250,13 +1338,8 @@ again:
1250 return -EAGAIN; 1338 return -EAGAIN;
1251 1339
1252 goto again; 1340 goto again;
1253 } else if (!bitmap_info->bytes) { 1341 } else if (!bitmap_info->bytes)
1254 unlink_free_space(block_group, bitmap_info); 1342 free_bitmap(block_group, bitmap_info);
1255 kfree(bitmap_info->bitmap);
1256 kfree(bitmap_info);
1257 block_group->total_bitmaps--;
1258 recalculate_thresholds(block_group);
1259 }
1260 1343
1261 return 0; 1344 return 0;
1262} 1345}
@@ -1273,9 +1356,22 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
1273 * If we are below the extents threshold then we can add this as an 1356 * If we are below the extents threshold then we can add this as an
1274 * extent, and don't have to deal with the bitmap 1357 * extent, and don't have to deal with the bitmap
1275 */ 1358 */
1276 if (block_group->free_extents < block_group->extents_thresh && 1359 if (block_group->free_extents < block_group->extents_thresh) {
1277 info->bytes > block_group->sectorsize * 4) 1360 /*
1278 return 0; 1361 * If this block group has some small extents we don't want to
1362 * use up all of our free slots in the cache with them, we want
1363 * to reserve them to larger extents, however if we have plent
1364 * of cache left then go ahead an dadd them, no sense in adding
1365 * the overhead of a bitmap if we don't have to.
1366 */
1367 if (info->bytes <= block_group->sectorsize * 4) {
1368 if (block_group->free_extents * 2 <=
1369 block_group->extents_thresh)
1370 return 0;
1371 } else {
1372 return 0;
1373 }
1374 }
1279 1375
1280 /* 1376 /*
1281 * some block groups are so tiny they can't be enveloped by a bitmap, so 1377 * some block groups are so tiny they can't be enveloped by a bitmap, so
@@ -1330,8 +1426,8 @@ new_bitmap:
1330 1426
1331 /* no pre-allocated info, allocate a new one */ 1427 /* no pre-allocated info, allocate a new one */
1332 if (!info) { 1428 if (!info) {
1333 info = kzalloc(sizeof(struct btrfs_free_space), 1429 info = kmem_cache_zalloc(btrfs_free_space_cachep,
1334 GFP_NOFS); 1430 GFP_NOFS);
1335 if (!info) { 1431 if (!info) {
1336 spin_lock(&block_group->tree_lock); 1432 spin_lock(&block_group->tree_lock);
1337 ret = -ENOMEM; 1433 ret = -ENOMEM;
@@ -1353,28 +1449,20 @@ out:
1353 if (info) { 1449 if (info) {
1354 if (info->bitmap) 1450 if (info->bitmap)
1355 kfree(info->bitmap); 1451 kfree(info->bitmap);
1356 kfree(info); 1452 kmem_cache_free(btrfs_free_space_cachep, info);
1357 } 1453 }
1358 1454
1359 return ret; 1455 return ret;
1360} 1456}
1361 1457
1362int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 1458bool try_merge_free_space(struct btrfs_block_group_cache *block_group,
1363 u64 offset, u64 bytes) 1459 struct btrfs_free_space *info, bool update_stat)
1364{ 1460{
1365 struct btrfs_free_space *right_info = NULL; 1461 struct btrfs_free_space *left_info;
1366 struct btrfs_free_space *left_info = NULL; 1462 struct btrfs_free_space *right_info;
1367 struct btrfs_free_space *info = NULL; 1463 bool merged = false;
1368 int ret = 0; 1464 u64 offset = info->offset;
1369 1465 u64 bytes = info->bytes;
1370 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
1371 if (!info)
1372 return -ENOMEM;
1373
1374 info->offset = offset;
1375 info->bytes = bytes;
1376
1377 spin_lock(&block_group->tree_lock);
1378 1466
1379 /* 1467 /*
1380 * first we want to see if there is free space adjacent to the range we 1468 * first we want to see if there is free space adjacent to the range we
@@ -1388,40 +1476,65 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1388 else 1476 else
1389 left_info = tree_search_offset(block_group, offset - 1, 0, 0); 1477 left_info = tree_search_offset(block_group, offset - 1, 0, 0);
1390 1478
1391 /*
1392 * If there was no extent directly to the left or right of this new
1393 * extent then we know we're going to have to allocate a new extent, so
1394 * before we do that see if we need to drop this into a bitmap
1395 */
1396 if ((!left_info || left_info->bitmap) &&
1397 (!right_info || right_info->bitmap)) {
1398 ret = insert_into_bitmap(block_group, info);
1399
1400 if (ret < 0) {
1401 goto out;
1402 } else if (ret) {
1403 ret = 0;
1404 goto out;
1405 }
1406 }
1407
1408 if (right_info && !right_info->bitmap) { 1479 if (right_info && !right_info->bitmap) {
1409 unlink_free_space(block_group, right_info); 1480 if (update_stat)
1481 unlink_free_space(block_group, right_info);
1482 else
1483 __unlink_free_space(block_group, right_info);
1410 info->bytes += right_info->bytes; 1484 info->bytes += right_info->bytes;
1411 kfree(right_info); 1485 kmem_cache_free(btrfs_free_space_cachep, right_info);
1486 merged = true;
1412 } 1487 }
1413 1488
1414 if (left_info && !left_info->bitmap && 1489 if (left_info && !left_info->bitmap &&
1415 left_info->offset + left_info->bytes == offset) { 1490 left_info->offset + left_info->bytes == offset) {
1416 unlink_free_space(block_group, left_info); 1491 if (update_stat)
1492 unlink_free_space(block_group, left_info);
1493 else
1494 __unlink_free_space(block_group, left_info);
1417 info->offset = left_info->offset; 1495 info->offset = left_info->offset;
1418 info->bytes += left_info->bytes; 1496 info->bytes += left_info->bytes;
1419 kfree(left_info); 1497 kmem_cache_free(btrfs_free_space_cachep, left_info);
1498 merged = true;
1420 } 1499 }
1421 1500
1501 return merged;
1502}
1503
1504int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
1505 u64 offset, u64 bytes)
1506{
1507 struct btrfs_free_space *info;
1508 int ret = 0;
1509
1510 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
1511 if (!info)
1512 return -ENOMEM;
1513
1514 info->offset = offset;
1515 info->bytes = bytes;
1516
1517 spin_lock(&block_group->tree_lock);
1518
1519 if (try_merge_free_space(block_group, info, true))
1520 goto link;
1521
1522 /*
1523 * There was no extent directly to the left or right of this new
1524 * extent then we know we're going to have to allocate a new extent, so
1525 * before we do that see if we need to drop this into a bitmap
1526 */
1527 ret = insert_into_bitmap(block_group, info);
1528 if (ret < 0) {
1529 goto out;
1530 } else if (ret) {
1531 ret = 0;
1532 goto out;
1533 }
1534link:
1422 ret = link_free_space(block_group, info); 1535 ret = link_free_space(block_group, info);
1423 if (ret) 1536 if (ret)
1424 kfree(info); 1537 kmem_cache_free(btrfs_free_space_cachep, info);
1425out: 1538out:
1426 spin_unlock(&block_group->tree_lock); 1539 spin_unlock(&block_group->tree_lock);
1427 1540
@@ -1491,7 +1604,7 @@ again:
1491 kfree(info->bitmap); 1604 kfree(info->bitmap);
1492 block_group->total_bitmaps--; 1605 block_group->total_bitmaps--;
1493 } 1606 }
1494 kfree(info); 1607 kmem_cache_free(btrfs_free_space_cachep, info);
1495 goto out_lock; 1608 goto out_lock;
1496 } 1609 }
1497 1610
@@ -1527,7 +1640,7 @@ again:
1527 /* the hole we're creating ends at the end 1640 /* the hole we're creating ends at the end
1528 * of the info struct, just free the info 1641 * of the info struct, just free the info
1529 */ 1642 */
1530 kfree(info); 1643 kmem_cache_free(btrfs_free_space_cachep, info);
1531 } 1644 }
1532 spin_unlock(&block_group->tree_lock); 1645 spin_unlock(&block_group->tree_lock);
1533 1646
@@ -1600,29 +1713,28 @@ __btrfs_return_cluster_to_free_space(
1600{ 1713{
1601 struct btrfs_free_space *entry; 1714 struct btrfs_free_space *entry;
1602 struct rb_node *node; 1715 struct rb_node *node;
1603 bool bitmap;
1604 1716
1605 spin_lock(&cluster->lock); 1717 spin_lock(&cluster->lock);
1606 if (cluster->block_group != block_group) 1718 if (cluster->block_group != block_group)
1607 goto out; 1719 goto out;
1608 1720
1609 bitmap = cluster->points_to_bitmap;
1610 cluster->block_group = NULL; 1721 cluster->block_group = NULL;
1611 cluster->window_start = 0; 1722 cluster->window_start = 0;
1612 list_del_init(&cluster->block_group_list); 1723 list_del_init(&cluster->block_group_list);
1613 cluster->points_to_bitmap = false;
1614
1615 if (bitmap)
1616 goto out;
1617 1724
1618 node = rb_first(&cluster->root); 1725 node = rb_first(&cluster->root);
1619 while (node) { 1726 while (node) {
1727 bool bitmap;
1728
1620 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1729 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1621 node = rb_next(&entry->offset_index); 1730 node = rb_next(&entry->offset_index);
1622 rb_erase(&entry->offset_index, &cluster->root); 1731 rb_erase(&entry->offset_index, &cluster->root);
1623 BUG_ON(entry->bitmap); 1732
1733 bitmap = (entry->bitmap != NULL);
1734 if (!bitmap)
1735 try_merge_free_space(block_group, entry, false);
1624 tree_insert_offset(&block_group->free_space_offset, 1736 tree_insert_offset(&block_group->free_space_offset,
1625 entry->offset, &entry->offset_index, 0); 1737 entry->offset, &entry->offset_index, bitmap);
1626 } 1738 }
1627 cluster->root = RB_ROOT; 1739 cluster->root = RB_ROOT;
1628 1740
@@ -1659,7 +1771,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
1659 unlink_free_space(block_group, info); 1771 unlink_free_space(block_group, info);
1660 if (info->bitmap) 1772 if (info->bitmap)
1661 kfree(info->bitmap); 1773 kfree(info->bitmap);
1662 kfree(info); 1774 kmem_cache_free(btrfs_free_space_cachep, info);
1663 if (need_resched()) { 1775 if (need_resched()) {
1664 spin_unlock(&block_group->tree_lock); 1776 spin_unlock(&block_group->tree_lock);
1665 cond_resched(); 1777 cond_resched();
@@ -1685,19 +1797,14 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
1685 ret = offset; 1797 ret = offset;
1686 if (entry->bitmap) { 1798 if (entry->bitmap) {
1687 bitmap_clear_bits(block_group, entry, offset, bytes); 1799 bitmap_clear_bits(block_group, entry, offset, bytes);
1688 if (!entry->bytes) { 1800 if (!entry->bytes)
1689 unlink_free_space(block_group, entry); 1801 free_bitmap(block_group, entry);
1690 kfree(entry->bitmap);
1691 kfree(entry);
1692 block_group->total_bitmaps--;
1693 recalculate_thresholds(block_group);
1694 }
1695 } else { 1802 } else {
1696 unlink_free_space(block_group, entry); 1803 unlink_free_space(block_group, entry);
1697 entry->offset += bytes; 1804 entry->offset += bytes;
1698 entry->bytes -= bytes; 1805 entry->bytes -= bytes;
1699 if (!entry->bytes) 1806 if (!entry->bytes)
1700 kfree(entry); 1807 kmem_cache_free(btrfs_free_space_cachep, entry);
1701 else 1808 else
1702 link_free_space(block_group, entry); 1809 link_free_space(block_group, entry);
1703 } 1810 }
@@ -1750,48 +1857,24 @@ int btrfs_return_cluster_to_free_space(
1750 1857
1751static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 1858static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
1752 struct btrfs_free_cluster *cluster, 1859 struct btrfs_free_cluster *cluster,
1860 struct btrfs_free_space *entry,
1753 u64 bytes, u64 min_start) 1861 u64 bytes, u64 min_start)
1754{ 1862{
1755 struct btrfs_free_space *entry;
1756 int err; 1863 int err;
1757 u64 search_start = cluster->window_start; 1864 u64 search_start = cluster->window_start;
1758 u64 search_bytes = bytes; 1865 u64 search_bytes = bytes;
1759 u64 ret = 0; 1866 u64 ret = 0;
1760 1867
1761 spin_lock(&block_group->tree_lock);
1762 spin_lock(&cluster->lock);
1763
1764 if (!cluster->points_to_bitmap)
1765 goto out;
1766
1767 if (cluster->block_group != block_group)
1768 goto out;
1769
1770 /*
1771 * search_start is the beginning of the bitmap, but at some point it may
1772 * be a good idea to point to the actual start of the free area in the
1773 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1774 * to 1 to make sure we get the bitmap entry
1775 */
1776 entry = tree_search_offset(block_group,
1777 offset_to_bitmap(block_group, search_start),
1778 1, 0);
1779 if (!entry || !entry->bitmap)
1780 goto out;
1781
1782 search_start = min_start; 1868 search_start = min_start;
1783 search_bytes = bytes; 1869 search_bytes = bytes;
1784 1870
1785 err = search_bitmap(block_group, entry, &search_start, 1871 err = search_bitmap(block_group, entry, &search_start,
1786 &search_bytes); 1872 &search_bytes);
1787 if (err) 1873 if (err)
1788 goto out; 1874 return 0;
1789 1875
1790 ret = search_start; 1876 ret = search_start;
1791 bitmap_clear_bits(block_group, entry, ret, bytes); 1877 bitmap_clear_bits(block_group, entry, ret, bytes);
1792out:
1793 spin_unlock(&cluster->lock);
1794 spin_unlock(&block_group->tree_lock);
1795 1878
1796 return ret; 1879 return ret;
1797} 1880}
@@ -1809,10 +1892,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1809 struct rb_node *node; 1892 struct rb_node *node;
1810 u64 ret = 0; 1893 u64 ret = 0;
1811 1894
1812 if (cluster->points_to_bitmap)
1813 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1814 min_start);
1815
1816 spin_lock(&cluster->lock); 1895 spin_lock(&cluster->lock);
1817 if (bytes > cluster->max_size) 1896 if (bytes > cluster->max_size)
1818 goto out; 1897 goto out;
@@ -1825,9 +1904,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1825 goto out; 1904 goto out;
1826 1905
1827 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1906 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1828
1829 while(1) { 1907 while(1) {
1830 if (entry->bytes < bytes || entry->offset < min_start) { 1908 if (entry->bytes < bytes ||
1909 (!entry->bitmap && entry->offset < min_start)) {
1831 struct rb_node *node; 1910 struct rb_node *node;
1832 1911
1833 node = rb_next(&entry->offset_index); 1912 node = rb_next(&entry->offset_index);
@@ -1837,20 +1916,53 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1837 offset_index); 1916 offset_index);
1838 continue; 1917 continue;
1839 } 1918 }
1840 ret = entry->offset;
1841 1919
1842 entry->offset += bytes; 1920 if (entry->bitmap) {
1843 entry->bytes -= bytes; 1921 ret = btrfs_alloc_from_bitmap(block_group,
1922 cluster, entry, bytes,
1923 min_start);
1924 if (ret == 0) {
1925 struct rb_node *node;
1926 node = rb_next(&entry->offset_index);
1927 if (!node)
1928 break;
1929 entry = rb_entry(node, struct btrfs_free_space,
1930 offset_index);
1931 continue;
1932 }
1933 } else {
1844 1934
1845 if (entry->bytes == 0) { 1935 ret = entry->offset;
1846 rb_erase(&entry->offset_index, &cluster->root); 1936
1847 kfree(entry); 1937 entry->offset += bytes;
1938 entry->bytes -= bytes;
1848 } 1939 }
1940
1941 if (entry->bytes == 0)
1942 rb_erase(&entry->offset_index, &cluster->root);
1849 break; 1943 break;
1850 } 1944 }
1851out: 1945out:
1852 spin_unlock(&cluster->lock); 1946 spin_unlock(&cluster->lock);
1853 1947
1948 if (!ret)
1949 return 0;
1950
1951 spin_lock(&block_group->tree_lock);
1952
1953 block_group->free_space -= bytes;
1954 if (entry->bytes == 0) {
1955 block_group->free_extents--;
1956 if (entry->bitmap) {
1957 kfree(entry->bitmap);
1958 block_group->total_bitmaps--;
1959 recalculate_thresholds(block_group);
1960 }
1961 kmem_cache_free(btrfs_free_space_cachep, entry);
1962 }
1963
1964 spin_unlock(&block_group->tree_lock);
1965
1854 return ret; 1966 return ret;
1855} 1967}
1856 1968
@@ -1866,12 +1978,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1866 unsigned long found_bits; 1978 unsigned long found_bits;
1867 unsigned long start = 0; 1979 unsigned long start = 0;
1868 unsigned long total_found = 0; 1980 unsigned long total_found = 0;
1981 int ret;
1869 bool found = false; 1982 bool found = false;
1870 1983
1871 i = offset_to_bit(entry->offset, block_group->sectorsize, 1984 i = offset_to_bit(entry->offset, block_group->sectorsize,
1872 max_t(u64, offset, entry->offset)); 1985 max_t(u64, offset, entry->offset));
1873 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 1986 search_bits = bytes_to_bits(bytes, block_group->sectorsize);
1874 total_bits = bytes_to_bits(bytes, block_group->sectorsize); 1987 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1875 1988
1876again: 1989again:
1877 found_bits = 0; 1990 found_bits = 0;
@@ -1888,7 +2001,7 @@ again:
1888 } 2001 }
1889 2002
1890 if (!found_bits) 2003 if (!found_bits)
1891 return -1; 2004 return -ENOSPC;
1892 2005
1893 if (!found) { 2006 if (!found) {
1894 start = i; 2007 start = i;
@@ -1912,189 +2025,208 @@ again:
1912 2025
1913 cluster->window_start = start * block_group->sectorsize + 2026 cluster->window_start = start * block_group->sectorsize +
1914 entry->offset; 2027 entry->offset;
1915 cluster->points_to_bitmap = true; 2028 rb_erase(&entry->offset_index, &block_group->free_space_offset);
2029 ret = tree_insert_offset(&cluster->root, entry->offset,
2030 &entry->offset_index, 1);
2031 BUG_ON(ret);
1916 2032
1917 return 0; 2033 return 0;
1918} 2034}
1919 2035
1920/* 2036/*
1921 * here we try to find a cluster of blocks in a block group. The goal 2037 * This searches the block group for just extents to fill the cluster with.
1922 * is to find at least bytes free and up to empty_size + bytes free.
1923 * We might not find them all in one contiguous area.
1924 *
1925 * returns zero and sets up cluster if things worked out, otherwise
1926 * it returns -enospc
1927 */ 2038 */
1928int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 2039static int setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
1929 struct btrfs_root *root, 2040 struct btrfs_free_cluster *cluster,
1930 struct btrfs_block_group_cache *block_group, 2041 u64 offset, u64 bytes, u64 min_bytes)
1931 struct btrfs_free_cluster *cluster,
1932 u64 offset, u64 bytes, u64 empty_size)
1933{ 2042{
2043 struct btrfs_free_space *first = NULL;
1934 struct btrfs_free_space *entry = NULL; 2044 struct btrfs_free_space *entry = NULL;
2045 struct btrfs_free_space *prev = NULL;
2046 struct btrfs_free_space *last;
1935 struct rb_node *node; 2047 struct rb_node *node;
1936 struct btrfs_free_space *next;
1937 struct btrfs_free_space *last = NULL;
1938 u64 min_bytes;
1939 u64 window_start; 2048 u64 window_start;
1940 u64 window_free; 2049 u64 window_free;
1941 u64 max_extent = 0; 2050 u64 max_extent;
1942 bool found_bitmap = false; 2051 u64 max_gap = 128 * 1024;
1943 int ret;
1944 2052
1945 /* for metadata, allow allocates with more holes */ 2053 entry = tree_search_offset(block_group, offset, 0, 1);
1946 if (btrfs_test_opt(root, SSD_SPREAD)) { 2054 if (!entry)
1947 min_bytes = bytes + empty_size; 2055 return -ENOSPC;
1948 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1949 /*
1950 * we want to do larger allocations when we are
1951 * flushing out the delayed refs, it helps prevent
1952 * making more work as we go along.
1953 */
1954 if (trans->transaction->delayed_refs.flushing)
1955 min_bytes = max(bytes, (bytes + empty_size) >> 1);
1956 else
1957 min_bytes = max(bytes, (bytes + empty_size) >> 4);
1958 } else
1959 min_bytes = max(bytes, (bytes + empty_size) >> 2);
1960
1961 spin_lock(&block_group->tree_lock);
1962 spin_lock(&cluster->lock);
1963
1964 /* someone already found a cluster, hooray */
1965 if (cluster->block_group) {
1966 ret = 0;
1967 goto out;
1968 }
1969again:
1970 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
1971 if (!entry) {
1972 ret = -ENOSPC;
1973 goto out;
1974 }
1975 2056
1976 /* 2057 /*
1977 * If found_bitmap is true, we exhausted our search for extent entries, 2058 * We don't want bitmaps, so just move along until we find a normal
1978 * and we just want to search all of the bitmaps that we can find, and 2059 * extent entry.
1979 * ignore any extent entries we find.
1980 */ 2060 */
1981 while (entry->bitmap || found_bitmap || 2061 while (entry->bitmap) {
1982 (!entry->bitmap && entry->bytes < min_bytes)) { 2062 node = rb_next(&entry->offset_index);
1983 struct rb_node *node = rb_next(&entry->offset_index); 2063 if (!node)
1984 2064 return -ENOSPC;
1985 if (entry->bitmap && entry->bytes > bytes + empty_size) {
1986 ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1987 offset, bytes + empty_size,
1988 min_bytes);
1989 if (!ret)
1990 goto got_it;
1991 }
1992
1993 if (!node) {
1994 ret = -ENOSPC;
1995 goto out;
1996 }
1997 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2065 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1998 } 2066 }
1999 2067
2000 /*
2001 * We already searched all the extent entries from the passed in offset
2002 * to the end and didn't find enough space for the cluster, and we also
2003 * didn't find any bitmaps that met our criteria, just go ahead and exit
2004 */
2005 if (found_bitmap) {
2006 ret = -ENOSPC;
2007 goto out;
2008 }
2009
2010 cluster->points_to_bitmap = false;
2011 window_start = entry->offset; 2068 window_start = entry->offset;
2012 window_free = entry->bytes; 2069 window_free = entry->bytes;
2013 last = entry;
2014 max_extent = entry->bytes; 2070 max_extent = entry->bytes;
2071 first = entry;
2072 last = entry;
2073 prev = entry;
2015 2074
2016 while (1) { 2075 while (window_free <= min_bytes) {
2017 /* out window is just right, lets fill it */ 2076 node = rb_next(&entry->offset_index);
2018 if (window_free >= bytes + empty_size) 2077 if (!node)
2019 break; 2078 return -ENOSPC;
2020 2079 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2021 node = rb_next(&last->offset_index);
2022 if (!node) {
2023 if (found_bitmap)
2024 goto again;
2025 ret = -ENOSPC;
2026 goto out;
2027 }
2028 next = rb_entry(node, struct btrfs_free_space, offset_index);
2029 2080
2030 /* 2081 if (entry->bitmap)
2031 * we found a bitmap, so if this search doesn't result in a
2032 * cluster, we know to go and search again for the bitmaps and
2033 * start looking for space there
2034 */
2035 if (next->bitmap) {
2036 if (!found_bitmap)
2037 offset = next->offset;
2038 found_bitmap = true;
2039 last = next;
2040 continue; 2082 continue;
2041 }
2042
2043 /* 2083 /*
2044 * we haven't filled the empty size and the window is 2084 * we haven't filled the empty size and the window is
2045 * very large. reset and try again 2085 * very large. reset and try again
2046 */ 2086 */
2047 if (next->offset - (last->offset + last->bytes) > 128 * 1024 || 2087 if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
2048 next->offset - window_start > (bytes + empty_size) * 2) { 2088 entry->offset - window_start > (min_bytes * 2)) {
2049 entry = next; 2089 first = entry;
2050 window_start = entry->offset; 2090 window_start = entry->offset;
2051 window_free = entry->bytes; 2091 window_free = entry->bytes;
2052 last = entry; 2092 last = entry;
2053 max_extent = entry->bytes; 2093 max_extent = entry->bytes;
2054 } else { 2094 } else {
2055 last = next; 2095 last = entry;
2056 window_free += next->bytes; 2096 window_free += entry->bytes;
2057 if (entry->bytes > max_extent) 2097 if (entry->bytes > max_extent)
2058 max_extent = entry->bytes; 2098 max_extent = entry->bytes;
2059 } 2099 }
2100 prev = entry;
2060 } 2101 }
2061 2102
2062 cluster->window_start = entry->offset; 2103 cluster->window_start = first->offset;
2104
2105 node = &first->offset_index;
2063 2106
2064 /* 2107 /*
2065 * now we've found our entries, pull them out of the free space 2108 * now we've found our entries, pull them out of the free space
2066 * cache and put them into the cluster rbtree 2109 * cache and put them into the cluster rbtree
2067 *
2068 * The cluster includes an rbtree, but only uses the offset index
2069 * of each free space cache entry.
2070 */ 2110 */
2071 while (1) { 2111 do {
2112 int ret;
2113
2114 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2072 node = rb_next(&entry->offset_index); 2115 node = rb_next(&entry->offset_index);
2073 if (entry->bitmap && node) { 2116 if (entry->bitmap)
2074 entry = rb_entry(node, struct btrfs_free_space,
2075 offset_index);
2076 continue; 2117 continue;
2077 } else if (entry->bitmap && !node) {
2078 break;
2079 }
2080 2118
2081 rb_erase(&entry->offset_index, &block_group->free_space_offset); 2119 rb_erase(&entry->offset_index, &block_group->free_space_offset);
2082 ret = tree_insert_offset(&cluster->root, entry->offset, 2120 ret = tree_insert_offset(&cluster->root, entry->offset,
2083 &entry->offset_index, 0); 2121 &entry->offset_index, 0);
2084 BUG_ON(ret); 2122 BUG_ON(ret);
2123 } while (node && entry != last);
2085 2124
2086 if (!node || entry == last) 2125 cluster->max_size = max_extent;
2087 break; 2126
2127 return 0;
2128}
2129
2130/*
2131 * This specifically looks for bitmaps that may work in the cluster, we assume
2132 * that we have already failed to find extents that will work.
2133 */
2134static int setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2135 struct btrfs_free_cluster *cluster,
2136 u64 offset, u64 bytes, u64 min_bytes)
2137{
2138 struct btrfs_free_space *entry;
2139 struct rb_node *node;
2140 int ret = -ENOSPC;
2141
2142 if (block_group->total_bitmaps == 0)
2143 return -ENOSPC;
2144
2145 entry = tree_search_offset(block_group,
2146 offset_to_bitmap(block_group, offset),
2147 0, 1);
2148 if (!entry)
2149 return -ENOSPC;
2088 2150
2151 node = &entry->offset_index;
2152 do {
2089 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2153 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2154 node = rb_next(&entry->offset_index);
2155 if (!entry->bitmap)
2156 continue;
2157 if (entry->bytes < min_bytes)
2158 continue;
2159 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2160 bytes, min_bytes);
2161 } while (ret && node);
2162
2163 return ret;
2164}
2165
2166/*
2167 * here we try to find a cluster of blocks in a block group. The goal
2168 * is to find at least bytes free and up to empty_size + bytes free.
2169 * We might not find them all in one contiguous area.
2170 *
2171 * returns zero and sets up cluster if things worked out, otherwise
2172 * it returns -enospc
2173 */
2174int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2175 struct btrfs_root *root,
2176 struct btrfs_block_group_cache *block_group,
2177 struct btrfs_free_cluster *cluster,
2178 u64 offset, u64 bytes, u64 empty_size)
2179{
2180 u64 min_bytes;
2181 int ret;
2182
2183 /* for metadata, allow allocates with more holes */
2184 if (btrfs_test_opt(root, SSD_SPREAD)) {
2185 min_bytes = bytes + empty_size;
2186 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2187 /*
2188 * we want to do larger allocations when we are
2189 * flushing out the delayed refs, it helps prevent
2190 * making more work as we go along.
2191 */
2192 if (trans->transaction->delayed_refs.flushing)
2193 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2194 else
2195 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2196 } else
2197 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2198
2199 spin_lock(&block_group->tree_lock);
2200
2201 /*
2202 * If we know we don't have enough space to make a cluster don't even
2203 * bother doing all the work to try and find one.
2204 */
2205 if (block_group->free_space < min_bytes) {
2206 spin_unlock(&block_group->tree_lock);
2207 return -ENOSPC;
2090 } 2208 }
2091 2209
2092 cluster->max_size = max_extent; 2210 spin_lock(&cluster->lock);
2093got_it: 2211
2094 ret = 0; 2212 /* someone already found a cluster, hooray */
2095 atomic_inc(&block_group->count); 2213 if (cluster->block_group) {
2096 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 2214 ret = 0;
2097 cluster->block_group = block_group; 2215 goto out;
2216 }
2217
2218 ret = setup_cluster_no_bitmap(block_group, cluster, offset, bytes,
2219 min_bytes);
2220 if (ret)
2221 ret = setup_cluster_bitmap(block_group, cluster, offset,
2222 bytes, min_bytes);
2223
2224 if (!ret) {
2225 atomic_inc(&block_group->count);
2226 list_add_tail(&cluster->block_group_list,
2227 &block_group->cluster_list);
2228 cluster->block_group = block_group;
2229 }
2098out: 2230out:
2099 spin_unlock(&cluster->lock); 2231 spin_unlock(&cluster->lock);
2100 spin_unlock(&block_group->tree_lock); 2232 spin_unlock(&block_group->tree_lock);
@@ -2111,8 +2243,99 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2111 spin_lock_init(&cluster->refill_lock); 2243 spin_lock_init(&cluster->refill_lock);
2112 cluster->root = RB_ROOT; 2244 cluster->root = RB_ROOT;
2113 cluster->max_size = 0; 2245 cluster->max_size = 0;
2114 cluster->points_to_bitmap = false;
2115 INIT_LIST_HEAD(&cluster->block_group_list); 2246 INIT_LIST_HEAD(&cluster->block_group_list);
2116 cluster->block_group = NULL; 2247 cluster->block_group = NULL;
2117} 2248}
2118 2249
2250int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2251 u64 *trimmed, u64 start, u64 end, u64 minlen)
2252{
2253 struct btrfs_free_space *entry = NULL;
2254 struct btrfs_fs_info *fs_info = block_group->fs_info;
2255 u64 bytes = 0;
2256 u64 actually_trimmed;
2257 int ret = 0;
2258
2259 *trimmed = 0;
2260
2261 while (start < end) {
2262 spin_lock(&block_group->tree_lock);
2263
2264 if (block_group->free_space < minlen) {
2265 spin_unlock(&block_group->tree_lock);
2266 break;
2267 }
2268
2269 entry = tree_search_offset(block_group, start, 0, 1);
2270 if (!entry)
2271 entry = tree_search_offset(block_group,
2272 offset_to_bitmap(block_group,
2273 start),
2274 1, 1);
2275
2276 if (!entry || entry->offset >= end) {
2277 spin_unlock(&block_group->tree_lock);
2278 break;
2279 }
2280
2281 if (entry->bitmap) {
2282 ret = search_bitmap(block_group, entry, &start, &bytes);
2283 if (!ret) {
2284 if (start >= end) {
2285 spin_unlock(&block_group->tree_lock);
2286 break;
2287 }
2288 bytes = min(bytes, end - start);
2289 bitmap_clear_bits(block_group, entry,
2290 start, bytes);
2291 if (entry->bytes == 0)
2292 free_bitmap(block_group, entry);
2293 } else {
2294 start = entry->offset + BITS_PER_BITMAP *
2295 block_group->sectorsize;
2296 spin_unlock(&block_group->tree_lock);
2297 ret = 0;
2298 continue;
2299 }
2300 } else {
2301 start = entry->offset;
2302 bytes = min(entry->bytes, end - start);
2303 unlink_free_space(block_group, entry);
2304 kfree(entry);
2305 }
2306
2307 spin_unlock(&block_group->tree_lock);
2308
2309 if (bytes >= minlen) {
2310 int update_ret;
2311 update_ret = btrfs_update_reserved_bytes(block_group,
2312 bytes, 1, 1);
2313
2314 ret = btrfs_error_discard_extent(fs_info->extent_root,
2315 start,
2316 bytes,
2317 &actually_trimmed);
2318
2319 btrfs_add_free_space(block_group,
2320 start, bytes);
2321 if (!update_ret)
2322 btrfs_update_reserved_bytes(block_group,
2323 bytes, 0, 1);
2324
2325 if (ret)
2326 break;
2327 *trimmed += actually_trimmed;
2328 }
2329 start += bytes;
2330 bytes = 0;
2331
2332 if (fatal_signal_pending(current)) {
2333 ret = -ERESTARTSYS;
2334 break;
2335 }
2336
2337 cond_resched();
2338 }
2339
2340 return ret;
2341}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index e49ca5c321b5..65c3b935289f 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -68,4 +68,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
68int btrfs_return_cluster_to_free_space( 68int btrfs_return_cluster_to_free_space(
69 struct btrfs_block_group_cache *block_group, 69 struct btrfs_block_group_cache *block_group,
70 struct btrfs_free_cluster *cluster); 70 struct btrfs_free_cluster *cluster);
71int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
72 u64 *trimmed, u64 start, u64 end, u64 minlen);
71#endif 73#endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c56eb5909172..c05a08f4c411 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -30,7 +30,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
30 int slot; 30 int slot;
31 31
32 path = btrfs_alloc_path(); 32 path = btrfs_alloc_path();
33 BUG_ON(!path); 33 if (!path)
34 return -ENOMEM;
34 35
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID; 36 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1; 37 search_key.type = -1;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 160b55b3e132..fcd66b6a8086 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -50,6 +50,7 @@
50#include "tree-log.h" 50#include "tree-log.h"
51#include "compression.h" 51#include "compression.h"
52#include "locking.h" 52#include "locking.h"
53#include "free-space-cache.h"
53 54
54struct btrfs_iget_args { 55struct btrfs_iget_args {
55 u64 ino; 56 u64 ino;
@@ -70,6 +71,7 @@ static struct kmem_cache *btrfs_inode_cachep;
70struct kmem_cache *btrfs_trans_handle_cachep; 71struct kmem_cache *btrfs_trans_handle_cachep;
71struct kmem_cache *btrfs_transaction_cachep; 72struct kmem_cache *btrfs_transaction_cachep;
72struct kmem_cache *btrfs_path_cachep; 73struct kmem_cache *btrfs_path_cachep;
74struct kmem_cache *btrfs_free_space_cachep;
73 75
74#define S_SHIFT 12 76#define S_SHIFT 12
75static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 77static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -82,7 +84,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
82 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 84 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
83}; 85};
84 86
85static void btrfs_truncate(struct inode *inode); 87static int btrfs_setsize(struct inode *inode, loff_t newsize);
88static int btrfs_truncate(struct inode *inode);
86static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 89static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
87static noinline int cow_file_range(struct inode *inode, 90static noinline int cow_file_range(struct inode *inode,
88 struct page *locked_page, 91 struct page *locked_page,
@@ -90,13 +93,14 @@ static noinline int cow_file_range(struct inode *inode,
90 unsigned long *nr_written, int unlock); 93 unsigned long *nr_written, int unlock);
91 94
92static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 95static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
93 struct inode *inode, struct inode *dir) 96 struct inode *inode, struct inode *dir,
97 const struct qstr *qstr)
94{ 98{
95 int err; 99 int err;
96 100
97 err = btrfs_init_acl(trans, inode, dir); 101 err = btrfs_init_acl(trans, inode, dir);
98 if (!err) 102 if (!err)
99 err = btrfs_xattr_security_init(trans, inode, dir); 103 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
100 return err; 104 return err;
101} 105}
102 106
@@ -108,6 +112,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
108static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 112static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root, struct inode *inode, 113 struct btrfs_root *root, struct inode *inode,
110 u64 start, size_t size, size_t compressed_size, 114 u64 start, size_t size, size_t compressed_size,
115 int compress_type,
111 struct page **compressed_pages) 116 struct page **compressed_pages)
112{ 117{
113 struct btrfs_key key; 118 struct btrfs_key key;
@@ -122,12 +127,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
122 size_t cur_size = size; 127 size_t cur_size = size;
123 size_t datasize; 128 size_t datasize;
124 unsigned long offset; 129 unsigned long offset;
125 int compress_type = BTRFS_COMPRESS_NONE;
126 130
127 if (compressed_size && compressed_pages) { 131 if (compressed_size && compressed_pages)
128 compress_type = root->fs_info->compress_type;
129 cur_size = compressed_size; 132 cur_size = compressed_size;
130 }
131 133
132 path = btrfs_alloc_path(); 134 path = btrfs_alloc_path();
133 if (!path) 135 if (!path)
@@ -217,7 +219,7 @@ fail:
217static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 219static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
218 struct btrfs_root *root, 220 struct btrfs_root *root,
219 struct inode *inode, u64 start, u64 end, 221 struct inode *inode, u64 start, u64 end,
220 size_t compressed_size, 222 size_t compressed_size, int compress_type,
221 struct page **compressed_pages) 223 struct page **compressed_pages)
222{ 224{
223 u64 isize = i_size_read(inode); 225 u64 isize = i_size_read(inode);
@@ -250,7 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
250 inline_len = min_t(u64, isize, actual_end); 252 inline_len = min_t(u64, isize, actual_end);
251 ret = insert_inline_extent(trans, root, inode, start, 253 ret = insert_inline_extent(trans, root, inode, start,
252 inline_len, compressed_size, 254 inline_len, compressed_size,
253 compressed_pages); 255 compress_type, compressed_pages);
254 BUG_ON(ret); 256 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start); 257 btrfs_delalloc_release_metadata(inode, end + 1 - start);
256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 258 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
@@ -287,6 +289,7 @@ static noinline int add_async_extent(struct async_cow *cow,
287 struct async_extent *async_extent; 289 struct async_extent *async_extent;
288 290
289 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 291 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
292 BUG_ON(!async_extent);
290 async_extent->start = start; 293 async_extent->start = start;
291 async_extent->ram_size = ram_size; 294 async_extent->ram_size = ram_size;
292 async_extent->compressed_size = compressed_size; 295 async_extent->compressed_size = compressed_size;
@@ -381,9 +384,11 @@ again:
381 */ 384 */
382 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 385 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
383 (btrfs_test_opt(root, COMPRESS) || 386 (btrfs_test_opt(root, COMPRESS) ||
384 (BTRFS_I(inode)->force_compress))) { 387 (BTRFS_I(inode)->force_compress) ||
388 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
385 WARN_ON(pages); 389 WARN_ON(pages);
386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 390 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
391 BUG_ON(!pages);
387 392
388 if (BTRFS_I(inode)->force_compress) 393 if (BTRFS_I(inode)->force_compress)
389 compress_type = BTRFS_I(inode)->force_compress; 394 compress_type = BTRFS_I(inode)->force_compress;
@@ -416,7 +421,7 @@ again:
416 } 421 }
417 if (start == 0) { 422 if (start == 0) {
418 trans = btrfs_join_transaction(root, 1); 423 trans = btrfs_join_transaction(root, 1);
419 BUG_ON(!trans); 424 BUG_ON(IS_ERR(trans));
420 btrfs_set_trans_block_group(trans, inode); 425 btrfs_set_trans_block_group(trans, inode);
421 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 426 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
422 427
@@ -426,12 +431,13 @@ again:
426 * to make an uncompressed inline extent. 431 * to make an uncompressed inline extent.
427 */ 432 */
428 ret = cow_file_range_inline(trans, root, inode, 433 ret = cow_file_range_inline(trans, root, inode,
429 start, end, 0, NULL); 434 start, end, 0, 0, NULL);
430 } else { 435 } else {
431 /* try making a compressed inline extent */ 436 /* try making a compressed inline extent */
432 ret = cow_file_range_inline(trans, root, inode, 437 ret = cow_file_range_inline(trans, root, inode,
433 start, end, 438 start, end,
434 total_compressed, pages); 439 total_compressed,
440 compress_type, pages);
435 } 441 }
436 if (ret == 0) { 442 if (ret == 0) {
437 /* 443 /*
@@ -612,6 +618,7 @@ retry:
612 GFP_NOFS); 618 GFP_NOFS);
613 619
614 trans = btrfs_join_transaction(root, 1); 620 trans = btrfs_join_transaction(root, 1);
621 BUG_ON(IS_ERR(trans));
615 ret = btrfs_reserve_extent(trans, root, 622 ret = btrfs_reserve_extent(trans, root,
616 async_extent->compressed_size, 623 async_extent->compressed_size,
617 async_extent->compressed_size, 624 async_extent->compressed_size,
@@ -643,6 +650,7 @@ retry:
643 async_extent->ram_size - 1, 0); 650 async_extent->ram_size - 1, 0);
644 651
645 em = alloc_extent_map(GFP_NOFS); 652 em = alloc_extent_map(GFP_NOFS);
653 BUG_ON(!em);
646 em->start = async_extent->start; 654 em->start = async_extent->start;
647 em->len = async_extent->ram_size; 655 em->len = async_extent->ram_size;
648 em->orig_start = em->start; 656 em->orig_start = em->start;
@@ -771,7 +779,7 @@ static noinline int cow_file_range(struct inode *inode,
771 779
772 BUG_ON(root == root->fs_info->tree_root); 780 BUG_ON(root == root->fs_info->tree_root);
773 trans = btrfs_join_transaction(root, 1); 781 trans = btrfs_join_transaction(root, 1);
774 BUG_ON(!trans); 782 BUG_ON(IS_ERR(trans));
775 btrfs_set_trans_block_group(trans, inode); 783 btrfs_set_trans_block_group(trans, inode);
776 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 784 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
777 785
@@ -783,7 +791,7 @@ static noinline int cow_file_range(struct inode *inode,
783 if (start == 0) { 791 if (start == 0) {
784 /* lets try to make an inline extent */ 792 /* lets try to make an inline extent */
785 ret = cow_file_range_inline(trans, root, inode, 793 ret = cow_file_range_inline(trans, root, inode,
786 start, end, 0, NULL); 794 start, end, 0, 0, NULL);
787 if (ret == 0) { 795 if (ret == 0) {
788 extent_clear_unlock_delalloc(inode, 796 extent_clear_unlock_delalloc(inode,
789 &BTRFS_I(inode)->io_tree, 797 &BTRFS_I(inode)->io_tree,
@@ -819,6 +827,7 @@ static noinline int cow_file_range(struct inode *inode,
819 BUG_ON(ret); 827 BUG_ON(ret);
820 828
821 em = alloc_extent_map(GFP_NOFS); 829 em = alloc_extent_map(GFP_NOFS);
830 BUG_ON(!em);
822 em->start = start; 831 em->start = start;
823 em->orig_start = em->start; 832 em->orig_start = em->start;
824 ram_size = ins.offset; 833 ram_size = ins.offset;
@@ -1049,7 +1058,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1049 } else { 1058 } else {
1050 trans = btrfs_join_transaction(root, 1); 1059 trans = btrfs_join_transaction(root, 1);
1051 } 1060 }
1052 BUG_ON(!trans); 1061 BUG_ON(IS_ERR(trans));
1053 1062
1054 cow_start = (u64)-1; 1063 cow_start = (u64)-1;
1055 cur_offset = start; 1064 cur_offset = start;
@@ -1168,6 +1177,7 @@ out_check:
1168 struct extent_map_tree *em_tree; 1177 struct extent_map_tree *em_tree;
1169 em_tree = &BTRFS_I(inode)->extent_tree; 1178 em_tree = &BTRFS_I(inode)->extent_tree;
1170 em = alloc_extent_map(GFP_NOFS); 1179 em = alloc_extent_map(GFP_NOFS);
1180 BUG_ON(!em);
1171 em->start = cur_offset; 1181 em->start = cur_offset;
1172 em->orig_start = em->start; 1182 em->orig_start = em->start;
1173 em->len = num_bytes; 1183 em->len = num_bytes;
@@ -1249,7 +1259,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1249 ret = run_delalloc_nocow(inode, locked_page, start, end, 1259 ret = run_delalloc_nocow(inode, locked_page, start, end,
1250 page_started, 0, nr_written); 1260 page_started, 0, nr_written);
1251 else if (!btrfs_test_opt(root, COMPRESS) && 1261 else if (!btrfs_test_opt(root, COMPRESS) &&
1252 !(BTRFS_I(inode)->force_compress)) 1262 !(BTRFS_I(inode)->force_compress) &&
1263 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
1253 ret = cow_file_range(inode, locked_page, start, end, 1264 ret = cow_file_range(inode, locked_page, start, end,
1254 page_started, nr_written, 1); 1265 page_started, nr_written, 1);
1255 else 1266 else
@@ -1456,8 +1467,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1456 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1467 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1457 return btrfs_submit_compressed_read(inode, bio, 1468 return btrfs_submit_compressed_read(inode, bio,
1458 mirror_num, bio_flags); 1469 mirror_num, bio_flags);
1459 } else if (!skip_sum) 1470 } else if (!skip_sum) {
1460 btrfs_lookup_bio_sums(root, inode, bio, NULL); 1471 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1472 if (ret)
1473 return ret;
1474 }
1461 goto mapit; 1475 goto mapit;
1462 } else if (!skip_sum) { 1476 } else if (!skip_sum) {
1463 /* csum items have already been cloned */ 1477 /* csum items have already been cloned */
@@ -1557,6 +1571,7 @@ out:
1557out_page: 1571out_page:
1558 unlock_page(page); 1572 unlock_page(page);
1559 page_cache_release(page); 1573 page_cache_release(page);
1574 kfree(fixup);
1560} 1575}
1561 1576
1562/* 1577/*
@@ -1703,7 +1718,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1703 trans = btrfs_join_transaction_nolock(root, 1); 1718 trans = btrfs_join_transaction_nolock(root, 1);
1704 else 1719 else
1705 trans = btrfs_join_transaction(root, 1); 1720 trans = btrfs_join_transaction(root, 1);
1706 BUG_ON(!trans); 1721 BUG_ON(IS_ERR(trans));
1707 btrfs_set_trans_block_group(trans, inode); 1722 btrfs_set_trans_block_group(trans, inode);
1708 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1723 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1709 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
@@ -1720,6 +1735,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1720 trans = btrfs_join_transaction_nolock(root, 1); 1735 trans = btrfs_join_transaction_nolock(root, 1);
1721 else 1736 else
1722 trans = btrfs_join_transaction(root, 1); 1737 trans = btrfs_join_transaction(root, 1);
1738 BUG_ON(IS_ERR(trans));
1723 btrfs_set_trans_block_group(trans, inode); 1739 btrfs_set_trans_block_group(trans, inode);
1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1740 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1725 1741
@@ -1754,9 +1770,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1754 add_pending_csums(trans, inode, ordered_extent->file_offset, 1770 add_pending_csums(trans, inode, ordered_extent->file_offset,
1755 &ordered_extent->list); 1771 &ordered_extent->list);
1756 1772
1757 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1773 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1758 ret = btrfs_update_inode(trans, root, inode); 1774 if (!ret) {
1759 BUG_ON(ret); 1775 ret = btrfs_update_inode(trans, root, inode);
1776 BUG_ON(ret);
1777 }
1778 ret = 0;
1760out: 1779out:
1761 if (nolock) { 1780 if (nolock) {
1762 if (trans) 1781 if (trans)
@@ -1778,6 +1797,8 @@ out:
1778static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1797static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1779 struct extent_state *state, int uptodate) 1798 struct extent_state *state, int uptodate)
1780{ 1799{
1800 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
1801
1781 ClearPagePrivate2(page); 1802 ClearPagePrivate2(page);
1782 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1803 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1783} 1804}
@@ -1888,10 +1909,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1888 else 1909 else
1889 rw = READ; 1910 rw = READ;
1890 1911
1891 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1912 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1892 failrec->last_mirror, 1913 failrec->last_mirror,
1893 failrec->bio_flags, 0); 1914 failrec->bio_flags, 0);
1894 return 0; 1915 return ret;
1895} 1916}
1896 1917
1897/* 1918/*
@@ -1907,7 +1928,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1907 1928
1908 private = 0; 1929 private = 0;
1909 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1930 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1910 (u64)-1, 1, EXTENT_DIRTY)) { 1931 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1911 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, 1932 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1912 start, &private_failure); 1933 start, &private_failure);
1913 if (ret == 0) { 1934 if (ret == 0) {
@@ -2203,8 +2224,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2203 insert = 1; 2224 insert = 1;
2204#endif 2225#endif
2205 insert = 1; 2226 insert = 1;
2206 } else {
2207 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2208 } 2227 }
2209 2228
2210 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2229 if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2275,7 +2294,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2275 * this cleans up any orphans that may be left on the list from the last use 2294 * this cleans up any orphans that may be left on the list from the last use
2276 * of this root. 2295 * of this root.
2277 */ 2296 */
2278void btrfs_orphan_cleanup(struct btrfs_root *root) 2297int btrfs_orphan_cleanup(struct btrfs_root *root)
2279{ 2298{
2280 struct btrfs_path *path; 2299 struct btrfs_path *path;
2281 struct extent_buffer *leaf; 2300 struct extent_buffer *leaf;
@@ -2285,10 +2304,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2285 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2304 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2286 2305
2287 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2306 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2288 return; 2307 return 0;
2289 2308
2290 path = btrfs_alloc_path(); 2309 path = btrfs_alloc_path();
2291 BUG_ON(!path); 2310 if (!path) {
2311 ret = -ENOMEM;
2312 goto out;
2313 }
2292 path->reada = -1; 2314 path->reada = -1;
2293 2315
2294 key.objectid = BTRFS_ORPHAN_OBJECTID; 2316 key.objectid = BTRFS_ORPHAN_OBJECTID;
@@ -2297,18 +2319,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2297 2319
2298 while (1) { 2320 while (1) {
2299 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2321 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2300 if (ret < 0) { 2322 if (ret < 0)
2301 printk(KERN_ERR "Error searching slot for orphan: %d" 2323 goto out;
2302 "\n", ret);
2303 break;
2304 }
2305 2324
2306 /* 2325 /*
2307 * if ret == 0 means we found what we were searching for, which 2326 * if ret == 0 means we found what we were searching for, which
2308 * is weird, but possible, so only screw with path if we didnt 2327 * is weird, but possible, so only screw with path if we didn't
2309 * find the key and see if we have stuff that matches 2328 * find the key and see if we have stuff that matches
2310 */ 2329 */
2311 if (ret > 0) { 2330 if (ret > 0) {
2331 ret = 0;
2312 if (path->slots[0] == 0) 2332 if (path->slots[0] == 0)
2313 break; 2333 break;
2314 path->slots[0]--; 2334 path->slots[0]--;
@@ -2336,7 +2356,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2336 found_key.type = BTRFS_INODE_ITEM_KEY; 2356 found_key.type = BTRFS_INODE_ITEM_KEY;
2337 found_key.offset = 0; 2357 found_key.offset = 0;
2338 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2358 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2339 BUG_ON(IS_ERR(inode)); 2359 if (IS_ERR(inode)) {
2360 ret = PTR_ERR(inode);
2361 goto out;
2362 }
2340 2363
2341 /* 2364 /*
2342 * add this inode to the orphan list so btrfs_orphan_del does 2365 * add this inode to the orphan list so btrfs_orphan_del does
@@ -2354,6 +2377,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2354 */ 2377 */
2355 if (is_bad_inode(inode)) { 2378 if (is_bad_inode(inode)) {
2356 trans = btrfs_start_transaction(root, 0); 2379 trans = btrfs_start_transaction(root, 0);
2380 if (IS_ERR(trans)) {
2381 ret = PTR_ERR(trans);
2382 goto out;
2383 }
2357 btrfs_orphan_del(trans, inode); 2384 btrfs_orphan_del(trans, inode);
2358 btrfs_end_transaction(trans, root); 2385 btrfs_end_transaction(trans, root);
2359 iput(inode); 2386 iput(inode);
@@ -2362,17 +2389,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2362 2389
2363 /* if we have links, this was a truncate, lets do that */ 2390 /* if we have links, this was a truncate, lets do that */
2364 if (inode->i_nlink) { 2391 if (inode->i_nlink) {
2392 if (!S_ISREG(inode->i_mode)) {
2393 WARN_ON(1);
2394 iput(inode);
2395 continue;
2396 }
2365 nr_truncate++; 2397 nr_truncate++;
2366 btrfs_truncate(inode); 2398 ret = btrfs_truncate(inode);
2367 } else { 2399 } else {
2368 nr_unlink++; 2400 nr_unlink++;
2369 } 2401 }
2370 2402
2371 /* this will do delete_inode and everything for us */ 2403 /* this will do delete_inode and everything for us */
2372 iput(inode); 2404 iput(inode);
2405 if (ret)
2406 goto out;
2373 } 2407 }
2374 btrfs_free_path(path);
2375
2376 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2408 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2377 2409
2378 if (root->orphan_block_rsv) 2410 if (root->orphan_block_rsv)
@@ -2381,13 +2413,20 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2381 2413
2382 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2414 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2383 trans = btrfs_join_transaction(root, 1); 2415 trans = btrfs_join_transaction(root, 1);
2384 btrfs_end_transaction(trans, root); 2416 if (!IS_ERR(trans))
2417 btrfs_end_transaction(trans, root);
2385 } 2418 }
2386 2419
2387 if (nr_unlink) 2420 if (nr_unlink)
2388 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2421 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2389 if (nr_truncate) 2422 if (nr_truncate)
2390 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2423 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2424
2425out:
2426 if (ret)
2427 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
2428 btrfs_free_path(path);
2429 return ret;
2391} 2430}
2392 2431
2393/* 2432/*
@@ -2554,6 +2593,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2554 struct btrfs_inode_item *item, 2593 struct btrfs_inode_item *item,
2555 struct inode *inode) 2594 struct inode *inode)
2556{ 2595{
2596 if (!leaf->map_token)
2597 map_private_extent_buffer(leaf, (unsigned long)item,
2598 sizeof(struct btrfs_inode_item),
2599 &leaf->map_token, &leaf->kaddr,
2600 &leaf->map_start, &leaf->map_len,
2601 KM_USER1);
2602
2557 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2603 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2558 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2604 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2559 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2605 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2582,6 +2628,11 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2582 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2628 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2583 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2629 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2584 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2630 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
2631
2632 if (leaf->map_token) {
2633 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2634 leaf->map_token = NULL;
2635 }
2585} 2636}
2586 2637
2587/* 2638/*
@@ -2626,10 +2677,10 @@ failed:
2626 * recovery code. It remove a link in a directory with a given name, and 2677 * recovery code. It remove a link in a directory with a given name, and
2627 * also drops the back refs in the inode to the directory 2678 * also drops the back refs in the inode to the directory
2628 */ 2679 */
2629int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2680static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2630 struct btrfs_root *root, 2681 struct btrfs_root *root,
2631 struct inode *dir, struct inode *inode, 2682 struct inode *dir, struct inode *inode,
2632 const char *name, int name_len) 2683 const char *name, int name_len)
2633{ 2684{
2634 struct btrfs_path *path; 2685 struct btrfs_path *path;
2635 int ret = 0; 2686 int ret = 0;
@@ -2641,7 +2692,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2641 path = btrfs_alloc_path(); 2692 path = btrfs_alloc_path();
2642 if (!path) { 2693 if (!path) {
2643 ret = -ENOMEM; 2694 ret = -ENOMEM;
2644 goto err; 2695 goto out;
2645 } 2696 }
2646 2697
2647 path->leave_spinning = 1; 2698 path->leave_spinning = 1;
@@ -2701,12 +2752,25 @@ err:
2701 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2752 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2702 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2753 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2703 btrfs_update_inode(trans, root, dir); 2754 btrfs_update_inode(trans, root, dir);
2704 btrfs_drop_nlink(inode);
2705 ret = btrfs_update_inode(trans, root, inode);
2706out: 2755out:
2707 return ret; 2756 return ret;
2708} 2757}
2709 2758
2759int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2760 struct btrfs_root *root,
2761 struct inode *dir, struct inode *inode,
2762 const char *name, int name_len)
2763{
2764 int ret;
2765 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
2766 if (!ret) {
2767 btrfs_drop_nlink(inode);
2768 ret = btrfs_update_inode(trans, root, inode);
2769 }
2770 return ret;
2771}
2772
2773
2710/* helper to check if there is any shared block in the path */ 2774/* helper to check if there is any shared block in the path */
2711static int check_path_shared(struct btrfs_root *root, 2775static int check_path_shared(struct btrfs_root *root,
2712 struct btrfs_path *path) 2776 struct btrfs_path *path)
@@ -2714,9 +2778,10 @@ static int check_path_shared(struct btrfs_root *root,
2714 struct extent_buffer *eb; 2778 struct extent_buffer *eb;
2715 int level; 2779 int level;
2716 u64 refs = 1; 2780 u64 refs = 1;
2717 int uninitialized_var(ret);
2718 2781
2719 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2782 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2783 int ret;
2784
2720 if (!path->nodes[level]) 2785 if (!path->nodes[level])
2721 break; 2786 break;
2722 eb = path->nodes[level]; 2787 eb = path->nodes[level];
@@ -2727,7 +2792,7 @@ static int check_path_shared(struct btrfs_root *root,
2727 if (refs > 1) 2792 if (refs > 1)
2728 return 1; 2793 return 1;
2729 } 2794 }
2730 return ret; /* XXX callers? */ 2795 return 0;
2731} 2796}
2732 2797
2733/* 2798/*
@@ -3527,7 +3592,13 @@ out:
3527 return ret; 3592 return ret;
3528} 3593}
3529 3594
3530int btrfs_cont_expand(struct inode *inode, loff_t size) 3595/*
3596 * This function puts in dummy file extents for the area we're creating a hole
3597 * for. So if we are truncating this file to a larger size we need to insert
3598 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
3599 * the range between oldsize and size
3600 */
3601int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3531{ 3602{
3532 struct btrfs_trans_handle *trans; 3603 struct btrfs_trans_handle *trans;
3533 struct btrfs_root *root = BTRFS_I(inode)->root; 3604 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3535,7 +3606,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3535 struct extent_map *em = NULL; 3606 struct extent_map *em = NULL;
3536 struct extent_state *cached_state = NULL; 3607 struct extent_state *cached_state = NULL;
3537 u64 mask = root->sectorsize - 1; 3608 u64 mask = root->sectorsize - 1;
3538 u64 hole_start = (inode->i_size + mask) & ~mask; 3609 u64 hole_start = (oldsize + mask) & ~mask;
3539 u64 block_end = (size + mask) & ~mask; 3610 u64 block_end = (size + mask) & ~mask;
3540 u64 last_byte; 3611 u64 last_byte;
3541 u64 cur_offset; 3612 u64 cur_offset;
@@ -3580,13 +3651,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3580 err = btrfs_drop_extents(trans, inode, cur_offset, 3651 err = btrfs_drop_extents(trans, inode, cur_offset,
3581 cur_offset + hole_size, 3652 cur_offset + hole_size,
3582 &hint_byte, 1); 3653 &hint_byte, 1);
3583 BUG_ON(err); 3654 if (err)
3655 break;
3584 3656
3585 err = btrfs_insert_file_extent(trans, root, 3657 err = btrfs_insert_file_extent(trans, root,
3586 inode->i_ino, cur_offset, 0, 3658 inode->i_ino, cur_offset, 0,
3587 0, hole_size, 0, hole_size, 3659 0, hole_size, 0, hole_size,
3588 0, 0, 0); 3660 0, 0, 0);
3589 BUG_ON(err); 3661 if (err)
3662 break;
3590 3663
3591 btrfs_drop_extent_cache(inode, hole_start, 3664 btrfs_drop_extent_cache(inode, hole_start,
3592 last_byte - 1, 0); 3665 last_byte - 1, 0);
@@ -3606,81 +3679,41 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3606 return err; 3679 return err;
3607} 3680}
3608 3681
3609static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) 3682static int btrfs_setsize(struct inode *inode, loff_t newsize)
3610{ 3683{
3611 struct btrfs_root *root = BTRFS_I(inode)->root; 3684 loff_t oldsize = i_size_read(inode);
3612 struct btrfs_trans_handle *trans;
3613 unsigned long nr;
3614 int ret; 3685 int ret;
3615 3686
3616 if (attr->ia_size == inode->i_size) 3687 if (newsize == oldsize)
3617 return 0; 3688 return 0;
3618 3689
3619 if (attr->ia_size > inode->i_size) { 3690 if (newsize > oldsize) {
3620 unsigned long limit; 3691 i_size_write(inode, newsize);
3621 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 3692 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3622 if (attr->ia_size > inode->i_sb->s_maxbytes) 3693 truncate_pagecache(inode, oldsize, newsize);
3623 return -EFBIG; 3694 ret = btrfs_cont_expand(inode, oldsize, newsize);
3624 if (limit != RLIM_INFINITY && attr->ia_size > limit) {
3625 send_sig(SIGXFSZ, current, 0);
3626 return -EFBIG;
3627 }
3628 }
3629
3630 trans = btrfs_start_transaction(root, 5);
3631 if (IS_ERR(trans))
3632 return PTR_ERR(trans);
3633
3634 btrfs_set_trans_block_group(trans, inode);
3635
3636 ret = btrfs_orphan_add(trans, inode);
3637 BUG_ON(ret);
3638
3639 nr = trans->blocks_used;
3640 btrfs_end_transaction(trans, root);
3641 btrfs_btree_balance_dirty(root, nr);
3642
3643 if (attr->ia_size > inode->i_size) {
3644 ret = btrfs_cont_expand(inode, attr->ia_size);
3645 if (ret) { 3695 if (ret) {
3646 btrfs_truncate(inode); 3696 btrfs_setsize(inode, oldsize);
3647 return ret; 3697 return ret;
3648 } 3698 }
3649 3699
3650 i_size_write(inode, attr->ia_size); 3700 mark_inode_dirty(inode);
3651 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3701 } else {
3652 3702
3653 trans = btrfs_start_transaction(root, 0); 3703 /*
3654 BUG_ON(IS_ERR(trans)); 3704 * We're truncating a file that used to have good data down to
3655 btrfs_set_trans_block_group(trans, inode); 3705 * zero. Make sure it gets into the ordered flush list so that
3656 trans->block_rsv = root->orphan_block_rsv; 3706 * any new writes get down to disk quickly.
3657 BUG_ON(!trans->block_rsv); 3707 */
3708 if (newsize == 0)
3709 BTRFS_I(inode)->ordered_data_close = 1;
3658 3710
3659 ret = btrfs_update_inode(trans, root, inode); 3711 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3660 BUG_ON(ret); 3712 truncate_setsize(inode, newsize);
3661 if (inode->i_nlink > 0) { 3713 ret = btrfs_truncate(inode);
3662 ret = btrfs_orphan_del(trans, inode);
3663 BUG_ON(ret);
3664 }
3665 nr = trans->blocks_used;
3666 btrfs_end_transaction(trans, root);
3667 btrfs_btree_balance_dirty(root, nr);
3668 return 0;
3669 } 3714 }
3670 3715
3671 /* 3716 return ret;
3672 * We're truncating a file that used to have good data down to
3673 * zero. Make sure it gets into the ordered flush list so that
3674 * any new writes get down to disk quickly.
3675 */
3676 if (attr->ia_size == 0)
3677 BTRFS_I(inode)->ordered_data_close = 1;
3678
3679 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3680 ret = vmtruncate(inode, attr->ia_size);
3681 BUG_ON(ret);
3682
3683 return 0;
3684} 3717}
3685 3718
3686static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3719static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
@@ -3697,7 +3730,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3697 return err; 3730 return err;
3698 3731
3699 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3732 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3700 err = btrfs_setattr_size(inode, attr); 3733 err = btrfs_setsize(inode, attr->ia_size);
3701 if (err) 3734 if (err)
3702 return err; 3735 return err;
3703 } 3736 }
@@ -3720,6 +3753,8 @@ void btrfs_evict_inode(struct inode *inode)
3720 unsigned long nr; 3753 unsigned long nr;
3721 int ret; 3754 int ret;
3722 3755
3756 trace_btrfs_inode_evict(inode);
3757
3723 truncate_inode_pages(&inode->i_data, 0); 3758 truncate_inode_pages(&inode->i_data, 0);
3724 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3759 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3725 root == root->fs_info->tree_root)) 3760 root == root->fs_info->tree_root))
@@ -4062,7 +4097,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4062 BTRFS_I(inode)->root = root; 4097 BTRFS_I(inode)->root = root;
4063 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 4098 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4064 btrfs_read_locked_inode(inode); 4099 btrfs_read_locked_inode(inode);
4065
4066 inode_tree_add(inode); 4100 inode_tree_add(inode);
4067 unlock_new_inode(inode); 4101 unlock_new_inode(inode);
4068 if (new) 4102 if (new)
@@ -4134,11 +4168,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4134 } 4168 }
4135 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4169 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4136 4170
4137 if (root != sub_root) { 4171 if (!IS_ERR(inode) && root != sub_root) {
4138 down_read(&root->fs_info->cleanup_work_sem); 4172 down_read(&root->fs_info->cleanup_work_sem);
4139 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4173 if (!(inode->i_sb->s_flags & MS_RDONLY))
4140 btrfs_orphan_cleanup(sub_root); 4174 ret = btrfs_orphan_cleanup(sub_root);
4141 up_read(&root->fs_info->cleanup_work_sem); 4175 up_read(&root->fs_info->cleanup_work_sem);
4176 if (ret)
4177 inode = ERR_PTR(ret);
4142 } 4178 }
4143 4179
4144 return inode; 4180 return inode;
@@ -4186,10 +4222,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4186 struct btrfs_key found_key; 4222 struct btrfs_key found_key;
4187 struct btrfs_path *path; 4223 struct btrfs_path *path;
4188 int ret; 4224 int ret;
4189 u32 nritems;
4190 struct extent_buffer *leaf; 4225 struct extent_buffer *leaf;
4191 int slot; 4226 int slot;
4192 int advance;
4193 unsigned char d_type; 4227 unsigned char d_type;
4194 int over = 0; 4228 int over = 0;
4195 u32 di_cur; 4229 u32 di_cur;
@@ -4232,27 +4266,19 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4232 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4266 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4233 if (ret < 0) 4267 if (ret < 0)
4234 goto err; 4268 goto err;
4235 advance = 0;
4236 4269
4237 while (1) { 4270 while (1) {
4238 leaf = path->nodes[0]; 4271 leaf = path->nodes[0];
4239 nritems = btrfs_header_nritems(leaf);
4240 slot = path->slots[0]; 4272 slot = path->slots[0];
4241 if (advance || slot >= nritems) { 4273 if (slot >= btrfs_header_nritems(leaf)) {
4242 if (slot >= nritems - 1) { 4274 ret = btrfs_next_leaf(root, path);
4243 ret = btrfs_next_leaf(root, path); 4275 if (ret < 0)
4244 if (ret) 4276 goto err;
4245 break; 4277 else if (ret > 0)
4246 leaf = path->nodes[0]; 4278 break;
4247 nritems = btrfs_header_nritems(leaf); 4279 continue;
4248 slot = path->slots[0];
4249 } else {
4250 slot++;
4251 path->slots[0]++;
4252 }
4253 } 4280 }
4254 4281
4255 advance = 1;
4256 item = btrfs_item_nr(leaf, slot); 4282 item = btrfs_item_nr(leaf, slot);
4257 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4283 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4258 4284
@@ -4261,7 +4287,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4261 if (btrfs_key_type(&found_key) != key_type) 4287 if (btrfs_key_type(&found_key) != key_type)
4262 break; 4288 break;
4263 if (found_key.offset < filp->f_pos) 4289 if (found_key.offset < filp->f_pos)
4264 continue; 4290 goto next;
4265 4291
4266 filp->f_pos = found_key.offset; 4292 filp->f_pos = found_key.offset;
4267 4293
@@ -4272,6 +4298,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4272 while (di_cur < di_total) { 4298 while (di_cur < di_total) {
4273 struct btrfs_key location; 4299 struct btrfs_key location;
4274 4300
4301 if (verify_dir_item(root, leaf, di))
4302 break;
4303
4275 name_len = btrfs_dir_name_len(leaf, di); 4304 name_len = btrfs_dir_name_len(leaf, di);
4276 if (name_len <= sizeof(tmp_name)) { 4305 if (name_len <= sizeof(tmp_name)) {
4277 name_ptr = tmp_name; 4306 name_ptr = tmp_name;
@@ -4311,6 +4340,8 @@ skip:
4311 di_cur += di_len; 4340 di_cur += di_len;
4312 di = (struct btrfs_dir_item *)((char *)di + di_len); 4341 di = (struct btrfs_dir_item *)((char *)di + di_len);
4313 } 4342 }
4343next:
4344 path->slots[0]++;
4314 } 4345 }
4315 4346
4316 /* Reached end of directory/root. Bump pos past the last item. */ 4347 /* Reached end of directory/root. Bump pos past the last item. */
@@ -4347,6 +4378,8 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4347 trans = btrfs_join_transaction_nolock(root, 1); 4378 trans = btrfs_join_transaction_nolock(root, 1);
4348 else 4379 else
4349 trans = btrfs_join_transaction(root, 1); 4380 trans = btrfs_join_transaction(root, 1);
4381 if (IS_ERR(trans))
4382 return PTR_ERR(trans);
4350 btrfs_set_trans_block_group(trans, inode); 4383 btrfs_set_trans_block_group(trans, inode);
4351 if (nolock) 4384 if (nolock)
4352 ret = btrfs_end_transaction_nolock(trans, root); 4385 ret = btrfs_end_transaction_nolock(trans, root);
@@ -4372,6 +4405,7 @@ void btrfs_dirty_inode(struct inode *inode)
4372 return; 4405 return;
4373 4406
4374 trans = btrfs_join_transaction(root, 1); 4407 trans = btrfs_join_transaction(root, 1);
4408 BUG_ON(IS_ERR(trans));
4375 btrfs_set_trans_block_group(trans, inode); 4409 btrfs_set_trans_block_group(trans, inode);
4376 4410
4377 ret = btrfs_update_inode(trans, root, inode); 4411 ret = btrfs_update_inode(trans, root, inode);
@@ -4500,12 +4534,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4500 BUG_ON(!path); 4534 BUG_ON(!path);
4501 4535
4502 inode = new_inode(root->fs_info->sb); 4536 inode = new_inode(root->fs_info->sb);
4503 if (!inode) 4537 if (!inode) {
4538 btrfs_free_path(path);
4504 return ERR_PTR(-ENOMEM); 4539 return ERR_PTR(-ENOMEM);
4540 }
4505 4541
4506 if (dir) { 4542 if (dir) {
4543 trace_btrfs_inode_request(dir);
4544
4507 ret = btrfs_set_inode_index(dir, index); 4545 ret = btrfs_set_inode_index(dir, index);
4508 if (ret) { 4546 if (ret) {
4547 btrfs_free_path(path);
4509 iput(inode); 4548 iput(inode);
4510 return ERR_PTR(ret); 4549 return ERR_PTR(ret);
4511 } 4550 }
@@ -4572,12 +4611,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4572 if ((mode & S_IFREG)) { 4611 if ((mode & S_IFREG)) {
4573 if (btrfs_test_opt(root, NODATASUM)) 4612 if (btrfs_test_opt(root, NODATASUM))
4574 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4613 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4575 if (btrfs_test_opt(root, NODATACOW)) 4614 if (btrfs_test_opt(root, NODATACOW) ||
4615 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4576 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4616 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4577 } 4617 }
4578 4618
4579 insert_inode_hash(inode); 4619 insert_inode_hash(inode);
4580 inode_tree_add(inode); 4620 inode_tree_add(inode);
4621
4622 trace_btrfs_inode_new(inode);
4623
4581 return inode; 4624 return inode;
4582fail: 4625fail:
4583 if (dir) 4626 if (dir)
@@ -4692,7 +4735,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4692 if (IS_ERR(inode)) 4735 if (IS_ERR(inode))
4693 goto out_unlock; 4736 goto out_unlock;
4694 4737
4695 err = btrfs_init_inode_security(trans, inode, dir); 4738 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4696 if (err) { 4739 if (err) {
4697 drop_inode = 1; 4740 drop_inode = 1;
4698 goto out_unlock; 4741 goto out_unlock;
@@ -4753,7 +4796,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4753 if (IS_ERR(inode)) 4796 if (IS_ERR(inode))
4754 goto out_unlock; 4797 goto out_unlock;
4755 4798
4756 err = btrfs_init_inode_security(trans, inode, dir); 4799 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4757 if (err) { 4800 if (err) {
4758 drop_inode = 1; 4801 drop_inode = 1;
4759 goto out_unlock; 4802 goto out_unlock;
@@ -4794,30 +4837,31 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4794 int err; 4837 int err;
4795 int drop_inode = 0; 4838 int drop_inode = 0;
4796 4839
4797 if (inode->i_nlink == 0)
4798 return -ENOENT;
4799
4800 /* do not allow sys_link's with other subvols of the same device */ 4840 /* do not allow sys_link's with other subvols of the same device */
4801 if (root->objectid != BTRFS_I(inode)->root->objectid) 4841 if (root->objectid != BTRFS_I(inode)->root->objectid)
4802 return -EPERM; 4842 return -EXDEV;
4803 4843
4804 btrfs_inc_nlink(inode); 4844 if (inode->i_nlink == ~0U)
4805 inode->i_ctime = CURRENT_TIME; 4845 return -EMLINK;
4806 4846
4807 err = btrfs_set_inode_index(dir, &index); 4847 err = btrfs_set_inode_index(dir, &index);
4808 if (err) 4848 if (err)
4809 goto fail; 4849 goto fail;
4810 4850
4811 /* 4851 /*
4812 * 1 item for inode ref 4852 * 2 items for inode and inode ref
4813 * 2 items for dir items 4853 * 2 items for dir items
4854 * 1 item for parent inode
4814 */ 4855 */
4815 trans = btrfs_start_transaction(root, 3); 4856 trans = btrfs_start_transaction(root, 5);
4816 if (IS_ERR(trans)) { 4857 if (IS_ERR(trans)) {
4817 err = PTR_ERR(trans); 4858 err = PTR_ERR(trans);
4818 goto fail; 4859 goto fail;
4819 } 4860 }
4820 4861
4862 btrfs_inc_nlink(inode);
4863 inode->i_ctime = CURRENT_TIME;
4864
4821 btrfs_set_trans_block_group(trans, dir); 4865 btrfs_set_trans_block_group(trans, dir);
4822 ihold(inode); 4866 ihold(inode);
4823 4867
@@ -4881,7 +4925,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4881 4925
4882 drop_on_err = 1; 4926 drop_on_err = 1;
4883 4927
4884 err = btrfs_init_inode_security(trans, inode, dir); 4928 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4885 if (err) 4929 if (err)
4886 goto out_fail; 4930 goto out_fail;
4887 4931
@@ -5176,6 +5220,8 @@ again:
5176 em = NULL; 5220 em = NULL;
5177 btrfs_release_path(root, path); 5221 btrfs_release_path(root, path);
5178 trans = btrfs_join_transaction(root, 1); 5222 trans = btrfs_join_transaction(root, 1);
5223 if (IS_ERR(trans))
5224 return ERR_CAST(trans);
5179 goto again; 5225 goto again;
5180 } 5226 }
5181 map = kmap(page); 5227 map = kmap(page);
@@ -5185,7 +5231,7 @@ again:
5185 btrfs_mark_buffer_dirty(leaf); 5231 btrfs_mark_buffer_dirty(leaf);
5186 } 5232 }
5187 set_extent_uptodate(io_tree, em->start, 5233 set_extent_uptodate(io_tree, em->start,
5188 extent_map_end(em) - 1, GFP_NOFS); 5234 extent_map_end(em) - 1, NULL, GFP_NOFS);
5189 goto insert; 5235 goto insert;
5190 } else { 5236 } else {
5191 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5237 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5252,6 +5298,9 @@ insert:
5252 } 5298 }
5253 write_unlock(&em_tree->lock); 5299 write_unlock(&em_tree->lock);
5254out: 5300out:
5301
5302 trace_btrfs_get_extent(root, em);
5303
5255 if (path) 5304 if (path)
5256 btrfs_free_path(path); 5305 btrfs_free_path(path);
5257 if (trans) { 5306 if (trans) {
@@ -5266,22 +5315,157 @@ out:
5266 return em; 5315 return em;
5267} 5316}
5268 5317
5318struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
5319 size_t pg_offset, u64 start, u64 len,
5320 int create)
5321{
5322 struct extent_map *em;
5323 struct extent_map *hole_em = NULL;
5324 u64 range_start = start;
5325 u64 end;
5326 u64 found;
5327 u64 found_end;
5328 int err = 0;
5329
5330 em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
5331 if (IS_ERR(em))
5332 return em;
5333 if (em) {
5334 /*
5335 * if our em maps to a hole, there might
5336 * actually be delalloc bytes behind it
5337 */
5338 if (em->block_start != EXTENT_MAP_HOLE)
5339 return em;
5340 else
5341 hole_em = em;
5342 }
5343
5344 /* check to see if we've wrapped (len == -1 or similar) */
5345 end = start + len;
5346 if (end < start)
5347 end = (u64)-1;
5348 else
5349 end -= 1;
5350
5351 em = NULL;
5352
5353 /* ok, we didn't find anything, lets look for delalloc */
5354 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
5355 end, len, EXTENT_DELALLOC, 1);
5356 found_end = range_start + found;
5357 if (found_end < range_start)
5358 found_end = (u64)-1;
5359
5360 /*
5361 * we didn't find anything useful, return
5362 * the original results from get_extent()
5363 */
5364 if (range_start > end || found_end <= start) {
5365 em = hole_em;
5366 hole_em = NULL;
5367 goto out;
5368 }
5369
5370 /* adjust the range_start to make sure it doesn't
5371 * go backwards from the start they passed in
5372 */
5373 range_start = max(start,range_start);
5374 found = found_end - range_start;
5375
5376 if (found > 0) {
5377 u64 hole_start = start;
5378 u64 hole_len = len;
5379
5380 em = alloc_extent_map(GFP_NOFS);
5381 if (!em) {
5382 err = -ENOMEM;
5383 goto out;
5384 }
5385 /*
5386 * when btrfs_get_extent can't find anything it
5387 * returns one huge hole
5388 *
5389 * make sure what it found really fits our range, and
5390 * adjust to make sure it is based on the start from
5391 * the caller
5392 */
5393 if (hole_em) {
5394 u64 calc_end = extent_map_end(hole_em);
5395
5396 if (calc_end <= start || (hole_em->start > end)) {
5397 free_extent_map(hole_em);
5398 hole_em = NULL;
5399 } else {
5400 hole_start = max(hole_em->start, start);
5401 hole_len = calc_end - hole_start;
5402 }
5403 }
5404 em->bdev = NULL;
5405 if (hole_em && range_start > hole_start) {
5406 /* our hole starts before our delalloc, so we
5407 * have to return just the parts of the hole
5408 * that go until the delalloc starts
5409 */
5410 em->len = min(hole_len,
5411 range_start - hole_start);
5412 em->start = hole_start;
5413 em->orig_start = hole_start;
5414 /*
5415 * don't adjust block start at all,
5416 * it is fixed at EXTENT_MAP_HOLE
5417 */
5418 em->block_start = hole_em->block_start;
5419 em->block_len = hole_len;
5420 } else {
5421 em->start = range_start;
5422 em->len = found;
5423 em->orig_start = range_start;
5424 em->block_start = EXTENT_MAP_DELALLOC;
5425 em->block_len = found;
5426 }
5427 } else if (hole_em) {
5428 return hole_em;
5429 }
5430out:
5431
5432 free_extent_map(hole_em);
5433 if (err) {
5434 free_extent_map(em);
5435 return ERR_PTR(err);
5436 }
5437 return em;
5438}
5439
5269static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5440static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5441 struct extent_map *em,
5270 u64 start, u64 len) 5442 u64 start, u64 len)
5271{ 5443{
5272 struct btrfs_root *root = BTRFS_I(inode)->root; 5444 struct btrfs_root *root = BTRFS_I(inode)->root;
5273 struct btrfs_trans_handle *trans; 5445 struct btrfs_trans_handle *trans;
5274 struct extent_map *em;
5275 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5446 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5276 struct btrfs_key ins; 5447 struct btrfs_key ins;
5277 u64 alloc_hint; 5448 u64 alloc_hint;
5278 int ret; 5449 int ret;
5450 bool insert = false;
5279 5451
5280 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5452 /*
5453 * Ok if the extent map we looked up is a hole and is for the exact
5454 * range we want, there is no reason to allocate a new one, however if
5455 * it is not right then we need to free this one and drop the cache for
5456 * our range.
5457 */
5458 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5459 em->len != len) {
5460 free_extent_map(em);
5461 em = NULL;
5462 insert = true;
5463 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5464 }
5281 5465
5282 trans = btrfs_join_transaction(root, 0); 5466 trans = btrfs_join_transaction(root, 0);
5283 if (!trans) 5467 if (IS_ERR(trans))
5284 return ERR_PTR(-ENOMEM); 5468 return ERR_CAST(trans);
5285 5469
5286 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5470 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5287 5471
@@ -5293,10 +5477,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5293 goto out; 5477 goto out;
5294 } 5478 }
5295 5479
5296 em = alloc_extent_map(GFP_NOFS);
5297 if (!em) { 5480 if (!em) {
5298 em = ERR_PTR(-ENOMEM); 5481 em = alloc_extent_map(GFP_NOFS);
5299 goto out; 5482 if (!em) {
5483 em = ERR_PTR(-ENOMEM);
5484 goto out;
5485 }
5300 } 5486 }
5301 5487
5302 em->start = start; 5488 em->start = start;
@@ -5306,9 +5492,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5306 em->block_start = ins.objectid; 5492 em->block_start = ins.objectid;
5307 em->block_len = ins.offset; 5493 em->block_len = ins.offset;
5308 em->bdev = root->fs_info->fs_devices->latest_bdev; 5494 em->bdev = root->fs_info->fs_devices->latest_bdev;
5495
5496 /*
5497 * We need to do this because if we're using the original em we searched
5498 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5499 */
5500 em->flags = 0;
5309 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5501 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5310 5502
5311 while (1) { 5503 while (insert) {
5312 write_lock(&em_tree->lock); 5504 write_lock(&em_tree->lock);
5313 ret = add_extent_mapping(em_tree, em); 5505 ret = add_extent_mapping(em_tree, em);
5314 write_unlock(&em_tree->lock); 5506 write_unlock(&em_tree->lock);
@@ -5505,7 +5697,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5505 * while we look for nocow cross refs 5697 * while we look for nocow cross refs
5506 */ 5698 */
5507 trans = btrfs_join_transaction(root, 0); 5699 trans = btrfs_join_transaction(root, 0);
5508 if (!trans) 5700 if (IS_ERR(trans))
5509 goto must_cow; 5701 goto must_cow;
5510 5702
5511 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5703 if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5526,8 +5718,7 @@ must_cow:
5526 * it above 5718 * it above
5527 */ 5719 */
5528 len = bh_result->b_size; 5720 len = bh_result->b_size;
5529 free_extent_map(em); 5721 em = btrfs_new_extent_direct(inode, em, start, len);
5530 em = btrfs_new_extent_direct(inode, start, len);
5531 if (IS_ERR(em)) 5722 if (IS_ERR(em))
5532 return PTR_ERR(em); 5723 return PTR_ERR(em);
5533 len = min(len, em->len - (start - em->start)); 5724 len = min(len, em->len - (start - em->start));
@@ -5613,6 +5804,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5613 5804
5614 kfree(dip->csums); 5805 kfree(dip->csums);
5615 kfree(dip); 5806 kfree(dip);
5807
5808 /* If we had a csum failure make sure to clear the uptodate flag */
5809 if (err)
5810 clear_bit(BIO_UPTODATE, &bio->bi_flags);
5616 dio_end_io(bio, err); 5811 dio_end_io(bio, err);
5617} 5812}
5618 5813
@@ -5640,7 +5835,7 @@ again:
5640 BUG_ON(!ordered); 5835 BUG_ON(!ordered);
5641 5836
5642 trans = btrfs_join_transaction(root, 1); 5837 trans = btrfs_join_transaction(root, 1);
5643 if (!trans) { 5838 if (IS_ERR(trans)) {
5644 err = -ENOMEM; 5839 err = -ENOMEM;
5645 goto out; 5840 goto out;
5646 } 5841 }
@@ -5686,8 +5881,10 @@ again:
5686 } 5881 }
5687 5882
5688 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5883 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5689 btrfs_ordered_update_i_size(inode, 0, ordered); 5884 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5690 btrfs_update_inode(trans, root, inode); 5885 if (!ret)
5886 btrfs_update_inode(trans, root, inode);
5887 ret = 0;
5691out_unlock: 5888out_unlock:
5692 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5889 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5693 ordered->file_offset + ordered->len - 1, 5890 ordered->file_offset + ordered->len - 1,
@@ -5714,6 +5911,10 @@ out_done:
5714 5911
5715 kfree(dip->csums); 5912 kfree(dip->csums);
5716 kfree(dip); 5913 kfree(dip);
5914
5915 /* If we had an error make sure to clear the uptodate flag */
5916 if (err)
5917 clear_bit(BIO_UPTODATE, &bio->bi_flags);
5717 dio_end_io(bio, err); 5918 dio_end_io(bio, err);
5718} 5919}
5719 5920
@@ -5769,7 +5970,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5769 5970
5770static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, 5971static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5771 int rw, u64 file_offset, int skip_sum, 5972 int rw, u64 file_offset, int skip_sum,
5772 u32 *csums) 5973 u32 *csums, int async_submit)
5773{ 5974{
5774 int write = rw & REQ_WRITE; 5975 int write = rw & REQ_WRITE;
5775 struct btrfs_root *root = BTRFS_I(inode)->root; 5976 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -5780,18 +5981,33 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5780 if (ret) 5981 if (ret)
5781 goto err; 5982 goto err;
5782 5983
5783 if (write && !skip_sum) { 5984 if (skip_sum)
5985 goto map;
5986
5987 if (write && async_submit) {
5784 ret = btrfs_wq_submit_bio(root->fs_info, 5988 ret = btrfs_wq_submit_bio(root->fs_info,
5785 inode, rw, bio, 0, 0, 5989 inode, rw, bio, 0, 0,
5786 file_offset, 5990 file_offset,
5787 __btrfs_submit_bio_start_direct_io, 5991 __btrfs_submit_bio_start_direct_io,
5788 __btrfs_submit_bio_done); 5992 __btrfs_submit_bio_done);
5789 goto err; 5993 goto err;
5790 } else if (!skip_sum) 5994 } else if (write) {
5791 btrfs_lookup_bio_sums_dio(root, inode, bio, 5995 /*
5996 * If we aren't doing async submit, calculate the csum of the
5997 * bio now.
5998 */
5999 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
6000 if (ret)
6001 goto err;
6002 } else if (!skip_sum) {
6003 ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
5792 file_offset, csums); 6004 file_offset, csums);
6005 if (ret)
6006 goto err;
6007 }
5793 6008
5794 ret = btrfs_map_bio(root, rw, bio, 0, 1); 6009map:
6010 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
5795err: 6011err:
5796 bio_put(bio); 6012 bio_put(bio);
5797 return ret; 6013 return ret;
@@ -5813,13 +6029,8 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5813 int nr_pages = 0; 6029 int nr_pages = 0;
5814 u32 *csums = dip->csums; 6030 u32 *csums = dip->csums;
5815 int ret = 0; 6031 int ret = 0;
5816 6032 int async_submit = 0;
5817 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); 6033 int write = rw & REQ_WRITE;
5818 if (!bio)
5819 return -ENOMEM;
5820 bio->bi_private = dip;
5821 bio->bi_end_io = btrfs_end_dio_bio;
5822 atomic_inc(&dip->pending_bios);
5823 6034
5824 map_length = orig_bio->bi_size; 6035 map_length = orig_bio->bi_size;
5825 ret = btrfs_map_block(map_tree, READ, start_sector << 9, 6036 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
@@ -5829,6 +6040,19 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5829 return -EIO; 6040 return -EIO;
5830 } 6041 }
5831 6042
6043 if (map_length >= orig_bio->bi_size) {
6044 bio = orig_bio;
6045 goto submit;
6046 }
6047
6048 async_submit = 1;
6049 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6050 if (!bio)
6051 return -ENOMEM;
6052 bio->bi_private = dip;
6053 bio->bi_end_io = btrfs_end_dio_bio;
6054 atomic_inc(&dip->pending_bios);
6055
5832 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { 6056 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5833 if (unlikely(map_length < submit_len + bvec->bv_len || 6057 if (unlikely(map_length < submit_len + bvec->bv_len ||
5834 bio_add_page(bio, bvec->bv_page, bvec->bv_len, 6058 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
@@ -5842,14 +6066,15 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5842 atomic_inc(&dip->pending_bios); 6066 atomic_inc(&dip->pending_bios);
5843 ret = __btrfs_submit_dio_bio(bio, inode, rw, 6067 ret = __btrfs_submit_dio_bio(bio, inode, rw,
5844 file_offset, skip_sum, 6068 file_offset, skip_sum,
5845 csums); 6069 csums, async_submit);
5846 if (ret) { 6070 if (ret) {
5847 bio_put(bio); 6071 bio_put(bio);
5848 atomic_dec(&dip->pending_bios); 6072 atomic_dec(&dip->pending_bios);
5849 goto out_err; 6073 goto out_err;
5850 } 6074 }
5851 6075
5852 if (!skip_sum) 6076 /* Write's use the ordered csums */
6077 if (!write && !skip_sum)
5853 csums = csums + nr_pages; 6078 csums = csums + nr_pages;
5854 start_sector += submit_len >> 9; 6079 start_sector += submit_len >> 9;
5855 file_offset += submit_len; 6080 file_offset += submit_len;
@@ -5878,8 +6103,9 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5878 } 6103 }
5879 } 6104 }
5880 6105
6106submit:
5881 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum, 6107 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
5882 csums); 6108 csums, async_submit);
5883 if (!ret) 6109 if (!ret)
5884 return 0; 6110 return 0;
5885 6111
@@ -5917,9 +6143,11 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5917 } 6143 }
5918 dip->csums = NULL; 6144 dip->csums = NULL;
5919 6145
5920 if (!skip_sum) { 6146 /* Write's use the ordered csum stuff, so we don't need dip->csums */
6147 if (!write && !skip_sum) {
5921 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6148 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5922 if (!dip->csums) { 6149 if (!dip->csums) {
6150 kfree(dip);
5923 ret = -ENOMEM; 6151 ret = -ENOMEM;
5924 goto free_ordered; 6152 goto free_ordered;
5925 } 6153 }
@@ -5972,6 +6200,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
5972 unsigned long nr_segs) 6200 unsigned long nr_segs)
5973{ 6201{
5974 int seg; 6202 int seg;
6203 int i;
5975 size_t size; 6204 size_t size;
5976 unsigned long addr; 6205 unsigned long addr;
5977 unsigned blocksize_mask = root->sectorsize - 1; 6206 unsigned blocksize_mask = root->sectorsize - 1;
@@ -5986,8 +6215,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
5986 addr = (unsigned long)iov[seg].iov_base; 6215 addr = (unsigned long)iov[seg].iov_base;
5987 size = iov[seg].iov_len; 6216 size = iov[seg].iov_len;
5988 end += size; 6217 end += size;
5989 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6218 if ((addr & blocksize_mask) || (size & blocksize_mask))
5990 goto out; 6219 goto out;
6220
6221 /* If this is a write we don't need to check anymore */
6222 if (rw & WRITE)
6223 continue;
6224
6225 /*
6226 * Check to make sure we don't have duplicate iov_base's in this
6227 * iovec, if so return EINVAL, otherwise we'll get csum errors
6228 * when reading back.
6229 */
6230 for (i = seg + 1; i < nr_segs; i++) {
6231 if (iov[seg].iov_base == iov[i].iov_base)
6232 goto out;
6233 }
5991 } 6234 }
5992 retval = 0; 6235 retval = 0;
5993out: 6236out:
@@ -6088,7 +6331,7 @@ out:
6088static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6331static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
6089 __u64 start, __u64 len) 6332 __u64 start, __u64 len)
6090{ 6333{
6091 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); 6334 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
6092} 6335}
6093 6336
6094int btrfs_readpage(struct file *file, struct page *page) 6337int btrfs_readpage(struct file *file, struct page *page)
@@ -6338,28 +6581,42 @@ out:
6338 return ret; 6581 return ret;
6339} 6582}
6340 6583
6341static void btrfs_truncate(struct inode *inode) 6584static int btrfs_truncate(struct inode *inode)
6342{ 6585{
6343 struct btrfs_root *root = BTRFS_I(inode)->root; 6586 struct btrfs_root *root = BTRFS_I(inode)->root;
6344 int ret; 6587 int ret;
6588 int err = 0;
6345 struct btrfs_trans_handle *trans; 6589 struct btrfs_trans_handle *trans;
6346 unsigned long nr; 6590 unsigned long nr;
6347 u64 mask = root->sectorsize - 1; 6591 u64 mask = root->sectorsize - 1;
6348 6592
6349 if (!S_ISREG(inode->i_mode)) {
6350 WARN_ON(1);
6351 return;
6352 }
6353
6354 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6593 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6355 if (ret) 6594 if (ret)
6356 return; 6595 return ret;
6357 6596
6358 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6597 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
6359 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6598 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
6360 6599
6600 trans = btrfs_start_transaction(root, 5);
6601 if (IS_ERR(trans))
6602 return PTR_ERR(trans);
6603
6604 btrfs_set_trans_block_group(trans, inode);
6605
6606 ret = btrfs_orphan_add(trans, inode);
6607 if (ret) {
6608 btrfs_end_transaction(trans, root);
6609 return ret;
6610 }
6611
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /* Now start a transaction for the truncate */
6361 trans = btrfs_start_transaction(root, 0); 6617 trans = btrfs_start_transaction(root, 0);
6362 BUG_ON(IS_ERR(trans)); 6618 if (IS_ERR(trans))
6619 return PTR_ERR(trans);
6363 btrfs_set_trans_block_group(trans, inode); 6620 btrfs_set_trans_block_group(trans, inode);
6364 trans->block_rsv = root->orphan_block_rsv; 6621 trans->block_rsv = root->orphan_block_rsv;
6365 6622
@@ -6386,29 +6643,38 @@ static void btrfs_truncate(struct inode *inode)
6386 while (1) { 6643 while (1) {
6387 if (!trans) { 6644 if (!trans) {
6388 trans = btrfs_start_transaction(root, 0); 6645 trans = btrfs_start_transaction(root, 0);
6389 BUG_ON(IS_ERR(trans)); 6646 if (IS_ERR(trans))
6647 return PTR_ERR(trans);
6390 btrfs_set_trans_block_group(trans, inode); 6648 btrfs_set_trans_block_group(trans, inode);
6391 trans->block_rsv = root->orphan_block_rsv; 6649 trans->block_rsv = root->orphan_block_rsv;
6392 } 6650 }
6393 6651
6394 ret = btrfs_block_rsv_check(trans, root, 6652 ret = btrfs_block_rsv_check(trans, root,
6395 root->orphan_block_rsv, 0, 5); 6653 root->orphan_block_rsv, 0, 5);
6396 if (ret) { 6654 if (ret == -EAGAIN) {
6397 BUG_ON(ret != -EAGAIN);
6398 ret = btrfs_commit_transaction(trans, root); 6655 ret = btrfs_commit_transaction(trans, root);
6399 BUG_ON(ret); 6656 if (ret)
6657 return ret;
6400 trans = NULL; 6658 trans = NULL;
6401 continue; 6659 continue;
6660 } else if (ret) {
6661 err = ret;
6662 break;
6402 } 6663 }
6403 6664
6404 ret = btrfs_truncate_inode_items(trans, root, inode, 6665 ret = btrfs_truncate_inode_items(trans, root, inode,
6405 inode->i_size, 6666 inode->i_size,
6406 BTRFS_EXTENT_DATA_KEY); 6667 BTRFS_EXTENT_DATA_KEY);
6407 if (ret != -EAGAIN) 6668 if (ret != -EAGAIN) {
6669 err = ret;
6408 break; 6670 break;
6671 }
6409 6672
6410 ret = btrfs_update_inode(trans, root, inode); 6673 ret = btrfs_update_inode(trans, root, inode);
6411 BUG_ON(ret); 6674 if (ret) {
6675 err = ret;
6676 break;
6677 }
6412 6678
6413 nr = trans->blocks_used; 6679 nr = trans->blocks_used;
6414 btrfs_end_transaction(trans, root); 6680 btrfs_end_transaction(trans, root);
@@ -6418,16 +6684,27 @@ static void btrfs_truncate(struct inode *inode)
6418 6684
6419 if (ret == 0 && inode->i_nlink > 0) { 6685 if (ret == 0 && inode->i_nlink > 0) {
6420 ret = btrfs_orphan_del(trans, inode); 6686 ret = btrfs_orphan_del(trans, inode);
6421 BUG_ON(ret); 6687 if (ret)
6688 err = ret;
6689 } else if (ret && inode->i_nlink > 0) {
6690 /*
6691 * Failed to do the truncate, remove us from the in memory
6692 * orphan list.
6693 */
6694 ret = btrfs_orphan_del(NULL, inode);
6422 } 6695 }
6423 6696
6424 ret = btrfs_update_inode(trans, root, inode); 6697 ret = btrfs_update_inode(trans, root, inode);
6425 BUG_ON(ret); 6698 if (ret && !err)
6699 err = ret;
6426 6700
6427 nr = trans->blocks_used; 6701 nr = trans->blocks_used;
6428 ret = btrfs_end_transaction_throttle(trans, root); 6702 ret = btrfs_end_transaction_throttle(trans, root);
6429 BUG_ON(ret); 6703 if (ret && !err)
6704 err = ret;
6430 btrfs_btree_balance_dirty(root, nr); 6705 btrfs_btree_balance_dirty(root, nr);
6706
6707 return err;
6431} 6708}
6432 6709
6433/* 6710/*
@@ -6494,9 +6771,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6494 ei->index_cnt = (u64)-1; 6771 ei->index_cnt = (u64)-1;
6495 ei->last_unlink_trans = 0; 6772 ei->last_unlink_trans = 0;
6496 6773
6497 spin_lock_init(&ei->accounting_lock);
6498 atomic_set(&ei->outstanding_extents, 0); 6774 atomic_set(&ei->outstanding_extents, 0);
6499 ei->reserved_extents = 0; 6775 atomic_set(&ei->reserved_extents, 0);
6500 6776
6501 ei->ordered_data_close = 0; 6777 ei->ordered_data_close = 0;
6502 ei->orphan_meta_reserved = 0; 6778 ei->orphan_meta_reserved = 0;
@@ -6532,7 +6808,7 @@ void btrfs_destroy_inode(struct inode *inode)
6532 WARN_ON(!list_empty(&inode->i_dentry)); 6808 WARN_ON(!list_empty(&inode->i_dentry));
6533 WARN_ON(inode->i_data.nrpages); 6809 WARN_ON(inode->i_data.nrpages);
6534 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6810 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6535 WARN_ON(BTRFS_I(inode)->reserved_extents); 6811 WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
6536 6812
6537 /* 6813 /*
6538 * This can happen where we create an inode, but somebody else also 6814 * This can happen where we create an inode, but somebody else also
@@ -6624,6 +6900,8 @@ void btrfs_destroy_cachep(void)
6624 kmem_cache_destroy(btrfs_transaction_cachep); 6900 kmem_cache_destroy(btrfs_transaction_cachep);
6625 if (btrfs_path_cachep) 6901 if (btrfs_path_cachep)
6626 kmem_cache_destroy(btrfs_path_cachep); 6902 kmem_cache_destroy(btrfs_path_cachep);
6903 if (btrfs_free_space_cachep)
6904 kmem_cache_destroy(btrfs_free_space_cachep);
6627} 6905}
6628 6906
6629int btrfs_init_cachep(void) 6907int btrfs_init_cachep(void)
@@ -6652,6 +6930,12 @@ int btrfs_init_cachep(void)
6652 if (!btrfs_path_cachep) 6930 if (!btrfs_path_cachep)
6653 goto fail; 6931 goto fail;
6654 6932
6933 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
6934 sizeof(struct btrfs_free_space), 0,
6935 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
6936 if (!btrfs_free_space_cachep)
6937 goto fail;
6938
6655 return 0; 6939 return 0;
6656fail: 6940fail:
6657 btrfs_destroy_cachep(); 6941 btrfs_destroy_cachep();
@@ -6670,6 +6954,26 @@ static int btrfs_getattr(struct vfsmount *mnt,
6670 return 0; 6954 return 0;
6671} 6955}
6672 6956
6957/*
6958 * If a file is moved, it will inherit the cow and compression flags of the new
6959 * directory.
6960 */
6961static void fixup_inode_flags(struct inode *dir, struct inode *inode)
6962{
6963 struct btrfs_inode *b_dir = BTRFS_I(dir);
6964 struct btrfs_inode *b_inode = BTRFS_I(inode);
6965
6966 if (b_dir->flags & BTRFS_INODE_NODATACOW)
6967 b_inode->flags |= BTRFS_INODE_NODATACOW;
6968 else
6969 b_inode->flags &= ~BTRFS_INODE_NODATACOW;
6970
6971 if (b_dir->flags & BTRFS_INODE_COMPRESS)
6972 b_inode->flags |= BTRFS_INODE_COMPRESS;
6973 else
6974 b_inode->flags &= ~BTRFS_INODE_COMPRESS;
6975}
6976
6673static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 6977static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6674 struct inode *new_dir, struct dentry *new_dentry) 6978 struct inode *new_dir, struct dentry *new_dentry)
6675{ 6979{
@@ -6718,8 +7022,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6718 * should cover the worst case number of items we'll modify. 7022 * should cover the worst case number of items we'll modify.
6719 */ 7023 */
6720 trans = btrfs_start_transaction(root, 20); 7024 trans = btrfs_start_transaction(root, 20);
6721 if (IS_ERR(trans)) 7025 if (IS_ERR(trans)) {
6722 return PTR_ERR(trans); 7026 ret = PTR_ERR(trans);
7027 goto out_notrans;
7028 }
6723 7029
6724 btrfs_set_trans_block_group(trans, new_dir); 7030 btrfs_set_trans_block_group(trans, new_dir);
6725 7031
@@ -6772,11 +7078,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6772 old_dentry->d_name.name, 7078 old_dentry->d_name.name,
6773 old_dentry->d_name.len); 7079 old_dentry->d_name.len);
6774 } else { 7080 } else {
6775 btrfs_inc_nlink(old_dentry->d_inode); 7081 ret = __btrfs_unlink_inode(trans, root, old_dir,
6776 ret = btrfs_unlink_inode(trans, root, old_dir, 7082 old_dentry->d_inode,
6777 old_dentry->d_inode, 7083 old_dentry->d_name.name,
6778 old_dentry->d_name.name, 7084 old_dentry->d_name.len);
6779 old_dentry->d_name.len); 7085 if (!ret)
7086 ret = btrfs_update_inode(trans, root, old_inode);
6780 } 7087 }
6781 BUG_ON(ret); 7088 BUG_ON(ret);
6782 7089
@@ -6803,6 +7110,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6803 } 7110 }
6804 } 7111 }
6805 7112
7113 fixup_inode_flags(new_dir, old_inode);
7114
6806 ret = btrfs_add_link(trans, new_dir, old_inode, 7115 ret = btrfs_add_link(trans, new_dir, old_inode,
6807 new_dentry->d_name.name, 7116 new_dentry->d_name.name,
6808 new_dentry->d_name.len, 0, index); 7117 new_dentry->d_name.len, 0, index);
@@ -6816,7 +7125,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6816 } 7125 }
6817out_fail: 7126out_fail:
6818 btrfs_end_transaction_throttle(trans, root); 7127 btrfs_end_transaction_throttle(trans, root);
6819 7128out_notrans:
6820 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 7129 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
6821 up_read(&root->fs_info->subvol_sem); 7130 up_read(&root->fs_info->subvol_sem);
6822 7131
@@ -6968,7 +7277,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6968 if (IS_ERR(inode)) 7277 if (IS_ERR(inode))
6969 goto out_unlock; 7278 goto out_unlock;
6970 7279
6971 err = btrfs_init_inode_security(trans, inode, dir); 7280 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6972 if (err) { 7281 if (err) {
6973 drop_inode = 1; 7282 drop_inode = 1;
6974 goto out_unlock; 7283 goto out_unlock;
@@ -7204,7 +7513,6 @@ static const struct address_space_operations btrfs_aops = {
7204 .writepage = btrfs_writepage, 7513 .writepage = btrfs_writepage,
7205 .writepages = btrfs_writepages, 7514 .writepages = btrfs_writepages,
7206 .readpages = btrfs_readpages, 7515 .readpages = btrfs_readpages,
7207 .sync_page = block_sync_page,
7208 .direct_IO = btrfs_direct_IO, 7516 .direct_IO = btrfs_direct_IO,
7209 .invalidatepage = btrfs_invalidatepage, 7517 .invalidatepage = btrfs_invalidatepage,
7210 .releasepage = btrfs_releasepage, 7518 .releasepage = btrfs_releasepage,
@@ -7220,7 +7528,6 @@ static const struct address_space_operations btrfs_symlink_aops = {
7220}; 7528};
7221 7529
7222static const struct inode_operations btrfs_file_inode_operations = { 7530static const struct inode_operations btrfs_file_inode_operations = {
7223 .truncate = btrfs_truncate,
7224 .getattr = btrfs_getattr, 7531 .getattr = btrfs_getattr,
7225 .setattr = btrfs_setattr, 7532 .setattr = btrfs_setattr,
7226 .setxattr = btrfs_setxattr, 7533 .setxattr = btrfs_setxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a506a22b522a..ffb48d6c5433 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -40,6 +40,7 @@
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h>
43#include "compat.h" 44#include "compat.h"
44#include "ctree.h" 45#include "ctree.h"
45#include "disk-io.h" 46#include "disk-io.h"
@@ -138,6 +139,24 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
138 return 0; 139 return 0;
139} 140}
140 141
142static int check_flags(unsigned int flags)
143{
144 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
145 FS_NOATIME_FL | FS_NODUMP_FL | \
146 FS_SYNC_FL | FS_DIRSYNC_FL | \
147 FS_NOCOMP_FL | FS_COMPR_FL | \
148 FS_NOCOW_FL | FS_COW_FL))
149 return -EOPNOTSUPP;
150
151 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
152 return -EINVAL;
153
154 if ((flags & FS_NOCOW_FL) && (flags & FS_COW_FL))
155 return -EINVAL;
156
157 return 0;
158}
159
141static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 160static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
142{ 161{
143 struct inode *inode = file->f_path.dentry->d_inode; 162 struct inode *inode = file->f_path.dentry->d_inode;
@@ -153,12 +172,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
153 if (copy_from_user(&flags, arg, sizeof(flags))) 172 if (copy_from_user(&flags, arg, sizeof(flags)))
154 return -EFAULT; 173 return -EFAULT;
155 174
156 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 175 ret = check_flags(flags);
157 FS_NOATIME_FL | FS_NODUMP_FL | \ 176 if (ret)
158 FS_SYNC_FL | FS_DIRSYNC_FL)) 177 return ret;
159 return -EOPNOTSUPP;
160 178
161 if (!is_owner_or_cap(inode)) 179 if (!inode_owner_or_capable(inode))
162 return -EACCES; 180 return -EACCES;
163 181
164 mutex_lock(&inode->i_mutex); 182 mutex_lock(&inode->i_mutex);
@@ -201,9 +219,25 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
201 else 219 else
202 ip->flags &= ~BTRFS_INODE_DIRSYNC; 220 ip->flags &= ~BTRFS_INODE_DIRSYNC;
203 221
222 /*
223 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
224 * flag may be changed automatically if compression code won't make
225 * things smaller.
226 */
227 if (flags & FS_NOCOMP_FL) {
228 ip->flags &= ~BTRFS_INODE_COMPRESS;
229 ip->flags |= BTRFS_INODE_NOCOMPRESS;
230 } else if (flags & FS_COMPR_FL) {
231 ip->flags |= BTRFS_INODE_COMPRESS;
232 ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
233 }
234 if (flags & FS_NOCOW_FL)
235 ip->flags |= BTRFS_INODE_NODATACOW;
236 else if (flags & FS_COW_FL)
237 ip->flags &= ~BTRFS_INODE_NODATACOW;
204 238
205 trans = btrfs_join_transaction(root, 1); 239 trans = btrfs_join_transaction(root, 1);
206 BUG_ON(!trans); 240 BUG_ON(IS_ERR(trans));
207 241
208 ret = btrfs_update_inode(trans, root, inode); 242 ret = btrfs_update_inode(trans, root, inode);
209 BUG_ON(ret); 243 BUG_ON(ret);
@@ -213,9 +247,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
213 btrfs_end_transaction(trans, root); 247 btrfs_end_transaction(trans, root);
214 248
215 mnt_drop_write(file->f_path.mnt); 249 mnt_drop_write(file->f_path.mnt);
250
251 ret = 0;
216 out_unlock: 252 out_unlock:
217 mutex_unlock(&inode->i_mutex); 253 mutex_unlock(&inode->i_mutex);
218 return 0; 254 return ret;
219} 255}
220 256
221static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 257static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
@@ -225,6 +261,49 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
225 return put_user(inode->i_generation, arg); 261 return put_user(inode->i_generation, arg);
226} 262}
227 263
264static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
265{
266 struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
267 struct btrfs_fs_info *fs_info = root->fs_info;
268 struct btrfs_device *device;
269 struct request_queue *q;
270 struct fstrim_range range;
271 u64 minlen = ULLONG_MAX;
272 u64 num_devices = 0;
273 int ret;
274
275 if (!capable(CAP_SYS_ADMIN))
276 return -EPERM;
277
278 mutex_lock(&fs_info->fs_devices->device_list_mutex);
279 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
280 if (!device->bdev)
281 continue;
282 q = bdev_get_queue(device->bdev);
283 if (blk_queue_discard(q)) {
284 num_devices++;
285 minlen = min((u64)q->limits.discard_granularity,
286 minlen);
287 }
288 }
289 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
290 if (!num_devices)
291 return -EOPNOTSUPP;
292
293 if (copy_from_user(&range, arg, sizeof(range)))
294 return -EFAULT;
295
296 range.minlen = max(range.minlen, minlen);
297 ret = btrfs_trim_fs(root, &range);
298 if (ret < 0)
299 return ret;
300
301 if (copy_to_user(arg, &range, sizeof(range)))
302 return -EFAULT;
303
304 return 0;
305}
306
228static noinline int create_subvol(struct btrfs_root *root, 307static noinline int create_subvol(struct btrfs_root *root,
229 struct dentry *dentry, 308 struct dentry *dentry,
230 char *name, int namelen, 309 char *name, int namelen,
@@ -294,6 +373,10 @@ static noinline int create_subvol(struct btrfs_root *root,
294 inode_item->nbytes = cpu_to_le64(root->leafsize); 373 inode_item->nbytes = cpu_to_le64(root->leafsize);
295 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 374 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
296 375
376 root_item.flags = 0;
377 root_item.byte_limit = 0;
378 inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
379
297 btrfs_set_root_bytenr(&root_item, leaf->start); 380 btrfs_set_root_bytenr(&root_item, leaf->start);
298 btrfs_set_root_generation(&root_item, trans->transid); 381 btrfs_set_root_generation(&root_item, trans->transid);
299 btrfs_set_root_level(&root_item, 0); 382 btrfs_set_root_level(&root_item, 0);
@@ -409,7 +492,9 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
409 if (ret) 492 if (ret)
410 goto fail; 493 goto fail;
411 494
412 btrfs_orphan_cleanup(pending_snapshot->snap); 495 ret = btrfs_orphan_cleanup(pending_snapshot->snap);
496 if (ret)
497 goto fail;
413 498
414 parent = dget_parent(dentry); 499 parent = dget_parent(dentry);
415 inode = btrfs_lookup_dentry(parent->d_inode, dentry); 500 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
@@ -907,6 +992,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
907 992
908 if (new_size > old_size) { 993 if (new_size > old_size) {
909 trans = btrfs_start_transaction(root, 0); 994 trans = btrfs_start_transaction(root, 0);
995 if (IS_ERR(trans)) {
996 ret = PTR_ERR(trans);
997 goto out_unlock;
998 }
910 ret = btrfs_grow_device(trans, device, new_size); 999 ret = btrfs_grow_device(trans, device, new_size);
911 btrfs_commit_transaction(trans, root); 1000 btrfs_commit_transaction(trans, root);
912 } else { 1001 } else {
@@ -1067,12 +1156,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1067 if (copy_from_user(&flags, arg, sizeof(flags))) 1156 if (copy_from_user(&flags, arg, sizeof(flags)))
1068 return -EFAULT; 1157 return -EFAULT;
1069 1158
1070 if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC) 1159 if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
1071 return -EINVAL; 1160 return -EINVAL;
1072 1161
1073 if (flags & ~BTRFS_SUBVOL_RDONLY) 1162 if (flags & ~BTRFS_SUBVOL_RDONLY)
1074 return -EOPNOTSUPP; 1163 return -EOPNOTSUPP;
1075 1164
1165 if (!inode_owner_or_capable(inode))
1166 return -EACCES;
1167
1076 down_write(&root->fs_info->subvol_sem); 1168 down_write(&root->fs_info->subvol_sem);
1077 1169
1078 /* nothing to do */ 1170 /* nothing to do */
@@ -1093,7 +1185,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1093 goto out_reset; 1185 goto out_reset;
1094 } 1186 }
1095 1187
1096 ret = btrfs_update_root(trans, root, 1188 ret = btrfs_update_root(trans, root->fs_info->tree_root,
1097 &root->root_key, &root->root_item); 1189 &root->root_key, &root->root_item);
1098 1190
1099 btrfs_commit_transaction(trans, root); 1191 btrfs_commit_transaction(trans, root);
@@ -1898,7 +1990,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1898 1990
1899 memcpy(&new_key, &key, sizeof(new_key)); 1991 memcpy(&new_key, &key, sizeof(new_key));
1900 new_key.objectid = inode->i_ino; 1992 new_key.objectid = inode->i_ino;
1901 new_key.offset = key.offset + destoff - off; 1993 if (off <= key.offset)
1994 new_key.offset = key.offset + destoff - off;
1995 else
1996 new_key.offset = destoff;
1902 1997
1903 trans = btrfs_start_transaction(root, 1); 1998 trans = btrfs_start_transaction(root, 1);
1904 if (IS_ERR(trans)) { 1999 if (IS_ERR(trans)) {
@@ -2082,7 +2177,7 @@ static long btrfs_ioctl_trans_start(struct file *file)
2082 2177
2083 ret = -ENOMEM; 2178 ret = -ENOMEM;
2084 trans = btrfs_start_ioctl_transaction(root, 0); 2179 trans = btrfs_start_ioctl_transaction(root, 0);
2085 if (!trans) 2180 if (IS_ERR(trans))
2086 goto out_drop; 2181 goto out_drop;
2087 2182
2088 file->private_data = trans; 2183 file->private_data = trans;
@@ -2138,9 +2233,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2138 path->leave_spinning = 1; 2233 path->leave_spinning = 1;
2139 2234
2140 trans = btrfs_start_transaction(root, 1); 2235 trans = btrfs_start_transaction(root, 1);
2141 if (!trans) { 2236 if (IS_ERR(trans)) {
2142 btrfs_free_path(path); 2237 btrfs_free_path(path);
2143 return -ENOMEM; 2238 return PTR_ERR(trans);
2144 } 2239 }
2145 2240
2146 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2241 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -2192,7 +2287,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2192 struct btrfs_ioctl_space_info space; 2287 struct btrfs_ioctl_space_info space;
2193 struct btrfs_ioctl_space_info *dest; 2288 struct btrfs_ioctl_space_info *dest;
2194 struct btrfs_ioctl_space_info *dest_orig; 2289 struct btrfs_ioctl_space_info *dest_orig;
2195 struct btrfs_ioctl_space_info *user_dest; 2290 struct btrfs_ioctl_space_info __user *user_dest;
2196 struct btrfs_space_info *info; 2291 struct btrfs_space_info *info;
2197 u64 types[] = {BTRFS_BLOCK_GROUP_DATA, 2292 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2198 BTRFS_BLOCK_GROUP_SYSTEM, 2293 BTRFS_BLOCK_GROUP_SYSTEM,
@@ -2201,7 +2296,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2201 int num_types = 4; 2296 int num_types = 4;
2202 int alloc_size; 2297 int alloc_size;
2203 int ret = 0; 2298 int ret = 0;
2204 int slot_count = 0; 2299 u64 slot_count = 0;
2205 int i, c; 2300 int i, c;
2206 2301
2207 if (copy_from_user(&space_args, 2302 if (copy_from_user(&space_args,
@@ -2240,7 +2335,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2240 goto out; 2335 goto out;
2241 } 2336 }
2242 2337
2243 slot_count = min_t(int, space_args.space_slots, slot_count); 2338 slot_count = min_t(u64, space_args.space_slots, slot_count);
2244 2339
2245 alloc_size = sizeof(*dest) * slot_count; 2340 alloc_size = sizeof(*dest) * slot_count;
2246 2341
@@ -2260,6 +2355,9 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2260 for (i = 0; i < num_types; i++) { 2355 for (i = 0; i < num_types; i++) {
2261 struct btrfs_space_info *tmp; 2356 struct btrfs_space_info *tmp;
2262 2357
2358 if (!slot_count)
2359 break;
2360
2263 info = NULL; 2361 info = NULL;
2264 rcu_read_lock(); 2362 rcu_read_lock();
2265 list_for_each_entry_rcu(tmp, &root->fs_info->space_info, 2363 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
@@ -2281,7 +2379,10 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2281 memcpy(dest, &space, sizeof(space)); 2379 memcpy(dest, &space, sizeof(space));
2282 dest++; 2380 dest++;
2283 space_args.total_spaces++; 2381 space_args.total_spaces++;
2382 slot_count--;
2284 } 2383 }
2384 if (!slot_count)
2385 break;
2285 } 2386 }
2286 up_read(&info->groups_sem); 2387 up_read(&info->groups_sem);
2287 } 2388 }
@@ -2332,10 +2433,17 @@ static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp
2332 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root; 2433 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2333 struct btrfs_trans_handle *trans; 2434 struct btrfs_trans_handle *trans;
2334 u64 transid; 2435 u64 transid;
2436 int ret;
2335 2437
2336 trans = btrfs_start_transaction(root, 0); 2438 trans = btrfs_start_transaction(root, 0);
2439 if (IS_ERR(trans))
2440 return PTR_ERR(trans);
2337 transid = trans->transid; 2441 transid = trans->transid;
2338 btrfs_commit_transaction_async(trans, root, 0); 2442 ret = btrfs_commit_transaction_async(trans, root, 0);
2443 if (ret) {
2444 btrfs_end_transaction(trans, root);
2445 return ret;
2446 }
2339 2447
2340 if (argp) 2448 if (argp)
2341 if (copy_to_user(argp, &transid, sizeof(transid))) 2449 if (copy_to_user(argp, &transid, sizeof(transid)))
@@ -2370,6 +2478,8 @@ long btrfs_ioctl(struct file *file, unsigned int
2370 return btrfs_ioctl_setflags(file, argp); 2478 return btrfs_ioctl_setflags(file, argp);
2371 case FS_IOC_GETVERSION: 2479 case FS_IOC_GETVERSION:
2372 return btrfs_ioctl_getversion(file, argp); 2480 return btrfs_ioctl_getversion(file, argp);
2481 case FITRIM:
2482 return btrfs_ioctl_fitrim(file, argp);
2373 case BTRFS_IOC_SNAP_CREATE: 2483 case BTRFS_IOC_SNAP_CREATE:
2374 return btrfs_ioctl_snap_create(file, argp, 0); 2484 return btrfs_ioctl_snap_create(file, argp, 0);
2375 case BTRFS_IOC_SNAP_CREATE_V2: 2485 case BTRFS_IOC_SNAP_CREATE_V2:
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399df..a178f5ebea78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
280 unsigned long tot_out; 280 unsigned long tot_out;
281 unsigned long tot_len; 281 unsigned long tot_len;
282 char *buf; 282 char *buf;
283 bool may_late_unmap, need_unmap;
283 284
284 data_in = kmap(pages_in[0]); 285 data_in = kmap(pages_in[0]);
285 tot_len = read_compress_length(data_in); 286 tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
300 301
301 tot_in += in_len; 302 tot_in += in_len;
302 working_bytes = in_len; 303 working_bytes = in_len;
304 may_late_unmap = need_unmap = false;
303 305
304 /* fast path: avoid using the working buffer */ 306 /* fast path: avoid using the working buffer */
305 if (in_page_bytes_left >= in_len) { 307 if (in_page_bytes_left >= in_len) {
306 buf = data_in + in_offset; 308 buf = data_in + in_offset;
307 bytes = in_len; 309 bytes = in_len;
310 may_late_unmap = true;
308 goto cont; 311 goto cont;
309 } 312 }
310 313
@@ -329,14 +332,17 @@ cont:
329 if (working_bytes == 0 && tot_in >= tot_len) 332 if (working_bytes == 0 && tot_in >= tot_len)
330 break; 333 break;
331 334
332 kunmap(pages_in[page_in_index]); 335 if (page_in_index + 1 >= total_pages_in) {
333 page_in_index++;
334 if (page_in_index >= total_pages_in) {
335 ret = -1; 336 ret = -1;
336 data_in = NULL;
337 goto done; 337 goto done;
338 } 338 }
339 data_in = kmap(pages_in[page_in_index]); 339
340 if (may_late_unmap)
341 need_unmap = true;
342 else
343 kunmap(pages_in[page_in_index]);
344
345 data_in = kmap(pages_in[++page_in_index]);
340 346
341 in_page_bytes_left = PAGE_CACHE_SIZE; 347 in_page_bytes_left = PAGE_CACHE_SIZE;
342 in_offset = 0; 348 in_offset = 0;
@@ -346,6 +352,8 @@ cont:
346 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE); 352 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
347 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, 353 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
348 &out_len); 354 &out_len);
355 if (need_unmap)
356 kunmap(pages_in[page_in_index - 1]);
349 if (ret != LZO_E_OK) { 357 if (ret != LZO_E_OK) {
350 printk(KERN_WARNING "btrfs decompress failed\n"); 358 printk(KERN_WARNING "btrfs decompress failed\n");
351 ret = -1; 359 ret = -1;
@@ -363,8 +371,7 @@ cont:
363 break; 371 break;
364 } 372 }
365done: 373done:
366 if (data_in) 374 kunmap(pages_in[page_in_index]);
367 kunmap(pages_in[page_in_index]);
368 return ret; 375 return ret;
369} 376}
370 377
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2b61e1ddcd99..a1c940425307 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
141 u64 file_offset) 141 u64 file_offset)
142{ 142{
143 struct rb_root *root = &tree->tree; 143 struct rb_root *root = &tree->tree;
144 struct rb_node *prev; 144 struct rb_node *prev = NULL;
145 struct rb_node *ret; 145 struct rb_node *ret;
146 struct btrfs_ordered_extent *entry; 146 struct btrfs_ordered_extent *entry;
147 147
@@ -202,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
202 INIT_LIST_HEAD(&entry->list); 202 INIT_LIST_HEAD(&entry->list);
203 INIT_LIST_HEAD(&entry->root_extent_list); 203 INIT_LIST_HEAD(&entry->root_extent_list);
204 204
205 trace_btrfs_ordered_extent_add(inode, entry);
206
205 spin_lock(&tree->lock); 207 spin_lock(&tree->lock);
206 node = tree_insert(&tree->tree, file_offset, 208 node = tree_insert(&tree->tree, file_offset,
207 &entry->rb_node); 209 &entry->rb_node);
@@ -387,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
387 struct list_head *cur; 389 struct list_head *cur;
388 struct btrfs_ordered_sum *sum; 390 struct btrfs_ordered_sum *sum;
389 391
392 trace_btrfs_ordered_extent_put(entry->inode, entry);
393
390 if (atomic_dec_and_test(&entry->refs)) { 394 if (atomic_dec_and_test(&entry->refs)) {
391 while (!list_empty(&entry->list)) { 395 while (!list_empty(&entry->list)) {
392 cur = entry->list.next; 396 cur = entry->list.next;
@@ -420,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
420 spin_lock(&root->fs_info->ordered_extent_lock); 424 spin_lock(&root->fs_info->ordered_extent_lock);
421 list_del_init(&entry->root_extent_list); 425 list_del_init(&entry->root_extent_list);
422 426
427 trace_btrfs_ordered_extent_remove(inode, entry);
428
423 /* 429 /*
424 * we have no more ordered extents for this inode and 430 * we have no more ordered extents for this inode and
425 * no dirty pages. We can safely remove it from the 431 * no dirty pages. We can safely remove it from the
@@ -585,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
585 u64 start = entry->file_offset; 591 u64 start = entry->file_offset;
586 u64 end = start + entry->len - 1; 592 u64 end = start + entry->len - 1;
587 593
594 trace_btrfs_ordered_extent_start(inode, entry);
595
588 /* 596 /*
589 * pages in the range can be dirty, clean or writeback. We 597 * pages in the range can be dirty, clean or writeback. We
590 * start IO on any dirty ones so the wait doesn't stall waiting 598 * start IO on any dirty ones so the wait doesn't stall waiting
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
260#else 260#else
261 BUG(); 261 BUG();
262#endif 262#endif
263 break;
263 case BTRFS_BLOCK_GROUP_ITEM_KEY: 264 case BTRFS_BLOCK_GROUP_ITEM_KEY:
264 bi = btrfs_item_ptr(l, i, 265 bi = btrfs_item_ptr(l, i,
265 struct btrfs_block_group_item); 266 struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 045c9c2b2d7e..199a80134312 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1157,6 +1157,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1157 new_node->bytenr = dest->node->start; 1157 new_node->bytenr = dest->node->start;
1158 new_node->level = node->level; 1158 new_node->level = node->level;
1159 new_node->lowest = node->lowest; 1159 new_node->lowest = node->lowest;
1160 new_node->checked = 1;
1160 new_node->root = dest; 1161 new_node->root = dest;
1161 1162
1162 if (!node->lowest) { 1163 if (!node->lowest) {
@@ -1723,6 +1724,7 @@ again:
1723 1724
1724 eb = read_tree_block(dest, old_bytenr, blocksize, 1725 eb = read_tree_block(dest, old_bytenr, blocksize,
1725 old_ptr_gen); 1726 old_ptr_gen);
1727 BUG_ON(!eb);
1726 btrfs_tree_lock(eb); 1728 btrfs_tree_lock(eb);
1727 if (cow) { 1729 if (cow) {
1728 ret = btrfs_cow_block(trans, dest, eb, parent, 1730 ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -2028,6 +2030,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2028 2030
2029 while (1) { 2031 while (1) {
2030 trans = btrfs_start_transaction(root, 0); 2032 trans = btrfs_start_transaction(root, 0);
2033 BUG_ON(IS_ERR(trans));
2031 trans->block_rsv = rc->block_rsv; 2034 trans->block_rsv = rc->block_rsv;
2032 2035
2033 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2036 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -2147,6 +2150,12 @@ again:
2147 } 2150 }
2148 2151
2149 trans = btrfs_join_transaction(rc->extent_root, 1); 2152 trans = btrfs_join_transaction(rc->extent_root, 1);
2153 if (IS_ERR(trans)) {
2154 if (!err)
2155 btrfs_block_rsv_release(rc->extent_root,
2156 rc->block_rsv, num_bytes);
2157 return PTR_ERR(trans);
2158 }
2150 2159
2151 if (!err) { 2160 if (!err) {
2152 if (num_bytes != rc->merging_rsv_size) { 2161 if (num_bytes != rc->merging_rsv_size) {
@@ -2337,7 +2346,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2337 root = next->root; 2346 root = next->root;
2338 BUG_ON(!root); 2347 BUG_ON(!root);
2339 2348
2340 /* no other choice for non-refernce counted tree */ 2349 /* no other choice for non-references counted tree */
2341 if (!root->ref_cows) 2350 if (!root->ref_cows)
2342 return root; 2351 return root;
2343 2352
@@ -2505,6 +2514,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2505 blocksize = btrfs_level_size(root, node->level); 2514 blocksize = btrfs_level_size(root, node->level);
2506 generation = btrfs_node_ptr_generation(upper->eb, slot); 2515 generation = btrfs_node_ptr_generation(upper->eb, slot);
2507 eb = read_tree_block(root, bytenr, blocksize, generation); 2516 eb = read_tree_block(root, bytenr, blocksize, generation);
2517 if (!eb) {
2518 err = -EIO;
2519 goto next;
2520 }
2508 btrfs_tree_lock(eb); 2521 btrfs_tree_lock(eb);
2509 btrfs_set_lock_blocking(eb); 2522 btrfs_set_lock_blocking(eb);
2510 2523
@@ -2662,6 +2675,7 @@ static int get_tree_block_key(struct reloc_control *rc,
2662 BUG_ON(block->key_ready); 2675 BUG_ON(block->key_ready);
2663 eb = read_tree_block(rc->extent_root, block->bytenr, 2676 eb = read_tree_block(rc->extent_root, block->bytenr,
2664 block->key.objectid, block->key.offset); 2677 block->key.objectid, block->key.offset);
2678 BUG_ON(!eb);
2665 WARN_ON(btrfs_header_level(eb) != block->level); 2679 WARN_ON(btrfs_header_level(eb) != block->level);
2666 if (block->level == 0) 2680 if (block->level == 0)
2667 btrfs_item_key_to_cpu(eb, &block->key, 0); 2681 btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -3222,6 +3236,7 @@ truncate:
3222 trans = btrfs_join_transaction(root, 0); 3236 trans = btrfs_join_transaction(root, 0);
3223 if (IS_ERR(trans)) { 3237 if (IS_ERR(trans)) {
3224 btrfs_free_path(path); 3238 btrfs_free_path(path);
3239 ret = PTR_ERR(trans);
3225 goto out; 3240 goto out;
3226 } 3241 }
3227 3242
@@ -3628,6 +3643,7 @@ int prepare_to_relocate(struct reloc_control *rc)
3628 set_reloc_control(rc); 3643 set_reloc_control(rc);
3629 3644
3630 trans = btrfs_join_transaction(rc->extent_root, 1); 3645 trans = btrfs_join_transaction(rc->extent_root, 1);
3646 BUG_ON(IS_ERR(trans));
3631 btrfs_commit_transaction(trans, rc->extent_root); 3647 btrfs_commit_transaction(trans, rc->extent_root);
3632 return 0; 3648 return 0;
3633} 3649}
@@ -3644,6 +3660,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3644 u32 item_size; 3660 u32 item_size;
3645 int ret; 3661 int ret;
3646 int err = 0; 3662 int err = 0;
3663 int progress = 0;
3647 3664
3648 path = btrfs_alloc_path(); 3665 path = btrfs_alloc_path();
3649 if (!path) 3666 if (!path)
@@ -3656,8 +3673,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3656 } 3673 }
3657 3674
3658 while (1) { 3675 while (1) {
3676 progress++;
3659 trans = btrfs_start_transaction(rc->extent_root, 0); 3677 trans = btrfs_start_transaction(rc->extent_root, 0);
3660 3678 BUG_ON(IS_ERR(trans));
3679restart:
3661 if (update_backref_cache(trans, &rc->backref_cache)) { 3680 if (update_backref_cache(trans, &rc->backref_cache)) {
3662 btrfs_end_transaction(trans, rc->extent_root); 3681 btrfs_end_transaction(trans, rc->extent_root);
3663 continue; 3682 continue;
@@ -3770,6 +3789,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3770 } 3789 }
3771 } 3790 }
3772 } 3791 }
3792 if (trans && progress && err == -ENOSPC) {
3793 ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
3794 rc->block_group->flags);
3795 if (ret == 0) {
3796 err = 0;
3797 progress = 0;
3798 goto restart;
3799 }
3800 }
3773 3801
3774 btrfs_release_path(rc->extent_root, path); 3802 btrfs_release_path(rc->extent_root, path);
3775 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, 3803 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
@@ -3804,7 +3832,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3804 3832
3805 /* get rid of pinned extents */ 3833 /* get rid of pinned extents */
3806 trans = btrfs_join_transaction(rc->extent_root, 1); 3834 trans = btrfs_join_transaction(rc->extent_root, 1);
3807 btrfs_commit_transaction(trans, rc->extent_root); 3835 if (IS_ERR(trans))
3836 err = PTR_ERR(trans);
3837 else
3838 btrfs_commit_transaction(trans, rc->extent_root);
3808out_free: 3839out_free:
3809 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); 3840 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3810 btrfs_free_path(path); 3841 btrfs_free_path(path);
@@ -4022,6 +4053,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4022 int ret; 4053 int ret;
4023 4054
4024 trans = btrfs_start_transaction(root->fs_info->tree_root, 0); 4055 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4056 BUG_ON(IS_ERR(trans));
4025 4057
4026 memset(&root->root_item.drop_progress, 0, 4058 memset(&root->root_item.drop_progress, 0,
4027 sizeof(root->root_item.drop_progress)); 4059 sizeof(root->root_item.drop_progress));
@@ -4125,6 +4157,11 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4125 set_reloc_control(rc); 4157 set_reloc_control(rc);
4126 4158
4127 trans = btrfs_join_transaction(rc->extent_root, 1); 4159 trans = btrfs_join_transaction(rc->extent_root, 1);
4160 if (IS_ERR(trans)) {
4161 unset_reloc_control(rc);
4162 err = PTR_ERR(trans);
4163 goto out_free;
4164 }
4128 4165
4129 rc->merge_reloc_tree = 1; 4166 rc->merge_reloc_tree = 1;
4130 4167
@@ -4154,9 +4191,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4154 unset_reloc_control(rc); 4191 unset_reloc_control(rc);
4155 4192
4156 trans = btrfs_join_transaction(rc->extent_root, 1); 4193 trans = btrfs_join_transaction(rc->extent_root, 1);
4157 btrfs_commit_transaction(trans, rc->extent_root); 4194 if (IS_ERR(trans))
4158out: 4195 err = PTR_ERR(trans);
4196 else
4197 btrfs_commit_transaction(trans, rc->extent_root);
4198out_free:
4159 kfree(rc); 4199 kfree(rc);
4200out:
4160 while (!list_empty(&reloc_roots)) { 4201 while (!list_empty(&reloc_roots)) {
4161 reloc_root = list_entry(reloc_roots.next, 4202 reloc_root = list_entry(reloc_roots.next,
4162 struct btrfs_root, root_list); 4203 struct btrfs_root, root_list);
@@ -4174,7 +4215,7 @@ out:
4174 if (IS_ERR(fs_root)) 4215 if (IS_ERR(fs_root))
4175 err = PTR_ERR(fs_root); 4216 err = PTR_ERR(fs_root);
4176 else 4217 else
4177 btrfs_orphan_cleanup(fs_root); 4218 err = btrfs_orphan_cleanup(fs_root);
4178 } 4219 }
4179 return err; 4220 return err;
4180} 4221}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 6a1086e83ffc..6928bff62daa 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -88,7 +88,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
88 search_key.offset = (u64)-1; 88 search_key.offset = (u64)-1;
89 89
90 path = btrfs_alloc_path(); 90 path = btrfs_alloc_path();
91 BUG_ON(!path); 91 if (!path)
92 return -ENOMEM;
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 93 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0) 94 if (ret < 0)
94 goto out; 95 goto out;
@@ -332,7 +333,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
332 struct extent_buffer *leaf; 333 struct extent_buffer *leaf;
333 334
334 path = btrfs_alloc_path(); 335 path = btrfs_alloc_path();
335 BUG_ON(!path); 336 if (!path)
337 return -ENOMEM;
336 ret = btrfs_search_slot(trans, root, key, path, -1, 1); 338 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
337 if (ret < 0) 339 if (ret < 0)
338 goto out; 340 goto out;
@@ -471,3 +473,21 @@ again:
471 btrfs_free_path(path); 473 btrfs_free_path(path);
472 return 0; 474 return 0;
473} 475}
476
477/*
478 * Old btrfs forgets to init root_item->flags and root_item->byte_limit
479 * for subvolumes. To work around this problem, we steal a bit from
480 * root_item->inode_item->flags, and use it to indicate if those fields
481 * have been properly initialized.
482 */
483void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
484{
485 u64 inode_flags = le64_to_cpu(root_item->inode.flags);
486
487 if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
488 inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
489 root_item->inode.flags = cpu_to_le64(inode_flags);
490 root_item->flags = 0;
491 root_item->byte_limit = 0;
492 }
493}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b2130c46fdb5..0ac712efcdf2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,6 +52,9 @@
52#include "export.h" 52#include "export.h"
53#include "compression.h" 53#include "compression.h"
54 54
55#define CREATE_TRACE_POINTS
56#include <trace/events/btrfs.h>
57
55static const struct super_operations btrfs_super_ops; 58static const struct super_operations btrfs_super_ops;
56 59
57static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 60static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
@@ -155,7 +158,8 @@ enum {
155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 158 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type, 159 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 160 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err, 161 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
162 Opt_enospc_debug, Opt_subvolrootid, Opt_err,
159}; 163};
160 164
161static match_table_t tokens = { 165static match_table_t tokens = {
@@ -184,6 +188,8 @@ static match_table_t tokens = {
184 {Opt_space_cache, "space_cache"}, 188 {Opt_space_cache, "space_cache"},
185 {Opt_clear_cache, "clear_cache"}, 189 {Opt_clear_cache, "clear_cache"},
186 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, 190 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
191 {Opt_enospc_debug, "enospc_debug"},
192 {Opt_subvolrootid, "subvolrootid=%d"},
187 {Opt_err, NULL}, 193 {Opt_err, NULL},
188}; 194};
189 195
@@ -227,6 +233,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
227 break; 233 break;
228 case Opt_subvol: 234 case Opt_subvol:
229 case Opt_subvolid: 235 case Opt_subvolid:
236 case Opt_subvolrootid:
230 case Opt_device: 237 case Opt_device:
231 /* 238 /*
232 * These are parsed by btrfs_parse_early_options 239 * These are parsed by btrfs_parse_early_options
@@ -358,6 +365,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
358 case Opt_user_subvol_rm_allowed: 365 case Opt_user_subvol_rm_allowed:
359 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); 366 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
360 break; 367 break;
368 case Opt_enospc_debug:
369 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
370 break;
361 case Opt_err: 371 case Opt_err:
362 printk(KERN_INFO "btrfs: unrecognized mount option " 372 printk(KERN_INFO "btrfs: unrecognized mount option "
363 "'%s'\n", p); 373 "'%s'\n", p);
@@ -380,10 +390,10 @@ out:
380 */ 390 */
381static int btrfs_parse_early_options(const char *options, fmode_t flags, 391static int btrfs_parse_early_options(const char *options, fmode_t flags,
382 void *holder, char **subvol_name, u64 *subvol_objectid, 392 void *holder, char **subvol_name, u64 *subvol_objectid,
383 struct btrfs_fs_devices **fs_devices) 393 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
384{ 394{
385 substring_t args[MAX_OPT_ARGS]; 395 substring_t args[MAX_OPT_ARGS];
386 char *opts, *p; 396 char *opts, *orig, *p;
387 int error = 0; 397 int error = 0;
388 int intarg; 398 int intarg;
389 399
@@ -397,6 +407,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
397 opts = kstrdup(options, GFP_KERNEL); 407 opts = kstrdup(options, GFP_KERNEL);
398 if (!opts) 408 if (!opts)
399 return -ENOMEM; 409 return -ENOMEM;
410 orig = opts;
400 411
401 while ((p = strsep(&opts, ",")) != NULL) { 412 while ((p = strsep(&opts, ",")) != NULL) {
402 int token; 413 int token;
@@ -420,6 +431,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
420 *subvol_objectid = intarg; 431 *subvol_objectid = intarg;
421 } 432 }
422 break; 433 break;
434 case Opt_subvolrootid:
435 intarg = 0;
436 error = match_int(&args[0], &intarg);
437 if (!error) {
438 /* we want the original fs_tree */
439 if (!intarg)
440 *subvol_rootid =
441 BTRFS_FS_TREE_OBJECTID;
442 else
443 *subvol_rootid = intarg;
444 }
445 break;
423 case Opt_device: 446 case Opt_device:
424 error = btrfs_scan_one_device(match_strdup(&args[0]), 447 error = btrfs_scan_one_device(match_strdup(&args[0]),
425 flags, holder, fs_devices); 448 flags, holder, fs_devices);
@@ -432,7 +455,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
432 } 455 }
433 456
434 out_free_opts: 457 out_free_opts:
435 kfree(opts); 458 kfree(orig);
436 out: 459 out:
437 /* 460 /*
438 * If no subvolume name is specified we use the default one. Allocate 461 * If no subvolume name is specified we use the default one. Allocate
@@ -614,6 +637,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
614 struct btrfs_root *root = btrfs_sb(sb); 637 struct btrfs_root *root = btrfs_sb(sb);
615 int ret; 638 int ret;
616 639
640 trace_btrfs_sync_fs(wait);
641
617 if (!wait) { 642 if (!wait) {
618 filemap_flush(root->fs_info->btree_inode->i_mapping); 643 filemap_flush(root->fs_info->btree_inode->i_mapping);
619 return 0; 644 return 0;
@@ -623,6 +648,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
623 btrfs_wait_ordered_extents(root, 0, 0); 648 btrfs_wait_ordered_extents(root, 0, 0);
624 649
625 trans = btrfs_start_transaction(root, 0); 650 trans = btrfs_start_transaction(root, 0);
651 if (IS_ERR(trans))
652 return PTR_ERR(trans);
626 ret = btrfs_commit_transaction(trans, root); 653 ret = btrfs_commit_transaction(trans, root);
627 return ret; 654 return ret;
628} 655}
@@ -631,6 +658,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
631{ 658{
632 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); 659 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
633 struct btrfs_fs_info *info = root->fs_info; 660 struct btrfs_fs_info *info = root->fs_info;
661 char *compress_type;
634 662
635 if (btrfs_test_opt(root, DEGRADED)) 663 if (btrfs_test_opt(root, DEGRADED))
636 seq_puts(seq, ",degraded"); 664 seq_puts(seq, ",degraded");
@@ -649,8 +677,16 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
649 if (info->thread_pool_size != min_t(unsigned long, 677 if (info->thread_pool_size != min_t(unsigned long,
650 num_online_cpus() + 2, 8)) 678 num_online_cpus() + 2, 8))
651 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 679 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
652 if (btrfs_test_opt(root, COMPRESS)) 680 if (btrfs_test_opt(root, COMPRESS)) {
653 seq_puts(seq, ",compress"); 681 if (info->compress_type == BTRFS_COMPRESS_ZLIB)
682 compress_type = "zlib";
683 else
684 compress_type = "lzo";
685 if (btrfs_test_opt(root, FORCE_COMPRESS))
686 seq_printf(seq, ",compress-force=%s", compress_type);
687 else
688 seq_printf(seq, ",compress=%s", compress_type);
689 }
654 if (btrfs_test_opt(root, NOSSD)) 690 if (btrfs_test_opt(root, NOSSD))
655 seq_puts(seq, ",nossd"); 691 seq_puts(seq, ",nossd");
656 if (btrfs_test_opt(root, SSD_SPREAD)) 692 if (btrfs_test_opt(root, SSD_SPREAD))
@@ -665,6 +701,12 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
665 seq_puts(seq, ",discard"); 701 seq_puts(seq, ",discard");
666 if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) 702 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
667 seq_puts(seq, ",noacl"); 703 seq_puts(seq, ",noacl");
704 if (btrfs_test_opt(root, SPACE_CACHE))
705 seq_puts(seq, ",space_cache");
706 if (btrfs_test_opt(root, CLEAR_CACHE))
707 seq_puts(seq, ",clear_cache");
708 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
709 seq_puts(seq, ",user_subvol_rm_allowed");
668 return 0; 710 return 0;
669} 711}
670 712
@@ -708,6 +750,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
708 fmode_t mode = FMODE_READ; 750 fmode_t mode = FMODE_READ;
709 char *subvol_name = NULL; 751 char *subvol_name = NULL;
710 u64 subvol_objectid = 0; 752 u64 subvol_objectid = 0;
753 u64 subvol_rootid = 0;
711 int error = 0; 754 int error = 0;
712 755
713 if (!(flags & MS_RDONLY)) 756 if (!(flags & MS_RDONLY))
@@ -715,7 +758,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
715 758
716 error = btrfs_parse_early_options(data, mode, fs_type, 759 error = btrfs_parse_early_options(data, mode, fs_type,
717 &subvol_name, &subvol_objectid, 760 &subvol_name, &subvol_objectid,
718 &fs_devices); 761 &subvol_rootid, &fs_devices);
719 if (error) 762 if (error)
720 return ERR_PTR(error); 763 return ERR_PTR(error);
721 764
@@ -761,6 +804,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
761 } 804 }
762 805
763 btrfs_close_devices(fs_devices); 806 btrfs_close_devices(fs_devices);
807 kfree(fs_info);
808 kfree(tree_root);
764 } else { 809 } else {
765 char b[BDEVNAME_SIZE]; 810 char b[BDEVNAME_SIZE];
766 811
@@ -777,15 +822,17 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
777 s->s_flags |= MS_ACTIVE; 822 s->s_flags |= MS_ACTIVE;
778 } 823 }
779 824
780 root = get_default_root(s, subvol_objectid);
781 if (IS_ERR(root)) {
782 error = PTR_ERR(root);
783 deactivate_locked_super(s);
784 goto error_free_subvol_name;
785 }
786 /* if they gave us a subvolume name bind mount into that */ 825 /* if they gave us a subvolume name bind mount into that */
787 if (strcmp(subvol_name, ".")) { 826 if (strcmp(subvol_name, ".")) {
788 struct dentry *new_root; 827 struct dentry *new_root;
828
829 root = get_default_root(s, subvol_rootid);
830 if (IS_ERR(root)) {
831 error = PTR_ERR(root);
832 deactivate_locked_super(s);
833 goto error_free_subvol_name;
834 }
835
789 mutex_lock(&root->d_inode->i_mutex); 836 mutex_lock(&root->d_inode->i_mutex);
790 new_root = lookup_one_len(subvol_name, root, 837 new_root = lookup_one_len(subvol_name, root,
791 strlen(subvol_name)); 838 strlen(subvol_name));
@@ -806,6 +853,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
806 } 853 }
807 dput(root); 854 dput(root);
808 root = new_root; 855 root = new_root;
856 } else {
857 root = get_default_root(s, subvol_objectid);
858 if (IS_ERR(root)) {
859 error = PTR_ERR(root);
860 deactivate_locked_super(s);
861 goto error_free_subvol_name;
862 }
809 } 863 }
810 864
811 kfree(subvol_name); 865 kfree(subvol_name);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index bae5c7b8bbe2..c571734d5e5a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -32,10 +32,8 @@
32 32
33static noinline void put_transaction(struct btrfs_transaction *transaction) 33static noinline void put_transaction(struct btrfs_transaction *transaction)
34{ 34{
35 WARN_ON(transaction->use_count == 0); 35 WARN_ON(atomic_read(&transaction->use_count) == 0);
36 transaction->use_count--; 36 if (atomic_dec_and_test(&transaction->use_count)) {
37 if (transaction->use_count == 0) {
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction)); 37 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 38 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 39 }
@@ -57,16 +55,17 @@ static noinline int join_transaction(struct btrfs_root *root)
57 if (!cur_trans) { 55 if (!cur_trans) {
58 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, 56 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
59 GFP_NOFS); 57 GFP_NOFS);
60 BUG_ON(!cur_trans); 58 if (!cur_trans)
59 return -ENOMEM;
61 root->fs_info->generation++; 60 root->fs_info->generation++;
62 cur_trans->num_writers = 1; 61 atomic_set(&cur_trans->num_writers, 1);
63 cur_trans->num_joined = 0; 62 cur_trans->num_joined = 0;
64 cur_trans->transid = root->fs_info->generation; 63 cur_trans->transid = root->fs_info->generation;
65 init_waitqueue_head(&cur_trans->writer_wait); 64 init_waitqueue_head(&cur_trans->writer_wait);
66 init_waitqueue_head(&cur_trans->commit_wait); 65 init_waitqueue_head(&cur_trans->commit_wait);
67 cur_trans->in_commit = 0; 66 cur_trans->in_commit = 0;
68 cur_trans->blocked = 0; 67 cur_trans->blocked = 0;
69 cur_trans->use_count = 1; 68 atomic_set(&cur_trans->use_count, 1);
70 cur_trans->commit_done = 0; 69 cur_trans->commit_done = 0;
71 cur_trans->start_time = get_seconds(); 70 cur_trans->start_time = get_seconds();
72 71
@@ -87,7 +86,7 @@ static noinline int join_transaction(struct btrfs_root *root)
87 root->fs_info->running_transaction = cur_trans; 86 root->fs_info->running_transaction = cur_trans;
88 spin_unlock(&root->fs_info->new_trans_lock); 87 spin_unlock(&root->fs_info->new_trans_lock);
89 } else { 88 } else {
90 cur_trans->num_writers++; 89 atomic_inc(&cur_trans->num_writers);
91 cur_trans->num_joined++; 90 cur_trans->num_joined++;
92 } 91 }
93 92
@@ -144,7 +143,7 @@ static void wait_current_trans(struct btrfs_root *root)
144 cur_trans = root->fs_info->running_transaction; 143 cur_trans = root->fs_info->running_transaction;
145 if (cur_trans && cur_trans->blocked) { 144 if (cur_trans && cur_trans->blocked) {
146 DEFINE_WAIT(wait); 145 DEFINE_WAIT(wait);
147 cur_trans->use_count++; 146 atomic_inc(&cur_trans->use_count);
148 while (1) { 147 while (1) {
149 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 148 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
150 TASK_UNINTERRUPTIBLE); 149 TASK_UNINTERRUPTIBLE);
@@ -180,6 +179,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
180{ 179{
181 struct btrfs_trans_handle *h; 180 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans; 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
183 int ret; 183 int ret;
184 184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -195,10 +195,15 @@ again:
195 wait_current_trans(root); 195 wait_current_trans(root);
196 196
197 ret = join_transaction(root); 197 ret = join_transaction(root);
198 BUG_ON(ret); 198 if (ret < 0) {
199 kmem_cache_free(btrfs_trans_handle_cachep, h);
200 if (type != TRANS_JOIN_NOLOCK)
201 mutex_unlock(&root->fs_info->trans_mutex);
202 return ERR_PTR(ret);
203 }
199 204
200 cur_trans = root->fs_info->running_transaction; 205 cur_trans = root->fs_info->running_transaction;
201 cur_trans->use_count++; 206 atomic_inc(&cur_trans->use_count);
202 if (type != TRANS_JOIN_NOLOCK) 207 if (type != TRANS_JOIN_NOLOCK)
203 mutex_unlock(&root->fs_info->trans_mutex); 208 mutex_unlock(&root->fs_info->trans_mutex);
204 209
@@ -218,10 +223,18 @@ again:
218 223
219 if (num_items > 0) { 224 if (num_items > 0) {
220 ret = btrfs_trans_reserve_metadata(h, root, num_items); 225 ret = btrfs_trans_reserve_metadata(h, root, num_items);
221 if (ret == -EAGAIN) { 226 if (ret == -EAGAIN && !retries) {
227 retries++;
222 btrfs_commit_transaction(h, root); 228 btrfs_commit_transaction(h, root);
223 goto again; 229 goto again;
230 } else if (ret == -EAGAIN) {
231 /*
232 * We have already retried and got EAGAIN, so really we
233 * don't have space, so set ret to -ENOSPC.
234 */
235 ret = -ENOSPC;
224 } 236 }
237
225 if (ret < 0) { 238 if (ret < 0) {
226 btrfs_end_transaction(h, root); 239 btrfs_end_transaction(h, root);
227 return ERR_PTR(ret); 240 return ERR_PTR(ret);
@@ -321,7 +334,7 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
321 goto out_unlock; /* nothing committing|committed */ 334 goto out_unlock; /* nothing committing|committed */
322 } 335 }
323 336
324 cur_trans->use_count++; 337 atomic_inc(&cur_trans->use_count);
325 mutex_unlock(&root->fs_info->trans_mutex); 338 mutex_unlock(&root->fs_info->trans_mutex);
326 339
327 wait_for_commit(root, cur_trans); 340 wait_for_commit(root, cur_trans);
@@ -451,18 +464,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
451 wake_up_process(info->transaction_kthread); 464 wake_up_process(info->transaction_kthread);
452 } 465 }
453 466
454 if (lock)
455 mutex_lock(&info->trans_mutex);
456 WARN_ON(cur_trans != info->running_transaction); 467 WARN_ON(cur_trans != info->running_transaction);
457 WARN_ON(cur_trans->num_writers < 1); 468 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
458 cur_trans->num_writers--; 469 atomic_dec(&cur_trans->num_writers);
459 470
460 smp_mb(); 471 smp_mb();
461 if (waitqueue_active(&cur_trans->writer_wait)) 472 if (waitqueue_active(&cur_trans->writer_wait))
462 wake_up(&cur_trans->writer_wait); 473 wake_up(&cur_trans->writer_wait);
463 put_transaction(cur_trans); 474 put_transaction(cur_trans);
464 if (lock)
465 mutex_unlock(&info->trans_mutex);
466 475
467 if (current->journal_info == trans) 476 if (current->journal_info == trans)
468 current->journal_info = NULL; 477 current->journal_info = NULL;
@@ -970,6 +979,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
970 record_root_in_trans(trans, root); 979 record_root_in_trans(trans, root);
971 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 980 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
972 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 981 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
982 btrfs_check_and_init_root_item(new_root_item);
973 983
974 root_flags = btrfs_root_flags(new_root_item); 984 root_flags = btrfs_root_flags(new_root_item);
975 if (pending->readonly) 985 if (pending->readonly)
@@ -1156,16 +1166,22 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1156 struct btrfs_transaction *cur_trans; 1166 struct btrfs_transaction *cur_trans;
1157 1167
1158 ac = kmalloc(sizeof(*ac), GFP_NOFS); 1168 ac = kmalloc(sizeof(*ac), GFP_NOFS);
1159 BUG_ON(!ac); 1169 if (!ac)
1170 return -ENOMEM;
1160 1171
1161 INIT_DELAYED_WORK(&ac->work, do_async_commit); 1172 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1162 ac->root = root; 1173 ac->root = root;
1163 ac->newtrans = btrfs_join_transaction(root, 0); 1174 ac->newtrans = btrfs_join_transaction(root, 0);
1175 if (IS_ERR(ac->newtrans)) {
1176 int err = PTR_ERR(ac->newtrans);
1177 kfree(ac);
1178 return err;
1179 }
1164 1180
1165 /* take transaction reference */ 1181 /* take transaction reference */
1166 mutex_lock(&root->fs_info->trans_mutex); 1182 mutex_lock(&root->fs_info->trans_mutex);
1167 cur_trans = trans->transaction; 1183 cur_trans = trans->transaction;
1168 cur_trans->use_count++; 1184 atomic_inc(&cur_trans->use_count);
1169 mutex_unlock(&root->fs_info->trans_mutex); 1185 mutex_unlock(&root->fs_info->trans_mutex);
1170 1186
1171 btrfs_end_transaction(trans, root); 1187 btrfs_end_transaction(trans, root);
@@ -1224,7 +1240,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1224 1240
1225 mutex_lock(&root->fs_info->trans_mutex); 1241 mutex_lock(&root->fs_info->trans_mutex);
1226 if (cur_trans->in_commit) { 1242 if (cur_trans->in_commit) {
1227 cur_trans->use_count++; 1243 atomic_inc(&cur_trans->use_count);
1228 mutex_unlock(&root->fs_info->trans_mutex); 1244 mutex_unlock(&root->fs_info->trans_mutex);
1229 btrfs_end_transaction(trans, root); 1245 btrfs_end_transaction(trans, root);
1230 1246
@@ -1246,7 +1262,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1246 prev_trans = list_entry(cur_trans->list.prev, 1262 prev_trans = list_entry(cur_trans->list.prev,
1247 struct btrfs_transaction, list); 1263 struct btrfs_transaction, list);
1248 if (!prev_trans->commit_done) { 1264 if (!prev_trans->commit_done) {
1249 prev_trans->use_count++; 1265 atomic_inc(&prev_trans->use_count);
1250 mutex_unlock(&root->fs_info->trans_mutex); 1266 mutex_unlock(&root->fs_info->trans_mutex);
1251 1267
1252 wait_for_commit(root, prev_trans); 1268 wait_for_commit(root, prev_trans);
@@ -1287,14 +1303,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1287 TASK_UNINTERRUPTIBLE); 1303 TASK_UNINTERRUPTIBLE);
1288 1304
1289 smp_mb(); 1305 smp_mb();
1290 if (cur_trans->num_writers > 1) 1306 if (atomic_read(&cur_trans->num_writers) > 1)
1291 schedule_timeout(MAX_SCHEDULE_TIMEOUT); 1307 schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1292 else if (should_grow) 1308 else if (should_grow)
1293 schedule_timeout(1); 1309 schedule_timeout(1);
1294 1310
1295 mutex_lock(&root->fs_info->trans_mutex); 1311 mutex_lock(&root->fs_info->trans_mutex);
1296 finish_wait(&cur_trans->writer_wait, &wait); 1312 finish_wait(&cur_trans->writer_wait, &wait);
1297 } while (cur_trans->num_writers > 1 || 1313 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1298 (should_grow && cur_trans->num_joined != joined)); 1314 (should_grow && cur_trans->num_joined != joined));
1299 1315
1300 ret = create_pending_snapshots(trans, root->fs_info); 1316 ret = create_pending_snapshots(trans, root->fs_info);
@@ -1381,9 +1397,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1381 1397
1382 wake_up(&cur_trans->commit_wait); 1398 wake_up(&cur_trans->commit_wait);
1383 1399
1400 list_del_init(&cur_trans->list);
1384 put_transaction(cur_trans); 1401 put_transaction(cur_trans);
1385 put_transaction(cur_trans); 1402 put_transaction(cur_trans);
1386 1403
1404 trace_btrfs_transaction_commit(root);
1405
1387 mutex_unlock(&root->fs_info->trans_mutex); 1406 mutex_unlock(&root->fs_info->trans_mutex);
1388 1407
1389 if (current->journal_info == trans) 1408 if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 229a594cacd5..e441acc6c584 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,11 @@ struct btrfs_transaction {
27 * total writers in this transaction, it must be zero before the 27 * total writers in this transaction, it must be zero before the
28 * transaction can end 28 * transaction can end
29 */ 29 */
30 unsigned long num_writers; 30 atomic_t num_writers;
31 31
32 unsigned long num_joined; 32 unsigned long num_joined;
33 int in_commit; 33 int in_commit;
34 int use_count; 34 atomic_t use_count;
35 int commit_done; 35 int commit_done;
36 int blocked; 36 int blocked;
37 struct list_head list; 37 struct list_head list;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 054744ac5719..c50271ad3157 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -338,6 +338,12 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
338 } 338 }
339 dst_copy = kmalloc(item_size, GFP_NOFS); 339 dst_copy = kmalloc(item_size, GFP_NOFS);
340 src_copy = kmalloc(item_size, GFP_NOFS); 340 src_copy = kmalloc(item_size, GFP_NOFS);
341 if (!dst_copy || !src_copy) {
342 btrfs_release_path(root, path);
343 kfree(dst_copy);
344 kfree(src_copy);
345 return -ENOMEM;
346 }
341 347
342 read_extent_buffer(eb, src_copy, src_ptr, item_size); 348 read_extent_buffer(eb, src_copy, src_ptr, item_size);
343 349
@@ -665,6 +671,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
665 btrfs_dir_item_key_to_cpu(leaf, di, &location); 671 btrfs_dir_item_key_to_cpu(leaf, di, &location);
666 name_len = btrfs_dir_name_len(leaf, di); 672 name_len = btrfs_dir_name_len(leaf, di);
667 name = kmalloc(name_len, GFP_NOFS); 673 name = kmalloc(name_len, GFP_NOFS);
674 if (!name)
675 return -ENOMEM;
676
668 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 677 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
669 btrfs_release_path(root, path); 678 btrfs_release_path(root, path);
670 679
@@ -744,6 +753,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
744 int match = 0; 753 int match = 0;
745 754
746 path = btrfs_alloc_path(); 755 path = btrfs_alloc_path();
756 if (!path)
757 return -ENOMEM;
758
747 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 759 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
748 if (ret != 0) 760 if (ret != 0)
749 goto out; 761 goto out;
@@ -787,12 +799,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
787 struct inode *dir; 799 struct inode *dir;
788 int ret; 800 int ret;
789 struct btrfs_inode_ref *ref; 801 struct btrfs_inode_ref *ref;
790 struct btrfs_dir_item *di;
791 struct inode *inode; 802 struct inode *inode;
792 char *name; 803 char *name;
793 int namelen; 804 int namelen;
794 unsigned long ref_ptr; 805 unsigned long ref_ptr;
795 unsigned long ref_end; 806 unsigned long ref_end;
807 int search_done = 0;
796 808
797 /* 809 /*
798 * it is possible that we didn't log all the parent directories 810 * it is possible that we didn't log all the parent directories
@@ -833,7 +845,10 @@ again:
833 * existing back reference, and we don't want to create 845 * existing back reference, and we don't want to create
834 * dangling pointers in the directory. 846 * dangling pointers in the directory.
835 */ 847 */
836conflict_again: 848
849 if (search_done)
850 goto insert;
851
837 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 852 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
838 if (ret == 0) { 853 if (ret == 0) {
839 char *victim_name; 854 char *victim_name;
@@ -874,37 +889,21 @@ conflict_again:
874 ret = btrfs_unlink_inode(trans, root, dir, 889 ret = btrfs_unlink_inode(trans, root, dir,
875 inode, victim_name, 890 inode, victim_name,
876 victim_name_len); 891 victim_name_len);
877 kfree(victim_name);
878 btrfs_release_path(root, path);
879 goto conflict_again;
880 } 892 }
881 kfree(victim_name); 893 kfree(victim_name);
882 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 894 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
883 } 895 }
884 BUG_ON(ret); 896 BUG_ON(ret);
885 }
886 btrfs_release_path(root, path);
887 897
888 /* look for a conflicting sequence number */ 898 /*
889 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 899 * NOTE: we have searched root tree and checked the
890 btrfs_inode_ref_index(eb, ref), 900 * coresponding ref, it does not need to check again.
891 name, namelen, 0); 901 */
892 if (di && !IS_ERR(di)) { 902 search_done = 1;
893 ret = drop_one_dir_item(trans, root, path, dir, di);
894 BUG_ON(ret);
895 }
896 btrfs_release_path(root, path);
897
898
899 /* look for a conflicting name */
900 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
901 name, namelen, 0);
902 if (di && !IS_ERR(di)) {
903 ret = drop_one_dir_item(trans, root, path, dir, di);
904 BUG_ON(ret);
905 } 903 }
906 btrfs_release_path(root, path); 904 btrfs_release_path(root, path);
907 905
906insert:
908 /* insert our name */ 907 /* insert our name */
909 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 908 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
910 btrfs_inode_ref_index(eb, ref)); 909 btrfs_inode_ref_index(eb, ref));
@@ -967,6 +966,8 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
967 key.offset = (u64)-1; 966 key.offset = (u64)-1;
968 967
969 path = btrfs_alloc_path(); 968 path = btrfs_alloc_path();
969 if (!path)
970 return -ENOMEM;
970 971
971 while (1) { 972 while (1) {
972 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 973 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -1178,6 +1179,9 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1178 1179
1179 name_len = btrfs_dir_name_len(eb, di); 1180 name_len = btrfs_dir_name_len(eb, di);
1180 name = kmalloc(name_len, GFP_NOFS); 1181 name = kmalloc(name_len, GFP_NOFS);
1182 if (!name)
1183 return -ENOMEM;
1184
1181 log_type = btrfs_dir_type(eb, di); 1185 log_type = btrfs_dir_type(eb, di);
1182 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1186 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1183 name_len); 1187 name_len);
@@ -1269,6 +1273,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1269 ptr_end = ptr + item_size; 1273 ptr_end = ptr + item_size;
1270 while (ptr < ptr_end) { 1274 while (ptr < ptr_end) {
1271 di = (struct btrfs_dir_item *)ptr; 1275 di = (struct btrfs_dir_item *)ptr;
1276 if (verify_dir_item(root, eb, di))
1277 return -EIO;
1272 name_len = btrfs_dir_name_len(eb, di); 1278 name_len = btrfs_dir_name_len(eb, di);
1273 ret = replay_one_name(trans, root, path, eb, di, key); 1279 ret = replay_one_name(trans, root, path, eb, di, key);
1274 BUG_ON(ret); 1280 BUG_ON(ret);
@@ -1395,6 +1401,11 @@ again:
1395 ptr_end = ptr + item_size; 1401 ptr_end = ptr + item_size;
1396 while (ptr < ptr_end) { 1402 while (ptr < ptr_end) {
1397 di = (struct btrfs_dir_item *)ptr; 1403 di = (struct btrfs_dir_item *)ptr;
1404 if (verify_dir_item(root, eb, di)) {
1405 ret = -EIO;
1406 goto out;
1407 }
1408
1398 name_len = btrfs_dir_name_len(eb, di); 1409 name_len = btrfs_dir_name_len(eb, di);
1399 name = kmalloc(name_len, GFP_NOFS); 1410 name = kmalloc(name_len, GFP_NOFS);
1400 if (!name) { 1411 if (!name) {
@@ -1692,6 +1703,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1692 root_owner = btrfs_header_owner(parent); 1703 root_owner = btrfs_header_owner(parent);
1693 1704
1694 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1705 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1706 if (!next)
1707 return -ENOMEM;
1695 1708
1696 if (*level == 1) { 1709 if (*level == 1) {
1697 wc->process_func(root, next, wc, ptr_gen); 1710 wc->process_func(root, next, wc, ptr_gen);
@@ -1802,7 +1815,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1802 int orig_level; 1815 int orig_level;
1803 1816
1804 path = btrfs_alloc_path(); 1817 path = btrfs_alloc_path();
1805 BUG_ON(!path); 1818 if (!path)
1819 return -ENOMEM;
1806 1820
1807 level = btrfs_header_level(log->node); 1821 level = btrfs_header_level(log->node);
1808 orig_level = level; 1822 orig_level = level;
@@ -2032,6 +2046,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2032 wait_log_commit(trans, log_root_tree, 2046 wait_log_commit(trans, log_root_tree,
2033 log_root_tree->log_transid); 2047 log_root_tree->log_transid);
2034 mutex_unlock(&log_root_tree->log_mutex); 2048 mutex_unlock(&log_root_tree->log_mutex);
2049 ret = 0;
2035 goto out; 2050 goto out;
2036 } 2051 }
2037 atomic_set(&log_root_tree->log_commit[index2], 1); 2052 atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2096,7 +2111,7 @@ out:
2096 smp_mb(); 2111 smp_mb();
2097 if (waitqueue_active(&root->log_commit_wait[index1])) 2112 if (waitqueue_active(&root->log_commit_wait[index1]))
2098 wake_up(&root->log_commit_wait[index1]); 2113 wake_up(&root->log_commit_wait[index1]);
2099 return 0; 2114 return ret;
2100} 2115}
2101 2116
2102static void free_log_tree(struct btrfs_trans_handle *trans, 2117static void free_log_tree(struct btrfs_trans_handle *trans,
@@ -2194,6 +2209,9 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2194 2209
2195 log = root->log_root; 2210 log = root->log_root;
2196 path = btrfs_alloc_path(); 2211 path = btrfs_alloc_path();
2212 if (!path)
2213 return -ENOMEM;
2214
2197 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2215 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2198 name, name_len, -1); 2216 name, name_len, -1);
2199 if (IS_ERR(di)) { 2217 if (IS_ERR(di)) {
@@ -2594,6 +2612,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2594 2612
2595 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2613 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2596 nr * sizeof(u32), GFP_NOFS); 2614 nr * sizeof(u32), GFP_NOFS);
2615 if (!ins_data)
2616 return -ENOMEM;
2617
2597 ins_sizes = (u32 *)ins_data; 2618 ins_sizes = (u32 *)ins_data;
2598 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2619 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2599 2620
@@ -2725,7 +2746,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2725 log = root->log_root; 2746 log = root->log_root;
2726 2747
2727 path = btrfs_alloc_path(); 2748 path = btrfs_alloc_path();
2749 if (!path)
2750 return -ENOMEM;
2728 dst_path = btrfs_alloc_path(); 2751 dst_path = btrfs_alloc_path();
2752 if (!dst_path) {
2753 btrfs_free_path(path);
2754 return -ENOMEM;
2755 }
2729 2756
2730 min_key.objectid = inode->i_ino; 2757 min_key.objectid = inode->i_ino;
2731 min_key.type = BTRFS_INODE_ITEM_KEY; 2758 min_key.type = BTRFS_INODE_ITEM_KEY;
@@ -3075,16 +3102,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3075 .stage = 0, 3102 .stage = 0,
3076 }; 3103 };
3077 3104
3078 fs_info->log_root_recovering = 1;
3079 path = btrfs_alloc_path(); 3105 path = btrfs_alloc_path();
3080 BUG_ON(!path); 3106 if (!path)
3107 return -ENOMEM;
3108
3109 fs_info->log_root_recovering = 1;
3081 3110
3082 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3111 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3112 BUG_ON(IS_ERR(trans));
3083 3113
3084 wc.trans = trans; 3114 wc.trans = trans;
3085 wc.pin = 1; 3115 wc.pin = 1;
3086 3116
3087 walk_log_tree(trans, log_root_tree, &wc); 3117 ret = walk_log_tree(trans, log_root_tree, &wc);
3118 BUG_ON(ret);
3088 3119
3089again: 3120again:
3090 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3121 key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3108,8 +3139,7 @@ again:
3108 3139
3109 log = btrfs_read_fs_root_no_radix(log_root_tree, 3140 log = btrfs_read_fs_root_no_radix(log_root_tree,
3110 &found_key); 3141 &found_key);
3111 BUG_ON(!log); 3142 BUG_ON(IS_ERR(log));
3112
3113 3143
3114 tmp_key.objectid = found_key.offset; 3144 tmp_key.objectid = found_key.offset;
3115 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3145 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d158530233b7..309a57b9fc85 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -33,17 +33,6 @@
33#include "volumes.h" 33#include "volumes.h"
34#include "async-thread.h" 34#include "async-thread.h"
35 35
36struct map_lookup {
37 u64 type;
38 int io_align;
39 int io_width;
40 int stripe_len;
41 int sector_size;
42 int num_stripes;
43 int sub_stripes;
44 struct btrfs_bio_stripe stripes[];
45};
46
47static int init_first_rw_device(struct btrfs_trans_handle *trans, 36static int init_first_rw_device(struct btrfs_trans_handle *trans,
48 struct btrfs_root *root, 37 struct btrfs_root *root,
49 struct btrfs_device *device); 38 struct btrfs_device *device);
@@ -162,7 +151,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
162 struct bio *cur; 151 struct bio *cur;
163 int again = 0; 152 int again = 0;
164 unsigned long num_run; 153 unsigned long num_run;
165 unsigned long num_sync_run;
166 unsigned long batch_run = 0; 154 unsigned long batch_run = 0;
167 unsigned long limit; 155 unsigned long limit;
168 unsigned long last_waited = 0; 156 unsigned long last_waited = 0;
@@ -173,11 +161,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
173 limit = btrfs_async_submit_limit(fs_info); 161 limit = btrfs_async_submit_limit(fs_info);
174 limit = limit * 2 / 3; 162 limit = limit * 2 / 3;
175 163
176 /* we want to make sure that every time we switch from the sync
177 * list to the normal list, we unplug
178 */
179 num_sync_run = 0;
180
181loop: 164loop:
182 spin_lock(&device->io_lock); 165 spin_lock(&device->io_lock);
183 166
@@ -223,15 +206,6 @@ loop_lock:
223 206
224 spin_unlock(&device->io_lock); 207 spin_unlock(&device->io_lock);
225 208
226 /*
227 * if we're doing the regular priority list, make sure we unplug
228 * for any high prio bios we've sent down
229 */
230 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
231 num_sync_run = 0;
232 blk_run_backing_dev(bdi, NULL);
233 }
234
235 while (pending) { 209 while (pending) {
236 210
237 rmb(); 211 rmb();
@@ -259,19 +233,11 @@ loop_lock:
259 233
260 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 234 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
261 235
262 if (cur->bi_rw & REQ_SYNC)
263 num_sync_run++;
264
265 submit_bio(cur->bi_rw, cur); 236 submit_bio(cur->bi_rw, cur);
266 num_run++; 237 num_run++;
267 batch_run++; 238 batch_run++;
268 if (need_resched()) { 239 if (need_resched())
269 if (num_sync_run) {
270 blk_run_backing_dev(bdi, NULL);
271 num_sync_run = 0;
272 }
273 cond_resched(); 240 cond_resched();
274 }
275 241
276 /* 242 /*
277 * we made progress, there is more work to do and the bdi 243 * we made progress, there is more work to do and the bdi
@@ -304,13 +270,8 @@ loop_lock:
304 * against it before looping 270 * against it before looping
305 */ 271 */
306 last_waited = ioc->last_waited; 272 last_waited = ioc->last_waited;
307 if (need_resched()) { 273 if (need_resched())
308 if (num_sync_run) {
309 blk_run_backing_dev(bdi, NULL);
310 num_sync_run = 0;
311 }
312 cond_resched(); 274 cond_resched();
313 }
314 continue; 275 continue;
315 } 276 }
316 spin_lock(&device->io_lock); 277 spin_lock(&device->io_lock);
@@ -323,22 +284,6 @@ loop_lock:
323 } 284 }
324 } 285 }
325 286
326 if (num_sync_run) {
327 num_sync_run = 0;
328 blk_run_backing_dev(bdi, NULL);
329 }
330 /*
331 * IO has already been through a long path to get here. Checksumming,
332 * async helper threads, perhaps compression. We've done a pretty
333 * good job of collecting a batch of IO and should just unplug
334 * the device right away.
335 *
336 * This will help anyone who is waiting on the IO, they might have
337 * already unplugged, but managed to do so before the bio they
338 * cared about found its way down here.
339 */
340 blk_run_backing_dev(bdi, NULL);
341
342 cond_resched(); 287 cond_resched();
343 if (again) 288 if (again)
344 goto loop; 289 goto loop;
@@ -1213,6 +1158,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1213 return -ENOMEM; 1158 return -ENOMEM;
1214 1159
1215 trans = btrfs_start_transaction(root, 0); 1160 trans = btrfs_start_transaction(root, 0);
1161 if (IS_ERR(trans)) {
1162 btrfs_free_path(path);
1163 return PTR_ERR(trans);
1164 }
1216 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1165 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1217 key.type = BTRFS_DEV_ITEM_KEY; 1166 key.type = BTRFS_DEV_ITEM_KEY;
1218 key.offset = device->devid; 1167 key.offset = device->devid;
@@ -1334,11 +1283,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1334 1283
1335 ret = btrfs_shrink_device(device, 0); 1284 ret = btrfs_shrink_device(device, 0);
1336 if (ret) 1285 if (ret)
1337 goto error_brelse; 1286 goto error_undo;
1338 1287
1339 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1288 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1340 if (ret) 1289 if (ret)
1341 goto error_brelse; 1290 goto error_undo;
1342 1291
1343 device->in_fs_metadata = 0; 1292 device->in_fs_metadata = 0;
1344 1293
@@ -1412,6 +1361,13 @@ out:
1412 mutex_unlock(&root->fs_info->volume_mutex); 1361 mutex_unlock(&root->fs_info->volume_mutex);
1413 mutex_unlock(&uuid_mutex); 1362 mutex_unlock(&uuid_mutex);
1414 return ret; 1363 return ret;
1364error_undo:
1365 if (device->writeable) {
1366 list_add(&device->dev_alloc_list,
1367 &root->fs_info->fs_devices->alloc_list);
1368 root->fs_info->fs_devices->rw_devices++;
1369 }
1370 goto error_brelse;
1415} 1371}
1416 1372
1417/* 1373/*
@@ -1601,11 +1557,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1601 1557
1602 ret = find_next_devid(root, &device->devid); 1558 ret = find_next_devid(root, &device->devid);
1603 if (ret) { 1559 if (ret) {
1560 kfree(device->name);
1604 kfree(device); 1561 kfree(device);
1605 goto error; 1562 goto error;
1606 } 1563 }
1607 1564
1608 trans = btrfs_start_transaction(root, 0); 1565 trans = btrfs_start_transaction(root, 0);
1566 if (IS_ERR(trans)) {
1567 kfree(device->name);
1568 kfree(device);
1569 ret = PTR_ERR(trans);
1570 goto error;
1571 }
1572
1609 lock_chunks(root); 1573 lock_chunks(root);
1610 1574
1611 device->writeable = 1; 1575 device->writeable = 1;
@@ -1621,7 +1585,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1621 device->dev_root = root->fs_info->dev_root; 1585 device->dev_root = root->fs_info->dev_root;
1622 device->bdev = bdev; 1586 device->bdev = bdev;
1623 device->in_fs_metadata = 1; 1587 device->in_fs_metadata = 1;
1624 device->mode = 0; 1588 device->mode = FMODE_EXCL;
1625 set_blocksize(device->bdev, 4096); 1589 set_blocksize(device->bdev, 4096);
1626 1590
1627 if (seeding_dev) { 1591 if (seeding_dev) {
@@ -1873,7 +1837,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1873 return ret; 1837 return ret;
1874 1838
1875 trans = btrfs_start_transaction(root, 0); 1839 trans = btrfs_start_transaction(root, 0);
1876 BUG_ON(!trans); 1840 BUG_ON(IS_ERR(trans));
1877 1841
1878 lock_chunks(root); 1842 lock_chunks(root);
1879 1843
@@ -1904,6 +1868,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1904 1868
1905 BUG_ON(ret); 1869 BUG_ON(ret);
1906 1870
1871 trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1872
1907 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1873 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1908 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1874 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1909 BUG_ON(ret); 1875 BUG_ON(ret);
@@ -2047,7 +2013,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
2047 BUG_ON(ret); 2013 BUG_ON(ret);
2048 2014
2049 trans = btrfs_start_transaction(dev_root, 0); 2015 trans = btrfs_start_transaction(dev_root, 0);
2050 BUG_ON(!trans); 2016 BUG_ON(IS_ERR(trans));
2051 2017
2052 ret = btrfs_grow_device(trans, device, old_size); 2018 ret = btrfs_grow_device(trans, device, old_size);
2053 BUG_ON(ret); 2019 BUG_ON(ret);
@@ -2213,6 +2179,11 @@ again:
2213 2179
2214 /* Shrinking succeeded, else we would be at "done". */ 2180 /* Shrinking succeeded, else we would be at "done". */
2215 trans = btrfs_start_transaction(root, 0); 2181 trans = btrfs_start_transaction(root, 0);
2182 if (IS_ERR(trans)) {
2183 ret = PTR_ERR(trans);
2184 goto done;
2185 }
2186
2216 lock_chunks(root); 2187 lock_chunks(root);
2217 2188
2218 device->disk_total_bytes = new_size; 2189 device->disk_total_bytes = new_size;
@@ -2626,6 +2597,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2626 *num_bytes = chunk_bytes_by_type(type, calc_size, 2597 *num_bytes = chunk_bytes_by_type(type, calc_size,
2627 map->num_stripes, sub_stripes); 2598 map->num_stripes, sub_stripes);
2628 2599
2600 trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes);
2601
2629 em = alloc_extent_map(GFP_NOFS); 2602 em = alloc_extent_map(GFP_NOFS);
2630 if (!em) { 2603 if (!em) {
2631 ret = -ENOMEM; 2604 ret = -ENOMEM;
@@ -2734,6 +2707,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2734 item_size); 2707 item_size);
2735 BUG_ON(ret); 2708 BUG_ON(ret);
2736 } 2709 }
2710
2737 kfree(chunk); 2711 kfree(chunk);
2738 return 0; 2712 return 0;
2739} 2713}
@@ -2931,14 +2905,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2931static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2905static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2932 u64 logical, u64 *length, 2906 u64 logical, u64 *length,
2933 struct btrfs_multi_bio **multi_ret, 2907 struct btrfs_multi_bio **multi_ret,
2934 int mirror_num, struct page *unplug_page) 2908 int mirror_num)
2935{ 2909{
2936 struct extent_map *em; 2910 struct extent_map *em;
2937 struct map_lookup *map; 2911 struct map_lookup *map;
2938 struct extent_map_tree *em_tree = &map_tree->map_tree; 2912 struct extent_map_tree *em_tree = &map_tree->map_tree;
2939 u64 offset; 2913 u64 offset;
2940 u64 stripe_offset; 2914 u64 stripe_offset;
2915 u64 stripe_end_offset;
2941 u64 stripe_nr; 2916 u64 stripe_nr;
2917 u64 stripe_nr_orig;
2918 u64 stripe_nr_end;
2942 int stripes_allocated = 8; 2919 int stripes_allocated = 8;
2943 int stripes_required = 1; 2920 int stripes_required = 1;
2944 int stripe_index; 2921 int stripe_index;
@@ -2947,7 +2924,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2947 int max_errors = 0; 2924 int max_errors = 0;
2948 struct btrfs_multi_bio *multi = NULL; 2925 struct btrfs_multi_bio *multi = NULL;
2949 2926
2950 if (multi_ret && !(rw & REQ_WRITE)) 2927 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2951 stripes_allocated = 1; 2928 stripes_allocated = 1;
2952again: 2929again:
2953 if (multi_ret) { 2930 if (multi_ret) {
@@ -2963,11 +2940,6 @@ again:
2963 em = lookup_extent_mapping(em_tree, logical, *length); 2940 em = lookup_extent_mapping(em_tree, logical, *length);
2964 read_unlock(&em_tree->lock); 2941 read_unlock(&em_tree->lock);
2965 2942
2966 if (!em && unplug_page) {
2967 kfree(multi);
2968 return 0;
2969 }
2970
2971 if (!em) { 2943 if (!em) {
2972 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2944 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2973 (unsigned long long)logical, 2945 (unsigned long long)logical,
@@ -2993,7 +2965,15 @@ again:
2993 max_errors = 1; 2965 max_errors = 1;
2994 } 2966 }
2995 } 2967 }
2996 if (multi_ret && (rw & REQ_WRITE) && 2968 if (rw & REQ_DISCARD) {
2969 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2970 BTRFS_BLOCK_GROUP_RAID1 |
2971 BTRFS_BLOCK_GROUP_DUP |
2972 BTRFS_BLOCK_GROUP_RAID10)) {
2973 stripes_required = map->num_stripes;
2974 }
2975 }
2976 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2997 stripes_allocated < stripes_required) { 2977 stripes_allocated < stripes_required) {
2998 stripes_allocated = map->num_stripes; 2978 stripes_allocated = map->num_stripes;
2999 free_extent_map(em); 2979 free_extent_map(em);
@@ -3013,23 +2993,37 @@ again:
3013 /* stripe_offset is the offset of this block in its stripe*/ 2993 /* stripe_offset is the offset of this block in its stripe*/
3014 stripe_offset = offset - stripe_offset; 2994 stripe_offset = offset - stripe_offset;
3015 2995
3016 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2996 if (rw & REQ_DISCARD)
3017 BTRFS_BLOCK_GROUP_RAID10 | 2997 *length = min_t(u64, em->len - offset, *length);
3018 BTRFS_BLOCK_GROUP_DUP)) { 2998 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2999 BTRFS_BLOCK_GROUP_RAID1 |
3000 BTRFS_BLOCK_GROUP_RAID10 |
3001 BTRFS_BLOCK_GROUP_DUP)) {
3019 /* we limit the length of each bio to what fits in a stripe */ 3002 /* we limit the length of each bio to what fits in a stripe */
3020 *length = min_t(u64, em->len - offset, 3003 *length = min_t(u64, em->len - offset,
3021 map->stripe_len - stripe_offset); 3004 map->stripe_len - stripe_offset);
3022 } else { 3005 } else {
3023 *length = em->len - offset; 3006 *length = em->len - offset;
3024 } 3007 }
3025 3008
3026 if (!multi_ret && !unplug_page) 3009 if (!multi_ret)
3027 goto out; 3010 goto out;
3028 3011
3029 num_stripes = 1; 3012 num_stripes = 1;
3030 stripe_index = 0; 3013 stripe_index = 0;
3031 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3014 stripe_nr_orig = stripe_nr;
3032 if (unplug_page || (rw & REQ_WRITE)) 3015 stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3016 (~(map->stripe_len - 1));
3017 do_div(stripe_nr_end, map->stripe_len);
3018 stripe_end_offset = stripe_nr_end * map->stripe_len -
3019 (offset + *length);
3020 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3021 if (rw & REQ_DISCARD)
3022 num_stripes = min_t(u64, map->num_stripes,
3023 stripe_nr_end - stripe_nr_orig);
3024 stripe_index = do_div(stripe_nr, map->num_stripes);
3025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3026 if (rw & (REQ_WRITE | REQ_DISCARD))
3033 num_stripes = map->num_stripes; 3027 num_stripes = map->num_stripes;
3034 else if (mirror_num) 3028 else if (mirror_num)
3035 stripe_index = mirror_num - 1; 3029 stripe_index = mirror_num - 1;
@@ -3040,7 +3034,7 @@ again:
3040 } 3034 }
3041 3035
3042 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3036 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3043 if (rw & REQ_WRITE) 3037 if (rw & (REQ_WRITE | REQ_DISCARD))
3044 num_stripes = map->num_stripes; 3038 num_stripes = map->num_stripes;
3045 else if (mirror_num) 3039 else if (mirror_num)
3046 stripe_index = mirror_num - 1; 3040 stripe_index = mirror_num - 1;
@@ -3051,8 +3045,12 @@ again:
3051 stripe_index = do_div(stripe_nr, factor); 3045 stripe_index = do_div(stripe_nr, factor);
3052 stripe_index *= map->sub_stripes; 3046 stripe_index *= map->sub_stripes;
3053 3047
3054 if (unplug_page || (rw & REQ_WRITE)) 3048 if (rw & REQ_WRITE)
3055 num_stripes = map->sub_stripes; 3049 num_stripes = map->sub_stripes;
3050 else if (rw & REQ_DISCARD)
3051 num_stripes = min_t(u64, map->sub_stripes *
3052 (stripe_nr_end - stripe_nr_orig),
3053 map->num_stripes);
3056 else if (mirror_num) 3054 else if (mirror_num)
3057 stripe_index += mirror_num - 1; 3055 stripe_index += mirror_num - 1;
3058 else { 3056 else {
@@ -3070,24 +3068,101 @@ again:
3070 } 3068 }
3071 BUG_ON(stripe_index >= map->num_stripes); 3069 BUG_ON(stripe_index >= map->num_stripes);
3072 3070
3073 for (i = 0; i < num_stripes; i++) { 3071 if (rw & REQ_DISCARD) {
3074 if (unplug_page) { 3072 for (i = 0; i < num_stripes; i++) {
3075 struct btrfs_device *device;
3076 struct backing_dev_info *bdi;
3077
3078 device = map->stripes[stripe_index].dev;
3079 if (device->bdev) {
3080 bdi = blk_get_backing_dev_info(device->bdev);
3081 if (bdi->unplug_io_fn)
3082 bdi->unplug_io_fn(bdi, unplug_page);
3083 }
3084 } else {
3085 multi->stripes[i].physical = 3073 multi->stripes[i].physical =
3086 map->stripes[stripe_index].physical + 3074 map->stripes[stripe_index].physical +
3087 stripe_offset + stripe_nr * map->stripe_len; 3075 stripe_offset + stripe_nr * map->stripe_len;
3088 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3076 multi->stripes[i].dev = map->stripes[stripe_index].dev;
3077
3078 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3079 u64 stripes;
3080 u32 last_stripe = 0;
3081 int j;
3082
3083 div_u64_rem(stripe_nr_end - 1,
3084 map->num_stripes,
3085 &last_stripe);
3086
3087 for (j = 0; j < map->num_stripes; j++) {
3088 u32 test;
3089
3090 div_u64_rem(stripe_nr_end - 1 - j,
3091 map->num_stripes, &test);
3092 if (test == stripe_index)
3093 break;
3094 }
3095 stripes = stripe_nr_end - 1 - j;
3096 do_div(stripes, map->num_stripes);
3097 multi->stripes[i].length = map->stripe_len *
3098 (stripes - stripe_nr + 1);
3099
3100 if (i == 0) {
3101 multi->stripes[i].length -=
3102 stripe_offset;
3103 stripe_offset = 0;
3104 }
3105 if (stripe_index == last_stripe)
3106 multi->stripes[i].length -=
3107 stripe_end_offset;
3108 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3109 u64 stripes;
3110 int j;
3111 int factor = map->num_stripes /
3112 map->sub_stripes;
3113 u32 last_stripe = 0;
3114
3115 div_u64_rem(stripe_nr_end - 1,
3116 factor, &last_stripe);
3117 last_stripe *= map->sub_stripes;
3118
3119 for (j = 0; j < factor; j++) {
3120 u32 test;
3121
3122 div_u64_rem(stripe_nr_end - 1 - j,
3123 factor, &test);
3124
3125 if (test ==
3126 stripe_index / map->sub_stripes)
3127 break;
3128 }
3129 stripes = stripe_nr_end - 1 - j;
3130 do_div(stripes, factor);
3131 multi->stripes[i].length = map->stripe_len *
3132 (stripes - stripe_nr + 1);
3133
3134 if (i < map->sub_stripes) {
3135 multi->stripes[i].length -=
3136 stripe_offset;
3137 if (i == map->sub_stripes - 1)
3138 stripe_offset = 0;
3139 }
3140 if (stripe_index >= last_stripe &&
3141 stripe_index <= (last_stripe +
3142 map->sub_stripes - 1)) {
3143 multi->stripes[i].length -=
3144 stripe_end_offset;
3145 }
3146 } else
3147 multi->stripes[i].length = *length;
3148
3149 stripe_index++;
3150 if (stripe_index == map->num_stripes) {
3151 /* This could only happen for RAID0/10 */
3152 stripe_index = 0;
3153 stripe_nr++;
3154 }
3155 }
3156 } else {
3157 for (i = 0; i < num_stripes; i++) {
3158 multi->stripes[i].physical =
3159 map->stripes[stripe_index].physical +
3160 stripe_offset +
3161 stripe_nr * map->stripe_len;
3162 multi->stripes[i].dev =
3163 map->stripes[stripe_index].dev;
3164 stripe_index++;
3089 } 3165 }
3090 stripe_index++;
3091 } 3166 }
3092 if (multi_ret) { 3167 if (multi_ret) {
3093 *multi_ret = multi; 3168 *multi_ret = multi;
@@ -3104,7 +3179,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3104 struct btrfs_multi_bio **multi_ret, int mirror_num) 3179 struct btrfs_multi_bio **multi_ret, int mirror_num)
3105{ 3180{
3106 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3181 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3107 mirror_num, NULL); 3182 mirror_num);
3108} 3183}
3109 3184
3110int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3185int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3172,14 +3247,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3172 return 0; 3247 return 0;
3173} 3248}
3174 3249
3175int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
3176 u64 logical, struct page *page)
3177{
3178 u64 length = PAGE_CACHE_SIZE;
3179 return __btrfs_map_block(map_tree, READ, logical, &length,
3180 NULL, 0, page);
3181}
3182
3183static void end_bio_multi_stripe(struct bio *bio, int err) 3250static void end_bio_multi_stripe(struct bio *bio, int err)
3184{ 3251{
3185 struct btrfs_multi_bio *multi = bio->bi_private; 3252 struct btrfs_multi_bio *multi = bio->bi_private;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7fb59d45fe8c..cc2eadaf7a27 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -126,6 +126,7 @@ struct btrfs_fs_devices {
126struct btrfs_bio_stripe { 126struct btrfs_bio_stripe {
127 struct btrfs_device *dev; 127 struct btrfs_device *dev;
128 u64 physical; 128 u64 physical;
129 u64 length; /* only used for discard mappings */
129}; 130};
130 131
131struct btrfs_multi_bio { 132struct btrfs_multi_bio {
@@ -145,6 +146,17 @@ struct btrfs_device_info {
145 u64 max_avail; 146 u64 max_avail;
146}; 147};
147 148
149struct map_lookup {
150 u64 type;
151 int io_align;
152 int io_width;
153 int stripe_len;
154 int sector_size;
155 int num_stripes;
156 int sub_stripes;
157 struct btrfs_bio_stripe stripes[];
158};
159
148/* Used to sort the devices by max_avail(descending sort) */ 160/* Used to sort the devices by max_avail(descending sort) */
149int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2); 161int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
150 162
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index a5776531dc2b..cfd660550ded 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -180,11 +180,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
180 struct btrfs_path *path; 180 struct btrfs_path *path;
181 struct extent_buffer *leaf; 181 struct extent_buffer *leaf;
182 struct btrfs_dir_item *di; 182 struct btrfs_dir_item *di;
183 int ret = 0, slot, advance; 183 int ret = 0, slot;
184 size_t total_size = 0, size_left = size; 184 size_t total_size = 0, size_left = size;
185 unsigned long name_ptr; 185 unsigned long name_ptr;
186 size_t name_len; 186 size_t name_len;
187 u32 nritems;
188 187
189 /* 188 /*
190 * ok we want all objects associated with this id. 189 * ok we want all objects associated with this id.
@@ -204,34 +203,24 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
204 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 203 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
205 if (ret < 0) 204 if (ret < 0)
206 goto err; 205 goto err;
207 advance = 0; 206
208 while (1) { 207 while (1) {
209 leaf = path->nodes[0]; 208 leaf = path->nodes[0];
210 nritems = btrfs_header_nritems(leaf);
211 slot = path->slots[0]; 209 slot = path->slots[0];
212 210
213 /* this is where we start walking through the path */ 211 /* this is where we start walking through the path */
214 if (advance || slot >= nritems) { 212 if (slot >= btrfs_header_nritems(leaf)) {
215 /* 213 /*
216 * if we've reached the last slot in this leaf we need 214 * if we've reached the last slot in this leaf we need
217 * to go to the next leaf and reset everything 215 * to go to the next leaf and reset everything
218 */ 216 */
219 if (slot >= nritems-1) { 217 ret = btrfs_next_leaf(root, path);
220 ret = btrfs_next_leaf(root, path); 218 if (ret < 0)
221 if (ret) 219 goto err;
222 break; 220 else if (ret > 0)
223 leaf = path->nodes[0]; 221 break;
224 nritems = btrfs_header_nritems(leaf); 222 continue;
225 slot = path->slots[0];
226 } else {
227 /*
228 * just walking through the slots on this leaf
229 */
230 slot++;
231 path->slots[0]++;
232 }
233 } 223 }
234 advance = 1;
235 224
236 btrfs_item_key_to_cpu(leaf, &found_key, slot); 225 btrfs_item_key_to_cpu(leaf, &found_key, slot);
237 226
@@ -242,13 +231,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
242 break; 231 break;
243 232
244 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 233 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
234 if (verify_dir_item(root, leaf, di))
235 continue;
245 236
246 name_len = btrfs_dir_name_len(leaf, di); 237 name_len = btrfs_dir_name_len(leaf, di);
247 total_size += name_len + 1; 238 total_size += name_len + 1;
248 239
249 /* we are just looking for how big our buffer needs to be */ 240 /* we are just looking for how big our buffer needs to be */
250 if (!size) 241 if (!size)
251 continue; 242 goto next;
252 243
253 if (!buffer || (name_len + 1) > size_left) { 244 if (!buffer || (name_len + 1) > size_left) {
254 ret = -ERANGE; 245 ret = -ERANGE;
@@ -261,6 +252,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
261 252
262 size_left -= name_len + 1; 253 size_left -= name_len + 1;
263 buffer += name_len + 1; 254 buffer += name_len + 1;
255next:
256 path->slots[0]++;
264 } 257 }
265 ret = total_size; 258 ret = total_size;
266 259
@@ -370,7 +363,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
370} 363}
371 364
372int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 365int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
373 struct inode *inode, struct inode *dir) 366 struct inode *inode, struct inode *dir,
367 const struct qstr *qstr)
374{ 368{
375 int err; 369 int err;
376 size_t len; 370 size_t len;
@@ -378,7 +372,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
378 char *suffix; 372 char *suffix;
379 char *name; 373 char *name;
380 374
381 err = security_inode_init_security(inode, dir, &suffix, &value, &len); 375 err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
376 &len);
382 if (err) { 377 if (err) {
383 if (err == -EOPNOTSUPP) 378 if (err == -EOPNOTSUPP)
384 return 0; 379 return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
40 struct inode *inode, struct inode *dir); 40 struct inode *inode, struct inode *dir,
41 const struct qstr *qstr);
41 42
42#endif /* __XATTR__ */ 43#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index f5ec2d44150d..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -57,7 +57,8 @@ static struct list_head *zlib_alloc_workspace(void)
57 if (!workspace) 57 if (!workspace)
58 return ERR_PTR(-ENOMEM); 58 return ERR_PTR(-ENOMEM);
59 59
60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
61 MAX_WBITS, MAX_MEM_LEVEL));
61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 62 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 63 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
63 if (!workspace->def_strm.workspace || 64 if (!workspace->def_strm.workspace ||
diff --git a/fs/buffer.c b/fs/buffer.c
index 2219a76e2caf..a08bb8e61c6f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -54,23 +54,15 @@ init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
54} 54}
55EXPORT_SYMBOL(init_buffer); 55EXPORT_SYMBOL(init_buffer);
56 56
57static int sync_buffer(void *word) 57static int sleep_on_buffer(void *word)
58{ 58{
59 struct block_device *bd;
60 struct buffer_head *bh
61 = container_of(word, struct buffer_head, b_state);
62
63 smp_mb();
64 bd = bh->b_bdev;
65 if (bd)
66 blk_run_address_space(bd->bd_inode->i_mapping);
67 io_schedule(); 59 io_schedule();
68 return 0; 60 return 0;
69} 61}
70 62
71void __lock_buffer(struct buffer_head *bh) 63void __lock_buffer(struct buffer_head *bh)
72{ 64{
73 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer, 65 wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
74 TASK_UNINTERRUPTIBLE); 66 TASK_UNINTERRUPTIBLE);
75} 67}
76EXPORT_SYMBOL(__lock_buffer); 68EXPORT_SYMBOL(__lock_buffer);
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(unlock_buffer);
90 */ 82 */
91void __wait_on_buffer(struct buffer_head * bh) 83void __wait_on_buffer(struct buffer_head * bh)
92{ 84{
93 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE); 85 wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
94} 86}
95EXPORT_SYMBOL(__wait_on_buffer); 87EXPORT_SYMBOL(__wait_on_buffer);
96 88
@@ -749,10 +741,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
749{ 741{
750 struct buffer_head *bh; 742 struct buffer_head *bh;
751 struct list_head tmp; 743 struct list_head tmp;
752 struct address_space *mapping, *prev_mapping = NULL; 744 struct address_space *mapping;
753 int err = 0, err2; 745 int err = 0, err2;
746 struct blk_plug plug;
754 747
755 INIT_LIST_HEAD(&tmp); 748 INIT_LIST_HEAD(&tmp);
749 blk_start_plug(&plug);
756 750
757 spin_lock(lock); 751 spin_lock(lock);
758 while (!list_empty(list)) { 752 while (!list_empty(list)) {
@@ -775,7 +769,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
775 * still in flight on potentially older 769 * still in flight on potentially older
776 * contents. 770 * contents.
777 */ 771 */
778 write_dirty_buffer(bh, WRITE_SYNC_PLUG); 772 write_dirty_buffer(bh, WRITE_SYNC);
779 773
780 /* 774 /*
781 * Kick off IO for the previous mapping. Note 775 * Kick off IO for the previous mapping. Note
@@ -783,16 +777,16 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
783 * wait_on_buffer() will do that for us 777 * wait_on_buffer() will do that for us
784 * through sync_buffer(). 778 * through sync_buffer().
785 */ 779 */
786 if (prev_mapping && prev_mapping != mapping)
787 blk_run_address_space(prev_mapping);
788 prev_mapping = mapping;
789
790 brelse(bh); 780 brelse(bh);
791 spin_lock(lock); 781 spin_lock(lock);
792 } 782 }
793 } 783 }
794 } 784 }
795 785
786 spin_unlock(lock);
787 blk_finish_plug(&plug);
788 spin_lock(lock);
789
796 while (!list_empty(&tmp)) { 790 while (!list_empty(&tmp)) {
797 bh = BH_ENTRY(tmp.prev); 791 bh = BH_ENTRY(tmp.prev);
798 get_bh(bh); 792 get_bh(bh);
@@ -1144,7 +1138,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
1144 * inode list. 1138 * inode list.
1145 * 1139 *
1146 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, 1140 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1147 * mapping->tree_lock and the global inode_lock. 1141 * mapping->tree_lock and mapping->host->i_lock.
1148 */ 1142 */
1149void mark_buffer_dirty(struct buffer_head *bh) 1143void mark_buffer_dirty(struct buffer_head *bh)
1150{ 1144{
@@ -1614,14 +1608,8 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
1614 * prevents this contention from occurring. 1608 * prevents this contention from occurring.
1615 * 1609 *
1616 * If block_write_full_page() is called with wbc->sync_mode == 1610 * If block_write_full_page() is called with wbc->sync_mode ==
1617 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this 1611 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1618 * causes the writes to be flagged as synchronous writes, but the 1612 * causes the writes to be flagged as synchronous writes.
1619 * block device queue will NOT be unplugged, since usually many pages
1620 * will be pushed to the out before the higher-level caller actually
1621 * waits for the writes to be completed. The various wait functions,
1622 * such as wait_on_writeback_range() will ultimately call sync_page()
1623 * which will ultimately call blk_run_backing_dev(), which will end up
1624 * unplugging the device queue.
1625 */ 1613 */
1626static int __block_write_full_page(struct inode *inode, struct page *page, 1614static int __block_write_full_page(struct inode *inode, struct page *page,
1627 get_block_t *get_block, struct writeback_control *wbc, 1615 get_block_t *get_block, struct writeback_control *wbc,
@@ -1634,7 +1622,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1634 const unsigned blocksize = 1 << inode->i_blkbits; 1622 const unsigned blocksize = 1 << inode->i_blkbits;
1635 int nr_underway = 0; 1623 int nr_underway = 0;
1636 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? 1624 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1637 WRITE_SYNC_PLUG : WRITE); 1625 WRITE_SYNC : WRITE);
1638 1626
1639 BUG_ON(!PageLocked(page)); 1627 BUG_ON(!PageLocked(page));
1640 1628
@@ -3138,17 +3126,6 @@ out:
3138} 3126}
3139EXPORT_SYMBOL(try_to_free_buffers); 3127EXPORT_SYMBOL(try_to_free_buffers);
3140 3128
3141void block_sync_page(struct page *page)
3142{
3143 struct address_space *mapping;
3144
3145 smp_mb();
3146 mapping = page_mapping(page);
3147 if (mapping)
3148 blk_run_backing_dev(mapping->backing_dev_info, page);
3149}
3150EXPORT_SYMBOL(block_sync_page);
3151
3152/* 3129/*
3153 * There are no bdflush tunables left. But distributions are 3130 * There are no bdflush tunables left. But distributions are
3154 * still running obsolete flush daemons, so we terminate them here. 3131 * still running obsolete flush daemons, so we terminate them here.
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 37fe101a4e0d..1064805e653b 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -197,7 +197,7 @@ struct fscache_object *cachefiles_grab_object(struct fscache_object *_object)
197} 197}
198 198
199/* 199/*
200 * update the auxilliary data for an object object on disk 200 * update the auxiliary data for an object object on disk
201 */ 201 */
202static void cachefiles_update_object(struct fscache_object *_object) 202static void cachefiles_update_object(struct fscache_object *_object)
203{ 203{
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 42c7fafc8bfe..a0358c2189cb 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -275,6 +275,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
275 bool preemptive) 275 bool preemptive)
276{ 276{
277 struct dentry *grave, *trap; 277 struct dentry *grave, *trap;
278 struct path path, path_to_graveyard;
278 char nbuffer[8 + 8 + 1]; 279 char nbuffer[8 + 8 + 1];
279 int ret; 280 int ret;
280 281
@@ -287,10 +288,18 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
287 /* non-directories can just be unlinked */ 288 /* non-directories can just be unlinked */
288 if (!S_ISDIR(rep->d_inode->i_mode)) { 289 if (!S_ISDIR(rep->d_inode->i_mode)) {
289 _debug("unlink stale object"); 290 _debug("unlink stale object");
290 ret = vfs_unlink(dir->d_inode, rep);
291 291
292 if (preemptive) 292 path.mnt = cache->mnt;
293 cachefiles_mark_object_buried(cache, rep); 293 path.dentry = dir;
294 ret = security_path_unlink(&path, rep);
295 if (ret < 0) {
296 cachefiles_io_error(cache, "Unlink security error");
297 } else {
298 ret = vfs_unlink(dir->d_inode, rep);
299
300 if (preemptive)
301 cachefiles_mark_object_buried(cache, rep);
302 }
294 303
295 mutex_unlock(&dir->d_inode->i_mutex); 304 mutex_unlock(&dir->d_inode->i_mutex);
296 305
@@ -379,12 +388,23 @@ try_again:
379 } 388 }
380 389
381 /* attempt the rename */ 390 /* attempt the rename */
382 ret = vfs_rename(dir->d_inode, rep, cache->graveyard->d_inode, grave); 391 path.mnt = cache->mnt;
383 if (ret != 0 && ret != -ENOMEM) 392 path.dentry = dir;
384 cachefiles_io_error(cache, "Rename failed with error %d", ret); 393 path_to_graveyard.mnt = cache->mnt;
394 path_to_graveyard.dentry = cache->graveyard;
395 ret = security_path_rename(&path, rep, &path_to_graveyard, grave);
396 if (ret < 0) {
397 cachefiles_io_error(cache, "Rename security error %d", ret);
398 } else {
399 ret = vfs_rename(dir->d_inode, rep,
400 cache->graveyard->d_inode, grave);
401 if (ret != 0 && ret != -ENOMEM)
402 cachefiles_io_error(cache,
403 "Rename failed with error %d", ret);
385 404
386 if (preemptive) 405 if (preemptive)
387 cachefiles_mark_object_buried(cache, rep); 406 cachefiles_mark_object_buried(cache, rep);
407 }
388 408
389 unlock_rename(cache->graveyard, dir); 409 unlock_rename(cache->graveyard, dir);
390 dput(grave); 410 dput(grave);
@@ -448,6 +468,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
448{ 468{
449 struct cachefiles_cache *cache; 469 struct cachefiles_cache *cache;
450 struct dentry *dir, *next = NULL; 470 struct dentry *dir, *next = NULL;
471 struct path path;
451 unsigned long start; 472 unsigned long start;
452 const char *name; 473 const char *name;
453 int ret, nlen; 474 int ret, nlen;
@@ -458,6 +479,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
458 479
459 cache = container_of(parent->fscache.cache, 480 cache = container_of(parent->fscache.cache,
460 struct cachefiles_cache, cache); 481 struct cachefiles_cache, cache);
482 path.mnt = cache->mnt;
461 483
462 ASSERT(parent->dentry); 484 ASSERT(parent->dentry);
463 ASSERT(parent->dentry->d_inode); 485 ASSERT(parent->dentry->d_inode);
@@ -511,6 +533,10 @@ lookup_again:
511 if (ret < 0) 533 if (ret < 0)
512 goto create_error; 534 goto create_error;
513 535
536 path.dentry = dir;
537 ret = security_path_mkdir(&path, next, 0);
538 if (ret < 0)
539 goto create_error;
514 start = jiffies; 540 start = jiffies;
515 ret = vfs_mkdir(dir->d_inode, next, 0); 541 ret = vfs_mkdir(dir->d_inode, next, 0);
516 cachefiles_hist(cachefiles_mkdir_histogram, start); 542 cachefiles_hist(cachefiles_mkdir_histogram, start);
@@ -536,6 +562,10 @@ lookup_again:
536 if (ret < 0) 562 if (ret < 0)
537 goto create_error; 563 goto create_error;
538 564
565 path.dentry = dir;
566 ret = security_path_mknod(&path, next, S_IFREG, 0);
567 if (ret < 0)
568 goto create_error;
539 start = jiffies; 569 start = jiffies;
540 ret = vfs_create(dir->d_inode, next, S_IFREG, NULL); 570 ret = vfs_create(dir->d_inode, next, S_IFREG, NULL);
541 cachefiles_hist(cachefiles_create_histogram, start); 571 cachefiles_hist(cachefiles_create_histogram, start);
@@ -692,6 +722,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
692{ 722{
693 struct dentry *subdir; 723 struct dentry *subdir;
694 unsigned long start; 724 unsigned long start;
725 struct path path;
695 int ret; 726 int ret;
696 727
697 _enter(",,%s", dirname); 728 _enter(",,%s", dirname);
@@ -719,6 +750,11 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
719 750
720 _debug("attempt mkdir"); 751 _debug("attempt mkdir");
721 752
753 path.mnt = cache->mnt;
754 path.dentry = dir;
755 ret = security_path_mkdir(&path, subdir, 0700);
756 if (ret < 0)
757 goto mkdir_error;
722 ret = vfs_mkdir(dir->d_inode, subdir, 0700); 758 ret = vfs_mkdir(dir->d_inode, subdir, 0700);
723 if (ret < 0) 759 if (ret < 0)
724 goto mkdir_error; 760 goto mkdir_error;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 561438b6a50c..e159c529fd2b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -24,7 +24,7 @@
24 * context needs to be associated with the osd write during writeback. 24 * context needs to be associated with the osd write during writeback.
25 * 25 *
26 * Similarly, struct ceph_inode_info maintains a set of counters to 26 * Similarly, struct ceph_inode_info maintains a set of counters to
27 * count dirty pages on the inode. In the absense of snapshots, 27 * count dirty pages on the inode. In the absence of snapshots,
28 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. 28 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
29 * 29 *
30 * When a snapshot is taken (that is, when the client receives 30 * When a snapshot is taken (that is, when the client receives
@@ -92,7 +92,7 @@ static int ceph_set_page_dirty(struct page *page)
92 ci->i_head_snapc = ceph_get_snap_context(snapc); 92 ci->i_head_snapc = ceph_get_snap_context(snapc);
93 ++ci->i_wrbuffer_ref_head; 93 ++ci->i_wrbuffer_ref_head;
94 if (ci->i_wrbuffer_ref == 0) 94 if (ci->i_wrbuffer_ref == 0)
95 igrab(inode); 95 ihold(inode);
96 ++ci->i_wrbuffer_ref; 96 ++ci->i_wrbuffer_ref;
97 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " 97 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
98 "snapc %p seq %lld (%d snaps)\n", 98 "snapc %p seq %lld (%d snaps)\n",
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6b61ded701e1..5323c330bbf3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -765,7 +765,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
765 if (touch) { 765 if (touch) {
766 struct rb_node *q; 766 struct rb_node *q;
767 767
768 /* touch this + preceeding caps */ 768 /* touch this + preceding caps */
769 __touch_cap(cap); 769 __touch_cap(cap);
770 for (q = rb_first(&ci->i_caps); q != p; 770 for (q = rb_first(&ci->i_caps); q != p;
771 q = rb_next(q)) { 771 q = rb_next(q)) {
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 08f65faac112..0dba6915712b 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -210,8 +210,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
210 if (!fsc->debugfs_congestion_kb) 210 if (!fsc->debugfs_congestion_kb)
211 goto out; 211 goto out;
212 212
213 dout("a\n");
214
215 snprintf(name, sizeof(name), "../../bdi/%s", 213 snprintf(name, sizeof(name), "../../bdi/%s",
216 dev_name(fsc->backing_dev_info.dev)); 214 dev_name(fsc->backing_dev_info.dev));
217 fsc->debugfs_bdi = 215 fsc->debugfs_bdi =
@@ -221,7 +219,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
221 if (!fsc->debugfs_bdi) 219 if (!fsc->debugfs_bdi)
222 goto out; 220 goto out;
223 221
224 dout("b\n");
225 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap", 222 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
226 0600, 223 0600,
227 fsc->client->debugfs_dir, 224 fsc->client->debugfs_dir,
@@ -230,7 +227,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
230 if (!fsc->debugfs_mdsmap) 227 if (!fsc->debugfs_mdsmap)
231 goto out; 228 goto out;
232 229
233 dout("ca\n");
234 fsc->debugfs_mdsc = debugfs_create_file("mdsc", 230 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
235 0600, 231 0600,
236 fsc->client->debugfs_dir, 232 fsc->client->debugfs_dir,
@@ -239,7 +235,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
239 if (!fsc->debugfs_mdsc) 235 if (!fsc->debugfs_mdsc)
240 goto out; 236 goto out;
241 237
242 dout("da\n");
243 fsc->debugfs_caps = debugfs_create_file("caps", 238 fsc->debugfs_caps = debugfs_create_file("caps",
244 0400, 239 0400,
245 fsc->client->debugfs_dir, 240 fsc->client->debugfs_dir,
@@ -248,7 +243,6 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
248 if (!fsc->debugfs_caps) 243 if (!fsc->debugfs_caps)
249 goto out; 244 goto out;
250 245
251 dout("ea\n");
252 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 246 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
253 0600, 247 0600,
254 fsc->client->debugfs_dir, 248 fsc->client->debugfs_dir,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bc68de8edd7..1a867a3601ae 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -161,7 +161,7 @@ more:
161 filp->f_pos = di->offset; 161 filp->f_pos = di->offset;
162 err = filldir(dirent, dentry->d_name.name, 162 err = filldir(dirent, dentry->d_name.name,
163 dentry->d_name.len, di->offset, 163 dentry->d_name.len, di->offset,
164 dentry->d_inode->i_ino, 164 ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
165 dentry->d_inode->i_mode >> 12); 165 dentry->d_inode->i_mode >> 12);
166 166
167 if (last) { 167 if (last) {
@@ -245,15 +245,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
245 245
246 dout("readdir off 0 -> '.'\n"); 246 dout("readdir off 0 -> '.'\n");
247 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0), 247 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
248 inode->i_ino, inode->i_mode >> 12) < 0) 248 ceph_translate_ino(inode->i_sb, inode->i_ino),
249 inode->i_mode >> 12) < 0)
249 return 0; 250 return 0;
250 filp->f_pos = 1; 251 filp->f_pos = 1;
251 off = 1; 252 off = 1;
252 } 253 }
253 if (filp->f_pos == 1) { 254 if (filp->f_pos == 1) {
255 ino_t ino = filp->f_dentry->d_parent->d_inode->i_ino;
254 dout("readdir off 1 -> '..'\n"); 256 dout("readdir off 1 -> '..'\n");
255 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1), 257 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
256 filp->f_dentry->d_parent->d_inode->i_ino, 258 ceph_translate_ino(inode->i_sb, ino),
257 inode->i_mode >> 12) < 0) 259 inode->i_mode >> 12) < 0)
258 return 0; 260 return 0;
259 filp->f_pos = 2; 261 filp->f_pos = 2;
@@ -377,7 +379,8 @@ more:
377 if (filldir(dirent, 379 if (filldir(dirent,
378 rinfo->dir_dname[off - fi->offset], 380 rinfo->dir_dname[off - fi->offset],
379 rinfo->dir_dname_len[off - fi->offset], 381 rinfo->dir_dname_len[off - fi->offset],
380 pos, ino, ftype) < 0) { 382 pos,
383 ceph_translate_ino(inode->i_sb, ino), ftype) < 0) {
381 dout("filldir stopping us...\n"); 384 dout("filldir stopping us...\n");
382 return 0; 385 return 0;
383 } 386 }
@@ -409,7 +412,7 @@ more:
409 spin_lock(&inode->i_lock); 412 spin_lock(&inode->i_lock);
410 if (ci->i_release_count == fi->dir_release_count) { 413 if (ci->i_release_count == fi->dir_release_count) {
411 dout(" marking %p complete\n", inode); 414 dout(" marking %p complete\n", inode);
412 ci->i_ceph_flags |= CEPH_I_COMPLETE; 415 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
413 ci->i_max_offset = filp->f_pos; 416 ci->i_max_offset = filp->f_pos;
414 } 417 }
415 spin_unlock(&inode->i_lock); 418 spin_unlock(&inode->i_lock);
@@ -496,6 +499,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
496 499
497 /* .snap dir? */ 500 /* .snap dir? */
498 if (err == -ENOENT && 501 if (err == -ENOENT &&
502 ceph_snap(parent) == CEPH_NOSNAP &&
499 strcmp(dentry->d_name.name, 503 strcmp(dentry->d_name.name,
500 fsc->mount_options->snapdir_name) == 0) { 504 fsc->mount_options->snapdir_name) == 0) {
501 struct inode *inode = ceph_get_snapdir(parent); 505 struct inode *inode = ceph_get_snapdir(parent);
@@ -992,7 +996,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
992{ 996{
993 struct inode *dir; 997 struct inode *dir;
994 998
995 if (nd->flags & LOOKUP_RCU) 999 if (nd && nd->flags & LOOKUP_RCU)
996 return -ECHILD; 1000 return -ECHILD;
997 1001
998 dir = dentry->d_parent->d_inode; 1002 dir = dentry->d_parent->d_inode;
@@ -1023,34 +1027,13 @@ out_touch:
1023} 1027}
1024 1028
1025/* 1029/*
1026 * When a dentry is released, clear the dir I_COMPLETE if it was part 1030 * Release our ceph_dentry_info.
1027 * of the current dir gen or if this is in the snapshot namespace.
1028 */ 1031 */
1029static void ceph_dentry_release(struct dentry *dentry) 1032static void ceph_d_release(struct dentry *dentry)
1030{ 1033{
1031 struct ceph_dentry_info *di = ceph_dentry(dentry); 1034 struct ceph_dentry_info *di = ceph_dentry(dentry);
1032 struct inode *parent_inode = NULL;
1033 u64 snapid = CEPH_NOSNAP;
1034 1035
1035 if (!IS_ROOT(dentry)) { 1036 dout("d_release %p\n", dentry);
1036 parent_inode = dentry->d_parent->d_inode;
1037 if (parent_inode)
1038 snapid = ceph_snap(parent_inode);
1039 }
1040 dout("dentry_release %p parent %p\n", dentry, parent_inode);
1041 if (parent_inode && snapid != CEPH_SNAPDIR) {
1042 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1043
1044 spin_lock(&parent_inode->i_lock);
1045 if (ci->i_shared_gen == di->lease_shared_gen ||
1046 snapid <= CEPH_MAXSNAP) {
1047 dout(" clearing %p complete (d_release)\n",
1048 parent_inode);
1049 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1050 ci->i_release_count++;
1051 }
1052 spin_unlock(&parent_inode->i_lock);
1053 }
1054 if (di) { 1037 if (di) {
1055 ceph_dentry_lru_del(dentry); 1038 ceph_dentry_lru_del(dentry);
1056 if (di->lease_session) 1039 if (di->lease_session)
@@ -1275,14 +1258,14 @@ const struct inode_operations ceph_dir_iops = {
1275 1258
1276const struct dentry_operations ceph_dentry_ops = { 1259const struct dentry_operations ceph_dentry_ops = {
1277 .d_revalidate = ceph_d_revalidate, 1260 .d_revalidate = ceph_d_revalidate,
1278 .d_release = ceph_dentry_release, 1261 .d_release = ceph_d_release,
1279}; 1262};
1280 1263
1281const struct dentry_operations ceph_snapdir_dentry_ops = { 1264const struct dentry_operations ceph_snapdir_dentry_ops = {
1282 .d_revalidate = ceph_snapdir_d_revalidate, 1265 .d_revalidate = ceph_snapdir_d_revalidate,
1283 .d_release = ceph_dentry_release, 1266 .d_release = ceph_d_release,
1284}; 1267};
1285 1268
1286const struct dentry_operations ceph_snap_dentry_ops = { 1269const struct dentry_operations ceph_snap_dentry_ops = {
1287 .d_release = ceph_dentry_release, 1270 .d_release = ceph_d_release,
1288}; 1271};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7d0e4a82d898..159b512d5a27 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -564,11 +564,19 @@ more:
564 * start_request so that a tid has been assigned. 564 * start_request so that a tid has been assigned.
565 */ 565 */
566 spin_lock(&ci->i_unsafe_lock); 566 spin_lock(&ci->i_unsafe_lock);
567 list_add(&req->r_unsafe_item, &ci->i_unsafe_writes); 567 list_add_tail(&req->r_unsafe_item,
568 &ci->i_unsafe_writes);
568 spin_unlock(&ci->i_unsafe_lock); 569 spin_unlock(&ci->i_unsafe_lock);
569 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 570 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
570 } 571 }
572
571 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 573 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
574 if (ret < 0 && req->r_safe_callback) {
575 spin_lock(&ci->i_unsafe_lock);
576 list_del_init(&req->r_unsafe_item);
577 spin_unlock(&ci->i_unsafe_lock);
578 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
579 }
572 } 580 }
573 581
574 if (file->f_flags & O_DIRECT) 582 if (file->f_flags & O_DIRECT)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5625463aa479..b54c97da1c43 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -36,6 +36,13 @@ static void ceph_vmtruncate_work(struct work_struct *work);
36/* 36/*
37 * find or create an inode, given the ceph ino number 37 * find or create an inode, given the ceph ino number
38 */ 38 */
39static int ceph_set_ino_cb(struct inode *inode, void *data)
40{
41 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
42 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
43 return 0;
44}
45
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) 46struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{ 47{
41 struct inode *inode; 48 struct inode *inode;
@@ -707,7 +714,7 @@ static int fill_inode(struct inode *inode,
707 (issued & CEPH_CAP_FILE_EXCL) == 0 && 714 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
708 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { 715 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
709 dout(" marking %p complete (empty)\n", inode); 716 dout(" marking %p complete (empty)\n", inode);
710 ci->i_ceph_flags |= CEPH_I_COMPLETE; 717 /* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
711 ci->i_max_offset = 2; 718 ci->i_max_offset = 2;
712 } 719 }
713 break; 720 break;
@@ -1030,9 +1037,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1030 dout("fill_trace doing d_move %p -> %p\n", 1037 dout("fill_trace doing d_move %p -> %p\n",
1031 req->r_old_dentry, dn); 1038 req->r_old_dentry, dn);
1032 1039
1033 /* d_move screws up d_subdirs order */
1034 ceph_i_clear(dir, CEPH_I_COMPLETE);
1035
1036 d_move(req->r_old_dentry, dn); 1040 d_move(req->r_old_dentry, dn);
1037 dout(" src %p '%.*s' dst %p '%.*s'\n", 1041 dout(" src %p '%.*s' dst %p '%.*s'\n",
1038 req->r_old_dentry, 1042 req->r_old_dentry,
@@ -1044,12 +1048,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1044 rehashing bug in vfs_rename_dir */ 1048 rehashing bug in vfs_rename_dir */
1045 ceph_invalidate_dentry_lease(dn); 1049 ceph_invalidate_dentry_lease(dn);
1046 1050
1047 /* take overwritten dentry's readdir offset */ 1051 /*
1048 dout("dn %p gets %p offset %lld (old offset %lld)\n", 1052 * d_move() puts the renamed dentry at the end of
1049 req->r_old_dentry, dn, ceph_dentry(dn)->offset, 1053 * d_subdirs. We need to assign it an appropriate
1054 * directory offset so we can behave when holding
1055 * I_COMPLETE.
1056 */
1057 ceph_set_dentry_offset(req->r_old_dentry);
1058 dout("dn %p gets new offset %lld\n", req->r_old_dentry,
1050 ceph_dentry(req->r_old_dentry)->offset); 1059 ceph_dentry(req->r_old_dentry)->offset);
1051 ceph_dentry(req->r_old_dentry)->offset =
1052 ceph_dentry(dn)->offset;
1053 1060
1054 dn = req->r_old_dentry; /* use old_dentry */ 1061 dn = req->r_old_dentry; /* use old_dentry */
1055 in = dn->d_inode; 1062 in = dn->d_inode;
@@ -1809,7 +1816,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1809 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); 1816 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1810 if (!err) { 1817 if (!err) {
1811 generic_fillattr(inode, stat); 1818 generic_fillattr(inode, stat);
1812 stat->ino = inode->i_ino; 1819 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
1813 if (ceph_snap(inode) != CEPH_NOSNAP) 1820 if (ceph_snap(inode) != CEPH_NOSNAP)
1814 stat->dev = ceph_snap(inode); 1821 stat->dev = ceph_snap(inode);
1815 else 1822 else
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a1ee8fa3a8e7..f60b07b0feb0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3215,9 +3215,15 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3215{ 3215{
3216 struct ceph_mds_client *mdsc = fsc->mdsc; 3216 struct ceph_mds_client *mdsc = fsc->mdsc;
3217 3217
3218 dout("mdsc_destroy %p\n", mdsc);
3218 ceph_mdsc_stop(mdsc); 3219 ceph_mdsc_stop(mdsc);
3220
3221 /* flush out any connection work with references to us */
3222 ceph_msgr_flush();
3223
3219 fsc->mdsc = NULL; 3224 fsc->mdsc = NULL;
3220 kfree(mdsc); 3225 kfree(mdsc);
3226 dout("mdsc_destroy %p done\n", mdsc);
3221} 3227}
3222 3228
3223 3229
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 39c243acd062..e86ec1155f8f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -342,7 +342,7 @@ static int build_snap_context(struct ceph_snap_realm *realm)
342 num = 0; 342 num = 0;
343 snapc->seq = realm->seq; 343 snapc->seq = realm->seq;
344 if (parent) { 344 if (parent) {
345 /* include any of parent's snaps occuring _after_ my 345 /* include any of parent's snaps occurring _after_ my
346 parent became my parent */ 346 parent became my parent */
347 for (i = 0; i < parent->cached_context->num_snaps; i++) 347 for (i = 0; i < parent->cached_context->num_snaps; i++)
348 if (parent->cached_context->snaps[i] >= 348 if (parent->cached_context->snaps[i] >=
@@ -463,8 +463,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
463 463
464 dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, 464 dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
465 capsnap, snapc); 465 capsnap, snapc);
466 igrab(inode); 466 ihold(inode);
467 467
468 atomic_set(&capsnap->nref, 1); 468 atomic_set(&capsnap->nref, 1);
469 capsnap->ci = ci; 469 capsnap->ci = ci;
470 INIT_LIST_HEAD(&capsnap->ci_item); 470 INIT_LIST_HEAD(&capsnap->ci_item);
@@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
584 if (lastinode) 584 if (lastinode)
585 iput(lastinode); 585 iput(lastinode);
586 586
587 dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); 587 list_for_each_entry(child, &realm->children, child_item) {
588 list_for_each_entry(child, &realm->children, child_item) 588 dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
589 queue_realm_cap_snaps(child); 589 realm, realm->ino, child, child->ino);
590 list_del_init(&child->dirty_item);
591 list_add(&child->dirty_item, &realm->dirty_item);
592 }
590 593
594 list_del_init(&realm->dirty_item);
591 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); 595 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
592} 596}
593 597
@@ -683,7 +687,9 @@ more:
683 * queue cap snaps _after_ we've built the new snap contexts, 687 * queue cap snaps _after_ we've built the new snap contexts,
684 * so that i_head_snapc can be set appropriately. 688 * so that i_head_snapc can be set appropriately.
685 */ 689 */
686 list_for_each_entry(realm, &dirty_realms, dirty_item) { 690 while (!list_empty(&dirty_realms)) {
691 realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
692 dirty_item);
687 queue_realm_cap_snaps(realm); 693 queue_realm_cap_snaps(realm);
688 } 694 }
689 695
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9c5085465a63..f2f77fd3c14c 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -131,6 +131,7 @@ enum {
131 Opt_rbytes, 131 Opt_rbytes,
132 Opt_norbytes, 132 Opt_norbytes,
133 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
134 Opt_ino32,
134}; 135};
135 136
136static match_table_t fsopt_tokens = { 137static match_table_t fsopt_tokens = {
@@ -150,6 +151,7 @@ static match_table_t fsopt_tokens = {
150 {Opt_rbytes, "rbytes"}, 151 {Opt_rbytes, "rbytes"},
151 {Opt_norbytes, "norbytes"}, 152 {Opt_norbytes, "norbytes"},
152 {Opt_noasyncreaddir, "noasyncreaddir"}, 153 {Opt_noasyncreaddir, "noasyncreaddir"},
154 {Opt_ino32, "ino32"},
153 {-1, NULL} 155 {-1, NULL}
154}; 156};
155 157
@@ -225,6 +227,9 @@ static int parse_fsopt_token(char *c, void *private)
225 case Opt_noasyncreaddir: 227 case Opt_noasyncreaddir:
226 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR; 228 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 break; 229 break;
230 case Opt_ino32:
231 fsopt->flags |= CEPH_MOUNT_OPT_INO32;
232 break;
228 default: 233 default:
229 BUG_ON(token); 234 BUG_ON(token);
230 } 235 }
@@ -288,7 +293,7 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
288 fsopt->sb_flags = flags; 293 fsopt->sb_flags = flags;
289 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; 294 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290 295
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 296 fsopt->rsize = CEPH_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 297 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; 298 fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
294 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 299 fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
@@ -348,7 +353,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
348 353
349 if (opt->name) 354 if (opt->name)
350 seq_printf(m, ",name=%s", opt->name); 355 seq_printf(m, ",name=%s", opt->name);
351 if (opt->secret) 356 if (opt->key)
352 seq_puts(m, ",secret=<hidden>"); 357 seq_puts(m, ",secret=<hidden>");
353 358
354 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) 359 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
@@ -370,7 +375,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
370 375
371 if (fsopt->wsize) 376 if (fsopt->wsize)
372 seq_printf(m, ",wsize=%d", fsopt->wsize); 377 seq_printf(m, ",wsize=%d", fsopt->wsize);
373 if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT) 378 if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
374 seq_printf(m, ",rsize=%d", fsopt->rsize); 379 seq_printf(m, ",rsize=%d", fsopt->rsize);
375 if (fsopt->congestion_kb != default_congestion_kb()) 380 if (fsopt->congestion_kb != default_congestion_kb())
376 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb); 381 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 20b907d76ae2..619fe719968f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -27,6 +27,7 @@
27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
30#define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */
30 31
31#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) 32#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
32 33
@@ -35,6 +36,7 @@
35#define ceph_test_mount_opt(fsc, opt) \ 36#define ceph_test_mount_opt(fsc, opt) \
36 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) 37 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
37 38
39#define CEPH_RSIZE_DEFAULT (512*1024) /* readahead */
38#define CEPH_MAX_READDIR_DEFAULT 1024 40#define CEPH_MAX_READDIR_DEFAULT 1024
39#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) 41#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
40#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 42#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
@@ -319,6 +321,16 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
319 return container_of(inode, struct ceph_inode_info, vfs_inode); 321 return container_of(inode, struct ceph_inode_info, vfs_inode);
320} 322}
321 323
324static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
325{
326 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
327}
328
329static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
330{
331 return (struct ceph_fs_client *)sb->s_fs_info;
332}
333
322static inline struct ceph_vino ceph_vino(struct inode *inode) 334static inline struct ceph_vino ceph_vino(struct inode *inode)
323{ 335{
324 return ceph_inode(inode)->i_vino; 336 return ceph_inode(inode)->i_vino;
@@ -327,19 +339,49 @@ static inline struct ceph_vino ceph_vino(struct inode *inode)
327/* 339/*
328 * ino_t is <64 bits on many architectures, blech. 340 * ino_t is <64 bits on many architectures, blech.
329 * 341 *
330 * don't include snap in ino hash, at least for now. 342 * i_ino (kernel inode) st_ino (userspace)
343 * i386 32 32
344 * x86_64+ino32 64 32
345 * x86_64 64 64
346 */
347static inline u32 ceph_ino_to_ino32(ino_t ino)
348{
349 ino ^= ino >> (sizeof(ino) * 8 - 32);
350 if (!ino)
351 ino = 1;
352 return ino;
353}
354
355/*
356 * kernel i_ino value
331 */ 357 */
332static inline ino_t ceph_vino_to_ino(struct ceph_vino vino) 358static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
333{ 359{
334 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */ 360 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
335#if BITS_PER_LONG == 32 361#if BITS_PER_LONG == 32
336 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8; 362 ino = ceph_ino_to_ino32(ino);
337 if (!ino)
338 ino = 1;
339#endif 363#endif
340 return ino; 364 return ino;
341} 365}
342 366
367/*
368 * user-visible ino (stat, filldir)
369 */
370#if BITS_PER_LONG == 32
371static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
372{
373 return ino;
374}
375#else
376static inline ino_t ceph_translate_ino(struct super_block *sb, ino_t ino)
377{
378 if (ceph_test_mount_opt(ceph_sb_to_client(sb), INO32))
379 ino = ceph_ino_to_ino32(ino);
380 return ino;
381}
382#endif
383
384
343/* for printf-style formatting */ 385/* for printf-style formatting */
344#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap 386#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
345 387
@@ -428,13 +470,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
428 return ((loff_t)frag << 32) | (loff_t)off; 470 return ((loff_t)frag << 32) | (loff_t)off;
429} 471}
430 472
431static inline int ceph_set_ino_cb(struct inode *inode, void *data)
432{
433 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
434 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
435 return 0;
436}
437
438/* 473/*
439 * caps helpers 474 * caps helpers
440 */ 475 */
@@ -503,15 +538,6 @@ extern void ceph_reservation_status(struct ceph_fs_client *client,
503 int *total, int *avail, int *used, 538 int *total, int *avail, int *used,
504 int *reserved, int *min); 539 int *reserved, int *min);
505 540
506static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
507{
508 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
509}
510
511static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
512{
513 return (struct ceph_fs_client *)sb->s_fs_info;
514}
515 541
516 542
517/* 543/*
diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
index 7f7fa3c302af..ea940b1db77b 100644
--- a/fs/cifs/AUTHORS
+++ b/fs/cifs/AUTHORS
@@ -35,7 +35,7 @@ Adrian Bunk (kcalloc cleanups)
35Miklos Szeredi 35Miklos Szeredi
36Kazeon team for various fixes especially for 2.4 version. 36Kazeon team for various fixes especially for 2.4 version.
37Asser Ferno (Change Notify support) 37Asser Ferno (Change Notify support)
38Shaggy (Dave Kleikamp) for inumerable small fs suggestions and some good cleanup 38Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup
39Gunter Kukkukk (testing and suggestions for support of old servers) 39Gunter Kukkukk (testing and suggestions for support of old servers)
40Igor Mammedov (DFS support) 40Igor Mammedov (DFS support)
41Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code) 41Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index ee45648b0d1a..7cb0f7f847e4 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -3,6 +3,7 @@ config CIFS
3 depends on INET 3 depends on INET
4 select NLS 4 select NLS
5 select CRYPTO 5 select CRYPTO
6 select CRYPTO_MD4
6 select CRYPTO_MD5 7 select CRYPTO_MD5
7 select CRYPTO_HMAC 8 select CRYPTO_HMAC
8 select CRYPTO_ARC4 9 select CRYPTO_ARC4
diff --git a/fs/cifs/README b/fs/cifs/README
index fe1683590828..74ab165fc646 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -685,22 +685,6 @@ LinuxExtensionsEnabled If set to one then the client will attempt to
685 support and want to map the uid and gid fields 685 support and want to map the uid and gid fields
686 to values supplied at mount (rather than the 686 to values supplied at mount (rather than the
687 actual values, then set this to zero. (default 1) 687 actual values, then set this to zero. (default 1)
688Experimental When set to 1 used to enable certain experimental
689 features (currently enables multipage writes
690 when signing is enabled, the multipage write
691 performance enhancement was disabled when
692 signing turned on in case buffer was modified
693 just before it was sent, also this flag will
694 be used to use the new experimental directory change
695 notification code). When set to 2 enables
696 an additional experimental feature, "raw ntlmssp"
697 session establishment support (which allows
698 specifying "sec=ntlmssp" on mount). The Linux cifs
699 module will use ntlmv2 authentication encapsulated
700 in "raw ntlmssp" (not using SPNEGO) when
701 "sec=ntlmssp" is specified on mount.
702 This support also requires building cifs with
703 the CONFIG_CIFS_EXPERIMENTAL configuration flag.
704 688
705These experimental features and tracing can be enabled by changing flags in 689These experimental features and tracing can be enabled by changing flags in
706/proc/fs/cifs (after the cifs module has been installed or built into the 690/proc/fs/cifs (after the cifs module has been installed or built into the
diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c
index e654dfd092c3..53d57a3fe427 100644
--- a/fs/cifs/cache.c
+++ b/fs/cifs/cache.c
@@ -50,7 +50,7 @@ void cifs_fscache_unregister(void)
50 */ 50 */
51struct cifs_server_key { 51struct cifs_server_key {
52 uint16_t family; /* address family */ 52 uint16_t family; /* address family */
53 uint16_t port; /* IP port */ 53 __be16 port; /* IP port */
54 union { 54 union {
55 struct in_addr ipv4_addr; 55 struct in_addr ipv4_addr;
56 struct in6_addr ipv6_addr; 56 struct in6_addr ipv6_addr;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 65829d32128c..30d01bc90855 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -423,7 +423,6 @@ static const struct file_operations cifs_lookup_cache_proc_fops;
423static const struct file_operations traceSMB_proc_fops; 423static const struct file_operations traceSMB_proc_fops;
424static const struct file_operations cifs_multiuser_mount_proc_fops; 424static const struct file_operations cifs_multiuser_mount_proc_fops;
425static const struct file_operations cifs_security_flags_proc_fops; 425static const struct file_operations cifs_security_flags_proc_fops;
426static const struct file_operations cifs_experimental_proc_fops;
427static const struct file_operations cifs_linux_ext_proc_fops; 426static const struct file_operations cifs_linux_ext_proc_fops;
428 427
429void 428void
@@ -441,8 +440,6 @@ cifs_proc_init(void)
441 proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops); 440 proc_create("cifsFYI", 0, proc_fs_cifs, &cifsFYI_proc_fops);
442 proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops); 441 proc_create("traceSMB", 0, proc_fs_cifs, &traceSMB_proc_fops);
443 proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops); 442 proc_create("OplockEnabled", 0, proc_fs_cifs, &cifs_oplock_proc_fops);
444 proc_create("Experimental", 0, proc_fs_cifs,
445 &cifs_experimental_proc_fops);
446 proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs, 443 proc_create("LinuxExtensionsEnabled", 0, proc_fs_cifs,
447 &cifs_linux_ext_proc_fops); 444 &cifs_linux_ext_proc_fops);
448 proc_create("MultiuserMount", 0, proc_fs_cifs, 445 proc_create("MultiuserMount", 0, proc_fs_cifs,
@@ -469,7 +466,6 @@ cifs_proc_clean(void)
469 remove_proc_entry("OplockEnabled", proc_fs_cifs); 466 remove_proc_entry("OplockEnabled", proc_fs_cifs);
470 remove_proc_entry("SecurityFlags", proc_fs_cifs); 467 remove_proc_entry("SecurityFlags", proc_fs_cifs);
471 remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); 468 remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs);
472 remove_proc_entry("Experimental", proc_fs_cifs);
473 remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); 469 remove_proc_entry("LookupCacheEnabled", proc_fs_cifs);
474 remove_proc_entry("fs/cifs", NULL); 470 remove_proc_entry("fs/cifs", NULL);
475} 471}
@@ -550,45 +546,6 @@ static const struct file_operations cifs_oplock_proc_fops = {
550 .write = cifs_oplock_proc_write, 546 .write = cifs_oplock_proc_write,
551}; 547};
552 548
553static int cifs_experimental_proc_show(struct seq_file *m, void *v)
554{
555 seq_printf(m, "%d\n", experimEnabled);
556 return 0;
557}
558
559static int cifs_experimental_proc_open(struct inode *inode, struct file *file)
560{
561 return single_open(file, cifs_experimental_proc_show, NULL);
562}
563
564static ssize_t cifs_experimental_proc_write(struct file *file,
565 const char __user *buffer, size_t count, loff_t *ppos)
566{
567 char c;
568 int rc;
569
570 rc = get_user(c, buffer);
571 if (rc)
572 return rc;
573 if (c == '0' || c == 'n' || c == 'N')
574 experimEnabled = 0;
575 else if (c == '1' || c == 'y' || c == 'Y')
576 experimEnabled = 1;
577 else if (c == '2')
578 experimEnabled = 2;
579
580 return count;
581}
582
583static const struct file_operations cifs_experimental_proc_fops = {
584 .owner = THIS_MODULE,
585 .open = cifs_experimental_proc_open,
586 .read = seq_read,
587 .llseek = seq_lseek,
588 .release = single_release,
589 .write = cifs_experimental_proc_write,
590};
591
592static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) 549static int cifs_linux_ext_proc_show(struct seq_file *m, void *v)
593{ 550{
594 seq_printf(m, "%d\n", linuxExtEnabled); 551 seq_printf(m, "%d\n", linuxExtEnabled);
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index f1c68629f277..2b68ac57d97d 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -53,7 +53,7 @@ void cifs_dfs_release_automount_timer(void)
53 * 53 *
54 * Extracts sharename form full UNC. 54 * Extracts sharename form full UNC.
55 * i.e. strips from UNC trailing path that is not part of share 55 * i.e. strips from UNC trailing path that is not part of share
56 * name and fixup missing '\' in the begining of DFS node refferal 56 * name and fixup missing '\' in the beginning of DFS node refferal
57 * if necessary. 57 * if necessary.
58 * Returns pointer to share name on success or ERR_PTR on error. 58 * Returns pointer to share name on success or ERR_PTR on error.
59 * Caller is responsible for freeing returned string. 59 * Caller is responsible for freeing returned string.
@@ -282,8 +282,6 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
282 cFYI(1, "in %s", __func__); 282 cFYI(1, "in %s", __func__);
283 BUG_ON(IS_ROOT(mntpt)); 283 BUG_ON(IS_ROOT(mntpt));
284 284
285 xid = GetXid();
286
287 /* 285 /*
288 * The MSDFS spec states that paths in DFS referral requests and 286 * The MSDFS spec states that paths in DFS referral requests and
289 * responses must be prefixed by a single '\' character instead of 287 * responses must be prefixed by a single '\' character instead of
@@ -293,7 +291,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
293 mnt = ERR_PTR(-ENOMEM); 291 mnt = ERR_PTR(-ENOMEM);
294 full_path = build_path_from_dentry(mntpt); 292 full_path = build_path_from_dentry(mntpt);
295 if (full_path == NULL) 293 if (full_path == NULL)
296 goto free_xid; 294 goto cdda_exit;
297 295
298 cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); 296 cifs_sb = CIFS_SB(mntpt->d_inode->i_sb);
299 tlink = cifs_sb_tlink(cifs_sb); 297 tlink = cifs_sb_tlink(cifs_sb);
@@ -303,9 +301,11 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
303 } 301 }
304 ses = tlink_tcon(tlink)->ses; 302 ses = tlink_tcon(tlink)->ses;
305 303
304 xid = GetXid();
306 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, 305 rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls,
307 &num_referrals, &referrals, 306 &num_referrals, &referrals,
308 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); 307 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
308 FreeXid(xid);
309 309
310 cifs_put_tlink(tlink); 310 cifs_put_tlink(tlink);
311 311
@@ -338,8 +338,7 @@ success:
338 free_dfs_info_array(referrals, num_referrals); 338 free_dfs_info_array(referrals, num_referrals);
339free_full_path: 339free_full_path:
340 kfree(full_path); 340 kfree(full_path);
341free_xid: 341cdda_exit:
342 FreeXid(xid);
343 cFYI(1, "leaving %s" , __func__); 342 cFYI(1, "leaving %s" , __func__);
344 return mnt; 343 return mnt;
345} 344}
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 4dfba8283165..33d221394aca 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -113,7 +113,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
113 MAX_MECH_STR_LEN + 113 MAX_MECH_STR_LEN +
114 UID_KEY_LEN + (sizeof(uid_t) * 2) + 114 UID_KEY_LEN + (sizeof(uid_t) * 2) +
115 CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + 115 CREDUID_KEY_LEN + (sizeof(uid_t) * 2) +
116 USER_KEY_LEN + strlen(sesInfo->userName) + 116 USER_KEY_LEN + strlen(sesInfo->user_name) +
117 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; 117 PID_KEY_LEN + (sizeof(pid_t) * 2) + 1;
118 118
119 spnego_key = ERR_PTR(-ENOMEM); 119 spnego_key = ERR_PTR(-ENOMEM);
@@ -153,7 +153,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
153 sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid); 153 sprintf(dp, ";creduid=0x%x", sesInfo->cred_uid);
154 154
155 dp = description + strlen(description); 155 dp = description + strlen(description);
156 sprintf(dp, ";user=%s", sesInfo->userName); 156 sprintf(dp, ";user=%s", sesInfo->user_name);
157 157
158 dp = description + strlen(description); 158 dp = description + strlen(description);
159 sprintf(dp, ";pid=0x%x", current->pid); 159 sprintf(dp, ";pid=0x%x", current->pid);
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index fc0fd4fde306..23d43cde4306 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -90,7 +90,7 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp,
90 case UNI_COLON: 90 case UNI_COLON:
91 *target = ':'; 91 *target = ':';
92 break; 92 break;
93 case UNI_ASTERIK: 93 case UNI_ASTERISK:
94 *target = '*'; 94 *target = '*';
95 break; 95 break;
96 case UNI_QUESTION: 96 case UNI_QUESTION:
@@ -264,40 +264,40 @@ cifs_strndup_from_ucs(const char *src, const int maxlen, const bool is_unicode,
264 * names are little endian 16 bit Unicode on the wire 264 * names are little endian 16 bit Unicode on the wire
265 */ 265 */
266int 266int
267cifsConvertToUCS(__le16 *target, const char *source, int maxlen, 267cifsConvertToUCS(__le16 *target, const char *source, int srclen,
268 const struct nls_table *cp, int mapChars) 268 const struct nls_table *cp, int mapChars)
269{ 269{
270 int i, j, charlen; 270 int i, j, charlen;
271 int len_remaining = maxlen;
272 char src_char; 271 char src_char;
273 __u16 temp; 272 __le16 dst_char;
273 wchar_t tmp;
274 274
275 if (!mapChars) 275 if (!mapChars)
276 return cifs_strtoUCS(target, source, PATH_MAX, cp); 276 return cifs_strtoUCS(target, source, PATH_MAX, cp);
277 277
278 for (i = 0, j = 0; i < maxlen; j++) { 278 for (i = 0, j = 0; i < srclen; j++) {
279 src_char = source[i]; 279 src_char = source[i];
280 switch (src_char) { 280 switch (src_char) {
281 case 0: 281 case 0:
282 put_unaligned_le16(0, &target[j]); 282 put_unaligned(0, &target[j]);
283 goto ctoUCS_out; 283 goto ctoUCS_out;
284 case ':': 284 case ':':
285 temp = UNI_COLON; 285 dst_char = cpu_to_le16(UNI_COLON);
286 break; 286 break;
287 case '*': 287 case '*':
288 temp = UNI_ASTERIK; 288 dst_char = cpu_to_le16(UNI_ASTERISK);
289 break; 289 break;
290 case '?': 290 case '?':
291 temp = UNI_QUESTION; 291 dst_char = cpu_to_le16(UNI_QUESTION);
292 break; 292 break;
293 case '<': 293 case '<':
294 temp = UNI_LESSTHAN; 294 dst_char = cpu_to_le16(UNI_LESSTHAN);
295 break; 295 break;
296 case '>': 296 case '>':
297 temp = UNI_GRTRTHAN; 297 dst_char = cpu_to_le16(UNI_GRTRTHAN);
298 break; 298 break;
299 case '|': 299 case '|':
300 temp = UNI_PIPE; 300 dst_char = cpu_to_le16(UNI_PIPE);
301 break; 301 break;
302 /* 302 /*
303 * FIXME: We can not handle remapping backslash (UNI_SLASH) 303 * FIXME: We can not handle remapping backslash (UNI_SLASH)
@@ -305,17 +305,17 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
305 * as they use backslash as separator. 305 * as they use backslash as separator.
306 */ 306 */
307 default: 307 default:
308 charlen = cp->char2uni(source+i, len_remaining, 308 charlen = cp->char2uni(source + i, srclen - i, &tmp);
309 &temp); 309 dst_char = cpu_to_le16(tmp);
310
310 /* 311 /*
311 * if no match, use question mark, which at least in 312 * if no match, use question mark, which at least in
312 * some cases serves as wild card 313 * some cases serves as wild card
313 */ 314 */
314 if (charlen < 1) { 315 if (charlen < 1) {
315 temp = 0x003f; 316 dst_char = cpu_to_le16(0x003f);
316 charlen = 1; 317 charlen = 1;
317 } 318 }
318 len_remaining -= charlen;
319 /* 319 /*
320 * character may take more than one byte in the source 320 * character may take more than one byte in the source
321 * string, but will take exactly two bytes in the 321 * string, but will take exactly two bytes in the
@@ -324,9 +324,8 @@ cifsConvertToUCS(__le16 *target, const char *source, int maxlen,
324 i += charlen; 324 i += charlen;
325 continue; 325 continue;
326 } 326 }
327 put_unaligned_le16(temp, &target[j]); 327 put_unaligned(dst_char, &target[j]);
328 i++; /* move to next char in source string */ 328 i++; /* move to next char in source string */
329 len_remaining--;
330 } 329 }
331 330
332ctoUCS_out: 331ctoUCS_out:
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 7fe6b52df507..644dd882a560 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -44,7 +44,7 @@
44 * reserved symbols (along with \ and /), otherwise illegal to store 44 * reserved symbols (along with \ and /), otherwise illegal to store
45 * in filenames in NTFS 45 * in filenames in NTFS
46 */ 46 */
47#define UNI_ASTERIK (__u16) ('*' + 0xF000) 47#define UNI_ASTERISK (__u16) ('*' + 0xF000)
48#define UNI_QUESTION (__u16) ('?' + 0xF000) 48#define UNI_QUESTION (__u16) ('?' + 0xF000)
49#define UNI_COLON (__u16) (':' + 0xF000) 49#define UNI_COLON (__u16) (':' + 0xF000)
50#define UNI_GRTRTHAN (__u16) ('>' + 0xF000) 50#define UNI_GRTRTHAN (__u16) ('>' + 0xF000)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1e7636b145a8..beeebf194234 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -372,6 +372,10 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl,
372 372
373 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *), 373 ppace = kmalloc(num_aces * sizeof(struct cifs_ace *),
374 GFP_KERNEL); 374 GFP_KERNEL);
375 if (!ppace) {
376 cERROR(1, "DACL memory allocation error");
377 return;
378 }
375 379
376 for (i = 0; i < num_aces; ++i) { 380 for (i = 0; i < num_aces; ++i) {
377 ppace[i] = (struct cifs_ace *) (acl_base + acl_size); 381 ppace[i] = (struct cifs_ace *) (acl_base + acl_size);
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 0db5f1de0227..d1a016be73ba 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -30,12 +30,13 @@
30#include <linux/ctype.h> 30#include <linux/ctype.h>
31#include <linux/random.h> 31#include <linux/random.h>
32 32
33/* Calculate and return the CIFS signature based on the mac key and SMB PDU */ 33/*
34/* the 16 byte signature must be allocated by the caller */ 34 * Calculate and return the CIFS signature based on the mac key and SMB PDU.
35/* Note we only use the 1st eight bytes */ 35 * The 16 byte signature must be allocated by the caller. Note we only use the
36/* Note that the smb header signature field on input contains the 36 * 1st eight bytes and that the smb header signature field on input contains
37 sequence number before this function is called */ 37 * the sequence number before this function is called. Also, this function
38 38 * should be called with the server->srv_mutex held.
39 */
39static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, 40static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
40 struct TCP_Server_Info *server, char *signature) 41 struct TCP_Server_Info *server, char *signature)
41{ 42{
@@ -209,8 +210,10 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
209 cpu_to_le32(expected_sequence_number); 210 cpu_to_le32(expected_sequence_number);
210 cifs_pdu->Signature.Sequence.Reserved = 0; 211 cifs_pdu->Signature.Sequence.Reserved = 0;
211 212
213 mutex_lock(&server->srv_mutex);
212 rc = cifs_calculate_signature(cifs_pdu, server, 214 rc = cifs_calculate_signature(cifs_pdu, server,
213 what_we_think_sig_should_be); 215 what_we_think_sig_should_be);
216 mutex_unlock(&server->srv_mutex);
214 217
215 if (rc) 218 if (rc)
216 return rc; 219 return rc;
@@ -469,15 +472,15 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, char *ntlmv2_hash,
469 return rc; 472 return rc;
470 } 473 }
471 474
472 /* convert ses->userName to unicode and uppercase */ 475 /* convert ses->user_name to unicode and uppercase */
473 len = strlen(ses->userName); 476 len = strlen(ses->user_name);
474 user = kmalloc(2 + (len * 2), GFP_KERNEL); 477 user = kmalloc(2 + (len * 2), GFP_KERNEL);
475 if (user == NULL) { 478 if (user == NULL) {
476 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); 479 cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
477 rc = -ENOMEM; 480 rc = -ENOMEM;
478 goto calc_exit_2; 481 goto calc_exit_2;
479 } 482 }
480 len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); 483 len = cifs_strtoUCS((__le16 *)user, ses->user_name, len, nls_cp);
481 UniStrupr(user); 484 UniStrupr(user);
482 485
483 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, 486 crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
@@ -657,9 +660,10 @@ calc_seckey(struct cifsSesInfo *ses)
657 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE); 660 get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
658 661
659 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC); 662 tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
660 if (!tfm_arc4 || IS_ERR(tfm_arc4)) { 663 if (IS_ERR(tfm_arc4)) {
664 rc = PTR_ERR(tfm_arc4);
661 cERROR(1, "could not allocate crypto API arc4\n"); 665 cERROR(1, "could not allocate crypto API arc4\n");
662 return PTR_ERR(tfm_arc4); 666 return rc;
663 } 667 }
664 668
665 desc.tfm = tfm_arc4; 669 desc.tfm = tfm_arc4;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f2970136d17d..5c412b33cd7c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -53,7 +53,6 @@ int cifsFYI = 0;
53int cifsERROR = 1; 53int cifsERROR = 1;
54int traceSMB = 0; 54int traceSMB = 0;
55unsigned int oplockEnabled = 1; 55unsigned int oplockEnabled = 1;
56unsigned int experimEnabled = 0;
57unsigned int linuxExtEnabled = 1; 56unsigned int linuxExtEnabled = 1;
58unsigned int lookupCacheEnabled = 1; 57unsigned int lookupCacheEnabled = 1;
59unsigned int multiuser_mount = 0; 58unsigned int multiuser_mount = 0;
@@ -127,6 +126,7 @@ cifs_read_super(struct super_block *sb, void *data,
127 kfree(cifs_sb); 126 kfree(cifs_sb);
128 return rc; 127 return rc;
129 } 128 }
129 cifs_sb->bdi.ra_pages = default_backing_dev_info.ra_pages;
130 130
131#ifdef CONFIG_CIFS_DFS_UPCALL 131#ifdef CONFIG_CIFS_DFS_UPCALL
132 /* copy mount params to sb for use in submounts */ 132 /* copy mount params to sb for use in submounts */
@@ -409,8 +409,8 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
409 409
410 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) 410 if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)
411 seq_printf(s, ",multiuser"); 411 seq_printf(s, ",multiuser");
412 else if (tcon->ses->userName) 412 else if (tcon->ses->user_name)
413 seq_printf(s, ",username=%s", tcon->ses->userName); 413 seq_printf(s, ",username=%s", tcon->ses->user_name);
414 414
415 if (tcon->ses->domainName) 415 if (tcon->ses->domainName)
416 seq_printf(s, ",domain=%s", tcon->ses->domainName); 416 seq_printf(s, ",domain=%s", tcon->ses->domainName);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 14789a97304e..a9371b6578c0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
127extern const struct export_operations cifs_export_ops; 127extern const struct export_operations cifs_export_ops;
128#endif /* EXPERIMENTAL */ 128#endif /* EXPERIMENTAL */
129 129
130#define CIFS_VERSION "1.69" 130#define CIFS_VERSION "1.71"
131#endif /* _CIFSFS_H */ 131#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index edd5b29b53c9..a5d1106fcbde 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -37,10 +37,9 @@
37 37
38#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1) 38#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
39#define MAX_SERVER_SIZE 15 39#define MAX_SERVER_SIZE 15
40#define MAX_SHARE_SIZE 64 /* used to be 20, this should still be enough */ 40#define MAX_SHARE_SIZE 80
41#define MAX_USERNAME_SIZE 32 /* 32 is to allow for 15 char names + null 41#define MAX_USERNAME_SIZE 256 /* reasonable maximum for current servers */
42 termination then *2 for unicode versions */ 42#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
43#define MAX_PASSWORD_SIZE 512 /* max for windows seems to be 256 wide chars */
44 43
45#define CIFS_MIN_RCV_POOL 4 44#define CIFS_MIN_RCV_POOL 4
46 45
@@ -92,7 +91,8 @@ enum statusEnum {
92 CifsNew = 0, 91 CifsNew = 0,
93 CifsGood, 92 CifsGood,
94 CifsExiting, 93 CifsExiting,
95 CifsNeedReconnect 94 CifsNeedReconnect,
95 CifsNeedNegotiate
96}; 96};
97 97
98enum securityEnum { 98enum securityEnum {
@@ -188,6 +188,8 @@ struct TCP_Server_Info {
188 /* multiplexed reads or writes */ 188 /* multiplexed reads or writes */
189 unsigned int maxBuf; /* maxBuf specifies the maximum */ 189 unsigned int maxBuf; /* maxBuf specifies the maximum */
190 /* message size the server can send or receive for non-raw SMBs */ 190 /* message size the server can send or receive for non-raw SMBs */
191 /* maxBuf is returned by SMB NegotiateProtocol so maxBuf is only 0 */
192 /* when socket is setup (and during reconnect) before NegProt sent */
191 unsigned int max_rw; /* maxRw specifies the maximum */ 193 unsigned int max_rw; /* maxRw specifies the maximum */
192 /* message size the server can send or receive for */ 194 /* message size the server can send or receive for */
193 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ 195 /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
@@ -272,7 +274,7 @@ struct cifsSesInfo {
272 int capabilities; 274 int capabilities;
273 char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for 275 char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for
274 TCP names - will ipv6 and sctp addresses fit? */ 276 TCP names - will ipv6 and sctp addresses fit? */
275 char userName[MAX_USERNAME_SIZE + 1]; 277 char *user_name;
276 char *domainName; 278 char *domainName;
277 char *password; 279 char *password;
278 struct session_key auth_key; 280 struct session_key auth_key;
@@ -652,7 +654,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
652#define MID_REQUEST_SUBMITTED 2 654#define MID_REQUEST_SUBMITTED 2
653#define MID_RESPONSE_RECEIVED 4 655#define MID_RESPONSE_RECEIVED 4
654#define MID_RETRY_NEEDED 8 /* session closed while this request out */ 656#define MID_RETRY_NEEDED 8 /* session closed while this request out */
655#define MID_NO_RESP_NEEDED 0x10 657#define MID_RESPONSE_MALFORMED 0x10
656 658
657/* Types of response buffer returned from SendReceive2 */ 659/* Types of response buffer returned from SendReceive2 */
658#define CIFS_NO_BUFFER 0 /* Response buffer not returned */ 660#define CIFS_NO_BUFFER 0 /* Response buffer not returned */
@@ -815,7 +817,6 @@ GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions
815 have the uid/password or Kerberos credential 817 have the uid/password or Kerberos credential
816 or equivalent for current user */ 818 or equivalent for current user */
817GLOBAL_EXTERN unsigned int oplockEnabled; 819GLOBAL_EXTERN unsigned int oplockEnabled;
818GLOBAL_EXTERN unsigned int experimEnabled;
819GLOBAL_EXTERN unsigned int lookupCacheEnabled; 820GLOBAL_EXTERN unsigned int lookupCacheEnabled;
820GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent 821GLOBAL_EXTERN unsigned int global_secflags; /* if on, session setup sent
821 with more secure ntlmssp2 challenge/resp */ 822 with more secure ntlmssp2 challenge/resp */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3106f5e5c633..df959bae6728 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -136,18 +136,15 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
136 } 136 }
137 } 137 }
138 138
139 if (ses->status == CifsExiting)
140 return -EIO;
141
142 /* 139 /*
143 * Give demultiplex thread up to 10 seconds to reconnect, should be 140 * Give demultiplex thread up to 10 seconds to reconnect, should be
144 * greater than cifs socket timeout which is 7 seconds 141 * greater than cifs socket timeout which is 7 seconds
145 */ 142 */
146 while (server->tcpStatus == CifsNeedReconnect) { 143 while (server->tcpStatus == CifsNeedReconnect) {
147 wait_event_interruptible_timeout(server->response_q, 144 wait_event_interruptible_timeout(server->response_q,
148 (server->tcpStatus == CifsGood), 10 * HZ); 145 (server->tcpStatus != CifsNeedReconnect), 10 * HZ);
149 146
150 /* is TCP session is reestablished now ?*/ 147 /* are we still trying to reconnect? */
151 if (server->tcpStatus != CifsNeedReconnect) 148 if (server->tcpStatus != CifsNeedReconnect)
152 break; 149 break;
153 150
@@ -156,7 +153,7 @@ cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
156 * retrying until process is killed or server comes 153 * retrying until process is killed or server comes
157 * back on-line 154 * back on-line
158 */ 155 */
159 if (!tcon->retry || ses->status == CifsExiting) { 156 if (!tcon->retry) {
160 cFYI(1, "gave up waiting on reconnect in smb_init"); 157 cFYI(1, "gave up waiting on reconnect in smb_init");
161 return -EHOSTDOWN; 158 return -EHOSTDOWN;
162 } 159 }
@@ -732,7 +729,7 @@ CIFSSMBEcho(struct TCP_Server_Info *server)
732 return rc; 729 return rc;
733 730
734 /* set up echo request */ 731 /* set up echo request */
735 smb->hdr.Tid = cpu_to_le16(0xffff); 732 smb->hdr.Tid = 0xffff;
736 smb->hdr.WordCount = 1; 733 smb->hdr.WordCount = 1;
737 put_unaligned_le16(1, &smb->EchoCount); 734 put_unaligned_le16(1, &smb->EchoCount);
738 put_bcc_le(1, &smb->hdr); 735 put_bcc_le(1, &smb->hdr);
@@ -1887,10 +1884,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon,
1887 __constant_cpu_to_le16(CIFS_WRLCK)) 1884 __constant_cpu_to_le16(CIFS_WRLCK))
1888 pLockData->fl_type = F_WRLCK; 1885 pLockData->fl_type = F_WRLCK;
1889 1886
1890 pLockData->fl_start = parm_data->start; 1887 pLockData->fl_start = le64_to_cpu(parm_data->start);
1891 pLockData->fl_end = parm_data->start + 1888 pLockData->fl_end = pLockData->fl_start +
1892 parm_data->length - 1; 1889 le64_to_cpu(parm_data->length) - 1;
1893 pLockData->fl_pid = parm_data->pid; 1890 pLockData->fl_pid = le32_to_cpu(parm_data->pid);
1894 } 1891 }
1895 } 1892 }
1896 1893
@@ -4914,7 +4911,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4914 __u16 fid, __u32 pid_of_opener, bool SetAllocation) 4911 __u16 fid, __u32 pid_of_opener, bool SetAllocation)
4915{ 4912{
4916 struct smb_com_transaction2_sfi_req *pSMB = NULL; 4913 struct smb_com_transaction2_sfi_req *pSMB = NULL;
4917 char *data_offset;
4918 struct file_end_of_file_info *parm_data; 4914 struct file_end_of_file_info *parm_data;
4919 int rc = 0; 4915 int rc = 0;
4920 __u16 params, param_offset, offset, byte_count, count; 4916 __u16 params, param_offset, offset, byte_count, count;
@@ -4938,8 +4934,6 @@ CIFSSMBSetFileSize(const int xid, struct cifsTconInfo *tcon, __u64 size,
4938 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; 4934 param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4;
4939 offset = param_offset + params; 4935 offset = param_offset + params;
4940 4936
4941 data_offset = (char *) (&pSMB->hdr.Protocol) + offset;
4942
4943 count = sizeof(struct file_end_of_file_info); 4937 count = sizeof(struct file_end_of_file_info);
4944 pSMB->MaxParameterCount = cpu_to_le16(2); 4938 pSMB->MaxParameterCount = cpu_to_le16(2);
4945 /* BB find exact max SMB PDU from sess structure BB */ 4939 /* BB find exact max SMB PDU from sess structure BB */
@@ -5253,7 +5247,7 @@ cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset,
5253 * Samba server ignores set of file size to zero due to bugs in some 5247 * Samba server ignores set of file size to zero due to bugs in some
5254 * older clients, but we should be precise - we use SetFileSize to 5248 * older clients, but we should be precise - we use SetFileSize to
5255 * set file size and do not want to truncate file size to zero 5249 * set file size and do not want to truncate file size to zero
5256 * accidently as happened on one Samba server beta by putting 5250 * accidentally as happened on one Samba server beta by putting
5257 * zero instead of -1 here 5251 * zero instead of -1 here
5258 */ 5252 */
5259 data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64); 5253 data_offset->EndOfFile = cpu_to_le64(NO_CHANGE_64);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 47d8ff623683..db9d55b507d0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -199,8 +199,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
199 } 199 }
200 spin_unlock(&GlobalMid_Lock); 200 spin_unlock(&GlobalMid_Lock);
201 201
202 while ((server->tcpStatus != CifsExiting) && 202 while (server->tcpStatus == CifsNeedReconnect) {
203 (server->tcpStatus != CifsGood)) {
204 try_to_freeze(); 203 try_to_freeze();
205 204
206 /* we should try only the port we connected to before */ 205 /* we should try only the port we connected to before */
@@ -212,7 +211,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
212 atomic_inc(&tcpSesReconnectCount); 211 atomic_inc(&tcpSesReconnectCount);
213 spin_lock(&GlobalMid_Lock); 212 spin_lock(&GlobalMid_Lock);
214 if (server->tcpStatus != CifsExiting) 213 if (server->tcpStatus != CifsExiting)
215 server->tcpStatus = CifsGood; 214 server->tcpStatus = CifsNeedNegotiate;
216 spin_unlock(&GlobalMid_Lock); 215 spin_unlock(&GlobalMid_Lock);
217 } 216 }
218 } 217 }
@@ -248,24 +247,24 @@ static int check2ndT2(struct smb_hdr *pSMB, unsigned int maxBufSize)
248 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount); 247 total_data_size = get_unaligned_le16(&pSMBt->t2_rsp.TotalDataCount);
249 data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount); 248 data_in_this_rsp = get_unaligned_le16(&pSMBt->t2_rsp.DataCount);
250 249
251 remaining = total_data_size - data_in_this_rsp; 250 if (total_data_size == data_in_this_rsp)
252
253 if (remaining == 0)
254 return 0; 251 return 0;
255 else if (remaining < 0) { 252 else if (total_data_size < data_in_this_rsp) {
256 cFYI(1, "total data %d smaller than data in frame %d", 253 cFYI(1, "total data %d smaller than data in frame %d",
257 total_data_size, data_in_this_rsp); 254 total_data_size, data_in_this_rsp);
258 return -EINVAL; 255 return -EINVAL;
259 } else {
260 cFYI(1, "missing %d bytes from transact2, check next response",
261 remaining);
262 if (total_data_size > maxBufSize) {
263 cERROR(1, "TotalDataSize %d is over maximum buffer %d",
264 total_data_size, maxBufSize);
265 return -EINVAL;
266 }
267 return remaining;
268 } 256 }
257
258 remaining = total_data_size - data_in_this_rsp;
259
260 cFYI(1, "missing %d bytes from transact2, check next response",
261 remaining);
262 if (total_data_size > maxBufSize) {
263 cERROR(1, "TotalDataSize %d is over maximum buffer %d",
264 total_data_size, maxBufSize);
265 return -EINVAL;
266 }
267 return remaining;
269} 268}
270 269
271static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB) 270static int coalesce_t2(struct smb_hdr *psecond, struct smb_hdr *pTargetSMB)
@@ -337,8 +336,13 @@ cifs_echo_request(struct work_struct *work)
337 struct TCP_Server_Info *server = container_of(work, 336 struct TCP_Server_Info *server = container_of(work,
338 struct TCP_Server_Info, echo.work); 337 struct TCP_Server_Info, echo.work);
339 338
340 /* no need to ping if we got a response recently */ 339 /*
341 if (time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) 340 * We cannot send an echo until the NEGOTIATE_PROTOCOL request is
341 * done, which is indicated by maxBuf != 0. Also, no need to ping if
342 * we got a response recently
343 */
344 if (server->maxBuf == 0 ||
345 time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
342 goto requeue_echo; 346 goto requeue_echo;
343 347
344 rc = CIFSSMBEcho(server); 348 rc = CIFSSMBEcho(server);
@@ -416,7 +420,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server)
416 pdu_length = 4; /* enough to get RFC1001 header */ 420 pdu_length = 4; /* enough to get RFC1001 header */
417 421
418incomplete_rcv: 422incomplete_rcv:
419 if (echo_retries > 0 && 423 if (echo_retries > 0 && server->tcpStatus == CifsGood &&
420 time_after(jiffies, server->lstrp + 424 time_after(jiffies, server->lstrp +
421 (echo_retries * SMB_ECHO_INTERVAL))) { 425 (echo_retries * SMB_ECHO_INTERVAL))) {
422 cERROR(1, "Server %s has not responded in %d seconds. " 426 cERROR(1, "Server %s has not responded in %d seconds. "
@@ -578,14 +582,23 @@ incomplete_rcv:
578 else if (reconnect == 1) 582 else if (reconnect == 1)
579 continue; 583 continue;
580 584
581 length += 4; /* account for rfc1002 hdr */ 585 total_read += 4; /* account for rfc1002 hdr */
582 586
587 dump_smb(smb_buffer, total_read);
583 588
584 dump_smb(smb_buffer, length); 589 /*
585 if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) { 590 * We know that we received enough to get to the MID as we
586 cifs_dump_mem("Bad SMB: ", smb_buffer, 48); 591 * checked the pdu_length earlier. Now check to see
587 continue; 592 * if the rest of the header is OK. We borrow the length
588 } 593 * var for the rest of the loop to avoid a new stack var.
594 *
595 * 48 bytes is enough to display the header and a little bit
596 * into the payload for debugging purposes.
597 */
598 length = checkSMB(smb_buffer, smb_buffer->Mid, total_read);
599 if (length != 0)
600 cifs_dump_mem("Bad SMB: ", smb_buffer,
601 min_t(unsigned int, total_read, 48));
589 602
590 mid_entry = NULL; 603 mid_entry = NULL;
591 server->lstrp = jiffies; 604 server->lstrp = jiffies;
@@ -597,7 +610,8 @@ incomplete_rcv:
597 if ((mid_entry->mid == smb_buffer->Mid) && 610 if ((mid_entry->mid == smb_buffer->Mid) &&
598 (mid_entry->midState == MID_REQUEST_SUBMITTED) && 611 (mid_entry->midState == MID_REQUEST_SUBMITTED) &&
599 (mid_entry->command == smb_buffer->Command)) { 612 (mid_entry->command == smb_buffer->Command)) {
600 if (check2ndT2(smb_buffer,server->maxBuf) > 0) { 613 if (length == 0 &&
614 check2ndT2(smb_buffer, server->maxBuf) > 0) {
601 /* We have a multipart transact2 resp */ 615 /* We have a multipart transact2 resp */
602 isMultiRsp = true; 616 isMultiRsp = true;
603 if (mid_entry->resp_buf) { 617 if (mid_entry->resp_buf) {
@@ -632,12 +646,17 @@ incomplete_rcv:
632 mid_entry->resp_buf = smb_buffer; 646 mid_entry->resp_buf = smb_buffer;
633 mid_entry->largeBuf = isLargeBuf; 647 mid_entry->largeBuf = isLargeBuf;
634multi_t2_fnd: 648multi_t2_fnd:
635 mid_entry->midState = MID_RESPONSE_RECEIVED; 649 if (length == 0)
636 list_del_init(&mid_entry->qhead); 650 mid_entry->midState =
637 mid_entry->callback(mid_entry); 651 MID_RESPONSE_RECEIVED;
652 else
653 mid_entry->midState =
654 MID_RESPONSE_MALFORMED;
638#ifdef CONFIG_CIFS_STATS2 655#ifdef CONFIG_CIFS_STATS2
639 mid_entry->when_received = jiffies; 656 mid_entry->when_received = jiffies;
640#endif 657#endif
658 list_del_init(&mid_entry->qhead);
659 mid_entry->callback(mid_entry);
641 break; 660 break;
642 } 661 }
643 mid_entry = NULL; 662 mid_entry = NULL;
@@ -653,6 +672,9 @@ multi_t2_fnd:
653 else 672 else
654 smallbuf = NULL; 673 smallbuf = NULL;
655 } 674 }
675 } else if (length != 0) {
676 /* response sanity checks failed */
677 continue;
656 } else if (!is_valid_oplock_break(smb_buffer, server) && 678 } else if (!is_valid_oplock_break(smb_buffer, server) &&
657 !isMultiRsp) { 679 !isMultiRsp) {
658 cERROR(1, "No task to wake, unknown frame received! " 680 cERROR(1, "No task to wake, unknown frame received! "
@@ -858,7 +880,8 @@ cifs_parse_mount_options(char *options, const char *devname,
858 /* null user, ie anonymous, authentication */ 880 /* null user, ie anonymous, authentication */
859 vol->nullauth = 1; 881 vol->nullauth = 1;
860 } 882 }
861 if (strnlen(value, 200) < 200) { 883 if (strnlen(value, MAX_USERNAME_SIZE) <
884 MAX_USERNAME_SIZE) {
862 vol->username = value; 885 vol->username = value;
863 } else { 886 } else {
864 printk(KERN_WARNING "CIFS: username too long\n"); 887 printk(KERN_WARNING "CIFS: username too long\n");
@@ -1449,7 +1472,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
1449static bool 1472static bool
1450match_port(struct TCP_Server_Info *server, struct sockaddr *addr) 1473match_port(struct TCP_Server_Info *server, struct sockaddr *addr)
1451{ 1474{
1452 unsigned short int port, *sport; 1475 __be16 port, *sport;
1453 1476
1454 switch (addr->sa_family) { 1477 switch (addr->sa_family) {
1455 case AF_INET: 1478 case AF_INET:
@@ -1549,7 +1572,7 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
1549 return false; 1572 return false;
1550 } 1573 }
1551 1574
1552 /* now check if signing mode is acceptible */ 1575 /* now check if signing mode is acceptable */
1553 if ((secFlags & CIFSSEC_MAY_SIGN) == 0 && 1576 if ((secFlags & CIFSSEC_MAY_SIGN) == 0 &&
1554 (server->secMode & SECMODE_SIGN_REQUIRED)) 1577 (server->secMode & SECMODE_SIGN_REQUIRED))
1555 return false; 1578 return false;
@@ -1742,6 +1765,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
1742 module_put(THIS_MODULE); 1765 module_put(THIS_MODULE);
1743 goto out_err_crypto_release; 1766 goto out_err_crypto_release;
1744 } 1767 }
1768 tcp_ses->tcpStatus = CifsNeedNegotiate;
1745 1769
1746 /* thread spawned, put it on the list */ 1770 /* thread spawned, put it on the list */
1747 spin_lock(&cifs_tcp_ses_lock); 1771 spin_lock(&cifs_tcp_ses_lock);
@@ -1785,7 +1809,9 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
1785 break; 1809 break;
1786 default: 1810 default:
1787 /* anything else takes username/password */ 1811 /* anything else takes username/password */
1788 if (strncmp(ses->userName, vol->username, 1812 if (ses->user_name == NULL)
1813 continue;
1814 if (strncmp(ses->user_name, vol->username,
1789 MAX_USERNAME_SIZE)) 1815 MAX_USERNAME_SIZE))
1790 continue; 1816 continue;
1791 if (strlen(vol->username) != 0 && 1817 if (strlen(vol->username) != 0 &&
@@ -1828,6 +1854,8 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
1828 cifs_put_tcp_session(server); 1854 cifs_put_tcp_session(server);
1829} 1855}
1830 1856
1857static bool warned_on_ntlm; /* globals init to false automatically */
1858
1831static struct cifsSesInfo * 1859static struct cifsSesInfo *
1832cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) 1860cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1833{ 1861{
@@ -1883,9 +1911,11 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1883 else 1911 else
1884 sprintf(ses->serverName, "%pI4", &addr->sin_addr); 1912 sprintf(ses->serverName, "%pI4", &addr->sin_addr);
1885 1913
1886 if (volume_info->username) 1914 if (volume_info->username) {
1887 strncpy(ses->userName, volume_info->username, 1915 ses->user_name = kstrdup(volume_info->username, GFP_KERNEL);
1888 MAX_USERNAME_SIZE); 1916 if (!ses->user_name)
1917 goto get_ses_fail;
1918 }
1889 1919
1890 /* volume_info->password freed at unmount */ 1920 /* volume_info->password freed at unmount */
1891 if (volume_info->password) { 1921 if (volume_info->password) {
@@ -1900,6 +1930,15 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
1900 } 1930 }
1901 ses->cred_uid = volume_info->cred_uid; 1931 ses->cred_uid = volume_info->cred_uid;
1902 ses->linux_uid = volume_info->linux_uid; 1932 ses->linux_uid = volume_info->linux_uid;
1933
1934 /* ntlmv2 is much stronger than ntlm security, and has been broadly
1935 supported for many years, time to update default security mechanism */
1936 if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
1937 warned_on_ntlm = true;
1938 cERROR(1, "default security mechanism requested. The default "
1939 "security mechanism will be upgraded from ntlm to "
1940 "ntlmv2 in kernel release 2.6.41");
1941 }
1903 ses->overrideSecFlg = volume_info->secFlg; 1942 ses->overrideSecFlg = volume_info->secFlg;
1904 1943
1905 mutex_lock(&ses->session_mutex); 1944 mutex_lock(&ses->session_mutex);
@@ -2253,7 +2292,7 @@ static int
2253generic_ip_connect(struct TCP_Server_Info *server) 2292generic_ip_connect(struct TCP_Server_Info *server)
2254{ 2293{
2255 int rc = 0; 2294 int rc = 0;
2256 unsigned short int sport; 2295 __be16 sport;
2257 int slen, sfamily; 2296 int slen, sfamily;
2258 struct socket *socket = server->ssocket; 2297 struct socket *socket = server->ssocket;
2259 struct sockaddr *saddr; 2298 struct sockaddr *saddr;
@@ -2338,7 +2377,7 @@ generic_ip_connect(struct TCP_Server_Info *server)
2338static int 2377static int
2339ip_connect(struct TCP_Server_Info *server) 2378ip_connect(struct TCP_Server_Info *server)
2340{ 2379{
2341 unsigned short int *sport; 2380 __be16 *sport;
2342 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr; 2381 struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&server->dstaddr;
2343 struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr; 2382 struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
2344 2383
@@ -2803,7 +2842,7 @@ try_mount_again:
2803 2842
2804remote_path_check: 2843remote_path_check:
2805 /* check if a whole path (including prepath) is not remote */ 2844 /* check if a whole path (including prepath) is not remote */
2806 if (!rc && cifs_sb->prepathlen && tcon) { 2845 if (!rc && tcon) {
2807 /* build_path_to_root works only when we have a valid tcon */ 2846 /* build_path_to_root works only when we have a valid tcon */
2808 full_path = cifs_build_path_to_root(cifs_sb, tcon); 2847 full_path = cifs_build_path_to_root(cifs_sb, tcon);
2809 if (full_path == NULL) { 2848 if (full_path == NULL) {
@@ -2910,7 +2949,7 @@ mount_fail_check:
2910 if (mount_data != mount_data_global) 2949 if (mount_data != mount_data_global)
2911 kfree(mount_data); 2950 kfree(mount_data);
2912 /* If find_unc succeeded then rc == 0 so we can not end */ 2951 /* If find_unc succeeded then rc == 0 so we can not end */
2913 /* up accidently freeing someone elses tcon struct */ 2952 /* up accidentally freeing someone elses tcon struct */
2914 if (tcon) 2953 if (tcon)
2915 cifs_put_tcon(tcon); 2954 cifs_put_tcon(tcon);
2916 else if (pSesInfo) 2955 else if (pSesInfo)
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index dd5f22918c33..9ea65cf36714 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -189,7 +189,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
189 inode->i_sb, mode, oflags, &oplock, &fileHandle, xid); 189 inode->i_sb, mode, oflags, &oplock, &fileHandle, xid);
190 /* EIO could indicate that (posix open) operation is not 190 /* EIO could indicate that (posix open) operation is not
191 supported, despite what server claimed in capability 191 supported, despite what server claimed in capability
192 negotation. EREMOTE indicates DFS junction, which is not 192 negotiation. EREMOTE indicates DFS junction, which is not
193 handled in posix open */ 193 handled in posix open */
194 194
195 if (rc == 0) { 195 if (rc == 0) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0de17c1db608..faf59529e847 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -346,7 +346,6 @@ int cifs_open(struct inode *inode, struct file *file)
346 struct cifsTconInfo *tcon; 346 struct cifsTconInfo *tcon;
347 struct tcon_link *tlink; 347 struct tcon_link *tlink;
348 struct cifsFileInfo *pCifsFile = NULL; 348 struct cifsFileInfo *pCifsFile = NULL;
349 struct cifsInodeInfo *pCifsInode;
350 char *full_path = NULL; 349 char *full_path = NULL;
351 bool posix_open_ok = false; 350 bool posix_open_ok = false;
352 __u16 netfid; 351 __u16 netfid;
@@ -361,8 +360,6 @@ int cifs_open(struct inode *inode, struct file *file)
361 } 360 }
362 tcon = tlink_tcon(tlink); 361 tcon = tlink_tcon(tlink);
363 362
364 pCifsInode = CIFS_I(file->f_path.dentry->d_inode);
365
366 full_path = build_path_from_dentry(file->f_path.dentry); 363 full_path = build_path_from_dentry(file->f_path.dentry);
367 if (full_path == NULL) { 364 if (full_path == NULL) {
368 rc = -ENOMEM; 365 rc = -ENOMEM;
@@ -578,8 +575,10 @@ reopen_error_exit:
578 575
579int cifs_close(struct inode *inode, struct file *file) 576int cifs_close(struct inode *inode, struct file *file)
580{ 577{
581 cifsFileInfo_put(file->private_data); 578 if (file->private_data != NULL) {
582 file->private_data = NULL; 579 cifsFileInfo_put(file->private_data);
580 file->private_data = NULL;
581 }
583 582
584 /* return code from the ->release op is always ignored */ 583 /* return code from the ->release op is always ignored */
585 return 0; 584 return 0;
@@ -973,6 +972,9 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
973 total_written += bytes_written) { 972 total_written += bytes_written) {
974 rc = -EAGAIN; 973 rc = -EAGAIN;
975 while (rc == -EAGAIN) { 974 while (rc == -EAGAIN) {
975 struct kvec iov[2];
976 unsigned int len;
977
976 if (open_file->invalidHandle) { 978 if (open_file->invalidHandle) {
977 /* we could deadlock if we called 979 /* we could deadlock if we called
978 filemap_fdatawait from here so tell 980 filemap_fdatawait from here so tell
@@ -982,31 +984,14 @@ static ssize_t cifs_write(struct cifsFileInfo *open_file,
982 if (rc != 0) 984 if (rc != 0)
983 break; 985 break;
984 } 986 }
985 if (experimEnabled || (pTcon->ses->server && 987
986 ((pTcon->ses->server->secMode & 988 len = min((size_t)cifs_sb->wsize,
987 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) 989 write_size - total_written);
988 == 0))) { 990 /* iov[0] is reserved for smb header */
989 struct kvec iov[2]; 991 iov[1].iov_base = (char *)write_data + total_written;
990 unsigned int len; 992 iov[1].iov_len = len;
991 993 rc = CIFSSMBWrite2(xid, pTcon, open_file->netfid, len,
992 len = min((size_t)cifs_sb->wsize, 994 *poffset, &bytes_written, iov, 1, 0);
993 write_size - total_written);
994 /* iov[0] is reserved for smb header */
995 iov[1].iov_base = (char *)write_data +
996 total_written;
997 iov[1].iov_len = len;
998 rc = CIFSSMBWrite2(xid, pTcon,
999 open_file->netfid, len,
1000 *poffset, &bytes_written,
1001 iov, 1, 0);
1002 } else
1003 rc = CIFSSMBWrite(xid, pTcon,
1004 open_file->netfid,
1005 min_t(const int, cifs_sb->wsize,
1006 write_size - total_written),
1007 *poffset, &bytes_written,
1008 write_data + total_written,
1009 NULL, 0);
1010 } 995 }
1011 if (rc || (bytes_written == 0)) { 996 if (rc || (bytes_written == 0)) {
1012 if (total_written) 997 if (total_written)
@@ -1146,7 +1131,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1146 char *write_data; 1131 char *write_data;
1147 int rc = -EFAULT; 1132 int rc = -EFAULT;
1148 int bytes_written = 0; 1133 int bytes_written = 0;
1149 struct cifs_sb_info *cifs_sb;
1150 struct inode *inode; 1134 struct inode *inode;
1151 struct cifsFileInfo *open_file; 1135 struct cifsFileInfo *open_file;
1152 1136
@@ -1154,7 +1138,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
1154 return -EFAULT; 1138 return -EFAULT;
1155 1139
1156 inode = page->mapping->host; 1140 inode = page->mapping->host;
1157 cifs_sb = CIFS_SB(inode->i_sb);
1158 1141
1159 offset += (loff_t)from; 1142 offset += (loff_t)from;
1160 write_data = kmap(page); 1143 write_data = kmap(page);
@@ -1245,12 +1228,6 @@ static int cifs_writepages(struct address_space *mapping,
1245 } 1228 }
1246 1229
1247 tcon = tlink_tcon(open_file->tlink); 1230 tcon = tlink_tcon(open_file->tlink);
1248 if (!experimEnabled && tcon->ses->server->secMode &
1249 (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
1250 cifsFileInfo_put(open_file);
1251 kfree(iov);
1252 return generic_writepages(mapping, wbc);
1253 }
1254 cifsFileInfo_put(open_file); 1231 cifsFileInfo_put(open_file);
1255 1232
1256 xid = GetXid(); 1233 xid = GetXid();
@@ -1574,34 +1551,6 @@ int cifs_fsync(struct file *file, int datasync)
1574 return rc; 1551 return rc;
1575} 1552}
1576 1553
1577/* static void cifs_sync_page(struct page *page)
1578{
1579 struct address_space *mapping;
1580 struct inode *inode;
1581 unsigned long index = page->index;
1582 unsigned int rpages = 0;
1583 int rc = 0;
1584
1585 cFYI(1, "sync page %p", page);
1586 mapping = page->mapping;
1587 if (!mapping)
1588 return 0;
1589 inode = mapping->host;
1590 if (!inode)
1591 return; */
1592
1593/* fill in rpages then
1594 result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */
1595
1596/* cFYI(1, "rpages is %d for sync page of Index %ld", rpages, index);
1597
1598#if 0
1599 if (rc < 0)
1600 return rc;
1601 return 0;
1602#endif
1603} */
1604
1605/* 1554/*
1606 * As file closes, flush all cached write data for this inode checking 1555 * As file closes, flush all cached write data for this inode checking
1607 * for write behind errors. 1556 * for write behind errors.
@@ -1667,9 +1616,10 @@ static ssize_t
1667cifs_iovec_write(struct file *file, const struct iovec *iov, 1616cifs_iovec_write(struct file *file, const struct iovec *iov,
1668 unsigned long nr_segs, loff_t *poffset) 1617 unsigned long nr_segs, loff_t *poffset)
1669{ 1618{
1670 size_t total_written = 0, written = 0; 1619 unsigned int written;
1671 unsigned long num_pages, npages; 1620 unsigned long num_pages, npages, i;
1672 size_t copied, len, cur_len, i; 1621 size_t copied, len, cur_len;
1622 ssize_t total_written = 0;
1673 struct kvec *to_send; 1623 struct kvec *to_send;
1674 struct page **pages; 1624 struct page **pages;
1675 struct iov_iter it; 1625 struct iov_iter it;
@@ -1825,7 +1775,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov,
1825{ 1775{
1826 int rc; 1776 int rc;
1827 int xid; 1777 int xid;
1828 unsigned int total_read, bytes_read = 0; 1778 ssize_t total_read;
1779 unsigned int bytes_read = 0;
1829 size_t len, cur_len; 1780 size_t len, cur_len;
1830 int iov_offset = 0; 1781 int iov_offset = 0;
1831 struct cifs_sb_info *cifs_sb; 1782 struct cifs_sb_info *cifs_sb;
@@ -2011,6 +1962,24 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
2011 return total_read; 1962 return total_read;
2012} 1963}
2013 1964
1965/*
1966 * If the page is mmap'ed into a process' page tables, then we need to make
1967 * sure that it doesn't change while being written back.
1968 */
1969static int
1970cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1971{
1972 struct page *page = vmf->page;
1973
1974 lock_page(page);
1975 return VM_FAULT_LOCKED;
1976}
1977
1978static struct vm_operations_struct cifs_file_vm_ops = {
1979 .fault = filemap_fault,
1980 .page_mkwrite = cifs_page_mkwrite,
1981};
1982
2014int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma) 1983int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
2015{ 1984{
2016 int rc, xid; 1985 int rc, xid;
@@ -2022,6 +1991,8 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
2022 cifs_invalidate_mapping(inode); 1991 cifs_invalidate_mapping(inode);
2023 1992
2024 rc = generic_file_mmap(file, vma); 1993 rc = generic_file_mmap(file, vma);
1994 if (rc == 0)
1995 vma->vm_ops = &cifs_file_vm_ops;
2025 FreeXid(xid); 1996 FreeXid(xid);
2026 return rc; 1997 return rc;
2027} 1998}
@@ -2038,6 +2009,8 @@ int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
2038 return rc; 2009 return rc;
2039 } 2010 }
2040 rc = generic_file_mmap(file, vma); 2011 rc = generic_file_mmap(file, vma);
2012 if (rc == 0)
2013 vma->vm_ops = &cifs_file_vm_ops;
2041 FreeXid(xid); 2014 FreeXid(xid);
2042 return rc; 2015 return rc;
2043} 2016}
@@ -2513,7 +2486,6 @@ const struct address_space_operations cifs_addr_ops = {
2513 .set_page_dirty = __set_page_dirty_nobuffers, 2486 .set_page_dirty = __set_page_dirty_nobuffers,
2514 .releasepage = cifs_release_page, 2487 .releasepage = cifs_release_page,
2515 .invalidatepage = cifs_invalidate_page, 2488 .invalidatepage = cifs_invalidate_page,
2516 /* .sync_page = cifs_sync_page, */
2517 /* .direct_IO = */ 2489 /* .direct_IO = */
2518}; 2490};
2519 2491
@@ -2531,6 +2503,5 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
2531 .set_page_dirty = __set_page_dirty_nobuffers, 2503 .set_page_dirty = __set_page_dirty_nobuffers,
2532 .releasepage = cifs_release_page, 2504 .releasepage = cifs_release_page,
2533 .invalidatepage = cifs_invalidate_page, 2505 .invalidatepage = cifs_invalidate_page,
2534 /* .sync_page = cifs_sync_page, */
2535 /* .direct_IO = */ 2506 /* .direct_IO = */
2536}; 2507};
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 02cd60aefbff..ce417a9764a3 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -55,8 +55,9 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
55 55
56 md5 = crypto_alloc_shash("md5", 0, 0); 56 md5 = crypto_alloc_shash("md5", 0, 0);
57 if (IS_ERR(md5)) { 57 if (IS_ERR(md5)) {
58 rc = PTR_ERR(md5);
58 cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc); 59 cERROR(1, "%s: Crypto md5 allocation error %d\n", __func__, rc);
59 return PTR_ERR(md5); 60 return rc;
60 } 61 }
61 size = sizeof(struct shash_desc) + crypto_shash_descsize(md5); 62 size = sizeof(struct shash_desc) + crypto_shash_descsize(md5);
62 sdescmd5 = kmalloc(size, GFP_KERNEL); 63 sdescmd5 = kmalloc(size, GFP_KERNEL);
@@ -238,7 +239,7 @@ CIFSQueryMFSymLink(const int xid, struct cifsTconInfo *tcon,
238 if (rc != 0) 239 if (rc != 0)
239 return rc; 240 return rc;
240 241
241 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) { 242 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
242 CIFSSMBClose(xid, tcon, netfid); 243 CIFSSMBClose(xid, tcon, netfid);
243 /* it's not a symlink */ 244 /* it's not a symlink */
244 return -EINVAL; 245 return -EINVAL;
@@ -315,7 +316,7 @@ CIFSCheckMFSymlink(struct cifs_fattr *fattr,
315 if (rc != 0) 316 if (rc != 0)
316 goto out; 317 goto out;
317 318
318 if (file_info.EndOfFile != CIFS_MF_SYMLINK_FILE_SIZE) { 319 if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
319 CIFSSMBClose(xid, pTcon, netfid); 320 CIFSSMBClose(xid, pTcon, netfid);
320 /* it's not a symlink */ 321 /* it's not a symlink */
321 goto out; 322 goto out;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index a09e077ba925..0c684ae4c071 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -100,6 +100,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free)
100 memset(buf_to_free->password, 0, strlen(buf_to_free->password)); 100 memset(buf_to_free->password, 0, strlen(buf_to_free->password));
101 kfree(buf_to_free->password); 101 kfree(buf_to_free->password);
102 } 102 }
103 kfree(buf_to_free->user_name);
103 kfree(buf_to_free->domainName); 104 kfree(buf_to_free->domainName);
104 kfree(buf_to_free); 105 kfree(buf_to_free);
105} 106}
@@ -236,10 +237,7 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
236{ 237{
237 __u16 mid = 0; 238 __u16 mid = 0;
238 __u16 last_mid; 239 __u16 last_mid;
239 int collision; 240 bool collision;
240
241 if (server == NULL)
242 return mid;
243 241
244 spin_lock(&GlobalMid_Lock); 242 spin_lock(&GlobalMid_Lock);
245 last_mid = server->CurrentMid; /* we do not want to loop forever */ 243 last_mid = server->CurrentMid; /* we do not want to loop forever */
@@ -252,24 +250,38 @@ __u16 GetNextMid(struct TCP_Server_Info *server)
252 (and it would also have to have been a request that 250 (and it would also have to have been a request that
253 did not time out) */ 251 did not time out) */
254 while (server->CurrentMid != last_mid) { 252 while (server->CurrentMid != last_mid) {
255 struct list_head *tmp;
256 struct mid_q_entry *mid_entry; 253 struct mid_q_entry *mid_entry;
254 unsigned int num_mids;
257 255
258 collision = 0; 256 collision = false;
259 if (server->CurrentMid == 0) 257 if (server->CurrentMid == 0)
260 server->CurrentMid++; 258 server->CurrentMid++;
261 259
262 list_for_each(tmp, &server->pending_mid_q) { 260 num_mids = 0;
263 mid_entry = list_entry(tmp, struct mid_q_entry, qhead); 261 list_for_each_entry(mid_entry, &server->pending_mid_q, qhead) {
264 262 ++num_mids;
265 if ((mid_entry->mid == server->CurrentMid) && 263 if (mid_entry->mid == server->CurrentMid &&
266 (mid_entry->midState == MID_REQUEST_SUBMITTED)) { 264 mid_entry->midState == MID_REQUEST_SUBMITTED) {
267 /* This mid is in use, try a different one */ 265 /* This mid is in use, try a different one */
268 collision = 1; 266 collision = true;
269 break; 267 break;
270 } 268 }
271 } 269 }
272 if (collision == 0) { 270
271 /*
272 * if we have more than 32k mids in the list, then something
273 * is very wrong. Possibly a local user is trying to DoS the
274 * box by issuing long-running calls and SIGKILL'ing them. If
275 * we get to 2^16 mids then we're in big trouble as this
276 * function could loop forever.
277 *
278 * Go ahead and assign out the mid in this situation, but force
279 * an eventual reconnect to clean out the pending_mid_q.
280 */
281 if (num_mids > 32768)
282 server->tcpStatus = CifsNeedReconnect;
283
284 if (!collision) {
273 mid = server->CurrentMid; 285 mid = server->CurrentMid;
274 break; 286 break;
275 } 287 }
@@ -381,29 +393,31 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ ,
381} 393}
382 394
383static int 395static int
384checkSMBhdr(struct smb_hdr *smb, __u16 mid) 396check_smb_hdr(struct smb_hdr *smb, __u16 mid)
385{ 397{
386 /* Make sure that this really is an SMB, that it is a response, 398 /* does it have the right SMB "signature" ? */
387 and that the message ids match */ 399 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff)) {
388 if ((*(__le32 *) smb->Protocol == cpu_to_le32(0x424d53ff)) && 400 cERROR(1, "Bad protocol string signature header 0x%x",
389 (mid == smb->Mid)) { 401 *(unsigned int *)smb->Protocol);
390 if (smb->Flags & SMBFLG_RESPONSE) 402 return 1;
391 return 0; 403 }
392 else { 404
393 /* only one valid case where server sends us request */ 405 /* Make sure that message ids match */
394 if (smb->Command == SMB_COM_LOCKING_ANDX) 406 if (mid != smb->Mid) {
395 return 0; 407 cERROR(1, "Mids do not match. received=%u expected=%u",
396 else 408 smb->Mid, mid);
397 cERROR(1, "Received Request not response"); 409 return 1;
398 }
399 } else { /* bad signature or mid */
400 if (*(__le32 *) smb->Protocol != cpu_to_le32(0x424d53ff))
401 cERROR(1, "Bad protocol string signature header %x",
402 *(unsigned int *) smb->Protocol);
403 if (mid != smb->Mid)
404 cERROR(1, "Mids do not match");
405 } 410 }
406 cERROR(1, "bad smb detected. The Mid=%d", smb->Mid); 411
412 /* if it's a response then accept */
413 if (smb->Flags & SMBFLG_RESPONSE)
414 return 0;
415
416 /* only one valid case where server sends us request */
417 if (smb->Command == SMB_COM_LOCKING_ANDX)
418 return 0;
419
420 cERROR(1, "Server sent request, not response. mid=%u", smb->Mid);
407 return 1; 421 return 1;
408} 422}
409 423
@@ -448,7 +462,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
448 return 1; 462 return 1;
449 } 463 }
450 464
451 if (checkSMBhdr(smb, mid)) 465 if (check_smb_hdr(smb, mid))
452 return 1; 466 return 1;
453 clc_len = smbCalcSize_LE(smb); 467 clc_len = smbCalcSize_LE(smb);
454 468
@@ -465,25 +479,26 @@ checkSMB(struct smb_hdr *smb, __u16 mid, unsigned int length)
465 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) 479 if (((4 + len) & 0xFFFF) == (clc_len & 0xFFFF))
466 return 0; /* bcc wrapped */ 480 return 0; /* bcc wrapped */
467 } 481 }
468 cFYI(1, "Calculated size %d vs length %d mismatch for mid %d", 482 cFYI(1, "Calculated size %u vs length %u mismatch for mid=%u",
469 clc_len, 4 + len, smb->Mid); 483 clc_len, 4 + len, smb->Mid);
470 /* Windows XP can return a few bytes too much, presumably 484
471 an illegal pad, at the end of byte range lock responses 485 if (4 + len < clc_len) {
472 so we allow for that three byte pad, as long as actual 486 cERROR(1, "RFC1001 size %u smaller than SMB for mid=%u",
473 received length is as long or longer than calculated length */
474 /* We have now had to extend this more, since there is a
475 case in which it needs to be bigger still to handle a
476 malformed response to transact2 findfirst from WinXP when
477 access denied is returned and thus bcc and wct are zero
478 but server says length is 0x21 bytes too long as if the server
479 forget to reset the smb rfc1001 length when it reset the
480 wct and bcc to minimum size and drop the t2 parms and data */
481 if ((4+len > clc_len) && (len <= clc_len + 512))
482 return 0;
483 else {
484 cERROR(1, "RFC1001 size %d bigger than SMB for Mid=%d",
485 len, smb->Mid); 487 len, smb->Mid);
486 return 1; 488 return 1;
489 } else if (len > clc_len + 512) {
490 /*
491 * Some servers (Windows XP in particular) send more
492 * data than the lengths in the SMB packet would
493 * indicate on certain calls (byte range locks and
494 * trans2 find first calls in particular). While the
495 * client can handle such a frame by ignoring the
496 * trailing data, we choose limit the amount of extra
497 * data to 512 bytes.
498 */
499 cERROR(1, "RFC1001 size %u more than 512 bytes larger "
500 "than SMB for mid=%u", len, smb->Mid);
501 return 1;
487 } 502 }
488 } 503 }
489 return 0; 504 return 0;
@@ -506,7 +521,7 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv)
506 (struct smb_com_transaction_change_notify_rsp *)buf; 521 (struct smb_com_transaction_change_notify_rsp *)buf;
507 struct file_notify_information *pnotify; 522 struct file_notify_information *pnotify;
508 __u32 data_offset = 0; 523 __u32 data_offset = 0;
509 if (pSMBr->ByteCount > sizeof(struct file_notify_information)) { 524 if (get_bcc_le(buf) > sizeof(struct file_notify_information)) {
510 data_offset = le32_to_cpu(pSMBr->DataOffset); 525 data_offset = le32_to_cpu(pSMBr->DataOffset);
511 526
512 pnotify = (struct file_notify_information *) 527 pnotify = (struct file_notify_information *)
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8d9189f64477..79f641eeda30 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -170,7 +170,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
170{ 170{
171 int rc, alen, slen; 171 int rc, alen, slen;
172 const char *pct; 172 const char *pct;
173 char *endp, scope_id[13]; 173 char scope_id[13];
174 struct sockaddr_in *s4 = (struct sockaddr_in *) dst; 174 struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
175 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; 175 struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
176 176
@@ -197,9 +197,9 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
197 memcpy(scope_id, pct + 1, slen); 197 memcpy(scope_id, pct + 1, slen);
198 scope_id[slen] = '\0'; 198 scope_id[slen] = '\0';
199 199
200 s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); 200 rc = strict_strtoul(scope_id, 0,
201 if (endp != scope_id + slen) 201 (unsigned long *)&s6->sin6_scope_id);
202 return 0; 202 rc = (rc == 0) ? 1 : 0;
203 } 203 }
204 204
205 return rc; 205 return rc;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 7f25cc3d2256..f8e4cd2a7912 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -764,7 +764,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
764{ 764{
765 int rc = 0; 765 int rc = 0;
766 int xid, i; 766 int xid, i;
767 struct cifs_sb_info *cifs_sb;
768 struct cifsTconInfo *pTcon; 767 struct cifsTconInfo *pTcon;
769 struct cifsFileInfo *cifsFile = NULL; 768 struct cifsFileInfo *cifsFile = NULL;
770 char *current_entry; 769 char *current_entry;
@@ -775,8 +774,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir)
775 774
776 xid = GetXid(); 775 xid = GetXid();
777 776
778 cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
779
780 /* 777 /*
781 * Ensure FindFirst doesn't fail before doing filldir() for '.' and 778 * Ensure FindFirst doesn't fail before doing filldir() for '.' and
782 * '..'. Otherwise we won't be able to notify VFS in case of failure. 779 * '..'. Otherwise we won't be able to notify VFS in case of failure.
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1adc9625a344..f6728eb6f4b9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -219,12 +219,12 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
219 bcc_ptr++; 219 bcc_ptr++;
220 } */ 220 } */
221 /* copy user */ 221 /* copy user */
222 if (ses->userName == NULL) { 222 if (ses->user_name == NULL) {
223 /* null user mount */ 223 /* null user mount */
224 *bcc_ptr = 0; 224 *bcc_ptr = 0;
225 *(bcc_ptr+1) = 0; 225 *(bcc_ptr+1) = 0;
226 } else { 226 } else {
227 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName, 227 bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->user_name,
228 MAX_USERNAME_SIZE, nls_cp); 228 MAX_USERNAME_SIZE, nls_cp);
229 } 229 }
230 bcc_ptr += 2 * bytes_ret; 230 bcc_ptr += 2 * bytes_ret;
@@ -244,12 +244,11 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifsSesInfo *ses,
244 /* copy user */ 244 /* copy user */
245 /* BB what about null user mounts - check that we do this BB */ 245 /* BB what about null user mounts - check that we do this BB */
246 /* copy user */ 246 /* copy user */
247 if (ses->userName == NULL) { 247 if (ses->user_name != NULL)
248 /* BB what about null user mounts - check that we do this BB */ 248 strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
249 } else { 249 /* else null user mount */
250 strncpy(bcc_ptr, ses->userName, MAX_USERNAME_SIZE); 250
251 } 251 bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
252 bcc_ptr += strnlen(ses->userName, MAX_USERNAME_SIZE);
253 *bcc_ptr = 0; 252 *bcc_ptr = 0;
254 bcc_ptr++; /* account for null termination */ 253 bcc_ptr++; /* account for null termination */
255 254
@@ -405,8 +404,8 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
405 /* BB spec says that if AvId field of MsvAvTimestamp is populated then 404 /* BB spec says that if AvId field of MsvAvTimestamp is populated then
406 we must set the MIC field of the AUTHENTICATE_MESSAGE */ 405 we must set the MIC field of the AUTHENTICATE_MESSAGE */
407 ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags); 406 ses->ntlmssp->server_flags = le32_to_cpu(pblob->NegotiateFlags);
408 tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset); 407 tioffset = le32_to_cpu(pblob->TargetInfoArray.BufferOffset);
409 tilen = cpu_to_le16(pblob->TargetInfoArray.Length); 408 tilen = le16_to_cpu(pblob->TargetInfoArray.Length);
410 if (tilen) { 409 if (tilen) {
411 ses->auth_key.response = kmalloc(tilen, GFP_KERNEL); 410 ses->auth_key.response = kmalloc(tilen, GFP_KERNEL);
412 if (!ses->auth_key.response) { 411 if (!ses->auth_key.response) {
@@ -523,14 +522,14 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
523 tmp += len; 522 tmp += len;
524 } 523 }
525 524
526 if (ses->userName == NULL) { 525 if (ses->user_name == NULL) {
527 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 526 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
528 sec_blob->UserName.Length = 0; 527 sec_blob->UserName.Length = 0;
529 sec_blob->UserName.MaximumLength = 0; 528 sec_blob->UserName.MaximumLength = 0;
530 tmp += 2; 529 tmp += 2;
531 } else { 530 } else {
532 int len; 531 int len;
533 len = cifs_strtoUCS((__le16 *)tmp, ses->userName, 532 len = cifs_strtoUCS((__le16 *)tmp, ses->user_name,
534 MAX_USERNAME_SIZE, nls_cp); 533 MAX_USERNAME_SIZE, nls_cp);
535 len *= 2; /* unicode is 2 bytes each */ 534 len *= 2; /* unicode is 2 bytes each */
536 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); 535 sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -656,13 +655,13 @@ ssetup_ntlmssp_authenticate:
656 655
657 if (type == LANMAN) { 656 if (type == LANMAN) {
658#ifdef CONFIG_CIFS_WEAK_PW_HASH 657#ifdef CONFIG_CIFS_WEAK_PW_HASH
659 char lnm_session_key[CIFS_SESS_KEY_SIZE]; 658 char lnm_session_key[CIFS_AUTH_RESP_SIZE];
660 659
661 pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; 660 pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
662 661
663 /* no capabilities flags in old lanman negotiation */ 662 /* no capabilities flags in old lanman negotiation */
664 663
665 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); 664 pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
666 665
667 /* Calculate hash with password and copy into bcc_ptr. 666 /* Calculate hash with password and copy into bcc_ptr.
668 * Encryption Key (stored as in cryptkey) gets used if the 667 * Encryption Key (stored as in cryptkey) gets used if the
@@ -675,8 +674,8 @@ ssetup_ntlmssp_authenticate:
675 true : false, lnm_session_key); 674 true : false, lnm_session_key);
676 675
677 ses->flags |= CIFS_SES_LANMAN; 676 ses->flags |= CIFS_SES_LANMAN;
678 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE); 677 memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
679 bcc_ptr += CIFS_SESS_KEY_SIZE; 678 bcc_ptr += CIFS_AUTH_RESP_SIZE;
680 679
681 /* can not sign if LANMAN negotiated so no need 680 /* can not sign if LANMAN negotiated so no need
682 to calculate signing key? but what if server 681 to calculate signing key? but what if server
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index b5450e9f40c0..b5041c849981 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -58,8 +58,9 @@ mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len)
58 58
59 md4 = crypto_alloc_shash("md4", 0, 0); 59 md4 = crypto_alloc_shash("md4", 0, 0);
60 if (IS_ERR(md4)) { 60 if (IS_ERR(md4)) {
61 rc = PTR_ERR(md4);
61 cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc); 62 cERROR(1, "%s: Crypto md4 allocation error %d\n", __func__, rc);
62 return PTR_ERR(md4); 63 return rc;
63 } 64 }
64 size = sizeof(struct shash_desc) + crypto_shash_descsize(md4); 65 size = sizeof(struct shash_desc) + crypto_shash_descsize(md4);
65 sdescmd4 = kmalloc(size, GFP_KERNEL); 66 sdescmd4 = kmalloc(size, GFP_KERNEL);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index c1ccca1a933f..46d8756f2b24 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -236,9 +236,9 @@ smb_sendv(struct TCP_Server_Info *server, struct kvec *iov, int n_vec)
236 server->tcpStatus = CifsNeedReconnect; 236 server->tcpStatus = CifsNeedReconnect;
237 } 237 }
238 238
239 if (rc < 0) { 239 if (rc < 0 && rc != -EINTR)
240 cERROR(1, "Error %d sending data on socket to server", rc); 240 cERROR(1, "Error %d sending data on socket to server", rc);
241 } else 241 else
242 rc = 0; 242 rc = 0;
243 243
244 /* Don't want to modify the buffer as a 244 /* Don't want to modify the buffer as a
@@ -359,6 +359,10 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_hdr *in_buf,
359 if (rc) 359 if (rc)
360 return rc; 360 return rc;
361 361
362 /* enable signing if server requires it */
363 if (server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
364 in_buf->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
365
362 mutex_lock(&server->srv_mutex); 366 mutex_lock(&server->srv_mutex);
363 mid = AllocMidQEntry(in_buf, server); 367 mid = AllocMidQEntry(in_buf, server);
364 if (mid == NULL) { 368 if (mid == NULL) {
@@ -453,6 +457,9 @@ sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
453 case MID_RETRY_NEEDED: 457 case MID_RETRY_NEEDED:
454 rc = -EAGAIN; 458 rc = -EAGAIN;
455 break; 459 break;
460 case MID_RESPONSE_MALFORMED:
461 rc = -EIO;
462 break;
456 default: 463 default:
457 cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__, 464 cERROR(1, "%s: invalid mid state mid=%d state=%d", __func__,
458 mid->mid, mid->midState); 465 mid->mid, mid->midState);
@@ -570,17 +577,33 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
570#endif 577#endif
571 578
572 mutex_unlock(&ses->server->srv_mutex); 579 mutex_unlock(&ses->server->srv_mutex);
573 cifs_small_buf_release(in_buf);
574 580
575 if (rc < 0) 581 if (rc < 0) {
582 cifs_small_buf_release(in_buf);
576 goto out; 583 goto out;
584 }
577 585
578 if (long_op == CIFS_ASYNC_OP) 586 if (long_op == CIFS_ASYNC_OP) {
587 cifs_small_buf_release(in_buf);
579 goto out; 588 goto out;
589 }
580 590
581 rc = wait_for_response(ses->server, midQ); 591 rc = wait_for_response(ses->server, midQ);
582 if (rc != 0) 592 if (rc != 0) {
583 goto out; 593 send_nt_cancel(ses->server, in_buf, midQ);
594 spin_lock(&GlobalMid_Lock);
595 if (midQ->midState == MID_REQUEST_SUBMITTED) {
596 midQ->callback = DeleteMidQEntry;
597 spin_unlock(&GlobalMid_Lock);
598 cifs_small_buf_release(in_buf);
599 atomic_dec(&ses->server->inFlight);
600 wake_up(&ses->server->request_q);
601 return rc;
602 }
603 spin_unlock(&GlobalMid_Lock);
604 }
605
606 cifs_small_buf_release(in_buf);
584 607
585 rc = sync_mid_result(midQ, ses->server); 608 rc = sync_mid_result(midQ, ses->server);
586 if (rc != 0) { 609 if (rc != 0) {
@@ -724,8 +747,19 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
724 goto out; 747 goto out;
725 748
726 rc = wait_for_response(ses->server, midQ); 749 rc = wait_for_response(ses->server, midQ);
727 if (rc != 0) 750 if (rc != 0) {
728 goto out; 751 send_nt_cancel(ses->server, in_buf, midQ);
752 spin_lock(&GlobalMid_Lock);
753 if (midQ->midState == MID_REQUEST_SUBMITTED) {
754 /* no longer considered to be "in-flight" */
755 midQ->callback = DeleteMidQEntry;
756 spin_unlock(&GlobalMid_Lock);
757 atomic_dec(&ses->server->inFlight);
758 wake_up(&ses->server->request_q);
759 return rc;
760 }
761 spin_unlock(&GlobalMid_Lock);
762 }
729 763
730 rc = sync_mid_result(midQ, ses->server); 764 rc = sync_mid_result(midQ, ses->server);
731 if (rc != 0) { 765 if (rc != 0) {
@@ -922,10 +956,21 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
922 } 956 }
923 } 957 }
924 958
925 if (wait_for_response(ses->server, midQ) == 0) { 959 rc = wait_for_response(ses->server, midQ);
926 /* We got the response - restart system call. */ 960 if (rc) {
927 rstart = 1; 961 send_nt_cancel(ses->server, in_buf, midQ);
962 spin_lock(&GlobalMid_Lock);
963 if (midQ->midState == MID_REQUEST_SUBMITTED) {
964 /* no longer considered to be "in-flight" */
965 midQ->callback = DeleteMidQEntry;
966 spin_unlock(&GlobalMid_Lock);
967 return rc;
968 }
969 spin_unlock(&GlobalMid_Lock);
928 } 970 }
971
972 /* We got the response - restart system call. */
973 rstart = 1;
929 } 974 }
930 975
931 rc = sync_mid_result(midQ, ses->server); 976 rc = sync_mid_result(midQ, ses->server);
diff --git a/fs/coda/Makefile b/fs/coda/Makefile
index 6c22e61da397..1bab69a0d347 100644
--- a/fs/coda/Makefile
+++ b/fs/coda/Makefile
@@ -9,4 +9,4 @@ coda-objs := psdev.o cache.o cnode.o inode.o dir.o file.o upcall.o \
9 9
10# If you want debugging output, please uncomment the following line. 10# If you want debugging output, please uncomment the following line.
11 11
12# EXTRA_CFLAGS += -DDEBUG -DDEBUG_SMB_MALLOC=1 12# ccflags-y := -DDEBUG -DDEBUG_SMB_MALLOC=1
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index c6405ce3c50e..af56ad56a89a 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -13,7 +13,6 @@
13 13
14#ifdef CONFIG_SYSCTL 14#ifdef CONFIG_SYSCTL
15static struct ctl_table_header *fs_table_header; 15static struct ctl_table_header *fs_table_header;
16#endif
17 16
18static ctl_table coda_table[] = { 17static ctl_table coda_table[] = {
19 { 18 {
@@ -40,7 +39,6 @@ static ctl_table coda_table[] = {
40 {} 39 {}
41}; 40};
42 41
43#ifdef CONFIG_SYSCTL
44static ctl_table fs_table[] = { 42static ctl_table fs_table[] = {
45 { 43 {
46 .procname = "coda", 44 .procname = "coda",
@@ -49,22 +47,27 @@ static ctl_table fs_table[] = {
49 }, 47 },
50 {} 48 {}
51}; 49};
52#endif
53 50
54void coda_sysctl_init(void) 51void coda_sysctl_init(void)
55{ 52{
56#ifdef CONFIG_SYSCTL
57 if ( !fs_table_header ) 53 if ( !fs_table_header )
58 fs_table_header = register_sysctl_table(fs_table); 54 fs_table_header = register_sysctl_table(fs_table);
59#endif
60} 55}
61 56
62void coda_sysctl_clean(void) 57void coda_sysctl_clean(void)
63{ 58{
64#ifdef CONFIG_SYSCTL
65 if ( fs_table_header ) { 59 if ( fs_table_header ) {
66 unregister_sysctl_table(fs_table_header); 60 unregister_sysctl_table(fs_table_header);
67 fs_table_header = NULL; 61 fs_table_header = NULL;
68 } 62 }
69#endif
70} 63}
64
65#else
66void coda_sysctl_init(void)
67{
68}
69
70void coda_sysctl_clean(void)
71{
72}
73#endif
diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6cc..72fe6cda9108 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
262 */ 262 */
263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf) 263asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
264{ 264{
265 struct path path; 265 struct kstatfs tmp;
266 int error; 266 int error = user_statfs(pathname, &tmp);
267 267 if (!error)
268 error = user_path(pathname, &path); 268 error = put_compat_statfs(buf, &tmp);
269 if (!error) {
270 struct kstatfs tmp;
271 error = vfs_statfs(&path, &tmp);
272 if (!error)
273 error = put_compat_statfs(buf, &tmp);
274 path_put(&path);
275 }
276 return error; 269 return error;
277} 270}
278 271
279asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf) 272asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
280{ 273{
281 struct file * file;
282 struct kstatfs tmp; 274 struct kstatfs tmp;
283 int error; 275 int error = fd_statfs(fd, &tmp);
284
285 error = -EBADF;
286 file = fget(fd);
287 if (!file)
288 goto out;
289 error = vfs_statfs(&file->f_path, &tmp);
290 if (!error) 276 if (!error)
291 error = put_compat_statfs(buf, &tmp); 277 error = put_compat_statfs(buf, &tmp);
292 fput(file);
293out:
294 return error; 278 return error;
295} 279}
296 280
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
329 313
330asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf) 314asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
331{ 315{
332 struct path path; 316 struct kstatfs tmp;
333 int error; 317 int error;
334 318
335 if (sz != sizeof(*buf)) 319 if (sz != sizeof(*buf))
336 return -EINVAL; 320 return -EINVAL;
337 321
338 error = user_path(pathname, &path); 322 error = user_statfs(pathname, &tmp);
339 if (!error) { 323 if (!error)
340 struct kstatfs tmp; 324 error = put_compat_statfs64(buf, &tmp);
341 error = vfs_statfs(&path, &tmp);
342 if (!error)
343 error = put_compat_statfs64(buf, &tmp);
344 path_put(&path);
345 }
346 return error; 325 return error;
347} 326}
348 327
349asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf) 328asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
350{ 329{
351 struct file * file;
352 struct kstatfs tmp; 330 struct kstatfs tmp;
353 int error; 331 int error;
354 332
355 if (sz != sizeof(*buf)) 333 if (sz != sizeof(*buf))
356 return -EINVAL; 334 return -EINVAL;
357 335
358 error = -EBADF; 336 error = fd_statfs(fd, &tmp);
359 file = fget(fd);
360 if (!file)
361 goto out;
362 error = vfs_statfs(&file->f_path, &tmp);
363 if (!error) 337 if (!error)
364 error = put_compat_statfs64(buf, &tmp); 338 error = put_compat_statfs64(buf, &tmp);
365 fput(file);
366out:
367 return error; 339 return error;
368} 340}
369 341
@@ -1228,7 +1200,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
1228 file = fget_light(fd, &fput_needed); 1200 file = fget_light(fd, &fput_needed);
1229 if (!file) 1201 if (!file)
1230 return -EBADF; 1202 return -EBADF;
1231 ret = compat_readv(file, vec, vlen, &pos); 1203 ret = -ESPIPE;
1204 if (file->f_mode & FMODE_PREAD)
1205 ret = compat_readv(file, vec, vlen, &pos);
1232 fput_light(file, fput_needed); 1206 fput_light(file, fput_needed);
1233 return ret; 1207 return ret;
1234} 1208}
@@ -1285,7 +1259,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
1285 file = fget_light(fd, &fput_needed); 1259 file = fget_light(fd, &fput_needed);
1286 if (!file) 1260 if (!file)
1287 return -EBADF; 1261 return -EBADF;
1288 ret = compat_writev(file, vec, vlen, &pos); 1262 ret = -ESPIPE;
1263 if (file->f_mode & FMODE_PWRITE)
1264 ret = compat_writev(file, vec, vlen, &pos);
1289 fput_light(file, fput_needed); 1265 fput_light(file, fput_needed);
1290 return ret; 1266 return ret;
1291} 1267}
@@ -1695,9 +1671,6 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
1695 * Update: ERESTARTSYS breaks at least the xview clock binary, so 1671 * Update: ERESTARTSYS breaks at least the xview clock binary, so
1696 * I'm trying ERESTARTNOHAND which restart only when you want to. 1672 * I'm trying ERESTARTNOHAND which restart only when you want to.
1697 */ 1673 */
1698#define MAX_SELECT_SECONDS \
1699 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
1700
1701int compat_core_sys_select(int n, compat_ulong_t __user *inp, 1674int compat_core_sys_select(int n, compat_ulong_t __user *inp,
1702 compat_ulong_t __user *outp, compat_ulong_t __user *exp, 1675 compat_ulong_t __user *outp, compat_ulong_t __user *exp,
1703 struct timespec *end_time) 1676 struct timespec *end_time)
@@ -2308,3 +2281,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
2308} 2281}
2309 2282
2310#endif /* CONFIG_TIMERFD */ 2283#endif /* CONFIG_TIMERFD */
2284
2285#ifdef CONFIG_FHANDLE
2286/*
2287 * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
2288 * doesn't set the O_LARGEFILE flag.
2289 */
2290asmlinkage long
2291compat_sys_open_by_handle_at(int mountdirfd,
2292 struct file_handle __user *handle, int flags)
2293{
2294 return do_handle_open(mountdirfd, handle, flags);
2295}
2296#endif
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 90ff3cb10de3..3313dd19f543 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -990,7 +990,7 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
990 * This describes these functions and their helpers. 990 * This describes these functions and their helpers.
991 * 991 *
992 * Allow another kernel system to depend on a config_item. If this 992 * Allow another kernel system to depend on a config_item. If this
993 * happens, the item cannot go away until the dependant can live without 993 * happens, the item cannot go away until the dependent can live without
994 * it. The idea is to give client modules as simple an interface as 994 * it. The idea is to give client modules as simple an interface as
995 * possible. When a system asks them to depend on an item, they just 995 * possible. When a system asks them to depend on an item, they just
996 * call configfs_depend_item(). If the item is live and the client 996 * call configfs_depend_item(). If the item is live and the client
diff --git a/fs/dcache.c b/fs/dcache.c
index 2a6bd9a4ae97..129a35730994 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
296 __releases(parent->d_lock) 296 __releases(parent->d_lock)
297 __releases(dentry->d_inode->i_lock) 297 __releases(dentry->d_inode->i_lock)
298{ 298{
299 dentry->d_parent = NULL;
300 list_del(&dentry->d_u.d_child); 299 list_del(&dentry->d_u.d_child);
300 /*
301 * Inform try_to_ascend() that we are no longer attached to the
302 * dentry tree
303 */
304 dentry->d_flags |= DCACHE_DISCONNECTED;
301 if (parent) 305 if (parent)
302 spin_unlock(&parent->d_lock); 306 spin_unlock(&parent->d_lock);
303 dentry_iput(dentry); 307 dentry_iput(dentry);
@@ -1012,6 +1016,35 @@ void shrink_dcache_for_umount(struct super_block *sb)
1012} 1016}
1013 1017
1014/* 1018/*
1019 * This tries to ascend one level of parenthood, but
1020 * we can race with renaming, so we need to re-check
1021 * the parenthood after dropping the lock and check
1022 * that the sequence number still matches.
1023 */
1024static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
1025{
1026 struct dentry *new = old->d_parent;
1027
1028 rcu_read_lock();
1029 spin_unlock(&old->d_lock);
1030 spin_lock(&new->d_lock);
1031
1032 /*
1033 * might go back up the wrong parent if we have had a rename
1034 * or deletion
1035 */
1036 if (new != old->d_parent ||
1037 (old->d_flags & DCACHE_DISCONNECTED) ||
1038 (!locked && read_seqretry(&rename_lock, seq))) {
1039 spin_unlock(&new->d_lock);
1040 new = NULL;
1041 }
1042 rcu_read_unlock();
1043 return new;
1044}
1045
1046
1047/*
1015 * Search for at least 1 mount point in the dentry's subdirs. 1048 * Search for at least 1 mount point in the dentry's subdirs.
1016 * We descend to the next level whenever the d_subdirs 1049 * We descend to the next level whenever the d_subdirs
1017 * list is non-empty and continue searching. 1050 * list is non-empty and continue searching.
@@ -1066,24 +1099,10 @@ resume:
1066 * All done at this level ... ascend and resume the search. 1099 * All done at this level ... ascend and resume the search.
1067 */ 1100 */
1068 if (this_parent != parent) { 1101 if (this_parent != parent) {
1069 struct dentry *tmp; 1102 struct dentry *child = this_parent;
1070 struct dentry *child; 1103 this_parent = try_to_ascend(this_parent, locked, seq);
1071 1104 if (!this_parent)
1072 tmp = this_parent->d_parent;
1073 rcu_read_lock();
1074 spin_unlock(&this_parent->d_lock);
1075 child = this_parent;
1076 this_parent = tmp;
1077 spin_lock(&this_parent->d_lock);
1078 /* might go back up the wrong parent if we have had a rename
1079 * or deletion */
1080 if (this_parent != child->d_parent ||
1081 (!locked && read_seqretry(&rename_lock, seq))) {
1082 spin_unlock(&this_parent->d_lock);
1083 rcu_read_unlock();
1084 goto rename_retry; 1105 goto rename_retry;
1085 }
1086 rcu_read_unlock();
1087 next = child->d_u.d_child.next; 1106 next = child->d_u.d_child.next;
1088 goto resume; 1107 goto resume;
1089 } 1108 }
@@ -1181,24 +1200,10 @@ resume:
1181 * All done at this level ... ascend and resume the search. 1200 * All done at this level ... ascend and resume the search.
1182 */ 1201 */
1183 if (this_parent != parent) { 1202 if (this_parent != parent) {
1184 struct dentry *tmp; 1203 struct dentry *child = this_parent;
1185 struct dentry *child; 1204 this_parent = try_to_ascend(this_parent, locked, seq);
1186 1205 if (!this_parent)
1187 tmp = this_parent->d_parent;
1188 rcu_read_lock();
1189 spin_unlock(&this_parent->d_lock);
1190 child = this_parent;
1191 this_parent = tmp;
1192 spin_lock(&this_parent->d_lock);
1193 /* might go back up the wrong parent if we have had a rename
1194 * or deletion */
1195 if (this_parent != child->d_parent ||
1196 (!locked && read_seqretry(&rename_lock, seq))) {
1197 spin_unlock(&this_parent->d_lock);
1198 rcu_read_unlock();
1199 goto rename_retry; 1206 goto rename_retry;
1200 }
1201 rcu_read_unlock();
1202 next = child->d_u.d_child.next; 1207 next = child->d_u.d_child.next;
1203 goto resume; 1208 goto resume;
1204 } 1209 }
@@ -1523,6 +1528,28 @@ struct dentry * d_alloc_root(struct inode * root_inode)
1523} 1528}
1524EXPORT_SYMBOL(d_alloc_root); 1529EXPORT_SYMBOL(d_alloc_root);
1525 1530
1531static struct dentry * __d_find_any_alias(struct inode *inode)
1532{
1533 struct dentry *alias;
1534
1535 if (list_empty(&inode->i_dentry))
1536 return NULL;
1537 alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
1538 __dget(alias);
1539 return alias;
1540}
1541
1542static struct dentry * d_find_any_alias(struct inode *inode)
1543{
1544 struct dentry *de;
1545
1546 spin_lock(&inode->i_lock);
1547 de = __d_find_any_alias(inode);
1548 spin_unlock(&inode->i_lock);
1549 return de;
1550}
1551
1552
1526/** 1553/**
1527 * d_obtain_alias - find or allocate a dentry for a given inode 1554 * d_obtain_alias - find or allocate a dentry for a given inode
1528 * @inode: inode to allocate the dentry for 1555 * @inode: inode to allocate the dentry for
@@ -1552,7 +1579,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
1552 if (IS_ERR(inode)) 1579 if (IS_ERR(inode))
1553 return ERR_CAST(inode); 1580 return ERR_CAST(inode);
1554 1581
1555 res = d_find_alias(inode); 1582 res = d_find_any_alias(inode);
1556 if (res) 1583 if (res)
1557 goto out_iput; 1584 goto out_iput;
1558 1585
@@ -1565,7 +1592,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
1565 1592
1566 1593
1567 spin_lock(&inode->i_lock); 1594 spin_lock(&inode->i_lock);
1568 res = __d_find_alias(inode, 0); 1595 res = __d_find_any_alias(inode);
1569 if (res) { 1596 if (res) {
1570 spin_unlock(&inode->i_lock); 1597 spin_unlock(&inode->i_lock);
1571 dput(tmp); 1598 dput(tmp);
@@ -1585,10 +1612,13 @@ struct dentry *d_obtain_alias(struct inode *inode)
1585 __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first); 1612 __bit_spin_unlock(0, (unsigned long *)&tmp->d_sb->s_anon.first);
1586 spin_unlock(&tmp->d_lock); 1613 spin_unlock(&tmp->d_lock);
1587 spin_unlock(&inode->i_lock); 1614 spin_unlock(&inode->i_lock);
1615 security_d_instantiate(tmp, inode);
1588 1616
1589 return tmp; 1617 return tmp;
1590 1618
1591 out_iput: 1619 out_iput:
1620 if (res && !IS_ERR(res))
1621 security_d_instantiate(res, inode);
1592 iput(inode); 1622 iput(inode);
1593 return res; 1623 return res;
1594} 1624}
@@ -1781,7 +1811,7 @@ struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name,
1781 * false-negative result. d_lookup() protects against concurrent 1811 * false-negative result. d_lookup() protects against concurrent
1782 * renames using rename_lock seqlock. 1812 * renames using rename_lock seqlock.
1783 * 1813 *
1784 * See Documentation/vfs/dcache-locking.txt for more details. 1814 * See Documentation/filesystems/path-lookup.txt for more details.
1785 */ 1815 */
1786 hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) { 1816 hlist_bl_for_each_entry_rcu(dentry, node, &b->head, d_hash) {
1787 struct inode *i; 1817 struct inode *i;
@@ -1901,7 +1931,7 @@ struct dentry *__d_lookup(struct dentry *parent, struct qstr *name)
1901 * false-negative result. d_lookup() protects against concurrent 1931 * false-negative result. d_lookup() protects against concurrent
1902 * renames using rename_lock seqlock. 1932 * renames using rename_lock seqlock.
1903 * 1933 *
1904 * See Documentation/vfs/dcache-locking.txt for more details. 1934 * See Documentation/filesystems/path-lookup.txt for more details.
1905 */ 1935 */
1906 rcu_read_lock(); 1936 rcu_read_lock();
1907 1937
@@ -2101,7 +2131,7 @@ EXPORT_SYMBOL(d_rehash);
2101 */ 2131 */
2102void dentry_update_name_case(struct dentry *dentry, struct qstr *name) 2132void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
2103{ 2133{
2104 BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); 2134 BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
2105 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ 2135 BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
2106 2136
2107 spin_lock(&dentry->d_lock); 2137 spin_lock(&dentry->d_lock);
@@ -2920,28 +2950,14 @@ resume:
2920 spin_unlock(&dentry->d_lock); 2950 spin_unlock(&dentry->d_lock);
2921 } 2951 }
2922 if (this_parent != root) { 2952 if (this_parent != root) {
2923 struct dentry *tmp; 2953 struct dentry *child = this_parent;
2924 struct dentry *child;
2925
2926 tmp = this_parent->d_parent;
2927 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) { 2954 if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
2928 this_parent->d_flags |= DCACHE_GENOCIDE; 2955 this_parent->d_flags |= DCACHE_GENOCIDE;
2929 this_parent->d_count--; 2956 this_parent->d_count--;
2930 } 2957 }
2931 rcu_read_lock(); 2958 this_parent = try_to_ascend(this_parent, locked, seq);
2932 spin_unlock(&this_parent->d_lock); 2959 if (!this_parent)
2933 child = this_parent;
2934 this_parent = tmp;
2935 spin_lock(&this_parent->d_lock);
2936 /* might go back up the wrong parent if we have had a rename
2937 * or deletion */
2938 if (this_parent != child->d_parent ||
2939 (!locked && read_seqretry(&rename_lock, seq))) {
2940 spin_unlock(&this_parent->d_lock);
2941 rcu_read_unlock();
2942 goto rename_retry; 2960 goto rename_retry;
2943 }
2944 rcu_read_unlock();
2945 next = child->d_u.d_child.next; 2961 next = child->d_u.d_child.next;
2946 goto resume; 2962 goto resume;
2947 } 2963 }
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 37a8ca7c1222..e7a7a2f07324 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -13,9 +13,6 @@
13 * 13 *
14 */ 14 */
15 15
16/* uncomment to get debug messages from the debug filesystem, ah the irony. */
17/* #define DEBUG */
18
19#include <linux/module.h> 16#include <linux/module.h>
20#include <linux/fs.h> 17#include <linux/fs.h>
21#include <linux/mount.h> 18#include <linux/mount.h>
@@ -310,7 +307,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
310} 307}
311EXPORT_SYMBOL_GPL(debugfs_create_symlink); 308EXPORT_SYMBOL_GPL(debugfs_create_symlink);
312 309
313static void __debugfs_remove(struct dentry *dentry, struct dentry *parent) 310static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
314{ 311{
315 int ret = 0; 312 int ret = 0;
316 313
@@ -333,6 +330,7 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
333 dput(dentry); 330 dput(dentry);
334 } 331 }
335 } 332 }
333 return ret;
336} 334}
337 335
338/** 336/**
@@ -351,7 +349,8 @@ static void __debugfs_remove(struct dentry *dentry, struct dentry *parent)
351void debugfs_remove(struct dentry *dentry) 349void debugfs_remove(struct dentry *dentry)
352{ 350{
353 struct dentry *parent; 351 struct dentry *parent;
354 352 int ret;
353
355 if (!dentry) 354 if (!dentry)
356 return; 355 return;
357 356
@@ -360,9 +359,10 @@ void debugfs_remove(struct dentry *dentry)
360 return; 359 return;
361 360
362 mutex_lock(&parent->d_inode->i_mutex); 361 mutex_lock(&parent->d_inode->i_mutex);
363 __debugfs_remove(dentry, parent); 362 ret = __debugfs_remove(dentry, parent);
364 mutex_unlock(&parent->d_inode->i_mutex); 363 mutex_unlock(&parent->d_inode->i_mutex);
365 simple_release_fs(&debugfs_mount, &debugfs_mount_count); 364 if (!ret)
365 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
366} 366}
367EXPORT_SYMBOL_GPL(debugfs_remove); 367EXPORT_SYMBOL_GPL(debugfs_remove);
368 368
@@ -540,17 +540,5 @@ static int __init debugfs_init(void)
540 540
541 return retval; 541 return retval;
542} 542}
543
544static void __exit debugfs_exit(void)
545{
546 debugfs_registered = false;
547
548 simple_release_fs(&debugfs_mount, &debugfs_mount_count);
549 unregister_filesystem(&debug_fs_type);
550 kobject_put(debug_kobj);
551}
552
553core_initcall(debugfs_init); 543core_initcall(debugfs_init);
554module_exit(debugfs_exit);
555MODULE_LICENSE("GPL");
556 544
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1bb547c9cad6..2f27e578d466 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -479,6 +479,7 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
479 struct dentry *root = sb->s_root; 479 struct dentry *root = sb->s_root;
480 struct pts_fs_info *fsi = DEVPTS_SB(sb); 480 struct pts_fs_info *fsi = DEVPTS_SB(sb);
481 struct pts_mount_opts *opts = &fsi->mount_opts; 481 struct pts_mount_opts *opts = &fsi->mount_opts;
482 int ret = 0;
482 char s[12]; 483 char s[12];
483 484
484 /* We're supposed to be given the slave end of a pty */ 485 /* We're supposed to be given the slave end of a pty */
@@ -501,14 +502,17 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
501 mutex_lock(&root->d_inode->i_mutex); 502 mutex_lock(&root->d_inode->i_mutex);
502 503
503 dentry = d_alloc_name(root, s); 504 dentry = d_alloc_name(root, s);
504 if (!IS_ERR(dentry)) { 505 if (dentry) {
505 d_add(dentry, inode); 506 d_add(dentry, inode);
506 fsnotify_create(root->d_inode, dentry); 507 fsnotify_create(root->d_inode, dentry);
508 } else {
509 iput(inode);
510 ret = -ENOMEM;
507 } 511 }
508 512
509 mutex_unlock(&root->d_inode->i_mutex); 513 mutex_unlock(&root->d_inode->i_mutex);
510 514
511 return 0; 515 return ret;
512} 516}
513 517
514struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) 518struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
@@ -544,17 +548,12 @@ void devpts_pty_kill(struct tty_struct *tty)
544 mutex_lock(&root->d_inode->i_mutex); 548 mutex_lock(&root->d_inode->i_mutex);
545 549
546 dentry = d_find_alias(inode); 550 dentry = d_find_alias(inode);
547 if (IS_ERR(dentry))
548 goto out;
549
550 if (dentry) {
551 inode->i_nlink--;
552 d_delete(dentry);
553 dput(dentry); /* d_alloc_name() in devpts_pty_new() */
554 }
555 551
552 inode->i_nlink--;
553 d_delete(dentry);
554 dput(dentry); /* d_alloc_name() in devpts_pty_new() */
556 dput(dentry); /* d_find_alias above */ 555 dput(dentry); /* d_find_alias above */
557out: 556
558 mutex_unlock(&root->d_inode->i_mutex); 557 mutex_unlock(&root->d_inode->i_mutex);
559} 558}
560 559
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b044705eedd4..ac5f164170e3 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -645,11 +645,11 @@ static int dio_send_cur_page(struct dio *dio)
645 /* 645 /*
646 * See whether this new request is contiguous with the old. 646 * See whether this new request is contiguous with the old.
647 * 647 *
648 * Btrfs cannot handl having logically non-contiguous requests 648 * Btrfs cannot handle having logically non-contiguous requests
649 * submitted. For exmple if you have 649 * submitted. For example if you have
650 * 650 *
651 * Logical: [0-4095][HOLE][8192-12287] 651 * Logical: [0-4095][HOLE][8192-12287]
652 * Phyiscal: [0-4095] [4096-8181] 652 * Physical: [0-4095] [4096-8191]
653 * 653 *
654 * We cannot submit those pages together as one BIO. So if our 654 * We cannot submit those pages together as one BIO. So if our
655 * current logical offset in the file does not equal what would 655 * current logical offset in the file does not equal what would
@@ -1110,11 +1110,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1110 ((rw & READ) || (dio->result == dio->size))) 1110 ((rw & READ) || (dio->result == dio->size)))
1111 ret = -EIOCBQUEUED; 1111 ret = -EIOCBQUEUED;
1112 1112
1113 if (ret != -EIOCBQUEUED) { 1113 if (ret != -EIOCBQUEUED)
1114 /* All IO is now issued, send it on its way */
1115 blk_run_address_space(inode->i_mapping);
1116 dio_await_completion(dio); 1114 dio_await_completion(dio);
1117 }
1118 1115
1119 /* 1116 /*
1120 * Sync will always be dropping the final ref and completing the 1117 * Sync will always be dropping the final ref and completing the
@@ -1176,7 +1173,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1176 struct dio *dio; 1173 struct dio *dio;
1177 1174
1178 if (rw & WRITE) 1175 if (rw & WRITE)
1179 rw = WRITE_ODIRECT_PLUG; 1176 rw = WRITE_ODIRECT;
1180 1177
1181 if (bdev) 1178 if (bdev)
1182 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); 1179 bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 4314f0d48d85..abc49f292454 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -18,6 +18,7 @@
18 18
19#define WAKE_ASTS 0 19#define WAKE_ASTS 0
20 20
21static uint64_t ast_seq_count;
21static struct list_head ast_queue; 22static struct list_head ast_queue;
22static spinlock_t ast_queue_lock; 23static spinlock_t ast_queue_lock;
23static struct task_struct * astd_task; 24static struct task_struct * astd_task;
@@ -25,40 +26,186 @@ static unsigned long astd_wakeflags;
25static struct mutex astd_running; 26static struct mutex astd_running;
26 27
27 28
29static void dlm_dump_lkb_callbacks(struct dlm_lkb *lkb)
30{
31 int i;
32
33 log_print("last_bast %x %llu flags %x mode %d sb %d %x",
34 lkb->lkb_id,
35 (unsigned long long)lkb->lkb_last_bast.seq,
36 lkb->lkb_last_bast.flags,
37 lkb->lkb_last_bast.mode,
38 lkb->lkb_last_bast.sb_status,
39 lkb->lkb_last_bast.sb_flags);
40
41 log_print("last_cast %x %llu flags %x mode %d sb %d %x",
42 lkb->lkb_id,
43 (unsigned long long)lkb->lkb_last_cast.seq,
44 lkb->lkb_last_cast.flags,
45 lkb->lkb_last_cast.mode,
46 lkb->lkb_last_cast.sb_status,
47 lkb->lkb_last_cast.sb_flags);
48
49 for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
50 log_print("cb %x %llu flags %x mode %d sb %d %x",
51 lkb->lkb_id,
52 (unsigned long long)lkb->lkb_callbacks[i].seq,
53 lkb->lkb_callbacks[i].flags,
54 lkb->lkb_callbacks[i].mode,
55 lkb->lkb_callbacks[i].sb_status,
56 lkb->lkb_callbacks[i].sb_flags);
57 }
58}
59
28void dlm_del_ast(struct dlm_lkb *lkb) 60void dlm_del_ast(struct dlm_lkb *lkb)
29{ 61{
30 spin_lock(&ast_queue_lock); 62 spin_lock(&ast_queue_lock);
31 if (lkb->lkb_ast_type & (AST_COMP | AST_BAST)) 63 if (!list_empty(&lkb->lkb_astqueue))
32 list_del(&lkb->lkb_astqueue); 64 list_del_init(&lkb->lkb_astqueue);
33 spin_unlock(&ast_queue_lock); 65 spin_unlock(&ast_queue_lock);
34} 66}
35 67
36void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode) 68int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
69 int status, uint32_t sbflags, uint64_t seq)
37{ 70{
71 struct dlm_ls *ls = lkb->lkb_resource->res_ls;
72 uint64_t prev_seq;
73 int prev_mode;
74 int i;
75
76 for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
77 if (lkb->lkb_callbacks[i].seq)
78 continue;
79
80 /*
81 * Suppress some redundant basts here, do more on removal.
82 * Don't even add a bast if the callback just before it
83 * is a bast for the same mode or a more restrictive mode.
84 * (the addional > PR check is needed for PR/CW inversion)
85 */
86
87 if ((i > 0) && (flags & DLM_CB_BAST) &&
88 (lkb->lkb_callbacks[i-1].flags & DLM_CB_BAST)) {
89
90 prev_seq = lkb->lkb_callbacks[i-1].seq;
91 prev_mode = lkb->lkb_callbacks[i-1].mode;
92
93 if ((prev_mode == mode) ||
94 (prev_mode > mode && prev_mode > DLM_LOCK_PR)) {
95
96 log_debug(ls, "skip %x add bast %llu mode %d "
97 "for bast %llu mode %d",
98 lkb->lkb_id,
99 (unsigned long long)seq,
100 mode,
101 (unsigned long long)prev_seq,
102 prev_mode);
103 return 0;
104 }
105 }
106
107 lkb->lkb_callbacks[i].seq = seq;
108 lkb->lkb_callbacks[i].flags = flags;
109 lkb->lkb_callbacks[i].mode = mode;
110 lkb->lkb_callbacks[i].sb_status = status;
111 lkb->lkb_callbacks[i].sb_flags = (sbflags & 0x000000FF);
112 break;
113 }
114
115 if (i == DLM_CALLBACKS_SIZE) {
116 log_error(ls, "no callbacks %x %llu flags %x mode %d sb %d %x",
117 lkb->lkb_id, (unsigned long long)seq,
118 flags, mode, status, sbflags);
119 dlm_dump_lkb_callbacks(lkb);
120 return -1;
121 }
122
123 return 0;
124}
125
126int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
127 struct dlm_callback *cb, int *resid)
128{
129 int i;
130
131 *resid = 0;
132
133 if (!lkb->lkb_callbacks[0].seq)
134 return -ENOENT;
135
136 /* oldest undelivered cb is callbacks[0] */
137
138 memcpy(cb, &lkb->lkb_callbacks[0], sizeof(struct dlm_callback));
139 memset(&lkb->lkb_callbacks[0], 0, sizeof(struct dlm_callback));
140
141 /* shift others down */
142
143 for (i = 1; i < DLM_CALLBACKS_SIZE; i++) {
144 if (!lkb->lkb_callbacks[i].seq)
145 break;
146 memcpy(&lkb->lkb_callbacks[i-1], &lkb->lkb_callbacks[i],
147 sizeof(struct dlm_callback));
148 memset(&lkb->lkb_callbacks[i], 0, sizeof(struct dlm_callback));
149 (*resid)++;
150 }
151
152 /* if cb is a bast, it should be skipped if the blocking mode is
153 compatible with the last granted mode */
154
155 if ((cb->flags & DLM_CB_BAST) && lkb->lkb_last_cast.seq) {
156 if (dlm_modes_compat(cb->mode, lkb->lkb_last_cast.mode)) {
157 cb->flags |= DLM_CB_SKIP;
158
159 log_debug(ls, "skip %x bast %llu mode %d "
160 "for cast %llu mode %d",
161 lkb->lkb_id,
162 (unsigned long long)cb->seq,
163 cb->mode,
164 (unsigned long long)lkb->lkb_last_cast.seq,
165 lkb->lkb_last_cast.mode);
166 return 0;
167 }
168 }
169
170 if (cb->flags & DLM_CB_CAST) {
171 memcpy(&lkb->lkb_last_cast, cb, sizeof(struct dlm_callback));
172 lkb->lkb_last_cast_time = ktime_get();
173 }
174
175 if (cb->flags & DLM_CB_BAST) {
176 memcpy(&lkb->lkb_last_bast, cb, sizeof(struct dlm_callback));
177 lkb->lkb_last_bast_time = ktime_get();
178 }
179
180 return 0;
181}
182
183void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
184 uint32_t sbflags)
185{
186 uint64_t seq;
187 int rv;
188
189 spin_lock(&ast_queue_lock);
190
191 seq = ++ast_seq_count;
192
38 if (lkb->lkb_flags & DLM_IFL_USER) { 193 if (lkb->lkb_flags & DLM_IFL_USER) {
39 dlm_user_add_ast(lkb, type, mode); 194 spin_unlock(&ast_queue_lock);
195 dlm_user_add_ast(lkb, flags, mode, status, sbflags, seq);
40 return; 196 return;
41 } 197 }
42 198
43 spin_lock(&ast_queue_lock); 199 rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
44 if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { 200 if (rv < 0) {
201 spin_unlock(&ast_queue_lock);
202 return;
203 }
204
205 if (list_empty(&lkb->lkb_astqueue)) {
45 kref_get(&lkb->lkb_ref); 206 kref_get(&lkb->lkb_ref);
46 list_add_tail(&lkb->lkb_astqueue, &ast_queue); 207 list_add_tail(&lkb->lkb_astqueue, &ast_queue);
47 lkb->lkb_ast_first = type;
48 } 208 }
49
50 /* sanity check, this should not happen */
51
52 if ((type == AST_COMP) && (lkb->lkb_ast_type & AST_COMP))
53 log_print("repeat cast %d castmode %d lock %x %s",
54 mode, lkb->lkb_castmode,
55 lkb->lkb_id, lkb->lkb_resource->res_name);
56
57 lkb->lkb_ast_type |= type;
58 if (type == AST_BAST)
59 lkb->lkb_bastmode = mode;
60 else
61 lkb->lkb_castmode = mode;
62 spin_unlock(&ast_queue_lock); 209 spin_unlock(&ast_queue_lock);
63 210
64 set_bit(WAKE_ASTS, &astd_wakeflags); 211 set_bit(WAKE_ASTS, &astd_wakeflags);
@@ -72,7 +219,8 @@ static void process_asts(void)
72 struct dlm_lkb *lkb; 219 struct dlm_lkb *lkb;
73 void (*castfn) (void *astparam); 220 void (*castfn) (void *astparam);
74 void (*bastfn) (void *astparam, int mode); 221 void (*bastfn) (void *astparam, int mode);
75 int type, first, bastmode, castmode, do_bast, do_cast, last_castmode; 222 struct dlm_callback callbacks[DLM_CALLBACKS_SIZE];
223 int i, rv, resid;
76 224
77repeat: 225repeat:
78 spin_lock(&ast_queue_lock); 226 spin_lock(&ast_queue_lock);
@@ -83,54 +231,45 @@ repeat:
83 if (dlm_locking_stopped(ls)) 231 if (dlm_locking_stopped(ls))
84 continue; 232 continue;
85 233
86 list_del(&lkb->lkb_astqueue); 234 /* we remove from astqueue list and remove everything in
87 type = lkb->lkb_ast_type; 235 lkb_callbacks before releasing the spinlock so empty
88 lkb->lkb_ast_type = 0; 236 lkb_astqueue is always consistent with empty lkb_callbacks */
89 first = lkb->lkb_ast_first; 237
90 lkb->lkb_ast_first = 0; 238 list_del_init(&lkb->lkb_astqueue);
91 bastmode = lkb->lkb_bastmode; 239
92 castmode = lkb->lkb_castmode;
93 castfn = lkb->lkb_astfn; 240 castfn = lkb->lkb_astfn;
94 bastfn = lkb->lkb_bastfn; 241 bastfn = lkb->lkb_bastfn;
95 spin_unlock(&ast_queue_lock);
96 242
97 do_cast = (type & AST_COMP) && castfn; 243 memset(&callbacks, 0, sizeof(callbacks));
98 do_bast = (type & AST_BAST) && bastfn;
99 244
100 /* Skip a bast if its blocking mode is compatible with the 245 for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
101 granted mode of the preceding cast. */ 246 rv = dlm_rem_lkb_callback(ls, lkb, &callbacks[i], &resid);
247 if (rv < 0)
248 break;
249 }
250 spin_unlock(&ast_queue_lock);
102 251
103 if (do_bast) { 252 if (resid) {
104 if (first == AST_COMP) 253 /* shouldn't happen, for loop should have removed all */
105 last_castmode = castmode; 254 log_error(ls, "callback resid %d lkb %x",
106 else 255 resid, lkb->lkb_id);
107 last_castmode = lkb->lkb_castmode_done;
108 if (dlm_modes_compat(bastmode, last_castmode))
109 do_bast = 0;
110 } 256 }
111 257
112 if (first == AST_COMP) { 258 for (i = 0; i < DLM_CALLBACKS_SIZE; i++) {
113 if (do_cast) 259 if (!callbacks[i].seq)
114 castfn(lkb->lkb_astparam); 260 break;
115 if (do_bast) 261 if (callbacks[i].flags & DLM_CB_SKIP) {
116 bastfn(lkb->lkb_astparam, bastmode); 262 continue;
117 } else if (first == AST_BAST) { 263 } else if (callbacks[i].flags & DLM_CB_BAST) {
118 if (do_bast) 264 bastfn(lkb->lkb_astparam, callbacks[i].mode);
119 bastfn(lkb->lkb_astparam, bastmode); 265 } else if (callbacks[i].flags & DLM_CB_CAST) {
120 if (do_cast) 266 lkb->lkb_lksb->sb_status = callbacks[i].sb_status;
267 lkb->lkb_lksb->sb_flags = callbacks[i].sb_flags;
121 castfn(lkb->lkb_astparam); 268 castfn(lkb->lkb_astparam);
122 } else { 269 }
123 log_error(ls, "bad ast_first %d ast_type %d",
124 first, type);
125 } 270 }
126 271
127 if (do_cast) 272 /* removes ref for ast_queue, may cause lkb to be freed */
128 lkb->lkb_castmode_done = castmode;
129 if (do_bast)
130 lkb->lkb_bastmode_done = bastmode;
131
132 /* this removes the reference added by dlm_add_ast
133 and may result in the lkb being freed */
134 dlm_put_lkb(lkb); 273 dlm_put_lkb(lkb);
135 274
136 cond_resched(); 275 cond_resched();
diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h
index bcb1aaba519d..8aa89c9b5611 100644
--- a/fs/dlm/ast.h
+++ b/fs/dlm/ast.h
@@ -13,8 +13,13 @@
13#ifndef __ASTD_DOT_H__ 13#ifndef __ASTD_DOT_H__
14#define __ASTD_DOT_H__ 14#define __ASTD_DOT_H__
15 15
16void dlm_add_ast(struct dlm_lkb *lkb, int type, int mode);
17void dlm_del_ast(struct dlm_lkb *lkb); 16void dlm_del_ast(struct dlm_lkb *lkb);
17int dlm_add_lkb_callback(struct dlm_lkb *lkb, uint32_t flags, int mode,
18 int status, uint32_t sbflags, uint64_t seq);
19int dlm_rem_lkb_callback(struct dlm_ls *ls, struct dlm_lkb *lkb,
20 struct dlm_callback *cb, int *resid);
21void dlm_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
22 uint32_t sbflags);
18 23
19void dlm_astd_wake(void); 24void dlm_astd_wake(void);
20int dlm_astd_start(void); 25int dlm_astd_start(void);
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index b54bca03d92f..0d329ff8ed4c 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -977,9 +977,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num)
977/* Config file defaults */ 977/* Config file defaults */
978#define DEFAULT_TCP_PORT 21064 978#define DEFAULT_TCP_PORT 21064
979#define DEFAULT_BUFFER_SIZE 4096 979#define DEFAULT_BUFFER_SIZE 4096
980#define DEFAULT_RSBTBL_SIZE 256 980#define DEFAULT_RSBTBL_SIZE 1024
981#define DEFAULT_LKBTBL_SIZE 1024 981#define DEFAULT_LKBTBL_SIZE 1024
982#define DEFAULT_DIRTBL_SIZE 512 982#define DEFAULT_DIRTBL_SIZE 1024
983#define DEFAULT_RECOVER_TIMER 5 983#define DEFAULT_RECOVER_TIMER 5
984#define DEFAULT_TOSS_SECS 10 984#define DEFAULT_TOSS_SECS 10
985#define DEFAULT_SCAN_SECS 5 985#define DEFAULT_SCAN_SECS 5
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 6b42ba807dfd..59779237e2b4 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -257,12 +257,12 @@ static int print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb,
257 lkb->lkb_status, 257 lkb->lkb_status,
258 lkb->lkb_grmode, 258 lkb->lkb_grmode,
259 lkb->lkb_rqmode, 259 lkb->lkb_rqmode,
260 lkb->lkb_bastmode, 260 lkb->lkb_last_bast.mode,
261 rsb_lookup, 261 rsb_lookup,
262 lkb->lkb_wait_type, 262 lkb->lkb_wait_type,
263 lkb->lkb_lvbseq, 263 lkb->lkb_lvbseq,
264 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), 264 (unsigned long long)ktime_to_ns(lkb->lkb_timestamp),
265 (unsigned long long)ktime_to_ns(lkb->lkb_time_bast)); 265 (unsigned long long)ktime_to_ns(lkb->lkb_last_bast_time));
266 return rv; 266 return rv;
267} 267}
268 268
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index f632b58cd222..b94204913011 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -192,11 +192,6 @@ struct dlm_args {
192 * lkb is a process copy, the nodeid specifies the lock master. 192 * lkb is a process copy, the nodeid specifies the lock master.
193 */ 193 */
194 194
195/* lkb_ast_type */
196
197#define AST_COMP 1
198#define AST_BAST 2
199
200/* lkb_status */ 195/* lkb_status */
201 196
202#define DLM_LKSTS_WAITING 1 197#define DLM_LKSTS_WAITING 1
@@ -217,6 +212,20 @@ struct dlm_args {
217#define DLM_IFL_USER 0x00000001 212#define DLM_IFL_USER 0x00000001
218#define DLM_IFL_ORPHAN 0x00000002 213#define DLM_IFL_ORPHAN 0x00000002
219 214
215#define DLM_CALLBACKS_SIZE 6
216
217#define DLM_CB_CAST 0x00000001
218#define DLM_CB_BAST 0x00000002
219#define DLM_CB_SKIP 0x00000004
220
221struct dlm_callback {
222 uint64_t seq;
223 uint32_t flags; /* DLM_CBF_ */
224 int sb_status; /* copy to lksb status */
225 uint8_t sb_flags; /* copy to lksb flags */
226 int8_t mode; /* rq mode of bast, gr mode of cast */
227};
228
220struct dlm_lkb { 229struct dlm_lkb {
221 struct dlm_rsb *lkb_resource; /* the rsb */ 230 struct dlm_rsb *lkb_resource; /* the rsb */
222 struct kref lkb_ref; 231 struct kref lkb_ref;
@@ -236,13 +245,6 @@ struct dlm_lkb {
236 245
237 int8_t lkb_wait_type; /* type of reply waiting for */ 246 int8_t lkb_wait_type; /* type of reply waiting for */
238 int8_t lkb_wait_count; 247 int8_t lkb_wait_count;
239 int8_t lkb_ast_type; /* type of ast queued for */
240 int8_t lkb_ast_first; /* type of first ast queued */
241
242 int8_t lkb_bastmode; /* req mode of queued bast */
243 int8_t lkb_castmode; /* gr mode of queued cast */
244 int8_t lkb_bastmode_done; /* last delivered bastmode */
245 int8_t lkb_castmode_done; /* last delivered castmode */
246 248
247 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 249 struct list_head lkb_idtbl_list; /* lockspace lkbtbl */
248 struct list_head lkb_statequeue; /* rsb g/c/w list */ 250 struct list_head lkb_statequeue; /* rsb g/c/w list */
@@ -251,10 +253,15 @@ struct dlm_lkb {
251 struct list_head lkb_astqueue; /* need ast to be sent */ 253 struct list_head lkb_astqueue; /* need ast to be sent */
252 struct list_head lkb_ownqueue; /* list of locks for a process */ 254 struct list_head lkb_ownqueue; /* list of locks for a process */
253 struct list_head lkb_time_list; 255 struct list_head lkb_time_list;
254 ktime_t lkb_time_bast; /* for debugging */
255 ktime_t lkb_timestamp; 256 ktime_t lkb_timestamp;
256 unsigned long lkb_timeout_cs; 257 unsigned long lkb_timeout_cs;
257 258
259 struct dlm_callback lkb_callbacks[DLM_CALLBACKS_SIZE];
260 struct dlm_callback lkb_last_cast;
261 struct dlm_callback lkb_last_bast;
262 ktime_t lkb_last_cast_time; /* for debugging */
263 ktime_t lkb_last_bast_time; /* for debugging */
264
258 char *lkb_lvbptr; 265 char *lkb_lvbptr;
259 struct dlm_lksb *lkb_lksb; /* caller's status block */ 266 struct dlm_lksb *lkb_lksb; /* caller's status block */
260 void (*lkb_astfn) (void *astparam); 267 void (*lkb_astfn) (void *astparam);
@@ -544,8 +551,6 @@ struct dlm_user_args {
544 (dlm_user_proc) on the struct file, 551 (dlm_user_proc) on the struct file,
545 the process's locks point back to it*/ 552 the process's locks point back to it*/
546 struct dlm_lksb lksb; 553 struct dlm_lksb lksb;
547 int old_mode;
548 int update_user_lvb;
549 struct dlm_lksb __user *user_lksb; 554 struct dlm_lksb __user *user_lksb;
550 void __user *castparam; 555 void __user *castparam;
551 void __user *castaddr; 556 void __user *castaddr;
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 64e5f3efdd81..56d6bfcc1e48 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -160,10 +160,10 @@ static const int __quecvt_compat_matrix[8][8] = {
160void dlm_print_lkb(struct dlm_lkb *lkb) 160void dlm_print_lkb(struct dlm_lkb *lkb)
161{ 161{
162 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n" 162 printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n"
163 " status %d rqmode %d grmode %d wait_type %d ast_type %d\n", 163 " status %d rqmode %d grmode %d wait_type %d\n",
164 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, 164 lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
165 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, 165 lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
166 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type); 166 lkb->lkb_grmode, lkb->lkb_wait_type);
167} 167}
168 168
169static void dlm_print_rsb(struct dlm_rsb *r) 169static void dlm_print_rsb(struct dlm_rsb *r)
@@ -305,10 +305,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
305 rv = -EDEADLK; 305 rv = -EDEADLK;
306 } 306 }
307 307
308 lkb->lkb_lksb->sb_status = rv; 308 dlm_add_ast(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
309 lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags;
310
311 dlm_add_ast(lkb, AST_COMP, lkb->lkb_grmode);
312} 309}
313 310
314static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) 311static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
@@ -319,13 +316,10 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
319 316
320static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) 317static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
321{ 318{
322 lkb->lkb_time_bast = ktime_get();
323
324 if (is_master_copy(lkb)) { 319 if (is_master_copy(lkb)) {
325 lkb->lkb_bastmode = rqmode; /* printed by debugfs */
326 send_bast(r, lkb, rqmode); 320 send_bast(r, lkb, rqmode);
327 } else { 321 } else {
328 dlm_add_ast(lkb, AST_BAST, rqmode); 322 dlm_add_ast(lkb, DLM_CB_BAST, rqmode, 0, 0);
329 } 323 }
330} 324}
331 325
@@ -525,7 +519,7 @@ static void toss_rsb(struct kref *kref)
525 } 519 }
526} 520}
527 521
528/* When all references to the rsb are gone it's transfered to 522/* When all references to the rsb are gone it's transferred to
529 the tossed list for later disposal. */ 523 the tossed list for later disposal. */
530 524
531static void put_rsb(struct dlm_rsb *r) 525static void put_rsb(struct dlm_rsb *r)
@@ -600,6 +594,7 @@ static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
600 INIT_LIST_HEAD(&lkb->lkb_ownqueue); 594 INIT_LIST_HEAD(&lkb->lkb_ownqueue);
601 INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); 595 INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
602 INIT_LIST_HEAD(&lkb->lkb_time_list); 596 INIT_LIST_HEAD(&lkb->lkb_time_list);
597 INIT_LIST_HEAD(&lkb->lkb_astqueue);
603 598
604 get_random_bytes(&bucket, sizeof(bucket)); 599 get_random_bytes(&bucket, sizeof(bucket));
605 bucket &= (ls->ls_lkbtbl_size - 1); 600 bucket &= (ls->ls_lkbtbl_size - 1);
@@ -2819,9 +2814,9 @@ static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
2819 not from lkb fields */ 2814 not from lkb fields */
2820 2815
2821 if (lkb->lkb_bastfn) 2816 if (lkb->lkb_bastfn)
2822 ms->m_asts |= AST_BAST; 2817 ms->m_asts |= DLM_CB_BAST;
2823 if (lkb->lkb_astfn) 2818 if (lkb->lkb_astfn)
2824 ms->m_asts |= AST_COMP; 2819 ms->m_asts |= DLM_CB_CAST;
2825 2820
2826 /* compare with switch in create_message; send_remove() doesn't 2821 /* compare with switch in create_message; send_remove() doesn't
2827 use send_args() */ 2822 use send_args() */
@@ -3122,8 +3117,8 @@ static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3122 lkb->lkb_grmode = DLM_LOCK_IV; 3117 lkb->lkb_grmode = DLM_LOCK_IV;
3123 lkb->lkb_rqmode = ms->m_rqmode; 3118 lkb->lkb_rqmode = ms->m_rqmode;
3124 3119
3125 lkb->lkb_bastfn = (ms->m_asts & AST_BAST) ? &fake_bastfn : NULL; 3120 lkb->lkb_bastfn = (ms->m_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
3126 lkb->lkb_astfn = (ms->m_asts & AST_COMP) ? &fake_astfn : NULL; 3121 lkb->lkb_astfn = (ms->m_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
3127 3122
3128 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 3123 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3129 /* lkb was just created so there won't be an lvb yet */ 3124 /* lkb was just created so there won't be an lvb yet */
@@ -4412,8 +4407,8 @@ static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
4412 lkb->lkb_grmode = rl->rl_grmode; 4407 lkb->lkb_grmode = rl->rl_grmode;
4413 /* don't set lkb_status because add_lkb wants to itself */ 4408 /* don't set lkb_status because add_lkb wants to itself */
4414 4409
4415 lkb->lkb_bastfn = (rl->rl_asts & AST_BAST) ? &fake_bastfn : NULL; 4410 lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
4416 lkb->lkb_astfn = (rl->rl_asts & AST_COMP) ? &fake_astfn : NULL; 4411 lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
4417 4412
4418 if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 4413 if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
4419 int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - 4414 int lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) -
@@ -4589,7 +4584,6 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4589 error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, 4584 error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
4590 fake_astfn, ua, fake_bastfn, &args); 4585 fake_astfn, ua, fake_bastfn, &args);
4591 lkb->lkb_flags |= DLM_IFL_USER; 4586 lkb->lkb_flags |= DLM_IFL_USER;
4592 ua->old_mode = DLM_LOCK_IV;
4593 4587
4594 if (error) { 4588 if (error) {
4595 __put_lkb(ls, lkb); 4589 __put_lkb(ls, lkb);
@@ -4658,7 +4652,6 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4658 ua->bastparam = ua_tmp->bastparam; 4652 ua->bastparam = ua_tmp->bastparam;
4659 ua->bastaddr = ua_tmp->bastaddr; 4653 ua->bastaddr = ua_tmp->bastaddr;
4660 ua->user_lksb = ua_tmp->user_lksb; 4654 ua->user_lksb = ua_tmp->user_lksb;
4661 ua->old_mode = lkb->lkb_grmode;
4662 4655
4663 error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, 4656 error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
4664 fake_astfn, ua, fake_bastfn, &args); 4657 fake_astfn, ua, fake_bastfn, &args);
@@ -4917,8 +4910,9 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4917 } 4910 }
4918 4911
4919 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { 4912 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4920 lkb->lkb_ast_type = 0; 4913 memset(&lkb->lkb_callbacks, 0,
4921 list_del(&lkb->lkb_astqueue); 4914 sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
4915 list_del_init(&lkb->lkb_astqueue);
4922 dlm_put_lkb(lkb); 4916 dlm_put_lkb(lkb);
4923 } 4917 }
4924 4918
@@ -4958,7 +4952,9 @@ static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
4958 4952
4959 spin_lock(&proc->asts_spin); 4953 spin_lock(&proc->asts_spin);
4960 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) { 4954 list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_astqueue) {
4961 list_del(&lkb->lkb_astqueue); 4955 memset(&lkb->lkb_callbacks, 0,
4956 sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
4957 list_del_init(&lkb->lkb_astqueue);
4962 dlm_put_lkb(lkb); 4958 dlm_put_lkb(lkb);
4963 } 4959 }
4964 spin_unlock(&proc->asts_spin); 4960 spin_unlock(&proc->asts_spin);
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 9c64ae9e4c1a..5e2c71f05e46 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -810,7 +810,7 @@ static int tcp_accept_from_sock(struct connection *con)
810 810
811 /* 811 /*
812 * Add it to the active queue in case we got data 812 * Add it to the active queue in case we got data
813 * beween processing the accept adding the socket 813 * between processing the accept adding the socket
814 * to the read_sockets list 814 * to the read_sockets list
815 */ 815 */
816 if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags)) 816 if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
@@ -1468,15 +1468,15 @@ static void work_stop(void)
1468 1468
1469static int work_start(void) 1469static int work_start(void)
1470{ 1470{
1471 recv_workqueue = alloc_workqueue("dlm_recv", WQ_MEM_RECLAIM | 1471 recv_workqueue = alloc_workqueue("dlm_recv",
1472 WQ_HIGHPRI | WQ_FREEZEABLE, 0); 1472 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
1473 if (!recv_workqueue) { 1473 if (!recv_workqueue) {
1474 log_print("can't start dlm_recv"); 1474 log_print("can't start dlm_recv");
1475 return -ENOMEM; 1475 return -ENOMEM;
1476 } 1476 }
1477 1477
1478 send_workqueue = alloc_workqueue("dlm_send", WQ_MEM_RECLAIM | 1478 send_workqueue = alloc_workqueue("dlm_send",
1479 WQ_HIGHPRI | WQ_FREEZEABLE, 0); 1479 WQ_UNBOUND | WQ_MEM_RECLAIM, 1);
1480 if (!send_workqueue) { 1480 if (!send_workqueue) {
1481 log_print("can't start dlm_send"); 1481 log_print("can't start dlm_send");
1482 destroy_workqueue(recv_workqueue); 1482 destroy_workqueue(recv_workqueue);
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 3c83a49a48a3..f10a50f24e8f 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -321,9 +321,9 @@ static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb,
321 rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type); 321 rl->rl_wait_type = cpu_to_le16(lkb->lkb_wait_type);
322 322
323 if (lkb->lkb_bastfn) 323 if (lkb->lkb_bastfn)
324 rl->rl_asts |= AST_BAST; 324 rl->rl_asts |= DLM_CB_BAST;
325 if (lkb->lkb_astfn) 325 if (lkb->lkb_astfn)
326 rl->rl_asts |= AST_COMP; 326 rl->rl_asts |= DLM_CB_CAST;
327 327
328 rl->rl_namelen = cpu_to_le16(r->res_length); 328 rl->rl_namelen = cpu_to_le16(r->res_length);
329 memcpy(rl->rl_name, r->res_name, r->res_length); 329 memcpy(rl->rl_name, r->res_name, r->res_length);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index eda43f362616..14638235f7b2 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -304,7 +304,7 @@ static void set_master_lkbs(struct dlm_rsb *r)
304} 304}
305 305
306/* 306/*
307 * Propogate the new master nodeid to locks 307 * Propagate the new master nodeid to locks
308 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. 308 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
309 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which 309 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
310 * rsb's to consider. 310 * rsb's to consider.
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 66d6c16bf440..d5ab3fe7c198 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -24,6 +24,7 @@
24#include "lock.h" 24#include "lock.h"
25#include "lvb_table.h" 25#include "lvb_table.h"
26#include "user.h" 26#include "user.h"
27#include "ast.h"
27 28
28static const char name_prefix[] = "dlm"; 29static const char name_prefix[] = "dlm";
29static const struct file_operations device_fops; 30static const struct file_operations device_fops;
@@ -152,19 +153,16 @@ static void compat_output(struct dlm_lock_result *res,
152 not related to the lifetime of the lkb struct which is managed 153 not related to the lifetime of the lkb struct which is managed
153 entirely by refcount. */ 154 entirely by refcount. */
154 155
155static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) 156static int lkb_is_endoflife(int mode, int status)
156{ 157{
157 switch (sb_status) { 158 switch (status) {
158 case -DLM_EUNLOCK: 159 case -DLM_EUNLOCK:
159 return 1; 160 return 1;
160 case -DLM_ECANCEL: 161 case -DLM_ECANCEL:
161 case -ETIMEDOUT: 162 case -ETIMEDOUT:
162 case -EDEADLK: 163 case -EDEADLK:
163 if (lkb->lkb_grmode == DLM_LOCK_IV)
164 return 1;
165 break;
166 case -EAGAIN: 164 case -EAGAIN:
167 if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV) 165 if (mode == DLM_LOCK_IV)
168 return 1; 166 return 1;
169 break; 167 break;
170 } 168 }
@@ -174,12 +172,13 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
174/* we could possibly check if the cancel of an orphan has resulted in the lkb 172/* we could possibly check if the cancel of an orphan has resulted in the lkb
175 being removed and then remove that lkb from the orphans list and free it */ 173 being removed and then remove that lkb from the orphans list and free it */
176 174
177void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode) 175void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
176 int status, uint32_t sbflags, uint64_t seq)
178{ 177{
179 struct dlm_ls *ls; 178 struct dlm_ls *ls;
180 struct dlm_user_args *ua; 179 struct dlm_user_args *ua;
181 struct dlm_user_proc *proc; 180 struct dlm_user_proc *proc;
182 int eol = 0, ast_type; 181 int rv;
183 182
184 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD)) 183 if (lkb->lkb_flags & (DLM_IFL_ORPHAN | DLM_IFL_DEAD))
185 return; 184 return;
@@ -200,49 +199,29 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode)
200 ua = lkb->lkb_ua; 199 ua = lkb->lkb_ua;
201 proc = ua->proc; 200 proc = ua->proc;
202 201
203 if (type == AST_BAST && ua->bastaddr == NULL) 202 if ((flags & DLM_CB_BAST) && ua->bastaddr == NULL)
204 goto out; 203 goto out;
205 204
205 if ((flags & DLM_CB_CAST) && lkb_is_endoflife(mode, status))
206 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
207
206 spin_lock(&proc->asts_spin); 208 spin_lock(&proc->asts_spin);
207 209
208 ast_type = lkb->lkb_ast_type; 210 rv = dlm_add_lkb_callback(lkb, flags, mode, status, sbflags, seq);
209 lkb->lkb_ast_type |= type; 211 if (rv < 0) {
210 if (type == AST_BAST) 212 spin_unlock(&proc->asts_spin);
211 lkb->lkb_bastmode = mode; 213 goto out;
212 else 214 }
213 lkb->lkb_castmode = mode;
214 215
215 if (!ast_type) { 216 if (list_empty(&lkb->lkb_astqueue)) {
216 kref_get(&lkb->lkb_ref); 217 kref_get(&lkb->lkb_ref);
217 list_add_tail(&lkb->lkb_astqueue, &proc->asts); 218 list_add_tail(&lkb->lkb_astqueue, &proc->asts);
218 lkb->lkb_ast_first = type;
219 wake_up_interruptible(&proc->wait); 219 wake_up_interruptible(&proc->wait);
220 } 220 }
221 if (type == AST_COMP && (ast_type & AST_COMP))
222 log_debug(ls, "ast overlap %x status %x %x",
223 lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
224
225 eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
226 if (eol) {
227 lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
228 }
229
230 /* We want to copy the lvb to userspace when the completion
231 ast is read if the status is 0, the lock has an lvb and
232 lvb_ops says we should. We could probably have set_lvb_lock()
233 set update_user_lvb instead and not need old_mode */
234
235 if ((lkb->lkb_ast_type & AST_COMP) &&
236 (lkb->lkb_lksb->sb_status == 0) &&
237 lkb->lkb_lksb->sb_lvbptr &&
238 dlm_lvb_operations[ua->old_mode + 1][lkb->lkb_grmode + 1])
239 ua->update_user_lvb = 1;
240 else
241 ua->update_user_lvb = 0;
242
243 spin_unlock(&proc->asts_spin); 221 spin_unlock(&proc->asts_spin);
244 222
245 if (eol) { 223 if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
224 /* N.B. spin_lock locks_spin, not asts_spin */
246 spin_lock(&proc->locks_spin); 225 spin_lock(&proc->locks_spin);
247 if (!list_empty(&lkb->lkb_ownqueue)) { 226 if (!list_empty(&lkb->lkb_ownqueue)) {
248 list_del_init(&lkb->lkb_ownqueue); 227 list_del_init(&lkb->lkb_ownqueue);
@@ -705,8 +684,9 @@ static int device_close(struct inode *inode, struct file *file)
705 return 0; 684 return 0;
706} 685}
707 686
708static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type, 687static int copy_result_to_user(struct dlm_user_args *ua, int compat,
709 int mode, char __user *buf, size_t count) 688 uint32_t flags, int mode, int copy_lvb,
689 char __user *buf, size_t count)
710{ 690{
711#ifdef CONFIG_COMPAT 691#ifdef CONFIG_COMPAT
712 struct dlm_lock_result32 result32; 692 struct dlm_lock_result32 result32;
@@ -730,7 +710,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
730 notes that a new blocking AST address and parameter are set even if 710 notes that a new blocking AST address and parameter are set even if
731 the conversion fails, so maybe we should just do that. */ 711 the conversion fails, so maybe we should just do that. */
732 712
733 if (type == AST_BAST) { 713 if (flags & DLM_CB_BAST) {
734 result.user_astaddr = ua->bastaddr; 714 result.user_astaddr = ua->bastaddr;
735 result.user_astparam = ua->bastparam; 715 result.user_astparam = ua->bastparam;
736 result.bast_mode = mode; 716 result.bast_mode = mode;
@@ -750,8 +730,7 @@ static int copy_result_to_user(struct dlm_user_args *ua, int compat, int type,
750 /* copy lvb to userspace if there is one, it's been updated, and 730 /* copy lvb to userspace if there is one, it's been updated, and
751 the user buffer has space for it */ 731 the user buffer has space for it */
752 732
753 if (ua->update_user_lvb && ua->lksb.sb_lvbptr && 733 if (copy_lvb && ua->lksb.sb_lvbptr && count >= len + DLM_USER_LVB_LEN) {
754 count >= len + DLM_USER_LVB_LEN) {
755 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr, 734 if (copy_to_user(buf+len, ua->lksb.sb_lvbptr,
756 DLM_USER_LVB_LEN)) { 735 DLM_USER_LVB_LEN)) {
757 error = -EFAULT; 736 error = -EFAULT;
@@ -801,13 +780,12 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
801 struct dlm_user_proc *proc = file->private_data; 780 struct dlm_user_proc *proc = file->private_data;
802 struct dlm_lkb *lkb; 781 struct dlm_lkb *lkb;
803 DECLARE_WAITQUEUE(wait, current); 782 DECLARE_WAITQUEUE(wait, current);
804 int error = 0, removed; 783 struct dlm_callback cb;
805 int ret_type, ret_mode; 784 int rv, resid, copy_lvb = 0;
806 int bastmode, castmode, do_bast, do_cast;
807 785
808 if (count == sizeof(struct dlm_device_version)) { 786 if (count == sizeof(struct dlm_device_version)) {
809 error = copy_version_to_user(buf, count); 787 rv = copy_version_to_user(buf, count);
810 return error; 788 return rv;
811 } 789 }
812 790
813 if (!proc) { 791 if (!proc) {
@@ -854,92 +832,57 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
854 } 832 }
855 } 833 }
856 834
857 /* there may be both completion and blocking asts to return for 835 /* if we empty lkb_callbacks, we don't want to unlock the spinlock
858 the lkb, don't remove lkb from asts list unless no asts remain */ 836 without removing lkb_astqueue; so empty lkb_astqueue is always
837 consistent with empty lkb_callbacks */
859 838
860 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue); 839 lkb = list_entry(proc->asts.next, struct dlm_lkb, lkb_astqueue);
861 840
862 removed = 0; 841 rv = dlm_rem_lkb_callback(lkb->lkb_resource->res_ls, lkb, &cb, &resid);
863 ret_type = 0; 842 if (rv < 0) {
864 ret_mode = 0; 843 /* this shouldn't happen; lkb should have been removed from
865 do_bast = lkb->lkb_ast_type & AST_BAST; 844 list when resid was zero */
866 do_cast = lkb->lkb_ast_type & AST_COMP; 845 log_print("dlm_rem_lkb_callback empty %x", lkb->lkb_id);
867 bastmode = lkb->lkb_bastmode; 846 list_del_init(&lkb->lkb_astqueue);
868 castmode = lkb->lkb_castmode; 847 spin_unlock(&proc->asts_spin);
869 848 /* removes ref for proc->asts, may cause lkb to be freed */
870 /* when both are queued figure out which to do first and 849 dlm_put_lkb(lkb);
871 switch first so the other goes in the next read */ 850 goto try_another;
872
873 if (do_cast && do_bast) {
874 if (lkb->lkb_ast_first == AST_COMP) {
875 ret_type = AST_COMP;
876 ret_mode = castmode;
877 lkb->lkb_ast_type &= ~AST_COMP;
878 lkb->lkb_ast_first = AST_BAST;
879 } else {
880 ret_type = AST_BAST;
881 ret_mode = bastmode;
882 lkb->lkb_ast_type &= ~AST_BAST;
883 lkb->lkb_ast_first = AST_COMP;
884 }
885 } else {
886 ret_type = lkb->lkb_ast_first;
887 ret_mode = (ret_type == AST_COMP) ? castmode : bastmode;
888 lkb->lkb_ast_type &= ~ret_type;
889 lkb->lkb_ast_first = 0;
890 } 851 }
852 if (!resid)
853 list_del_init(&lkb->lkb_astqueue);
854 spin_unlock(&proc->asts_spin);
891 855
892 /* if we're doing a bast but the bast is unnecessary, then 856 if (cb.flags & DLM_CB_SKIP) {
893 switch to do nothing or do a cast if that was needed next */ 857 /* removes ref for proc->asts, may cause lkb to be freed */
894 858 if (!resid)
895 if ((ret_type == AST_BAST) && 859 dlm_put_lkb(lkb);
896 dlm_modes_compat(bastmode, lkb->lkb_castmode_done)) { 860 goto try_another;
897 ret_type = 0;
898 ret_mode = 0;
899
900 if (do_cast) {
901 ret_type = AST_COMP;
902 ret_mode = castmode;
903 lkb->lkb_ast_type &= ~AST_COMP;
904 lkb->lkb_ast_first = 0;
905 }
906 } 861 }
907 862
908 if (lkb->lkb_ast_first != lkb->lkb_ast_type) { 863 if (cb.flags & DLM_CB_CAST) {
909 log_print("device_read %x ast_first %x ast_type %x", 864 int old_mode, new_mode;
910 lkb->lkb_id, lkb->lkb_ast_first, lkb->lkb_ast_type);
911 }
912 865
913 if (!lkb->lkb_ast_type) { 866 old_mode = lkb->lkb_last_cast.mode;
914 list_del(&lkb->lkb_astqueue); 867 new_mode = cb.mode;
915 removed = 1;
916 }
917 spin_unlock(&proc->asts_spin);
918 868
919 if (ret_type) { 869 if (!cb.sb_status && lkb->lkb_lksb->sb_lvbptr &&
920 error = copy_result_to_user(lkb->lkb_ua, 870 dlm_lvb_operations[old_mode + 1][new_mode + 1])
921 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags), 871 copy_lvb = 1;
922 ret_type, ret_mode, buf, count);
923 872
924 if (ret_type == AST_COMP) 873 lkb->lkb_lksb->sb_status = cb.sb_status;
925 lkb->lkb_castmode_done = castmode; 874 lkb->lkb_lksb->sb_flags = cb.sb_flags;
926 if (ret_type == AST_BAST)
927 lkb->lkb_bastmode_done = bastmode;
928 } 875 }
929 876
930 /* removes reference for the proc->asts lists added by 877 rv = copy_result_to_user(lkb->lkb_ua,
931 dlm_user_add_ast() and may result in the lkb being freed */ 878 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
879 cb.flags, cb.mode, copy_lvb, buf, count);
932 880
933 if (removed) 881 /* removes ref for proc->asts, may cause lkb to be freed */
882 if (!resid)
934 dlm_put_lkb(lkb); 883 dlm_put_lkb(lkb);
935 884
936 /* the bast that was queued was eliminated (see unnecessary above), 885 return rv;
937 leaving nothing to return */
938
939 if (!ret_type)
940 goto try_another;
941
942 return error;
943} 886}
944 887
945static unsigned int device_poll(struct file *file, poll_table *wait) 888static unsigned int device_poll(struct file *file, poll_table *wait)
diff --git a/fs/dlm/user.h b/fs/dlm/user.h
index f196091dd7ff..00499ab8835f 100644
--- a/fs/dlm/user.h
+++ b/fs/dlm/user.h
@@ -9,7 +9,8 @@
9#ifndef __USER_DOT_H__ 9#ifndef __USER_DOT_H__
10#define __USER_DOT_H__ 10#define __USER_DOT_H__
11 11
12void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int mode); 12void dlm_user_add_ast(struct dlm_lkb *lkb, uint32_t flags, int mode,
13 int status, uint32_t sbflags, uint64_t seq);
13int dlm_user_init(void); 14int dlm_user_init(void);
14void dlm_user_exit(void); 15void dlm_user_exit(void);
15int dlm_device_deregister(struct dlm_ls *ls); 16int dlm_device_deregister(struct dlm_ls *ls);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2195c213ab2f..98b77c89494c 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,6 +8,7 @@
8#include <linux/writeback.h> 8#include <linux/writeback.h>
9#include <linux/sysctl.h> 9#include <linux/sysctl.h>
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include "internal.h"
11 12
12/* A global variable is a bit ugly, but it keeps the code simple */ 13/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 14int sysctl_drop_caches;
@@ -16,20 +17,23 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 17{
17 struct inode *inode, *toput_inode = NULL; 18 struct inode *inode, *toput_inode = NULL;
18 19
19 spin_lock(&inode_lock); 20 spin_lock(&inode_sb_list_lock);
20 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 21 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
21 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 22 spin_lock(&inode->i_lock);
22 continue; 23 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
23 if (inode->i_mapping->nrpages == 0) 24 (inode->i_mapping->nrpages == 0)) {
25 spin_unlock(&inode->i_lock);
24 continue; 26 continue;
27 }
25 __iget(inode); 28 __iget(inode);
26 spin_unlock(&inode_lock); 29 spin_unlock(&inode->i_lock);
30 spin_unlock(&inode_sb_list_lock);
27 invalidate_mapping_pages(inode->i_mapping, 0, -1); 31 invalidate_mapping_pages(inode->i_mapping, 0, -1);
28 iput(toput_inode); 32 iput(toput_inode);
29 toput_inode = inode; 33 toput_inode = inode;
30 spin_lock(&inode_lock); 34 spin_lock(&inode_sb_list_lock);
31 } 35 }
32 spin_unlock(&inode_lock); 36 spin_unlock(&inode_sb_list_lock);
33 iput(toput_inode); 37 iput(toput_inode);
34} 38}
35 39
@@ -45,7 +49,11 @@ static void drop_slab(void)
45int drop_caches_sysctl_handler(ctl_table *table, int write, 49int drop_caches_sysctl_handler(ctl_table *table, int write,
46 void __user *buffer, size_t *length, loff_t *ppos) 50 void __user *buffer, size_t *length, loff_t *ppos)
47{ 51{
48 proc_dointvec_minmax(table, write, buffer, length, ppos); 52 int ret;
53
54 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
55 if (ret)
56 return ret;
49 if (write) { 57 if (write) {
50 if (sysctl_drop_caches & 1) 58 if (sysctl_drop_caches & 1)
51 iterate_supers(drop_pagecache_sb, NULL); 59 iterate_supers(drop_pagecache_sb, NULL);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index bfd8b680e648..d2a70a4561f9 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -266,7 +266,6 @@ void ecryptfs_destroy_mount_crypt_stat(
266 &mount_crypt_stat->global_auth_tok_list, 266 &mount_crypt_stat->global_auth_tok_list,
267 mount_crypt_stat_list) { 267 mount_crypt_stat_list) {
268 list_del(&auth_tok->mount_crypt_stat_list); 268 list_del(&auth_tok->mount_crypt_stat_list);
269 mount_crypt_stat->num_global_auth_toks--;
270 if (auth_tok->global_auth_tok_key 269 if (auth_tok->global_auth_tok_key
271 && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID)) 270 && !(auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID))
272 key_put(auth_tok->global_auth_tok_key); 271 key_put(auth_tok->global_auth_tok_key);
@@ -1389,6 +1388,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry)
1389 rc = -ENOMEM; 1388 rc = -ENOMEM;
1390 goto out; 1389 goto out;
1391 } 1390 }
1391 /* Zeroed page ensures the in-header unencrypted i_size is set to 0 */
1392 rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat, 1392 rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat,
1393 ecryptfs_dentry); 1393 ecryptfs_dentry);
1394 if (unlikely(rc)) { 1394 if (unlikely(rc)) {
diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 6fc4f319b550..534c1d46e69e 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -46,24 +46,28 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
46{ 46{
47 struct dentry *lower_dentry; 47 struct dentry *lower_dentry;
48 struct vfsmount *lower_mnt; 48 struct vfsmount *lower_mnt;
49 struct dentry *dentry_save; 49 struct dentry *dentry_save = NULL;
50 struct vfsmount *vfsmount_save; 50 struct vfsmount *vfsmount_save = NULL;
51 int rc = 1; 51 int rc = 1;
52 52
53 if (nd->flags & LOOKUP_RCU) 53 if (nd && nd->flags & LOOKUP_RCU)
54 return -ECHILD; 54 return -ECHILD;
55 55
56 lower_dentry = ecryptfs_dentry_to_lower(dentry); 56 lower_dentry = ecryptfs_dentry_to_lower(dentry);
57 lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry); 57 lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
58 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate) 58 if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
59 goto out; 59 goto out;
60 dentry_save = nd->path.dentry; 60 if (nd) {
61 vfsmount_save = nd->path.mnt; 61 dentry_save = nd->path.dentry;
62 nd->path.dentry = lower_dentry; 62 vfsmount_save = nd->path.mnt;
63 nd->path.mnt = lower_mnt; 63 nd->path.dentry = lower_dentry;
64 nd->path.mnt = lower_mnt;
65 }
64 rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd); 66 rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
65 nd->path.dentry = dentry_save; 67 if (nd) {
66 nd->path.mnt = vfsmount_save; 68 nd->path.dentry = dentry_save;
69 nd->path.mnt = vfsmount_save;
70 }
67 if (dentry->d_inode) { 71 if (dentry->d_inode) {
68 struct inode *lower_inode = 72 struct inode *lower_inode =
69 ecryptfs_inode_to_lower(dentry->d_inode); 73 ecryptfs_inode_to_lower(dentry->d_inode);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index dbc84ed96336..bd3cafd0949d 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -233,7 +233,7 @@ ecryptfs_get_key_payload_data(struct key *key)
233 233
234struct ecryptfs_key_sig { 234struct ecryptfs_key_sig {
235 struct list_head crypt_stat_list; 235 struct list_head crypt_stat_list;
236 char keysig[ECRYPTFS_SIG_SIZE_HEX]; 236 char keysig[ECRYPTFS_SIG_SIZE_HEX + 1];
237}; 237};
238 238
239struct ecryptfs_filename { 239struct ecryptfs_filename {
@@ -257,19 +257,18 @@ struct ecryptfs_filename {
257struct ecryptfs_crypt_stat { 257struct ecryptfs_crypt_stat {
258#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 258#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001
259#define ECRYPTFS_POLICY_APPLIED 0x00000002 259#define ECRYPTFS_POLICY_APPLIED 0x00000002
260#define ECRYPTFS_NEW_FILE 0x00000004 260#define ECRYPTFS_ENCRYPTED 0x00000004
261#define ECRYPTFS_ENCRYPTED 0x00000008 261#define ECRYPTFS_SECURITY_WARNING 0x00000008
262#define ECRYPTFS_SECURITY_WARNING 0x00000010 262#define ECRYPTFS_ENABLE_HMAC 0x00000010
263#define ECRYPTFS_ENABLE_HMAC 0x00000020 263#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000020
264#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 264#define ECRYPTFS_KEY_VALID 0x00000040
265#define ECRYPTFS_KEY_VALID 0x00000080 265#define ECRYPTFS_METADATA_IN_XATTR 0x00000080
266#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 266#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000100
267#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 267#define ECRYPTFS_KEY_SET 0x00000200
268#define ECRYPTFS_KEY_SET 0x00000400 268#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000400
269#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 269#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00000800
270#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 270#define ECRYPTFS_ENCFN_USE_FEK 0x00001000
271#define ECRYPTFS_ENCFN_USE_FEK 0x00002000 271#define ECRYPTFS_UNLINK_SIGS 0x00002000
272#define ECRYPTFS_UNLINK_SIGS 0x00004000
273 u32 flags; 272 u32 flags;
274 unsigned int file_version; 273 unsigned int file_version;
275 size_t iv_bytes; 274 size_t iv_bytes;
@@ -297,7 +296,6 @@ struct ecryptfs_inode_info {
297 struct inode vfs_inode; 296 struct inode vfs_inode;
298 struct inode *wii_inode; 297 struct inode *wii_inode;
299 struct file *lower_file; 298 struct file *lower_file;
300 struct mutex lower_file_mutex;
301 struct ecryptfs_crypt_stat crypt_stat; 299 struct ecryptfs_crypt_stat crypt_stat;
302}; 300};
303 301
@@ -333,7 +331,6 @@ struct ecryptfs_global_auth_tok {
333 u32 flags; 331 u32 flags;
334 struct list_head mount_crypt_stat_list; 332 struct list_head mount_crypt_stat_list;
335 struct key *global_auth_tok_key; 333 struct key *global_auth_tok_key;
336 struct ecryptfs_auth_tok *global_auth_tok;
337 unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1]; 334 unsigned char sig[ECRYPTFS_SIG_SIZE_HEX + 1];
338}; 335};
339 336
@@ -380,7 +377,6 @@ struct ecryptfs_mount_crypt_stat {
380 u32 flags; 377 u32 flags;
381 struct list_head global_auth_tok_list; 378 struct list_head global_auth_tok_list;
382 struct mutex global_auth_tok_list_mutex; 379 struct mutex global_auth_tok_list_mutex;
383 size_t num_global_auth_toks;
384 size_t global_default_cipher_key_size; 380 size_t global_default_cipher_key_size;
385 size_t global_default_fn_cipher_key_bytes; 381 size_t global_default_fn_cipher_key_bytes;
386 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE 382 unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE
@@ -632,8 +628,7 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
632 u32 flags); 628 u32 flags);
633int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 629int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
634 struct dentry *lower_dentry, 630 struct dentry *lower_dentry,
635 struct inode *ecryptfs_dir_inode, 631 struct inode *ecryptfs_dir_inode);
636 struct nameidata *ecryptfs_nd);
637int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, 632int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
638 size_t *decrypted_name_size, 633 size_t *decrypted_name_size,
639 struct dentry *ecryptfs_dentry, 634 struct dentry *ecryptfs_dentry,
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 81e10e6a9443..cedc913d11ba 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -273,7 +273,14 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
273static int 273static int
274ecryptfs_fsync(struct file *file, int datasync) 274ecryptfs_fsync(struct file *file, int datasync)
275{ 275{
276 return vfs_fsync(ecryptfs_file_to_lower(file), datasync); 276 int rc = 0;
277
278 rc = generic_file_fsync(file, datasync);
279 if (rc)
280 goto out;
281 rc = vfs_fsync(ecryptfs_file_to_lower(file), datasync);
282out:
283 return rc;
277} 284}
278 285
279static int ecryptfs_fasync(int fd, struct file *file, int flag) 286static int ecryptfs_fasync(int fd, struct file *file, int flag)
@@ -317,6 +324,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
317 324
318const struct file_operations ecryptfs_dir_fops = { 325const struct file_operations ecryptfs_dir_fops = {
319 .readdir = ecryptfs_readdir, 326 .readdir = ecryptfs_readdir,
327 .read = generic_read_dir,
320 .unlocked_ioctl = ecryptfs_unlocked_ioctl, 328 .unlocked_ioctl = ecryptfs_unlocked_ioctl,
321#ifdef CONFIG_COMPAT 329#ifdef CONFIG_COMPAT
322 .compat_ioctl = ecryptfs_compat_ioctl, 330 .compat_ioctl = ecryptfs_compat_ioctl,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bd33f87a1907..f99051b7adab 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -74,16 +74,20 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
74 unsigned int flags_save; 74 unsigned int flags_save;
75 int rc; 75 int rc;
76 76
77 dentry_save = nd->path.dentry; 77 if (nd) {
78 vfsmount_save = nd->path.mnt; 78 dentry_save = nd->path.dentry;
79 flags_save = nd->flags; 79 vfsmount_save = nd->path.mnt;
80 nd->path.dentry = lower_dentry; 80 flags_save = nd->flags;
81 nd->path.mnt = lower_mnt; 81 nd->path.dentry = lower_dentry;
82 nd->flags &= ~LOOKUP_OPEN; 82 nd->path.mnt = lower_mnt;
83 nd->flags &= ~LOOKUP_OPEN;
84 }
83 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd); 85 rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
84 nd->path.dentry = dentry_save; 86 if (nd) {
85 nd->path.mnt = vfsmount_save; 87 nd->path.dentry = dentry_save;
86 nd->flags = flags_save; 88 nd->path.mnt = vfsmount_save;
89 nd->flags = flags_save;
90 }
87 return rc; 91 return rc;
88} 92}
89 93
@@ -139,26 +143,6 @@ out:
139} 143}
140 144
141/** 145/**
142 * grow_file
143 * @ecryptfs_dentry: the eCryptfs dentry
144 *
145 * This is the code which will grow the file to its correct size.
146 */
147static int grow_file(struct dentry *ecryptfs_dentry)
148{
149 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
150 char zero_virt[] = { 0x00 };
151 int rc = 0;
152
153 rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
154 i_size_write(ecryptfs_inode, 0);
155 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
156 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
157 ECRYPTFS_NEW_FILE;
158 return rc;
159}
160
161/**
162 * ecryptfs_initialize_file 146 * ecryptfs_initialize_file
163 * 147 *
164 * Cause the file to be changed from a basic empty file to an ecryptfs 148 * Cause the file to be changed from a basic empty file to an ecryptfs
@@ -177,7 +161,6 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
177 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); 161 crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED);
178 goto out; 162 goto out;
179 } 163 }
180 crypt_stat->flags |= ECRYPTFS_NEW_FILE;
181 ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n"); 164 ecryptfs_printk(KERN_DEBUG, "Initializing crypto context\n");
182 rc = ecryptfs_new_file_context(ecryptfs_dentry); 165 rc = ecryptfs_new_file_context(ecryptfs_dentry);
183 if (rc) { 166 if (rc) {
@@ -198,9 +181,6 @@ static int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry)
198 printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc); 181 printk(KERN_ERR "Error writing headers; rc = [%d]\n", rc);
199 goto out; 182 goto out;
200 } 183 }
201 rc = grow_file(ecryptfs_dentry);
202 if (rc)
203 printk(KERN_ERR "Error growing file; rc = [%d]\n", rc);
204out: 184out:
205 return rc; 185 return rc;
206} 186}
@@ -241,8 +221,7 @@ out:
241 */ 221 */
242int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, 222int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
243 struct dentry *lower_dentry, 223 struct dentry *lower_dentry,
244 struct inode *ecryptfs_dir_inode, 224 struct inode *ecryptfs_dir_inode)
245 struct nameidata *ecryptfs_nd)
246{ 225{
247 struct dentry *lower_dir_dentry; 226 struct dentry *lower_dir_dentry;
248 struct vfsmount *lower_mnt; 227 struct vfsmount *lower_mnt;
@@ -290,8 +269,6 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
290 goto out; 269 goto out;
291 if (special_file(lower_inode->i_mode)) 270 if (special_file(lower_inode->i_mode))
292 goto out; 271 goto out;
293 if (!ecryptfs_nd)
294 goto out;
295 /* Released in this function */ 272 /* Released in this function */
296 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER); 273 page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
297 if (!page_virt) { 274 if (!page_virt) {
@@ -349,75 +326,6 @@ out:
349} 326}
350 327
351/** 328/**
352 * ecryptfs_new_lower_dentry
353 * @name: The name of the new dentry.
354 * @lower_dir_dentry: Parent directory of the new dentry.
355 * @nd: nameidata from last lookup.
356 *
357 * Create a new dentry or get it from lower parent dir.
358 */
359static struct dentry *
360ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
361 struct nameidata *nd)
362{
363 struct dentry *new_dentry;
364 struct dentry *tmp;
365 struct inode *lower_dir_inode;
366
367 lower_dir_inode = lower_dir_dentry->d_inode;
368
369 tmp = d_alloc(lower_dir_dentry, name);
370 if (!tmp)
371 return ERR_PTR(-ENOMEM);
372
373 mutex_lock(&lower_dir_inode->i_mutex);
374 new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
375 mutex_unlock(&lower_dir_inode->i_mutex);
376
377 if (!new_dentry)
378 new_dentry = tmp;
379 else
380 dput(tmp);
381
382 return new_dentry;
383}
384
385
386/**
387 * ecryptfs_lookup_one_lower
388 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
389 * @lower_dir_dentry: lower parent directory
390 * @name: lower file name
391 *
392 * Get the lower dentry from vfs. If lower dentry does not exist yet,
393 * create it.
394 */
395static struct dentry *
396ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
397 struct dentry *lower_dir_dentry, struct qstr *name)
398{
399 struct nameidata nd;
400 struct vfsmount *lower_mnt;
401 int err;
402
403 lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
404 ecryptfs_dentry->d_parent));
405 err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
406 mntput(lower_mnt);
407
408 if (!err) {
409 /* we dont need the mount */
410 mntput(nd.path.mnt);
411 return nd.path.dentry;
412 }
413 if (err != -ENOENT)
414 return ERR_PTR(err);
415
416 /* create a new lower dentry */
417 return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
418}
419
420/**
421 * ecryptfs_lookup 329 * ecryptfs_lookup
422 * @ecryptfs_dir_inode: The eCryptfs directory inode 330 * @ecryptfs_dir_inode: The eCryptfs directory inode
423 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up 331 * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
@@ -434,7 +342,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
434 size_t encrypted_and_encoded_name_size; 342 size_t encrypted_and_encoded_name_size;
435 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; 343 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
436 struct dentry *lower_dir_dentry, *lower_dentry; 344 struct dentry *lower_dir_dentry, *lower_dentry;
437 struct qstr lower_name;
438 int rc = 0; 345 int rc = 0;
439 346
440 if ((ecryptfs_dentry->d_name.len == 1 347 if ((ecryptfs_dentry->d_name.len == 1
@@ -444,20 +351,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
444 goto out_d_drop; 351 goto out_d_drop;
445 } 352 }
446 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); 353 lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
447 lower_name.name = ecryptfs_dentry->d_name.name; 354 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
448 lower_name.len = ecryptfs_dentry->d_name.len; 355 lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
449 lower_name.hash = ecryptfs_dentry->d_name.hash; 356 lower_dir_dentry,
450 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { 357 ecryptfs_dentry->d_name.len);
451 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, 358 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
452 lower_dir_dentry->d_inode, &lower_name);
453 if (rc < 0)
454 goto out_d_drop;
455 }
456 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
457 lower_dir_dentry, &lower_name);
458 if (IS_ERR(lower_dentry)) { 359 if (IS_ERR(lower_dentry)) {
459 rc = PTR_ERR(lower_dentry); 360 rc = PTR_ERR(lower_dentry);
460 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " 361 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
461 "[%d] on lower_dentry = [%s]\n", __func__, rc, 362 "[%d] on lower_dentry = [%s]\n", __func__, rc,
462 encrypted_and_encoded_name); 363 encrypted_and_encoded_name);
463 goto out_d_drop; 364 goto out_d_drop;
@@ -479,28 +380,21 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
479 "filename; rc = [%d]\n", __func__, rc); 380 "filename; rc = [%d]\n", __func__, rc);
480 goto out_d_drop; 381 goto out_d_drop;
481 } 382 }
482 lower_name.name = encrypted_and_encoded_name; 383 mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
483 lower_name.len = encrypted_and_encoded_name_size; 384 lower_dentry = lookup_one_len(encrypted_and_encoded_name,
484 lower_name.hash = full_name_hash(lower_name.name, lower_name.len); 385 lower_dir_dentry,
485 if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { 386 encrypted_and_encoded_name_size);
486 rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, 387 mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
487 lower_dir_dentry->d_inode, &lower_name);
488 if (rc < 0)
489 goto out_d_drop;
490 }
491 lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
492 lower_dir_dentry, &lower_name);
493 if (IS_ERR(lower_dentry)) { 388 if (IS_ERR(lower_dentry)) {
494 rc = PTR_ERR(lower_dentry); 389 rc = PTR_ERR(lower_dentry);
495 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " 390 ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
496 "[%d] on lower_dentry = [%s]\n", __func__, rc, 391 "[%d] on lower_dentry = [%s]\n", __func__, rc,
497 encrypted_and_encoded_name); 392 encrypted_and_encoded_name);
498 goto out_d_drop; 393 goto out_d_drop;
499 } 394 }
500lookup_and_interpose: 395lookup_and_interpose:
501 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, 396 rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
502 ecryptfs_dir_inode, 397 ecryptfs_dir_inode);
503 ecryptfs_nd);
504 goto out; 398 goto out;
505out_d_drop: 399out_d_drop:
506 d_drop(ecryptfs_dentry); 400 d_drop(ecryptfs_dentry);
@@ -1092,6 +986,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1092 rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), 986 rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
1093 ecryptfs_dentry_to_lower(dentry), &lower_stat); 987 ecryptfs_dentry_to_lower(dentry), &lower_stat);
1094 if (!rc) { 988 if (!rc) {
989 fsstack_copy_attr_all(dentry->d_inode,
990 ecryptfs_inode_to_lower(dentry->d_inode));
1095 generic_fillattr(dentry->d_inode, stat); 991 generic_fillattr(dentry->d_inode, stat);
1096 stat->blocks = lower_stat.blocks; 992 stat->blocks = lower_stat.blocks;
1097 } 993 }
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index c1436cff6f2d..03e609c45012 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -65,6 +65,24 @@ static int process_request_key_err(long err_code)
65 return rc; 65 return rc;
66} 66}
67 67
68static int process_find_global_auth_tok_for_sig_err(int err_code)
69{
70 int rc = err_code;
71
72 switch (err_code) {
73 case -ENOENT:
74 ecryptfs_printk(KERN_WARNING, "Missing auth tok\n");
75 break;
76 case -EINVAL:
77 ecryptfs_printk(KERN_WARNING, "Invalid auth tok\n");
78 break;
79 default:
80 rc = process_request_key_err(err_code);
81 break;
82 }
83 return rc;
84}
85
68/** 86/**
69 * ecryptfs_parse_packet_length 87 * ecryptfs_parse_packet_length
70 * @data: Pointer to memory containing length at offset 88 * @data: Pointer to memory containing length at offset
@@ -403,27 +421,120 @@ out:
403 return rc; 421 return rc;
404} 422}
405 423
424/**
425 * ecryptfs_verify_version
426 * @version: The version number to confirm
427 *
428 * Returns zero on good version; non-zero otherwise
429 */
430static int ecryptfs_verify_version(u16 version)
431{
432 int rc = 0;
433 unsigned char major;
434 unsigned char minor;
435
436 major = ((version >> 8) & 0xFF);
437 minor = (version & 0xFF);
438 if (major != ECRYPTFS_VERSION_MAJOR) {
439 ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
440 "Expected [%d]; got [%d]\n",
441 ECRYPTFS_VERSION_MAJOR, major);
442 rc = -EINVAL;
443 goto out;
444 }
445 if (minor != ECRYPTFS_VERSION_MINOR) {
446 ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
447 "Expected [%d]; got [%d]\n",
448 ECRYPTFS_VERSION_MINOR, minor);
449 rc = -EINVAL;
450 goto out;
451 }
452out:
453 return rc;
454}
455
456/**
457 * ecryptfs_verify_auth_tok_from_key
458 * @auth_tok_key: key containing the authentication token
459 * @auth_tok: authentication token
460 *
461 * Returns zero on valid auth tok; -EINVAL otherwise
462 */
463static int
464ecryptfs_verify_auth_tok_from_key(struct key *auth_tok_key,
465 struct ecryptfs_auth_tok **auth_tok)
466{
467 int rc = 0;
468
469 (*auth_tok) = ecryptfs_get_key_payload_data(auth_tok_key);
470 if (ecryptfs_verify_version((*auth_tok)->version)) {
471 printk(KERN_ERR "Data structure version mismatch. Userspace "
472 "tools must match eCryptfs kernel module with major "
473 "version [%d] and minor version [%d]\n",
474 ECRYPTFS_VERSION_MAJOR, ECRYPTFS_VERSION_MINOR);
475 rc = -EINVAL;
476 goto out;
477 }
478 if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
479 && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
480 printk(KERN_ERR "Invalid auth_tok structure "
481 "returned from key query\n");
482 rc = -EINVAL;
483 goto out;
484 }
485out:
486 return rc;
487}
488
406static int 489static int
407ecryptfs_find_global_auth_tok_for_sig( 490ecryptfs_find_global_auth_tok_for_sig(
408 struct ecryptfs_global_auth_tok **global_auth_tok, 491 struct key **auth_tok_key,
492 struct ecryptfs_auth_tok **auth_tok,
409 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) 493 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig)
410{ 494{
411 struct ecryptfs_global_auth_tok *walker; 495 struct ecryptfs_global_auth_tok *walker;
412 int rc = 0; 496 int rc = 0;
413 497
414 (*global_auth_tok) = NULL; 498 (*auth_tok_key) = NULL;
499 (*auth_tok) = NULL;
415 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); 500 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
416 list_for_each_entry(walker, 501 list_for_each_entry(walker,
417 &mount_crypt_stat->global_auth_tok_list, 502 &mount_crypt_stat->global_auth_tok_list,
418 mount_crypt_stat_list) { 503 mount_crypt_stat_list) {
419 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { 504 if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX))
420 rc = key_validate(walker->global_auth_tok_key); 505 continue;
421 if (!rc) 506
422 (*global_auth_tok) = walker; 507 if (walker->flags & ECRYPTFS_AUTH_TOK_INVALID) {
508 rc = -EINVAL;
423 goto out; 509 goto out;
424 } 510 }
511
512 rc = key_validate(walker->global_auth_tok_key);
513 if (rc) {
514 if (rc == -EKEYEXPIRED)
515 goto out;
516 goto out_invalid_auth_tok;
517 }
518
519 down_write(&(walker->global_auth_tok_key->sem));
520 rc = ecryptfs_verify_auth_tok_from_key(
521 walker->global_auth_tok_key, auth_tok);
522 if (rc)
523 goto out_invalid_auth_tok_unlock;
524
525 (*auth_tok_key) = walker->global_auth_tok_key;
526 key_get(*auth_tok_key);
527 goto out;
425 } 528 }
426 rc = -EINVAL; 529 rc = -ENOENT;
530 goto out;
531out_invalid_auth_tok_unlock:
532 up_write(&(walker->global_auth_tok_key->sem));
533out_invalid_auth_tok:
534 printk(KERN_WARNING "Invalidating auth tok with sig = [%s]\n", sig);
535 walker->flags |= ECRYPTFS_AUTH_TOK_INVALID;
536 key_put(walker->global_auth_tok_key);
537 walker->global_auth_tok_key = NULL;
427out: 538out:
428 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); 539 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
429 return rc; 540 return rc;
@@ -451,14 +562,11 @@ ecryptfs_find_auth_tok_for_sig(
451 struct ecryptfs_mount_crypt_stat *mount_crypt_stat, 562 struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
452 char *sig) 563 char *sig)
453{ 564{
454 struct ecryptfs_global_auth_tok *global_auth_tok;
455 int rc = 0; 565 int rc = 0;
456 566
457 (*auth_tok_key) = NULL; 567 rc = ecryptfs_find_global_auth_tok_for_sig(auth_tok_key, auth_tok,
458 (*auth_tok) = NULL; 568 mount_crypt_stat, sig);
459 if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, 569 if (rc == -ENOENT) {
460 mount_crypt_stat, sig)) {
461
462 /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the 570 /* if the flag ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY is set in the
463 * mount_crypt_stat structure, we prevent to use auth toks that 571 * mount_crypt_stat structure, we prevent to use auth toks that
464 * are not inserted through the ecryptfs_add_global_auth_tok 572 * are not inserted through the ecryptfs_add_global_auth_tok
@@ -470,8 +578,7 @@ ecryptfs_find_auth_tok_for_sig(
470 578
471 rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok, 579 rc = ecryptfs_keyring_auth_tok_for_sig(auth_tok_key, auth_tok,
472 sig); 580 sig);
473 } else 581 }
474 (*auth_tok) = global_auth_tok->global_auth_tok;
475 return rc; 582 return rc;
476} 583}
477 584
@@ -531,6 +638,16 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
531 } 638 }
532 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; 639 s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
533 (*packet_size) = 0; 640 (*packet_size) = 0;
641 rc = ecryptfs_find_auth_tok_for_sig(
642 &auth_tok_key,
643 &s->auth_tok, mount_crypt_stat,
644 mount_crypt_stat->global_default_fnek_sig);
645 if (rc) {
646 printk(KERN_ERR "%s: Error attempting to find auth tok for "
647 "fnek sig [%s]; rc = [%d]\n", __func__,
648 mount_crypt_stat->global_default_fnek_sig, rc);
649 goto out;
650 }
534 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( 651 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
535 &s->desc.tfm, 652 &s->desc.tfm,
536 &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); 653 &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name);
@@ -616,16 +733,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
616 goto out_free_unlock; 733 goto out_free_unlock;
617 } 734 }
618 dest[s->i++] = s->cipher_code; 735 dest[s->i++] = s->cipher_code;
619 rc = ecryptfs_find_auth_tok_for_sig(
620 &auth_tok_key,
621 &s->auth_tok, mount_crypt_stat,
622 mount_crypt_stat->global_default_fnek_sig);
623 if (rc) {
624 printk(KERN_ERR "%s: Error attempting to find auth tok for "
625 "fnek sig [%s]; rc = [%d]\n", __func__,
626 mount_crypt_stat->global_default_fnek_sig, rc);
627 goto out_free_unlock;
628 }
629 /* TODO: Support other key modules than passphrase for 736 /* TODO: Support other key modules than passphrase for
630 * filename encryption */ 737 * filename encryption */
631 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { 738 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
@@ -765,8 +872,10 @@ out_free_unlock:
765out_unlock: 872out_unlock:
766 mutex_unlock(s->tfm_mutex); 873 mutex_unlock(s->tfm_mutex);
767out: 874out:
768 if (auth_tok_key) 875 if (auth_tok_key) {
876 up_write(&(auth_tok_key->sem));
769 key_put(auth_tok_key); 877 key_put(auth_tok_key);
878 }
770 kfree(s); 879 kfree(s);
771 return rc; 880 return rc;
772} 881}
@@ -879,6 +988,15 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
879 __func__, s->cipher_code); 988 __func__, s->cipher_code);
880 goto out; 989 goto out;
881 } 990 }
991 rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
992 &s->auth_tok, mount_crypt_stat,
993 s->fnek_sig_hex);
994 if (rc) {
995 printk(KERN_ERR "%s: Error attempting to find auth tok for "
996 "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
997 rc);
998 goto out;
999 }
882 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, 1000 rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
883 &s->tfm_mutex, 1001 &s->tfm_mutex,
884 s->cipher_string); 1002 s->cipher_string);
@@ -925,15 +1043,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
925 * >= ECRYPTFS_MAX_IV_BYTES. */ 1043 * >= ECRYPTFS_MAX_IV_BYTES. */
926 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); 1044 memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES);
927 s->desc.info = s->iv; 1045 s->desc.info = s->iv;
928 rc = ecryptfs_find_auth_tok_for_sig(&auth_tok_key,
929 &s->auth_tok, mount_crypt_stat,
930 s->fnek_sig_hex);
931 if (rc) {
932 printk(KERN_ERR "%s: Error attempting to find auth tok for "
933 "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
934 rc);
935 goto out_free_unlock;
936 }
937 /* TODO: Support other key modules than passphrase for 1046 /* TODO: Support other key modules than passphrase for
938 * filename encryption */ 1047 * filename encryption */
939 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) { 1048 if (s->auth_tok->token_type != ECRYPTFS_PASSWORD) {
@@ -1002,8 +1111,10 @@ out:
1002 (*filename_size) = 0; 1111 (*filename_size) = 0;
1003 (*filename) = NULL; 1112 (*filename) = NULL;
1004 } 1113 }
1005 if (auth_tok_key) 1114 if (auth_tok_key) {
1115 up_write(&(auth_tok_key->sem));
1006 key_put(auth_tok_key); 1116 key_put(auth_tok_key);
1117 }
1007 kfree(s); 1118 kfree(s);
1008 return rc; 1119 return rc;
1009} 1120}
@@ -1520,38 +1631,6 @@ out:
1520 return rc; 1631 return rc;
1521} 1632}
1522 1633
1523/**
1524 * ecryptfs_verify_version
1525 * @version: The version number to confirm
1526 *
1527 * Returns zero on good version; non-zero otherwise
1528 */
1529static int ecryptfs_verify_version(u16 version)
1530{
1531 int rc = 0;
1532 unsigned char major;
1533 unsigned char minor;
1534
1535 major = ((version >> 8) & 0xFF);
1536 minor = (version & 0xFF);
1537 if (major != ECRYPTFS_VERSION_MAJOR) {
1538 ecryptfs_printk(KERN_ERR, "Major version number mismatch. "
1539 "Expected [%d]; got [%d]\n",
1540 ECRYPTFS_VERSION_MAJOR, major);
1541 rc = -EINVAL;
1542 goto out;
1543 }
1544 if (minor != ECRYPTFS_VERSION_MINOR) {
1545 ecryptfs_printk(KERN_ERR, "Minor version number mismatch. "
1546 "Expected [%d]; got [%d]\n",
1547 ECRYPTFS_VERSION_MINOR, minor);
1548 rc = -EINVAL;
1549 goto out;
1550 }
1551out:
1552 return rc;
1553}
1554
1555int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key, 1634int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
1556 struct ecryptfs_auth_tok **auth_tok, 1635 struct ecryptfs_auth_tok **auth_tok,
1557 char *sig) 1636 char *sig)
@@ -1563,31 +1642,16 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
1563 printk(KERN_ERR "Could not find key with description: [%s]\n", 1642 printk(KERN_ERR "Could not find key with description: [%s]\n",
1564 sig); 1643 sig);
1565 rc = process_request_key_err(PTR_ERR(*auth_tok_key)); 1644 rc = process_request_key_err(PTR_ERR(*auth_tok_key));
1645 (*auth_tok_key) = NULL;
1566 goto out; 1646 goto out;
1567 } 1647 }
1568 (*auth_tok) = ecryptfs_get_key_payload_data(*auth_tok_key); 1648 down_write(&(*auth_tok_key)->sem);
1569 if (ecryptfs_verify_version((*auth_tok)->version)) { 1649 rc = ecryptfs_verify_auth_tok_from_key(*auth_tok_key, auth_tok);
1570 printk(KERN_ERR
1571 "Data structure version mismatch. "
1572 "Userspace tools must match eCryptfs "
1573 "kernel module with major version [%d] "
1574 "and minor version [%d]\n",
1575 ECRYPTFS_VERSION_MAJOR,
1576 ECRYPTFS_VERSION_MINOR);
1577 rc = -EINVAL;
1578 goto out_release_key;
1579 }
1580 if ((*auth_tok)->token_type != ECRYPTFS_PASSWORD
1581 && (*auth_tok)->token_type != ECRYPTFS_PRIVATE_KEY) {
1582 printk(KERN_ERR "Invalid auth_tok structure "
1583 "returned from key query\n");
1584 rc = -EINVAL;
1585 goto out_release_key;
1586 }
1587out_release_key:
1588 if (rc) { 1650 if (rc) {
1651 up_write(&(*auth_tok_key)->sem);
1589 key_put(*auth_tok_key); 1652 key_put(*auth_tok_key);
1590 (*auth_tok_key) = NULL; 1653 (*auth_tok_key) = NULL;
1654 goto out;
1591 } 1655 }
1592out: 1656out:
1593 return rc; 1657 return rc;
@@ -1809,6 +1873,7 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat,
1809find_next_matching_auth_tok: 1873find_next_matching_auth_tok:
1810 found_auth_tok = 0; 1874 found_auth_tok = 0;
1811 if (auth_tok_key) { 1875 if (auth_tok_key) {
1876 up_write(&(auth_tok_key->sem));
1812 key_put(auth_tok_key); 1877 key_put(auth_tok_key);
1813 auth_tok_key = NULL; 1878 auth_tok_key = NULL;
1814 } 1879 }
@@ -1895,8 +1960,10 @@ found_matching_auth_tok:
1895out_wipe_list: 1960out_wipe_list:
1896 wipe_auth_tok_list(&auth_tok_list); 1961 wipe_auth_tok_list(&auth_tok_list);
1897out: 1962out:
1898 if (auth_tok_key) 1963 if (auth_tok_key) {
1964 up_write(&(auth_tok_key->sem));
1899 key_put(auth_tok_key); 1965 key_put(auth_tok_key);
1966 }
1900 return rc; 1967 return rc;
1901} 1968}
1902 1969
@@ -2324,7 +2391,7 @@ ecryptfs_generate_key_packet_set(char *dest_base,
2324 size_t max) 2391 size_t max)
2325{ 2392{
2326 struct ecryptfs_auth_tok *auth_tok; 2393 struct ecryptfs_auth_tok *auth_tok;
2327 struct ecryptfs_global_auth_tok *global_auth_tok; 2394 struct key *auth_tok_key = NULL;
2328 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 2395 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
2329 &ecryptfs_superblock_to_private( 2396 &ecryptfs_superblock_to_private(
2330 ecryptfs_dentry->d_sb)->mount_crypt_stat; 2397 ecryptfs_dentry->d_sb)->mount_crypt_stat;
@@ -2343,21 +2410,16 @@ ecryptfs_generate_key_packet_set(char *dest_base,
2343 list_for_each_entry(key_sig, &crypt_stat->keysig_list, 2410 list_for_each_entry(key_sig, &crypt_stat->keysig_list,
2344 crypt_stat_list) { 2411 crypt_stat_list) {
2345 memset(key_rec, 0, sizeof(*key_rec)); 2412 memset(key_rec, 0, sizeof(*key_rec));
2346 rc = ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, 2413 rc = ecryptfs_find_global_auth_tok_for_sig(&auth_tok_key,
2414 &auth_tok,
2347 mount_crypt_stat, 2415 mount_crypt_stat,
2348 key_sig->keysig); 2416 key_sig->keysig);
2349 if (rc) { 2417 if (rc) {
2350 printk(KERN_ERR "Error attempting to get the global " 2418 printk(KERN_WARNING "Unable to retrieve auth tok with "
2351 "auth_tok; rc = [%d]\n", rc); 2419 "sig = [%s]\n", key_sig->keysig);
2420 rc = process_find_global_auth_tok_for_sig_err(rc);
2352 goto out_free; 2421 goto out_free;
2353 } 2422 }
2354 if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_INVALID) {
2355 printk(KERN_WARNING
2356 "Skipping invalid auth tok with sig = [%s]\n",
2357 global_auth_tok->sig);
2358 continue;
2359 }
2360 auth_tok = global_auth_tok->global_auth_tok;
2361 if (auth_tok->token_type == ECRYPTFS_PASSWORD) { 2423 if (auth_tok->token_type == ECRYPTFS_PASSWORD) {
2362 rc = write_tag_3_packet((dest_base + (*len)), 2424 rc = write_tag_3_packet((dest_base + (*len)),
2363 &max, auth_tok, 2425 &max, auth_tok,
@@ -2395,6 +2457,9 @@ ecryptfs_generate_key_packet_set(char *dest_base,
2395 rc = -EINVAL; 2457 rc = -EINVAL;
2396 goto out_free; 2458 goto out_free;
2397 } 2459 }
2460 up_write(&(auth_tok_key->sem));
2461 key_put(auth_tok_key);
2462 auth_tok_key = NULL;
2398 } 2463 }
2399 if (likely(max > 0)) { 2464 if (likely(max > 0)) {
2400 dest_base[(*len)] = 0x00; 2465 dest_base[(*len)] = 0x00;
@@ -2407,6 +2472,11 @@ out_free:
2407out: 2472out:
2408 if (rc) 2473 if (rc)
2409 (*len) = 0; 2474 (*len) = 0;
2475 if (auth_tok_key) {
2476 up_write(&(auth_tok_key->sem));
2477 key_put(auth_tok_key);
2478 }
2479
2410 mutex_unlock(&crypt_stat->keysig_list_mutex); 2480 mutex_unlock(&crypt_stat->keysig_list_mutex);
2411 return rc; 2481 return rc;
2412} 2482}
@@ -2424,6 +2494,7 @@ int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig)
2424 return -ENOMEM; 2494 return -ENOMEM;
2425 } 2495 }
2426 memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); 2496 memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX);
2497 new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0';
2427 /* Caller must hold keysig_list_mutex */ 2498 /* Caller must hold keysig_list_mutex */
2428 list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list); 2499 list_add(&new_key_sig->crypt_stat_list, &crypt_stat->keysig_list);
2429 2500
@@ -2453,7 +2524,6 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat,
2453 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); 2524 mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex);
2454 list_add(&new_auth_tok->mount_crypt_stat_list, 2525 list_add(&new_auth_tok->mount_crypt_stat_list,
2455 &mount_crypt_stat->global_auth_tok_list); 2526 &mount_crypt_stat->global_auth_tok_list);
2456 mount_crypt_stat->num_global_auth_toks++;
2457 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); 2527 mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex);
2458out: 2528out:
2459 return rc; 2529 return rc;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 758323a0f09a..fdb2eb0ad09e 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -122,7 +122,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
122 ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); 122 ecryptfs_inode_to_private(ecryptfs_dentry->d_inode);
123 int rc = 0; 123 int rc = 0;
124 124
125 mutex_lock(&inode_info->lower_file_mutex);
126 if (!inode_info->lower_file) { 125 if (!inode_info->lower_file) {
127 struct dentry *lower_dentry; 126 struct dentry *lower_dentry;
128 struct vfsmount *lower_mnt = 127 struct vfsmount *lower_mnt =
@@ -138,7 +137,6 @@ int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry)
138 inode_info->lower_file = NULL; 137 inode_info->lower_file = NULL;
139 } 138 }
140 } 139 }
141 mutex_unlock(&inode_info->lower_file_mutex);
142 return rc; 140 return rc;
143} 141}
144 142
@@ -241,14 +239,14 @@ static int ecryptfs_init_global_auth_toks(
241 struct ecryptfs_mount_crypt_stat *mount_crypt_stat) 239 struct ecryptfs_mount_crypt_stat *mount_crypt_stat)
242{ 240{
243 struct ecryptfs_global_auth_tok *global_auth_tok; 241 struct ecryptfs_global_auth_tok *global_auth_tok;
242 struct ecryptfs_auth_tok *auth_tok;
244 int rc = 0; 243 int rc = 0;
245 244
246 list_for_each_entry(global_auth_tok, 245 list_for_each_entry(global_auth_tok,
247 &mount_crypt_stat->global_auth_tok_list, 246 &mount_crypt_stat->global_auth_tok_list,
248 mount_crypt_stat_list) { 247 mount_crypt_stat_list) {
249 rc = ecryptfs_keyring_auth_tok_for_sig( 248 rc = ecryptfs_keyring_auth_tok_for_sig(
250 &global_auth_tok->global_auth_tok_key, 249 &global_auth_tok->global_auth_tok_key, &auth_tok,
251 &global_auth_tok->global_auth_tok,
252 global_auth_tok->sig); 250 global_auth_tok->sig);
253 if (rc) { 251 if (rc) {
254 printk(KERN_ERR "Could not find valid key in user " 252 printk(KERN_ERR "Could not find valid key in user "
@@ -256,8 +254,10 @@ static int ecryptfs_init_global_auth_toks(
256 "option: [%s]\n", global_auth_tok->sig); 254 "option: [%s]\n", global_auth_tok->sig);
257 global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; 255 global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID;
258 goto out; 256 goto out;
259 } else 257 } else {
260 global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; 258 global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID;
259 up_write(&(global_auth_tok->global_auth_tok_key)->sem);
260 }
261 } 261 }
262out: 262out:
263 return rc; 263 return rc;
@@ -276,7 +276,7 @@ static void ecryptfs_init_mount_crypt_stat(
276/** 276/**
277 * ecryptfs_parse_options 277 * ecryptfs_parse_options
278 * @sb: The ecryptfs super block 278 * @sb: The ecryptfs super block
279 * @options: The options pased to the kernel 279 * @options: The options passed to the kernel
280 * 280 *
281 * Parse mount options: 281 * Parse mount options:
282 * debug=N - ecryptfs_verbosity level for debug output 282 * debug=N - ecryptfs_verbosity level for debug output
@@ -840,7 +840,7 @@ static int __init ecryptfs_init(void)
840 } 840 }
841 rc = ecryptfs_init_messaging(); 841 rc = ecryptfs_init_messaging();
842 if (rc) { 842 if (rc) {
843 printk(KERN_ERR "Failure occured while attempting to " 843 printk(KERN_ERR "Failure occurred while attempting to "
844 "initialize the communications channel to " 844 "initialize the communications channel to "
845 "ecryptfsd\n"); 845 "ecryptfsd\n");
846 goto out_destroy_kthread; 846 goto out_destroy_kthread;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index cc64fca89f8d..6a44148c5fb9 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -62,6 +62,18 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
62{ 62{
63 int rc; 63 int rc;
64 64
65 /*
66 * Refuse to write the page out if we are called from reclaim context
67 * since our writepage() path may potentially allocate memory when
68 * calling into the lower fs vfs_write() which may in turn invoke
69 * us again.
70 */
71 if (current->flags & PF_MEMALLOC) {
72 redirty_page_for_writepage(wbc, page);
73 rc = 0;
74 goto out;
75 }
76
65 rc = ecryptfs_encrypt_page(page); 77 rc = ecryptfs_encrypt_page(page);
66 if (rc) { 78 if (rc) {
67 ecryptfs_printk(KERN_WARNING, "Error encrypting " 79 ecryptfs_printk(KERN_WARNING, "Error encrypting "
@@ -70,8 +82,8 @@ static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
70 goto out; 82 goto out;
71 } 83 }
72 SetPageUptodate(page); 84 SetPageUptodate(page);
73 unlock_page(page);
74out: 85out:
86 unlock_page(page);
75 return rc; 87 return rc;
76} 88}
77 89
@@ -193,11 +205,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page)
193 &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat; 205 &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
194 int rc = 0; 206 int rc = 0;
195 207
196 if (!crypt_stat 208 if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
197 || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
198 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
199 ecryptfs_printk(KERN_DEBUG,
200 "Passing through unencrypted page\n");
201 rc = ecryptfs_read_lower_page_segment(page, page->index, 0, 209 rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
202 PAGE_CACHE_SIZE, 210 PAGE_CACHE_SIZE,
203 page->mapping->host); 211 page->mapping->host);
@@ -295,8 +303,7 @@ static int ecryptfs_write_begin(struct file *file,
295 struct ecryptfs_crypt_stat *crypt_stat = 303 struct ecryptfs_crypt_stat *crypt_stat =
296 &ecryptfs_inode_to_private(mapping->host)->crypt_stat; 304 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
297 305
298 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) 306 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
299 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
300 rc = ecryptfs_read_lower_page_segment( 307 rc = ecryptfs_read_lower_page_segment(
301 page, index, 0, PAGE_CACHE_SIZE, mapping->host); 308 page, index, 0, PAGE_CACHE_SIZE, mapping->host);
302 if (rc) { 309 if (rc) {
@@ -374,6 +381,11 @@ static int ecryptfs_write_begin(struct file *file,
374 && (pos != 0)) 381 && (pos != 0))
375 zero_user(page, 0, PAGE_CACHE_SIZE); 382 zero_user(page, 0, PAGE_CACHE_SIZE);
376out: 383out:
384 if (unlikely(rc)) {
385 unlock_page(page);
386 page_cache_release(page);
387 *pagep = NULL;
388 }
377 return rc; 389 return rc;
378} 390}
379 391
@@ -486,13 +498,8 @@ static int ecryptfs_write_end(struct file *file,
486 struct ecryptfs_crypt_stat *crypt_stat = 498 struct ecryptfs_crypt_stat *crypt_stat =
487 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; 499 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
488 int rc; 500 int rc;
501 int need_unlock_page = 1;
489 502
490 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
491 ecryptfs_printk(KERN_DEBUG, "ECRYPTFS_NEW_FILE flag set in "
492 "crypt_stat at memory location [%p]\n", crypt_stat);
493 crypt_stat->flags &= ~(ECRYPTFS_NEW_FILE);
494 } else
495 ecryptfs_printk(KERN_DEBUG, "Not a new file\n");
496 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" 503 ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page"
497 "(page w/ index = [0x%.16lx], to = [%d])\n", index, to); 504 "(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
498 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 505 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
@@ -512,26 +519,26 @@ static int ecryptfs_write_end(struct file *file,
512 "zeros in page with index = [0x%.16lx]\n", index); 519 "zeros in page with index = [0x%.16lx]\n", index);
513 goto out; 520 goto out;
514 } 521 }
515 rc = ecryptfs_encrypt_page(page); 522 set_page_dirty(page);
516 if (rc) { 523 unlock_page(page);
517 ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " 524 need_unlock_page = 0;
518 "index [0x%.16lx])\n", index);
519 goto out;
520 }
521 if (pos + copied > i_size_read(ecryptfs_inode)) { 525 if (pos + copied > i_size_read(ecryptfs_inode)) {
522 i_size_write(ecryptfs_inode, pos + copied); 526 i_size_write(ecryptfs_inode, pos + copied);
523 ecryptfs_printk(KERN_DEBUG, "Expanded file size to " 527 ecryptfs_printk(KERN_DEBUG, "Expanded file size to "
524 "[0x%.16llx]\n", 528 "[0x%.16llx]\n",
525 (unsigned long long)i_size_read(ecryptfs_inode)); 529 (unsigned long long)i_size_read(ecryptfs_inode));
530 balance_dirty_pages_ratelimited(mapping);
531 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
532 if (rc) {
533 printk(KERN_ERR "Error writing inode size to metadata; "
534 "rc = [%d]\n", rc);
535 goto out;
536 }
526 } 537 }
527 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 538 rc = copied;
528 if (rc)
529 printk(KERN_ERR "Error writing inode size to metadata; "
530 "rc = [%d]\n", rc);
531 else
532 rc = copied;
533out: 539out:
534 unlock_page(page); 540 if (need_unlock_page)
541 unlock_page(page);
535 page_cache_release(page); 542 page_cache_release(page);
536 return rc; 543 return rc;
537} 544}
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index db184ef15d3d..85d430963116 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -44,15 +44,11 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
44 ssize_t rc; 44 ssize_t rc;
45 45
46 inode_info = ecryptfs_inode_to_private(ecryptfs_inode); 46 inode_info = ecryptfs_inode_to_private(ecryptfs_inode);
47 mutex_lock(&inode_info->lower_file_mutex);
48 BUG_ON(!inode_info->lower_file); 47 BUG_ON(!inode_info->lower_file);
49 inode_info->lower_file->f_pos = offset;
50 fs_save = get_fs(); 48 fs_save = get_fs();
51 set_fs(get_ds()); 49 set_fs(get_ds());
52 rc = vfs_write(inode_info->lower_file, data, size, 50 rc = vfs_write(inode_info->lower_file, data, size, &offset);
53 &inode_info->lower_file->f_pos);
54 set_fs(fs_save); 51 set_fs(fs_save);
55 mutex_unlock(&inode_info->lower_file_mutex);
56 mark_inode_dirty_sync(ecryptfs_inode); 52 mark_inode_dirty_sync(ecryptfs_inode);
57 return rc; 53 return rc;
58} 54}
@@ -234,15 +230,11 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
234 mm_segment_t fs_save; 230 mm_segment_t fs_save;
235 ssize_t rc; 231 ssize_t rc;
236 232
237 mutex_lock(&inode_info->lower_file_mutex);
238 BUG_ON(!inode_info->lower_file); 233 BUG_ON(!inode_info->lower_file);
239 inode_info->lower_file->f_pos = offset;
240 fs_save = get_fs(); 234 fs_save = get_fs();
241 set_fs(get_ds()); 235 set_fs(get_ds());
242 rc = vfs_read(inode_info->lower_file, data, size, 236 rc = vfs_read(inode_info->lower_file, data, size, &offset);
243 &inode_info->lower_file->f_pos);
244 set_fs(fs_save); 237 set_fs(fs_save);
245 mutex_unlock(&inode_info->lower_file_mutex);
246 return rc; 238 return rc;
247} 239}
248 240
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 3042fe123a34..bacc882e1ae4 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -55,7 +55,6 @@ static struct inode *ecryptfs_alloc_inode(struct super_block *sb)
55 if (unlikely(!inode_info)) 55 if (unlikely(!inode_info))
56 goto out; 56 goto out;
57 ecryptfs_init_crypt_stat(&inode_info->crypt_stat); 57 ecryptfs_init_crypt_stat(&inode_info->crypt_stat);
58 mutex_init(&inode_info->lower_file_mutex);
59 inode_info->lower_file = NULL; 58 inode_info->lower_file = NULL;
60 inode = &inode_info->vfs_inode; 59 inode = &inode_info->vfs_inode;
61out: 60out:
@@ -198,7 +197,7 @@ static int ecryptfs_show_options(struct seq_file *m, struct vfsmount *mnt)
198const struct super_operations ecryptfs_sops = { 197const struct super_operations ecryptfs_sops = {
199 .alloc_inode = ecryptfs_alloc_inode, 198 .alloc_inode = ecryptfs_alloc_inode,
200 .destroy_inode = ecryptfs_destroy_inode, 199 .destroy_inode = ecryptfs_destroy_inode,
201 .drop_inode = generic_delete_inode, 200 .drop_inode = generic_drop_inode,
202 .statfs = ecryptfs_statfs, 201 .statfs = ecryptfs_statfs,
203 .remount_fs = NULL, 202 .remount_fs = NULL,
204 .evict_inode = ecryptfs_evict_inode, 203 .evict_inode = ecryptfs_evict_inode,
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index a8e7797b9477..9c13412e6c99 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -23,7 +23,6 @@ static sector_t _efs_bmap(struct address_space *mapping, sector_t block)
23} 23}
24static const struct address_space_operations efs_aops = { 24static const struct address_space_operations efs_aops = {
25 .readpage = efs_readpage, 25 .readpage = efs_readpage,
26 .sync_page = block_sync_page,
27 .bmap = _efs_bmap 26 .bmap = _efs_bmap
28}; 27};
29 28
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e0194b3e14d6..d9a591773919 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_get);
99 * @ctx: [in] Pointer to eventfd context. 99 * @ctx: [in] Pointer to eventfd context.
100 * 100 *
101 * The eventfd context reference must have been previously acquired either 101 * The eventfd context reference must have been previously acquired either
102 * with eventfd_ctx_get() or eventfd_ctx_fdget()). 102 * with eventfd_ctx_get() or eventfd_ctx_fdget().
103 */ 103 */
104void eventfd_ctx_put(struct eventfd_ctx *ctx) 104void eventfd_ctx_put(struct eventfd_ctx *ctx)
105{ 105{
@@ -146,9 +146,9 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
146 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. 146 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
147 * @ctx: [in] Pointer to eventfd context. 147 * @ctx: [in] Pointer to eventfd context.
148 * @wait: [in] Wait queue to be removed. 148 * @wait: [in] Wait queue to be removed.
149 * @cnt: [out] Pointer to the 64bit conter value. 149 * @cnt: [out] Pointer to the 64-bit counter value.
150 * 150 *
151 * Returns zero if successful, or the following error codes: 151 * Returns %0 if successful, or the following error codes:
152 * 152 *
153 * -EAGAIN : The operation would have blocked. 153 * -EAGAIN : The operation would have blocked.
154 * 154 *
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
175 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. 175 * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
176 * @ctx: [in] Pointer to eventfd context. 176 * @ctx: [in] Pointer to eventfd context.
177 * @no_wait: [in] Different from zero if the operation should not block. 177 * @no_wait: [in] Different from zero if the operation should not block.
178 * @cnt: [out] Pointer to the 64bit conter value. 178 * @cnt: [out] Pointer to the 64-bit counter value.
179 * 179 *
180 * Returns zero if successful, or the following error codes: 180 * Returns %0 if successful, or the following error codes:
181 * 181 *
182 * -EAGAIN : The operation would have blocked but @no_wait was nonzero. 182 * -EAGAIN : The operation would have blocked but @no_wait was non-zero.
183 * -ERESTARTSYS : A signal interrupted the wait operation. 183 * -ERESTARTSYS : A signal interrupted the wait operation.
184 * 184 *
185 * If @no_wait is zero, the function might sleep until the eventfd internal 185 * If @no_wait is zero, the function might sleep until the eventfd internal
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index cc8a9b7d6064..f9cfd168fbe2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -62,7 +62,14 @@
62 * This mutex is acquired by ep_free() during the epoll file 62 * This mutex is acquired by ep_free() during the epoll file
63 * cleanup path and it is also acquired by eventpoll_release_file() 63 * cleanup path and it is also acquired by eventpoll_release_file()
64 * if a file has been pushed inside an epoll set and it is then 64 * if a file has been pushed inside an epoll set and it is then
65 * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). 65 * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL).
66 * It is also acquired when inserting an epoll fd onto another epoll
67 * fd. We do this so that we walk the epoll tree and ensure that this
68 * insertion does not create a cycle of epoll file descriptors, which
69 * could lead to deadlock. We need a global mutex to prevent two
70 * simultaneous inserts (A into B and B into A) from racing and
71 * constructing a cycle without either insert observing that it is
72 * going to.
66 * It is possible to drop the "ep->mtx" and to use the global 73 * It is possible to drop the "ep->mtx" and to use the global
67 * mutex "epmutex" (together with "ep->lock") to have it working, 74 * mutex "epmutex" (together with "ep->lock") to have it working,
68 * but having "ep->mtx" will make the interface more scalable. 75 * but having "ep->mtx" will make the interface more scalable.
@@ -145,11 +152,11 @@ struct epitem {
145 152
146/* 153/*
147 * This structure is stored inside the "private_data" member of the file 154 * This structure is stored inside the "private_data" member of the file
148 * structure and rapresent the main data sructure for the eventpoll 155 * structure and represents the main data structure for the eventpoll
149 * interface. 156 * interface.
150 */ 157 */
151struct eventpoll { 158struct eventpoll {
152 /* Protect the this structure access */ 159 /* Protect the access to this structure */
153 spinlock_t lock; 160 spinlock_t lock;
154 161
155 /* 162 /*
@@ -174,7 +181,7 @@ struct eventpoll {
174 181
175 /* 182 /*
176 * This is a single linked list that chains all the "struct epitem" that 183 * This is a single linked list that chains all the "struct epitem" that
177 * happened while transfering ready events to userspace w/out 184 * happened while transferring ready events to userspace w/out
178 * holding ->lock. 185 * holding ->lock.
179 */ 186 */
180 struct epitem *ovflist; 187 struct epitem *ovflist;
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
224 */ 231 */
225static DEFINE_MUTEX(epmutex); 232static DEFINE_MUTEX(epmutex);
226 233
234/* Used to check for epoll file descriptor inclusion loops */
235static struct nested_calls poll_loop_ncalls;
236
227/* Used for safe wake up implementation */ 237/* Used for safe wake up implementation */
228static struct nested_calls poll_safewake_ncalls; 238static struct nested_calls poll_safewake_ncalls;
229 239
@@ -306,6 +316,19 @@ static void ep_nested_calls_init(struct nested_calls *ncalls)
306} 316}
307 317
308/** 318/**
319 * ep_events_available - Checks if ready events might be available.
320 *
321 * @ep: Pointer to the eventpoll context.
322 *
323 * Returns: Returns a value different than zero if ready events are available,
324 * or zero otherwise.
325 */
326static inline int ep_events_available(struct eventpoll *ep)
327{
328 return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
329}
330
331/**
309 * ep_call_nested - Perform a bound (possibly) nested call, by checking 332 * ep_call_nested - Perform a bound (possibly) nested call, by checking
310 * that the recursion limit is not exceeded, and that 333 * that the recursion limit is not exceeded, and that
311 * the same nested call (by the meaning of same cookie) is 334 * the same nested call (by the meaning of same cookie) is
@@ -583,7 +606,7 @@ static void ep_free(struct eventpoll *ep)
583 * We do not need to hold "ep->mtx" here because the epoll file 606 * We do not need to hold "ep->mtx" here because the epoll file
584 * is on the way to be removed and no one has references to it 607 * is on the way to be removed and no one has references to it
585 * anymore. The only hit might come from eventpoll_release_file() but 608 * anymore. The only hit might come from eventpoll_release_file() but
586 * holding "epmutex" is sufficent here. 609 * holding "epmutex" is sufficient here.
587 */ 610 */
588 mutex_lock(&epmutex); 611 mutex_lock(&epmutex);
589 612
@@ -697,7 +720,7 @@ void eventpoll_release_file(struct file *file)
697 /* 720 /*
698 * We don't want to get "file->f_lock" because it is not 721 * We don't want to get "file->f_lock" because it is not
699 * necessary. It is not necessary because we're in the "struct file" 722 * necessary. It is not necessary because we're in the "struct file"
700 * cleanup path, and this means that noone is using this file anymore. 723 * cleanup path, and this means that no one is using this file anymore.
701 * So, for example, epoll_ctl() cannot hit here since if we reach this 724 * So, for example, epoll_ctl() cannot hit here since if we reach this
702 * point, the file counter already went to zero and fget() would fail. 725 * point, the file counter already went to zero and fget() would fail.
703 * The only hit might come from ep_free() but by holding the mutex 726 * The only hit might come from ep_free() but by holding the mutex
@@ -783,7 +806,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
783 806
784/* 807/*
785 * This is the callback that is passed to the wait queue wakeup 808 * This is the callback that is passed to the wait queue wakeup
786 * machanism. It is called by the stored file descriptors when they 809 * mechanism. It is called by the stored file descriptors when they
787 * have events to report. 810 * have events to report.
788 */ 811 */
789static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) 812static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
@@ -814,9 +837,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
814 goto out_unlock; 837 goto out_unlock;
815 838
816 /* 839 /*
817 * If we are trasfering events to userspace, we can hold no locks 840 * If we are transferring events to userspace, we can hold no locks
818 * (because we're accessing user memory, and because of linux f_op->poll() 841 * (because we're accessing user memory, and because of linux f_op->poll()
819 * semantics). All the events that happens during that period of time are 842 * semantics). All the events that happen during that period of time are
820 * chained in ep->ovflist and requeued later on. 843 * chained in ep->ovflist and requeued later on.
821 */ 844 */
822 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { 845 if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
@@ -1089,7 +1112,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1089 * Trigger mode, we need to insert back inside 1112 * Trigger mode, we need to insert back inside
1090 * the ready list, so that the next call to 1113 * the ready list, so that the next call to
1091 * epoll_wait() will check again the events 1114 * epoll_wait() will check again the events
1092 * availability. At this point, noone can insert 1115 * availability. At this point, no one can insert
1093 * into ep->rdllist besides us. The epoll_ctl() 1116 * into ep->rdllist besides us. The epoll_ctl()
1094 * callers are locked out by 1117 * callers are locked out by
1095 * ep_scan_ready_list() holding "mtx" and the 1118 * ep_scan_ready_list() holding "mtx" and the
@@ -1114,31 +1137,63 @@ static int ep_send_events(struct eventpoll *ep,
1114 return ep_scan_ready_list(ep, ep_send_events_proc, &esed); 1137 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1115} 1138}
1116 1139
1140static inline struct timespec ep_set_mstimeout(long ms)
1141{
1142 struct timespec now, ts = {
1143 .tv_sec = ms / MSEC_PER_SEC,
1144 .tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC),
1145 };
1146
1147 ktime_get_ts(&now);
1148 return timespec_add_safe(now, ts);
1149}
1150
1151/**
1152 * ep_poll - Retrieves ready events, and delivers them to the caller supplied
1153 * event buffer.
1154 *
1155 * @ep: Pointer to the eventpoll context.
1156 * @events: Pointer to the userspace buffer where the ready events should be
1157 * stored.
1158 * @maxevents: Size (in terms of number of events) of the caller event buffer.
1159 * @timeout: Maximum timeout for the ready events fetch operation, in
1160 * milliseconds. If the @timeout is zero, the function will not block,
1161 * while if the @timeout is less than zero, the function will block
1162 * until at least one event has been retrieved (or an error
1163 * occurred).
1164 *
1165 * Returns: Returns the number of ready events which have been fetched, or an
1166 * error code, in case of error.
1167 */
1117static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1168static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1118 int maxevents, long timeout) 1169 int maxevents, long timeout)
1119{ 1170{
1120 int res, eavail, timed_out = 0; 1171 int res = 0, eavail, timed_out = 0;
1121 unsigned long flags; 1172 unsigned long flags;
1122 long slack; 1173 long slack = 0;
1123 wait_queue_t wait; 1174 wait_queue_t wait;
1124 struct timespec end_time;
1125 ktime_t expires, *to = NULL; 1175 ktime_t expires, *to = NULL;
1126 1176
1127 if (timeout > 0) { 1177 if (timeout > 0) {
1128 ktime_get_ts(&end_time); 1178 struct timespec end_time = ep_set_mstimeout(timeout);
1129 timespec_add_ns(&end_time, (u64)timeout * NSEC_PER_MSEC); 1179
1130 slack = select_estimate_accuracy(&end_time); 1180 slack = select_estimate_accuracy(&end_time);
1131 to = &expires; 1181 to = &expires;
1132 *to = timespec_to_ktime(end_time); 1182 *to = timespec_to_ktime(end_time);
1133 } else if (timeout == 0) { 1183 } else if (timeout == 0) {
1184 /*
1185 * Avoid the unnecessary trip to the wait queue loop, if the
1186 * caller specified a non blocking operation.
1187 */
1134 timed_out = 1; 1188 timed_out = 1;
1189 spin_lock_irqsave(&ep->lock, flags);
1190 goto check_events;
1135 } 1191 }
1136 1192
1137retry: 1193fetch_events:
1138 spin_lock_irqsave(&ep->lock, flags); 1194 spin_lock_irqsave(&ep->lock, flags);
1139 1195
1140 res = 0; 1196 if (!ep_events_available(ep)) {
1141 if (list_empty(&ep->rdllist)) {
1142 /* 1197 /*
1143 * We don't have any available event to return to the caller. 1198 * We don't have any available event to return to the caller.
1144 * We need to sleep here, and we will be wake up by 1199 * We need to sleep here, and we will be wake up by
@@ -1154,7 +1209,7 @@ retry:
1154 * to TASK_INTERRUPTIBLE before doing the checks. 1209 * to TASK_INTERRUPTIBLE before doing the checks.
1155 */ 1210 */
1156 set_current_state(TASK_INTERRUPTIBLE); 1211 set_current_state(TASK_INTERRUPTIBLE);
1157 if (!list_empty(&ep->rdllist) || timed_out) 1212 if (ep_events_available(ep) || timed_out)
1158 break; 1213 break;
1159 if (signal_pending(current)) { 1214 if (signal_pending(current)) {
1160 res = -EINTR; 1215 res = -EINTR;
@@ -1171,8 +1226,9 @@ retry:
1171 1226
1172 set_current_state(TASK_RUNNING); 1227 set_current_state(TASK_RUNNING);
1173 } 1228 }
1229check_events:
1174 /* Is it worth to try to dig for events ? */ 1230 /* Is it worth to try to dig for events ? */
1175 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; 1231 eavail = ep_events_available(ep);
1176 1232
1177 spin_unlock_irqrestore(&ep->lock, flags); 1233 spin_unlock_irqrestore(&ep->lock, flags);
1178 1234
@@ -1183,11 +1239,67 @@ retry:
1183 */ 1239 */
1184 if (!res && eavail && 1240 if (!res && eavail &&
1185 !(res = ep_send_events(ep, events, maxevents)) && !timed_out) 1241 !(res = ep_send_events(ep, events, maxevents)) && !timed_out)
1186 goto retry; 1242 goto fetch_events;
1187 1243
1188 return res; 1244 return res;
1189} 1245}
1190 1246
1247/**
1248 * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
1249 * API, to verify that adding an epoll file inside another
1250 * epoll structure, does not violate the constraints, in
1251 * terms of closed loops, or too deep chains (which can
1252 * result in excessive stack usage).
1253 *
1254 * @priv: Pointer to the epoll file to be currently checked.
1255 * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
1256 * data structure pointer.
1257 * @call_nests: Current dept of the @ep_call_nested() call stack.
1258 *
1259 * Returns: Returns zero if adding the epoll @file inside current epoll
1260 * structure @ep does not violate the constraints, or -1 otherwise.
1261 */
1262static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1263{
1264 int error = 0;
1265 struct file *file = priv;
1266 struct eventpoll *ep = file->private_data;
1267 struct rb_node *rbp;
1268 struct epitem *epi;
1269
1270 mutex_lock(&ep->mtx);
1271 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1272 epi = rb_entry(rbp, struct epitem, rbn);
1273 if (unlikely(is_file_epoll(epi->ffd.file))) {
1274 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1275 ep_loop_check_proc, epi->ffd.file,
1276 epi->ffd.file->private_data, current);
1277 if (error != 0)
1278 break;
1279 }
1280 }
1281 mutex_unlock(&ep->mtx);
1282
1283 return error;
1284}
1285
1286/**
1287 * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
1288 * another epoll file (represented by @ep) does not create
1289 * closed loops or too deep chains.
1290 *
1291 * @ep: Pointer to the epoll private data structure.
1292 * @file: Pointer to the epoll file to be checked.
1293 *
1294 * Returns: Returns zero if adding the epoll @file inside current epoll
1295 * structure @ep does not violate the constraints, or -1 otherwise.
1296 */
1297static int ep_loop_check(struct eventpoll *ep, struct file *file)
1298{
1299 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1300 ep_loop_check_proc, file, ep, current);
1301}
1302
1191/* 1303/*
1192 * Open an eventpoll file descriptor. 1304 * Open an eventpoll file descriptor.
1193 */ 1305 */
@@ -1236,6 +1348,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1236 struct epoll_event __user *, event) 1348 struct epoll_event __user *, event)
1237{ 1349{
1238 int error; 1350 int error;
1351 int did_lock_epmutex = 0;
1239 struct file *file, *tfile; 1352 struct file *file, *tfile;
1240 struct eventpoll *ep; 1353 struct eventpoll *ep;
1241 struct epitem *epi; 1354 struct epitem *epi;
@@ -1277,6 +1390,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1277 */ 1390 */
1278 ep = file->private_data; 1391 ep = file->private_data;
1279 1392
1393 /*
1394 * When we insert an epoll file descriptor, inside another epoll file
1395 * descriptor, there is the change of creating closed loops, which are
1396 * better be handled here, than in more critical paths.
1397 *
1398 * We hold epmutex across the loop check and the insert in this case, in
1399 * order to prevent two separate inserts from racing and each doing the
1400 * insert "at the same time" such that ep_loop_check passes on both
1401 * before either one does the insert, thereby creating a cycle.
1402 */
1403 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
1404 mutex_lock(&epmutex);
1405 did_lock_epmutex = 1;
1406 error = -ELOOP;
1407 if (ep_loop_check(ep, tfile) != 0)
1408 goto error_tgt_fput;
1409 }
1410
1411
1280 mutex_lock(&ep->mtx); 1412 mutex_lock(&ep->mtx);
1281 1413
1282 /* 1414 /*
@@ -1312,6 +1444,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1312 mutex_unlock(&ep->mtx); 1444 mutex_unlock(&ep->mtx);
1313 1445
1314error_tgt_fput: 1446error_tgt_fput:
1447 if (unlikely(did_lock_epmutex))
1448 mutex_unlock(&epmutex);
1449
1315 fput(tfile); 1450 fput(tfile);
1316error_fput: 1451error_fput:
1317 fput(file); 1452 fput(file);
@@ -1431,6 +1566,12 @@ static int __init eventpoll_init(void)
1431 EP_ITEM_COST; 1566 EP_ITEM_COST;
1432 BUG_ON(max_user_watches < 0); 1567 BUG_ON(max_user_watches < 0);
1433 1568
1569 /*
1570 * Initialize the structure used to perform epoll file descriptor
1571 * inclusion loops checks.
1572 */
1573 ep_nested_calls_init(&poll_loop_ncalls);
1574
1434 /* Initialize the structure used to perform safe poll wait head wake ups */ 1575 /* Initialize the structure used to perform safe poll wait head wake ups */
1435 ep_nested_calls_init(&poll_safewake_ncalls); 1576 ep_nested_calls_init(&poll_safewake_ncalls);
1436 1577
diff --git a/fs/exec.c b/fs/exec.c
index c62efcb959c7..5e62d26a4fec 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
115 struct file *file; 115 struct file *file;
116 char *tmp = getname(library); 116 char *tmp = getname(library);
117 int error = PTR_ERR(tmp); 117 int error = PTR_ERR(tmp);
118 static const struct open_flags uselib_flags = {
119 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
120 .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
121 .intent = LOOKUP_OPEN
122 };
118 123
119 if (IS_ERR(tmp)) 124 if (IS_ERR(tmp))
120 goto out; 125 goto out;
121 126
122 file = do_filp_open(AT_FDCWD, tmp, 127 file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
123 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
124 MAY_READ | MAY_EXEC | MAY_OPEN);
125 putname(tmp); 128 putname(tmp);
126 error = PTR_ERR(file); 129 error = PTR_ERR(file);
127 if (IS_ERR(file)) 130 if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
721{ 724{
722 struct file *file; 725 struct file *file;
723 int err; 726 int err;
727 static const struct open_flags open_exec_flags = {
728 .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
729 .acc_mode = MAY_EXEC | MAY_OPEN,
730 .intent = LOOKUP_OPEN
731 };
724 732
725 file = do_filp_open(AT_FDCWD, name, 733 file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
726 O_LARGEFILE | O_RDONLY | FMODE_EXEC, 0,
727 MAY_EXEC | MAY_OPEN);
728 if (IS_ERR(file)) 734 if (IS_ERR(file))
729 goto out; 735 goto out;
730 736
@@ -1869,7 +1875,7 @@ static void wait_for_dump_helpers(struct file *file)
1869 1875
1870 1876
1871/* 1877/*
1872 * uhm_pipe_setup 1878 * umh_pipe_setup
1873 * helper function to customize the process used 1879 * helper function to customize the process used
1874 * to collect the core in userspace. Specifically 1880 * to collect the core in userspace. Specifically
1875 * it sets up a pipe and installs it as fd 0 (stdin) 1881 * it sets up a pipe and installs it as fd 0 (stdin)
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index f0d520312d8b..3bbd46956d77 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -53,10 +53,14 @@
53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ 53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
54 54
55/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
56/* Inode attrs */
56# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3) 57# define EXOFS_APAGE_FS_DATA (OSD_APAGE_APP_DEFINED_FIRST + 3)
57# define EXOFS_ATTR_INODE_DATA 1 58# define EXOFS_ATTR_INODE_DATA 1
58# define EXOFS_ATTR_INODE_FILE_LAYOUT 2 59# define EXOFS_ATTR_INODE_FILE_LAYOUT 2
59# define EXOFS_ATTR_INODE_DIR_LAYOUT 3 60# define EXOFS_ATTR_INODE_DIR_LAYOUT 3
61/* Partition attrs */
62# define EXOFS_APAGE_SB_DATA (0xF0000000U + 3)
63# define EXOFS_ATTR_SB_STATS 1
60 64
61/* 65/*
62 * The maximum number of files we can have is limited by the size of the 66 * The maximum number of files we can have is limited by the size of the
@@ -86,8 +90,8 @@ enum {
86 */ 90 */
87enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1}; 91enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
88struct exofs_fscb { 92struct exofs_fscb {
89 __le64 s_nextid; /* Highest object ID used */ 93 __le64 s_nextid; /* Only used after mkfs */
90 __le64 s_numfiles; /* Number of files on fs */ 94 __le64 s_numfiles; /* Only used after mkfs */
91 __le32 s_version; /* == EXOFS_FSCB_VER */ 95 __le32 s_version; /* == EXOFS_FSCB_VER */
92 __le16 s_magic; /* Magic signature */ 96 __le16 s_magic; /* Magic signature */
93 __le16 s_newfs; /* Non-zero if this is a new fs */ 97 __le16 s_newfs; /* Non-zero if this is a new fs */
@@ -98,10 +102,20 @@ struct exofs_fscb {
98} __packed; 102} __packed;
99 103
100/* 104/*
105 * This struct is set on the FS partition's attributes.
106 * [EXOFS_APAGE_SB_DATA, EXOFS_ATTR_SB_STATS] and is written together
107 * with the create command, to atomically persist the sb writeable information.
108 */
109struct exofs_sb_stats {
110 __le64 s_nextid; /* Highest object ID used */
111 __le64 s_numfiles; /* Number of files on fs */
112} __packed;
113
114/*
101 * Describes the raid used in the FS. It is part of the device table. 115 * Describes the raid used in the FS. It is part of the device table.
102 * This here is taken from the pNFS-objects definition. In exofs we 116 * This here is taken from the pNFS-objects definition. In exofs we
103 * use one raid policy through-out the filesystem. (NOTE: the funny 117 * use one raid policy through-out the filesystem. (NOTE: the funny
104 * alignment at begining. We take care of it at exofs_device_table. 118 * alignment at beginning. We take care of it at exofs_device_table.
105 */ 119 */
106struct exofs_dt_data_map { 120struct exofs_dt_data_map {
107 __le32 cb_num_comps; 121 __le32 cb_num_comps;
@@ -122,7 +136,7 @@ struct exofs_dt_device_info {
122 u8 systemid[OSD_SYSTEMID_LEN]; 136 u8 systemid[OSD_SYSTEMID_LEN];
123 __le64 long_name_offset; /* If !0 then offset-in-file */ 137 __le64 long_name_offset; /* If !0 then offset-in-file */
124 __le32 osdname_len; /* */ 138 __le32 osdname_len; /* */
125 u8 osdname[44]; /* Embbeded, Ususally an asci uuid */ 139 u8 osdname[44]; /* Embbeded, Usually an asci uuid */
126} __packed; 140} __packed;
127 141
128/* 142/*
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index dcc941d82d67..d0941c6a1f72 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -124,7 +124,7 @@ out:
124 124
125Ebadsize: 125Ebadsize:
126 EXOFS_ERR("ERROR [exofs_check_page]: " 126 EXOFS_ERR("ERROR [exofs_check_page]: "
127 "size of directory #%lu is not a multiple of chunk size", 127 "size of directory(0x%lx) is not a multiple of chunk size\n",
128 dir->i_ino 128 dir->i_ino
129 ); 129 );
130 goto fail; 130 goto fail;
@@ -142,8 +142,8 @@ Espan:
142 goto bad_entry; 142 goto bad_entry;
143bad_entry: 143bad_entry:
144 EXOFS_ERR( 144 EXOFS_ERR(
145 "ERROR [exofs_check_page]: bad entry in directory #%lu: %s - " 145 "ERROR [exofs_check_page]: bad entry in directory(0x%lx): %s - "
146 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d", 146 "offset=%lu, inode=0x%llu, rec_len=%d, name_len=%d\n",
147 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, 147 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
148 _LLU(le64_to_cpu(p->inode_no)), 148 _LLU(le64_to_cpu(p->inode_no)),
149 rec_len, p->name_len); 149 rec_len, p->name_len);
@@ -151,8 +151,8 @@ bad_entry:
151Eend: 151Eend:
152 p = (struct exofs_dir_entry *)(kaddr + offs); 152 p = (struct exofs_dir_entry *)(kaddr + offs);
153 EXOFS_ERR("ERROR [exofs_check_page]: " 153 EXOFS_ERR("ERROR [exofs_check_page]: "
154 "entry in directory #%lu spans the page boundary" 154 "entry in directory(0x%lx) spans the page boundary"
155 "offset=%lu, inode=%llu", 155 "offset=%lu, inode=0x%llx\n",
156 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, 156 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
157 _LLU(le64_to_cpu(p->inode_no))); 157 _LLU(le64_to_cpu(p->inode_no)));
158fail: 158fail:
@@ -261,9 +261,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
261 struct page *page = exofs_get_page(inode, n); 261 struct page *page = exofs_get_page(inode, n);
262 262
263 if (IS_ERR(page)) { 263 if (IS_ERR(page)) {
264 EXOFS_ERR("ERROR: " 264 EXOFS_ERR("ERROR: bad page in directory(0x%lx)\n",
265 "bad page in #%lu", 265 inode->i_ino);
266 inode->i_ino);
267 filp->f_pos += PAGE_CACHE_SIZE - offset; 266 filp->f_pos += PAGE_CACHE_SIZE - offset;
268 return PTR_ERR(page); 267 return PTR_ERR(page);
269 } 268 }
@@ -283,7 +282,8 @@ exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
283 for (; (char *)de <= limit; de = exofs_next_entry(de)) { 282 for (; (char *)de <= limit; de = exofs_next_entry(de)) {
284 if (de->rec_len == 0) { 283 if (de->rec_len == 0) {
285 EXOFS_ERR("ERROR: " 284 EXOFS_ERR("ERROR: "
286 "zero-length directory entry"); 285 "zero-length entry in directory(0x%lx)\n",
286 inode->i_ino);
287 exofs_put_page(page); 287 exofs_put_page(page);
288 return -EIO; 288 return -EIO;
289 } 289 }
@@ -342,9 +342,9 @@ struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
342 kaddr += exofs_last_byte(dir, n) - reclen; 342 kaddr += exofs_last_byte(dir, n) - reclen;
343 while ((char *) de <= kaddr) { 343 while ((char *) de <= kaddr) {
344 if (de->rec_len == 0) { 344 if (de->rec_len == 0) {
345 EXOFS_ERR( 345 EXOFS_ERR("ERROR: zero-length entry in "
346 "ERROR: exofs_find_entry: " 346 "directory(0x%lx)\n",
347 "zero-length directory entry"); 347 dir->i_ino);
348 exofs_put_page(page); 348 exofs_put_page(page);
349 goto out; 349 goto out;
350 } 350 }
@@ -472,7 +472,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
472 } 472 }
473 if (de->rec_len == 0) { 473 if (de->rec_len == 0) {
474 EXOFS_ERR("ERROR: exofs_add_link: " 474 EXOFS_ERR("ERROR: exofs_add_link: "
475 "zero-length directory entry"); 475 "zero-length entry in directory(0x%lx)\n",
476 inode->i_ino);
476 err = -EIO; 477 err = -EIO;
477 goto out_unlock; 478 goto out_unlock;
478 } 479 }
@@ -491,7 +492,8 @@ int exofs_add_link(struct dentry *dentry, struct inode *inode)
491 exofs_put_page(page); 492 exofs_put_page(page);
492 } 493 }
493 494
494 EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode); 495 EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=0x%lx\n",
496 dentry, inode->i_ino);
495 return -EINVAL; 497 return -EINVAL;
496 498
497got_it: 499got_it:
@@ -542,7 +544,8 @@ int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
542 while (de < dir) { 544 while (de < dir) {
543 if (de->rec_len == 0) { 545 if (de->rec_len == 0) {
544 EXOFS_ERR("ERROR: exofs_delete_entry:" 546 EXOFS_ERR("ERROR: exofs_delete_entry:"
545 "zero-length directory entry"); 547 "zero-length entry in directory(0x%lx)\n",
548 inode->i_ino);
546 err = -EIO; 549 err = -EIO;
547 goto out; 550 goto out;
548 } 551 }
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 2dc925fa1010..c965806c2821 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -77,7 +77,7 @@ struct exofs_layout {
77 * our extension to the in-memory superblock 77 * our extension to the in-memory superblock
78 */ 78 */
79struct exofs_sb_info { 79struct exofs_sb_info {
80 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/ 80 struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/
81 int s_timeout; /* timeout for OSD operations */ 81 int s_timeout; /* timeout for OSD operations */
82 uint64_t s_nextid; /* highest object ID used */ 82 uint64_t s_nextid; /* highest object ID used */
83 uint32_t s_numfiles; /* number of files on fs */ 83 uint32_t s_numfiles; /* number of files on fs */
@@ -256,6 +256,8 @@ static inline int exofs_oi_read(struct exofs_i_info *oi,
256} 256}
257 257
258/* inode.c */ 258/* inode.c */
259unsigned exofs_max_io_pages(struct exofs_layout *layout,
260 unsigned expected_pages);
259int exofs_setattr(struct dentry *, struct iattr *); 261int exofs_setattr(struct dentry *, struct iattr *);
260int exofs_write_begin(struct file *file, struct address_space *mapping, 262int exofs_write_begin(struct file *file, struct address_space *mapping,
261 loff_t pos, unsigned len, unsigned flags, 263 loff_t pos, unsigned len, unsigned flags,
@@ -279,7 +281,7 @@ int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
279 struct inode *); 281 struct inode *);
280 282
281/* super.c */ 283/* super.c */
282int exofs_sync_fs(struct super_block *sb, int wait); 284int exofs_sbi_write_stats(struct exofs_sb_info *sbi);
283 285
284/********************* 286/*********************
285 * operation vectors * 287 * operation vectors *
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index b905c79b4f0a..45ca323d8363 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -45,22 +45,8 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
45static int exofs_file_fsync(struct file *filp, int datasync) 45static int exofs_file_fsync(struct file *filp, int datasync)
46{ 46{
47 int ret; 47 int ret;
48 struct inode *inode = filp->f_mapping->host;
49 struct super_block *sb;
50
51 if (!(inode->i_state & I_DIRTY))
52 return 0;
53 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
54 return 0;
55
56 ret = sync_inode_metadata(inode, 1);
57
58 /* This is a good place to write the sb */
59 /* TODO: Sechedule an sb-sync on create */
60 sb = inode->i_sb;
61 if (sb->s_dirt)
62 exofs_sync_fs(sb, 1);
63 48
49 ret = sync_inode_metadata(filp->f_mapping->host, 1);
64 return ret; 50 return ret;
65} 51}
66 52
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 42685424817b..8472c098445d 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -43,6 +43,17 @@ enum { BIO_MAX_PAGES_KMALLOC =
43 PAGE_SIZE / sizeof(struct page *), 43 PAGE_SIZE / sizeof(struct page *),
44}; 44};
45 45
46unsigned exofs_max_io_pages(struct exofs_layout *layout,
47 unsigned expected_pages)
48{
49 unsigned pages = min_t(unsigned, expected_pages, MAX_PAGES_KMALLOC);
50
51 /* TODO: easily support bio chaining */
52 pages = min_t(unsigned, pages,
53 layout->group_width * BIO_MAX_PAGES_KMALLOC);
54 return pages;
55}
56
46struct page_collect { 57struct page_collect {
47 struct exofs_sb_info *sbi; 58 struct exofs_sb_info *sbi;
48 struct inode *inode; 59 struct inode *inode;
@@ -97,8 +108,7 @@ static void _pcol_reset(struct page_collect *pcol)
97 108
98static int pcol_try_alloc(struct page_collect *pcol) 109static int pcol_try_alloc(struct page_collect *pcol)
99{ 110{
100 unsigned pages = min_t(unsigned, pcol->expected_pages, 111 unsigned pages;
101 MAX_PAGES_KMALLOC);
102 112
103 if (!pcol->ios) { /* First time allocate io_state */ 113 if (!pcol->ios) { /* First time allocate io_state */
104 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios); 114 int ret = exofs_get_io_state(&pcol->sbi->layout, &pcol->ios);
@@ -108,8 +118,7 @@ static int pcol_try_alloc(struct page_collect *pcol)
108 } 118 }
109 119
110 /* TODO: easily support bio chaining */ 120 /* TODO: easily support bio chaining */
111 pages = min_t(unsigned, pages, 121 pages = exofs_max_io_pages(&pcol->sbi->layout, pcol->expected_pages);
112 pcol->sbi->layout.group_width * BIO_MAX_PAGES_KMALLOC);
113 122
114 for (; pages; pages >>= 1) { 123 for (; pages; pages >>= 1) {
115 pcol->pages = kmalloc(pages * sizeof(struct page *), 124 pcol->pages = kmalloc(pages * sizeof(struct page *),
@@ -350,8 +359,10 @@ static int readpage_strip(void *data, struct page *page)
350 359
351 if (!pcol->read_4_write) 360 if (!pcol->read_4_write)
352 unlock_page(page); 361 unlock_page(page);
353 EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page," 362 EXOFS_DBGMSG("readpage_strip(0x%lx) empty page len=%zx "
354 " splitting\n", inode->i_ino, page->index); 363 "read_4_write=%d index=0x%lx end_index=0x%lx "
364 "splitting\n", inode->i_ino, len,
365 pcol->read_4_write, page->index, end_index);
355 366
356 return read_exec(pcol); 367 return read_exec(pcol);
357 } 368 }
@@ -722,11 +733,28 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
722 733
723 /* read modify write */ 734 /* read modify write */
724 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) { 735 if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
736 loff_t i_size = i_size_read(mapping->host);
737 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
738 size_t rlen;
739
740 if (page->index < end_index)
741 rlen = PAGE_CACHE_SIZE;
742 else if (page->index == end_index)
743 rlen = i_size & ~PAGE_CACHE_MASK;
744 else
745 rlen = 0;
746
747 if (!rlen) {
748 clear_highpage(page);
749 SetPageUptodate(page);
750 goto out;
751 }
752
725 ret = _readpage(page, true); 753 ret = _readpage(page, true);
726 if (ret) { 754 if (ret) {
727 /*SetPageError was done by _readpage. Is it ok?*/ 755 /*SetPageError was done by _readpage. Is it ok?*/
728 unlock_page(page); 756 unlock_page(page);
729 EXOFS_DBGMSG("__readpage_filler failed\n"); 757 EXOFS_DBGMSG("__readpage failed\n");
730 } 758 }
731 } 759 }
732out: 760out:
@@ -795,7 +823,6 @@ const struct address_space_operations exofs_aops = {
795 .direct_IO = NULL, /* TODO: Should be trivial to do */ 823 .direct_IO = NULL, /* TODO: Should be trivial to do */
796 824
797 /* With these NULL has special meaning or default is not exported */ 825 /* With these NULL has special meaning or default is not exported */
798 .sync_page = NULL,
799 .get_xip_mem = NULL, 826 .get_xip_mem = NULL,
800 .migratepage = NULL, 827 .migratepage = NULL,
801 .launder_page = NULL, 828 .launder_page = NULL,
@@ -1074,6 +1101,7 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
1074 } 1101 }
1075 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0; 1102 return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
1076} 1103}
1104
1077/* 1105/*
1078 * Callback function from exofs_new_inode(). The important thing is that we 1106 * Callback function from exofs_new_inode(). The important thing is that we
1079 * set the obj_created flag so that other methods know that the object exists on 1107 * set the obj_created flag so that other methods know that the object exists on
@@ -1132,7 +1160,6 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1132 sbi = sb->s_fs_info; 1160 sbi = sb->s_fs_info;
1133 1161
1134 inode->i_mapping->backing_dev_info = sb->s_bdi; 1162 inode->i_mapping->backing_dev_info = sb->s_bdi;
1135 sb->s_dirt = 1;
1136 inode_init_owner(inode, dir, mode); 1163 inode_init_owner(inode, dir, mode);
1137 inode->i_ino = sbi->s_nextid++; 1164 inode->i_ino = sbi->s_nextid++;
1138 inode->i_blkbits = EXOFS_BLKSHIFT; 1165 inode->i_blkbits = EXOFS_BLKSHIFT;
@@ -1143,6 +1170,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1143 spin_unlock(&sbi->s_next_gen_lock); 1170 spin_unlock(&sbi->s_next_gen_lock);
1144 insert_inode_hash(inode); 1171 insert_inode_hash(inode);
1145 1172
1173 exofs_sbi_write_stats(sbi); /* Make sure new sbi->s_nextid is on disk */
1174
1146 mark_inode_dirty(inode); 1175 mark_inode_dirty(inode);
1147 1176
1148 ret = exofs_get_io_state(&sbi->layout, &ios); 1177 ret = exofs_get_io_state(&sbi->layout, &ios);
@@ -1273,7 +1302,8 @@ out:
1273 1302
1274int exofs_write_inode(struct inode *inode, struct writeback_control *wbc) 1303int exofs_write_inode(struct inode *inode, struct writeback_control *wbc)
1275{ 1304{
1276 return exofs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 1305 /* FIXME: fix fsync and use wbc->sync_mode == WB_SYNC_ALL */
1306 return exofs_update_inode(inode, 1);
1277} 1307}
1278 1308
1279/* 1309/*
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 264e95d02830..4d70db110cfc 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
272 new_de = exofs_find_entry(new_dir, new_dentry, &new_page); 272 new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
273 if (!new_de) 273 if (!new_de)
274 goto out_dir; 274 goto out_dir;
275 inode_inc_link_count(old_inode);
276 err = exofs_set_link(new_dir, new_de, new_page, old_inode); 275 err = exofs_set_link(new_dir, new_de, new_page, old_inode);
277 new_inode->i_ctime = CURRENT_TIME; 276 new_inode->i_ctime = CURRENT_TIME;
278 if (dir_de) 277 if (dir_de)
@@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
286 if (new_dir->i_nlink >= EXOFS_LINK_MAX) 285 if (new_dir->i_nlink >= EXOFS_LINK_MAX)
287 goto out_dir; 286 goto out_dir;
288 } 287 }
289 inode_inc_link_count(old_inode);
290 err = exofs_add_link(new_dentry, old_inode); 288 err = exofs_add_link(new_dentry, old_inode);
291 if (err) { 289 if (err)
292 inode_dec_link_count(old_inode);
293 goto out_dir; 290 goto out_dir;
294 }
295 if (dir_de) 291 if (dir_de)
296 inode_inc_link_count(new_dir); 292 inode_inc_link_count(new_dir);
297 } 293 }
@@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
299 old_inode->i_ctime = CURRENT_TIME; 295 old_inode->i_ctime = CURRENT_TIME;
300 296
301 exofs_delete_entry(old_de, old_page); 297 exofs_delete_entry(old_de, old_page);
302 inode_dec_link_count(old_inode); 298 mark_inode_dirty(old_inode);
303 299
304 if (dir_de) { 300 if (dir_de) {
305 err = exofs_set_link(old_inode, dir_de, dir_page, new_dir); 301 err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 8c6c4669b381..06065bd37fc3 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -48,6 +48,7 @@
48 * struct to hold what we get from mount options 48 * struct to hold what we get from mount options
49 */ 49 */
50struct exofs_mountopt { 50struct exofs_mountopt {
51 bool is_osdname;
51 const char *dev_name; 52 const char *dev_name;
52 uint64_t pid; 53 uint64_t pid;
53 int timeout; 54 int timeout;
@@ -56,7 +57,7 @@ struct exofs_mountopt {
56/* 57/*
57 * exofs-specific mount-time options. 58 * exofs-specific mount-time options.
58 */ 59 */
59enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err }; 60enum { Opt_name, Opt_pid, Opt_to, Opt_err };
60 61
61/* 62/*
62 * Our mount-time options. These should ideally be 64-bit unsigned, but the 63 * Our mount-time options. These should ideally be 64-bit unsigned, but the
@@ -64,6 +65,7 @@ enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
64 * sufficient for most applications now. 65 * sufficient for most applications now.
65 */ 66 */
66static match_table_t tokens = { 67static match_table_t tokens = {
68 {Opt_name, "osdname=%s"},
67 {Opt_pid, "pid=%u"}, 69 {Opt_pid, "pid=%u"},
68 {Opt_to, "to=%u"}, 70 {Opt_to, "to=%u"},
69 {Opt_err, NULL} 71 {Opt_err, NULL}
@@ -94,6 +96,14 @@ static int parse_options(char *options, struct exofs_mountopt *opts)
94 96
95 token = match_token(p, tokens, args); 97 token = match_token(p, tokens, args);
96 switch (token) { 98 switch (token) {
99 case Opt_name:
100 opts->dev_name = match_strdup(&args[0]);
101 if (unlikely(!opts->dev_name)) {
102 EXOFS_ERR("Error allocating dev_name");
103 return -ENOMEM;
104 }
105 opts->is_osdname = true;
106 break;
97 case Opt_pid: 107 case Opt_pid:
98 if (0 == match_strlcpy(str, &args[0], sizeof(str))) 108 if (0 == match_strlcpy(str, &args[0], sizeof(str)))
99 return -EINVAL; 109 return -EINVAL;
@@ -203,6 +213,101 @@ static void destroy_inodecache(void)
203static const struct super_operations exofs_sops; 213static const struct super_operations exofs_sops;
204static const struct export_operations exofs_export_ops; 214static const struct export_operations exofs_export_ops;
205 215
216static const struct osd_attr g_attr_sb_stats = ATTR_DEF(
217 EXOFS_APAGE_SB_DATA,
218 EXOFS_ATTR_SB_STATS,
219 sizeof(struct exofs_sb_stats));
220
221static int __sbi_read_stats(struct exofs_sb_info *sbi)
222{
223 struct osd_attr attrs[] = {
224 [0] = g_attr_sb_stats,
225 };
226 struct exofs_io_state *ios;
227 int ret;
228
229 ret = exofs_get_io_state(&sbi->layout, &ios);
230 if (unlikely(ret)) {
231 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
232 return ret;
233 }
234
235 ios->cred = sbi->s_cred;
236
237 ios->in_attr = attrs;
238 ios->in_attr_len = ARRAY_SIZE(attrs);
239
240 ret = exofs_sbi_read(ios);
241 if (unlikely(ret)) {
242 EXOFS_ERR("Error reading super_block stats => %d\n", ret);
243 goto out;
244 }
245
246 ret = extract_attr_from_ios(ios, &attrs[0]);
247 if (ret) {
248 EXOFS_ERR("%s: extract_attr of sb_stats failed\n", __func__);
249 goto out;
250 }
251 if (attrs[0].len) {
252 struct exofs_sb_stats *ess;
253
254 if (unlikely(attrs[0].len != sizeof(*ess))) {
255 EXOFS_ERR("%s: Wrong version of exofs_sb_stats "
256 "size(%d) != expected(%zd)\n",
257 __func__, attrs[0].len, sizeof(*ess));
258 goto out;
259 }
260
261 ess = attrs[0].val_ptr;
262 sbi->s_nextid = le64_to_cpu(ess->s_nextid);
263 sbi->s_numfiles = le32_to_cpu(ess->s_numfiles);
264 }
265
266out:
267 exofs_put_io_state(ios);
268 return ret;
269}
270
271static void stats_done(struct exofs_io_state *ios, void *p)
272{
273 exofs_put_io_state(ios);
274 /* Good thanks nothing to do anymore */
275}
276
277/* Asynchronously write the stats attribute */
278int exofs_sbi_write_stats(struct exofs_sb_info *sbi)
279{
280 struct osd_attr attrs[] = {
281 [0] = g_attr_sb_stats,
282 };
283 struct exofs_io_state *ios;
284 int ret;
285
286 ret = exofs_get_io_state(&sbi->layout, &ios);
287 if (unlikely(ret)) {
288 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
289 return ret;
290 }
291
292 sbi->s_ess.s_nextid = cpu_to_le64(sbi->s_nextid);
293 sbi->s_ess.s_numfiles = cpu_to_le64(sbi->s_numfiles);
294 attrs[0].val_ptr = &sbi->s_ess;
295
296 ios->cred = sbi->s_cred;
297 ios->done = stats_done;
298 ios->private = sbi;
299 ios->out_attr = attrs;
300 ios->out_attr_len = ARRAY_SIZE(attrs);
301
302 ret = exofs_sbi_write(ios);
303 if (unlikely(ret)) {
304 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
305 exofs_put_io_state(ios);
306 }
307
308 return ret;
309}
310
206/* 311/*
207 * Write the superblock to the OSD 312 * Write the superblock to the OSD
208 */ 313 */
@@ -213,18 +318,25 @@ int exofs_sync_fs(struct super_block *sb, int wait)
213 struct exofs_io_state *ios; 318 struct exofs_io_state *ios;
214 int ret = -ENOMEM; 319 int ret = -ENOMEM;
215 320
216 lock_super(sb); 321 fscb = kmalloc(sizeof(*fscb), GFP_KERNEL);
322 if (unlikely(!fscb))
323 return -ENOMEM;
324
217 sbi = sb->s_fs_info; 325 sbi = sb->s_fs_info;
218 fscb = &sbi->s_fscb;
219 326
327 /* NOTE: We no longer dirty the super_block anywhere in exofs. The
328 * reason we write the fscb here on unmount is so we can stay backwards
329 * compatible with fscb->s_version == 1. (What we are not compatible
330 * with is if a new version FS crashed and then we try to mount an old
331 * version). Otherwise the exofs_fscb is read-only from mkfs time. All
332 * the writeable info is set in exofs_sbi_write_stats() above.
333 */
220 ret = exofs_get_io_state(&sbi->layout, &ios); 334 ret = exofs_get_io_state(&sbi->layout, &ios);
221 if (ret) 335 if (unlikely(ret))
222 goto out; 336 goto out;
223 337
224 /* Note: We only write the changing part of the fscb. .i.e upto the 338 lock_super(sb);
225 * the fscb->s_dev_table_oid member. There is no read-modify-write 339
226 * here.
227 */
228 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid); 340 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
229 memset(fscb, 0, ios->length); 341 memset(fscb, 0, ios->length);
230 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 342 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -239,16 +351,17 @@ int exofs_sync_fs(struct super_block *sb, int wait)
239 ios->cred = sbi->s_cred; 351 ios->cred = sbi->s_cred;
240 352
241 ret = exofs_sbi_write(ios); 353 ret = exofs_sbi_write(ios);
242 if (unlikely(ret)) { 354 if (unlikely(ret))
243 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__); 355 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
244 goto out; 356 else
245 } 357 sb->s_dirt = 0;
246 sb->s_dirt = 0; 358
247 359
360 unlock_super(sb);
248out: 361out:
249 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret); 362 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
250 exofs_put_io_state(ios); 363 exofs_put_io_state(ios);
251 unlock_super(sb); 364 kfree(fscb);
252 return ret; 365 return ret;
253} 366}
254 367
@@ -292,13 +405,14 @@ static void exofs_put_super(struct super_block *sb)
292 int num_pend; 405 int num_pend;
293 struct exofs_sb_info *sbi = sb->s_fs_info; 406 struct exofs_sb_info *sbi = sb->s_fs_info;
294 407
295 if (sb->s_dirt)
296 exofs_write_super(sb);
297
298 /* make sure there are no pending commands */ 408 /* make sure there are no pending commands */
299 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0; 409 for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
300 num_pend = atomic_read(&sbi->s_curr_pending)) { 410 num_pend = atomic_read(&sbi->s_curr_pending)) {
301 wait_queue_head_t wq; 411 wait_queue_head_t wq;
412
413 printk(KERN_NOTICE "%s: !!Pending operations in flight. "
414 "This is a BUG. please report to osd-dev@open-osd.org\n",
415 __func__);
302 init_waitqueue_head(&wq); 416 init_waitqueue_head(&wq);
303 wait_event_timeout(wq, 417 wait_event_timeout(wq,
304 (atomic_read(&sbi->s_curr_pending) == 0), 418 (atomic_read(&sbi->s_curr_pending) == 0),
@@ -390,6 +504,23 @@ static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
390 return 0; 504 return 0;
391} 505}
392 506
507static unsigned __ra_pages(struct exofs_layout *layout)
508{
509 const unsigned _MIN_RA = 32; /* min 128K read-ahead */
510 unsigned ra_pages = layout->group_width * layout->stripe_unit /
511 PAGE_SIZE;
512 unsigned max_io_pages = exofs_max_io_pages(layout, ~0);
513
514 ra_pages *= 2; /* two stripes */
515 if (ra_pages < _MIN_RA)
516 ra_pages = roundup(_MIN_RA, ra_pages / 2);
517
518 if (ra_pages > max_io_pages)
519 ra_pages = max_io_pages;
520
521 return ra_pages;
522}
523
393/* @odi is valid only as long as @fscb_dev is valid */ 524/* @odi is valid only as long as @fscb_dev is valid */
394static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev, 525static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
395 struct osd_dev_info *odi) 526 struct osd_dev_info *odi)
@@ -495,7 +626,7 @@ static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
495 } 626 }
496 627
497 od = osduld_info_lookup(&odi); 628 od = osduld_info_lookup(&odi);
498 if (unlikely(IS_ERR(od))) { 629 if (IS_ERR(od)) {
499 ret = PTR_ERR(od); 630 ret = PTR_ERR(od);
500 EXOFS_ERR("ERROR: device requested is not found " 631 EXOFS_ERR("ERROR: device requested is not found "
501 "osd_name-%s =>%d\n", odi.osdname, ret); 632 "osd_name-%s =>%d\n", odi.osdname, ret);
@@ -558,9 +689,17 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
558 goto free_bdi; 689 goto free_bdi;
559 690
560 /* use mount options to fill superblock */ 691 /* use mount options to fill superblock */
561 od = osduld_path_lookup(opts->dev_name); 692 if (opts->is_osdname) {
693 struct osd_dev_info odi = {.systemid_len = 0};
694
695 odi.osdname_len = strlen(opts->dev_name);
696 odi.osdname = (u8 *)opts->dev_name;
697 od = osduld_info_lookup(&odi);
698 } else {
699 od = osduld_path_lookup(opts->dev_name);
700 }
562 if (IS_ERR(od)) { 701 if (IS_ERR(od)) {
563 ret = PTR_ERR(od); 702 ret = -EINVAL;
564 goto free_sbi; 703 goto free_sbi;
565 } 704 }
566 705
@@ -594,6 +733,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
594 goto free_sbi; 733 goto free_sbi;
595 734
596 sb->s_magic = le16_to_cpu(fscb.s_magic); 735 sb->s_magic = le16_to_cpu(fscb.s_magic);
736 /* NOTE: we read below to be backward compatible with old versions */
597 sbi->s_nextid = le64_to_cpu(fscb.s_nextid); 737 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
598 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles); 738 sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
599 739
@@ -604,7 +744,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
604 ret = -EINVAL; 744 ret = -EINVAL;
605 goto free_sbi; 745 goto free_sbi;
606 } 746 }
607 if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) { 747 if (le32_to_cpu(fscb.s_version) > EXOFS_FSCB_VER) {
608 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n", 748 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
609 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version)); 749 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
610 ret = -EINVAL; 750 ret = -EINVAL;
@@ -622,7 +762,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
622 goto free_sbi; 762 goto free_sbi;
623 } 763 }
624 764
765 __sbi_read_stats(sbi);
766
625 /* set up operation vectors */ 767 /* set up operation vectors */
768 sbi->bdi.ra_pages = __ra_pages(&sbi->layout);
626 sb->s_bdi = &sbi->bdi; 769 sb->s_bdi = &sbi->bdi;
627 sb->s_fs_info = sbi; 770 sb->s_fs_info = sbi;
628 sb->s_op = &exofs_sops; 771 sb->s_op = &exofs_sops;
@@ -652,6 +795,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
652 795
653 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0], 796 _exofs_print_device("Mounting", opts->dev_name, sbi->layout.s_ods[0],
654 sbi->layout.s_pid); 797 sbi->layout.s_pid);
798 if (opts->is_osdname)
799 kfree(opts->dev_name);
655 return 0; 800 return 0;
656 801
657free_sbi: 802free_sbi:
@@ -660,6 +805,8 @@ free_bdi:
660 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", 805 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
661 opts->dev_name, sbi->layout.s_pid, ret); 806 opts->dev_name, sbi->layout.s_pid, ret);
662 exofs_free_sbi(sbi); 807 exofs_free_sbi(sbi);
808 if (opts->is_osdname)
809 kfree(opts->dev_name);
663 return ret; 810 return ret;
664} 811}
665 812
@@ -677,7 +824,8 @@ static struct dentry *exofs_mount(struct file_system_type *type,
677 if (ret) 824 if (ret)
678 return ERR_PTR(ret); 825 return ERR_PTR(ret);
679 826
680 opts.dev_name = dev_name; 827 if (!opts.dev_name)
828 opts.dev_name = dev_name;
681 return mount_nodev(type, flags, &opts, exofs_fill_super); 829 return mount_nodev(type, flags, &opts, exofs_fill_super);
682} 830}
683 831
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd5..b05acb796135 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
320 struct inode * inode = dentry->d_inode; 320 struct inode * inode = dentry->d_inode;
321 int len = *max_len; 321 int len = *max_len;
322 int type = FILEID_INO32_GEN; 322 int type = FILEID_INO32_GEN;
323 323
324 if (len < 2 || (connectable && len < 4)) 324 if (connectable && (len < 4)) {
325 *max_len = 4;
326 return 255;
327 } else if (len < 2) {
328 *max_len = 2;
325 return 255; 329 return 255;
330 }
326 331
327 len = 2; 332 len = 2;
328 fid->i32.ino = inode->i_ino; 333 fid->i32.ino = inode->i_ino;
@@ -369,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
369 /* 374 /*
370 * Try to get any dentry for the given file handle from the filesystem. 375 * Try to get any dentry for the given file handle from the filesystem.
371 */ 376 */
377 if (!nop || !nop->fh_to_dentry)
378 return ERR_PTR(-ESTALE);
372 result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type); 379 result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
373 if (!result) 380 if (!result)
374 result = ERR_PTR(-ESTALE); 381 result = ERR_PTR(-ESTALE);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 7b4180554a62..abea5a17c764 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -406,7 +406,7 @@ ext2_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
406 return -EINVAL; 406 return -EINVAL;
407 if (!test_opt(dentry->d_sb, POSIX_ACL)) 407 if (!test_opt(dentry->d_sb, POSIX_ACL))
408 return -EOPNOTSUPP; 408 return -EOPNOTSUPP;
409 if (!is_owner_or_cap(dentry->d_inode)) 409 if (!inode_owner_or_capable(dentry->d_inode))
410 return -EPERM; 410 return -EPERM;
411 411
412 if (value) { 412 if (value) {
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 0d06f4e75699..8f44cef1b3ef 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -850,7 +850,7 @@ static int find_next_reservable_window(
850 rsv_window_remove(sb, my_rsv); 850 rsv_window_remove(sb, my_rsv);
851 851
852 /* 852 /*
853 * Let's book the whole avaliable window for now. We will check the 853 * Let's book the whole available window for now. We will check the
854 * disk bitmap later and then, if there are free blocks then we adjust 854 * disk bitmap later and then, if there are free blocks then we adjust
855 * the window size if it's larger than requested. 855 * the window size if it's larger than requested.
856 * Otherwise, we will remove this node from the tree next time 856 * Otherwise, we will remove this node from the tree next time
@@ -1357,9 +1357,9 @@ retry_alloc:
1357 goto allocated; 1357 goto allocated;
1358 } 1358 }
1359 /* 1359 /*
1360 * We may end up a bogus ealier ENOSPC error due to 1360 * We may end up a bogus earlier ENOSPC error due to
1361 * filesystem is "full" of reservations, but 1361 * filesystem is "full" of reservations, but
1362 * there maybe indeed free blocks avaliable on disk 1362 * there maybe indeed free blocks available on disk
1363 * In this case, we just forget about the reservations 1363 * In this case, we just forget about the reservations
1364 * just do block allocation as without reservations. 1364 * just do block allocation as without reservations.
1365 */ 1365 */
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 6346a2acf326..645be9e7ee47 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -110,7 +110,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
110extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int); 110extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *, int);
111 111
112/* ialloc.c */ 112/* ialloc.c */
113extern struct inode * ext2_new_inode (struct inode *, int); 113extern struct inode * ext2_new_inode (struct inode *, int, const struct qstr *);
114extern void ext2_free_inode (struct inode *); 114extern void ext2_free_inode (struct inode *);
115extern unsigned long ext2_count_free_inodes (struct super_block *); 115extern unsigned long ext2_count_free_inodes (struct super_block *);
116extern void ext2_check_inodes_bitmap (struct super_block *); 116extern void ext2_check_inodes_bitmap (struct super_block *);
@@ -174,3 +174,9 @@ ext2_group_first_block_no(struct super_block *sb, unsigned long group_no)
174 return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) + 174 return group_no * (ext2_fsblk_t)EXT2_BLOCKS_PER_GROUP(sb) +
175 le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block); 175 le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block);
176} 176}
177
178#define ext2_set_bit __test_and_set_bit_le
179#define ext2_clear_bit __test_and_clear_bit_le
180#define ext2_test_bit test_bit_le
181#define ext2_find_first_zero_bit find_first_zero_bit_le
182#define ext2_find_next_zero_bit find_next_zero_bit_le
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index ad70479aabff..ee9ed31948e1 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -429,7 +429,8 @@ found:
429 return group; 429 return group;
430} 430}
431 431
432struct inode *ext2_new_inode(struct inode *dir, int mode) 432struct inode *ext2_new_inode(struct inode *dir, int mode,
433 const struct qstr *qstr)
433{ 434{
434 struct super_block *sb; 435 struct super_block *sb;
435 struct buffer_head *bitmap_bh = NULL; 436 struct buffer_head *bitmap_bh = NULL;
@@ -585,7 +586,7 @@ got:
585 if (err) 586 if (err)
586 goto fail_free_drop; 587 goto fail_free_drop;
587 588
588 err = ext2_init_security(inode,dir); 589 err = ext2_init_security(inode, dir, qstr);
589 if (err) 590 if (err)
590 goto fail_free_drop; 591 goto fail_free_drop;
591 592
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 40ad210a5049..788e09a07f7e 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -305,7 +305,7 @@ static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind)
305 return ind->bh->b_blocknr; 305 return ind->bh->b_blocknr;
306 306
307 /* 307 /*
308 * It is going to be refered from inode itself? OK, just put it into 308 * It is going to be referred from inode itself? OK, just put it into
309 * the same cylinder group then. 309 * the same cylinder group then.
310 */ 310 */
311 bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group); 311 bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group);
@@ -860,7 +860,6 @@ const struct address_space_operations ext2_aops = {
860 .readpage = ext2_readpage, 860 .readpage = ext2_readpage,
861 .readpages = ext2_readpages, 861 .readpages = ext2_readpages,
862 .writepage = ext2_writepage, 862 .writepage = ext2_writepage,
863 .sync_page = block_sync_page,
864 .write_begin = ext2_write_begin, 863 .write_begin = ext2_write_begin,
865 .write_end = ext2_write_end, 864 .write_end = ext2_write_end,
866 .bmap = ext2_bmap, 865 .bmap = ext2_bmap,
@@ -880,7 +879,6 @@ const struct address_space_operations ext2_nobh_aops = {
880 .readpage = ext2_readpage, 879 .readpage = ext2_readpage,
881 .readpages = ext2_readpages, 880 .readpages = ext2_readpages,
882 .writepage = ext2_nobh_writepage, 881 .writepage = ext2_nobh_writepage,
883 .sync_page = block_sync_page,
884 .write_begin = ext2_nobh_write_begin, 882 .write_begin = ext2_nobh_write_begin,
885 .write_end = nobh_write_end, 883 .write_end = nobh_write_end,
886 .bmap = ext2_bmap, 884 .bmap = ext2_bmap,
@@ -915,7 +913,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
915 * 913 *
916 * When we do truncate() we may have to clean the ends of several indirect 914 * When we do truncate() we may have to clean the ends of several indirect
917 * blocks but leave the blocks themselves alive. Block is partially 915 * blocks but leave the blocks themselves alive. Block is partially
918 * truncated if some data below the new i_size is refered from it (and 916 * truncated if some data below the new i_size is referred from it (and
919 * it is on the path to the first completely truncated data block, indeed). 917 * it is on the path to the first completely truncated data block, indeed).
920 * We have to free the top of that path along with everything to the right 918 * We have to free the top of that path along with everything to the right
921 * of the path. Since no allocation past the truncation point is possible 919 * of the path. Since no allocation past the truncation point is possible
@@ -992,7 +990,7 @@ no_top:
992 * @p: array of block numbers 990 * @p: array of block numbers
993 * @q: points immediately past the end of array 991 * @q: points immediately past the end of array
994 * 992 *
995 * We are freeing all blocks refered from that array (numbers are 993 * We are freeing all blocks referred from that array (numbers are
996 * stored as little-endian 32-bit) and updating @inode->i_blocks 994 * stored as little-endian 32-bit) and updating @inode->i_blocks
997 * appropriately. 995 * appropriately.
998 */ 996 */
@@ -1032,7 +1030,7 @@ static inline void ext2_free_data(struct inode *inode, __le32 *p, __le32 *q)
1032 * @q: pointer immediately past the end of array 1030 * @q: pointer immediately past the end of array
1033 * @depth: depth of the branches to free 1031 * @depth: depth of the branches to free
1034 * 1032 *
1035 * We are freeing all blocks refered from these branches (numbers are 1033 * We are freeing all blocks referred from these branches (numbers are
1036 * stored as little-endian 32-bit) and updating @inode->i_blocks 1034 * stored as little-endian 32-bit) and updating @inode->i_blocks
1037 * appropriately. 1035 * appropriately.
1038 */ 1036 */
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index e7431309bdca..f81e250ac5c4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
39 if (ret) 39 if (ret)
40 return ret; 40 return ret;
41 41
42 if (!is_owner_or_cap(inode)) { 42 if (!inode_owner_or_capable(inode)) {
43 ret = -EACCES; 43 ret = -EACCES;
44 goto setflags_out; 44 goto setflags_out;
45 } 45 }
@@ -89,7 +89,7 @@ setflags_out:
89 case EXT2_IOC_GETVERSION: 89 case EXT2_IOC_GETVERSION:
90 return put_user(inode->i_generation, (int __user *) arg); 90 return put_user(inode->i_generation, (int __user *) arg);
91 case EXT2_IOC_SETVERSION: 91 case EXT2_IOC_SETVERSION:
92 if (!is_owner_or_cap(inode)) 92 if (!inode_owner_or_capable(inode))
93 return -EPERM; 93 return -EPERM;
94 ret = mnt_want_write(filp->f_path.mnt); 94 ret = mnt_want_write(filp->f_path.mnt);
95 if (ret) 95 if (ret)
@@ -115,7 +115,7 @@ setflags_out:
115 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) 115 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
116 return -ENOTTY; 116 return -ENOTTY;
117 117
118 if (!is_owner_or_cap(inode)) 118 if (!inode_owner_or_capable(inode))
119 return -EACCES; 119 return -EACCES;
120 120
121 if (get_user(rsv_window_size, (int __user *)arg)) 121 if (get_user(rsv_window_size, (int __user *)arg))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d827..ed5c5d496ee9 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -104,7 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
104 104
105 dquot_initialize(dir); 105 dquot_initialize(dir);
106 106
107 inode = ext2_new_inode(dir, mode); 107 inode = ext2_new_inode(dir, mode, &dentry->d_name);
108 if (IS_ERR(inode)) 108 if (IS_ERR(inode))
109 return PTR_ERR(inode); 109 return PTR_ERR(inode);
110 110
@@ -133,7 +133,7 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_
133 133
134 dquot_initialize(dir); 134 dquot_initialize(dir);
135 135
136 inode = ext2_new_inode (dir, mode); 136 inode = ext2_new_inode (dir, mode, &dentry->d_name);
137 err = PTR_ERR(inode); 137 err = PTR_ERR(inode);
138 if (!IS_ERR(inode)) { 138 if (!IS_ERR(inode)) {
139 init_special_inode(inode, inode->i_mode, rdev); 139 init_special_inode(inode, inode->i_mode, rdev);
@@ -159,7 +159,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
159 159
160 dquot_initialize(dir); 160 dquot_initialize(dir);
161 161
162 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO); 162 inode = ext2_new_inode (dir, S_IFLNK | S_IRWXUGO, &dentry->d_name);
163 err = PTR_ERR(inode); 163 err = PTR_ERR(inode);
164 if (IS_ERR(inode)) 164 if (IS_ERR(inode))
165 goto out; 165 goto out;
@@ -230,7 +230,7 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
230 230
231 inode_inc_link_count(dir); 231 inode_inc_link_count(dir);
232 232
233 inode = ext2_new_inode (dir, S_IFDIR | mode); 233 inode = ext2_new_inode(dir, S_IFDIR | mode, &dentry->d_name);
234 err = PTR_ERR(inode); 234 err = PTR_ERR(inode);
235 if (IS_ERR(inode)) 235 if (IS_ERR(inode))
236 goto out_dir; 236 goto out_dir;
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
344 new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page); 344 new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
345 if (!new_de) 345 if (!new_de)
346 goto out_dir; 346 goto out_dir;
347 inode_inc_link_count(old_inode);
348 ext2_set_link(new_dir, new_de, new_page, old_inode, 1); 347 ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
349 new_inode->i_ctime = CURRENT_TIME_SEC; 348 new_inode->i_ctime = CURRENT_TIME_SEC;
350 if (dir_de) 349 if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
356 if (new_dir->i_nlink >= EXT2_LINK_MAX) 355 if (new_dir->i_nlink >= EXT2_LINK_MAX)
357 goto out_dir; 356 goto out_dir;
358 } 357 }
359 inode_inc_link_count(old_inode);
360 err = ext2_add_link(new_dentry, old_inode); 358 err = ext2_add_link(new_dentry, old_inode);
361 if (err) { 359 if (err)
362 inode_dec_link_count(old_inode);
363 goto out_dir; 360 goto out_dir;
364 }
365 if (dir_de) 361 if (dir_de)
366 inode_inc_link_count(new_dir); 362 inode_inc_link_count(new_dir);
367 } 363 }
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
369 /* 365 /*
370 * Like most other Unix systems, set the ctime for inodes on a 366 * Like most other Unix systems, set the ctime for inodes on a
371 * rename. 367 * rename.
372 * inode_dec_link_count() will mark the inode dirty.
373 */ 368 */
374 old_inode->i_ctime = CURRENT_TIME_SEC; 369 old_inode->i_ctime = CURRENT_TIME_SEC;
370 mark_inode_dirty(old_inode);
375 371
376 ext2_delete_entry (old_de, old_page); 372 ext2_delete_entry (old_de, old_page);
377 inode_dec_link_count(old_inode);
378 373
379 if (dir_de) { 374 if (dir_de) {
380 if (old_dir != new_dir) 375 if (old_dir != new_dir)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 7731695e65d9..0a78dae7e2cb 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1382,7 +1382,7 @@ static struct dentry *ext2_mount(struct file_system_type *fs_type,
1382 1382
1383/* Read data from quotafile - avoid pagecache and such because we cannot afford 1383/* Read data from quotafile - avoid pagecache and such because we cannot afford
1384 * acquiring the locks... As quota files are never truncated and quota code 1384 * acquiring the locks... As quota files are never truncated and quota code
1385 * itself serializes the operations (and noone else should touch the files) 1385 * itself serializes the operations (and no one else should touch the files)
1386 * we don't have to be afraid of races */ 1386 * we don't have to be afraid of races */
1387static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, 1387static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
1388 size_t len, loff_t off) 1388 size_t len, loff_t off)
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index c2e4dce984d2..529970617a21 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -35,7 +35,7 @@
35 * +------------------+ 35 * +------------------+
36 * 36 *
37 * The block header is followed by multiple entry descriptors. These entry 37 * The block header is followed by multiple entry descriptors. These entry
38 * descriptors are variable in size, and alligned to EXT2_XATTR_PAD 38 * descriptors are variable in size, and aligned to EXT2_XATTR_PAD
39 * byte boundaries. The entry descriptors are sorted by attribute name, 39 * byte boundaries. The entry descriptors are sorted by attribute name,
40 * so that two extended attribute blocks can be compared efficiently. 40 * so that two extended attribute blocks can be compared efficiently.
41 * 41 *
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index a1a1c2184616..5e41cccff762 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -116,9 +116,11 @@ exit_ext2_xattr(void)
116# endif /* CONFIG_EXT2_FS_XATTR */ 116# endif /* CONFIG_EXT2_FS_XATTR */
117 117
118#ifdef CONFIG_EXT2_FS_SECURITY 118#ifdef CONFIG_EXT2_FS_SECURITY
119extern int ext2_init_security(struct inode *inode, struct inode *dir); 119extern int ext2_init_security(struct inode *inode, struct inode *dir,
120 const struct qstr *qstr);
120#else 121#else
121static inline int ext2_init_security(struct inode *inode, struct inode *dir) 122static inline int ext2_init_security(struct inode *inode, struct inode *dir,
123 const struct qstr *qstr)
122{ 124{
123 return 0; 125 return 0;
124} 126}
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 3004e15d5da5..5d979b4347b0 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -47,14 +47,15 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name,
47} 47}
48 48
49int 49int
50ext2_init_security(struct inode *inode, struct inode *dir) 50ext2_init_security(struct inode *inode, struct inode *dir,
51 const struct qstr *qstr)
51{ 52{
52 int err; 53 int err;
53 size_t len; 54 size_t len;
54 void *value; 55 void *value;
55 char *name; 56 char *name;
56 57
57 err = security_inode_init_security(inode, dir, &name, &value, &len); 58 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
58 if (err) { 59 if (err) {
59 if (err == -EOPNOTSUPP) 60 if (err == -EOPNOTSUPP)
60 return 0; 61 return 0;
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e4fa49e6c539..9d021c0d472a 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -435,7 +435,7 @@ ext3_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
435 return -EINVAL; 435 return -EINVAL;
436 if (!test_opt(inode->i_sb, POSIX_ACL)) 436 if (!test_opt(inode->i_sb, POSIX_ACL))
437 return -EOPNOTSUPP; 437 return -EOPNOTSUPP;
438 if (!is_owner_or_cap(inode)) 438 if (!inode_owner_or_capable(inode))
439 return -EPERM; 439 return -EPERM;
440 440
441 if (value) { 441 if (value) {
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 045995c8ce5a..fe52297e31ad 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -590,7 +590,7 @@ do_more:
590 BUFFER_TRACE(debug_bh, "Deleted!"); 590 BUFFER_TRACE(debug_bh, "Deleted!");
591 if (!bh2jh(bitmap_bh)->b_committed_data) 591 if (!bh2jh(bitmap_bh)->b_committed_data)
592 BUFFER_TRACE(debug_bh, 592 BUFFER_TRACE(debug_bh,
593 "No commited data in bitmap"); 593 "No committed data in bitmap");
594 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); 594 BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
595 __brelse(debug_bh); 595 __brelse(debug_bh);
596 } 596 }
@@ -1063,7 +1063,7 @@ static int find_next_reservable_window(
1063 rsv_window_remove(sb, my_rsv); 1063 rsv_window_remove(sb, my_rsv);
1064 1064
1065 /* 1065 /*
1066 * Let's book the whole avaliable window for now. We will check the 1066 * Let's book the whole available window for now. We will check the
1067 * disk bitmap later and then, if there are free blocks then we adjust 1067 * disk bitmap later and then, if there are free blocks then we adjust
1068 * the window size if it's larger than requested. 1068 * the window size if it's larger than requested.
1069 * Otherwise, we will remove this node from the tree next time 1069 * Otherwise, we will remove this node from the tree next time
@@ -1456,7 +1456,7 @@ static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
1456 * 1456 *
1457 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if 1457 * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
1458 * it is profitable to retry the operation, this function will wait 1458 * it is profitable to retry the operation, this function will wait
1459 * for the current or commiting transaction to complete, and then 1459 * for the current or committing transaction to complete, and then
1460 * return TRUE. 1460 * return TRUE.
1461 * 1461 *
1462 * if the total number of retries exceed three times, return FALSE. 1462 * if the total number of retries exceed three times, return FALSE.
@@ -1632,9 +1632,9 @@ retry_alloc:
1632 goto allocated; 1632 goto allocated;
1633 } 1633 }
1634 /* 1634 /*
1635 * We may end up a bogus ealier ENOSPC error due to 1635 * We may end up a bogus earlier ENOSPC error due to
1636 * filesystem is "full" of reservations, but 1636 * filesystem is "full" of reservations, but
1637 * there maybe indeed free blocks avaliable on disk 1637 * there maybe indeed free blocks available on disk
1638 * In this case, we just forget about the reservations 1638 * In this case, we just forget about the reservations
1639 * just do block allocation as without reservations. 1639 * just do block allocation as without reservations.
1640 */ 1640 */
@@ -1991,6 +1991,7 @@ ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
1991 spin_unlock(sb_bgl_lock(sbi, group)); 1991 spin_unlock(sb_bgl_lock(sbi, group));
1992 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start); 1992 percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1993 1993
1994 free_blocks -= next - start;
1994 /* Do not issue a TRIM on extents smaller than minblocks */ 1995 /* Do not issue a TRIM on extents smaller than minblocks */
1995 if ((next - start) < minblocks) 1996 if ((next - start) < minblocks)
1996 goto free_extent; 1997 goto free_extent;
@@ -2040,7 +2041,7 @@ free_extent:
2040 cond_resched(); 2041 cond_resched();
2041 2042
2042 /* No more suitable extents */ 2043 /* No more suitable extents */
2043 if ((free_blocks - count) < minblocks) 2044 if (free_blocks < minblocks)
2044 break; 2045 break;
2045 } 2046 }
2046 2047
@@ -2090,7 +2091,8 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2090 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count); 2091 ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2091 int ret = 0; 2092 int ret = 0;
2092 2093
2093 start = range->start >> sb->s_blocksize_bits; 2094 start = (range->start >> sb->s_blocksize_bits) +
2095 le32_to_cpu(es->s_first_data_block);
2094 len = range->len >> sb->s_blocksize_bits; 2096 len = range->len >> sb->s_blocksize_bits;
2095 minlen = range->minlen >> sb->s_blocksize_bits; 2097 minlen = range->minlen >> sb->s_blocksize_bits;
2096 trimmed = 0; 2098 trimmed = 0;
@@ -2099,10 +2101,6 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2099 return -EINVAL; 2101 return -EINVAL;
2100 if (start >= max_blks) 2102 if (start >= max_blks)
2101 goto out; 2103 goto out;
2102 if (start < le32_to_cpu(es->s_first_data_block)) {
2103 len -= le32_to_cpu(es->s_first_data_block) - start;
2104 start = le32_to_cpu(es->s_first_data_block);
2105 }
2106 if (start + len > max_blks) 2104 if (start + len > max_blks)
2107 len = max_blks - start; 2105 len = max_blks - start;
2108 2106
@@ -2129,10 +2127,15 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2129 if (free_blocks < minlen) 2127 if (free_blocks < minlen)
2130 continue; 2128 continue;
2131 2129
2132 if (len >= EXT3_BLOCKS_PER_GROUP(sb)) 2130 /*
2133 len -= (EXT3_BLOCKS_PER_GROUP(sb) - first_block); 2131 * For all the groups except the last one, last block will
2134 else 2132 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
2133 * change it for the last group in which case first_block +
2134 * len < EXT3_BLOCKS_PER_GROUP(sb).
2135 */
2136 if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
2135 last_block = first_block + len; 2137 last_block = first_block + len;
2138 len -= last_block - first_block;
2136 2139
2137 ret = ext3_trim_all_free(sb, group, first_block, 2140 ret = ext3_trim_all_free(sb, group, first_block,
2138 last_block, minlen); 2141 last_block, minlen);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 9724aef22460..bfc2dc43681d 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -404,7 +404,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent)
404 * For other inodes, search forward from the parent directory's block 404 * For other inodes, search forward from the parent directory's block
405 * group to find a free inode. 405 * group to find a free inode.
406 */ 406 */
407struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode) 407struct inode *ext3_new_inode(handle_t *handle, struct inode * dir,
408 const struct qstr *qstr, int mode)
408{ 409{
409 struct super_block *sb; 410 struct super_block *sb;
410 struct buffer_head *bitmap_bh = NULL; 411 struct buffer_head *bitmap_bh = NULL;
@@ -589,7 +590,7 @@ got:
589 if (err) 590 if (err)
590 goto fail_free_drop; 591 goto fail_free_drop;
591 592
592 err = ext3_init_security(handle,inode, dir); 593 err = ext3_init_security(handle, inode, dir, qstr);
593 if (err) 594 if (err)
594 goto fail_free_drop; 595 goto fail_free_drop;
595 596
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index ae94f6d949f5..68b2e43d7c35 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1894,7 +1894,6 @@ static const struct address_space_operations ext3_ordered_aops = {
1894 .readpage = ext3_readpage, 1894 .readpage = ext3_readpage,
1895 .readpages = ext3_readpages, 1895 .readpages = ext3_readpages,
1896 .writepage = ext3_ordered_writepage, 1896 .writepage = ext3_ordered_writepage,
1897 .sync_page = block_sync_page,
1898 .write_begin = ext3_write_begin, 1897 .write_begin = ext3_write_begin,
1899 .write_end = ext3_ordered_write_end, 1898 .write_end = ext3_ordered_write_end,
1900 .bmap = ext3_bmap, 1899 .bmap = ext3_bmap,
@@ -1910,7 +1909,6 @@ static const struct address_space_operations ext3_writeback_aops = {
1910 .readpage = ext3_readpage, 1909 .readpage = ext3_readpage,
1911 .readpages = ext3_readpages, 1910 .readpages = ext3_readpages,
1912 .writepage = ext3_writeback_writepage, 1911 .writepage = ext3_writeback_writepage,
1913 .sync_page = block_sync_page,
1914 .write_begin = ext3_write_begin, 1912 .write_begin = ext3_write_begin,
1915 .write_end = ext3_writeback_write_end, 1913 .write_end = ext3_writeback_write_end,
1916 .bmap = ext3_bmap, 1914 .bmap = ext3_bmap,
@@ -1926,7 +1924,6 @@ static const struct address_space_operations ext3_journalled_aops = {
1926 .readpage = ext3_readpage, 1924 .readpage = ext3_readpage,
1927 .readpages = ext3_readpages, 1925 .readpages = ext3_readpages,
1928 .writepage = ext3_journalled_writepage, 1926 .writepage = ext3_journalled_writepage,
1929 .sync_page = block_sync_page,
1930 .write_begin = ext3_write_begin, 1927 .write_begin = ext3_write_begin,
1931 .write_end = ext3_journalled_write_end, 1928 .write_end = ext3_journalled_write_end,
1932 .set_page_dirty = ext3_journalled_set_page_dirty, 1929 .set_page_dirty = ext3_journalled_set_page_dirty,
@@ -2058,7 +2055,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
2058 * 2055 *
2059 * When we do truncate() we may have to clean the ends of several 2056 * When we do truncate() we may have to clean the ends of several
2060 * indirect blocks but leave the blocks themselves alive. Block is 2057 * indirect blocks but leave the blocks themselves alive. Block is
2061 * partially truncated if some data below the new i_size is refered 2058 * partially truncated if some data below the new i_size is referred
2062 * from it (and it is on the path to the first completely truncated 2059 * from it (and it is on the path to the first completely truncated
2063 * data block, indeed). We have to free the top of that path along 2060 * data block, indeed). We have to free the top of that path along
2064 * with everything to the right of the path. Since no allocation 2061 * with everything to the right of the path. Since no allocation
@@ -2187,7 +2184,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
2187 * @first: array of block numbers 2184 * @first: array of block numbers
2188 * @last: points immediately past the end of array 2185 * @last: points immediately past the end of array
2189 * 2186 *
2190 * We are freeing all blocks refered from that array (numbers are stored as 2187 * We are freeing all blocks referred from that array (numbers are stored as
2191 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 2188 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2192 * 2189 *
2193 * We accumulate contiguous runs of blocks to free. Conveniently, if these 2190 * We accumulate contiguous runs of blocks to free. Conveniently, if these
@@ -2275,7 +2272,7 @@ static void ext3_free_data(handle_t *handle, struct inode *inode,
2275 * @last: pointer immediately past the end of array 2272 * @last: pointer immediately past the end of array
2276 * @depth: depth of the branches to free 2273 * @depth: depth of the branches to free
2277 * 2274 *
2278 * We are freeing all blocks refered from these branches (numbers are 2275 * We are freeing all blocks referred from these branches (numbers are
2279 * stored as little-endian 32-bit) and updating @inode->i_blocks 2276 * stored as little-endian 32-bit) and updating @inode->i_blocks
2280 * appropriately. 2277 * appropriately.
2281 */ 2278 */
@@ -3294,7 +3291,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
3294 if (ext3_should_journal_data(inode)) 3291 if (ext3_should_journal_data(inode))
3295 ret = 3 * (bpp + indirects) + 2; 3292 ret = 3 * (bpp + indirects) + 2;
3296 else 3293 else
3297 ret = 2 * (bpp + indirects) + 2; 3294 ret = 2 * (bpp + indirects) + indirects + 2;
3298 3295
3299#ifdef CONFIG_QUOTA 3296#ifdef CONFIG_QUOTA
3300 /* We know that structure was already allocated during dquot_initialize so 3297 /* We know that structure was already allocated during dquot_initialize so
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index fc080dd561f7..f4090bd2f345 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -38,7 +38,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
38 unsigned int oldflags; 38 unsigned int oldflags;
39 unsigned int jflag; 39 unsigned int jflag;
40 40
41 if (!is_owner_or_cap(inode)) 41 if (!inode_owner_or_capable(inode))
42 return -EACCES; 42 return -EACCES;
43 43
44 if (get_user(flags, (int __user *) arg)) 44 if (get_user(flags, (int __user *) arg))
@@ -123,7 +123,7 @@ flags_out:
123 __u32 generation; 123 __u32 generation;
124 int err; 124 int err;
125 125
126 if (!is_owner_or_cap(inode)) 126 if (!inode_owner_or_capable(inode))
127 return -EPERM; 127 return -EPERM;
128 128
129 err = mnt_want_write(filp->f_path.mnt); 129 err = mnt_want_write(filp->f_path.mnt);
@@ -192,7 +192,7 @@ setversion_out:
192 if (err) 192 if (err)
193 return err; 193 return err;
194 194
195 if (!is_owner_or_cap(inode)) { 195 if (!inode_owner_or_capable(inode)) {
196 err = -EACCES; 196 err = -EACCES;
197 goto setrsvsz_out; 197 goto setrsvsz_out;
198 } 198 }
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810ec..32f3b8695859 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1540,8 +1540,8 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
1540 goto cleanup; 1540 goto cleanup;
1541 node2 = (struct dx_node *)(bh2->b_data); 1541 node2 = (struct dx_node *)(bh2->b_data);
1542 entries2 = node2->entries; 1542 entries2 = node2->entries;
1543 memset(&node2->fake, 0, sizeof(struct fake_dirent));
1543 node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize); 1544 node2->fake.rec_len = ext3_rec_len_to_disk(sb->s_blocksize);
1544 node2->fake.inode = 0;
1545 BUFFER_TRACE(frame->bh, "get_write_access"); 1545 BUFFER_TRACE(frame->bh, "get_write_access");
1546 err = ext3_journal_get_write_access(handle, frame->bh); 1546 err = ext3_journal_get_write_access(handle, frame->bh);
1547 if (err) 1547 if (err)
@@ -1710,7 +1710,7 @@ retry:
1710 if (IS_DIRSYNC(dir)) 1710 if (IS_DIRSYNC(dir))
1711 handle->h_sync = 1; 1711 handle->h_sync = 1;
1712 1712
1713 inode = ext3_new_inode (handle, dir, mode); 1713 inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1714 err = PTR_ERR(inode); 1714 err = PTR_ERR(inode);
1715 if (!IS_ERR(inode)) { 1715 if (!IS_ERR(inode)) {
1716 inode->i_op = &ext3_file_inode_operations; 1716 inode->i_op = &ext3_file_inode_operations;
@@ -1746,7 +1746,7 @@ retry:
1746 if (IS_DIRSYNC(dir)) 1746 if (IS_DIRSYNC(dir))
1747 handle->h_sync = 1; 1747 handle->h_sync = 1;
1748 1748
1749 inode = ext3_new_inode (handle, dir, mode); 1749 inode = ext3_new_inode (handle, dir, &dentry->d_name, mode);
1750 err = PTR_ERR(inode); 1750 err = PTR_ERR(inode);
1751 if (!IS_ERR(inode)) { 1751 if (!IS_ERR(inode)) {
1752 init_special_inode(inode, inode->i_mode, rdev); 1752 init_special_inode(inode, inode->i_mode, rdev);
@@ -1784,7 +1784,7 @@ retry:
1784 if (IS_DIRSYNC(dir)) 1784 if (IS_DIRSYNC(dir))
1785 handle->h_sync = 1; 1785 handle->h_sync = 1;
1786 1786
1787 inode = ext3_new_inode (handle, dir, S_IFDIR | mode); 1787 inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFDIR | mode);
1788 err = PTR_ERR(inode); 1788 err = PTR_ERR(inode);
1789 if (IS_ERR(inode)) 1789 if (IS_ERR(inode))
1790 goto out_stop; 1790 goto out_stop;
@@ -2206,7 +2206,7 @@ retry:
2206 if (IS_DIRSYNC(dir)) 2206 if (IS_DIRSYNC(dir))
2207 handle->h_sync = 1; 2207 handle->h_sync = 1;
2208 2208
2209 inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); 2209 inode = ext3_new_inode (handle, dir, &dentry->d_name, S_IFLNK|S_IRWXUGO);
2210 err = PTR_ERR(inode); 2210 err = PTR_ERR(inode);
2211 if (IS_ERR(inode)) 2211 if (IS_ERR(inode))
2212 goto out_stop; 2212 goto out_stop;
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
2253 2253
2254 dquot_initialize(dir); 2254 dquot_initialize(dir);
2255 2255
2256 /*
2257 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2258 * otherwise has the potential to corrupt the orphan inode list.
2259 */
2260 if (inode->i_nlink == 0)
2261 return -ENOENT;
2262
2263retry: 2256retry:
2264 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + 2257 handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
2265 EXT3_INDEX_EXTRA_TRANS_BLOCKS); 2258 EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 108b142e11ed..7916e4ce166a 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -1009,7 +1009,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
1009 1009
1010 if (test_opt(sb, DEBUG)) 1010 if (test_opt(sb, DEBUG))
1011 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK 1011 printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK
1012 " upto "E3FSBLK" blocks\n", 1012 " up to "E3FSBLK" blocks\n",
1013 o_blocks_count, n_blocks_count); 1013 o_blocks_count, n_blocks_count);
1014 1014
1015 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count) 1015 if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f2473..3c6a9e0eadc1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1464,6 +1464,13 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1464 return; 1464 return;
1465 } 1465 }
1466 1466
1467 /* Check if feature set allows readwrite operations */
1468 if (EXT3_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP)) {
1469 ext3_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
1470 "unknown ROCOMPAT features");
1471 return;
1472 }
1473
1467 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) { 1474 if (EXT3_SB(sb)->s_mount_state & EXT3_ERROR_FS) {
1468 if (es->s_last_orphan) 1475 if (es->s_last_orphan)
1469 jbd_debug(1, "Errors on filesystem, " 1476 jbd_debug(1, "Errors on filesystem, "
@@ -1936,6 +1943,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1936 sb->s_qcop = &ext3_qctl_operations; 1943 sb->s_qcop = &ext3_qctl_operations;
1937 sb->dq_op = &ext3_quota_operations; 1944 sb->dq_op = &ext3_quota_operations;
1938#endif 1945#endif
1946 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
1939 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 1947 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
1940 mutex_init(&sbi->s_orphan_lock); 1948 mutex_init(&sbi->s_orphan_lock);
1941 mutex_init(&sbi->s_resize_lock); 1949 mutex_init(&sbi->s_resize_lock);
@@ -2917,7 +2925,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2917 2925
2918/* Read data from quotafile - avoid pagecache and such because we cannot afford 2926/* Read data from quotafile - avoid pagecache and such because we cannot afford
2919 * acquiring the locks... As quota files are never truncated and quota code 2927 * acquiring the locks... As quota files are never truncated and quota code
2920 * itself serializes the operations (and noone else should touch the files) 2928 * itself serializes the operations (and no one else should touch the files)
2921 * we don't have to be afraid of races */ 2929 * we don't have to be afraid of races */
2922static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 2930static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
2923 size_t len, loff_t off) 2931 size_t len, loff_t off)
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 377fe7201169..2be4f69bfa64 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -128,10 +128,10 @@ exit_ext3_xattr(void)
128 128
129#ifdef CONFIG_EXT3_FS_SECURITY 129#ifdef CONFIG_EXT3_FS_SECURITY
130extern int ext3_init_security(handle_t *handle, struct inode *inode, 130extern int ext3_init_security(handle_t *handle, struct inode *inode,
131 struct inode *dir); 131 struct inode *dir, const struct qstr *qstr);
132#else 132#else
133static inline int ext3_init_security(handle_t *handle, struct inode *inode, 133static inline int ext3_init_security(handle_t *handle, struct inode *inode,
134 struct inode *dir) 134 struct inode *dir, const struct qstr *qstr)
135{ 135{
136 return 0; 136 return 0;
137} 137}
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 03a99bfc59f9..b8d9f83aa5c5 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -49,14 +49,15 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name,
49} 49}
50 50
51int 51int
52ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir) 52ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
53 const struct qstr *qstr)
53{ 54{
54 int err; 55 int err;
55 size_t len; 56 size_t len;
56 void *value; 57 void *value;
57 char *name; 58 char *name;
58 59
59 err = security_inode_init_security(inode, dir, &name, &value, &len); 60 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
60 if (err) { 61 if (err) {
61 if (err == -EOPNOTSUPP) 62 if (err == -EOPNOTSUPP)
62 return 0; 63 return 0;
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index e0270d1f8d82..21eacd7b7d79 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -433,7 +433,7 @@ ext4_xattr_set_acl(struct dentry *dentry, const char *name, const void *value,
433 return -EINVAL; 433 return -EINVAL;
434 if (!test_opt(inode->i_sb, POSIX_ACL)) 434 if (!test_opt(inode->i_sb, POSIX_ACL))
435 return -EOPNOTSUPP; 435 return -EOPNOTSUPP;
436 if (!is_owner_or_cap(inode)) 436 if (!inode_owner_or_capable(inode))
437 return -EPERM; 437 return -EPERM;
438 438
439 if (value) { 439 if (value) {
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index adf96b822781..1c67139ad4b4 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,6 +21,8 @@
21#include "ext4_jbd2.h" 21#include "ext4_jbd2.h"
22#include "mballoc.h" 22#include "mballoc.h"
23 23
24#include <trace/events/ext4.h>
25
24/* 26/*
25 * balloc.c contains the blocks allocation and deallocation routines 27 * balloc.c contains the blocks allocation and deallocation routines
26 */ 28 */
@@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
342 * We do it here so the bitmap uptodate bit 344 * We do it here so the bitmap uptodate bit
343 * get set with buffer lock held. 345 * get set with buffer lock held.
344 */ 346 */
347 trace_ext4_read_block_bitmap_load(sb, block_group);
345 set_bitmap_uptodate(bh); 348 set_bitmap_uptodate(bh);
346 if (bh_submit_read(bh) < 0) { 349 if (bh_submit_read(bh) < 0) {
347 put_bh(bh); 350 put_bh(bh);
@@ -544,7 +547,7 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
544 * 547 *
545 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if 548 * ext4_should_retry_alloc() is called when ENOSPC is returned, and if
546 * it is profitable to retry the operation, this function will wait 549 * it is profitable to retry the operation, this function will wait
547 * for the current or commiting transaction to complete, and then 550 * for the current or committing transaction to complete, and then
548 * return TRUE. 551 * return TRUE.
549 * 552 *
550 * if the total number of retries exceed three times, return FALSE. 553 * if the total number of retries exceed three times, return FALSE.
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0c8d97b56f34..4daaf2b753f4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -848,6 +848,7 @@ struct ext4_inode_info {
848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 848 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
849 /* current io_end structure for async DIO write*/ 849 /* current io_end structure for async DIO write*/
850 ext4_io_end_t *cur_aio_dio; 850 ext4_io_end_t *cur_aio_dio;
851 atomic_t i_aiodio_unwritten; /* Nr. of inflight conversions pending */
851 852
852 spinlock_t i_block_reservation_lock; 853 spinlock_t i_block_reservation_lock;
853 854
@@ -922,14 +923,14 @@ struct ext4_inode_info {
922#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ 923#define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \
923 EXT4_MOUNT2_##opt) 924 EXT4_MOUNT2_##opt)
924 925
925#define ext4_set_bit ext2_set_bit 926#define ext4_set_bit __test_and_set_bit_le
926#define ext4_set_bit_atomic ext2_set_bit_atomic 927#define ext4_set_bit_atomic ext2_set_bit_atomic
927#define ext4_clear_bit ext2_clear_bit 928#define ext4_clear_bit __test_and_clear_bit_le
928#define ext4_clear_bit_atomic ext2_clear_bit_atomic 929#define ext4_clear_bit_atomic ext2_clear_bit_atomic
929#define ext4_test_bit ext2_test_bit 930#define ext4_test_bit test_bit_le
930#define ext4_find_first_zero_bit ext2_find_first_zero_bit 931#define ext4_find_first_zero_bit find_first_zero_bit_le
931#define ext4_find_next_zero_bit ext2_find_next_zero_bit 932#define ext4_find_next_zero_bit find_next_zero_bit_le
932#define ext4_find_next_bit ext2_find_next_bit 933#define ext4_find_next_bit find_next_bit_le
933 934
934/* 935/*
935 * Maximal mount counts between two filesystem checks 936 * Maximal mount counts between two filesystem checks
@@ -2119,6 +2120,15 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
2119 2120
2120#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) 2121#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
2121 2122
2123/* For ioend & aio unwritten conversion wait queues */
2124#define EXT4_WQ_HASH_SZ 37
2125#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\
2126 EXT4_WQ_HASH_SZ])
2127#define ext4_aio_mutex(v) (&ext4__aio_mutex[((unsigned long)(v)) %\
2128 EXT4_WQ_HASH_SZ])
2129extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
2130extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
2131
2122#endif /* __KERNEL__ */ 2132#endif /* __KERNEL__ */
2123 2133
2124#endif /* _EXT4_H */ 2134#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d8b992e658c1..d0f53538a57f 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -86,8 +86,8 @@
86 86
87#ifdef CONFIG_QUOTA 87#ifdef CONFIG_QUOTA
88/* Amount of blocks needed for quota update - we know that the structure was 88/* Amount of blocks needed for quota update - we know that the structure was
89 * allocated so we need to update only inode+data */ 89 * allocated so we need to update only data block */
90#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) 90#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
91/* Amount of blocks needed for quota insert/delete - we do some block writes 91/* Amount of blocks needed for quota insert/delete - we do some block writes
92 * but inode, sb and group updates are done only once */ 92 * but inode, sb and group updates are done only once */
93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
@@ -202,13 +202,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
202 return 1; 202 return 1;
203} 203}
204 204
205static inline void ext4_journal_release_buffer(handle_t *handle,
206 struct buffer_head *bh)
207{
208 if (ext4_handle_valid(handle))
209 jbd2_journal_release_buffer(handle, bh);
210}
211
212static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 205static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
213{ 206{
214 return ext4_journal_start_sb(inode->i_sb, nblocks); 207 return ext4_journal_start_sb(inode->i_sb, nblocks);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 63a75810b7c3..4890d6f3ad15 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,8 @@
44#include "ext4_jbd2.h" 44#include "ext4_jbd2.h"
45#include "ext4_extents.h" 45#include "ext4_extents.h"
46 46
47#include <trace/events/ext4.h>
48
47static int ext4_ext_truncate_extend_restart(handle_t *handle, 49static int ext4_ext_truncate_extend_restart(handle_t *handle,
48 struct inode *inode, 50 struct inode *inode,
49 int needed) 51 int needed)
@@ -131,7 +133,7 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
131 * fragmenting the file system's free space. Maybe we 133 * fragmenting the file system's free space. Maybe we
132 * should have some hueristics or some way to allow 134 * should have some hueristics or some way to allow
133 * userspace to pass a hint to file system, 135 * userspace to pass a hint to file system,
134 * especiially if the latter case turns out to be 136 * especially if the latter case turns out to be
135 * common. 137 * common.
136 */ 138 */
137 ex = path[depth].p_ext; 139 ex = path[depth].p_ext;
@@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
664 if (unlikely(!bh)) 666 if (unlikely(!bh))
665 goto err; 667 goto err;
666 if (!bh_uptodate_or_lock(bh)) { 668 if (!bh_uptodate_or_lock(bh)) {
669 trace_ext4_ext_load_extent(inode, block,
670 path[ppos].p_block);
667 if (bh_submit_read(bh) < 0) { 671 if (bh_submit_read(bh) < 0) {
668 put_bh(bh); 672 put_bh(bh);
669 goto err; 673 goto err;
@@ -1034,7 +1038,7 @@ cleanup:
1034 for (i = 0; i < depth; i++) { 1038 for (i = 0; i < depth; i++) {
1035 if (!ablocks[i]) 1039 if (!ablocks[i])
1036 continue; 1040 continue;
1037 ext4_free_blocks(handle, inode, 0, ablocks[i], 1, 1041 ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
1038 EXT4_FREE_BLOCKS_METADATA); 1042 EXT4_FREE_BLOCKS_METADATA);
1039 } 1043 }
1040 } 1044 }
@@ -1725,7 +1729,7 @@ repeat:
1725 BUG_ON(npath->p_depth != path->p_depth); 1729 BUG_ON(npath->p_depth != path->p_depth);
1726 eh = npath[depth].p_hdr; 1730 eh = npath[depth].p_hdr;
1727 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { 1731 if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) {
1728 ext_debug("next leaf isnt full(%d)\n", 1732 ext_debug("next leaf isn't full(%d)\n",
1729 le16_to_cpu(eh->eh_entries)); 1733 le16_to_cpu(eh->eh_entries));
1730 path = npath; 1734 path = npath;
1731 goto repeat; 1735 goto repeat;
@@ -2059,7 +2063,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
2059 if (err) 2063 if (err)
2060 return err; 2064 return err;
2061 ext_debug("index is empty, remove it, free block %llu\n", leaf); 2065 ext_debug("index is empty, remove it, free block %llu\n", leaf);
2062 ext4_free_blocks(handle, inode, 0, leaf, 1, 2066 ext4_free_blocks(handle, inode, NULL, leaf, 1,
2063 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 2067 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
2064 return err; 2068 return err;
2065} 2069}
@@ -2156,7 +2160,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2156 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2160 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2157 start = ext4_ext_pblock(ex) + ee_len - num; 2161 start = ext4_ext_pblock(ex) + ee_len - num;
2158 ext_debug("free last %u blocks starting %llu\n", num, start); 2162 ext_debug("free last %u blocks starting %llu\n", num, start);
2159 ext4_free_blocks(handle, inode, 0, start, num, flags); 2163 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2160 } else if (from == le32_to_cpu(ex->ee_block) 2164 } else if (from == le32_to_cpu(ex->ee_block)
2161 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2165 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2162 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2166 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2529,7 +2533,7 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2529/* 2533/*
2530 * This function is called by ext4_ext_map_blocks() if someone tries to write 2534 * This function is called by ext4_ext_map_blocks() if someone tries to write
2531 * to an uninitialized extent. It may result in splitting the uninitialized 2535 * to an uninitialized extent. It may result in splitting the uninitialized
2532 * extent into multiple extents (upto three - one initialized and two 2536 * extent into multiple extents (up to three - one initialized and two
2533 * uninitialized). 2537 * uninitialized).
2534 * There are three possibilities: 2538 * There are three possibilities:
2535 * a> There is no split required: Entire extent should be initialized 2539 * a> There is no split required: Entire extent should be initialized
@@ -2844,7 +2848,7 @@ fix_extent_len:
2844 * ext4_get_blocks_dio_write() when DIO to write 2848 * ext4_get_blocks_dio_write() when DIO to write
2845 * to an uninitialized extent. 2849 * to an uninitialized extent.
2846 * 2850 *
2847 * Writing to an uninitized extent may result in splitting the uninitialized 2851 * Writing to an uninitialized extent may result in splitting the uninitialized
2848 * extent into multiple /initialized uninitialized extents (up to three) 2852 * extent into multiple /initialized uninitialized extents (up to three)
2849 * There are three possibilities: 2853 * There are three possibilities:
2850 * a> There is no split required: Entire extent should be uninitialized 2854 * a> There is no split required: Entire extent should be uninitialized
@@ -3108,14 +3112,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
3108{ 3112{
3109 int i, depth; 3113 int i, depth;
3110 struct ext4_extent_header *eh; 3114 struct ext4_extent_header *eh;
3111 struct ext4_extent *ex, *last_ex; 3115 struct ext4_extent *last_ex;
3112 3116
3113 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)) 3117 if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
3114 return 0; 3118 return 0;
3115 3119
3116 depth = ext_depth(inode); 3120 depth = ext_depth(inode);
3117 eh = path[depth].p_hdr; 3121 eh = path[depth].p_hdr;
3118 ex = path[depth].p_ext;
3119 3122
3120 if (unlikely(!eh->eh_entries)) { 3123 if (unlikely(!eh->eh_entries)) {
3121 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and " 3124 EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
@@ -3171,12 +3174,13 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3171 path, flags); 3174 path, flags);
3172 /* 3175 /*
3173 * Flag the inode(non aio case) or end_io struct (aio case) 3176 * Flag the inode(non aio case) or end_io struct (aio case)
3174 * that this IO needs to convertion to written when IO is 3177 * that this IO needs to conversion to written when IO is
3175 * completed 3178 * completed
3176 */ 3179 */
3177 if (io) 3180 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3178 io->flag = EXT4_IO_END_UNWRITTEN; 3181 io->flag = EXT4_IO_END_UNWRITTEN;
3179 else 3182 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3183 } else
3180 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3184 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3181 if (ext4_should_dioread_nolock(inode)) 3185 if (ext4_should_dioread_nolock(inode))
3182 map->m_flags |= EXT4_MAP_UNINIT; 3186 map->m_flags |= EXT4_MAP_UNINIT;
@@ -3294,9 +3298,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3294 struct ext4_map_blocks *map, int flags) 3298 struct ext4_map_blocks *map, int flags)
3295{ 3299{
3296 struct ext4_ext_path *path = NULL; 3300 struct ext4_ext_path *path = NULL;
3297 struct ext4_extent_header *eh;
3298 struct ext4_extent newex, *ex; 3301 struct ext4_extent newex, *ex;
3299 ext4_fsblk_t newblock; 3302 ext4_fsblk_t newblock = 0;
3300 int err = 0, depth, ret; 3303 int err = 0, depth, ret;
3301 unsigned int allocated = 0; 3304 unsigned int allocated = 0;
3302 struct ext4_allocation_request ar; 3305 struct ext4_allocation_request ar;
@@ -3304,6 +3307,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3304 3307
3305 ext_debug("blocks %u/%u requested for inode %lu\n", 3308 ext_debug("blocks %u/%u requested for inode %lu\n",
3306 map->m_lblk, map->m_len, inode->i_ino); 3309 map->m_lblk, map->m_len, inode->i_ino);
3310 trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
3307 3311
3308 /* check in cache */ 3312 /* check in cache */
3309 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) { 3313 if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
@@ -3351,7 +3355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3351 err = -EIO; 3355 err = -EIO;
3352 goto out2; 3356 goto out2;
3353 } 3357 }
3354 eh = path[depth].p_hdr;
3355 3358
3356 ex = path[depth].p_ext; 3359 ex = path[depth].p_ext;
3357 if (ex) { 3360 if (ex) {
@@ -3457,15 +3460,16 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3457 ext4_ext_mark_uninitialized(&newex); 3460 ext4_ext_mark_uninitialized(&newex);
3458 /* 3461 /*
3459 * io_end structure was created for every IO write to an 3462 * io_end structure was created for every IO write to an
3460 * uninitialized extent. To avoid unecessary conversion, 3463 * uninitialized extent. To avoid unnecessary conversion,
3461 * here we flag the IO that really needs the conversion. 3464 * here we flag the IO that really needs the conversion.
3462 * For non asycn direct IO case, flag the inode state 3465 * For non asycn direct IO case, flag the inode state
3463 * that we need to perform convertion when IO is done. 3466 * that we need to perform conversion when IO is done.
3464 */ 3467 */
3465 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3468 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3466 if (io) 3469 if (io && !(io->flag & EXT4_IO_END_UNWRITTEN)) {
3467 io->flag = EXT4_IO_END_UNWRITTEN; 3470 io->flag = EXT4_IO_END_UNWRITTEN;
3468 else 3471 atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
3472 } else
3469 ext4_set_inode_state(inode, 3473 ext4_set_inode_state(inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3474 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3475 }
@@ -3483,7 +3487,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3483 /* not a good idea to call discard here directly, 3487 /* not a good idea to call discard here directly,
3484 * but otherwise we'd need to call it every free() */ 3488 * but otherwise we'd need to call it every free() */
3485 ext4_discard_preallocations(inode); 3489 ext4_discard_preallocations(inode);
3486 ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex), 3490 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
3487 ext4_ext_get_actual_len(&newex), 0); 3491 ext4_ext_get_actual_len(&newex), 0);
3488 goto out2; 3492 goto out2;
3489 } 3493 }
@@ -3523,6 +3527,8 @@ out2:
3523 ext4_ext_drop_refs(path); 3527 ext4_ext_drop_refs(path);
3524 kfree(path); 3528 kfree(path);
3525 } 3529 }
3530 trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
3531 newblock, map->m_len, err ? err : allocated);
3526 return err ? err : allocated; 3532 return err ? err : allocated;
3527} 3533}
3528 3534
@@ -3656,6 +3662,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3656 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 3662 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3657 return -EOPNOTSUPP; 3663 return -EOPNOTSUPP;
3658 3664
3665 trace_ext4_fallocate_enter(inode, offset, len, mode);
3659 map.m_lblk = offset >> blkbits; 3666 map.m_lblk = offset >> blkbits;
3660 /* 3667 /*
3661 * We can't just convert len to max_blocks because 3668 * We can't just convert len to max_blocks because
@@ -3671,6 +3678,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
3671 ret = inode_newsize_ok(inode, (len + offset)); 3678 ret = inode_newsize_ok(inode, (len + offset));
3672 if (ret) { 3679 if (ret) {
3673 mutex_unlock(&inode->i_mutex); 3680 mutex_unlock(&inode->i_mutex);
3681 trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
3674 return ret; 3682 return ret;
3675 } 3683 }
3676retry: 3684retry:
@@ -3715,6 +3723,8 @@ retry:
3715 goto retry; 3723 goto retry;
3716 } 3724 }
3717 mutex_unlock(&inode->i_mutex); 3725 mutex_unlock(&inode->i_mutex);
3726 trace_ext4_fallocate_exit(inode, offset, max_blocks,
3727 ret > 0 ? ret2 : ret);
3718 return ret > 0 ? ret2 : ret; 3728 return ret > 0 ? ret2 : ret;
3719} 3729}
3720 3730
@@ -3773,6 +3783,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3773 } 3783 }
3774 return ret > 0 ? ret2 : ret; 3784 return ret > 0 ? ret2 : ret;
3775} 3785}
3786
3776/* 3787/*
3777 * Callback function called for each extent to gather FIEMAP information. 3788 * Callback function called for each extent to gather FIEMAP information.
3778 */ 3789 */
@@ -3780,38 +3791,162 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3780 struct ext4_ext_cache *newex, struct ext4_extent *ex, 3791 struct ext4_ext_cache *newex, struct ext4_extent *ex,
3781 void *data) 3792 void *data)
3782{ 3793{
3783 struct fiemap_extent_info *fieinfo = data;
3784 unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
3785 __u64 logical; 3794 __u64 logical;
3786 __u64 physical; 3795 __u64 physical;
3787 __u64 length; 3796 __u64 length;
3797 loff_t size;
3788 __u32 flags = 0; 3798 __u32 flags = 0;
3789 int error; 3799 int ret = 0;
3800 struct fiemap_extent_info *fieinfo = data;
3801 unsigned char blksize_bits;
3790 3802
3791 logical = (__u64)newex->ec_block << blksize_bits; 3803 blksize_bits = inode->i_sb->s_blocksize_bits;
3804 logical = (__u64)newex->ec_block << blksize_bits;
3792 3805
3793 if (newex->ec_start == 0) { 3806 if (newex->ec_start == 0) {
3794 pgoff_t offset; 3807 /*
3795 struct page *page; 3808 * No extent in extent-tree contains block @newex->ec_start,
3809 * then the block may stay in 1)a hole or 2)delayed-extent.
3810 *
3811 * Holes or delayed-extents are processed as follows.
3812 * 1. lookup dirty pages with specified range in pagecache.
3813 * If no page is got, then there is no delayed-extent and
3814 * return with EXT_CONTINUE.
3815 * 2. find the 1st mapped buffer,
3816 * 3. check if the mapped buffer is both in the request range
3817 * and a delayed buffer. If not, there is no delayed-extent,
3818 * then return.
3819 * 4. a delayed-extent is found, the extent will be collected.
3820 */
3821 ext4_lblk_t end = 0;
3822 pgoff_t last_offset;
3823 pgoff_t offset;
3824 pgoff_t index;
3825 struct page **pages = NULL;
3796 struct buffer_head *bh = NULL; 3826 struct buffer_head *bh = NULL;
3827 struct buffer_head *head = NULL;
3828 unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
3829
3830 pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
3831 if (pages == NULL)
3832 return -ENOMEM;
3797 3833
3798 offset = logical >> PAGE_SHIFT; 3834 offset = logical >> PAGE_SHIFT;
3799 page = find_get_page(inode->i_mapping, offset); 3835repeat:
3800 if (!page || !page_has_buffers(page)) 3836 last_offset = offset;
3801 return EXT_CONTINUE; 3837 head = NULL;
3838 ret = find_get_pages_tag(inode->i_mapping, &offset,
3839 PAGECACHE_TAG_DIRTY, nr_pages, pages);
3840
3841 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
3842 /* First time, try to find a mapped buffer. */
3843 if (ret == 0) {
3844out:
3845 for (index = 0; index < ret; index++)
3846 page_cache_release(pages[index]);
3847 /* just a hole. */
3848 kfree(pages);
3849 return EXT_CONTINUE;
3850 }
3802 3851
3803 bh = page_buffers(page); 3852 /* Try to find the 1st mapped buffer. */
3853 end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
3854 blksize_bits;
3855 if (!page_has_buffers(pages[0]))
3856 goto out;
3857 head = page_buffers(pages[0]);
3858 if (!head)
3859 goto out;
3804 3860
3805 if (!bh) 3861 bh = head;
3806 return EXT_CONTINUE; 3862 do {
3863 if (buffer_mapped(bh)) {
3864 /* get the 1st mapped buffer. */
3865 if (end > newex->ec_block +
3866 newex->ec_len)
3867 /* The buffer is out of
3868 * the request range.
3869 */
3870 goto out;
3871 goto found_mapped_buffer;
3872 }
3873 bh = bh->b_this_page;
3874 end++;
3875 } while (bh != head);
3807 3876
3808 if (buffer_delay(bh)) { 3877 /* No mapped buffer found. */
3809 flags |= FIEMAP_EXTENT_DELALLOC; 3878 goto out;
3810 page_cache_release(page);
3811 } else { 3879 } else {
3812 page_cache_release(page); 3880 /*Find contiguous delayed buffers. */
3813 return EXT_CONTINUE; 3881 if (ret > 0 && pages[0]->index == last_offset)
3882 head = page_buffers(pages[0]);
3883 bh = head;
3884 }
3885
3886found_mapped_buffer:
3887 if (bh != NULL && buffer_delay(bh)) {
3888 /* 1st or contiguous delayed buffer found. */
3889 if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
3890 /*
3891 * 1st delayed buffer found, record
3892 * the start of extent.
3893 */
3894 flags |= FIEMAP_EXTENT_DELALLOC;
3895 newex->ec_block = end;
3896 logical = (__u64)end << blksize_bits;
3897 }
3898 /* Find contiguous delayed buffers. */
3899 do {
3900 if (!buffer_delay(bh))
3901 goto found_delayed_extent;
3902 bh = bh->b_this_page;
3903 end++;
3904 } while (bh != head);
3905
3906 for (index = 1; index < ret; index++) {
3907 if (!page_has_buffers(pages[index])) {
3908 bh = NULL;
3909 break;
3910 }
3911 head = page_buffers(pages[index]);
3912 if (!head) {
3913 bh = NULL;
3914 break;
3915 }
3916 if (pages[index]->index !=
3917 pages[0]->index + index) {
3918 /* Blocks are not contiguous. */
3919 bh = NULL;
3920 break;
3921 }
3922 bh = head;
3923 do {
3924 if (!buffer_delay(bh))
3925 /* Delayed-extent ends. */
3926 goto found_delayed_extent;
3927 bh = bh->b_this_page;
3928 end++;
3929 } while (bh != head);
3930 }
3931 } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
3932 /* a hole found. */
3933 goto out;
3934
3935found_delayed_extent:
3936 newex->ec_len = min(end - newex->ec_block,
3937 (ext4_lblk_t)EXT_INIT_MAX_LEN);
3938 if (ret == nr_pages && bh != NULL &&
3939 newex->ec_len < EXT_INIT_MAX_LEN &&
3940 buffer_delay(bh)) {
3941 /* Have not collected an extent and continue. */
3942 for (index = 0; index < ret; index++)
3943 page_cache_release(pages[index]);
3944 goto repeat;
3814 } 3945 }
3946
3947 for (index = 0; index < ret; index++)
3948 page_cache_release(pages[index]);
3949 kfree(pages);
3815 } 3950 }
3816 3951
3817 physical = (__u64)newex->ec_start << blksize_bits; 3952 physical = (__u64)newex->ec_start << blksize_bits;
@@ -3820,32 +3955,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
3820 if (ex && ext4_ext_is_uninitialized(ex)) 3955 if (ex && ext4_ext_is_uninitialized(ex))
3821 flags |= FIEMAP_EXTENT_UNWRITTEN; 3956 flags |= FIEMAP_EXTENT_UNWRITTEN;
3822 3957
3823 /* 3958 size = i_size_read(inode);
3824 * If this extent reaches EXT_MAX_BLOCK, it must be last. 3959 if (logical + length >= size)
3825 *
3826 * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
3827 * this also indicates no more allocated blocks.
3828 *
3829 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
3830 */
3831 if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
3832 newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
3833 loff_t size = i_size_read(inode);
3834 loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
3835
3836 flags |= FIEMAP_EXTENT_LAST; 3960 flags |= FIEMAP_EXTENT_LAST;
3837 if ((flags & FIEMAP_EXTENT_DELALLOC) &&
3838 logical+length > size)
3839 length = (size - logical + bs - 1) & ~(bs-1);
3840 }
3841 3961
3842 error = fiemap_fill_next_extent(fieinfo, logical, physical, 3962 ret = fiemap_fill_next_extent(fieinfo, logical, physical,
3843 length, flags); 3963 length, flags);
3844 if (error < 0) 3964 if (ret < 0)
3845 return error; 3965 return ret;
3846 if (error == 1) 3966 if (ret == 1)
3847 return EXT_BREAK; 3967 return EXT_BREAK;
3848
3849 return EXT_CONTINUE; 3968 return EXT_CONTINUE;
3850} 3969}
3851 3970
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2e8322c8aa88..7b80d543b89e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -55,11 +55,47 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
55 return 0; 55 return 0;
56} 56}
57 57
58static void ext4_aiodio_wait(struct inode *inode)
59{
60 wait_queue_head_t *wq = ext4_ioend_wq(inode);
61
62 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_aiodio_unwritten) == 0));
63}
64
65/*
66 * This tests whether the IO in question is block-aligned or not.
67 * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
68 * are converted to written only after the IO is complete. Until they are
69 * mapped, these blocks appear as holes, so dio_zero_block() will assume that
70 * it needs to zero out portions of the start and/or end block. If 2 AIO
71 * threads are at work on the same unwritten block, they must be synchronized
72 * or one thread will zero the other's data, causing corruption.
73 */
74static int
75ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
76 unsigned long nr_segs, loff_t pos)
77{
78 struct super_block *sb = inode->i_sb;
79 int blockmask = sb->s_blocksize - 1;
80 size_t count = iov_length(iov, nr_segs);
81 loff_t final_size = pos + count;
82
83 if (pos >= inode->i_size)
84 return 0;
85
86 if ((pos & blockmask) || (final_size & blockmask))
87 return 1;
88
89 return 0;
90}
91
58static ssize_t 92static ssize_t
59ext4_file_write(struct kiocb *iocb, const struct iovec *iov, 93ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
60 unsigned long nr_segs, loff_t pos) 94 unsigned long nr_segs, loff_t pos)
61{ 95{
62 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 96 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
97 int unaligned_aio = 0;
98 int ret;
63 99
64 /* 100 /*
65 * If we have encountered a bitmap-format file, the size limit 101 * If we have encountered a bitmap-format file, the size limit
@@ -78,9 +114,31 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
78 nr_segs = iov_shorten((struct iovec *)iov, nr_segs, 114 nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
79 sbi->s_bitmap_maxbytes - pos); 115 sbi->s_bitmap_maxbytes - pos);
80 } 116 }
117 } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
118 !is_sync_kiocb(iocb))) {
119 unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
81 } 120 }
82 121
83 return generic_file_aio_write(iocb, iov, nr_segs, pos); 122 /* Unaligned direct AIO must be serialized; see comment above */
123 if (unaligned_aio) {
124 static unsigned long unaligned_warn_time;
125
126 /* Warn about this once per day */
127 if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
128 ext4_msg(inode->i_sb, KERN_WARNING,
129 "Unaligned AIO/DIO on inode %ld by %s; "
130 "performance will be poor.",
131 inode->i_ino, current->comm);
132 mutex_lock(ext4_aio_mutex(inode));
133 ext4_aiodio_wait(inode);
134 }
135
136 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
137
138 if (unaligned_aio)
139 mutex_unlock(ext4_aio_mutex(inode));
140
141 return ret;
84} 142}
85 143
86static const struct vm_operations_struct ext4_file_vm_ops = { 144static const struct vm_operations_struct ext4_file_vm_ops = {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7829b287822a..e9473cbe80df 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -101,7 +101,7 @@ extern int ext4_flush_completed_IO(struct inode *inode)
101 * to the work-to-be schedule is freed. 101 * to the work-to-be schedule is freed.
102 * 102 *
103 * Thus we need to keep the io structure still valid here after 103 * Thus we need to keep the io structure still valid here after
104 * convertion finished. The io structure has a flag to 104 * conversion finished. The io structure has a flag to
105 * avoid double converting from both fsync and background work 105 * avoid double converting from both fsync and background work
106 * queue work. 106 * queue work.
107 */ 107 */
@@ -125,9 +125,11 @@ extern int ext4_flush_completed_IO(struct inode *inode)
125 * the parent directory's parent as well, and so on recursively, if 125 * the parent directory's parent as well, and so on recursively, if
126 * they are also freshly created. 126 * they are also freshly created.
127 */ 127 */
128static void ext4_sync_parent(struct inode *inode) 128static int ext4_sync_parent(struct inode *inode)
129{ 129{
130 struct writeback_control wbc;
130 struct dentry *dentry = NULL; 131 struct dentry *dentry = NULL;
132 int ret = 0;
131 133
132 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { 134 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
133 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); 135 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
@@ -136,8 +138,17 @@ static void ext4_sync_parent(struct inode *inode)
136 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) 138 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
137 break; 139 break;
138 inode = dentry->d_parent->d_inode; 140 inode = dentry->d_parent->d_inode;
139 sync_mapping_buffers(inode->i_mapping); 141 ret = sync_mapping_buffers(inode->i_mapping);
142 if (ret)
143 break;
144 memset(&wbc, 0, sizeof(wbc));
145 wbc.sync_mode = WB_SYNC_ALL;
146 wbc.nr_to_write = 0; /* only write out the inode */
147 ret = sync_inode(inode, &wbc);
148 if (ret)
149 break;
140 } 150 }
151 return ret;
141} 152}
142 153
143/* 154/*
@@ -164,20 +175,20 @@ int ext4_sync_file(struct file *file, int datasync)
164 175
165 J_ASSERT(ext4_journal_current_handle() == NULL); 176 J_ASSERT(ext4_journal_current_handle() == NULL);
166 177
167 trace_ext4_sync_file(file, datasync); 178 trace_ext4_sync_file_enter(file, datasync);
168 179
169 if (inode->i_sb->s_flags & MS_RDONLY) 180 if (inode->i_sb->s_flags & MS_RDONLY)
170 return 0; 181 return 0;
171 182
172 ret = ext4_flush_completed_IO(inode); 183 ret = ext4_flush_completed_IO(inode);
173 if (ret < 0) 184 if (ret < 0)
174 return ret; 185 goto out;
175 186
176 if (!journal) { 187 if (!journal) {
177 ret = generic_file_fsync(file, datasync); 188 ret = generic_file_fsync(file, datasync);
178 if (!ret && !list_empty(&inode->i_dentry)) 189 if (!ret && !list_empty(&inode->i_dentry))
179 ext4_sync_parent(inode); 190 ret = ext4_sync_parent(inode);
180 return ret; 191 goto out;
181 } 192 }
182 193
183 /* 194 /*
@@ -194,8 +205,10 @@ int ext4_sync_file(struct file *file, int datasync)
194 * (they were dirtied by commit). But that's OK - the blocks are 205 * (they were dirtied by commit). But that's OK - the blocks are
195 * safe in-journal, which is all fsync() needs to ensure. 206 * safe in-journal, which is all fsync() needs to ensure.
196 */ 207 */
197 if (ext4_should_journal_data(inode)) 208 if (ext4_should_journal_data(inode)) {
198 return ext4_force_commit(inode->i_sb); 209 ret = ext4_force_commit(inode->i_sb);
210 goto out;
211 }
199 212
200 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; 213 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
201 if (jbd2_log_start_commit(journal, commit_tid)) { 214 if (jbd2_log_start_commit(journal, commit_tid)) {
@@ -215,5 +228,7 @@ int ext4_sync_file(struct file *file, int datasync)
215 ret = jbd2_log_wait_commit(journal, commit_tid); 228 ret = jbd2_log_wait_commit(journal, commit_tid);
216 } else if (journal->j_flags & JBD2_BARRIER) 229 } else if (journal->j_flags & JBD2_BARRIER)
217 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 230 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
231 out:
232 trace_ext4_sync_file_exit(inode, ret);
218 return ret; 233 return ret;
219} 234}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index eb9097aec6f0..21bb2f61e502 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
152 * We do it here so the bitmap uptodate bit 152 * We do it here so the bitmap uptodate bit
153 * get set with buffer lock held. 153 * get set with buffer lock held.
154 */ 154 */
155 trace_ext4_load_inode_bitmap(sb, block_group);
155 set_bitmap_uptodate(bh); 156 set_bitmap_uptodate(bh);
156 if (bh_submit_read(bh) < 0) { 157 if (bh_submit_read(bh) < 0) {
157 put_bh(bh); 158 put_bh(bh);
@@ -649,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
649 *group = parent_group + flex_size; 650 *group = parent_group + flex_size;
650 if (*group > ngroups) 651 if (*group > ngroups)
651 *group = 0; 652 *group = 0;
652 return find_group_orlov(sb, parent, group, mode, 0); 653 return find_group_orlov(sb, parent, group, mode, NULL);
653 } 654 }
654 655
655 /* 656 /*
@@ -1042,7 +1043,7 @@ got:
1042 if (err) 1043 if (err)
1043 goto fail_free_drop; 1044 goto fail_free_drop;
1044 1045
1045 err = ext4_init_security(handle, inode, dir); 1046 err = ext4_init_security(handle, inode, dir, qstr);
1046 if (err) 1047 if (err)
1047 goto fail_free_drop; 1048 goto fail_free_drop;
1048 1049
@@ -1054,6 +1055,11 @@ got:
1054 } 1055 }
1055 } 1056 }
1056 1057
1058 if (ext4_handle_valid(handle)) {
1059 ei->i_sync_tid = handle->h_transaction->t_tid;
1060 ei->i_datasync_tid = handle->h_transaction->t_tid;
1061 }
1062
1057 err = ext4_mark_inode_dirty(handle, inode); 1063 err = ext4_mark_inode_dirty(handle, inode);
1058 if (err) { 1064 if (err) {
1059 ext4_std_error(sb, err); 1065 ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9f7f9e49914f..f2fa5e8a582c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -173,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
173 BUG_ON(EXT4_JOURNAL(inode) == NULL); 173 BUG_ON(EXT4_JOURNAL(inode) == NULL);
174 jbd_debug(2, "restarting handle %p\n", handle); 174 jbd_debug(2, "restarting handle %p\n", handle);
175 up_write(&EXT4_I(inode)->i_data_sem); 175 up_write(&EXT4_I(inode)->i_data_sem);
176 ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); 176 ret = ext4_journal_restart(handle, nblocks);
177 down_write(&EXT4_I(inode)->i_data_sem); 177 down_write(&EXT4_I(inode)->i_data_sem);
178 ext4_discard_preallocations(inode); 178 ext4_discard_preallocations(inode);
179 179
@@ -720,7 +720,7 @@ allocated:
720 return ret; 720 return ret;
721failed_out: 721failed_out:
722 for (i = 0; i < index; i++) 722 for (i = 0; i < index; i++)
723 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 723 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
724 return ret; 724 return ret;
725} 725}
726 726
@@ -823,20 +823,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
823 return err; 823 return err;
824failed: 824failed:
825 /* Allocation failed, free what we already allocated */ 825 /* Allocation failed, free what we already allocated */
826 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 826 ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
827 for (i = 1; i <= n ; i++) { 827 for (i = 1; i <= n ; i++) {
828 /* 828 /*
829 * branch[i].bh is newly allocated, so there is no 829 * branch[i].bh is newly allocated, so there is no
830 * need to revoke the block, which is why we don't 830 * need to revoke the block, which is why we don't
831 * need to set EXT4_FREE_BLOCKS_METADATA. 831 * need to set EXT4_FREE_BLOCKS_METADATA.
832 */ 832 */
833 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 833 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
834 EXT4_FREE_BLOCKS_FORGET); 834 EXT4_FREE_BLOCKS_FORGET);
835 } 835 }
836 for (i = n+1; i < indirect_blks; i++) 836 for (i = n+1; i < indirect_blks; i++)
837 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0); 837 ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
838 838
839 ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0); 839 ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
840 840
841 return err; 841 return err;
842} 842}
@@ -924,7 +924,7 @@ err_out:
924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1, 924 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
925 EXT4_FREE_BLOCKS_FORGET); 925 EXT4_FREE_BLOCKS_FORGET);
926 } 926 }
927 ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key), 927 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
928 blks, 0); 928 blks, 0);
929 929
930 return err; 930 return err;
@@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
973 int count = 0; 973 int count = 0;
974 ext4_fsblk_t first_block = 0; 974 ext4_fsblk_t first_block = 0;
975 975
976 trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
976 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))); 977 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
977 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 978 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
978 depth = ext4_block_to_path(inode, map->m_lblk, offsets, 979 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1058,6 +1059,8 @@ cleanup:
1058 partial--; 1059 partial--;
1059 } 1060 }
1060out: 1061out:
1062 trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
1063 map->m_pblk, map->m_len, err);
1061 return err; 1064 return err;
1062} 1065}
1063 1066
@@ -2060,7 +2063,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2060 if (nr_pages == 0) 2063 if (nr_pages == 0)
2061 break; 2064 break;
2062 for (i = 0; i < nr_pages; i++) { 2065 for (i = 0; i < nr_pages; i++) {
2063 int commit_write = 0, redirty_page = 0; 2066 int commit_write = 0, skip_page = 0;
2064 struct page *page = pvec.pages[i]; 2067 struct page *page = pvec.pages[i];
2065 2068
2066 index = page->index; 2069 index = page->index;
@@ -2086,14 +2089,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2086 * If the page does not have buffers (for 2089 * If the page does not have buffers (for
2087 * whatever reason), try to create them using 2090 * whatever reason), try to create them using
2088 * __block_write_begin. If this fails, 2091 * __block_write_begin. If this fails,
2089 * redirty the page and move on. 2092 * skip the page and move on.
2090 */ 2093 */
2091 if (!page_has_buffers(page)) { 2094 if (!page_has_buffers(page)) {
2092 if (__block_write_begin(page, 0, len, 2095 if (__block_write_begin(page, 0, len,
2093 noalloc_get_block_write)) { 2096 noalloc_get_block_write)) {
2094 redirty_page: 2097 skip_page:
2095 redirty_page_for_writepage(mpd->wbc,
2096 page);
2097 unlock_page(page); 2098 unlock_page(page);
2098 continue; 2099 continue;
2099 } 2100 }
@@ -2104,7 +2105,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2104 block_start = 0; 2105 block_start = 0;
2105 do { 2106 do {
2106 if (!bh) 2107 if (!bh)
2107 goto redirty_page; 2108 goto skip_page;
2108 if (map && (cur_logical >= map->m_lblk) && 2109 if (map && (cur_logical >= map->m_lblk) &&
2109 (cur_logical <= (map->m_lblk + 2110 (cur_logical <= (map->m_lblk +
2110 (map->m_len - 1)))) { 2111 (map->m_len - 1)))) {
@@ -2120,22 +2121,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2120 clear_buffer_unwritten(bh); 2121 clear_buffer_unwritten(bh);
2121 } 2122 }
2122 2123
2123 /* redirty page if block allocation undone */ 2124 /* skip page if block allocation undone */
2124 if (buffer_delay(bh) || buffer_unwritten(bh)) 2125 if (buffer_delay(bh) || buffer_unwritten(bh))
2125 redirty_page = 1; 2126 skip_page = 1;
2126 bh = bh->b_this_page; 2127 bh = bh->b_this_page;
2127 block_start += bh->b_size; 2128 block_start += bh->b_size;
2128 cur_logical++; 2129 cur_logical++;
2129 pblock++; 2130 pblock++;
2130 } while (bh != page_bufs); 2131 } while (bh != page_bufs);
2131 2132
2132 if (redirty_page) 2133 if (skip_page)
2133 goto redirty_page; 2134 goto skip_page;
2134 2135
2135 if (commit_write) 2136 if (commit_write)
2136 /* mark the buffer_heads as dirty & uptodate */ 2137 /* mark the buffer_heads as dirty & uptodate */
2137 block_commit_write(page, 0, len); 2138 block_commit_write(page, 0, len);
2138 2139
2140 clear_page_dirty_for_io(page);
2139 /* 2141 /*
2140 * Delalloc doesn't support data journalling, 2142 * Delalloc doesn't support data journalling,
2141 * but eventually maybe we'll lift this 2143 * but eventually maybe we'll lift this
@@ -2165,8 +2167,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
2165 return ret; 2167 return ret;
2166} 2168}
2167 2169
2168static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2170static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
2169 sector_t logical, long blk_cnt)
2170{ 2171{
2171 int nr_pages, i; 2172 int nr_pages, i;
2172 pgoff_t index, end; 2173 pgoff_t index, end;
@@ -2174,9 +2175,8 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2174 struct inode *inode = mpd->inode; 2175 struct inode *inode = mpd->inode;
2175 struct address_space *mapping = inode->i_mapping; 2176 struct address_space *mapping = inode->i_mapping;
2176 2177
2177 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2178 index = mpd->first_page;
2178 end = (logical + blk_cnt - 1) >> 2179 end = mpd->next_page - 1;
2179 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2180 while (index <= end) { 2180 while (index <= end) {
2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); 2181 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2182 if (nr_pages == 0) 2182 if (nr_pages == 0)
@@ -2279,9 +2279,8 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2279 err = blks; 2279 err = blks;
2280 /* 2280 /*
2281 * If get block returns EAGAIN or ENOSPC and there 2281 * If get block returns EAGAIN or ENOSPC and there
2282 * appears to be free blocks we will call 2282 * appears to be free blocks we will just let
2283 * ext4_writepage() for all of the pages which will 2283 * mpage_da_submit_io() unlock all of the pages.
2284 * just redirty the pages.
2285 */ 2284 */
2286 if (err == -EAGAIN) 2285 if (err == -EAGAIN)
2287 goto submit_io; 2286 goto submit_io;
@@ -2312,8 +2311,10 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
2312 ext4_print_free_blocks(mpd->inode); 2311 ext4_print_free_blocks(mpd->inode);
2313 } 2312 }
2314 /* invalidate all the pages */ 2313 /* invalidate all the pages */
2315 ext4_da_block_invalidatepages(mpd, next, 2314 ext4_da_block_invalidatepages(mpd);
2316 mpd->b_size >> mpd->inode->i_blkbits); 2315
2316 /* Mark this page range as having been completed */
2317 mpd->io_done = 1;
2317 return; 2318 return;
2318 } 2319 }
2319 BUG_ON(blks == 0); 2320 BUG_ON(blks == 0);
@@ -2438,102 +2439,6 @@ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
2438} 2439}
2439 2440
2440/* 2441/*
2441 * __mpage_da_writepage - finds extent of pages and blocks
2442 *
2443 * @page: page to consider
2444 * @wbc: not used, we just follow rules
2445 * @data: context
2446 *
2447 * The function finds extents of pages and scan them for all blocks.
2448 */
2449static int __mpage_da_writepage(struct page *page,
2450 struct writeback_control *wbc,
2451 struct mpage_da_data *mpd)
2452{
2453 struct inode *inode = mpd->inode;
2454 struct buffer_head *bh, *head;
2455 sector_t logical;
2456
2457 /*
2458 * Can we merge this page to current extent?
2459 */
2460 if (mpd->next_page != page->index) {
2461 /*
2462 * Nope, we can't. So, we map non-allocated blocks
2463 * and start IO on them
2464 */
2465 if (mpd->next_page != mpd->first_page) {
2466 mpage_da_map_and_submit(mpd);
2467 /*
2468 * skip rest of the page in the page_vec
2469 */
2470 redirty_page_for_writepage(wbc, page);
2471 unlock_page(page);
2472 return MPAGE_DA_EXTENT_TAIL;
2473 }
2474
2475 /*
2476 * Start next extent of pages ...
2477 */
2478 mpd->first_page = page->index;
2479
2480 /*
2481 * ... and blocks
2482 */
2483 mpd->b_size = 0;
2484 mpd->b_state = 0;
2485 mpd->b_blocknr = 0;
2486 }
2487
2488 mpd->next_page = page->index + 1;
2489 logical = (sector_t) page->index <<
2490 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2491
2492 if (!page_has_buffers(page)) {
2493 mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
2494 (1 << BH_Dirty) | (1 << BH_Uptodate));
2495 if (mpd->io_done)
2496 return MPAGE_DA_EXTENT_TAIL;
2497 } else {
2498 /*
2499 * Page with regular buffer heads, just add all dirty ones
2500 */
2501 head = page_buffers(page);
2502 bh = head;
2503 do {
2504 BUG_ON(buffer_locked(bh));
2505 /*
2506 * We need to try to allocate
2507 * unmapped blocks in the same page.
2508 * Otherwise we won't make progress
2509 * with the page in ext4_writepage
2510 */
2511 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2512 mpage_add_bh_to_extent(mpd, logical,
2513 bh->b_size,
2514 bh->b_state);
2515 if (mpd->io_done)
2516 return MPAGE_DA_EXTENT_TAIL;
2517 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2518 /*
2519 * mapped dirty buffer. We need to update
2520 * the b_state because we look at
2521 * b_state in mpage_da_map_blocks. We don't
2522 * update b_size because if we find an
2523 * unmapped buffer_head later we need to
2524 * use the b_state flag of that buffer_head.
2525 */
2526 if (mpd->b_size == 0)
2527 mpd->b_state = bh->b_state & BH_FLAGS;
2528 }
2529 logical++;
2530 } while ((bh = bh->b_this_page) != head);
2531 }
2532
2533 return 0;
2534}
2535
2536/*
2537 * This is a special get_blocks_t callback which is used by 2442 * This is a special get_blocks_t callback which is used by
2538 * ext4_da_write_begin(). It will either return mapped block or 2443 * ext4_da_write_begin(). It will either return mapped block or
2539 * reserve space for a single block. 2444 * reserve space for a single block.
@@ -2684,7 +2589,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate);
2684 * because we should have holes filled from ext4_page_mkwrite(). We even don't 2589 * because we should have holes filled from ext4_page_mkwrite(). We even don't
2685 * need to file the inode to the transaction's list in ordered mode because if 2590 * need to file the inode to the transaction's list in ordered mode because if
2686 * we are writing back data added by write(), the inode is already there and if 2591 * we are writing back data added by write(), the inode is already there and if
2687 * we are writing back data modified via mmap(), noone guarantees in which 2592 * we are writing back data modified via mmap(), no one guarantees in which
2688 * transaction the data will hit the disk. In case we are journaling data, we 2593 * transaction the data will hit the disk. In case we are journaling data, we
2689 * cannot start transaction directly because transaction start ranks above page 2594 * cannot start transaction directly because transaction start ranks above page
2690 * lock so we have to do some magic. 2595 * lock so we have to do some magic.
@@ -2786,7 +2691,7 @@ static int ext4_writepage(struct page *page,
2786 2691
2787/* 2692/*
2788 * This is called via ext4_da_writepages() to 2693 * This is called via ext4_da_writepages() to
2789 * calulate the total number of credits to reserve to fit 2694 * calculate the total number of credits to reserve to fit
2790 * a single extent allocation into a single transaction, 2695 * a single extent allocation into a single transaction,
2791 * ext4_da_writpeages() will loop calling this before 2696 * ext4_da_writpeages() will loop calling this before
2792 * the block allocation. 2697 * the block allocation.
@@ -2811,27 +2716,27 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2811 2716
2812/* 2717/*
2813 * write_cache_pages_da - walk the list of dirty pages of the given 2718 * write_cache_pages_da - walk the list of dirty pages of the given
2814 * address space and call the callback function (which usually writes 2719 * address space and accumulate pages that need writing, and call
2815 * the pages). 2720 * mpage_da_map_and_submit to map a single contiguous memory region
2816 * 2721 * and then write them.
2817 * This is a forked version of write_cache_pages(). Differences:
2818 * Range cyclic is ignored.
2819 * no_nrwrite_index_update is always presumed true
2820 */ 2722 */
2821static int write_cache_pages_da(struct address_space *mapping, 2723static int write_cache_pages_da(struct address_space *mapping,
2822 struct writeback_control *wbc, 2724 struct writeback_control *wbc,
2823 struct mpage_da_data *mpd, 2725 struct mpage_da_data *mpd,
2824 pgoff_t *done_index) 2726 pgoff_t *done_index)
2825{ 2727{
2826 int ret = 0; 2728 struct buffer_head *bh, *head;
2827 int done = 0; 2729 struct inode *inode = mapping->host;
2828 struct pagevec pvec; 2730 struct pagevec pvec;
2829 unsigned nr_pages; 2731 unsigned int nr_pages;
2830 pgoff_t index; 2732 sector_t logical;
2831 pgoff_t end; /* Inclusive */ 2733 pgoff_t index, end;
2832 long nr_to_write = wbc->nr_to_write; 2734 long nr_to_write = wbc->nr_to_write;
2833 int tag; 2735 int i, tag, ret = 0;
2834 2736
2737 memset(mpd, 0, sizeof(struct mpage_da_data));
2738 mpd->wbc = wbc;
2739 mpd->inode = inode;
2835 pagevec_init(&pvec, 0); 2740 pagevec_init(&pvec, 0);
2836 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2741 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2837 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2742 end = wbc->range_end >> PAGE_CACHE_SHIFT;
@@ -2842,13 +2747,11 @@ static int write_cache_pages_da(struct address_space *mapping,
2842 tag = PAGECACHE_TAG_DIRTY; 2747 tag = PAGECACHE_TAG_DIRTY;
2843 2748
2844 *done_index = index; 2749 *done_index = index;
2845 while (!done && (index <= end)) { 2750 while (index <= end) {
2846 int i;
2847
2848 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2751 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2849 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2752 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2850 if (nr_pages == 0) 2753 if (nr_pages == 0)
2851 break; 2754 return 0;
2852 2755
2853 for (i = 0; i < nr_pages; i++) { 2756 for (i = 0; i < nr_pages; i++) {
2854 struct page *page = pvec.pages[i]; 2757 struct page *page = pvec.pages[i];
@@ -2860,60 +2763,100 @@ static int write_cache_pages_da(struct address_space *mapping,
2860 * mapping. However, page->index will not change 2763 * mapping. However, page->index will not change
2861 * because we have a reference on the page. 2764 * because we have a reference on the page.
2862 */ 2765 */
2863 if (page->index > end) { 2766 if (page->index > end)
2864 done = 1; 2767 goto out;
2865 break;
2866 }
2867 2768
2868 *done_index = page->index + 1; 2769 *done_index = page->index + 1;
2869 2770
2771 /*
2772 * If we can't merge this page, and we have
2773 * accumulated an contiguous region, write it
2774 */
2775 if ((mpd->next_page != page->index) &&
2776 (mpd->next_page != mpd->first_page)) {
2777 mpage_da_map_and_submit(mpd);
2778 goto ret_extent_tail;
2779 }
2780
2870 lock_page(page); 2781 lock_page(page);
2871 2782
2872 /* 2783 /*
2873 * Page truncated or invalidated. We can freely skip it 2784 * If the page is no longer dirty, or its
2874 * then, even for data integrity operations: the page 2785 * mapping no longer corresponds to inode we
2875 * has disappeared concurrently, so there could be no 2786 * are writing (which means it has been
2876 * real expectation of this data interity operation 2787 * truncated or invalidated), or the page is
2877 * even if there is now a new, dirty page at the same 2788 * already under writeback and we are not
2878 * pagecache address. 2789 * doing a data integrity writeback, skip the page
2879 */ 2790 */
2880 if (unlikely(page->mapping != mapping)) { 2791 if (!PageDirty(page) ||
2881continue_unlock: 2792 (PageWriteback(page) &&
2793 (wbc->sync_mode == WB_SYNC_NONE)) ||
2794 unlikely(page->mapping != mapping)) {
2882 unlock_page(page); 2795 unlock_page(page);
2883 continue; 2796 continue;
2884 } 2797 }
2885 2798
2886 if (!PageDirty(page)) { 2799 if (PageWriteback(page))
2887 /* someone wrote it for us */ 2800 wait_on_page_writeback(page);
2888 goto continue_unlock;
2889 }
2890
2891 if (PageWriteback(page)) {
2892 if (wbc->sync_mode != WB_SYNC_NONE)
2893 wait_on_page_writeback(page);
2894 else
2895 goto continue_unlock;
2896 }
2897 2801
2898 BUG_ON(PageWriteback(page)); 2802 BUG_ON(PageWriteback(page));
2899 if (!clear_page_dirty_for_io(page))
2900 goto continue_unlock;
2901 2803
2902 ret = __mpage_da_writepage(page, wbc, mpd); 2804 if (mpd->next_page != page->index)
2903 if (unlikely(ret)) { 2805 mpd->first_page = page->index;
2904 if (ret == AOP_WRITEPAGE_ACTIVATE) { 2806 mpd->next_page = page->index + 1;
2905 unlock_page(page); 2807 logical = (sector_t) page->index <<
2906 ret = 0; 2808 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2907 } else { 2809
2908 done = 1; 2810 if (!page_has_buffers(page)) {
2909 break; 2811 mpage_add_bh_to_extent(mpd, logical,
2910 } 2812 PAGE_CACHE_SIZE,
2813 (1 << BH_Dirty) | (1 << BH_Uptodate));
2814 if (mpd->io_done)
2815 goto ret_extent_tail;
2816 } else {
2817 /*
2818 * Page with regular buffer heads,
2819 * just add all dirty ones
2820 */
2821 head = page_buffers(page);
2822 bh = head;
2823 do {
2824 BUG_ON(buffer_locked(bh));
2825 /*
2826 * We need to try to allocate
2827 * unmapped blocks in the same page.
2828 * Otherwise we won't make progress
2829 * with the page in ext4_writepage
2830 */
2831 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2832 mpage_add_bh_to_extent(mpd, logical,
2833 bh->b_size,
2834 bh->b_state);
2835 if (mpd->io_done)
2836 goto ret_extent_tail;
2837 } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
2838 /*
2839 * mapped dirty buffer. We need
2840 * to update the b_state
2841 * because we look at b_state
2842 * in mpage_da_map_blocks. We
2843 * don't update b_size because
2844 * if we find an unmapped
2845 * buffer_head later we need to
2846 * use the b_state flag of that
2847 * buffer_head.
2848 */
2849 if (mpd->b_size == 0)
2850 mpd->b_state = bh->b_state & BH_FLAGS;
2851 }
2852 logical++;
2853 } while ((bh = bh->b_this_page) != head);
2911 } 2854 }
2912 2855
2913 if (nr_to_write > 0) { 2856 if (nr_to_write > 0) {
2914 nr_to_write--; 2857 nr_to_write--;
2915 if (nr_to_write == 0 && 2858 if (nr_to_write == 0 &&
2916 wbc->sync_mode == WB_SYNC_NONE) { 2859 wbc->sync_mode == WB_SYNC_NONE)
2917 /* 2860 /*
2918 * We stop writing back only if we are 2861 * We stop writing back only if we are
2919 * not doing integrity sync. In case of 2862 * not doing integrity sync. In case of
@@ -2924,14 +2867,18 @@ continue_unlock:
2924 * pages, but have not synced all of the 2867 * pages, but have not synced all of the
2925 * old dirty pages. 2868 * old dirty pages.
2926 */ 2869 */
2927 done = 1; 2870 goto out;
2928 break;
2929 }
2930 } 2871 }
2931 } 2872 }
2932 pagevec_release(&pvec); 2873 pagevec_release(&pvec);
2933 cond_resched(); 2874 cond_resched();
2934 } 2875 }
2876 return 0;
2877ret_extent_tail:
2878 ret = MPAGE_DA_EXTENT_TAIL;
2879out:
2880 pagevec_release(&pvec);
2881 cond_resched();
2935 return ret; 2882 return ret;
2936} 2883}
2937 2884
@@ -2945,7 +2892,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2945 struct mpage_da_data mpd; 2892 struct mpage_da_data mpd;
2946 struct inode *inode = mapping->host; 2893 struct inode *inode = mapping->host;
2947 int pages_written = 0; 2894 int pages_written = 0;
2948 long pages_skipped;
2949 unsigned int max_pages; 2895 unsigned int max_pages;
2950 int range_cyclic, cycled = 1, io_done = 0; 2896 int range_cyclic, cycled = 1, io_done = 0;
2951 int needed_blocks, ret = 0; 2897 int needed_blocks, ret = 0;
@@ -3028,11 +2974,6 @@ static int ext4_da_writepages(struct address_space *mapping,
3028 wbc->nr_to_write = desired_nr_to_write; 2974 wbc->nr_to_write = desired_nr_to_write;
3029 } 2975 }
3030 2976
3031 mpd.wbc = wbc;
3032 mpd.inode = mapping->host;
3033
3034 pages_skipped = wbc->pages_skipped;
3035
3036retry: 2977retry:
3037 if (wbc->sync_mode == WB_SYNC_ALL) 2978 if (wbc->sync_mode == WB_SYNC_ALL)
3038 tag_pages_for_writeback(mapping, index, end); 2979 tag_pages_for_writeback(mapping, index, end);
@@ -3059,22 +3000,10 @@ retry:
3059 } 3000 }
3060 3001
3061 /* 3002 /*
3062 * Now call __mpage_da_writepage to find the next 3003 * Now call write_cache_pages_da() to find the next
3063 * contiguous region of logical blocks that need 3004 * contiguous region of logical blocks that need
3064 * blocks to be allocated by ext4. We don't actually 3005 * blocks to be allocated by ext4 and submit them.
3065 * submit the blocks for I/O here, even though
3066 * write_cache_pages thinks it will, and will set the
3067 * pages as clean for write before calling
3068 * __mpage_da_writepage().
3069 */ 3006 */
3070 mpd.b_size = 0;
3071 mpd.b_state = 0;
3072 mpd.b_blocknr = 0;
3073 mpd.first_page = 0;
3074 mpd.next_page = 0;
3075 mpd.io_done = 0;
3076 mpd.pages_written = 0;
3077 mpd.retval = 0;
3078 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index); 3007 ret = write_cache_pages_da(mapping, wbc, &mpd, &done_index);
3079 /* 3008 /*
3080 * If we have a contiguous extent of pages and we 3009 * If we have a contiguous extent of pages and we
@@ -3096,7 +3025,6 @@ retry:
3096 * and try again 3025 * and try again
3097 */ 3026 */
3098 jbd2_journal_force_commit_nested(sbi->s_journal); 3027 jbd2_journal_force_commit_nested(sbi->s_journal);
3099 wbc->pages_skipped = pages_skipped;
3100 ret = 0; 3028 ret = 0;
3101 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 3029 } else if (ret == MPAGE_DA_EXTENT_TAIL) {
3102 /* 3030 /*
@@ -3104,7 +3032,6 @@ retry:
3104 * rest of the pages 3032 * rest of the pages
3105 */ 3033 */
3106 pages_written += mpd.pages_written; 3034 pages_written += mpd.pages_written;
3107 wbc->pages_skipped = pages_skipped;
3108 ret = 0; 3035 ret = 0;
3109 io_done = 1; 3036 io_done = 1;
3110 } else if (wbc->nr_to_write) 3037 } else if (wbc->nr_to_write)
@@ -3122,11 +3049,6 @@ retry:
3122 wbc->range_end = mapping->writeback_index - 1; 3049 wbc->range_end = mapping->writeback_index - 1;
3123 goto retry; 3050 goto retry;
3124 } 3051 }
3125 if (pages_skipped != wbc->pages_skipped)
3126 ext4_msg(inode->i_sb, KERN_CRIT,
3127 "This should not happen leaving %s "
3128 "with nr_to_write = %ld ret = %d",
3129 __func__, wbc->nr_to_write, ret);
3130 3052
3131 /* Update index */ 3053 /* Update index */
3132 wbc->range_cyclic = range_cyclic; 3054 wbc->range_cyclic = range_cyclic;
@@ -3383,7 +3305,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
3383 * the pages by calling redirty_page_for_writepage() but that 3305 * the pages by calling redirty_page_for_writepage() but that
3384 * would be ugly in the extreme. So instead we would need to 3306 * would be ugly in the extreme. So instead we would need to
3385 * replicate parts of the code in the above functions, 3307 * replicate parts of the code in the above functions,
3386 * simplifying them becuase we wouldn't actually intend to 3308 * simplifying them because we wouldn't actually intend to
3387 * write out the pages, but rather only collect contiguous 3309 * write out the pages, but rather only collect contiguous
3388 * logical block extents, call the multi-block allocator, and 3310 * logical block extents, call the multi-block allocator, and
3389 * then update the buffer heads with the block allocations. 3311 * then update the buffer heads with the block allocations.
@@ -3460,6 +3382,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3460 3382
3461static int ext4_readpage(struct file *file, struct page *page) 3383static int ext4_readpage(struct file *file, struct page *page)
3462{ 3384{
3385 trace_ext4_readpage(page);
3463 return mpage_readpage(page, ext4_get_block); 3386 return mpage_readpage(page, ext4_get_block);
3464} 3387}
3465 3388
@@ -3494,6 +3417,8 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset)
3494{ 3417{
3495 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3418 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3496 3419
3420 trace_ext4_invalidatepage(page, offset);
3421
3497 /* 3422 /*
3498 * free any io_end structure allocated for buffers to be discarded 3423 * free any io_end structure allocated for buffers to be discarded
3499 */ 3424 */
@@ -3515,6 +3440,8 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
3515{ 3440{
3516 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 3441 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3517 3442
3443 trace_ext4_releasepage(page);
3444
3518 WARN_ON(PageChecked(page)); 3445 WARN_ON(PageChecked(page));
3519 if (!page_has_buffers(page)) 3446 if (!page_has_buffers(page))
3520 return 0; 3447 return 0;
@@ -3768,7 +3695,7 @@ retry:
3768 * 3695 *
3769 * The unwrritten extents will be converted to written when DIO is completed. 3696 * The unwrritten extents will be converted to written when DIO is completed.
3770 * For async direct IO, since the IO may still pending when return, we 3697 * For async direct IO, since the IO may still pending when return, we
3771 * set up an end_io call back function, which will do the convertion 3698 * set up an end_io call back function, which will do the conversion
3772 * when async direct IO completed. 3699 * when async direct IO completed.
3773 * 3700 *
3774 * If the O_DIRECT write will extend the file then add this inode to the 3701 * If the O_DIRECT write will extend the file then add this inode to the
@@ -3791,7 +3718,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3791 * We could direct write to holes and fallocate. 3718 * We could direct write to holes and fallocate.
3792 * 3719 *
3793 * Allocated blocks to fill the hole are marked as uninitialized 3720 * Allocated blocks to fill the hole are marked as uninitialized
3794 * to prevent paralel buffered read to expose the stale data 3721 * to prevent parallel buffered read to expose the stale data
3795 * before DIO complete the data IO. 3722 * before DIO complete the data IO.
3796 * 3723 *
3797 * As to previously fallocated extents, ext4 get_block 3724 * As to previously fallocated extents, ext4 get_block
@@ -3852,7 +3779,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3852 int err; 3779 int err;
3853 /* 3780 /*
3854 * for non AIO case, since the IO is already 3781 * for non AIO case, since the IO is already
3855 * completed, we could do the convertion right here 3782 * completed, we could do the conversion right here
3856 */ 3783 */
3857 err = ext4_convert_unwritten_extents(inode, 3784 err = ext4_convert_unwritten_extents(inode,
3858 offset, ret); 3785 offset, ret);
@@ -3873,11 +3800,16 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3873{ 3800{
3874 struct file *file = iocb->ki_filp; 3801 struct file *file = iocb->ki_filp;
3875 struct inode *inode = file->f_mapping->host; 3802 struct inode *inode = file->f_mapping->host;
3803 ssize_t ret;
3876 3804
3805 trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
3877 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3806 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3878 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 3807 ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3879 3808 else
3880 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 3809 ret = ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
3810 trace_ext4_direct_IO_exit(inode, offset,
3811 iov_length(iov, nr_segs), rw, ret);
3812 return ret;
3881} 3813}
3882 3814
3883/* 3815/*
@@ -3903,7 +3835,6 @@ static const struct address_space_operations ext4_ordered_aops = {
3903 .readpage = ext4_readpage, 3835 .readpage = ext4_readpage,
3904 .readpages = ext4_readpages, 3836 .readpages = ext4_readpages,
3905 .writepage = ext4_writepage, 3837 .writepage = ext4_writepage,
3906 .sync_page = block_sync_page,
3907 .write_begin = ext4_write_begin, 3838 .write_begin = ext4_write_begin,
3908 .write_end = ext4_ordered_write_end, 3839 .write_end = ext4_ordered_write_end,
3909 .bmap = ext4_bmap, 3840 .bmap = ext4_bmap,
@@ -3919,7 +3850,6 @@ static const struct address_space_operations ext4_writeback_aops = {
3919 .readpage = ext4_readpage, 3850 .readpage = ext4_readpage,
3920 .readpages = ext4_readpages, 3851 .readpages = ext4_readpages,
3921 .writepage = ext4_writepage, 3852 .writepage = ext4_writepage,
3922 .sync_page = block_sync_page,
3923 .write_begin = ext4_write_begin, 3853 .write_begin = ext4_write_begin,
3924 .write_end = ext4_writeback_write_end, 3854 .write_end = ext4_writeback_write_end,
3925 .bmap = ext4_bmap, 3855 .bmap = ext4_bmap,
@@ -3935,7 +3865,6 @@ static const struct address_space_operations ext4_journalled_aops = {
3935 .readpage = ext4_readpage, 3865 .readpage = ext4_readpage,
3936 .readpages = ext4_readpages, 3866 .readpages = ext4_readpages,
3937 .writepage = ext4_writepage, 3867 .writepage = ext4_writepage,
3938 .sync_page = block_sync_page,
3939 .write_begin = ext4_write_begin, 3868 .write_begin = ext4_write_begin,
3940 .write_end = ext4_journalled_write_end, 3869 .write_end = ext4_journalled_write_end,
3941 .set_page_dirty = ext4_journalled_set_page_dirty, 3870 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3951,7 +3880,6 @@ static const struct address_space_operations ext4_da_aops = {
3951 .readpages = ext4_readpages, 3880 .readpages = ext4_readpages,
3952 .writepage = ext4_writepage, 3881 .writepage = ext4_writepage,
3953 .writepages = ext4_da_writepages, 3882 .writepages = ext4_da_writepages,
3954 .sync_page = block_sync_page,
3955 .write_begin = ext4_da_write_begin, 3883 .write_begin = ext4_da_write_begin,
3956 .write_end = ext4_da_write_end, 3884 .write_end = ext4_da_write_end,
3957 .bmap = ext4_bmap, 3885 .bmap = ext4_bmap,
@@ -4098,7 +4026,7 @@ static inline int all_zeroes(__le32 *p, __le32 *q)
4098 * 4026 *
4099 * When we do truncate() we may have to clean the ends of several 4027 * When we do truncate() we may have to clean the ends of several
4100 * indirect blocks but leave the blocks themselves alive. Block is 4028 * indirect blocks but leave the blocks themselves alive. Block is
4101 * partially truncated if some data below the new i_size is refered 4029 * partially truncated if some data below the new i_size is referred
4102 * from it (and it is on the path to the first completely truncated 4030 * from it (and it is on the path to the first completely truncated
4103 * data block, indeed). We have to free the top of that path along 4031 * data block, indeed). We have to free the top of that path along
4104 * with everything to the right of the path. Since no allocation 4032 * with everything to the right of the path. Since no allocation
@@ -4177,6 +4105,9 @@ no_top:
4177 * 4105 *
4178 * We release `count' blocks on disk, but (last - first) may be greater 4106 * We release `count' blocks on disk, but (last - first) may be greater
4179 * than `count' because there can be holes in there. 4107 * than `count' because there can be holes in there.
4108 *
4109 * Return 0 on success, 1 on invalid block range
4110 * and < 0 on fatal error.
4180 */ 4111 */
4181static int ext4_clear_blocks(handle_t *handle, struct inode *inode, 4112static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4182 struct buffer_head *bh, 4113 struct buffer_head *bh,
@@ -4203,33 +4134,32 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4203 if (bh) { 4134 if (bh) {
4204 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4135 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
4205 err = ext4_handle_dirty_metadata(handle, inode, bh); 4136 err = ext4_handle_dirty_metadata(handle, inode, bh);
4206 if (unlikely(err)) { 4137 if (unlikely(err))
4207 ext4_std_error(inode->i_sb, err); 4138 goto out_err;
4208 return 1;
4209 }
4210 } 4139 }
4211 err = ext4_mark_inode_dirty(handle, inode); 4140 err = ext4_mark_inode_dirty(handle, inode);
4212 if (unlikely(err)) { 4141 if (unlikely(err))
4213 ext4_std_error(inode->i_sb, err); 4142 goto out_err;
4214 return 1;
4215 }
4216 err = ext4_truncate_restart_trans(handle, inode, 4143 err = ext4_truncate_restart_trans(handle, inode,
4217 blocks_for_truncate(inode)); 4144 blocks_for_truncate(inode));
4218 if (unlikely(err)) { 4145 if (unlikely(err))
4219 ext4_std_error(inode->i_sb, err); 4146 goto out_err;
4220 return 1;
4221 }
4222 if (bh) { 4147 if (bh) {
4223 BUFFER_TRACE(bh, "retaking write access"); 4148 BUFFER_TRACE(bh, "retaking write access");
4224 ext4_journal_get_write_access(handle, bh); 4149 err = ext4_journal_get_write_access(handle, bh);
4150 if (unlikely(err))
4151 goto out_err;
4225 } 4152 }
4226 } 4153 }
4227 4154
4228 for (p = first; p < last; p++) 4155 for (p = first; p < last; p++)
4229 *p = 0; 4156 *p = 0;
4230 4157
4231 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags); 4158 ext4_free_blocks(handle, inode, NULL, block_to_free, count, flags);
4232 return 0; 4159 return 0;
4160out_err:
4161 ext4_std_error(inode->i_sb, err);
4162 return err;
4233} 4163}
4234 4164
4235/** 4165/**
@@ -4240,7 +4170,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4240 * @first: array of block numbers 4170 * @first: array of block numbers
4241 * @last: points immediately past the end of array 4171 * @last: points immediately past the end of array
4242 * 4172 *
4243 * We are freeing all blocks refered from that array (numbers are stored as 4173 * We are freeing all blocks referred from that array (numbers are stored as
4244 * little-endian 32-bit) and updating @inode->i_blocks appropriately. 4174 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
4245 * 4175 *
4246 * We accumulate contiguous runs of blocks to free. Conveniently, if these 4176 * We accumulate contiguous runs of blocks to free. Conveniently, if these
@@ -4263,7 +4193,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4263 ext4_fsblk_t nr; /* Current block # */ 4193 ext4_fsblk_t nr; /* Current block # */
4264 __le32 *p; /* Pointer into inode/ind 4194 __le32 *p; /* Pointer into inode/ind
4265 for current block */ 4195 for current block */
4266 int err; 4196 int err = 0;
4267 4197
4268 if (this_bh) { /* For indirect block */ 4198 if (this_bh) { /* For indirect block */
4269 BUFFER_TRACE(this_bh, "get_write_access"); 4199 BUFFER_TRACE(this_bh, "get_write_access");
@@ -4285,9 +4215,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4285 } else if (nr == block_to_free + count) { 4215 } else if (nr == block_to_free + count) {
4286 count++; 4216 count++;
4287 } else { 4217 } else {
4288 if (ext4_clear_blocks(handle, inode, this_bh, 4218 err = ext4_clear_blocks(handle, inode, this_bh,
4289 block_to_free, count, 4219 block_to_free, count,
4290 block_to_free_p, p)) 4220 block_to_free_p, p);
4221 if (err)
4291 break; 4222 break;
4292 block_to_free = nr; 4223 block_to_free = nr;
4293 block_to_free_p = p; 4224 block_to_free_p = p;
@@ -4296,9 +4227,12 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4296 } 4227 }
4297 } 4228 }
4298 4229
4299 if (count > 0) 4230 if (!err && count > 0)
4300 ext4_clear_blocks(handle, inode, this_bh, block_to_free, 4231 err = ext4_clear_blocks(handle, inode, this_bh, block_to_free,
4301 count, block_to_free_p, p); 4232 count, block_to_free_p, p);
4233 if (err < 0)
4234 /* fatal error */
4235 return;
4302 4236
4303 if (this_bh) { 4237 if (this_bh) {
4304 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); 4238 BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata");
@@ -4328,7 +4262,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4328 * @last: pointer immediately past the end of array 4262 * @last: pointer immediately past the end of array
4329 * @depth: depth of the branches to free 4263 * @depth: depth of the branches to free
4330 * 4264 *
4331 * We are freeing all blocks refered from these branches (numbers are 4265 * We are freeing all blocks referred from these branches (numbers are
4332 * stored as little-endian 32-bit) and updating @inode->i_blocks 4266 * stored as little-endian 32-bit) and updating @inode->i_blocks
4333 * appropriately. 4267 * appropriately.
4334 */ 4268 */
@@ -4416,7 +4350,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4416 * transaction where the data blocks are 4350 * transaction where the data blocks are
4417 * actually freed. 4351 * actually freed.
4418 */ 4352 */
4419 ext4_free_blocks(handle, inode, 0, nr, 1, 4353 ext4_free_blocks(handle, inode, NULL, nr, 1,
4420 EXT4_FREE_BLOCKS_METADATA| 4354 EXT4_FREE_BLOCKS_METADATA|
4421 EXT4_FREE_BLOCKS_FORGET); 4355 EXT4_FREE_BLOCKS_FORGET);
4422 4356
@@ -4496,10 +4430,12 @@ void ext4_truncate(struct inode *inode)
4496 Indirect chain[4]; 4430 Indirect chain[4];
4497 Indirect *partial; 4431 Indirect *partial;
4498 __le32 nr = 0; 4432 __le32 nr = 0;
4499 int n; 4433 int n = 0;
4500 ext4_lblk_t last_block; 4434 ext4_lblk_t last_block, max_block;
4501 unsigned blocksize = inode->i_sb->s_blocksize; 4435 unsigned blocksize = inode->i_sb->s_blocksize;
4502 4436
4437 trace_ext4_truncate_enter(inode);
4438
4503 if (!ext4_can_truncate(inode)) 4439 if (!ext4_can_truncate(inode))
4504 return; 4440 return;
4505 4441
@@ -4510,6 +4446,7 @@ void ext4_truncate(struct inode *inode)
4510 4446
4511 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 4447 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4512 ext4_ext_truncate(inode); 4448 ext4_ext_truncate(inode);
4449 trace_ext4_truncate_exit(inode);
4513 return; 4450 return;
4514 } 4451 }
4515 4452
@@ -4519,14 +4456,18 @@ void ext4_truncate(struct inode *inode)
4519 4456
4520 last_block = (inode->i_size + blocksize-1) 4457 last_block = (inode->i_size + blocksize-1)
4521 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); 4458 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4459 max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
4460 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4522 4461
4523 if (inode->i_size & (blocksize - 1)) 4462 if (inode->i_size & (blocksize - 1))
4524 if (ext4_block_truncate_page(handle, mapping, inode->i_size)) 4463 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4525 goto out_stop; 4464 goto out_stop;
4526 4465
4527 n = ext4_block_to_path(inode, last_block, offsets, NULL); 4466 if (last_block != max_block) {
4528 if (n == 0) 4467 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4529 goto out_stop; /* error */ 4468 if (n == 0)
4469 goto out_stop; /* error */
4470 }
4530 4471
4531 /* 4472 /*
4532 * OK. This truncate is going to happen. We add the inode to the 4473 * OK. This truncate is going to happen. We add the inode to the
@@ -4557,7 +4498,13 @@ void ext4_truncate(struct inode *inode)
4557 */ 4498 */
4558 ei->i_disksize = inode->i_size; 4499 ei->i_disksize = inode->i_size;
4559 4500
4560 if (n == 1) { /* direct blocks */ 4501 if (last_block == max_block) {
4502 /*
4503 * It is unnecessary to free any data blocks if last_block is
4504 * equal to the indirect block limit.
4505 */
4506 goto out_unlock;
4507 } else if (n == 1) { /* direct blocks */
4561 ext4_free_data(handle, inode, NULL, i_data+offsets[0], 4508 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4562 i_data + EXT4_NDIR_BLOCKS); 4509 i_data + EXT4_NDIR_BLOCKS);
4563 goto do_indirects; 4510 goto do_indirects;
@@ -4617,6 +4564,7 @@ do_indirects:
4617 ; 4564 ;
4618 } 4565 }
4619 4566
4567out_unlock:
4620 up_write(&ei->i_data_sem); 4568 up_write(&ei->i_data_sem);
4621 inode->i_mtime = inode->i_ctime = ext4_current_time(inode); 4569 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4622 ext4_mark_inode_dirty(handle, inode); 4570 ext4_mark_inode_dirty(handle, inode);
@@ -4639,6 +4587,7 @@ out_stop:
4639 ext4_orphan_del(handle, inode); 4587 ext4_orphan_del(handle, inode);
4640 4588
4641 ext4_journal_stop(handle); 4589 ext4_journal_stop(handle);
4590 trace_ext4_truncate_exit(inode);
4642} 4591}
4643 4592
4644/* 4593/*
@@ -4770,6 +4719,7 @@ make_io:
4770 * has in-inode xattrs, or we don't have this inode in memory. 4719 * has in-inode xattrs, or we don't have this inode in memory.
4771 * Read the block from disk. 4720 * Read the block from disk.
4772 */ 4721 */
4722 trace_ext4_load_inode(inode);
4773 get_bh(bh); 4723 get_bh(bh);
4774 bh->b_end_io = end_buffer_read_sync; 4724 bh->b_end_io = end_buffer_read_sync;
4775 submit_bh(READ_META, bh); 4725 submit_bh(READ_META, bh);
@@ -4875,7 +4825,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4875 return inode; 4825 return inode;
4876 4826
4877 ei = EXT4_I(inode); 4827 ei = EXT4_I(inode);
4878 iloc.bh = 0; 4828 iloc.bh = NULL;
4879 4829
4880 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4830 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4881 if (ret < 0) 4831 if (ret < 0)
@@ -5460,13 +5410,12 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5460 /* if nrblocks are contiguous */ 5410 /* if nrblocks are contiguous */
5461 if (chunk) { 5411 if (chunk) {
5462 /* 5412 /*
5463 * With N contiguous data blocks, it need at most 5413 * With N contiguous data blocks, we need at most
5464 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks 5414 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
5465 * 2 dindirect blocks 5415 * 2 dindirect blocks, and 1 tindirect block
5466 * 1 tindirect block
5467 */ 5416 */
5468 indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); 5417 return DIV_ROUND_UP(nrblocks,
5469 return indirects + 3; 5418 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5470 } 5419 }
5471 /* 5420 /*
5472 * if nrblocks are not contiguous, worse case, each block touch 5421 * if nrblocks are not contiguous, worse case, each block touch
@@ -5540,7 +5489,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5540} 5489}
5541 5490
5542/* 5491/*
5543 * Calulate the total number of credits to reserve to fit 5492 * Calculate the total number of credits to reserve to fit
5544 * the modification of a single pages into a single transaction, 5493 * the modification of a single pages into a single transaction,
5545 * which may include multiple chunks of block allocations. 5494 * which may include multiple chunks of block allocations.
5546 * 5495 *
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index eb3bc2fe647e..808c554e773f 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
38 unsigned int oldflags; 38 unsigned int oldflags;
39 unsigned int jflag; 39 unsigned int jflag;
40 40
41 if (!is_owner_or_cap(inode)) 41 if (!inode_owner_or_capable(inode))
42 return -EACCES; 42 return -EACCES;
43 43
44 if (get_user(flags, (int __user *) arg)) 44 if (get_user(flags, (int __user *) arg))
@@ -146,7 +146,7 @@ flags_out:
146 __u32 generation; 146 __u32 generation;
147 int err; 147 int err;
148 148
149 if (!is_owner_or_cap(inode)) 149 if (!inode_owner_or_capable(inode))
150 return -EPERM; 150 return -EPERM;
151 151
152 err = mnt_want_write(filp->f_path.mnt); 152 err = mnt_want_write(filp->f_path.mnt);
@@ -298,7 +298,7 @@ mext_out:
298 case EXT4_IOC_MIGRATE: 298 case EXT4_IOC_MIGRATE:
299 { 299 {
300 int err; 300 int err;
301 if (!is_owner_or_cap(inode)) 301 if (!inode_owner_or_capable(inode))
302 return -EACCES; 302 return -EACCES;
303 303
304 err = mnt_want_write(filp->f_path.mnt); 304 err = mnt_want_write(filp->f_path.mnt);
@@ -320,7 +320,7 @@ mext_out:
320 case EXT4_IOC_ALLOC_DA_BLKS: 320 case EXT4_IOC_ALLOC_DA_BLKS:
321 { 321 {
322 int err; 322 int err;
323 if (!is_owner_or_cap(inode)) 323 if (!inode_owner_or_capable(inode))
324 return -EACCES; 324 return -EACCES;
325 325
326 err = mnt_want_write(filp->f_path.mnt); 326 err = mnt_want_write(filp->f_path.mnt);
@@ -334,16 +334,22 @@ mext_out:
334 case FITRIM: 334 case FITRIM:
335 { 335 {
336 struct super_block *sb = inode->i_sb; 336 struct super_block *sb = inode->i_sb;
337 struct request_queue *q = bdev_get_queue(sb->s_bdev);
337 struct fstrim_range range; 338 struct fstrim_range range;
338 int ret = 0; 339 int ret = 0;
339 340
340 if (!capable(CAP_SYS_ADMIN)) 341 if (!capable(CAP_SYS_ADMIN))
341 return -EPERM; 342 return -EPERM;
342 343
344 if (!blk_queue_discard(q))
345 return -EOPNOTSUPP;
346
343 if (copy_from_user(&range, (struct fstrim_range *)arg, 347 if (copy_from_user(&range, (struct fstrim_range *)arg,
344 sizeof(range))) 348 sizeof(range)))
345 return -EFAULT; 349 return -EFAULT;
346 350
351 range.minlen = max((unsigned int)range.minlen,
352 q->limits.discard_granularity);
347 ret = ext4_trim_fs(sb, &range); 353 ret = ext4_trim_fs(sb, &range);
348 if (ret < 0) 354 if (ret < 0)
349 return ret; 355 return ret;
@@ -421,6 +427,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
421 return err; 427 return err;
422 } 428 }
423 case EXT4_IOC_MOVE_EXT: 429 case EXT4_IOC_MOVE_EXT:
430 case FITRIM:
424 break; 431 break;
425 default: 432 default:
426 return -ENOIOCTLCMD; 433 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 851f49b2f9d2..d8a16eecf1d5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -92,7 +92,7 @@
92 * between CPUs. It is possible to get scheduled at this point. 92 * between CPUs. It is possible to get scheduled at this point.
93 * 93 *
94 * The locality group prealloc space is used looking at whether we have 94 * The locality group prealloc space is used looking at whether we have
95 * enough free space (pa_free) withing the prealloc space. 95 * enough free space (pa_free) within the prealloc space.
96 * 96 *
97 * If we can't allocate blocks via inode prealloc or/and locality group 97 * If we can't allocate blocks via inode prealloc or/and locality group
98 * prealloc then we look at the buddy cache. The buddy cache is represented 98 * prealloc then we look at the buddy cache. The buddy cache is represented
@@ -342,10 +342,15 @@ static struct kmem_cache *ext4_free_ext_cachep;
342/* We create slab caches for groupinfo data structures based on the 342/* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for 343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */ 344 * each unique s_blocksize_bits */
345#define NR_GRPINFO_CACHES \ 345#define NR_GRPINFO_CACHES 8
346 (EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE + 1)
347static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 346static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
348 347
348static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
349 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
350 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
351 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
352};
353
349static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 354static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
350 ext4_group_t group); 355 ext4_group_t group);
351static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 356static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
@@ -427,9 +432,10 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
427 } 432 }
428 433
429 /* at order 0 we see each particular block */ 434 /* at order 0 we see each particular block */
430 *max = 1 << (e4b->bd_blkbits + 3); 435 if (order == 0) {
431 if (order == 0) 436 *max = 1 << (e4b->bd_blkbits + 3);
432 return EXT4_MB_BITMAP(e4b); 437 return EXT4_MB_BITMAP(e4b);
438 }
433 439
434 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 440 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
435 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 441 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
@@ -611,7 +617,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
611 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 617 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
612 618
613 grp = ext4_get_group_info(sb, e4b->bd_group); 619 grp = ext4_get_group_info(sb, e4b->bd_group);
614 buddy = mb_find_buddy(e4b, 0, &max);
615 list_for_each(cur, &grp->bb_prealloc_list) { 620 list_for_each(cur, &grp->bb_prealloc_list) {
616 ext4_group_t groupnr; 621 ext4_group_t groupnr;
617 struct ext4_prealloc_space *pa; 622 struct ext4_prealloc_space *pa;
@@ -630,7 +635,12 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
630#define mb_check_buddy(e4b) 635#define mb_check_buddy(e4b)
631#endif 636#endif
632 637
633/* FIXME!! need more doc */ 638/*
639 * Divide blocks started from @first with length @len into
640 * smaller chunks with power of 2 blocks.
641 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
642 * then increase bb_counters[] for corresponded chunk size.
643 */
634static void ext4_mb_mark_free_simple(struct super_block *sb, 644static void ext4_mb_mark_free_simple(struct super_block *sb,
635 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 645 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
636 struct ext4_group_info *grp) 646 struct ext4_group_info *grp)
@@ -2376,7 +2386,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
2376 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2386 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2377 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2387 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2378 * So a two level scheme suffices for now. */ 2388 * So a two level scheme suffices for now. */
2379 sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); 2389 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
2380 if (sbi->s_group_info == NULL) { 2390 if (sbi->s_group_info == NULL) {
2381 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2391 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2382 return -ENOMEM; 2392 return -ENOMEM;
@@ -2414,6 +2424,55 @@ err_freesgi:
2414 return -ENOMEM; 2424 return -ENOMEM;
2415} 2425}
2416 2426
2427static void ext4_groupinfo_destroy_slabs(void)
2428{
2429 int i;
2430
2431 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2432 if (ext4_groupinfo_caches[i])
2433 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2434 ext4_groupinfo_caches[i] = NULL;
2435 }
2436}
2437
2438static int ext4_groupinfo_create_slab(size_t size)
2439{
2440 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2441 int slab_size;
2442 int blocksize_bits = order_base_2(size);
2443 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2444 struct kmem_cache *cachep;
2445
2446 if (cache_index >= NR_GRPINFO_CACHES)
2447 return -EINVAL;
2448
2449 if (unlikely(cache_index < 0))
2450 cache_index = 0;
2451
2452 mutex_lock(&ext4_grpinfo_slab_create_mutex);
2453 if (ext4_groupinfo_caches[cache_index]) {
2454 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2455 return 0; /* Already created */
2456 }
2457
2458 slab_size = offsetof(struct ext4_group_info,
2459 bb_counters[blocksize_bits + 2]);
2460
2461 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2462 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2463 NULL);
2464
2465 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2466 if (!cachep) {
2467 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
2468 return -ENOMEM;
2469 }
2470
2471 ext4_groupinfo_caches[cache_index] = cachep;
2472
2473 return 0;
2474}
2475
2417int ext4_mb_init(struct super_block *sb, int needs_recovery) 2476int ext4_mb_init(struct super_block *sb, int needs_recovery)
2418{ 2477{
2419 struct ext4_sb_info *sbi = EXT4_SB(sb); 2478 struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2421,9 +2480,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2421 unsigned offset; 2480 unsigned offset;
2422 unsigned max; 2481 unsigned max;
2423 int ret; 2482 int ret;
2424 int cache_index;
2425 struct kmem_cache *cachep;
2426 char *namep = NULL;
2427 2483
2428 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2484 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2429 2485
@@ -2440,30 +2496,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
2440 goto out; 2496 goto out;
2441 } 2497 }
2442 2498
2443 cache_index = sb->s_blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2499 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2444 cachep = ext4_groupinfo_caches[cache_index]; 2500 if (ret < 0)
2445 if (!cachep) { 2501 goto out;
2446 char name[32];
2447 int len = offsetof(struct ext4_group_info,
2448 bb_counters[sb->s_blocksize_bits + 2]);
2449
2450 sprintf(name, "ext4_groupinfo_%d", sb->s_blocksize_bits);
2451 namep = kstrdup(name, GFP_KERNEL);
2452 if (!namep) {
2453 ret = -ENOMEM;
2454 goto out;
2455 }
2456
2457 /* Need to free the kmem_cache_name() when we
2458 * destroy the slab */
2459 cachep = kmem_cache_create(namep, len, 0,
2460 SLAB_RECLAIM_ACCOUNT, NULL);
2461 if (!cachep) {
2462 ret = -ENOMEM;
2463 goto out;
2464 }
2465 ext4_groupinfo_caches[cache_index] = cachep;
2466 }
2467 2502
2468 /* order 0 is regular bitmap */ 2503 /* order 0 is regular bitmap */
2469 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2504 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
@@ -2520,7 +2555,6 @@ out:
2520 if (ret) { 2555 if (ret) {
2521 kfree(sbi->s_mb_offsets); 2556 kfree(sbi->s_mb_offsets);
2522 kfree(sbi->s_mb_maxs); 2557 kfree(sbi->s_mb_maxs);
2523 kfree(namep);
2524 } 2558 }
2525 return ret; 2559 return ret;
2526} 2560}
@@ -2734,7 +2768,6 @@ int __init ext4_init_mballoc(void)
2734 2768
2735void ext4_exit_mballoc(void) 2769void ext4_exit_mballoc(void)
2736{ 2770{
2737 int i;
2738 /* 2771 /*
2739 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2772 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2740 * before destroying the slab cache. 2773 * before destroying the slab cache.
@@ -2743,15 +2776,7 @@ void ext4_exit_mballoc(void)
2743 kmem_cache_destroy(ext4_pspace_cachep); 2776 kmem_cache_destroy(ext4_pspace_cachep);
2744 kmem_cache_destroy(ext4_ac_cachep); 2777 kmem_cache_destroy(ext4_ac_cachep);
2745 kmem_cache_destroy(ext4_free_ext_cachep); 2778 kmem_cache_destroy(ext4_free_ext_cachep);
2746 2779 ext4_groupinfo_destroy_slabs();
2747 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2748 struct kmem_cache *cachep = ext4_groupinfo_caches[i];
2749 if (cachep) {
2750 char *name = (char *)kmem_cache_name(cachep);
2751 kmem_cache_destroy(cachep);
2752 kfree(name);
2753 }
2754 }
2755 ext4_remove_debugfs_entry(); 2780 ext4_remove_debugfs_entry();
2756} 2781}
2757 2782
@@ -3188,7 +3213,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3188 cur_distance = abs(goal_block - cpa->pa_pstart); 3213 cur_distance = abs(goal_block - cpa->pa_pstart);
3189 new_distance = abs(goal_block - pa->pa_pstart); 3214 new_distance = abs(goal_block - pa->pa_pstart);
3190 3215
3191 if (cur_distance < new_distance) 3216 if (cur_distance <= new_distance)
3192 return cpa; 3217 return cpa;
3193 3218
3194 /* drop the previous reference */ 3219 /* drop the previous reference */
@@ -3887,7 +3912,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3887 struct super_block *sb = ac->ac_sb; 3912 struct super_block *sb = ac->ac_sb;
3888 ext4_group_t ngroups, i; 3913 ext4_group_t ngroups, i;
3889 3914
3890 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) 3915 if (!mb_enable_debug ||
3916 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3891 return; 3917 return;
3892 3918
3893 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3919 printk(KERN_ERR "EXT4-fs: Can't allocate:"
@@ -4733,7 +4759,8 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
4733 * bitmap. Then issue a TRIM command on this extent and free the extent in 4759 * bitmap. Then issue a TRIM command on this extent and free the extent in
4734 * the group buddy bitmap. This is done until whole group is scanned. 4760 * the group buddy bitmap. This is done until whole group is scanned.
4735 */ 4761 */
4736ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, 4762static ext4_grpblk_t
4763ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4737 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) 4764 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
4738{ 4765{
4739 void *bitmap; 4766 void *bitmap;
@@ -4843,10 +4870,15 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4843 break; 4870 break;
4844 } 4871 }
4845 4872
4846 if (len >= EXT4_BLOCKS_PER_GROUP(sb)) 4873 /*
4847 len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block); 4874 * For all the groups except the last one, last block will
4848 else 4875 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
4876 * change it for the last group in which case start +
4877 * len < EXT4_BLOCKS_PER_GROUP(sb).
4878 */
4879 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
4849 last_block = first_block + len; 4880 last_block = first_block + len;
4881 len -= last_block - first_block;
4850 4882
4851 if (e4b.bd_info->bb_free >= minlen) { 4883 if (e4b.bd_info->bb_free >= minlen) {
4852 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4884 cnt = ext4_trim_all_free(sb, &e4b, first_block,
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index b619322c76f0..22bd4d7f289b 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -169,7 +169,7 @@ struct ext4_allocation_context {
169 /* original request */ 169 /* original request */
170 struct ext4_free_extent ac_o_ex; 170 struct ext4_free_extent ac_o_ex;
171 171
172 /* goal request (after normalization) */ 172 /* goal request (normalized ac_o_ex) */
173 struct ext4_free_extent ac_g_ex; 173 struct ext4_free_extent ac_g_ex;
174 174
175 /* the best found extent */ 175 /* the best found extent */
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b0a126f23c20..92816b4e0f16 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -263,7 +263,7 @@ static int free_dind_blocks(handle_t *handle,
263 for (i = 0; i < max_entries; i++) { 263 for (i = 0; i < max_entries; i++) {
264 if (tmp_idata[i]) { 264 if (tmp_idata[i]) {
265 extend_credit_for_blkdel(handle, inode); 265 extend_credit_for_blkdel(handle, inode);
266 ext4_free_blocks(handle, inode, 0, 266 ext4_free_blocks(handle, inode, NULL,
267 le32_to_cpu(tmp_idata[i]), 1, 267 le32_to_cpu(tmp_idata[i]), 1,
268 EXT4_FREE_BLOCKS_METADATA | 268 EXT4_FREE_BLOCKS_METADATA |
269 EXT4_FREE_BLOCKS_FORGET); 269 EXT4_FREE_BLOCKS_FORGET);
@@ -271,7 +271,7 @@ static int free_dind_blocks(handle_t *handle,
271 } 271 }
272 put_bh(bh); 272 put_bh(bh);
273 extend_credit_for_blkdel(handle, inode); 273 extend_credit_for_blkdel(handle, inode);
274 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 274 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
275 EXT4_FREE_BLOCKS_METADATA | 275 EXT4_FREE_BLOCKS_METADATA |
276 EXT4_FREE_BLOCKS_FORGET); 276 EXT4_FREE_BLOCKS_FORGET);
277 return 0; 277 return 0;
@@ -302,7 +302,7 @@ static int free_tind_blocks(handle_t *handle,
302 } 302 }
303 put_bh(bh); 303 put_bh(bh);
304 extend_credit_for_blkdel(handle, inode); 304 extend_credit_for_blkdel(handle, inode);
305 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1, 305 ext4_free_blocks(handle, inode, NULL, le32_to_cpu(i_data), 1,
306 EXT4_FREE_BLOCKS_METADATA | 306 EXT4_FREE_BLOCKS_METADATA |
307 EXT4_FREE_BLOCKS_FORGET); 307 EXT4_FREE_BLOCKS_FORGET);
308 return 0; 308 return 0;
@@ -315,7 +315,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
315 /* ei->i_data[EXT4_IND_BLOCK] */ 315 /* ei->i_data[EXT4_IND_BLOCK] */
316 if (i_data[0]) { 316 if (i_data[0]) {
317 extend_credit_for_blkdel(handle, inode); 317 extend_credit_for_blkdel(handle, inode);
318 ext4_free_blocks(handle, inode, 0, 318 ext4_free_blocks(handle, inode, NULL,
319 le32_to_cpu(i_data[0]), 1, 319 le32_to_cpu(i_data[0]), 1,
320 EXT4_FREE_BLOCKS_METADATA | 320 EXT4_FREE_BLOCKS_METADATA |
321 EXT4_FREE_BLOCKS_FORGET); 321 EXT4_FREE_BLOCKS_FORGET);
@@ -428,7 +428,7 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
428 } 428 }
429 put_bh(bh); 429 put_bh(bh);
430 extend_credit_for_blkdel(handle, inode); 430 extend_credit_for_blkdel(handle, inode);
431 ext4_free_blocks(handle, inode, 0, block, 1, 431 ext4_free_blocks(handle, inode, NULL, block, 1,
432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); 432 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
433 return retval; 433 return retval;
434} 434}
@@ -517,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
517 * start with one credit accounted for 517 * start with one credit accounted for
518 * superblock modification. 518 * superblock modification.
519 * 519 *
520 * For the tmp_inode we already have commited the 520 * For the tmp_inode we already have committed the
521 * trascation that created the inode. Later as and 521 * trascation that created the inode. Later as and
522 * when we add extents we extent the journal 522 * when we add extents we extent the journal
523 */ 523 */
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c5..67fd0b025858 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -40,6 +40,7 @@
40#include "xattr.h" 40#include "xattr.h"
41#include "acl.h" 41#include "acl.h"
42 42
43#include <trace/events/ext4.h>
43/* 44/*
44 * define how far ahead to read directories while searching them. 45 * define how far ahead to read directories while searching them.
45 */ 46 */
@@ -2183,6 +2184,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2183 struct ext4_dir_entry_2 *de; 2184 struct ext4_dir_entry_2 *de;
2184 handle_t *handle; 2185 handle_t *handle;
2185 2186
2187 trace_ext4_unlink_enter(dir, dentry);
2186 /* Initialize quotas before so that eventual writes go 2188 /* Initialize quotas before so that eventual writes go
2187 * in separate transaction */ 2189 * in separate transaction */
2188 dquot_initialize(dir); 2190 dquot_initialize(dir);
@@ -2228,6 +2230,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
2228end_unlink: 2230end_unlink:
2229 ext4_journal_stop(handle); 2231 ext4_journal_stop(handle);
2230 brelse(bh); 2232 brelse(bh);
2233 trace_ext4_unlink_exit(dentry, retval);
2231 return retval; 2234 return retval;
2232} 2235}
2233 2236
@@ -2304,13 +2307,6 @@ static int ext4_link(struct dentry *old_dentry,
2304 2307
2305 dquot_initialize(dir); 2308 dquot_initialize(dir);
2306 2309
2307 /*
2308 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
2309 * otherwise has the potential to corrupt the orphan inode list.
2310 */
2311 if (inode->i_nlink == 0)
2312 return -ENOENT;
2313
2314retry: 2310retry:
2315 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2311 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2316 EXT4_INDEX_EXTRA_TRANS_BLOCKS); 2312 EXT4_INDEX_EXTRA_TRANS_BLOCKS);
@@ -2409,6 +2405,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2409 if (!new_inode && new_dir != old_dir && 2405 if (!new_inode && new_dir != old_dir &&
2410 EXT4_DIR_LINK_MAX(new_dir)) 2406 EXT4_DIR_LINK_MAX(new_dir))
2411 goto end_rename; 2407 goto end_rename;
2408 BUFFER_TRACE(dir_bh, "get_write_access");
2409 retval = ext4_journal_get_write_access(handle, dir_bh);
2410 if (retval)
2411 goto end_rename;
2412 } 2412 }
2413 if (!new_bh) { 2413 if (!new_bh) {
2414 retval = ext4_add_entry(handle, new_dentry, old_inode); 2414 retval = ext4_add_entry(handle, new_dentry, old_inode);
@@ -2416,7 +2416,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2416 goto end_rename; 2416 goto end_rename;
2417 } else { 2417 } else {
2418 BUFFER_TRACE(new_bh, "get write access"); 2418 BUFFER_TRACE(new_bh, "get write access");
2419 ext4_journal_get_write_access(handle, new_bh); 2419 retval = ext4_journal_get_write_access(handle, new_bh);
2420 if (retval)
2421 goto end_rename;
2420 new_de->inode = cpu_to_le32(old_inode->i_ino); 2422 new_de->inode = cpu_to_le32(old_inode->i_ino);
2421 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb, 2423 if (EXT4_HAS_INCOMPAT_FEATURE(new_dir->i_sb,
2422 EXT4_FEATURE_INCOMPAT_FILETYPE)) 2424 EXT4_FEATURE_INCOMPAT_FILETYPE))
@@ -2477,8 +2479,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
2477 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir); 2479 old_dir->i_ctime = old_dir->i_mtime = ext4_current_time(old_dir);
2478 ext4_update_dx_flag(old_dir); 2480 ext4_update_dx_flag(old_dir);
2479 if (dir_bh) { 2481 if (dir_bh) {
2480 BUFFER_TRACE(dir_bh, "get_write_access");
2481 ext4_journal_get_write_access(handle, dir_bh);
2482 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) = 2482 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
2483 cpu_to_le32(new_dir->i_ino); 2483 cpu_to_le32(new_dir->i_ino);
2484 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); 2484 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 7270dcfca92a..b6dbd056fcb1 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -32,14 +32,8 @@
32 32
33static struct kmem_cache *io_page_cachep, *io_end_cachep; 33static struct kmem_cache *io_page_cachep, *io_end_cachep;
34 34
35#define WQ_HASH_SZ 37
36#define to_ioend_wq(v) (&ioend_wq[((unsigned long)v) % WQ_HASH_SZ])
37static wait_queue_head_t ioend_wq[WQ_HASH_SZ];
38
39int __init ext4_init_pageio(void) 35int __init ext4_init_pageio(void)
40{ 36{
41 int i;
42
43 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT); 37 io_page_cachep = KMEM_CACHE(ext4_io_page, SLAB_RECLAIM_ACCOUNT);
44 if (io_page_cachep == NULL) 38 if (io_page_cachep == NULL)
45 return -ENOMEM; 39 return -ENOMEM;
@@ -48,9 +42,6 @@ int __init ext4_init_pageio(void)
48 kmem_cache_destroy(io_page_cachep); 42 kmem_cache_destroy(io_page_cachep);
49 return -ENOMEM; 43 return -ENOMEM;
50 } 44 }
51 for (i = 0; i < WQ_HASH_SZ; i++)
52 init_waitqueue_head(&ioend_wq[i]);
53
54 return 0; 45 return 0;
55} 46}
56 47
@@ -62,7 +53,7 @@ void ext4_exit_pageio(void)
62 53
63void ext4_ioend_wait(struct inode *inode) 54void ext4_ioend_wait(struct inode *inode)
64{ 55{
65 wait_queue_head_t *wq = to_ioend_wq(inode); 56 wait_queue_head_t *wq = ext4_ioend_wq(inode);
66 57
67 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 58 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0));
68} 59}
@@ -87,7 +78,7 @@ void ext4_free_io_end(ext4_io_end_t *io)
87 for (i = 0; i < io->num_io_pages; i++) 78 for (i = 0; i < io->num_io_pages; i++)
88 put_io_page(io->pages[i]); 79 put_io_page(io->pages[i]);
89 io->num_io_pages = 0; 80 io->num_io_pages = 0;
90 wq = to_ioend_wq(io->inode); 81 wq = ext4_ioend_wq(io->inode);
91 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) && 82 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count) &&
92 waitqueue_active(wq)) 83 waitqueue_active(wq))
93 wake_up_all(wq); 84 wake_up_all(wq);
@@ -102,6 +93,7 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
102 struct inode *inode = io->inode; 93 struct inode *inode = io->inode;
103 loff_t offset = io->offset; 94 loff_t offset = io->offset;
104 ssize_t size = io->size; 95 ssize_t size = io->size;
96 wait_queue_head_t *wq;
105 int ret = 0; 97 int ret = 0;
106 98
107 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 99 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
@@ -126,7 +118,16 @@ int ext4_end_io_nolock(ext4_io_end_t *io)
126 if (io->iocb) 118 if (io->iocb)
127 aio_complete(io->iocb, io->result, 0); 119 aio_complete(io->iocb, io->result, 0);
128 /* clear the DIO AIO unwritten flag */ 120 /* clear the DIO AIO unwritten flag */
129 io->flag &= ~EXT4_IO_END_UNWRITTEN; 121 if (io->flag & EXT4_IO_END_UNWRITTEN) {
122 io->flag &= ~EXT4_IO_END_UNWRITTEN;
123 /* Wake up anyone waiting on unwritten extent conversion */
124 wq = ext4_ioend_wq(io->inode);
125 if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten) &&
126 waitqueue_active(wq)) {
127 wake_up_all(wq);
128 }
129 }
130
130 return ret; 131 return ret;
131} 132}
132 133
@@ -190,6 +191,7 @@ static void ext4_end_bio(struct bio *bio, int error)
190 struct inode *inode; 191 struct inode *inode;
191 unsigned long flags; 192 unsigned long flags;
192 int i; 193 int i;
194 sector_t bi_sector = bio->bi_sector;
193 195
194 BUG_ON(!io_end); 196 BUG_ON(!io_end);
195 bio->bi_private = NULL; 197 bio->bi_private = NULL;
@@ -207,9 +209,7 @@ static void ext4_end_bio(struct bio *bio, int error)
207 if (error) 209 if (error)
208 SetPageError(page); 210 SetPageError(page);
209 BUG_ON(!head); 211 BUG_ON(!head);
210 if (head->b_size == PAGE_CACHE_SIZE) 212 if (head->b_size != PAGE_CACHE_SIZE) {
211 clear_buffer_dirty(head);
212 else {
213 loff_t offset; 213 loff_t offset;
214 loff_t io_end_offset = io_end->offset + io_end->size; 214 loff_t io_end_offset = io_end->offset + io_end->size;
215 215
@@ -221,7 +221,6 @@ static void ext4_end_bio(struct bio *bio, int error)
221 if (error) 221 if (error)
222 buffer_io_error(bh); 222 buffer_io_error(bh);
223 223
224 clear_buffer_dirty(bh);
225 } 224 }
226 if (buffer_delay(bh)) 225 if (buffer_delay(bh))
227 partial_write = 1; 226 partial_write = 1;
@@ -257,7 +256,12 @@ static void ext4_end_bio(struct bio *bio, int error)
257 (unsigned long long) io_end->offset, 256 (unsigned long long) io_end->offset,
258 (long) io_end->size, 257 (long) io_end->size,
259 (unsigned long long) 258 (unsigned long long)
260 bio->bi_sector >> (inode->i_blkbits - 9)); 259 bi_sector >> (inode->i_blkbits - 9));
260 }
261
262 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
263 ext4_free_io_end(io_end);
264 return;
261 } 265 }
262 266
263 /* Add the io_end to per-inode completed io list*/ 267 /* Add the io_end to per-inode completed io list*/
@@ -280,9 +284,9 @@ void ext4_io_submit(struct ext4_io_submit *io)
280 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); 284 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP));
281 bio_put(io->io_bio); 285 bio_put(io->io_bio);
282 } 286 }
283 io->io_bio = 0; 287 io->io_bio = NULL;
284 io->io_op = 0; 288 io->io_op = 0;
285 io->io_end = 0; 289 io->io_end = NULL;
286} 290}
287 291
288static int io_submit_init(struct ext4_io_submit *io, 292static int io_submit_init(struct ext4_io_submit *io,
@@ -311,8 +315,7 @@ static int io_submit_init(struct ext4_io_submit *io,
311 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 315 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
312 316
313 io->io_bio = bio; 317 io->io_bio = bio;
314 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? 318 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
315 WRITE_SYNC_PLUG : WRITE);
316 io->io_next_block = bh->b_blocknr; 319 io->io_next_block = bh->b_blocknr;
317 return 0; 320 return 0;
318} 321}
@@ -380,9 +383,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
380 383
381 blocksize = 1 << inode->i_blkbits; 384 blocksize = 1 << inode->i_blkbits;
382 385
386 BUG_ON(!PageLocked(page));
383 BUG_ON(PageWriteback(page)); 387 BUG_ON(PageWriteback(page));
384 set_page_writeback(page);
385 ClearPageError(page);
386 388
387 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS); 389 io_page = kmem_cache_alloc(io_page_cachep, GFP_NOFS);
388 if (!io_page) { 390 if (!io_page) {
@@ -393,16 +395,20 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
393 io_page->p_page = page; 395 io_page->p_page = page;
394 atomic_set(&io_page->p_count, 1); 396 atomic_set(&io_page->p_count, 1);
395 get_page(page); 397 get_page(page);
398 set_page_writeback(page);
399 ClearPageError(page);
396 400
397 for (bh = head = page_buffers(page), block_start = 0; 401 for (bh = head = page_buffers(page), block_start = 0;
398 bh != head || !block_start; 402 bh != head || !block_start;
399 block_start = block_end, bh = bh->b_this_page) { 403 block_start = block_end, bh = bh->b_this_page) {
404
400 block_end = block_start + blocksize; 405 block_end = block_start + blocksize;
401 if (block_start >= len) { 406 if (block_start >= len) {
402 clear_buffer_dirty(bh); 407 clear_buffer_dirty(bh);
403 set_buffer_uptodate(bh); 408 set_buffer_uptodate(bh);
404 continue; 409 continue;
405 } 410 }
411 clear_buffer_dirty(bh);
406 ret = io_submit_add_bh(io, io_page, inode, wbc, bh); 412 ret = io_submit_add_bh(io, io_page, inode, wbc, bh);
407 if (ret) { 413 if (ret) {
408 /* 414 /*
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3ecc6e45d2f9..80bbc9c60c24 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -230,7 +230,7 @@ static int setup_new_group_blocks(struct super_block *sb,
230 } 230 }
231 231
232 /* Zero out all of the reserved backup group descriptor table blocks */ 232 /* Zero out all of the reserved backup group descriptor table blocks */
233 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", 233 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
234 block, sbi->s_itb_per_group); 234 block, sbi->s_itb_per_group);
235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb, 235 err = sb_issue_zeroout(sb, gdblocks + start + 1, reserved_gdb,
236 GFP_NOFS); 236 GFP_NOFS);
@@ -248,7 +248,7 @@ static int setup_new_group_blocks(struct super_block *sb,
248 248
249 /* Zero out all of the inode table blocks */ 249 /* Zero out all of the inode table blocks */
250 block = input->inode_table; 250 block = input->inode_table;
251 ext4_debug("clear inode table blocks %#04llx -> %#04llx\n", 251 ext4_debug("clear inode table blocks %#04llx -> %#04lx\n",
252 block, sbi->s_itb_per_group); 252 block, sbi->s_itb_per_group);
253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS); 253 err = sb_issue_zeroout(sb, block, sbi->s_itb_per_group, GFP_NOFS);
254 if (err) 254 if (err)
@@ -499,12 +499,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
499 return err; 499 return err;
500 500
501exit_inode: 501exit_inode:
502 /* ext4_journal_release_buffer(handle, iloc.bh); */ 502 /* ext4_handle_release_buffer(handle, iloc.bh); */
503 brelse(iloc.bh); 503 brelse(iloc.bh);
504exit_dindj: 504exit_dindj:
505 /* ext4_journal_release_buffer(handle, dind); */ 505 /* ext4_handle_release_buffer(handle, dind); */
506exit_sbh: 506exit_sbh:
507 /* ext4_journal_release_buffer(handle, EXT4_SB(sb)->s_sbh); */ 507 /* ext4_handle_release_buffer(handle, EXT4_SB(sb)->s_sbh); */
508exit_dind: 508exit_dind:
509 brelse(dind); 509 brelse(dind);
510exit_bh: 510exit_bh:
@@ -586,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
586 /* 586 /*
587 int j; 587 int j;
588 for (j = 0; j < i; j++) 588 for (j = 0; j < i; j++)
589 ext4_journal_release_buffer(handle, primary[j]); 589 ext4_handle_release_buffer(handle, primary[j]);
590 */ 590 */
591 goto exit_bh; 591 goto exit_bh;
592 } 592 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 48ce561fafac..8553dfb310af 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -54,9 +54,9 @@
54 54
55static struct proc_dir_entry *ext4_proc_root; 55static struct proc_dir_entry *ext4_proc_root;
56static struct kset *ext4_kset; 56static struct kset *ext4_kset;
57struct ext4_lazy_init *ext4_li_info; 57static struct ext4_lazy_init *ext4_li_info;
58struct mutex ext4_li_mtx; 58static struct mutex ext4_li_mtx;
59struct ext4_features *ext4_feat; 59static struct ext4_features *ext4_feat;
60 60
61static int ext4_load_journal(struct super_block *, struct ext4_super_block *, 61static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
62 unsigned long journal_devnum); 62 unsigned long journal_devnum);
@@ -75,8 +75,10 @@ static void ext4_write_super(struct super_block *sb);
75static int ext4_freeze(struct super_block *sb); 75static int ext4_freeze(struct super_block *sb);
76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, 76static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
77 const char *dev_name, void *data); 77 const char *dev_name, void *data);
78static int ext4_feature_set_ok(struct super_block *sb, int readonly);
78static void ext4_destroy_lazyinit_thread(void); 79static void ext4_destroy_lazyinit_thread(void);
79static void ext4_unregister_li_request(struct super_block *sb); 80static void ext4_unregister_li_request(struct super_block *sb);
81static void ext4_clear_request_list(void);
80 82
81#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23) 83#if !defined(CONFIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
82static struct file_system_type ext3_fs_type = { 84static struct file_system_type ext3_fs_type = {
@@ -240,27 +242,44 @@ static void ext4_put_nojournal(handle_t *handle)
240 * journal_end calls result in the superblock being marked dirty, so 242 * journal_end calls result in the superblock being marked dirty, so
241 * that sync() will call the filesystem's write_super callback if 243 * that sync() will call the filesystem's write_super callback if
242 * appropriate. 244 * appropriate.
245 *
246 * To avoid j_barrier hold in userspace when a user calls freeze(),
247 * ext4 prevents a new handle from being started by s_frozen, which
248 * is in an upper layer.
243 */ 249 */
244handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) 250handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
245{ 251{
246 journal_t *journal; 252 journal_t *journal;
253 handle_t *handle;
247 254
248 if (sb->s_flags & MS_RDONLY) 255 if (sb->s_flags & MS_RDONLY)
249 return ERR_PTR(-EROFS); 256 return ERR_PTR(-EROFS);
250 257
251 vfs_check_frozen(sb, SB_FREEZE_TRANS);
252 /* Special case here: if the journal has aborted behind our
253 * backs (eg. EIO in the commit thread), then we still need to
254 * take the FS itself readonly cleanly. */
255 journal = EXT4_SB(sb)->s_journal; 258 journal = EXT4_SB(sb)->s_journal;
256 if (journal) { 259 handle = ext4_journal_current_handle();
257 if (is_journal_aborted(journal)) { 260
258 ext4_abort(sb, "Detected aborted journal"); 261 /*
259 return ERR_PTR(-EROFS); 262 * If a handle has been started, it should be allowed to
260 } 263 * finish, otherwise deadlock could happen between freeze
261 return jbd2_journal_start(journal, nblocks); 264 * and others(e.g. truncate) due to the restart of the
265 * journal handle if the filesystem is forzen and active
266 * handles are not stopped.
267 */
268 if (!handle)
269 vfs_check_frozen(sb, SB_FREEZE_TRANS);
270
271 if (!journal)
272 return ext4_get_nojournal();
273 /*
274 * Special case here: if the journal has aborted behind our
275 * backs (eg. EIO in the commit thread), then we still need to
276 * take the FS itself readonly cleanly.
277 */
278 if (is_journal_aborted(journal)) {
279 ext4_abort(sb, "Detected aborted journal");
280 return ERR_PTR(-EROFS);
262 } 281 }
263 return ext4_get_nojournal(); 282 return jbd2_journal_start(journal, nblocks);
264} 283}
265 284
266/* 285/*
@@ -593,7 +612,7 @@ __acquires(bitlock)
593 612
594 vaf.fmt = fmt; 613 vaf.fmt = fmt;
595 vaf.va = &args; 614 vaf.va = &args;
596 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", 615 printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u, ",
597 sb->s_id, function, line, grp); 616 sb->s_id, function, line, grp);
598 if (ino) 617 if (ino)
599 printk(KERN_CONT "inode %lu: ", ino); 618 printk(KERN_CONT "inode %lu: ", ino);
@@ -615,7 +634,7 @@ __acquires(bitlock)
615 * filesystem will have already been marked read/only and the 634 * filesystem will have already been marked read/only and the
616 * journal has been aborted. We return 1 as a hint to callers 635 * journal has been aborted. We return 1 as a hint to callers
617 * who might what to use the return value from 636 * who might what to use the return value from
618 * ext4_grp_locked_error() to distinguish beween the 637 * ext4_grp_locked_error() to distinguish between the
619 * ERRORS_CONT and ERRORS_RO case, and perhaps return more 638 * ERRORS_CONT and ERRORS_RO case, and perhaps return more
620 * aggressively from the ext4 function in question, with a 639 * aggressively from the ext4 function in question, with a
621 * more appropriate error code. 640 * more appropriate error code.
@@ -832,6 +851,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
832 ei->i_sync_tid = 0; 851 ei->i_sync_tid = 0;
833 ei->i_datasync_tid = 0; 852 ei->i_datasync_tid = 0;
834 atomic_set(&ei->i_ioend_count, 0); 853 atomic_set(&ei->i_ioend_count, 0);
854 atomic_set(&ei->i_aiodio_unwritten, 0);
835 855
836 return &ei->vfs_inode; 856 return &ei->vfs_inode;
837} 857}
@@ -995,13 +1015,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
995 if (test_opt(sb, OLDALLOC)) 1015 if (test_opt(sb, OLDALLOC))
996 seq_puts(seq, ",oldalloc"); 1016 seq_puts(seq, ",oldalloc");
997#ifdef CONFIG_EXT4_FS_XATTR 1017#ifdef CONFIG_EXT4_FS_XATTR
998 if (test_opt(sb, XATTR_USER) && 1018 if (test_opt(sb, XATTR_USER))
999 !(def_mount_opts & EXT4_DEFM_XATTR_USER))
1000 seq_puts(seq, ",user_xattr"); 1019 seq_puts(seq, ",user_xattr");
1001 if (!test_opt(sb, XATTR_USER) && 1020 if (!test_opt(sb, XATTR_USER))
1002 (def_mount_opts & EXT4_DEFM_XATTR_USER)) {
1003 seq_puts(seq, ",nouser_xattr"); 1021 seq_puts(seq, ",nouser_xattr");
1004 }
1005#endif 1022#endif
1006#ifdef CONFIG_EXT4_FS_POSIX_ACL 1023#ifdef CONFIG_EXT4_FS_POSIX_ACL
1007 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) 1024 if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
@@ -1039,8 +1056,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
1039 !(def_mount_opts & EXT4_DEFM_NODELALLOC)) 1056 !(def_mount_opts & EXT4_DEFM_NODELALLOC))
1040 seq_puts(seq, ",nodelalloc"); 1057 seq_puts(seq, ",nodelalloc");
1041 1058
1042 if (test_opt(sb, MBLK_IO_SUBMIT)) 1059 if (!test_opt(sb, MBLK_IO_SUBMIT))
1043 seq_puts(seq, ",mblk_io_submit"); 1060 seq_puts(seq, ",nomblk_io_submit");
1044 if (sbi->s_stripe) 1061 if (sbi->s_stripe)
1045 seq_printf(seq, ",stripe=%lu", sbi->s_stripe); 1062 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
1046 /* 1063 /*
@@ -1449,7 +1466,7 @@ static int parse_options(char *options, struct super_block *sb,
1449 * Initialize args struct so we know whether arg was 1466 * Initialize args struct so we know whether arg was
1450 * found; some options take optional arguments. 1467 * found; some options take optional arguments.
1451 */ 1468 */
1452 args[0].to = args[0].from = 0; 1469 args[0].to = args[0].from = NULL;
1453 token = match_token(p, tokens, args); 1470 token = match_token(p, tokens, args);
1454 switch (token) { 1471 switch (token) {
1455 case Opt_bsd_df: 1472 case Opt_bsd_df:
@@ -1769,7 +1786,7 @@ set_qf_format:
1769 return 0; 1786 return 0;
1770 if (option < 0 || option > (1 << 30)) 1787 if (option < 0 || option > (1 << 30))
1771 return 0; 1788 return 0;
1772 if (!is_power_of_2(option)) { 1789 if (option && !is_power_of_2(option)) {
1773 ext4_msg(sb, KERN_ERR, 1790 ext4_msg(sb, KERN_ERR,
1774 "EXT4-fs: inode_readahead_blks" 1791 "EXT4-fs: inode_readahead_blks"
1775 " must be a power of 2"); 1792 " must be a power of 2");
@@ -2118,6 +2135,13 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2118 return; 2135 return;
2119 } 2136 }
2120 2137
2138 /* Check if feature set would not allow a r/w mount */
2139 if (!ext4_feature_set_ok(sb, 0)) {
2140 ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to "
2141 "unknown ROCOMPAT features");
2142 return;
2143 }
2144
2121 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 2145 if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) {
2122 if (es->s_last_orphan) 2146 if (es->s_last_orphan)
2123 jbd_debug(1, "Errors on filesystem, " 2147 jbd_debug(1, "Errors on filesystem, "
@@ -2410,7 +2434,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2410 if (parse_strtoul(buf, 0x40000000, &t)) 2434 if (parse_strtoul(buf, 0x40000000, &t))
2411 return -EINVAL; 2435 return -EINVAL;
2412 2436
2413 if (!is_power_of_2(t)) 2437 if (t && !is_power_of_2(t))
2414 return -EINVAL; 2438 return -EINVAL;
2415 2439
2416 sbi->s_inode_readahead_blks = t; 2440 sbi->s_inode_readahead_blks = t;
@@ -2716,6 +2740,8 @@ static void ext4_unregister_li_request(struct super_block *sb)
2716 mutex_unlock(&ext4_li_info->li_list_mtx); 2740 mutex_unlock(&ext4_li_info->li_list_mtx);
2717} 2741}
2718 2742
2743static struct task_struct *ext4_lazyinit_task;
2744
2719/* 2745/*
2720 * This is the function where ext4lazyinit thread lives. It walks 2746 * This is the function where ext4lazyinit thread lives. It walks
2721 * through the request list searching for next scheduled filesystem. 2747 * through the request list searching for next scheduled filesystem.
@@ -2784,6 +2810,10 @@ cont_thread:
2784 if (time_before(jiffies, next_wakeup)) 2810 if (time_before(jiffies, next_wakeup))
2785 schedule(); 2811 schedule();
2786 finish_wait(&eli->li_wait_daemon, &wait); 2812 finish_wait(&eli->li_wait_daemon, &wait);
2813 if (kthread_should_stop()) {
2814 ext4_clear_request_list();
2815 goto exit_thread;
2816 }
2787 } 2817 }
2788 2818
2789exit_thread: 2819exit_thread:
@@ -2808,6 +2838,7 @@ exit_thread:
2808 wake_up(&eli->li_wait_task); 2838 wake_up(&eli->li_wait_task);
2809 2839
2810 kfree(ext4_li_info); 2840 kfree(ext4_li_info);
2841 ext4_lazyinit_task = NULL;
2811 ext4_li_info = NULL; 2842 ext4_li_info = NULL;
2812 mutex_unlock(&ext4_li_mtx); 2843 mutex_unlock(&ext4_li_mtx);
2813 2844
@@ -2830,11 +2861,10 @@ static void ext4_clear_request_list(void)
2830 2861
2831static int ext4_run_lazyinit_thread(void) 2862static int ext4_run_lazyinit_thread(void)
2832{ 2863{
2833 struct task_struct *t; 2864 ext4_lazyinit_task = kthread_run(ext4_lazyinit_thread,
2834 2865 ext4_li_info, "ext4lazyinit");
2835 t = kthread_run(ext4_lazyinit_thread, ext4_li_info, "ext4lazyinit"); 2866 if (IS_ERR(ext4_lazyinit_task)) {
2836 if (IS_ERR(t)) { 2867 int err = PTR_ERR(ext4_lazyinit_task);
2837 int err = PTR_ERR(t);
2838 ext4_clear_request_list(); 2868 ext4_clear_request_list();
2839 del_timer_sync(&ext4_li_info->li_timer); 2869 del_timer_sync(&ext4_li_info->li_timer);
2840 kfree(ext4_li_info); 2870 kfree(ext4_li_info);
@@ -2962,6 +2992,12 @@ static int ext4_register_li_request(struct super_block *sb,
2962 mutex_unlock(&ext4_li_info->li_list_mtx); 2992 mutex_unlock(&ext4_li_info->li_list_mtx);
2963 2993
2964 sbi->s_li_request = elr; 2994 sbi->s_li_request = elr;
2995 /*
2996 * set elr to NULL here since it has been inserted to
2997 * the request_list and the removal and free of it is
2998 * handled by ext4_clear_request_list from now on.
2999 */
3000 elr = NULL;
2965 3001
2966 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { 3002 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
2967 ret = ext4_run_lazyinit_thread(); 3003 ret = ext4_run_lazyinit_thread();
@@ -2985,16 +3021,10 @@ static void ext4_destroy_lazyinit_thread(void)
2985 * If thread exited earlier 3021 * If thread exited earlier
2986 * there's nothing to be done. 3022 * there's nothing to be done.
2987 */ 3023 */
2988 if (!ext4_li_info) 3024 if (!ext4_li_info || !ext4_lazyinit_task)
2989 return; 3025 return;
2990 3026
2991 ext4_clear_request_list(); 3027 kthread_stop(ext4_lazyinit_task);
2992
2993 while (ext4_li_info->li_task) {
2994 wake_up(&ext4_li_info->li_wait_daemon);
2995 wait_event(ext4_li_info->li_wait_task,
2996 ext4_li_info->li_task == NULL);
2997 }
2998} 3028}
2999 3029
3000static int ext4_fill_super(struct super_block *sb, void *data, int silent) 3030static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -3093,14 +3123,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3093 } 3123 }
3094 if (def_mount_opts & EXT4_DEFM_UID16) 3124 if (def_mount_opts & EXT4_DEFM_UID16)
3095 set_opt(sb, NO_UID32); 3125 set_opt(sb, NO_UID32);
3126 /* xattr user namespace & acls are now defaulted on */
3096#ifdef CONFIG_EXT4_FS_XATTR 3127#ifdef CONFIG_EXT4_FS_XATTR
3097 if (def_mount_opts & EXT4_DEFM_XATTR_USER) 3128 set_opt(sb, XATTR_USER);
3098 set_opt(sb, XATTR_USER);
3099#endif 3129#endif
3100#ifdef CONFIG_EXT4_FS_POSIX_ACL 3130#ifdef CONFIG_EXT4_FS_POSIX_ACL
3101 if (def_mount_opts & EXT4_DEFM_ACL) 3131 set_opt(sb, POSIX_ACL);
3102 set_opt(sb, POSIX_ACL);
3103#endif 3132#endif
3133 set_opt(sb, MBLK_IO_SUBMIT);
3104 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) 3134 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA)
3105 set_opt(sb, JOURNAL_DATA); 3135 set_opt(sb, JOURNAL_DATA);
3106 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED) 3136 else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_ORDERED)
@@ -3378,6 +3408,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3378 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 3408 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3379 spin_lock_init(&sbi->s_next_gen_lock); 3409 spin_lock_init(&sbi->s_next_gen_lock);
3380 3410
3411 init_timer(&sbi->s_err_report);
3412 sbi->s_err_report.function = print_daily_error_info;
3413 sbi->s_err_report.data = (unsigned long) sb;
3414
3381 err = percpu_counter_init(&sbi->s_freeblocks_counter, 3415 err = percpu_counter_init(&sbi->s_freeblocks_counter,
3382 ext4_count_free_blocks(sb)); 3416 ext4_count_free_blocks(sb));
3383 if (!err) { 3417 if (!err) {
@@ -3413,6 +3447,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3413 sb->s_qcop = &ext4_qctl_operations; 3447 sb->s_qcop = &ext4_qctl_operations;
3414 sb->dq_op = &ext4_quota_operations; 3448 sb->dq_op = &ext4_quota_operations;
3415#endif 3449#endif
3450 memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
3451
3416 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ 3452 INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
3417 mutex_init(&sbi->s_orphan_lock); 3453 mutex_init(&sbi->s_orphan_lock);
3418 mutex_init(&sbi->s_resize_lock); 3454 mutex_init(&sbi->s_resize_lock);
@@ -3507,7 +3543,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3507 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0); 3543 percpu_counter_set(&sbi->s_dirtyblocks_counter, 0);
3508 3544
3509no_journal: 3545no_journal:
3510 EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); 3546 /*
3547 * The maximum number of concurrent works can be high and
3548 * concurrency isn't really necessary. Limit it to 1.
3549 */
3550 EXT4_SB(sb)->dio_unwritten_wq =
3551 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3511 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3552 if (!EXT4_SB(sb)->dio_unwritten_wq) {
3512 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3553 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
3513 goto failed_mount_wq; 3554 goto failed_mount_wq;
@@ -3522,17 +3563,16 @@ no_journal:
3522 if (IS_ERR(root)) { 3563 if (IS_ERR(root)) {
3523 ext4_msg(sb, KERN_ERR, "get root inode failed"); 3564 ext4_msg(sb, KERN_ERR, "get root inode failed");
3524 ret = PTR_ERR(root); 3565 ret = PTR_ERR(root);
3566 root = NULL;
3525 goto failed_mount4; 3567 goto failed_mount4;
3526 } 3568 }
3527 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) { 3569 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
3528 iput(root);
3529 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck"); 3570 ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
3530 goto failed_mount4; 3571 goto failed_mount4;
3531 } 3572 }
3532 sb->s_root = d_alloc_root(root); 3573 sb->s_root = d_alloc_root(root);
3533 if (!sb->s_root) { 3574 if (!sb->s_root) {
3534 ext4_msg(sb, KERN_ERR, "get root dentry failed"); 3575 ext4_msg(sb, KERN_ERR, "get root dentry failed");
3535 iput(root);
3536 ret = -ENOMEM; 3576 ret = -ENOMEM;
3537 goto failed_mount4; 3577 goto failed_mount4;
3538 } 3578 }
@@ -3633,9 +3673,6 @@ no_journal:
3633 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, 3673 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
3634 *sbi->s_es->s_mount_opts ? "; " : "", orig_data); 3674 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
3635 3675
3636 init_timer(&sbi->s_err_report);
3637 sbi->s_err_report.function = print_daily_error_info;
3638 sbi->s_err_report.data = (unsigned long) sb;
3639 if (es->s_error_count) 3676 if (es->s_error_count)
3640 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ 3677 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3641 3678
@@ -3648,6 +3685,8 @@ cantfind_ext4:
3648 goto failed_mount; 3685 goto failed_mount;
3649 3686
3650failed_mount4: 3687failed_mount4:
3688 iput(root);
3689 sb->s_root = NULL;
3651 ext4_msg(sb, KERN_ERR, "mount failed"); 3690 ext4_msg(sb, KERN_ERR, "mount failed");
3652 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 3691 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
3653failed_mount_wq: 3692failed_mount_wq:
@@ -3657,6 +3696,7 @@ failed_mount_wq:
3657 sbi->s_journal = NULL; 3696 sbi->s_journal = NULL;
3658 } 3697 }
3659failed_mount3: 3698failed_mount3:
3699 del_timer(&sbi->s_err_report);
3660 if (sbi->s_flex_groups) { 3700 if (sbi->s_flex_groups) {
3661 if (is_vmalloc_addr(sbi->s_flex_groups)) 3701 if (is_vmalloc_addr(sbi->s_flex_groups))
3662 vfree(sbi->s_flex_groups); 3702 vfree(sbi->s_flex_groups);
@@ -4123,6 +4163,11 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4123/* 4163/*
4124 * LVM calls this function before a (read-only) snapshot is created. This 4164 * LVM calls this function before a (read-only) snapshot is created. This
4125 * gives us a chance to flush the journal completely and mark the fs clean. 4165 * gives us a chance to flush the journal completely and mark the fs clean.
4166 *
4167 * Note that only this function cannot bring a filesystem to be in a clean
4168 * state independently, because ext4 prevents a new handle from being started
4169 * by @sb->s_frozen, which stays in an upper layer. It thus needs help from
4170 * the upper layer.
4126 */ 4171 */
4127static int ext4_freeze(struct super_block *sb) 4172static int ext4_freeze(struct super_block *sb)
4128{ 4173{
@@ -4599,17 +4644,30 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
4599 4644
4600static int ext4_quota_off(struct super_block *sb, int type) 4645static int ext4_quota_off(struct super_block *sb, int type)
4601{ 4646{
4647 struct inode *inode = sb_dqopt(sb)->files[type];
4648 handle_t *handle;
4649
4602 /* Force all delayed allocation blocks to be allocated. 4650 /* Force all delayed allocation blocks to be allocated.
4603 * Caller already holds s_umount sem */ 4651 * Caller already holds s_umount sem */
4604 if (test_opt(sb, DELALLOC)) 4652 if (test_opt(sb, DELALLOC))
4605 sync_filesystem(sb); 4653 sync_filesystem(sb);
4606 4654
4655 /* Update modification times of quota files when userspace can
4656 * start looking at them */
4657 handle = ext4_journal_start(inode, 1);
4658 if (IS_ERR(handle))
4659 goto out;
4660 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
4661 ext4_mark_inode_dirty(handle, inode);
4662 ext4_journal_stop(handle);
4663
4664out:
4607 return dquot_quota_off(sb, type); 4665 return dquot_quota_off(sb, type);
4608} 4666}
4609 4667
4610/* Read data from quotafile - avoid pagecache and such because we cannot afford 4668/* Read data from quotafile - avoid pagecache and such because we cannot afford
4611 * acquiring the locks... As quota files are never truncated and quota code 4669 * acquiring the locks... As quota files are never truncated and quota code
4612 * itself serializes the operations (and noone else should touch the files) 4670 * itself serializes the operations (and no one else should touch the files)
4613 * we don't have to be afraid of races */ 4671 * we don't have to be afraid of races */
4614static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 4672static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
4615 size_t len, loff_t off) 4673 size_t len, loff_t off)
@@ -4699,9 +4757,8 @@ out:
4699 if (inode->i_size < off + len) { 4757 if (inode->i_size < off + len) {
4700 i_size_write(inode, off + len); 4758 i_size_write(inode, off + len);
4701 EXT4_I(inode)->i_disksize = inode->i_size; 4759 EXT4_I(inode)->i_disksize = inode->i_size;
4760 ext4_mark_inode_dirty(handle, inode);
4702 } 4761 }
4703 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
4704 ext4_mark_inode_dirty(handle, inode);
4705 mutex_unlock(&inode->i_mutex); 4762 mutex_unlock(&inode->i_mutex);
4706 return len; 4763 return len;
4707} 4764}
@@ -4768,7 +4825,7 @@ static struct file_system_type ext4_fs_type = {
4768 .fs_flags = FS_REQUIRES_DEV, 4825 .fs_flags = FS_REQUIRES_DEV,
4769}; 4826};
4770 4827
4771int __init ext4_init_feat_adverts(void) 4828static int __init ext4_init_feat_adverts(void)
4772{ 4829{
4773 struct ext4_features *ef; 4830 struct ext4_features *ef;
4774 int ret = -ENOMEM; 4831 int ret = -ENOMEM;
@@ -4792,23 +4849,44 @@ out:
4792 return ret; 4849 return ret;
4793} 4850}
4794 4851
4852static void ext4_exit_feat_adverts(void)
4853{
4854 kobject_put(&ext4_feat->f_kobj);
4855 wait_for_completion(&ext4_feat->f_kobj_unregister);
4856 kfree(ext4_feat);
4857}
4858
4859/* Shared across all ext4 file systems */
4860wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
4861struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
4862
4795static int __init ext4_init_fs(void) 4863static int __init ext4_init_fs(void)
4796{ 4864{
4797 int err; 4865 int i, err;
4798 4866
4799 ext4_check_flag_values(); 4867 ext4_check_flag_values();
4868
4869 for (i = 0; i < EXT4_WQ_HASH_SZ; i++) {
4870 mutex_init(&ext4__aio_mutex[i]);
4871 init_waitqueue_head(&ext4__ioend_wq[i]);
4872 }
4873
4800 err = ext4_init_pageio(); 4874 err = ext4_init_pageio();
4801 if (err) 4875 if (err)
4802 return err; 4876 return err;
4803 err = ext4_init_system_zone(); 4877 err = ext4_init_system_zone();
4804 if (err) 4878 if (err)
4805 goto out5; 4879 goto out7;
4806 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); 4880 ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
4807 if (!ext4_kset) 4881 if (!ext4_kset)
4808 goto out4; 4882 goto out6;
4809 ext4_proc_root = proc_mkdir("fs/ext4", NULL); 4883 ext4_proc_root = proc_mkdir("fs/ext4", NULL);
4884 if (!ext4_proc_root)
4885 goto out5;
4810 4886
4811 err = ext4_init_feat_adverts(); 4887 err = ext4_init_feat_adverts();
4888 if (err)
4889 goto out4;
4812 4890
4813 err = ext4_init_mballoc(); 4891 err = ext4_init_mballoc();
4814 if (err) 4892 if (err)
@@ -4838,12 +4916,14 @@ out1:
4838out2: 4916out2:
4839 ext4_exit_mballoc(); 4917 ext4_exit_mballoc();
4840out3: 4918out3:
4841 kfree(ext4_feat); 4919 ext4_exit_feat_adverts();
4920out4:
4842 remove_proc_entry("fs/ext4", NULL); 4921 remove_proc_entry("fs/ext4", NULL);
4922out5:
4843 kset_unregister(ext4_kset); 4923 kset_unregister(ext4_kset);
4844out4: 4924out6:
4845 ext4_exit_system_zone(); 4925 ext4_exit_system_zone();
4846out5: 4926out7:
4847 ext4_exit_pageio(); 4927 ext4_exit_pageio();
4848 return err; 4928 return err;
4849} 4929}
@@ -4857,6 +4937,7 @@ static void __exit ext4_exit_fs(void)
4857 destroy_inodecache(); 4937 destroy_inodecache();
4858 ext4_exit_xattr(); 4938 ext4_exit_xattr();
4859 ext4_exit_mballoc(); 4939 ext4_exit_mballoc();
4940 ext4_exit_feat_adverts();
4860 remove_proc_entry("fs/ext4", NULL); 4941 remove_proc_entry("fs/ext4", NULL);
4861 kset_unregister(ext4_kset); 4942 kset_unregister(ext4_kset);
4862 ext4_exit_system_zone(); 4943 ext4_exit_system_zone();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fc32176eee39..b545ca1c459c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -735,7 +735,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
735 int offset = (char *)s->here - bs->bh->b_data; 735 int offset = (char *)s->here - bs->bh->b_data;
736 736
737 unlock_buffer(bs->bh); 737 unlock_buffer(bs->bh);
738 jbd2_journal_release_buffer(handle, bs->bh); 738 ext4_handle_release_buffer(handle, bs->bh);
739 if (ce) { 739 if (ce) {
740 mb_cache_entry_release(ce); 740 mb_cache_entry_release(ce);
741 ce = NULL; 741 ce = NULL;
@@ -833,7 +833,7 @@ inserted:
833 new_bh = sb_getblk(sb, block); 833 new_bh = sb_getblk(sb, block);
834 if (!new_bh) { 834 if (!new_bh) {
835getblk_failed: 835getblk_failed:
836 ext4_free_blocks(handle, inode, 0, block, 1, 836 ext4_free_blocks(handle, inode, NULL, block, 1,
837 EXT4_FREE_BLOCKS_METADATA); 837 EXT4_FREE_BLOCKS_METADATA);
838 error = -EIO; 838 error = -EIO;
839 goto cleanup; 839 goto cleanup;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1ef16520b950..25b7387ff183 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -145,10 +145,10 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
145 145
146#ifdef CONFIG_EXT4_FS_SECURITY 146#ifdef CONFIG_EXT4_FS_SECURITY
147extern int ext4_init_security(handle_t *handle, struct inode *inode, 147extern int ext4_init_security(handle_t *handle, struct inode *inode,
148 struct inode *dir); 148 struct inode *dir, const struct qstr *qstr);
149#else 149#else
150static inline int ext4_init_security(handle_t *handle, struct inode *inode, 150static inline int ext4_init_security(handle_t *handle, struct inode *inode,
151 struct inode *dir) 151 struct inode *dir, const struct qstr *qstr)
152{ 152{
153 return 0; 153 return 0;
154} 154}
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 9b21268e121c..007c3bfbf094 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -49,14 +49,15 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name,
49} 49}
50 50
51int 51int
52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir) 52ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
53 const struct qstr *qstr)
53{ 54{
54 int err; 55 int err;
55 size_t len; 56 size_t len;
56 void *value; 57 void *value;
57 char *name; 58 char *name;
58 59
59 err = security_inode_init_security(inode, dir, &name, &value, &len); 60 err = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
60 if (err) { 61 if (err) {
61 if (err == -EOPNOTSUPP) 62 if (err == -EOPNOTSUPP)
62 return 0; 63 return 0;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd1..8d68690bdcf1 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -236,7 +236,6 @@ static const struct address_space_operations fat_aops = {
236 .readpages = fat_readpages, 236 .readpages = fat_readpages,
237 .writepage = fat_writepage, 237 .writepage = fat_writepage,
238 .writepages = fat_writepages, 238 .writepages = fat_writepages,
239 .sync_page = block_sync_page,
240 .write_begin = fat_write_begin, 239 .write_begin = fat_write_begin,
241 .write_end = fat_write_end, 240 .write_end = fat_write_end,
242 .direct_IO = fat_direct_IO, 241 .direct_IO = fat_direct_IO,
@@ -757,8 +756,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
757 struct inode *inode = de->d_inode; 756 struct inode *inode = de->d_inode;
758 u32 ipos_h, ipos_m, ipos_l; 757 u32 ipos_h, ipos_m, ipos_l;
759 758
760 if (len < 5) 759 if (len < 5) {
760 *lenp = 5;
761 return 255; /* no room */ 761 return 255; /* no room */
762 }
762 763
763 ipos_h = MSDOS_I(inode)->i_pos >> 8; 764 ipos_h = MSDOS_I(inode)->i_pos >> 8;
764 ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; 765 ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f88f752babd9..adae3fb7451a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
43 43
44static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd) 44static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
45{ 45{
46 if (nd->flags & LOOKUP_RCU) 46 if (nd && nd->flags & LOOKUP_RCU)
47 return -ECHILD; 47 return -ECHILD;
48 48
49 /* This is not negative dentry. Always valid. */ 49 /* This is not negative dentry. Always valid. */
@@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
54 54
55static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd) 55static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
56{ 56{
57 if (nd->flags & LOOKUP_RCU) 57 if (nd && nd->flags & LOOKUP_RCU)
58 return -ECHILD; 58 return -ECHILD;
59 59
60 /* 60 /*
diff --git a/fs/fcntl.c b/fs/fcntl.c
index ecc8b3954ed6..22764c7c8382 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
131SYSCALL_DEFINE1(dup, unsigned int, fildes) 131SYSCALL_DEFINE1(dup, unsigned int, fildes)
132{ 132{
133 int ret = -EBADF; 133 int ret = -EBADF;
134 struct file *file = fget(fildes); 134 struct file *file = fget_raw(fildes);
135 135
136 if (file) { 136 if (file) {
137 ret = get_unused_fd(); 137 ret = get_unused_fd();
@@ -159,7 +159,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
159 159
160 /* O_NOATIME can only be set by the owner or superuser */ 160 /* O_NOATIME can only be set by the owner or superuser */
161 if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) 161 if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
162 if (!is_owner_or_cap(inode)) 162 if (!inode_owner_or_capable(inode))
163 return -EPERM; 163 return -EPERM;
164 164
165 /* required for strict SunOS emulation */ 165 /* required for strict SunOS emulation */
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
426 return err; 426 return err;
427} 427}
428 428
429static int check_fcntl_cmd(unsigned cmd)
430{
431 switch (cmd) {
432 case F_DUPFD:
433 case F_DUPFD_CLOEXEC:
434 case F_GETFD:
435 case F_SETFD:
436 case F_GETFL:
437 return 1;
438 }
439 return 0;
440}
441
429SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) 442SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
430{ 443{
431 struct file *filp; 444 struct file *filp;
432 long err = -EBADF; 445 long err = -EBADF;
433 446
434 filp = fget(fd); 447 filp = fget_raw(fd);
435 if (!filp) 448 if (!filp)
436 goto out; 449 goto out;
437 450
451 if (unlikely(filp->f_mode & FMODE_PATH)) {
452 if (!check_fcntl_cmd(cmd)) {
453 fput(filp);
454 goto out;
455 }
456 }
457
438 err = security_file_fcntl(filp, cmd, arg); 458 err = security_file_fcntl(filp, cmd, arg);
439 if (err) { 459 if (err) {
440 fput(filp); 460 fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
456 long err; 476 long err;
457 477
458 err = -EBADF; 478 err = -EBADF;
459 filp = fget(fd); 479 filp = fget_raw(fd);
460 if (!filp) 480 if (!filp)
461 goto out; 481 goto out;
462 482
483 if (unlikely(filp->f_mode & FMODE_PATH)) {
484 if (!check_fcntl_cmd(cmd)) {
485 fput(filp);
486 goto out;
487 }
488 }
489
463 err = security_file_fcntl(filp, cmd, arg); 490 err = security_file_fcntl(filp, cmd, arg);
464 if (err) { 491 if (err) {
465 fput(filp); 492 fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
808 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY 835 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
809 * is defined as O_NONBLOCK on some platforms and not on others. 836 * is defined as O_NONBLOCK on some platforms and not on others.
810 */ 837 */
811 BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( 838 BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
812 O_RDONLY | O_WRONLY | O_RDWR | 839 O_RDONLY | O_WRONLY | O_RDWR |
813 O_CREAT | O_EXCL | O_NOCTTY | 840 O_CREAT | O_EXCL | O_NOCTTY |
814 O_TRUNC | O_APPEND | /* O_NONBLOCK | */ 841 O_TRUNC | O_APPEND | /* O_NONBLOCK | */
815 __O_SYNC | O_DSYNC | FASYNC | 842 __O_SYNC | O_DSYNC | FASYNC |
816 O_DIRECT | O_LARGEFILE | O_DIRECTORY | 843 O_DIRECT | O_LARGEFILE | O_DIRECTORY |
817 O_NOFOLLOW | O_NOATIME | O_CLOEXEC | 844 O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
818 FMODE_EXEC 845 __FMODE_EXEC | O_PATH
819 )); 846 ));
820 847
821 fasync_cache = kmem_cache_create("fasync_cache", 848 fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 000000000000..6b088641f5bf
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,266 @@
1#include <linux/syscalls.h>
2#include <linux/slab.h>
3#include <linux/fs.h>
4#include <linux/file.h>
5#include <linux/mount.h>
6#include <linux/namei.h>
7#include <linux/exportfs.h>
8#include <linux/fs_struct.h>
9#include <linux/fsnotify.h>
10#include <linux/personality.h>
11#include <asm/uaccess.h>
12#include "internal.h"
13
14static long do_sys_name_to_handle(struct path *path,
15 struct file_handle __user *ufh,
16 int __user *mnt_id)
17{
18 long retval;
19 struct file_handle f_handle;
20 int handle_dwords, handle_bytes;
21 struct file_handle *handle = NULL;
22
23 /*
24 * We need t make sure wether the file system
25 * support decoding of the file handle
26 */
27 if (!path->mnt->mnt_sb->s_export_op ||
28 !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
29 return -EOPNOTSUPP;
30
31 if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
32 return -EFAULT;
33
34 if (f_handle.handle_bytes > MAX_HANDLE_SZ)
35 return -EINVAL;
36
37 handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
38 GFP_KERNEL);
39 if (!handle)
40 return -ENOMEM;
41
42 /* convert handle size to multiple of sizeof(u32) */
43 handle_dwords = f_handle.handle_bytes >> 2;
44
45 /* we ask for a non connected handle */
46 retval = exportfs_encode_fh(path->dentry,
47 (struct fid *)handle->f_handle,
48 &handle_dwords, 0);
49 handle->handle_type = retval;
50 /* convert handle size to bytes */
51 handle_bytes = handle_dwords * sizeof(u32);
52 handle->handle_bytes = handle_bytes;
53 if ((handle->handle_bytes > f_handle.handle_bytes) ||
54 (retval == 255) || (retval == -ENOSPC)) {
55 /* As per old exportfs_encode_fh documentation
56 * we could return ENOSPC to indicate overflow
57 * But file system returned 255 always. So handle
58 * both the values
59 */
60 /*
61 * set the handle size to zero so we copy only
62 * non variable part of the file_handle
63 */
64 handle_bytes = 0;
65 retval = -EOVERFLOW;
66 } else
67 retval = 0;
68 /* copy the mount id */
69 if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
70 copy_to_user(ufh, handle,
71 sizeof(struct file_handle) + handle_bytes))
72 retval = -EFAULT;
73 kfree(handle);
74 return retval;
75}
76
77/**
78 * sys_name_to_handle_at: convert name to handle
79 * @dfd: directory relative to which name is interpreted if not absolute
80 * @name: name that should be converted to handle.
81 * @handle: resulting file handle
82 * @mnt_id: mount id of the file system containing the file
83 * @flag: flag value to indicate whether to follow symlink or not
84 *
85 * @handle->handle_size indicate the space available to store the
86 * variable part of the file handle in bytes. If there is not
87 * enough space, the field is updated to return the minimum
88 * value required.
89 */
90SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
91 struct file_handle __user *, handle, int __user *, mnt_id,
92 int, flag)
93{
94 struct path path;
95 int lookup_flags;
96 int err;
97
98 if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
99 return -EINVAL;
100
101 lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
102 if (flag & AT_EMPTY_PATH)
103 lookup_flags |= LOOKUP_EMPTY;
104 err = user_path_at(dfd, name, lookup_flags, &path);
105 if (!err) {
106 err = do_sys_name_to_handle(&path, handle, mnt_id);
107 path_put(&path);
108 }
109 return err;
110}
111
112static struct vfsmount *get_vfsmount_from_fd(int fd)
113{
114 struct path path;
115
116 if (fd == AT_FDCWD) {
117 struct fs_struct *fs = current->fs;
118 spin_lock(&fs->lock);
119 path = fs->pwd;
120 mntget(path.mnt);
121 spin_unlock(&fs->lock);
122 } else {
123 int fput_needed;
124 struct file *file = fget_light(fd, &fput_needed);
125 if (!file)
126 return ERR_PTR(-EBADF);
127 path = file->f_path;
128 mntget(path.mnt);
129 fput_light(file, fput_needed);
130 }
131 return path.mnt;
132}
133
134static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
135{
136 return 1;
137}
138
139static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
140 struct path *path)
141{
142 int retval = 0;
143 int handle_dwords;
144
145 path->mnt = get_vfsmount_from_fd(mountdirfd);
146 if (IS_ERR(path->mnt)) {
147 retval = PTR_ERR(path->mnt);
148 goto out_err;
149 }
150 /* change the handle size to multiple of sizeof(u32) */
151 handle_dwords = handle->handle_bytes >> 2;
152 path->dentry = exportfs_decode_fh(path->mnt,
153 (struct fid *)handle->f_handle,
154 handle_dwords, handle->handle_type,
155 vfs_dentry_acceptable, NULL);
156 if (IS_ERR(path->dentry)) {
157 retval = PTR_ERR(path->dentry);
158 goto out_mnt;
159 }
160 return 0;
161out_mnt:
162 mntput(path->mnt);
163out_err:
164 return retval;
165}
166
167static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
168 struct path *path)
169{
170 int retval = 0;
171 struct file_handle f_handle;
172 struct file_handle *handle = NULL;
173
174 /*
175 * With handle we don't look at the execute bit on the
176 * the directory. Ideally we would like CAP_DAC_SEARCH.
177 * But we don't have that
178 */
179 if (!capable(CAP_DAC_READ_SEARCH)) {
180 retval = -EPERM;
181 goto out_err;
182 }
183 if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
184 retval = -EFAULT;
185 goto out_err;
186 }
187 if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
188 (f_handle.handle_bytes == 0)) {
189 retval = -EINVAL;
190 goto out_err;
191 }
192 handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
193 GFP_KERNEL);
194 if (!handle) {
195 retval = -ENOMEM;
196 goto out_err;
197 }
198 /* copy the full handle */
199 if (copy_from_user(handle, ufh,
200 sizeof(struct file_handle) +
201 f_handle.handle_bytes)) {
202 retval = -EFAULT;
203 goto out_handle;
204 }
205
206 retval = do_handle_to_path(mountdirfd, handle, path);
207
208out_handle:
209 kfree(handle);
210out_err:
211 return retval;
212}
213
214long do_handle_open(int mountdirfd,
215 struct file_handle __user *ufh, int open_flag)
216{
217 long retval = 0;
218 struct path path;
219 struct file *file;
220 int fd;
221
222 retval = handle_to_path(mountdirfd, ufh, &path);
223 if (retval)
224 return retval;
225
226 fd = get_unused_fd_flags(open_flag);
227 if (fd < 0) {
228 path_put(&path);
229 return fd;
230 }
231 file = file_open_root(path.dentry, path.mnt, "", open_flag);
232 if (IS_ERR(file)) {
233 put_unused_fd(fd);
234 retval = PTR_ERR(file);
235 } else {
236 retval = fd;
237 fsnotify_open(file);
238 fd_install(fd, file);
239 }
240 path_put(&path);
241 return retval;
242}
243
244/**
245 * sys_open_by_handle_at: Open the file handle
246 * @mountdirfd: directory file descriptor
247 * @handle: file handle to be opened
248 * @flag: open flags.
249 *
250 * @mountdirfd indicate the directory file descriptor
251 * of the mount point. file handle is decoded relative
252 * to the vfsmount pointed by the @mountdirfd. @flags
253 * value is same as the open(2) flags.
254 */
255SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
256 struct file_handle __user *, handle,
257 int, flags)
258{
259 long ret;
260
261 if (force_o_largefile())
262 flags |= O_LARGEFILE;
263
264 ret = do_handle_open(mountdirfd, handle, flags);
265 return ret;
266}
diff --git a/fs/fifo.c b/fs/fifo.c
index 4e303c22d5ee..b1a524d798e7 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -66,8 +66,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
66 /* suppress POLLHUP until we have 66 /* suppress POLLHUP until we have
67 * seen a writer */ 67 * seen a writer */
68 filp->f_version = pipe->w_counter; 68 filp->f_version = pipe->w_counter;
69 } else 69 } else {
70 {
71 wait_for_partner(inode, &pipe->w_counter); 70 wait_for_partner(inode, &pipe->w_counter);
72 if(signal_pending(current)) 71 if(signal_pending(current))
73 goto err_rd; 72 goto err_rd;
diff --git a/fs/file_table.c b/fs/file_table.c
index c3e89adf53c0..01e4c1e8e6b6 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -125,13 +125,13 @@ struct file *get_empty_filp(void)
125 goto fail; 125 goto fail;
126 126
127 percpu_counter_inc(&nr_files); 127 percpu_counter_inc(&nr_files);
128 f->f_cred = get_cred(cred);
128 if (security_file_alloc(f)) 129 if (security_file_alloc(f))
129 goto fail_sec; 130 goto fail_sec;
130 131
131 INIT_LIST_HEAD(&f->f_u.fu_list); 132 INIT_LIST_HEAD(&f->f_u.fu_list);
132 atomic_long_set(&f->f_count, 1); 133 atomic_long_set(&f->f_count, 1);
133 rwlock_init(&f->f_owner.lock); 134 rwlock_init(&f->f_owner.lock);
134 f->f_cred = get_cred(cred);
135 spin_lock_init(&f->f_lock); 135 spin_lock_init(&f->f_lock);
136 eventpoll_init_file(f); 136 eventpoll_init_file(f);
137 /* f->f_version: 0 */ 137 /* f->f_version: 0 */
@@ -190,7 +190,8 @@ struct file *alloc_file(struct path *path, fmode_t mode,
190 file_take_write(file); 190 file_take_write(file);
191 WARN_ON(mnt_clone_write(path->mnt)); 191 WARN_ON(mnt_clone_write(path->mnt));
192 } 192 }
193 ima_counts_get(file); 193 if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
194 i_readcount_inc(path->dentry->d_inode);
194 return file; 195 return file;
195} 196}
196EXPORT_SYMBOL(alloc_file); 197EXPORT_SYMBOL(alloc_file);
@@ -246,11 +247,15 @@ static void __fput(struct file *file)
246 file->f_op->release(inode, file); 247 file->f_op->release(inode, file);
247 security_file_free(file); 248 security_file_free(file);
248 ima_file_free(file); 249 ima_file_free(file);
249 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) 250 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
251 !(file->f_mode & FMODE_PATH))) {
250 cdev_put(inode->i_cdev); 252 cdev_put(inode->i_cdev);
253 }
251 fops_put(file->f_op); 254 fops_put(file->f_op);
252 put_pid(file->f_owner.pid); 255 put_pid(file->f_owner.pid);
253 file_sb_list_del(file); 256 file_sb_list_del(file);
257 if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
258 i_readcount_dec(inode);
254 if (file->f_mode & FMODE_WRITE) 259 if (file->f_mode & FMODE_WRITE)
255 drop_file_write_access(file); 260 drop_file_write_access(file);
256 file->f_path.dentry = NULL; 261 file->f_path.dentry = NULL;
@@ -276,11 +281,10 @@ struct file *fget(unsigned int fd)
276 rcu_read_lock(); 281 rcu_read_lock();
277 file = fcheck_files(files, fd); 282 file = fcheck_files(files, fd);
278 if (file) { 283 if (file) {
279 if (!atomic_long_inc_not_zero(&file->f_count)) { 284 /* File object ref couldn't be taken */
280 /* File object ref couldn't be taken */ 285 if (file->f_mode & FMODE_PATH ||
281 rcu_read_unlock(); 286 !atomic_long_inc_not_zero(&file->f_count))
282 return NULL; 287 file = NULL;
283 }
284 } 288 }
285 rcu_read_unlock(); 289 rcu_read_unlock();
286 290
@@ -289,6 +293,25 @@ struct file *fget(unsigned int fd)
289 293
290EXPORT_SYMBOL(fget); 294EXPORT_SYMBOL(fget);
291 295
296struct file *fget_raw(unsigned int fd)
297{
298 struct file *file;
299 struct files_struct *files = current->files;
300
301 rcu_read_lock();
302 file = fcheck_files(files, fd);
303 if (file) {
304 /* File object ref couldn't be taken */
305 if (!atomic_long_inc_not_zero(&file->f_count))
306 file = NULL;
307 }
308 rcu_read_unlock();
309
310 return file;
311}
312
313EXPORT_SYMBOL(fget_raw);
314
292/* 315/*
293 * Lightweight file lookup - no refcnt increment if fd table isn't shared. 316 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
294 * 317 *
@@ -313,6 +336,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
313 *fput_needed = 0; 336 *fput_needed = 0;
314 if (atomic_read(&files->count) == 1) { 337 if (atomic_read(&files->count) == 1) {
315 file = fcheck_files(files, fd); 338 file = fcheck_files(files, fd);
339 if (file && (file->f_mode & FMODE_PATH))
340 file = NULL;
341 } else {
342 rcu_read_lock();
343 file = fcheck_files(files, fd);
344 if (file) {
345 if (!(file->f_mode & FMODE_PATH) &&
346 atomic_long_inc_not_zero(&file->f_count))
347 *fput_needed = 1;
348 else
349 /* Didn't get the reference, someone's freed */
350 file = NULL;
351 }
352 rcu_read_unlock();
353 }
354
355 return file;
356}
357
358struct file *fget_raw_light(unsigned int fd, int *fput_needed)
359{
360 struct file *file;
361 struct files_struct *files = current->files;
362
363 *fput_needed = 0;
364 if (atomic_read(&files->count) == 1) {
365 file = fcheck_files(files, fd);
316 } else { 366 } else {
317 rcu_read_lock(); 367 rcu_read_lock();
318 file = fcheck_files(files, fd); 368 file = fcheck_files(files, fd);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 751d6b255a12..0845f84f2a5f 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -110,14 +110,13 @@ int unregister_filesystem(struct file_system_type * fs)
110 *tmp = fs->next; 110 *tmp = fs->next;
111 fs->next = NULL; 111 fs->next = NULL;
112 write_unlock(&file_systems_lock); 112 write_unlock(&file_systems_lock);
113 synchronize_rcu();
113 return 0; 114 return 0;
114 } 115 }
115 tmp = &(*tmp)->next; 116 tmp = &(*tmp)->next;
116 } 117 }
117 write_unlock(&file_systems_lock); 118 write_unlock(&file_systems_lock);
118 119
119 synchronize_rcu();
120
121 return -EINVAL; 120 return -EINVAL;
122} 121}
123 122
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index 78948b4b1894..c9a6a94e58e9 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -164,7 +164,7 @@ vxfs_read_fshead(struct super_block *sbp)
164 goto out_free_pfp; 164 goto out_free_pfp;
165 } 165 }
166 if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) { 166 if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) {
167 printk(KERN_ERR "vxfs: structual list inode is of wrong type (%x)\n", 167 printk(KERN_ERR "vxfs: structural list inode is of wrong type (%x)\n",
168 VXFS_INO(infp->vsi_stilist)->vii_mode & VXFS_TYPE_MASK); 168 VXFS_INO(infp->vsi_stilist)->vii_mode & VXFS_TYPE_MASK);
169 goto out_iput_stilist; 169 goto out_iput_stilist;
170 } 170 }
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 6c5131d592f0..3360f1e678ad 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -162,7 +162,7 @@ vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp)
162/** 162/**
163 * vxfs_inode_by_name - find inode number for dentry 163 * vxfs_inode_by_name - find inode number for dentry
164 * @dip: directory to search in 164 * @dip: directory to search in
165 * @dp: dentry we seach for 165 * @dp: dentry we search for
166 * 166 *
167 * Description: 167 * Description:
168 * vxfs_inode_by_name finds out the inode number of 168 * vxfs_inode_by_name finds out the inode number of
diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h
index d8324296486f..b7b3af502615 100644
--- a/fs/freevxfs/vxfs_olt.h
+++ b/fs/freevxfs/vxfs_olt.h
@@ -60,7 +60,7 @@ enum {
60 * 60 *
61 * The Object Location Table header is placed at the beginning of each 61 * The Object Location Table header is placed at the beginning of each
62 * OLT extent. It is used to fing certain filesystem-wide metadata, e.g. 62 * OLT extent. It is used to fing certain filesystem-wide metadata, e.g.
63 * the inital inode list, the fileset header or the device configuration. 63 * the initial inode list, the fileset header or the device configuration.
64 */ 64 */
65struct vxfs_olt { 65struct vxfs_olt {
66 u_int32_t olt_magic; /* magic number */ 66 u_int32_t olt_magic; /* magic number */
diff --git a/fs/freevxfs/vxfs_subr.c b/fs/freevxfs/vxfs_subr.c
index 1429f3ae1e86..5d318c44f855 100644
--- a/fs/freevxfs/vxfs_subr.c
+++ b/fs/freevxfs/vxfs_subr.c
@@ -44,7 +44,6 @@ static sector_t vxfs_bmap(struct address_space *, sector_t);
44const struct address_space_operations vxfs_aops = { 44const struct address_space_operations vxfs_aops = {
45 .readpage = vxfs_readpage, 45 .readpage = vxfs_readpage,
46 .bmap = vxfs_bmap, 46 .bmap = vxfs_bmap,
47 .sync_page = block_sync_page,
48}; 47};
49 48
50inline void 49inline void
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 59c6e4956786..34591ee804b5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -144,7 +144,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
144 * 144 *
145 * Description: 145 * Description:
146 * This does WB_SYNC_NONE opportunistic writeback. The IO is only 146 * This does WB_SYNC_NONE opportunistic writeback. The IO is only
147 * started when this function returns, we make no guarentees on 147 * started when this function returns, we make no guarantees on
148 * completion. Caller need not hold sb s_umount semaphore. 148 * completion. Caller need not hold sb s_umount semaphore.
149 * 149 *
150 */ 150 */
@@ -176,6 +176,17 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
176} 176}
177 177
178/* 178/*
179 * Remove the inode from the writeback list it is on.
180 */
181void inode_wb_list_del(struct inode *inode)
182{
183 spin_lock(&inode_wb_list_lock);
184 list_del_init(&inode->i_wb_list);
185 spin_unlock(&inode_wb_list_lock);
186}
187
188
189/*
179 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 190 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
180 * furthest end of its superblock's dirty-inode list. 191 * furthest end of its superblock's dirty-inode list.
181 * 192 *
@@ -188,6 +199,7 @@ static void redirty_tail(struct inode *inode)
188{ 199{
189 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 200 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
190 201
202 assert_spin_locked(&inode_wb_list_lock);
191 if (!list_empty(&wb->b_dirty)) { 203 if (!list_empty(&wb->b_dirty)) {
192 struct inode *tail; 204 struct inode *tail;
193 205
@@ -205,14 +217,17 @@ static void requeue_io(struct inode *inode)
205{ 217{
206 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; 218 struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
207 219
220 assert_spin_locked(&inode_wb_list_lock);
208 list_move(&inode->i_wb_list, &wb->b_more_io); 221 list_move(&inode->i_wb_list, &wb->b_more_io);
209} 222}
210 223
211static void inode_sync_complete(struct inode *inode) 224static void inode_sync_complete(struct inode *inode)
212{ 225{
213 /* 226 /*
214 * Prevent speculative execution through spin_unlock(&inode_lock); 227 * Prevent speculative execution through
228 * spin_unlock(&inode_wb_list_lock);
215 */ 229 */
230
216 smp_mb(); 231 smp_mb();
217 wake_up_bit(&inode->i_state, __I_SYNC); 232 wake_up_bit(&inode->i_state, __I_SYNC);
218} 233}
@@ -286,6 +301,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
286 */ 301 */
287static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) 302static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
288{ 303{
304 assert_spin_locked(&inode_wb_list_lock);
289 list_splice_init(&wb->b_more_io, &wb->b_io); 305 list_splice_init(&wb->b_more_io, &wb->b_io);
290 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); 306 move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
291} 307}
@@ -306,25 +322,25 @@ static void inode_wait_for_writeback(struct inode *inode)
306 wait_queue_head_t *wqh; 322 wait_queue_head_t *wqh;
307 323
308 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 324 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
309 while (inode->i_state & I_SYNC) { 325 while (inode->i_state & I_SYNC) {
310 spin_unlock(&inode_lock); 326 spin_unlock(&inode->i_lock);
327 spin_unlock(&inode_wb_list_lock);
311 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 328 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
312 spin_lock(&inode_lock); 329 spin_lock(&inode_wb_list_lock);
330 spin_lock(&inode->i_lock);
313 } 331 }
314} 332}
315 333
316/* 334/*
317 * Write out an inode's dirty pages. Called under inode_lock. Either the 335 * Write out an inode's dirty pages. Called under inode_wb_list_lock and
318 * caller has ref on the inode (either via __iget or via syscall against an fd) 336 * inode->i_lock. Either the caller has an active reference on the inode or
319 * or the inode has I_WILL_FREE set (via generic_forget_inode) 337 * the inode has I_WILL_FREE set.
320 * 338 *
321 * If `wait' is set, wait on the writeout. 339 * If `wait' is set, wait on the writeout.
322 * 340 *
323 * The whole writeout design is quite complex and fragile. We want to avoid 341 * The whole writeout design is quite complex and fragile. We want to avoid
324 * starvation of particular inodes when others are being redirtied, prevent 342 * starvation of particular inodes when others are being redirtied, prevent
325 * livelocks, etc. 343 * livelocks, etc.
326 *
327 * Called under inode_lock.
328 */ 344 */
329static int 345static int
330writeback_single_inode(struct inode *inode, struct writeback_control *wbc) 346writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
@@ -333,6 +349,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
333 unsigned dirty; 349 unsigned dirty;
334 int ret; 350 int ret;
335 351
352 assert_spin_locked(&inode_wb_list_lock);
353 assert_spin_locked(&inode->i_lock);
354
336 if (!atomic_read(&inode->i_count)) 355 if (!atomic_read(&inode->i_count))
337 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); 356 WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
338 else 357 else
@@ -363,7 +382,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
363 /* Set I_SYNC, reset I_DIRTY_PAGES */ 382 /* Set I_SYNC, reset I_DIRTY_PAGES */
364 inode->i_state |= I_SYNC; 383 inode->i_state |= I_SYNC;
365 inode->i_state &= ~I_DIRTY_PAGES; 384 inode->i_state &= ~I_DIRTY_PAGES;
366 spin_unlock(&inode_lock); 385 spin_unlock(&inode->i_lock);
386 spin_unlock(&inode_wb_list_lock);
367 387
368 ret = do_writepages(mapping, wbc); 388 ret = do_writepages(mapping, wbc);
369 389
@@ -383,10 +403,10 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
383 * due to delalloc, clear dirty metadata flags right before 403 * due to delalloc, clear dirty metadata flags right before
384 * write_inode() 404 * write_inode()
385 */ 405 */
386 spin_lock(&inode_lock); 406 spin_lock(&inode->i_lock);
387 dirty = inode->i_state & I_DIRTY; 407 dirty = inode->i_state & I_DIRTY;
388 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); 408 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
389 spin_unlock(&inode_lock); 409 spin_unlock(&inode->i_lock);
390 /* Don't write the inode if only I_DIRTY_PAGES was set */ 410 /* Don't write the inode if only I_DIRTY_PAGES was set */
391 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 411 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
392 int err = write_inode(inode, wbc); 412 int err = write_inode(inode, wbc);
@@ -394,7 +414,8 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
394 ret = err; 414 ret = err;
395 } 415 }
396 416
397 spin_lock(&inode_lock); 417 spin_lock(&inode_wb_list_lock);
418 spin_lock(&inode->i_lock);
398 inode->i_state &= ~I_SYNC; 419 inode->i_state &= ~I_SYNC;
399 if (!(inode->i_state & I_FREEING)) { 420 if (!(inode->i_state & I_FREEING)) {
400 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 421 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
@@ -506,7 +527,9 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
506 * kind does not need peridic writeout yet, and for the latter 527 * kind does not need peridic writeout yet, and for the latter
507 * kind writeout is handled by the freer. 528 * kind writeout is handled by the freer.
508 */ 529 */
530 spin_lock(&inode->i_lock);
509 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 531 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
532 spin_unlock(&inode->i_lock);
510 requeue_io(inode); 533 requeue_io(inode);
511 continue; 534 continue;
512 } 535 }
@@ -515,10 +538,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
515 * Was this inode dirtied after sync_sb_inodes was called? 538 * Was this inode dirtied after sync_sb_inodes was called?
516 * This keeps sync from extra jobs and livelock. 539 * This keeps sync from extra jobs and livelock.
517 */ 540 */
518 if (inode_dirtied_after(inode, wbc->wb_start)) 541 if (inode_dirtied_after(inode, wbc->wb_start)) {
542 spin_unlock(&inode->i_lock);
519 return 1; 543 return 1;
544 }
520 545
521 __iget(inode); 546 __iget(inode);
547
522 pages_skipped = wbc->pages_skipped; 548 pages_skipped = wbc->pages_skipped;
523 writeback_single_inode(inode, wbc); 549 writeback_single_inode(inode, wbc);
524 if (wbc->pages_skipped != pages_skipped) { 550 if (wbc->pages_skipped != pages_skipped) {
@@ -528,10 +554,11 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
528 */ 554 */
529 redirty_tail(inode); 555 redirty_tail(inode);
530 } 556 }
531 spin_unlock(&inode_lock); 557 spin_unlock(&inode->i_lock);
558 spin_unlock(&inode_wb_list_lock);
532 iput(inode); 559 iput(inode);
533 cond_resched(); 560 cond_resched();
534 spin_lock(&inode_lock); 561 spin_lock(&inode_wb_list_lock);
535 if (wbc->nr_to_write <= 0) { 562 if (wbc->nr_to_write <= 0) {
536 wbc->more_io = 1; 563 wbc->more_io = 1;
537 return 1; 564 return 1;
@@ -550,7 +577,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
550 577
551 if (!wbc->wb_start) 578 if (!wbc->wb_start)
552 wbc->wb_start = jiffies; /* livelock avoidance */ 579 wbc->wb_start = jiffies; /* livelock avoidance */
553 spin_lock(&inode_lock); 580 spin_lock(&inode_wb_list_lock);
554 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 581 if (!wbc->for_kupdate || list_empty(&wb->b_io))
555 queue_io(wb, wbc->older_than_this); 582 queue_io(wb, wbc->older_than_this);
556 583
@@ -568,7 +595,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
568 if (ret) 595 if (ret)
569 break; 596 break;
570 } 597 }
571 spin_unlock(&inode_lock); 598 spin_unlock(&inode_wb_list_lock);
572 /* Leave any unwritten inodes on b_io */ 599 /* Leave any unwritten inodes on b_io */
573} 600}
574 601
@@ -577,11 +604,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
577{ 604{
578 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 605 WARN_ON(!rwsem_is_locked(&sb->s_umount));
579 606
580 spin_lock(&inode_lock); 607 spin_lock(&inode_wb_list_lock);
581 if (!wbc->for_kupdate || list_empty(&wb->b_io)) 608 if (!wbc->for_kupdate || list_empty(&wb->b_io))
582 queue_io(wb, wbc->older_than_this); 609 queue_io(wb, wbc->older_than_this);
583 writeback_sb_inodes(sb, wb, wbc, true); 610 writeback_sb_inodes(sb, wb, wbc, true);
584 spin_unlock(&inode_lock); 611 spin_unlock(&inode_wb_list_lock);
585} 612}
586 613
587/* 614/*
@@ -720,13 +747,15 @@ static long wb_writeback(struct bdi_writeback *wb,
720 * become available for writeback. Otherwise 747 * become available for writeback. Otherwise
721 * we'll just busyloop. 748 * we'll just busyloop.
722 */ 749 */
723 spin_lock(&inode_lock); 750 spin_lock(&inode_wb_list_lock);
724 if (!list_empty(&wb->b_more_io)) { 751 if (!list_empty(&wb->b_more_io)) {
725 inode = wb_inode(wb->b_more_io.prev); 752 inode = wb_inode(wb->b_more_io.prev);
726 trace_wbc_writeback_wait(&wbc, wb->bdi); 753 trace_wbc_writeback_wait(&wbc, wb->bdi);
754 spin_lock(&inode->i_lock);
727 inode_wait_for_writeback(inode); 755 inode_wait_for_writeback(inode);
756 spin_unlock(&inode->i_lock);
728 } 757 }
729 spin_unlock(&inode_lock); 758 spin_unlock(&inode_wb_list_lock);
730 } 759 }
731 760
732 return wrote; 761 return wrote;
@@ -992,7 +1021,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
992{ 1021{
993 struct super_block *sb = inode->i_sb; 1022 struct super_block *sb = inode->i_sb;
994 struct backing_dev_info *bdi = NULL; 1023 struct backing_dev_info *bdi = NULL;
995 bool wakeup_bdi = false;
996 1024
997 /* 1025 /*
998 * Don't do this for I_DIRTY_PAGES - that doesn't actually 1026 * Don't do this for I_DIRTY_PAGES - that doesn't actually
@@ -1016,7 +1044,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1016 if (unlikely(block_dump)) 1044 if (unlikely(block_dump))
1017 block_dump___mark_inode_dirty(inode); 1045 block_dump___mark_inode_dirty(inode);
1018 1046
1019 spin_lock(&inode_lock); 1047 spin_lock(&inode->i_lock);
1020 if ((inode->i_state & flags) != flags) { 1048 if ((inode->i_state & flags) != flags) {
1021 const int was_dirty = inode->i_state & I_DIRTY; 1049 const int was_dirty = inode->i_state & I_DIRTY;
1022 1050
@@ -1028,7 +1056,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1028 * superblock list, based upon its state. 1056 * superblock list, based upon its state.
1029 */ 1057 */
1030 if (inode->i_state & I_SYNC) 1058 if (inode->i_state & I_SYNC)
1031 goto out; 1059 goto out_unlock_inode;
1032 1060
1033 /* 1061 /*
1034 * Only add valid (hashed) inodes to the superblock's 1062 * Only add valid (hashed) inodes to the superblock's
@@ -1036,16 +1064,17 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1036 */ 1064 */
1037 if (!S_ISBLK(inode->i_mode)) { 1065 if (!S_ISBLK(inode->i_mode)) {
1038 if (inode_unhashed(inode)) 1066 if (inode_unhashed(inode))
1039 goto out; 1067 goto out_unlock_inode;
1040 } 1068 }
1041 if (inode->i_state & I_FREEING) 1069 if (inode->i_state & I_FREEING)
1042 goto out; 1070 goto out_unlock_inode;
1043 1071
1044 /* 1072 /*
1045 * If the inode was already on b_dirty/b_io/b_more_io, don't 1073 * If the inode was already on b_dirty/b_io/b_more_io, don't
1046 * reposition it (that would break b_dirty time-ordering). 1074 * reposition it (that would break b_dirty time-ordering).
1047 */ 1075 */
1048 if (!was_dirty) { 1076 if (!was_dirty) {
1077 bool wakeup_bdi = false;
1049 bdi = inode_to_bdi(inode); 1078 bdi = inode_to_bdi(inode);
1050 1079
1051 if (bdi_cap_writeback_dirty(bdi)) { 1080 if (bdi_cap_writeback_dirty(bdi)) {
@@ -1062,15 +1091,20 @@ void __mark_inode_dirty(struct inode *inode, int flags)
1062 wakeup_bdi = true; 1091 wakeup_bdi = true;
1063 } 1092 }
1064 1093
1094 spin_unlock(&inode->i_lock);
1095 spin_lock(&inode_wb_list_lock);
1065 inode->dirtied_when = jiffies; 1096 inode->dirtied_when = jiffies;
1066 list_move(&inode->i_wb_list, &bdi->wb.b_dirty); 1097 list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
1098 spin_unlock(&inode_wb_list_lock);
1099
1100 if (wakeup_bdi)
1101 bdi_wakeup_thread_delayed(bdi);
1102 return;
1067 } 1103 }
1068 } 1104 }
1069out: 1105out_unlock_inode:
1070 spin_unlock(&inode_lock); 1106 spin_unlock(&inode->i_lock);
1071 1107
1072 if (wakeup_bdi)
1073 bdi_wakeup_thread_delayed(bdi);
1074} 1108}
1075EXPORT_SYMBOL(__mark_inode_dirty); 1109EXPORT_SYMBOL(__mark_inode_dirty);
1076 1110
@@ -1101,7 +1135,7 @@ static void wait_sb_inodes(struct super_block *sb)
1101 */ 1135 */
1102 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 1136 WARN_ON(!rwsem_is_locked(&sb->s_umount));
1103 1137
1104 spin_lock(&inode_lock); 1138 spin_lock(&inode_sb_list_lock);
1105 1139
1106 /* 1140 /*
1107 * Data integrity sync. Must wait for all pages under writeback, 1141 * Data integrity sync. Must wait for all pages under writeback,
@@ -1111,22 +1145,25 @@ static void wait_sb_inodes(struct super_block *sb)
1111 * we still have to wait for that writeout. 1145 * we still have to wait for that writeout.
1112 */ 1146 */
1113 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1147 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1114 struct address_space *mapping; 1148 struct address_space *mapping = inode->i_mapping;
1115 1149
1116 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 1150 spin_lock(&inode->i_lock);
1117 continue; 1151 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
1118 mapping = inode->i_mapping; 1152 (mapping->nrpages == 0)) {
1119 if (mapping->nrpages == 0) 1153 spin_unlock(&inode->i_lock);
1120 continue; 1154 continue;
1155 }
1121 __iget(inode); 1156 __iget(inode);
1122 spin_unlock(&inode_lock); 1157 spin_unlock(&inode->i_lock);
1158 spin_unlock(&inode_sb_list_lock);
1159
1123 /* 1160 /*
1124 * We hold a reference to 'inode' so it couldn't have 1161 * We hold a reference to 'inode' so it couldn't have been
1125 * been removed from s_inodes list while we dropped the 1162 * removed from s_inodes list while we dropped the
1126 * inode_lock. We cannot iput the inode now as we can 1163 * inode_sb_list_lock. We cannot iput the inode now as we can
1127 * be holding the last reference and we cannot iput it 1164 * be holding the last reference and we cannot iput it under
1128 * under inode_lock. So we keep the reference and iput 1165 * inode_sb_list_lock. So we keep the reference and iput it
1129 * it later. 1166 * later.
1130 */ 1167 */
1131 iput(old_inode); 1168 iput(old_inode);
1132 old_inode = inode; 1169 old_inode = inode;
@@ -1135,9 +1172,9 @@ static void wait_sb_inodes(struct super_block *sb)
1135 1172
1136 cond_resched(); 1173 cond_resched();
1137 1174
1138 spin_lock(&inode_lock); 1175 spin_lock(&inode_sb_list_lock);
1139 } 1176 }
1140 spin_unlock(&inode_lock); 1177 spin_unlock(&inode_sb_list_lock);
1141 iput(old_inode); 1178 iput(old_inode);
1142} 1179}
1143 1180
@@ -1271,9 +1308,11 @@ int write_inode_now(struct inode *inode, int sync)
1271 wbc.nr_to_write = 0; 1308 wbc.nr_to_write = 0;
1272 1309
1273 might_sleep(); 1310 might_sleep();
1274 spin_lock(&inode_lock); 1311 spin_lock(&inode_wb_list_lock);
1312 spin_lock(&inode->i_lock);
1275 ret = writeback_single_inode(inode, &wbc); 1313 ret = writeback_single_inode(inode, &wbc);
1276 spin_unlock(&inode_lock); 1314 spin_unlock(&inode->i_lock);
1315 spin_unlock(&inode_wb_list_lock);
1277 if (sync) 1316 if (sync)
1278 inode_sync_wait(inode); 1317 inode_sync_wait(inode);
1279 return ret; 1318 return ret;
@@ -1295,9 +1334,11 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
1295{ 1334{
1296 int ret; 1335 int ret;
1297 1336
1298 spin_lock(&inode_lock); 1337 spin_lock(&inode_wb_list_lock);
1338 spin_lock(&inode->i_lock);
1299 ret = writeback_single_inode(inode, wbc); 1339 ret = writeback_single_inode(inode, wbc);
1300 spin_unlock(&inode_lock); 1340 spin_unlock(&inode->i_lock);
1341 spin_unlock(&inode_wb_list_lock);
1301 return ret; 1342 return ret;
1302} 1343}
1303EXPORT_SYMBOL(sync_inode); 1344EXPORT_SYMBOL(sync_inode);
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 3e87cce5837d..b6cca47f7b07 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -305,7 +305,7 @@ static void cuse_gendev_release(struct device *dev)
305static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 305static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
306{ 306{
307 struct cuse_conn *cc = fc_to_cc(fc); 307 struct cuse_conn *cc = fc_to_cc(fc);
308 struct cuse_init_out *arg = &req->misc.cuse_init_out; 308 struct cuse_init_out *arg = req->out.args[0].value;
309 struct page *page = req->pages[0]; 309 struct page *page = req->pages[0];
310 struct cuse_devinfo devinfo = { }; 310 struct cuse_devinfo devinfo = { };
311 struct device *dev; 311 struct device *dev;
@@ -384,6 +384,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
384 dev_set_uevent_suppress(dev, 0); 384 dev_set_uevent_suppress(dev, 0);
385 kobject_uevent(&dev->kobj, KOBJ_ADD); 385 kobject_uevent(&dev->kobj, KOBJ_ADD);
386out: 386out:
387 kfree(arg);
387 __free_page(page); 388 __free_page(page);
388 return; 389 return;
389 390
@@ -405,6 +406,7 @@ static int cuse_send_init(struct cuse_conn *cc)
405 struct page *page; 406 struct page *page;
406 struct fuse_conn *fc = &cc->fc; 407 struct fuse_conn *fc = &cc->fc;
407 struct cuse_init_in *arg; 408 struct cuse_init_in *arg;
409 void *outarg;
408 410
409 BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); 411 BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
410 412
@@ -419,6 +421,10 @@ static int cuse_send_init(struct cuse_conn *cc)
419 if (!page) 421 if (!page)
420 goto err_put_req; 422 goto err_put_req;
421 423
424 outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
425 if (!outarg)
426 goto err_free_page;
427
422 arg = &req->misc.cuse_init_in; 428 arg = &req->misc.cuse_init_in;
423 arg->major = FUSE_KERNEL_VERSION; 429 arg->major = FUSE_KERNEL_VERSION;
424 arg->minor = FUSE_KERNEL_MINOR_VERSION; 430 arg->minor = FUSE_KERNEL_MINOR_VERSION;
@@ -429,7 +435,7 @@ static int cuse_send_init(struct cuse_conn *cc)
429 req->in.args[0].value = arg; 435 req->in.args[0].value = arg;
430 req->out.numargs = 2; 436 req->out.numargs = 2;
431 req->out.args[0].size = sizeof(struct cuse_init_out); 437 req->out.args[0].size = sizeof(struct cuse_init_out);
432 req->out.args[0].value = &req->misc.cuse_init_out; 438 req->out.args[0].value = outarg;
433 req->out.args[1].size = CUSE_INIT_INFO_MAX; 439 req->out.args[1].size = CUSE_INIT_INFO_MAX;
434 req->out.argvar = 1; 440 req->out.argvar = 1;
435 req->out.argpages = 1; 441 req->out.argpages = 1;
@@ -440,6 +446,8 @@ static int cuse_send_init(struct cuse_conn *cc)
440 446
441 return 0; 447 return 0;
442 448
449err_free_page:
450 __free_page(page);
443err_put_req: 451err_put_req:
444 fuse_put_request(fc, req); 452 fuse_put_request(fc, req);
445err: 453err:
@@ -458,7 +466,7 @@ static void cuse_fc_release(struct fuse_conn *fc)
458 * @file: file struct being opened 466 * @file: file struct being opened
459 * 467 *
460 * Userland CUSE server can create a CUSE device by opening /dev/cuse 468 * Userland CUSE server can create a CUSE device by opening /dev/cuse
461 * and replying to the initilaization request kernel sends. This 469 * and replying to the initialization request kernel sends. This
462 * function is responsible for handling CUSE device initialization. 470 * function is responsible for handling CUSE device initialization.
463 * Because the fd opened by this function is used during 471 * Because the fd opened by this function is used during
464 * initialization, this function only creates cuse_conn and sends 472 * initialization, this function only creates cuse_conn and sends
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cf8d28d1fbad..640fc229df10 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -737,14 +737,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
737 if (WARN_ON(PageMlocked(oldpage))) 737 if (WARN_ON(PageMlocked(oldpage)))
738 goto out_fallback_unlock; 738 goto out_fallback_unlock;
739 739
740 remove_from_page_cache(oldpage); 740 err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
741 page_cache_release(oldpage);
742
743 err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
744 if (err) { 741 if (err) {
745 printk(KERN_WARNING "fuse_try_move_page: failed to add page"); 742 unlock_page(newpage);
746 goto out_fallback_unlock; 743 return err;
747 } 744 }
745
748 page_cache_get(newpage); 746 page_cache_get(newpage);
749 747
750 if (!(buf->flags & PIPE_BUF_FLAG_LRU)) 748 if (!(buf->flags & PIPE_BUF_FLAG_LRU))
@@ -1910,6 +1908,21 @@ __acquires(fc->lock)
1910 kfree(dequeue_forget(fc, 1, NULL)); 1908 kfree(dequeue_forget(fc, 1, NULL));
1911} 1909}
1912 1910
1911static void end_polls(struct fuse_conn *fc)
1912{
1913 struct rb_node *p;
1914
1915 p = rb_first(&fc->polled_files);
1916
1917 while (p) {
1918 struct fuse_file *ff;
1919 ff = rb_entry(p, struct fuse_file, polled_node);
1920 wake_up_interruptible_all(&ff->poll_wait);
1921
1922 p = rb_next(p);
1923 }
1924}
1925
1913/* 1926/*
1914 * Abort all requests. 1927 * Abort all requests.
1915 * 1928 *
@@ -1937,6 +1950,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
1937 fc->blocked = 0; 1950 fc->blocked = 0;
1938 end_io_requests(fc); 1951 end_io_requests(fc);
1939 end_queued_requests(fc); 1952 end_queued_requests(fc);
1953 end_polls(fc);
1940 wake_up_all(&fc->waitq); 1954 wake_up_all(&fc->waitq);
1941 wake_up_all(&fc->blocked_waitq); 1955 wake_up_all(&fc->blocked_waitq);
1942 kill_fasync(&fc->fasync, SIGIO, POLL_IN); 1956 kill_fasync(&fc->fasync, SIGIO, POLL_IN);
@@ -1953,6 +1967,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
1953 fc->connected = 0; 1967 fc->connected = 0;
1954 fc->blocked = 0; 1968 fc->blocked = 0;
1955 end_queued_requests(fc); 1969 end_queued_requests(fc);
1970 end_polls(fc);
1956 wake_up_all(&fc->blocked_waitq); 1971 wake_up_all(&fc->blocked_waitq);
1957 spin_unlock(&fc->lock); 1972 spin_unlock(&fc->lock);
1958 fuse_conn_put(fc); 1973 fuse_conn_put(fc);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bfed8447ed80..c6ba49bd95b3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,10 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
158{ 158{
159 struct inode *inode; 159 struct inode *inode;
160 160
161 if (nd->flags & LOOKUP_RCU) 161 inode = ACCESS_ONCE(entry->d_inode);
162 return -ECHILD;
163
164 inode = entry->d_inode;
165 if (inode && is_bad_inode(inode)) 162 if (inode && is_bad_inode(inode))
166 return 0; 163 return 0;
167 else if (fuse_dentry_time(entry) < get_jiffies_64()) { 164 else if (fuse_dentry_time(entry) < get_jiffies_64()) {
@@ -177,6 +174,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
177 if (!inode) 174 if (!inode)
178 return 0; 175 return 0;
179 176
177 if (nd->flags & LOOKUP_RCU)
178 return -ECHILD;
179
180 fc = get_fuse_conn(inode); 180 fc = get_fuse_conn(inode);
181 req = fuse_get_req(fc); 181 req = fuse_get_req(fc);
182 if (IS_ERR(req)) 182 if (IS_ERR(req))
@@ -970,6 +970,14 @@ static int fuse_access(struct inode *inode, int mask)
970 return err; 970 return err;
971} 971}
972 972
973static int fuse_perm_getattr(struct inode *inode, int flags)
974{
975 if (flags & IPERM_FLAG_RCU)
976 return -ECHILD;
977
978 return fuse_do_getattr(inode, NULL, NULL);
979}
980
973/* 981/*
974 * Check permission. The two basic access models of FUSE are: 982 * Check permission. The two basic access models of FUSE are:
975 * 983 *
@@ -989,9 +997,6 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
989 bool refreshed = false; 997 bool refreshed = false;
990 int err = 0; 998 int err = 0;
991 999
992 if (flags & IPERM_FLAG_RCU)
993 return -ECHILD;
994
995 if (!fuse_allow_task(fc, current)) 1000 if (!fuse_allow_task(fc, current))
996 return -EACCES; 1001 return -EACCES;
997 1002
@@ -1000,9 +1005,15 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
1000 */ 1005 */
1001 if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) || 1006 if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
1002 ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { 1007 ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
1003 err = fuse_update_attributes(inode, NULL, NULL, &refreshed); 1008 struct fuse_inode *fi = get_fuse_inode(inode);
1004 if (err) 1009
1005 return err; 1010 if (fi->i_time < get_jiffies_64()) {
1011 refreshed = true;
1012
1013 err = fuse_perm_getattr(inode, flags);
1014 if (err)
1015 return err;
1016 }
1006 } 1017 }
1007 1018
1008 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) { 1019 if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
@@ -1012,7 +1023,7 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
1012 attributes. This is also needed, because the root 1023 attributes. This is also needed, because the root
1013 node will at first have no permissions */ 1024 node will at first have no permissions */
1014 if (err == -EACCES && !refreshed) { 1025 if (err == -EACCES && !refreshed) {
1015 err = fuse_do_getattr(inode, NULL, NULL); 1026 err = fuse_perm_getattr(inode, flags);
1016 if (!err) 1027 if (!err)
1017 err = generic_permission(inode, mask, 1028 err = generic_permission(inode, mask,
1018 flags, NULL); 1029 flags, NULL);
@@ -1023,13 +1034,16 @@ static int fuse_permission(struct inode *inode, int mask, unsigned int flags)
1023 noticed immediately, only after the attribute 1034 noticed immediately, only after the attribute
1024 timeout has expired */ 1035 timeout has expired */
1025 } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { 1036 } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
1037 if (flags & IPERM_FLAG_RCU)
1038 return -ECHILD;
1039
1026 err = fuse_access(inode, mask); 1040 err = fuse_access(inode, mask);
1027 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { 1041 } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
1028 if (!(inode->i_mode & S_IXUGO)) { 1042 if (!(inode->i_mode & S_IXUGO)) {
1029 if (refreshed) 1043 if (refreshed)
1030 return -EACCES; 1044 return -EACCES;
1031 1045
1032 err = fuse_do_getattr(inode, NULL, NULL); 1046 err = fuse_perm_getattr(inode, flags);
1033 if (!err && !(inode->i_mode & S_IXUGO)) 1047 if (!err && !(inode->i_mode & S_IXUGO))
1034 return -EACCES; 1048 return -EACCES;
1035 } 1049 }
@@ -1283,8 +1297,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
1283 if (err) 1297 if (err)
1284 return err; 1298 return err;
1285 1299
1286 if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc) 1300 if (attr->ia_valid & ATTR_OPEN) {
1287 return 0; 1301 if (fc->atomic_o_trunc)
1302 return 0;
1303 file = NULL;
1304 }
1288 1305
1289 if (attr->ia_valid & ATTR_SIZE) 1306 if (attr->ia_valid & ATTR_SIZE)
1290 is_truncate = true; 1307 is_truncate = true;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 95da1bc1c826..82a66466a24c 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
86 return ff; 86 return ff;
87} 87}
88 88
89static void fuse_release_async(struct work_struct *work)
90{
91 struct fuse_req *req;
92 struct fuse_conn *fc;
93 struct path path;
94
95 req = container_of(work, struct fuse_req, misc.release.work);
96 path = req->misc.release.path;
97 fc = get_fuse_conn(path.dentry->d_inode);
98
99 fuse_put_request(fc, req);
100 path_put(&path);
101}
102
89static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) 103static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
90{ 104{
91 path_put(&req->misc.release.path); 105 if (fc->destroy_req) {
106 /*
107 * If this is a fuseblk mount, then it's possible that
108 * releasing the path will result in releasing the
109 * super block and sending the DESTROY request. If
110 * the server is single threaded, this would hang.
111 * For this reason do the path_put() in a separate
112 * thread.
113 */
114 atomic_inc(&req->count);
115 INIT_WORK(&req->misc.release.work, fuse_release_async);
116 schedule_work(&req->misc.release.work);
117 } else {
118 path_put(&req->misc.release.path);
119 }
92} 120}
93 121
94static void fuse_file_put(struct fuse_file *ff) 122static void fuse_file_put(struct fuse_file *ff, bool sync)
95{ 123{
96 if (atomic_dec_and_test(&ff->count)) { 124 if (atomic_dec_and_test(&ff->count)) {
97 struct fuse_req *req = ff->reserved_req; 125 struct fuse_req *req = ff->reserved_req;
98 126
99 req->end = fuse_release_end; 127 if (sync) {
100 fuse_request_send_background(ff->fc, req); 128 fuse_request_send(ff->fc, req);
129 path_put(&req->misc.release.path);
130 fuse_put_request(ff->fc, req);
131 } else {
132 req->end = fuse_release_end;
133 fuse_request_send_background(ff->fc, req);
134 }
101 kfree(ff); 135 kfree(ff);
102 } 136 }
103} 137}
@@ -188,7 +222,7 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
188 rb_erase(&ff->polled_node, &fc->polled_files); 222 rb_erase(&ff->polled_node, &fc->polled_files);
189 spin_unlock(&fc->lock); 223 spin_unlock(&fc->lock);
190 224
191 wake_up_interruptible_sync(&ff->poll_wait); 225 wake_up_interruptible_all(&ff->poll_wait);
192 226
193 inarg->fh = ff->fh; 227 inarg->fh = ff->fh;
194 inarg->flags = flags; 228 inarg->flags = flags;
@@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode)
219 * Normally this will send the RELEASE request, however if 253 * Normally this will send the RELEASE request, however if
220 * some asynchronous READ or WRITE requests are outstanding, 254 * some asynchronous READ or WRITE requests are outstanding,
221 * the sending will be delayed. 255 * the sending will be delayed.
256 *
257 * Make the release synchronous if this is a fuseblk mount,
258 * synchronous RELEASE is allowed (and desirable) in this case
259 * because the server can be trusted not to screw up.
222 */ 260 */
223 fuse_file_put(ff); 261 fuse_file_put(ff, ff->fc->destroy_req != NULL);
224} 262}
225 263
226static int fuse_open(struct inode *inode, struct file *file) 264static int fuse_open(struct inode *inode, struct file *file)
@@ -485,7 +523,7 @@ static int fuse_readpage(struct file *file, struct page *page)
485 goto out; 523 goto out;
486 524
487 /* 525 /*
488 * Page writeback can extend beyond the liftime of the 526 * Page writeback can extend beyond the lifetime of the
489 * page-cache page, so make sure we read a properly synced 527 * page-cache page, so make sure we read a properly synced
490 * page. 528 * page.
491 */ 529 */
@@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
558 page_cache_release(page); 596 page_cache_release(page);
559 } 597 }
560 if (req->ff) 598 if (req->ff)
561 fuse_file_put(req->ff); 599 fuse_file_put(req->ff, false);
562} 600}
563 601
564static void fuse_send_readpages(struct fuse_req *req, struct file *file) 602static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
1137static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) 1175static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1138{ 1176{
1139 __free_page(req->pages[0]); 1177 __free_page(req->pages[0]);
1140 fuse_file_put(req->ff); 1178 fuse_file_put(req->ff, false);
1141} 1179}
1142 1180
1143static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) 1181static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ae5744a2f9e9..b788becada76 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,7 @@
21#include <linux/rwsem.h> 21#include <linux/rwsem.h>
22#include <linux/rbtree.h> 22#include <linux/rbtree.h>
23#include <linux/poll.h> 23#include <linux/poll.h>
24#include <linux/workqueue.h>
24 25
25/** Max number of pages that can be used in a single read request */ 26/** Max number of pages that can be used in a single read request */
26#define FUSE_MAX_PAGES_PER_REQ 32 27#define FUSE_MAX_PAGES_PER_REQ 32
@@ -262,13 +263,15 @@ struct fuse_req {
262 /** Data for asynchronous requests */ 263 /** Data for asynchronous requests */
263 union { 264 union {
264 struct { 265 struct {
265 struct fuse_release_in in; 266 union {
267 struct fuse_release_in in;
268 struct work_struct work;
269 };
266 struct path path; 270 struct path path;
267 } release; 271 } release;
268 struct fuse_init_in init_in; 272 struct fuse_init_in init_in;
269 struct fuse_init_out init_out; 273 struct fuse_init_out init_out;
270 struct cuse_init_in cuse_init_in; 274 struct cuse_init_in cuse_init_in;
271 struct cuse_init_out cuse_init_out;
272 struct { 275 struct {
273 struct fuse_read_in in; 276 struct fuse_read_in in;
274 u64 attr_ver; 277 u64 attr_ver;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd1..cc6ec4b2f0ff 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
637 u64 nodeid; 637 u64 nodeid;
638 u32 generation; 638 u32 generation;
639 639
640 if (*max_len < len) 640 if (*max_len < len) {
641 *max_len = len;
641 return 255; 642 return 255;
643 }
642 644
643 nodeid = get_fuse_inode(inode)->nodeid; 645 nodeid = get_fuse_inode(inode)->nodeid;
644 generation = inode->i_generation; 646 generation = inode->i_generation;
@@ -868,7 +870,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
868 870
869 fc->bdi.name = "fuse"; 871 fc->bdi.name = "fuse";
870 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; 872 fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
871 fc->bdi.unplug_io_fn = default_unplug_io_fn;
872 /* fuse does it's own writeback accounting */ 873 /* fuse does it's own writeback accounting */
873 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; 874 fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
874 875
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 06c48a891832..8f26d1a58912 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -74,7 +74,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
74 return -EINVAL; 74 return -EINVAL;
75 if (S_ISLNK(inode->i_mode)) 75 if (S_ISLNK(inode->i_mode))
76 return -EOPNOTSUPP; 76 return -EOPNOTSUPP;
77 if (!is_owner_or_cap(inode)) 77 if (!inode_owner_or_capable(inode))
78 return -EPERM; 78 return -EPERM;
79 if (value) { 79 if (value) {
80 acl = posix_acl_from_xattr(value, size); 80 acl = posix_acl_from_xattr(value, size);
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 21f7e46da4c0..f3d23ef4e876 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,4 +1,4 @@
1EXTRA_CFLAGS := -I$(src) 1ccflags-y := -I$(src)
2obj-$(CONFIG_GFS2_FS) += gfs2.o 2obj-$(CONFIG_GFS2_FS) += gfs2.o
3gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ 3gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
4 glops.o inode.o log.o lops.o main.o meta_io.o \ 4 glops.o inode.o log.o lops.o main.o meta_io.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 7118f1a780a9..cbc07155b1a0 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -80,8 +80,11 @@ int gfs2_check_acl(struct inode *inode, int mask, unsigned int flags)
80 struct posix_acl *acl; 80 struct posix_acl *acl;
81 int error; 81 int error;
82 82
83 if (flags & IPERM_FLAG_RCU) 83 if (flags & IPERM_FLAG_RCU) {
84 return -ECHILD; 84 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
85 return -ECHILD;
86 return -EAGAIN;
87 }
85 88
86 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS); 89 acl = gfs2_acl_get(GFS2_I(inode), ACL_TYPE_ACCESS);
87 if (IS_ERR(acl)) 90 if (IS_ERR(acl))
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 4f36f8832b9b..c71995b111bf 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -695,6 +695,7 @@ out:
695 if (error == 0) 695 if (error == 0)
696 return 0; 696 return 0;
697 697
698 unlock_page(page);
698 page_cache_release(page); 699 page_cache_release(page);
699 700
700 gfs2_trans_end(sdp); 701 gfs2_trans_end(sdp);
@@ -1116,7 +1117,6 @@ static const struct address_space_operations gfs2_writeback_aops = {
1116 .writepages = gfs2_writeback_writepages, 1117 .writepages = gfs2_writeback_writepages,
1117 .readpage = gfs2_readpage, 1118 .readpage = gfs2_readpage,
1118 .readpages = gfs2_readpages, 1119 .readpages = gfs2_readpages,
1119 .sync_page = block_sync_page,
1120 .write_begin = gfs2_write_begin, 1120 .write_begin = gfs2_write_begin,
1121 .write_end = gfs2_write_end, 1121 .write_end = gfs2_write_end,
1122 .bmap = gfs2_bmap, 1122 .bmap = gfs2_bmap,
@@ -1132,7 +1132,6 @@ static const struct address_space_operations gfs2_ordered_aops = {
1132 .writepage = gfs2_ordered_writepage, 1132 .writepage = gfs2_ordered_writepage,
1133 .readpage = gfs2_readpage, 1133 .readpage = gfs2_readpage,
1134 .readpages = gfs2_readpages, 1134 .readpages = gfs2_readpages,
1135 .sync_page = block_sync_page,
1136 .write_begin = gfs2_write_begin, 1135 .write_begin = gfs2_write_begin,
1137 .write_end = gfs2_write_end, 1136 .write_end = gfs2_write_end,
1138 .set_page_dirty = gfs2_set_page_dirty, 1137 .set_page_dirty = gfs2_set_page_dirty,
@@ -1150,7 +1149,6 @@ static const struct address_space_operations gfs2_jdata_aops = {
1150 .writepages = gfs2_jdata_writepages, 1149 .writepages = gfs2_jdata_writepages,
1151 .readpage = gfs2_readpage, 1150 .readpage = gfs2_readpage,
1152 .readpages = gfs2_readpages, 1151 .readpages = gfs2_readpages,
1153 .sync_page = block_sync_page,
1154 .write_begin = gfs2_write_begin, 1152 .write_begin = gfs2_write_begin,
1155 .write_end = gfs2_write_end, 1153 .write_end = gfs2_write_end,
1156 .set_page_dirty = gfs2_set_page_dirty, 1154 .set_page_dirty = gfs2_set_page_dirty,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3c4039d5eef1..74add2ddcc3f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -21,6 +21,7 @@
21#include "meta_io.h" 21#include "meta_io.h"
22#include "quota.h" 22#include "quota.h"
23#include "rgrp.h" 23#include "rgrp.h"
24#include "super.h"
24#include "trans.h" 25#include "trans.h"
25#include "dir.h" 26#include "dir.h"
26#include "util.h" 27#include "util.h"
@@ -757,7 +758,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
757 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 758 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
758 struct gfs2_rgrp_list rlist; 759 struct gfs2_rgrp_list rlist;
759 u64 bn, bstart; 760 u64 bn, bstart;
760 u32 blen; 761 u32 blen, btotal;
761 __be64 *p; 762 __be64 *p;
762 unsigned int rg_blocks = 0; 763 unsigned int rg_blocks = 0;
763 int metadata; 764 int metadata;
@@ -839,6 +840,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
839 840
840 bstart = 0; 841 bstart = 0;
841 blen = 0; 842 blen = 0;
843 btotal = 0;
842 844
843 for (p = top; p < bottom; p++) { 845 for (p = top; p < bottom; p++) {
844 if (!*p) 846 if (!*p)
@@ -851,9 +853,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
851 else { 853 else {
852 if (bstart) { 854 if (bstart) {
853 if (metadata) 855 if (metadata)
854 gfs2_free_meta(ip, bstart, blen); 856 __gfs2_free_meta(ip, bstart, blen);
855 else 857 else
856 gfs2_free_data(ip, bstart, blen); 858 __gfs2_free_data(ip, bstart, blen);
859
860 btotal += blen;
857 } 861 }
858 862
859 bstart = bn; 863 bstart = bn;
@@ -865,11 +869,17 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
865 } 869 }
866 if (bstart) { 870 if (bstart) {
867 if (metadata) 871 if (metadata)
868 gfs2_free_meta(ip, bstart, blen); 872 __gfs2_free_meta(ip, bstart, blen);
869 else 873 else
870 gfs2_free_data(ip, bstart, blen); 874 __gfs2_free_data(ip, bstart, blen);
875
876 btotal += blen;
871 } 877 }
872 878
879 gfs2_statfs_change(sdp, 0, +btotal, 0);
880 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
881 ip->i_inode.i_gid);
882
873 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 883 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
874 884
875 gfs2_dinode_out(ip, dibh->b_data); 885 gfs2_dinode_out(ip, dibh->b_data);
@@ -1126,7 +1136,7 @@ void gfs2_trim_blocks(struct inode *inode)
1126 * earlier versions of GFS2 have a bug in the stuffed file reading 1136 * earlier versions of GFS2 have a bug in the stuffed file reading
1127 * code which will result in a buffer overrun if the size is larger 1137 * code which will result in a buffer overrun if the size is larger
1128 * than the max stuffed file size. In order to prevent this from 1138 * than the max stuffed file size. In order to prevent this from
1129 * occuring, such files are unstuffed, but in other cases we can 1139 * occurring, such files are unstuffed, but in other cases we can
1130 * just update the inode size directly. 1140 * just update the inode size directly.
1131 * 1141 *
1132 * Returns: 0 on success, or -ve on error 1142 * Returns: 0 on success, or -ve on error
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4a456338b873..0da8da2c991d 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
44 int error; 44 int error;
45 int had_lock = 0; 45 int had_lock = 0;
46 46
47 if (nd->flags & LOOKUP_RCU) 47 if (nd && nd->flags & LOOKUP_RCU)
48 return -ECHILD; 48 return -ECHILD;
49 49
50 parent = dget_parent(dentry); 50 parent = dget_parent(dentry);
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f9..b5a5e60df0d5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
36 struct super_block *sb = inode->i_sb; 36 struct super_block *sb = inode->i_sb;
37 struct gfs2_inode *ip = GFS2_I(inode); 37 struct gfs2_inode *ip = GFS2_I(inode);
38 38
39 if (*len < GFS2_SMALL_FH_SIZE || 39 if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
40 (connectable && *len < GFS2_LARGE_FH_SIZE)) 40 *len = GFS2_LARGE_FH_SIZE;
41 return 255; 41 return 255;
42 } else if (*len < GFS2_SMALL_FH_SIZE) {
43 *len = GFS2_SMALL_FH_SIZE;
44 return 255;
45 }
42 46
43 fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); 47 fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
44 fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); 48 fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7cfdcb913363..b2682e073eee 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -221,7 +221,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
221 goto out_drop_write; 221 goto out_drop_write;
222 222
223 error = -EACCES; 223 error = -EACCES;
224 if (!is_owner_or_cap(inode)) 224 if (!inode_owner_or_capable(inode))
225 goto out; 225 goto out;
226 226
227 error = 0; 227 error = 0;
@@ -448,15 +448,20 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
448{ 448{
449 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); 449 struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
450 450
451 if (!(file->f_flags & O_NOATIME)) { 451 if (!(file->f_flags & O_NOATIME) &&
452 !IS_NOATIME(&ip->i_inode)) {
452 struct gfs2_holder i_gh; 453 struct gfs2_holder i_gh;
453 int error; 454 int error;
454 455
455 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); 456 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
456 error = gfs2_glock_nq(&i_gh); 457 error = gfs2_glock_nq(&i_gh);
457 file_accessed(file); 458 if (error == 0) {
458 if (error == 0) 459 file_accessed(file);
459 gfs2_glock_dq_uninit(&i_gh); 460 gfs2_glock_dq(&i_gh);
461 }
462 gfs2_holder_uninit(&i_gh);
463 if (error)
464 return error;
460 } 465 }
461 vma->vm_ops = &gfs2_vm_ops; 466 vma->vm_ops = &gfs2_vm_ops;
462 vma->vm_flags |= VM_CAN_NONLINEAR; 467 vma->vm_flags |= VM_CAN_NONLINEAR;
@@ -617,8 +622,7 @@ static void empty_write_end(struct page *page, unsigned from,
617{ 622{
618 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 623 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
619 624
620 page_zero_new_buffers(page, from, to); 625 zero_user(page, from, to-from);
621 flush_dcache_page(page);
622 mark_page_accessed(page); 626 mark_page_accessed(page);
623 627
624 if (!gfs2_is_writeback(ip)) 628 if (!gfs2_is_writeback(ip))
@@ -627,36 +631,43 @@ static void empty_write_end(struct page *page, unsigned from,
627 block_commit_write(page, from, to); 631 block_commit_write(page, from, to);
628} 632}
629 633
630static int write_empty_blocks(struct page *page, unsigned from, unsigned to) 634static int needs_empty_write(sector_t block, struct inode *inode)
631{ 635{
632 unsigned start, end, next;
633 struct buffer_head *bh, *head;
634 int error; 636 int error;
637 struct buffer_head bh_map = { .b_state = 0, .b_blocknr = 0 };
635 638
636 if (!page_has_buffers(page)) { 639 bh_map.b_size = 1 << inode->i_blkbits;
637 error = __block_write_begin(page, from, to - from, gfs2_block_map); 640 error = gfs2_block_map(inode, block, &bh_map, 0);
638 if (unlikely(error)) 641 if (unlikely(error))
639 return error; 642 return error;
643 return !buffer_mapped(&bh_map);
644}
640 645
641 empty_write_end(page, from, to); 646static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
642 return 0; 647{
643 } 648 struct inode *inode = page->mapping->host;
649 unsigned start, end, next, blksize;
650 sector_t block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
651 int ret;
644 652
645 bh = head = page_buffers(page); 653 blksize = 1 << inode->i_blkbits;
646 next = end = 0; 654 next = end = 0;
647 while (next < from) { 655 while (next < from) {
648 next += bh->b_size; 656 next += blksize;
649 bh = bh->b_this_page; 657 block++;
650 } 658 }
651 start = next; 659 start = next;
652 do { 660 do {
653 next += bh->b_size; 661 next += blksize;
654 if (buffer_mapped(bh)) { 662 ret = needs_empty_write(block, inode);
663 if (unlikely(ret < 0))
664 return ret;
665 if (ret == 0) {
655 if (end) { 666 if (end) {
656 error = __block_write_begin(page, start, end - start, 667 ret = __block_write_begin(page, start, end - start,
657 gfs2_block_map); 668 gfs2_block_map);
658 if (unlikely(error)) 669 if (unlikely(ret))
659 return error; 670 return ret;
660 empty_write_end(page, start, end); 671 empty_write_end(page, start, end);
661 end = 0; 672 end = 0;
662 } 673 }
@@ -664,13 +675,13 @@ static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
664 } 675 }
665 else 676 else
666 end = next; 677 end = next;
667 bh = bh->b_this_page; 678 block++;
668 } while (next < to); 679 } while (next < to);
669 680
670 if (end) { 681 if (end) {
671 error = __block_write_begin(page, start, end - start, gfs2_block_map); 682 ret = __block_write_begin(page, start, end - start, gfs2_block_map);
672 if (unlikely(error)) 683 if (unlikely(ret))
673 return error; 684 return ret;
674 empty_write_end(page, start, end); 685 empty_write_end(page, start, end);
675 } 686 }
676 687
@@ -976,8 +987,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
976 987
977 mutex_lock(&fp->f_fl_mutex); 988 mutex_lock(&fp->f_fl_mutex);
978 flock_lock_file_wait(file, fl); 989 flock_lock_file_wait(file, fl);
979 if (fl_gh->gh_gl) 990 if (fl_gh->gh_gl) {
980 gfs2_glock_dq_uninit(fl_gh); 991 gfs2_glock_dq_wait(fl_gh);
992 gfs2_holder_uninit(fl_gh);
993 }
981 mutex_unlock(&fp->f_fl_mutex); 994 mutex_unlock(&fp->f_fl_mutex);
982} 995}
983 996
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 08a8beb152e6..f07643e21bfa 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -26,6 +26,9 @@
26#include <linux/freezer.h> 26#include <linux/freezer.h>
27#include <linux/workqueue.h> 27#include <linux/workqueue.h>
28#include <linux/jiffies.h> 28#include <linux/jiffies.h>
29#include <linux/rcupdate.h>
30#include <linux/rculist_bl.h>
31#include <linux/bit_spinlock.h>
29 32
30#include "gfs2.h" 33#include "gfs2.h"
31#include "incore.h" 34#include "incore.h"
@@ -41,10 +44,6 @@
41#define CREATE_TRACE_POINTS 44#define CREATE_TRACE_POINTS
42#include "trace_gfs2.h" 45#include "trace_gfs2.h"
43 46
44struct gfs2_gl_hash_bucket {
45 struct hlist_head hb_list;
46};
47
48struct gfs2_glock_iter { 47struct gfs2_glock_iter {
49 int hash; /* hash bucket index */ 48 int hash; /* hash bucket index */
50 struct gfs2_sbd *sdp; /* incore superblock */ 49 struct gfs2_sbd *sdp; /* incore superblock */
@@ -54,7 +53,6 @@ struct gfs2_glock_iter {
54 53
55typedef void (*glock_examiner) (struct gfs2_glock * gl); 54typedef void (*glock_examiner) (struct gfs2_glock * gl);
56 55
57static int gfs2_dump_lockstate(struct gfs2_sbd *sdp);
58static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl); 56static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
59#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0) 57#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
60static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); 58static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
@@ -70,57 +68,9 @@ static DEFINE_SPINLOCK(lru_lock);
70#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) 68#define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT)
71#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1) 69#define GFS2_GL_HASH_MASK (GFS2_GL_HASH_SIZE - 1)
72 70
73static struct gfs2_gl_hash_bucket gl_hash_table[GFS2_GL_HASH_SIZE]; 71static struct hlist_bl_head gl_hash_table[GFS2_GL_HASH_SIZE];
74static struct dentry *gfs2_root; 72static struct dentry *gfs2_root;
75 73
76/*
77 * Despite what you might think, the numbers below are not arbitrary :-)
78 * They are taken from the ipv4 routing hash code, which is well tested
79 * and thus should be nearly optimal. Later on we might tweek the numbers
80 * but for now this should be fine.
81 *
82 * The reason for putting the locks in a separate array from the list heads
83 * is that we can have fewer locks than list heads and save memory. We use
84 * the same hash function for both, but with a different hash mask.
85 */
86#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
87 defined(CONFIG_PROVE_LOCKING)
88
89#ifdef CONFIG_LOCKDEP
90# define GL_HASH_LOCK_SZ 256
91#else
92# if NR_CPUS >= 32
93# define GL_HASH_LOCK_SZ 4096
94# elif NR_CPUS >= 16
95# define GL_HASH_LOCK_SZ 2048
96# elif NR_CPUS >= 8
97# define GL_HASH_LOCK_SZ 1024
98# elif NR_CPUS >= 4
99# define GL_HASH_LOCK_SZ 512
100# else
101# define GL_HASH_LOCK_SZ 256
102# endif
103#endif
104
105/* We never want more locks than chains */
106#if GFS2_GL_HASH_SIZE < GL_HASH_LOCK_SZ
107# undef GL_HASH_LOCK_SZ
108# define GL_HASH_LOCK_SZ GFS2_GL_HASH_SIZE
109#endif
110
111static rwlock_t gl_hash_locks[GL_HASH_LOCK_SZ];
112
113static inline rwlock_t *gl_lock_addr(unsigned int x)
114{
115 return &gl_hash_locks[x & (GL_HASH_LOCK_SZ-1)];
116}
117#else /* not SMP, so no spinlocks required */
118static inline rwlock_t *gl_lock_addr(unsigned int x)
119{
120 return NULL;
121}
122#endif
123
124/** 74/**
125 * gl_hash() - Turn glock number into hash bucket number 75 * gl_hash() - Turn glock number into hash bucket number
126 * @lock: The glock number 76 * @lock: The glock number
@@ -141,25 +91,35 @@ static unsigned int gl_hash(const struct gfs2_sbd *sdp,
141 return h; 91 return h;
142} 92}
143 93
144/** 94static inline void spin_lock_bucket(unsigned int hash)
145 * glock_free() - Perform a few checks and then release struct gfs2_glock 95{
146 * @gl: The glock to release 96 struct hlist_bl_head *bl = &gl_hash_table[hash];
147 * 97 bit_spin_lock(0, (unsigned long *)bl);
148 * Also calls lock module to release its internal structure for this glock. 98}
149 *
150 */
151 99
152static void glock_free(struct gfs2_glock *gl) 100static inline void spin_unlock_bucket(unsigned int hash)
101{
102 struct hlist_bl_head *bl = &gl_hash_table[hash];
103 __bit_spin_unlock(0, (unsigned long *)bl);
104}
105
106static void gfs2_glock_dealloc(struct rcu_head *rcu)
107{
108 struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
109
110 if (gl->gl_ops->go_flags & GLOF_ASPACE)
111 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
112 else
113 kmem_cache_free(gfs2_glock_cachep, gl);
114}
115
116void gfs2_glock_free(struct gfs2_glock *gl)
153{ 117{
154 struct gfs2_sbd *sdp = gl->gl_sbd; 118 struct gfs2_sbd *sdp = gl->gl_sbd;
155 struct address_space *mapping = gfs2_glock2aspace(gl);
156 struct kmem_cache *cachep = gfs2_glock_cachep;
157 119
158 GLOCK_BUG_ON(gl, mapping && mapping->nrpages); 120 call_rcu(&gl->gl_rcu, gfs2_glock_dealloc);
159 trace_gfs2_glock_put(gl); 121 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
160 if (mapping) 122 wake_up(&sdp->sd_glock_wait);
161 cachep = gfs2_glock_aspace_cachep;
162 sdp->sd_lockstruct.ls_ops->lm_put_lock(cachep, gl);
163} 123}
164 124
165/** 125/**
@@ -185,34 +145,49 @@ static int demote_ok(const struct gfs2_glock *gl)
185{ 145{
186 const struct gfs2_glock_operations *glops = gl->gl_ops; 146 const struct gfs2_glock_operations *glops = gl->gl_ops;
187 147
148 /* assert_spin_locked(&gl->gl_spin); */
149
188 if (gl->gl_state == LM_ST_UNLOCKED) 150 if (gl->gl_state == LM_ST_UNLOCKED)
189 return 0; 151 return 0;
190 if (!list_empty(&gl->gl_holders)) 152 if (test_bit(GLF_LFLUSH, &gl->gl_flags))
153 return 0;
154 if ((gl->gl_name.ln_type != LM_TYPE_INODE) &&
155 !list_empty(&gl->gl_holders))
191 return 0; 156 return 0;
192 if (glops->go_demote_ok) 157 if (glops->go_demote_ok)
193 return glops->go_demote_ok(gl); 158 return glops->go_demote_ok(gl);
194 return 1; 159 return 1;
195} 160}
196 161
162
197/** 163/**
198 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list 164 * __gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
199 * @gl: the glock 165 * @gl: the glock
200 * 166 *
167 * If the glock is demotable, then we add it (or move it) to the end
168 * of the glock LRU list.
201 */ 169 */
202 170
203static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) 171static void __gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
204{ 172{
205 int may_reclaim; 173 if (demote_ok(gl)) {
206 may_reclaim = (demote_ok(gl) && 174 spin_lock(&lru_lock);
207 (atomic_read(&gl->gl_ref) == 1 || 175
208 (gl->gl_name.ln_type == LM_TYPE_INODE && 176 if (!list_empty(&gl->gl_lru))
209 atomic_read(&gl->gl_ref) <= 2))); 177 list_del_init(&gl->gl_lru);
210 spin_lock(&lru_lock); 178 else
211 if (list_empty(&gl->gl_lru) && may_reclaim) { 179 atomic_inc(&lru_count);
180
212 list_add_tail(&gl->gl_lru, &lru_list); 181 list_add_tail(&gl->gl_lru, &lru_list);
213 atomic_inc(&lru_count); 182 spin_unlock(&lru_lock);
214 } 183 }
215 spin_unlock(&lru_lock); 184}
185
186void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
187{
188 spin_lock(&gl->gl_spin);
189 __gfs2_glock_schedule_for_reclaim(gl);
190 spin_unlock(&gl->gl_spin);
216} 191}
217 192
218/** 193/**
@@ -227,7 +202,6 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
227{ 202{
228 if (atomic_dec_and_test(&gl->gl_ref)) 203 if (atomic_dec_and_test(&gl->gl_ref))
229 GLOCK_BUG_ON(gl, 1); 204 GLOCK_BUG_ON(gl, 1);
230 gfs2_glock_schedule_for_reclaim(gl);
231} 205}
232 206
233/** 207/**
@@ -236,30 +210,26 @@ void gfs2_glock_put_nolock(struct gfs2_glock *gl)
236 * 210 *
237 */ 211 */
238 212
239int gfs2_glock_put(struct gfs2_glock *gl) 213void gfs2_glock_put(struct gfs2_glock *gl)
240{ 214{
241 int rv = 0; 215 struct gfs2_sbd *sdp = gl->gl_sbd;
216 struct address_space *mapping = gfs2_glock2aspace(gl);
242 217
243 write_lock(gl_lock_addr(gl->gl_hash)); 218 if (atomic_dec_and_test(&gl->gl_ref)) {
244 if (atomic_dec_and_lock(&gl->gl_ref, &lru_lock)) { 219 spin_lock_bucket(gl->gl_hash);
245 hlist_del(&gl->gl_list); 220 hlist_bl_del_rcu(&gl->gl_list);
221 spin_unlock_bucket(gl->gl_hash);
222 spin_lock(&lru_lock);
246 if (!list_empty(&gl->gl_lru)) { 223 if (!list_empty(&gl->gl_lru)) {
247 list_del_init(&gl->gl_lru); 224 list_del_init(&gl->gl_lru);
248 atomic_dec(&lru_count); 225 atomic_dec(&lru_count);
249 } 226 }
250 spin_unlock(&lru_lock); 227 spin_unlock(&lru_lock);
251 write_unlock(gl_lock_addr(gl->gl_hash));
252 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); 228 GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
253 glock_free(gl); 229 GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
254 rv = 1; 230 trace_gfs2_glock_put(gl);
255 goto out; 231 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
256 } 232 }
257 spin_lock(&gl->gl_spin);
258 gfs2_glock_schedule_for_reclaim(gl);
259 spin_unlock(&gl->gl_spin);
260 write_unlock(gl_lock_addr(gl->gl_hash));
261out:
262 return rv;
263} 233}
264 234
265/** 235/**
@@ -275,17 +245,15 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
275 const struct lm_lockname *name) 245 const struct lm_lockname *name)
276{ 246{
277 struct gfs2_glock *gl; 247 struct gfs2_glock *gl;
278 struct hlist_node *h; 248 struct hlist_bl_node *h;
279 249
280 hlist_for_each_entry(gl, h, &gl_hash_table[hash].hb_list, gl_list) { 250 hlist_bl_for_each_entry_rcu(gl, h, &gl_hash_table[hash], gl_list) {
281 if (!lm_name_equal(&gl->gl_name, name)) 251 if (!lm_name_equal(&gl->gl_name, name))
282 continue; 252 continue;
283 if (gl->gl_sbd != sdp) 253 if (gl->gl_sbd != sdp)
284 continue; 254 continue;
285 255 if (atomic_inc_not_zero(&gl->gl_ref))
286 atomic_inc(&gl->gl_ref); 256 return gl;
287
288 return gl;
289 } 257 }
290 258
291 return NULL; 259 return NULL;
@@ -743,10 +711,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
743 struct gfs2_glock *gl, *tmp; 711 struct gfs2_glock *gl, *tmp;
744 unsigned int hash = gl_hash(sdp, &name); 712 unsigned int hash = gl_hash(sdp, &name);
745 struct address_space *mapping; 713 struct address_space *mapping;
714 struct kmem_cache *cachep;
746 715
747 read_lock(gl_lock_addr(hash)); 716 rcu_read_lock();
748 gl = search_bucket(hash, sdp, &name); 717 gl = search_bucket(hash, sdp, &name);
749 read_unlock(gl_lock_addr(hash)); 718 rcu_read_unlock();
750 719
751 *glp = gl; 720 *glp = gl;
752 if (gl) 721 if (gl)
@@ -755,9 +724,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
755 return -ENOENT; 724 return -ENOENT;
756 725
757 if (glops->go_flags & GLOF_ASPACE) 726 if (glops->go_flags & GLOF_ASPACE)
758 gl = kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_KERNEL); 727 cachep = gfs2_glock_aspace_cachep;
759 else 728 else
760 gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_KERNEL); 729 cachep = gfs2_glock_cachep;
730 gl = kmem_cache_alloc(cachep, GFP_KERNEL);
761 if (!gl) 731 if (!gl)
762 return -ENOMEM; 732 return -ENOMEM;
763 733
@@ -790,15 +760,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
790 mapping->writeback_index = 0; 760 mapping->writeback_index = 0;
791 } 761 }
792 762
793 write_lock(gl_lock_addr(hash)); 763 spin_lock_bucket(hash);
794 tmp = search_bucket(hash, sdp, &name); 764 tmp = search_bucket(hash, sdp, &name);
795 if (tmp) { 765 if (tmp) {
796 write_unlock(gl_lock_addr(hash)); 766 spin_unlock_bucket(hash);
797 glock_free(gl); 767 kmem_cache_free(cachep, gl);
768 atomic_dec(&sdp->sd_glock_disposal);
798 gl = tmp; 769 gl = tmp;
799 } else { 770 } else {
800 hlist_add_head(&gl->gl_list, &gl_hash_table[hash].hb_list); 771 hlist_bl_add_head_rcu(&gl->gl_list, &gl_hash_table[hash]);
801 write_unlock(gl_lock_addr(hash)); 772 spin_unlock_bucket(hash);
802 } 773 }
803 774
804 *glp = gl; 775 *glp = gl;
@@ -1007,13 +978,13 @@ fail:
1007 insert_pt = &gh2->gh_list; 978 insert_pt = &gh2->gh_list;
1008 } 979 }
1009 set_bit(GLF_QUEUED, &gl->gl_flags); 980 set_bit(GLF_QUEUED, &gl->gl_flags);
981 trace_gfs2_glock_queue(gh, 1);
1010 if (likely(insert_pt == NULL)) { 982 if (likely(insert_pt == NULL)) {
1011 list_add_tail(&gh->gh_list, &gl->gl_holders); 983 list_add_tail(&gh->gh_list, &gl->gl_holders);
1012 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) 984 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
1013 goto do_cancel; 985 goto do_cancel;
1014 return; 986 return;
1015 } 987 }
1016 trace_gfs2_glock_queue(gh, 1);
1017 list_add_tail(&gh->gh_list, insert_pt); 988 list_add_tail(&gh->gh_list, insert_pt);
1018do_cancel: 989do_cancel:
1019 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list); 990 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
@@ -1113,6 +1084,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
1113 !test_bit(GLF_DEMOTE, &gl->gl_flags)) 1084 !test_bit(GLF_DEMOTE, &gl->gl_flags))
1114 fast_path = 1; 1085 fast_path = 1;
1115 } 1086 }
1087 __gfs2_glock_schedule_for_reclaim(gl);
1116 trace_gfs2_glock_queue(gh, 0); 1088 trace_gfs2_glock_queue(gh, 0);
1117 spin_unlock(&gl->gl_spin); 1089 spin_unlock(&gl->gl_spin);
1118 if (likely(fast_path)) 1090 if (likely(fast_path))
@@ -1151,7 +1123,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
1151 * @number: the lock number 1123 * @number: the lock number
1152 * @glops: the glock operations for the type of glock 1124 * @glops: the glock operations for the type of glock
1153 * @state: the state to acquire the glock in 1125 * @state: the state to acquire the glock in
1154 * @flags: modifier flags for the aquisition 1126 * @flags: modifier flags for the acquisition
1155 * @gh: the struct gfs2_holder 1127 * @gh: the struct gfs2_holder
1156 * 1128 *
1157 * Returns: errno 1129 * Returns: errno
@@ -1276,10 +1248,8 @@ int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1276 1248
1277void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) 1249void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1278{ 1250{
1279 unsigned int x; 1251 while (num_gh--)
1280 1252 gfs2_glock_dq(&ghs[num_gh]);
1281 for (x = 0; x < num_gh; x++)
1282 gfs2_glock_dq(&ghs[x]);
1283} 1253}
1284 1254
1285/** 1255/**
@@ -1291,10 +1261,8 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs)
1291 1261
1292void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs) 1262void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1293{ 1263{
1294 unsigned int x; 1264 while (num_gh--)
1295 1265 gfs2_glock_dq_uninit(&ghs[num_gh]);
1296 for (x = 0; x < num_gh; x++)
1297 gfs2_glock_dq_uninit(&ghs[x]);
1298} 1266}
1299 1267
1300void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) 1268void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
@@ -1440,42 +1408,30 @@ static struct shrinker glock_shrinker = {
1440 * @sdp: the filesystem 1408 * @sdp: the filesystem
1441 * @bucket: the bucket 1409 * @bucket: the bucket
1442 * 1410 *
1443 * Returns: 1 if the bucket has entries
1444 */ 1411 */
1445 1412
1446static int examine_bucket(glock_examiner examiner, struct gfs2_sbd *sdp, 1413static void examine_bucket(glock_examiner examiner, const struct gfs2_sbd *sdp,
1447 unsigned int hash) 1414 unsigned int hash)
1448{ 1415{
1449 struct gfs2_glock *gl, *prev = NULL; 1416 struct gfs2_glock *gl;
1450 int has_entries = 0; 1417 struct hlist_bl_head *head = &gl_hash_table[hash];
1451 struct hlist_head *head = &gl_hash_table[hash].hb_list; 1418 struct hlist_bl_node *pos;
1452 1419
1453 read_lock(gl_lock_addr(hash)); 1420 rcu_read_lock();
1454 /* Can't use hlist_for_each_entry - don't want prefetch here */ 1421 hlist_bl_for_each_entry_rcu(gl, pos, head, gl_list) {
1455 if (hlist_empty(head)) 1422 if ((gl->gl_sbd == sdp) && atomic_read(&gl->gl_ref))
1456 goto out;
1457 gl = list_entry(head->first, struct gfs2_glock, gl_list);
1458 while(1) {
1459 if (!sdp || gl->gl_sbd == sdp) {
1460 gfs2_glock_hold(gl);
1461 read_unlock(gl_lock_addr(hash));
1462 if (prev)
1463 gfs2_glock_put(prev);
1464 prev = gl;
1465 examiner(gl); 1423 examiner(gl);
1466 has_entries = 1;
1467 read_lock(gl_lock_addr(hash));
1468 }
1469 if (gl->gl_list.next == NULL)
1470 break;
1471 gl = list_entry(gl->gl_list.next, struct gfs2_glock, gl_list);
1472 } 1424 }
1473out: 1425 rcu_read_unlock();
1474 read_unlock(gl_lock_addr(hash));
1475 if (prev)
1476 gfs2_glock_put(prev);
1477 cond_resched(); 1426 cond_resched();
1478 return has_entries; 1427}
1428
1429static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
1430{
1431 unsigned x;
1432
1433 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1434 examine_bucket(examiner, sdp, x);
1479} 1435}
1480 1436
1481 1437
@@ -1529,10 +1485,21 @@ static void clear_glock(struct gfs2_glock *gl)
1529 1485
1530void gfs2_glock_thaw(struct gfs2_sbd *sdp) 1486void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1531{ 1487{
1532 unsigned x; 1488 glock_hash_walk(thaw_glock, sdp);
1489}
1533 1490
1534 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) 1491static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1535 examine_bucket(thaw_glock, sdp, x); 1492{
1493 int ret;
1494 spin_lock(&gl->gl_spin);
1495 ret = __dump_glock(seq, gl);
1496 spin_unlock(&gl->gl_spin);
1497 return ret;
1498}
1499
1500static void dump_glock_func(struct gfs2_glock *gl)
1501{
1502 dump_glock(NULL, gl);
1536} 1503}
1537 1504
1538/** 1505/**
@@ -1545,13 +1512,10 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
1545 1512
1546void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) 1513void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
1547{ 1514{
1548 unsigned int x; 1515 glock_hash_walk(clear_glock, sdp);
1549
1550 for (x = 0; x < GFS2_GL_HASH_SIZE; x++)
1551 examine_bucket(clear_glock, sdp, x);
1552 flush_workqueue(glock_workqueue); 1516 flush_workqueue(glock_workqueue);
1553 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); 1517 wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
1554 gfs2_dump_lockstate(sdp); 1518 glock_hash_walk(dump_glock_func, sdp);
1555} 1519}
1556 1520
1557void gfs2_glock_finish_truncate(struct gfs2_inode *ip) 1521void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
@@ -1717,73 +1681,22 @@ out:
1717 return error; 1681 return error;
1718} 1682}
1719 1683
1720static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
1721{
1722 int ret;
1723 spin_lock(&gl->gl_spin);
1724 ret = __dump_glock(seq, gl);
1725 spin_unlock(&gl->gl_spin);
1726 return ret;
1727}
1728 1684
1729/**
1730 * gfs2_dump_lockstate - print out the current lockstate
1731 * @sdp: the filesystem
1732 * @ub: the buffer to copy the information into
1733 *
1734 * If @ub is NULL, dump the lockstate to the console.
1735 *
1736 */
1737
1738static int gfs2_dump_lockstate(struct gfs2_sbd *sdp)
1739{
1740 struct gfs2_glock *gl;
1741 struct hlist_node *h;
1742 unsigned int x;
1743 int error = 0;
1744
1745 for (x = 0; x < GFS2_GL_HASH_SIZE; x++) {
1746
1747 read_lock(gl_lock_addr(x));
1748
1749 hlist_for_each_entry(gl, h, &gl_hash_table[x].hb_list, gl_list) {
1750 if (gl->gl_sbd != sdp)
1751 continue;
1752
1753 error = dump_glock(NULL, gl);
1754 if (error)
1755 break;
1756 }
1757
1758 read_unlock(gl_lock_addr(x));
1759
1760 if (error)
1761 break;
1762 }
1763
1764
1765 return error;
1766}
1767 1685
1768 1686
1769int __init gfs2_glock_init(void) 1687int __init gfs2_glock_init(void)
1770{ 1688{
1771 unsigned i; 1689 unsigned i;
1772 for(i = 0; i < GFS2_GL_HASH_SIZE; i++) { 1690 for(i = 0; i < GFS2_GL_HASH_SIZE; i++) {
1773 INIT_HLIST_HEAD(&gl_hash_table[i].hb_list); 1691 INIT_HLIST_BL_HEAD(&gl_hash_table[i]);
1774 }
1775#ifdef GL_HASH_LOCK_SZ
1776 for(i = 0; i < GL_HASH_LOCK_SZ; i++) {
1777 rwlock_init(&gl_hash_locks[i]);
1778 } 1692 }
1779#endif
1780 1693
1781 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | 1694 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM |
1782 WQ_HIGHPRI | WQ_FREEZEABLE, 0); 1695 WQ_HIGHPRI | WQ_FREEZABLE, 0);
1783 if (IS_ERR(glock_workqueue)) 1696 if (IS_ERR(glock_workqueue))
1784 return PTR_ERR(glock_workqueue); 1697 return PTR_ERR(glock_workqueue);
1785 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", 1698 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue",
1786 WQ_MEM_RECLAIM | WQ_FREEZEABLE, 1699 WQ_MEM_RECLAIM | WQ_FREEZABLE,
1787 0); 1700 0);
1788 if (IS_ERR(gfs2_delete_workqueue)) { 1701 if (IS_ERR(gfs2_delete_workqueue)) {
1789 destroy_workqueue(glock_workqueue); 1702 destroy_workqueue(glock_workqueue);
@@ -1802,62 +1715,54 @@ void gfs2_glock_exit(void)
1802 destroy_workqueue(gfs2_delete_workqueue); 1715 destroy_workqueue(gfs2_delete_workqueue);
1803} 1716}
1804 1717
1718static inline struct gfs2_glock *glock_hash_chain(unsigned hash)
1719{
1720 return hlist_bl_entry(hlist_bl_first_rcu(&gl_hash_table[hash]),
1721 struct gfs2_glock, gl_list);
1722}
1723
1724static inline struct gfs2_glock *glock_hash_next(struct gfs2_glock *gl)
1725{
1726 return hlist_bl_entry(rcu_dereference(gl->gl_list.next),
1727 struct gfs2_glock, gl_list);
1728}
1729
1805static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) 1730static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
1806{ 1731{
1807 struct gfs2_glock *gl; 1732 struct gfs2_glock *gl;
1808 1733
1809restart: 1734 do {
1810 read_lock(gl_lock_addr(gi->hash)); 1735 gl = gi->gl;
1811 gl = gi->gl; 1736 if (gl) {
1812 if (gl) { 1737 gi->gl = glock_hash_next(gl);
1813 gi->gl = hlist_entry(gl->gl_list.next, 1738 } else {
1814 struct gfs2_glock, gl_list); 1739 gi->gl = glock_hash_chain(gi->hash);
1815 } else { 1740 }
1816 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, 1741 while (gi->gl == NULL) {
1817 struct gfs2_glock, gl_list); 1742 gi->hash++;
1818 } 1743 if (gi->hash >= GFS2_GL_HASH_SIZE) {
1819 if (gi->gl) 1744 rcu_read_unlock();
1820 gfs2_glock_hold(gi->gl); 1745 return 1;
1821 read_unlock(gl_lock_addr(gi->hash)); 1746 }
1822 if (gl) 1747 gi->gl = glock_hash_chain(gi->hash);
1823 gfs2_glock_put(gl); 1748 }
1824 while (gi->gl == NULL) { 1749 /* Skip entries for other sb and dead entries */
1825 gi->hash++; 1750 } while (gi->sdp != gi->gl->gl_sbd || atomic_read(&gi->gl->gl_ref) == 0);
1826 if (gi->hash >= GFS2_GL_HASH_SIZE)
1827 return 1;
1828 read_lock(gl_lock_addr(gi->hash));
1829 gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first,
1830 struct gfs2_glock, gl_list);
1831 if (gi->gl)
1832 gfs2_glock_hold(gi->gl);
1833 read_unlock(gl_lock_addr(gi->hash));
1834 }
1835
1836 if (gi->sdp != gi->gl->gl_sbd)
1837 goto restart;
1838 1751
1839 return 0; 1752 return 0;
1840} 1753}
1841 1754
1842static void gfs2_glock_iter_free(struct gfs2_glock_iter *gi)
1843{
1844 if (gi->gl)
1845 gfs2_glock_put(gi->gl);
1846 gi->gl = NULL;
1847}
1848
1849static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) 1755static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos)
1850{ 1756{
1851 struct gfs2_glock_iter *gi = seq->private; 1757 struct gfs2_glock_iter *gi = seq->private;
1852 loff_t n = *pos; 1758 loff_t n = *pos;
1853 1759
1854 gi->hash = 0; 1760 gi->hash = 0;
1761 rcu_read_lock();
1855 1762
1856 do { 1763 do {
1857 if (gfs2_glock_iter_next(gi)) { 1764 if (gfs2_glock_iter_next(gi))
1858 gfs2_glock_iter_free(gi);
1859 return NULL; 1765 return NULL;
1860 }
1861 } while (n--); 1766 } while (n--);
1862 1767
1863 return gi->gl; 1768 return gi->gl;
@@ -1870,10 +1775,8 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
1870 1775
1871 (*pos)++; 1776 (*pos)++;
1872 1777
1873 if (gfs2_glock_iter_next(gi)) { 1778 if (gfs2_glock_iter_next(gi))
1874 gfs2_glock_iter_free(gi);
1875 return NULL; 1779 return NULL;
1876 }
1877 1780
1878 return gi->gl; 1781 return gi->gl;
1879} 1782}
@@ -1881,7 +1784,10 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr,
1881static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) 1784static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
1882{ 1785{
1883 struct gfs2_glock_iter *gi = seq->private; 1786 struct gfs2_glock_iter *gi = seq->private;
1884 gfs2_glock_iter_free(gi); 1787
1788 if (gi->gl)
1789 rcu_read_unlock();
1790 gi->gl = NULL;
1885} 1791}
1886 1792
1887static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) 1793static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 691851ceb615..aea160690e94 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -118,7 +118,7 @@ struct lm_lockops {
118 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); 118 int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
119 void (*lm_unmount) (struct gfs2_sbd *sdp); 119 void (*lm_unmount) (struct gfs2_sbd *sdp);
120 void (*lm_withdraw) (struct gfs2_sbd *sdp); 120 void (*lm_withdraw) (struct gfs2_sbd *sdp);
121 void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); 121 void (*lm_put_lock) (struct gfs2_glock *gl);
122 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, 122 int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state,
123 unsigned int flags); 123 unsigned int flags);
124 void (*lm_cancel) (struct gfs2_glock *gl); 124 void (*lm_cancel) (struct gfs2_glock *gl);
@@ -174,7 +174,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp,
174 int create, struct gfs2_glock **glp); 174 int create, struct gfs2_glock **glp);
175void gfs2_glock_hold(struct gfs2_glock *gl); 175void gfs2_glock_hold(struct gfs2_glock *gl);
176void gfs2_glock_put_nolock(struct gfs2_glock *gl); 176void gfs2_glock_put_nolock(struct gfs2_glock *gl);
177int gfs2_glock_put(struct gfs2_glock *gl); 177void gfs2_glock_put(struct gfs2_glock *gl);
178void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, 178void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
179 struct gfs2_holder *gh); 179 struct gfs2_holder *gh);
180void gfs2_holder_reinit(unsigned int state, unsigned flags, 180void gfs2_holder_reinit(unsigned int state, unsigned flags,
@@ -223,25 +223,22 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
223 return error; 223 return error;
224} 224}
225 225
226/* Lock Value Block functions */ 226extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state);
227 227extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret);
228int gfs2_lvb_hold(struct gfs2_glock *gl); 228extern void gfs2_reclaim_glock(struct gfs2_sbd *sdp);
229void gfs2_lvb_unhold(struct gfs2_glock *gl); 229extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp);
230 230extern void gfs2_glock_finish_truncate(struct gfs2_inode *ip);
231void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); 231extern void gfs2_glock_thaw(struct gfs2_sbd *sdp);
232void gfs2_glock_complete(struct gfs2_glock *gl, int ret); 232extern void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl);
233void gfs2_reclaim_glock(struct gfs2_sbd *sdp); 233extern void gfs2_glock_free(struct gfs2_glock *gl);
234void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); 234
235void gfs2_glock_finish_truncate(struct gfs2_inode *ip); 235extern int __init gfs2_glock_init(void);
236void gfs2_glock_thaw(struct gfs2_sbd *sdp); 236extern void gfs2_glock_exit(void);
237 237
238int __init gfs2_glock_init(void); 238extern int gfs2_create_debugfs_file(struct gfs2_sbd *sdp);
239void gfs2_glock_exit(void); 239extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
240 240extern int gfs2_register_debugfs(void);
241int gfs2_create_debugfs_file(struct gfs2_sbd *sdp); 241extern void gfs2_unregister_debugfs(void);
242void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp);
243int gfs2_register_debugfs(void);
244void gfs2_unregister_debugfs(void);
245 242
246extern const struct lm_lockops gfs2_dlm_ops; 243extern const struct lm_lockops gfs2_dlm_ops;
247 244
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 263561bf1a50..3754e3cbf02b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -56,20 +56,26 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
56 BUG_ON(current->journal_info); 56 BUG_ON(current->journal_info);
57 current->journal_info = &tr; 57 current->journal_info = &tr;
58 58
59 gfs2_log_lock(sdp); 59 spin_lock(&sdp->sd_ail_lock);
60 while (!list_empty(head)) { 60 while (!list_empty(head)) {
61 bd = list_entry(head->next, struct gfs2_bufdata, 61 bd = list_entry(head->next, struct gfs2_bufdata,
62 bd_ail_gl_list); 62 bd_ail_gl_list);
63 bh = bd->bd_bh; 63 bh = bd->bd_bh;
64 gfs2_remove_from_ail(bd); 64 gfs2_remove_from_ail(bd);
65 spin_unlock(&sdp->sd_ail_lock);
66
65 bd->bd_bh = NULL; 67 bd->bd_bh = NULL;
66 bh->b_private = NULL; 68 bh->b_private = NULL;
67 bd->bd_blkno = bh->b_blocknr; 69 bd->bd_blkno = bh->b_blocknr;
70 gfs2_log_lock(sdp);
68 gfs2_assert_withdraw(sdp, !buffer_busy(bh)); 71 gfs2_assert_withdraw(sdp, !buffer_busy(bh));
69 gfs2_trans_add_revoke(sdp, bd); 72 gfs2_trans_add_revoke(sdp, bd);
73 gfs2_log_unlock(sdp);
74
75 spin_lock(&sdp->sd_ail_lock);
70 } 76 }
71 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); 77 gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
72 gfs2_log_unlock(sdp); 78 spin_unlock(&sdp->sd_ail_lock);
73 79
74 gfs2_trans_end(sdp); 80 gfs2_trans_end(sdp);
75 gfs2_log_flush(sdp, NULL); 81 gfs2_log_flush(sdp, NULL);
@@ -206,8 +212,17 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
206static int inode_go_demote_ok(const struct gfs2_glock *gl) 212static int inode_go_demote_ok(const struct gfs2_glock *gl)
207{ 213{
208 struct gfs2_sbd *sdp = gl->gl_sbd; 214 struct gfs2_sbd *sdp = gl->gl_sbd;
215 struct gfs2_holder *gh;
216
209 if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) 217 if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object)
210 return 0; 218 return 0;
219
220 if (!list_empty(&gl->gl_holders)) {
221 gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
222 if (gh->gh_list.next != &gl->gl_holders)
223 return 0;
224 }
225
211 return 1; 226 return 1;
212} 227}
213 228
@@ -272,19 +287,6 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
272} 287}
273 288
274/** 289/**
275 * rgrp_go_demote_ok - Check to see if it's ok to unlock a RG's glock
276 * @gl: the glock
277 *
278 * Returns: 1 if it's ok
279 */
280
281static int rgrp_go_demote_ok(const struct gfs2_glock *gl)
282{
283 const struct address_space *mapping = (const struct address_space *)(gl + 1);
284 return !mapping->nrpages;
285}
286
287/**
288 * rgrp_go_lock - operation done after an rgrp lock is locked by 290 * rgrp_go_lock - operation done after an rgrp lock is locked by
289 * a first holder on this node. 291 * a first holder on this node.
290 * @gl: the glock 292 * @gl: the glock
@@ -410,7 +412,6 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
410const struct gfs2_glock_operations gfs2_rgrp_glops = { 412const struct gfs2_glock_operations gfs2_rgrp_glops = {
411 .go_xmote_th = rgrp_go_sync, 413 .go_xmote_th = rgrp_go_sync,
412 .go_inval = rgrp_go_inval, 414 .go_inval = rgrp_go_inval,
413 .go_demote_ok = rgrp_go_demote_ok,
414 .go_lock = rgrp_go_lock, 415 .go_lock = rgrp_go_lock,
415 .go_unlock = rgrp_go_unlock, 416 .go_unlock = rgrp_go_unlock,
416 .go_dump = gfs2_rgrp_dump, 417 .go_dump = gfs2_rgrp_dump,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a79790c06275..870a89d6d4dc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -15,6 +15,8 @@
15#include <linux/workqueue.h> 15#include <linux/workqueue.h>
16#include <linux/dlm.h> 16#include <linux/dlm.h>
17#include <linux/buffer_head.h> 17#include <linux/buffer_head.h>
18#include <linux/rcupdate.h>
19#include <linux/rculist_bl.h>
18 20
19#define DIO_WAIT 0x00000010 21#define DIO_WAIT 0x00000010
20#define DIO_METADATA 0x00000020 22#define DIO_METADATA 0x00000020
@@ -201,7 +203,7 @@ enum {
201}; 203};
202 204
203struct gfs2_glock { 205struct gfs2_glock {
204 struct hlist_node gl_list; 206 struct hlist_bl_node gl_list;
205 unsigned long gl_flags; /* GLF_... */ 207 unsigned long gl_flags; /* GLF_... */
206 struct lm_lockname gl_name; 208 struct lm_lockname gl_name;
207 atomic_t gl_ref; 209 atomic_t gl_ref;
@@ -234,6 +236,7 @@ struct gfs2_glock {
234 atomic_t gl_ail_count; 236 atomic_t gl_ail_count;
235 struct delayed_work gl_work; 237 struct delayed_work gl_work;
236 struct work_struct gl_delete; 238 struct work_struct gl_delete;
239 struct rcu_head gl_rcu;
237}; 240};
238 241
239#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ 242#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
@@ -314,6 +317,7 @@ enum {
314 QDF_USER = 0, 317 QDF_USER = 0,
315 QDF_CHANGE = 1, 318 QDF_CHANGE = 1,
316 QDF_LOCKED = 2, 319 QDF_LOCKED = 2,
320 QDF_REFRESH = 3,
317}; 321};
318 322
319struct gfs2_quota_data { 323struct gfs2_quota_data {
@@ -647,6 +651,7 @@ struct gfs2_sbd {
647 unsigned int sd_log_flush_head; 651 unsigned int sd_log_flush_head;
648 u64 sd_log_flush_wrapped; 652 u64 sd_log_flush_wrapped;
649 653
654 spinlock_t sd_ail_lock;
650 struct list_head sd_ail1_list; 655 struct list_head sd_ail1_list;
651 struct list_head sd_ail2_list; 656 struct list_head sd_ail2_list;
652 u64 sd_ail_sync_gen; 657 u64 sd_ail_sync_gen;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7aa7d4f8984a..97d54a28776a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -763,14 +763,15 @@ fail:
763 return error; 763 return error;
764} 764}
765 765
766static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) 766static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
767 const struct qstr *qstr)
767{ 768{
768 int err; 769 int err;
769 size_t len; 770 size_t len;
770 void *value; 771 void *value;
771 char *name; 772 char *name;
772 773
773 err = security_inode_init_security(&ip->i_inode, &dip->i_inode, 774 err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
774 &name, &value, &len); 775 &name, &value, &len);
775 776
776 if (err) { 777 if (err) {
@@ -854,7 +855,7 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
854 if (error) 855 if (error)
855 goto fail_gunlock2; 856 goto fail_gunlock2;
856 857
857 error = gfs2_security_init(dip, GFS2_I(inode)); 858 error = gfs2_security_init(dip, GFS2_I(inode), name);
858 if (error) 859 if (error)
859 goto fail_gunlock2; 860 goto fail_gunlock2;
860 861
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6e493aee28f8..98c80d8c2a62 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -22,7 +22,6 @@ static void gdlm_ast(void *arg)
22{ 22{
23 struct gfs2_glock *gl = arg; 23 struct gfs2_glock *gl = arg;
24 unsigned ret = gl->gl_state; 24 unsigned ret = gl->gl_state;
25 struct gfs2_sbd *sdp = gl->gl_sbd;
26 25
27 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); 26 BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
28 27
@@ -31,12 +30,7 @@ static void gdlm_ast(void *arg)
31 30
32 switch (gl->gl_lksb.sb_status) { 31 switch (gl->gl_lksb.sb_status) {
33 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ 32 case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
34 if (gl->gl_ops->go_flags & GLOF_ASPACE) 33 gfs2_glock_free(gl);
35 kmem_cache_free(gfs2_glock_aspace_cachep, gl);
36 else
37 kmem_cache_free(gfs2_glock_cachep, gl);
38 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
39 wake_up(&sdp->sd_glock_wait);
40 return; 34 return;
41 case -DLM_ECANCEL: /* Cancel while getting lock */ 35 case -DLM_ECANCEL: /* Cancel while getting lock */
42 ret |= LM_OUT_CANCELED; 36 ret |= LM_OUT_CANCELED;
@@ -164,16 +158,14 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
164 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); 158 GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast);
165} 159}
166 160
167static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) 161static void gdlm_put_lock(struct gfs2_glock *gl)
168{ 162{
169 struct gfs2_sbd *sdp = gl->gl_sbd; 163 struct gfs2_sbd *sdp = gl->gl_sbd;
170 struct lm_lockstruct *ls = &sdp->sd_lockstruct; 164 struct lm_lockstruct *ls = &sdp->sd_lockstruct;
171 int error; 165 int error;
172 166
173 if (gl->gl_lksb.sb_lkid == 0) { 167 if (gl->gl_lksb.sb_lkid == 0) {
174 kmem_cache_free(cachep, gl); 168 gfs2_glock_free(gl);
175 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
176 wake_up(&sdp->sd_glock_wait);
177 return; 169 return;
178 } 170 }
179 171
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index eb01f3575e10..5b102c1887fd 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -67,7 +67,7 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
67 * @mapping: The associated mapping (maybe NULL) 67 * @mapping: The associated mapping (maybe NULL)
68 * @bd: The gfs2_bufdata to remove 68 * @bd: The gfs2_bufdata to remove
69 * 69 *
70 * The log lock _must_ be held when calling this function 70 * The ail lock _must_ be held when calling this function
71 * 71 *
72 */ 72 */
73 73
@@ -88,8 +88,8 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
88 */ 88 */
89 89
90static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai) 90static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
91__releases(&sdp->sd_log_lock) 91__releases(&sdp->sd_ail_lock)
92__acquires(&sdp->sd_log_lock) 92__acquires(&sdp->sd_ail_lock)
93{ 93{
94 struct gfs2_bufdata *bd, *s; 94 struct gfs2_bufdata *bd, *s;
95 struct buffer_head *bh; 95 struct buffer_head *bh;
@@ -117,16 +117,16 @@ __acquires(&sdp->sd_log_lock)
117 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list); 117 list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);
118 118
119 get_bh(bh); 119 get_bh(bh);
120 gfs2_log_unlock(sdp); 120 spin_unlock(&sdp->sd_ail_lock);
121 lock_buffer(bh); 121 lock_buffer(bh);
122 if (test_clear_buffer_dirty(bh)) { 122 if (test_clear_buffer_dirty(bh)) {
123 bh->b_end_io = end_buffer_write_sync; 123 bh->b_end_io = end_buffer_write_sync;
124 submit_bh(WRITE_SYNC_PLUG, bh); 124 submit_bh(WRITE_SYNC, bh);
125 } else { 125 } else {
126 unlock_buffer(bh); 126 unlock_buffer(bh);
127 brelse(bh); 127 brelse(bh);
128 } 128 }
129 gfs2_log_lock(sdp); 129 spin_lock(&sdp->sd_ail_lock);
130 130
131 retry = 1; 131 retry = 1;
132 break; 132 break;
@@ -175,10 +175,10 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
175 struct gfs2_ail *ai; 175 struct gfs2_ail *ai;
176 int done = 0; 176 int done = 0;
177 177
178 gfs2_log_lock(sdp); 178 spin_lock(&sdp->sd_ail_lock);
179 head = &sdp->sd_ail1_list; 179 head = &sdp->sd_ail1_list;
180 if (list_empty(head)) { 180 if (list_empty(head)) {
181 gfs2_log_unlock(sdp); 181 spin_unlock(&sdp->sd_ail_lock);
182 return; 182 return;
183 } 183 }
184 sync_gen = sdp->sd_ail_sync_gen++; 184 sync_gen = sdp->sd_ail_sync_gen++;
@@ -189,13 +189,13 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp)
189 if (ai->ai_sync_gen >= sync_gen) 189 if (ai->ai_sync_gen >= sync_gen)
190 continue; 190 continue;
191 ai->ai_sync_gen = sync_gen; 191 ai->ai_sync_gen = sync_gen;
192 gfs2_ail1_start_one(sdp, ai); /* This may drop log lock */ 192 gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
193 done = 0; 193 done = 0;
194 break; 194 break;
195 } 195 }
196 } 196 }
197 197
198 gfs2_log_unlock(sdp); 198 spin_unlock(&sdp->sd_ail_lock);
199} 199}
200 200
201static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags) 201static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
@@ -203,7 +203,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
203 struct gfs2_ail *ai, *s; 203 struct gfs2_ail *ai, *s;
204 int ret; 204 int ret;
205 205
206 gfs2_log_lock(sdp); 206 spin_lock(&sdp->sd_ail_lock);
207 207
208 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) { 208 list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
209 if (gfs2_ail1_empty_one(sdp, ai, flags)) 209 if (gfs2_ail1_empty_one(sdp, ai, flags))
@@ -214,7 +214,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
214 214
215 ret = list_empty(&sdp->sd_ail1_list); 215 ret = list_empty(&sdp->sd_ail1_list);
216 216
217 gfs2_log_unlock(sdp); 217 spin_unlock(&sdp->sd_ail_lock);
218 218
219 return ret; 219 return ret;
220} 220}
@@ -247,7 +247,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
247 int wrap = (new_tail < old_tail); 247 int wrap = (new_tail < old_tail);
248 int a, b, rm; 248 int a, b, rm;
249 249
250 gfs2_log_lock(sdp); 250 spin_lock(&sdp->sd_ail_lock);
251 251
252 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) { 252 list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
253 a = (old_tail <= ai->ai_first); 253 a = (old_tail <= ai->ai_first);
@@ -263,7 +263,7 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
263 kfree(ai); 263 kfree(ai);
264 } 264 }
265 265
266 gfs2_log_unlock(sdp); 266 spin_unlock(&sdp->sd_ail_lock);
267} 267}
268 268
269/** 269/**
@@ -421,7 +421,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
421 struct gfs2_ail *ai; 421 struct gfs2_ail *ai;
422 unsigned int tail; 422 unsigned int tail;
423 423
424 gfs2_log_lock(sdp); 424 spin_lock(&sdp->sd_ail_lock);
425 425
426 if (list_empty(&sdp->sd_ail1_list)) { 426 if (list_empty(&sdp->sd_ail1_list)) {
427 tail = sdp->sd_log_head; 427 tail = sdp->sd_log_head;
@@ -430,7 +430,7 @@ static unsigned int current_tail(struct gfs2_sbd *sdp)
430 tail = ai->ai_first; 430 tail = ai->ai_first;
431 } 431 }
432 432
433 gfs2_log_unlock(sdp); 433 spin_unlock(&sdp->sd_ail_lock);
434 434
435 return tail; 435 return tail;
436} 436}
@@ -647,7 +647,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
647 lock_buffer(bh); 647 lock_buffer(bh);
648 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { 648 if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
649 bh->b_end_io = end_buffer_write_sync; 649 bh->b_end_io = end_buffer_write_sync;
650 submit_bh(WRITE_SYNC_PLUG, bh); 650 submit_bh(WRITE_SYNC, bh);
651 } else { 651 } else {
652 unlock_buffer(bh); 652 unlock_buffer(bh);
653 brelse(bh); 653 brelse(bh);
@@ -743,10 +743,12 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
743 sdp->sd_log_commited_databuf = 0; 743 sdp->sd_log_commited_databuf = 0;
744 sdp->sd_log_commited_revoke = 0; 744 sdp->sd_log_commited_revoke = 0;
745 745
746 spin_lock(&sdp->sd_ail_lock);
746 if (!list_empty(&ai->ai_ail1_list)) { 747 if (!list_empty(&ai->ai_ail1_list)) {
747 list_add(&ai->ai_list, &sdp->sd_ail1_list); 748 list_add(&ai->ai_list, &sdp->sd_ail1_list);
748 ai = NULL; 749 ai = NULL;
749 } 750 }
751 spin_unlock(&sdp->sd_ail_lock);
750 gfs2_log_unlock(sdp); 752 gfs2_log_unlock(sdp);
751 trace_gfs2_log_flush(sdp, 0); 753 trace_gfs2_log_flush(sdp, 0);
752 up_write(&sdp->sd_log_flush_lock); 754 up_write(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index bf33f822058d..51d27f00ebb4 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -51,8 +51,10 @@ static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
51 /* If this buffer is in the AIL and it has already been written 51 /* If this buffer is in the AIL and it has already been written
52 * to in-place disk block, remove it from the AIL. 52 * to in-place disk block, remove it from the AIL.
53 */ 53 */
54 spin_lock(&sdp->sd_ail_lock);
54 if (bd->bd_ail) 55 if (bd->bd_ail)
55 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list); 56 list_move(&bd->bd_ail_st_list, &bd->bd_ail->ai_ail2_list);
57 spin_unlock(&sdp->sd_ail_lock);
56 get_bh(bh); 58 get_bh(bh);
57 atomic_inc(&sdp->sd_log_pinned); 59 atomic_inc(&sdp->sd_log_pinned);
58 trace_gfs2_pin(bd, 1); 60 trace_gfs2_pin(bd, 1);
@@ -80,7 +82,7 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
80 mark_buffer_dirty(bh); 82 mark_buffer_dirty(bh);
81 clear_buffer_pinned(bh); 83 clear_buffer_pinned(bh);
82 84
83 gfs2_log_lock(sdp); 85 spin_lock(&sdp->sd_ail_lock);
84 if (bd->bd_ail) { 86 if (bd->bd_ail) {
85 list_del(&bd->bd_ail_st_list); 87 list_del(&bd->bd_ail_st_list);
86 brelse(bh); 88 brelse(bh);
@@ -91,9 +93,11 @@ static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh,
91 } 93 }
92 bd->bd_ail = ai; 94 bd->bd_ail = ai;
93 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list); 95 list_add(&bd->bd_ail_st_list, &ai->ai_ail1_list);
94 clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); 96 spin_unlock(&sdp->sd_ail_lock);
97
98 if (test_and_clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags))
99 gfs2_glock_schedule_for_reclaim(bd->bd_gl);
95 trace_gfs2_pin(bd, 0); 100 trace_gfs2_pin(bd, 0);
96 gfs2_log_unlock(sdp);
97 unlock_buffer(bh); 101 unlock_buffer(bh);
98 atomic_dec(&sdp->sd_log_pinned); 102 atomic_dec(&sdp->sd_log_pinned);
99} 103}
@@ -200,7 +204,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
200 } 204 }
201 205
202 gfs2_log_unlock(sdp); 206 gfs2_log_unlock(sdp);
203 submit_bh(WRITE_SYNC_PLUG, bh); 207 submit_bh(WRITE_SYNC, bh);
204 gfs2_log_lock(sdp); 208 gfs2_log_lock(sdp);
205 209
206 n = 0; 210 n = 0;
@@ -210,7 +214,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
210 gfs2_log_unlock(sdp); 214 gfs2_log_unlock(sdp);
211 lock_buffer(bd2->bd_bh); 215 lock_buffer(bd2->bd_bh);
212 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); 216 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
213 submit_bh(WRITE_SYNC_PLUG, bh); 217 submit_bh(WRITE_SYNC, bh);
214 gfs2_log_lock(sdp); 218 gfs2_log_lock(sdp);
215 if (++n >= num) 219 if (++n >= num)
216 break; 220 break;
@@ -352,7 +356,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
352 sdp->sd_log_num_revoke--; 356 sdp->sd_log_num_revoke--;
353 357
354 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { 358 if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
355 submit_bh(WRITE_SYNC_PLUG, bh); 359 submit_bh(WRITE_SYNC, bh);
356 360
357 bh = gfs2_log_get_buf(sdp); 361 bh = gfs2_log_get_buf(sdp);
358 mh = (struct gfs2_meta_header *)bh->b_data; 362 mh = (struct gfs2_meta_header *)bh->b_data;
@@ -369,7 +373,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
369 } 373 }
370 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); 374 gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
371 375
372 submit_bh(WRITE_SYNC_PLUG, bh); 376 submit_bh(WRITE_SYNC, bh);
373} 377}
374 378
375static void revoke_lo_before_scan(struct gfs2_jdesc *jd, 379static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -571,7 +575,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
571 ptr = bh_log_ptr(bh); 575 ptr = bh_log_ptr(bh);
572 576
573 get_bh(bh); 577 get_bh(bh);
574 submit_bh(WRITE_SYNC_PLUG, bh); 578 submit_bh(WRITE_SYNC, bh);
575 gfs2_log_lock(sdp); 579 gfs2_log_lock(sdp);
576 while(!list_empty(list)) { 580 while(!list_empty(list)) {
577 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); 581 bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -597,7 +601,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
597 } else { 601 } else {
598 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); 602 bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
599 } 603 }
600 submit_bh(WRITE_SYNC_PLUG, bh1); 604 submit_bh(WRITE_SYNC, bh1);
601 gfs2_log_lock(sdp); 605 gfs2_log_lock(sdp);
602 ptr += 2; 606 ptr += 2;
603 } 607 }
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index ebef7ab6e17e..888a5f5a1a58 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -14,6 +14,8 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/gfs2_ondisk.h> 16#include <linux/gfs2_ondisk.h>
17#include <linux/rcupdate.h>
18#include <linux/rculist_bl.h>
17#include <asm/atomic.h> 19#include <asm/atomic.h>
18 20
19#include "gfs2.h" 21#include "gfs2.h"
@@ -45,7 +47,7 @@ static void gfs2_init_glock_once(void *foo)
45{ 47{
46 struct gfs2_glock *gl = foo; 48 struct gfs2_glock *gl = foo;
47 49
48 INIT_HLIST_NODE(&gl->gl_list); 50 INIT_HLIST_BL_NODE(&gl->gl_list);
49 spin_lock_init(&gl->gl_spin); 51 spin_lock_init(&gl->gl_spin);
50 INIT_LIST_HEAD(&gl->gl_holders); 52 INIT_LIST_HEAD(&gl->gl_holders);
51 INIT_LIST_HEAD(&gl->gl_lru); 53 INIT_LIST_HEAD(&gl->gl_lru);
@@ -59,14 +61,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
59 struct address_space *mapping = (struct address_space *)(gl + 1); 61 struct address_space *mapping = (struct address_space *)(gl + 1);
60 62
61 gfs2_init_glock_once(gl); 63 gfs2_init_glock_once(gl);
62 memset(mapping, 0, sizeof(*mapping)); 64 address_space_init_once(mapping);
63 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
64 spin_lock_init(&mapping->tree_lock);
65 spin_lock_init(&mapping->i_mmap_lock);
66 INIT_LIST_HEAD(&mapping->private_list);
67 spin_lock_init(&mapping->private_lock);
68 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
69 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
70} 65}
71 66
72/** 67/**
@@ -144,7 +139,7 @@ static int __init init_gfs2_fs(void)
144 139
145 error = -ENOMEM; 140 error = -ENOMEM;
146 gfs_recovery_wq = alloc_workqueue("gfs_recovery", 141 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
147 WQ_MEM_RECLAIM | WQ_FREEZEABLE, 0); 142 WQ_MEM_RECLAIM | WQ_FREEZABLE, 0);
148 if (!gfs_recovery_wq) 143 if (!gfs_recovery_wq)
149 goto fail_wq; 144 goto fail_wq;
150 145
@@ -198,6 +193,8 @@ static void __exit exit_gfs2_fs(void)
198 unregister_filesystem(&gfs2meta_fs_type); 193 unregister_filesystem(&gfs2meta_fs_type);
199 destroy_workqueue(gfs_recovery_wq); 194 destroy_workqueue(gfs_recovery_wq);
200 195
196 rcu_barrier();
197
201 kmem_cache_destroy(gfs2_quotad_cachep); 198 kmem_cache_destroy(gfs2_quotad_cachep);
202 kmem_cache_destroy(gfs2_rgrpd_cachep); 199 kmem_cache_destroy(gfs2_rgrpd_cachep);
203 kmem_cache_destroy(gfs2_bufdata_cachep); 200 kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 939739c7b3f9..675349b5a133 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
37 struct buffer_head *bh, *head; 37 struct buffer_head *bh, *head;
38 int nr_underway = 0; 38 int nr_underway = 0;
39 int write_op = REQ_META | 39 int write_op = REQ_META |
40 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC_PLUG : WRITE); 40 (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
41 41
42 BUG_ON(!PageLocked(page)); 42 BUG_ON(!PageLocked(page));
43 BUG_ON(!page_has_buffers(page)); 43 BUG_ON(!page_has_buffers(page));
@@ -94,7 +94,6 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
94const struct address_space_operations gfs2_meta_aops = { 94const struct address_space_operations gfs2_meta_aops = {
95 .writepage = gfs2_aspace_writepage, 95 .writepage = gfs2_aspace_writepage,
96 .releasepage = gfs2_releasepage, 96 .releasepage = gfs2_releasepage,
97 .sync_page = block_sync_page,
98}; 97};
99 98
100/** 99/**
@@ -326,6 +325,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
326 brelse(bh); 325 brelse(bh);
327 } 326 }
328 if (bd) { 327 if (bd) {
328 spin_lock(&sdp->sd_ail_lock);
329 if (bd->bd_ail) { 329 if (bd->bd_ail) {
330 gfs2_remove_from_ail(bd); 330 gfs2_remove_from_ail(bd);
331 bh->b_private = NULL; 331 bh->b_private = NULL;
@@ -333,6 +333,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
333 bd->bd_blkno = bh->b_blocknr; 333 bd->bd_blkno = bh->b_blocknr;
334 gfs2_trans_add_revoke(sdp, bd); 334 gfs2_trans_add_revoke(sdp, bd);
335 } 335 }
336 spin_unlock(&sdp->sd_ail_lock);
336 } 337 }
337 clear_buffer_dirty(bh); 338 clear_buffer_dirty(bh);
338 clear_buffer_uptodate(bh); 339 clear_buffer_uptodate(bh);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 777927ce6f79..42ef24355afb 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -99,6 +99,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
99 99
100 init_waitqueue_head(&sdp->sd_log_waitq); 100 init_waitqueue_head(&sdp->sd_log_waitq);
101 init_waitqueue_head(&sdp->sd_logd_waitq); 101 init_waitqueue_head(&sdp->sd_logd_waitq);
102 spin_lock_init(&sdp->sd_ail_lock);
102 INIT_LIST_HEAD(&sdp->sd_ail1_list); 103 INIT_LIST_HEAD(&sdp->sd_ail1_list);
103 INIT_LIST_HEAD(&sdp->sd_ail2_list); 104 INIT_LIST_HEAD(&sdp->sd_ail2_list);
104 105
@@ -928,17 +929,9 @@ static const match_table_t nolock_tokens = {
928 { Opt_err, NULL }, 929 { Opt_err, NULL },
929}; 930};
930 931
931static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
932{
933 struct gfs2_sbd *sdp = gl->gl_sbd;
934 kmem_cache_free(cachep, gl);
935 if (atomic_dec_and_test(&sdp->sd_glock_disposal))
936 wake_up(&sdp->sd_glock_wait);
937}
938
939static const struct lm_lockops nolock_ops = { 932static const struct lm_lockops nolock_ops = {
940 .lm_proto_name = "lock_nolock", 933 .lm_proto_name = "lock_nolock",
941 .lm_put_lock = nolock_put_lock, 934 .lm_put_lock = gfs2_glock_free,
942 .lm_tokens = &nolock_tokens, 935 .lm_tokens = &nolock_tokens,
943}; 936};
944 937
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index d8b26ac2e20b..09e436a50723 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1026,9 +1026,9 @@ static void gfs2_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
1026 1026
1027/** 1027/**
1028 * gfs2_permission - 1028 * gfs2_permission -
1029 * @inode: 1029 * @inode: The inode
1030 * @mask: 1030 * @mask: The mask to be tested
1031 * @nd: passed from Linux VFS, ignored by us 1031 * @flags: Indicates whether this is an RCU path walk or not
1032 * 1032 *
1033 * This may be called from the VFS directly, or from within GFS2 with the 1033 * This may be called from the VFS directly, or from within GFS2 with the
1034 * inode locked, so we look to see if the glock is already locked and only 1034 * inode locked, so we look to see if the glock is already locked and only
@@ -1044,11 +1044,11 @@ int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
1044 int error; 1044 int error;
1045 int unlock = 0; 1045 int unlock = 0;
1046 1046
1047 if (flags & IPERM_FLAG_RCU)
1048 return -ECHILD;
1049 1047
1050 ip = GFS2_I(inode); 1048 ip = GFS2_I(inode);
1051 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { 1049 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
1050 if (flags & IPERM_FLAG_RCU)
1051 return -ECHILD;
1052 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 1052 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
1053 if (error) 1053 if (error)
1054 return error; 1054 return error;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a689901963de..e23d9864c418 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -834,6 +834,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
834 goto out_end_trans; 834 goto out_end_trans;
835 835
836 do_qc(qd, -qd->qd_change_sync); 836 do_qc(qd, -qd->qd_change_sync);
837 set_bit(QDF_REFRESH, &qd->qd_flags);
837 } 838 }
838 839
839 error = 0; 840 error = 0;
@@ -929,6 +930,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
929{ 930{
930 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 931 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
931 struct gfs2_alloc *al = ip->i_alloc; 932 struct gfs2_alloc *al = ip->i_alloc;
933 struct gfs2_quota_data *qd;
932 unsigned int x; 934 unsigned int x;
933 int error = 0; 935 int error = 0;
934 936
@@ -942,7 +944,11 @@ int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid)
942 sort_qd, NULL); 944 sort_qd, NULL);
943 945
944 for (x = 0; x < al->al_qd_num; x++) { 946 for (x = 0; x < al->al_qd_num; x++) {
945 error = do_glock(al->al_qd[x], NO_FORCE, &al->al_qd_ghs[x]); 947 int force = NO_FORCE;
948 qd = al->al_qd[x];
949 if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags))
950 force = FORCE;
951 error = do_glock(qd, force, &al->al_qd_ghs[x]);
946 if (error) 952 if (error)
947 break; 953 break;
948 } 954 }
@@ -1587,6 +1593,8 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1587 1593
1588 offset = qd2offset(qd); 1594 offset = qd2offset(qd);
1589 alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); 1595 alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota));
1596 if (gfs2_is_stuffed(ip))
1597 alloc_required = 1;
1590 if (alloc_required) { 1598 if (alloc_required) {
1591 al = gfs2_alloc_get(ip); 1599 al = gfs2_alloc_get(ip);
1592 if (al == NULL) 1600 if (al == NULL)
@@ -1600,7 +1608,9 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1600 blocks += gfs2_rg_blocks(al); 1608 blocks += gfs2_rg_blocks(al);
1601 } 1609 }
1602 1610
1603 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); 1611 /* Some quotas span block boundaries and can update two blocks,
1612 adding an extra block to the transaction to handle such quotas */
1613 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0);
1604 if (error) 1614 if (error)
1605 goto out_release; 1615 goto out_release;
1606 1616
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7293ea27020c..cf930cd9664a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1602,7 +1602,7 @@ rgrp_error:
1602 * 1602 *
1603 */ 1603 */
1604 1604
1605void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) 1605void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1606{ 1606{
1607 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1607 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1608 struct gfs2_rgrpd *rgd; 1608 struct gfs2_rgrpd *rgd;
@@ -1617,7 +1617,21 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1617 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1617 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1618 1618
1619 gfs2_trans_add_rg(rgd); 1619 gfs2_trans_add_rg(rgd);
1620}
1620 1621
1622/**
1623 * gfs2_free_data - free a contiguous run of data block(s)
1624 * @ip: the inode these blocks are being freed from
1625 * @bstart: first block of a run of contiguous blocks
1626 * @blen: the length of the block run
1627 *
1628 */
1629
1630void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1631{
1632 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1633
1634 __gfs2_free_data(ip, bstart, blen);
1621 gfs2_statfs_change(sdp, 0, +blen, 0); 1635 gfs2_statfs_change(sdp, 0, +blen, 0);
1622 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); 1636 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
1623} 1637}
@@ -1630,7 +1644,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1630 * 1644 *
1631 */ 1645 */
1632 1646
1633void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) 1647void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1634{ 1648{
1635 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1649 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1636 struct gfs2_rgrpd *rgd; 1650 struct gfs2_rgrpd *rgd;
@@ -1645,10 +1659,24 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1645 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); 1659 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1646 1660
1647 gfs2_trans_add_rg(rgd); 1661 gfs2_trans_add_rg(rgd);
1662 gfs2_meta_wipe(ip, bstart, blen);
1663}
1648 1664
1665/**
1666 * gfs2_free_meta - free a contiguous run of data block(s)
1667 * @ip: the inode these blocks are being freed from
1668 * @bstart: first block of a run of contiguous blocks
1669 * @blen: the length of the block run
1670 *
1671 */
1672
1673void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1674{
1675 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1676
1677 __gfs2_free_meta(ip, bstart, blen);
1649 gfs2_statfs_change(sdp, 0, +blen, 0); 1678 gfs2_statfs_change(sdp, 0, +blen, 0);
1650 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); 1679 gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
1651 gfs2_meta_wipe(ip, bstart, blen);
1652} 1680}
1653 1681
1654void gfs2_unlink_di(struct inode *inode) 1682void gfs2_unlink_di(struct inode *inode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 50c2bb04369c..a80e3034ac47 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -52,7 +52,9 @@ extern int gfs2_ri_update(struct gfs2_inode *ip);
52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); 52extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); 53extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
54 54
55extern void __gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
55extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); 56extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
57extern void __gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
56extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 58extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
57extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); 59extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
58extern void gfs2_unlink_di(struct inode *inode); 60extern void gfs2_unlink_di(struct inode *inode);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ec73ed70bae1..a4e23d68a398 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -657,7 +657,7 @@ out:
657 * @sdp: the file system 657 * @sdp: the file system
658 * 658 *
659 * This function flushes data and meta data for all machines by 659 * This function flushes data and meta data for all machines by
660 * aquiring the transaction log exclusively. All journals are 660 * acquiring the transaction log exclusively. All journals are
661 * ensured to be in a clean state as well. 661 * ensured to be in a clean state as well.
662 * 662 *
663 * Returns: errno 663 * Returns: errno
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index afa66aaa2237..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
238} 238}
239 239
240/* 240/*
241 * hfs_unlink() 241 * hfs_remove()
242 * 242 *
243 * This is the unlink() entry in the inode_operations structure for 243 * This serves as both unlink() and rmdir() in the inode_operations
244 * regular HFS directories. The purpose is to delete an existing 244 * structure for regular HFS directories. The purpose is to delete
245 * file, given the inode for the parent directory and the name 245 * an existing child, given the inode for the parent directory and
246 * (and its length) of the existing file. 246 * the name (and its length) of the existing directory.
247 */
248static int hfs_unlink(struct inode *dir, struct dentry *dentry)
249{
250 struct inode *inode;
251 int res;
252
253 inode = dentry->d_inode;
254 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
255 if (res)
256 return res;
257
258 drop_nlink(inode);
259 hfs_delete_inode(inode);
260 inode->i_ctime = CURRENT_TIME_SEC;
261 mark_inode_dirty(inode);
262
263 return res;
264}
265
266/*
267 * hfs_rmdir()
268 * 247 *
269 * This is the rmdir() entry in the inode_operations structure for 248 * HFS does not have hardlinks, so both rmdir and unlink set the
270 * regular HFS directories. The purpose is to delete an existing 249 * link count to 0. The only difference is the emptiness check.
271 * directory, given the inode for the parent directory and the name
272 * (and its length) of the existing directory.
273 */ 250 */
274static int hfs_rmdir(struct inode *dir, struct dentry *dentry) 251static int hfs_remove(struct inode *dir, struct dentry *dentry)
275{ 252{
276 struct inode *inode; 253 struct inode *inode = dentry->d_inode;
277 int res; 254 int res;
278 255
279 inode = dentry->d_inode; 256 if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
280 if (inode->i_size != 2)
281 return -ENOTEMPTY; 257 return -ENOTEMPTY;
282 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); 258 res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
283 if (res) 259 if (res)
@@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
307 283
308 /* Unlink destination if it already exists */ 284 /* Unlink destination if it already exists */
309 if (new_dentry->d_inode) { 285 if (new_dentry->d_inode) {
310 res = hfs_unlink(new_dir, new_dentry); 286 res = hfs_remove(new_dir, new_dentry);
311 if (res) 287 if (res)
312 return res; 288 return res;
313 } 289 }
@@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = {
332const struct inode_operations hfs_dir_inode_operations = { 308const struct inode_operations hfs_dir_inode_operations = {
333 .create = hfs_create, 309 .create = hfs_create,
334 .lookup = hfs_lookup, 310 .lookup = hfs_lookup,
335 .unlink = hfs_unlink, 311 .unlink = hfs_remove,
336 .mkdir = hfs_mkdir, 312 .mkdir = hfs_mkdir,
337 .rmdir = hfs_rmdir, 313 .rmdir = hfs_remove,
338 .rename = hfs_rename, 314 .rename = hfs_rename,
339 .setattr = hfs_inode_setattr, 315 .setattr = hfs_inode_setattr,
340}; 316};
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index dffb4e996643..fff16c968e67 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -150,7 +150,6 @@ static int hfs_writepages(struct address_space *mapping,
150const struct address_space_operations hfs_btree_aops = { 150const struct address_space_operations hfs_btree_aops = {
151 .readpage = hfs_readpage, 151 .readpage = hfs_readpage,
152 .writepage = hfs_writepage, 152 .writepage = hfs_writepage,
153 .sync_page = block_sync_page,
154 .write_begin = hfs_write_begin, 153 .write_begin = hfs_write_begin,
155 .write_end = generic_write_end, 154 .write_end = generic_write_end,
156 .bmap = hfs_bmap, 155 .bmap = hfs_bmap,
@@ -160,7 +159,6 @@ const struct address_space_operations hfs_btree_aops = {
160const struct address_space_operations hfs_aops = { 159const struct address_space_operations hfs_aops = {
161 .readpage = hfs_readpage, 160 .readpage = hfs_readpage,
162 .writepage = hfs_writepage, 161 .writepage = hfs_writepage,
163 .sync_page = block_sync_page,
164 .write_begin = hfs_write_begin, 162 .write_begin = hfs_write_begin,
165 .write_end = generic_write_end, 163 .write_end = generic_write_end,
166 .bmap = hfs_bmap, 164 .bmap = hfs_bmap,
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 52a0bcaa7b6d..b1991a2a08e0 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -397,8 +397,8 @@ int hfsplus_file_extend(struct inode *inode)
397 u32 start, len, goal; 397 u32 start, len, goal;
398 int res; 398 int res;
399 399
400 if (sbi->total_blocks - sbi->free_blocks + 8 > 400 if (sbi->alloc_file->i_size * 8 <
401 sbi->alloc_file->i_size * 8) { 401 sbi->total_blocks - sbi->free_blocks + 8) {
402 /* extend alloc file */ 402 /* extend alloc file */
403 printk(KERN_ERR "hfs: extend alloc file! " 403 printk(KERN_ERR "hfs: extend alloc file! "
404 "(%llu,%u,%u)\n", 404 "(%llu,%u,%u)\n",
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index a8df651747f0..b248a6cfcad9 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -146,7 +146,6 @@ static int hfsplus_writepages(struct address_space *mapping,
146const struct address_space_operations hfsplus_btree_aops = { 146const struct address_space_operations hfsplus_btree_aops = {
147 .readpage = hfsplus_readpage, 147 .readpage = hfsplus_readpage,
148 .writepage = hfsplus_writepage, 148 .writepage = hfsplus_writepage,
149 .sync_page = block_sync_page,
150 .write_begin = hfsplus_write_begin, 149 .write_begin = hfsplus_write_begin,
151 .write_end = generic_write_end, 150 .write_end = generic_write_end,
152 .bmap = hfsplus_bmap, 151 .bmap = hfsplus_bmap,
@@ -156,7 +155,6 @@ const struct address_space_operations hfsplus_btree_aops = {
156const struct address_space_operations hfsplus_aops = { 155const struct address_space_operations hfsplus_aops = {
157 .readpage = hfsplus_readpage, 156 .readpage = hfsplus_readpage,
158 .writepage = hfsplus_writepage, 157 .writepage = hfsplus_writepage,
159 .sync_page = block_sync_page,
160 .write_begin = hfsplus_write_begin, 158 .write_begin = hfsplus_write_begin,
161 .write_end = generic_write_end, 159 .write_end = generic_write_end,
162 .bmap = hfsplus_bmap, 160 .bmap = hfsplus_bmap,
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 508ce662ce12..fbaa6690c8e0 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -47,7 +47,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
47 if (err) 47 if (err)
48 goto out; 48 goto out;
49 49
50 if (!is_owner_or_cap(inode)) { 50 if (!inode_owner_or_capable(inode)) {
51 err = -EACCES; 51 err = -EACCES;
52 goto out_drop_write; 52 goto out_drop_write;
53 } 53 }
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index d66ad113b1cc..40ad88c12c64 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -134,7 +134,7 @@ int hfs_part_find(struct super_block *sb,
134 res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK, 134 res = hfsplus_submit_bio(sb->s_bdev, *part_start + HFS_PMAP_BLK,
135 data, READ); 135 data, READ);
136 if (res) 136 if (res)
137 return res; 137 goto out;
138 138
139 switch (be16_to_cpu(*((__be16 *)data))) { 139 switch (be16_to_cpu(*((__be16 *)data))) {
140 case HFS_OLD_PMAP_MAGIC: 140 case HFS_OLD_PMAP_MAGIC:
@@ -147,7 +147,7 @@ int hfs_part_find(struct super_block *sb,
147 res = -ENOENT; 147 res = -ENOENT;
148 break; 148 break;
149 } 149 }
150 150out:
151 kfree(data); 151 kfree(data);
152 return res; 152 return res;
153} 153}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9a3b4795f43c..b49b55584c84 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -338,20 +338,22 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
338 struct inode *root, *inode; 338 struct inode *root, *inode;
339 struct qstr str; 339 struct qstr str;
340 struct nls_table *nls = NULL; 340 struct nls_table *nls = NULL;
341 int err = -EINVAL; 341 int err;
342 342
343 err = -EINVAL;
343 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 344 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
344 if (!sbi) 345 if (!sbi)
345 return -ENOMEM; 346 goto out;
346 347
347 sb->s_fs_info = sbi; 348 sb->s_fs_info = sbi;
348 mutex_init(&sbi->alloc_mutex); 349 mutex_init(&sbi->alloc_mutex);
349 mutex_init(&sbi->vh_mutex); 350 mutex_init(&sbi->vh_mutex);
350 hfsplus_fill_defaults(sbi); 351 hfsplus_fill_defaults(sbi);
352
353 err = -EINVAL;
351 if (!hfsplus_parse_options(data, sbi)) { 354 if (!hfsplus_parse_options(data, sbi)) {
352 printk(KERN_ERR "hfs: unable to parse mount options\n"); 355 printk(KERN_ERR "hfs: unable to parse mount options\n");
353 err = -EINVAL; 356 goto out_unload_nls;
354 goto cleanup;
355 } 357 }
356 358
357 /* temporarily use utf8 to correctly find the hidden dir below */ 359 /* temporarily use utf8 to correctly find the hidden dir below */
@@ -359,16 +361,14 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
359 sbi->nls = load_nls("utf8"); 361 sbi->nls = load_nls("utf8");
360 if (!sbi->nls) { 362 if (!sbi->nls) {
361 printk(KERN_ERR "hfs: unable to load nls for utf8\n"); 363 printk(KERN_ERR "hfs: unable to load nls for utf8\n");
362 err = -EINVAL; 364 goto out_unload_nls;
363 goto cleanup;
364 } 365 }
365 366
366 /* Grab the volume header */ 367 /* Grab the volume header */
367 if (hfsplus_read_wrapper(sb)) { 368 if (hfsplus_read_wrapper(sb)) {
368 if (!silent) 369 if (!silent)
369 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n"); 370 printk(KERN_WARNING "hfs: unable to find HFS+ superblock\n");
370 err = -EINVAL; 371 goto out_unload_nls;
371 goto cleanup;
372 } 372 }
373 vhdr = sbi->s_vhdr; 373 vhdr = sbi->s_vhdr;
374 374
@@ -377,7 +377,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
377 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION || 377 if (be16_to_cpu(vhdr->version) < HFSPLUS_MIN_VERSION ||
378 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) { 378 be16_to_cpu(vhdr->version) > HFSPLUS_CURRENT_VERSION) {
379 printk(KERN_ERR "hfs: wrong filesystem version\n"); 379 printk(KERN_ERR "hfs: wrong filesystem version\n");
380 goto cleanup; 380 goto out_free_vhdr;
381 } 381 }
382 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks); 382 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
383 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks); 383 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
@@ -421,19 +421,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
421 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 421 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
422 if (!sbi->ext_tree) { 422 if (!sbi->ext_tree) {
423 printk(KERN_ERR "hfs: failed to load extents file\n"); 423 printk(KERN_ERR "hfs: failed to load extents file\n");
424 goto cleanup; 424 goto out_free_vhdr;
425 } 425 }
426 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 426 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
427 if (!sbi->cat_tree) { 427 if (!sbi->cat_tree) {
428 printk(KERN_ERR "hfs: failed to load catalog file\n"); 428 printk(KERN_ERR "hfs: failed to load catalog file\n");
429 goto cleanup; 429 goto out_close_ext_tree;
430 } 430 }
431 431
432 inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID); 432 inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
433 if (IS_ERR(inode)) { 433 if (IS_ERR(inode)) {
434 printk(KERN_ERR "hfs: failed to load allocation file\n"); 434 printk(KERN_ERR "hfs: failed to load allocation file\n");
435 err = PTR_ERR(inode); 435 err = PTR_ERR(inode);
436 goto cleanup; 436 goto out_close_cat_tree;
437 } 437 }
438 sbi->alloc_file = inode; 438 sbi->alloc_file = inode;
439 439
@@ -442,14 +442,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
442 if (IS_ERR(root)) { 442 if (IS_ERR(root)) {
443 printk(KERN_ERR "hfs: failed to load root directory\n"); 443 printk(KERN_ERR "hfs: failed to load root directory\n");
444 err = PTR_ERR(root); 444 err = PTR_ERR(root);
445 goto cleanup; 445 goto out_put_alloc_file;
446 }
447 sb->s_d_op = &hfsplus_dentry_operations;
448 sb->s_root = d_alloc_root(root);
449 if (!sb->s_root) {
450 iput(root);
451 err = -ENOMEM;
452 goto cleanup;
453 } 446 }
454 447
455 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 448 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
@@ -459,46 +452,69 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
459 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 452 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
460 hfs_find_exit(&fd); 453 hfs_find_exit(&fd);
461 if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) 454 if (entry.type != cpu_to_be16(HFSPLUS_FOLDER))
462 goto cleanup; 455 goto out_put_root;
463 inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); 456 inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id));
464 if (IS_ERR(inode)) { 457 if (IS_ERR(inode)) {
465 err = PTR_ERR(inode); 458 err = PTR_ERR(inode);
466 goto cleanup; 459 goto out_put_root;
467 } 460 }
468 sbi->hidden_dir = inode; 461 sbi->hidden_dir = inode;
469 } else 462 } else
470 hfs_find_exit(&fd); 463 hfs_find_exit(&fd);
471 464
472 if (sb->s_flags & MS_RDONLY) 465 if (!(sb->s_flags & MS_RDONLY)) {
473 goto out; 466 /*
467 * H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused
468 * all three are registered with Apple for our use
469 */
470 vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION);
471 vhdr->modify_date = hfsp_now2mt();
472 be32_add_cpu(&vhdr->write_count, 1);
473 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
474 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
475 hfsplus_sync_fs(sb, 1);
474 476
475 /* H+LX == hfsplusutils, H+Lx == this driver, H+lx is unused 477 if (!sbi->hidden_dir) {
476 * all three are registered with Apple for our use 478 mutex_lock(&sbi->vh_mutex);
477 */ 479 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
478 vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); 480 hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
479 vhdr->modify_date = hfsp_now2mt(); 481 sbi->hidden_dir);
480 be32_add_cpu(&vhdr->write_count, 1); 482 mutex_unlock(&sbi->vh_mutex);
481 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 483
482 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 484 hfsplus_mark_inode_dirty(sbi->hidden_dir,
483 hfsplus_sync_fs(sb, 1); 485 HFSPLUS_I_CAT_DIRTY);
484 486 }
485 if (!sbi->hidden_dir) {
486 mutex_lock(&sbi->vh_mutex);
487 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
488 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
489 &str, sbi->hidden_dir);
490 mutex_unlock(&sbi->vh_mutex);
491
492 hfsplus_mark_inode_dirty(sbi->hidden_dir, HFSPLUS_I_CAT_DIRTY);
493 } 487 }
494out: 488
489 sb->s_d_op = &hfsplus_dentry_operations;
490 sb->s_root = d_alloc_root(root);
491 if (!sb->s_root) {
492 err = -ENOMEM;
493 goto out_put_hidden_dir;
494 }
495
495 unload_nls(sbi->nls); 496 unload_nls(sbi->nls);
496 sbi->nls = nls; 497 sbi->nls = nls;
497 return 0; 498 return 0;
498 499
499cleanup: 500out_put_hidden_dir:
500 hfsplus_put_super(sb); 501 iput(sbi->hidden_dir);
502out_put_root:
503 iput(sbi->alloc_file);
504out_put_alloc_file:
505 iput(sbi->alloc_file);
506out_close_cat_tree:
507 hfs_btree_close(sbi->cat_tree);
508out_close_ext_tree:
509 hfs_btree_close(sbi->ext_tree);
510out_free_vhdr:
511 kfree(sbi->s_vhdr);
512 kfree(sbi->s_backup_vhdr);
513out_unload_nls:
514 unload_nls(sbi->nls);
501 unload_nls(nls); 515 unload_nls(nls);
516 kfree(sbi);
517out:
502 return err; 518 return err;
503} 519}
504 520
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 196231794f64..3031d81f5f0f 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -167,7 +167,7 @@ reread:
167 break; 167 break;
168 case cpu_to_be16(HFSP_WRAP_MAGIC): 168 case cpu_to_be16(HFSP_WRAP_MAGIC):
169 if (!hfsplus_read_mdb(sbi->s_vhdr, &wd)) 169 if (!hfsplus_read_mdb(sbi->s_vhdr, &wd))
170 goto out; 170 goto out_free_backup_vhdr;
171 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT; 171 wd.ablk_size >>= HFSPLUS_SECTOR_SHIFT;
172 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size; 172 part_start += wd.ablk_start + wd.embed_start * wd.ablk_size;
173 part_size = wd.embed_count * wd.ablk_size; 173 part_size = wd.embed_count * wd.ablk_size;
@@ -179,7 +179,7 @@ reread:
179 * (should do this only for cdrom/loop though) 179 * (should do this only for cdrom/loop though)
180 */ 180 */
181 if (hfs_part_find(sb, &part_start, &part_size)) 181 if (hfs_part_find(sb, &part_start, &part_size))
182 goto out; 182 goto out_free_backup_vhdr;
183 goto reread; 183 goto reread;
184 } 184 }
185 185
diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig
index 63b6f5632318..0c39dc3ef7d7 100644
--- a/fs/hpfs/Kconfig
+++ b/fs/hpfs/Kconfig
@@ -1,7 +1,7 @@
1config HPFS_FS 1config HPFS_FS
2 tristate "OS/2 HPFS file system support" 2 tristate "OS/2 HPFS file system support"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # nontrivial to fix 4 depends on BROKEN || !PREEMPT
5 help 5 help
6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS 6 OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS
7 is the file system used for organizing files on OS/2 hard disk 7 is the file system used for organizing files on OS/2 hard disk
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index d32f63a569f7..b3d7c0ddb609 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -6,16 +6,15 @@
6 * directory VFS functions 6 * directory VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include <linux/slab.h> 9#include <linux/slab.h>
11#include "hpfs_fn.h" 10#include "hpfs_fn.h"
12 11
13static int hpfs_dir_release(struct inode *inode, struct file *filp) 12static int hpfs_dir_release(struct inode *inode, struct file *filp)
14{ 13{
15 lock_kernel(); 14 hpfs_lock(inode->i_sb);
16 hpfs_del_pos(inode, &filp->f_pos); 15 hpfs_del_pos(inode, &filp->f_pos);
17 /*hpfs_write_if_changed(inode);*/ 16 /*hpfs_write_if_changed(inode);*/
18 unlock_kernel(); 17 hpfs_unlock(inode->i_sb);
19 return 0; 18 return 0;
20} 19}
21 20
@@ -30,7 +29,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
30 struct hpfs_inode_info *hpfs_inode = hpfs_i(i); 29 struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
31 struct super_block *s = i->i_sb; 30 struct super_block *s = i->i_sb;
32 31
33 lock_kernel(); 32 hpfs_lock(s);
34 33
35 /*printk("dir lseek\n");*/ 34 /*printk("dir lseek\n");*/
36 if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; 35 if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok;
@@ -43,12 +42,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
43 } 42 }
44 mutex_unlock(&i->i_mutex); 43 mutex_unlock(&i->i_mutex);
45ok: 44ok:
46 unlock_kernel(); 45 hpfs_unlock(s);
47 return filp->f_pos = new_off; 46 return filp->f_pos = new_off;
48fail: 47fail:
49 mutex_unlock(&i->i_mutex); 48 mutex_unlock(&i->i_mutex);
50 /*printk("illegal lseek: %016llx\n", new_off);*/ 49 /*printk("illegal lseek: %016llx\n", new_off);*/
51 unlock_kernel(); 50 hpfs_unlock(s);
52 return -ESPIPE; 51 return -ESPIPE;
53} 52}
54 53
@@ -64,7 +63,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
64 int c1, c2 = 0; 63 int c1, c2 = 0;
65 int ret = 0; 64 int ret = 0;
66 65
67 lock_kernel(); 66 hpfs_lock(inode->i_sb);
68 67
69 if (hpfs_sb(inode->i_sb)->sb_chk) { 68 if (hpfs_sb(inode->i_sb)->sb_chk) {
70 if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) { 69 if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
@@ -167,7 +166,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
167 hpfs_brelse4(&qbh); 166 hpfs_brelse4(&qbh);
168 } 167 }
169out: 168out:
170 unlock_kernel(); 169 hpfs_unlock(inode->i_sb);
171 return ret; 170 return ret;
172} 171}
173 172
@@ -197,10 +196,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
197 struct inode *result = NULL; 196 struct inode *result = NULL;
198 struct hpfs_inode_info *hpfs_result; 197 struct hpfs_inode_info *hpfs_result;
199 198
200 lock_kernel(); 199 hpfs_lock(dir->i_sb);
201 if ((err = hpfs_chk_name(name, &len))) { 200 if ((err = hpfs_chk_name(name, &len))) {
202 if (err == -ENAMETOOLONG) { 201 if (err == -ENAMETOOLONG) {
203 unlock_kernel(); 202 hpfs_unlock(dir->i_sb);
204 return ERR_PTR(-ENAMETOOLONG); 203 return ERR_PTR(-ENAMETOOLONG);
205 } 204 }
206 goto end_add; 205 goto end_add;
@@ -298,7 +297,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
298 297
299 end: 298 end:
300 end_add: 299 end_add:
301 unlock_kernel(); 300 hpfs_unlock(dir->i_sb);
302 d_add(dentry, result); 301 d_add(dentry, result);
303 return NULL; 302 return NULL;
304 303
@@ -311,7 +310,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, struct name
311 310
312 /*bail:*/ 311 /*bail:*/
313 312
314 unlock_kernel(); 313 hpfs_unlock(dir->i_sb);
315 return ERR_PTR(-ENOENT); 314 return ERR_PTR(-ENOENT);
316} 315}
317 316
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index c0340887c7ea..9b9eb6933e43 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -6,16 +6,15 @@
6 * file VFS functions 6 * file VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include "hpfs_fn.h" 9#include "hpfs_fn.h"
11 10
12#define BLOCKS(size) (((size) + 511) >> 9) 11#define BLOCKS(size) (((size) + 511) >> 9)
13 12
14static int hpfs_file_release(struct inode *inode, struct file *file) 13static int hpfs_file_release(struct inode *inode, struct file *file)
15{ 14{
16 lock_kernel(); 15 hpfs_lock(inode->i_sb);
17 hpfs_write_if_changed(inode); 16 hpfs_write_if_changed(inode);
18 unlock_kernel(); 17 hpfs_unlock(inode->i_sb);
19 return 0; 18 return 0;
20} 19}
21 20
@@ -49,14 +48,14 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno)
49static void hpfs_truncate(struct inode *i) 48static void hpfs_truncate(struct inode *i)
50{ 49{
51 if (IS_IMMUTABLE(i)) return /*-EPERM*/; 50 if (IS_IMMUTABLE(i)) return /*-EPERM*/;
52 lock_kernel(); 51 hpfs_lock(i->i_sb);
53 hpfs_i(i)->i_n_secs = 0; 52 hpfs_i(i)->i_n_secs = 0;
54 i->i_blocks = 1 + ((i->i_size + 511) >> 9); 53 i->i_blocks = 1 + ((i->i_size + 511) >> 9);
55 hpfs_i(i)->mmu_private = i->i_size; 54 hpfs_i(i)->mmu_private = i->i_size;
56 hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9)); 55 hpfs_truncate_btree(i->i_sb, i->i_ino, 1, ((i->i_size + 511) >> 9));
57 hpfs_write_inode(i); 56 hpfs_write_inode(i);
58 hpfs_i(i)->i_n_secs = 0; 57 hpfs_i(i)->i_n_secs = 0;
59 unlock_kernel(); 58 hpfs_unlock(i->i_sb);
60} 59}
61 60
62static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) 61static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create)
@@ -120,7 +119,6 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block)
120const struct address_space_operations hpfs_aops = { 119const struct address_space_operations hpfs_aops = {
121 .readpage = hpfs_readpage, 120 .readpage = hpfs_readpage,
122 .writepage = hpfs_writepage, 121 .writepage = hpfs_writepage,
123 .sync_page = block_sync_page,
124 .write_begin = hpfs_write_begin, 122 .write_begin = hpfs_write_begin,
125 .write_end = generic_write_end, 123 .write_end = generic_write_end,
126 .bmap = _hpfs_bmap 124 .bmap = _hpfs_bmap
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 1c43dbea55e8..c15adbca07ff 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -342,3 +342,25 @@ static inline time32_t gmt_to_local(struct super_block *s, time_t t)
342 extern struct timezone sys_tz; 342 extern struct timezone sys_tz;
343 return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift; 343 return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
344} 344}
345
346/*
347 * Locking:
348 *
349 * hpfs_lock() is a leftover from the big kernel lock.
350 * Right now, these functions are empty and only left
351 * for documentation purposes. The file system no longer
352 * works on SMP systems, so the lock is not needed
353 * any more.
354 *
355 * If someone is interested in making it work again, this
356 * would be the place to start by adding a per-superblock
357 * mutex and fixing all the bugs and performance issues
358 * caused by that.
359 */
360static inline void hpfs_lock(struct super_block *s)
361{
362}
363
364static inline void hpfs_unlock(struct super_block *s)
365{
366}
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 1ae35baa539e..87f1f787e767 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -6,7 +6,6 @@
6 * inode VFS functions 6 * inode VFS functions
7 */ 7 */
8 8
9#include <linux/smp_lock.h>
10#include <linux/slab.h> 9#include <linux/slab.h>
11#include "hpfs_fn.h" 10#include "hpfs_fn.h"
12 11
@@ -267,7 +266,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
267 struct inode *inode = dentry->d_inode; 266 struct inode *inode = dentry->d_inode;
268 int error = -EINVAL; 267 int error = -EINVAL;
269 268
270 lock_kernel(); 269 hpfs_lock(inode->i_sb);
271 if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root) 270 if (inode->i_ino == hpfs_sb(inode->i_sb)->sb_root)
272 goto out_unlock; 271 goto out_unlock;
273 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) 272 if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size)
@@ -290,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr)
290 hpfs_write_inode(inode); 289 hpfs_write_inode(inode);
291 290
292 out_unlock: 291 out_unlock:
293 unlock_kernel(); 292 hpfs_unlock(inode->i_sb);
294 return error; 293 return error;
295} 294}
296 295
@@ -307,8 +306,8 @@ void hpfs_evict_inode(struct inode *inode)
307 truncate_inode_pages(&inode->i_data, 0); 306 truncate_inode_pages(&inode->i_data, 0);
308 end_writeback(inode); 307 end_writeback(inode);
309 if (!inode->i_nlink) { 308 if (!inode->i_nlink) {
310 lock_kernel(); 309 hpfs_lock(inode->i_sb);
311 hpfs_remove_fnode(inode->i_sb, inode->i_ino); 310 hpfs_remove_fnode(inode->i_sb, inode->i_ino);
312 unlock_kernel(); 311 hpfs_unlock(inode->i_sb);
313 } 312 }
314} 313}
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index f4ad9e31ddc4..d5f8c8a19023 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -6,7 +6,6 @@
6 * adding & removing files & directories 6 * adding & removing files & directories
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/smp_lock.h>
10#include "hpfs_fn.h" 9#include "hpfs_fn.h"
11 10
12static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) 11static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
@@ -25,7 +24,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
25 struct hpfs_dirent dee; 24 struct hpfs_dirent dee;
26 int err; 25 int err;
27 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; 26 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
28 lock_kernel(); 27 hpfs_lock(dir->i_sb);
29 err = -ENOSPC; 28 err = -ENOSPC;
30 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 29 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
31 if (!fnode) 30 if (!fnode)
@@ -103,7 +102,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
103 } 102 }
104 d_instantiate(dentry, result); 103 d_instantiate(dentry, result);
105 mutex_unlock(&hpfs_i(dir)->i_mutex); 104 mutex_unlock(&hpfs_i(dir)->i_mutex);
106 unlock_kernel(); 105 hpfs_unlock(dir->i_sb);
107 return 0; 106 return 0;
108bail3: 107bail3:
109 mutex_unlock(&hpfs_i(dir)->i_mutex); 108 mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -115,7 +114,7 @@ bail1:
115 brelse(bh); 114 brelse(bh);
116 hpfs_free_sectors(dir->i_sb, fno, 1); 115 hpfs_free_sectors(dir->i_sb, fno, 1);
117bail: 116bail:
118 unlock_kernel(); 117 hpfs_unlock(dir->i_sb);
119 return err; 118 return err;
120} 119}
121 120
@@ -132,7 +131,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
132 int err; 131 int err;
133 if ((err = hpfs_chk_name(name, &len))) 132 if ((err = hpfs_chk_name(name, &len)))
134 return err==-ENOENT ? -EINVAL : err; 133 return err==-ENOENT ? -EINVAL : err;
135 lock_kernel(); 134 hpfs_lock(dir->i_sb);
136 err = -ENOSPC; 135 err = -ENOSPC;
137 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 136 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
138 if (!fnode) 137 if (!fnode)
@@ -195,7 +194,7 @@ static int hpfs_create(struct inode *dir, struct dentry *dentry, int mode, struc
195 } 194 }
196 d_instantiate(dentry, result); 195 d_instantiate(dentry, result);
197 mutex_unlock(&hpfs_i(dir)->i_mutex); 196 mutex_unlock(&hpfs_i(dir)->i_mutex);
198 unlock_kernel(); 197 hpfs_unlock(dir->i_sb);
199 return 0; 198 return 0;
200 199
201bail2: 200bail2:
@@ -205,7 +204,7 @@ bail1:
205 brelse(bh); 204 brelse(bh);
206 hpfs_free_sectors(dir->i_sb, fno, 1); 205 hpfs_free_sectors(dir->i_sb, fno, 1);
207bail: 206bail:
208 unlock_kernel(); 207 hpfs_unlock(dir->i_sb);
209 return err; 208 return err;
210} 209}
211 210
@@ -224,7 +223,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
224 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM; 223 if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
225 if (!new_valid_dev(rdev)) 224 if (!new_valid_dev(rdev))
226 return -EINVAL; 225 return -EINVAL;
227 lock_kernel(); 226 hpfs_lock(dir->i_sb);
228 err = -ENOSPC; 227 err = -ENOSPC;
229 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); 228 fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
230 if (!fnode) 229 if (!fnode)
@@ -274,7 +273,7 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t
274 d_instantiate(dentry, result); 273 d_instantiate(dentry, result);
275 mutex_unlock(&hpfs_i(dir)->i_mutex); 274 mutex_unlock(&hpfs_i(dir)->i_mutex);
276 brelse(bh); 275 brelse(bh);
277 unlock_kernel(); 276 hpfs_unlock(dir->i_sb);
278 return 0; 277 return 0;
279bail2: 278bail2:
280 mutex_unlock(&hpfs_i(dir)->i_mutex); 279 mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -283,7 +282,7 @@ bail1:
283 brelse(bh); 282 brelse(bh);
284 hpfs_free_sectors(dir->i_sb, fno, 1); 283 hpfs_free_sectors(dir->i_sb, fno, 1);
285bail: 284bail:
286 unlock_kernel(); 285 hpfs_unlock(dir->i_sb);
287 return err; 286 return err;
288} 287}
289 288
@@ -299,9 +298,9 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
299 struct inode *result; 298 struct inode *result;
300 int err; 299 int err;
301 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; 300 if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
302 lock_kernel(); 301 hpfs_lock(dir->i_sb);
303 if (hpfs_sb(dir->i_sb)->sb_eas < 2) { 302 if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
304 unlock_kernel(); 303 hpfs_unlock(dir->i_sb);
305 return -EPERM; 304 return -EPERM;
306 } 305 }
307 err = -ENOSPC; 306 err = -ENOSPC;
@@ -354,7 +353,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
354 hpfs_write_inode_nolock(result); 353 hpfs_write_inode_nolock(result);
355 d_instantiate(dentry, result); 354 d_instantiate(dentry, result);
356 mutex_unlock(&hpfs_i(dir)->i_mutex); 355 mutex_unlock(&hpfs_i(dir)->i_mutex);
357 unlock_kernel(); 356 hpfs_unlock(dir->i_sb);
358 return 0; 357 return 0;
359bail2: 358bail2:
360 mutex_unlock(&hpfs_i(dir)->i_mutex); 359 mutex_unlock(&hpfs_i(dir)->i_mutex);
@@ -363,7 +362,7 @@ bail1:
363 brelse(bh); 362 brelse(bh);
364 hpfs_free_sectors(dir->i_sb, fno, 1); 363 hpfs_free_sectors(dir->i_sb, fno, 1);
365bail: 364bail:
366 unlock_kernel(); 365 hpfs_unlock(dir->i_sb);
367 return err; 366 return err;
368} 367}
369 368
@@ -380,7 +379,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry)
380 int rep = 0; 379 int rep = 0;
381 int err; 380 int err;
382 381
383 lock_kernel(); 382 hpfs_lock(dir->i_sb);
384 hpfs_adjust_length(name, &len); 383 hpfs_adjust_length(name, &len);
385again: 384again:
386 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 385 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
@@ -416,7 +415,7 @@ again:
416 dentry_unhash(dentry); 415 dentry_unhash(dentry);
417 if (!d_unhashed(dentry)) { 416 if (!d_unhashed(dentry)) {
418 dput(dentry); 417 dput(dentry);
419 unlock_kernel(); 418 hpfs_unlock(dir->i_sb);
420 return -ENOSPC; 419 return -ENOSPC;
421 } 420 }
422 if (generic_permission(inode, MAY_WRITE, 0, NULL) || 421 if (generic_permission(inode, MAY_WRITE, 0, NULL) ||
@@ -435,7 +434,7 @@ again:
435 if (!err) 434 if (!err)
436 goto again; 435 goto again;
437 } 436 }
438 unlock_kernel(); 437 hpfs_unlock(dir->i_sb);
439 return -ENOSPC; 438 return -ENOSPC;
440 default: 439 default:
441 drop_nlink(inode); 440 drop_nlink(inode);
@@ -448,7 +447,7 @@ out1:
448out: 447out:
449 mutex_unlock(&hpfs_i(dir)->i_mutex); 448 mutex_unlock(&hpfs_i(dir)->i_mutex);
450 mutex_unlock(&hpfs_i(inode)->i_parent_mutex); 449 mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
451 unlock_kernel(); 450 hpfs_unlock(dir->i_sb);
452 return err; 451 return err;
453} 452}
454 453
@@ -466,7 +465,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry)
466 int r; 465 int r;
467 466
468 hpfs_adjust_length(name, &len); 467 hpfs_adjust_length(name, &len);
469 lock_kernel(); 468 hpfs_lock(dir->i_sb);
470 mutex_lock(&hpfs_i(inode)->i_parent_mutex); 469 mutex_lock(&hpfs_i(inode)->i_parent_mutex);
471 mutex_lock(&hpfs_i(dir)->i_mutex); 470 mutex_lock(&hpfs_i(dir)->i_mutex);
472 err = -ENOENT; 471 err = -ENOENT;
@@ -508,7 +507,7 @@ out1:
508out: 507out:
509 mutex_unlock(&hpfs_i(dir)->i_mutex); 508 mutex_unlock(&hpfs_i(dir)->i_mutex);
510 mutex_unlock(&hpfs_i(inode)->i_parent_mutex); 509 mutex_unlock(&hpfs_i(inode)->i_parent_mutex);
511 unlock_kernel(); 510 hpfs_unlock(dir->i_sb);
512 return err; 511 return err;
513} 512}
514 513
@@ -521,21 +520,21 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
521 int err; 520 int err;
522 521
523 err = -EIO; 522 err = -EIO;
524 lock_kernel(); 523 hpfs_lock(i->i_sb);
525 if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) 524 if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh)))
526 goto fail; 525 goto fail;
527 err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE); 526 err = hpfs_read_ea(i->i_sb, fnode, "SYMLINK", link, PAGE_SIZE);
528 brelse(bh); 527 brelse(bh);
529 if (err) 528 if (err)
530 goto fail; 529 goto fail;
531 unlock_kernel(); 530 hpfs_unlock(i->i_sb);
532 SetPageUptodate(page); 531 SetPageUptodate(page);
533 kunmap(page); 532 kunmap(page);
534 unlock_page(page); 533 unlock_page(page);
535 return 0; 534 return 0;
536 535
537fail: 536fail:
538 unlock_kernel(); 537 hpfs_unlock(i->i_sb);
539 SetPageError(page); 538 SetPageError(page);
540 kunmap(page); 539 kunmap(page);
541 unlock_page(page); 540 unlock_page(page);
@@ -567,7 +566,7 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
567 err = 0; 566 err = 0;
568 hpfs_adjust_length(old_name, &old_len); 567 hpfs_adjust_length(old_name, &old_len);
569 568
570 lock_kernel(); 569 hpfs_lock(i->i_sb);
571 /* order doesn't matter, due to VFS exclusion */ 570 /* order doesn't matter, due to VFS exclusion */
572 mutex_lock(&hpfs_i(i)->i_parent_mutex); 571 mutex_lock(&hpfs_i(i)->i_parent_mutex);
573 if (new_inode) 572 if (new_inode)
@@ -659,7 +658,7 @@ end1:
659 mutex_unlock(&hpfs_i(i)->i_parent_mutex); 658 mutex_unlock(&hpfs_i(i)->i_parent_mutex);
660 if (new_inode) 659 if (new_inode)
661 mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex); 660 mutex_unlock(&hpfs_i(new_inode)->i_parent_mutex);
662 unlock_kernel(); 661 hpfs_unlock(i->i_sb);
663 return err; 662 return err;
664} 663}
665 664
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b30426b1fc97..c89b40808587 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,7 +13,6 @@
13#include <linux/statfs.h> 13#include <linux/statfs.h>
14#include <linux/magic.h> 14#include <linux/magic.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/smp_lock.h>
17#include <linux/bitmap.h> 16#include <linux/bitmap.h>
18#include <linux/slab.h> 17#include <linux/slab.h>
19 18
@@ -103,15 +102,11 @@ static void hpfs_put_super(struct super_block *s)
103{ 102{
104 struct hpfs_sb_info *sbi = hpfs_sb(s); 103 struct hpfs_sb_info *sbi = hpfs_sb(s);
105 104
106 lock_kernel();
107
108 kfree(sbi->sb_cp_table); 105 kfree(sbi->sb_cp_table);
109 kfree(sbi->sb_bmp_dir); 106 kfree(sbi->sb_bmp_dir);
110 unmark_dirty(s); 107 unmark_dirty(s);
111 s->s_fs_info = NULL; 108 s->s_fs_info = NULL;
112 kfree(sbi); 109 kfree(sbi);
113
114 unlock_kernel();
115} 110}
116 111
117unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno) 112unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -143,7 +138,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
143 struct super_block *s = dentry->d_sb; 138 struct super_block *s = dentry->d_sb;
144 struct hpfs_sb_info *sbi = hpfs_sb(s); 139 struct hpfs_sb_info *sbi = hpfs_sb(s);
145 u64 id = huge_encode_dev(s->s_bdev->bd_dev); 140 u64 id = huge_encode_dev(s->s_bdev->bd_dev);
146 lock_kernel(); 141 hpfs_lock(s);
147 142
148 /*if (sbi->sb_n_free == -1) {*/ 143 /*if (sbi->sb_n_free == -1) {*/
149 sbi->sb_n_free = count_bitmaps(s); 144 sbi->sb_n_free = count_bitmaps(s);
@@ -160,7 +155,7 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf)
160 buf->f_fsid.val[1] = (u32)(id >> 32); 155 buf->f_fsid.val[1] = (u32)(id >> 32);
161 buf->f_namelen = 254; 156 buf->f_namelen = 254;
162 157
163 unlock_kernel(); 158 hpfs_unlock(s);
164 159
165 return 0; 160 return 0;
166} 161}
@@ -406,7 +401,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
406 401
407 *flags |= MS_NOATIME; 402 *flags |= MS_NOATIME;
408 403
409 lock_kernel(); 404 hpfs_lock(s);
410 lock_super(s); 405 lock_super(s);
411 uid = sbi->sb_uid; gid = sbi->sb_gid; 406 uid = sbi->sb_uid; gid = sbi->sb_gid;
412 umask = 0777 & ~sbi->sb_mode; 407 umask = 0777 & ~sbi->sb_mode;
@@ -441,12 +436,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
441 replace_mount_options(s, new_opts); 436 replace_mount_options(s, new_opts);
442 437
443 unlock_super(s); 438 unlock_super(s);
444 unlock_kernel(); 439 hpfs_unlock(s);
445 return 0; 440 return 0;
446 441
447out_err: 442out_err:
448 unlock_super(s); 443 unlock_super(s);
449 unlock_kernel(); 444 hpfs_unlock(s);
450 kfree(new_opts); 445 kfree(new_opts);
451 return -EINVAL; 446 return -EINVAL;
452} 447}
@@ -484,13 +479,15 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
484 479
485 int o; 480 int o;
486 481
487 lock_kernel(); 482 if (num_possible_cpus() > 1) {
483 printk(KERN_ERR "HPFS is not SMP safe\n");
484 return -EINVAL;
485 }
488 486
489 save_mount_options(s, options); 487 save_mount_options(s, options);
490 488
491 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 489 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
492 if (!sbi) { 490 if (!sbi) {
493 unlock_kernel();
494 return -ENOMEM; 491 return -ENOMEM;
495 } 492 }
496 s->s_fs_info = sbi; 493 s->s_fs_info = sbi;
@@ -677,7 +674,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
677 root->i_blocks = 5; 674 root->i_blocks = 5;
678 hpfs_brelse4(&qbh); 675 hpfs_brelse4(&qbh);
679 } 676 }
680 unlock_kernel();
681 return 0; 677 return 0;
682 678
683bail4: brelse(bh2); 679bail4: brelse(bh2);
@@ -689,7 +685,6 @@ bail0:
689 kfree(sbi->sb_cp_table); 685 kfree(sbi->sb_cp_table);
690 s->s_fs_info = NULL; 686 s->s_fs_info = NULL;
691 kfree(sbi); 687 kfree(sbi);
692 unlock_kernel();
693 return -EINVAL; 688 return -EINVAL;
694} 689}
695 690
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9885082b470f..b9eeb1cd03ff 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -332,8 +332,7 @@ static void truncate_huge_page(struct page *page)
332{ 332{
333 cancel_dirty_page(page, /* No IO accounting for huge pages? */0); 333 cancel_dirty_page(page, /* No IO accounting for huge pages? */0);
334 ClearPageUptodate(page); 334 ClearPageUptodate(page);
335 remove_from_page_cache(page); 335 delete_from_page_cache(page);
336 put_page(page);
337} 336}
338 337
339static void truncate_hugepages(struct inode *inode, loff_t lstart) 338static void truncate_hugepages(struct inode *inode, loff_t lstart)
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..33c963d08ab4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -25,6 +25,39 @@
25#include <linux/async.h> 25#include <linux/async.h>
26#include <linux/posix_acl.h> 26#include <linux/posix_acl.h>
27#include <linux/ima.h> 27#include <linux/ima.h>
28#include <linux/cred.h>
29#include "internal.h"
30
31/*
32 * inode locking rules.
33 *
34 * inode->i_lock protects:
35 * inode->i_state, inode->i_hash, __iget()
36 * inode_lru_lock protects:
37 * inode_lru, inode->i_lru
38 * inode_sb_list_lock protects:
39 * sb->s_inodes, inode->i_sb_list
40 * inode_wb_list_lock protects:
41 * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
42 * inode_hash_lock protects:
43 * inode_hashtable, inode->i_hash
44 *
45 * Lock ordering:
46 *
47 * inode_sb_list_lock
48 * inode->i_lock
49 * inode_lru_lock
50 *
51 * inode_wb_list_lock
52 * inode->i_lock
53 *
54 * inode_hash_lock
55 * inode_sb_list_lock
56 * inode->i_lock
57 *
58 * iunique_lock
59 * inode_hash_lock
60 */
28 61
29/* 62/*
30 * This is needed for the following functions: 63 * This is needed for the following functions:
@@ -59,6 +92,8 @@
59 92
60static unsigned int i_hash_mask __read_mostly; 93static unsigned int i_hash_mask __read_mostly;
61static unsigned int i_hash_shift __read_mostly; 94static unsigned int i_hash_shift __read_mostly;
95static struct hlist_head *inode_hashtable __read_mostly;
96static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
62 97
63/* 98/*
64 * Each inode can be on two separate lists. One is 99 * Each inode can be on two separate lists. One is
@@ -73,29 +108,29 @@ static unsigned int i_hash_shift __read_mostly;
73 */ 108 */
74 109
75static LIST_HEAD(inode_lru); 110static LIST_HEAD(inode_lru);
76static struct hlist_head *inode_hashtable __read_mostly; 111static DEFINE_SPINLOCK(inode_lru_lock);
112
113__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
114__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
77 115
78/* 116/*
79 * A simple spinlock to protect the list manipulations. 117 * iprune_sem provides exclusion between the icache shrinking and the
118 * umount path.
80 * 119 *
81 * NOTE! You also have to own the lock if you change 120 * We don't actually need it to protect anything in the umount path,
82 * the i_state of an inode while it is in use.. 121 * but only need to cycle through it to make sure any inode that
122 * prune_icache took off the LRU list has been fully torn down by the
123 * time we are past evict_inodes.
83 */ 124 */
84DEFINE_SPINLOCK(inode_lock); 125static DECLARE_RWSEM(iprune_sem);
85 126
86/* 127/*
87 * iprune_sem provides exclusion between the kswapd or try_to_free_pages 128 * Empty aops. Can be used for the cases where the user does not
88 * icache shrinking path, and the umount path. Without this exclusion, 129 * define any of the address_space operations.
89 * by the time prune_icache calls iput for the inode whose pages it has
90 * been invalidating, or by the time it calls clear_inode & destroy_inode
91 * from its final dispose_list, the struct super_block they refer to
92 * (for inode->i_sb->s_op) may already have been freed and reused.
93 *
94 * We make this an rwsem because the fastpath is icache shrinking. In
95 * some cases a filesystem may be doing a significant amount of work in
96 * its inode reclaim code, so this should improve parallelism.
97 */ 130 */
98static DECLARE_RWSEM(iprune_sem); 131const struct address_space_operations empty_aops = {
132};
133EXPORT_SYMBOL(empty_aops);
99 134
100/* 135/*
101 * Statistics gathering.. 136 * Statistics gathering..
@@ -139,15 +174,6 @@ int proc_nr_inodes(ctl_table *table, int write,
139} 174}
140#endif 175#endif
141 176
142static void wake_up_inode(struct inode *inode)
143{
144 /*
145 * Prevent speculative execution through spin_unlock(&inode_lock);
146 */
147 smp_mb();
148 wake_up_bit(&inode->i_state, __I_NEW);
149}
150
151/** 177/**
152 * inode_init_always - perform inode structure intialisation 178 * inode_init_always - perform inode structure intialisation
153 * @sb: superblock inode belongs to 179 * @sb: superblock inode belongs to
@@ -158,7 +184,6 @@ static void wake_up_inode(struct inode *inode)
158 */ 184 */
159int inode_init_always(struct super_block *sb, struct inode *inode) 185int inode_init_always(struct super_block *sb, struct inode *inode)
160{ 186{
161 static const struct address_space_operations empty_aops;
162 static const struct inode_operations empty_iops; 187 static const struct inode_operations empty_iops;
163 static const struct file_operations empty_fops; 188 static const struct file_operations empty_fops;
164 struct address_space *const mapping = &inode->i_data; 189 struct address_space *const mapping = &inode->i_data;
@@ -295,6 +320,20 @@ static void destroy_inode(struct inode *inode)
295 call_rcu(&inode->i_rcu, i_callback); 320 call_rcu(&inode->i_rcu, i_callback);
296} 321}
297 322
323void address_space_init_once(struct address_space *mapping)
324{
325 memset(mapping, 0, sizeof(*mapping));
326 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
327 spin_lock_init(&mapping->tree_lock);
328 spin_lock_init(&mapping->i_mmap_lock);
329 INIT_LIST_HEAD(&mapping->private_list);
330 spin_lock_init(&mapping->private_lock);
331 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
332 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
333 mutex_init(&mapping->unmap_mutex);
334}
335EXPORT_SYMBOL(address_space_init_once);
336
298/* 337/*
299 * These are initializations that only need to be done 338 * These are initializations that only need to be done
300 * once, because the fields are idempotent across use 339 * once, because the fields are idempotent across use
@@ -308,13 +347,7 @@ void inode_init_once(struct inode *inode)
308 INIT_LIST_HEAD(&inode->i_devices); 347 INIT_LIST_HEAD(&inode->i_devices);
309 INIT_LIST_HEAD(&inode->i_wb_list); 348 INIT_LIST_HEAD(&inode->i_wb_list);
310 INIT_LIST_HEAD(&inode->i_lru); 349 INIT_LIST_HEAD(&inode->i_lru);
311 INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); 350 address_space_init_once(&inode->i_data);
312 spin_lock_init(&inode->i_data.tree_lock);
313 spin_lock_init(&inode->i_data.i_mmap_lock);
314 INIT_LIST_HEAD(&inode->i_data.private_list);
315 spin_lock_init(&inode->i_data.private_lock);
316 INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
317 INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
318 i_size_ordered_init(inode); 351 i_size_ordered_init(inode);
319#ifdef CONFIG_FSNOTIFY 352#ifdef CONFIG_FSNOTIFY
320 INIT_HLIST_HEAD(&inode->i_fsnotify_marks); 353 INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
@@ -330,7 +363,7 @@ static void init_once(void *foo)
330} 363}
331 364
332/* 365/*
333 * inode_lock must be held 366 * inode->i_lock must be held
334 */ 367 */
335void __iget(struct inode *inode) 368void __iget(struct inode *inode)
336{ 369{
@@ -348,23 +381,22 @@ EXPORT_SYMBOL(ihold);
348 381
349static void inode_lru_list_add(struct inode *inode) 382static void inode_lru_list_add(struct inode *inode)
350{ 383{
384 spin_lock(&inode_lru_lock);
351 if (list_empty(&inode->i_lru)) { 385 if (list_empty(&inode->i_lru)) {
352 list_add(&inode->i_lru, &inode_lru); 386 list_add(&inode->i_lru, &inode_lru);
353 inodes_stat.nr_unused++; 387 inodes_stat.nr_unused++;
354 } 388 }
389 spin_unlock(&inode_lru_lock);
355} 390}
356 391
357static void inode_lru_list_del(struct inode *inode) 392static void inode_lru_list_del(struct inode *inode)
358{ 393{
394 spin_lock(&inode_lru_lock);
359 if (!list_empty(&inode->i_lru)) { 395 if (!list_empty(&inode->i_lru)) {
360 list_del_init(&inode->i_lru); 396 list_del_init(&inode->i_lru);
361 inodes_stat.nr_unused--; 397 inodes_stat.nr_unused--;
362 } 398 }
363} 399 spin_unlock(&inode_lru_lock);
364
365static inline void __inode_sb_list_add(struct inode *inode)
366{
367 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
368} 400}
369 401
370/** 402/**
@@ -373,15 +405,17 @@ static inline void __inode_sb_list_add(struct inode *inode)
373 */ 405 */
374void inode_sb_list_add(struct inode *inode) 406void inode_sb_list_add(struct inode *inode)
375{ 407{
376 spin_lock(&inode_lock); 408 spin_lock(&inode_sb_list_lock);
377 __inode_sb_list_add(inode); 409 list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
378 spin_unlock(&inode_lock); 410 spin_unlock(&inode_sb_list_lock);
379} 411}
380EXPORT_SYMBOL_GPL(inode_sb_list_add); 412EXPORT_SYMBOL_GPL(inode_sb_list_add);
381 413
382static inline void __inode_sb_list_del(struct inode *inode) 414static inline void inode_sb_list_del(struct inode *inode)
383{ 415{
416 spin_lock(&inode_sb_list_lock);
384 list_del_init(&inode->i_sb_list); 417 list_del_init(&inode->i_sb_list);
418 spin_unlock(&inode_sb_list_lock);
385} 419}
386 420
387static unsigned long hash(struct super_block *sb, unsigned long hashval) 421static unsigned long hash(struct super_block *sb, unsigned long hashval)
@@ -406,24 +440,15 @@ void __insert_inode_hash(struct inode *inode, unsigned long hashval)
406{ 440{
407 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); 441 struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
408 442
409 spin_lock(&inode_lock); 443 spin_lock(&inode_hash_lock);
444 spin_lock(&inode->i_lock);
410 hlist_add_head(&inode->i_hash, b); 445 hlist_add_head(&inode->i_hash, b);
411 spin_unlock(&inode_lock); 446 spin_unlock(&inode->i_lock);
447 spin_unlock(&inode_hash_lock);
412} 448}
413EXPORT_SYMBOL(__insert_inode_hash); 449EXPORT_SYMBOL(__insert_inode_hash);
414 450
415/** 451/**
416 * __remove_inode_hash - remove an inode from the hash
417 * @inode: inode to unhash
418 *
419 * Remove an inode from the superblock.
420 */
421static void __remove_inode_hash(struct inode *inode)
422{
423 hlist_del_init(&inode->i_hash);
424}
425
426/**
427 * remove_inode_hash - remove an inode from the hash 452 * remove_inode_hash - remove an inode from the hash
428 * @inode: inode to unhash 453 * @inode: inode to unhash
429 * 454 *
@@ -431,9 +456,11 @@ static void __remove_inode_hash(struct inode *inode)
431 */ 456 */
432void remove_inode_hash(struct inode *inode) 457void remove_inode_hash(struct inode *inode)
433{ 458{
434 spin_lock(&inode_lock); 459 spin_lock(&inode_hash_lock);
460 spin_lock(&inode->i_lock);
435 hlist_del_init(&inode->i_hash); 461 hlist_del_init(&inode->i_hash);
436 spin_unlock(&inode_lock); 462 spin_unlock(&inode->i_lock);
463 spin_unlock(&inode_hash_lock);
437} 464}
438EXPORT_SYMBOL(remove_inode_hash); 465EXPORT_SYMBOL(remove_inode_hash);
439 466
@@ -450,10 +477,29 @@ void end_writeback(struct inode *inode)
450} 477}
451EXPORT_SYMBOL(end_writeback); 478EXPORT_SYMBOL(end_writeback);
452 479
480/*
481 * Free the inode passed in, removing it from the lists it is still connected
482 * to. We remove any pages still attached to the inode and wait for any IO that
483 * is still in progress before finally destroying the inode.
484 *
485 * An inode must already be marked I_FREEING so that we avoid the inode being
486 * moved back onto lists if we race with other code that manipulates the lists
487 * (e.g. writeback_single_inode). The caller is responsible for setting this.
488 *
489 * An inode must already be removed from the LRU list before being evicted from
490 * the cache. This should occur atomically with setting the I_FREEING state
491 * flag, so no inodes here should ever be on the LRU when being evicted.
492 */
453static void evict(struct inode *inode) 493static void evict(struct inode *inode)
454{ 494{
455 const struct super_operations *op = inode->i_sb->s_op; 495 const struct super_operations *op = inode->i_sb->s_op;
456 496
497 BUG_ON(!(inode->i_state & I_FREEING));
498 BUG_ON(!list_empty(&inode->i_lru));
499
500 inode_wb_list_del(inode);
501 inode_sb_list_del(inode);
502
457 if (op->evict_inode) { 503 if (op->evict_inode) {
458 op->evict_inode(inode); 504 op->evict_inode(inode);
459 } else { 505 } else {
@@ -465,6 +511,15 @@ static void evict(struct inode *inode)
465 bd_forget(inode); 511 bd_forget(inode);
466 if (S_ISCHR(inode->i_mode) && inode->i_cdev) 512 if (S_ISCHR(inode->i_mode) && inode->i_cdev)
467 cd_forget(inode); 513 cd_forget(inode);
514
515 remove_inode_hash(inode);
516
517 spin_lock(&inode->i_lock);
518 wake_up_bit(&inode->i_state, __I_NEW);
519 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
520 spin_unlock(&inode->i_lock);
521
522 destroy_inode(inode);
468} 523}
469 524
470/* 525/*
@@ -483,14 +538,6 @@ static void dispose_list(struct list_head *head)
483 list_del_init(&inode->i_lru); 538 list_del_init(&inode->i_lru);
484 539
485 evict(inode); 540 evict(inode);
486
487 spin_lock(&inode_lock);
488 __remove_inode_hash(inode);
489 __inode_sb_list_del(inode);
490 spin_unlock(&inode_lock);
491
492 wake_up_inode(inode);
493 destroy_inode(inode);
494 } 541 }
495} 542}
496 543
@@ -508,74 +555,77 @@ void evict_inodes(struct super_block *sb)
508 struct inode *inode, *next; 555 struct inode *inode, *next;
509 LIST_HEAD(dispose); 556 LIST_HEAD(dispose);
510 557
511 down_write(&iprune_sem); 558 spin_lock(&inode_sb_list_lock);
512
513 spin_lock(&inode_lock);
514 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 559 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
515 if (atomic_read(&inode->i_count)) 560 if (atomic_read(&inode->i_count))
516 continue; 561 continue;
517 562
563 spin_lock(&inode->i_lock);
518 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { 564 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
519 WARN_ON(1); 565 spin_unlock(&inode->i_lock);
520 continue; 566 continue;
521 } 567 }
522 568
523 inode->i_state |= I_FREEING; 569 inode->i_state |= I_FREEING;
524 570 inode_lru_list_del(inode);
525 /* 571 spin_unlock(&inode->i_lock);
526 * Move the inode off the IO lists and LRU once I_FREEING is 572 list_add(&inode->i_lru, &dispose);
527 * set so that it won't get moved back on there if it is dirty.
528 */
529 list_move(&inode->i_lru, &dispose);
530 list_del_init(&inode->i_wb_list);
531 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
532 inodes_stat.nr_unused--;
533 } 573 }
534 spin_unlock(&inode_lock); 574 spin_unlock(&inode_sb_list_lock);
535 575
536 dispose_list(&dispose); 576 dispose_list(&dispose);
577
578 /*
579 * Cycle through iprune_sem to make sure any inode that prune_icache
580 * moved off the list before we took the lock has been fully torn
581 * down.
582 */
583 down_write(&iprune_sem);
537 up_write(&iprune_sem); 584 up_write(&iprune_sem);
538} 585}
539 586
540/** 587/**
541 * invalidate_inodes - attempt to free all inodes on a superblock 588 * invalidate_inodes - attempt to free all inodes on a superblock
542 * @sb: superblock to operate on 589 * @sb: superblock to operate on
590 * @kill_dirty: flag to guide handling of dirty inodes
543 * 591 *
544 * Attempts to free all inodes for a given superblock. If there were any 592 * Attempts to free all inodes for a given superblock. If there were any
545 * busy inodes return a non-zero value, else zero. 593 * busy inodes return a non-zero value, else zero.
594 * If @kill_dirty is set, discard dirty inodes too, otherwise treat
595 * them as busy.
546 */ 596 */
547int invalidate_inodes(struct super_block *sb) 597int invalidate_inodes(struct super_block *sb, bool kill_dirty)
548{ 598{
549 int busy = 0; 599 int busy = 0;
550 struct inode *inode, *next; 600 struct inode *inode, *next;
551 LIST_HEAD(dispose); 601 LIST_HEAD(dispose);
552 602
553 down_write(&iprune_sem); 603 spin_lock(&inode_sb_list_lock);
554
555 spin_lock(&inode_lock);
556 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { 604 list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
557 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) 605 spin_lock(&inode->i_lock);
606 if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
607 spin_unlock(&inode->i_lock);
558 continue; 608 continue;
609 }
610 if (inode->i_state & I_DIRTY && !kill_dirty) {
611 spin_unlock(&inode->i_lock);
612 busy = 1;
613 continue;
614 }
559 if (atomic_read(&inode->i_count)) { 615 if (atomic_read(&inode->i_count)) {
616 spin_unlock(&inode->i_lock);
560 busy = 1; 617 busy = 1;
561 continue; 618 continue;
562 } 619 }
563 620
564 inode->i_state |= I_FREEING; 621 inode->i_state |= I_FREEING;
565 622 inode_lru_list_del(inode);
566 /* 623 spin_unlock(&inode->i_lock);
567 * Move the inode off the IO lists and LRU once I_FREEING is 624 list_add(&inode->i_lru, &dispose);
568 * set so that it won't get moved back on there if it is dirty.
569 */
570 list_move(&inode->i_lru, &dispose);
571 list_del_init(&inode->i_wb_list);
572 if (!(inode->i_state & (I_DIRTY | I_SYNC)))
573 inodes_stat.nr_unused--;
574 } 625 }
575 spin_unlock(&inode_lock); 626 spin_unlock(&inode_sb_list_lock);
576 627
577 dispose_list(&dispose); 628 dispose_list(&dispose);
578 up_write(&iprune_sem);
579 629
580 return busy; 630 return busy;
581} 631}
@@ -595,7 +645,7 @@ static int can_unuse(struct inode *inode)
595 645
596/* 646/*
597 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a 647 * Scan `goal' inodes on the unused list for freeable ones. They are moved to a
598 * temporary list and then are freed outside inode_lock by dispose_list(). 648 * temporary list and then are freed outside inode_lru_lock by dispose_list().
599 * 649 *
600 * Any inodes which are pinned purely because of attached pagecache have their 650 * Any inodes which are pinned purely because of attached pagecache have their
601 * pagecache removed. If the inode has metadata buffers attached to 651 * pagecache removed. If the inode has metadata buffers attached to
@@ -616,7 +666,7 @@ static void prune_icache(int nr_to_scan)
616 unsigned long reap = 0; 666 unsigned long reap = 0;
617 667
618 down_read(&iprune_sem); 668 down_read(&iprune_sem);
619 spin_lock(&inode_lock); 669 spin_lock(&inode_lru_lock);
620 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { 670 for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
621 struct inode *inode; 671 struct inode *inode;
622 672
@@ -626,53 +676,67 @@ static void prune_icache(int nr_to_scan)
626 inode = list_entry(inode_lru.prev, struct inode, i_lru); 676 inode = list_entry(inode_lru.prev, struct inode, i_lru);
627 677
628 /* 678 /*
679 * we are inverting the inode_lru_lock/inode->i_lock here,
680 * so use a trylock. If we fail to get the lock, just move the
681 * inode to the back of the list so we don't spin on it.
682 */
683 if (!spin_trylock(&inode->i_lock)) {
684 list_move(&inode->i_lru, &inode_lru);
685 continue;
686 }
687
688 /*
629 * Referenced or dirty inodes are still in use. Give them 689 * Referenced or dirty inodes are still in use. Give them
630 * another pass through the LRU as we canot reclaim them now. 690 * another pass through the LRU as we canot reclaim them now.
631 */ 691 */
632 if (atomic_read(&inode->i_count) || 692 if (atomic_read(&inode->i_count) ||
633 (inode->i_state & ~I_REFERENCED)) { 693 (inode->i_state & ~I_REFERENCED)) {
634 list_del_init(&inode->i_lru); 694 list_del_init(&inode->i_lru);
695 spin_unlock(&inode->i_lock);
635 inodes_stat.nr_unused--; 696 inodes_stat.nr_unused--;
636 continue; 697 continue;
637 } 698 }
638 699
639 /* recently referenced inodes get one more pass */ 700 /* recently referenced inodes get one more pass */
640 if (inode->i_state & I_REFERENCED) { 701 if (inode->i_state & I_REFERENCED) {
641 list_move(&inode->i_lru, &inode_lru);
642 inode->i_state &= ~I_REFERENCED; 702 inode->i_state &= ~I_REFERENCED;
703 list_move(&inode->i_lru, &inode_lru);
704 spin_unlock(&inode->i_lock);
643 continue; 705 continue;
644 } 706 }
645 if (inode_has_buffers(inode) || inode->i_data.nrpages) { 707 if (inode_has_buffers(inode) || inode->i_data.nrpages) {
646 __iget(inode); 708 __iget(inode);
647 spin_unlock(&inode_lock); 709 spin_unlock(&inode->i_lock);
710 spin_unlock(&inode_lru_lock);
648 if (remove_inode_buffers(inode)) 711 if (remove_inode_buffers(inode))
649 reap += invalidate_mapping_pages(&inode->i_data, 712 reap += invalidate_mapping_pages(&inode->i_data,
650 0, -1); 713 0, -1);
651 iput(inode); 714 iput(inode);
652 spin_lock(&inode_lock); 715 spin_lock(&inode_lru_lock);
653 716
654 if (inode != list_entry(inode_lru.next, 717 if (inode != list_entry(inode_lru.next,
655 struct inode, i_lru)) 718 struct inode, i_lru))
656 continue; /* wrong inode or list_empty */ 719 continue; /* wrong inode or list_empty */
657 if (!can_unuse(inode)) 720 /* avoid lock inversions with trylock */
721 if (!spin_trylock(&inode->i_lock))
722 continue;
723 if (!can_unuse(inode)) {
724 spin_unlock(&inode->i_lock);
658 continue; 725 continue;
726 }
659 } 727 }
660 WARN_ON(inode->i_state & I_NEW); 728 WARN_ON(inode->i_state & I_NEW);
661 inode->i_state |= I_FREEING; 729 inode->i_state |= I_FREEING;
730 spin_unlock(&inode->i_lock);
662 731
663 /*
664 * Move the inode off the IO lists and LRU once I_FREEING is
665 * set so that it won't get moved back on there if it is dirty.
666 */
667 list_move(&inode->i_lru, &freeable); 732 list_move(&inode->i_lru, &freeable);
668 list_del_init(&inode->i_wb_list);
669 inodes_stat.nr_unused--; 733 inodes_stat.nr_unused--;
670 } 734 }
671 if (current_is_kswapd()) 735 if (current_is_kswapd())
672 __count_vm_events(KSWAPD_INODESTEAL, reap); 736 __count_vm_events(KSWAPD_INODESTEAL, reap);
673 else 737 else
674 __count_vm_events(PGINODESTEAL, reap); 738 __count_vm_events(PGINODESTEAL, reap);
675 spin_unlock(&inode_lock); 739 spin_unlock(&inode_lru_lock);
676 740
677 dispose_list(&freeable); 741 dispose_list(&freeable);
678 up_read(&iprune_sem); 742 up_read(&iprune_sem);
@@ -721,15 +785,21 @@ static struct inode *find_inode(struct super_block *sb,
721 785
722repeat: 786repeat:
723 hlist_for_each_entry(inode, node, head, i_hash) { 787 hlist_for_each_entry(inode, node, head, i_hash) {
724 if (inode->i_sb != sb) 788 spin_lock(&inode->i_lock);
789 if (inode->i_sb != sb) {
790 spin_unlock(&inode->i_lock);
725 continue; 791 continue;
726 if (!test(inode, data)) 792 }
793 if (!test(inode, data)) {
794 spin_unlock(&inode->i_lock);
727 continue; 795 continue;
796 }
728 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 797 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
729 __wait_on_freeing_inode(inode); 798 __wait_on_freeing_inode(inode);
730 goto repeat; 799 goto repeat;
731 } 800 }
732 __iget(inode); 801 __iget(inode);
802 spin_unlock(&inode->i_lock);
733 return inode; 803 return inode;
734 } 804 }
735 return NULL; 805 return NULL;
@@ -747,15 +817,21 @@ static struct inode *find_inode_fast(struct super_block *sb,
747 817
748repeat: 818repeat:
749 hlist_for_each_entry(inode, node, head, i_hash) { 819 hlist_for_each_entry(inode, node, head, i_hash) {
750 if (inode->i_ino != ino) 820 spin_lock(&inode->i_lock);
821 if (inode->i_ino != ino) {
822 spin_unlock(&inode->i_lock);
751 continue; 823 continue;
752 if (inode->i_sb != sb) 824 }
825 if (inode->i_sb != sb) {
826 spin_unlock(&inode->i_lock);
753 continue; 827 continue;
828 }
754 if (inode->i_state & (I_FREEING|I_WILL_FREE)) { 829 if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
755 __wait_on_freeing_inode(inode); 830 __wait_on_freeing_inode(inode);
756 goto repeat; 831 goto repeat;
757 } 832 }
758 __iget(inode); 833 __iget(inode);
834 spin_unlock(&inode->i_lock);
759 return inode; 835 return inode;
760 } 836 }
761 return NULL; 837 return NULL;
@@ -815,19 +891,26 @@ struct inode *new_inode(struct super_block *sb)
815{ 891{
816 struct inode *inode; 892 struct inode *inode;
817 893
818 spin_lock_prefetch(&inode_lock); 894 spin_lock_prefetch(&inode_sb_list_lock);
819 895
820 inode = alloc_inode(sb); 896 inode = alloc_inode(sb);
821 if (inode) { 897 if (inode) {
822 spin_lock(&inode_lock); 898 spin_lock(&inode->i_lock);
823 __inode_sb_list_add(inode);
824 inode->i_state = 0; 899 inode->i_state = 0;
825 spin_unlock(&inode_lock); 900 spin_unlock(&inode->i_lock);
901 inode_sb_list_add(inode);
826 } 902 }
827 return inode; 903 return inode;
828} 904}
829EXPORT_SYMBOL(new_inode); 905EXPORT_SYMBOL(new_inode);
830 906
907/**
908 * unlock_new_inode - clear the I_NEW state and wake up any waiters
909 * @inode: new inode to unlock
910 *
911 * Called when the inode is fully initialised to clear the new state of the
912 * inode and wake up anyone waiting for the inode to finish initialisation.
913 */
831void unlock_new_inode(struct inode *inode) 914void unlock_new_inode(struct inode *inode)
832{ 915{
833#ifdef CONFIG_DEBUG_LOCK_ALLOC 916#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -847,51 +930,67 @@ void unlock_new_inode(struct inode *inode)
847 } 930 }
848 } 931 }
849#endif 932#endif
850 /* 933 spin_lock(&inode->i_lock);
851 * This is special! We do not need the spinlock when clearing I_NEW,
852 * because we're guaranteed that nobody else tries to do anything about
853 * the state of the inode when it is locked, as we just created it (so
854 * there can be no old holders that haven't tested I_NEW).
855 * However we must emit the memory barrier so that other CPUs reliably
856 * see the clearing of I_NEW after the other inode initialisation has
857 * completed.
858 */
859 smp_mb();
860 WARN_ON(!(inode->i_state & I_NEW)); 934 WARN_ON(!(inode->i_state & I_NEW));
861 inode->i_state &= ~I_NEW; 935 inode->i_state &= ~I_NEW;
862 wake_up_inode(inode); 936 wake_up_bit(&inode->i_state, __I_NEW);
937 spin_unlock(&inode->i_lock);
863} 938}
864EXPORT_SYMBOL(unlock_new_inode); 939EXPORT_SYMBOL(unlock_new_inode);
865 940
866/* 941/**
867 * This is called without the inode lock held.. Be careful. 942 * iget5_locked - obtain an inode from a mounted file system
943 * @sb: super block of file system
944 * @hashval: hash value (usually inode number) to get
945 * @test: callback used for comparisons between inodes
946 * @set: callback used to initialize a new struct inode
947 * @data: opaque data pointer to pass to @test and @set
948 *
949 * Search for the inode specified by @hashval and @data in the inode cache,
950 * and if present it is return it with an increased reference count. This is
951 * a generalized version of iget_locked() for file systems where the inode
952 * number is not sufficient for unique identification of an inode.
868 * 953 *
869 * We no longer cache the sb_flags in i_flags - see fs.h 954 * If the inode is not in cache, allocate a new inode and return it locked,
870 * -- rmk@arm.uk.linux.org 955 * hashed, and with the I_NEW flag set. The file system gets to fill it in
956 * before unlocking it via unlock_new_inode().
957 *
958 * Note both @test and @set are called with the inode_hash_lock held, so can't
959 * sleep.
871 */ 960 */
872static struct inode *get_new_inode(struct super_block *sb, 961struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
873 struct hlist_head *head, 962 int (*test)(struct inode *, void *),
874 int (*test)(struct inode *, void *), 963 int (*set)(struct inode *, void *), void *data)
875 int (*set)(struct inode *, void *),
876 void *data)
877{ 964{
965 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
878 struct inode *inode; 966 struct inode *inode;
879 967
968 spin_lock(&inode_hash_lock);
969 inode = find_inode(sb, head, test, data);
970 spin_unlock(&inode_hash_lock);
971
972 if (inode) {
973 wait_on_inode(inode);
974 return inode;
975 }
976
880 inode = alloc_inode(sb); 977 inode = alloc_inode(sb);
881 if (inode) { 978 if (inode) {
882 struct inode *old; 979 struct inode *old;
883 980
884 spin_lock(&inode_lock); 981 spin_lock(&inode_hash_lock);
885 /* We released the lock, so.. */ 982 /* We released the lock, so.. */
886 old = find_inode(sb, head, test, data); 983 old = find_inode(sb, head, test, data);
887 if (!old) { 984 if (!old) {
888 if (set(inode, data)) 985 if (set(inode, data))
889 goto set_failed; 986 goto set_failed;
890 987
891 hlist_add_head(&inode->i_hash, head); 988 spin_lock(&inode->i_lock);
892 __inode_sb_list_add(inode);
893 inode->i_state = I_NEW; 989 inode->i_state = I_NEW;
894 spin_unlock(&inode_lock); 990 hlist_add_head(&inode->i_hash, head);
991 spin_unlock(&inode->i_lock);
992 inode_sb_list_add(inode);
993 spin_unlock(&inode_hash_lock);
895 994
896 /* Return the locked inode with I_NEW set, the 995 /* Return the locked inode with I_NEW set, the
897 * caller is responsible for filling in the contents 996 * caller is responsible for filling in the contents
@@ -904,7 +1003,7 @@ static struct inode *get_new_inode(struct super_block *sb,
904 * us. Use the old inode instead of the one we just 1003 * us. Use the old inode instead of the one we just
905 * allocated. 1004 * allocated.
906 */ 1005 */
907 spin_unlock(&inode_lock); 1006 spin_unlock(&inode_hash_lock);
908 destroy_inode(inode); 1007 destroy_inode(inode);
909 inode = old; 1008 inode = old;
910 wait_on_inode(inode); 1009 wait_on_inode(inode);
@@ -912,33 +1011,53 @@ static struct inode *get_new_inode(struct super_block *sb,
912 return inode; 1011 return inode;
913 1012
914set_failed: 1013set_failed:
915 spin_unlock(&inode_lock); 1014 spin_unlock(&inode_hash_lock);
916 destroy_inode(inode); 1015 destroy_inode(inode);
917 return NULL; 1016 return NULL;
918} 1017}
1018EXPORT_SYMBOL(iget5_locked);
919 1019
920/* 1020/**
921 * get_new_inode_fast is the fast path version of get_new_inode, see the 1021 * iget_locked - obtain an inode from a mounted file system
922 * comment at iget_locked for details. 1022 * @sb: super block of file system
1023 * @ino: inode number to get
1024 *
1025 * Search for the inode specified by @ino in the inode cache and if present
1026 * return it with an increased reference count. This is for file systems
1027 * where the inode number is sufficient for unique identification of an inode.
1028 *
1029 * If the inode is not in cache, allocate a new inode and return it locked,
1030 * hashed, and with the I_NEW flag set. The file system gets to fill it in
1031 * before unlocking it via unlock_new_inode().
923 */ 1032 */
924static struct inode *get_new_inode_fast(struct super_block *sb, 1033struct inode *iget_locked(struct super_block *sb, unsigned long ino)
925 struct hlist_head *head, unsigned long ino)
926{ 1034{
1035 struct hlist_head *head = inode_hashtable + hash(sb, ino);
927 struct inode *inode; 1036 struct inode *inode;
928 1037
1038 spin_lock(&inode_hash_lock);
1039 inode = find_inode_fast(sb, head, ino);
1040 spin_unlock(&inode_hash_lock);
1041 if (inode) {
1042 wait_on_inode(inode);
1043 return inode;
1044 }
1045
929 inode = alloc_inode(sb); 1046 inode = alloc_inode(sb);
930 if (inode) { 1047 if (inode) {
931 struct inode *old; 1048 struct inode *old;
932 1049
933 spin_lock(&inode_lock); 1050 spin_lock(&inode_hash_lock);
934 /* We released the lock, so.. */ 1051 /* We released the lock, so.. */
935 old = find_inode_fast(sb, head, ino); 1052 old = find_inode_fast(sb, head, ino);
936 if (!old) { 1053 if (!old) {
937 inode->i_ino = ino; 1054 inode->i_ino = ino;
938 hlist_add_head(&inode->i_hash, head); 1055 spin_lock(&inode->i_lock);
939 __inode_sb_list_add(inode);
940 inode->i_state = I_NEW; 1056 inode->i_state = I_NEW;
941 spin_unlock(&inode_lock); 1057 hlist_add_head(&inode->i_hash, head);
1058 spin_unlock(&inode->i_lock);
1059 inode_sb_list_add(inode);
1060 spin_unlock(&inode_hash_lock);
942 1061
943 /* Return the locked inode with I_NEW set, the 1062 /* Return the locked inode with I_NEW set, the
944 * caller is responsible for filling in the contents 1063 * caller is responsible for filling in the contents
@@ -951,13 +1070,14 @@ static struct inode *get_new_inode_fast(struct super_block *sb,
951 * us. Use the old inode instead of the one we just 1070 * us. Use the old inode instead of the one we just
952 * allocated. 1071 * allocated.
953 */ 1072 */
954 spin_unlock(&inode_lock); 1073 spin_unlock(&inode_hash_lock);
955 destroy_inode(inode); 1074 destroy_inode(inode);
956 inode = old; 1075 inode = old;
957 wait_on_inode(inode); 1076 wait_on_inode(inode);
958 } 1077 }
959 return inode; 1078 return inode;
960} 1079}
1080EXPORT_SYMBOL(iget_locked);
961 1081
962/* 1082/*
963 * search the inode cache for a matching inode number. 1083 * search the inode cache for a matching inode number.
@@ -972,10 +1092,14 @@ static int test_inode_iunique(struct super_block *sb, unsigned long ino)
972 struct hlist_node *node; 1092 struct hlist_node *node;
973 struct inode *inode; 1093 struct inode *inode;
974 1094
1095 spin_lock(&inode_hash_lock);
975 hlist_for_each_entry(inode, node, b, i_hash) { 1096 hlist_for_each_entry(inode, node, b, i_hash) {
976 if (inode->i_ino == ino && inode->i_sb == sb) 1097 if (inode->i_ino == ino && inode->i_sb == sb) {
1098 spin_unlock(&inode_hash_lock);
977 return 0; 1099 return 0;
1100 }
978 } 1101 }
1102 spin_unlock(&inode_hash_lock);
979 1103
980 return 1; 1104 return 1;
981} 1105}
@@ -1005,7 +1129,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
1005 static unsigned int counter; 1129 static unsigned int counter;
1006 ino_t res; 1130 ino_t res;
1007 1131
1008 spin_lock(&inode_lock);
1009 spin_lock(&iunique_lock); 1132 spin_lock(&iunique_lock);
1010 do { 1133 do {
1011 if (counter <= max_reserved) 1134 if (counter <= max_reserved)
@@ -1013,7 +1136,6 @@ ino_t iunique(struct super_block *sb, ino_t max_reserved)
1013 res = counter++; 1136 res = counter++;
1014 } while (!test_inode_iunique(sb, res)); 1137 } while (!test_inode_iunique(sb, res));
1015 spin_unlock(&iunique_lock); 1138 spin_unlock(&iunique_lock);
1016 spin_unlock(&inode_lock);
1017 1139
1018 return res; 1140 return res;
1019} 1141}
@@ -1021,116 +1143,50 @@ EXPORT_SYMBOL(iunique);
1021 1143
1022struct inode *igrab(struct inode *inode) 1144struct inode *igrab(struct inode *inode)
1023{ 1145{
1024 spin_lock(&inode_lock); 1146 spin_lock(&inode->i_lock);
1025 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) 1147 if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
1026 __iget(inode); 1148 __iget(inode);
1027 else 1149 spin_unlock(&inode->i_lock);
1150 } else {
1151 spin_unlock(&inode->i_lock);
1028 /* 1152 /*
1029 * Handle the case where s_op->clear_inode is not been 1153 * Handle the case where s_op->clear_inode is not been
1030 * called yet, and somebody is calling igrab 1154 * called yet, and somebody is calling igrab
1031 * while the inode is getting freed. 1155 * while the inode is getting freed.
1032 */ 1156 */
1033 inode = NULL; 1157 inode = NULL;
1034 spin_unlock(&inode_lock); 1158 }
1035 return inode; 1159 return inode;
1036} 1160}
1037EXPORT_SYMBOL(igrab); 1161EXPORT_SYMBOL(igrab);
1038 1162
1039/** 1163/**
1040 * ifind - internal function, you want ilookup5() or iget5().
1041 * @sb: super block of file system to search
1042 * @head: the head of the list to search
1043 * @test: callback used for comparisons between inodes
1044 * @data: opaque data pointer to pass to @test
1045 * @wait: if true wait for the inode to be unlocked, if false do not
1046 *
1047 * ifind() searches for the inode specified by @data in the inode
1048 * cache. This is a generalized version of ifind_fast() for file systems where
1049 * the inode number is not sufficient for unique identification of an inode.
1050 *
1051 * If the inode is in the cache, the inode is returned with an incremented
1052 * reference count.
1053 *
1054 * Otherwise NULL is returned.
1055 *
1056 * Note, @test is called with the inode_lock held, so can't sleep.
1057 */
1058static struct inode *ifind(struct super_block *sb,
1059 struct hlist_head *head, int (*test)(struct inode *, void *),
1060 void *data, const int wait)
1061{
1062 struct inode *inode;
1063
1064 spin_lock(&inode_lock);
1065 inode = find_inode(sb, head, test, data);
1066 if (inode) {
1067 spin_unlock(&inode_lock);
1068 if (likely(wait))
1069 wait_on_inode(inode);
1070 return inode;
1071 }
1072 spin_unlock(&inode_lock);
1073 return NULL;
1074}
1075
1076/**
1077 * ifind_fast - internal function, you want ilookup() or iget().
1078 * @sb: super block of file system to search
1079 * @head: head of the list to search
1080 * @ino: inode number to search for
1081 *
1082 * ifind_fast() searches for the inode @ino in the inode cache. This is for
1083 * file systems where the inode number is sufficient for unique identification
1084 * of an inode.
1085 *
1086 * If the inode is in the cache, the inode is returned with an incremented
1087 * reference count.
1088 *
1089 * Otherwise NULL is returned.
1090 */
1091static struct inode *ifind_fast(struct super_block *sb,
1092 struct hlist_head *head, unsigned long ino)
1093{
1094 struct inode *inode;
1095
1096 spin_lock(&inode_lock);
1097 inode = find_inode_fast(sb, head, ino);
1098 if (inode) {
1099 spin_unlock(&inode_lock);
1100 wait_on_inode(inode);
1101 return inode;
1102 }
1103 spin_unlock(&inode_lock);
1104 return NULL;
1105}
1106
1107/**
1108 * ilookup5_nowait - search for an inode in the inode cache 1164 * ilookup5_nowait - search for an inode in the inode cache
1109 * @sb: super block of file system to search 1165 * @sb: super block of file system to search
1110 * @hashval: hash value (usually inode number) to search for 1166 * @hashval: hash value (usually inode number) to search for
1111 * @test: callback used for comparisons between inodes 1167 * @test: callback used for comparisons between inodes
1112 * @data: opaque data pointer to pass to @test 1168 * @data: opaque data pointer to pass to @test
1113 * 1169 *
1114 * ilookup5() uses ifind() to search for the inode specified by @hashval and 1170 * Search for the inode specified by @hashval and @data in the inode cache.
1115 * @data in the inode cache. This is a generalized version of ilookup() for
1116 * file systems where the inode number is not sufficient for unique
1117 * identification of an inode.
1118 *
1119 * If the inode is in the cache, the inode is returned with an incremented 1171 * If the inode is in the cache, the inode is returned with an incremented
1120 * reference count. Note, the inode lock is not waited upon so you have to be 1172 * reference count.
1121 * very careful what you do with the returned inode. You probably should be
1122 * using ilookup5() instead.
1123 * 1173 *
1124 * Otherwise NULL is returned. 1174 * Note: I_NEW is not waited upon so you have to be very careful what you do
1175 * with the returned inode. You probably should be using ilookup5() instead.
1125 * 1176 *
1126 * Note, @test is called with the inode_lock held, so can't sleep. 1177 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
1127 */ 1178 */
1128struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, 1179struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
1129 int (*test)(struct inode *, void *), void *data) 1180 int (*test)(struct inode *, void *), void *data)
1130{ 1181{
1131 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1182 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1183 struct inode *inode;
1132 1184
1133 return ifind(sb, head, test, data, 0); 1185 spin_lock(&inode_hash_lock);
1186 inode = find_inode(sb, head, test, data);
1187 spin_unlock(&inode_hash_lock);
1188
1189 return inode;
1134} 1190}
1135EXPORT_SYMBOL(ilookup5_nowait); 1191EXPORT_SYMBOL(ilookup5_nowait);
1136 1192
@@ -1141,24 +1197,24 @@ EXPORT_SYMBOL(ilookup5_nowait);
1141 * @test: callback used for comparisons between inodes 1197 * @test: callback used for comparisons between inodes
1142 * @data: opaque data pointer to pass to @test 1198 * @data: opaque data pointer to pass to @test
1143 * 1199 *
1144 * ilookup5() uses ifind() to search for the inode specified by @hashval and 1200 * Search for the inode specified by @hashval and @data in the inode cache,
1145 * @data in the inode cache. This is a generalized version of ilookup() for 1201 * and if the inode is in the cache, return the inode with an incremented
1146 * file systems where the inode number is not sufficient for unique 1202 * reference count. Waits on I_NEW before returning the inode.
1147 * identification of an inode.
1148 *
1149 * If the inode is in the cache, the inode lock is waited upon and the inode is
1150 * returned with an incremented reference count. 1203 * returned with an incremented reference count.
1151 * 1204 *
1152 * Otherwise NULL is returned. 1205 * This is a generalized version of ilookup() for file systems where the
1206 * inode number is not sufficient for unique identification of an inode.
1153 * 1207 *
1154 * Note, @test is called with the inode_lock held, so can't sleep. 1208 * Note: @test is called with the inode_hash_lock held, so can't sleep.
1155 */ 1209 */
1156struct inode *ilookup5(struct super_block *sb, unsigned long hashval, 1210struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
1157 int (*test)(struct inode *, void *), void *data) 1211 int (*test)(struct inode *, void *), void *data)
1158{ 1212{
1159 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1213 struct inode *inode = ilookup5_nowait(sb, hashval, test, data);
1160 1214
1161 return ifind(sb, head, test, data, 1); 1215 if (inode)
1216 wait_on_inode(inode);
1217 return inode;
1162} 1218}
1163EXPORT_SYMBOL(ilookup5); 1219EXPORT_SYMBOL(ilookup5);
1164 1220
@@ -1167,91 +1223,23 @@ EXPORT_SYMBOL(ilookup5);
1167 * @sb: super block of file system to search 1223 * @sb: super block of file system to search
1168 * @ino: inode number to search for 1224 * @ino: inode number to search for
1169 * 1225 *
1170 * ilookup() uses ifind_fast() to search for the inode @ino in the inode cache. 1226 * Search for the inode @ino in the inode cache, and if the inode is in the
1171 * This is for file systems where the inode number is sufficient for unique 1227 * cache, the inode is returned with an incremented reference count.
1172 * identification of an inode.
1173 *
1174 * If the inode is in the cache, the inode is returned with an incremented
1175 * reference count.
1176 *
1177 * Otherwise NULL is returned.
1178 */ 1228 */
1179struct inode *ilookup(struct super_block *sb, unsigned long ino) 1229struct inode *ilookup(struct super_block *sb, unsigned long ino)
1180{ 1230{
1181 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1231 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1182
1183 return ifind_fast(sb, head, ino);
1184}
1185EXPORT_SYMBOL(ilookup);
1186
1187/**
1188 * iget5_locked - obtain an inode from a mounted file system
1189 * @sb: super block of file system
1190 * @hashval: hash value (usually inode number) to get
1191 * @test: callback used for comparisons between inodes
1192 * @set: callback used to initialize a new struct inode
1193 * @data: opaque data pointer to pass to @test and @set
1194 *
1195 * iget5_locked() uses ifind() to search for the inode specified by @hashval
1196 * and @data in the inode cache and if present it is returned with an increased
1197 * reference count. This is a generalized version of iget_locked() for file
1198 * systems where the inode number is not sufficient for unique identification
1199 * of an inode.
1200 *
1201 * If the inode is not in cache, get_new_inode() is called to allocate a new
1202 * inode and this is returned locked, hashed, and with the I_NEW flag set. The
1203 * file system gets to fill it in before unlocking it via unlock_new_inode().
1204 *
1205 * Note both @test and @set are called with the inode_lock held, so can't sleep.
1206 */
1207struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
1208 int (*test)(struct inode *, void *),
1209 int (*set)(struct inode *, void *), void *data)
1210{
1211 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1212 struct inode *inode; 1232 struct inode *inode;
1213 1233
1214 inode = ifind(sb, head, test, data, 1); 1234 spin_lock(&inode_hash_lock);
1215 if (inode) 1235 inode = find_inode_fast(sb, head, ino);
1216 return inode; 1236 spin_unlock(&inode_hash_lock);
1217 /*
1218 * get_new_inode() will do the right thing, re-trying the search
1219 * in case it had to block at any point.
1220 */
1221 return get_new_inode(sb, head, test, set, data);
1222}
1223EXPORT_SYMBOL(iget5_locked);
1224
1225/**
1226 * iget_locked - obtain an inode from a mounted file system
1227 * @sb: super block of file system
1228 * @ino: inode number to get
1229 *
1230 * iget_locked() uses ifind_fast() to search for the inode specified by @ino in
1231 * the inode cache and if present it is returned with an increased reference
1232 * count. This is for file systems where the inode number is sufficient for
1233 * unique identification of an inode.
1234 *
1235 * If the inode is not in cache, get_new_inode_fast() is called to allocate a
1236 * new inode and this is returned locked, hashed, and with the I_NEW flag set.
1237 * The file system gets to fill it in before unlocking it via
1238 * unlock_new_inode().
1239 */
1240struct inode *iget_locked(struct super_block *sb, unsigned long ino)
1241{
1242 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1243 struct inode *inode;
1244 1237
1245 inode = ifind_fast(sb, head, ino);
1246 if (inode) 1238 if (inode)
1247 return inode; 1239 wait_on_inode(inode);
1248 /* 1240 return inode;
1249 * get_new_inode_fast() will do the right thing, re-trying the search
1250 * in case it had to block at any point.
1251 */
1252 return get_new_inode_fast(sb, head, ino);
1253} 1241}
1254EXPORT_SYMBOL(iget_locked); 1242EXPORT_SYMBOL(ilookup);
1255 1243
1256int insert_inode_locked(struct inode *inode) 1244int insert_inode_locked(struct inode *inode)
1257{ 1245{
@@ -1259,27 +1247,33 @@ int insert_inode_locked(struct inode *inode)
1259 ino_t ino = inode->i_ino; 1247 ino_t ino = inode->i_ino;
1260 struct hlist_head *head = inode_hashtable + hash(sb, ino); 1248 struct hlist_head *head = inode_hashtable + hash(sb, ino);
1261 1249
1262 inode->i_state |= I_NEW;
1263 while (1) { 1250 while (1) {
1264 struct hlist_node *node; 1251 struct hlist_node *node;
1265 struct inode *old = NULL; 1252 struct inode *old = NULL;
1266 spin_lock(&inode_lock); 1253 spin_lock(&inode_hash_lock);
1267 hlist_for_each_entry(old, node, head, i_hash) { 1254 hlist_for_each_entry(old, node, head, i_hash) {
1268 if (old->i_ino != ino) 1255 if (old->i_ino != ino)
1269 continue; 1256 continue;
1270 if (old->i_sb != sb) 1257 if (old->i_sb != sb)
1271 continue; 1258 continue;
1272 if (old->i_state & (I_FREEING|I_WILL_FREE)) 1259 spin_lock(&old->i_lock);
1260 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1261 spin_unlock(&old->i_lock);
1273 continue; 1262 continue;
1263 }
1274 break; 1264 break;
1275 } 1265 }
1276 if (likely(!node)) { 1266 if (likely(!node)) {
1267 spin_lock(&inode->i_lock);
1268 inode->i_state |= I_NEW;
1277 hlist_add_head(&inode->i_hash, head); 1269 hlist_add_head(&inode->i_hash, head);
1278 spin_unlock(&inode_lock); 1270 spin_unlock(&inode->i_lock);
1271 spin_unlock(&inode_hash_lock);
1279 return 0; 1272 return 0;
1280 } 1273 }
1281 __iget(old); 1274 __iget(old);
1282 spin_unlock(&inode_lock); 1275 spin_unlock(&old->i_lock);
1276 spin_unlock(&inode_hash_lock);
1283 wait_on_inode(old); 1277 wait_on_inode(old);
1284 if (unlikely(!inode_unhashed(old))) { 1278 if (unlikely(!inode_unhashed(old))) {
1285 iput(old); 1279 iput(old);
@@ -1296,29 +1290,34 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
1296 struct super_block *sb = inode->i_sb; 1290 struct super_block *sb = inode->i_sb;
1297 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 1291 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
1298 1292
1299 inode->i_state |= I_NEW;
1300
1301 while (1) { 1293 while (1) {
1302 struct hlist_node *node; 1294 struct hlist_node *node;
1303 struct inode *old = NULL; 1295 struct inode *old = NULL;
1304 1296
1305 spin_lock(&inode_lock); 1297 spin_lock(&inode_hash_lock);
1306 hlist_for_each_entry(old, node, head, i_hash) { 1298 hlist_for_each_entry(old, node, head, i_hash) {
1307 if (old->i_sb != sb) 1299 if (old->i_sb != sb)
1308 continue; 1300 continue;
1309 if (!test(old, data)) 1301 if (!test(old, data))
1310 continue; 1302 continue;
1311 if (old->i_state & (I_FREEING|I_WILL_FREE)) 1303 spin_lock(&old->i_lock);
1304 if (old->i_state & (I_FREEING|I_WILL_FREE)) {
1305 spin_unlock(&old->i_lock);
1312 continue; 1306 continue;
1307 }
1313 break; 1308 break;
1314 } 1309 }
1315 if (likely(!node)) { 1310 if (likely(!node)) {
1311 spin_lock(&inode->i_lock);
1312 inode->i_state |= I_NEW;
1316 hlist_add_head(&inode->i_hash, head); 1313 hlist_add_head(&inode->i_hash, head);
1317 spin_unlock(&inode_lock); 1314 spin_unlock(&inode->i_lock);
1315 spin_unlock(&inode_hash_lock);
1318 return 0; 1316 return 0;
1319 } 1317 }
1320 __iget(old); 1318 __iget(old);
1321 spin_unlock(&inode_lock); 1319 spin_unlock(&old->i_lock);
1320 spin_unlock(&inode_hash_lock);
1322 wait_on_inode(old); 1321 wait_on_inode(old);
1323 if (unlikely(!inode_unhashed(old))) { 1322 if (unlikely(!inode_unhashed(old))) {
1324 iput(old); 1323 iput(old);
@@ -1363,47 +1362,35 @@ static void iput_final(struct inode *inode)
1363 const struct super_operations *op = inode->i_sb->s_op; 1362 const struct super_operations *op = inode->i_sb->s_op;
1364 int drop; 1363 int drop;
1365 1364
1365 WARN_ON(inode->i_state & I_NEW);
1366
1366 if (op && op->drop_inode) 1367 if (op && op->drop_inode)
1367 drop = op->drop_inode(inode); 1368 drop = op->drop_inode(inode);
1368 else 1369 else
1369 drop = generic_drop_inode(inode); 1370 drop = generic_drop_inode(inode);
1370 1371
1372 if (!drop && (sb->s_flags & MS_ACTIVE)) {
1373 inode->i_state |= I_REFERENCED;
1374 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
1375 inode_lru_list_add(inode);
1376 spin_unlock(&inode->i_lock);
1377 return;
1378 }
1379
1371 if (!drop) { 1380 if (!drop) {
1372 if (sb->s_flags & MS_ACTIVE) {
1373 inode->i_state |= I_REFERENCED;
1374 if (!(inode->i_state & (I_DIRTY|I_SYNC))) {
1375 inode_lru_list_add(inode);
1376 }
1377 spin_unlock(&inode_lock);
1378 return;
1379 }
1380 WARN_ON(inode->i_state & I_NEW);
1381 inode->i_state |= I_WILL_FREE; 1381 inode->i_state |= I_WILL_FREE;
1382 spin_unlock(&inode_lock); 1382 spin_unlock(&inode->i_lock);
1383 write_inode_now(inode, 1); 1383 write_inode_now(inode, 1);
1384 spin_lock(&inode_lock); 1384 spin_lock(&inode->i_lock);
1385 WARN_ON(inode->i_state & I_NEW); 1385 WARN_ON(inode->i_state & I_NEW);
1386 inode->i_state &= ~I_WILL_FREE; 1386 inode->i_state &= ~I_WILL_FREE;
1387 __remove_inode_hash(inode);
1388 } 1387 }
1389 1388
1390 WARN_ON(inode->i_state & I_NEW);
1391 inode->i_state |= I_FREEING; 1389 inode->i_state |= I_FREEING;
1392
1393 /*
1394 * Move the inode off the IO lists and LRU once I_FREEING is
1395 * set so that it won't get moved back on there if it is dirty.
1396 */
1397 inode_lru_list_del(inode); 1390 inode_lru_list_del(inode);
1398 list_del_init(&inode->i_wb_list); 1391 spin_unlock(&inode->i_lock);
1399 1392
1400 __inode_sb_list_del(inode);
1401 spin_unlock(&inode_lock);
1402 evict(inode); 1393 evict(inode);
1403 remove_inode_hash(inode);
1404 wake_up_inode(inode);
1405 BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
1406 destroy_inode(inode);
1407} 1394}
1408 1395
1409/** 1396/**
@@ -1420,7 +1407,7 @@ void iput(struct inode *inode)
1420 if (inode) { 1407 if (inode) {
1421 BUG_ON(inode->i_state & I_CLEAR); 1408 BUG_ON(inode->i_state & I_CLEAR);
1422 1409
1423 if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) 1410 if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
1424 iput_final(inode); 1411 iput_final(inode);
1425 } 1412 }
1426} 1413}
@@ -1599,9 +1586,8 @@ EXPORT_SYMBOL(inode_wait);
1599 * to recheck inode state. 1586 * to recheck inode state.
1600 * 1587 *
1601 * It doesn't matter if I_NEW is not set initially, a call to 1588 * It doesn't matter if I_NEW is not set initially, a call to
1602 * wake_up_inode() after removing from the hash list will DTRT. 1589 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
1603 * 1590 * will DTRT.
1604 * This is called with inode_lock held.
1605 */ 1591 */
1606static void __wait_on_freeing_inode(struct inode *inode) 1592static void __wait_on_freeing_inode(struct inode *inode)
1607{ 1593{
@@ -1609,10 +1595,11 @@ static void __wait_on_freeing_inode(struct inode *inode)
1609 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); 1595 DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
1610 wq = bit_waitqueue(&inode->i_state, __I_NEW); 1596 wq = bit_waitqueue(&inode->i_state, __I_NEW);
1611 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); 1597 prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
1612 spin_unlock(&inode_lock); 1598 spin_unlock(&inode->i_lock);
1599 spin_unlock(&inode_hash_lock);
1613 schedule(); 1600 schedule();
1614 finish_wait(wq, &wait.wait); 1601 finish_wait(wq, &wait.wait);
1615 spin_lock(&inode_lock); 1602 spin_lock(&inode_hash_lock);
1616} 1603}
1617 1604
1618static __initdata unsigned long ihash_entries; 1605static __initdata unsigned long ihash_entries;
@@ -1704,7 +1691,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1704EXPORT_SYMBOL(init_special_inode); 1691EXPORT_SYMBOL(init_special_inode);
1705 1692
1706/** 1693/**
1707 * Init uid,gid,mode for new inode according to posix standards 1694 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
1708 * @inode: New inode 1695 * @inode: New inode
1709 * @dir: Directory inode 1696 * @dir: Directory inode
1710 * @mode: mode of the new inode 1697 * @mode: mode of the new inode
@@ -1722,3 +1709,22 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
1722 inode->i_mode = mode; 1709 inode->i_mode = mode;
1723} 1710}
1724EXPORT_SYMBOL(inode_init_owner); 1711EXPORT_SYMBOL(inode_init_owner);
1712
1713/**
1714 * inode_owner_or_capable - check current task permissions to inode
1715 * @inode: inode being checked
1716 *
1717 * Return true if current either has CAP_FOWNER to the inode, or
1718 * owns the file.
1719 */
1720bool inode_owner_or_capable(const struct inode *inode)
1721{
1722 struct user_namespace *ns = inode_userns(inode);
1723
1724 if (current_user_ns() == ns && current_fsuid() == inode->i_uid)
1725 return true;
1726 if (ns_capable(ns, CAP_FOWNER))
1727 return true;
1728 return false;
1729}
1730EXPORT_SYMBOL(inode_owner_or_capable);
diff --git a/fs/internal.h b/fs/internal.h
index 0663568b1247..b29c46e4e32f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -12,6 +12,7 @@
12#include <linux/lglock.h> 12#include <linux/lglock.h>
13 13
14struct super_block; 14struct super_block;
15struct file_system_type;
15struct linux_binprm; 16struct linux_binprm;
16struct path; 17struct path;
17 18
@@ -61,10 +62,9 @@ extern int check_unsafe_exec(struct linux_binprm *);
61extern int copy_mount_options(const void __user *, unsigned long *); 62extern int copy_mount_options(const void __user *, unsigned long *);
62extern int copy_mount_string(const void __user *, char **); 63extern int copy_mount_string(const void __user *, char **);
63 64
64extern void free_vfsmnt(struct vfsmount *);
65extern struct vfsmount *alloc_vfsmnt(const char *);
66extern unsigned int mnt_get_count(struct vfsmount *mnt); 65extern unsigned int mnt_get_count(struct vfsmount *mnt);
67extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int); 66extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
67extern struct vfsmount *lookup_mnt(struct path *);
68extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, 68extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
69 struct vfsmount *); 69 struct vfsmount *);
70extern void release_mounts(struct list_head *); 70extern void release_mounts(struct list_head *);
@@ -99,6 +99,8 @@ extern struct file *get_empty_filp(void);
99extern int do_remount_sb(struct super_block *, int, void *, int); 99extern int do_remount_sb(struct super_block *, int, void *, int);
100extern void __put_super(struct super_block *sb); 100extern void __put_super(struct super_block *sb);
101extern void put_super(struct super_block *sb); 101extern void put_super(struct super_block *sb);
102extern struct dentry *mount_fs(struct file_system_type *,
103 int, const char *, void *);
102 104
103/* 105/*
104 * open.c 106 * open.c
@@ -106,10 +108,30 @@ extern void put_super(struct super_block *sb);
106struct nameidata; 108struct nameidata;
107extern struct file *nameidata_to_filp(struct nameidata *); 109extern struct file *nameidata_to_filp(struct nameidata *);
108extern void release_open_intent(struct nameidata *); 110extern void release_open_intent(struct nameidata *);
111struct open_flags {
112 int open_flag;
113 int mode;
114 int acc_mode;
115 int intent;
116};
117extern struct file *do_filp_open(int dfd, const char *pathname,
118 const struct open_flags *op, int lookup_flags);
119extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
120 const char *, const struct open_flags *, int lookup_flags);
121
122extern long do_handle_open(int mountdirfd,
123 struct file_handle __user *ufh, int open_flag);
109 124
110/* 125/*
111 * inode.c 126 * inode.c
112 */ 127 */
128extern spinlock_t inode_sb_list_lock;
129
130/*
131 * fs-writeback.c
132 */
133extern void inode_wb_list_del(struct inode *inode);
134
113extern int get_nr_dirty_inodes(void); 135extern int get_nr_dirty_inodes(void);
114extern void evict_inodes(struct super_block *); 136extern void evict_inodes(struct super_block *);
115extern int invalidate_inodes(struct super_block *); 137extern int invalidate_inodes(struct super_block *, bool);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index a59635e295fa..1d9b9fcb2db4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -273,6 +273,13 @@ int __generic_block_fiemap(struct inode *inode,
273 len = isize; 273 len = isize;
274 } 274 }
275 275
276 /*
277 * Some filesystems can't deal with being asked to map less than
278 * blocksize, so make sure our len is at least block length.
279 */
280 if (logical_to_blk(inode, len) == 0)
281 len = blk_to_logical(inode, 1);
282
276 start_blk = logical_to_blk(inode, start); 283 start_blk = logical_to_blk(inode, start);
277 last_blk = logical_to_blk(inode, start + len - 1); 284 last_blk = logical_to_blk(inode, start + len - 1);
278 285
@@ -541,6 +548,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
541{ 548{
542 int error = 0; 549 int error = 0;
543 int __user *argp = (int __user *)arg; 550 int __user *argp = (int __user *)arg;
551 struct inode *inode = filp->f_path.dentry->d_inode;
544 552
545 switch (cmd) { 553 switch (cmd) {
546 case FIOCLEX: 554 case FIOCLEX:
@@ -560,13 +568,11 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
560 break; 568 break;
561 569
562 case FIOQSIZE: 570 case FIOQSIZE:
563 if (S_ISDIR(filp->f_path.dentry->d_inode->i_mode) || 571 if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
564 S_ISREG(filp->f_path.dentry->d_inode->i_mode) || 572 S_ISLNK(inode->i_mode)) {
565 S_ISLNK(filp->f_path.dentry->d_inode->i_mode)) { 573 loff_t res = inode_get_bytes(inode);
566 loff_t res = 574 error = copy_to_user(argp, &res, sizeof(res)) ?
567 inode_get_bytes(filp->f_path.dentry->d_inode); 575 -EFAULT : 0;
568 error = copy_to_user((loff_t __user *)arg, &res,
569 sizeof(res)) ? -EFAULT : 0;
570 } else 576 } else
571 error = -ENOTTY; 577 error = -ENOTTY;
572 break; 578 break;
@@ -583,14 +589,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
583 return ioctl_fiemap(filp, arg); 589 return ioctl_fiemap(filp, arg);
584 590
585 case FIGETBSZ: 591 case FIGETBSZ:
586 { 592 return put_user(inode->i_sb->s_blocksize, argp);
587 struct inode *inode = filp->f_path.dentry->d_inode;
588 int __user *p = (int __user *)arg;
589 return put_user(inode->i_sb->s_blocksize, p);
590 }
591 593
592 default: 594 default:
593 if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) 595 if (S_ISREG(inode->i_mode))
594 error = file_ioctl(filp, cmd, arg); 596 error = file_ioctl(filp, cmd, arg);
595 else 597 else
596 error = vfs_ioctl(filp, cmd, arg); 598 error = vfs_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb38474..dd4687ff30d0 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
124 * offset of the inode and the upper 16 bits of fh32[1] to 124 * offset of the inode and the upper 16 bits of fh32[1] to
125 * hold the offset of the parent. 125 * hold the offset of the parent.
126 */ 126 */
127 127 if (connectable && (len < 5)) {
128 if (len < 3 || (connectable && len < 5)) 128 *max_len = 5;
129 return 255;
130 } else if (len < 3) {
131 *max_len = 3;
129 return 255; 132 return 255;
133 }
130 134
131 len = 3; 135 len = 3;
132 fh32[0] = ei->i_iget5_block; 136 fh32[0] = ei->i_iget5_block;
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index a0f3833c0dbf..3db5ba4568fc 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1158,7 +1158,6 @@ static sector_t _isofs_bmap(struct address_space *mapping, sector_t block)
1158 1158
1159static const struct address_space_operations isofs_aops = { 1159static const struct address_space_operations isofs_aops = {
1160 .readpage = isofs_readpage, 1160 .readpage = isofs_readpage,
1161 .sync_page = block_sync_page,
1162 .bmap = _isofs_bmap 1161 .bmap = _isofs_bmap
1163}; 1162};
1164 1163
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 34a4861c14b8..69b180459463 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -20,6 +20,7 @@
20#include <linux/mm.h> 20#include <linux/mm.h>
21#include <linux/pagemap.h> 21#include <linux/pagemap.h>
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/blkdev.h>
23 24
24/* 25/*
25 * Default IO end handler for temporary BJ_IO buffer_heads. 26 * Default IO end handler for temporary BJ_IO buffer_heads.
@@ -294,7 +295,7 @@ void journal_commit_transaction(journal_t *journal)
294 int first_tag = 0; 295 int first_tag = 0;
295 int tag_flag; 296 int tag_flag;
296 int i; 297 int i;
297 int write_op = WRITE_SYNC; 298 struct blk_plug plug;
298 299
299 /* 300 /*
300 * First job: lock down the current transaction and wait for 301 * First job: lock down the current transaction and wait for
@@ -327,13 +328,6 @@ void journal_commit_transaction(journal_t *journal)
327 spin_lock(&journal->j_state_lock); 328 spin_lock(&journal->j_state_lock);
328 commit_transaction->t_state = T_LOCKED; 329 commit_transaction->t_state = T_LOCKED;
329 330
330 /*
331 * Use plugged writes here, since we want to submit several before
332 * we unplug the device. We don't do explicit unplugging in here,
333 * instead we rely on sync_buffer() doing the unplug for us.
334 */
335 if (commit_transaction->t_synchronous_commit)
336 write_op = WRITE_SYNC_PLUG;
337 spin_lock(&commit_transaction->t_handle_lock); 331 spin_lock(&commit_transaction->t_handle_lock);
338 while (commit_transaction->t_updates) { 332 while (commit_transaction->t_updates) {
339 DEFINE_WAIT(wait); 333 DEFINE_WAIT(wait);
@@ -368,7 +362,7 @@ void journal_commit_transaction(journal_t *journal)
368 * we do not require it to remember exactly which old buffers it 362 * we do not require it to remember exactly which old buffers it
369 * has reserved. This is consistent with the existing behaviour 363 * has reserved. This is consistent with the existing behaviour
370 * that multiple journal_get_write_access() calls to the same 364 * that multiple journal_get_write_access() calls to the same
371 * buffer are perfectly permissable. 365 * buffer are perfectly permissible.
372 */ 366 */
373 while (commit_transaction->t_reserved_list) { 367 while (commit_transaction->t_reserved_list) {
374 jh = commit_transaction->t_reserved_list; 368 jh = commit_transaction->t_reserved_list;
@@ -418,8 +412,10 @@ void journal_commit_transaction(journal_t *journal)
418 * Now start flushing things to disk, in the order they appear 412 * Now start flushing things to disk, in the order they appear
419 * on the transaction lists. Data blocks go first. 413 * on the transaction lists. Data blocks go first.
420 */ 414 */
415 blk_start_plug(&plug);
421 err = journal_submit_data_buffers(journal, commit_transaction, 416 err = journal_submit_data_buffers(journal, commit_transaction,
422 write_op); 417 WRITE_SYNC);
418 blk_finish_plug(&plug);
423 419
424 /* 420 /*
425 * Wait for all previously submitted IO to complete. 421 * Wait for all previously submitted IO to complete.
@@ -480,7 +476,9 @@ void journal_commit_transaction(journal_t *journal)
480 err = 0; 476 err = 0;
481 } 477 }
482 478
483 journal_write_revoke_records(journal, commit_transaction, write_op); 479 blk_start_plug(&plug);
480
481 journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
484 482
485 /* 483 /*
486 * If we found any dirty or locked buffers, then we should have 484 * If we found any dirty or locked buffers, then we should have
@@ -650,7 +648,7 @@ start_journal_io:
650 clear_buffer_dirty(bh); 648 clear_buffer_dirty(bh);
651 set_buffer_uptodate(bh); 649 set_buffer_uptodate(bh);
652 bh->b_end_io = journal_end_buffer_io_sync; 650 bh->b_end_io = journal_end_buffer_io_sync;
653 submit_bh(write_op, bh); 651 submit_bh(WRITE_SYNC, bh);
654 } 652 }
655 cond_resched(); 653 cond_resched();
656 654
@@ -661,6 +659,8 @@ start_journal_io:
661 } 659 }
662 } 660 }
663 661
662 blk_finish_plug(&plug);
663
664 /* Lo and behold: we have just managed to send a transaction to 664 /* Lo and behold: we have just managed to send a transaction to
665 the log. Before we can commit it, wait for the IO so far to 665 the log. Before we can commit it, wait for the IO so far to
666 complete. Control buffers being written are on the 666 complete. Control buffers being written are on the
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index da1b5e4ffce1..b3713afaaa9e 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -770,7 +770,7 @@ journal_t * journal_init_dev(struct block_device *bdev,
770 journal->j_wbufsize = n; 770 journal->j_wbufsize = n;
771 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 771 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
772 if (!journal->j_wbuf) { 772 if (!journal->j_wbuf) {
773 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 773 printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
774 __func__); 774 __func__);
775 goto out_err; 775 goto out_err;
776 } 776 }
@@ -831,7 +831,7 @@ journal_t * journal_init_inode (struct inode *inode)
831 journal->j_wbufsize = n; 831 journal->j_wbufsize = n;
832 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 832 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
833 if (!journal->j_wbuf) { 833 if (!journal->j_wbuf) {
834 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 834 printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
835 __func__); 835 __func__);
836 goto out_err; 836 goto out_err;
837 } 837 }
@@ -839,7 +839,7 @@ journal_t * journal_init_inode (struct inode *inode)
839 err = journal_bmap(journal, 0, &blocknr); 839 err = journal_bmap(journal, 0, &blocknr);
840 /* If that failed, give up */ 840 /* If that failed, give up */
841 if (err) { 841 if (err) {
842 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 842 printk(KERN_ERR "%s: Cannot locate journal superblock\n",
843 __func__); 843 __func__);
844 goto out_err; 844 goto out_err;
845 } 845 }
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index d29018307e2e..305a90763154 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -71,7 +71,7 @@
71 * switching hash tables under them. For operations on the lists of entries in 71 * switching hash tables under them. For operations on the lists of entries in
72 * the hash table j_revoke_lock is used. 72 * the hash table j_revoke_lock is used.
73 * 73 *
74 * Finally, also replay code uses the hash tables but at this moment noone else 74 * Finally, also replay code uses the hash tables but at this moment no one else
75 * can touch them (filesystem isn't mounted yet) and hence no locking is 75 * can touch them (filesystem isn't mounted yet) and hence no locking is
76 * needed. 76 * needed.
77 */ 77 */
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 5b2e4c30a2a1..60d2319651b2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1392,7 +1392,7 @@ int journal_stop(handle_t *handle)
1392 * by 30x or more... 1392 * by 30x or more...
1393 * 1393 *
1394 * We try and optimize the sleep time against what the underlying disk 1394 * We try and optimize the sleep time against what the underlying disk
1395 * can do, instead of having a static sleep time. This is usefull for 1395 * can do, instead of having a static sleep time. This is useful for
1396 * the case where our storage is so fast that it is more optimal to go 1396 * the case where our storage is so fast that it is more optimal to go
1397 * ahead and force a flush and wait for the transaction to be committed 1397 * ahead and force a flush and wait for the transaction to be committed
1398 * than it is to wait for an arbitrary amount of time for new writers to 1398 * than it is to wait for an arbitrary amount of time for new writers to
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f3ad1598b201..6e28000a4b21 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -105,6 +105,8 @@ static int journal_submit_commit_record(journal_t *journal,
105 int ret; 105 int ret;
106 struct timespec now = current_kernel_time(); 106 struct timespec now = current_kernel_time();
107 107
108 *cbh = NULL;
109
108 if (is_journal_aborted(journal)) 110 if (is_journal_aborted(journal))
109 return 0; 111 return 0;
110 112
@@ -137,9 +139,9 @@ static int journal_submit_commit_record(journal_t *journal,
137 if (journal->j_flags & JBD2_BARRIER && 139 if (journal->j_flags & JBD2_BARRIER &&
138 !JBD2_HAS_INCOMPAT_FEATURE(journal, 140 !JBD2_HAS_INCOMPAT_FEATURE(journal,
139 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) 141 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
140 ret = submit_bh(WRITE_SYNC_PLUG | WRITE_FLUSH_FUA, bh); 142 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
141 else 143 else
142 ret = submit_bh(WRITE_SYNC_PLUG, bh); 144 ret = submit_bh(WRITE_SYNC, bh);
143 145
144 *cbh = bh; 146 *cbh = bh;
145 return ret; 147 return ret;
@@ -329,7 +331,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
329 int tag_bytes = journal_tag_bytes(journal); 331 int tag_bytes = journal_tag_bytes(journal);
330 struct buffer_head *cbh = NULL; /* For transactional checksums */ 332 struct buffer_head *cbh = NULL; /* For transactional checksums */
331 __u32 crc32_sum = ~0; 333 __u32 crc32_sum = ~0;
332 int write_op = WRITE_SYNC; 334 struct blk_plug plug;
333 335
334 /* 336 /*
335 * First job: lock down the current transaction and wait for 337 * First job: lock down the current transaction and wait for
@@ -363,13 +365,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
363 write_lock(&journal->j_state_lock); 365 write_lock(&journal->j_state_lock);
364 commit_transaction->t_state = T_LOCKED; 366 commit_transaction->t_state = T_LOCKED;
365 367
366 /*
367 * Use plugged writes here, since we want to submit several before
368 * we unplug the device. We don't do explicit unplugging in here,
369 * instead we rely on sync_buffer() doing the unplug for us.
370 */
371 if (commit_transaction->t_synchronous_commit)
372 write_op = WRITE_SYNC_PLUG;
373 trace_jbd2_commit_locking(journal, commit_transaction); 368 trace_jbd2_commit_locking(journal, commit_transaction);
374 stats.run.rs_wait = commit_transaction->t_max_wait; 369 stats.run.rs_wait = commit_transaction->t_max_wait;
375 stats.run.rs_locked = jiffies; 370 stats.run.rs_locked = jiffies;
@@ -410,7 +405,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
410 * we do not require it to remember exactly which old buffers it 405 * we do not require it to remember exactly which old buffers it
411 * has reserved. This is consistent with the existing behaviour 406 * has reserved. This is consistent with the existing behaviour
412 * that multiple jbd2_journal_get_write_access() calls to the same 407 * that multiple jbd2_journal_get_write_access() calls to the same
413 * buffer are perfectly permissable. 408 * buffer are perfectly permissible.
414 */ 409 */
415 while (commit_transaction->t_reserved_list) { 410 while (commit_transaction->t_reserved_list) {
416 jh = commit_transaction->t_reserved_list; 411 jh = commit_transaction->t_reserved_list;
@@ -469,8 +464,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
469 if (err) 464 if (err)
470 jbd2_journal_abort(journal, err); 465 jbd2_journal_abort(journal, err);
471 466
467 blk_start_plug(&plug);
472 jbd2_journal_write_revoke_records(journal, commit_transaction, 468 jbd2_journal_write_revoke_records(journal, commit_transaction,
473 write_op); 469 WRITE_SYNC);
470 blk_finish_plug(&plug);
474 471
475 jbd_debug(3, "JBD: commit phase 2\n"); 472 jbd_debug(3, "JBD: commit phase 2\n");
476 473
@@ -497,6 +494,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
497 err = 0; 494 err = 0;
498 descriptor = NULL; 495 descriptor = NULL;
499 bufs = 0; 496 bufs = 0;
497 blk_start_plug(&plug);
500 while (commit_transaction->t_buffers) { 498 while (commit_transaction->t_buffers) {
501 499
502 /* Find the next buffer to be journaled... */ 500 /* Find the next buffer to be journaled... */
@@ -658,7 +656,7 @@ start_journal_io:
658 clear_buffer_dirty(bh); 656 clear_buffer_dirty(bh);
659 set_buffer_uptodate(bh); 657 set_buffer_uptodate(bh);
660 bh->b_end_io = journal_end_buffer_io_sync; 658 bh->b_end_io = journal_end_buffer_io_sync;
661 submit_bh(write_op, bh); 659 submit_bh(WRITE_SYNC, bh);
662 } 660 }
663 cond_resched(); 661 cond_resched();
664 stats.run.rs_blocks_logged += bufs; 662 stats.run.rs_blocks_logged += bufs;
@@ -699,6 +697,8 @@ start_journal_io:
699 __jbd2_journal_abort_hard(journal); 697 __jbd2_journal_abort_hard(journal);
700 } 698 }
701 699
700 blk_finish_plug(&plug);
701
702 /* Lo and behold: we have just managed to send a transaction to 702 /* Lo and behold: we have just managed to send a transaction to
703 the log. Before we can commit it, wait for the IO so far to 703 the log. Before we can commit it, wait for the IO so far to
704 complete. Control buffers being written are on the 704 complete. Control buffers being written are on the
@@ -808,7 +808,7 @@ wait_for_iobuf:
808 if (err) 808 if (err)
809 __jbd2_journal_abort_hard(journal); 809 __jbd2_journal_abort_hard(journal);
810 } 810 }
811 if (!err && !is_journal_aborted(journal)) 811 if (cbh)
812 err = journal_wait_on_commit_record(journal, cbh); 812 err = journal_wait_on_commit_record(journal, cbh);
813 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 813 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
814 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && 814 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 9e4686900f18..e0ec3db1c395 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -473,7 +473,8 @@ int __jbd2_log_space_left(journal_t *journal)
473} 473}
474 474
475/* 475/*
476 * Called under j_state_lock. Returns true if a transaction commit was started. 476 * Called with j_state_lock locked for writing.
477 * Returns true if a transaction commit was started.
477 */ 478 */
478int __jbd2_log_start_commit(journal_t *journal, tid_t target) 479int __jbd2_log_start_commit(journal_t *journal, tid_t target)
479{ 480{
@@ -520,11 +521,13 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
520{ 521{
521 transaction_t *transaction = NULL; 522 transaction_t *transaction = NULL;
522 tid_t tid; 523 tid_t tid;
524 int need_to_start = 0;
523 525
524 read_lock(&journal->j_state_lock); 526 read_lock(&journal->j_state_lock);
525 if (journal->j_running_transaction && !current->journal_info) { 527 if (journal->j_running_transaction && !current->journal_info) {
526 transaction = journal->j_running_transaction; 528 transaction = journal->j_running_transaction;
527 __jbd2_log_start_commit(journal, transaction->t_tid); 529 if (!tid_geq(journal->j_commit_request, transaction->t_tid))
530 need_to_start = 1;
528 } else if (journal->j_committing_transaction) 531 } else if (journal->j_committing_transaction)
529 transaction = journal->j_committing_transaction; 532 transaction = journal->j_committing_transaction;
530 533
@@ -535,6 +538,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
535 538
536 tid = transaction->t_tid; 539 tid = transaction->t_tid;
537 read_unlock(&journal->j_state_lock); 540 read_unlock(&journal->j_state_lock);
541 if (need_to_start)
542 jbd2_log_start_commit(journal, tid);
538 jbd2_log_wait_commit(journal, tid); 543 jbd2_log_wait_commit(journal, tid);
539 return 1; 544 return 1;
540} 545}
@@ -912,7 +917,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev,
912 journal->j_wbufsize = n; 917 journal->j_wbufsize = n;
913 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 918 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
914 if (!journal->j_wbuf) { 919 if (!journal->j_wbuf) {
915 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 920 printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
916 __func__); 921 __func__);
917 goto out_err; 922 goto out_err;
918 } 923 }
@@ -978,7 +983,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
978 journal->j_wbufsize = n; 983 journal->j_wbufsize = n;
979 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); 984 journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL);
980 if (!journal->j_wbuf) { 985 if (!journal->j_wbuf) {
981 printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", 986 printk(KERN_ERR "%s: Can't allocate bhs for commit thread\n",
982 __func__); 987 __func__);
983 goto out_err; 988 goto out_err;
984 } 989 }
@@ -986,7 +991,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode)
986 err = jbd2_journal_bmap(journal, 0, &blocknr); 991 err = jbd2_journal_bmap(journal, 0, &blocknr);
987 /* If that failed, give up */ 992 /* If that failed, give up */
988 if (err) { 993 if (err) {
989 printk(KERN_ERR "%s: Cannnot locate journal superblock\n", 994 printk(KERN_ERR "%s: Cannot locate journal superblock\n",
990 __func__); 995 __func__);
991 goto out_err; 996 goto out_err;
992 } 997 }
@@ -2408,10 +2413,12 @@ const char *jbd2_dev_to_name(dev_t device)
2408 new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); 2413 new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
2409 if (!new_dev) 2414 if (!new_dev)
2410 return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ 2415 return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
2416 bd = bdget(device);
2411 spin_lock(&devname_cache_lock); 2417 spin_lock(&devname_cache_lock);
2412 if (devcache[i]) { 2418 if (devcache[i]) {
2413 if (devcache[i]->device == device) { 2419 if (devcache[i]->device == device) {
2414 kfree(new_dev); 2420 kfree(new_dev);
2421 bdput(bd);
2415 ret = devcache[i]->devname; 2422 ret = devcache[i]->devname;
2416 spin_unlock(&devname_cache_lock); 2423 spin_unlock(&devname_cache_lock);
2417 return ret; 2424 return ret;
@@ -2420,7 +2427,6 @@ const char *jbd2_dev_to_name(dev_t device)
2420 } 2427 }
2421 devcache[i] = new_dev; 2428 devcache[i] = new_dev;
2422 devcache[i]->device = device; 2429 devcache[i]->device = device;
2423 bd = bdget(device);
2424 if (bd) { 2430 if (bd) {
2425 bdevname(bd, devcache[i]->devname); 2431 bdevname(bd, devcache[i]->devname);
2426 bdput(bd); 2432 bdput(bd);
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 9ad321fd63fd..69fd93588118 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -71,7 +71,7 @@
71 * switching hash tables under them. For operations on the lists of entries in 71 * switching hash tables under them. For operations on the lists of entries in
72 * the hash table j_revoke_lock is used. 72 * the hash table j_revoke_lock is used.
73 * 73 *
74 * Finally, also replay code uses the hash tables but at this moment noone else 74 * Finally, also replay code uses the hash tables but at this moment no one else
75 * can touch them (filesystem isn't mounted yet) and hence no locking is 75 * can touch them (filesystem isn't mounted yet) and hence no locking is
76 * needed. 76 * needed.
77 */ 77 */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index faad2bd787c7..05fa77a23711 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -117,10 +117,10 @@ static inline void update_t_max_wait(transaction_t *transaction)
117static int start_this_handle(journal_t *journal, handle_t *handle, 117static int start_this_handle(journal_t *journal, handle_t *handle,
118 int gfp_mask) 118 int gfp_mask)
119{ 119{
120 transaction_t *transaction; 120 transaction_t *transaction, *new_transaction = NULL;
121 int needed; 121 tid_t tid;
122 int nblocks = handle->h_buffer_credits; 122 int needed, need_to_start;
123 transaction_t *new_transaction = NULL; 123 int nblocks = handle->h_buffer_credits;
124 124
125 if (nblocks > journal->j_max_transaction_buffers) { 125 if (nblocks > journal->j_max_transaction_buffers) {
126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", 126 printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n",
@@ -222,8 +222,11 @@ repeat:
222 atomic_sub(nblocks, &transaction->t_outstanding_credits); 222 atomic_sub(nblocks, &transaction->t_outstanding_credits);
223 prepare_to_wait(&journal->j_wait_transaction_locked, &wait, 223 prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
224 TASK_UNINTERRUPTIBLE); 224 TASK_UNINTERRUPTIBLE);
225 __jbd2_log_start_commit(journal, transaction->t_tid); 225 tid = transaction->t_tid;
226 need_to_start = !tid_geq(journal->j_commit_request, tid);
226 read_unlock(&journal->j_state_lock); 227 read_unlock(&journal->j_state_lock);
228 if (need_to_start)
229 jbd2_log_start_commit(journal, tid);
227 schedule(); 230 schedule();
228 finish_wait(&journal->j_wait_transaction_locked, &wait); 231 finish_wait(&journal->j_wait_transaction_locked, &wait);
229 goto repeat; 232 goto repeat;
@@ -442,7 +445,8 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
442{ 445{
443 transaction_t *transaction = handle->h_transaction; 446 transaction_t *transaction = handle->h_transaction;
444 journal_t *journal = transaction->t_journal; 447 journal_t *journal = transaction->t_journal;
445 int ret; 448 tid_t tid;
449 int need_to_start, ret;
446 450
447 /* If we've had an abort of any type, don't even think about 451 /* If we've had an abort of any type, don't even think about
448 * actually doing the restart! */ 452 * actually doing the restart! */
@@ -465,8 +469,11 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask)
465 spin_unlock(&transaction->t_handle_lock); 469 spin_unlock(&transaction->t_handle_lock);
466 470
467 jbd_debug(2, "restarting handle %p\n", handle); 471 jbd_debug(2, "restarting handle %p\n", handle);
468 __jbd2_log_start_commit(journal, transaction->t_tid); 472 tid = transaction->t_tid;
473 need_to_start = !tid_geq(journal->j_commit_request, tid);
469 read_unlock(&journal->j_state_lock); 474 read_unlock(&journal->j_state_lock);
475 if (need_to_start)
476 jbd2_log_start_commit(journal, tid);
470 477
471 lock_map_release(&handle->h_lockdep_map); 478 lock_map_release(&handle->h_lockdep_map);
472 handle->h_buffer_credits = nblocks; 479 handle->h_buffer_credits = nblocks;
@@ -1396,7 +1403,7 @@ int jbd2_journal_stop(handle_t *handle)
1396 1403
1397 /* 1404 /*
1398 * Once we drop t_updates, if it goes to zero the transaction 1405 * Once we drop t_updates, if it goes to zero the transaction
1399 * could start commiting on us and eventually disappear. So 1406 * could start committing on us and eventually disappear. So
1400 * once we do this, we must not dereference transaction 1407 * once we do this, we must not dereference transaction
1401 * pointer again. 1408 * pointer again.
1402 */ 1409 */
diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO
index 5d3ea4070f01..ca28964abd4b 100644
--- a/fs/jffs2/TODO
+++ b/fs/jffs2/TODO
@@ -11,7 +11,7 @@
11 - checkpointing (do we need this? scan is quite fast) 11 - checkpointing (do we need this? scan is quite fast)
12 - make the scan code populate real inodes so read_inode just after 12 - make the scan code populate real inodes so read_inode just after
13 mount doesn't have to read the flash twice for large files. 13 mount doesn't have to read the flash twice for large files.
14 Make this a per-inode option, changable with chattr, so you can 14 Make this a per-inode option, changeable with chattr, so you can
15 decide which inodes should be in-core immediately after mount. 15 decide which inodes should be in-core immediately after mount.
16 - test, test, test 16 - test, test, test
17 17
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 95b79672150a..828a0e1ea438 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -402,7 +402,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
402 402
403 if (name[0] != '\0') 403 if (name[0] != '\0')
404 return -EINVAL; 404 return -EINVAL;
405 if (!is_owner_or_cap(dentry->d_inode)) 405 if (!inode_owner_or_capable(dentry->d_inode))
406 return -EPERM; 406 return -EPERM;
407 407
408 if (value) { 408 if (value) {
diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c
index fd05a0b9431d..5a001020c542 100644
--- a/fs/jffs2/compr_zlib.c
+++ b/fs/jffs2/compr_zlib.c
@@ -40,12 +40,13 @@ static z_stream inf_strm, def_strm;
40 40
41static int __init alloc_workspaces(void) 41static int __init alloc_workspaces(void)
42{ 42{
43 def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 43 def_strm.workspace = vmalloc(zlib_deflate_workspacesize(MAX_WBITS,
44 MAX_MEM_LEVEL));
44 if (!def_strm.workspace) { 45 if (!def_strm.workspace) {
45 printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize()); 46 printk(KERN_WARNING "Failed to allocate %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL));
46 return -ENOMEM; 47 return -ENOMEM;
47 } 48 }
48 D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize())); 49 D1(printk(KERN_DEBUG "Allocated %d bytes for deflate workspace\n", zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL)));
49 inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 50 inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
50 if (!inf_strm.workspace) { 51 if (!inf_strm.workspace) {
51 printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize()); 52 printk(KERN_WARNING "Failed to allocate %d bytes for inflate workspace\n", zlib_inflate_workspacesize());
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 92978658ed18..82faddd1f321 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -215,8 +215,7 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
215 no chance of AB-BA deadlock involving its f->sem). */ 215 no chance of AB-BA deadlock involving its f->sem). */
216 mutex_unlock(&f->sem); 216 mutex_unlock(&f->sem);
217 217
218 ret = jffs2_do_create(c, dir_f, f, ri, 218 ret = jffs2_do_create(c, dir_f, f, ri, &dentry->d_name);
219 dentry->d_name.name, dentry->d_name.len);
220 if (ret) 219 if (ret)
221 goto fail; 220 goto fail;
222 221
@@ -386,7 +385,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
386 385
387 jffs2_complete_reservation(c); 386 jffs2_complete_reservation(c);
388 387
389 ret = jffs2_init_security(inode, dir_i); 388 ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
390 if (ret) 389 if (ret)
391 goto fail; 390 goto fail;
392 391
@@ -530,7 +529,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
530 529
531 jffs2_complete_reservation(c); 530 jffs2_complete_reservation(c);
532 531
533 ret = jffs2_init_security(inode, dir_i); 532 ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
534 if (ret) 533 if (ret)
535 goto fail; 534 goto fail;
536 535
@@ -703,7 +702,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
703 702
704 jffs2_complete_reservation(c); 703 jffs2_complete_reservation(c);
705 704
706 ret = jffs2_init_security(inode, dir_i); 705 ret = jffs2_init_security(inode, dir_i, &dentry->d_name);
707 if (ret) 706 if (ret)
708 goto fail; 707 goto fail;
709 708
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index 5a53d9bdb2b5..e4619b00f7c5 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -401,7 +401,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
401 struct jffs2_raw_inode *ri, unsigned char *buf, 401 struct jffs2_raw_inode *ri, unsigned char *buf,
402 uint32_t offset, uint32_t writelen, uint32_t *retlen); 402 uint32_t offset, uint32_t writelen, uint32_t *retlen);
403int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, 403int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f,
404 struct jffs2_raw_inode *ri, const char *name, int namelen); 404 struct jffs2_raw_inode *ri, const struct qstr *qstr);
405int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name, 405int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, const char *name,
406 int namelen, struct jffs2_inode_info *dead_f, uint32_t time); 406 int namelen, struct jffs2_inode_info *dead_f, uint32_t time);
407int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino, 407int jffs2_do_link(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint32_t ino,
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index d32ee9412cb9..2ab1a0d91210 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -24,7 +24,7 @@
24 * 24 *
25 * Returns: 0 if the data CRC is correct; 25 * Returns: 0 if the data CRC is correct;
26 * 1 - if incorrect; 26 * 1 - if incorrect;
27 * error code if an error occured. 27 * error code if an error occurred.
28 */ 28 */
29static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) 29static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn)
30{ 30{
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 239f51216a68..cfeb7164b085 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -23,14 +23,15 @@
23#include "nodelist.h" 23#include "nodelist.h"
24 24
25/* ---- Initial Security Label Attachment -------------- */ 25/* ---- Initial Security Label Attachment -------------- */
26int jffs2_init_security(struct inode *inode, struct inode *dir) 26int jffs2_init_security(struct inode *inode, struct inode *dir,
27 const struct qstr *qstr)
27{ 28{
28 int rc; 29 int rc;
29 size_t len; 30 size_t len;
30 void *value; 31 void *value;
31 char *name; 32 char *name;
32 33
33 rc = security_inode_init_security(inode, dir, &name, &value, &len); 34 rc = security_inode_init_security(inode, dir, qstr, &name, &value, &len);
34 if (rc) { 35 if (rc) {
35 if (rc == -EOPNOTSUPP) 36 if (rc == -EOPNOTSUPP)
36 return 0; 37 return 0;
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 800171dca53b..e537fb0e0184 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -121,7 +121,7 @@ int jffs2_sum_add_inode_mem(struct jffs2_summary *s, struct jffs2_raw_inode *ri,
121 temp->nodetype = ri->nodetype; 121 temp->nodetype = ri->nodetype;
122 temp->inode = ri->ino; 122 temp->inode = ri->ino;
123 temp->version = ri->version; 123 temp->version = ri->version;
124 temp->offset = cpu_to_je32(ofs); /* relative offset from the begining of the jeb */ 124 temp->offset = cpu_to_je32(ofs); /* relative offset from the beginning of the jeb */
125 temp->totlen = ri->totlen; 125 temp->totlen = ri->totlen;
126 temp->next = NULL; 126 temp->next = NULL;
127 127
@@ -139,7 +139,7 @@ int jffs2_sum_add_dirent_mem(struct jffs2_summary *s, struct jffs2_raw_dirent *r
139 139
140 temp->nodetype = rd->nodetype; 140 temp->nodetype = rd->nodetype;
141 temp->totlen = rd->totlen; 141 temp->totlen = rd->totlen;
142 temp->offset = cpu_to_je32(ofs); /* relative from the begining of the jeb */ 142 temp->offset = cpu_to_je32(ofs); /* relative from the beginning of the jeb */
143 temp->pino = rd->pino; 143 temp->pino = rd->pino;
144 temp->version = rd->version; 144 temp->version = rd->version;
145 temp->ino = rd->ino; 145 temp->ino = rd->ino;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 07ee1546b2fa..4515bea0268f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1116,7 +1116,7 @@ int jffs2_write_nand_cleanmarker(struct jffs2_sb_info *c,
1116 1116
1117/* 1117/*
1118 * On NAND we try to mark this block bad. If the block was erased more 1118 * On NAND we try to mark this block bad. If the block was erased more
1119 * than MAX_ERASE_FAILURES we mark it finaly bad. 1119 * than MAX_ERASE_FAILURES we mark it finally bad.
1120 * Don't care about failures. This block remains on the erase-pending 1120 * Don't care about failures. This block remains on the erase-pending
1121 * or badblock list as long as nobody manipulates the flash with 1121 * or badblock list as long as nobody manipulates the flash with
1122 * a bootloader or something like that. 1122 * a bootloader or something like that.
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index c819eb0e982d..30d175b6d290 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -424,7 +424,9 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
424 return ret; 424 return ret;
425} 425}
426 426
427int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, struct jffs2_inode_info *f, struct jffs2_raw_inode *ri, const char *name, int namelen) 427int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
428 struct jffs2_inode_info *f, struct jffs2_raw_inode *ri,
429 const struct qstr *qstr)
428{ 430{
429 struct jffs2_raw_dirent *rd; 431 struct jffs2_raw_dirent *rd;
430 struct jffs2_full_dnode *fn; 432 struct jffs2_full_dnode *fn;
@@ -466,15 +468,15 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
466 mutex_unlock(&f->sem); 468 mutex_unlock(&f->sem);
467 jffs2_complete_reservation(c); 469 jffs2_complete_reservation(c);
468 470
469 ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode); 471 ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode, qstr);
470 if (ret) 472 if (ret)
471 return ret; 473 return ret;
472 ret = jffs2_init_acl_post(&f->vfs_inode); 474 ret = jffs2_init_acl_post(&f->vfs_inode);
473 if (ret) 475 if (ret)
474 return ret; 476 return ret;
475 477
476 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 478 ret = jffs2_reserve_space(c, sizeof(*rd)+qstr->len, &alloclen,
477 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 479 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(qstr->len));
478 480
479 if (ret) { 481 if (ret) {
480 /* Eep. */ 482 /* Eep. */
@@ -493,19 +495,19 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str
493 495
494 rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); 496 rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
495 rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); 497 rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT);
496 rd->totlen = cpu_to_je32(sizeof(*rd) + namelen); 498 rd->totlen = cpu_to_je32(sizeof(*rd) + qstr->len);
497 rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4)); 499 rd->hdr_crc = cpu_to_je32(crc32(0, rd, sizeof(struct jffs2_unknown_node)-4));
498 500
499 rd->pino = cpu_to_je32(dir_f->inocache->ino); 501 rd->pino = cpu_to_je32(dir_f->inocache->ino);
500 rd->version = cpu_to_je32(++dir_f->highest_version); 502 rd->version = cpu_to_je32(++dir_f->highest_version);
501 rd->ino = ri->ino; 503 rd->ino = ri->ino;
502 rd->mctime = ri->ctime; 504 rd->mctime = ri->ctime;
503 rd->nsize = namelen; 505 rd->nsize = qstr->len;
504 rd->type = DT_REG; 506 rd->type = DT_REG;
505 rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8)); 507 rd->node_crc = cpu_to_je32(crc32(0, rd, sizeof(*rd)-8));
506 rd->name_crc = cpu_to_je32(crc32(0, name, namelen)); 508 rd->name_crc = cpu_to_je32(crc32(0, qstr->name, qstr->len));
507 509
508 fd = jffs2_write_dirent(c, dir_f, rd, name, namelen, ALLOC_NORMAL); 510 fd = jffs2_write_dirent(c, dir_f, rd, qstr->name, qstr->len, ALLOC_NORMAL);
509 511
510 jffs2_free_raw_dirent(rd); 512 jffs2_free_raw_dirent(rd);
511 513
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4f9cc0482949..3e93cdd19005 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
31 * is used to release xattr name/value pair and detach from c->xattrindex. 31 * is used to release xattr name/value pair and detach from c->xattrindex.
32 * reclaim_xattr_datum(c) 32 * reclaim_xattr_datum(c)
33 * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when 33 * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
34 * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold 34 * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold
35 * is hard coded as 32KiB. 35 * is hard coded as 32KiB.
36 * do_verify_xattr_datum(c, xd) 36 * do_verify_xattr_datum(c, xd)
37 * is used to load the xdatum informations without name/value pair from the medium. 37 * is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index cf4f5759b42b..7be4beb306f3 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -121,10 +121,11 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
121#endif /* CONFIG_JFFS2_FS_XATTR */ 121#endif /* CONFIG_JFFS2_FS_XATTR */
122 122
123#ifdef CONFIG_JFFS2_FS_SECURITY 123#ifdef CONFIG_JFFS2_FS_SECURITY
124extern int jffs2_init_security(struct inode *inode, struct inode *dir); 124extern int jffs2_init_security(struct inode *inode, struct inode *dir,
125 const struct qstr *qstr);
125extern const struct xattr_handler jffs2_security_xattr_handler; 126extern const struct xattr_handler jffs2_security_xattr_handler;
126#else 127#else
127#define jffs2_init_security(inode,dir) (0) 128#define jffs2_init_security(inode,dir,qstr) (0)
128#endif /* CONFIG_JFFS2_FS_SECURITY */ 129#endif /* CONFIG_JFFS2_FS_SECURITY */
129 130
130#endif /* _JFFS2_FS_XATTR_H_ */ 131#endif /* _JFFS2_FS_XATTR_H_ */
diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile
index 3adb6395e42d..a58fa72d7e59 100644
--- a/fs/jfs/Makefile
+++ b/fs/jfs/Makefile
@@ -13,4 +13,4 @@ jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \
13 13
14jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o 14jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o
15 15
16EXTRA_CFLAGS += -D_JFS_4K 16ccflags-y := -D_JFS_4K
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9978803ceedc..eddbb373209e 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -352,7 +352,6 @@ const struct address_space_operations jfs_aops = {
352 .readpages = jfs_readpages, 352 .readpages = jfs_readpages,
353 .writepage = jfs_writepage, 353 .writepage = jfs_writepage,
354 .writepages = jfs_writepages, 354 .writepages = jfs_writepages,
355 .sync_page = block_sync_page,
356 .write_begin = jfs_write_begin, 355 .write_begin = jfs_write_begin,
357 .write_end = nobh_write_end, 356 .write_end = nobh_write_end,
358 .bmap = jfs_bmap, 357 .bmap = jfs_bmap,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index afe222bf300f..6f98a1866776 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -72,7 +72,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
72 if (err) 72 if (err)
73 return err; 73 return err;
74 74
75 if (!is_owner_or_cap(inode)) { 75 if (!inode_owner_or_capable(inode)) {
76 err = -EACCES; 76 err = -EACCES;
77 goto setflags_out; 77 goto setflags_out;
78 } 78 }
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index c92ea3b3ea5e..4496872cf4e7 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1649,7 +1649,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1649 } 1649 }
1650 1650
1651 /* search the tree within the dmap control page for 1651 /* search the tree within the dmap control page for
1652 * sufficent free space. if sufficient free space is found, 1652 * sufficient free space. if sufficient free space is found,
1653 * dbFindLeaf() returns the index of the leaf at which 1653 * dbFindLeaf() returns the index of the leaf at which
1654 * free space was found. 1654 * free space was found.
1655 */ 1655 */
@@ -2744,7 +2744,7 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
2744 /* check which (leafno or buddy) is the left buddy. 2744 /* check which (leafno or buddy) is the left buddy.
2745 * the left buddy gets to claim the blocks resulting 2745 * the left buddy gets to claim the blocks resulting
2746 * from the join while the right gets to claim none. 2746 * from the join while the right gets to claim none.
2747 * the left buddy is also eligable to participate in 2747 * the left buddy is also eligible to participate in
2748 * a join at the next higher level while the right 2748 * a join at the next higher level while the right
2749 * is not. 2749 * is not.
2750 * 2750 *
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5d3bbd10f8db..e5fe8506ed16 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr)
126 126
127 /* allocate the disk blocks for the extent. initially, extBalloc() 127 /* allocate the disk blocks for the extent. initially, extBalloc()
128 * will try to allocate disk blocks for the requested size (xlen). 128 * will try to allocate disk blocks for the requested size (xlen).
129 * if this fails (xlen contiguous free blocks not avaliable), it'll 129 * if this fails (xlen contiguous free blocks not available), it'll
130 * try to allocate a smaller number of blocks (producing a smaller 130 * try to allocate a smaller number of blocks (producing a smaller
131 * extent), with this smaller number of blocks consisting of the 131 * extent), with this smaller number of blocks consisting of the
132 * requested number of blocks rounded down to the next smaller 132 * requested number of blocks rounded down to the next smaller
@@ -481,7 +481,7 @@ int extFill(struct inode *ip, xad_t * xp)
481 * 481 *
482 * initially, we will try to allocate disk blocks for the 482 * initially, we will try to allocate disk blocks for the
483 * requested size (nblocks). if this fails (nblocks 483 * requested size (nblocks). if this fails (nblocks
484 * contiguous free blocks not avaliable), we'll try to allocate 484 * contiguous free blocks not available), we'll try to allocate
485 * a smaller number of blocks (producing a smaller extent), with 485 * a smaller number of blocks (producing a smaller extent), with
486 * this smaller number of blocks consisting of the requested 486 * this smaller number of blocks consisting of the requested
487 * number of blocks rounded down to the next smaller power of 2 487 * number of blocks rounded down to the next smaller power of 2
@@ -575,7 +575,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
575 * to a new set of blocks. If moving the extent, we initially 575 * to a new set of blocks. If moving the extent, we initially
576 * will try to allocate disk blocks for the requested size 576 * will try to allocate disk blocks for the requested size
577 * (newnblks). if this fails (new contiguous free blocks not 577 * (newnblks). if this fails (new contiguous free blocks not
578 * avaliable), we'll try to allocate a smaller number of 578 * available), we'll try to allocate a smaller number of
579 * blocks (producing a smaller extent), with this smaller 579 * blocks (producing a smaller extent), with this smaller
580 * number of blocks consisting of the requested number of 580 * number of blocks consisting of the requested number of
581 * blocks rounded down to the next smaller power of 2 581 * blocks rounded down to the next smaller power of 2
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 3a09423b6c22..ed53a4740168 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1069,7 +1069,7 @@ int diFree(struct inode *ip)
1069 */ 1069 */
1070 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { 1070 if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) {
1071 /* in preparation for removing the iag from the 1071 /* in preparation for removing the iag from the
1072 * ag extent free list, read the iags preceeding 1072 * ag extent free list, read the iags preceding
1073 * and following the iag on the ag extent free 1073 * and following the iag on the ag extent free
1074 * list. 1074 * list.
1075 */ 1075 */
@@ -1095,7 +1095,7 @@ int diFree(struct inode *ip)
1095 int inofreefwd = le32_to_cpu(iagp->inofreefwd); 1095 int inofreefwd = le32_to_cpu(iagp->inofreefwd);
1096 1096
1097 /* in preparation for removing the iag from the 1097 /* in preparation for removing the iag from the
1098 * ag inode free list, read the iags preceeding 1098 * ag inode free list, read the iags preceding
1099 * and following the iag on the ag inode free 1099 * and following the iag on the ag inode free
1100 * list. before reading these iags, we must make 1100 * list. before reading these iags, we must make
1101 * sure that we already don't have them in hand 1101 * sure that we already don't have them in hand
@@ -1681,7 +1681,7 @@ diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip)
1681 * try to allocate a new extent of free inodes. 1681 * try to allocate a new extent of free inodes.
1682 */ 1682 */
1683 if (addext) { 1683 if (addext) {
1684 /* if free space is not avaliable for this new extent, try 1684 /* if free space is not available for this new extent, try
1685 * below to allocate a free and existing (already backed) 1685 * below to allocate a free and existing (already backed)
1686 * inode from the ag. 1686 * inode from the ag.
1687 */ 1687 */
@@ -2036,7 +2036,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2036 2036
2037 /* check if this is the last free inode within the iag. 2037 /* check if this is the last free inode within the iag.
2038 * if so, it will have to be removed from the ag free 2038 * if so, it will have to be removed from the ag free
2039 * inode list, so get the iags preceeding and following 2039 * inode list, so get the iags preceding and following
2040 * it on the list. 2040 * it on the list.
2041 */ 2041 */
2042 if (iagp->nfreeinos == cpu_to_le32(1)) { 2042 if (iagp->nfreeinos == cpu_to_le32(1)) {
@@ -2208,7 +2208,7 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2208 2208
2209 /* check if this is the last free extent within the 2209 /* check if this is the last free extent within the
2210 * iag. if so, the iag must be removed from the ag 2210 * iag. if so, the iag must be removed from the ag
2211 * free extent list, so get the iags preceeding and 2211 * free extent list, so get the iags preceding and
2212 * following the iag on this list. 2212 * following the iag on this list.
2213 */ 2213 */
2214 if (iagp->nfreeexts == cpu_to_le32(1)) { 2214 if (iagp->nfreeexts == cpu_to_le32(1)) {
@@ -2504,7 +2504,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2504 } 2504 }
2505 2505
2506 2506
2507 /* get the next avaliable iag number */ 2507 /* get the next available iag number */
2508 iagno = imap->im_nextiag; 2508 iagno = imap->im_nextiag;
2509 2509
2510 /* make sure that we have not exceeded the maximum inode 2510 /* make sure that we have not exceeded the maximum inode
@@ -2615,7 +2615,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
2615 2615
2616 duplicateIXtree(sb, blkno, xlen, &xaddr); 2616 duplicateIXtree(sb, blkno, xlen, &xaddr);
2617 2617
2618 /* update the next avaliable iag number */ 2618 /* update the next available iag number */
2619 imap->im_nextiag += 1; 2619 imap->im_nextiag += 1;
2620 2620
2621 /* Add the iag to the iag free list so we don't lose the iag 2621 /* Add the iag to the iag free list so we don't lose the iag
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 9236bc49ae7f..e38c21598850 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -288,7 +288,7 @@ struct lrd {
288 /* 288 /*
289 * SYNCPT: log sync point 289 * SYNCPT: log sync point
290 * 290 *
291 * replay log upto syncpt address specified; 291 * replay log up to syncpt address specified;
292 */ 292 */
293 struct { 293 struct {
294 __le32 sync; /* 4: syncpt address (0 = here) */ 294 __le32 sync; /* 4: syncpt address (0 = here) */
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 48b44bd8267b..6740d34cd82b 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -583,7 +583,6 @@ static void metapage_invalidatepage(struct page *page, unsigned long offset)
583const struct address_space_operations jfs_metapage_aops = { 583const struct address_space_operations jfs_metapage_aops = {
584 .readpage = metapage_readpage, 584 .readpage = metapage_readpage,
585 .writepage = metapage_writepage, 585 .writepage = metapage_writepage,
586 .sync_page = block_sync_page,
587 .releasepage = metapage_releasepage, 586 .releasepage = metapage_releasepage,
588 .invalidatepage = metapage_invalidatepage, 587 .invalidatepage = metapage_invalidatepage,
589 .set_page_dirty = __set_page_dirty_nobuffers, 588 .set_page_dirty = __set_page_dirty_nobuffers,
diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h
index d94f8d9e87d7..a78beda85f68 100644
--- a/fs/jfs/jfs_metapage.h
+++ b/fs/jfs/jfs_metapage.h
@@ -75,7 +75,7 @@ extern void grab_metapage(struct metapage *);
75extern void force_metapage(struct metapage *); 75extern void force_metapage(struct metapage *);
76 76
77/* 77/*
78 * hold_metapage and put_metapage are used in conjuction. The page lock 78 * hold_metapage and put_metapage are used in conjunction. The page lock
79 * is not dropped between the two, so no other threads can get or release 79 * is not dropped between the two, so no other threads can get or release
80 * the metapage 80 * the metapage
81 */ 81 */
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 9466957ec841..f6cc0c09ec63 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -636,7 +636,7 @@ struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
636 * the inode of the page and available to all anonymous 636 * the inode of the page and available to all anonymous
637 * transactions until txCommit() time at which point 637 * transactions until txCommit() time at which point
638 * they are transferred to the transaction tlock list of 638 * they are transferred to the transaction tlock list of
639 * the commiting transaction of the inode) 639 * the committing transaction of the inode)
640 */ 640 */
641 if (xtid == 0) { 641 if (xtid == 0) {
642 tlck->tid = tid; 642 tlck->tid = tid;
diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h
index 88b6cc535bf2..e9e100fd7c09 100644
--- a/fs/jfs/jfs_xattr.h
+++ b/fs/jfs/jfs_xattr.h
@@ -62,10 +62,11 @@ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
62extern int jfs_removexattr(struct dentry *, const char *); 62extern int jfs_removexattr(struct dentry *, const char *);
63 63
64#ifdef CONFIG_JFS_SECURITY 64#ifdef CONFIG_JFS_SECURITY
65extern int jfs_init_security(tid_t, struct inode *, struct inode *); 65extern int jfs_init_security(tid_t, struct inode *, struct inode *,
66 const struct qstr *);
66#else 67#else
67static inline int jfs_init_security(tid_t tid, struct inode *inode, 68static inline int jfs_init_security(tid_t tid, struct inode *inode,
68 struct inode *dir) 69 struct inode *dir, const struct qstr *qstr)
69{ 70{
70 return 0; 71 return 0;
71} 72}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 81ead850ddb6..eaaf2b511e89 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -115,7 +115,7 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
115 if (rc) 115 if (rc)
116 goto out3; 116 goto out3;
117 117
118 rc = jfs_init_security(tid, ip, dip); 118 rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
119 if (rc) { 119 if (rc) {
120 txAbort(tid, 0); 120 txAbort(tid, 0);
121 goto out3; 121 goto out3;
@@ -253,7 +253,7 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
253 if (rc) 253 if (rc)
254 goto out3; 254 goto out3;
255 255
256 rc = jfs_init_security(tid, ip, dip); 256 rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
257 if (rc) { 257 if (rc) {
258 txAbort(tid, 0); 258 txAbort(tid, 0);
259 goto out3; 259 goto out3;
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
809 if (ip->i_nlink == JFS_LINK_MAX) 809 if (ip->i_nlink == JFS_LINK_MAX)
810 return -EMLINK; 810 return -EMLINK;
811 811
812 if (ip->i_nlink == 0)
813 return -ENOENT;
814
815 dquot_initialize(dir); 812 dquot_initialize(dir);
816 813
817 tid = txBegin(ip->i_sb, 0); 814 tid = txBegin(ip->i_sb, 0);
@@ -932,7 +929,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
932 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); 929 mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT);
933 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); 930 mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD);
934 931
935 rc = jfs_init_security(tid, ip, dip); 932 rc = jfs_init_security(tid, ip, dip, &dentry->d_name);
936 if (rc) 933 if (rc)
937 goto out3; 934 goto out3;
938 935
@@ -1395,7 +1392,7 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
1395 if (rc) 1392 if (rc)
1396 goto out3; 1393 goto out3;
1397 1394
1398 rc = jfs_init_security(tid, ip, dir); 1395 rc = jfs_init_security(tid, ip, dir, &dentry->d_name);
1399 if (rc) { 1396 if (rc) {
1400 txAbort(tid, 0); 1397 txAbort(tid, 0);
1401 goto out3; 1398 goto out3;
@@ -1600,7 +1597,7 @@ out:
1600 1597
1601static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd) 1598static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
1602{ 1599{
1603 if (nd->flags & LOOKUP_RCU) 1600 if (nd && nd->flags & LOOKUP_RCU)
1604 return -ECHILD; 1601 return -ECHILD;
1605 /* 1602 /*
1606 * This is not negative dentry. Always valid. 1603 * This is not negative dentry. Always valid.
diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c
index 1aba0039f1c9..8ea5efb5a34e 100644
--- a/fs/jfs/resize.c
+++ b/fs/jfs/resize.c
@@ -57,7 +57,7 @@
57 * 2. compute new FSCKSize from new LVSize; 57 * 2. compute new FSCKSize from new LVSize;
58 * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where 58 * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where
59 * assert(new FSSize >= old FSSize), 59 * assert(new FSSize >= old FSSize),
60 * i.e., file system must not be shrinked; 60 * i.e., file system must not be shrunk;
61 */ 61 */
62int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) 62int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
63{ 63{
@@ -182,7 +182,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize)
182 */ 182 */
183 newFSSize = newLVSize - newLogSize - newFSCKSize; 183 newFSSize = newLVSize - newLogSize - newFSCKSize;
184 184
185 /* file system cannot be shrinked */ 185 /* file system cannot be shrunk */
186 if (newFSSize < bmp->db_mapsize) { 186 if (newFSSize < bmp->db_mapsize) {
187 rc = -EINVAL; 187 rc = -EINVAL;
188 goto out; 188 goto out;
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index eeca48a031ab..06c8a67cbe76 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -644,7 +644,7 @@ static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
644 644
645/* Read data from quotafile - avoid pagecache and such because we cannot afford 645/* Read data from quotafile - avoid pagecache and such because we cannot afford
646 * acquiring the locks... As quota files are never truncated and quota code 646 * acquiring the locks... As quota files are never truncated and quota code
647 * itself serializes the operations (and noone else should touch the files) 647 * itself serializes the operations (and no one else should touch the files)
648 * we don't have to be afraid of races */ 648 * we don't have to be afraid of races */
649static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, 649static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data,
650 size_t len, loff_t off) 650 size_t len, loff_t off)
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 2d7f165d0f1d..24838f1eeee5 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -678,7 +678,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
678 struct posix_acl *acl; 678 struct posix_acl *acl;
679 int rc; 679 int rc;
680 680
681 if (!is_owner_or_cap(inode)) 681 if (!inode_owner_or_capable(inode))
682 return -EPERM; 682 return -EPERM;
683 683
684 /* 684 /*
@@ -1091,7 +1091,8 @@ int jfs_removexattr(struct dentry *dentry, const char *name)
1091} 1091}
1092 1092
1093#ifdef CONFIG_JFS_SECURITY 1093#ifdef CONFIG_JFS_SECURITY
1094int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir) 1094int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
1095 const struct qstr *qstr)
1095{ 1096{
1096 int rc; 1097 int rc;
1097 size_t len; 1098 size_t len;
@@ -1099,7 +1100,8 @@ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir)
1099 char *suffix; 1100 char *suffix;
1100 char *name; 1101 char *name;
1101 1102
1102 rc = security_inode_init_security(inode, dir, &suffix, &value, &len); 1103 rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
1104 &len);
1103 if (rc) { 1105 if (rc) {
1104 if (rc == -EOPNOTSUPP) 1106 if (rc == -EOPNOTSUPP)
1105 return 0; 1107 return 0;
diff --git a/fs/locks.c b/fs/locks.c
index 0f3998291f78..0a4f50dfadfb 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -145,7 +145,6 @@ static DEFINE_SPINLOCK(file_lock_lock);
145 145
146/* 146/*
147 * Protects the two list heads above, plus the inode->i_flock list 147 * Protects the two list heads above, plus the inode->i_flock list
148 * FIXME: should use a spinlock, once lockd and ceph are ready.
149 */ 148 */
150void lock_flocks(void) 149void lock_flocks(void)
151{ 150{
@@ -415,17 +414,7 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
415 fl->fl_ops = NULL; 414 fl->fl_ops = NULL;
416 fl->fl_lmops = NULL; 415 fl->fl_lmops = NULL;
417 416
418 switch (l->l_type) { 417 return assign_type(fl, l->l_type);
419 case F_RDLCK:
420 case F_WRLCK:
421 case F_UNLCK:
422 fl->fl_type = l->l_type;
423 break;
424 default:
425 return -EINVAL;
426 }
427
428 return (0);
429} 418}
430#endif 419#endif
431 420
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
index 44bbfd249abc..961f02b86d97 100644
--- a/fs/logfs/compr.c
+++ b/fs/logfs/compr.c
@@ -81,7 +81,7 @@ error:
81 81
82int __init logfs_compr_init(void) 82int __init logfs_compr_init(void)
83{ 83{
84 size_t size = max(zlib_deflate_workspacesize(), 84 size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
85 zlib_inflate_workspacesize()); 85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size); 86 stream.workspace = vmalloc(size);
87 if (!stream.workspace) 87 if (!stream.workspace)
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 723bc5bca09a..1adc8d455f0e 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -39,7 +39,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
39 bio.bi_end_io = request_complete; 39 bio.bi_end_io = request_complete;
40 40
41 submit_bio(rw, &bio); 41 submit_bio(rw, &bio);
42 generic_unplug_device(bdev_get_queue(bdev));
43 wait_for_completion(&complete); 42 wait_for_completion(&complete);
44 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO; 43 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
45} 44}
@@ -168,7 +167,6 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
168 } 167 }
169 len = PAGE_ALIGN(len); 168 len = PAGE_ALIGN(len);
170 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); 169 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
171 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
172} 170}
173 171
174 172
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
index 7466e9dcc8c5..339e17e9133d 100644
--- a/fs/logfs/dev_mtd.c
+++ b/fs/logfs/dev_mtd.c
@@ -60,7 +60,7 @@ static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
60 * asynchronous properties. So just to prevent the first implementor of such 60 * asynchronous properties. So just to prevent the first implementor of such
61 * a thing from breaking logfs in 2350, we do the usual pointless dance to 61 * a thing from breaking logfs in 2350, we do the usual pointless dance to
62 * declare a completion variable and wait for completion before returning 62 * declare a completion variable and wait for completion before returning
63 * from mtd_erase(). What an excercise in futility! 63 * from mtd_erase(). What an exercise in futility!
64 */ 64 */
65static void logfs_erase_callback(struct erase_info *ei) 65static void logfs_erase_callback(struct erase_info *ei)
66{ 66{
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f9ddf0c388c8..9ed89d1663f8 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -92,7 +92,7 @@ static int beyond_eof(struct inode *inode, loff_t bix)
92 * so short names (len <= 9) don't even occupy the complete 32bit name 92 * so short names (len <= 9) don't even occupy the complete 32bit name
93 * space. A prime >256 ensures short names quickly spread the 32bit 93 * space. A prime >256 ensures short names quickly spread the 32bit
94 * name space. Add about 26 for the estimated amount of information 94 * name space. Add about 26 for the estimated amount of information
95 * of each character and pick a prime nearby, preferrably a bit-sparse 95 * of each character and pick a prime nearby, preferably a bit-sparse
96 * one. 96 * one.
97 */ 97 */
98static u32 hash_32(const char *s, int len, u32 seed) 98static u32 hash_32(const char *s, int len, u32 seed)
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index e86376b87af1..c2ad7028def4 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -196,7 +196,7 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
196 if (IS_RDONLY(inode)) 196 if (IS_RDONLY(inode))
197 return -EROFS; 197 return -EROFS;
198 198
199 if (!is_owner_or_cap(inode)) 199 if (!inode_owner_or_capable(inode))
200 return -EACCES; 200 return -EACCES;
201 201
202 err = get_user(flags, (int __user *)arg); 202 err = get_user(flags, (int __user *)arg);
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 03b8c240aeda..edfea7a3a747 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -293,7 +293,7 @@ static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
293 return ret; 293 return ret;
294} 294}
295 295
296/* called with inode_lock held */ 296/* called with inode->i_lock held */
297static int logfs_drop_inode(struct inode *inode) 297static int logfs_drop_inode(struct inode *inode)
298{ 298{
299 struct logfs_super *super = logfs_super(inode->i_sb); 299 struct logfs_super *super = logfs_super(inode->i_sb);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index ee99a9f5dfd3..9e22085231b3 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -1616,7 +1616,7 @@ int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1616 err = logfs_write_buf(inode, page, flags); 1616 err = logfs_write_buf(inode, page, flags);
1617 if (!err && shrink_level(gc_level) == 0) { 1617 if (!err && shrink_level(gc_level) == 0) {
1618 /* Rewrite cannot mark the inode dirty but has to 1618 /* Rewrite cannot mark the inode dirty but has to
1619 * write it immediatly. 1619 * write it immediately.
1620 * Q: Can't we just create an alias for the inode 1620 * Q: Can't we just create an alias for the inode
1621 * instead? And if not, why not? 1621 * instead? And if not, why not?
1622 */ 1622 */
diff --git a/fs/mbcache.c b/fs/mbcache.c
index a25444ab2baf..2f174be06555 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -542,7 +542,7 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head,
542 * mb_cache_entry_find_first() 542 * mb_cache_entry_find_first()
543 * 543 *
544 * Find the first cache entry on a given device with a certain key in 544 * Find the first cache entry on a given device with a certain key in
545 * an additional index. Additonal matches can be found with 545 * an additional index. Additional matches can be found with
546 * mb_cache_entry_find_next(). Returns NULL if no match was found. The 546 * mb_cache_entry_find_next(). Returns NULL if no match was found. The
547 * returned cache entry is locked for shared access ("multiple readers"). 547 * returned cache entry is locked for shared access ("multiple readers").
548 * 548 *
diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig
index 0fd7ca994264..6624684dd5de 100644
--- a/fs/minix/Kconfig
+++ b/fs/minix/Kconfig
@@ -15,3 +15,11 @@ config MINIX_FS
15 module will be called minix. Note that the file system of your root 15 module will be called minix. Note that the file system of your root
16 partition (the one containing the directory /) cannot be compiled as 16 partition (the one containing the directory /) cannot be compiled as
17 a module. 17 a module.
18
19config MINIX_FS_NATIVE_ENDIAN
20 def_bool MINIX_FS
21 depends on H8300 || M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU)
22
23config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED
24 def_bool MINIX_FS
25 depends on M68K && MMU
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index ae0b83f476a6..adcdc0a4e182 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -399,7 +399,6 @@ static sector_t minix_bmap(struct address_space *mapping, sector_t block)
399static const struct address_space_operations minix_aops = { 399static const struct address_space_operations minix_aops = {
400 .readpage = minix_readpage, 400 .readpage = minix_readpage,
401 .writepage = minix_writepage, 401 .writepage = minix_writepage,
402 .sync_page = block_sync_page,
403 .write_begin = minix_write_begin, 402 .write_begin = minix_write_begin,
404 .write_end = generic_write_end, 403 .write_end = generic_write_end,
405 .bmap = minix_bmap 404 .bmap = minix_bmap
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 407b1c84911e..341e2122879a 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -88,4 +88,78 @@ static inline struct minix_inode_info *minix_i(struct inode *inode)
88 return list_entry(inode, struct minix_inode_info, vfs_inode); 88 return list_entry(inode, struct minix_inode_info, vfs_inode);
89} 89}
90 90
91#if defined(CONFIG_MINIX_FS_NATIVE_ENDIAN) && \
92 defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
93
94#error Minix file system byte order broken
95
96#elif defined(CONFIG_MINIX_FS_NATIVE_ENDIAN)
97
98/*
99 * big-endian 32 or 64 bit indexed bitmaps on big-endian system or
100 * little-endian bitmaps on little-endian system
101 */
102
103#define minix_test_and_set_bit(nr, addr) \
104 __test_and_set_bit((nr), (unsigned long *)(addr))
105#define minix_set_bit(nr, addr) \
106 __set_bit((nr), (unsigned long *)(addr))
107#define minix_test_and_clear_bit(nr, addr) \
108 __test_and_clear_bit((nr), (unsigned long *)(addr))
109#define minix_test_bit(nr, addr) \
110 test_bit((nr), (unsigned long *)(addr))
111#define minix_find_first_zero_bit(addr, size) \
112 find_first_zero_bit((unsigned long *)(addr), (size))
113
114#elif defined(CONFIG_MINIX_FS_BIG_ENDIAN_16BIT_INDEXED)
115
116/*
117 * big-endian 16bit indexed bitmaps
118 */
119
120static inline int minix_find_first_zero_bit(const void *vaddr, unsigned size)
121{
122 const unsigned short *p = vaddr, *addr = vaddr;
123 unsigned short num;
124
125 if (!size)
126 return 0;
127
128 size = (size >> 4) + ((size & 15) > 0);
129 while (*p++ == 0xffff) {
130 if (--size == 0)
131 return (p - addr) << 4;
132 }
133
134 num = *--p;
135 return ((p - addr) << 4) + ffz(num);
136}
137
138#define minix_test_and_set_bit(nr, addr) \
139 __test_and_set_bit((nr) ^ 16, (unsigned long *)(addr))
140#define minix_set_bit(nr, addr) \
141 __set_bit((nr) ^ 16, (unsigned long *)(addr))
142#define minix_test_and_clear_bit(nr, addr) \
143 __test_and_clear_bit((nr) ^ 16, (unsigned long *)(addr))
144
145static inline int minix_test_bit(int nr, const void *vaddr)
146{
147 const unsigned short *p = vaddr;
148 return (p[nr >> 4] & (1U << (nr & 15))) != 0;
149}
150
151#else
152
153/*
154 * little-endian bitmaps
155 */
156
157#define minix_test_and_set_bit __test_and_set_bit_le
158#define minix_set_bit __set_bit_le
159#define minix_test_and_clear_bit __test_and_clear_bit_le
160#define minix_test_bit test_bit_le
161#define minix_find_first_zero_bit find_first_zero_bit_le
162
163#endif
164
91#endif /* FS_MINIX_H */ 165#endif /* FS_MINIX_H */
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ce7337ddfdbf..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
213 new_de = minix_find_entry(new_dentry, &new_page); 213 new_de = minix_find_entry(new_dentry, &new_page);
214 if (!new_de) 214 if (!new_de)
215 goto out_dir; 215 goto out_dir;
216 inode_inc_link_count(old_inode);
217 minix_set_link(new_de, new_page, old_inode); 216 minix_set_link(new_de, new_page, old_inode);
218 new_inode->i_ctime = CURRENT_TIME_SEC; 217 new_inode->i_ctime = CURRENT_TIME_SEC;
219 if (dir_de) 218 if (dir_de)
@@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
225 if (new_dir->i_nlink >= info->s_link_max) 224 if (new_dir->i_nlink >= info->s_link_max)
226 goto out_dir; 225 goto out_dir;
227 } 226 }
228 inode_inc_link_count(old_inode);
229 err = minix_add_link(new_dentry, old_inode); 227 err = minix_add_link(new_dentry, old_inode);
230 if (err) { 228 if (err)
231 inode_dec_link_count(old_inode);
232 goto out_dir; 229 goto out_dir;
233 }
234 if (dir_de) 230 if (dir_de)
235 inode_inc_link_count(new_dir); 231 inode_inc_link_count(new_dir);
236 } 232 }
237 233
238 minix_delete_entry(old_de, old_page); 234 minix_delete_entry(old_de, old_page);
239 inode_dec_link_count(old_inode); 235 mark_inode_dirty(old_inode);
240 236
241 if (dir_de) { 237 if (dir_de) {
242 minix_set_link(dir_de, dir_page, new_dir); 238 minix_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/mpage.c b/fs/mpage.c
index d78455a81ec9..0afc809e46e0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -364,6 +364,9 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
364 sector_t last_block_in_bio = 0; 364 sector_t last_block_in_bio = 0;
365 struct buffer_head map_bh; 365 struct buffer_head map_bh;
366 unsigned long first_logical_block = 0; 366 unsigned long first_logical_block = 0;
367 struct blk_plug plug;
368
369 blk_start_plug(&plug);
367 370
368 map_bh.b_state = 0; 371 map_bh.b_state = 0;
369 map_bh.b_size = 0; 372 map_bh.b_size = 0;
@@ -385,6 +388,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
385 BUG_ON(!list_empty(pages)); 388 BUG_ON(!list_empty(pages));
386 if (bio) 389 if (bio)
387 mpage_bio_submit(READ, bio); 390 mpage_bio_submit(READ, bio);
391 blk_finish_plug(&plug);
388 return 0; 392 return 0;
389} 393}
390EXPORT_SYMBOL(mpage_readpages); 394EXPORT_SYMBOL(mpage_readpages);
@@ -666,8 +670,11 @@ int
666mpage_writepages(struct address_space *mapping, 670mpage_writepages(struct address_space *mapping,
667 struct writeback_control *wbc, get_block_t get_block) 671 struct writeback_control *wbc, get_block_t get_block)
668{ 672{
673 struct blk_plug plug;
669 int ret; 674 int ret;
670 675
676 blk_start_plug(&plug);
677
671 if (!get_block) 678 if (!get_block)
672 ret = generic_writepages(mapping, wbc); 679 ret = generic_writepages(mapping, wbc);
673 else { 680 else {
@@ -682,6 +689,7 @@ mpage_writepages(struct address_space *mapping,
682 if (mpd.bio) 689 if (mpd.bio)
683 mpage_bio_submit(WRITE, mpd.bio); 690 mpage_bio_submit(WRITE, mpd.bio);
684 } 691 }
692 blk_finish_plug(&plug);
685 return ret; 693 return ret;
686} 694}
687EXPORT_SYMBOL(mpage_writepages); 695EXPORT_SYMBOL(mpage_writepages);
diff --git a/fs/namei.c b/fs/namei.c
index 7d77f24d32a9..54fc993e3027 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -70,7 +70,7 @@
70 * name indicated by the symlink. The old code always complained that the 70 * name indicated by the symlink. The old code always complained that the
71 * name already exists, due to not following the symlink even if its target 71 * name already exists, due to not following the symlink even if its target
72 * is nonexistent. The new semantics affects also mknod() and link() when 72 * is nonexistent. The new semantics affects also mknod() and link() when
73 * the name is a symlink pointing to a non-existant name. 73 * the name is a symlink pointing to a non-existent name.
74 * 74 *
75 * I don't know which semantics is the right one, since I have no access 75 * I don't know which semantics is the right one, since I have no access
76 * to standards. But I found by trial that HP-UX 9.0 has the full "new" 76 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
136 return retval; 136 return retval;
137} 137}
138 138
139char * getname(const char __user * filename) 139static char *getname_flags(const char __user * filename, int flags)
140{ 140{
141 char *tmp, *result; 141 char *tmp, *result;
142 142
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
147 147
148 result = tmp; 148 result = tmp;
149 if (retval < 0) { 149 if (retval < 0) {
150 __putname(tmp); 150 if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
151 result = ERR_PTR(retval); 151 __putname(tmp);
152 result = ERR_PTR(retval);
153 }
152 } 154 }
153 } 155 }
154 audit_getname(result); 156 audit_getname(result);
155 return result; 157 return result;
156} 158}
157 159
160char *getname(const char __user * filename)
161{
162 return getname_flags(filename, 0);
163}
164
158#ifdef CONFIG_AUDITSYSCALL 165#ifdef CONFIG_AUDITSYSCALL
159void putname(const char *name) 166void putname(const char *name)
160{ 167{
@@ -176,6 +183,9 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
176 183
177 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 184 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
178 185
186 if (current_user_ns() != inode_userns(inode))
187 goto other_perms;
188
179 if (current_fsuid() == inode->i_uid) 189 if (current_fsuid() == inode->i_uid)
180 mode >>= 6; 190 mode >>= 6;
181 else { 191 else {
@@ -189,6 +199,7 @@ static int acl_permission_check(struct inode *inode, int mask, unsigned int flag
189 mode >>= 3; 199 mode >>= 3;
190 } 200 }
191 201
202other_perms:
192 /* 203 /*
193 * If the DACs are ok we don't need any capability check. 204 * If the DACs are ok we don't need any capability check.
194 */ 205 */
@@ -230,7 +241,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
230 * Executable DACs are overridable if at least one exec bit is set. 241 * Executable DACs are overridable if at least one exec bit is set.
231 */ 242 */
232 if (!(mask & MAY_EXEC) || execute_ok(inode)) 243 if (!(mask & MAY_EXEC) || execute_ok(inode))
233 if (capable(CAP_DAC_OVERRIDE)) 244 if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
234 return 0; 245 return 0;
235 246
236 /* 247 /*
@@ -238,7 +249,7 @@ int generic_permission(struct inode *inode, int mask, unsigned int flags,
238 */ 249 */
239 mask &= MAY_READ | MAY_WRITE | MAY_EXEC; 250 mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
240 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) 251 if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE)))
241 if (capable(CAP_DAC_READ_SEARCH)) 252 if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
242 return 0; 253 return 0;
243 254
244 return -EACCES; 255 return -EACCES;
@@ -401,9 +412,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
401{ 412{
402 struct fs_struct *fs = current->fs; 413 struct fs_struct *fs = current->fs;
403 struct dentry *dentry = nd->path.dentry; 414 struct dentry *dentry = nd->path.dentry;
415 int want_root = 0;
404 416
405 BUG_ON(!(nd->flags & LOOKUP_RCU)); 417 BUG_ON(!(nd->flags & LOOKUP_RCU));
406 if (nd->root.mnt) { 418 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
419 want_root = 1;
407 spin_lock(&fs->lock); 420 spin_lock(&fs->lock);
408 if (nd->root.mnt != fs->root.mnt || 421 if (nd->root.mnt != fs->root.mnt ||
409 nd->root.dentry != fs->root.dentry) 422 nd->root.dentry != fs->root.dentry)
@@ -414,7 +427,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
414 goto err; 427 goto err;
415 BUG_ON(nd->inode != dentry->d_inode); 428 BUG_ON(nd->inode != dentry->d_inode);
416 spin_unlock(&dentry->d_lock); 429 spin_unlock(&dentry->d_lock);
417 if (nd->root.mnt) { 430 if (want_root) {
418 path_get(&nd->root); 431 path_get(&nd->root);
419 spin_unlock(&fs->lock); 432 spin_unlock(&fs->lock);
420 } 433 }
@@ -427,7 +440,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
427err: 440err:
428 spin_unlock(&dentry->d_lock); 441 spin_unlock(&dentry->d_lock);
429err_root: 442err_root:
430 if (nd->root.mnt) 443 if (want_root)
431 spin_unlock(&fs->lock); 444 spin_unlock(&fs->lock);
432 return -ECHILD; 445 return -ECHILD;
433} 446}
@@ -454,17 +467,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
454{ 467{
455 struct fs_struct *fs = current->fs; 468 struct fs_struct *fs = current->fs;
456 struct dentry *parent = nd->path.dentry; 469 struct dentry *parent = nd->path.dentry;
457 470 int want_root = 0;
458 /*
459 * It can be possible to revalidate the dentry that we started
460 * the path walk with. force_reval_path may also revalidate the
461 * dentry already committed to the nameidata.
462 */
463 if (unlikely(parent == dentry))
464 return nameidata_drop_rcu(nd);
465 471
466 BUG_ON(!(nd->flags & LOOKUP_RCU)); 472 BUG_ON(!(nd->flags & LOOKUP_RCU));
467 if (nd->root.mnt) { 473 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
474 want_root = 1;
468 spin_lock(&fs->lock); 475 spin_lock(&fs->lock);
469 if (nd->root.mnt != fs->root.mnt || 476 if (nd->root.mnt != fs->root.mnt ||
470 nd->root.dentry != fs->root.dentry) 477 nd->root.dentry != fs->root.dentry)
@@ -484,7 +491,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
484 parent->d_count++; 491 parent->d_count++;
485 spin_unlock(&dentry->d_lock); 492 spin_unlock(&dentry->d_lock);
486 spin_unlock(&parent->d_lock); 493 spin_unlock(&parent->d_lock);
487 if (nd->root.mnt) { 494 if (want_root) {
488 path_get(&nd->root); 495 path_get(&nd->root);
489 spin_unlock(&fs->lock); 496 spin_unlock(&fs->lock);
490 } 497 }
@@ -498,7 +505,7 @@ err:
498 spin_unlock(&dentry->d_lock); 505 spin_unlock(&dentry->d_lock);
499 spin_unlock(&parent->d_lock); 506 spin_unlock(&parent->d_lock);
500err_root: 507err_root:
501 if (nd->root.mnt) 508 if (want_root)
502 spin_unlock(&fs->lock); 509 spin_unlock(&fs->lock);
503 return -ECHILD; 510 return -ECHILD;
504} 511}
@@ -506,8 +513,16 @@ err_root:
506/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */ 513/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
507static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry) 514static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
508{ 515{
509 if (nd->flags & LOOKUP_RCU) 516 if (nd->flags & LOOKUP_RCU) {
510 return nameidata_dentry_drop_rcu(nd, dentry); 517 if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
518 nd->flags &= ~LOOKUP_RCU;
519 if (!(nd->flags & LOOKUP_ROOT))
520 nd->root.mnt = NULL;
521 rcu_read_unlock();
522 br_read_unlock(vfsmount_lock);
523 return -ECHILD;
524 }
525 }
511 return 0; 526 return 0;
512} 527}
513 528
@@ -526,7 +541,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
526 541
527 BUG_ON(!(nd->flags & LOOKUP_RCU)); 542 BUG_ON(!(nd->flags & LOOKUP_RCU));
528 nd->flags &= ~LOOKUP_RCU; 543 nd->flags &= ~LOOKUP_RCU;
529 nd->root.mnt = NULL; 544 if (!(nd->flags & LOOKUP_ROOT))
545 nd->root.mnt = NULL;
530 spin_lock(&dentry->d_lock); 546 spin_lock(&dentry->d_lock);
531 if (!__d_rcu_to_refcount(dentry, nd->seq)) 547 if (!__d_rcu_to_refcount(dentry, nd->seq))
532 goto err_unlock; 548 goto err_unlock;
@@ -547,53 +563,31 @@ err_unlock:
547 return -ECHILD; 563 return -ECHILD;
548} 564}
549 565
550/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing. */
551static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
552{
553 if (likely(nd->flags & LOOKUP_RCU))
554 return nameidata_drop_rcu_last(nd);
555 return 0;
556}
557
558/** 566/**
559 * release_open_intent - free up open intent resources 567 * release_open_intent - free up open intent resources
560 * @nd: pointer to nameidata 568 * @nd: pointer to nameidata
561 */ 569 */
562void release_open_intent(struct nameidata *nd) 570void release_open_intent(struct nameidata *nd)
563{ 571{
564 if (nd->intent.open.file->f_path.dentry == NULL) 572 struct file *file = nd->intent.open.file;
565 put_filp(nd->intent.open.file);
566 else
567 fput(nd->intent.open.file);
568}
569
570/*
571 * Call d_revalidate and handle filesystems that request rcu-walk
572 * to be dropped. This may be called and return in rcu-walk mode,
573 * regardless of success or error. If -ECHILD is returned, the caller
574 * must return -ECHILD back up the path walk stack so path walk may
575 * be restarted in ref-walk mode.
576 */
577static int d_revalidate(struct dentry *dentry, struct nameidata *nd)
578{
579 int status;
580 573
581 status = dentry->d_op->d_revalidate(dentry, nd); 574 if (file && !IS_ERR(file)) {
582 if (status == -ECHILD) { 575 if (file->f_path.dentry == NULL)
583 if (nameidata_dentry_drop_rcu(nd, dentry)) 576 put_filp(file);
584 return status; 577 else
585 status = dentry->d_op->d_revalidate(dentry, nd); 578 fput(file);
586 } 579 }
580}
587 581
588 return status; 582static inline int d_revalidate(struct dentry *dentry, struct nameidata *nd)
583{
584 return dentry->d_op->d_revalidate(dentry, nd);
589} 585}
590 586
591static inline struct dentry * 587static struct dentry *
592do_revalidate(struct dentry *dentry, struct nameidata *nd) 588do_revalidate(struct dentry *dentry, struct nameidata *nd)
593{ 589{
594 int status; 590 int status = d_revalidate(dentry, nd);
595
596 status = d_revalidate(dentry, nd);
597 if (unlikely(status <= 0)) { 591 if (unlikely(status <= 0)) {
598 /* 592 /*
599 * The dentry failed validation. 593 * The dentry failed validation.
@@ -602,37 +596,18 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
602 * to return a fail status. 596 * to return a fail status.
603 */ 597 */
604 if (status < 0) { 598 if (status < 0) {
605 /* If we're in rcu-walk, we don't have a ref */ 599 dput(dentry);
606 if (!(nd->flags & LOOKUP_RCU))
607 dput(dentry);
608 dentry = ERR_PTR(status); 600 dentry = ERR_PTR(status);
609 601 } else if (!d_invalidate(dentry)) {
610 } else { 602 dput(dentry);
611 /* Don't d_invalidate in rcu-walk mode */ 603 dentry = NULL;
612 if (nameidata_dentry_drop_rcu_maybe(nd, dentry))
613 return ERR_PTR(-ECHILD);
614 if (!d_invalidate(dentry)) {
615 dput(dentry);
616 dentry = NULL;
617 }
618 } 604 }
619 } 605 }
620 return dentry; 606 return dentry;
621} 607}
622 608
623static inline int need_reval_dot(struct dentry *dentry)
624{
625 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
626 return 0;
627
628 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
629 return 0;
630
631 return 1;
632}
633
634/* 609/*
635 * force_reval_path - force revalidation of a dentry 610 * handle_reval_path - force revalidation of a dentry
636 * 611 *
637 * In some situations the path walking code will trust dentries without 612 * In some situations the path walking code will trust dentries without
638 * revalidating them. This causes problems for filesystems that depend on 613 * revalidating them. This causes problems for filesystems that depend on
@@ -646,30 +621,28 @@ static inline int need_reval_dot(struct dentry *dentry)
646 * invalidate the dentry. It's up to the caller to handle putting references 621 * invalidate the dentry. It's up to the caller to handle putting references
647 * to the path if necessary. 622 * to the path if necessary.
648 */ 623 */
649static int 624static inline int handle_reval_path(struct nameidata *nd)
650force_reval_path(struct path *path, struct nameidata *nd)
651{ 625{
626 struct dentry *dentry = nd->path.dentry;
652 int status; 627 int status;
653 struct dentry *dentry = path->dentry;
654 628
655 /* 629 if (likely(!(nd->flags & LOOKUP_JUMPED)))
656 * only check on filesystems where it's possible for the dentry to 630 return 0;
657 * become stale. 631
658 */ 632 if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
659 if (!need_reval_dot(dentry)) 633 return 0;
634
635 if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
660 return 0; 636 return 0;
661 637
638 /* Note: we do not d_invalidate() */
662 status = d_revalidate(dentry, nd); 639 status = d_revalidate(dentry, nd);
663 if (status > 0) 640 if (status > 0)
664 return 0; 641 return 0;
665 642
666 if (!status) { 643 if (!status)
667 /* Don't d_invalidate in rcu-walk mode */
668 if (nameidata_drop_rcu(nd))
669 return -ECHILD;
670 d_invalidate(dentry);
671 status = -ESTALE; 644 status = -ESTALE;
672 } 645
673 return status; 646 return status;
674} 647}
675 648
@@ -685,6 +658,7 @@ force_reval_path(struct path *path, struct nameidata *nd)
685static inline int exec_permission(struct inode *inode, unsigned int flags) 658static inline int exec_permission(struct inode *inode, unsigned int flags)
686{ 659{
687 int ret; 660 int ret;
661 struct user_namespace *ns = inode_userns(inode);
688 662
689 if (inode->i_op->permission) { 663 if (inode->i_op->permission) {
690 ret = inode->i_op->permission(inode, MAY_EXEC, flags); 664 ret = inode->i_op->permission(inode, MAY_EXEC, flags);
@@ -697,7 +671,8 @@ static inline int exec_permission(struct inode *inode, unsigned int flags)
697 if (ret == -ECHILD) 671 if (ret == -ECHILD)
698 return ret; 672 return ret;
699 673
700 if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) 674 if (ns_capable(ns, CAP_DAC_OVERRIDE) ||
675 ns_capable(ns, CAP_DAC_READ_SEARCH))
701 goto ok; 676 goto ok;
702 677
703 return ret; 678 return ret;
@@ -722,6 +697,7 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
722 do { 697 do {
723 seq = read_seqcount_begin(&fs->seq); 698 seq = read_seqcount_begin(&fs->seq);
724 nd->root = fs->root; 699 nd->root = fs->root;
700 nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
725 } while (read_seqcount_retry(&fs->seq, seq)); 701 } while (read_seqcount_retry(&fs->seq, seq));
726 } 702 }
727} 703}
@@ -738,6 +714,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
738 path_put(&nd->path); 714 path_put(&nd->path);
739 nd->path = nd->root; 715 nd->path = nd->root;
740 path_get(&nd->root); 716 path_get(&nd->root);
717 nd->flags |= LOOKUP_JUMPED;
741 } 718 }
742 nd->inode = nd->path.dentry->d_inode; 719 nd->inode = nd->path.dentry->d_inode;
743 720
@@ -767,18 +744,43 @@ static inline void path_to_nameidata(const struct path *path,
767 nd->path.dentry = path->dentry; 744 nd->path.dentry = path->dentry;
768} 745}
769 746
747static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
748{
749 struct inode *inode = link->dentry->d_inode;
750 if (!IS_ERR(cookie) && inode->i_op->put_link)
751 inode->i_op->put_link(link->dentry, nd, cookie);
752 path_put(link);
753}
754
770static __always_inline int 755static __always_inline int
771__do_follow_link(const struct path *link, struct nameidata *nd, void **p) 756follow_link(struct path *link, struct nameidata *nd, void **p)
772{ 757{
773 int error; 758 int error;
774 struct dentry *dentry = link->dentry; 759 struct dentry *dentry = link->dentry;
775 760
776 touch_atime(link->mnt, dentry); 761 BUG_ON(nd->flags & LOOKUP_RCU);
777 nd_set_link(nd, NULL);
778 762
779 if (link->mnt == nd->path.mnt) 763 if (link->mnt == nd->path.mnt)
780 mntget(link->mnt); 764 mntget(link->mnt);
781 765
766 if (unlikely(current->total_link_count >= 40)) {
767 *p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
768 path_put(&nd->path);
769 return -ELOOP;
770 }
771 cond_resched();
772 current->total_link_count++;
773
774 touch_atime(link->mnt, dentry);
775 nd_set_link(nd, NULL);
776
777 error = security_inode_follow_link(link->dentry, nd);
778 if (error) {
779 *p = ERR_PTR(error); /* no ->put_link(), please */
780 path_put(&nd->path);
781 return error;
782 }
783
782 nd->last_type = LAST_BIND; 784 nd->last_type = LAST_BIND;
783 *p = dentry->d_inode->i_op->follow_link(dentry, nd); 785 *p = dentry->d_inode->i_op->follow_link(dentry, nd);
784 error = PTR_ERR(*p); 786 error = PTR_ERR(*p);
@@ -788,50 +790,18 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
788 if (s) 790 if (s)
789 error = __vfs_follow_link(nd, s); 791 error = __vfs_follow_link(nd, s);
790 else if (nd->last_type == LAST_BIND) { 792 else if (nd->last_type == LAST_BIND) {
791 error = force_reval_path(&nd->path, nd); 793 nd->flags |= LOOKUP_JUMPED;
792 if (error) 794 nd->inode = nd->path.dentry->d_inode;
795 if (nd->inode->i_op->follow_link) {
796 /* stepped on a _really_ weird one */
793 path_put(&nd->path); 797 path_put(&nd->path);
798 error = -ELOOP;
799 }
794 } 800 }
795 } 801 }
796 return error; 802 return error;
797} 803}
798 804
799/*
800 * This limits recursive symlink follows to 8, while
801 * limiting consecutive symlinks to 40.
802 *
803 * Without that kind of total limit, nasty chains of consecutive
804 * symlinks can cause almost arbitrarily long lookups.
805 */
806static inline int do_follow_link(struct path *path, struct nameidata *nd)
807{
808 void *cookie;
809 int err = -ELOOP;
810 if (current->link_count >= MAX_NESTED_LINKS)
811 goto loop;
812 if (current->total_link_count >= 40)
813 goto loop;
814 BUG_ON(nd->depth >= MAX_NESTED_LINKS);
815 cond_resched();
816 err = security_inode_follow_link(path->dentry, nd);
817 if (err)
818 goto loop;
819 current->link_count++;
820 current->total_link_count++;
821 nd->depth++;
822 err = __do_follow_link(path, nd, &cookie);
823 if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
824 path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
825 path_put(path);
826 current->link_count--;
827 nd->depth--;
828 return err;
829loop:
830 path_put_conditional(path, nd);
831 path_put(&nd->path);
832 return err;
833}
834
835static int follow_up_rcu(struct path *path) 805static int follow_up_rcu(struct path *path)
836{ 806{
837 struct vfsmount *parent; 807 struct vfsmount *parent;
@@ -970,8 +940,7 @@ static int follow_managed(struct path *path, unsigned flags)
970 if (managed & DCACHE_MANAGE_TRANSIT) { 940 if (managed & DCACHE_MANAGE_TRANSIT) {
971 BUG_ON(!path->dentry->d_op); 941 BUG_ON(!path->dentry->d_op);
972 BUG_ON(!path->dentry->d_op->d_manage); 942 BUG_ON(!path->dentry->d_op->d_manage);
973 ret = path->dentry->d_op->d_manage(path->dentry, 943 ret = path->dentry->d_op->d_manage(path->dentry, false);
974 false, false);
975 if (ret < 0) 944 if (ret < 0)
976 return ret == -EISDIR ? 0 : ret; 945 return ret == -EISDIR ? 0 : ret;
977 } 946 }
@@ -1024,6 +993,12 @@ int follow_down_one(struct path *path)
1024 return 0; 993 return 0;
1025} 994}
1026 995
996static inline bool managed_dentry_might_block(struct dentry *dentry)
997{
998 return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
999 dentry->d_op->d_manage(dentry, true) < 0);
1000}
1001
1027/* 1002/*
1028 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we 1003 * Skip to top of mountpoint pile in rcuwalk mode. We abort the rcu-walk if we
1029 * meet a managed dentry and we're not walking to "..". True is returned to 1004 * meet a managed dentry and we're not walking to "..". True is returned to
@@ -1032,19 +1007,26 @@ int follow_down_one(struct path *path)
1032static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, 1007static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
1033 struct inode **inode, bool reverse_transit) 1008 struct inode **inode, bool reverse_transit)
1034{ 1009{
1035 while (d_mountpoint(path->dentry)) { 1010 for (;;) {
1036 struct vfsmount *mounted; 1011 struct vfsmount *mounted;
1037 if (unlikely(path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) && 1012 /*
1038 !reverse_transit && 1013 * Don't forget we might have a non-mountpoint managed dentry
1039 path->dentry->d_op->d_manage(path->dentry, false, true) < 0) 1014 * that wants to block transit.
1015 */
1016 *inode = path->dentry->d_inode;
1017 if (!reverse_transit &&
1018 unlikely(managed_dentry_might_block(path->dentry)))
1040 return false; 1019 return false;
1020
1021 if (!d_mountpoint(path->dentry))
1022 break;
1023
1041 mounted = __lookup_mnt(path->mnt, path->dentry, 1); 1024 mounted = __lookup_mnt(path->mnt, path->dentry, 1);
1042 if (!mounted) 1025 if (!mounted)
1043 break; 1026 break;
1044 path->mnt = mounted; 1027 path->mnt = mounted;
1045 path->dentry = mounted->mnt_root; 1028 path->dentry = mounted->mnt_root;
1046 nd->seq = read_seqcount_begin(&path->dentry->d_seq); 1029 nd->seq = read_seqcount_begin(&path->dentry->d_seq);
1047 *inode = path->dentry->d_inode;
1048 } 1030 }
1049 1031
1050 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT)) 1032 if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
@@ -1070,7 +1052,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1070 1052
1071 seq = read_seqcount_begin(&parent->d_seq); 1053 seq = read_seqcount_begin(&parent->d_seq);
1072 if (read_seqcount_retry(&old->d_seq, nd->seq)) 1054 if (read_seqcount_retry(&old->d_seq, nd->seq))
1073 return -ECHILD; 1055 goto failed;
1074 inode = parent->d_inode; 1056 inode = parent->d_inode;
1075 nd->path.dentry = parent; 1057 nd->path.dentry = parent;
1076 nd->seq = seq; 1058 nd->seq = seq;
@@ -1083,8 +1065,15 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1083 } 1065 }
1084 __follow_mount_rcu(nd, &nd->path, &inode, true); 1066 __follow_mount_rcu(nd, &nd->path, &inode, true);
1085 nd->inode = inode; 1067 nd->inode = inode;
1086
1087 return 0; 1068 return 0;
1069
1070failed:
1071 nd->flags &= ~LOOKUP_RCU;
1072 if (!(nd->flags & LOOKUP_ROOT))
1073 nd->root.mnt = NULL;
1074 rcu_read_unlock();
1075 br_read_unlock(vfsmount_lock);
1076 return -ECHILD;
1088} 1077}
1089 1078
1090/* 1079/*
@@ -1095,7 +1084,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
1095 * Care must be taken as namespace_sem may be held (indicated by mounting_here 1084 * Care must be taken as namespace_sem may be held (indicated by mounting_here
1096 * being true). 1085 * being true).
1097 */ 1086 */
1098int follow_down(struct path *path, bool mounting_here) 1087int follow_down(struct path *path)
1099{ 1088{
1100 unsigned managed; 1089 unsigned managed;
1101 int ret; 1090 int ret;
@@ -1116,7 +1105,7 @@ int follow_down(struct path *path, bool mounting_here)
1116 BUG_ON(!path->dentry->d_op); 1105 BUG_ON(!path->dentry->d_op);
1117 BUG_ON(!path->dentry->d_op->d_manage); 1106 BUG_ON(!path->dentry->d_op->d_manage);
1118 ret = path->dentry->d_op->d_manage( 1107 ret = path->dentry->d_op->d_manage(
1119 path->dentry, mounting_here, false); 1108 path->dentry, false);
1120 if (ret < 0) 1109 if (ret < 0)
1121 return ret == -EISDIR ? 0 : ret; 1110 return ret == -EISDIR ? 0 : ret;
1122 } 1111 }
@@ -1218,57 +1207,85 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
1218{ 1207{
1219 struct vfsmount *mnt = nd->path.mnt; 1208 struct vfsmount *mnt = nd->path.mnt;
1220 struct dentry *dentry, *parent = nd->path.dentry; 1209 struct dentry *dentry, *parent = nd->path.dentry;
1221 struct inode *dir; 1210 int need_reval = 1;
1211 int status = 1;
1222 int err; 1212 int err;
1223 1213
1224 /* 1214 /*
1225 * See if the low-level filesystem might want
1226 * to use its own hash..
1227 */
1228 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1229 err = parent->d_op->d_hash(parent, nd->inode, name);
1230 if (err < 0)
1231 return err;
1232 }
1233
1234 /*
1235 * Rename seqlock is not required here because in the off chance 1215 * Rename seqlock is not required here because in the off chance
1236 * of a false negative due to a concurrent rename, we're going to 1216 * of a false negative due to a concurrent rename, we're going to
1237 * do the non-racy lookup, below. 1217 * do the non-racy lookup, below.
1238 */ 1218 */
1239 if (nd->flags & LOOKUP_RCU) { 1219 if (nd->flags & LOOKUP_RCU) {
1240 unsigned seq; 1220 unsigned seq;
1241
1242 *inode = nd->inode; 1221 *inode = nd->inode;
1243 dentry = __d_lookup_rcu(parent, name, &seq, inode); 1222 dentry = __d_lookup_rcu(parent, name, &seq, inode);
1244 if (!dentry) { 1223 if (!dentry)
1245 if (nameidata_drop_rcu(nd)) 1224 goto unlazy;
1246 return -ECHILD; 1225
1247 goto need_lookup;
1248 }
1249 /* Memory barrier in read_seqcount_begin of child is enough */ 1226 /* Memory barrier in read_seqcount_begin of child is enough */
1250 if (__read_seqcount_retry(&parent->d_seq, nd->seq)) 1227 if (__read_seqcount_retry(&parent->d_seq, nd->seq))
1251 return -ECHILD; 1228 return -ECHILD;
1252
1253 nd->seq = seq; 1229 nd->seq = seq;
1254 if (dentry->d_flags & DCACHE_OP_REVALIDATE) 1230
1255 goto need_revalidate; 1231 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
1256done2: 1232 status = d_revalidate(dentry, nd);
1233 if (unlikely(status <= 0)) {
1234 if (status != -ECHILD)
1235 need_reval = 0;
1236 goto unlazy;
1237 }
1238 }
1257 path->mnt = mnt; 1239 path->mnt = mnt;
1258 path->dentry = dentry; 1240 path->dentry = dentry;
1259 if (likely(__follow_mount_rcu(nd, path, inode, false))) 1241 if (likely(__follow_mount_rcu(nd, path, inode, false)))
1260 return 0; 1242 return 0;
1261 if (nameidata_drop_rcu(nd)) 1243unlazy:
1262 return -ECHILD; 1244 if (dentry) {
1263 /* fallthru */ 1245 if (nameidata_dentry_drop_rcu(nd, dentry))
1246 return -ECHILD;
1247 } else {
1248 if (nameidata_drop_rcu(nd))
1249 return -ECHILD;
1250 }
1251 } else {
1252 dentry = __d_lookup(parent, name);
1264 } 1253 }
1265 dentry = __d_lookup(parent, name); 1254
1266 if (!dentry) 1255retry:
1267 goto need_lookup; 1256 if (unlikely(!dentry)) {
1268found: 1257 struct inode *dir = parent->d_inode;
1269 if (dentry->d_flags & DCACHE_OP_REVALIDATE) 1258 BUG_ON(nd->inode != dir);
1270 goto need_revalidate; 1259
1271done: 1260 mutex_lock(&dir->i_mutex);
1261 dentry = d_lookup(parent, name);
1262 if (likely(!dentry)) {
1263 dentry = d_alloc_and_lookup(parent, name, nd);
1264 if (IS_ERR(dentry)) {
1265 mutex_unlock(&dir->i_mutex);
1266 return PTR_ERR(dentry);
1267 }
1268 /* known good */
1269 need_reval = 0;
1270 status = 1;
1271 }
1272 mutex_unlock(&dir->i_mutex);
1273 }
1274 if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
1275 status = d_revalidate(dentry, nd);
1276 if (unlikely(status <= 0)) {
1277 if (status < 0) {
1278 dput(dentry);
1279 return status;
1280 }
1281 if (!d_invalidate(dentry)) {
1282 dput(dentry);
1283 dentry = NULL;
1284 need_reval = 1;
1285 goto retry;
1286 }
1287 }
1288
1272 path->mnt = mnt; 1289 path->mnt = mnt;
1273 path->dentry = dentry; 1290 path->dentry = dentry;
1274 err = follow_managed(path, nd->flags); 1291 err = follow_managed(path, nd->flags);
@@ -1278,49 +1295,113 @@ done:
1278 } 1295 }
1279 *inode = path->dentry->d_inode; 1296 *inode = path->dentry->d_inode;
1280 return 0; 1297 return 0;
1298}
1281 1299
1282need_lookup: 1300static inline int may_lookup(struct nameidata *nd)
1283 dir = parent->d_inode; 1301{
1284 BUG_ON(nd->inode != dir); 1302 if (nd->flags & LOOKUP_RCU) {
1303 int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
1304 if (err != -ECHILD)
1305 return err;
1306 if (nameidata_drop_rcu(nd))
1307 return -ECHILD;
1308 }
1309 return exec_permission(nd->inode, 0);
1310}
1285 1311
1286 mutex_lock(&dir->i_mutex); 1312static inline int handle_dots(struct nameidata *nd, int type)
1287 /* 1313{
1288 * First re-do the cached lookup just in case it was created 1314 if (type == LAST_DOTDOT) {
1289 * while we waited for the directory semaphore, or the first 1315 if (nd->flags & LOOKUP_RCU) {
1290 * lookup failed due to an unrelated rename. 1316 if (follow_dotdot_rcu(nd))
1291 * 1317 return -ECHILD;
1292 * This could use version numbering or similar to avoid unnecessary 1318 } else
1293 * cache lookups, but then we'd have to do the first lookup in the 1319 follow_dotdot(nd);
1294 * non-racy way. However in the common case here, everything should 1320 }
1295 * be hot in cache, so would it be a big win? 1321 return 0;
1296 */ 1322}
1297 dentry = d_lookup(parent, name); 1323
1298 if (likely(!dentry)) { 1324static void terminate_walk(struct nameidata *nd)
1299 dentry = d_alloc_and_lookup(parent, name, nd); 1325{
1300 mutex_unlock(&dir->i_mutex); 1326 if (!(nd->flags & LOOKUP_RCU)) {
1301 if (IS_ERR(dentry)) 1327 path_put(&nd->path);
1302 goto fail; 1328 } else {
1303 goto done; 1329 nd->flags &= ~LOOKUP_RCU;
1330 if (!(nd->flags & LOOKUP_ROOT))
1331 nd->root.mnt = NULL;
1332 rcu_read_unlock();
1333 br_read_unlock(vfsmount_lock);
1304 } 1334 }
1335}
1336
1337static inline int walk_component(struct nameidata *nd, struct path *path,
1338 struct qstr *name, int type, int follow)
1339{
1340 struct inode *inode;
1341 int err;
1305 /* 1342 /*
1306 * Uhhuh! Nasty case: the cache was re-populated while 1343 * "." and ".." are special - ".." especially so because it has
1307 * we waited on the semaphore. Need to revalidate. 1344 * to be able to know about the current root directory and
1345 * parent relationships.
1308 */ 1346 */
1309 mutex_unlock(&dir->i_mutex); 1347 if (unlikely(type != LAST_NORM))
1310 goto found; 1348 return handle_dots(nd, type);
1349 err = do_lookup(nd, name, path, &inode);
1350 if (unlikely(err)) {
1351 terminate_walk(nd);
1352 return err;
1353 }
1354 if (!inode) {
1355 path_to_nameidata(path, nd);
1356 terminate_walk(nd);
1357 return -ENOENT;
1358 }
1359 if (unlikely(inode->i_op->follow_link) && follow) {
1360 if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
1361 return -ECHILD;
1362 BUG_ON(inode != path->dentry->d_inode);
1363 return 1;
1364 }
1365 path_to_nameidata(path, nd);
1366 nd->inode = inode;
1367 return 0;
1368}
1311 1369
1312need_revalidate: 1370/*
1313 dentry = do_revalidate(dentry, nd); 1371 * This limits recursive symlink follows to 8, while
1314 if (!dentry) 1372 * limiting consecutive symlinks to 40.
1315 goto need_lookup; 1373 *
1316 if (IS_ERR(dentry)) 1374 * Without that kind of total limit, nasty chains of consecutive
1317 goto fail; 1375 * symlinks can cause almost arbitrarily long lookups.
1318 if (nd->flags & LOOKUP_RCU) 1376 */
1319 goto done2; 1377static inline int nested_symlink(struct path *path, struct nameidata *nd)
1320 goto done; 1378{
1379 int res;
1321 1380
1322fail: 1381 BUG_ON(nd->depth >= MAX_NESTED_LINKS);
1323 return PTR_ERR(dentry); 1382 if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
1383 path_put_conditional(path, nd);
1384 path_put(&nd->path);
1385 return -ELOOP;
1386 }
1387
1388 nd->depth++;
1389 current->link_count++;
1390
1391 do {
1392 struct path link = *path;
1393 void *cookie;
1394
1395 res = follow_link(&link, nd, &cookie);
1396 if (!res)
1397 res = walk_component(nd, path, &nd->last,
1398 nd->last_type, LOOKUP_FOLLOW);
1399 put_link(nd, &link, cookie);
1400 } while (res > 0);
1401
1402 current->link_count--;
1403 nd->depth--;
1404 return res;
1324} 1405}
1325 1406
1326/* 1407/*
@@ -1340,30 +1421,18 @@ static int link_path_walk(const char *name, struct nameidata *nd)
1340 while (*name=='/') 1421 while (*name=='/')
1341 name++; 1422 name++;
1342 if (!*name) 1423 if (!*name)
1343 goto return_reval; 1424 return 0;
1344
1345 if (nd->depth)
1346 lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
1347 1425
1348 /* At this point we know we have a real path component. */ 1426 /* At this point we know we have a real path component. */
1349 for(;;) { 1427 for(;;) {
1350 struct inode *inode;
1351 unsigned long hash; 1428 unsigned long hash;
1352 struct qstr this; 1429 struct qstr this;
1353 unsigned int c; 1430 unsigned int c;
1431 int type;
1354 1432
1355 nd->flags |= LOOKUP_CONTINUE; 1433 nd->flags |= LOOKUP_CONTINUE;
1356 if (nd->flags & LOOKUP_RCU) { 1434
1357 err = exec_permission(nd->inode, IPERM_FLAG_RCU); 1435 err = may_lookup(nd);
1358 if (err == -ECHILD) {
1359 if (nameidata_drop_rcu(nd))
1360 return -ECHILD;
1361 goto exec_again;
1362 }
1363 } else {
1364exec_again:
1365 err = exec_permission(nd->inode, 0);
1366 }
1367 if (err) 1436 if (err)
1368 break; 1437 break;
1369 1438
@@ -1379,56 +1448,43 @@ exec_again:
1379 this.len = name - (const char *) this.name; 1448 this.len = name - (const char *) this.name;
1380 this.hash = end_name_hash(hash); 1449 this.hash = end_name_hash(hash);
1381 1450
1451 type = LAST_NORM;
1452 if (this.name[0] == '.') switch (this.len) {
1453 case 2:
1454 if (this.name[1] == '.') {
1455 type = LAST_DOTDOT;
1456 nd->flags |= LOOKUP_JUMPED;
1457 }
1458 break;
1459 case 1:
1460 type = LAST_DOT;
1461 }
1462 if (likely(type == LAST_NORM)) {
1463 struct dentry *parent = nd->path.dentry;
1464 nd->flags &= ~LOOKUP_JUMPED;
1465 if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
1466 err = parent->d_op->d_hash(parent, nd->inode,
1467 &this);
1468 if (err < 0)
1469 break;
1470 }
1471 }
1472
1382 /* remove trailing slashes? */ 1473 /* remove trailing slashes? */
1383 if (!c) 1474 if (!c)
1384 goto last_component; 1475 goto last_component;
1385 while (*++name == '/'); 1476 while (*++name == '/');
1386 if (!*name) 1477 if (!*name)
1387 goto last_with_slashes; 1478 goto last_component;
1388 1479
1389 /* 1480 err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
1390 * "." and ".." are special - ".." especially so because it has 1481 if (err < 0)
1391 * to be able to know about the current root directory and 1482 return err;
1392 * parent relationships.
1393 */
1394 if (this.name[0] == '.') switch (this.len) {
1395 default:
1396 break;
1397 case 2:
1398 if (this.name[1] != '.')
1399 break;
1400 if (nd->flags & LOOKUP_RCU) {
1401 if (follow_dotdot_rcu(nd))
1402 return -ECHILD;
1403 } else
1404 follow_dotdot(nd);
1405 /* fallthrough */
1406 case 1:
1407 continue;
1408 }
1409 /* This does the actual lookups.. */
1410 err = do_lookup(nd, &this, &next, &inode);
1411 if (err)
1412 break;
1413 err = -ENOENT;
1414 if (!inode)
1415 goto out_dput;
1416 1483
1417 if (inode->i_op->follow_link) { 1484 if (err) {
1418 /* We commonly drop rcu-walk here */ 1485 err = nested_symlink(&next, nd);
1419 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1420 return -ECHILD;
1421 BUG_ON(inode != next.dentry->d_inode);
1422 err = do_follow_link(&next, nd);
1423 if (err) 1486 if (err)
1424 goto return_err; 1487 return err;
1425 nd->inode = nd->path.dentry->d_inode;
1426 err = -ENOENT;
1427 if (!nd->inode)
1428 break;
1429 } else {
1430 path_to_nameidata(&next, nd);
1431 nd->inode = inode;
1432 } 1488 }
1433 err = -ENOTDIR; 1489 err = -ENOTDIR;
1434 if (!nd->inode->i_op->lookup) 1490 if (!nd->inode->i_op->lookup)
@@ -1436,209 +1492,109 @@ exec_again:
1436 continue; 1492 continue;
1437 /* here ends the main loop */ 1493 /* here ends the main loop */
1438 1494
1439last_with_slashes:
1440 lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1441last_component: 1495last_component:
1442 /* Clear LOOKUP_CONTINUE iff it was previously unset */ 1496 /* Clear LOOKUP_CONTINUE iff it was previously unset */
1443 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; 1497 nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
1444 if (lookup_flags & LOOKUP_PARENT)
1445 goto lookup_parent;
1446 if (this.name[0] == '.') switch (this.len) {
1447 default:
1448 break;
1449 case 2:
1450 if (this.name[1] != '.')
1451 break;
1452 if (nd->flags & LOOKUP_RCU) {
1453 if (follow_dotdot_rcu(nd))
1454 return -ECHILD;
1455 } else
1456 follow_dotdot(nd);
1457 /* fallthrough */
1458 case 1:
1459 goto return_reval;
1460 }
1461 err = do_lookup(nd, &this, &next, &inode);
1462 if (err)
1463 break;
1464 if (inode && unlikely(inode->i_op->follow_link) &&
1465 (lookup_flags & LOOKUP_FOLLOW)) {
1466 if (nameidata_dentry_drop_rcu_maybe(nd, next.dentry))
1467 return -ECHILD;
1468 BUG_ON(inode != next.dentry->d_inode);
1469 err = do_follow_link(&next, nd);
1470 if (err)
1471 goto return_err;
1472 nd->inode = nd->path.dentry->d_inode;
1473 } else {
1474 path_to_nameidata(&next, nd);
1475 nd->inode = inode;
1476 }
1477 err = -ENOENT;
1478 if (!nd->inode)
1479 break;
1480 if (lookup_flags & LOOKUP_DIRECTORY) {
1481 err = -ENOTDIR;
1482 if (!nd->inode->i_op->lookup)
1483 break;
1484 }
1485 goto return_base;
1486lookup_parent:
1487 nd->last = this; 1498 nd->last = this;
1488 nd->last_type = LAST_NORM; 1499 nd->last_type = type;
1489 if (this.name[0] != '.')
1490 goto return_base;
1491 if (this.len == 1)
1492 nd->last_type = LAST_DOT;
1493 else if (this.len == 2 && this.name[1] == '.')
1494 nd->last_type = LAST_DOTDOT;
1495 else
1496 goto return_base;
1497return_reval:
1498 /*
1499 * We bypassed the ordinary revalidation routines.
1500 * We may need to check the cached dentry for staleness.
1501 */
1502 if (need_reval_dot(nd->path.dentry)) {
1503 /* Note: we do not d_invalidate() */
1504 err = d_revalidate(nd->path.dentry, nd);
1505 if (!err)
1506 err = -ESTALE;
1507 if (err < 0)
1508 break;
1509 }
1510return_base:
1511 if (nameidata_drop_rcu_last_maybe(nd))
1512 return -ECHILD;
1513 return 0; 1500 return 0;
1514out_dput:
1515 if (!(nd->flags & LOOKUP_RCU))
1516 path_put_conditional(&next, nd);
1517 break;
1518 } 1501 }
1519 if (!(nd->flags & LOOKUP_RCU)) 1502 terminate_walk(nd);
1520 path_put(&nd->path);
1521return_err:
1522 return err; 1503 return err;
1523} 1504}
1524 1505
1525static inline int path_walk_rcu(const char *name, struct nameidata *nd) 1506static int path_init(int dfd, const char *name, unsigned int flags,
1526{ 1507 struct nameidata *nd, struct file **fp)
1527 current->total_link_count = 0;
1528
1529 return link_path_walk(name, nd);
1530}
1531
1532static inline int path_walk_simple(const char *name, struct nameidata *nd)
1533{
1534 current->total_link_count = 0;
1535
1536 return link_path_walk(name, nd);
1537}
1538
1539static int path_walk(const char *name, struct nameidata *nd)
1540{
1541 struct path save = nd->path;
1542 int result;
1543
1544 current->total_link_count = 0;
1545
1546 /* make sure the stuff we saved doesn't go away */
1547 path_get(&save);
1548
1549 result = link_path_walk(name, nd);
1550 if (result == -ESTALE) {
1551 /* nd->path had been dropped */
1552 current->total_link_count = 0;
1553 nd->path = save;
1554 path_get(&nd->path);
1555 nd->flags |= LOOKUP_REVAL;
1556 result = link_path_walk(name, nd);
1557 }
1558
1559 path_put(&save);
1560
1561 return result;
1562}
1563
1564static void path_finish_rcu(struct nameidata *nd)
1565{
1566 if (nd->flags & LOOKUP_RCU) {
1567 /* RCU dangling. Cancel it. */
1568 nd->flags &= ~LOOKUP_RCU;
1569 nd->root.mnt = NULL;
1570 rcu_read_unlock();
1571 br_read_unlock(vfsmount_lock);
1572 }
1573 if (nd->file)
1574 fput(nd->file);
1575}
1576
1577static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
1578{ 1508{
1579 int retval = 0; 1509 int retval = 0;
1580 int fput_needed; 1510 int fput_needed;
1581 struct file *file; 1511 struct file *file;
1582 1512
1583 nd->last_type = LAST_ROOT; /* if there are only slashes... */ 1513 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1584 nd->flags = flags | LOOKUP_RCU; 1514 nd->flags = flags | LOOKUP_JUMPED;
1585 nd->depth = 0; 1515 nd->depth = 0;
1516 if (flags & LOOKUP_ROOT) {
1517 struct inode *inode = nd->root.dentry->d_inode;
1518 if (*name) {
1519 if (!inode->i_op->lookup)
1520 return -ENOTDIR;
1521 retval = inode_permission(inode, MAY_EXEC);
1522 if (retval)
1523 return retval;
1524 }
1525 nd->path = nd->root;
1526 nd->inode = inode;
1527 if (flags & LOOKUP_RCU) {
1528 br_read_lock(vfsmount_lock);
1529 rcu_read_lock();
1530 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1531 } else {
1532 path_get(&nd->path);
1533 }
1534 return 0;
1535 }
1536
1586 nd->root.mnt = NULL; 1537 nd->root.mnt = NULL;
1587 nd->file = NULL;
1588 1538
1589 if (*name=='/') { 1539 if (*name=='/') {
1590 struct fs_struct *fs = current->fs; 1540 if (flags & LOOKUP_RCU) {
1591 unsigned seq; 1541 br_read_lock(vfsmount_lock);
1592 1542 rcu_read_lock();
1593 br_read_lock(vfsmount_lock); 1543 set_root_rcu(nd);
1594 rcu_read_lock(); 1544 } else {
1595 1545 set_root(nd);
1596 do { 1546 path_get(&nd->root);
1597 seq = read_seqcount_begin(&fs->seq); 1547 }
1598 nd->root = fs->root; 1548 nd->path = nd->root;
1599 nd->path = nd->root;
1600 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1601 } while (read_seqcount_retry(&fs->seq, seq));
1602
1603 } else if (dfd == AT_FDCWD) { 1549 } else if (dfd == AT_FDCWD) {
1604 struct fs_struct *fs = current->fs; 1550 if (flags & LOOKUP_RCU) {
1605 unsigned seq; 1551 struct fs_struct *fs = current->fs;
1552 unsigned seq;
1606 1553
1607 br_read_lock(vfsmount_lock); 1554 br_read_lock(vfsmount_lock);
1608 rcu_read_lock(); 1555 rcu_read_lock();
1609
1610 do {
1611 seq = read_seqcount_begin(&fs->seq);
1612 nd->path = fs->pwd;
1613 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1614 } while (read_seqcount_retry(&fs->seq, seq));
1615 1556
1557 do {
1558 seq = read_seqcount_begin(&fs->seq);
1559 nd->path = fs->pwd;
1560 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1561 } while (read_seqcount_retry(&fs->seq, seq));
1562 } else {
1563 get_fs_pwd(current->fs, &nd->path);
1564 }
1616 } else { 1565 } else {
1617 struct dentry *dentry; 1566 struct dentry *dentry;
1618 1567
1619 file = fget_light(dfd, &fput_needed); 1568 file = fget_raw_light(dfd, &fput_needed);
1620 retval = -EBADF; 1569 retval = -EBADF;
1621 if (!file) 1570 if (!file)
1622 goto out_fail; 1571 goto out_fail;
1623 1572
1624 dentry = file->f_path.dentry; 1573 dentry = file->f_path.dentry;
1625 1574
1626 retval = -ENOTDIR; 1575 if (*name) {
1627 if (!S_ISDIR(dentry->d_inode->i_mode)) 1576 retval = -ENOTDIR;
1628 goto fput_fail; 1577 if (!S_ISDIR(dentry->d_inode->i_mode))
1578 goto fput_fail;
1629 1579
1630 retval = file_permission(file, MAY_EXEC); 1580 retval = file_permission(file, MAY_EXEC);
1631 if (retval) 1581 if (retval)
1632 goto fput_fail; 1582 goto fput_fail;
1583 }
1633 1584
1634 nd->path = file->f_path; 1585 nd->path = file->f_path;
1635 if (fput_needed) 1586 if (flags & LOOKUP_RCU) {
1636 nd->file = file; 1587 if (fput_needed)
1637 1588 *fp = file;
1638 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); 1589 nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
1639 br_read_lock(vfsmount_lock); 1590 br_read_lock(vfsmount_lock);
1640 rcu_read_lock(); 1591 rcu_read_lock();
1592 } else {
1593 path_get(&file->f_path);
1594 fput_light(file, fput_needed);
1595 }
1641 } 1596 }
1597
1642 nd->inode = nd->path.dentry->d_inode; 1598 nd->inode = nd->path.dentry->d_inode;
1643 return 0; 1599 return 0;
1644 1600
@@ -1648,60 +1604,23 @@ out_fail:
1648 return retval; 1604 return retval;
1649} 1605}
1650 1606
1651static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd) 1607static inline int lookup_last(struct nameidata *nd, struct path *path)
1652{ 1608{
1653 int retval = 0; 1609 if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
1654 int fput_needed; 1610 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
1655 struct file *file;
1656
1657 nd->last_type = LAST_ROOT; /* if there are only slashes... */
1658 nd->flags = flags;
1659 nd->depth = 0;
1660 nd->root.mnt = NULL;
1661
1662 if (*name=='/') {
1663 set_root(nd);
1664 nd->path = nd->root;
1665 path_get(&nd->root);
1666 } else if (dfd == AT_FDCWD) {
1667 get_fs_pwd(current->fs, &nd->path);
1668 } else {
1669 struct dentry *dentry;
1670
1671 file = fget_light(dfd, &fput_needed);
1672 retval = -EBADF;
1673 if (!file)
1674 goto out_fail;
1675
1676 dentry = file->f_path.dentry;
1677 1611
1678 retval = -ENOTDIR; 1612 nd->flags &= ~LOOKUP_PARENT;
1679 if (!S_ISDIR(dentry->d_inode->i_mode)) 1613 return walk_component(nd, path, &nd->last, nd->last_type,
1680 goto fput_fail; 1614 nd->flags & LOOKUP_FOLLOW);
1681
1682 retval = file_permission(file, MAY_EXEC);
1683 if (retval)
1684 goto fput_fail;
1685
1686 nd->path = file->f_path;
1687 path_get(&file->f_path);
1688
1689 fput_light(file, fput_needed);
1690 }
1691 nd->inode = nd->path.dentry->d_inode;
1692 return 0;
1693
1694fput_fail:
1695 fput_light(file, fput_needed);
1696out_fail:
1697 return retval;
1698} 1615}
1699 1616
1700/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ 1617/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
1701static int do_path_lookup(int dfd, const char *name, 1618static int path_lookupat(int dfd, const char *name,
1702 unsigned int flags, struct nameidata *nd) 1619 unsigned int flags, struct nameidata *nd)
1703{ 1620{
1704 int retval; 1621 struct file *base = NULL;
1622 struct path path;
1623 int err;
1705 1624
1706 /* 1625 /*
1707 * Path walking is largely split up into 2 different synchronisation 1626 * Path walking is largely split up into 2 different synchronisation
@@ -1717,44 +1636,78 @@ static int do_path_lookup(int dfd, const char *name,
1717 * be handled by restarting a traditional ref-walk (which will always 1636 * be handled by restarting a traditional ref-walk (which will always
1718 * be able to complete). 1637 * be able to complete).
1719 */ 1638 */
1720 retval = path_init_rcu(dfd, name, flags, nd); 1639 err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
1721 if (unlikely(retval)) 1640
1722 return retval; 1641 if (unlikely(err))
1723 retval = path_walk_rcu(name, nd); 1642 return err;
1724 path_finish_rcu(nd); 1643
1725 if (nd->root.mnt) { 1644 current->total_link_count = 0;
1726 path_put(&nd->root); 1645 err = link_path_walk(name, nd);
1727 nd->root.mnt = NULL; 1646
1647 if (!err && !(flags & LOOKUP_PARENT)) {
1648 err = lookup_last(nd, &path);
1649 while (err > 0) {
1650 void *cookie;
1651 struct path link = path;
1652 nd->flags |= LOOKUP_PARENT;
1653 err = follow_link(&link, nd, &cookie);
1654 if (!err)
1655 err = lookup_last(nd, &path);
1656 put_link(nd, &link, cookie);
1657 }
1728 } 1658 }
1729 1659
1730 if (unlikely(retval == -ECHILD || retval == -ESTALE)) { 1660 if (nd->flags & LOOKUP_RCU) {
1731 /* slower, locked walk */ 1661 /* went all way through without dropping RCU */
1732 if (retval == -ESTALE) 1662 BUG_ON(err);
1733 flags |= LOOKUP_REVAL; 1663 if (nameidata_drop_rcu_last(nd))
1734 retval = path_init(dfd, name, flags, nd); 1664 err = -ECHILD;
1735 if (unlikely(retval)) 1665 }
1736 return retval; 1666
1737 retval = path_walk(name, nd); 1667 if (!err) {
1738 if (nd->root.mnt) { 1668 err = handle_reval_path(nd);
1739 path_put(&nd->root); 1669 if (err)
1740 nd->root.mnt = NULL; 1670 path_put(&nd->path);
1671 }
1672
1673 if (!err && nd->flags & LOOKUP_DIRECTORY) {
1674 if (!nd->inode->i_op->lookup) {
1675 path_put(&nd->path);
1676 err = -ENOTDIR;
1741 } 1677 }
1742 } 1678 }
1743 1679
1680 if (base)
1681 fput(base);
1682
1683 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
1684 path_put(&nd->root);
1685 nd->root.mnt = NULL;
1686 }
1687 return err;
1688}
1689
1690static int do_path_lookup(int dfd, const char *name,
1691 unsigned int flags, struct nameidata *nd)
1692{
1693 int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
1694 if (unlikely(retval == -ECHILD))
1695 retval = path_lookupat(dfd, name, flags, nd);
1696 if (unlikely(retval == -ESTALE))
1697 retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
1698
1744 if (likely(!retval)) { 1699 if (likely(!retval)) {
1745 if (unlikely(!audit_dummy_context())) { 1700 if (unlikely(!audit_dummy_context())) {
1746 if (nd->path.dentry && nd->inode) 1701 if (nd->path.dentry && nd->inode)
1747 audit_inode(name, nd->path.dentry); 1702 audit_inode(name, nd->path.dentry);
1748 } 1703 }
1749 } 1704 }
1750
1751 return retval; 1705 return retval;
1752} 1706}
1753 1707
1754int path_lookup(const char *name, unsigned int flags, 1708int kern_path_parent(const char *name, struct nameidata *nd)
1755 struct nameidata *nd)
1756{ 1709{
1757 return do_path_lookup(AT_FDCWD, name, flags, nd); 1710 return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
1758} 1711}
1759 1712
1760int kern_path(const char *name, unsigned int flags, struct path *path) 1713int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -1778,29 +1731,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
1778 const char *name, unsigned int flags, 1731 const char *name, unsigned int flags,
1779 struct nameidata *nd) 1732 struct nameidata *nd)
1780{ 1733{
1781 int retval; 1734 nd->root.dentry = dentry;
1782 1735 nd->root.mnt = mnt;
1783 /* same as do_path_lookup */ 1736 /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
1784 nd->last_type = LAST_ROOT; 1737 return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
1785 nd->flags = flags;
1786 nd->depth = 0;
1787
1788 nd->path.dentry = dentry;
1789 nd->path.mnt = mnt;
1790 path_get(&nd->path);
1791 nd->root = nd->path;
1792 path_get(&nd->root);
1793 nd->inode = nd->path.dentry->d_inode;
1794
1795 retval = path_walk(name, nd);
1796 if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
1797 nd->inode))
1798 audit_inode(name, nd->path.dentry);
1799
1800 path_put(&nd->root);
1801 nd->root.mnt = NULL;
1802
1803 return retval;
1804} 1738}
1805 1739
1806static struct dentry *__lookup_hash(struct qstr *name, 1740static struct dentry *__lookup_hash(struct qstr *name,
@@ -1815,17 +1749,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
1815 return ERR_PTR(err); 1749 return ERR_PTR(err);
1816 1750
1817 /* 1751 /*
1818 * See if the low-level filesystem might want
1819 * to use its own hash..
1820 */
1821 if (base->d_flags & DCACHE_OP_HASH) {
1822 err = base->d_op->d_hash(base, inode, name);
1823 dentry = ERR_PTR(err);
1824 if (err < 0)
1825 goto out;
1826 }
1827
1828 /*
1829 * Don't bother with __d_lookup: callers are for creat as 1752 * Don't bother with __d_lookup: callers are for creat as
1830 * well as unlink, so a lot of the time it would cost 1753 * well as unlink, so a lot of the time it would cost
1831 * a double lookup. 1754 * a double lookup.
@@ -1837,7 +1760,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
1837 1760
1838 if (!dentry) 1761 if (!dentry)
1839 dentry = d_alloc_and_lookup(base, name, nd); 1762 dentry = d_alloc_and_lookup(base, name, nd);
1840out: 1763
1841 return dentry; 1764 return dentry;
1842} 1765}
1843 1766
@@ -1851,28 +1774,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
1851 return __lookup_hash(&nd->last, nd->path.dentry, nd); 1774 return __lookup_hash(&nd->last, nd->path.dentry, nd);
1852} 1775}
1853 1776
1854static int __lookup_one_len(const char *name, struct qstr *this,
1855 struct dentry *base, int len)
1856{
1857 unsigned long hash;
1858 unsigned int c;
1859
1860 this->name = name;
1861 this->len = len;
1862 if (!len)
1863 return -EACCES;
1864
1865 hash = init_name_hash();
1866 while (len--) {
1867 c = *(const unsigned char *)name++;
1868 if (c == '/' || c == '\0')
1869 return -EACCES;
1870 hash = partial_name_hash(c, hash);
1871 }
1872 this->hash = end_name_hash(hash);
1873 return 0;
1874}
1875
1876/** 1777/**
1877 * lookup_one_len - filesystem helper to lookup single pathname component 1778 * lookup_one_len - filesystem helper to lookup single pathname component
1878 * @name: pathname component to lookup 1779 * @name: pathname component to lookup
@@ -1886,14 +1787,34 @@ static int __lookup_one_len(const char *name, struct qstr *this,
1886 */ 1787 */
1887struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) 1788struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
1888{ 1789{
1889 int err;
1890 struct qstr this; 1790 struct qstr this;
1791 unsigned long hash;
1792 unsigned int c;
1891 1793
1892 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); 1794 WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
1893 1795
1894 err = __lookup_one_len(name, &this, base, len); 1796 this.name = name;
1895 if (err) 1797 this.len = len;
1896 return ERR_PTR(err); 1798 if (!len)
1799 return ERR_PTR(-EACCES);
1800
1801 hash = init_name_hash();
1802 while (len--) {
1803 c = *(const unsigned char *)name++;
1804 if (c == '/' || c == '\0')
1805 return ERR_PTR(-EACCES);
1806 hash = partial_name_hash(c, hash);
1807 }
1808 this.hash = end_name_hash(hash);
1809 /*
1810 * See if the low-level filesystem might want
1811 * to use its own hash..
1812 */
1813 if (base->d_flags & DCACHE_OP_HASH) {
1814 int err = base->d_op->d_hash(base, base->d_inode, &this);
1815 if (err < 0)
1816 return ERR_PTR(err);
1817 }
1897 1818
1898 return __lookup_hash(&this, base, NULL); 1819 return __lookup_hash(&this, base, NULL);
1899} 1820}
@@ -1902,7 +1823,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
1902 struct path *path) 1823 struct path *path)
1903{ 1824{
1904 struct nameidata nd; 1825 struct nameidata nd;
1905 char *tmp = getname(name); 1826 char *tmp = getname_flags(name, flags);
1906 int err = PTR_ERR(tmp); 1827 int err = PTR_ERR(tmp);
1907 if (!IS_ERR(tmp)) { 1828 if (!IS_ERR(tmp)) {
1908 1829
@@ -1944,11 +1865,15 @@ static inline int check_sticky(struct inode *dir, struct inode *inode)
1944 1865
1945 if (!(dir->i_mode & S_ISVTX)) 1866 if (!(dir->i_mode & S_ISVTX))
1946 return 0; 1867 return 0;
1868 if (current_user_ns() != inode_userns(inode))
1869 goto other_userns;
1947 if (inode->i_uid == fsuid) 1870 if (inode->i_uid == fsuid)
1948 return 0; 1871 return 0;
1949 if (dir->i_uid == fsuid) 1872 if (dir->i_uid == fsuid)
1950 return 0; 1873 return 0;
1951 return !capable(CAP_FOWNER); 1874
1875other_userns:
1876 return !ns_capable(inode_userns(inode), CAP_FOWNER);
1952} 1877}
1953 1878
1954/* 1879/*
@@ -2082,12 +2007,16 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
2082 return error; 2007 return error;
2083} 2008}
2084 2009
2085int may_open(struct path *path, int acc_mode, int flag) 2010static int may_open(struct path *path, int acc_mode, int flag)
2086{ 2011{
2087 struct dentry *dentry = path->dentry; 2012 struct dentry *dentry = path->dentry;
2088 struct inode *inode = dentry->d_inode; 2013 struct inode *inode = dentry->d_inode;
2089 int error; 2014 int error;
2090 2015
2016 /* O_PATH? */
2017 if (!acc_mode)
2018 return 0;
2019
2091 if (!inode) 2020 if (!inode)
2092 return -ENOENT; 2021 return -ENOENT;
2093 2022
@@ -2124,7 +2053,7 @@ int may_open(struct path *path, int acc_mode, int flag)
2124 } 2053 }
2125 2054
2126 /* O_NOATIME can only be set by the owner or superuser */ 2055 /* O_NOATIME can only be set by the owner or superuser */
2127 if (flag & O_NOATIME && !is_owner_or_cap(inode)) 2056 if (flag & O_NOATIME && !inode_owner_or_capable(inode))
2128 return -EPERM; 2057 return -EPERM;
2129 2058
2130 /* 2059 /*
@@ -2156,34 +2085,6 @@ static int handle_truncate(struct file *filp)
2156} 2085}
2157 2086
2158/* 2087/*
2159 * Be careful about ever adding any more callers of this
2160 * function. Its flags must be in the namei format, not
2161 * what get passed to sys_open().
2162 */
2163static int __open_namei_create(struct nameidata *nd, struct path *path,
2164 int open_flag, int mode)
2165{
2166 int error;
2167 struct dentry *dir = nd->path.dentry;
2168
2169 if (!IS_POSIXACL(dir->d_inode))
2170 mode &= ~current_umask();
2171 error = security_path_mknod(&nd->path, path->dentry, mode, 0);
2172 if (error)
2173 goto out_unlock;
2174 error = vfs_create(dir->d_inode, path->dentry, mode, nd);
2175out_unlock:
2176 mutex_unlock(&dir->d_inode->i_mutex);
2177 dput(nd->path.dentry);
2178 nd->path.dentry = path->dentry;
2179
2180 if (error)
2181 return error;
2182 /* Don't check for write permission, don't truncate */
2183 return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
2184}
2185
2186/*
2187 * Note that while the flag value (low two bits) for sys_open means: 2088 * Note that while the flag value (low two bits) for sys_open means:
2188 * 00 - read-only 2089 * 00 - read-only
2189 * 01 - write-only 2090 * 01 - write-only
@@ -2207,128 +2108,115 @@ static inline int open_to_namei_flags(int flag)
2207 return flag; 2108 return flag;
2208} 2109}
2209 2110
2210static int open_will_truncate(int flag, struct inode *inode)
2211{
2212 /*
2213 * We'll never write to the fs underlying
2214 * a device file.
2215 */
2216 if (special_file(inode->i_mode))
2217 return 0;
2218 return (flag & O_TRUNC);
2219}
2220
2221static struct file *finish_open(struct nameidata *nd,
2222 int open_flag, int acc_mode)
2223{
2224 struct file *filp;
2225 int will_truncate;
2226 int error;
2227
2228 will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
2229 if (will_truncate) {
2230 error = mnt_want_write(nd->path.mnt);
2231 if (error)
2232 goto exit;
2233 }
2234 error = may_open(&nd->path, acc_mode, open_flag);
2235 if (error) {
2236 if (will_truncate)
2237 mnt_drop_write(nd->path.mnt);
2238 goto exit;
2239 }
2240 filp = nameidata_to_filp(nd);
2241 if (!IS_ERR(filp)) {
2242 error = ima_file_check(filp, acc_mode);
2243 if (error) {
2244 fput(filp);
2245 filp = ERR_PTR(error);
2246 }
2247 }
2248 if (!IS_ERR(filp)) {
2249 if (will_truncate) {
2250 error = handle_truncate(filp);
2251 if (error) {
2252 fput(filp);
2253 filp = ERR_PTR(error);
2254 }
2255 }
2256 }
2257 /*
2258 * It is now safe to drop the mnt write
2259 * because the filp has had a write taken
2260 * on its behalf.
2261 */
2262 if (will_truncate)
2263 mnt_drop_write(nd->path.mnt);
2264 path_put(&nd->path);
2265 return filp;
2266
2267exit:
2268 if (!IS_ERR(nd->intent.open.file))
2269 release_open_intent(nd);
2270 path_put(&nd->path);
2271 return ERR_PTR(error);
2272}
2273
2274/* 2111/*
2275 * Handle O_CREAT case for do_filp_open 2112 * Handle the last step of open()
2276 */ 2113 */
2277static struct file *do_last(struct nameidata *nd, struct path *path, 2114static struct file *do_last(struct nameidata *nd, struct path *path,
2278 int open_flag, int acc_mode, 2115 const struct open_flags *op, const char *pathname)
2279 int mode, const char *pathname)
2280{ 2116{
2281 struct dentry *dir = nd->path.dentry; 2117 struct dentry *dir = nd->path.dentry;
2118 struct dentry *dentry;
2119 int open_flag = op->open_flag;
2120 int will_truncate = open_flag & O_TRUNC;
2121 int want_write = 0;
2122 int acc_mode = op->acc_mode;
2282 struct file *filp; 2123 struct file *filp;
2283 int error = -EISDIR; 2124 int error;
2125
2126 nd->flags &= ~LOOKUP_PARENT;
2127 nd->flags |= op->intent;
2284 2128
2285 switch (nd->last_type) { 2129 switch (nd->last_type) {
2286 case LAST_DOTDOT: 2130 case LAST_DOTDOT:
2287 follow_dotdot(nd);
2288 dir = nd->path.dentry;
2289 case LAST_DOT: 2131 case LAST_DOT:
2290 if (need_reval_dot(dir)) { 2132 error = handle_dots(nd, nd->last_type);
2291 int status = d_revalidate(nd->path.dentry, nd); 2133 if (error)
2292 if (!status) 2134 return ERR_PTR(error);
2293 status = -ESTALE;
2294 if (status < 0) {
2295 error = status;
2296 goto exit;
2297 }
2298 }
2299 /* fallthrough */ 2135 /* fallthrough */
2300 case LAST_ROOT: 2136 case LAST_ROOT:
2301 goto exit; 2137 if (nd->flags & LOOKUP_RCU) {
2138 if (nameidata_drop_rcu_last(nd))
2139 return ERR_PTR(-ECHILD);
2140 }
2141 error = handle_reval_path(nd);
2142 if (error)
2143 goto exit;
2144 audit_inode(pathname, nd->path.dentry);
2145 if (open_flag & O_CREAT) {
2146 error = -EISDIR;
2147 goto exit;
2148 }
2149 goto ok;
2302 case LAST_BIND: 2150 case LAST_BIND:
2151 /* can't be RCU mode here */
2152 error = handle_reval_path(nd);
2153 if (error)
2154 goto exit;
2303 audit_inode(pathname, dir); 2155 audit_inode(pathname, dir);
2304 goto ok; 2156 goto ok;
2305 } 2157 }
2306 2158
2159 if (!(open_flag & O_CREAT)) {
2160 int symlink_ok = 0;
2161 if (nd->last.name[nd->last.len])
2162 nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
2163 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
2164 symlink_ok = 1;
2165 /* we _can_ be in RCU mode here */
2166 error = walk_component(nd, path, &nd->last, LAST_NORM,
2167 !symlink_ok);
2168 if (error < 0)
2169 return ERR_PTR(error);
2170 if (error) /* symlink */
2171 return NULL;
2172 /* sayonara */
2173 if (nd->flags & LOOKUP_RCU) {
2174 if (nameidata_drop_rcu_last(nd))
2175 return ERR_PTR(-ECHILD);
2176 }
2177
2178 error = -ENOTDIR;
2179 if (nd->flags & LOOKUP_DIRECTORY) {
2180 if (!nd->inode->i_op->lookup)
2181 goto exit;
2182 }
2183 audit_inode(pathname, nd->path.dentry);
2184 goto ok;
2185 }
2186
2187 /* create side of things */
2188
2189 if (nd->flags & LOOKUP_RCU) {
2190 if (nameidata_drop_rcu_last(nd))
2191 return ERR_PTR(-ECHILD);
2192 }
2193
2194 audit_inode(pathname, dir);
2195 error = -EISDIR;
2307 /* trailing slashes? */ 2196 /* trailing slashes? */
2308 if (nd->last.name[nd->last.len]) 2197 if (nd->last.name[nd->last.len])
2309 goto exit; 2198 goto exit;
2310 2199
2311 mutex_lock(&dir->d_inode->i_mutex); 2200 mutex_lock(&dir->d_inode->i_mutex);
2312 2201
2313 path->dentry = lookup_hash(nd); 2202 dentry = lookup_hash(nd);
2314 path->mnt = nd->path.mnt; 2203 error = PTR_ERR(dentry);
2315 2204 if (IS_ERR(dentry)) {
2316 error = PTR_ERR(path->dentry);
2317 if (IS_ERR(path->dentry)) {
2318 mutex_unlock(&dir->d_inode->i_mutex); 2205 mutex_unlock(&dir->d_inode->i_mutex);
2319 goto exit; 2206 goto exit;
2320 } 2207 }
2321 2208
2322 if (IS_ERR(nd->intent.open.file)) { 2209 path->dentry = dentry;
2323 error = PTR_ERR(nd->intent.open.file); 2210 path->mnt = nd->path.mnt;
2324 goto exit_mutex_unlock;
2325 }
2326 2211
2327 /* Negative dentry, just create the file */ 2212 /* Negative dentry, just create the file */
2328 if (!path->dentry->d_inode) { 2213 if (!dentry->d_inode) {
2214 int mode = op->mode;
2215 if (!IS_POSIXACL(dir->d_inode))
2216 mode &= ~current_umask();
2329 /* 2217 /*
2330 * This write is needed to ensure that a 2218 * This write is needed to ensure that a
2331 * ro->rw transition does not occur between 2219 * rw->ro transition does not occur between
2332 * the time when the file is created and when 2220 * the time when the file is created and when
2333 * a permanent write count is taken through 2221 * a permanent write count is taken through
2334 * the 'struct file' in nameidata_to_filp(). 2222 * the 'struct file' in nameidata_to_filp().
@@ -2336,22 +2224,21 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2336 error = mnt_want_write(nd->path.mnt); 2224 error = mnt_want_write(nd->path.mnt);
2337 if (error) 2225 if (error)
2338 goto exit_mutex_unlock; 2226 goto exit_mutex_unlock;
2339 error = __open_namei_create(nd, path, open_flag, mode); 2227 want_write = 1;
2340 if (error) { 2228 /* Don't check for write permission, don't truncate */
2341 mnt_drop_write(nd->path.mnt); 2229 open_flag &= ~O_TRUNC;
2342 goto exit; 2230 will_truncate = 0;
2343 } 2231 acc_mode = MAY_OPEN;
2344 filp = nameidata_to_filp(nd); 2232 error = security_path_mknod(&nd->path, dentry, mode, 0);
2345 mnt_drop_write(nd->path.mnt); 2233 if (error)
2346 path_put(&nd->path); 2234 goto exit_mutex_unlock;
2347 if (!IS_ERR(filp)) { 2235 error = vfs_create(dir->d_inode, dentry, mode, nd);
2348 error = ima_file_check(filp, acc_mode); 2236 if (error)
2349 if (error) { 2237 goto exit_mutex_unlock;
2350 fput(filp); 2238 mutex_unlock(&dir->d_inode->i_mutex);
2351 filp = ERR_PTR(error); 2239 dput(nd->path.dentry);
2352 } 2240 nd->path.dentry = dentry;
2353 } 2241 goto common;
2354 return filp;
2355 } 2242 }
2356 2243
2357 /* 2244 /*
@@ -2381,7 +2268,40 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
2381 if (S_ISDIR(nd->inode->i_mode)) 2268 if (S_ISDIR(nd->inode->i_mode))
2382 goto exit; 2269 goto exit;
2383ok: 2270ok:
2384 filp = finish_open(nd, open_flag, acc_mode); 2271 if (!S_ISREG(nd->inode->i_mode))
2272 will_truncate = 0;
2273
2274 if (will_truncate) {
2275 error = mnt_want_write(nd->path.mnt);
2276 if (error)
2277 goto exit;
2278 want_write = 1;
2279 }
2280common:
2281 error = may_open(&nd->path, acc_mode, open_flag);
2282 if (error)
2283 goto exit;
2284 filp = nameidata_to_filp(nd);
2285 if (!IS_ERR(filp)) {
2286 error = ima_file_check(filp, op->acc_mode);
2287 if (error) {
2288 fput(filp);
2289 filp = ERR_PTR(error);
2290 }
2291 }
2292 if (!IS_ERR(filp)) {
2293 if (will_truncate) {
2294 error = handle_truncate(filp);
2295 if (error) {
2296 fput(filp);
2297 filp = ERR_PTR(error);
2298 }
2299 }
2300 }
2301out:
2302 if (want_write)
2303 mnt_drop_write(nd->path.mnt);
2304 path_put(&nd->path);
2385 return filp; 2305 return filp;
2386 2306
2387exit_mutex_unlock: 2307exit_mutex_unlock:
@@ -2389,199 +2309,103 @@ exit_mutex_unlock:
2389exit_dput: 2309exit_dput:
2390 path_put_conditional(path, nd); 2310 path_put_conditional(path, nd);
2391exit: 2311exit:
2392 if (!IS_ERR(nd->intent.open.file)) 2312 filp = ERR_PTR(error);
2393 release_open_intent(nd); 2313 goto out;
2394 path_put(&nd->path);
2395 return ERR_PTR(error);
2396} 2314}
2397 2315
2398/* 2316static struct file *path_openat(int dfd, const char *pathname,
2399 * Note that the low bits of the passed in "open_flag" 2317 struct nameidata *nd, const struct open_flags *op, int flags)
2400 * are not the same as in the local variable "flag". See
2401 * open_to_namei_flags() for more details.
2402 */
2403struct file *do_filp_open(int dfd, const char *pathname,
2404 int open_flag, int mode, int acc_mode)
2405{ 2318{
2319 struct file *base = NULL;
2406 struct file *filp; 2320 struct file *filp;
2407 struct nameidata nd;
2408 int error;
2409 struct path path; 2321 struct path path;
2410 int count = 0; 2322 int error;
2411 int flag = open_to_namei_flags(open_flag);
2412 int flags;
2413
2414 if (!(open_flag & O_CREAT))
2415 mode = 0;
2416
2417 /* Must never be set by userspace */
2418 open_flag &= ~FMODE_NONOTIFY;
2419
2420 /*
2421 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
2422 * check for O_DSYNC if the need any syncing at all we enforce it's
2423 * always set instead of having to deal with possibly weird behaviour
2424 * for malicious applications setting only __O_SYNC.
2425 */
2426 if (open_flag & __O_SYNC)
2427 open_flag |= O_DSYNC;
2428
2429 if (!acc_mode)
2430 acc_mode = MAY_OPEN | ACC_MODE(open_flag);
2431
2432 /* O_TRUNC implies we need access checks for write permissions */
2433 if (open_flag & O_TRUNC)
2434 acc_mode |= MAY_WRITE;
2435
2436 /* Allow the LSM permission hook to distinguish append
2437 access from general write access. */
2438 if (open_flag & O_APPEND)
2439 acc_mode |= MAY_APPEND;
2440
2441 flags = LOOKUP_OPEN;
2442 if (open_flag & O_CREAT) {
2443 flags |= LOOKUP_CREATE;
2444 if (open_flag & O_EXCL)
2445 flags |= LOOKUP_EXCL;
2446 }
2447 if (open_flag & O_DIRECTORY)
2448 flags |= LOOKUP_DIRECTORY;
2449 if (!(open_flag & O_NOFOLLOW))
2450 flags |= LOOKUP_FOLLOW;
2451 2323
2452 filp = get_empty_filp(); 2324 filp = get_empty_filp();
2453 if (!filp) 2325 if (!filp)
2454 return ERR_PTR(-ENFILE); 2326 return ERR_PTR(-ENFILE);
2455 2327
2456 filp->f_flags = open_flag; 2328 filp->f_flags = op->open_flag;
2457 nd.intent.open.file = filp; 2329 nd->intent.open.file = filp;
2458 nd.intent.open.flags = flag; 2330 nd->intent.open.flags = open_to_namei_flags(op->open_flag);
2459 nd.intent.open.create_mode = mode; 2331 nd->intent.open.create_mode = op->mode;
2460
2461 if (open_flag & O_CREAT)
2462 goto creat;
2463 2332
2464 /* !O_CREAT, simple open */ 2333 error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
2465 error = do_path_lookup(dfd, pathname, flags, &nd);
2466 if (unlikely(error)) 2334 if (unlikely(error))
2467 goto out_filp; 2335 goto out_filp;
2468 error = -ELOOP;
2469 if (!(nd.flags & LOOKUP_FOLLOW)) {
2470 if (nd.inode->i_op->follow_link)
2471 goto out_path;
2472 }
2473 error = -ENOTDIR;
2474 if (nd.flags & LOOKUP_DIRECTORY) {
2475 if (!nd.inode->i_op->lookup)
2476 goto out_path;
2477 }
2478 audit_inode(pathname, nd.path.dentry);
2479 filp = finish_open(&nd, open_flag, acc_mode);
2480 return filp;
2481
2482creat:
2483 /* OK, have to create the file. Find the parent. */
2484 error = path_init_rcu(dfd, pathname,
2485 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2486 if (error)
2487 goto out_filp;
2488 error = path_walk_rcu(pathname, &nd);
2489 path_finish_rcu(&nd);
2490 if (unlikely(error == -ECHILD || error == -ESTALE)) {
2491 /* slower, locked walk */
2492 if (error == -ESTALE) {
2493reval:
2494 flags |= LOOKUP_REVAL;
2495 }
2496 error = path_init(dfd, pathname,
2497 LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
2498 if (error)
2499 goto out_filp;
2500 2336
2501 error = path_walk_simple(pathname, &nd); 2337 current->total_link_count = 0;
2502 } 2338 error = link_path_walk(pathname, nd);
2503 if (unlikely(error)) 2339 if (unlikely(error))
2504 goto out_filp; 2340 goto out_filp;
2505 if (unlikely(!audit_dummy_context()))
2506 audit_inode(pathname, nd.path.dentry);
2507 2341
2508 /* 2342 filp = do_last(nd, &path, op, pathname);
2509 * We have the parent and last component.
2510 */
2511 nd.flags = flags;
2512 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
2513 while (unlikely(!filp)) { /* trailing symlink */ 2343 while (unlikely(!filp)) { /* trailing symlink */
2514 struct path link = path; 2344 struct path link = path;
2515 struct inode *linki = link.dentry->d_inode;
2516 void *cookie; 2345 void *cookie;
2517 error = -ELOOP; 2346 if (!(nd->flags & LOOKUP_FOLLOW)) {
2518 if (!(nd.flags & LOOKUP_FOLLOW)) 2347 path_put_conditional(&path, nd);
2519 goto exit_dput; 2348 path_put(&nd->path);
2520 if (count++ == 32) 2349 filp = ERR_PTR(-ELOOP);
2521 goto exit_dput; 2350 break;
2522 /*
2523 * This is subtle. Instead of calling do_follow_link() we do
2524 * the thing by hands. The reason is that this way we have zero
2525 * link_count and path_walk() (called from ->follow_link)
2526 * honoring LOOKUP_PARENT. After that we have the parent and
2527 * last component, i.e. we are in the same situation as after
2528 * the first path_walk(). Well, almost - if the last component
2529 * is normal we get its copy stored in nd->last.name and we will
2530 * have to putname() it when we are done. Procfs-like symlinks
2531 * just set LAST_BIND.
2532 */
2533 nd.flags |= LOOKUP_PARENT;
2534 error = security_inode_follow_link(link.dentry, &nd);
2535 if (error)
2536 goto exit_dput;
2537 error = __do_follow_link(&link, &nd, &cookie);
2538 if (unlikely(error)) {
2539 if (!IS_ERR(cookie) && linki->i_op->put_link)
2540 linki->i_op->put_link(link.dentry, &nd, cookie);
2541 /* nd.path had been dropped */
2542 nd.path = link;
2543 goto out_path;
2544 } 2351 }
2545 nd.flags &= ~LOOKUP_PARENT; 2352 nd->flags |= LOOKUP_PARENT;
2546 filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname); 2353 nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
2547 if (linki->i_op->put_link) 2354 error = follow_link(&link, nd, &cookie);
2548 linki->i_op->put_link(link.dentry, &nd, cookie); 2355 if (unlikely(error))
2549 path_put(&link); 2356 filp = ERR_PTR(error);
2357 else
2358 filp = do_last(nd, &path, op, pathname);
2359 put_link(nd, &link, cookie);
2550 } 2360 }
2551out: 2361out:
2552 if (nd.root.mnt) 2362 if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
2553 path_put(&nd.root); 2363 path_put(&nd->root);
2554 if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL)) 2364 if (base)
2555 goto reval; 2365 fput(base);
2366 release_open_intent(nd);
2556 return filp; 2367 return filp;
2557 2368
2558exit_dput:
2559 path_put_conditional(&path, &nd);
2560out_path:
2561 path_put(&nd.path);
2562out_filp: 2369out_filp:
2563 if (!IS_ERR(nd.intent.open.file))
2564 release_open_intent(&nd);
2565 filp = ERR_PTR(error); 2370 filp = ERR_PTR(error);
2566 goto out; 2371 goto out;
2567} 2372}
2568 2373
2569/** 2374struct file *do_filp_open(int dfd, const char *pathname,
2570 * filp_open - open file and return file pointer 2375 const struct open_flags *op, int flags)
2571 *
2572 * @filename: path to open
2573 * @flags: open flags as per the open(2) second argument
2574 * @mode: mode for the new file if O_CREAT is set, else ignored
2575 *
2576 * This is the helper to open a file from kernelspace if you really
2577 * have to. But in generally you should not do this, so please move
2578 * along, nothing to see here..
2579 */
2580struct file *filp_open(const char *filename, int flags, int mode)
2581{ 2376{
2582 return do_filp_open(AT_FDCWD, filename, flags, mode, 0); 2377 struct nameidata nd;
2378 struct file *filp;
2379
2380 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
2381 if (unlikely(filp == ERR_PTR(-ECHILD)))
2382 filp = path_openat(dfd, pathname, &nd, op, flags);
2383 if (unlikely(filp == ERR_PTR(-ESTALE)))
2384 filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
2385 return filp;
2386}
2387
2388struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
2389 const char *name, const struct open_flags *op, int flags)
2390{
2391 struct nameidata nd;
2392 struct file *file;
2393
2394 nd.root.mnt = mnt;
2395 nd.root.dentry = dentry;
2396
2397 flags |= LOOKUP_ROOT;
2398
2399 if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
2400 return ERR_PTR(-ELOOP);
2401
2402 file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
2403 if (unlikely(file == ERR_PTR(-ECHILD)))
2404 file = path_openat(-1, name, &nd, op, flags);
2405 if (unlikely(file == ERR_PTR(-ESTALE)))
2406 file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
2407 return file;
2583} 2408}
2584EXPORT_SYMBOL(filp_open);
2585 2409
2586/** 2410/**
2587 * lookup_create - lookup a dentry, creating it if it doesn't exist 2411 * lookup_create - lookup a dentry, creating it if it doesn't exist
@@ -2643,7 +2467,8 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
2643 if (error) 2467 if (error)
2644 return error; 2468 return error;
2645 2469
2646 if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) 2470 if ((S_ISCHR(mode) || S_ISBLK(mode)) &&
2471 !ns_capable(inode_userns(dir), CAP_MKNOD))
2647 return -EPERM; 2472 return -EPERM;
2648 2473
2649 if (!dir->i_op->mknod) 2474 if (!dir->i_op->mknod)
@@ -3120,7 +2945,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
3120 return error; 2945 return error;
3121 2946
3122 mutex_lock(&inode->i_mutex); 2947 mutex_lock(&inode->i_mutex);
3123 error = dir->i_op->link(old_dentry, dir, new_dentry); 2948 /* Make sure we don't allow creating hardlink to an unlinked file */
2949 if (inode->i_nlink == 0)
2950 error = -ENOENT;
2951 else
2952 error = dir->i_op->link(old_dentry, dir, new_dentry);
3124 mutex_unlock(&inode->i_mutex); 2953 mutex_unlock(&inode->i_mutex);
3125 if (!error) 2954 if (!error)
3126 fsnotify_link(dir, inode, new_dentry); 2955 fsnotify_link(dir, inode, new_dentry);
@@ -3142,15 +2971,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
3142 struct dentry *new_dentry; 2971 struct dentry *new_dentry;
3143 struct nameidata nd; 2972 struct nameidata nd;
3144 struct path old_path; 2973 struct path old_path;
2974 int how = 0;
3145 int error; 2975 int error;
3146 char *to; 2976 char *to;
3147 2977
3148 if ((flags & ~AT_SYMLINK_FOLLOW) != 0) 2978 if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
3149 return -EINVAL; 2979 return -EINVAL;
2980 /*
2981 * To use null names we require CAP_DAC_READ_SEARCH
2982 * This ensures that not everyone will be able to create
2983 * handlink using the passed filedescriptor.
2984 */
2985 if (flags & AT_EMPTY_PATH) {
2986 if (!capable(CAP_DAC_READ_SEARCH))
2987 return -ENOENT;
2988 how = LOOKUP_EMPTY;
2989 }
2990
2991 if (flags & AT_SYMLINK_FOLLOW)
2992 how |= LOOKUP_FOLLOW;
3150 2993
3151 error = user_path_at(olddfd, oldname, 2994 error = user_path_at(olddfd, oldname, how, &old_path);
3152 flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
3153 &old_path);
3154 if (error) 2995 if (error)
3155 return error; 2996 return error;
3156 2997
@@ -3587,7 +3428,7 @@ EXPORT_SYMBOL(page_readlink);
3587EXPORT_SYMBOL(__page_symlink); 3428EXPORT_SYMBOL(__page_symlink);
3588EXPORT_SYMBOL(page_symlink); 3429EXPORT_SYMBOL(page_symlink);
3589EXPORT_SYMBOL(page_symlink_inode_operations); 3430EXPORT_SYMBOL(page_symlink_inode_operations);
3590EXPORT_SYMBOL(path_lookup); 3431EXPORT_SYMBOL(kern_path_parent);
3591EXPORT_SYMBOL(kern_path); 3432EXPORT_SYMBOL(kern_path);
3592EXPORT_SYMBOL(vfs_path_lookup); 3433EXPORT_SYMBOL(vfs_path_lookup);
3593EXPORT_SYMBOL(inode_permission); 3434EXPORT_SYMBOL(inode_permission);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7b0b95371696..d99bcf59e4c2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -196,7 +196,7 @@ unsigned int mnt_get_count(struct vfsmount *mnt)
196#endif 196#endif
197} 197}
198 198
199struct vfsmount *alloc_vfsmnt(const char *name) 199static struct vfsmount *alloc_vfsmnt(const char *name)
200{ 200{
201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 201 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
202 if (mnt) { 202 if (mnt) {
@@ -466,15 +466,7 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
466 br_write_unlock(vfsmount_lock); 466 br_write_unlock(vfsmount_lock);
467} 467}
468 468
469void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 469static void free_vfsmnt(struct vfsmount *mnt)
470{
471 mnt->mnt_sb = sb;
472 mnt->mnt_root = dget(sb->s_root);
473}
474
475EXPORT_SYMBOL(simple_set_mnt);
476
477void free_vfsmnt(struct vfsmount *mnt)
478{ 470{
479 kfree(mnt->mnt_devname); 471 kfree(mnt->mnt_devname);
480 mnt_free_id(mnt); 472 mnt_free_id(mnt);
@@ -678,6 +670,36 @@ static struct vfsmount *skip_mnt_tree(struct vfsmount *p)
678 return p; 670 return p;
679} 671}
680 672
673struct vfsmount *
674vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
675{
676 struct vfsmount *mnt;
677 struct dentry *root;
678
679 if (!type)
680 return ERR_PTR(-ENODEV);
681
682 mnt = alloc_vfsmnt(name);
683 if (!mnt)
684 return ERR_PTR(-ENOMEM);
685
686 if (flags & MS_KERNMOUNT)
687 mnt->mnt_flags = MNT_INTERNAL;
688
689 root = mount_fs(type, flags, name, data);
690 if (IS_ERR(root)) {
691 free_vfsmnt(mnt);
692 return ERR_CAST(root);
693 }
694
695 mnt->mnt_root = root;
696 mnt->mnt_sb = root->d_sb;
697 mnt->mnt_mountpoint = mnt->mnt_root;
698 mnt->mnt_parent = mnt;
699 return mnt;
700}
701EXPORT_SYMBOL_GPL(vfs_kern_mount);
702
681static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, 703static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
682 int flag) 704 int flag)
683{ 705{
@@ -978,7 +1000,13 @@ static int show_vfsmnt(struct seq_file *m, void *v)
978 int err = 0; 1000 int err = 0;
979 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 1001 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
980 1002
981 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 1003 if (mnt->mnt_sb->s_op->show_devname) {
1004 err = mnt->mnt_sb->s_op->show_devname(m, mnt);
1005 if (err)
1006 goto out;
1007 } else {
1008 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
1009 }
982 seq_putc(m, ' '); 1010 seq_putc(m, ' ');
983 seq_path(m, &mnt_path, " \t\n\\"); 1011 seq_path(m, &mnt_path, " \t\n\\");
984 seq_putc(m, ' '); 1012 seq_putc(m, ' ');
@@ -1013,7 +1041,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
1013 1041
1014 seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, 1042 seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
1015 MAJOR(sb->s_dev), MINOR(sb->s_dev)); 1043 MAJOR(sb->s_dev), MINOR(sb->s_dev));
1016 seq_dentry(m, mnt->mnt_root, " \t\n\\"); 1044 if (sb->s_op->show_path)
1045 err = sb->s_op->show_path(m, mnt);
1046 else
1047 seq_dentry(m, mnt->mnt_root, " \t\n\\");
1048 if (err)
1049 goto out;
1017 seq_putc(m, ' '); 1050 seq_putc(m, ' ');
1018 seq_path_root(m, &mnt_path, &root, " \t\n\\"); 1051 seq_path_root(m, &mnt_path, &root, " \t\n\\");
1019 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { 1052 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
@@ -1044,7 +1077,12 @@ static int show_mountinfo(struct seq_file *m, void *v)
1044 seq_puts(m, " - "); 1077 seq_puts(m, " - ");
1045 show_type(m, sb); 1078 show_type(m, sb);
1046 seq_putc(m, ' '); 1079 seq_putc(m, ' ');
1047 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 1080 if (sb->s_op->show_devname)
1081 err = sb->s_op->show_devname(m, mnt);
1082 else
1083 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
1084 if (err)
1085 goto out;
1048 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); 1086 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
1049 err = show_sb_opts(m, sb); 1087 err = show_sb_opts(m, sb);
1050 if (err) 1088 if (err)
@@ -1070,11 +1108,15 @@ static int show_vfsstat(struct seq_file *m, void *v)
1070 int err = 0; 1108 int err = 0;
1071 1109
1072 /* device */ 1110 /* device */
1073 if (mnt->mnt_devname) { 1111 if (mnt->mnt_sb->s_op->show_devname) {
1074 seq_puts(m, "device "); 1112 err = mnt->mnt_sb->s_op->show_devname(m, mnt);
1075 mangle(m, mnt->mnt_devname); 1113 } else {
1076 } else 1114 if (mnt->mnt_devname) {
1077 seq_puts(m, "no device"); 1115 seq_puts(m, "device ");
1116 mangle(m, mnt->mnt_devname);
1117 } else
1118 seq_puts(m, "no device");
1119 }
1078 1120
1079 /* mount point */ 1121 /* mount point */
1080 seq_puts(m, " mounted on "); 1122 seq_puts(m, " mounted on ");
@@ -1088,7 +1130,8 @@ static int show_vfsstat(struct seq_file *m, void *v)
1088 /* optional statistics */ 1130 /* optional statistics */
1089 if (mnt->mnt_sb->s_op->show_stats) { 1131 if (mnt->mnt_sb->s_op->show_stats) {
1090 seq_putc(m, ' '); 1132 seq_putc(m, ' ');
1091 err = mnt->mnt_sb->s_op->show_stats(m, mnt); 1133 if (!err)
1134 err = mnt->mnt_sb->s_op->show_stats(m, mnt);
1092 } 1135 }
1093 1136
1094 seq_putc(m, '\n'); 1137 seq_putc(m, '\n');
@@ -1244,7 +1287,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
1244 */ 1287 */
1245 br_write_lock(vfsmount_lock); 1288 br_write_lock(vfsmount_lock);
1246 if (mnt_get_count(mnt) != 2) { 1289 if (mnt_get_count(mnt) != 2) {
1247 br_write_lock(vfsmount_lock); 1290 br_write_unlock(vfsmount_lock);
1248 return -EBUSY; 1291 return -EBUSY;
1249 } 1292 }
1250 br_write_unlock(vfsmount_lock); 1293 br_write_unlock(vfsmount_lock);
@@ -1604,9 +1647,35 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
1604 return err; 1647 return err;
1605} 1648}
1606 1649
1650static int lock_mount(struct path *path)
1651{
1652 struct vfsmount *mnt;
1653retry:
1654 mutex_lock(&path->dentry->d_inode->i_mutex);
1655 if (unlikely(cant_mount(path->dentry))) {
1656 mutex_unlock(&path->dentry->d_inode->i_mutex);
1657 return -ENOENT;
1658 }
1659 down_write(&namespace_sem);
1660 mnt = lookup_mnt(path);
1661 if (likely(!mnt))
1662 return 0;
1663 up_write(&namespace_sem);
1664 mutex_unlock(&path->dentry->d_inode->i_mutex);
1665 path_put(path);
1666 path->mnt = mnt;
1667 path->dentry = dget(mnt->mnt_root);
1668 goto retry;
1669}
1670
1671static void unlock_mount(struct path *path)
1672{
1673 up_write(&namespace_sem);
1674 mutex_unlock(&path->dentry->d_inode->i_mutex);
1675}
1676
1607static int graft_tree(struct vfsmount *mnt, struct path *path) 1677static int graft_tree(struct vfsmount *mnt, struct path *path)
1608{ 1678{
1609 int err;
1610 if (mnt->mnt_sb->s_flags & MS_NOUSER) 1679 if (mnt->mnt_sb->s_flags & MS_NOUSER)
1611 return -EINVAL; 1680 return -EINVAL;
1612 1681
@@ -1614,16 +1683,10 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
1614 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 1683 S_ISDIR(mnt->mnt_root->d_inode->i_mode))
1615 return -ENOTDIR; 1684 return -ENOTDIR;
1616 1685
1617 err = -ENOENT; 1686 if (d_unlinked(path->dentry))
1618 mutex_lock(&path->dentry->d_inode->i_mutex); 1687 return -ENOENT;
1619 if (cant_mount(path->dentry))
1620 goto out_unlock;
1621 1688
1622 if (!d_unlinked(path->dentry)) 1689 return attach_recursive_mnt(mnt, path, NULL);
1623 err = attach_recursive_mnt(mnt, path, NULL);
1624out_unlock:
1625 mutex_unlock(&path->dentry->d_inode->i_mutex);
1626 return err;
1627} 1690}
1628 1691
1629/* 1692/*
@@ -1686,6 +1749,7 @@ static int do_change_type(struct path *path, int flag)
1686static int do_loopback(struct path *path, char *old_name, 1749static int do_loopback(struct path *path, char *old_name,
1687 int recurse) 1750 int recurse)
1688{ 1751{
1752 LIST_HEAD(umount_list);
1689 struct path old_path; 1753 struct path old_path;
1690 struct vfsmount *mnt = NULL; 1754 struct vfsmount *mnt = NULL;
1691 int err = mount_is_safe(path); 1755 int err = mount_is_safe(path);
@@ -1697,13 +1761,16 @@ static int do_loopback(struct path *path, char *old_name,
1697 if (err) 1761 if (err)
1698 return err; 1762 return err;
1699 1763
1700 down_write(&namespace_sem); 1764 err = lock_mount(path);
1765 if (err)
1766 goto out;
1767
1701 err = -EINVAL; 1768 err = -EINVAL;
1702 if (IS_MNT_UNBINDABLE(old_path.mnt)) 1769 if (IS_MNT_UNBINDABLE(old_path.mnt))
1703 goto out; 1770 goto out2;
1704 1771
1705 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1772 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
1706 goto out; 1773 goto out2;
1707 1774
1708 err = -ENOMEM; 1775 err = -ENOMEM;
1709 if (recurse) 1776 if (recurse)
@@ -1712,20 +1779,18 @@ static int do_loopback(struct path *path, char *old_name,
1712 mnt = clone_mnt(old_path.mnt, old_path.dentry, 0); 1779 mnt = clone_mnt(old_path.mnt, old_path.dentry, 0);
1713 1780
1714 if (!mnt) 1781 if (!mnt)
1715 goto out; 1782 goto out2;
1716 1783
1717 err = graft_tree(mnt, path); 1784 err = graft_tree(mnt, path);
1718 if (err) { 1785 if (err) {
1719 LIST_HEAD(umount_list);
1720
1721 br_write_lock(vfsmount_lock); 1786 br_write_lock(vfsmount_lock);
1722 umount_tree(mnt, 0, &umount_list); 1787 umount_tree(mnt, 0, &umount_list);
1723 br_write_unlock(vfsmount_lock); 1788 br_write_unlock(vfsmount_lock);
1724 release_mounts(&umount_list);
1725 } 1789 }
1726 1790out2:
1791 unlock_mount(path);
1792 release_mounts(&umount_list);
1727out: 1793out:
1728 up_write(&namespace_sem);
1729 path_put(&old_path); 1794 path_put(&old_path);
1730 return err; 1795 return err;
1731} 1796}
@@ -1767,6 +1832,10 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
1767 if (path->dentry != path->mnt->mnt_root) 1832 if (path->dentry != path->mnt->mnt_root)
1768 return -EINVAL; 1833 return -EINVAL;
1769 1834
1835 err = security_sb_remount(sb, data);
1836 if (err)
1837 return err;
1838
1770 down_write(&sb->s_umount); 1839 down_write(&sb->s_umount);
1771 if (flags & MS_BIND) 1840 if (flags & MS_BIND)
1772 err = change_mount_flags(path->mnt, flags); 1841 err = change_mount_flags(path->mnt, flags);
@@ -1810,18 +1879,12 @@ static int do_move_mount(struct path *path, char *old_name)
1810 if (err) 1879 if (err)
1811 return err; 1880 return err;
1812 1881
1813 down_write(&namespace_sem); 1882 err = lock_mount(path);
1814 err = follow_down(path, true);
1815 if (err < 0) 1883 if (err < 0)
1816 goto out; 1884 goto out;
1817 1885
1818 err = -EINVAL; 1886 err = -EINVAL;
1819 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt)) 1887 if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
1820 goto out;
1821
1822 err = -ENOENT;
1823 mutex_lock(&path->dentry->d_inode->i_mutex);
1824 if (cant_mount(path->dentry))
1825 goto out1; 1888 goto out1;
1826 1889
1827 if (d_unlinked(path->dentry)) 1890 if (d_unlinked(path->dentry))
@@ -1863,16 +1926,87 @@ static int do_move_mount(struct path *path, char *old_name)
1863 * automatically */ 1926 * automatically */
1864 list_del_init(&old_path.mnt->mnt_expire); 1927 list_del_init(&old_path.mnt->mnt_expire);
1865out1: 1928out1:
1866 mutex_unlock(&path->dentry->d_inode->i_mutex); 1929 unlock_mount(path);
1867out: 1930out:
1868 up_write(&namespace_sem);
1869 if (!err) 1931 if (!err)
1870 path_put(&parent_path); 1932 path_put(&parent_path);
1871 path_put(&old_path); 1933 path_put(&old_path);
1872 return err; 1934 return err;
1873} 1935}
1874 1936
1875static int do_add_mount(struct vfsmount *, struct path *, int); 1937static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1938{
1939 int err;
1940 const char *subtype = strchr(fstype, '.');
1941 if (subtype) {
1942 subtype++;
1943 err = -EINVAL;
1944 if (!subtype[0])
1945 goto err;
1946 } else
1947 subtype = "";
1948
1949 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
1950 err = -ENOMEM;
1951 if (!mnt->mnt_sb->s_subtype)
1952 goto err;
1953 return mnt;
1954
1955 err:
1956 mntput(mnt);
1957 return ERR_PTR(err);
1958}
1959
1960struct vfsmount *
1961do_kern_mount(const char *fstype, int flags, const char *name, void *data)
1962{
1963 struct file_system_type *type = get_fs_type(fstype);
1964 struct vfsmount *mnt;
1965 if (!type)
1966 return ERR_PTR(-ENODEV);
1967 mnt = vfs_kern_mount(type, flags, name, data);
1968 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1969 !mnt->mnt_sb->s_subtype)
1970 mnt = fs_set_subtype(mnt, fstype);
1971 put_filesystem(type);
1972 return mnt;
1973}
1974EXPORT_SYMBOL_GPL(do_kern_mount);
1975
1976/*
1977 * add a mount into a namespace's mount tree
1978 */
1979static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
1980{
1981 int err;
1982
1983 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
1984
1985 err = lock_mount(path);
1986 if (err)
1987 return err;
1988
1989 err = -EINVAL;
1990 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1991 goto unlock;
1992
1993 /* Refuse the same filesystem on the same mount point */
1994 err = -EBUSY;
1995 if (path->mnt->mnt_sb == newmnt->mnt_sb &&
1996 path->mnt->mnt_root == path->dentry)
1997 goto unlock;
1998
1999 err = -EINVAL;
2000 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
2001 goto unlock;
2002
2003 newmnt->mnt_flags = mnt_flags;
2004 err = graft_tree(newmnt, path);
2005
2006unlock:
2007 unlock_mount(path);
2008 return err;
2009}
1876 2010
1877/* 2011/*
1878 * create a new mount for userspace and request it to be added into the 2012 * create a new mount for userspace and request it to be added into the
@@ -1932,43 +2066,6 @@ fail:
1932 return err; 2066 return err;
1933} 2067}
1934 2068
1935/*
1936 * add a mount into a namespace's mount tree
1937 */
1938static int do_add_mount(struct vfsmount *newmnt, struct path *path, int mnt_flags)
1939{
1940 int err;
1941
1942 mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL);
1943
1944 down_write(&namespace_sem);
1945 /* Something was mounted here while we slept */
1946 err = follow_down(path, true);
1947 if (err < 0)
1948 goto unlock;
1949
1950 err = -EINVAL;
1951 if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
1952 goto unlock;
1953
1954 /* Refuse the same filesystem on the same mount point */
1955 err = -EBUSY;
1956 if (path->mnt->mnt_sb == newmnt->mnt_sb &&
1957 path->mnt->mnt_root == path->dentry)
1958 goto unlock;
1959
1960 err = -EINVAL;
1961 if (S_ISLNK(newmnt->mnt_root->d_inode->i_mode))
1962 goto unlock;
1963
1964 newmnt->mnt_flags = mnt_flags;
1965 err = graft_tree(newmnt, path);
1966
1967unlock:
1968 up_write(&namespace_sem);
1969 return err;
1970}
1971
1972/** 2069/**
1973 * mnt_set_expiry - Put a mount on an expiration list 2070 * mnt_set_expiry - Put a mount on an expiration list
1974 * @mnt: The mount to list. 2071 * @mnt: The mount to list.
@@ -2469,65 +2566,60 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2469 error = user_path_dir(new_root, &new); 2566 error = user_path_dir(new_root, &new);
2470 if (error) 2567 if (error)
2471 goto out0; 2568 goto out0;
2472 error = -EINVAL;
2473 if (!check_mnt(new.mnt))
2474 goto out1;
2475 2569
2476 error = user_path_dir(put_old, &old); 2570 error = user_path_dir(put_old, &old);
2477 if (error) 2571 if (error)
2478 goto out1; 2572 goto out1;
2479 2573
2480 error = security_sb_pivotroot(&old, &new); 2574 error = security_sb_pivotroot(&old, &new);
2481 if (error) { 2575 if (error)
2482 path_put(&old); 2576 goto out2;
2483 goto out1;
2484 }
2485 2577
2486 get_fs_root(current->fs, &root); 2578 get_fs_root(current->fs, &root);
2487 down_write(&namespace_sem); 2579 error = lock_mount(&old);
2488 mutex_lock(&old.dentry->d_inode->i_mutex); 2580 if (error)
2581 goto out3;
2582
2489 error = -EINVAL; 2583 error = -EINVAL;
2490 if (IS_MNT_SHARED(old.mnt) || 2584 if (IS_MNT_SHARED(old.mnt) ||
2491 IS_MNT_SHARED(new.mnt->mnt_parent) || 2585 IS_MNT_SHARED(new.mnt->mnt_parent) ||
2492 IS_MNT_SHARED(root.mnt->mnt_parent)) 2586 IS_MNT_SHARED(root.mnt->mnt_parent))
2493 goto out2; 2587 goto out4;
2494 if (!check_mnt(root.mnt)) 2588 if (!check_mnt(root.mnt) || !check_mnt(new.mnt))
2495 goto out2; 2589 goto out4;
2496 error = -ENOENT; 2590 error = -ENOENT;
2497 if (cant_mount(old.dentry))
2498 goto out2;
2499 if (d_unlinked(new.dentry)) 2591 if (d_unlinked(new.dentry))
2500 goto out2; 2592 goto out4;
2501 if (d_unlinked(old.dentry)) 2593 if (d_unlinked(old.dentry))
2502 goto out2; 2594 goto out4;
2503 error = -EBUSY; 2595 error = -EBUSY;
2504 if (new.mnt == root.mnt || 2596 if (new.mnt == root.mnt ||
2505 old.mnt == root.mnt) 2597 old.mnt == root.mnt)
2506 goto out2; /* loop, on the same file system */ 2598 goto out4; /* loop, on the same file system */
2507 error = -EINVAL; 2599 error = -EINVAL;
2508 if (root.mnt->mnt_root != root.dentry) 2600 if (root.mnt->mnt_root != root.dentry)
2509 goto out2; /* not a mountpoint */ 2601 goto out4; /* not a mountpoint */
2510 if (root.mnt->mnt_parent == root.mnt) 2602 if (root.mnt->mnt_parent == root.mnt)
2511 goto out2; /* not attached */ 2603 goto out4; /* not attached */
2512 if (new.mnt->mnt_root != new.dentry) 2604 if (new.mnt->mnt_root != new.dentry)
2513 goto out2; /* not a mountpoint */ 2605 goto out4; /* not a mountpoint */
2514 if (new.mnt->mnt_parent == new.mnt) 2606 if (new.mnt->mnt_parent == new.mnt)
2515 goto out2; /* not attached */ 2607 goto out4; /* not attached */
2516 /* make sure we can reach put_old from new_root */ 2608 /* make sure we can reach put_old from new_root */
2517 tmp = old.mnt; 2609 tmp = old.mnt;
2518 br_write_lock(vfsmount_lock);
2519 if (tmp != new.mnt) { 2610 if (tmp != new.mnt) {
2520 for (;;) { 2611 for (;;) {
2521 if (tmp->mnt_parent == tmp) 2612 if (tmp->mnt_parent == tmp)
2522 goto out3; /* already mounted on put_old */ 2613 goto out4; /* already mounted on put_old */
2523 if (tmp->mnt_parent == new.mnt) 2614 if (tmp->mnt_parent == new.mnt)
2524 break; 2615 break;
2525 tmp = tmp->mnt_parent; 2616 tmp = tmp->mnt_parent;
2526 } 2617 }
2527 if (!is_subdir(tmp->mnt_mountpoint, new.dentry)) 2618 if (!is_subdir(tmp->mnt_mountpoint, new.dentry))
2528 goto out3; 2619 goto out4;
2529 } else if (!is_subdir(old.dentry, new.dentry)) 2620 } else if (!is_subdir(old.dentry, new.dentry))
2530 goto out3; 2621 goto out4;
2622 br_write_lock(vfsmount_lock);
2531 detach_mnt(new.mnt, &parent_path); 2623 detach_mnt(new.mnt, &parent_path);
2532 detach_mnt(root.mnt, &root_parent); 2624 detach_mnt(root.mnt, &root_parent);
2533 /* mount old root on put_old */ 2625 /* mount old root on put_old */
@@ -2537,22 +2629,21 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2537 touch_mnt_namespace(current->nsproxy->mnt_ns); 2629 touch_mnt_namespace(current->nsproxy->mnt_ns);
2538 br_write_unlock(vfsmount_lock); 2630 br_write_unlock(vfsmount_lock);
2539 chroot_fs_refs(&root, &new); 2631 chroot_fs_refs(&root, &new);
2540
2541 error = 0; 2632 error = 0;
2542 path_put(&root_parent); 2633out4:
2543 path_put(&parent_path); 2634 unlock_mount(&old);
2544out2: 2635 if (!error) {
2545 mutex_unlock(&old.dentry->d_inode->i_mutex); 2636 path_put(&root_parent);
2546 up_write(&namespace_sem); 2637 path_put(&parent_path);
2638 }
2639out3:
2547 path_put(&root); 2640 path_put(&root);
2641out2:
2548 path_put(&old); 2642 path_put(&old);
2549out1: 2643out1:
2550 path_put(&new); 2644 path_put(&new);
2551out0: 2645out0:
2552 return error; 2646 return error;
2553out3:
2554 br_write_unlock(vfsmount_lock);
2555 goto out2;
2556} 2647}
2557 2648
2558static void __init init_mount_tree(void) 2649static void __init init_mount_tree(void)
@@ -2594,7 +2685,7 @@ void __init mnt_init(void)
2594 if (!mount_hashtable) 2685 if (!mount_hashtable)
2595 panic("Failed to allocate mount hash table\n"); 2686 panic("Failed to allocate mount hash table\n");
2596 2687
2597 printk("Mount-cache hash table entries: %lu\n", HASH_SIZE); 2688 printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
2598 2689
2599 for (u = 0; u < HASH_SIZE; u++) 2690 for (u = 0; u < HASH_SIZE; u++)
2600 INIT_LIST_HEAD(&mount_hashtable[u]); 2691 INIT_LIST_HEAD(&mount_hashtable[u]);
@@ -2627,3 +2718,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
2627 kfree(ns); 2718 kfree(ns);
2628} 2719}
2629EXPORT_SYMBOL(put_mnt_ns); 2720EXPORT_SYMBOL(put_mnt_ns);
2721
2722struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
2723{
2724 return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
2725}
2726EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/ncpfs/Makefile b/fs/ncpfs/Makefile
index 68ea095100a8..c66af563f2ce 100644
--- a/fs/ncpfs/Makefile
+++ b/fs/ncpfs/Makefile
@@ -11,6 +11,6 @@ ncpfs-$(CONFIG_NCPFS_EXTRAS) += symlink.o
11ncpfs-$(CONFIG_NCPFS_NFS_NS) += symlink.o 11ncpfs-$(CONFIG_NCPFS_NFS_NS) += symlink.o
12 12
13# If you want debugging output, please uncomment the following line 13# If you want debugging output, please uncomment the following line
14# EXTRA_CFLAGS += -DDEBUG_NCP=1 14# ccflags-y := -DDEBUG_NCP=1
15 15
16CFLAGS_ncplib_kernel.o := -finline-functions 16CFLAGS_ncplib_kernel.o := -finline-functions
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 00a1d1c3d3a4..0250e4ce4893 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -596,7 +596,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
596/* server->priv.data = NULL; */ 596/* server->priv.data = NULL; */
597 597
598 server->m = data; 598 server->m = data;
599 /* Althought anything producing this is buggy, it happens 599 /* Although anything producing this is buggy, it happens
600 now because of PATH_MAX changes.. */ 600 now because of PATH_MAX changes.. */
601 if (server->m.time_out < 1) { 601 if (server->m.time_out < 1) {
602 server->m.time_out = 10; 602 server->m.time_out = 10;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 89587573fe50..2f41dccea18e 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -188,10 +188,10 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
188 rv = NFS4ERR_DELAY; 188 rv = NFS4ERR_DELAY;
189 list_del_init(&lo->plh_bulk_recall); 189 list_del_init(&lo->plh_bulk_recall);
190 spin_unlock(&ino->i_lock); 190 spin_unlock(&ino->i_lock);
191 pnfs_free_lseg_list(&free_me_list);
191 put_layout_hdr(lo); 192 put_layout_hdr(lo);
192 iput(ino); 193 iput(ino);
193 } 194 }
194 pnfs_free_lseg_list(&free_me_list);
195 return rv; 195 return rv;
196} 196}
197 197
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 14e0f9371d14..00ecf62ce7c1 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -241,7 +241,7 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
241 241
242 args->cbl_layout_type = ntohl(*p++); 242 args->cbl_layout_type = ntohl(*p++);
243 /* Depite the spec's xdr, iomode really belongs in the FILE switch, 243 /* Depite the spec's xdr, iomode really belongs in the FILE switch,
244 * as it is unuseable and ignored with the other types. 244 * as it is unusable and ignored with the other types.
245 */ 245 */
246 iomode = ntohl(*p++); 246 iomode = ntohl(*p++);
247 args->cbl_layoutchanged = ntohl(*p++); 247 args->cbl_layoutchanged = ntohl(*p++);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index bd3ca32879e7..139be9647d80 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -82,6 +82,11 @@ retry:
82#endif /* CONFIG_NFS_V4 */ 82#endif /* CONFIG_NFS_V4 */
83 83
84/* 84/*
85 * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
86 */
87static int nfs4_disable_idmapping = 0;
88
89/*
85 * RPC cruft for NFS 90 * RPC cruft for NFS
86 */ 91 */
87static struct rpc_version *nfs_version[5] = { 92static struct rpc_version *nfs_version[5] = {
@@ -481,7 +486,12 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
481 * Look up a client by IP address and protocol version 486 * Look up a client by IP address and protocol version
482 * - creates a new record if one doesn't yet exist 487 * - creates a new record if one doesn't yet exist
483 */ 488 */
484static struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) 489static struct nfs_client *
490nfs_get_client(const struct nfs_client_initdata *cl_init,
491 const struct rpc_timeout *timeparms,
492 const char *ip_addr,
493 rpc_authflavor_t authflavour,
494 int noresvport)
485{ 495{
486 struct nfs_client *clp, *new = NULL; 496 struct nfs_client *clp, *new = NULL;
487 int error; 497 int error;
@@ -512,6 +522,13 @@ install_client:
512 clp = new; 522 clp = new;
513 list_add(&clp->cl_share_link, &nfs_client_list); 523 list_add(&clp->cl_share_link, &nfs_client_list);
514 spin_unlock(&nfs_client_lock); 524 spin_unlock(&nfs_client_lock);
525
526 error = cl_init->rpc_ops->init_client(clp, timeparms, ip_addr,
527 authflavour, noresvport);
528 if (error < 0) {
529 nfs_put_client(clp);
530 return ERR_PTR(error);
531 }
515 dprintk("--> nfs_get_client() = %p [new]\n", clp); 532 dprintk("--> nfs_get_client() = %p [new]\n", clp);
516 return clp; 533 return clp;
517 534
@@ -767,9 +784,9 @@ static int nfs_init_server_rpcclient(struct nfs_server *server,
767/* 784/*
768 * Initialise an NFS2 or NFS3 client 785 * Initialise an NFS2 or NFS3 client
769 */ 786 */
770static int nfs_init_client(struct nfs_client *clp, 787int nfs_init_client(struct nfs_client *clp, const struct rpc_timeout *timeparms,
771 const struct rpc_timeout *timeparms, 788 const char *ip_addr, rpc_authflavor_t authflavour,
772 const struct nfs_parsed_mount_data *data) 789 int noresvport)
773{ 790{
774 int error; 791 int error;
775 792
@@ -784,7 +801,7 @@ static int nfs_init_client(struct nfs_client *clp,
784 * - RFC 2623, sec 2.3.2 801 * - RFC 2623, sec 2.3.2
785 */ 802 */
786 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 803 error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
787 0, data->flags & NFS_MOUNT_NORESVPORT); 804 0, noresvport);
788 if (error < 0) 805 if (error < 0)
789 goto error; 806 goto error;
790 nfs_mark_client_ready(clp, NFS_CS_READY); 807 nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -820,19 +837,17 @@ static int nfs_init_server(struct nfs_server *server,
820 cl_init.rpc_ops = &nfs_v3_clientops; 837 cl_init.rpc_ops = &nfs_v3_clientops;
821#endif 838#endif
822 839
840 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
841 data->timeo, data->retrans);
842
823 /* Allocate or find a client reference we can use */ 843 /* Allocate or find a client reference we can use */
824 clp = nfs_get_client(&cl_init); 844 clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX,
845 data->flags & NFS_MOUNT_NORESVPORT);
825 if (IS_ERR(clp)) { 846 if (IS_ERR(clp)) {
826 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); 847 dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
827 return PTR_ERR(clp); 848 return PTR_ERR(clp);
828 } 849 }
829 850
830 nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
831 data->timeo, data->retrans);
832 error = nfs_init_client(clp, &timeparms, data);
833 if (error < 0)
834 goto error;
835
836 server->nfs_client = clp; 851 server->nfs_client = clp;
837 852
838 /* Initialise the client representation from the mount data */ 853 /* Initialise the client representation from the mount data */
@@ -1009,14 +1024,19 @@ static void nfs_server_insert_lists(struct nfs_server *server)
1009 spin_lock(&nfs_client_lock); 1024 spin_lock(&nfs_client_lock);
1010 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks); 1025 list_add_tail_rcu(&server->client_link, &clp->cl_superblocks);
1011 list_add_tail(&server->master_link, &nfs_volume_list); 1026 list_add_tail(&server->master_link, &nfs_volume_list);
1027 clear_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
1012 spin_unlock(&nfs_client_lock); 1028 spin_unlock(&nfs_client_lock);
1013 1029
1014} 1030}
1015 1031
1016static void nfs_server_remove_lists(struct nfs_server *server) 1032static void nfs_server_remove_lists(struct nfs_server *server)
1017{ 1033{
1034 struct nfs_client *clp = server->nfs_client;
1035
1018 spin_lock(&nfs_client_lock); 1036 spin_lock(&nfs_client_lock);
1019 list_del_rcu(&server->client_link); 1037 list_del_rcu(&server->client_link);
1038 if (clp && list_empty(&clp->cl_superblocks))
1039 set_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state);
1020 list_del(&server->master_link); 1040 list_del(&server->master_link);
1021 spin_unlock(&nfs_client_lock); 1041 spin_unlock(&nfs_client_lock);
1022 1042
@@ -1307,11 +1327,11 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
1307/* 1327/*
1308 * Initialise an NFS4 client record 1328 * Initialise an NFS4 client record
1309 */ 1329 */
1310static int nfs4_init_client(struct nfs_client *clp, 1330int nfs4_init_client(struct nfs_client *clp,
1311 const struct rpc_timeout *timeparms, 1331 const struct rpc_timeout *timeparms,
1312 const char *ip_addr, 1332 const char *ip_addr,
1313 rpc_authflavor_t authflavour, 1333 rpc_authflavor_t authflavour,
1314 int flags) 1334 int noresvport)
1315{ 1335{
1316 int error; 1336 int error;
1317 1337
@@ -1325,7 +1345,7 @@ static int nfs4_init_client(struct nfs_client *clp,
1325 clp->rpc_ops = &nfs_v4_clientops; 1345 clp->rpc_ops = &nfs_v4_clientops;
1326 1346
1327 error = nfs_create_rpc_client(clp, timeparms, authflavour, 1347 error = nfs_create_rpc_client(clp, timeparms, authflavour,
1328 1, flags & NFS_MOUNT_NORESVPORT); 1348 1, noresvport);
1329 if (error < 0) 1349 if (error < 0)
1330 goto error; 1350 goto error;
1331 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); 1351 strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1378,27 +1398,71 @@ static int nfs4_set_client(struct nfs_server *server,
1378 dprintk("--> nfs4_set_client()\n"); 1398 dprintk("--> nfs4_set_client()\n");
1379 1399
1380 /* Allocate or find a client reference we can use */ 1400 /* Allocate or find a client reference we can use */
1381 clp = nfs_get_client(&cl_init); 1401 clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour,
1402 server->flags & NFS_MOUNT_NORESVPORT);
1382 if (IS_ERR(clp)) { 1403 if (IS_ERR(clp)) {
1383 error = PTR_ERR(clp); 1404 error = PTR_ERR(clp);
1384 goto error; 1405 goto error;
1385 } 1406 }
1386 error = nfs4_init_client(clp, timeparms, ip_addr, authflavour, 1407
1387 server->flags); 1408 /*
1388 if (error < 0) 1409 * Query for the lease time on clientid setup or renewal
1389 goto error_put; 1410 *
1411 * Note that this will be set on nfs_clients that were created
1412 * only for the DS role and did not set this bit, but now will
1413 * serve a dual role.
1414 */
1415 set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state);
1390 1416
1391 server->nfs_client = clp; 1417 server->nfs_client = clp;
1392 dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); 1418 dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp);
1393 return 0; 1419 return 0;
1394
1395error_put:
1396 nfs_put_client(clp);
1397error: 1420error:
1398 dprintk("<-- nfs4_set_client() = xerror %d\n", error); 1421 dprintk("<-- nfs4_set_client() = xerror %d\n", error);
1399 return error; 1422 return error;
1400} 1423}
1401 1424
1425/*
1426 * Set up a pNFS Data Server client.
1427 *
1428 * Return any existing nfs_client that matches server address,port,version
1429 * and minorversion.
1430 *
1431 * For a new nfs_client, use a soft mount (default), a low retrans and a
1432 * low timeout interval so that if a connection is lost, we retry through
1433 * the MDS.
1434 */
1435struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
1436 const struct sockaddr *ds_addr,
1437 int ds_addrlen, int ds_proto)
1438{
1439 struct nfs_client_initdata cl_init = {
1440 .addr = ds_addr,
1441 .addrlen = ds_addrlen,
1442 .rpc_ops = &nfs_v4_clientops,
1443 .proto = ds_proto,
1444 .minorversion = mds_clp->cl_minorversion,
1445 };
1446 struct rpc_timeout ds_timeout = {
1447 .to_initval = 15 * HZ,
1448 .to_maxval = 15 * HZ,
1449 .to_retries = 1,
1450 .to_exponential = 1,
1451 };
1452 struct nfs_client *clp;
1453
1454 /*
1455 * Set an authflavor equual to the MDS value. Use the MDS nfs_client
1456 * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
1457 * (section 13.1 RFC 5661).
1458 */
1459 clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
1460 mds_clp->cl_rpcclient->cl_auth->au_flavor, 0);
1461
1462 dprintk("<-- %s %p\n", __func__, clp);
1463 return clp;
1464}
1465EXPORT_SYMBOL(nfs4_set_ds_client);
1402 1466
1403/* 1467/*
1404 * Session has been established, and the client marked ready. 1468 * Session has been established, and the client marked ready.
@@ -1435,6 +1499,10 @@ static int nfs4_server_common_setup(struct nfs_server *server,
1435 BUG_ON(!server->nfs_client->rpc_ops); 1499 BUG_ON(!server->nfs_client->rpc_ops);
1436 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); 1500 BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
1437 1501
1502 /* data servers support only a subset of NFSv4.1 */
1503 if (is_ds_only_client(server->nfs_client))
1504 return -EPROTONOSUPPORT;
1505
1438 fattr = nfs_alloc_fattr(); 1506 fattr = nfs_alloc_fattr();
1439 if (fattr == NULL) 1507 if (fattr == NULL)
1440 return -ENOMEM; 1508 return -ENOMEM;
@@ -1504,6 +1572,13 @@ static int nfs4_init_server(struct nfs_server *server,
1504 if (error < 0) 1572 if (error < 0)
1505 goto error; 1573 goto error;
1506 1574
1575 /*
1576 * Don't use NFS uid/gid mapping if we're using AUTH_SYS or lower
1577 * authentication.
1578 */
1579 if (nfs4_disable_idmapping && data->auth_flavors[0] == RPC_AUTH_UNIX)
1580 server->caps |= NFS_CAP_UIDGID_NOMAP;
1581
1507 if (data->rsize) 1582 if (data->rsize)
1508 server->rsize = nfs_block_size(data->rsize, NULL); 1583 server->rsize = nfs_block_size(data->rsize, NULL);
1509 if (data->wsize) 1584 if (data->wsize)
@@ -1921,3 +1996,7 @@ void nfs_fs_proc_exit(void)
1921} 1996}
1922 1997
1923#endif /* CONFIG_PROC_FS */ 1998#endif /* CONFIG_PROC_FS */
1999
2000module_param(nfs4_disable_idmapping, bool, 0644);
2001MODULE_PARM_DESC(nfs4_disable_idmapping,
2002 "Turn off NFSv4 idmapping when using 'sec=sys'");
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2c3eb33b904d..7237672216c8 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -44,6 +44,7 @@
44/* #define NFS_DEBUG_VERBOSE 1 */ 44/* #define NFS_DEBUG_VERBOSE 1 */
45 45
46static int nfs_opendir(struct inode *, struct file *); 46static int nfs_opendir(struct inode *, struct file *);
47static int nfs_closedir(struct inode *, struct file *);
47static int nfs_readdir(struct file *, void *, filldir_t); 48static int nfs_readdir(struct file *, void *, filldir_t);
48static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *); 49static struct dentry *nfs_lookup(struct inode *, struct dentry *, struct nameidata *);
49static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *); 50static int nfs_create(struct inode *, struct dentry *, int, struct nameidata *);
@@ -64,7 +65,7 @@ const struct file_operations nfs_dir_operations = {
64 .read = generic_read_dir, 65 .read = generic_read_dir,
65 .readdir = nfs_readdir, 66 .readdir = nfs_readdir,
66 .open = nfs_opendir, 67 .open = nfs_opendir,
67 .release = nfs_release, 68 .release = nfs_closedir,
68 .fsync = nfs_fsync_dir, 69 .fsync = nfs_fsync_dir,
69}; 70};
70 71
@@ -133,13 +134,35 @@ const struct inode_operations nfs4_dir_inode_operations = {
133 134
134#endif /* CONFIG_NFS_V4 */ 135#endif /* CONFIG_NFS_V4 */
135 136
137static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct rpc_cred *cred)
138{
139 struct nfs_open_dir_context *ctx;
140 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
141 if (ctx != NULL) {
142 ctx->duped = 0;
143 ctx->dir_cookie = 0;
144 ctx->dup_cookie = 0;
145 ctx->cred = get_rpccred(cred);
146 } else
147 ctx = ERR_PTR(-ENOMEM);
148 return ctx;
149}
150
151static void put_nfs_open_dir_context(struct nfs_open_dir_context *ctx)
152{
153 put_rpccred(ctx->cred);
154 kfree(ctx);
155}
156
136/* 157/*
137 * Open file 158 * Open file
138 */ 159 */
139static int 160static int
140nfs_opendir(struct inode *inode, struct file *filp) 161nfs_opendir(struct inode *inode, struct file *filp)
141{ 162{
142 int res; 163 int res = 0;
164 struct nfs_open_dir_context *ctx;
165 struct rpc_cred *cred;
143 166
144 dfprintk(FILE, "NFS: open dir(%s/%s)\n", 167 dfprintk(FILE, "NFS: open dir(%s/%s)\n",
145 filp->f_path.dentry->d_parent->d_name.name, 168 filp->f_path.dentry->d_parent->d_name.name,
@@ -147,8 +170,15 @@ nfs_opendir(struct inode *inode, struct file *filp)
147 170
148 nfs_inc_stats(inode, NFSIOS_VFSOPEN); 171 nfs_inc_stats(inode, NFSIOS_VFSOPEN);
149 172
150 /* Call generic open code in order to cache credentials */ 173 cred = rpc_lookup_cred();
151 res = nfs_open(inode, filp); 174 if (IS_ERR(cred))
175 return PTR_ERR(cred);
176 ctx = alloc_nfs_open_dir_context(cred);
177 if (IS_ERR(ctx)) {
178 res = PTR_ERR(ctx);
179 goto out;
180 }
181 filp->private_data = ctx;
152 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { 182 if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
153 /* This is a mountpoint, so d_revalidate will never 183 /* This is a mountpoint, so d_revalidate will never
154 * have been called, so we need to refresh the 184 * have been called, so we need to refresh the
@@ -156,9 +186,18 @@ nfs_opendir(struct inode *inode, struct file *filp)
156 */ 186 */
157 __nfs_revalidate_inode(NFS_SERVER(inode), inode); 187 __nfs_revalidate_inode(NFS_SERVER(inode), inode);
158 } 188 }
189out:
190 put_rpccred(cred);
159 return res; 191 return res;
160} 192}
161 193
194static int
195nfs_closedir(struct inode *inode, struct file *filp)
196{
197 put_nfs_open_dir_context(filp->private_data);
198 return 0;
199}
200
162struct nfs_cache_array_entry { 201struct nfs_cache_array_entry {
163 u64 cookie; 202 u64 cookie;
164 u64 ino; 203 u64 ino;
@@ -284,19 +323,20 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri
284{ 323{
285 loff_t diff = desc->file->f_pos - desc->current_index; 324 loff_t diff = desc->file->f_pos - desc->current_index;
286 unsigned int index; 325 unsigned int index;
326 struct nfs_open_dir_context *ctx = desc->file->private_data;
287 327
288 if (diff < 0) 328 if (diff < 0)
289 goto out_eof; 329 goto out_eof;
290 if (diff >= array->size) { 330 if (diff >= array->size) {
291 if (array->eof_index >= 0) 331 if (array->eof_index >= 0)
292 goto out_eof; 332 goto out_eof;
293 desc->current_index += array->size;
294 return -EAGAIN; 333 return -EAGAIN;
295 } 334 }
296 335
297 index = (unsigned int)diff; 336 index = (unsigned int)diff;
298 *desc->dir_cookie = array->array[index].cookie; 337 *desc->dir_cookie = array->array[index].cookie;
299 desc->cache_entry_index = index; 338 desc->cache_entry_index = index;
339 ctx->duped = 0;
300 return 0; 340 return 0;
301out_eof: 341out_eof:
302 desc->eof = 1; 342 desc->eof = 1;
@@ -307,10 +347,18 @@ static
307int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc) 347int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
308{ 348{
309 int i; 349 int i;
350 loff_t new_pos;
310 int status = -EAGAIN; 351 int status = -EAGAIN;
352 struct nfs_open_dir_context *ctx = desc->file->private_data;
311 353
312 for (i = 0; i < array->size; i++) { 354 for (i = 0; i < array->size; i++) {
313 if (array->array[i].cookie == *desc->dir_cookie) { 355 if (array->array[i].cookie == *desc->dir_cookie) {
356 new_pos = desc->current_index + i;
357 if (new_pos < desc->file->f_pos) {
358 ctx->dup_cookie = *desc->dir_cookie;
359 ctx->duped = 1;
360 }
361 desc->file->f_pos = new_pos;
314 desc->cache_entry_index = i; 362 desc->cache_entry_index = i;
315 return 0; 363 return 0;
316 } 364 }
@@ -342,6 +390,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
342 390
343 if (status == -EAGAIN) { 391 if (status == -EAGAIN) {
344 desc->last_cookie = array->last_cookie; 392 desc->last_cookie = array->last_cookie;
393 desc->current_index += array->size;
345 desc->page_index++; 394 desc->page_index++;
346 } 395 }
347 nfs_readdir_release_array(desc->page); 396 nfs_readdir_release_array(desc->page);
@@ -354,7 +403,8 @@ static
354int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc, 403int nfs_readdir_xdr_filler(struct page **pages, nfs_readdir_descriptor_t *desc,
355 struct nfs_entry *entry, struct file *file, struct inode *inode) 404 struct nfs_entry *entry, struct file *file, struct inode *inode)
356{ 405{
357 struct rpc_cred *cred = nfs_file_cred(file); 406 struct nfs_open_dir_context *ctx = file->private_data;
407 struct rpc_cred *cred = ctx->cred;
358 unsigned long timestamp, gencount; 408 unsigned long timestamp, gencount;
359 int error; 409 int error;
360 410
@@ -693,6 +743,20 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
693 int i = 0; 743 int i = 0;
694 int res = 0; 744 int res = 0;
695 struct nfs_cache_array *array = NULL; 745 struct nfs_cache_array *array = NULL;
746 struct nfs_open_dir_context *ctx = file->private_data;
747
748 if (ctx->duped != 0 && ctx->dup_cookie == *desc->dir_cookie) {
749 if (printk_ratelimit()) {
750 pr_notice("NFS: directory %s/%s contains a readdir loop. "
751 "Please contact your server vendor. "
752 "Offending cookie: %llu\n",
753 file->f_dentry->d_parent->d_name.name,
754 file->f_dentry->d_name.name,
755 *desc->dir_cookie);
756 }
757 res = -ELOOP;
758 goto out;
759 }
696 760
697 array = nfs_readdir_get_array(desc->page); 761 array = nfs_readdir_get_array(desc->page);
698 if (IS_ERR(array)) { 762 if (IS_ERR(array)) {
@@ -785,6 +849,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
785 struct inode *inode = dentry->d_inode; 849 struct inode *inode = dentry->d_inode;
786 nfs_readdir_descriptor_t my_desc, 850 nfs_readdir_descriptor_t my_desc,
787 *desc = &my_desc; 851 *desc = &my_desc;
852 struct nfs_open_dir_context *dir_ctx = filp->private_data;
788 int res; 853 int res;
789 854
790 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n", 855 dfprintk(FILE, "NFS: readdir(%s/%s) starting at cookie %llu\n",
@@ -801,7 +866,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
801 memset(desc, 0, sizeof(*desc)); 866 memset(desc, 0, sizeof(*desc));
802 867
803 desc->file = filp; 868 desc->file = filp;
804 desc->dir_cookie = &nfs_file_open_context(filp)->dir_cookie; 869 desc->dir_cookie = &dir_ctx->dir_cookie;
805 desc->decode = NFS_PROTO(inode)->decode_dirent; 870 desc->decode = NFS_PROTO(inode)->decode_dirent;
806 desc->plus = NFS_USE_READDIRPLUS(inode); 871 desc->plus = NFS_USE_READDIRPLUS(inode);
807 872
@@ -853,6 +918,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
853{ 918{
854 struct dentry *dentry = filp->f_path.dentry; 919 struct dentry *dentry = filp->f_path.dentry;
855 struct inode *inode = dentry->d_inode; 920 struct inode *inode = dentry->d_inode;
921 struct nfs_open_dir_context *dir_ctx = filp->private_data;
856 922
857 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n", 923 dfprintk(FILE, "NFS: llseek dir(%s/%s, %lld, %d)\n",
858 dentry->d_parent->d_name.name, 924 dentry->d_parent->d_name.name,
@@ -872,7 +938,8 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
872 } 938 }
873 if (offset != filp->f_pos) { 939 if (offset != filp->f_pos) {
874 filp->f_pos = offset; 940 filp->f_pos = offset;
875 nfs_file_open_context(filp)->dir_cookie = 0; 941 dir_ctx->dir_cookie = 0;
942 dir_ctx->duped = 0;
876 } 943 }
877out: 944out:
878 mutex_unlock(&inode->i_mutex); 945 mutex_unlock(&inode->i_mutex);
@@ -1068,7 +1135,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
1068 if (fhandle == NULL || fattr == NULL) 1135 if (fhandle == NULL || fattr == NULL)
1069 goto out_error; 1136 goto out_error;
1070 1137
1071 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1138 error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
1072 if (error) 1139 if (error)
1073 goto out_bad; 1140 goto out_bad;
1074 if (nfs_compare_fh(NFS_FH(inode), fhandle)) 1141 if (nfs_compare_fh(NFS_FH(inode), fhandle))
@@ -1169,11 +1236,23 @@ static void nfs_dentry_iput(struct dentry *dentry, struct inode *inode)
1169 iput(inode); 1236 iput(inode);
1170} 1237}
1171 1238
1239static void nfs_d_release(struct dentry *dentry)
1240{
1241 /* free cached devname value, if it survived that far */
1242 if (unlikely(dentry->d_fsdata)) {
1243 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
1244 WARN_ON(1);
1245 else
1246 kfree(dentry->d_fsdata);
1247 }
1248}
1249
1172const struct dentry_operations nfs_dentry_operations = { 1250const struct dentry_operations nfs_dentry_operations = {
1173 .d_revalidate = nfs_lookup_revalidate, 1251 .d_revalidate = nfs_lookup_revalidate,
1174 .d_delete = nfs_dentry_delete, 1252 .d_delete = nfs_dentry_delete,
1175 .d_iput = nfs_dentry_iput, 1253 .d_iput = nfs_dentry_iput,
1176 .d_automount = nfs_d_automount, 1254 .d_automount = nfs_d_automount,
1255 .d_release = nfs_d_release,
1177}; 1256};
1178 1257
1179static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 1258static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
@@ -1212,7 +1291,7 @@ static struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, stru
1212 parent = dentry->d_parent; 1291 parent = dentry->d_parent;
1213 /* Protect against concurrent sillydeletes */ 1292 /* Protect against concurrent sillydeletes */
1214 nfs_block_sillyrename(parent); 1293 nfs_block_sillyrename(parent);
1215 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1294 error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
1216 if (error == -ENOENT) 1295 if (error == -ENOENT)
1217 goto no_entry; 1296 goto no_entry;
1218 if (error < 0) { 1297 if (error < 0) {
@@ -1248,6 +1327,7 @@ const struct dentry_operations nfs4_dentry_operations = {
1248 .d_delete = nfs_dentry_delete, 1327 .d_delete = nfs_dentry_delete,
1249 .d_iput = nfs_dentry_iput, 1328 .d_iput = nfs_dentry_iput,
1250 .d_automount = nfs_d_automount, 1329 .d_automount = nfs_d_automount,
1330 .d_release = nfs_d_release,
1251}; 1331};
1252 1332
1253/* 1333/*
@@ -1549,7 +1629,7 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle,
1549 if (dentry->d_inode) 1629 if (dentry->d_inode)
1550 goto out; 1630 goto out;
1551 if (fhandle->size == 0) { 1631 if (fhandle->size == 0) {
1552 error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr); 1632 error = NFS_PROTO(dir)->lookup(NFS_SERVER(dir)->client, dir, &dentry->d_name, fhandle, fattr);
1553 if (error) 1633 if (error)
1554 goto out_error; 1634 goto out_error;
1555 } 1635 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 9943a75bb6d1..8eea25366717 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -45,6 +45,7 @@
45#include <linux/pagemap.h> 45#include <linux/pagemap.h>
46#include <linux/kref.h> 46#include <linux/kref.h>
47#include <linux/slab.h> 47#include <linux/slab.h>
48#include <linux/task_io_accounting_ops.h>
48 49
49#include <linux/nfs_fs.h> 50#include <linux/nfs_fs.h>
50#include <linux/nfs_page.h> 51#include <linux/nfs_page.h>
@@ -649,8 +650,7 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
649{ 650{
650 struct nfs_write_data *data = calldata; 651 struct nfs_write_data *data = calldata;
651 652
652 if (nfs_writeback_done(task, data) != 0) 653 nfs_writeback_done(task, data);
653 return;
654} 654}
655 655
656/* 656/*
@@ -938,6 +938,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
938 if (retval) 938 if (retval)
939 goto out; 939 goto out;
940 940
941 task_io_account_read(count);
942
941 retval = nfs_direct_read(iocb, iov, nr_segs, pos); 943 retval = nfs_direct_read(iocb, iov, nr_segs, pos);
942 if (retval > 0) 944 if (retval > 0)
943 iocb->ki_pos = pos + retval; 945 iocb->ki_pos = pos + retval;
@@ -999,6 +1001,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
999 if (retval) 1001 if (retval)
1000 goto out; 1002 goto out;
1001 1003
1004 task_io_account_write(count);
1005
1002 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); 1006 retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
1003 1007
1004 if (retval > 0) 1008 if (retval > 0)
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 7bf029ef4084..2f093ed16980 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -301,7 +301,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
301 * disk, but it retrieves and clears ctx->error after synching, despite 301 * disk, but it retrieves and clears ctx->error after synching, despite
302 * the two being set at the same time in nfs_context_set_write_error(). 302 * the two being set at the same time in nfs_context_set_write_error().
303 * This is because the former is used to notify the _next_ call to 303 * This is because the former is used to notify the _next_ call to
304 * nfs_file_write() that a write error occured, and hence cause it to 304 * nfs_file_write() that a write error occurred, and hence cause it to
305 * fall back to doing a synchronous write. 305 * fall back to doing a synchronous write.
306 */ 306 */
307static int 307static int
@@ -326,6 +326,9 @@ nfs_file_fsync(struct file *file, int datasync)
326 ret = xchg(&ctx->error, 0); 326 ret = xchg(&ctx->error, 0);
327 if (!ret && status < 0) 327 if (!ret && status < 0)
328 ret = status; 328 ret = status;
329 if (!ret && !datasync)
330 /* application has asked for meta-data sync */
331 ret = pnfs_layoutcommit_inode(inode, true);
329 return ret; 332 return ret;
330} 333}
331 334
@@ -387,10 +390,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
387 file->f_path.dentry->d_name.name, 390 file->f_path.dentry->d_name.name,
388 mapping->host->i_ino, len, (long long) pos); 391 mapping->host->i_ino, len, (long long) pos);
389 392
390 pnfs_update_layout(mapping->host,
391 nfs_file_open_context(file),
392 IOMODE_RW);
393
394start: 393start:
395 /* 394 /*
396 * Prevent starvation issues if someone is doing a consistency 395 * Prevent starvation issues if someone is doing a consistency
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b5ffe8fa291f..dcb61548887f 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -75,18 +75,25 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i
75/* 75/*
76 * get an NFS2/NFS3 root dentry from the root filehandle 76 * get an NFS2/NFS3 root dentry from the root filehandle
77 */ 77 */
78struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh) 78struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
79 const char *devname)
79{ 80{
80 struct nfs_server *server = NFS_SB(sb); 81 struct nfs_server *server = NFS_SB(sb);
81 struct nfs_fsinfo fsinfo; 82 struct nfs_fsinfo fsinfo;
82 struct dentry *ret; 83 struct dentry *ret;
83 struct inode *inode; 84 struct inode *inode;
85 void *name = kstrdup(devname, GFP_KERNEL);
84 int error; 86 int error;
85 87
88 if (!name)
89 return ERR_PTR(-ENOMEM);
90
86 /* get the actual root for this mount */ 91 /* get the actual root for this mount */
87 fsinfo.fattr = nfs_alloc_fattr(); 92 fsinfo.fattr = nfs_alloc_fattr();
88 if (fsinfo.fattr == NULL) 93 if (fsinfo.fattr == NULL) {
94 kfree(name);
89 return ERR_PTR(-ENOMEM); 95 return ERR_PTR(-ENOMEM);
96 }
90 97
91 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo); 98 error = server->nfs_client->rpc_ops->getroot(server, mntfh, &fsinfo);
92 if (error < 0) { 99 if (error < 0) {
@@ -119,7 +126,15 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh)
119 } 126 }
120 127
121 security_d_instantiate(ret, inode); 128 security_d_instantiate(ret, inode);
129 spin_lock(&ret->d_lock);
130 if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
131 ret->d_fsdata = name;
132 name = NULL;
133 }
134 spin_unlock(&ret->d_lock);
122out: 135out:
136 if (name)
137 kfree(name);
123 nfs_free_fattr(fsinfo.fattr); 138 nfs_free_fattr(fsinfo.fattr);
124 return ret; 139 return ret;
125} 140}
@@ -169,27 +184,35 @@ out:
169/* 184/*
170 * get an NFS4 root dentry from the root filehandle 185 * get an NFS4 root dentry from the root filehandle
171 */ 186 */
172struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh) 187struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh,
188 const char *devname)
173{ 189{
174 struct nfs_server *server = NFS_SB(sb); 190 struct nfs_server *server = NFS_SB(sb);
175 struct nfs_fattr *fattr = NULL; 191 struct nfs_fattr *fattr = NULL;
176 struct dentry *ret; 192 struct dentry *ret;
177 struct inode *inode; 193 struct inode *inode;
194 void *name = kstrdup(devname, GFP_KERNEL);
178 int error; 195 int error;
179 196
180 dprintk("--> nfs4_get_root()\n"); 197 dprintk("--> nfs4_get_root()\n");
181 198
199 if (!name)
200 return ERR_PTR(-ENOMEM);
201
182 /* get the info about the server and filesystem */ 202 /* get the info about the server and filesystem */
183 error = nfs4_server_capabilities(server, mntfh); 203 error = nfs4_server_capabilities(server, mntfh);
184 if (error < 0) { 204 if (error < 0) {
185 dprintk("nfs_get_root: getcaps error = %d\n", 205 dprintk("nfs_get_root: getcaps error = %d\n",
186 -error); 206 -error);
207 kfree(name);
187 return ERR_PTR(error); 208 return ERR_PTR(error);
188 } 209 }
189 210
190 fattr = nfs_alloc_fattr(); 211 fattr = nfs_alloc_fattr();
191 if (fattr == NULL) 212 if (fattr == NULL) {
192 return ERR_PTR(-ENOMEM);; 213 kfree(name);
214 return ERR_PTR(-ENOMEM);
215 }
193 216
194 /* get the actual root for this mount */ 217 /* get the actual root for this mount */
195 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr); 218 error = server->nfs_client->rpc_ops->getattr(server, mntfh, fattr);
@@ -199,6 +222,10 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
199 goto out; 222 goto out;
200 } 223 }
201 224
225 if (fattr->valid & NFS_ATTR_FATTR_FSID &&
226 !nfs_fsid_equal(&server->fsid, &fattr->fsid))
227 memcpy(&server->fsid, &fattr->fsid, sizeof(server->fsid));
228
202 inode = nfs_fhget(sb, mntfh, fattr); 229 inode = nfs_fhget(sb, mntfh, fattr);
203 if (IS_ERR(inode)) { 230 if (IS_ERR(inode)) {
204 dprintk("nfs_get_root: get root inode failed\n"); 231 dprintk("nfs_get_root: get root inode failed\n");
@@ -223,8 +250,15 @@ struct dentry *nfs4_get_root(struct super_block *sb, struct nfs_fh *mntfh)
223 } 250 }
224 251
225 security_d_instantiate(ret, inode); 252 security_d_instantiate(ret, inode);
226 253 spin_lock(&ret->d_lock);
254 if (IS_ROOT(ret) && !(ret->d_flags & DCACHE_NFSFS_RENAMED)) {
255 ret->d_fsdata = name;
256 name = NULL;
257 }
258 spin_unlock(&ret->d_lock);
227out: 259out:
260 if (name)
261 kfree(name);
228 nfs_free_fattr(fattr); 262 nfs_free_fattr(fattr);
229 dprintk("<-- nfs4_get_root()\n"); 263 dprintk("<-- nfs4_get_root()\n");
230 return ret; 264 return ret;
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 18696882f1c6..79664a1025af 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -33,16 +33,41 @@
33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 */ 35 */
36#include <linux/types.h>
37#include <linux/string.h>
38#include <linux/kernel.h>
39
40static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
41{
42 unsigned long val;
43 char buf[16];
44
45 if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf))
46 return 0;
47 memcpy(buf, name, namelen);
48 buf[namelen] = '\0';
49 if (strict_strtoul(buf, 0, &val) != 0)
50 return 0;
51 *res = val;
52 return 1;
53}
54
55static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
56{
57 return snprintf(buf, buflen, "%u", id);
58}
36 59
37#ifdef CONFIG_NFS_USE_NEW_IDMAPPER 60#ifdef CONFIG_NFS_USE_NEW_IDMAPPER
38 61
39#include <linux/slab.h> 62#include <linux/slab.h>
40#include <linux/cred.h> 63#include <linux/cred.h>
64#include <linux/sunrpc/sched.h>
65#include <linux/nfs4.h>
66#include <linux/nfs_fs_sb.h>
41#include <linux/nfs_idmap.h> 67#include <linux/nfs_idmap.h>
42#include <linux/keyctl.h> 68#include <linux/keyctl.h>
43#include <linux/key-type.h> 69#include <linux/key-type.h>
44#include <linux/rcupdate.h> 70#include <linux/rcupdate.h>
45#include <linux/kernel.h>
46#include <linux/err.h> 71#include <linux/err.h>
47 72
48#include <keys/user-type.h> 73#include <keys/user-type.h>
@@ -219,23 +244,39 @@ static int nfs_idmap_lookup_id(const char *name, size_t namelen,
219 return ret; 244 return ret;
220} 245}
221 246
222int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 247int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
223{ 248{
249 if (nfs_map_string_to_numeric(name, namelen, uid))
250 return 0;
224 return nfs_idmap_lookup_id(name, namelen, "uid", uid); 251 return nfs_idmap_lookup_id(name, namelen, "uid", uid);
225} 252}
226 253
227int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *gid) 254int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *gid)
228{ 255{
256 if (nfs_map_string_to_numeric(name, namelen, gid))
257 return 0;
229 return nfs_idmap_lookup_id(name, namelen, "gid", gid); 258 return nfs_idmap_lookup_id(name, namelen, "gid", gid);
230} 259}
231 260
232int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 261int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
233{ 262{
234 return nfs_idmap_lookup_name(uid, "user", buf, buflen); 263 int ret = -EINVAL;
264
265 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
266 ret = nfs_idmap_lookup_name(uid, "user", buf, buflen);
267 if (ret < 0)
268 ret = nfs_map_numeric_to_string(uid, buf, buflen);
269 return ret;
235} 270}
236int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t buflen) 271int nfs_map_gid_to_group(const struct nfs_server *server, __u32 gid, char *buf, size_t buflen)
237{ 272{
238 return nfs_idmap_lookup_name(gid, "group", buf, buflen); 273 int ret = -EINVAL;
274
275 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
276 ret = nfs_idmap_lookup_name(gid, "group", buf, buflen);
277 if (ret < 0)
278 ret = nfs_map_numeric_to_string(gid, buf, buflen);
279 return ret;
239} 280}
240 281
241#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */ 282#else /* CONFIG_NFS_USE_NEW_IDMAPPER not defined */
@@ -243,7 +284,6 @@ int nfs_map_gid_to_group(struct nfs_client *clp, __u32 gid, char *buf, size_t bu
243#include <linux/module.h> 284#include <linux/module.h>
244#include <linux/mutex.h> 285#include <linux/mutex.h>
245#include <linux/init.h> 286#include <linux/init.h>
246#include <linux/types.h>
247#include <linux/slab.h> 287#include <linux/slab.h>
248#include <linux/socket.h> 288#include <linux/socket.h>
249#include <linux/in.h> 289#include <linux/in.h>
@@ -695,31 +735,45 @@ static unsigned int fnvhash32(const void *buf, size_t buflen)
695 return hash; 735 return hash;
696} 736}
697 737
698int nfs_map_name_to_uid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 738int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
699{ 739{
700 struct idmap *idmap = clp->cl_idmap; 740 struct idmap *idmap = server->nfs_client->cl_idmap;
701 741
742 if (nfs_map_string_to_numeric(name, namelen, uid))
743 return 0;
702 return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid); 744 return nfs_idmap_id(idmap, &idmap->idmap_user_hash, name, namelen, uid);
703} 745}
704 746
705int nfs_map_group_to_gid(struct nfs_client *clp, const char *name, size_t namelen, __u32 *uid) 747int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, __u32 *uid)
706{ 748{
707 struct idmap *idmap = clp->cl_idmap; 749 struct idmap *idmap = server->nfs_client->cl_idmap;
708 750
751 if (nfs_map_string_to_numeric(name, namelen, uid))
752 return 0;
709 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid); 753 return nfs_idmap_id(idmap, &idmap->idmap_group_hash, name, namelen, uid);
710} 754}
711 755
712int nfs_map_uid_to_name(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 756int nfs_map_uid_to_name(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
713{ 757{
714 struct idmap *idmap = clp->cl_idmap; 758 struct idmap *idmap = server->nfs_client->cl_idmap;
759 int ret = -EINVAL;
715 760
716 return nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf); 761 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
762 ret = nfs_idmap_name(idmap, &idmap->idmap_user_hash, uid, buf);
763 if (ret < 0)
764 ret = nfs_map_numeric_to_string(uid, buf, buflen);
765 return ret;
717} 766}
718int nfs_map_gid_to_group(struct nfs_client *clp, __u32 uid, char *buf, size_t buflen) 767int nfs_map_gid_to_group(const struct nfs_server *server, __u32 uid, char *buf, size_t buflen)
719{ 768{
720 struct idmap *idmap = clp->cl_idmap; 769 struct idmap *idmap = server->nfs_client->cl_idmap;
770 int ret = -EINVAL;
721 771
722 return nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf); 772 if (!(server->caps & NFS_CAP_UIDGID_NOMAP))
773 ret = nfs_idmap_name(idmap, &idmap->idmap_group_hash, uid, buf);
774 if (ret < 0)
775 ret = nfs_map_numeric_to_string(uid, buf, buflen);
776 return ret;
723} 777}
724 778
725#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */ 779#endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1cc600e77bb4..57bb31ad7a5e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,6 +37,7 @@
37#include <linux/inet.h> 37#include <linux/inet.h>
38#include <linux/nfs_xdr.h> 38#include <linux/nfs_xdr.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/compat.h>
40 41
41#include <asm/system.h> 42#include <asm/system.h>
42#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word)
89 */ 90 */
90u64 nfs_compat_user_ino64(u64 fileid) 91u64 nfs_compat_user_ino64(u64 fileid)
91{ 92{
92 int ino; 93#ifdef CONFIG_COMPAT
94 compat_ulong_t ino;
95#else
96 unsigned long ino;
97#endif
93 98
94 if (enable_ino64) 99 if (enable_ino64)
95 return fileid; 100 return fileid;
@@ -249,7 +254,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
249 struct inode *inode = ERR_PTR(-ENOENT); 254 struct inode *inode = ERR_PTR(-ENOENT);
250 unsigned long hash; 255 unsigned long hash;
251 256
252 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) 257 nfs_attr_check_mountpoint(sb, fattr);
258
259 if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0 && (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0)
253 goto out_no_inode; 260 goto out_no_inode;
254 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) 261 if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
255 goto out_no_inode; 262 goto out_no_inode;
@@ -293,8 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
293 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)) 300 if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS))
294 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); 301 set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
295 /* Deal with crossing mountpoints */ 302 /* Deal with crossing mountpoints */
296 if ((fattr->valid & NFS_ATTR_FATTR_FSID) 303 if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
297 && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) { 304 fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
298 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 305 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
299 inode->i_op = &nfs_referral_inode_operations; 306 inode->i_op = &nfs_referral_inode_operations;
300 else 307 else
@@ -634,7 +641,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct rpc_cr
634 ctx->mode = f_mode; 641 ctx->mode = f_mode;
635 ctx->flags = 0; 642 ctx->flags = 0;
636 ctx->error = 0; 643 ctx->error = 0;
637 ctx->dir_cookie = 0;
638 nfs_init_lock_context(&ctx->lock_context); 644 nfs_init_lock_context(&ctx->lock_context);
639 ctx->lock_context.open_context = ctx; 645 ctx->lock_context.open_context = ctx;
640 INIT_LIST_HEAD(&ctx->list); 646 INIT_LIST_HEAD(&ctx->list);
@@ -1466,6 +1472,7 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
1466 nfsi->delegation_state = 0; 1472 nfsi->delegation_state = 0;
1467 init_rwsem(&nfsi->rwsem); 1473 init_rwsem(&nfsi->rwsem);
1468 nfsi->layout = NULL; 1474 nfsi->layout = NULL;
1475 atomic_set(&nfsi->commits_outstanding, 0);
1469#endif 1476#endif
1470} 1477}
1471 1478
@@ -1513,7 +1520,7 @@ static int nfsiod_start(void)
1513{ 1520{
1514 struct workqueue_struct *wq; 1521 struct workqueue_struct *wq;
1515 dprintk("RPC: creating workqueue nfsiod\n"); 1522 dprintk("RPC: creating workqueue nfsiod\n");
1516 wq = alloc_workqueue("nfsiod", WQ_RESCUER, 0); 1523 wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM, 0);
1517 if (wq == NULL) 1524 if (wq == NULL)
1518 return -ENOMEM; 1525 return -ENOMEM;
1519 nfsiod_workqueue = wq; 1526 nfsiod_workqueue = wq;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index cf9fdbdabc67..ce118ce885dd 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -39,6 +39,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
39 return 0; 39 return 0;
40} 40}
41 41
42static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr)
43{
44 if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid))
45 fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT;
46}
47
42struct nfs_clone_mount { 48struct nfs_clone_mount {
43 const struct super_block *sb; 49 const struct super_block *sb;
44 const struct dentry *dentry; 50 const struct dentry *dentry;
@@ -148,6 +154,9 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
148 struct nfs_fattr *); 154 struct nfs_fattr *);
149extern void nfs_mark_client_ready(struct nfs_client *clp, int state); 155extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
150extern int nfs4_check_client_ready(struct nfs_client *clp); 156extern int nfs4_check_client_ready(struct nfs_client *clp);
157extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
158 const struct sockaddr *ds_addr,
159 int ds_addrlen, int ds_proto);
151#ifdef CONFIG_PROC_FS 160#ifdef CONFIG_PROC_FS
152extern int __init nfs_fs_proc_init(void); 161extern int __init nfs_fs_proc_init(void);
153extern void nfs_fs_proc_exit(void); 162extern void nfs_fs_proc_exit(void);
@@ -163,10 +172,10 @@ static inline void nfs_fs_proc_exit(void)
163 172
164/* nfs4namespace.c */ 173/* nfs4namespace.c */
165#ifdef CONFIG_NFS_V4 174#ifdef CONFIG_NFS_V4
166extern struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry); 175extern struct vfsmount *nfs_do_refmount(struct dentry *dentry);
167#else 176#else
168static inline 177static inline
169struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) 178struct vfsmount *nfs_do_refmount(struct dentry *dentry)
170{ 179{
171 return ERR_PTR(-ENOENT); 180 return ERR_PTR(-ENOENT);
172} 181}
@@ -211,10 +220,17 @@ extern const u32 nfs41_maxwrite_overhead;
211/* nfs4proc.c */ 220/* nfs4proc.c */
212#ifdef CONFIG_NFS_V4 221#ifdef CONFIG_NFS_V4
213extern struct rpc_procinfo nfs4_procedures[]; 222extern struct rpc_procinfo nfs4_procedures[];
223void nfs_fixup_secinfo_attributes(struct nfs_fattr *, struct nfs_fh *);
214#endif 224#endif
215 225
226extern int nfs4_init_ds_session(struct nfs_client *clp);
227
216/* proc.c */ 228/* proc.c */
217void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 229void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
230extern int nfs_init_client(struct nfs_client *clp,
231 const struct rpc_timeout *timeparms,
232 const char *ip_addr, rpc_authflavor_t authflavour,
233 int noresvport);
218 234
219/* dir.c */ 235/* dir.c */
220extern int nfs_access_cache_shrinker(struct shrinker *shrink, 236extern int nfs_access_cache_shrinker(struct shrinker *shrink,
@@ -247,25 +263,45 @@ extern void nfs_sb_active(struct super_block *sb);
247extern void nfs_sb_deactive(struct super_block *sb); 263extern void nfs_sb_deactive(struct super_block *sb);
248 264
249/* namespace.c */ 265/* namespace.c */
250extern char *nfs_path(const char *base, 266extern char *nfs_path(char **p, struct dentry *dentry,
251 const struct dentry *droot,
252 const struct dentry *dentry,
253 char *buffer, ssize_t buflen); 267 char *buffer, ssize_t buflen);
254extern struct vfsmount *nfs_d_automount(struct path *path); 268extern struct vfsmount *nfs_d_automount(struct path *path);
255 269
256/* getroot.c */ 270/* getroot.c */
257extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *); 271extern struct dentry *nfs_get_root(struct super_block *, struct nfs_fh *,
272 const char *);
258#ifdef CONFIG_NFS_V4 273#ifdef CONFIG_NFS_V4
259extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *); 274extern struct dentry *nfs4_get_root(struct super_block *, struct nfs_fh *,
275 const char *);
260 276
261extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh); 277extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh);
262#endif 278#endif
263 279
264/* read.c */ 280/* read.c */
281extern int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
282 const struct rpc_call_ops *call_ops);
265extern void nfs_read_prepare(struct rpc_task *task, void *calldata); 283extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
266 284
267/* write.c */ 285/* write.c */
286extern void nfs_commit_free(struct nfs_write_data *p);
287extern int nfs_initiate_write(struct nfs_write_data *data,
288 struct rpc_clnt *clnt,
289 const struct rpc_call_ops *call_ops,
290 int how);
268extern void nfs_write_prepare(struct rpc_task *task, void *calldata); 291extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
292extern int nfs_initiate_commit(struct nfs_write_data *data,
293 struct rpc_clnt *clnt,
294 const struct rpc_call_ops *call_ops,
295 int how);
296extern void nfs_init_commit(struct nfs_write_data *data,
297 struct list_head *head,
298 struct pnfs_layout_segment *lseg);
299void nfs_retry_commit(struct list_head *page_list,
300 struct pnfs_layout_segment *lseg);
301void nfs_commit_clear_lock(struct nfs_inode *nfsi);
302void nfs_commitdata_release(void *data);
303void nfs_commit_release_pages(struct nfs_write_data *data);
304
269#ifdef CONFIG_MIGRATION 305#ifdef CONFIG_MIGRATION
270extern int nfs_migrate_page(struct address_space *, 306extern int nfs_migrate_page(struct address_space *,
271 struct page *, struct page *); 307 struct page *, struct page *);
@@ -274,12 +310,21 @@ extern int nfs_migrate_page(struct address_space *,
274#endif 310#endif
275 311
276/* nfs4proc.c */ 312/* nfs4proc.c */
277extern int _nfs4_call_sync(struct nfs_server *server, 313extern void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data);
314extern int nfs4_init_client(struct nfs_client *clp,
315 const struct rpc_timeout *timeparms,
316 const char *ip_addr,
317 rpc_authflavor_t authflavour,
318 int noresvport);
319extern void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data);
320extern int _nfs4_call_sync(struct rpc_clnt *clnt,
321 struct nfs_server *server,
278 struct rpc_message *msg, 322 struct rpc_message *msg,
279 struct nfs4_sequence_args *args, 323 struct nfs4_sequence_args *args,
280 struct nfs4_sequence_res *res, 324 struct nfs4_sequence_res *res,
281 int cache_reply); 325 int cache_reply);
282extern int _nfs4_call_sync_session(struct nfs_server *server, 326extern int _nfs4_call_sync_session(struct rpc_clnt *clnt,
327 struct nfs_server *server,
283 struct rpc_message *msg, 328 struct rpc_message *msg,
284 struct nfs4_sequence_args *args, 329 struct nfs4_sequence_args *args,
285 struct nfs4_sequence_res *res, 330 struct nfs4_sequence_res *res,
@@ -288,12 +333,11 @@ extern int _nfs4_call_sync_session(struct nfs_server *server,
288/* 333/*
289 * Determine the device name as a string 334 * Determine the device name as a string
290 */ 335 */
291static inline char *nfs_devname(const struct vfsmount *mnt_parent, 336static inline char *nfs_devname(struct dentry *dentry,
292 const struct dentry *dentry,
293 char *buffer, ssize_t buflen) 337 char *buffer, ssize_t buflen)
294{ 338{
295 return nfs_path(mnt_parent->mnt_devname, mnt_parent->mnt_root, 339 char *dummy;
296 dentry, buffer, buflen); 340 return nfs_path(&dummy, dentry, buffer, buflen);
297} 341}
298 342
299/* 343/*
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f32b8603dca8..89fc160fd5b0 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -15,6 +15,7 @@
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/sunrpc/clnt.h> 16#include <linux/sunrpc/clnt.h>
17#include <linux/vfs.h> 17#include <linux/vfs.h>
18#include <linux/sunrpc/gss_api.h>
18#include "internal.h" 19#include "internal.h"
19 20
20#define NFSDBG_FACILITY NFSDBG_VFS 21#define NFSDBG_FACILITY NFSDBG_VFS
@@ -25,33 +26,31 @@ static LIST_HEAD(nfs_automount_list);
25static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); 26static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts);
26int nfs_mountpoint_expiry_timeout = 500 * HZ; 27int nfs_mountpoint_expiry_timeout = 500 * HZ;
27 28
28static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, 29static struct vfsmount *nfs_do_submount(struct dentry *dentry,
29 const struct dentry *dentry,
30 struct nfs_fh *fh, 30 struct nfs_fh *fh,
31 struct nfs_fattr *fattr); 31 struct nfs_fattr *fattr,
32 rpc_authflavor_t authflavor);
32 33
33/* 34/*
34 * nfs_path - reconstruct the path given an arbitrary dentry 35 * nfs_path - reconstruct the path given an arbitrary dentry
35 * @base - arbitrary string to prepend to the path 36 * @base - used to return pointer to the end of devname part of path
36 * @droot - pointer to root dentry for mountpoint
37 * @dentry - pointer to dentry 37 * @dentry - pointer to dentry
38 * @buffer - result buffer 38 * @buffer - result buffer
39 * @buflen - length of buffer 39 * @buflen - length of buffer
40 * 40 *
41 * Helper function for constructing the path from the 41 * Helper function for constructing the server pathname
42 * root dentry to an arbitrary hashed dentry. 42 * by arbitrary hashed dentry.
43 * 43 *
44 * This is mainly for use in figuring out the path on the 44 * This is mainly for use in figuring out the path on the
45 * server side when automounting on top of an existing partition. 45 * server side when automounting on top of an existing partition
46 * and in generating /proc/mounts and friends.
46 */ 47 */
47char *nfs_path(const char *base, 48char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
48 const struct dentry *droot,
49 const struct dentry *dentry,
50 char *buffer, ssize_t buflen)
51{ 49{
52 char *end; 50 char *end;
53 int namelen; 51 int namelen;
54 unsigned seq; 52 unsigned seq;
53 const char *base;
55 54
56rename_retry: 55rename_retry:
57 end = buffer+buflen; 56 end = buffer+buflen;
@@ -60,7 +59,10 @@ rename_retry:
60 59
61 seq = read_seqbegin(&rename_lock); 60 seq = read_seqbegin(&rename_lock);
62 rcu_read_lock(); 61 rcu_read_lock();
63 while (!IS_ROOT(dentry) && dentry != droot) { 62 while (1) {
63 spin_lock(&dentry->d_lock);
64 if (IS_ROOT(dentry))
65 break;
64 namelen = dentry->d_name.len; 66 namelen = dentry->d_name.len;
65 buflen -= namelen + 1; 67 buflen -= namelen + 1;
66 if (buflen < 0) 68 if (buflen < 0)
@@ -68,27 +70,47 @@ rename_retry:
68 end -= namelen; 70 end -= namelen;
69 memcpy(end, dentry->d_name.name, namelen); 71 memcpy(end, dentry->d_name.name, namelen);
70 *--end = '/'; 72 *--end = '/';
73 spin_unlock(&dentry->d_lock);
71 dentry = dentry->d_parent; 74 dentry = dentry->d_parent;
72 } 75 }
73 rcu_read_unlock(); 76 if (read_seqretry(&rename_lock, seq)) {
74 if (read_seqretry(&rename_lock, seq)) 77 spin_unlock(&dentry->d_lock);
78 rcu_read_unlock();
75 goto rename_retry; 79 goto rename_retry;
80 }
76 if (*end != '/') { 81 if (*end != '/') {
77 if (--buflen < 0) 82 if (--buflen < 0) {
83 spin_unlock(&dentry->d_lock);
84 rcu_read_unlock();
78 goto Elong; 85 goto Elong;
86 }
79 *--end = '/'; 87 *--end = '/';
80 } 88 }
89 *p = end;
90 base = dentry->d_fsdata;
91 if (!base) {
92 spin_unlock(&dentry->d_lock);
93 rcu_read_unlock();
94 WARN_ON(1);
95 return end;
96 }
81 namelen = strlen(base); 97 namelen = strlen(base);
82 /* Strip off excess slashes in base string */ 98 /* Strip off excess slashes in base string */
83 while (namelen > 0 && base[namelen - 1] == '/') 99 while (namelen > 0 && base[namelen - 1] == '/')
84 namelen--; 100 namelen--;
85 buflen -= namelen; 101 buflen -= namelen;
86 if (buflen < 0) 102 if (buflen < 0) {
103 spin_unlock(&dentry->d_lock);
104 rcu_read_unlock();
87 goto Elong; 105 goto Elong;
106 }
88 end -= namelen; 107 end -= namelen;
89 memcpy(end, base, namelen); 108 memcpy(end, base, namelen);
109 spin_unlock(&dentry->d_lock);
110 rcu_read_unlock();
90 return end; 111 return end;
91Elong_unlock: 112Elong_unlock:
113 spin_unlock(&dentry->d_lock);
92 rcu_read_unlock(); 114 rcu_read_unlock();
93 if (read_seqretry(&rename_lock, seq)) 115 if (read_seqretry(&rename_lock, seq))
94 goto rename_retry; 116 goto rename_retry;
@@ -96,6 +118,99 @@ Elong:
96 return ERR_PTR(-ENAMETOOLONG); 118 return ERR_PTR(-ENAMETOOLONG);
97} 119}
98 120
121#ifdef CONFIG_NFS_V4
122static rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors, struct inode *inode)
123{
124 struct gss_api_mech *mech;
125 struct xdr_netobj oid;
126 int i;
127 rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
128
129 for (i = 0; i < flavors->num_flavors; i++) {
130 struct nfs4_secinfo_flavor *flavor;
131 flavor = &flavors->flavors[i];
132
133 if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
134 pseudoflavor = flavor->flavor;
135 break;
136 } else if (flavor->flavor == RPC_AUTH_GSS) {
137 oid.len = flavor->gss.sec_oid4.len;
138 oid.data = flavor->gss.sec_oid4.data;
139 mech = gss_mech_get_by_OID(&oid);
140 if (!mech)
141 continue;
142 pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
143 gss_mech_put(mech);
144 break;
145 }
146 }
147
148 return pseudoflavor;
149}
150
151static int nfs_negotiate_security(const struct dentry *parent,
152 const struct dentry *dentry,
153 rpc_authflavor_t *flavor)
154{
155 struct page *page;
156 struct nfs4_secinfo_flavors *flavors;
157 int (*secinfo)(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
158 int ret = -EPERM;
159
160 secinfo = NFS_PROTO(parent->d_inode)->secinfo;
161 if (secinfo != NULL) {
162 page = alloc_page(GFP_KERNEL);
163 if (!page) {
164 ret = -ENOMEM;
165 goto out;
166 }
167 flavors = page_address(page);
168 ret = secinfo(parent->d_inode, &dentry->d_name, flavors);
169 *flavor = nfs_find_best_sec(flavors, dentry->d_inode);
170 put_page(page);
171 }
172
173out:
174 return ret;
175}
176
177static int nfs_lookup_with_sec(struct nfs_server *server, struct dentry *parent,
178 struct dentry *dentry, struct path *path,
179 struct nfs_fh *fh, struct nfs_fattr *fattr,
180 rpc_authflavor_t *flavor)
181{
182 struct rpc_clnt *clone;
183 struct rpc_auth *auth;
184 int err;
185
186 err = nfs_negotiate_security(parent, path->dentry, flavor);
187 if (err < 0)
188 goto out;
189 clone = rpc_clone_client(server->client);
190 auth = rpcauth_create(*flavor, clone);
191 if (!auth) {
192 err = -EIO;
193 goto out_shutdown;
194 }
195 err = server->nfs_client->rpc_ops->lookup(clone, parent->d_inode,
196 &path->dentry->d_name,
197 fh, fattr);
198out_shutdown:
199 rpc_shutdown_client(clone);
200out:
201 return err;
202}
203#else /* CONFIG_NFS_V4 */
204static inline int nfs_lookup_with_sec(struct nfs_server *server,
205 struct dentry *parent, struct dentry *dentry,
206 struct path *path, struct nfs_fh *fh,
207 struct nfs_fattr *fattr,
208 rpc_authflavor_t *flavor)
209{
210 return -EPERM;
211}
212#endif /* CONFIG_NFS_V4 */
213
99/* 214/*
100 * nfs_d_automount - Handle crossing a mountpoint on the server 215 * nfs_d_automount - Handle crossing a mountpoint on the server
101 * @path - The mountpoint 216 * @path - The mountpoint
@@ -116,6 +231,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
116 struct nfs_fh *fh = NULL; 231 struct nfs_fh *fh = NULL;
117 struct nfs_fattr *fattr = NULL; 232 struct nfs_fattr *fattr = NULL;
118 int err; 233 int err;
234 rpc_authflavor_t flavor = RPC_AUTH_UNIX;
119 235
120 dprintk("--> nfs_d_automount()\n"); 236 dprintk("--> nfs_d_automount()\n");
121 237
@@ -133,9 +249,11 @@ struct vfsmount *nfs_d_automount(struct path *path)
133 249
134 /* Look it up again to get its attributes */ 250 /* Look it up again to get its attributes */
135 parent = dget_parent(path->dentry); 251 parent = dget_parent(path->dentry);
136 err = server->nfs_client->rpc_ops->lookup(parent->d_inode, 252 err = server->nfs_client->rpc_ops->lookup(server->client, parent->d_inode,
137 &path->dentry->d_name, 253 &path->dentry->d_name,
138 fh, fattr); 254 fh, fattr);
255 if (err == -EPERM && NFS_PROTO(parent->d_inode)->secinfo != NULL)
256 err = nfs_lookup_with_sec(server, parent, path->dentry, path, fh, fattr, &flavor);
139 dput(parent); 257 dput(parent);
140 if (err != 0) { 258 if (err != 0) {
141 mnt = ERR_PTR(err); 259 mnt = ERR_PTR(err);
@@ -143,9 +261,9 @@ struct vfsmount *nfs_d_automount(struct path *path)
143 } 261 }
144 262
145 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) 263 if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
146 mnt = nfs_do_refmount(path->mnt, path->dentry); 264 mnt = nfs_do_refmount(path->dentry);
147 else 265 else
148 mnt = nfs_do_submount(path->mnt, path->dentry, fh, fattr); 266 mnt = nfs_do_submount(path->dentry, fh, fattr, flavor);
149 if (IS_ERR(mnt)) 267 if (IS_ERR(mnt))
150 goto out; 268 goto out;
151 269
@@ -209,22 +327,23 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server,
209 327
210/** 328/**
211 * nfs_do_submount - set up mountpoint when crossing a filesystem boundary 329 * nfs_do_submount - set up mountpoint when crossing a filesystem boundary
212 * @mnt_parent - mountpoint of parent directory
213 * @dentry - parent directory 330 * @dentry - parent directory
214 * @fh - filehandle for new root dentry 331 * @fh - filehandle for new root dentry
215 * @fattr - attributes for new root inode 332 * @fattr - attributes for new root inode
333 * @authflavor - security flavor to use when performing the mount
216 * 334 *
217 */ 335 */
218static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent, 336static struct vfsmount *nfs_do_submount(struct dentry *dentry,
219 const struct dentry *dentry,
220 struct nfs_fh *fh, 337 struct nfs_fh *fh,
221 struct nfs_fattr *fattr) 338 struct nfs_fattr *fattr,
339 rpc_authflavor_t authflavor)
222{ 340{
223 struct nfs_clone_mount mountdata = { 341 struct nfs_clone_mount mountdata = {
224 .sb = mnt_parent->mnt_sb, 342 .sb = dentry->d_sb,
225 .dentry = dentry, 343 .dentry = dentry,
226 .fh = fh, 344 .fh = fh,
227 .fattr = fattr, 345 .fattr = fattr,
346 .authflavor = authflavor,
228 }; 347 };
229 struct vfsmount *mnt = ERR_PTR(-ENOMEM); 348 struct vfsmount *mnt = ERR_PTR(-ENOMEM);
230 char *page = (char *) __get_free_page(GFP_USER); 349 char *page = (char *) __get_free_page(GFP_USER);
@@ -237,11 +356,11 @@ static struct vfsmount *nfs_do_submount(const struct vfsmount *mnt_parent,
237 dentry->d_name.name); 356 dentry->d_name.name);
238 if (page == NULL) 357 if (page == NULL)
239 goto out; 358 goto out;
240 devname = nfs_devname(mnt_parent, dentry, page, PAGE_SIZE); 359 devname = nfs_devname(dentry, page, PAGE_SIZE);
241 mnt = (struct vfsmount *)devname; 360 mnt = (struct vfsmount *)devname;
242 if (IS_ERR(devname)) 361 if (IS_ERR(devname))
243 goto free_page; 362 goto free_page;
244 mnt = nfs_do_clone_mount(NFS_SB(mnt_parent->mnt_sb), devname, &mountdata); 363 mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata);
245free_page: 364free_page:
246 free_page((unsigned long)page); 365 free_page((unsigned long)page);
247out: 366out:
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index ce939c062a52..38053d823eb0 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -141,7 +141,7 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
141} 141}
142 142
143static int 143static int
144nfs3_proc_lookup(struct inode *dir, struct qstr *name, 144nfs3_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
145 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 145 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
146{ 146{
147 struct nfs3_diropargs arg = { 147 struct nfs3_diropargs arg = {
@@ -885,4 +885,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
885 .lock = nfs3_proc_lock, 885 .lock = nfs3_proc_lock,
886 .clear_acl_cache = nfs3_forget_cached_acls, 886 .clear_acl_cache = nfs3_forget_cached_acls,
887 .close_context = nfs_close_context, 887 .close_context = nfs_close_context,
888 .init_client = nfs_init_client,
888}; 889};
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a7474073148..e1c261ddd65d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -57,7 +57,8 @@ enum nfs4_session_state {
57struct nfs4_minor_version_ops { 57struct nfs4_minor_version_ops {
58 u32 minor_version; 58 u32 minor_version;
59 59
60 int (*call_sync)(struct nfs_server *server, 60 int (*call_sync)(struct rpc_clnt *clnt,
61 struct nfs_server *server,
61 struct rpc_message *msg, 62 struct rpc_message *msg,
62 struct nfs4_sequence_args *args, 63 struct nfs4_sequence_args *args,
63 struct nfs4_sequence_res *res, 64 struct nfs4_sequence_res *res,
@@ -252,6 +253,9 @@ static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *ser
252extern int nfs4_setup_sequence(const struct nfs_server *server, 253extern int nfs4_setup_sequence(const struct nfs_server *server,
253 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, 254 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
254 int cache_reply, struct rpc_task *task); 255 int cache_reply, struct rpc_task *task);
256extern int nfs41_setup_sequence(struct nfs4_session *session,
257 struct nfs4_sequence_args *args, struct nfs4_sequence_res *res,
258 int cache_reply, struct rpc_task *task);
255extern void nfs4_destroy_session(struct nfs4_session *session); 259extern void nfs4_destroy_session(struct nfs4_session *session);
256extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp); 260extern struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp);
257extern int nfs4_proc_create_session(struct nfs_client *); 261extern int nfs4_proc_create_session(struct nfs_client *);
@@ -259,6 +263,21 @@ extern int nfs4_proc_destroy_session(struct nfs4_session *);
259extern int nfs4_init_session(struct nfs_server *server); 263extern int nfs4_init_session(struct nfs_server *server);
260extern int nfs4_proc_get_lease_time(struct nfs_client *clp, 264extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
261 struct nfs_fsinfo *fsinfo); 265 struct nfs_fsinfo *fsinfo);
266extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
267 bool sync);
268
269static inline bool
270is_ds_only_client(struct nfs_client *clp)
271{
272 return (clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) ==
273 EXCHGID4_FLAG_USE_PNFS_DS;
274}
275
276static inline bool
277is_ds_client(struct nfs_client *clp)
278{
279 return clp->cl_exchange_flags & EXCHGID4_FLAG_USE_PNFS_DS;
280}
262#else /* CONFIG_NFS_v4_1 */ 281#else /* CONFIG_NFS_v4_1 */
263static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) 282static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
264{ 283{
@@ -276,6 +295,18 @@ static inline int nfs4_init_session(struct nfs_server *server)
276{ 295{
277 return 0; 296 return 0;
278} 297}
298
299static inline bool
300is_ds_only_client(struct nfs_client *clp)
301{
302 return false;
303}
304
305static inline bool
306is_ds_client(struct nfs_client *clp)
307{
308 return false;
309}
279#endif /* CONFIG_NFS_V4_1 */ 310#endif /* CONFIG_NFS_V4_1 */
280 311
281extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; 312extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[];
@@ -298,6 +329,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
298#if defined(CONFIG_NFS_V4_1) 329#if defined(CONFIG_NFS_V4_1)
299struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp); 330struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
300struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp); 331struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
332extern void nfs4_schedule_session_recovery(struct nfs4_session *);
333#else
334static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
335{
336}
301#endif /* CONFIG_NFS_V4_1 */ 337#endif /* CONFIG_NFS_V4_1 */
302 338
303extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *); 339extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
@@ -307,10 +343,9 @@ extern void nfs4_put_open_state(struct nfs4_state *);
307extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t); 343extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
308extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t); 344extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
309extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); 345extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
310extern void nfs4_schedule_state_recovery(struct nfs_client *); 346extern void nfs4_schedule_lease_recovery(struct nfs_client *);
311extern void nfs4_schedule_state_manager(struct nfs_client *); 347extern void nfs4_schedule_state_manager(struct nfs_client *);
312extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); 348extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
313extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
314extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); 349extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
315extern void nfs41_handle_recall_slot(struct nfs_client *clp); 350extern void nfs41_handle_recall_slot(struct nfs_client *clp);
316extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); 351extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 23f930caf1e2..6f8192f4cfc7 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -40,32 +40,370 @@ MODULE_LICENSE("GPL");
40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>"); 40MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
41MODULE_DESCRIPTION("The NFSv4 file layout driver"); 41MODULE_DESCRIPTION("The NFSv4 file layout driver");
42 42
43static int 43#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
44filelayout_set_layoutdriver(struct nfs_server *nfss) 44
45{ 45static loff_t
46 int status = pnfs_alloc_init_deviceid_cache(nfss->nfs_client, 46filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
47 nfs4_fl_free_deviceid_callback); 47 loff_t offset)
48 if (status) { 48{
49 printk(KERN_WARNING "%s: deviceid cache could not be " 49 u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
50 "initialized\n", __func__); 50 u64 tmp;
51 return status; 51
52 offset -= flseg->pattern_offset;
53 tmp = offset;
54 do_div(tmp, stripe_width);
55
56 return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
57}
58
59/* This function is used by the layout driver to calculate the
60 * offset of the file on the dserver based on whether the
61 * layout type is STRIPE_DENSE or STRIPE_SPARSE
62 */
63static loff_t
64filelayout_get_dserver_offset(struct pnfs_layout_segment *lseg, loff_t offset)
65{
66 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
67
68 switch (flseg->stripe_type) {
69 case STRIPE_SPARSE:
70 return offset;
71
72 case STRIPE_DENSE:
73 return filelayout_get_dense_offset(flseg, offset);
74 }
75
76 BUG();
77}
78
79/* For data server errors we don't recover from */
80static void
81filelayout_set_lo_fail(struct pnfs_layout_segment *lseg)
82{
83 if (lseg->pls_range.iomode == IOMODE_RW) {
84 dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
85 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
86 } else {
87 dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
88 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
89 }
90}
91
92static int filelayout_async_handle_error(struct rpc_task *task,
93 struct nfs4_state *state,
94 struct nfs_client *clp,
95 int *reset)
96{
97 if (task->tk_status >= 0)
98 return 0;
99
100 *reset = 0;
101
102 switch (task->tk_status) {
103 case -NFS4ERR_BADSESSION:
104 case -NFS4ERR_BADSLOT:
105 case -NFS4ERR_BAD_HIGH_SLOT:
106 case -NFS4ERR_DEADSESSION:
107 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
108 case -NFS4ERR_SEQ_FALSE_RETRY:
109 case -NFS4ERR_SEQ_MISORDERED:
110 dprintk("%s ERROR %d, Reset session. Exchangeid "
111 "flags 0x%x\n", __func__, task->tk_status,
112 clp->cl_exchange_flags);
113 nfs4_schedule_session_recovery(clp->cl_session);
114 break;
115 case -NFS4ERR_DELAY:
116 case -NFS4ERR_GRACE:
117 case -EKEYEXPIRED:
118 rpc_delay(task, FILELAYOUT_POLL_RETRY_MAX);
119 break;
120 default:
121 dprintk("%s DS error. Retry through MDS %d\n", __func__,
122 task->tk_status);
123 *reset = 1;
124 break;
125 }
126 task->tk_status = 0;
127 return -EAGAIN;
128}
129
130/* NFS_PROTO call done callback routines */
131
132static int filelayout_read_done_cb(struct rpc_task *task,
133 struct nfs_read_data *data)
134{
135 struct nfs_client *clp = data->ds_clp;
136 int reset = 0;
137
138 dprintk("%s DS read\n", __func__);
139
140 if (filelayout_async_handle_error(task, data->args.context->state,
141 data->ds_clp, &reset) == -EAGAIN) {
142 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
143 __func__, data->ds_clp, data->ds_clp->cl_session);
144 if (reset) {
145 filelayout_set_lo_fail(data->lseg);
146 nfs4_reset_read(task, data);
147 clp = NFS_SERVER(data->inode)->nfs_client;
148 }
149 nfs_restart_rpc(task, clp);
150 return -EAGAIN;
52 } 151 }
53 dprintk("%s: deviceid cache has been initialized successfully\n", 152
54 __func__);
55 return 0; 153 return 0;
56} 154}
57 155
58/* Clear out the layout by destroying its device list */ 156/*
59static int 157 * We reference the rpc_cred of the first WRITE that triggers the need for
60filelayout_clear_layoutdriver(struct nfs_server *nfss) 158 * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
159 * rfc5661 is not clear about which credential should be used.
160 */
161static void
162filelayout_set_layoutcommit(struct nfs_write_data *wdata)
61{ 163{
62 dprintk("--> %s\n", __func__); 164 if (FILELAYOUT_LSEG(wdata->lseg)->commit_through_mds ||
165 wdata->res.verf->committed == NFS_FILE_SYNC)
166 return;
167
168 pnfs_set_layoutcommit(wdata);
169 dprintk("%s ionde %lu pls_end_pos %lu\n", __func__, wdata->inode->i_ino,
170 (unsigned long) wdata->lseg->pls_end_pos);
171}
172
173/*
174 * Call ops for the async read/write cases
175 * In the case of dense layouts, the offset needs to be reset to its
176 * original value.
177 */
178static void filelayout_read_prepare(struct rpc_task *task, void *data)
179{
180 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
181
182 rdata->read_done_cb = filelayout_read_done_cb;
183
184 if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
185 &rdata->args.seq_args, &rdata->res.seq_res,
186 0, task))
187 return;
188
189 rpc_call_start(task);
190}
191
192static void filelayout_read_call_done(struct rpc_task *task, void *data)
193{
194 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
195
196 dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
197
198 /* Note this may cause RPC to be resent */
199 rdata->mds_ops->rpc_call_done(task, data);
200}
201
202static void filelayout_read_release(void *data)
203{
204 struct nfs_read_data *rdata = (struct nfs_read_data *)data;
205
206 rdata->mds_ops->rpc_release(data);
207}
208
209static int filelayout_write_done_cb(struct rpc_task *task,
210 struct nfs_write_data *data)
211{
212 int reset = 0;
213
214 if (filelayout_async_handle_error(task, data->args.context->state,
215 data->ds_clp, &reset) == -EAGAIN) {
216 struct nfs_client *clp;
217
218 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
219 __func__, data->ds_clp, data->ds_clp->cl_session);
220 if (reset) {
221 filelayout_set_lo_fail(data->lseg);
222 nfs4_reset_write(task, data);
223 clp = NFS_SERVER(data->inode)->nfs_client;
224 } else
225 clp = data->ds_clp;
226 nfs_restart_rpc(task, clp);
227 return -EAGAIN;
228 }
63 229
64 if (nfss->nfs_client->cl_devid_cache) 230 filelayout_set_layoutcommit(data);
65 pnfs_put_deviceid_cache(nfss->nfs_client);
66 return 0; 231 return 0;
67} 232}
68 233
234/* Fake up some data that will cause nfs_commit_release to retry the writes. */
235static void prepare_to_resend_writes(struct nfs_write_data *data)
236{
237 struct nfs_page *first = nfs_list_entry(data->pages.next);
238
239 data->task.tk_status = 0;
240 memcpy(data->verf.verifier, first->wb_verf.verifier,
241 sizeof(first->wb_verf.verifier));
242 data->verf.verifier[0]++; /* ensure verifier mismatch */
243}
244
245static int filelayout_commit_done_cb(struct rpc_task *task,
246 struct nfs_write_data *data)
247{
248 int reset = 0;
249
250 if (filelayout_async_handle_error(task, data->args.context->state,
251 data->ds_clp, &reset) == -EAGAIN) {
252 dprintk("%s calling restart ds_clp %p ds_clp->cl_session %p\n",
253 __func__, data->ds_clp, data->ds_clp->cl_session);
254 if (reset) {
255 prepare_to_resend_writes(data);
256 filelayout_set_lo_fail(data->lseg);
257 } else
258 nfs_restart_rpc(task, data->ds_clp);
259 return -EAGAIN;
260 }
261
262 return 0;
263}
264
265static void filelayout_write_prepare(struct rpc_task *task, void *data)
266{
267 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
268
269 if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
270 &wdata->args.seq_args, &wdata->res.seq_res,
271 0, task))
272 return;
273
274 rpc_call_start(task);
275}
276
277static void filelayout_write_call_done(struct rpc_task *task, void *data)
278{
279 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
280
281 /* Note this may cause RPC to be resent */
282 wdata->mds_ops->rpc_call_done(task, data);
283}
284
285static void filelayout_write_release(void *data)
286{
287 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
288
289 wdata->mds_ops->rpc_release(data);
290}
291
292static void filelayout_commit_release(void *data)
293{
294 struct nfs_write_data *wdata = (struct nfs_write_data *)data;
295
296 nfs_commit_release_pages(wdata);
297 if (atomic_dec_and_test(&NFS_I(wdata->inode)->commits_outstanding))
298 nfs_commit_clear_lock(NFS_I(wdata->inode));
299 nfs_commitdata_release(wdata);
300}
301
302struct rpc_call_ops filelayout_read_call_ops = {
303 .rpc_call_prepare = filelayout_read_prepare,
304 .rpc_call_done = filelayout_read_call_done,
305 .rpc_release = filelayout_read_release,
306};
307
308struct rpc_call_ops filelayout_write_call_ops = {
309 .rpc_call_prepare = filelayout_write_prepare,
310 .rpc_call_done = filelayout_write_call_done,
311 .rpc_release = filelayout_write_release,
312};
313
314struct rpc_call_ops filelayout_commit_call_ops = {
315 .rpc_call_prepare = filelayout_write_prepare,
316 .rpc_call_done = filelayout_write_call_done,
317 .rpc_release = filelayout_commit_release,
318};
319
320static enum pnfs_try_status
321filelayout_read_pagelist(struct nfs_read_data *data)
322{
323 struct pnfs_layout_segment *lseg = data->lseg;
324 struct nfs4_pnfs_ds *ds;
325 loff_t offset = data->args.offset;
326 u32 j, idx;
327 struct nfs_fh *fh;
328 int status;
329
330 dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
331 __func__, data->inode->i_ino,
332 data->args.pgbase, (size_t)data->args.count, offset);
333
334 /* Retrieve the correct rpc_client for the byte range */
335 j = nfs4_fl_calc_j_index(lseg, offset);
336 idx = nfs4_fl_calc_ds_index(lseg, j);
337 ds = nfs4_fl_prepare_ds(lseg, idx);
338 if (!ds) {
339 /* Either layout fh index faulty, or ds connect failed */
340 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
341 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
342 return PNFS_NOT_ATTEMPTED;
343 }
344 dprintk("%s USE DS:ip %x %hu\n", __func__,
345 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
346
347 /* No multipath support. Use first DS */
348 data->ds_clp = ds->ds_clp;
349 fh = nfs4_fl_select_ds_fh(lseg, j);
350 if (fh)
351 data->args.fh = fh;
352
353 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
354 data->mds_offset = offset;
355
356 /* Perform an asynchronous read to ds */
357 status = nfs_initiate_read(data, ds->ds_clp->cl_rpcclient,
358 &filelayout_read_call_ops);
359 BUG_ON(status != 0);
360 return PNFS_ATTEMPTED;
361}
362
363/* Perform async writes. */
364static enum pnfs_try_status
365filelayout_write_pagelist(struct nfs_write_data *data, int sync)
366{
367 struct pnfs_layout_segment *lseg = data->lseg;
368 struct nfs4_pnfs_ds *ds;
369 loff_t offset = data->args.offset;
370 u32 j, idx;
371 struct nfs_fh *fh;
372 int status;
373
374 /* Retrieve the correct rpc_client for the byte range */
375 j = nfs4_fl_calc_j_index(lseg, offset);
376 idx = nfs4_fl_calc_ds_index(lseg, j);
377 ds = nfs4_fl_prepare_ds(lseg, idx);
378 if (!ds) {
379 printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
380 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
381 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
382 return PNFS_NOT_ATTEMPTED;
383 }
384 dprintk("%s ino %lu sync %d req %Zu@%llu DS:%x:%hu\n", __func__,
385 data->inode->i_ino, sync, (size_t) data->args.count, offset,
386 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
387
388 data->write_done_cb = filelayout_write_done_cb;
389 data->ds_clp = ds->ds_clp;
390 fh = nfs4_fl_select_ds_fh(lseg, j);
391 if (fh)
392 data->args.fh = fh;
393 /*
394 * Get the file offset on the dserver. Set the write offset to
395 * this offset and save the original offset.
396 */
397 data->args.offset = filelayout_get_dserver_offset(lseg, offset);
398 data->mds_offset = offset;
399
400 /* Perform an asynchronous write */
401 status = nfs_initiate_write(data, ds->ds_clp->cl_rpcclient,
402 &filelayout_write_call_ops, sync);
403 BUG_ON(status != 0);
404 return PNFS_ATTEMPTED;
405}
406
69/* 407/*
70 * filelayout_check_layout() 408 * filelayout_check_layout()
71 * 409 *
@@ -92,14 +430,14 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo,
92 goto out; 430 goto out;
93 } 431 }
94 432
95 if (fl->stripe_unit % PAGE_SIZE) { 433 if (!fl->stripe_unit || fl->stripe_unit % PAGE_SIZE) {
96 dprintk("%s Stripe unit (%u) not page aligned\n", 434 dprintk("%s Invalid stripe unit (%u)\n",
97 __func__, fl->stripe_unit); 435 __func__, fl->stripe_unit);
98 goto out; 436 goto out;
99 } 437 }
100 438
101 /* find and reference the deviceid */ 439 /* find and reference the deviceid */
102 dsaddr = nfs4_fl_find_get_deviceid(nfss->nfs_client, id); 440 dsaddr = nfs4_fl_find_get_deviceid(id);
103 if (dsaddr == NULL) { 441 if (dsaddr == NULL) {
104 dsaddr = get_device_info(lo->plh_inode, id); 442 dsaddr = get_device_info(lo->plh_inode, id);
105 if (dsaddr == NULL) 443 if (dsaddr == NULL)
@@ -134,7 +472,7 @@ out:
134 dprintk("--> %s returns %d\n", __func__, status); 472 dprintk("--> %s returns %d\n", __func__, status);
135 return status; 473 return status;
136out_put: 474out_put:
137 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, &dsaddr->deviceid); 475 nfs4_fl_put_deviceid(dsaddr);
138 goto out; 476 goto out;
139} 477}
140 478
@@ -164,12 +502,33 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
164 struct nfs4_layoutget_res *lgr, 502 struct nfs4_layoutget_res *lgr,
165 struct nfs4_deviceid *id) 503 struct nfs4_deviceid *id)
166{ 504{
167 uint32_t *p = (uint32_t *)lgr->layout.buf; 505 struct xdr_stream stream;
506 struct xdr_buf buf = {
507 .pages = lgr->layoutp->pages,
508 .page_len = lgr->layoutp->len,
509 .buflen = lgr->layoutp->len,
510 .len = lgr->layoutp->len,
511 };
512 struct page *scratch;
513 __be32 *p;
168 uint32_t nfl_util; 514 uint32_t nfl_util;
169 int i; 515 int i;
170 516
171 dprintk("%s: set_layout_map Begin\n", __func__); 517 dprintk("%s: set_layout_map Begin\n", __func__);
172 518
519 scratch = alloc_page(GFP_KERNEL);
520 if (!scratch)
521 return -ENOMEM;
522
523 xdr_init_decode(&stream, &buf, NULL);
524 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
525
526 /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
527 * num_fh (4) */
528 p = xdr_inline_decode(&stream, NFS4_DEVICEID4_SIZE + 20);
529 if (unlikely(!p))
530 goto out_err;
531
173 memcpy(id, p, sizeof(*id)); 532 memcpy(id, p, sizeof(*id));
174 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); 533 p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
175 print_deviceid(id); 534 print_deviceid(id);
@@ -191,32 +550,57 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
191 __func__, nfl_util, fl->num_fh, fl->first_stripe_index, 550 __func__, nfl_util, fl->num_fh, fl->first_stripe_index,
192 fl->pattern_offset); 551 fl->pattern_offset);
193 552
553 if (!fl->num_fh)
554 goto out_err;
555
194 fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *), 556 fl->fh_array = kzalloc(fl->num_fh * sizeof(struct nfs_fh *),
195 GFP_KERNEL); 557 GFP_KERNEL);
196 if (!fl->fh_array) 558 if (!fl->fh_array)
197 return -ENOMEM; 559 goto out_err;
198 560
199 for (i = 0; i < fl->num_fh; i++) { 561 for (i = 0; i < fl->num_fh; i++) {
200 /* Do we want to use a mempool here? */ 562 /* Do we want to use a mempool here? */
201 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL); 563 fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), GFP_KERNEL);
202 if (!fl->fh_array[i]) { 564 if (!fl->fh_array[i])
203 filelayout_free_fh_array(fl); 565 goto out_err_free;
204 return -ENOMEM; 566
205 } 567 p = xdr_inline_decode(&stream, 4);
568 if (unlikely(!p))
569 goto out_err_free;
206 fl->fh_array[i]->size = be32_to_cpup(p++); 570 fl->fh_array[i]->size = be32_to_cpup(p++);
207 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { 571 if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
208 printk(KERN_ERR "Too big fh %d received %d\n", 572 printk(KERN_ERR "Too big fh %d received %d\n",
209 i, fl->fh_array[i]->size); 573 i, fl->fh_array[i]->size);
210 filelayout_free_fh_array(fl); 574 goto out_err_free;
211 return -EIO;
212 } 575 }
576
577 p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
578 if (unlikely(!p))
579 goto out_err_free;
213 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size); 580 memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
214 p += XDR_QUADLEN(fl->fh_array[i]->size);
215 dprintk("DEBUG: %s: fh len %d\n", __func__, 581 dprintk("DEBUG: %s: fh len %d\n", __func__,
216 fl->fh_array[i]->size); 582 fl->fh_array[i]->size);
217 } 583 }
218 584
585 __free_page(scratch);
219 return 0; 586 return 0;
587
588out_err_free:
589 filelayout_free_fh_array(fl);
590out_err:
591 __free_page(scratch);
592 return -EIO;
593}
594
595static void
596filelayout_free_lseg(struct pnfs_layout_segment *lseg)
597{
598 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
599
600 dprintk("--> %s\n", __func__);
601 nfs4_fl_put_deviceid(fl->dsaddr);
602 kfree(fl->commit_buckets);
603 _filelayout_free_lseg(fl);
220} 604}
221 605
222static struct pnfs_layout_segment * 606static struct pnfs_layout_segment *
@@ -237,29 +621,252 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
237 _filelayout_free_lseg(fl); 621 _filelayout_free_lseg(fl);
238 return NULL; 622 return NULL;
239 } 623 }
624
625 /* This assumes there is only one IOMODE_RW lseg. What
626 * we really want to do is have a layout_hdr level
627 * dictionary of <multipath_list4, fh> keys, each
628 * associated with a struct list_head, populated by calls
629 * to filelayout_write_pagelist().
630 * */
631 if ((!fl->commit_through_mds) && (lgr->range.iomode == IOMODE_RW)) {
632 int i;
633 int size = (fl->stripe_type == STRIPE_SPARSE) ?
634 fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
635
636 fl->commit_buckets = kcalloc(size, sizeof(struct list_head), GFP_KERNEL);
637 if (!fl->commit_buckets) {
638 filelayout_free_lseg(&fl->generic_hdr);
639 return NULL;
640 }
641 fl->number_of_buckets = size;
642 for (i = 0; i < size; i++)
643 INIT_LIST_HEAD(&fl->commit_buckets[i]);
644 }
240 return &fl->generic_hdr; 645 return &fl->generic_hdr;
241} 646}
242 647
243static void 648/*
244filelayout_free_lseg(struct pnfs_layout_segment *lseg) 649 * filelayout_pg_test(). Called by nfs_can_coalesce_requests()
650 *
651 * return 1 : coalesce page
652 * return 0 : don't coalesce page
653 */
654int
655filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
656 struct nfs_page *req)
657{
658 u64 p_stripe, r_stripe;
659 u32 stripe_unit;
660
661 if (!pgio->pg_lseg)
662 return 1;
663 p_stripe = (u64)prev->wb_index << PAGE_CACHE_SHIFT;
664 r_stripe = (u64)req->wb_index << PAGE_CACHE_SHIFT;
665 stripe_unit = FILELAYOUT_LSEG(pgio->pg_lseg)->stripe_unit;
666
667 do_div(p_stripe, stripe_unit);
668 do_div(r_stripe, stripe_unit);
669
670 return (p_stripe == r_stripe);
671}
672
673static bool filelayout_mark_pnfs_commit(struct pnfs_layout_segment *lseg)
674{
675 return !FILELAYOUT_LSEG(lseg)->commit_through_mds;
676}
677
678static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
679{
680 if (fl->stripe_type == STRIPE_SPARSE)
681 return nfs4_fl_calc_ds_index(&fl->generic_hdr, j);
682 else
683 return j;
684}
685
686struct list_head *filelayout_choose_commit_list(struct nfs_page *req)
245{ 687{
246 struct nfs_server *nfss = NFS_SERVER(lseg->pls_layout->plh_inode); 688 struct pnfs_layout_segment *lseg = req->wb_commit_lseg;
247 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); 689 struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
690 u32 i, j;
691 struct list_head *list;
692
693 /* Note that we are calling nfs4_fl_calc_j_index on each page
694 * that ends up being committed to a data server. An attractive
695 * alternative is to add a field to nfs_write_data and nfs_page
696 * to store the value calculated in filelayout_write_pagelist
697 * and just use that here.
698 */
699 j = nfs4_fl_calc_j_index(lseg,
700 (loff_t)req->wb_index << PAGE_CACHE_SHIFT);
701 i = select_bucket_index(fl, j);
702 list = &fl->commit_buckets[i];
703 if (list_empty(list)) {
704 /* Non-empty buckets hold a reference on the lseg */
705 get_lseg(lseg);
706 }
707 return list;
708}
248 709
249 dprintk("--> %s\n", __func__); 710static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
250 pnfs_put_deviceid(nfss->nfs_client->cl_devid_cache, 711{
251 &fl->dsaddr->deviceid); 712 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
252 _filelayout_free_lseg(fl); 713
714 if (flseg->stripe_type == STRIPE_SPARSE)
715 return i;
716 else
717 return nfs4_fl_calc_ds_index(lseg, i);
718}
719
720static struct nfs_fh *
721select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
722{
723 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
724
725 if (flseg->stripe_type == STRIPE_SPARSE) {
726 if (flseg->num_fh == 1)
727 i = 0;
728 else if (flseg->num_fh == 0)
729 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
730 return NULL;
731 }
732 return flseg->fh_array[i];
733}
734
735static int filelayout_initiate_commit(struct nfs_write_data *data, int how)
736{
737 struct pnfs_layout_segment *lseg = data->lseg;
738 struct nfs4_pnfs_ds *ds;
739 u32 idx;
740 struct nfs_fh *fh;
741
742 idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
743 ds = nfs4_fl_prepare_ds(lseg, idx);
744 if (!ds) {
745 printk(KERN_ERR "%s: prepare_ds failed, use MDS\n", __func__);
746 set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
747 set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
748 prepare_to_resend_writes(data);
749 data->mds_ops->rpc_release(data);
750 return -EAGAIN;
751 }
752 dprintk("%s ino %lu, how %d\n", __func__, data->inode->i_ino, how);
753 data->write_done_cb = filelayout_commit_done_cb;
754 data->ds_clp = ds->ds_clp;
755 fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
756 if (fh)
757 data->args.fh = fh;
758 return nfs_initiate_commit(data, ds->ds_clp->cl_rpcclient,
759 &filelayout_commit_call_ops, how);
760}
761
762/*
763 * This is only useful while we are using whole file layouts.
764 */
765static struct pnfs_layout_segment *find_only_write_lseg(struct inode *inode)
766{
767 struct pnfs_layout_segment *lseg, *rv = NULL;
768
769 spin_lock(&inode->i_lock);
770 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
771 if (lseg->pls_range.iomode == IOMODE_RW)
772 rv = get_lseg(lseg);
773 spin_unlock(&inode->i_lock);
774 return rv;
775}
776
777static int alloc_ds_commits(struct inode *inode, struct list_head *list)
778{
779 struct pnfs_layout_segment *lseg;
780 struct nfs4_filelayout_segment *fl;
781 struct nfs_write_data *data;
782 int i, j;
783
784 /* Won't need this when non-whole file layout segments are supported
785 * instead we will use a pnfs_layout_hdr structure */
786 lseg = find_only_write_lseg(inode);
787 if (!lseg)
788 return 0;
789 fl = FILELAYOUT_LSEG(lseg);
790 for (i = 0; i < fl->number_of_buckets; i++) {
791 if (list_empty(&fl->commit_buckets[i]))
792 continue;
793 data = nfs_commitdata_alloc();
794 if (!data)
795 goto out_bad;
796 data->ds_commit_index = i;
797 data->lseg = lseg;
798 list_add(&data->pages, list);
799 }
800 put_lseg(lseg);
801 return 0;
802
803out_bad:
804 for (j = i; j < fl->number_of_buckets; j++) {
805 if (list_empty(&fl->commit_buckets[i]))
806 continue;
807 nfs_retry_commit(&fl->commit_buckets[i], lseg);
808 put_lseg(lseg); /* associated with emptying bucket */
809 }
810 put_lseg(lseg);
811 /* Caller will clean up entries put on list */
812 return -ENOMEM;
813}
814
815/* This follows nfs_commit_list pretty closely */
816static int
817filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
818 int how)
819{
820 struct nfs_write_data *data, *tmp;
821 LIST_HEAD(list);
822
823 if (!list_empty(mds_pages)) {
824 data = nfs_commitdata_alloc();
825 if (!data)
826 goto out_bad;
827 data->lseg = NULL;
828 list_add(&data->pages, &list);
829 }
830
831 if (alloc_ds_commits(inode, &list))
832 goto out_bad;
833
834 list_for_each_entry_safe(data, tmp, &list, pages) {
835 list_del_init(&data->pages);
836 atomic_inc(&NFS_I(inode)->commits_outstanding);
837 if (!data->lseg) {
838 nfs_init_commit(data, mds_pages, NULL);
839 nfs_initiate_commit(data, NFS_CLIENT(inode),
840 data->mds_ops, how);
841 } else {
842 nfs_init_commit(data, &FILELAYOUT_LSEG(data->lseg)->commit_buckets[data->ds_commit_index], data->lseg);
843 filelayout_initiate_commit(data, how);
844 }
845 }
846 return 0;
847 out_bad:
848 list_for_each_entry_safe(data, tmp, &list, pages) {
849 nfs_retry_commit(&data->pages, data->lseg);
850 list_del_init(&data->pages);
851 nfs_commit_free(data);
852 }
853 nfs_retry_commit(mds_pages, NULL);
854 nfs_commit_clear_lock(NFS_I(inode));
855 return -ENOMEM;
253} 856}
254 857
255static struct pnfs_layoutdriver_type filelayout_type = { 858static struct pnfs_layoutdriver_type filelayout_type = {
256 .id = LAYOUT_NFSV4_1_FILES, 859 .id = LAYOUT_NFSV4_1_FILES,
257 .name = "LAYOUT_NFSV4_1_FILES", 860 .name = "LAYOUT_NFSV4_1_FILES",
258 .owner = THIS_MODULE, 861 .owner = THIS_MODULE,
259 .set_layoutdriver = filelayout_set_layoutdriver, 862 .alloc_lseg = filelayout_alloc_lseg,
260 .clear_layoutdriver = filelayout_clear_layoutdriver, 863 .free_lseg = filelayout_free_lseg,
261 .alloc_lseg = filelayout_alloc_lseg, 864 .pg_test = filelayout_pg_test,
262 .free_lseg = filelayout_free_lseg, 865 .mark_pnfs_commit = filelayout_mark_pnfs_commit,
866 .choose_commit_list = filelayout_choose_commit_list,
867 .commit_pagelist = filelayout_commit_pagelist,
868 .read_pagelist = filelayout_read_pagelist,
869 .write_pagelist = filelayout_write_pagelist,
263}; 870};
264 871
265static int __init nfs4filelayout_init(void) 872static int __init nfs4filelayout_init(void)
diff --git a/fs/nfs/nfs4filelayout.h b/fs/nfs/nfs4filelayout.h
index bbf60dd2ab9d..7c44579f5832 100644
--- a/fs/nfs/nfs4filelayout.h
+++ b/fs/nfs/nfs4filelayout.h
@@ -33,7 +33,7 @@
33#include "pnfs.h" 33#include "pnfs.h"
34 34
35/* 35/*
36 * Field testing shows we need to support upto 4096 stripe indices. 36 * Field testing shows we need to support up to 4096 stripe indices.
37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint 37 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
38 * reasonable. This in turn means we support a maximum of 256 38 * reasonable. This in turn means we support a maximum of 256
39 * RFC 5661 multipath_list4 structures. 39 * RFC 5661 multipath_list4 structures.
@@ -55,8 +55,14 @@ struct nfs4_pnfs_ds {
55 atomic_t ds_count; 55 atomic_t ds_count;
56}; 56};
57 57
58/* nfs4_file_layout_dsaddr flags */
59#define NFS4_DEVICE_ID_NEG_ENTRY 0x00000001
60
58struct nfs4_file_layout_dsaddr { 61struct nfs4_file_layout_dsaddr {
59 struct pnfs_deviceid_node deviceid; 62 struct hlist_node node;
63 struct nfs4_deviceid deviceid;
64 atomic_t ref;
65 unsigned long flags;
60 u32 stripe_count; 66 u32 stripe_count;
61 u8 *stripe_indices; 67 u8 *stripe_indices;
62 u32 ds_num; 68 u32 ds_num;
@@ -73,6 +79,8 @@ struct nfs4_filelayout_segment {
73 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */ 79 struct nfs4_file_layout_dsaddr *dsaddr; /* Point to GETDEVINFO data */
74 unsigned int num_fh; 80 unsigned int num_fh;
75 struct nfs_fh **fh_array; 81 struct nfs_fh **fh_array;
82 struct list_head *commit_buckets; /* Sort commits to ds */
83 int number_of_buckets;
76}; 84};
77 85
78static inline struct nfs4_filelayout_segment * 86static inline struct nfs4_filelayout_segment *
@@ -83,11 +91,18 @@ FILELAYOUT_LSEG(struct pnfs_layout_segment *lseg)
83 generic_hdr); 91 generic_hdr);
84} 92}
85 93
86extern void nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *); 94extern struct nfs_fh *
95nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
96
87extern void print_ds(struct nfs4_pnfs_ds *ds); 97extern void print_ds(struct nfs4_pnfs_ds *ds);
88extern void print_deviceid(struct nfs4_deviceid *dev_id); 98extern void print_deviceid(struct nfs4_deviceid *dev_id);
99u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
100u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
101struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
102 u32 ds_idx);
89extern struct nfs4_file_layout_dsaddr * 103extern struct nfs4_file_layout_dsaddr *
90nfs4_fl_find_get_deviceid(struct nfs_client *, struct nfs4_deviceid *dev_id); 104nfs4_fl_find_get_deviceid(struct nfs4_deviceid *dev_id);
105extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr);
91struct nfs4_file_layout_dsaddr * 106struct nfs4_file_layout_dsaddr *
92get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id); 107get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id);
93 108
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f5c9b125e8cc..de5350f2b249 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -37,6 +37,30 @@
37#define NFSDBG_FACILITY NFSDBG_PNFS_LD 37#define NFSDBG_FACILITY NFSDBG_PNFS_LD
38 38
39/* 39/*
40 * Device ID RCU cache. A device ID is unique per client ID and layout type.
41 */
42#define NFS4_FL_DEVICE_ID_HASH_BITS 5
43#define NFS4_FL_DEVICE_ID_HASH_SIZE (1 << NFS4_FL_DEVICE_ID_HASH_BITS)
44#define NFS4_FL_DEVICE_ID_HASH_MASK (NFS4_FL_DEVICE_ID_HASH_SIZE - 1)
45
46static inline u32
47nfs4_fl_deviceid_hash(struct nfs4_deviceid *id)
48{
49 unsigned char *cptr = (unsigned char *)id->data;
50 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
51 u32 x = 0;
52
53 while (nbytes--) {
54 x *= 37;
55 x += *cptr++;
56 }
57 return x & NFS4_FL_DEVICE_ID_HASH_MASK;
58}
59
60static struct hlist_head filelayout_deviceid_cache[NFS4_FL_DEVICE_ID_HASH_SIZE];
61static DEFINE_SPINLOCK(filelayout_deviceid_lock);
62
63/*
40 * Data server cache 64 * Data server cache
41 * 65 *
42 * Data servers can be mapped to different device ids. 66 * Data servers can be mapped to different device ids.
@@ -104,6 +128,67 @@ _data_server_lookup_locked(u32 ip_addr, u32 port)
104 return NULL; 128 return NULL;
105} 129}
106 130
131/*
132 * Create an rpc connection to the nfs4_pnfs_ds data server
133 * Currently only support IPv4
134 */
135static int
136nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
137{
138 struct nfs_client *clp;
139 struct sockaddr_in sin;
140 int status = 0;
141
142 dprintk("--> %s ip:port %x:%hu au_flavor %d\n", __func__,
143 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port),
144 mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
145
146 sin.sin_family = AF_INET;
147 sin.sin_addr.s_addr = ds->ds_ip_addr;
148 sin.sin_port = ds->ds_port;
149
150 clp = nfs4_set_ds_client(mds_srv->nfs_client, (struct sockaddr *)&sin,
151 sizeof(sin), IPPROTO_TCP);
152 if (IS_ERR(clp)) {
153 status = PTR_ERR(clp);
154 goto out;
155 }
156
157 if ((clp->cl_exchange_flags & EXCHGID4_FLAG_MASK_PNFS) != 0) {
158 if (!is_ds_client(clp)) {
159 status = -ENODEV;
160 goto out_put;
161 }
162 ds->ds_clp = clp;
163 dprintk("%s [existing] ip=%x, port=%hu\n", __func__,
164 ntohl(ds->ds_ip_addr), ntohs(ds->ds_port));
165 goto out;
166 }
167
168 /*
169 * Do not set NFS_CS_CHECK_LEASE_TIME instead set the DS lease to
170 * be equal to the MDS lease. Renewal is scheduled in create_session.
171 */
172 spin_lock(&mds_srv->nfs_client->cl_lock);
173 clp->cl_lease_time = mds_srv->nfs_client->cl_lease_time;
174 spin_unlock(&mds_srv->nfs_client->cl_lock);
175 clp->cl_last_renewal = jiffies;
176
177 /* New nfs_client */
178 status = nfs4_init_ds_session(clp);
179 if (status)
180 goto out_put;
181
182 ds->ds_clp = clp;
183 dprintk("%s [new] ip=%x, port=%hu\n", __func__, ntohl(ds->ds_ip_addr),
184 ntohs(ds->ds_port));
185out:
186 return status;
187out_put:
188 nfs_put_client(clp);
189 goto out;
190}
191
107static void 192static void
108destroy_ds(struct nfs4_pnfs_ds *ds) 193destroy_ds(struct nfs4_pnfs_ds *ds)
109{ 194{
@@ -122,7 +207,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
122 struct nfs4_pnfs_ds *ds; 207 struct nfs4_pnfs_ds *ds;
123 int i; 208 int i;
124 209
125 print_deviceid(&dsaddr->deviceid.de_id); 210 print_deviceid(&dsaddr->deviceid);
126 211
127 for (i = 0; i < dsaddr->ds_num; i++) { 212 for (i = 0; i < dsaddr->ds_num; i++) {
128 ds = dsaddr->ds_list[i]; 213 ds = dsaddr->ds_list[i];
@@ -139,15 +224,6 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
139 kfree(dsaddr); 224 kfree(dsaddr);
140} 225}
141 226
142void
143nfs4_fl_free_deviceid_callback(struct pnfs_deviceid_node *device)
144{
145 struct nfs4_file_layout_dsaddr *dsaddr =
146 container_of(device, struct nfs4_file_layout_dsaddr, deviceid);
147
148 nfs4_fl_free_deviceid(dsaddr);
149}
150
151static struct nfs4_pnfs_ds * 227static struct nfs4_pnfs_ds *
152nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port) 228nfs4_pnfs_ds_add(struct inode *inode, u32 ip_addr, u32 port)
153{ 229{
@@ -185,7 +261,7 @@ out:
185 * Currently only support ipv4, and one multi-path address. 261 * Currently only support ipv4, and one multi-path address.
186 */ 262 */
187static struct nfs4_pnfs_ds * 263static struct nfs4_pnfs_ds *
188decode_and_add_ds(__be32 **pp, struct inode *inode) 264decode_and_add_ds(struct xdr_stream *streamp, struct inode *inode)
189{ 265{
190 struct nfs4_pnfs_ds *ds = NULL; 266 struct nfs4_pnfs_ds *ds = NULL;
191 char *buf; 267 char *buf;
@@ -193,25 +269,34 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
193 u32 ip_addr, port; 269 u32 ip_addr, port;
194 int nlen, rlen, i; 270 int nlen, rlen, i;
195 int tmp[2]; 271 int tmp[2];
196 __be32 *r_netid, *r_addr, *p = *pp; 272 __be32 *p;
197 273
198 /* r_netid */ 274 /* r_netid */
275 p = xdr_inline_decode(streamp, 4);
276 if (unlikely(!p))
277 goto out_err;
199 nlen = be32_to_cpup(p++); 278 nlen = be32_to_cpup(p++);
200 r_netid = p;
201 p += XDR_QUADLEN(nlen);
202 279
203 /* r_addr */ 280 p = xdr_inline_decode(streamp, nlen);
204 rlen = be32_to_cpup(p++); 281 if (unlikely(!p))
205 r_addr = p; 282 goto out_err;
206 p += XDR_QUADLEN(rlen);
207 *pp = p;
208 283
209 /* Check that netid is "tcp" */ 284 /* Check that netid is "tcp" */
210 if (nlen != 3 || memcmp((char *)r_netid, "tcp", 3)) { 285 if (nlen != 3 || memcmp((char *)p, "tcp", 3)) {
211 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__); 286 dprintk("%s: ERROR: non ipv4 TCP r_netid\n", __func__);
212 goto out_err; 287 goto out_err;
213 } 288 }
214 289
290 /* r_addr */
291 p = xdr_inline_decode(streamp, 4);
292 if (unlikely(!p))
293 goto out_err;
294 rlen = be32_to_cpup(p);
295
296 p = xdr_inline_decode(streamp, rlen);
297 if (unlikely(!p))
298 goto out_err;
299
215 /* ipv6 length plus port is legal */ 300 /* ipv6 length plus port is legal */
216 if (rlen > INET6_ADDRSTRLEN + 8) { 301 if (rlen > INET6_ADDRSTRLEN + 8) {
217 dprintk("%s: Invalid address, length %d\n", __func__, 302 dprintk("%s: Invalid address, length %d\n", __func__,
@@ -219,8 +304,12 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
219 goto out_err; 304 goto out_err;
220 } 305 }
221 buf = kmalloc(rlen + 1, GFP_KERNEL); 306 buf = kmalloc(rlen + 1, GFP_KERNEL);
307 if (!buf) {
308 dprintk("%s: Not enough memory\n", __func__);
309 goto out_err;
310 }
222 buf[rlen] = '\0'; 311 buf[rlen] = '\0';
223 memcpy(buf, r_addr, rlen); 312 memcpy(buf, p, rlen);
224 313
225 /* replace the port dots with dashes for the in4_pton() delimiter*/ 314 /* replace the port dots with dashes for the in4_pton() delimiter*/
226 for (i = 0; i < 2; i++) { 315 for (i = 0; i < 2; i++) {
@@ -256,118 +345,191 @@ out_err:
256static struct nfs4_file_layout_dsaddr* 345static struct nfs4_file_layout_dsaddr*
257decode_device(struct inode *ino, struct pnfs_device *pdev) 346decode_device(struct inode *ino, struct pnfs_device *pdev)
258{ 347{
259 int i, dummy; 348 int i;
260 u32 cnt, num; 349 u32 cnt, num;
261 u8 *indexp; 350 u8 *indexp;
262 __be32 *p = (__be32 *)pdev->area, *indicesp; 351 __be32 *p;
263 struct nfs4_file_layout_dsaddr *dsaddr; 352 u8 *stripe_indices;
353 u8 max_stripe_index;
354 struct nfs4_file_layout_dsaddr *dsaddr = NULL;
355 struct xdr_stream stream;
356 struct xdr_buf buf = {
357 .pages = pdev->pages,
358 .page_len = pdev->pglen,
359 .buflen = pdev->pglen,
360 .len = pdev->pglen,
361 };
362 struct page *scratch;
363
364 /* set up xdr stream */
365 scratch = alloc_page(GFP_KERNEL);
366 if (!scratch)
367 goto out_err;
368
369 xdr_init_decode(&stream, &buf, NULL);
370 xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
264 371
265 /* Get the stripe count (number of stripe index) */ 372 /* Get the stripe count (number of stripe index) */
266 cnt = be32_to_cpup(p++); 373 p = xdr_inline_decode(&stream, 4);
374 if (unlikely(!p))
375 goto out_err_free_scratch;
376
377 cnt = be32_to_cpup(p);
267 dprintk("%s stripe count %d\n", __func__, cnt); 378 dprintk("%s stripe count %d\n", __func__, cnt);
268 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) { 379 if (cnt > NFS4_PNFS_MAX_STRIPE_CNT) {
269 printk(KERN_WARNING "%s: stripe count %d greater than " 380 printk(KERN_WARNING "%s: stripe count %d greater than "
270 "supported maximum %d\n", __func__, 381 "supported maximum %d\n", __func__,
271 cnt, NFS4_PNFS_MAX_STRIPE_CNT); 382 cnt, NFS4_PNFS_MAX_STRIPE_CNT);
272 goto out_err; 383 goto out_err_free_scratch;
384 }
385
386 /* read stripe indices */
387 stripe_indices = kcalloc(cnt, sizeof(u8), GFP_KERNEL);
388 if (!stripe_indices)
389 goto out_err_free_scratch;
390
391 p = xdr_inline_decode(&stream, cnt << 2);
392 if (unlikely(!p))
393 goto out_err_free_stripe_indices;
394
395 indexp = &stripe_indices[0];
396 max_stripe_index = 0;
397 for (i = 0; i < cnt; i++) {
398 *indexp = be32_to_cpup(p++);
399 max_stripe_index = max(max_stripe_index, *indexp);
400 indexp++;
273 } 401 }
274 402
275 /* Check the multipath list count */ 403 /* Check the multipath list count */
276 indicesp = p; 404 p = xdr_inline_decode(&stream, 4);
277 p += XDR_QUADLEN(cnt << 2); 405 if (unlikely(!p))
278 num = be32_to_cpup(p++); 406 goto out_err_free_stripe_indices;
407
408 num = be32_to_cpup(p);
279 dprintk("%s ds_num %u\n", __func__, num); 409 dprintk("%s ds_num %u\n", __func__, num);
280 if (num > NFS4_PNFS_MAX_MULTI_CNT) { 410 if (num > NFS4_PNFS_MAX_MULTI_CNT) {
281 printk(KERN_WARNING "%s: multipath count %d greater than " 411 printk(KERN_WARNING "%s: multipath count %d greater than "
282 "supported maximum %d\n", __func__, 412 "supported maximum %d\n", __func__,
283 num, NFS4_PNFS_MAX_MULTI_CNT); 413 num, NFS4_PNFS_MAX_MULTI_CNT);
284 goto out_err; 414 goto out_err_free_stripe_indices;
415 }
416
417 /* validate stripe indices are all < num */
418 if (max_stripe_index >= num) {
419 printk(KERN_WARNING "%s: stripe index %u >= num ds %u\n",
420 __func__, max_stripe_index, num);
421 goto out_err_free_stripe_indices;
285 } 422 }
423
286 dsaddr = kzalloc(sizeof(*dsaddr) + 424 dsaddr = kzalloc(sizeof(*dsaddr) +
287 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)), 425 (sizeof(struct nfs4_pnfs_ds *) * (num - 1)),
288 GFP_KERNEL); 426 GFP_KERNEL);
289 if (!dsaddr) 427 if (!dsaddr)
290 goto out_err; 428 goto out_err_free_stripe_indices;
291
292 dsaddr->stripe_indices = kzalloc(sizeof(u8) * cnt, GFP_KERNEL);
293 if (!dsaddr->stripe_indices)
294 goto out_err_free;
295 429
296 dsaddr->stripe_count = cnt; 430 dsaddr->stripe_count = cnt;
431 dsaddr->stripe_indices = stripe_indices;
432 stripe_indices = NULL;
297 dsaddr->ds_num = num; 433 dsaddr->ds_num = num;
298 434
299 memcpy(&dsaddr->deviceid.de_id, &pdev->dev_id, sizeof(pdev->dev_id)); 435 memcpy(&dsaddr->deviceid, &pdev->dev_id, sizeof(pdev->dev_id));
300
301 /* Go back an read stripe indices */
302 p = indicesp;
303 indexp = &dsaddr->stripe_indices[0];
304 for (i = 0; i < dsaddr->stripe_count; i++) {
305 *indexp = be32_to_cpup(p++);
306 if (*indexp >= num)
307 goto out_err_free;
308 indexp++;
309 }
310 /* Skip already read multipath list count */
311 p++;
312 436
313 for (i = 0; i < dsaddr->ds_num; i++) { 437 for (i = 0; i < dsaddr->ds_num; i++) {
314 int j; 438 int j;
439 u32 mp_count;
315 440
316 dummy = be32_to_cpup(p++); /* multipath count */ 441 p = xdr_inline_decode(&stream, 4);
317 if (dummy > 1) { 442 if (unlikely(!p))
443 goto out_err_free_deviceid;
444
445 mp_count = be32_to_cpup(p); /* multipath count */
446 if (mp_count > 1) {
318 printk(KERN_WARNING 447 printk(KERN_WARNING
319 "%s: Multipath count %d not supported, " 448 "%s: Multipath count %d not supported, "
320 "skipping all greater than 1\n", __func__, 449 "skipping all greater than 1\n", __func__,
321 dummy); 450 mp_count);
322 } 451 }
323 for (j = 0; j < dummy; j++) { 452 for (j = 0; j < mp_count; j++) {
324 if (j == 0) { 453 if (j == 0) {
325 dsaddr->ds_list[i] = decode_and_add_ds(&p, ino); 454 dsaddr->ds_list[i] = decode_and_add_ds(&stream,
455 ino);
326 if (dsaddr->ds_list[i] == NULL) 456 if (dsaddr->ds_list[i] == NULL)
327 goto out_err_free; 457 goto out_err_free_deviceid;
328 } else { 458 } else {
329 u32 len; 459 u32 len;
330 /* skip extra multipath */ 460 /* skip extra multipath */
331 len = be32_to_cpup(p++); 461
332 p += XDR_QUADLEN(len); 462 /* read len, skip */
333 len = be32_to_cpup(p++); 463 p = xdr_inline_decode(&stream, 4);
334 p += XDR_QUADLEN(len); 464 if (unlikely(!p))
335 continue; 465 goto out_err_free_deviceid;
466 len = be32_to_cpup(p);
467
468 p = xdr_inline_decode(&stream, len);
469 if (unlikely(!p))
470 goto out_err_free_deviceid;
471
472 /* read len, skip */
473 p = xdr_inline_decode(&stream, 4);
474 if (unlikely(!p))
475 goto out_err_free_deviceid;
476 len = be32_to_cpup(p);
477
478 p = xdr_inline_decode(&stream, len);
479 if (unlikely(!p))
480 goto out_err_free_deviceid;
336 } 481 }
337 } 482 }
338 } 483 }
484
485 __free_page(scratch);
339 return dsaddr; 486 return dsaddr;
340 487
341out_err_free: 488out_err_free_deviceid:
342 nfs4_fl_free_deviceid(dsaddr); 489 nfs4_fl_free_deviceid(dsaddr);
490 /* stripe_indicies was part of dsaddr */
491 goto out_err_free_scratch;
492out_err_free_stripe_indices:
493 kfree(stripe_indices);
494out_err_free_scratch:
495 __free_page(scratch);
343out_err: 496out_err:
344 dprintk("%s ERROR: returning NULL\n", __func__); 497 dprintk("%s ERROR: returning NULL\n", __func__);
345 return NULL; 498 return NULL;
346} 499}
347 500
348/* 501/*
349 * Decode the opaque device specified in 'dev' 502 * Decode the opaque device specified in 'dev' and add it to the cache of
350 * and add it to the list of available devices. 503 * available devices.
351 * If the deviceid is already cached, nfs4_add_deviceid will return
352 * a pointer to the cached struct and throw away the new.
353 */ 504 */
354static struct nfs4_file_layout_dsaddr* 505static struct nfs4_file_layout_dsaddr *
355decode_and_add_device(struct inode *inode, struct pnfs_device *dev) 506decode_and_add_device(struct inode *inode, struct pnfs_device *dev)
356{ 507{
357 struct nfs4_file_layout_dsaddr *dsaddr; 508 struct nfs4_file_layout_dsaddr *d, *new;
358 struct pnfs_deviceid_node *d; 509 long hash;
359 510
360 dsaddr = decode_device(inode, dev); 511 new = decode_device(inode, dev);
361 if (!dsaddr) { 512 if (!new) {
362 printk(KERN_WARNING "%s: Could not decode or add device\n", 513 printk(KERN_WARNING "%s: Could not decode or add device\n",
363 __func__); 514 __func__);
364 return NULL; 515 return NULL;
365 } 516 }
366 517
367 d = pnfs_add_deviceid(NFS_SERVER(inode)->nfs_client->cl_devid_cache, 518 spin_lock(&filelayout_deviceid_lock);
368 &dsaddr->deviceid); 519 d = nfs4_fl_find_get_deviceid(&new->deviceid);
520 if (d) {
521 spin_unlock(&filelayout_deviceid_lock);
522 nfs4_fl_free_deviceid(new);
523 return d;
524 }
369 525
370 return container_of(d, struct nfs4_file_layout_dsaddr, deviceid); 526 INIT_HLIST_NODE(&new->node);
527 atomic_set(&new->ref, 1);
528 hash = nfs4_fl_deviceid_hash(&new->deviceid);
529 hlist_add_head_rcu(&new->node, &filelayout_deviceid_cache[hash]);
530 spin_unlock(&filelayout_deviceid_lock);
531
532 return new;
371} 533}
372 534
373/* 535/*
@@ -409,11 +571,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
409 goto out_free; 571 goto out_free;
410 } 572 }
411 573
412 /* set pdev->area */
413 pdev->area = vmap(pages, max_pages, VM_MAP, PAGE_KERNEL);
414 if (!pdev->area)
415 goto out_free;
416
417 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); 574 memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id));
418 pdev->layout_type = LAYOUT_NFSV4_1_FILES; 575 pdev->layout_type = LAYOUT_NFSV4_1_FILES;
419 pdev->pages = pages; 576 pdev->pages = pages;
@@ -432,8 +589,6 @@ get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id)
432 */ 589 */
433 dsaddr = decode_and_add_device(inode, pdev); 590 dsaddr = decode_and_add_device(inode, pdev);
434out_free: 591out_free:
435 if (pdev->area != NULL)
436 vunmap(pdev->area);
437 for (i = 0; i < max_pages; i++) 592 for (i = 0; i < max_pages; i++)
438 __free_page(pages[i]); 593 __free_page(pages[i]);
439 kfree(pages); 594 kfree(pages);
@@ -442,12 +597,123 @@ out_free:
442 return dsaddr; 597 return dsaddr;
443} 598}
444 599
600void
601nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
602{
603 if (atomic_dec_and_lock(&dsaddr->ref, &filelayout_deviceid_lock)) {
604 hlist_del_rcu(&dsaddr->node);
605 spin_unlock(&filelayout_deviceid_lock);
606
607 synchronize_rcu();
608 nfs4_fl_free_deviceid(dsaddr);
609 }
610}
611
445struct nfs4_file_layout_dsaddr * 612struct nfs4_file_layout_dsaddr *
446nfs4_fl_find_get_deviceid(struct nfs_client *clp, struct nfs4_deviceid *id) 613nfs4_fl_find_get_deviceid(struct nfs4_deviceid *id)
447{ 614{
448 struct pnfs_deviceid_node *d; 615 struct nfs4_file_layout_dsaddr *d;
616 struct hlist_node *n;
617 long hash = nfs4_fl_deviceid_hash(id);
618
619
620 rcu_read_lock();
621 hlist_for_each_entry_rcu(d, n, &filelayout_deviceid_cache[hash], node) {
622 if (!memcmp(&d->deviceid, id, sizeof(*id))) {
623 if (!atomic_inc_not_zero(&d->ref))
624 goto fail;
625 rcu_read_unlock();
626 return d;
627 }
628 }
629fail:
630 rcu_read_unlock();
631 return NULL;
632}
449 633
450 d = pnfs_find_get_deviceid(clp->cl_devid_cache, id); 634/*
451 return (d == NULL) ? NULL : 635 * Want res = (offset - layout->pattern_offset)/ layout->stripe_unit
452 container_of(d, struct nfs4_file_layout_dsaddr, deviceid); 636 * Then: ((res + fsi) % dsaddr->stripe_count)
637 */
638u32
639nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset)
640{
641 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
642 u64 tmp;
643
644 tmp = offset - flseg->pattern_offset;
645 do_div(tmp, flseg->stripe_unit);
646 tmp += flseg->first_stripe_index;
647 return do_div(tmp, flseg->dsaddr->stripe_count);
648}
649
650u32
651nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j)
652{
653 return FILELAYOUT_LSEG(lseg)->dsaddr->stripe_indices[j];
654}
655
656struct nfs_fh *
657nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
658{
659 struct nfs4_filelayout_segment *flseg = FILELAYOUT_LSEG(lseg);
660 u32 i;
661
662 if (flseg->stripe_type == STRIPE_SPARSE) {
663 if (flseg->num_fh == 1)
664 i = 0;
665 else if (flseg->num_fh == 0)
666 /* Use the MDS OPEN fh set in nfs_read_rpcsetup */
667 return NULL;
668 else
669 i = nfs4_fl_calc_ds_index(lseg, j);
670 } else
671 i = j;
672 return flseg->fh_array[i];
673}
674
675static void
676filelayout_mark_devid_negative(struct nfs4_file_layout_dsaddr *dsaddr,
677 int err, u32 ds_addr)
678{
679 u32 *p = (u32 *)&dsaddr->deviceid;
680
681 printk(KERN_ERR "NFS: data server %x connection error %d."
682 " Deviceid [%x%x%x%x] marked out of use.\n",
683 ds_addr, err, p[0], p[1], p[2], p[3]);
684
685 spin_lock(&filelayout_deviceid_lock);
686 dsaddr->flags |= NFS4_DEVICE_ID_NEG_ENTRY;
687 spin_unlock(&filelayout_deviceid_lock);
688}
689
690struct nfs4_pnfs_ds *
691nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
692{
693 struct nfs4_file_layout_dsaddr *dsaddr = FILELAYOUT_LSEG(lseg)->dsaddr;
694 struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
695
696 if (ds == NULL) {
697 printk(KERN_ERR "%s: No data server for offset index %d\n",
698 __func__, ds_idx);
699 return NULL;
700 }
701
702 if (!ds->ds_clp) {
703 struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
704 int err;
705
706 if (dsaddr->flags & NFS4_DEVICE_ID_NEG_ENTRY) {
707 /* Already tried to connect, don't try again */
708 dprintk("%s Deviceid marked out of use\n", __func__);
709 return NULL;
710 }
711 err = nfs4_ds_connect(s, ds);
712 if (err) {
713 filelayout_mark_devid_negative(dsaddr, err,
714 ntohl(ds->ds_ip_addr));
715 return NULL;
716 }
717 }
718 return ds;
453} 719}
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 3c2a1724fbd2..bb80c49b6533 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -54,33 +54,29 @@ Elong:
54/* 54/*
55 * Determine the mount path as a string 55 * Determine the mount path as a string
56 */ 56 */
57static char *nfs4_path(const struct vfsmount *mnt_parent, 57static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
58 const struct dentry *dentry,
59 char *buffer, ssize_t buflen)
60{ 58{
61 const char *srvpath; 59 char *limit;
62 60 char *path = nfs_path(&limit, dentry, buffer, buflen);
63 srvpath = strchr(mnt_parent->mnt_devname, ':'); 61 if (!IS_ERR(path)) {
64 if (srvpath) 62 char *colon = strchr(path, ':');
65 srvpath++; 63 if (colon && colon < limit)
66 else 64 path = colon + 1;
67 srvpath = mnt_parent->mnt_devname; 65 }
68 66 return path;
69 return nfs_path(srvpath, mnt_parent->mnt_root, dentry, buffer, buflen);
70} 67}
71 68
72/* 69/*
73 * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we 70 * Check that fs_locations::fs_root [RFC3530 6.3] is a prefix for what we
74 * believe to be the server path to this dentry 71 * believe to be the server path to this dentry
75 */ 72 */
76static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, 73static int nfs4_validate_fspath(struct dentry *dentry,
77 const struct dentry *dentry,
78 const struct nfs4_fs_locations *locations, 74 const struct nfs4_fs_locations *locations,
79 char *page, char *page2) 75 char *page, char *page2)
80{ 76{
81 const char *path, *fs_path; 77 const char *path, *fs_path;
82 78
83 path = nfs4_path(mnt_parent, dentry, page, PAGE_SIZE); 79 path = nfs4_path(dentry, page, PAGE_SIZE);
84 if (IS_ERR(path)) 80 if (IS_ERR(path))
85 return PTR_ERR(path); 81 return PTR_ERR(path);
86 82
@@ -165,20 +161,18 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
165 161
166/** 162/**
167 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error 163 * nfs_follow_referral - set up mountpoint when hitting a referral on moved error
168 * @mnt_parent - mountpoint of parent directory
169 * @dentry - parent directory 164 * @dentry - parent directory
170 * @locations - array of NFSv4 server location information 165 * @locations - array of NFSv4 server location information
171 * 166 *
172 */ 167 */
173static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, 168static struct vfsmount *nfs_follow_referral(struct dentry *dentry,
174 const struct dentry *dentry,
175 const struct nfs4_fs_locations *locations) 169 const struct nfs4_fs_locations *locations)
176{ 170{
177 struct vfsmount *mnt = ERR_PTR(-ENOENT); 171 struct vfsmount *mnt = ERR_PTR(-ENOENT);
178 struct nfs_clone_mount mountdata = { 172 struct nfs_clone_mount mountdata = {
179 .sb = mnt_parent->mnt_sb, 173 .sb = dentry->d_sb,
180 .dentry = dentry, 174 .dentry = dentry,
181 .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, 175 .authflavor = NFS_SB(dentry->d_sb)->client->cl_auth->au_flavor,
182 }; 176 };
183 char *page = NULL, *page2 = NULL; 177 char *page = NULL, *page2 = NULL;
184 int loc, error; 178 int loc, error;
@@ -198,7 +192,7 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent,
198 goto out; 192 goto out;
199 193
200 /* Ensure fs path is a prefix of current dentry path */ 194 /* Ensure fs path is a prefix of current dentry path */
201 error = nfs4_validate_fspath(mnt_parent, dentry, locations, page, page2); 195 error = nfs4_validate_fspath(dentry, locations, page, page2);
202 if (error < 0) { 196 if (error < 0) {
203 mnt = ERR_PTR(error); 197 mnt = ERR_PTR(error);
204 goto out; 198 goto out;
@@ -225,11 +219,10 @@ out:
225 219
226/* 220/*
227 * nfs_do_refmount - handle crossing a referral on server 221 * nfs_do_refmount - handle crossing a referral on server
228 * @mnt_parent - mountpoint of referral
229 * @dentry - dentry of referral 222 * @dentry - dentry of referral
230 * 223 *
231 */ 224 */
232struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentry *dentry) 225struct vfsmount *nfs_do_refmount(struct dentry *dentry)
233{ 226{
234 struct vfsmount *mnt = ERR_PTR(-ENOMEM); 227 struct vfsmount *mnt = ERR_PTR(-ENOMEM);
235 struct dentry *parent; 228 struct dentry *parent;
@@ -262,7 +255,7 @@ struct vfsmount *nfs_do_refmount(const struct vfsmount *mnt_parent, struct dentr
262 fs_locations->fs_path.ncomponents <= 0) 255 fs_locations->fs_path.ncomponents <= 0)
263 goto out_free; 256 goto out_free;
264 257
265 mnt = nfs_follow_referral(mnt_parent, dentry, fs_locations); 258 mnt = nfs_follow_referral(dentry, fs_locations);
266out_free: 259out_free:
267 __free_page(page); 260 __free_page(page);
268 kfree(fs_locations); 261 kfree(fs_locations);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78936a8f40ab..9bf41eab3e46 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -41,6 +41,7 @@
41#include <linux/string.h> 41#include <linux/string.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/sunrpc/clnt.h> 43#include <linux/sunrpc/clnt.h>
44#include <linux/sunrpc/gss_api.h>
44#include <linux/nfs.h> 45#include <linux/nfs.h>
45#include <linux/nfs4.h> 46#include <linux/nfs4.h>
46#include <linux/nfs_fs.h> 47#include <linux/nfs_fs.h>
@@ -71,7 +72,9 @@ static int _nfs4_proc_open(struct nfs4_opendata *data);
71static int _nfs4_recover_proc_open(struct nfs4_opendata *data); 72static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
72static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); 73static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
73static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); 74static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
74static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 75static int _nfs4_proc_lookup(struct rpc_clnt *client, struct inode *dir,
76 const struct qstr *name, struct nfs_fh *fhandle,
77 struct nfs_fattr *fattr);
75static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr); 78static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
76static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, 79static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
77 struct nfs_fattr *fattr, struct iattr *sattr, 80 struct nfs_fattr *fattr, struct iattr *sattr,
@@ -85,6 +88,11 @@ static int nfs4_map_errors(int err)
85 switch (err) { 88 switch (err) {
86 case -NFS4ERR_RESOURCE: 89 case -NFS4ERR_RESOURCE:
87 return -EREMOTEIO; 90 return -EREMOTEIO;
91 case -NFS4ERR_WRONGSEC:
92 return -EPERM;
93 case -NFS4ERR_BADOWNER:
94 case -NFS4ERR_BADNAME:
95 return -EINVAL;
88 default: 96 default:
89 dprintk("%s could not handle NFSv4 error %d\n", 97 dprintk("%s could not handle NFSv4 error %d\n",
90 __func__, -err); 98 __func__, -err);
@@ -241,7 +249,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
241/* This is the error handling routine for processes that are allowed 249/* This is the error handling routine for processes that are allowed
242 * to sleep. 250 * to sleep.
243 */ 251 */
244static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception) 252static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
245{ 253{
246 struct nfs_client *clp = server->nfs_client; 254 struct nfs_client *clp = server->nfs_client;
247 struct nfs4_state *state = exception->state; 255 struct nfs4_state *state = exception->state;
@@ -256,12 +264,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
256 case -NFS4ERR_OPENMODE: 264 case -NFS4ERR_OPENMODE:
257 if (state == NULL) 265 if (state == NULL)
258 break; 266 break;
259 nfs4_state_mark_reclaim_nograce(clp, state); 267 nfs4_schedule_stateid_recovery(server, state);
260 goto do_state_recovery; 268 goto wait_on_recovery;
261 case -NFS4ERR_STALE_STATEID: 269 case -NFS4ERR_STALE_STATEID:
262 case -NFS4ERR_STALE_CLIENTID: 270 case -NFS4ERR_STALE_CLIENTID:
263 case -NFS4ERR_EXPIRED: 271 case -NFS4ERR_EXPIRED:
264 goto do_state_recovery; 272 nfs4_schedule_lease_recovery(clp);
273 goto wait_on_recovery;
265#if defined(CONFIG_NFS_V4_1) 274#if defined(CONFIG_NFS_V4_1)
266 case -NFS4ERR_BADSESSION: 275 case -NFS4ERR_BADSESSION:
267 case -NFS4ERR_BADSLOT: 276 case -NFS4ERR_BADSLOT:
@@ -272,7 +281,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
272 case -NFS4ERR_SEQ_MISORDERED: 281 case -NFS4ERR_SEQ_MISORDERED:
273 dprintk("%s ERROR: %d Reset session\n", __func__, 282 dprintk("%s ERROR: %d Reset session\n", __func__,
274 errorcode); 283 errorcode);
275 nfs4_schedule_state_recovery(clp); 284 nfs4_schedule_session_recovery(clp->cl_session);
276 exception->retry = 1; 285 exception->retry = 1;
277 break; 286 break;
278#endif /* defined(CONFIG_NFS_V4_1) */ 287#endif /* defined(CONFIG_NFS_V4_1) */
@@ -292,11 +301,23 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
292 break; 301 break;
293 case -NFS4ERR_OLD_STATEID: 302 case -NFS4ERR_OLD_STATEID:
294 exception->retry = 1; 303 exception->retry = 1;
304 break;
305 case -NFS4ERR_BADOWNER:
306 /* The following works around a Linux server bug! */
307 case -NFS4ERR_BADNAME:
308 if (server->caps & NFS_CAP_UIDGID_NOMAP) {
309 server->caps &= ~NFS_CAP_UIDGID_NOMAP;
310 exception->retry = 1;
311 printk(KERN_WARNING "NFS: v4 server %s "
312 "does not accept raw "
313 "uid/gids. "
314 "Reenabling the idmapper.\n",
315 server->nfs_client->cl_hostname);
316 }
295 } 317 }
296 /* We failed to handle the error */ 318 /* We failed to handle the error */
297 return nfs4_map_errors(ret); 319 return nfs4_map_errors(ret);
298do_state_recovery: 320wait_on_recovery:
299 nfs4_schedule_state_recovery(clp);
300 ret = nfs4_wait_clnt_recover(clp); 321 ret = nfs4_wait_clnt_recover(clp);
301 if (ret == 0) 322 if (ret == 0)
302 exception->retry = 1; 323 exception->retry = 1;
@@ -435,8 +456,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
435 clp = res->sr_session->clp; 456 clp = res->sr_session->clp;
436 do_renew_lease(clp, timestamp); 457 do_renew_lease(clp, timestamp);
437 /* Check sequence flags */ 458 /* Check sequence flags */
438 if (atomic_read(&clp->cl_count) > 1) 459 if (res->sr_status_flags != 0)
439 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); 460 nfs4_schedule_lease_recovery(clp);
440 break; 461 break;
441 case -NFS4ERR_DELAY: 462 case -NFS4ERR_DELAY:
442 /* The server detected a resend of the RPC call and 463 /* The server detected a resend of the RPC call and
@@ -505,7 +526,7 @@ out:
505 return ret_id; 526 return ret_id;
506} 527}
507 528
508static int nfs41_setup_sequence(struct nfs4_session *session, 529int nfs41_setup_sequence(struct nfs4_session *session,
509 struct nfs4_sequence_args *args, 530 struct nfs4_sequence_args *args,
510 struct nfs4_sequence_res *res, 531 struct nfs4_sequence_res *res,
511 int cache_reply, 532 int cache_reply,
@@ -571,6 +592,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session,
571 res->sr_status = 1; 592 res->sr_status = 1;
572 return 0; 593 return 0;
573} 594}
595EXPORT_SYMBOL_GPL(nfs41_setup_sequence);
574 596
575int nfs4_setup_sequence(const struct nfs_server *server, 597int nfs4_setup_sequence(const struct nfs_server *server,
576 struct nfs4_sequence_args *args, 598 struct nfs4_sequence_args *args,
@@ -640,7 +662,8 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = {
640 .rpc_call_done = nfs41_call_sync_done, 662 .rpc_call_done = nfs41_call_sync_done,
641}; 663};
642 664
643static int nfs4_call_sync_sequence(struct nfs_server *server, 665static int nfs4_call_sync_sequence(struct rpc_clnt *clnt,
666 struct nfs_server *server,
644 struct rpc_message *msg, 667 struct rpc_message *msg,
645 struct nfs4_sequence_args *args, 668 struct nfs4_sequence_args *args,
646 struct nfs4_sequence_res *res, 669 struct nfs4_sequence_res *res,
@@ -656,7 +679,7 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
656 .cache_reply = cache_reply, 679 .cache_reply = cache_reply,
657 }; 680 };
658 struct rpc_task_setup task_setup = { 681 struct rpc_task_setup task_setup = {
659 .rpc_client = server->client, 682 .rpc_client = clnt,
660 .rpc_message = msg, 683 .rpc_message = msg,
661 .callback_ops = &nfs41_call_sync_ops, 684 .callback_ops = &nfs41_call_sync_ops,
662 .callback_data = &data 685 .callback_data = &data
@@ -675,13 +698,14 @@ static int nfs4_call_sync_sequence(struct nfs_server *server,
675 return ret; 698 return ret;
676} 699}
677 700
678int _nfs4_call_sync_session(struct nfs_server *server, 701int _nfs4_call_sync_session(struct rpc_clnt *clnt,
702 struct nfs_server *server,
679 struct rpc_message *msg, 703 struct rpc_message *msg,
680 struct nfs4_sequence_args *args, 704 struct nfs4_sequence_args *args,
681 struct nfs4_sequence_res *res, 705 struct nfs4_sequence_res *res,
682 int cache_reply) 706 int cache_reply)
683{ 707{
684 return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); 708 return nfs4_call_sync_sequence(clnt, server, msg, args, res, cache_reply, 0);
685} 709}
686 710
687#else 711#else
@@ -692,19 +716,28 @@ static int nfs4_sequence_done(struct rpc_task *task,
692} 716}
693#endif /* CONFIG_NFS_V4_1 */ 717#endif /* CONFIG_NFS_V4_1 */
694 718
695int _nfs4_call_sync(struct nfs_server *server, 719int _nfs4_call_sync(struct rpc_clnt *clnt,
720 struct nfs_server *server,
696 struct rpc_message *msg, 721 struct rpc_message *msg,
697 struct nfs4_sequence_args *args, 722 struct nfs4_sequence_args *args,
698 struct nfs4_sequence_res *res, 723 struct nfs4_sequence_res *res,
699 int cache_reply) 724 int cache_reply)
700{ 725{
701 args->sa_session = res->sr_session = NULL; 726 args->sa_session = res->sr_session = NULL;
702 return rpc_call_sync(server->client, msg, 0); 727 return rpc_call_sync(clnt, msg, 0);
703} 728}
704 729
705#define nfs4_call_sync(server, msg, args, res, cache_reply) \ 730static inline
706 (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ 731int nfs4_call_sync(struct rpc_clnt *clnt,
707 &(res)->seq_res, (cache_reply)) 732 struct nfs_server *server,
733 struct rpc_message *msg,
734 struct nfs4_sequence_args *args,
735 struct nfs4_sequence_res *res,
736 int cache_reply)
737{
738 return server->nfs_client->cl_mvops->call_sync(clnt, server, msg,
739 args, res, cache_reply);
740}
708 741
709static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) 742static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
710{ 743{
@@ -1255,14 +1288,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1255 case -NFS4ERR_BAD_HIGH_SLOT: 1288 case -NFS4ERR_BAD_HIGH_SLOT:
1256 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 1289 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
1257 case -NFS4ERR_DEADSESSION: 1290 case -NFS4ERR_DEADSESSION:
1258 nfs4_schedule_state_recovery( 1291 nfs4_schedule_session_recovery(server->nfs_client->cl_session);
1259 server->nfs_client);
1260 goto out; 1292 goto out;
1261 case -NFS4ERR_STALE_CLIENTID: 1293 case -NFS4ERR_STALE_CLIENTID:
1262 case -NFS4ERR_STALE_STATEID: 1294 case -NFS4ERR_STALE_STATEID:
1263 case -NFS4ERR_EXPIRED: 1295 case -NFS4ERR_EXPIRED:
1264 /* Don't recall a delegation if it was lost */ 1296 /* Don't recall a delegation if it was lost */
1265 nfs4_schedule_state_recovery(server->nfs_client); 1297 nfs4_schedule_lease_recovery(server->nfs_client);
1266 goto out; 1298 goto out;
1267 case -ERESTARTSYS: 1299 case -ERESTARTSYS:
1268 /* 1300 /*
@@ -1271,7 +1303,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
1271 */ 1303 */
1272 case -NFS4ERR_ADMIN_REVOKED: 1304 case -NFS4ERR_ADMIN_REVOKED:
1273 case -NFS4ERR_BAD_STATEID: 1305 case -NFS4ERR_BAD_STATEID:
1274 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 1306 nfs4_schedule_stateid_recovery(server, state);
1275 case -EKEYEXPIRED: 1307 case -EKEYEXPIRED:
1276 /* 1308 /*
1277 * User RPCSEC_GSS context has expired. 1309 * User RPCSEC_GSS context has expired.
@@ -1574,9 +1606,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
1574 return 0; 1606 return 0;
1575} 1607}
1576 1608
1577static int nfs4_recover_expired_lease(struct nfs_server *server) 1609static int nfs4_client_recover_expired_lease(struct nfs_client *clp)
1578{ 1610{
1579 struct nfs_client *clp = server->nfs_client;
1580 unsigned int loop; 1611 unsigned int loop;
1581 int ret; 1612 int ret;
1582 1613
@@ -1587,12 +1618,17 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
1587 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && 1618 if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
1588 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) 1619 !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
1589 break; 1620 break;
1590 nfs4_schedule_state_recovery(clp); 1621 nfs4_schedule_state_manager(clp);
1591 ret = -EIO; 1622 ret = -EIO;
1592 } 1623 }
1593 return ret; 1624 return ret;
1594} 1625}
1595 1626
1627static int nfs4_recover_expired_lease(struct nfs_server *server)
1628{
1629 return nfs4_client_recover_expired_lease(server->nfs_client);
1630}
1631
1596/* 1632/*
1597 * OPEN_EXPIRED: 1633 * OPEN_EXPIRED:
1598 * reclaim state on the server after a network partition. 1634 * reclaim state on the server after a network partition.
@@ -1811,7 +1847,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
1811 } else 1847 } else
1812 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); 1848 memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
1813 1849
1814 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 1850 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
1815 if (status == 0 && state != NULL) 1851 if (status == 0 && state != NULL)
1816 renew_lease(server, timestamp); 1852 renew_lease(server, timestamp);
1817 return status; 1853 return status;
@@ -2070,7 +2106,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
2070 }; 2106 };
2071 int status; 2107 int status;
2072 2108
2073 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2109 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2074 if (status == 0) { 2110 if (status == 0) {
2075 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); 2111 memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
2076 server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| 2112 server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
@@ -2140,7 +2176,7 @@ static int _nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2140 }; 2176 };
2141 2177
2142 nfs_fattr_init(info->fattr); 2178 nfs_fattr_init(info->fattr);
2143 return nfs4_call_sync(server, &msg, &args, &res, 0); 2179 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2144} 2180}
2145 2181
2146static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle, 2182static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -2156,15 +2192,41 @@ static int nfs4_lookup_root(struct nfs_server *server, struct nfs_fh *fhandle,
2156 return err; 2192 return err;
2157} 2193}
2158 2194
2195static int nfs4_lookup_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
2196 struct nfs_fsinfo *info, rpc_authflavor_t flavor)
2197{
2198 struct rpc_auth *auth;
2199 int ret;
2200
2201 auth = rpcauth_create(flavor, server->client);
2202 if (!auth) {
2203 ret = -EIO;
2204 goto out;
2205 }
2206 ret = nfs4_lookup_root(server, fhandle, info);
2207out:
2208 return ret;
2209}
2210
2159/* 2211/*
2160 * get the file handle for the "/" directory on the server 2212 * get the file handle for the "/" directory on the server
2161 */ 2213 */
2162static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, 2214static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle,
2163 struct nfs_fsinfo *info) 2215 struct nfs_fsinfo *info)
2164{ 2216{
2165 int status; 2217 int i, len, status = 0;
2218 rpc_authflavor_t flav_array[NFS_MAX_SECFLAVORS + 2];
2166 2219
2167 status = nfs4_lookup_root(server, fhandle, info); 2220 flav_array[0] = RPC_AUTH_UNIX;
2221 len = gss_mech_list_pseudoflavors(&flav_array[1]);
2222 flav_array[1+len] = RPC_AUTH_NULL;
2223 len += 2;
2224
2225 for (i = 0; i < len; i++) {
2226 status = nfs4_lookup_root_sec(server, fhandle, info, flav_array[i]);
2227 if (status != -EPERM)
2228 break;
2229 }
2168 if (status == 0) 2230 if (status == 0)
2169 status = nfs4_server_capabilities(server, fhandle); 2231 status = nfs4_server_capabilities(server, fhandle);
2170 if (status == 0) 2232 if (status == 0)
@@ -2229,7 +2291,7 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle,
2229 }; 2291 };
2230 2292
2231 nfs_fattr_init(fattr); 2293 nfs_fattr_init(fattr);
2232 return nfs4_call_sync(server, &msg, &args, &res, 0); 2294 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2233} 2295}
2234 2296
2235static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2297static int nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr)
@@ -2289,9 +2351,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
2289 return status; 2351 return status;
2290} 2352}
2291 2353
2292static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *dirfh, 2354static int _nfs4_proc_lookupfh(struct rpc_clnt *clnt, struct nfs_server *server,
2293 const struct qstr *name, struct nfs_fh *fhandle, 2355 const struct nfs_fh *dirfh, const struct qstr *name,
2294 struct nfs_fattr *fattr) 2356 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
2295{ 2357{
2296 int status; 2358 int status;
2297 struct nfs4_lookup_arg args = { 2359 struct nfs4_lookup_arg args = {
@@ -2313,7 +2375,7 @@ static int _nfs4_proc_lookupfh(struct nfs_server *server, const struct nfs_fh *d
2313 nfs_fattr_init(fattr); 2375 nfs_fattr_init(fattr);
2314 2376
2315 dprintk("NFS call lookupfh %s\n", name->name); 2377 dprintk("NFS call lookupfh %s\n", name->name);
2316 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2378 status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, &res.seq_res, 0);
2317 dprintk("NFS reply lookupfh: %d\n", status); 2379 dprintk("NFS reply lookupfh: %d\n", status);
2318 return status; 2380 return status;
2319} 2381}
@@ -2325,7 +2387,7 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
2325 struct nfs4_exception exception = { }; 2387 struct nfs4_exception exception = { };
2326 int err; 2388 int err;
2327 do { 2389 do {
2328 err = _nfs4_proc_lookupfh(server, dirfh, name, fhandle, fattr); 2390 err = _nfs4_proc_lookupfh(server->client, server, dirfh, name, fhandle, fattr);
2329 /* FIXME: !!!! */ 2391 /* FIXME: !!!! */
2330 if (err == -NFS4ERR_MOVED) { 2392 if (err == -NFS4ERR_MOVED) {
2331 err = -EREMOTE; 2393 err = -EREMOTE;
@@ -2336,27 +2398,41 @@ static int nfs4_proc_lookupfh(struct nfs_server *server, struct nfs_fh *dirfh,
2336 return err; 2398 return err;
2337} 2399}
2338 2400
2339static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, 2401static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir,
2340 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2402 const struct qstr *name, struct nfs_fh *fhandle,
2403 struct nfs_fattr *fattr)
2341{ 2404{
2342 int status; 2405 int status;
2343 2406
2344 dprintk("NFS call lookup %s\n", name->name); 2407 dprintk("NFS call lookup %s\n", name->name);
2345 status = _nfs4_proc_lookupfh(NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr); 2408 status = _nfs4_proc_lookupfh(clnt, NFS_SERVER(dir), NFS_FH(dir), name, fhandle, fattr);
2346 if (status == -NFS4ERR_MOVED) 2409 if (status == -NFS4ERR_MOVED)
2347 status = nfs4_get_referral(dir, name, fattr, fhandle); 2410 status = nfs4_get_referral(dir, name, fattr, fhandle);
2348 dprintk("NFS reply lookup: %d\n", status); 2411 dprintk("NFS reply lookup: %d\n", status);
2349 return status; 2412 return status;
2350} 2413}
2351 2414
2352static int nfs4_proc_lookup(struct inode *dir, struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr) 2415void nfs_fixup_secinfo_attributes(struct nfs_fattr *fattr, struct nfs_fh *fh)
2416{
2417 memset(fh, 0, sizeof(struct nfs_fh));
2418 fattr->fsid.major = 1;
2419 fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
2420 NFS_ATTR_FATTR_NLINK | NFS_ATTR_FATTR_FSID | NFS_ATTR_FATTR_MOUNTPOINT;
2421 fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
2422 fattr->nlink = 2;
2423}
2424
2425static int nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
2426 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
2353{ 2427{
2354 struct nfs4_exception exception = { }; 2428 struct nfs4_exception exception = { };
2355 int err; 2429 int err;
2356 do { 2430 do {
2357 err = nfs4_handle_exception(NFS_SERVER(dir), 2431 err = nfs4_handle_exception(NFS_SERVER(dir),
2358 _nfs4_proc_lookup(dir, name, fhandle, fattr), 2432 _nfs4_proc_lookup(clnt, dir, name, fhandle, fattr),
2359 &exception); 2433 &exception);
2434 if (err == -EPERM)
2435 nfs_fixup_secinfo_attributes(fattr, fhandle);
2360 } while (exception.retry); 2436 } while (exception.retry);
2361 return err; 2437 return err;
2362} 2438}
@@ -2401,7 +2477,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry
2401 if (res.fattr == NULL) 2477 if (res.fattr == NULL)
2402 return -ENOMEM; 2478 return -ENOMEM;
2403 2479
2404 status = nfs4_call_sync(server, &msg, &args, &res, 0); 2480 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2405 if (!status) { 2481 if (!status) {
2406 entry->mask = 0; 2482 entry->mask = 0;
2407 if (res.access & NFS4_ACCESS_READ) 2483 if (res.access & NFS4_ACCESS_READ)
@@ -2468,7 +2544,7 @@ static int _nfs4_proc_readlink(struct inode *inode, struct page *page,
2468 .rpc_resp = &res, 2544 .rpc_resp = &res,
2469 }; 2545 };
2470 2546
2471 return nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); 2547 return nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
2472} 2548}
2473 2549
2474static int nfs4_proc_readlink(struct inode *inode, struct page *page, 2550static int nfs4_proc_readlink(struct inode *inode, struct page *page,
@@ -2557,7 +2633,7 @@ static int _nfs4_proc_remove(struct inode *dir, struct qstr *name)
2557 if (res.dir_attr == NULL) 2633 if (res.dir_attr == NULL)
2558 goto out; 2634 goto out;
2559 2635
2560 status = nfs4_call_sync(server, &msg, &args, &res, 1); 2636 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
2561 if (status == 0) { 2637 if (status == 0) {
2562 update_changeattr(dir, &res.cinfo); 2638 update_changeattr(dir, &res.cinfo);
2563 nfs_post_op_update_inode(dir, res.dir_attr); 2639 nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2658,7 +2734,7 @@ static int _nfs4_proc_rename(struct inode *old_dir, struct qstr *old_name,
2658 if (res.old_fattr == NULL || res.new_fattr == NULL) 2734 if (res.old_fattr == NULL || res.new_fattr == NULL)
2659 goto out; 2735 goto out;
2660 2736
2661 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2737 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
2662 if (!status) { 2738 if (!status) {
2663 update_changeattr(old_dir, &res.old_cinfo); 2739 update_changeattr(old_dir, &res.old_cinfo);
2664 nfs_post_op_update_inode(old_dir, res.old_fattr); 2740 nfs_post_op_update_inode(old_dir, res.old_fattr);
@@ -2709,7 +2785,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, struct qstr *
2709 if (res.fattr == NULL || res.dir_attr == NULL) 2785 if (res.fattr == NULL || res.dir_attr == NULL)
2710 goto out; 2786 goto out;
2711 2787
2712 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 2788 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
2713 if (!status) { 2789 if (!status) {
2714 update_changeattr(dir, &res.cinfo); 2790 update_changeattr(dir, &res.cinfo);
2715 nfs_post_op_update_inode(dir, res.dir_attr); 2791 nfs_post_op_update_inode(dir, res.dir_attr);
@@ -2772,8 +2848,8 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
2772 2848
2773static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data) 2849static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_createdata *data)
2774{ 2850{
2775 int status = nfs4_call_sync(NFS_SERVER(dir), &data->msg, 2851 int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
2776 &data->arg, &data->res, 1); 2852 &data->arg.seq_args, &data->res.seq_res, 1);
2777 if (status == 0) { 2853 if (status == 0) {
2778 update_changeattr(dir, &data->res.dir_cinfo); 2854 update_changeattr(dir, &data->res.dir_cinfo);
2779 nfs_post_op_update_inode(dir, data->res.dir_fattr); 2855 nfs_post_op_update_inode(dir, data->res.dir_fattr);
@@ -2885,7 +2961,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
2885 (unsigned long long)cookie); 2961 (unsigned long long)cookie);
2886 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args); 2962 nfs4_setup_readdir(cookie, NFS_COOKIEVERF(dir), dentry, &args);
2887 res.pgbase = args.pgbase; 2963 res.pgbase = args.pgbase;
2888 status = nfs4_call_sync(NFS_SERVER(dir), &msg, &args, &res, 0); 2964 status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
2889 if (status >= 0) { 2965 if (status >= 0) {
2890 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE); 2966 memcpy(NFS_COOKIEVERF(dir), res.verifier.data, NFS4_VERIFIER_SIZE);
2891 status += args.pgbase; 2967 status += args.pgbase;
@@ -2977,7 +3053,7 @@ static int _nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle,
2977 }; 3053 };
2978 3054
2979 nfs_fattr_init(fsstat->fattr); 3055 nfs_fattr_init(fsstat->fattr);
2980 return nfs4_call_sync(server, &msg, &args, &res, 0); 3056 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
2981} 3057}
2982 3058
2983static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat) 3059static int nfs4_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsstat *fsstat)
@@ -3008,7 +3084,7 @@ static int _nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle,
3008 .rpc_resp = &res, 3084 .rpc_resp = &res,
3009 }; 3085 };
3010 3086
3011 return nfs4_call_sync(server, &msg, &args, &res, 0); 3087 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
3012} 3088}
3013 3089
3014static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo) 3090static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *fsinfo)
@@ -3053,7 +3129,7 @@ static int _nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle
3053 } 3129 }
3054 3130
3055 nfs_fattr_init(pathconf->fattr); 3131 nfs_fattr_init(pathconf->fattr);
3056 return nfs4_call_sync(server, &msg, &args, &res, 0); 3132 return nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
3057} 3133}
3058 3134
3059static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, 3135static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
@@ -3070,15 +3146,10 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
3070 return err; 3146 return err;
3071} 3147}
3072 3148
3073static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) 3149static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_read_data *data)
3074{ 3150{
3075 struct nfs_server *server = NFS_SERVER(data->inode); 3151 struct nfs_server *server = NFS_SERVER(data->inode);
3076 3152
3077 dprintk("--> %s\n", __func__);
3078
3079 if (!nfs4_sequence_done(task, &data->res.seq_res))
3080 return -EAGAIN;
3081
3082 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { 3153 if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
3083 nfs_restart_rpc(task, server->nfs_client); 3154 nfs_restart_rpc(task, server->nfs_client);
3084 return -EAGAIN; 3155 return -EAGAIN;
@@ -3090,19 +3161,44 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3090 return 0; 3161 return 0;
3091} 3162}
3092 3163
3164static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
3165{
3166
3167 dprintk("--> %s\n", __func__);
3168
3169 if (!nfs4_sequence_done(task, &data->res.seq_res))
3170 return -EAGAIN;
3171
3172 return data->read_done_cb(task, data);
3173}
3174
3093static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg) 3175static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message *msg)
3094{ 3176{
3095 data->timestamp = jiffies; 3177 data->timestamp = jiffies;
3178 data->read_done_cb = nfs4_read_done_cb;
3096 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; 3179 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
3097} 3180}
3098 3181
3099static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) 3182/* Reset the the nfs_read_data to send the read to the MDS. */
3183void nfs4_reset_read(struct rpc_task *task, struct nfs_read_data *data)
3184{
3185 dprintk("%s Reset task for i/o through\n", __func__);
3186 put_lseg(data->lseg);
3187 data->lseg = NULL;
3188 /* offsets will differ in the dense stripe case */
3189 data->args.offset = data->mds_offset;
3190 data->ds_clp = NULL;
3191 data->args.fh = NFS_FH(data->inode);
3192 data->read_done_cb = nfs4_read_done_cb;
3193 task->tk_ops = data->mds_ops;
3194 rpc_task_reset_client(task, NFS_CLIENT(data->inode));
3195}
3196EXPORT_SYMBOL_GPL(nfs4_reset_read);
3197
3198static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
3100{ 3199{
3101 struct inode *inode = data->inode; 3200 struct inode *inode = data->inode;
3102 3201
3103 if (!nfs4_sequence_done(task, &data->res.seq_res))
3104 return -EAGAIN;
3105
3106 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { 3202 if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
3107 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3203 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
3108 return -EAGAIN; 3204 return -EAGAIN;
@@ -3114,23 +3210,50 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3114 return 0; 3210 return 0;
3115} 3211}
3116 3212
3213static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
3214{
3215 if (!nfs4_sequence_done(task, &data->res.seq_res))
3216 return -EAGAIN;
3217 return data->write_done_cb(task, data);
3218}
3219
3220/* Reset the the nfs_write_data to send the write to the MDS. */
3221void nfs4_reset_write(struct rpc_task *task, struct nfs_write_data *data)
3222{
3223 dprintk("%s Reset task for i/o through\n", __func__);
3224 put_lseg(data->lseg);
3225 data->lseg = NULL;
3226 data->ds_clp = NULL;
3227 data->write_done_cb = nfs4_write_done_cb;
3228 data->args.fh = NFS_FH(data->inode);
3229 data->args.bitmask = data->res.server->cache_consistency_bitmask;
3230 data->args.offset = data->mds_offset;
3231 data->res.fattr = &data->fattr;
3232 task->tk_ops = data->mds_ops;
3233 rpc_task_reset_client(task, NFS_CLIENT(data->inode));
3234}
3235EXPORT_SYMBOL_GPL(nfs4_reset_write);
3236
3117static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg) 3237static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_message *msg)
3118{ 3238{
3119 struct nfs_server *server = NFS_SERVER(data->inode); 3239 struct nfs_server *server = NFS_SERVER(data->inode);
3120 3240
3121 data->args.bitmask = server->cache_consistency_bitmask; 3241 if (data->lseg) {
3242 data->args.bitmask = NULL;
3243 data->res.fattr = NULL;
3244 } else
3245 data->args.bitmask = server->cache_consistency_bitmask;
3246 if (!data->write_done_cb)
3247 data->write_done_cb = nfs4_write_done_cb;
3122 data->res.server = server; 3248 data->res.server = server;
3123 data->timestamp = jiffies; 3249 data->timestamp = jiffies;
3124 3250
3125 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE]; 3251 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
3126} 3252}
3127 3253
3128static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) 3254static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_write_data *data)
3129{ 3255{
3130 struct inode *inode = data->inode; 3256 struct inode *inode = data->inode;
3131
3132 if (!nfs4_sequence_done(task, &data->res.seq_res))
3133 return -EAGAIN;
3134 3257
3135 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { 3258 if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
3136 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); 3259 nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client);
@@ -3140,11 +3263,24 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
3140 return 0; 3263 return 0;
3141} 3264}
3142 3265
3266static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
3267{
3268 if (!nfs4_sequence_done(task, &data->res.seq_res))
3269 return -EAGAIN;
3270 return data->write_done_cb(task, data);
3271}
3272
3143static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg) 3273static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_message *msg)
3144{ 3274{
3145 struct nfs_server *server = NFS_SERVER(data->inode); 3275 struct nfs_server *server = NFS_SERVER(data->inode);
3146 3276
3147 data->args.bitmask = server->cache_consistency_bitmask; 3277 if (data->lseg) {
3278 data->args.bitmask = NULL;
3279 data->res.fattr = NULL;
3280 } else
3281 data->args.bitmask = server->cache_consistency_bitmask;
3282 if (!data->write_done_cb)
3283 data->write_done_cb = nfs4_commit_done_cb;
3148 data->res.server = server; 3284 data->res.server = server;
3149 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT]; 3285 msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
3150} 3286}
@@ -3178,7 +3314,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
3178 if (task->tk_status < 0) { 3314 if (task->tk_status < 0) {
3179 /* Unless we're shutting down, schedule state recovery! */ 3315 /* Unless we're shutting down, schedule state recovery! */
3180 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0) 3316 if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
3181 nfs4_schedule_state_recovery(clp); 3317 nfs4_schedule_lease_recovery(clp);
3182 return; 3318 return;
3183 } 3319 }
3184 do_renew_lease(clp, timestamp); 3320 do_renew_lease(clp, timestamp);
@@ -3252,6 +3388,35 @@ static void buf_to_pages(const void *buf, size_t buflen,
3252 } 3388 }
3253} 3389}
3254 3390
3391static int buf_to_pages_noslab(const void *buf, size_t buflen,
3392 struct page **pages, unsigned int *pgbase)
3393{
3394 struct page *newpage, **spages;
3395 int rc = 0;
3396 size_t len;
3397 spages = pages;
3398
3399 do {
3400 len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
3401 newpage = alloc_page(GFP_KERNEL);
3402
3403 if (newpage == NULL)
3404 goto unwind;
3405 memcpy(page_address(newpage), buf, len);
3406 buf += len;
3407 buflen -= len;
3408 *pages++ = newpage;
3409 rc++;
3410 } while (buflen != 0);
3411
3412 return rc;
3413
3414unwind:
3415 for(; rc > 0; rc--)
3416 __free_page(spages[rc-1]);
3417 return -ENOMEM;
3418}
3419
3255struct nfs4_cached_acl { 3420struct nfs4_cached_acl {
3256 int cached; 3421 int cached;
3257 size_t len; 3422 size_t len;
@@ -3353,7 +3518,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
3353 resp_buf = buf; 3518 resp_buf = buf;
3354 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase); 3519 buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
3355 } 3520 }
3356 ret = nfs4_call_sync(NFS_SERVER(inode), &msg, &args, &res, 0); 3521 ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
3357 if (ret) 3522 if (ret)
3358 goto out_free; 3523 goto out_free;
3359 if (res.acl_len > args.acl_len) 3524 if (res.acl_len > args.acl_len)
@@ -3420,13 +3585,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
3420 .rpc_argp = &arg, 3585 .rpc_argp = &arg,
3421 .rpc_resp = &res, 3586 .rpc_resp = &res,
3422 }; 3587 };
3423 int ret; 3588 int ret, i;
3424 3589
3425 if (!nfs4_server_supports_acls(server)) 3590 if (!nfs4_server_supports_acls(server))
3426 return -EOPNOTSUPP; 3591 return -EOPNOTSUPP;
3592 i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
3593 if (i < 0)
3594 return i;
3427 nfs_inode_return_delegation(inode); 3595 nfs_inode_return_delegation(inode);
3428 buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase); 3596 ret = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3429 ret = nfs4_call_sync(server, &msg, &arg, &res, 1); 3597
3598 /*
3599 * Free each page after tx, so the only ref left is
3600 * held by the network stack
3601 */
3602 for (; i > 0; i--)
3603 put_page(pages[i-1]);
3604
3430 /* 3605 /*
3431 * Acl update can result in inode attribute update. 3606 * Acl update can result in inode attribute update.
3432 * so mark the attribute cache invalid. 3607 * so mark the attribute cache invalid.
@@ -3464,12 +3639,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3464 case -NFS4ERR_OPENMODE: 3639 case -NFS4ERR_OPENMODE:
3465 if (state == NULL) 3640 if (state == NULL)
3466 break; 3641 break;
3467 nfs4_state_mark_reclaim_nograce(clp, state); 3642 nfs4_schedule_stateid_recovery(server, state);
3468 goto do_state_recovery; 3643 goto wait_on_recovery;
3469 case -NFS4ERR_STALE_STATEID: 3644 case -NFS4ERR_STALE_STATEID:
3470 case -NFS4ERR_STALE_CLIENTID: 3645 case -NFS4ERR_STALE_CLIENTID:
3471 case -NFS4ERR_EXPIRED: 3646 case -NFS4ERR_EXPIRED:
3472 goto do_state_recovery; 3647 nfs4_schedule_lease_recovery(clp);
3648 goto wait_on_recovery;
3473#if defined(CONFIG_NFS_V4_1) 3649#if defined(CONFIG_NFS_V4_1)
3474 case -NFS4ERR_BADSESSION: 3650 case -NFS4ERR_BADSESSION:
3475 case -NFS4ERR_BADSLOT: 3651 case -NFS4ERR_BADSLOT:
@@ -3480,7 +3656,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3480 case -NFS4ERR_SEQ_MISORDERED: 3656 case -NFS4ERR_SEQ_MISORDERED:
3481 dprintk("%s ERROR %d, Reset session\n", __func__, 3657 dprintk("%s ERROR %d, Reset session\n", __func__,
3482 task->tk_status); 3658 task->tk_status);
3483 nfs4_schedule_state_recovery(clp); 3659 nfs4_schedule_session_recovery(clp->cl_session);
3484 task->tk_status = 0; 3660 task->tk_status = 0;
3485 return -EAGAIN; 3661 return -EAGAIN;
3486#endif /* CONFIG_NFS_V4_1 */ 3662#endif /* CONFIG_NFS_V4_1 */
@@ -3497,9 +3673,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
3497 } 3673 }
3498 task->tk_status = nfs4_map_errors(task->tk_status); 3674 task->tk_status = nfs4_map_errors(task->tk_status);
3499 return 0; 3675 return 0;
3500do_state_recovery: 3676wait_on_recovery:
3501 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); 3677 rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
3502 nfs4_schedule_state_recovery(clp);
3503 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) 3678 if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
3504 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); 3679 rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
3505 task->tk_status = 0; 3680 task->tk_status = 0;
@@ -3781,7 +3956,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
3781 lsp = request->fl_u.nfs4_fl.owner; 3956 lsp = request->fl_u.nfs4_fl.owner;
3782 arg.lock_owner.id = lsp->ls_id.id; 3957 arg.lock_owner.id = lsp->ls_id.id;
3783 arg.lock_owner.s_dev = server->s_dev; 3958 arg.lock_owner.s_dev = server->s_dev;
3784 status = nfs4_call_sync(server, &msg, &arg, &res, 1); 3959 status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
3785 switch (status) { 3960 switch (status) {
3786 case 0: 3961 case 0:
3787 request->fl_type = F_UNLCK; 3962 request->fl_type = F_UNLCK;
@@ -4110,7 +4285,7 @@ static void nfs4_lock_release(void *calldata)
4110 task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp, 4285 task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
4111 data->arg.lock_seqid); 4286 data->arg.lock_seqid);
4112 if (!IS_ERR(task)) 4287 if (!IS_ERR(task))
4113 rpc_put_task(task); 4288 rpc_put_task_async(task);
4114 dprintk("%s: cancelling lock!\n", __func__); 4289 dprintk("%s: cancelling lock!\n", __func__);
4115 } else 4290 } else
4116 nfs_free_seqid(data->arg.lock_seqid); 4291 nfs_free_seqid(data->arg.lock_seqid);
@@ -4134,23 +4309,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
4134 4309
4135static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) 4310static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
4136{ 4311{
4137 struct nfs_client *clp = server->nfs_client;
4138 struct nfs4_state *state = lsp->ls_state;
4139
4140 switch (error) { 4312 switch (error) {
4141 case -NFS4ERR_ADMIN_REVOKED: 4313 case -NFS4ERR_ADMIN_REVOKED:
4142 case -NFS4ERR_BAD_STATEID: 4314 case -NFS4ERR_BAD_STATEID:
4143 case -NFS4ERR_EXPIRED: 4315 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4144 if (new_lock_owner != 0 || 4316 if (new_lock_owner != 0 ||
4145 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) 4317 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
4146 nfs4_state_mark_reclaim_nograce(clp, state); 4318 nfs4_schedule_stateid_recovery(server, lsp->ls_state);
4147 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4148 break; 4319 break;
4149 case -NFS4ERR_STALE_STATEID: 4320 case -NFS4ERR_STALE_STATEID:
4150 if (new_lock_owner != 0 ||
4151 (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
4152 nfs4_state_mark_reclaim_reboot(clp, state);
4153 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; 4321 lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
4322 case -NFS4ERR_EXPIRED:
4323 nfs4_schedule_lease_recovery(server->nfs_client);
4154 }; 4324 };
4155} 4325}
4156 4326
@@ -4366,12 +4536,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4366 case -NFS4ERR_EXPIRED: 4536 case -NFS4ERR_EXPIRED:
4367 case -NFS4ERR_STALE_CLIENTID: 4537 case -NFS4ERR_STALE_CLIENTID:
4368 case -NFS4ERR_STALE_STATEID: 4538 case -NFS4ERR_STALE_STATEID:
4539 nfs4_schedule_lease_recovery(server->nfs_client);
4540 goto out;
4369 case -NFS4ERR_BADSESSION: 4541 case -NFS4ERR_BADSESSION:
4370 case -NFS4ERR_BADSLOT: 4542 case -NFS4ERR_BADSLOT:
4371 case -NFS4ERR_BAD_HIGH_SLOT: 4543 case -NFS4ERR_BAD_HIGH_SLOT:
4372 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: 4544 case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
4373 case -NFS4ERR_DEADSESSION: 4545 case -NFS4ERR_DEADSESSION:
4374 nfs4_schedule_state_recovery(server->nfs_client); 4546 nfs4_schedule_session_recovery(server->nfs_client->cl_session);
4375 goto out; 4547 goto out;
4376 case -ERESTARTSYS: 4548 case -ERESTARTSYS:
4377 /* 4549 /*
@@ -4381,7 +4553,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
4381 case -NFS4ERR_ADMIN_REVOKED: 4553 case -NFS4ERR_ADMIN_REVOKED:
4382 case -NFS4ERR_BAD_STATEID: 4554 case -NFS4ERR_BAD_STATEID:
4383 case -NFS4ERR_OPENMODE: 4555 case -NFS4ERR_OPENMODE:
4384 nfs4_state_mark_reclaim_nograce(server->nfs_client, state); 4556 nfs4_schedule_stateid_recovery(server, state);
4385 err = 0; 4557 err = 0;
4386 goto out; 4558 goto out;
4387 case -EKEYEXPIRED: 4559 case -EKEYEXPIRED:
@@ -4512,12 +4684,46 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
4512 nfs_fattr_init(&fs_locations->fattr); 4684 nfs_fattr_init(&fs_locations->fattr);
4513 fs_locations->server = server; 4685 fs_locations->server = server;
4514 fs_locations->nlocations = 0; 4686 fs_locations->nlocations = 0;
4515 status = nfs4_call_sync(server, &msg, &args, &res, 0); 4687 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
4516 nfs_fixup_referral_attributes(&fs_locations->fattr); 4688 nfs_fixup_referral_attributes(&fs_locations->fattr);
4517 dprintk("%s: returned status = %d\n", __func__, status); 4689 dprintk("%s: returned status = %d\n", __func__, status);
4518 return status; 4690 return status;
4519} 4691}
4520 4692
4693static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
4694{
4695 int status;
4696 struct nfs4_secinfo_arg args = {
4697 .dir_fh = NFS_FH(dir),
4698 .name = name,
4699 };
4700 struct nfs4_secinfo_res res = {
4701 .flavors = flavors,
4702 };
4703 struct rpc_message msg = {
4704 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SECINFO],
4705 .rpc_argp = &args,
4706 .rpc_resp = &res,
4707 };
4708
4709 dprintk("NFS call secinfo %s\n", name->name);
4710 status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &msg, &args.seq_args, &res.seq_res, 0);
4711 dprintk("NFS reply secinfo: %d\n", status);
4712 return status;
4713}
4714
4715int nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors)
4716{
4717 struct nfs4_exception exception = { };
4718 int err;
4719 do {
4720 err = nfs4_handle_exception(NFS_SERVER(dir),
4721 _nfs4_proc_secinfo(dir, name, flavors),
4722 &exception);
4723 } while (exception.retry);
4724 return err;
4725}
4726
4521#ifdef CONFIG_NFS_V4_1 4727#ifdef CONFIG_NFS_V4_1
4522/* 4728/*
4523 * Check the exchange flags returned by the server for invalid flags, having 4729 * Check the exchange flags returned by the server for invalid flags, having
@@ -4988,10 +5194,20 @@ int nfs4_proc_create_session(struct nfs_client *clp)
4988 int status; 5194 int status;
4989 unsigned *ptr; 5195 unsigned *ptr;
4990 struct nfs4_session *session = clp->cl_session; 5196 struct nfs4_session *session = clp->cl_session;
5197 long timeout = 0;
5198 int err;
4991 5199
4992 dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); 5200 dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
4993 5201
4994 status = _nfs4_proc_create_session(clp); 5202 do {
5203 status = _nfs4_proc_create_session(clp);
5204 if (status == -NFS4ERR_DELAY) {
5205 err = nfs4_delay(clp->cl_rpcclient, &timeout);
5206 if (err)
5207 status = err;
5208 }
5209 } while (status == -NFS4ERR_DELAY);
5210
4995 if (status) 5211 if (status)
4996 goto out; 5212 goto out;
4997 5213
@@ -5073,6 +5289,27 @@ int nfs4_init_session(struct nfs_server *server)
5073 return ret; 5289 return ret;
5074} 5290}
5075 5291
5292int nfs4_init_ds_session(struct nfs_client *clp)
5293{
5294 struct nfs4_session *session = clp->cl_session;
5295 int ret;
5296
5297 if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state))
5298 return 0;
5299
5300 ret = nfs4_client_recover_expired_lease(clp);
5301 if (!ret)
5302 /* Test for the DS role */
5303 if (!is_ds_client(clp))
5304 ret = -ENODEV;
5305 if (!ret)
5306 ret = nfs4_check_client_ready(clp);
5307 return ret;
5308
5309}
5310EXPORT_SYMBOL_GPL(nfs4_init_ds_session);
5311
5312
5076/* 5313/*
5077 * Renew the cl_session lease. 5314 * Renew the cl_session lease.
5078 */ 5315 */
@@ -5100,7 +5337,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
5100 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5337 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5101 return -EAGAIN; 5338 return -EAGAIN;
5102 default: 5339 default:
5103 nfs4_schedule_state_recovery(clp); 5340 nfs4_schedule_lease_recovery(clp);
5104 } 5341 }
5105 return 0; 5342 return 0;
5106} 5343}
@@ -5187,7 +5424,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
5187 if (IS_ERR(task)) 5424 if (IS_ERR(task))
5188 ret = PTR_ERR(task); 5425 ret = PTR_ERR(task);
5189 else 5426 else
5190 rpc_put_task(task); 5427 rpc_put_task_async(task);
5191 dprintk("<-- %s status=%d\n", __func__, ret); 5428 dprintk("<-- %s status=%d\n", __func__, ret);
5192 return ret; 5429 return ret;
5193} 5430}
@@ -5203,8 +5440,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
5203 goto out; 5440 goto out;
5204 } 5441 }
5205 ret = rpc_wait_for_completion_task(task); 5442 ret = rpc_wait_for_completion_task(task);
5206 if (!ret) 5443 if (!ret) {
5444 struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
5445
5446 if (task->tk_status == 0)
5447 nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
5207 ret = task->tk_status; 5448 ret = task->tk_status;
5449 }
5208 rpc_put_task(task); 5450 rpc_put_task(task);
5209out: 5451out:
5210 dprintk("<-- %s status=%d\n", __func__, ret); 5452 dprintk("<-- %s status=%d\n", __func__, ret);
@@ -5241,7 +5483,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
5241 rpc_delay(task, NFS4_POLL_RETRY_MAX); 5483 rpc_delay(task, NFS4_POLL_RETRY_MAX);
5242 return -EAGAIN; 5484 return -EAGAIN;
5243 default: 5485 default:
5244 nfs4_schedule_state_recovery(clp); 5486 nfs4_schedule_lease_recovery(clp);
5245 } 5487 }
5246 return 0; 5488 return 0;
5247} 5489}
@@ -5309,6 +5551,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
5309 status = PTR_ERR(task); 5551 status = PTR_ERR(task);
5310 goto out; 5552 goto out;
5311 } 5553 }
5554 status = nfs4_wait_for_completion_rpc_task(task);
5555 if (status == 0)
5556 status = task->tk_status;
5312 rpc_put_task(task); 5557 rpc_put_task(task);
5313 return 0; 5558 return 0;
5314out: 5559out:
@@ -5371,8 +5616,6 @@ static void nfs4_layoutget_release(void *calldata)
5371 struct nfs4_layoutget *lgp = calldata; 5616 struct nfs4_layoutget *lgp = calldata;
5372 5617
5373 dprintk("--> %s\n", __func__); 5618 dprintk("--> %s\n", __func__);
5374 if (lgp->res.layout.buf != NULL)
5375 free_page((unsigned long) lgp->res.layout.buf);
5376 put_nfs_open_context(lgp->args.ctx); 5619 put_nfs_open_context(lgp->args.ctx);
5377 kfree(calldata); 5620 kfree(calldata);
5378 dprintk("<-- %s\n", __func__); 5621 dprintk("<-- %s\n", __func__);
@@ -5404,12 +5647,7 @@ int nfs4_proc_layoutget(struct nfs4_layoutget *lgp)
5404 5647
5405 dprintk("--> %s\n", __func__); 5648 dprintk("--> %s\n", __func__);
5406 5649
5407 lgp->res.layout.buf = (void *)__get_free_page(GFP_NOFS); 5650 lgp->res.layoutp = &lgp->args.layout;
5408 if (lgp->res.layout.buf == NULL) {
5409 nfs4_layoutget_release(lgp);
5410 return -ENOMEM;
5411 }
5412
5413 lgp->res.seq_res.sr_slot = NULL; 5651 lgp->res.seq_res.sr_slot = NULL;
5414 task = rpc_run_task(&task_setup_data); 5652 task = rpc_run_task(&task_setup_data);
5415 if (IS_ERR(task)) 5653 if (IS_ERR(task))
@@ -5441,7 +5679,7 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5441 int status; 5679 int status;
5442 5680
5443 dprintk("--> %s\n", __func__); 5681 dprintk("--> %s\n", __func__);
5444 status = nfs4_call_sync(server, &msg, &args, &res, 0); 5682 status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0);
5445 dprintk("<-- %s status=%d\n", __func__, status); 5683 dprintk("<-- %s status=%d\n", __func__, status);
5446 5684
5447 return status; 5685 return status;
@@ -5461,6 +5699,100 @@ int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev)
5461} 5699}
5462EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo); 5700EXPORT_SYMBOL_GPL(nfs4_proc_getdeviceinfo);
5463 5701
5702static void nfs4_layoutcommit_prepare(struct rpc_task *task, void *calldata)
5703{
5704 struct nfs4_layoutcommit_data *data = calldata;
5705 struct nfs_server *server = NFS_SERVER(data->args.inode);
5706
5707 if (nfs4_setup_sequence(server, &data->args.seq_args,
5708 &data->res.seq_res, 1, task))
5709 return;
5710 rpc_call_start(task);
5711}
5712
5713static void
5714nfs4_layoutcommit_done(struct rpc_task *task, void *calldata)
5715{
5716 struct nfs4_layoutcommit_data *data = calldata;
5717 struct nfs_server *server = NFS_SERVER(data->args.inode);
5718
5719 if (!nfs4_sequence_done(task, &data->res.seq_res))
5720 return;
5721
5722 switch (task->tk_status) { /* Just ignore these failures */
5723 case NFS4ERR_DELEG_REVOKED: /* layout was recalled */
5724 case NFS4ERR_BADIOMODE: /* no IOMODE_RW layout for range */
5725 case NFS4ERR_BADLAYOUT: /* no layout */
5726 case NFS4ERR_GRACE: /* loca_recalim always false */
5727 task->tk_status = 0;
5728 }
5729
5730 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
5731 nfs_restart_rpc(task, server->nfs_client);
5732 return;
5733 }
5734
5735 if (task->tk_status == 0)
5736 nfs_post_op_update_inode_force_wcc(data->args.inode,
5737 data->res.fattr);
5738}
5739
5740static void nfs4_layoutcommit_release(void *calldata)
5741{
5742 struct nfs4_layoutcommit_data *data = calldata;
5743
5744 /* Matched by references in pnfs_set_layoutcommit */
5745 put_lseg(data->lseg);
5746 put_rpccred(data->cred);
5747 kfree(data);
5748}
5749
5750static const struct rpc_call_ops nfs4_layoutcommit_ops = {
5751 .rpc_call_prepare = nfs4_layoutcommit_prepare,
5752 .rpc_call_done = nfs4_layoutcommit_done,
5753 .rpc_release = nfs4_layoutcommit_release,
5754};
5755
5756int
5757nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
5758{
5759 struct rpc_message msg = {
5760 .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTCOMMIT],
5761 .rpc_argp = &data->args,
5762 .rpc_resp = &data->res,
5763 .rpc_cred = data->cred,
5764 };
5765 struct rpc_task_setup task_setup_data = {
5766 .task = &data->task,
5767 .rpc_client = NFS_CLIENT(data->args.inode),
5768 .rpc_message = &msg,
5769 .callback_ops = &nfs4_layoutcommit_ops,
5770 .callback_data = data,
5771 .flags = RPC_TASK_ASYNC,
5772 };
5773 struct rpc_task *task;
5774 int status = 0;
5775
5776 dprintk("NFS: %4d initiating layoutcommit call. sync %d "
5777 "lbw: %llu inode %lu\n",
5778 data->task.tk_pid, sync,
5779 data->args.lastbytewritten,
5780 data->args.inode->i_ino);
5781
5782 task = rpc_run_task(&task_setup_data);
5783 if (IS_ERR(task))
5784 return PTR_ERR(task);
5785 if (sync == false)
5786 goto out;
5787 status = nfs4_wait_for_completion_rpc_task(task);
5788 if (status != 0)
5789 goto out;
5790 status = task->tk_status;
5791out:
5792 dprintk("%s: status %d\n", __func__, status);
5793 rpc_put_task(task);
5794 return status;
5795}
5464#endif /* CONFIG_NFS_V4_1 */ 5796#endif /* CONFIG_NFS_V4_1 */
5465 5797
5466struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = { 5798struct nfs4_state_recovery_ops nfs40_reboot_recovery_ops = {
@@ -5595,6 +5927,8 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
5595 .clear_acl_cache = nfs4_zap_acl_attr, 5927 .clear_acl_cache = nfs4_zap_acl_attr,
5596 .close_context = nfs4_close_context, 5928 .close_context = nfs4_close_context,
5597 .open_context = nfs4_atomic_open, 5929 .open_context = nfs4_atomic_open,
5930 .init_client = nfs4_init_client,
5931 .secinfo = nfs4_proc_secinfo,
5598}; 5932};
5599 5933
5600static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { 5934static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 402143d75fc5..df8e7f3ca56d 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -64,12 +64,8 @@ nfs4_renew_state(struct work_struct *work)
64 ops = clp->cl_mvops->state_renewal_ops; 64 ops = clp->cl_mvops->state_renewal_ops;
65 dprintk("%s: start\n", __func__); 65 dprintk("%s: start\n", __func__);
66 66
67 rcu_read_lock(); 67 if (test_bit(NFS_CS_STOP_RENEW, &clp->cl_res_state))
68 if (list_empty(&clp->cl_superblocks)) {
69 rcu_read_unlock();
70 goto out; 68 goto out;
71 }
72 rcu_read_unlock();
73 69
74 spin_lock(&clp->cl_lock); 70 spin_lock(&clp->cl_lock);
75 lease = clp->cl_lease_time; 71 lease = clp->cl_lease_time;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e6742b57a04c..a6804f704d9d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -153,6 +153,11 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
153 int status; 153 int status;
154 struct nfs_fsinfo fsinfo; 154 struct nfs_fsinfo fsinfo;
155 155
156 if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
157 nfs4_schedule_state_renewal(clp);
158 return 0;
159 }
160
156 status = nfs4_proc_get_lease_time(clp, &fsinfo); 161 status = nfs4_proc_get_lease_time(clp, &fsinfo);
157 if (status == 0) { 162 if (status == 0) {
158 /* Update lease time and schedule renewal */ 163 /* Update lease time and schedule renewal */
@@ -585,7 +590,8 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
585 state->owner = owner; 590 state->owner = owner;
586 atomic_inc(&owner->so_count); 591 atomic_inc(&owner->so_count);
587 list_add(&state->inode_states, &nfsi->open_states); 592 list_add(&state->inode_states, &nfsi->open_states);
588 state->inode = igrab(inode); 593 ihold(inode);
594 state->inode = inode;
589 spin_unlock(&inode->i_lock); 595 spin_unlock(&inode->i_lock);
590 /* Note: The reclaim code dictates that we add stateless 596 /* Note: The reclaim code dictates that we add stateless
591 * and read-only stateids to the end of the list */ 597 * and read-only stateids to the end of the list */
@@ -1007,9 +1013,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
1007} 1013}
1008 1014
1009/* 1015/*
1010 * Schedule a state recovery attempt 1016 * Schedule a lease recovery attempt
1011 */ 1017 */
1012void nfs4_schedule_state_recovery(struct nfs_client *clp) 1018void nfs4_schedule_lease_recovery(struct nfs_client *clp)
1013{ 1019{
1014 if (!clp) 1020 if (!clp)
1015 return; 1021 return;
@@ -1018,7 +1024,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
1018 nfs4_schedule_state_manager(clp); 1024 nfs4_schedule_state_manager(clp);
1019} 1025}
1020 1026
1021int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) 1027static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
1022{ 1028{
1023 1029
1024 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); 1030 set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1032,7 +1038,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st
1032 return 1; 1038 return 1;
1033} 1039}
1034 1040
1035int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state) 1041static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
1036{ 1042{
1037 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags); 1043 set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
1038 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); 1044 clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1041,6 +1047,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s
1041 return 1; 1047 return 1;
1042} 1048}
1043 1049
1050void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
1051{
1052 struct nfs_client *clp = server->nfs_client;
1053
1054 nfs4_state_mark_reclaim_nograce(clp, state);
1055 nfs4_schedule_state_manager(clp);
1056}
1057
1044static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops) 1058static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
1045{ 1059{
1046 struct inode *inode = state->inode; 1060 struct inode *inode = state->inode;
@@ -1436,10 +1450,16 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
1436} 1450}
1437 1451
1438#ifdef CONFIG_NFS_V4_1 1452#ifdef CONFIG_NFS_V4_1
1453void nfs4_schedule_session_recovery(struct nfs4_session *session)
1454{
1455 nfs4_schedule_lease_recovery(session->clp);
1456}
1457EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
1458
1439void nfs41_handle_recall_slot(struct nfs_client *clp) 1459void nfs41_handle_recall_slot(struct nfs_client *clp)
1440{ 1460{
1441 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state); 1461 set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
1442 nfs4_schedule_state_recovery(clp); 1462 nfs4_schedule_state_manager(clp);
1443} 1463}
1444 1464
1445static void nfs4_reset_all_state(struct nfs_client *clp) 1465static void nfs4_reset_all_state(struct nfs_client *clp)
@@ -1447,7 +1467,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
1447 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { 1467 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1448 clp->cl_boot_time = CURRENT_TIME; 1468 clp->cl_boot_time = CURRENT_TIME;
1449 nfs4_state_start_reclaim_nograce(clp); 1469 nfs4_state_start_reclaim_nograce(clp);
1450 nfs4_schedule_state_recovery(clp); 1470 nfs4_schedule_state_manager(clp);
1451 } 1471 }
1452} 1472}
1453 1473
@@ -1455,7 +1475,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
1455{ 1475{
1456 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) { 1476 if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
1457 nfs4_state_start_reclaim_reboot(clp); 1477 nfs4_state_start_reclaim_reboot(clp);
1458 nfs4_schedule_state_recovery(clp); 1478 nfs4_schedule_state_manager(clp);
1459 } 1479 }
1460} 1480}
1461 1481
@@ -1475,7 +1495,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
1475{ 1495{
1476 nfs_expire_all_delegations(clp); 1496 nfs_expire_all_delegations(clp);
1477 if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) 1497 if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
1478 nfs4_schedule_state_recovery(clp); 1498 nfs4_schedule_state_manager(clp);
1479} 1499}
1480 1500
1481void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) 1501void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e2c168b6ee9..dddfb5795d7b 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -46,6 +46,7 @@
46#include <linux/kdev_t.h> 46#include <linux/kdev_t.h>
47#include <linux/sunrpc/clnt.h> 47#include <linux/sunrpc/clnt.h>
48#include <linux/sunrpc/msg_prot.h> 48#include <linux/sunrpc/msg_prot.h>
49#include <linux/sunrpc/gss_api.h>
49#include <linux/nfs.h> 50#include <linux/nfs.h>
50#include <linux/nfs4.h> 51#include <linux/nfs4.h>
51#include <linux/nfs_fs.h> 52#include <linux/nfs_fs.h>
@@ -112,7 +113,7 @@ static int nfs4_stat_to_errno(int);
112#define encode_restorefh_maxsz (op_encode_hdr_maxsz) 113#define encode_restorefh_maxsz (op_encode_hdr_maxsz)
113#define decode_restorefh_maxsz (op_decode_hdr_maxsz) 114#define decode_restorefh_maxsz (op_decode_hdr_maxsz)
114#define encode_fsinfo_maxsz (encode_getattr_maxsz) 115#define encode_fsinfo_maxsz (encode_getattr_maxsz)
115#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) 116#define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 15)
116#define encode_renew_maxsz (op_encode_hdr_maxsz + 3) 117#define encode_renew_maxsz (op_encode_hdr_maxsz + 3)
117#define decode_renew_maxsz (op_decode_hdr_maxsz) 118#define decode_renew_maxsz (op_decode_hdr_maxsz)
118#define encode_setclientid_maxsz \ 119#define encode_setclientid_maxsz \
@@ -253,6 +254,8 @@ static int nfs4_stat_to_errno(int);
253 (encode_getattr_maxsz) 254 (encode_getattr_maxsz)
254#define decode_fs_locations_maxsz \ 255#define decode_fs_locations_maxsz \
255 (0) 256 (0)
257#define encode_secinfo_maxsz (op_encode_hdr_maxsz + nfs4_name_maxsz)
258#define decode_secinfo_maxsz (op_decode_hdr_maxsz + 4 + (NFS_MAX_SECFLAVORS * (16 + GSS_OID_MAX_LEN)))
256 259
257#if defined(CONFIG_NFS_V4_1) 260#if defined(CONFIG_NFS_V4_1)
258#define NFS4_MAX_MACHINE_NAME_LEN (64) 261#define NFS4_MAX_MACHINE_NAME_LEN (64)
@@ -324,6 +327,18 @@ static int nfs4_stat_to_errno(int);
324#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ 327#define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \
325 decode_stateid_maxsz + \ 328 decode_stateid_maxsz + \
326 XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE)) 329 XDR_QUADLEN(PNFS_LAYOUT_MAXSIZE))
330#define encode_layoutcommit_maxsz (op_encode_hdr_maxsz + \
331 2 /* offset */ + \
332 2 /* length */ + \
333 1 /* reclaim */ + \
334 encode_stateid_maxsz + \
335 1 /* new offset (true) */ + \
336 2 /* last byte written */ + \
337 1 /* nt_timechanged (false) */ + \
338 1 /* layoutupdate4 layout type */ + \
339 1 /* NULL filelayout layoutupdate4 payload */)
340#define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3)
341
327#else /* CONFIG_NFS_V4_1 */ 342#else /* CONFIG_NFS_V4_1 */
328#define encode_sequence_maxsz 0 343#define encode_sequence_maxsz 0
329#define decode_sequence_maxsz 0 344#define decode_sequence_maxsz 0
@@ -676,6 +691,14 @@ static int nfs4_stat_to_errno(int);
676 decode_putfh_maxsz + \ 691 decode_putfh_maxsz + \
677 decode_lookup_maxsz + \ 692 decode_lookup_maxsz + \
678 decode_fs_locations_maxsz) 693 decode_fs_locations_maxsz)
694#define NFS4_enc_secinfo_sz (compound_encode_hdr_maxsz + \
695 encode_sequence_maxsz + \
696 encode_putfh_maxsz + \
697 encode_secinfo_maxsz)
698#define NFS4_dec_secinfo_sz (compound_decode_hdr_maxsz + \
699 decode_sequence_maxsz + \
700 decode_putfh_maxsz + \
701 decode_secinfo_maxsz)
679#if defined(CONFIG_NFS_V4_1) 702#if defined(CONFIG_NFS_V4_1)
680#define NFS4_enc_exchange_id_sz \ 703#define NFS4_enc_exchange_id_sz \
681 (compound_encode_hdr_maxsz + \ 704 (compound_encode_hdr_maxsz + \
@@ -727,6 +750,17 @@ static int nfs4_stat_to_errno(int);
727 decode_sequence_maxsz + \ 750 decode_sequence_maxsz + \
728 decode_putfh_maxsz + \ 751 decode_putfh_maxsz + \
729 decode_layoutget_maxsz) 752 decode_layoutget_maxsz)
753#define NFS4_enc_layoutcommit_sz (compound_encode_hdr_maxsz + \
754 encode_sequence_maxsz +\
755 encode_putfh_maxsz + \
756 encode_layoutcommit_maxsz + \
757 encode_getattr_maxsz)
758#define NFS4_dec_layoutcommit_sz (compound_decode_hdr_maxsz + \
759 decode_sequence_maxsz + \
760 decode_putfh_maxsz + \
761 decode_layoutcommit_maxsz + \
762 decode_getattr_maxsz)
763
730 764
731const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH + 765const u32 nfs41_maxwrite_overhead = ((RPC_MAX_HEADER_WITH_AUTH +
732 compound_encode_hdr_maxsz + 766 compound_encode_hdr_maxsz +
@@ -844,7 +878,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
844 if (iap->ia_valid & ATTR_MODE) 878 if (iap->ia_valid & ATTR_MODE)
845 len += 4; 879 len += 4;
846 if (iap->ia_valid & ATTR_UID) { 880 if (iap->ia_valid & ATTR_UID) {
847 owner_namelen = nfs_map_uid_to_name(server->nfs_client, iap->ia_uid, owner_name, IDMAP_NAMESZ); 881 owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
848 if (owner_namelen < 0) { 882 if (owner_namelen < 0) {
849 dprintk("nfs: couldn't resolve uid %d to string\n", 883 dprintk("nfs: couldn't resolve uid %d to string\n",
850 iap->ia_uid); 884 iap->ia_uid);
@@ -856,7 +890,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
856 len += 4 + (XDR_QUADLEN(owner_namelen) << 2); 890 len += 4 + (XDR_QUADLEN(owner_namelen) << 2);
857 } 891 }
858 if (iap->ia_valid & ATTR_GID) { 892 if (iap->ia_valid & ATTR_GID) {
859 owner_grouplen = nfs_map_gid_to_group(server->nfs_client, iap->ia_gid, owner_group, IDMAP_NAMESZ); 893 owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ);
860 if (owner_grouplen < 0) { 894 if (owner_grouplen < 0) {
861 dprintk("nfs: couldn't resolve gid %d to string\n", 895 dprintk("nfs: couldn't resolve gid %d to string\n",
862 iap->ia_gid); 896 iap->ia_gid);
@@ -1384,7 +1418,7 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
1384 hdr->replen += decode_putrootfh_maxsz; 1418 hdr->replen += decode_putrootfh_maxsz;
1385} 1419}
1386 1420
1387static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) 1421static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, int zero_seqid)
1388{ 1422{
1389 nfs4_stateid stateid; 1423 nfs4_stateid stateid;
1390 __be32 *p; 1424 __be32 *p;
@@ -1392,6 +1426,8 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
1392 p = reserve_space(xdr, NFS4_STATEID_SIZE); 1426 p = reserve_space(xdr, NFS4_STATEID_SIZE);
1393 if (ctx->state != NULL) { 1427 if (ctx->state != NULL) {
1394 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); 1428 nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid);
1429 if (zero_seqid)
1430 stateid.stateid.seqid = 0;
1395 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); 1431 xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
1396 } else 1432 } else
1397 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); 1433 xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
@@ -1404,7 +1440,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args,
1404 p = reserve_space(xdr, 4); 1440 p = reserve_space(xdr, 4);
1405 *p = cpu_to_be32(OP_READ); 1441 *p = cpu_to_be32(OP_READ);
1406 1442
1407 encode_stateid(xdr, args->context, args->lock_context); 1443 encode_stateid(xdr, args->context, args->lock_context,
1444 hdr->minorversion);
1408 1445
1409 p = reserve_space(xdr, 12); 1446 p = reserve_space(xdr, 12);
1410 p = xdr_encode_hyper(p, args->offset); 1447 p = xdr_encode_hyper(p, args->offset);
@@ -1592,7 +1629,8 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
1592 p = reserve_space(xdr, 4); 1629 p = reserve_space(xdr, 4);
1593 *p = cpu_to_be32(OP_WRITE); 1630 *p = cpu_to_be32(OP_WRITE);
1594 1631
1595 encode_stateid(xdr, args->context, args->lock_context); 1632 encode_stateid(xdr, args->context, args->lock_context,
1633 hdr->minorversion);
1596 1634
1597 p = reserve_space(xdr, 16); 1635 p = reserve_space(xdr, 16);
1598 p = xdr_encode_hyper(p, args->offset); 1636 p = xdr_encode_hyper(p, args->offset);
@@ -1616,6 +1654,18 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
1616 hdr->replen += decode_delegreturn_maxsz; 1654 hdr->replen += decode_delegreturn_maxsz;
1617} 1655}
1618 1656
1657static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
1658{
1659 int len = name->len;
1660 __be32 *p;
1661
1662 p = reserve_space(xdr, 8 + len);
1663 *p++ = cpu_to_be32(OP_SECINFO);
1664 xdr_encode_opaque(p, name->name, len);
1665 hdr->nops++;
1666 hdr->replen += decode_secinfo_maxsz;
1667}
1668
1619#if defined(CONFIG_NFS_V4_1) 1669#if defined(CONFIG_NFS_V4_1)
1620/* NFSv4.1 operations */ 1670/* NFSv4.1 operations */
1621static void encode_exchange_id(struct xdr_stream *xdr, 1671static void encode_exchange_id(struct xdr_stream *xdr,
@@ -1660,7 +1710,7 @@ static void encode_create_session(struct xdr_stream *xdr,
1660 1710
1661 p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); 1711 p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
1662 *p++ = cpu_to_be32(OP_CREATE_SESSION); 1712 *p++ = cpu_to_be32(OP_CREATE_SESSION);
1663 p = xdr_encode_hyper(p, clp->cl_ex_clid); 1713 p = xdr_encode_hyper(p, clp->cl_clientid);
1664 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ 1714 *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
1665 *p++ = cpu_to_be32(args->flags); /*flags */ 1715 *p++ = cpu_to_be32(args->flags); /*flags */
1666 1716
@@ -1812,6 +1862,34 @@ encode_layoutget(struct xdr_stream *xdr,
1812 hdr->nops++; 1862 hdr->nops++;
1813 hdr->replen += decode_layoutget_maxsz; 1863 hdr->replen += decode_layoutget_maxsz;
1814} 1864}
1865
1866static int
1867encode_layoutcommit(struct xdr_stream *xdr,
1868 const struct nfs4_layoutcommit_args *args,
1869 struct compound_hdr *hdr)
1870{
1871 __be32 *p;
1872
1873 dprintk("%s: lbw: %llu type: %d\n", __func__, args->lastbytewritten,
1874 NFS_SERVER(args->inode)->pnfs_curr_ld->id);
1875
1876 p = reserve_space(xdr, 48 + NFS4_STATEID_SIZE);
1877 *p++ = cpu_to_be32(OP_LAYOUTCOMMIT);
1878 /* Only whole file layouts */
1879 p = xdr_encode_hyper(p, 0); /* offset */
1880 p = xdr_encode_hyper(p, NFS4_MAX_UINT64); /* length */
1881 *p++ = cpu_to_be32(0); /* reclaim */
1882 p = xdr_encode_opaque_fixed(p, args->stateid.data, NFS4_STATEID_SIZE);
1883 *p++ = cpu_to_be32(1); /* newoffset = TRUE */
1884 p = xdr_encode_hyper(p, args->lastbytewritten);
1885 *p++ = cpu_to_be32(0); /* Never send time_modify_changed */
1886 *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
1887 *p++ = cpu_to_be32(0); /* no file layout payload */
1888
1889 hdr->nops++;
1890 hdr->replen += decode_layoutcommit_maxsz;
1891 return 0;
1892}
1815#endif /* CONFIG_NFS_V4_1 */ 1893#endif /* CONFIG_NFS_V4_1 */
1816 1894
1817/* 1895/*
@@ -2271,7 +2349,8 @@ static void nfs4_xdr_enc_write(struct rpc_rqst *req, struct xdr_stream *xdr,
2271 encode_putfh(xdr, args->fh, &hdr); 2349 encode_putfh(xdr, args->fh, &hdr);
2272 encode_write(xdr, args, &hdr); 2350 encode_write(xdr, args, &hdr);
2273 req->rq_snd_buf.flags |= XDRBUF_WRITE; 2351 req->rq_snd_buf.flags |= XDRBUF_WRITE;
2274 encode_getfattr(xdr, args->bitmask, &hdr); 2352 if (args->bitmask)
2353 encode_getfattr(xdr, args->bitmask, &hdr);
2275 encode_nops(&hdr); 2354 encode_nops(&hdr);
2276} 2355}
2277 2356
@@ -2289,7 +2368,8 @@ static void nfs4_xdr_enc_commit(struct rpc_rqst *req, struct xdr_stream *xdr,
2289 encode_sequence(xdr, &args->seq_args, &hdr); 2368 encode_sequence(xdr, &args->seq_args, &hdr);
2290 encode_putfh(xdr, args->fh, &hdr); 2369 encode_putfh(xdr, args->fh, &hdr);
2291 encode_commit(xdr, args, &hdr); 2370 encode_commit(xdr, args, &hdr);
2292 encode_getfattr(xdr, args->bitmask, &hdr); 2371 if (args->bitmask)
2372 encode_getfattr(xdr, args->bitmask, &hdr);
2293 encode_nops(&hdr); 2373 encode_nops(&hdr);
2294} 2374}
2295 2375
@@ -2460,6 +2540,24 @@ static void nfs4_xdr_enc_fs_locations(struct rpc_rqst *req,
2460 encode_nops(&hdr); 2540 encode_nops(&hdr);
2461} 2541}
2462 2542
2543/*
2544 * Encode SECINFO request
2545 */
2546static void nfs4_xdr_enc_secinfo(struct rpc_rqst *req,
2547 struct xdr_stream *xdr,
2548 struct nfs4_secinfo_arg *args)
2549{
2550 struct compound_hdr hdr = {
2551 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2552 };
2553
2554 encode_compound_hdr(xdr, req, &hdr);
2555 encode_sequence(xdr, &args->seq_args, &hdr);
2556 encode_putfh(xdr, args->dir_fh, &hdr);
2557 encode_secinfo(xdr, args->name, &hdr);
2558 encode_nops(&hdr);
2559}
2560
2463#if defined(CONFIG_NFS_V4_1) 2561#if defined(CONFIG_NFS_V4_1)
2464/* 2562/*
2465 * EXCHANGE_ID request 2563 * EXCHANGE_ID request
@@ -2599,8 +2697,32 @@ static void nfs4_xdr_enc_layoutget(struct rpc_rqst *req,
2599 encode_sequence(xdr, &args->seq_args, &hdr); 2697 encode_sequence(xdr, &args->seq_args, &hdr);
2600 encode_putfh(xdr, NFS_FH(args->inode), &hdr); 2698 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2601 encode_layoutget(xdr, args, &hdr); 2699 encode_layoutget(xdr, args, &hdr);
2700
2701 xdr_inline_pages(&req->rq_rcv_buf, hdr.replen << 2,
2702 args->layout.pages, 0, args->layout.pglen);
2703
2602 encode_nops(&hdr); 2704 encode_nops(&hdr);
2603} 2705}
2706
2707/*
2708 * Encode LAYOUTCOMMIT request
2709 */
2710static int nfs4_xdr_enc_layoutcommit(struct rpc_rqst *req,
2711 struct xdr_stream *xdr,
2712 struct nfs4_layoutcommit_args *args)
2713{
2714 struct compound_hdr hdr = {
2715 .minorversion = nfs4_xdr_minorversion(&args->seq_args),
2716 };
2717
2718 encode_compound_hdr(xdr, req, &hdr);
2719 encode_sequence(xdr, &args->seq_args, &hdr);
2720 encode_putfh(xdr, NFS_FH(args->inode), &hdr);
2721 encode_layoutcommit(xdr, args, &hdr);
2722 encode_getfattr(xdr, args->bitmask, &hdr);
2723 encode_nops(&hdr);
2724 return 0;
2725}
2604#endif /* CONFIG_NFS_V4_1 */ 2726#endif /* CONFIG_NFS_V4_1 */
2605 2727
2606static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) 2728static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
@@ -2920,6 +3042,7 @@ static int decode_attr_error(struct xdr_stream *xdr, uint32_t *bitmap)
2920 if (unlikely(!p)) 3042 if (unlikely(!p))
2921 goto out_overflow; 3043 goto out_overflow;
2922 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR; 3044 bitmap[0] &= ~FATTR4_WORD0_RDATTR_ERROR;
3045 return -be32_to_cpup(p);
2923 } 3046 }
2924 return 0; 3047 return 0;
2925out_overflow: 3048out_overflow:
@@ -3382,7 +3505,7 @@ out_overflow:
3382} 3505}
3383 3506
3384static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, 3507static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3385 struct nfs_client *clp, uint32_t *uid, int may_sleep) 3508 const struct nfs_server *server, uint32_t *uid, int may_sleep)
3386{ 3509{
3387 uint32_t len; 3510 uint32_t len;
3388 __be32 *p; 3511 __be32 *p;
@@ -3402,7 +3525,7 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
3402 if (!may_sleep) { 3525 if (!may_sleep) {
3403 /* do nothing */ 3526 /* do nothing */
3404 } else if (len < XDR_MAX_NETOBJ) { 3527 } else if (len < XDR_MAX_NETOBJ) {
3405 if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) 3528 if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
3406 ret = NFS_ATTR_FATTR_OWNER; 3529 ret = NFS_ATTR_FATTR_OWNER;
3407 else 3530 else
3408 dprintk("%s: nfs_map_name_to_uid failed!\n", 3531 dprintk("%s: nfs_map_name_to_uid failed!\n",
@@ -3420,7 +3543,7 @@ out_overflow:
3420} 3543}
3421 3544
3422static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, 3545static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3423 struct nfs_client *clp, uint32_t *gid, int may_sleep) 3546 const struct nfs_server *server, uint32_t *gid, int may_sleep)
3424{ 3547{
3425 uint32_t len; 3548 uint32_t len;
3426 __be32 *p; 3549 __be32 *p;
@@ -3440,7 +3563,7 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
3440 if (!may_sleep) { 3563 if (!may_sleep) {
3441 /* do nothing */ 3564 /* do nothing */
3442 } else if (len < XDR_MAX_NETOBJ) { 3565 } else if (len < XDR_MAX_NETOBJ) {
3443 if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) 3566 if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
3444 ret = NFS_ATTR_FATTR_GROUP; 3567 ret = NFS_ATTR_FATTR_GROUP;
3445 else 3568 else
3446 dprintk("%s: nfs_map_group_to_gid failed!\n", 3569 dprintk("%s: nfs_map_group_to_gid failed!\n",
@@ -3907,6 +4030,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3907 fattr->valid |= status; 4030 fattr->valid |= status;
3908 4031
3909 status = decode_attr_error(xdr, bitmap); 4032 status = decode_attr_error(xdr, bitmap);
4033 if (status == -NFS4ERR_WRONGSEC) {
4034 nfs_fixup_secinfo_attributes(fattr, fh);
4035 status = 0;
4036 }
3910 if (status < 0) 4037 if (status < 0)
3911 goto xdr_error; 4038 goto xdr_error;
3912 4039
@@ -3939,14 +4066,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
3939 goto xdr_error; 4066 goto xdr_error;
3940 fattr->valid |= status; 4067 fattr->valid |= status;
3941 4068
3942 status = decode_attr_owner(xdr, bitmap, server->nfs_client, 4069 status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
3943 &fattr->uid, may_sleep);
3944 if (status < 0) 4070 if (status < 0)
3945 goto xdr_error; 4071 goto xdr_error;
3946 fattr->valid |= status; 4072 fattr->valid |= status;
3947 4073
3948 status = decode_attr_group(xdr, bitmap, server->nfs_client, 4074 status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
3949 &fattr->gid, may_sleep);
3950 if (status < 0) 4075 if (status < 0)
3951 goto xdr_error; 4076 goto xdr_error;
3952 fattr->valid |= status; 4077 fattr->valid |= status;
@@ -4677,6 +4802,73 @@ static int decode_delegreturn(struct xdr_stream *xdr)
4677 return decode_op_hdr(xdr, OP_DELEGRETURN); 4802 return decode_op_hdr(xdr, OP_DELEGRETURN);
4678} 4803}
4679 4804
4805static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor)
4806{
4807 __be32 *p;
4808
4809 p = xdr_inline_decode(xdr, 4);
4810 if (unlikely(!p))
4811 goto out_overflow;
4812 flavor->gss.sec_oid4.len = be32_to_cpup(p);
4813 if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN)
4814 goto out_err;
4815
4816 p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len);
4817 if (unlikely(!p))
4818 goto out_overflow;
4819 memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len);
4820
4821 p = xdr_inline_decode(xdr, 8);
4822 if (unlikely(!p))
4823 goto out_overflow;
4824 flavor->gss.qop4 = be32_to_cpup(p++);
4825 flavor->gss.service = be32_to_cpup(p);
4826
4827 return 0;
4828
4829out_overflow:
4830 print_overflow_msg(__func__, xdr);
4831 return -EIO;
4832out_err:
4833 return -EINVAL;
4834}
4835
4836static int decode_secinfo(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
4837{
4838 struct nfs4_secinfo_flavor *sec_flavor;
4839 int status;
4840 __be32 *p;
4841 int i;
4842
4843 status = decode_op_hdr(xdr, OP_SECINFO);
4844 p = xdr_inline_decode(xdr, 4);
4845 if (unlikely(!p))
4846 goto out_overflow;
4847 res->flavors->num_flavors = be32_to_cpup(p);
4848
4849 for (i = 0; i < res->flavors->num_flavors; i++) {
4850 sec_flavor = &res->flavors->flavors[i];
4851 if ((char *)&sec_flavor[1] - (char *)res > PAGE_SIZE)
4852 break;
4853
4854 p = xdr_inline_decode(xdr, 4);
4855 if (unlikely(!p))
4856 goto out_overflow;
4857 sec_flavor->flavor = be32_to_cpup(p);
4858
4859 if (sec_flavor->flavor == RPC_AUTH_GSS) {
4860 if (decode_secinfo_gss(xdr, sec_flavor))
4861 break;
4862 }
4863 }
4864
4865 return 0;
4866
4867out_overflow:
4868 print_overflow_msg(__func__, xdr);
4869 return -EIO;
4870}
4871
4680#if defined(CONFIG_NFS_V4_1) 4872#if defined(CONFIG_NFS_V4_1)
4681static int decode_exchange_id(struct xdr_stream *xdr, 4873static int decode_exchange_id(struct xdr_stream *xdr,
4682 struct nfs41_exchange_id_res *res) 4874 struct nfs41_exchange_id_res *res)
@@ -4694,7 +4886,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
4694 p = xdr_inline_decode(xdr, 8); 4886 p = xdr_inline_decode(xdr, 8);
4695 if (unlikely(!p)) 4887 if (unlikely(!p))
4696 goto out_overflow; 4888 goto out_overflow;
4697 xdr_decode_hyper(p, &clp->cl_ex_clid); 4889 xdr_decode_hyper(p, &clp->cl_clientid);
4698 p = xdr_inline_decode(xdr, 12); 4890 p = xdr_inline_decode(xdr, 12);
4699 if (unlikely(!p)) 4891 if (unlikely(!p))
4700 goto out_overflow; 4892 goto out_overflow;
@@ -4947,6 +5139,9 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
4947 __be32 *p; 5139 __be32 *p;
4948 int status; 5140 int status;
4949 u32 layout_count; 5141 u32 layout_count;
5142 struct xdr_buf *rcvbuf = &req->rq_rcv_buf;
5143 struct kvec *iov = rcvbuf->head;
5144 u32 hdrlen, recvd;
4950 5145
4951 status = decode_op_hdr(xdr, OP_LAYOUTGET); 5146 status = decode_op_hdr(xdr, OP_LAYOUTGET);
4952 if (status) 5147 if (status)
@@ -4963,17 +5158,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
4963 return -EINVAL; 5158 return -EINVAL;
4964 } 5159 }
4965 5160
4966 p = xdr_inline_decode(xdr, 24); 5161 p = xdr_inline_decode(xdr, 28);
4967 if (unlikely(!p)) 5162 if (unlikely(!p))
4968 goto out_overflow; 5163 goto out_overflow;
4969 p = xdr_decode_hyper(p, &res->range.offset); 5164 p = xdr_decode_hyper(p, &res->range.offset);
4970 p = xdr_decode_hyper(p, &res->range.length); 5165 p = xdr_decode_hyper(p, &res->range.length);
4971 res->range.iomode = be32_to_cpup(p++); 5166 res->range.iomode = be32_to_cpup(p++);
4972 res->type = be32_to_cpup(p++); 5167 res->type = be32_to_cpup(p++);
4973 5168 res->layoutp->len = be32_to_cpup(p);
4974 status = decode_opaque_inline(xdr, &res->layout.len, (char **)&p);
4975 if (unlikely(status))
4976 return status;
4977 5169
4978 dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n", 5170 dprintk("%s roff:%lu rlen:%lu riomode:%d, lo_type:0x%x, lo.len:%d\n",
4979 __func__, 5171 __func__,
@@ -4981,12 +5173,18 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req,
4981 (unsigned long)res->range.length, 5173 (unsigned long)res->range.length,
4982 res->range.iomode, 5174 res->range.iomode,
4983 res->type, 5175 res->type,
4984 res->layout.len); 5176 res->layoutp->len);
5177
5178 hdrlen = (u8 *) xdr->p - (u8 *) iov->iov_base;
5179 recvd = req->rq_rcv_buf.len - hdrlen;
5180 if (res->layoutp->len > recvd) {
5181 dprintk("NFS: server cheating in layoutget reply: "
5182 "layout len %u > recvd %u\n",
5183 res->layoutp->len, recvd);
5184 return -EINVAL;
5185 }
4985 5186
4986 /* nfs4_proc_layoutget allocated a single page */ 5187 xdr_read_pages(xdr, res->layoutp->len);
4987 if (res->layout.len > PAGE_SIZE)
4988 return -ENOMEM;
4989 memcpy(res->layout.buf, p, res->layout.len);
4990 5188
4991 if (layout_count > 1) { 5189 if (layout_count > 1) {
4992 /* We only handle a length one array at the moment. Any 5190 /* We only handle a length one array at the moment. Any
@@ -5003,6 +5201,35 @@ out_overflow:
5003 print_overflow_msg(__func__, xdr); 5201 print_overflow_msg(__func__, xdr);
5004 return -EIO; 5202 return -EIO;
5005} 5203}
5204
5205static int decode_layoutcommit(struct xdr_stream *xdr,
5206 struct rpc_rqst *req,
5207 struct nfs4_layoutcommit_res *res)
5208{
5209 __be32 *p;
5210 __u32 sizechanged;
5211 int status;
5212
5213 status = decode_op_hdr(xdr, OP_LAYOUTCOMMIT);
5214 if (status)
5215 return status;
5216
5217 p = xdr_inline_decode(xdr, 4);
5218 if (unlikely(!p))
5219 goto out_overflow;
5220 sizechanged = be32_to_cpup(p);
5221
5222 if (sizechanged) {
5223 /* throw away new size */
5224 p = xdr_inline_decode(xdr, 8);
5225 if (unlikely(!p))
5226 goto out_overflow;
5227 }
5228 return 0;
5229out_overflow:
5230 print_overflow_msg(__func__, xdr);
5231 return -EIO;
5232}
5006#endif /* CONFIG_NFS_V4_1 */ 5233#endif /* CONFIG_NFS_V4_1 */
5007 5234
5008/* 5235/*
@@ -5690,8 +5917,9 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5690 status = decode_write(xdr, res); 5917 status = decode_write(xdr, res);
5691 if (status) 5918 if (status)
5692 goto out; 5919 goto out;
5693 decode_getfattr(xdr, res->fattr, res->server, 5920 if (res->fattr)
5694 !RPC_IS_ASYNC(rqstp->rq_task)); 5921 decode_getfattr(xdr, res->fattr, res->server,
5922 !RPC_IS_ASYNC(rqstp->rq_task));
5695 if (!status) 5923 if (!status)
5696 status = res->count; 5924 status = res->count;
5697out: 5925out:
@@ -5719,8 +5947,9 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
5719 status = decode_commit(xdr, res); 5947 status = decode_commit(xdr, res);
5720 if (status) 5948 if (status)
5721 goto out; 5949 goto out;
5722 decode_getfattr(xdr, res->fattr, res->server, 5950 if (res->fattr)
5723 !RPC_IS_ASYNC(rqstp->rq_task)); 5951 decode_getfattr(xdr, res->fattr, res->server,
5952 !RPC_IS_ASYNC(rqstp->rq_task));
5724out: 5953out:
5725 return status; 5954 return status;
5726} 5955}
@@ -5915,6 +6144,32 @@ out:
5915 return status; 6144 return status;
5916} 6145}
5917 6146
6147/*
6148 * Decode SECINFO response
6149 */
6150static int nfs4_xdr_dec_secinfo(struct rpc_rqst *rqstp,
6151 struct xdr_stream *xdr,
6152 struct nfs4_secinfo_res *res)
6153{
6154 struct compound_hdr hdr;
6155 int status;
6156
6157 status = decode_compound_hdr(xdr, &hdr);
6158 if (status)
6159 goto out;
6160 status = decode_sequence(xdr, &res->seq_res, rqstp);
6161 if (status)
6162 goto out;
6163 status = decode_putfh(xdr);
6164 if (status)
6165 goto out;
6166 status = decode_secinfo(xdr, res);
6167 if (status)
6168 goto out;
6169out:
6170 return status;
6171}
6172
5918#if defined(CONFIG_NFS_V4_1) 6173#if defined(CONFIG_NFS_V4_1)
5919/* 6174/*
5920 * Decode EXCHANGE_ID response 6175 * Decode EXCHANGE_ID response
@@ -6062,6 +6317,34 @@ static int nfs4_xdr_dec_layoutget(struct rpc_rqst *rqstp,
6062out: 6317out:
6063 return status; 6318 return status;
6064} 6319}
6320
6321/*
6322 * Decode LAYOUTCOMMIT response
6323 */
6324static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
6325 struct xdr_stream *xdr,
6326 struct nfs4_layoutcommit_res *res)
6327{
6328 struct compound_hdr hdr;
6329 int status;
6330
6331 status = decode_compound_hdr(xdr, &hdr);
6332 if (status)
6333 goto out;
6334 status = decode_sequence(xdr, &res->seq_res, rqstp);
6335 if (status)
6336 goto out;
6337 status = decode_putfh(xdr);
6338 if (status)
6339 goto out;
6340 status = decode_layoutcommit(xdr, rqstp, res);
6341 if (status)
6342 goto out;
6343 decode_getfattr(xdr, res->fattr, res->server,
6344 !RPC_IS_ASYNC(rqstp->rq_task));
6345out:
6346 return status;
6347}
6065#endif /* CONFIG_NFS_V4_1 */ 6348#endif /* CONFIG_NFS_V4_1 */
6066 6349
6067/** 6350/**
@@ -6167,8 +6450,6 @@ static struct {
6167 { NFS4ERR_DQUOT, -EDQUOT }, 6450 { NFS4ERR_DQUOT, -EDQUOT },
6168 { NFS4ERR_STALE, -ESTALE }, 6451 { NFS4ERR_STALE, -ESTALE },
6169 { NFS4ERR_BADHANDLE, -EBADHANDLE }, 6452 { NFS4ERR_BADHANDLE, -EBADHANDLE },
6170 { NFS4ERR_BADOWNER, -EINVAL },
6171 { NFS4ERR_BADNAME, -EINVAL },
6172 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, 6453 { NFS4ERR_BAD_COOKIE, -EBADCOOKIE },
6173 { NFS4ERR_NOTSUPP, -ENOTSUPP }, 6454 { NFS4ERR_NOTSUPP, -ENOTSUPP },
6174 { NFS4ERR_TOOSMALL, -ETOOSMALL }, 6455 { NFS4ERR_TOOSMALL, -ETOOSMALL },
@@ -6178,10 +6459,6 @@ static struct {
6178 { NFS4ERR_SYMLINK, -ELOOP }, 6459 { NFS4ERR_SYMLINK, -ELOOP },
6179 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, 6460 { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP },
6180 { NFS4ERR_DEADLOCK, -EDEADLK }, 6461 { NFS4ERR_DEADLOCK, -EDEADLK },
6181 { NFS4ERR_WRONGSEC, -EPERM }, /* FIXME: this needs
6182 * to be handled by a
6183 * middle-layer.
6184 */
6185 { -1, -EIO } 6462 { -1, -EIO }
6186}; 6463};
6187 6464
@@ -6256,6 +6533,7 @@ struct rpc_procinfo nfs4_procedures[] = {
6256 PROC(SETACL, enc_setacl, dec_setacl), 6533 PROC(SETACL, enc_setacl, dec_setacl),
6257 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), 6534 PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations),
6258 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), 6535 PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner),
6536 PROC(SECINFO, enc_secinfo, dec_secinfo),
6259#if defined(CONFIG_NFS_V4_1) 6537#if defined(CONFIG_NFS_V4_1)
6260 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), 6538 PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id),
6261 PROC(CREATE_SESSION, enc_create_session, dec_create_session), 6539 PROC(CREATE_SESSION, enc_create_session, dec_create_session),
@@ -6265,6 +6543,7 @@ struct rpc_procinfo nfs4_procedures[] = {
6265 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete), 6543 PROC(RECLAIM_COMPLETE, enc_reclaim_complete, dec_reclaim_complete),
6266 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo), 6544 PROC(GETDEVICEINFO, enc_getdeviceinfo, dec_getdeviceinfo),
6267 PROC(LAYOUTGET, enc_layoutget, dec_layoutget), 6545 PROC(LAYOUTGET, enc_layoutget, dec_layoutget),
6546 PROC(LAYOUTCOMMIT, enc_layoutcommit, dec_layoutcommit),
6268#endif /* CONFIG_NFS_V4_1 */ 6547#endif /* CONFIG_NFS_V4_1 */
6269}; 6548};
6270 6549
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 903908a20023..c541093a5bf2 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,11 +86,14 @@
86/* Default path we try to mount. "%s" gets replaced by our IP address */ 86/* Default path we try to mount. "%s" gets replaced by our IP address */
87#define NFS_ROOT "/tftpboot/%s" 87#define NFS_ROOT "/tftpboot/%s"
88 88
89/* Default NFSROOT mount options. */
90#define NFS_DEF_OPTIONS "udp"
91
89/* Parameters passed from the kernel command line */ 92/* Parameters passed from the kernel command line */
90static char nfs_root_parms[256] __initdata = ""; 93static char nfs_root_parms[256] __initdata = "";
91 94
92/* Text-based mount options passed to super.c */ 95/* Text-based mount options passed to super.c */
93static char nfs_root_options[256] __initdata = ""; 96static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
94 97
95/* Address of NFS server */ 98/* Address of NFS server */
96static __be32 servaddr __initdata = htonl(INADDR_NONE); 99static __be32 servaddr __initdata = htonl(INADDR_NONE);
@@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,
160} 163}
161 164
162static int __init root_nfs_cat(char *dest, const char *src, 165static int __init root_nfs_cat(char *dest, const char *src,
163 const size_t destlen) 166 const size_t destlen)
164{ 167{
168 size_t len = strlen(dest);
169
170 if (len && dest[len - 1] != ',')
171 if (strlcat(dest, ",", destlen) > destlen)
172 return -1;
173
165 if (strlcat(dest, src, destlen) > destlen) 174 if (strlcat(dest, src, destlen) > destlen)
166 return -1; 175 return -1;
167 return 0; 176 return 0;
@@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
194 if (root_nfs_cat(nfs_root_options, incoming, 203 if (root_nfs_cat(nfs_root_options, incoming,
195 sizeof(nfs_root_options))) 204 sizeof(nfs_root_options)))
196 return -1; 205 return -1;
197
198 /*
199 * Possibly prepare for more options to be appended
200 */
201 if (nfs_root_options[0] != '\0' &&
202 nfs_root_options[strlen(nfs_root_options)] != ',')
203 if (root_nfs_cat(nfs_root_options, ",",
204 sizeof(nfs_root_options)))
205 return -1;
206
207 return 0; 206 return 0;
208} 207}
209 208
@@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
217 */ 216 */
218static int __init root_nfs_data(char *cmdline) 217static int __init root_nfs_data(char *cmdline)
219{ 218{
220 char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1]; 219 char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
221 int len, retval = -1; 220 int len, retval = -1;
222 char *tmp = NULL; 221 char *tmp = NULL;
223 const size_t tmplen = sizeof(nfs_export_path); 222 const size_t tmplen = sizeof(nfs_export_path);
@@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)
244 * Append mandatory options for nfsroot so they override 243 * Append mandatory options for nfsroot so they override
245 * what has come before 244 * what has come before
246 */ 245 */
247 snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4", 246 snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
248 &servaddr); 247 &servaddr);
249 if (root_nfs_cat(nfs_root_options, addr_option, 248 if (root_nfs_cat(nfs_root_options, mand_options,
250 sizeof(nfs_root_options))) 249 sizeof(nfs_root_options)))
251 goto out_optionstoolong; 250 goto out_optionstoolong;
252 251
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e1164e3f9e69..c80add6e2213 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -20,6 +20,7 @@
20#include <linux/nfs_mount.h> 20#include <linux/nfs_mount.h>
21 21
22#include "internal.h" 22#include "internal.h"
23#include "pnfs.h"
23 24
24static struct kmem_cache *nfs_page_cachep; 25static struct kmem_cache *nfs_page_cachep;
25 26
@@ -134,14 +135,14 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
134 nfs_unlock_request(req); 135 nfs_unlock_request(req);
135} 136}
136 137
137/** 138/*
138 * nfs_clear_request - Free up all resources allocated to the request 139 * nfs_clear_request - Free up all resources allocated to the request
139 * @req: 140 * @req:
140 * 141 *
141 * Release page and open context resources associated with a read/write 142 * Release page and open context resources associated with a read/write
142 * request after it has completed. 143 * request after it has completed.
143 */ 144 */
144void nfs_clear_request(struct nfs_page *req) 145static void nfs_clear_request(struct nfs_page *req)
145{ 146{
146 struct page *page = req->wb_page; 147 struct page *page = req->wb_page;
147 struct nfs_open_context *ctx = req->wb_context; 148 struct nfs_open_context *ctx = req->wb_context;
@@ -213,7 +214,7 @@ nfs_wait_on_request(struct nfs_page *req)
213 */ 214 */
214void nfs_pageio_init(struct nfs_pageio_descriptor *desc, 215void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
215 struct inode *inode, 216 struct inode *inode,
216 int (*doio)(struct inode *, struct list_head *, unsigned int, size_t, int), 217 int (*doio)(struct nfs_pageio_descriptor *),
217 size_t bsize, 218 size_t bsize,
218 int io_flags) 219 int io_flags)
219{ 220{
@@ -222,10 +223,12 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
222 desc->pg_count = 0; 223 desc->pg_count = 0;
223 desc->pg_bsize = bsize; 224 desc->pg_bsize = bsize;
224 desc->pg_base = 0; 225 desc->pg_base = 0;
226 desc->pg_moreio = 0;
225 desc->pg_inode = inode; 227 desc->pg_inode = inode;
226 desc->pg_doio = doio; 228 desc->pg_doio = doio;
227 desc->pg_ioflags = io_flags; 229 desc->pg_ioflags = io_flags;
228 desc->pg_error = 0; 230 desc->pg_error = 0;
231 desc->pg_lseg = NULL;
229} 232}
230 233
231/** 234/**
@@ -240,7 +243,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
240 * Return 'true' if this is the case, else return 'false'. 243 * Return 'true' if this is the case, else return 'false'.
241 */ 244 */
242static int nfs_can_coalesce_requests(struct nfs_page *prev, 245static int nfs_can_coalesce_requests(struct nfs_page *prev,
243 struct nfs_page *req) 246 struct nfs_page *req,
247 struct nfs_pageio_descriptor *pgio)
244{ 248{
245 if (req->wb_context->cred != prev->wb_context->cred) 249 if (req->wb_context->cred != prev->wb_context->cred)
246 return 0; 250 return 0;
@@ -254,6 +258,12 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev,
254 return 0; 258 return 0;
255 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE) 259 if (prev->wb_pgbase + prev->wb_bytes != PAGE_CACHE_SIZE)
256 return 0; 260 return 0;
261 /*
262 * Non-whole file layouts need to check that req is inside of
263 * pgio->pg_lseg.
264 */
265 if (pgio->pg_test && !pgio->pg_test(pgio, prev, req))
266 return 0;
257 return 1; 267 return 1;
258} 268}
259 269
@@ -286,7 +296,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
286 if (newlen > desc->pg_bsize) 296 if (newlen > desc->pg_bsize)
287 return 0; 297 return 0;
288 prev = nfs_list_entry(desc->pg_list.prev); 298 prev = nfs_list_entry(desc->pg_list.prev);
289 if (!nfs_can_coalesce_requests(prev, req)) 299 if (!nfs_can_coalesce_requests(prev, req, desc))
290 return 0; 300 return 0;
291 } else 301 } else
292 desc->pg_base = req->wb_pgbase; 302 desc->pg_base = req->wb_pgbase;
@@ -302,12 +312,7 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
302static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) 312static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
303{ 313{
304 if (!list_empty(&desc->pg_list)) { 314 if (!list_empty(&desc->pg_list)) {
305 int error = desc->pg_doio(desc->pg_inode, 315 int error = desc->pg_doio(desc);
306 &desc->pg_list,
307 nfs_page_array_len(desc->pg_base,
308 desc->pg_count),
309 desc->pg_count,
310 desc->pg_ioflags);
311 if (error < 0) 316 if (error < 0)
312 desc->pg_error = error; 317 desc->pg_error = error;
313 else 318 else
@@ -331,9 +336,11 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
331 struct nfs_page *req) 336 struct nfs_page *req)
332{ 337{
333 while (!nfs_pageio_do_add_request(desc, req)) { 338 while (!nfs_pageio_do_add_request(desc, req)) {
339 desc->pg_moreio = 1;
334 nfs_pageio_doio(desc); 340 nfs_pageio_doio(desc);
335 if (desc->pg_error < 0) 341 if (desc->pg_error < 0)
336 return 0; 342 return 0;
343 desc->pg_moreio = 0;
337 } 344 }
338 return 1; 345 return 1;
339} 346}
@@ -391,6 +398,7 @@ int nfs_scan_list(struct nfs_inode *nfsi,
391 pgoff_t idx_end; 398 pgoff_t idx_end;
392 int found, i; 399 int found, i;
393 int res; 400 int res;
401 struct list_head *list;
394 402
395 res = 0; 403 res = 0;
396 if (npages == 0) 404 if (npages == 0)
@@ -411,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi,
411 idx_start = req->wb_index + 1; 419 idx_start = req->wb_index + 1;
412 if (nfs_set_page_tag_locked(req)) { 420 if (nfs_set_page_tag_locked(req)) {
413 kref_get(&req->wb_kref); 421 kref_get(&req->wb_kref);
414 nfs_list_remove_request(req);
415 radix_tree_tag_clear(&nfsi->nfs_page_tree, 422 radix_tree_tag_clear(&nfsi->nfs_page_tree,
416 req->wb_index, tag); 423 req->wb_index, tag);
417 nfs_list_add_request(req, dst); 424 list = pnfs_choose_commit_list(req, dst);
425 nfs_list_add_request(req, list);
418 res++; 426 res++;
419 if (res == INT_MAX) 427 if (res == INT_MAX)
420 goto out; 428 goto out;
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 1b1bc1a0fb0a..d9ab97269ce6 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
30#include <linux/nfs_fs.h> 30#include <linux/nfs_fs.h>
31#include "internal.h" 31#include "internal.h"
32#include "pnfs.h" 32#include "pnfs.h"
33#include "iostat.h"
33 34
34#define NFSDBG_FACILITY NFSDBG_PNFS 35#define NFSDBG_FACILITY NFSDBG_PNFS
35 36
@@ -74,10 +75,8 @@ find_pnfs_driver(u32 id)
74void 75void
75unset_pnfs_layoutdriver(struct nfs_server *nfss) 76unset_pnfs_layoutdriver(struct nfs_server *nfss)
76{ 77{
77 if (nfss->pnfs_curr_ld) { 78 if (nfss->pnfs_curr_ld)
78 nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
79 module_put(nfss->pnfs_curr_ld->owner); 79 module_put(nfss->pnfs_curr_ld->owner);
80 }
81 nfss->pnfs_curr_ld = NULL; 80 nfss->pnfs_curr_ld = NULL;
82} 81}
83 82
@@ -115,13 +114,7 @@ set_pnfs_layoutdriver(struct nfs_server *server, u32 id)
115 goto out_no_driver; 114 goto out_no_driver;
116 } 115 }
117 server->pnfs_curr_ld = ld_type; 116 server->pnfs_curr_ld = ld_type;
118 if (ld_type->set_layoutdriver(server)) { 117
119 printk(KERN_ERR
120 "%s: Error initializing mount point for layout driver %u.\n",
121 __func__, id);
122 module_put(ld_type->owner);
123 goto out_no_driver;
124 }
125 dprintk("%s: pNFS module for %u set\n", __func__, id); 118 dprintk("%s: pNFS module for %u set\n", __func__, id);
126 return; 119 return;
127 120
@@ -230,38 +223,43 @@ static void free_lseg(struct pnfs_layout_segment *lseg)
230 put_layout_hdr(NFS_I(ino)->layout); 223 put_layout_hdr(NFS_I(ino)->layout);
231} 224}
232 225
233/* The use of tmp_list is necessary because pnfs_curr_ld->free_lseg 226static void
234 * could sleep, so must be called outside of the lock. 227put_lseg_common(struct pnfs_layout_segment *lseg)
235 * Returns 1 if object was removed, otherwise return 0.
236 */
237static int
238put_lseg_locked(struct pnfs_layout_segment *lseg,
239 struct list_head *tmp_list)
240{ 228{
229 struct inode *inode = lseg->pls_layout->plh_inode;
230
231 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
232 list_del_init(&lseg->pls_list);
233 if (list_empty(&lseg->pls_layout->plh_segs)) {
234 set_bit(NFS_LAYOUT_DESTROYED, &lseg->pls_layout->plh_flags);
235 /* Matched by initial refcount set in alloc_init_layout_hdr */
236 put_layout_hdr_locked(lseg->pls_layout);
237 }
238 rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
239}
240
241void
242put_lseg(struct pnfs_layout_segment *lseg)
243{
244 struct inode *inode;
245
246 if (!lseg)
247 return;
248
241 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, 249 dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
242 atomic_read(&lseg->pls_refcount), 250 atomic_read(&lseg->pls_refcount),
243 test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 251 test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
244 if (atomic_dec_and_test(&lseg->pls_refcount)) { 252 inode = lseg->pls_layout->plh_inode;
245 struct inode *ino = lseg->pls_layout->plh_inode; 253 if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
254 LIST_HEAD(free_me);
246 255
247 BUG_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); 256 put_lseg_common(lseg);
248 list_del(&lseg->pls_list); 257 list_add(&lseg->pls_list, &free_me);
249 if (list_empty(&lseg->pls_layout->plh_segs)) { 258 spin_unlock(&inode->i_lock);
250 struct nfs_client *clp; 259 pnfs_free_lseg_list(&free_me);
251
252 clp = NFS_SERVER(ino)->nfs_client;
253 spin_lock(&clp->cl_lock);
254 /* List does not take a reference, so no need for put here */
255 list_del_init(&lseg->pls_layout->plh_layouts);
256 spin_unlock(&clp->cl_lock);
257 clear_bit(NFS_LAYOUT_BULK_RECALL, &lseg->pls_layout->plh_flags);
258 }
259 rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
260 list_add(&lseg->pls_list, tmp_list);
261 return 1;
262 } 260 }
263 return 0;
264} 261}
262EXPORT_SYMBOL_GPL(put_lseg);
265 263
266static bool 264static bool
267should_free_lseg(u32 lseg_iomode, u32 recall_iomode) 265should_free_lseg(u32 lseg_iomode, u32 recall_iomode)
@@ -281,7 +279,13 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
281 * list. It will now be removed when all 279 * list. It will now be removed when all
282 * outstanding io is finished. 280 * outstanding io is finished.
283 */ 281 */
284 rv = put_lseg_locked(lseg, tmp_list); 282 dprintk("%s: lseg %p ref %d\n", __func__, lseg,
283 atomic_read(&lseg->pls_refcount));
284 if (atomic_dec_and_test(&lseg->pls_refcount)) {
285 put_lseg_common(lseg);
286 list_add(&lseg->pls_list, tmp_list);
287 rv = 1;
288 }
285 } 289 }
286 return rv; 290 return rv;
287} 291}
@@ -299,6 +303,11 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
299 303
300 dprintk("%s:Begin lo %p\n", __func__, lo); 304 dprintk("%s:Begin lo %p\n", __func__, lo);
301 305
306 if (list_empty(&lo->plh_segs)) {
307 if (!test_and_set_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags))
308 put_layout_hdr_locked(lo);
309 return 0;
310 }
302 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) 311 list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
303 if (should_free_lseg(lseg->pls_range.iomode, iomode)) { 312 if (should_free_lseg(lseg->pls_range.iomode, iomode)) {
304 dprintk("%s: freeing lseg %p iomode %d " 313 dprintk("%s: freeing lseg %p iomode %d "
@@ -312,11 +321,27 @@ mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
312 return invalid - removed; 321 return invalid - removed;
313} 322}
314 323
324/* note free_me must contain lsegs from a single layout_hdr */
315void 325void
316pnfs_free_lseg_list(struct list_head *free_me) 326pnfs_free_lseg_list(struct list_head *free_me)
317{ 327{
318 struct pnfs_layout_segment *lseg, *tmp; 328 struct pnfs_layout_segment *lseg, *tmp;
329 struct pnfs_layout_hdr *lo;
319 330
331 if (list_empty(free_me))
332 return;
333
334 lo = list_first_entry(free_me, struct pnfs_layout_segment,
335 pls_list)->pls_layout;
336
337 if (test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags)) {
338 struct nfs_client *clp;
339
340 clp = NFS_SERVER(lo->plh_inode)->nfs_client;
341 spin_lock(&clp->cl_lock);
342 list_del_init(&lo->plh_layouts);
343 spin_unlock(&clp->cl_lock);
344 }
320 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) { 345 list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
321 list_del(&lseg->pls_list); 346 list_del(&lseg->pls_list);
322 free_lseg(lseg); 347 free_lseg(lseg);
@@ -332,10 +357,8 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
332 spin_lock(&nfsi->vfs_inode.i_lock); 357 spin_lock(&nfsi->vfs_inode.i_lock);
333 lo = nfsi->layout; 358 lo = nfsi->layout;
334 if (lo) { 359 if (lo) {
335 set_bit(NFS_LAYOUT_DESTROYED, &nfsi->layout->plh_flags); 360 lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
336 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY); 361 mark_matching_lsegs_invalid(lo, &tmp_list, IOMODE_ANY);
337 /* Matched by refcount set to 1 in alloc_init_layout_hdr */
338 put_layout_hdr_locked(lo);
339 } 362 }
340 spin_unlock(&nfsi->vfs_inode.i_lock); 363 spin_unlock(&nfsi->vfs_inode.i_lock);
341 pnfs_free_lseg_list(&tmp_list); 364 pnfs_free_lseg_list(&tmp_list);
@@ -403,6 +426,7 @@ pnfs_layoutgets_blocked(struct pnfs_layout_hdr *lo, nfs4_stateid *stateid,
403 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0) 426 (int)(lo->plh_barrier - be32_to_cpu(stateid->stateid.seqid)) >= 0)
404 return true; 427 return true;
405 return lo->plh_block_lgets || 428 return lo->plh_block_lgets ||
429 test_bit(NFS_LAYOUT_DESTROYED, &lo->plh_flags) ||
406 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || 430 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
407 (list_empty(&lo->plh_segs) && 431 (list_empty(&lo->plh_segs) &&
408 (atomic_read(&lo->plh_outstanding) > lget)); 432 (atomic_read(&lo->plh_outstanding) > lget));
@@ -448,6 +472,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
448 struct nfs_server *server = NFS_SERVER(ino); 472 struct nfs_server *server = NFS_SERVER(ino);
449 struct nfs4_layoutget *lgp; 473 struct nfs4_layoutget *lgp;
450 struct pnfs_layout_segment *lseg = NULL; 474 struct pnfs_layout_segment *lseg = NULL;
475 struct page **pages = NULL;
476 int i;
477 u32 max_resp_sz, max_pages;
451 478
452 dprintk("--> %s\n", __func__); 479 dprintk("--> %s\n", __func__);
453 480
@@ -455,6 +482,21 @@ send_layoutget(struct pnfs_layout_hdr *lo,
455 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); 482 lgp = kzalloc(sizeof(*lgp), GFP_KERNEL);
456 if (lgp == NULL) 483 if (lgp == NULL)
457 return NULL; 484 return NULL;
485
486 /* allocate pages for xdr post processing */
487 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
488 max_pages = max_resp_sz >> PAGE_SHIFT;
489
490 pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL);
491 if (!pages)
492 goto out_err_free;
493
494 for (i = 0; i < max_pages; i++) {
495 pages[i] = alloc_page(GFP_KERNEL);
496 if (!pages[i])
497 goto out_err_free;
498 }
499
458 lgp->args.minlength = NFS4_MAX_UINT64; 500 lgp->args.minlength = NFS4_MAX_UINT64;
459 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; 501 lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
460 lgp->args.range.iomode = iomode; 502 lgp->args.range.iomode = iomode;
@@ -463,6 +505,8 @@ send_layoutget(struct pnfs_layout_hdr *lo,
463 lgp->args.type = server->pnfs_curr_ld->id; 505 lgp->args.type = server->pnfs_curr_ld->id;
464 lgp->args.inode = ino; 506 lgp->args.inode = ino;
465 lgp->args.ctx = get_nfs_open_context(ctx); 507 lgp->args.ctx = get_nfs_open_context(ctx);
508 lgp->args.layout.pages = pages;
509 lgp->args.layout.pglen = max_pages * PAGE_SIZE;
466 lgp->lsegpp = &lseg; 510 lgp->lsegpp = &lseg;
467 511
468 /* Synchronously retrieve layout information from server and 512 /* Synchronously retrieve layout information from server and
@@ -473,7 +517,26 @@ send_layoutget(struct pnfs_layout_hdr *lo,
473 /* remember that LAYOUTGET failed and suspend trying */ 517 /* remember that LAYOUTGET failed and suspend trying */
474 set_bit(lo_fail_bit(iomode), &lo->plh_flags); 518 set_bit(lo_fail_bit(iomode), &lo->plh_flags);
475 } 519 }
520
521 /* free xdr pages */
522 for (i = 0; i < max_pages; i++)
523 __free_page(pages[i]);
524 kfree(pages);
525
476 return lseg; 526 return lseg;
527
528out_err_free:
529 /* free any allocated xdr pages, lgp as it's not used */
530 if (pages) {
531 for (i = 0; i < max_pages; i++) {
532 if (!pages[i])
533 break;
534 __free_page(pages[i]);
535 }
536 kfree(pages);
537 }
538 kfree(lgp);
539 return NULL;
477} 540}
478 541
479bool pnfs_roc(struct inode *ino) 542bool pnfs_roc(struct inode *ino)
@@ -674,7 +737,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, u32 iomode)
674 list_for_each_entry(lseg, &lo->plh_segs, pls_list) { 737 list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
675 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) && 738 if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
676 is_matching_lseg(lseg, iomode)) { 739 is_matching_lseg(lseg, iomode)) {
677 ret = lseg; 740 ret = get_lseg(lseg);
678 break; 741 break;
679 } 742 }
680 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0) 743 if (cmp_layout(iomode, lseg->pls_range.iomode) > 0)
@@ -699,6 +762,7 @@ pnfs_update_layout(struct inode *ino,
699 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; 762 struct nfs_client *clp = NFS_SERVER(ino)->nfs_client;
700 struct pnfs_layout_hdr *lo; 763 struct pnfs_layout_hdr *lo;
701 struct pnfs_layout_segment *lseg = NULL; 764 struct pnfs_layout_segment *lseg = NULL;
765 bool first = false;
702 766
703 if (!pnfs_enabled_sb(NFS_SERVER(ino))) 767 if (!pnfs_enabled_sb(NFS_SERVER(ino)))
704 return NULL; 768 return NULL;
@@ -715,21 +779,25 @@ pnfs_update_layout(struct inode *ino,
715 dprintk("%s matches recall, use MDS\n", __func__); 779 dprintk("%s matches recall, use MDS\n", __func__);
716 goto out_unlock; 780 goto out_unlock;
717 } 781 }
718 /* Check to see if the layout for the given range already exists */
719 lseg = pnfs_find_lseg(lo, iomode);
720 if (lseg)
721 goto out_unlock;
722 782
723 /* if LAYOUTGET already failed once we don't try again */ 783 /* if LAYOUTGET already failed once we don't try again */
724 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags)) 784 if (test_bit(lo_fail_bit(iomode), &nfsi->layout->plh_flags))
725 goto out_unlock; 785 goto out_unlock;
726 786
787 /* Check to see if the layout for the given range already exists */
788 lseg = pnfs_find_lseg(lo, iomode);
789 if (lseg)
790 goto out_unlock;
791
727 if (pnfs_layoutgets_blocked(lo, NULL, 0)) 792 if (pnfs_layoutgets_blocked(lo, NULL, 0))
728 goto out_unlock; 793 goto out_unlock;
729 atomic_inc(&lo->plh_outstanding); 794 atomic_inc(&lo->plh_outstanding);
730 795
731 get_layout_hdr(lo); 796 get_layout_hdr(lo);
732 if (list_empty(&lo->plh_segs)) { 797 if (list_empty(&lo->plh_segs))
798 first = true;
799 spin_unlock(&ino->i_lock);
800 if (first) {
733 /* The lo must be on the clp list if there is any 801 /* The lo must be on the clp list if there is any
734 * chance of a CB_LAYOUTRECALL(FILE) coming in. 802 * chance of a CB_LAYOUTRECALL(FILE) coming in.
735 */ 803 */
@@ -738,24 +806,18 @@ pnfs_update_layout(struct inode *ino,
738 list_add_tail(&lo->plh_layouts, &clp->cl_layouts); 806 list_add_tail(&lo->plh_layouts, &clp->cl_layouts);
739 spin_unlock(&clp->cl_lock); 807 spin_unlock(&clp->cl_lock);
740 } 808 }
741 spin_unlock(&ino->i_lock);
742 809
743 lseg = send_layoutget(lo, ctx, iomode); 810 lseg = send_layoutget(lo, ctx, iomode);
744 if (!lseg) { 811 if (!lseg && first) {
745 spin_lock(&ino->i_lock); 812 spin_lock(&clp->cl_lock);
746 if (list_empty(&lo->plh_segs)) { 813 list_del_init(&lo->plh_layouts);
747 spin_lock(&clp->cl_lock); 814 spin_unlock(&clp->cl_lock);
748 list_del_init(&lo->plh_layouts);
749 spin_unlock(&clp->cl_lock);
750 clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
751 }
752 spin_unlock(&ino->i_lock);
753 } 815 }
754 atomic_dec(&lo->plh_outstanding); 816 atomic_dec(&lo->plh_outstanding);
755 put_layout_hdr(lo); 817 put_layout_hdr(lo);
756out: 818out:
757 dprintk("%s end, state 0x%lx lseg %p\n", __func__, 819 dprintk("%s end, state 0x%lx lseg %p\n", __func__,
758 nfsi->layout->plh_flags, lseg); 820 nfsi->layout ? nfsi->layout->plh_flags : -1, lseg);
759 return lseg; 821 return lseg;
760out_unlock: 822out_unlock:
761 spin_unlock(&ino->i_lock); 823 spin_unlock(&ino->i_lock);
@@ -808,7 +870,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
808 } 870 }
809 init_lseg(lo, lseg); 871 init_lseg(lo, lseg);
810 lseg->pls_range = res->range; 872 lseg->pls_range = res->range;
811 *lgp->lsegpp = lseg; 873 *lgp->lsegpp = get_lseg(lseg);
812 pnfs_insert_layout(lo, lseg); 874 pnfs_insert_layout(lo, lseg);
813 875
814 if (res->return_on_close) { 876 if (res->return_on_close) {
@@ -829,137 +891,199 @@ out_forget_reply:
829 goto out; 891 goto out;
830} 892}
831 893
894static int pnfs_read_pg_test(struct nfs_pageio_descriptor *pgio,
895 struct nfs_page *prev,
896 struct nfs_page *req)
897{
898 if (pgio->pg_count == prev->wb_bytes) {
899 /* This is first coelesce call for a series of nfs_pages */
900 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
901 prev->wb_context,
902 IOMODE_READ);
903 }
904 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
905}
906
907void
908pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode)
909{
910 struct pnfs_layoutdriver_type *ld;
911
912 ld = NFS_SERVER(inode)->pnfs_curr_ld;
913 pgio->pg_test = (ld && ld->pg_test) ? pnfs_read_pg_test : NULL;
914}
915
916static int pnfs_write_pg_test(struct nfs_pageio_descriptor *pgio,
917 struct nfs_page *prev,
918 struct nfs_page *req)
919{
920 if (pgio->pg_count == prev->wb_bytes) {
921 /* This is first coelesce call for a series of nfs_pages */
922 pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
923 prev->wb_context,
924 IOMODE_RW);
925 }
926 return NFS_SERVER(pgio->pg_inode)->pnfs_curr_ld->pg_test(pgio, prev, req);
927}
928
929void
930pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode)
931{
932 struct pnfs_layoutdriver_type *ld;
933
934 ld = NFS_SERVER(inode)->pnfs_curr_ld;
935 pgio->pg_test = (ld && ld->pg_test) ? pnfs_write_pg_test : NULL;
936}
937
938enum pnfs_try_status
939pnfs_try_to_write_data(struct nfs_write_data *wdata,
940 const struct rpc_call_ops *call_ops, int how)
941{
942 struct inode *inode = wdata->inode;
943 enum pnfs_try_status trypnfs;
944 struct nfs_server *nfss = NFS_SERVER(inode);
945
946 wdata->mds_ops = call_ops;
947
948 dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
949 inode->i_ino, wdata->args.count, wdata->args.offset, how);
950
951 trypnfs = nfss->pnfs_curr_ld->write_pagelist(wdata, how);
952 if (trypnfs == PNFS_NOT_ATTEMPTED) {
953 put_lseg(wdata->lseg);
954 wdata->lseg = NULL;
955 } else
956 nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
957
958 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
959 return trypnfs;
960}
961
832/* 962/*
833 * Device ID cache. Currently supports one layout type per struct nfs_client. 963 * Call the appropriate parallel I/O subsystem read function.
834 * Add layout type to the lookup key to expand to support multiple types.
835 */ 964 */
836int 965enum pnfs_try_status
837pnfs_alloc_init_deviceid_cache(struct nfs_client *clp, 966pnfs_try_to_read_data(struct nfs_read_data *rdata,
838 void (*free_callback)(struct pnfs_deviceid_node *)) 967 const struct rpc_call_ops *call_ops)
839{ 968{
840 struct pnfs_deviceid_cache *c; 969 struct inode *inode = rdata->inode;
970 struct nfs_server *nfss = NFS_SERVER(inode);
971 enum pnfs_try_status trypnfs;
841 972
842 c = kzalloc(sizeof(struct pnfs_deviceid_cache), GFP_KERNEL); 973 rdata->mds_ops = call_ops;
843 if (!c) 974
844 return -ENOMEM; 975 dprintk("%s: Reading ino:%lu %u@%llu\n",
845 spin_lock(&clp->cl_lock); 976 __func__, inode->i_ino, rdata->args.count, rdata->args.offset);
846 if (clp->cl_devid_cache != NULL) { 977
847 atomic_inc(&clp->cl_devid_cache->dc_ref); 978 trypnfs = nfss->pnfs_curr_ld->read_pagelist(rdata);
848 dprintk("%s [kref [%d]]\n", __func__, 979 if (trypnfs == PNFS_NOT_ATTEMPTED) {
849 atomic_read(&clp->cl_devid_cache->dc_ref)); 980 put_lseg(rdata->lseg);
850 kfree(c); 981 rdata->lseg = NULL;
851 } else { 982 } else {
852 /* kzalloc initializes hlists */ 983 nfs_inc_stats(inode, NFSIOS_PNFS_READ);
853 spin_lock_init(&c->dc_lock);
854 atomic_set(&c->dc_ref, 1);
855 c->dc_free_callback = free_callback;
856 clp->cl_devid_cache = c;
857 dprintk("%s [new]\n", __func__);
858 } 984 }
859 spin_unlock(&clp->cl_lock); 985 dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
860 return 0; 986 return trypnfs;
861} 987}
862EXPORT_SYMBOL_GPL(pnfs_alloc_init_deviceid_cache);
863 988
864/* 989/*
865 * Called from pnfs_layoutdriver_type->free_lseg 990 * Currently there is only one (whole file) write lseg.
866 * last layout segment reference frees deviceid
867 */ 991 */
868void 992static struct pnfs_layout_segment *pnfs_list_write_lseg(struct inode *inode)
869pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
870 struct pnfs_deviceid_node *devid)
871{ 993{
872 struct nfs4_deviceid *id = &devid->de_id; 994 struct pnfs_layout_segment *lseg, *rv = NULL;
873 struct pnfs_deviceid_node *d;
874 struct hlist_node *n;
875 long h = nfs4_deviceid_hash(id);
876 995
877 dprintk("%s [%d]\n", __func__, atomic_read(&devid->de_ref)); 996 list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list)
878 if (!atomic_dec_and_lock(&devid->de_ref, &c->dc_lock)) 997 if (lseg->pls_range.iomode == IOMODE_RW)
879 return; 998 rv = lseg;
999 return rv;
1000}
880 1001
881 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[h], de_node) 1002void
882 if (!memcmp(&d->de_id, id, sizeof(*id))) { 1003pnfs_set_layoutcommit(struct nfs_write_data *wdata)
883 hlist_del_rcu(&d->de_node); 1004{
884 spin_unlock(&c->dc_lock); 1005 struct nfs_inode *nfsi = NFS_I(wdata->inode);
885 synchronize_rcu(); 1006 loff_t end_pos = wdata->args.offset + wdata->res.count;
886 c->dc_free_callback(devid); 1007
887 return; 1008 spin_lock(&nfsi->vfs_inode.i_lock);
888 } 1009 if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
889 spin_unlock(&c->dc_lock); 1010 /* references matched in nfs4_layoutcommit_release */
890 /* Why wasn't it found in the list? */ 1011 get_lseg(wdata->lseg);
891 BUG(); 1012 wdata->lseg->pls_lc_cred =
892} 1013 get_rpccred(wdata->args.context->state->owner->so_cred);
893EXPORT_SYMBOL_GPL(pnfs_put_deviceid); 1014 mark_inode_dirty_sync(wdata->inode);
894 1015 dprintk("%s: Set layoutcommit for inode %lu ",
895/* Find and reference a deviceid */ 1016 __func__, wdata->inode->i_ino);
896struct pnfs_deviceid_node *
897pnfs_find_get_deviceid(struct pnfs_deviceid_cache *c, struct nfs4_deviceid *id)
898{
899 struct pnfs_deviceid_node *d;
900 struct hlist_node *n;
901 long hash = nfs4_deviceid_hash(id);
902
903 dprintk("--> %s hash %ld\n", __func__, hash);
904 rcu_read_lock();
905 hlist_for_each_entry_rcu(d, n, &c->dc_deviceids[hash], de_node) {
906 if (!memcmp(&d->de_id, id, sizeof(*id))) {
907 if (!atomic_inc_not_zero(&d->de_ref)) {
908 goto fail;
909 } else {
910 rcu_read_unlock();
911 return d;
912 }
913 }
914 } 1017 }
915fail: 1018 if (end_pos > wdata->lseg->pls_end_pos)
916 rcu_read_unlock(); 1019 wdata->lseg->pls_end_pos = end_pos;
917 return NULL; 1020 spin_unlock(&nfsi->vfs_inode.i_lock);
918} 1021}
919EXPORT_SYMBOL_GPL(pnfs_find_get_deviceid); 1022EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
920 1023
921/* 1024/*
922 * Add a deviceid to the cache. 1025 * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
923 * GETDEVICEINFOs for same deviceid can race. If deviceid is found, discard new 1026 * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
1027 * data to disk to allow the server to recover the data if it crashes.
1028 * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
1029 * is off, and a COMMIT is sent to a data server, or
1030 * if WRITEs to a data server return NFS_DATA_SYNC.
924 */ 1031 */
925struct pnfs_deviceid_node * 1032int
926pnfs_add_deviceid(struct pnfs_deviceid_cache *c, struct pnfs_deviceid_node *new) 1033pnfs_layoutcommit_inode(struct inode *inode, bool sync)
927{
928 struct pnfs_deviceid_node *d;
929 long hash = nfs4_deviceid_hash(&new->de_id);
930
931 dprintk("--> %s hash %ld\n", __func__, hash);
932 spin_lock(&c->dc_lock);
933 d = pnfs_find_get_deviceid(c, &new->de_id);
934 if (d) {
935 spin_unlock(&c->dc_lock);
936 dprintk("%s [discard]\n", __func__);
937 c->dc_free_callback(new);
938 return d;
939 }
940 INIT_HLIST_NODE(&new->de_node);
941 atomic_set(&new->de_ref, 1);
942 hlist_add_head_rcu(&new->de_node, &c->dc_deviceids[hash]);
943 spin_unlock(&c->dc_lock);
944 dprintk("%s [new]\n", __func__);
945 return new;
946}
947EXPORT_SYMBOL_GPL(pnfs_add_deviceid);
948
949void
950pnfs_put_deviceid_cache(struct nfs_client *clp)
951{ 1034{
952 struct pnfs_deviceid_cache *local = clp->cl_devid_cache; 1035 struct nfs4_layoutcommit_data *data;
1036 struct nfs_inode *nfsi = NFS_I(inode);
1037 struct pnfs_layout_segment *lseg;
1038 struct rpc_cred *cred;
1039 loff_t end_pos;
1040 int status = 0;
953 1041
954 dprintk("--> %s ({%d})\n", __func__, atomic_read(&local->dc_ref)); 1042 dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
955 if (atomic_dec_and_lock(&local->dc_ref, &clp->cl_lock)) { 1043
956 int i; 1044 if (!test_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
957 /* Verify cache is empty */ 1045 return 0;
958 for (i = 0; i < NFS4_DEVICE_ID_HASH_SIZE; i++) 1046
959 BUG_ON(!hlist_empty(&local->dc_deviceids[i])); 1047 /* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
960 clp->cl_devid_cache = NULL; 1048 data = kzalloc(sizeof(*data), GFP_NOFS);
961 spin_unlock(&clp->cl_lock); 1049 if (!data) {
962 kfree(local); 1050 mark_inode_dirty_sync(inode);
1051 status = -ENOMEM;
1052 goto out;
963 } 1053 }
1054
1055 spin_lock(&inode->i_lock);
1056 if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
1057 spin_unlock(&inode->i_lock);
1058 kfree(data);
1059 goto out;
1060 }
1061 /*
1062 * Currently only one (whole file) write lseg which is referenced
1063 * in pnfs_set_layoutcommit and will be found.
1064 */
1065 lseg = pnfs_list_write_lseg(inode);
1066
1067 end_pos = lseg->pls_end_pos;
1068 cred = lseg->pls_lc_cred;
1069 lseg->pls_end_pos = 0;
1070 lseg->pls_lc_cred = NULL;
1071
1072 memcpy(&data->args.stateid.data, nfsi->layout->plh_stateid.data,
1073 sizeof(nfsi->layout->plh_stateid.data));
1074 spin_unlock(&inode->i_lock);
1075
1076 data->args.inode = inode;
1077 data->lseg = lseg;
1078 data->cred = cred;
1079 nfs_fattr_init(&data->fattr);
1080 data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
1081 data->res.fattr = &data->fattr;
1082 data->args.lastbytewritten = end_pos - 1;
1083 data->res.server = NFS_SERVER(inode);
1084
1085 status = nfs4_proc_layoutcommit(data, sync);
1086out:
1087 dprintk("<-- %s status %d\n", __func__, status);
1088 return status;
964} 1089}
965EXPORT_SYMBOL_GPL(pnfs_put_deviceid_cache);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index e2612ea0cbed..bc4827202e7a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -30,6 +30,8 @@
30#ifndef FS_NFS_PNFS_H 30#ifndef FS_NFS_PNFS_H
31#define FS_NFS_PNFS_H 31#define FS_NFS_PNFS_H
32 32
33#include <linux/nfs_page.h>
34
33enum { 35enum {
34 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ 36 NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
35 NFS_LSEG_ROC, /* roc bit received from server */ 37 NFS_LSEG_ROC, /* roc bit received from server */
@@ -41,6 +43,13 @@ struct pnfs_layout_segment {
41 atomic_t pls_refcount; 43 atomic_t pls_refcount;
42 unsigned long pls_flags; 44 unsigned long pls_flags;
43 struct pnfs_layout_hdr *pls_layout; 45 struct pnfs_layout_hdr *pls_layout;
46 struct rpc_cred *pls_lc_cred; /* LAYOUTCOMMIT credential */
47 loff_t pls_end_pos; /* LAYOUTCOMMIT write end */
48};
49
50enum pnfs_try_status {
51 PNFS_ATTEMPTED = 0,
52 PNFS_NOT_ATTEMPTED = 1,
44}; 53};
45 54
46#ifdef CONFIG_NFS_V4_1 55#ifdef CONFIG_NFS_V4_1
@@ -61,10 +70,25 @@ struct pnfs_layoutdriver_type {
61 const u32 id; 70 const u32 id;
62 const char *name; 71 const char *name;
63 struct module *owner; 72 struct module *owner;
64 int (*set_layoutdriver) (struct nfs_server *);
65 int (*clear_layoutdriver) (struct nfs_server *);
66 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr); 73 struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr);
67 void (*free_lseg) (struct pnfs_layout_segment *lseg); 74 void (*free_lseg) (struct pnfs_layout_segment *lseg);
75
76 /* test for nfs page cache coalescing */
77 int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, struct nfs_page *);
78
79 /* Returns true if layoutdriver wants to divert this request to
80 * driver's commit routine.
81 */
82 bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg);
83 struct list_head * (*choose_commit_list) (struct nfs_page *req);
84 int (*commit_pagelist)(struct inode *inode, struct list_head *mds_pages, int how);
85
86 /*
87 * Return PNFS_ATTEMPTED to indicate the layout code has attempted
88 * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS
89 */
90 enum pnfs_try_status (*read_pagelist) (struct nfs_read_data *nfs_data);
91 enum pnfs_try_status (*write_pagelist) (struct nfs_write_data *nfs_data, int how);
68}; 92};
69 93
70struct pnfs_layout_hdr { 94struct pnfs_layout_hdr {
@@ -85,57 +109,10 @@ struct pnfs_device {
85 unsigned int layout_type; 109 unsigned int layout_type;
86 unsigned int mincount; 110 unsigned int mincount;
87 struct page **pages; 111 struct page **pages;
88 void *area;
89 unsigned int pgbase; 112 unsigned int pgbase;
90 unsigned int pglen; 113 unsigned int pglen;
91}; 114};
92 115
93/*
94 * Device ID RCU cache. A device ID is unique per client ID and layout type.
95 */
96#define NFS4_DEVICE_ID_HASH_BITS 5
97#define NFS4_DEVICE_ID_HASH_SIZE (1 << NFS4_DEVICE_ID_HASH_BITS)
98#define NFS4_DEVICE_ID_HASH_MASK (NFS4_DEVICE_ID_HASH_SIZE - 1)
99
100static inline u32
101nfs4_deviceid_hash(struct nfs4_deviceid *id)
102{
103 unsigned char *cptr = (unsigned char *)id->data;
104 unsigned int nbytes = NFS4_DEVICEID4_SIZE;
105 u32 x = 0;
106
107 while (nbytes--) {
108 x *= 37;
109 x += *cptr++;
110 }
111 return x & NFS4_DEVICE_ID_HASH_MASK;
112}
113
114struct pnfs_deviceid_node {
115 struct hlist_node de_node;
116 struct nfs4_deviceid de_id;
117 atomic_t de_ref;
118};
119
120struct pnfs_deviceid_cache {
121 spinlock_t dc_lock;
122 atomic_t dc_ref;
123 void (*dc_free_callback)(struct pnfs_deviceid_node *);
124 struct hlist_head dc_deviceids[NFS4_DEVICE_ID_HASH_SIZE];
125};
126
127extern int pnfs_alloc_init_deviceid_cache(struct nfs_client *,
128 void (*free_callback)(struct pnfs_deviceid_node *));
129extern void pnfs_put_deviceid_cache(struct nfs_client *);
130extern struct pnfs_deviceid_node *pnfs_find_get_deviceid(
131 struct pnfs_deviceid_cache *,
132 struct nfs4_deviceid *);
133extern struct pnfs_deviceid_node *pnfs_add_deviceid(
134 struct pnfs_deviceid_cache *,
135 struct pnfs_deviceid_node *);
136extern void pnfs_put_deviceid(struct pnfs_deviceid_cache *c,
137 struct pnfs_deviceid_node *devid);
138
139extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); 116extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
140extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); 117extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
141 118
@@ -146,11 +123,18 @@ extern int nfs4_proc_layoutget(struct nfs4_layoutget *lgp);
146 123
147/* pnfs.c */ 124/* pnfs.c */
148void get_layout_hdr(struct pnfs_layout_hdr *lo); 125void get_layout_hdr(struct pnfs_layout_hdr *lo);
126void put_lseg(struct pnfs_layout_segment *lseg);
149struct pnfs_layout_segment * 127struct pnfs_layout_segment *
150pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 128pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
151 enum pnfs_iomode access_type); 129 enum pnfs_iomode access_type);
152void set_pnfs_layoutdriver(struct nfs_server *, u32 id); 130void set_pnfs_layoutdriver(struct nfs_server *, u32 id);
153void unset_pnfs_layoutdriver(struct nfs_server *); 131void unset_pnfs_layoutdriver(struct nfs_server *);
132enum pnfs_try_status pnfs_try_to_write_data(struct nfs_write_data *,
133 const struct rpc_call_ops *, int);
134enum pnfs_try_status pnfs_try_to_read_data(struct nfs_read_data *,
135 const struct rpc_call_ops *);
136void pnfs_pageio_init_read(struct nfs_pageio_descriptor *, struct inode *);
137void pnfs_pageio_init_write(struct nfs_pageio_descriptor *, struct inode *);
154int pnfs_layout_process(struct nfs4_layoutget *lgp); 138int pnfs_layout_process(struct nfs4_layoutget *lgp);
155void pnfs_free_lseg_list(struct list_head *tmp_list); 139void pnfs_free_lseg_list(struct list_head *tmp_list);
156void pnfs_destroy_layout(struct nfs_inode *); 140void pnfs_destroy_layout(struct nfs_inode *);
@@ -169,7 +153,8 @@ bool pnfs_roc(struct inode *ino);
169void pnfs_roc_release(struct inode *ino); 153void pnfs_roc_release(struct inode *ino);
170void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); 154void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
171bool pnfs_roc_drain(struct inode *ino, u32 *barrier); 155bool pnfs_roc_drain(struct inode *ino, u32 *barrier);
172 156void pnfs_set_layoutcommit(struct nfs_write_data *wdata);
157int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
173 158
174static inline int lo_fail_bit(u32 iomode) 159static inline int lo_fail_bit(u32 iomode)
175{ 160{
@@ -177,12 +162,67 @@ static inline int lo_fail_bit(u32 iomode)
177 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED; 162 NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
178} 163}
179 164
165static inline struct pnfs_layout_segment *
166get_lseg(struct pnfs_layout_segment *lseg)
167{
168 if (lseg) {
169 atomic_inc(&lseg->pls_refcount);
170 smp_mb__after_atomic_inc();
171 }
172 return lseg;
173}
174
180/* Return true if a layout driver is being used for this mountpoint */ 175/* Return true if a layout driver is being used for this mountpoint */
181static inline int pnfs_enabled_sb(struct nfs_server *nfss) 176static inline int pnfs_enabled_sb(struct nfs_server *nfss)
182{ 177{
183 return nfss->pnfs_curr_ld != NULL; 178 return nfss->pnfs_curr_ld != NULL;
184} 179}
185 180
181static inline void
182pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
183{
184 if (lseg) {
185 struct pnfs_layoutdriver_type *ld;
186
187 ld = NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld;
188 if (ld->mark_pnfs_commit && ld->mark_pnfs_commit(lseg)) {
189 set_bit(PG_PNFS_COMMIT, &req->wb_flags);
190 req->wb_commit_lseg = get_lseg(lseg);
191 }
192 }
193}
194
195static inline int
196pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
197{
198 if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags))
199 return PNFS_NOT_ATTEMPTED;
200 return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how);
201}
202
203static inline struct list_head *
204pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
205{
206 struct list_head *rv;
207
208 if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags)) {
209 struct inode *inode = req->wb_commit_lseg->pls_layout->plh_inode;
210
211 set_bit(NFS_INO_PNFS_COMMIT, &NFS_I(inode)->flags);
212 rv = NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req);
213 /* matched by ref taken when PG_PNFS_COMMIT is set */
214 put_lseg(req->wb_commit_lseg);
215 } else
216 rv = mds;
217 return rv;
218}
219
220static inline void pnfs_clear_request_commit(struct nfs_page *req)
221{
222 if (test_and_clear_bit(PG_PNFS_COMMIT, &req->wb_flags))
223 put_lseg(req->wb_commit_lseg);
224}
225
186#else /* CONFIG_NFS_V4_1 */ 226#else /* CONFIG_NFS_V4_1 */
187 227
188static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) 228static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
@@ -194,12 +234,36 @@ static inline void pnfs_destroy_layout(struct nfs_inode *nfsi)
194} 234}
195 235
196static inline struct pnfs_layout_segment * 236static inline struct pnfs_layout_segment *
237get_lseg(struct pnfs_layout_segment *lseg)
238{
239 return NULL;
240}
241
242static inline void put_lseg(struct pnfs_layout_segment *lseg)
243{
244}
245
246static inline struct pnfs_layout_segment *
197pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx, 247pnfs_update_layout(struct inode *ino, struct nfs_open_context *ctx,
198 enum pnfs_iomode access_type) 248 enum pnfs_iomode access_type)
199{ 249{
200 return NULL; 250 return NULL;
201} 251}
202 252
253static inline enum pnfs_try_status
254pnfs_try_to_read_data(struct nfs_read_data *data,
255 const struct rpc_call_ops *call_ops)
256{
257 return PNFS_NOT_ATTEMPTED;
258}
259
260static inline enum pnfs_try_status
261pnfs_try_to_write_data(struct nfs_write_data *data,
262 const struct rpc_call_ops *call_ops, int how)
263{
264 return PNFS_NOT_ATTEMPTED;
265}
266
203static inline bool 267static inline bool
204pnfs_roc(struct inode *ino) 268pnfs_roc(struct inode *ino)
205{ 269{
@@ -230,6 +294,43 @@ static inline void unset_pnfs_layoutdriver(struct nfs_server *s)
230{ 294{
231} 295}
232 296
297static inline void
298pnfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *ino)
299{
300 pgio->pg_test = NULL;
301}
302
303static inline void
304pnfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *ino)
305{
306 pgio->pg_test = NULL;
307}
308
309static inline void
310pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
311{
312}
313
314static inline int
315pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how)
316{
317 return PNFS_NOT_ATTEMPTED;
318}
319
320static inline struct list_head *
321pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds)
322{
323 return mds;
324}
325
326static inline void pnfs_clear_request_commit(struct nfs_page *req)
327{
328}
329
330static inline int pnfs_layoutcommit_inode(struct inode *inode, bool sync)
331{
332 return 0;
333}
233#endif /* CONFIG_NFS_V4_1 */ 334#endif /* CONFIG_NFS_V4_1 */
234 335
235#endif /* FS_NFS_PNFS_H */ 336#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 77d5e21c4ad6..ac40b8535d7e 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -177,7 +177,7 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
177} 177}
178 178
179static int 179static int
180nfs_proc_lookup(struct inode *dir, struct qstr *name, 180nfs_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, struct qstr *name,
181 struct nfs_fh *fhandle, struct nfs_fattr *fattr) 181 struct nfs_fh *fhandle, struct nfs_fattr *fattr)
182{ 182{
183 struct nfs_diropargs arg = { 183 struct nfs_diropargs arg = {
@@ -741,4 +741,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
741 .lock = nfs_proc_lock, 741 .lock = nfs_proc_lock,
742 .lock_check_bounds = nfs_lock_check_bounds, 742 .lock_check_bounds = nfs_lock_check_bounds,
743 .close_context = nfs_close_context, 743 .close_context = nfs_close_context,
744 .init_client = nfs_init_client,
744}; 745};
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index aedcaa7f291f..7cded2b12a05 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -18,19 +18,20 @@
18#include <linux/sunrpc/clnt.h> 18#include <linux/sunrpc/clnt.h>
19#include <linux/nfs_fs.h> 19#include <linux/nfs_fs.h>
20#include <linux/nfs_page.h> 20#include <linux/nfs_page.h>
21#include <linux/module.h>
21 22
22#include <asm/system.h> 23#include <asm/system.h>
24#include "pnfs.h"
23 25
24#include "nfs4_fs.h" 26#include "nfs4_fs.h"
25#include "internal.h" 27#include "internal.h"
26#include "iostat.h" 28#include "iostat.h"
27#include "fscache.h" 29#include "fscache.h"
28#include "pnfs.h"
29 30
30#define NFSDBG_FACILITY NFSDBG_PAGECACHE 31#define NFSDBG_FACILITY NFSDBG_PAGECACHE
31 32
32static int nfs_pagein_multi(struct inode *, struct list_head *, unsigned int, size_t, int); 33static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc);
33static int nfs_pagein_one(struct inode *, struct list_head *, unsigned int, size_t, int); 34static int nfs_pagein_one(struct nfs_pageio_descriptor *desc);
34static const struct rpc_call_ops nfs_read_partial_ops; 35static const struct rpc_call_ops nfs_read_partial_ops;
35static const struct rpc_call_ops nfs_read_full_ops; 36static const struct rpc_call_ops nfs_read_full_ops;
36 37
@@ -69,6 +70,7 @@ void nfs_readdata_free(struct nfs_read_data *p)
69 70
70static void nfs_readdata_release(struct nfs_read_data *rdata) 71static void nfs_readdata_release(struct nfs_read_data *rdata)
71{ 72{
73 put_lseg(rdata->lseg);
72 put_nfs_open_context(rdata->args.context); 74 put_nfs_open_context(rdata->args.context);
73 nfs_readdata_free(rdata); 75 nfs_readdata_free(rdata);
74} 76}
@@ -114,14 +116,13 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data)
114int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, 116int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
115 struct page *page) 117 struct page *page)
116{ 118{
117 LIST_HEAD(one_request);
118 struct nfs_page *new; 119 struct nfs_page *new;
119 unsigned int len; 120 unsigned int len;
121 struct nfs_pageio_descriptor pgio;
120 122
121 len = nfs_page_length(page); 123 len = nfs_page_length(page);
122 if (len == 0) 124 if (len == 0)
123 return nfs_return_empty_page(page); 125 return nfs_return_empty_page(page);
124 pnfs_update_layout(inode, ctx, IOMODE_READ);
125 new = nfs_create_request(ctx, inode, page, 0, len); 126 new = nfs_create_request(ctx, inode, page, 0, len);
126 if (IS_ERR(new)) { 127 if (IS_ERR(new)) {
127 unlock_page(page); 128 unlock_page(page);
@@ -130,11 +131,14 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
130 if (len < PAGE_CACHE_SIZE) 131 if (len < PAGE_CACHE_SIZE)
131 zero_user_segment(page, len, PAGE_CACHE_SIZE); 132 zero_user_segment(page, len, PAGE_CACHE_SIZE);
132 133
133 nfs_list_add_request(new, &one_request); 134 nfs_pageio_init(&pgio, inode, NULL, 0, 0);
135 nfs_list_add_request(new, &pgio.pg_list);
136 pgio.pg_count = len;
137
134 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) 138 if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE)
135 nfs_pagein_multi(inode, &one_request, 1, len, 0); 139 nfs_pagein_multi(&pgio);
136 else 140 else
137 nfs_pagein_one(inode, &one_request, 1, len, 0); 141 nfs_pagein_one(&pgio);
138 return 0; 142 return 0;
139} 143}
140 144
@@ -155,24 +159,20 @@ static void nfs_readpage_release(struct nfs_page *req)
155 nfs_release_request(req); 159 nfs_release_request(req);
156} 160}
157 161
158/* 162int nfs_initiate_read(struct nfs_read_data *data, struct rpc_clnt *clnt,
159 * Set up the NFS read request struct 163 const struct rpc_call_ops *call_ops)
160 */
161static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
162 const struct rpc_call_ops *call_ops,
163 unsigned int count, unsigned int offset)
164{ 164{
165 struct inode *inode = req->wb_context->path.dentry->d_inode; 165 struct inode *inode = data->inode;
166 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0; 166 int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
167 struct rpc_task *task; 167 struct rpc_task *task;
168 struct rpc_message msg = { 168 struct rpc_message msg = {
169 .rpc_argp = &data->args, 169 .rpc_argp = &data->args,
170 .rpc_resp = &data->res, 170 .rpc_resp = &data->res,
171 .rpc_cred = req->wb_context->cred, 171 .rpc_cred = data->cred,
172 }; 172 };
173 struct rpc_task_setup task_setup_data = { 173 struct rpc_task_setup task_setup_data = {
174 .task = &data->task, 174 .task = &data->task,
175 .rpc_client = NFS_CLIENT(inode), 175 .rpc_client = clnt,
176 .rpc_message = &msg, 176 .rpc_message = &msg,
177 .callback_ops = call_ops, 177 .callback_ops = call_ops,
178 .callback_data = data, 178 .callback_data = data,
@@ -180,9 +180,39 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
180 .flags = RPC_TASK_ASYNC | swap_flags, 180 .flags = RPC_TASK_ASYNC | swap_flags,
181 }; 181 };
182 182
183 /* Set up the initial task struct. */
184 NFS_PROTO(inode)->read_setup(data, &msg);
185
186 dprintk("NFS: %5u initiated read call (req %s/%lld, %u bytes @ "
187 "offset %llu)\n",
188 data->task.tk_pid,
189 inode->i_sb->s_id,
190 (long long)NFS_FILEID(inode),
191 data->args.count,
192 (unsigned long long)data->args.offset);
193
194 task = rpc_run_task(&task_setup_data);
195 if (IS_ERR(task))
196 return PTR_ERR(task);
197 rpc_put_task(task);
198 return 0;
199}
200EXPORT_SYMBOL_GPL(nfs_initiate_read);
201
202/*
203 * Set up the NFS read request struct
204 */
205static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
206 const struct rpc_call_ops *call_ops,
207 unsigned int count, unsigned int offset,
208 struct pnfs_layout_segment *lseg)
209{
210 struct inode *inode = req->wb_context->path.dentry->d_inode;
211
183 data->req = req; 212 data->req = req;
184 data->inode = inode; 213 data->inode = inode;
185 data->cred = msg.rpc_cred; 214 data->cred = req->wb_context->cred;
215 data->lseg = get_lseg(lseg);
186 216
187 data->args.fh = NFS_FH(inode); 217 data->args.fh = NFS_FH(inode);
188 data->args.offset = req_offset(req) + offset; 218 data->args.offset = req_offset(req) + offset;
@@ -197,21 +227,11 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data,
197 data->res.eof = 0; 227 data->res.eof = 0;
198 nfs_fattr_init(&data->fattr); 228 nfs_fattr_init(&data->fattr);
199 229
200 /* Set up the initial task struct. */ 230 if (data->lseg &&
201 NFS_PROTO(inode)->read_setup(data, &msg); 231 (pnfs_try_to_read_data(data, call_ops) == PNFS_ATTEMPTED))
202 232 return 0;
203 dprintk("NFS: %5u initiated read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
204 data->task.tk_pid,
205 inode->i_sb->s_id,
206 (long long)NFS_FILEID(inode),
207 count,
208 (unsigned long long)data->args.offset);
209 233
210 task = rpc_run_task(&task_setup_data); 234 return nfs_initiate_read(data, NFS_CLIENT(inode), call_ops);
211 if (IS_ERR(task))
212 return PTR_ERR(task);
213 rpc_put_task(task);
214 return 0;
215} 235}
216 236
217static void 237static void
@@ -240,20 +260,21 @@ nfs_async_read_error(struct list_head *head)
240 * won't see the new data until our attribute cache is updated. This is more 260 * won't see the new data until our attribute cache is updated. This is more
241 * or less conventional NFS client behavior. 261 * or less conventional NFS client behavior.
242 */ 262 */
243static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) 263static int nfs_pagein_multi(struct nfs_pageio_descriptor *desc)
244{ 264{
245 struct nfs_page *req = nfs_list_entry(head->next); 265 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
246 struct page *page = req->wb_page; 266 struct page *page = req->wb_page;
247 struct nfs_read_data *data; 267 struct nfs_read_data *data;
248 size_t rsize = NFS_SERVER(inode)->rsize, nbytes; 268 size_t rsize = NFS_SERVER(desc->pg_inode)->rsize, nbytes;
249 unsigned int offset; 269 unsigned int offset;
250 int requests = 0; 270 int requests = 0;
251 int ret = 0; 271 int ret = 0;
272 struct pnfs_layout_segment *lseg;
252 LIST_HEAD(list); 273 LIST_HEAD(list);
253 274
254 nfs_list_remove_request(req); 275 nfs_list_remove_request(req);
255 276
256 nbytes = count; 277 nbytes = desc->pg_count;
257 do { 278 do {
258 size_t len = min(nbytes,rsize); 279 size_t len = min(nbytes,rsize);
259 280
@@ -266,9 +287,11 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
266 } while(nbytes != 0); 287 } while(nbytes != 0);
267 atomic_set(&req->wb_complete, requests); 288 atomic_set(&req->wb_complete, requests);
268 289
290 BUG_ON(desc->pg_lseg != NULL);
291 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
269 ClearPageError(page); 292 ClearPageError(page);
270 offset = 0; 293 offset = 0;
271 nbytes = count; 294 nbytes = desc->pg_count;
272 do { 295 do {
273 int ret2; 296 int ret2;
274 297
@@ -280,12 +303,14 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne
280 if (nbytes < rsize) 303 if (nbytes < rsize)
281 rsize = nbytes; 304 rsize = nbytes;
282 ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, 305 ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops,
283 rsize, offset); 306 rsize, offset, lseg);
284 if (ret == 0) 307 if (ret == 0)
285 ret = ret2; 308 ret = ret2;
286 offset += rsize; 309 offset += rsize;
287 nbytes -= rsize; 310 nbytes -= rsize;
288 } while (nbytes != 0); 311 } while (nbytes != 0);
312 put_lseg(lseg);
313 desc->pg_lseg = NULL;
289 314
290 return ret; 315 return ret;
291 316
@@ -300,16 +325,21 @@ out_bad:
300 return -ENOMEM; 325 return -ENOMEM;
301} 326}
302 327
303static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int flags) 328static int nfs_pagein_one(struct nfs_pageio_descriptor *desc)
304{ 329{
305 struct nfs_page *req; 330 struct nfs_page *req;
306 struct page **pages; 331 struct page **pages;
307 struct nfs_read_data *data; 332 struct nfs_read_data *data;
333 struct list_head *head = &desc->pg_list;
334 struct pnfs_layout_segment *lseg = desc->pg_lseg;
308 int ret = -ENOMEM; 335 int ret = -ENOMEM;
309 336
310 data = nfs_readdata_alloc(npages); 337 data = nfs_readdata_alloc(nfs_page_array_len(desc->pg_base,
311 if (!data) 338 desc->pg_count));
312 goto out_bad; 339 if (!data) {
340 nfs_async_read_error(head);
341 goto out;
342 }
313 343
314 pages = data->pagevec; 344 pages = data->pagevec;
315 while (!list_empty(head)) { 345 while (!list_empty(head)) {
@@ -320,10 +350,14 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned
320 *pages++ = req->wb_page; 350 *pages++ = req->wb_page;
321 } 351 }
322 req = nfs_list_entry(data->pages.next); 352 req = nfs_list_entry(data->pages.next);
353 if ((!lseg) && list_is_singular(&data->pages))
354 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_READ);
323 355
324 return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); 356 ret = nfs_read_rpcsetup(req, data, &nfs_read_full_ops, desc->pg_count,
325out_bad: 357 0, lseg);
326 nfs_async_read_error(head); 358out:
359 put_lseg(lseg);
360 desc->pg_lseg = NULL;
327 return ret; 361 return ret;
328} 362}
329 363
@@ -366,6 +400,7 @@ static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data
366 return; 400 return;
367 401
368 /* Yes, so retry the read at the end of the data */ 402 /* Yes, so retry the read at the end of the data */
403 data->mds_offset += resp->count;
369 argp->offset += resp->count; 404 argp->offset += resp->count;
370 argp->pgbase += resp->count; 405 argp->pgbase += resp->count;
371 argp->count -= resp->count; 406 argp->count -= resp->count;
@@ -625,7 +660,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
625 if (ret == 0) 660 if (ret == 0)
626 goto read_complete; /* all pages were read */ 661 goto read_complete; /* all pages were read */
627 662
628 pnfs_update_layout(inode, desc.ctx, IOMODE_READ); 663 pnfs_pageio_init_read(&pgio, inode);
629 if (rsize < PAGE_CACHE_SIZE) 664 if (rsize < PAGE_CACHE_SIZE)
630 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0); 665 nfs_pageio_init(&pgio, inode, nfs_pagein_multi, rsize, 0);
631 else 666 else
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b68c8607770f..2b8e9a5e366a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -263,8 +263,11 @@ static match_table_t nfs_local_lock_tokens = {
263static void nfs_umount_begin(struct super_block *); 263static void nfs_umount_begin(struct super_block *);
264static int nfs_statfs(struct dentry *, struct kstatfs *); 264static int nfs_statfs(struct dentry *, struct kstatfs *);
265static int nfs_show_options(struct seq_file *, struct vfsmount *); 265static int nfs_show_options(struct seq_file *, struct vfsmount *);
266static int nfs_show_devname(struct seq_file *, struct vfsmount *);
267static int nfs_show_path(struct seq_file *, struct vfsmount *);
266static int nfs_show_stats(struct seq_file *, struct vfsmount *); 268static int nfs_show_stats(struct seq_file *, struct vfsmount *);
267static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); 269static struct dentry *nfs_fs_mount(struct file_system_type *,
270 int, const char *, void *);
268static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type, 271static struct dentry *nfs_xdev_mount(struct file_system_type *fs_type,
269 int flags, const char *dev_name, void *raw_data); 272 int flags, const char *dev_name, void *raw_data);
270static void nfs_put_super(struct super_block *); 273static void nfs_put_super(struct super_block *);
@@ -274,7 +277,7 @@ static int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
274static struct file_system_type nfs_fs_type = { 277static struct file_system_type nfs_fs_type = {
275 .owner = THIS_MODULE, 278 .owner = THIS_MODULE,
276 .name = "nfs", 279 .name = "nfs",
277 .get_sb = nfs_get_sb, 280 .mount = nfs_fs_mount,
278 .kill_sb = nfs_kill_super, 281 .kill_sb = nfs_kill_super,
279 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 282 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
280}; 283};
@@ -296,6 +299,8 @@ static const struct super_operations nfs_sops = {
296 .evict_inode = nfs_evict_inode, 299 .evict_inode = nfs_evict_inode,
297 .umount_begin = nfs_umount_begin, 300 .umount_begin = nfs_umount_begin,
298 .show_options = nfs_show_options, 301 .show_options = nfs_show_options,
302 .show_devname = nfs_show_devname,
303 .show_path = nfs_show_path,
299 .show_stats = nfs_show_stats, 304 .show_stats = nfs_show_stats,
300 .remount_fs = nfs_remount, 305 .remount_fs = nfs_remount,
301}; 306};
@@ -303,16 +308,16 @@ static const struct super_operations nfs_sops = {
303#ifdef CONFIG_NFS_V4 308#ifdef CONFIG_NFS_V4
304static int nfs4_validate_text_mount_data(void *options, 309static int nfs4_validate_text_mount_data(void *options,
305 struct nfs_parsed_mount_data *args, const char *dev_name); 310 struct nfs_parsed_mount_data *args, const char *dev_name);
306static int nfs4_try_mount(int flags, const char *dev_name, 311static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
307 struct nfs_parsed_mount_data *data, struct vfsmount *mnt); 312 struct nfs_parsed_mount_data *data);
308static int nfs4_get_sb(struct file_system_type *fs_type, 313static struct dentry *nfs4_mount(struct file_system_type *fs_type,
309 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 314 int flags, const char *dev_name, void *raw_data);
310static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type, 315static struct dentry *nfs4_remote_mount(struct file_system_type *fs_type,
311 int flags, const char *dev_name, void *raw_data); 316 int flags, const char *dev_name, void *raw_data);
312static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type, 317static struct dentry *nfs4_xdev_mount(struct file_system_type *fs_type,
313 int flags, const char *dev_name, void *raw_data); 318 int flags, const char *dev_name, void *raw_data);
314static int nfs4_referral_get_sb(struct file_system_type *fs_type, 319static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
315 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); 320 int flags, const char *dev_name, void *raw_data);
316static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type, 321static struct dentry *nfs4_remote_referral_mount(struct file_system_type *fs_type,
317 int flags, const char *dev_name, void *raw_data); 322 int flags, const char *dev_name, void *raw_data);
318static void nfs4_kill_super(struct super_block *sb); 323static void nfs4_kill_super(struct super_block *sb);
@@ -320,7 +325,7 @@ static void nfs4_kill_super(struct super_block *sb);
320static struct file_system_type nfs4_fs_type = { 325static struct file_system_type nfs4_fs_type = {
321 .owner = THIS_MODULE, 326 .owner = THIS_MODULE,
322 .name = "nfs4", 327 .name = "nfs4",
323 .get_sb = nfs4_get_sb, 328 .mount = nfs4_mount,
324 .kill_sb = nfs4_kill_super, 329 .kill_sb = nfs4_kill_super,
325 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 330 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
326}; 331};
@@ -352,7 +357,7 @@ static struct file_system_type nfs4_remote_referral_fs_type = {
352struct file_system_type nfs4_referral_fs_type = { 357struct file_system_type nfs4_referral_fs_type = {
353 .owner = THIS_MODULE, 358 .owner = THIS_MODULE,
354 .name = "nfs4", 359 .name = "nfs4",
355 .get_sb = nfs4_referral_get_sb, 360 .mount = nfs4_referral_mount,
356 .kill_sb = nfs4_kill_super, 361 .kill_sb = nfs4_kill_super,
357 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA, 362 .fs_flags = FS_RENAME_DOES_D_MOVE|FS_REVAL_DOT|FS_BINARY_MOUNTDATA,
358}; 363};
@@ -366,6 +371,8 @@ static const struct super_operations nfs4_sops = {
366 .evict_inode = nfs4_evict_inode, 371 .evict_inode = nfs4_evict_inode,
367 .umount_begin = nfs_umount_begin, 372 .umount_begin = nfs_umount_begin,
368 .show_options = nfs_show_options, 373 .show_options = nfs_show_options,
374 .show_devname = nfs_show_devname,
375 .show_path = nfs_show_path,
369 .show_stats = nfs_show_stats, 376 .show_stats = nfs_show_stats,
370 .remount_fs = nfs_remount, 377 .remount_fs = nfs_remount,
371}; 378};
@@ -726,6 +733,28 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
726 return 0; 733 return 0;
727} 734}
728 735
736static int nfs_show_devname(struct seq_file *m, struct vfsmount *mnt)
737{
738 char *page = (char *) __get_free_page(GFP_KERNEL);
739 char *devname, *dummy;
740 int err = 0;
741 if (!page)
742 return -ENOMEM;
743 devname = nfs_path(&dummy, mnt->mnt_root, page, PAGE_SIZE);
744 if (IS_ERR(devname))
745 err = PTR_ERR(devname);
746 else
747 seq_escape(m, devname, " \t\n\\");
748 free_page((unsigned long)page);
749 return err;
750}
751
752static int nfs_show_path(struct seq_file *m, struct vfsmount *mnt)
753{
754 seq_puts(m, "/");
755 return 0;
756}
757
729/* 758/*
730 * Present statistical information for this VFS mountpoint 759 * Present statistical information for this VFS mountpoint
731 */ 760 */
@@ -979,6 +1008,27 @@ static int nfs_parse_security_flavors(char *value,
979 return 1; 1008 return 1;
980} 1009}
981 1010
1011static int nfs_get_option_str(substring_t args[], char **option)
1012{
1013 kfree(*option);
1014 *option = match_strdup(args);
1015 return !option;
1016}
1017
1018static int nfs_get_option_ul(substring_t args[], unsigned long *option)
1019{
1020 int rc;
1021 char *string;
1022
1023 string = match_strdup(args);
1024 if (string == NULL)
1025 return -ENOMEM;
1026 rc = strict_strtoul(string, 10, option);
1027 kfree(string);
1028
1029 return rc;
1030}
1031
982/* 1032/*
983 * Error-check and convert a string of mount options from user space into 1033 * Error-check and convert a string of mount options from user space into
984 * a data structure. The whole mount string is processed; bad options are 1034 * a data structure. The whole mount string is processed; bad options are
@@ -1127,155 +1177,82 @@ static int nfs_parse_mount_options(char *raw,
1127 * options that take numeric values 1177 * options that take numeric values
1128 */ 1178 */
1129 case Opt_port: 1179 case Opt_port:
1130 string = match_strdup(args); 1180 if (nfs_get_option_ul(args, &option) ||
1131 if (string == NULL) 1181 option > USHRT_MAX)
1132 goto out_nomem;
1133 rc = strict_strtoul(string, 10, &option);
1134 kfree(string);
1135 if (rc != 0 || option > USHRT_MAX)
1136 goto out_invalid_value; 1182 goto out_invalid_value;
1137 mnt->nfs_server.port = option; 1183 mnt->nfs_server.port = option;
1138 break; 1184 break;
1139 case Opt_rsize: 1185 case Opt_rsize:
1140 string = match_strdup(args); 1186 if (nfs_get_option_ul(args, &option))
1141 if (string == NULL)
1142 goto out_nomem;
1143 rc = strict_strtoul(string, 10, &option);
1144 kfree(string);
1145 if (rc != 0)
1146 goto out_invalid_value; 1187 goto out_invalid_value;
1147 mnt->rsize = option; 1188 mnt->rsize = option;
1148 break; 1189 break;
1149 case Opt_wsize: 1190 case Opt_wsize:
1150 string = match_strdup(args); 1191 if (nfs_get_option_ul(args, &option))
1151 if (string == NULL)
1152 goto out_nomem;
1153 rc = strict_strtoul(string, 10, &option);
1154 kfree(string);
1155 if (rc != 0)
1156 goto out_invalid_value; 1192 goto out_invalid_value;
1157 mnt->wsize = option; 1193 mnt->wsize = option;
1158 break; 1194 break;
1159 case Opt_bsize: 1195 case Opt_bsize:
1160 string = match_strdup(args); 1196 if (nfs_get_option_ul(args, &option))
1161 if (string == NULL)
1162 goto out_nomem;
1163 rc = strict_strtoul(string, 10, &option);
1164 kfree(string);
1165 if (rc != 0)
1166 goto out_invalid_value; 1197 goto out_invalid_value;
1167 mnt->bsize = option; 1198 mnt->bsize = option;
1168 break; 1199 break;
1169 case Opt_timeo: 1200 case Opt_timeo:
1170 string = match_strdup(args); 1201 if (nfs_get_option_ul(args, &option) || option == 0)
1171 if (string == NULL)
1172 goto out_nomem;
1173 rc = strict_strtoul(string, 10, &option);
1174 kfree(string);
1175 if (rc != 0 || option == 0)
1176 goto out_invalid_value; 1202 goto out_invalid_value;
1177 mnt->timeo = option; 1203 mnt->timeo = option;
1178 break; 1204 break;
1179 case Opt_retrans: 1205 case Opt_retrans:
1180 string = match_strdup(args); 1206 if (nfs_get_option_ul(args, &option) || option == 0)
1181 if (string == NULL)
1182 goto out_nomem;
1183 rc = strict_strtoul(string, 10, &option);
1184 kfree(string);
1185 if (rc != 0 || option == 0)
1186 goto out_invalid_value; 1207 goto out_invalid_value;
1187 mnt->retrans = option; 1208 mnt->retrans = option;
1188 break; 1209 break;
1189 case Opt_acregmin: 1210 case Opt_acregmin:
1190 string = match_strdup(args); 1211 if (nfs_get_option_ul(args, &option))
1191 if (string == NULL)
1192 goto out_nomem;
1193 rc = strict_strtoul(string, 10, &option);
1194 kfree(string);
1195 if (rc != 0)
1196 goto out_invalid_value; 1212 goto out_invalid_value;
1197 mnt->acregmin = option; 1213 mnt->acregmin = option;
1198 break; 1214 break;
1199 case Opt_acregmax: 1215 case Opt_acregmax:
1200 string = match_strdup(args); 1216 if (nfs_get_option_ul(args, &option))
1201 if (string == NULL)
1202 goto out_nomem;
1203 rc = strict_strtoul(string, 10, &option);
1204 kfree(string);
1205 if (rc != 0)
1206 goto out_invalid_value; 1217 goto out_invalid_value;
1207 mnt->acregmax = option; 1218 mnt->acregmax = option;
1208 break; 1219 break;
1209 case Opt_acdirmin: 1220 case Opt_acdirmin:
1210 string = match_strdup(args); 1221 if (nfs_get_option_ul(args, &option))
1211 if (string == NULL)
1212 goto out_nomem;
1213 rc = strict_strtoul(string, 10, &option);
1214 kfree(string);
1215 if (rc != 0)
1216 goto out_invalid_value; 1222 goto out_invalid_value;
1217 mnt->acdirmin = option; 1223 mnt->acdirmin = option;
1218 break; 1224 break;
1219 case Opt_acdirmax: 1225 case Opt_acdirmax:
1220 string = match_strdup(args); 1226 if (nfs_get_option_ul(args, &option))
1221 if (string == NULL)
1222 goto out_nomem;
1223 rc = strict_strtoul(string, 10, &option);
1224 kfree(string);
1225 if (rc != 0)
1226 goto out_invalid_value; 1227 goto out_invalid_value;
1227 mnt->acdirmax = option; 1228 mnt->acdirmax = option;
1228 break; 1229 break;
1229 case Opt_actimeo: 1230 case Opt_actimeo:
1230 string = match_strdup(args); 1231 if (nfs_get_option_ul(args, &option))
1231 if (string == NULL)
1232 goto out_nomem;
1233 rc = strict_strtoul(string, 10, &option);
1234 kfree(string);
1235 if (rc != 0)
1236 goto out_invalid_value; 1232 goto out_invalid_value;
1237 mnt->acregmin = mnt->acregmax = 1233 mnt->acregmin = mnt->acregmax =
1238 mnt->acdirmin = mnt->acdirmax = option; 1234 mnt->acdirmin = mnt->acdirmax = option;
1239 break; 1235 break;
1240 case Opt_namelen: 1236 case Opt_namelen:
1241 string = match_strdup(args); 1237 if (nfs_get_option_ul(args, &option))
1242 if (string == NULL)
1243 goto out_nomem;
1244 rc = strict_strtoul(string, 10, &option);
1245 kfree(string);
1246 if (rc != 0)
1247 goto out_invalid_value; 1238 goto out_invalid_value;
1248 mnt->namlen = option; 1239 mnt->namlen = option;
1249 break; 1240 break;
1250 case Opt_mountport: 1241 case Opt_mountport:
1251 string = match_strdup(args); 1242 if (nfs_get_option_ul(args, &option) ||
1252 if (string == NULL) 1243 option > USHRT_MAX)
1253 goto out_nomem;
1254 rc = strict_strtoul(string, 10, &option);
1255 kfree(string);
1256 if (rc != 0 || option > USHRT_MAX)
1257 goto out_invalid_value; 1244 goto out_invalid_value;
1258 mnt->mount_server.port = option; 1245 mnt->mount_server.port = option;
1259 break; 1246 break;
1260 case Opt_mountvers: 1247 case Opt_mountvers:
1261 string = match_strdup(args); 1248 if (nfs_get_option_ul(args, &option) ||
1262 if (string == NULL)
1263 goto out_nomem;
1264 rc = strict_strtoul(string, 10, &option);
1265 kfree(string);
1266 if (rc != 0 ||
1267 option < NFS_MNT_VERSION || 1249 option < NFS_MNT_VERSION ||
1268 option > NFS_MNT3_VERSION) 1250 option > NFS_MNT3_VERSION)
1269 goto out_invalid_value; 1251 goto out_invalid_value;
1270 mnt->mount_server.version = option; 1252 mnt->mount_server.version = option;
1271 break; 1253 break;
1272 case Opt_nfsvers: 1254 case Opt_nfsvers:
1273 string = match_strdup(args); 1255 if (nfs_get_option_ul(args, &option))
1274 if (string == NULL)
1275 goto out_nomem;
1276 rc = strict_strtoul(string, 10, &option);
1277 kfree(string);
1278 if (rc != 0)
1279 goto out_invalid_value; 1256 goto out_invalid_value;
1280 switch (option) { 1257 switch (option) {
1281 case NFS2_VERSION: 1258 case NFS2_VERSION:
@@ -1295,12 +1272,7 @@ static int nfs_parse_mount_options(char *raw,
1295 } 1272 }
1296 break; 1273 break;
1297 case Opt_minorversion: 1274 case Opt_minorversion:
1298 string = match_strdup(args); 1275 if (nfs_get_option_ul(args, &option))
1299 if (string == NULL)
1300 goto out_nomem;
1301 rc = strict_strtoul(string, 10, &option);
1302 kfree(string);
1303 if (rc != 0)
1304 goto out_invalid_value; 1276 goto out_invalid_value;
1305 if (option > NFS4_MAX_MINOR_VERSION) 1277 if (option > NFS4_MAX_MINOR_VERSION)
1306 goto out_invalid_value; 1278 goto out_invalid_value;
@@ -1336,21 +1308,18 @@ static int nfs_parse_mount_options(char *raw,
1336 case Opt_xprt_udp: 1308 case Opt_xprt_udp:
1337 mnt->flags &= ~NFS_MOUNT_TCP; 1309 mnt->flags &= ~NFS_MOUNT_TCP;
1338 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; 1310 mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
1339 kfree(string);
1340 break; 1311 break;
1341 case Opt_xprt_tcp6: 1312 case Opt_xprt_tcp6:
1342 protofamily = AF_INET6; 1313 protofamily = AF_INET6;
1343 case Opt_xprt_tcp: 1314 case Opt_xprt_tcp:
1344 mnt->flags |= NFS_MOUNT_TCP; 1315 mnt->flags |= NFS_MOUNT_TCP;
1345 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; 1316 mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP;
1346 kfree(string);
1347 break; 1317 break;
1348 case Opt_xprt_rdma: 1318 case Opt_xprt_rdma:
1349 /* vector side protocols to TCP */ 1319 /* vector side protocols to TCP */
1350 mnt->flags |= NFS_MOUNT_TCP; 1320 mnt->flags |= NFS_MOUNT_TCP;
1351 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA; 1321 mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
1352 xprt_load_transport(string); 1322 xprt_load_transport(string);
1353 kfree(string);
1354 break; 1323 break;
1355 default: 1324 default:
1356 dfprintk(MOUNT, "NFS: unrecognized " 1325 dfprintk(MOUNT, "NFS: unrecognized "
@@ -1358,6 +1327,7 @@ static int nfs_parse_mount_options(char *raw,
1358 kfree(string); 1327 kfree(string);
1359 return 0; 1328 return 0;
1360 } 1329 }
1330 kfree(string);
1361 break; 1331 break;
1362 case Opt_mountproto: 1332 case Opt_mountproto:
1363 string = match_strdup(args); 1333 string = match_strdup(args);
@@ -1400,18 +1370,13 @@ static int nfs_parse_mount_options(char *raw,
1400 goto out_invalid_address; 1370 goto out_invalid_address;
1401 break; 1371 break;
1402 case Opt_clientaddr: 1372 case Opt_clientaddr:
1403 string = match_strdup(args); 1373 if (nfs_get_option_str(args, &mnt->client_address))
1404 if (string == NULL)
1405 goto out_nomem; 1374 goto out_nomem;
1406 kfree(mnt->client_address);
1407 mnt->client_address = string;
1408 break; 1375 break;
1409 case Opt_mounthost: 1376 case Opt_mounthost:
1410 string = match_strdup(args); 1377 if (nfs_get_option_str(args,
1411 if (string == NULL) 1378 &mnt->mount_server.hostname))
1412 goto out_nomem; 1379 goto out_nomem;
1413 kfree(mnt->mount_server.hostname);
1414 mnt->mount_server.hostname = string;
1415 break; 1380 break;
1416 case Opt_mountaddr: 1381 case Opt_mountaddr:
1417 string = match_strdup(args); 1382 string = match_strdup(args);
@@ -1451,11 +1416,8 @@ static int nfs_parse_mount_options(char *raw,
1451 }; 1416 };
1452 break; 1417 break;
1453 case Opt_fscache_uniq: 1418 case Opt_fscache_uniq:
1454 string = match_strdup(args); 1419 if (nfs_get_option_str(args, &mnt->fscache_uniq))
1455 if (string == NULL)
1456 goto out_nomem; 1420 goto out_nomem;
1457 kfree(mnt->fscache_uniq);
1458 mnt->fscache_uniq = string;
1459 mnt->options |= NFS_OPTION_FSCACHE; 1421 mnt->options |= NFS_OPTION_FSCACHE;
1460 break; 1422 break;
1461 case Opt_local_lock: 1423 case Opt_local_lock:
@@ -1665,99 +1627,59 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
1665 return nfs_walk_authlist(args, &request); 1627 return nfs_walk_authlist(args, &request);
1666} 1628}
1667 1629
1668static int nfs_parse_simple_hostname(const char *dev_name, 1630/*
1669 char **hostname, size_t maxnamlen, 1631 * Split "dev_name" into "hostname:export_path".
1670 char **export_path, size_t maxpathlen) 1632 *
1633 * The leftmost colon demarks the split between the server's hostname
1634 * and the export path. If the hostname starts with a left square
1635 * bracket, then it may contain colons.
1636 *
1637 * Note: caller frees hostname and export path, even on error.
1638 */
1639static int nfs_parse_devname(const char *dev_name,
1640 char **hostname, size_t maxnamlen,
1641 char **export_path, size_t maxpathlen)
1671{ 1642{
1672 size_t len; 1643 size_t len;
1673 char *colon, *comma; 1644 char *end;
1674
1675 colon = strchr(dev_name, ':');
1676 if (colon == NULL)
1677 goto out_bad_devname;
1678
1679 len = colon - dev_name;
1680 if (len > maxnamlen)
1681 goto out_hostname;
1682 1645
1683 /* N.B. caller will free nfs_server.hostname in all cases */ 1646 /* Is the host name protected with square brakcets? */
1684 *hostname = kstrndup(dev_name, len, GFP_KERNEL); 1647 if (*dev_name == '[') {
1685 if (!*hostname) 1648 end = strchr(++dev_name, ']');
1686 goto out_nomem; 1649 if (end == NULL || end[1] != ':')
1687
1688 /* kill possible hostname list: not supported */
1689 comma = strchr(*hostname, ',');
1690 if (comma != NULL) {
1691 if (comma == *hostname)
1692 goto out_bad_devname; 1650 goto out_bad_devname;
1693 *comma = '\0';
1694 }
1695 1651
1696 colon++; 1652 len = end - dev_name;
1697 len = strlen(colon); 1653 end++;
1698 if (len > maxpathlen) 1654 } else {
1699 goto out_path; 1655 char *comma;
1700 *export_path = kstrndup(colon, len, GFP_KERNEL);
1701 if (!*export_path)
1702 goto out_nomem;
1703
1704 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
1705 return 0;
1706
1707out_bad_devname:
1708 dfprintk(MOUNT, "NFS: device name not in host:path format\n");
1709 return -EINVAL;
1710
1711out_nomem:
1712 dfprintk(MOUNT, "NFS: not enough memory to parse device name\n");
1713 return -ENOMEM;
1714
1715out_hostname:
1716 dfprintk(MOUNT, "NFS: server hostname too long\n");
1717 return -ENAMETOOLONG;
1718
1719out_path:
1720 dfprintk(MOUNT, "NFS: export pathname too long\n");
1721 return -ENAMETOOLONG;
1722}
1723
1724/*
1725 * Hostname has square brackets around it because it contains one or
1726 * more colons. We look for the first closing square bracket, and a
1727 * colon must follow it.
1728 */
1729static int nfs_parse_protected_hostname(const char *dev_name,
1730 char **hostname, size_t maxnamlen,
1731 char **export_path, size_t maxpathlen)
1732{
1733 size_t len;
1734 char *start, *end;
1735 1656
1736 start = (char *)(dev_name + 1); 1657 end = strchr(dev_name, ':');
1658 if (end == NULL)
1659 goto out_bad_devname;
1660 len = end - dev_name;
1737 1661
1738 end = strchr(start, ']'); 1662 /* kill possible hostname list: not supported */
1739 if (end == NULL) 1663 comma = strchr(dev_name, ',');
1740 goto out_bad_devname; 1664 if (comma != NULL && comma < end)
1741 if (*(end + 1) != ':') 1665 *comma = 0;
1742 goto out_bad_devname; 1666 }
1743 1667
1744 len = end - start;
1745 if (len > maxnamlen) 1668 if (len > maxnamlen)
1746 goto out_hostname; 1669 goto out_hostname;
1747 1670
1748 /* N.B. caller will free nfs_server.hostname in all cases */ 1671 /* N.B. caller will free nfs_server.hostname in all cases */
1749 *hostname = kstrndup(start, len, GFP_KERNEL); 1672 *hostname = kstrndup(dev_name, len, GFP_KERNEL);
1750 if (*hostname == NULL) 1673 if (*hostname == NULL)
1751 goto out_nomem; 1674 goto out_nomem;
1752 1675 len = strlen(++end);
1753 end += 2;
1754 len = strlen(end);
1755 if (len > maxpathlen) 1676 if (len > maxpathlen)
1756 goto out_path; 1677 goto out_path;
1757 *export_path = kstrndup(end, len, GFP_KERNEL); 1678 *export_path = kstrndup(end, len, GFP_KERNEL);
1758 if (!*export_path) 1679 if (!*export_path)
1759 goto out_nomem; 1680 goto out_nomem;
1760 1681
1682 dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", *export_path);
1761 return 0; 1683 return 0;
1762 1684
1763out_bad_devname: 1685out_bad_devname:
@@ -1778,29 +1700,6 @@ out_path:
1778} 1700}
1779 1701
1780/* 1702/*
1781 * Split "dev_name" into "hostname:export_path".
1782 *
1783 * The leftmost colon demarks the split between the server's hostname
1784 * and the export path. If the hostname starts with a left square
1785 * bracket, then it may contain colons.
1786 *
1787 * Note: caller frees hostname and export path, even on error.
1788 */
1789static int nfs_parse_devname(const char *dev_name,
1790 char **hostname, size_t maxnamlen,
1791 char **export_path, size_t maxpathlen)
1792{
1793 if (*dev_name == '[')
1794 return nfs_parse_protected_hostname(dev_name,
1795 hostname, maxnamlen,
1796 export_path, maxpathlen);
1797
1798 return nfs_parse_simple_hostname(dev_name,
1799 hostname, maxnamlen,
1800 export_path, maxpathlen);
1801}
1802
1803/*
1804 * Validate the NFS2/NFS3 mount data 1703 * Validate the NFS2/NFS3 mount data
1805 * - fills in the mount root filehandle 1704 * - fills in the mount root filehandle
1806 * 1705 *
@@ -2267,19 +2166,19 @@ static int nfs_bdi_register(struct nfs_server *server)
2267 return bdi_register_dev(&server->backing_dev_info, server->s_dev); 2166 return bdi_register_dev(&server->backing_dev_info, server->s_dev);
2268} 2167}
2269 2168
2270static int nfs_get_sb(struct file_system_type *fs_type, 2169static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
2271 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2170 int flags, const char *dev_name, void *raw_data)
2272{ 2171{
2273 struct nfs_server *server = NULL; 2172 struct nfs_server *server = NULL;
2274 struct super_block *s; 2173 struct super_block *s;
2275 struct nfs_parsed_mount_data *data; 2174 struct nfs_parsed_mount_data *data;
2276 struct nfs_fh *mntfh; 2175 struct nfs_fh *mntfh;
2277 struct dentry *mntroot; 2176 struct dentry *mntroot = ERR_PTR(-ENOMEM);
2278 int (*compare_super)(struct super_block *, void *) = nfs_compare_super; 2177 int (*compare_super)(struct super_block *, void *) = nfs_compare_super;
2279 struct nfs_sb_mountdata sb_mntdata = { 2178 struct nfs_sb_mountdata sb_mntdata = {
2280 .mntflags = flags, 2179 .mntflags = flags,
2281 }; 2180 };
2282 int error = -ENOMEM; 2181 int error;
2283 2182
2284 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION); 2183 data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
2285 mntfh = nfs_alloc_fhandle(); 2184 mntfh = nfs_alloc_fhandle();
@@ -2290,12 +2189,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2290 2189
2291 /* Validate the mount data */ 2190 /* Validate the mount data */
2292 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name); 2191 error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
2293 if (error < 0) 2192 if (error < 0) {
2193 mntroot = ERR_PTR(error);
2294 goto out; 2194 goto out;
2195 }
2295 2196
2296#ifdef CONFIG_NFS_V4 2197#ifdef CONFIG_NFS_V4
2297 if (data->version == 4) { 2198 if (data->version == 4) {
2298 error = nfs4_try_mount(flags, dev_name, data, mnt); 2199 mntroot = nfs4_try_mount(flags, dev_name, data);
2299 kfree(data->client_address); 2200 kfree(data->client_address);
2300 kfree(data->nfs_server.export_path); 2201 kfree(data->nfs_server.export_path);
2301 goto out; 2202 goto out;
@@ -2305,7 +2206,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2305 /* Get a volume representation */ 2206 /* Get a volume representation */
2306 server = nfs_create_server(data, mntfh); 2207 server = nfs_create_server(data, mntfh);
2307 if (IS_ERR(server)) { 2208 if (IS_ERR(server)) {
2308 error = PTR_ERR(server); 2209 mntroot = ERR_CAST(server);
2309 goto out; 2210 goto out;
2310 } 2211 }
2311 sb_mntdata.server = server; 2212 sb_mntdata.server = server;
@@ -2316,7 +2217,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2316 /* Get a superblock - note that we may end up sharing one that already exists */ 2217 /* Get a superblock - note that we may end up sharing one that already exists */
2317 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata); 2218 s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
2318 if (IS_ERR(s)) { 2219 if (IS_ERR(s)) {
2319 error = PTR_ERR(s); 2220 mntroot = ERR_CAST(s);
2320 goto out_err_nosb; 2221 goto out_err_nosb;
2321 } 2222 }
2322 2223
@@ -2325,8 +2226,10 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2325 server = NULL; 2226 server = NULL;
2326 } else { 2227 } else {
2327 error = nfs_bdi_register(server); 2228 error = nfs_bdi_register(server);
2328 if (error) 2229 if (error) {
2230 mntroot = ERR_PTR(error);
2329 goto error_splat_bdi; 2231 goto error_splat_bdi;
2232 }
2330 } 2233 }
2331 2234
2332 if (!s->s_root) { 2235 if (!s->s_root) {
@@ -2336,20 +2239,15 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2336 s, data ? data->fscache_uniq : NULL, NULL); 2239 s, data ? data->fscache_uniq : NULL, NULL);
2337 } 2240 }
2338 2241
2339 mntroot = nfs_get_root(s, mntfh); 2242 mntroot = nfs_get_root(s, mntfh, dev_name);
2340 if (IS_ERR(mntroot)) { 2243 if (IS_ERR(mntroot))
2341 error = PTR_ERR(mntroot);
2342 goto error_splat_super; 2244 goto error_splat_super;
2343 }
2344 2245
2345 error = security_sb_set_mnt_opts(s, &data->lsm_opts); 2246 error = security_sb_set_mnt_opts(s, &data->lsm_opts);
2346 if (error) 2247 if (error)
2347 goto error_splat_root; 2248 goto error_splat_root;
2348 2249
2349 s->s_flags |= MS_ACTIVE; 2250 s->s_flags |= MS_ACTIVE;
2350 mnt->mnt_sb = s;
2351 mnt->mnt_root = mntroot;
2352 error = 0;
2353 2251
2354out: 2252out:
2355 kfree(data->nfs_server.hostname); 2253 kfree(data->nfs_server.hostname);
@@ -2359,7 +2257,7 @@ out:
2359out_free_fh: 2257out_free_fh:
2360 nfs_free_fhandle(mntfh); 2258 nfs_free_fhandle(mntfh);
2361 kfree(data); 2259 kfree(data);
2362 return error; 2260 return mntroot;
2363 2261
2364out_err_nosb: 2262out_err_nosb:
2365 nfs_free_server(server); 2263 nfs_free_server(server);
@@ -2367,6 +2265,7 @@ out_err_nosb:
2367 2265
2368error_splat_root: 2266error_splat_root:
2369 dput(mntroot); 2267 dput(mntroot);
2268 mntroot = ERR_PTR(error);
2370error_splat_super: 2269error_splat_super:
2371 if (server && !s->s_root) 2270 if (server && !s->s_root)
2372 bdi_unregister(&server->backing_dev_info); 2271 bdi_unregister(&server->backing_dev_info);
@@ -2450,7 +2349,7 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
2450 nfs_fscache_get_super_cookie(s, NULL, data); 2349 nfs_fscache_get_super_cookie(s, NULL, data);
2451 } 2350 }
2452 2351
2453 mntroot = nfs_get_root(s, data->fh); 2352 mntroot = nfs_get_root(s, data->fh, dev_name);
2454 if (IS_ERR(mntroot)) { 2353 if (IS_ERR(mntroot)) {
2455 error = PTR_ERR(mntroot); 2354 error = PTR_ERR(mntroot);
2456 goto error_splat_super; 2355 goto error_splat_super;
@@ -2718,7 +2617,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
2718 s, data ? data->fscache_uniq : NULL, NULL); 2617 s, data ? data->fscache_uniq : NULL, NULL);
2719 } 2618 }
2720 2619
2721 mntroot = nfs4_get_root(s, mntfh); 2620 mntroot = nfs4_get_root(s, mntfh, dev_name);
2722 if (IS_ERR(mntroot)) { 2621 if (IS_ERR(mntroot)) {
2723 error = PTR_ERR(mntroot); 2622 error = PTR_ERR(mntroot);
2724 goto error_splat_super; 2623 goto error_splat_super;
@@ -2771,27 +2670,6 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
2771 return root_mnt; 2670 return root_mnt;
2772} 2671}
2773 2672
2774static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt)
2775{
2776 char *page = (char *) __get_free_page(GFP_KERNEL);
2777 char *devname, *tmp;
2778
2779 if (page == NULL)
2780 return;
2781 devname = nfs_path(path->mnt->mnt_devname,
2782 path->mnt->mnt_root, path->dentry,
2783 page, PAGE_SIZE);
2784 if (IS_ERR(devname))
2785 goto out_freepage;
2786 tmp = kstrdup(devname, GFP_KERNEL);
2787 if (tmp == NULL)
2788 goto out_freepage;
2789 kfree(mnt->mnt_devname);
2790 mnt->mnt_devname = tmp;
2791out_freepage:
2792 free_page((unsigned long)page);
2793}
2794
2795struct nfs_referral_count { 2673struct nfs_referral_count {
2796 struct list_head list; 2674 struct list_head list;
2797 const struct task_struct *task; 2675 const struct task_struct *task;
@@ -2858,17 +2736,18 @@ static void nfs_referral_loop_unprotect(void)
2858 kfree(p); 2736 kfree(p);
2859} 2737}
2860 2738
2861static int nfs_follow_remote_path(struct vfsmount *root_mnt, 2739static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
2862 const char *export_path, struct vfsmount *mnt_target) 2740 const char *export_path)
2863{ 2741{
2864 struct nameidata *nd = NULL; 2742 struct nameidata *nd = NULL;
2865 struct mnt_namespace *ns_private; 2743 struct mnt_namespace *ns_private;
2866 struct super_block *s; 2744 struct super_block *s;
2745 struct dentry *dentry;
2867 int ret; 2746 int ret;
2868 2747
2869 nd = kmalloc(sizeof(*nd), GFP_KERNEL); 2748 nd = kmalloc(sizeof(*nd), GFP_KERNEL);
2870 if (nd == NULL) 2749 if (nd == NULL)
2871 return -ENOMEM; 2750 return ERR_PTR(-ENOMEM);
2872 2751
2873 ns_private = create_mnt_ns(root_mnt); 2752 ns_private = create_mnt_ns(root_mnt);
2874 ret = PTR_ERR(ns_private); 2753 ret = PTR_ERR(ns_private);
@@ -2890,32 +2769,27 @@ static int nfs_follow_remote_path(struct vfsmount *root_mnt,
2890 2769
2891 s = nd->path.mnt->mnt_sb; 2770 s = nd->path.mnt->mnt_sb;
2892 atomic_inc(&s->s_active); 2771 atomic_inc(&s->s_active);
2893 mnt_target->mnt_sb = s; 2772 dentry = dget(nd->path.dentry);
2894 mnt_target->mnt_root = dget(nd->path.dentry);
2895
2896 /* Correct the device pathname */
2897 nfs_fix_devname(&nd->path, mnt_target);
2898 2773
2899 path_put(&nd->path); 2774 path_put(&nd->path);
2900 kfree(nd); 2775 kfree(nd);
2901 down_write(&s->s_umount); 2776 down_write(&s->s_umount);
2902 return 0; 2777 return dentry;
2903out_put_mnt_ns: 2778out_put_mnt_ns:
2904 put_mnt_ns(ns_private); 2779 put_mnt_ns(ns_private);
2905out_mntput: 2780out_mntput:
2906 mntput(root_mnt); 2781 mntput(root_mnt);
2907out_err: 2782out_err:
2908 kfree(nd); 2783 kfree(nd);
2909 return ret; 2784 return ERR_PTR(ret);
2910} 2785}
2911 2786
2912static int nfs4_try_mount(int flags, const char *dev_name, 2787static struct dentry *nfs4_try_mount(int flags, const char *dev_name,
2913 struct nfs_parsed_mount_data *data, 2788 struct nfs_parsed_mount_data *data)
2914 struct vfsmount *mnt)
2915{ 2789{
2916 char *export_path; 2790 char *export_path;
2917 struct vfsmount *root_mnt; 2791 struct vfsmount *root_mnt;
2918 int error; 2792 struct dentry *res;
2919 2793
2920 dfprintk(MOUNT, "--> nfs4_try_mount()\n"); 2794 dfprintk(MOUNT, "--> nfs4_try_mount()\n");
2921 2795
@@ -2925,26 +2799,25 @@ static int nfs4_try_mount(int flags, const char *dev_name,
2925 data->nfs_server.hostname); 2799 data->nfs_server.hostname);
2926 data->nfs_server.export_path = export_path; 2800 data->nfs_server.export_path = export_path;
2927 2801
2928 error = PTR_ERR(root_mnt); 2802 res = ERR_CAST(root_mnt);
2929 if (IS_ERR(root_mnt)) 2803 if (!IS_ERR(root_mnt))
2930 goto out; 2804 res = nfs_follow_remote_path(root_mnt, export_path);
2931
2932 error = nfs_follow_remote_path(root_mnt, export_path, mnt);
2933 2805
2934out: 2806 dfprintk(MOUNT, "<-- nfs4_try_mount() = %ld%s\n",
2935 dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error, 2807 IS_ERR(res) ? PTR_ERR(res) : 0,
2936 error != 0 ? " [error]" : ""); 2808 IS_ERR(res) ? " [error]" : "");
2937 return error; 2809 return res;
2938} 2810}
2939 2811
2940/* 2812/*
2941 * Get the superblock for an NFS4 mountpoint 2813 * Get the superblock for an NFS4 mountpoint
2942 */ 2814 */
2943static int nfs4_get_sb(struct file_system_type *fs_type, 2815static struct dentry *nfs4_mount(struct file_system_type *fs_type,
2944 int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) 2816 int flags, const char *dev_name, void *raw_data)
2945{ 2817{
2946 struct nfs_parsed_mount_data *data; 2818 struct nfs_parsed_mount_data *data;
2947 int error = -ENOMEM; 2819 int error = -ENOMEM;
2820 struct dentry *res = ERR_PTR(-ENOMEM);
2948 2821
2949 data = nfs_alloc_parsed_mount_data(4); 2822 data = nfs_alloc_parsed_mount_data(4);
2950 if (data == NULL) 2823 if (data == NULL)
@@ -2952,10 +2825,14 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
2952 2825
2953 /* Validate the mount data */ 2826 /* Validate the mount data */
2954 error = nfs4_validate_mount_data(raw_data, data, dev_name); 2827 error = nfs4_validate_mount_data(raw_data, data, dev_name);
2955 if (error < 0) 2828 if (error < 0) {
2829 res = ERR_PTR(error);
2956 goto out; 2830 goto out;
2831 }
2957 2832
2958 error = nfs4_try_mount(flags, dev_name, data, mnt); 2833 res = nfs4_try_mount(flags, dev_name, data);
2834 if (IS_ERR(res))
2835 error = PTR_ERR(res);
2959 2836
2960out: 2837out:
2961 kfree(data->client_address); 2838 kfree(data->client_address);
@@ -2964,9 +2841,9 @@ out:
2964 kfree(data->fscache_uniq); 2841 kfree(data->fscache_uniq);
2965out_free_data: 2842out_free_data:
2966 kfree(data); 2843 kfree(data);
2967 dprintk("<-- nfs4_get_sb() = %d%s\n", error, 2844 dprintk("<-- nfs4_mount() = %d%s\n", error,
2968 error != 0 ? " [error]" : ""); 2845 error != 0 ? " [error]" : "");
2969 return error; 2846 return res;
2970} 2847}
2971 2848
2972static void nfs4_kill_super(struct super_block *sb) 2849static void nfs4_kill_super(struct super_block *sb)
@@ -3033,7 +2910,7 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
3033 nfs_fscache_get_super_cookie(s, NULL, data); 2910 nfs_fscache_get_super_cookie(s, NULL, data);
3034 } 2911 }
3035 2912
3036 mntroot = nfs4_get_root(s, data->fh); 2913 mntroot = nfs4_get_root(s, data->fh, dev_name);
3037 if (IS_ERR(mntroot)) { 2914 if (IS_ERR(mntroot)) {
3038 error = PTR_ERR(mntroot); 2915 error = PTR_ERR(mntroot);
3039 goto error_splat_super; 2916 goto error_splat_super;
@@ -3120,7 +2997,7 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
3120 nfs_fscache_get_super_cookie(s, NULL, data); 2997 nfs_fscache_get_super_cookie(s, NULL, data);
3121 } 2998 }
3122 2999
3123 mntroot = nfs4_get_root(s, mntfh); 3000 mntroot = nfs4_get_root(s, mntfh, dev_name);
3124 if (IS_ERR(mntroot)) { 3001 if (IS_ERR(mntroot)) {
3125 error = PTR_ERR(mntroot); 3002 error = PTR_ERR(mntroot);
3126 goto error_splat_super; 3003 goto error_splat_super;
@@ -3160,16 +3037,15 @@ error_splat_bdi:
3160/* 3037/*
3161 * Create an NFS4 server record on referral traversal 3038 * Create an NFS4 server record on referral traversal
3162 */ 3039 */
3163static int nfs4_referral_get_sb(struct file_system_type *fs_type, 3040static struct dentry *nfs4_referral_mount(struct file_system_type *fs_type,
3164 int flags, const char *dev_name, void *raw_data, 3041 int flags, const char *dev_name, void *raw_data)
3165 struct vfsmount *mnt)
3166{ 3042{
3167 struct nfs_clone_mount *data = raw_data; 3043 struct nfs_clone_mount *data = raw_data;
3168 char *export_path; 3044 char *export_path;
3169 struct vfsmount *root_mnt; 3045 struct vfsmount *root_mnt;
3170 int error; 3046 struct dentry *res;
3171 3047
3172 dprintk("--> nfs4_referral_get_sb()\n"); 3048 dprintk("--> nfs4_referral_mount()\n");
3173 3049
3174 export_path = data->mnt_path; 3050 export_path = data->mnt_path;
3175 data->mnt_path = "/"; 3051 data->mnt_path = "/";
@@ -3178,15 +3054,13 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type,
3178 flags, data, data->hostname); 3054 flags, data, data->hostname);
3179 data->mnt_path = export_path; 3055 data->mnt_path = export_path;
3180 3056
3181 error = PTR_ERR(root_mnt); 3057 res = ERR_CAST(root_mnt);
3182 if (IS_ERR(root_mnt)) 3058 if (!IS_ERR(root_mnt))
3183 goto out; 3059 res = nfs_follow_remote_path(root_mnt, export_path);
3184 3060 dprintk("<-- nfs4_referral_mount() = %ld%s\n",
3185 error = nfs_follow_remote_path(root_mnt, export_path, mnt); 3061 IS_ERR(res) ? PTR_ERR(res) : 0,
3186out: 3062 IS_ERR(res) ? " [error]" : "");
3187 dprintk("<-- nfs4_referral_get_sb() = %d%s\n", error, 3063 return res;
3188 error != 0 ? " [error]" : "");
3189 return error;
3190} 3064}
3191 3065
3192#endif /* CONFIG_NFS_V4 */ 3066#endif /* CONFIG_NFS_V4 */
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e313a51acdd1..8d6864c2a5fa 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -148,6 +148,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
148 alias = d_lookup(parent, &data->args.name); 148 alias = d_lookup(parent, &data->args.name);
149 if (alias != NULL) { 149 if (alias != NULL) {
150 int ret = 0; 150 int ret = 0;
151 void *devname_garbage = NULL;
151 152
152 /* 153 /*
153 * Hey, we raced with lookup... See if we need to transfer 154 * Hey, we raced with lookup... See if we need to transfer
@@ -157,6 +158,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
157 spin_lock(&alias->d_lock); 158 spin_lock(&alias->d_lock);
158 if (alias->d_inode != NULL && 159 if (alias->d_inode != NULL &&
159 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { 160 !(alias->d_flags & DCACHE_NFSFS_RENAMED)) {
161 devname_garbage = alias->d_fsdata;
160 alias->d_fsdata = data; 162 alias->d_fsdata = data;
161 alias->d_flags |= DCACHE_NFSFS_RENAMED; 163 alias->d_flags |= DCACHE_NFSFS_RENAMED;
162 ret = 1; 164 ret = 1;
@@ -164,6 +166,13 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
164 spin_unlock(&alias->d_lock); 166 spin_unlock(&alias->d_lock);
165 nfs_dec_sillycount(dir); 167 nfs_dec_sillycount(dir);
166 dput(alias); 168 dput(alias);
169 /*
170 * If we'd displaced old cached devname, free it. At that
171 * point dentry is definitely not a root, so we won't need
172 * that anymore.
173 */
174 if (devname_garbage)
175 kfree(devname_garbage);
167 return ret; 176 return ret;
168 } 177 }
169 data->dir = igrab(dir); 178 data->dir = igrab(dir);
@@ -180,7 +189,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
180 task_setup_data.rpc_client = NFS_CLIENT(dir); 189 task_setup_data.rpc_client = NFS_CLIENT(dir);
181 task = rpc_run_task(&task_setup_data); 190 task = rpc_run_task(&task_setup_data);
182 if (!IS_ERR(task)) 191 if (!IS_ERR(task))
183 rpc_put_task(task); 192 rpc_put_task_async(task);
184 return 1; 193 return 1;
185} 194}
186 195
@@ -252,6 +261,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
252{ 261{
253 struct nfs_unlinkdata *data; 262 struct nfs_unlinkdata *data;
254 int status = -ENOMEM; 263 int status = -ENOMEM;
264 void *devname_garbage = NULL;
255 265
256 data = kzalloc(sizeof(*data), GFP_KERNEL); 266 data = kzalloc(sizeof(*data), GFP_KERNEL);
257 if (data == NULL) 267 if (data == NULL)
@@ -269,8 +279,16 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
269 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) 279 if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
270 goto out_unlock; 280 goto out_unlock;
271 dentry->d_flags |= DCACHE_NFSFS_RENAMED; 281 dentry->d_flags |= DCACHE_NFSFS_RENAMED;
282 devname_garbage = dentry->d_fsdata;
272 dentry->d_fsdata = data; 283 dentry->d_fsdata = data;
273 spin_unlock(&dentry->d_lock); 284 spin_unlock(&dentry->d_lock);
285 /*
286 * If we'd displaced old cached devname, free it. At that
287 * point dentry is definitely not a root, so we won't need
288 * that anymore.
289 */
290 if (devname_garbage)
291 kfree(devname_garbage);
274 return 0; 292 return 0;
275out_unlock: 293out_unlock:
276 spin_unlock(&dentry->d_lock); 294 spin_unlock(&dentry->d_lock);
@@ -299,6 +317,7 @@ nfs_complete_unlink(struct dentry *dentry, struct inode *inode)
299 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { 317 if (dentry->d_flags & DCACHE_NFSFS_RENAMED) {
300 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; 318 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
301 data = dentry->d_fsdata; 319 data = dentry->d_fsdata;
320 dentry->d_fsdata = NULL;
302 } 321 }
303 spin_unlock(&dentry->d_lock); 322 spin_unlock(&dentry->d_lock);
304 323
@@ -315,6 +334,7 @@ nfs_cancel_async_unlink(struct dentry *dentry)
315 struct nfs_unlinkdata *data = dentry->d_fsdata; 334 struct nfs_unlinkdata *data = dentry->d_fsdata;
316 335
317 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED; 336 dentry->d_flags &= ~DCACHE_NFSFS_RENAMED;
337 dentry->d_fsdata = NULL;
318 spin_unlock(&dentry->d_lock); 338 spin_unlock(&dentry->d_lock);
319 nfs_free_unlinkdata(data); 339 nfs_free_unlinkdata(data);
320 return; 340 return;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c8278f4046cb..e4cbc11a74ab 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -28,6 +28,7 @@
28#include "iostat.h" 28#include "iostat.h"
29#include "nfs4_fs.h" 29#include "nfs4_fs.h"
30#include "fscache.h" 30#include "fscache.h"
31#include "pnfs.h"
31 32
32#define NFSDBG_FACILITY NFSDBG_PAGECACHE 33#define NFSDBG_FACILITY NFSDBG_PAGECACHE
33 34
@@ -58,6 +59,7 @@ struct nfs_write_data *nfs_commitdata_alloc(void)
58 } 59 }
59 return p; 60 return p;
60} 61}
62EXPORT_SYMBOL_GPL(nfs_commitdata_alloc);
61 63
62void nfs_commit_free(struct nfs_write_data *p) 64void nfs_commit_free(struct nfs_write_data *p)
63{ 65{
@@ -65,6 +67,7 @@ void nfs_commit_free(struct nfs_write_data *p)
65 kfree(p->pagevec); 67 kfree(p->pagevec);
66 mempool_free(p, nfs_commit_mempool); 68 mempool_free(p, nfs_commit_mempool);
67} 69}
70EXPORT_SYMBOL_GPL(nfs_commit_free);
68 71
69struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) 72struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
70{ 73{
@@ -96,6 +99,7 @@ void nfs_writedata_free(struct nfs_write_data *p)
96 99
97static void nfs_writedata_release(struct nfs_write_data *wdata) 100static void nfs_writedata_release(struct nfs_write_data *wdata)
98{ 101{
102 put_lseg(wdata->lseg);
99 put_nfs_open_context(wdata->args.context); 103 put_nfs_open_context(wdata->args.context);
100 nfs_writedata_free(wdata); 104 nfs_writedata_free(wdata);
101} 105}
@@ -177,8 +181,8 @@ static int wb_priority(struct writeback_control *wbc)
177 if (wbc->for_reclaim) 181 if (wbc->for_reclaim)
178 return FLUSH_HIGHPRI | FLUSH_STABLE; 182 return FLUSH_HIGHPRI | FLUSH_STABLE;
179 if (wbc->for_kupdate || wbc->for_background) 183 if (wbc->for_kupdate || wbc->for_background)
180 return FLUSH_LOWPRI; 184 return FLUSH_LOWPRI | FLUSH_COND_STABLE;
181 return 0; 185 return FLUSH_COND_STABLE;
182} 186}
183 187
184/* 188/*
@@ -385,11 +389,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
385 spin_lock(&inode->i_lock); 389 spin_lock(&inode->i_lock);
386 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); 390 error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req);
387 BUG_ON(error); 391 BUG_ON(error);
388 if (!nfsi->npages) { 392 if (!nfsi->npages && nfs_have_delegation(inode, FMODE_WRITE))
389 igrab(inode); 393 nfsi->change_attr++;
390 if (nfs_have_delegation(inode, FMODE_WRITE))
391 nfsi->change_attr++;
392 }
393 set_bit(PG_MAPPED, &req->wb_flags); 394 set_bit(PG_MAPPED, &req->wb_flags);
394 SetPagePrivate(req->wb_page); 395 SetPagePrivate(req->wb_page);
395 set_page_private(req->wb_page, (unsigned long)req); 396 set_page_private(req->wb_page, (unsigned long)req);
@@ -419,11 +420,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
419 clear_bit(PG_MAPPED, &req->wb_flags); 420 clear_bit(PG_MAPPED, &req->wb_flags);
420 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index); 421 radix_tree_delete(&nfsi->nfs_page_tree, req->wb_index);
421 nfsi->npages--; 422 nfsi->npages--;
422 if (!nfsi->npages) { 423 spin_unlock(&inode->i_lock);
423 spin_unlock(&inode->i_lock);
424 iput(inode);
425 } else
426 spin_unlock(&inode->i_lock);
427 nfs_release_request(req); 424 nfs_release_request(req);
428} 425}
429 426
@@ -439,7 +436,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
439 * Add a request to the inode's commit list. 436 * Add a request to the inode's commit list.
440 */ 437 */
441static void 438static void
442nfs_mark_request_commit(struct nfs_page *req) 439nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
443{ 440{
444 struct inode *inode = req->wb_context->path.dentry->d_inode; 441 struct inode *inode = req->wb_context->path.dentry->d_inode;
445 struct nfs_inode *nfsi = NFS_I(inode); 442 struct nfs_inode *nfsi = NFS_I(inode);
@@ -451,6 +448,7 @@ nfs_mark_request_commit(struct nfs_page *req)
451 NFS_PAGE_TAG_COMMIT); 448 NFS_PAGE_TAG_COMMIT);
452 nfsi->ncommit++; 449 nfsi->ncommit++;
453 spin_unlock(&inode->i_lock); 450 spin_unlock(&inode->i_lock);
451 pnfs_mark_request_commit(req, lseg);
454 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); 452 inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
455 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE); 453 inc_bdi_stat(req->wb_page->mapping->backing_dev_info, BDI_RECLAIMABLE);
456 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 454 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
@@ -472,14 +470,18 @@ nfs_clear_request_commit(struct nfs_page *req)
472static inline 470static inline
473int nfs_write_need_commit(struct nfs_write_data *data) 471int nfs_write_need_commit(struct nfs_write_data *data)
474{ 472{
475 return data->verf.committed != NFS_FILE_SYNC; 473 if (data->verf.committed == NFS_DATA_SYNC)
474 return data->lseg == NULL;
475 else
476 return data->verf.committed != NFS_FILE_SYNC;
476} 477}
477 478
478static inline 479static inline
479int nfs_reschedule_unstable_write(struct nfs_page *req) 480int nfs_reschedule_unstable_write(struct nfs_page *req,
481 struct nfs_write_data *data)
480{ 482{
481 if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) { 483 if (test_and_clear_bit(PG_NEED_COMMIT, &req->wb_flags)) {
482 nfs_mark_request_commit(req); 484 nfs_mark_request_commit(req, data->lseg);
483 return 1; 485 return 1;
484 } 486 }
485 if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { 487 if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) {
@@ -490,7 +492,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req)
490} 492}
491#else 493#else
492static inline void 494static inline void
493nfs_mark_request_commit(struct nfs_page *req) 495nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg)
494{ 496{
495} 497}
496 498
@@ -507,7 +509,8 @@ int nfs_write_need_commit(struct nfs_write_data *data)
507} 509}
508 510
509static inline 511static inline
510int nfs_reschedule_unstable_write(struct nfs_page *req) 512int nfs_reschedule_unstable_write(struct nfs_page *req,
513 struct nfs_write_data *data)
511{ 514{
512 return 0; 515 return 0;
513} 516}
@@ -539,11 +542,15 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, u
539 if (!nfs_need_commit(nfsi)) 542 if (!nfs_need_commit(nfsi))
540 return 0; 543 return 0;
541 544
545 spin_lock(&inode->i_lock);
542 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT); 546 ret = nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
543 if (ret > 0) 547 if (ret > 0)
544 nfsi->ncommit -= ret; 548 nfsi->ncommit -= ret;
549 spin_unlock(&inode->i_lock);
550
545 if (nfs_need_commit(NFS_I(inode))) 551 if (nfs_need_commit(NFS_I(inode)))
546 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 552 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
553
547 return ret; 554 return ret;
548} 555}
549#else 556#else
@@ -610,9 +617,11 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
610 } 617 }
611 618
612 if (nfs_clear_request_commit(req) && 619 if (nfs_clear_request_commit(req) &&
613 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, 620 radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree,
614 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) 621 req->wb_index, NFS_PAGE_TAG_COMMIT) != NULL) {
615 NFS_I(inode)->ncommit--; 622 NFS_I(inode)->ncommit--;
623 pnfs_clear_request_commit(req);
624 }
616 625
617 /* Okay, the request matches. Update the region */ 626 /* Okay, the request matches. Update the region */
618 if (offset < req->wb_offset) { 627 if (offset < req->wb_offset) {
@@ -760,11 +769,12 @@ int nfs_updatepage(struct file *file, struct page *page,
760 return status; 769 return status;
761} 770}
762 771
763static void nfs_writepage_release(struct nfs_page *req) 772static void nfs_writepage_release(struct nfs_page *req,
773 struct nfs_write_data *data)
764{ 774{
765 struct page *page = req->wb_page; 775 struct page *page = req->wb_page;
766 776
767 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) 777 if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, data))
768 nfs_inode_remove_request(req); 778 nfs_inode_remove_request(req);
769 nfs_clear_page_tag_locked(req); 779 nfs_clear_page_tag_locked(req);
770 nfs_end_page_writeback(page); 780 nfs_end_page_writeback(page);
@@ -781,25 +791,21 @@ static int flush_task_priority(int how)
781 return RPC_PRIORITY_NORMAL; 791 return RPC_PRIORITY_NORMAL;
782} 792}
783 793
784/* 794int nfs_initiate_write(struct nfs_write_data *data,
785 * Set up the argument/result storage required for the RPC call. 795 struct rpc_clnt *clnt,
786 */ 796 const struct rpc_call_ops *call_ops,
787static int nfs_write_rpcsetup(struct nfs_page *req, 797 int how)
788 struct nfs_write_data *data,
789 const struct rpc_call_ops *call_ops,
790 unsigned int count, unsigned int offset,
791 int how)
792{ 798{
793 struct inode *inode = req->wb_context->path.dentry->d_inode; 799 struct inode *inode = data->inode;
794 int priority = flush_task_priority(how); 800 int priority = flush_task_priority(how);
795 struct rpc_task *task; 801 struct rpc_task *task;
796 struct rpc_message msg = { 802 struct rpc_message msg = {
797 .rpc_argp = &data->args, 803 .rpc_argp = &data->args,
798 .rpc_resp = &data->res, 804 .rpc_resp = &data->res,
799 .rpc_cred = req->wb_context->cred, 805 .rpc_cred = data->cred,
800 }; 806 };
801 struct rpc_task_setup task_setup_data = { 807 struct rpc_task_setup task_setup_data = {
802 .rpc_client = NFS_CLIENT(inode), 808 .rpc_client = clnt,
803 .task = &data->task, 809 .task = &data->task,
804 .rpc_message = &msg, 810 .rpc_message = &msg,
805 .callback_ops = call_ops, 811 .callback_ops = call_ops,
@@ -810,12 +816,52 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
810 }; 816 };
811 int ret = 0; 817 int ret = 0;
812 818
819 /* Set up the initial task struct. */
820 NFS_PROTO(inode)->write_setup(data, &msg);
821
822 dprintk("NFS: %5u initiated write call "
823 "(req %s/%lld, %u bytes @ offset %llu)\n",
824 data->task.tk_pid,
825 inode->i_sb->s_id,
826 (long long)NFS_FILEID(inode),
827 data->args.count,
828 (unsigned long long)data->args.offset);
829
830 task = rpc_run_task(&task_setup_data);
831 if (IS_ERR(task)) {
832 ret = PTR_ERR(task);
833 goto out;
834 }
835 if (how & FLUSH_SYNC) {
836 ret = rpc_wait_for_completion_task(task);
837 if (ret == 0)
838 ret = task->tk_status;
839 }
840 rpc_put_task(task);
841out:
842 return ret;
843}
844EXPORT_SYMBOL_GPL(nfs_initiate_write);
845
846/*
847 * Set up the argument/result storage required for the RPC call.
848 */
849static int nfs_write_rpcsetup(struct nfs_page *req,
850 struct nfs_write_data *data,
851 const struct rpc_call_ops *call_ops,
852 unsigned int count, unsigned int offset,
853 struct pnfs_layout_segment *lseg,
854 int how)
855{
856 struct inode *inode = req->wb_context->path.dentry->d_inode;
857
813 /* Set up the RPC argument and reply structs 858 /* Set up the RPC argument and reply structs
814 * NB: take care not to mess about with data->commit et al. */ 859 * NB: take care not to mess about with data->commit et al. */
815 860
816 data->req = req; 861 data->req = req;
817 data->inode = inode = req->wb_context->path.dentry->d_inode; 862 data->inode = inode = req->wb_context->path.dentry->d_inode;
818 data->cred = msg.rpc_cred; 863 data->cred = req->wb_context->cred;
864 data->lseg = get_lseg(lseg);
819 865
820 data->args.fh = NFS_FH(inode); 866 data->args.fh = NFS_FH(inode);
821 data->args.offset = req_offset(req) + offset; 867 data->args.offset = req_offset(req) + offset;
@@ -825,7 +871,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
825 data->args.context = get_nfs_open_context(req->wb_context); 871 data->args.context = get_nfs_open_context(req->wb_context);
826 data->args.lock_context = req->wb_lock_context; 872 data->args.lock_context = req->wb_lock_context;
827 data->args.stable = NFS_UNSTABLE; 873 data->args.stable = NFS_UNSTABLE;
828 if (how & FLUSH_STABLE) { 874 if (how & (FLUSH_STABLE | FLUSH_COND_STABLE)) {
829 data->args.stable = NFS_DATA_SYNC; 875 data->args.stable = NFS_DATA_SYNC;
830 if (!nfs_need_commit(NFS_I(inode))) 876 if (!nfs_need_commit(NFS_I(inode)))
831 data->args.stable = NFS_FILE_SYNC; 877 data->args.stable = NFS_FILE_SYNC;
@@ -836,30 +882,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
836 data->res.verf = &data->verf; 882 data->res.verf = &data->verf;
837 nfs_fattr_init(&data->fattr); 883 nfs_fattr_init(&data->fattr);
838 884
839 /* Set up the initial task struct. */ 885 if (data->lseg &&
840 NFS_PROTO(inode)->write_setup(data, &msg); 886 (pnfs_try_to_write_data(data, call_ops, how) == PNFS_ATTEMPTED))
841 887 return 0;
842 dprintk("NFS: %5u initiated write call "
843 "(req %s/%lld, %u bytes @ offset %llu)\n",
844 data->task.tk_pid,
845 inode->i_sb->s_id,
846 (long long)NFS_FILEID(inode),
847 count,
848 (unsigned long long)data->args.offset);
849 888
850 task = rpc_run_task(&task_setup_data); 889 return nfs_initiate_write(data, NFS_CLIENT(inode), call_ops, how);
851 if (IS_ERR(task)) {
852 ret = PTR_ERR(task);
853 goto out;
854 }
855 if (how & FLUSH_SYNC) {
856 ret = rpc_wait_for_completion_task(task);
857 if (ret == 0)
858 ret = task->tk_status;
859 }
860 rpc_put_task(task);
861out:
862 return ret;
863} 890}
864 891
865/* If a nfs_flush_* function fails, it should remove reqs from @head and 892/* If a nfs_flush_* function fails, it should remove reqs from @head and
@@ -879,20 +906,27 @@ static void nfs_redirty_request(struct nfs_page *req)
879 * Generate multiple small requests to write out a single 906 * Generate multiple small requests to write out a single
880 * contiguous dirty area on one page. 907 * contiguous dirty area on one page.
881 */ 908 */
882static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) 909static int nfs_flush_multi(struct nfs_pageio_descriptor *desc)
883{ 910{
884 struct nfs_page *req = nfs_list_entry(head->next); 911 struct nfs_page *req = nfs_list_entry(desc->pg_list.next);
885 struct page *page = req->wb_page; 912 struct page *page = req->wb_page;
886 struct nfs_write_data *data; 913 struct nfs_write_data *data;
887 size_t wsize = NFS_SERVER(inode)->wsize, nbytes; 914 size_t wsize = NFS_SERVER(desc->pg_inode)->wsize, nbytes;
888 unsigned int offset; 915 unsigned int offset;
889 int requests = 0; 916 int requests = 0;
890 int ret = 0; 917 int ret = 0;
918 struct pnfs_layout_segment *lseg;
891 LIST_HEAD(list); 919 LIST_HEAD(list);
892 920
893 nfs_list_remove_request(req); 921 nfs_list_remove_request(req);
894 922
895 nbytes = count; 923 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
924 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit ||
925 desc->pg_count > wsize))
926 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
927
928
929 nbytes = desc->pg_count;
896 do { 930 do {
897 size_t len = min(nbytes, wsize); 931 size_t len = min(nbytes, wsize);
898 932
@@ -905,9 +939,11 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
905 } while (nbytes != 0); 939 } while (nbytes != 0);
906 atomic_set(&req->wb_complete, requests); 940 atomic_set(&req->wb_complete, requests);
907 941
942 BUG_ON(desc->pg_lseg);
943 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
908 ClearPageError(page); 944 ClearPageError(page);
909 offset = 0; 945 offset = 0;
910 nbytes = count; 946 nbytes = desc->pg_count;
911 do { 947 do {
912 int ret2; 948 int ret2;
913 949
@@ -919,13 +955,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned
919 if (nbytes < wsize) 955 if (nbytes < wsize)
920 wsize = nbytes; 956 wsize = nbytes;
921 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, 957 ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops,
922 wsize, offset, how); 958 wsize, offset, lseg, desc->pg_ioflags);
923 if (ret == 0) 959 if (ret == 0)
924 ret = ret2; 960 ret = ret2;
925 offset += wsize; 961 offset += wsize;
926 nbytes -= wsize; 962 nbytes -= wsize;
927 } while (nbytes != 0); 963 } while (nbytes != 0);
928 964
965 put_lseg(lseg);
966 desc->pg_lseg = NULL;
929 return ret; 967 return ret;
930 968
931out_bad: 969out_bad:
@@ -946,16 +984,26 @@ out_bad:
946 * This is the case if nfs_updatepage detects a conflicting request 984 * This is the case if nfs_updatepage detects a conflicting request
947 * that has been written but not committed. 985 * that has been written but not committed.
948 */ 986 */
949static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned int npages, size_t count, int how) 987static int nfs_flush_one(struct nfs_pageio_descriptor *desc)
950{ 988{
951 struct nfs_page *req; 989 struct nfs_page *req;
952 struct page **pages; 990 struct page **pages;
953 struct nfs_write_data *data; 991 struct nfs_write_data *data;
992 struct list_head *head = &desc->pg_list;
993 struct pnfs_layout_segment *lseg = desc->pg_lseg;
994 int ret;
954 995
955 data = nfs_writedata_alloc(npages); 996 data = nfs_writedata_alloc(nfs_page_array_len(desc->pg_base,
956 if (!data) 997 desc->pg_count));
957 goto out_bad; 998 if (!data) {
958 999 while (!list_empty(head)) {
1000 req = nfs_list_entry(head->next);
1001 nfs_list_remove_request(req);
1002 nfs_redirty_request(req);
1003 }
1004 ret = -ENOMEM;
1005 goto out;
1006 }
959 pages = data->pagevec; 1007 pages = data->pagevec;
960 while (!list_empty(head)) { 1008 while (!list_empty(head)) {
961 req = nfs_list_entry(head->next); 1009 req = nfs_list_entry(head->next);
@@ -965,16 +1013,19 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i
965 *pages++ = req->wb_page; 1013 *pages++ = req->wb_page;
966 } 1014 }
967 req = nfs_list_entry(data->pages.next); 1015 req = nfs_list_entry(data->pages.next);
1016 if ((!lseg) && list_is_singular(&data->pages))
1017 lseg = pnfs_update_layout(desc->pg_inode, req->wb_context, IOMODE_RW);
1018
1019 if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
1020 (desc->pg_moreio || NFS_I(desc->pg_inode)->ncommit))
1021 desc->pg_ioflags &= ~FLUSH_COND_STABLE;
968 1022
969 /* Set up the argument struct */ 1023 /* Set up the argument struct */
970 return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); 1024 ret = nfs_write_rpcsetup(req, data, &nfs_write_full_ops, desc->pg_count, 0, lseg, desc->pg_ioflags);
971 out_bad: 1025out:
972 while (!list_empty(head)) { 1026 put_lseg(lseg); /* Cleans any gotten in ->pg_test */
973 req = nfs_list_entry(head->next); 1027 desc->pg_lseg = NULL;
974 nfs_list_remove_request(req); 1028 return ret;
975 nfs_redirty_request(req);
976 }
977 return -ENOMEM;
978} 1029}
979 1030
980static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, 1031static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -982,6 +1033,8 @@ static void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
982{ 1033{
983 size_t wsize = NFS_SERVER(inode)->wsize; 1034 size_t wsize = NFS_SERVER(inode)->wsize;
984 1035
1036 pnfs_pageio_init_write(pgio, inode);
1037
985 if (wsize < PAGE_CACHE_SIZE) 1038 if (wsize < PAGE_CACHE_SIZE)
986 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags); 1039 nfs_pageio_init(pgio, inode, nfs_flush_multi, wsize, ioflags);
987 else 1040 else
@@ -1039,7 +1092,7 @@ static void nfs_writeback_release_partial(void *calldata)
1039 1092
1040out: 1093out:
1041 if (atomic_dec_and_test(&req->wb_complete)) 1094 if (atomic_dec_and_test(&req->wb_complete))
1042 nfs_writepage_release(req); 1095 nfs_writepage_release(req, data);
1043 nfs_writedata_release(calldata); 1096 nfs_writedata_release(calldata);
1044} 1097}
1045 1098
@@ -1106,7 +1159,7 @@ static void nfs_writeback_release_full(void *calldata)
1106 1159
1107 if (nfs_write_need_commit(data)) { 1160 if (nfs_write_need_commit(data)) {
1108 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); 1161 memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf));
1109 nfs_mark_request_commit(req); 1162 nfs_mark_request_commit(req, data->lseg);
1110 dprintk(" marked for commit\n"); 1163 dprintk(" marked for commit\n");
1111 goto next; 1164 goto next;
1112 } 1165 }
@@ -1132,7 +1185,7 @@ static const struct rpc_call_ops nfs_write_full_ops = {
1132/* 1185/*
1133 * This function is called when the WRITE call is complete. 1186 * This function is called when the WRITE call is complete.
1134 */ 1187 */
1135int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) 1188void nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1136{ 1189{
1137 struct nfs_writeargs *argp = &data->args; 1190 struct nfs_writeargs *argp = &data->args;
1138 struct nfs_writeres *resp = &data->res; 1191 struct nfs_writeres *resp = &data->res;
@@ -1151,7 +1204,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1151 */ 1204 */
1152 status = NFS_PROTO(data->inode)->write_done(task, data); 1205 status = NFS_PROTO(data->inode)->write_done(task, data);
1153 if (status != 0) 1206 if (status != 0)
1154 return status; 1207 return;
1155 nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count); 1208 nfs_add_stats(data->inode, NFSIOS_SERVERWRITTENBYTES, resp->count);
1156 1209
1157#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1210#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
@@ -1166,6 +1219,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1166 */ 1219 */
1167 static unsigned long complain; 1220 static unsigned long complain;
1168 1221
1222 /* Note this will print the MDS for a DS write */
1169 if (time_before(complain, jiffies)) { 1223 if (time_before(complain, jiffies)) {
1170 dprintk("NFS: faulty NFS server %s:" 1224 dprintk("NFS: faulty NFS server %s:"
1171 " (committed = %d) != (stable = %d)\n", 1225 " (committed = %d) != (stable = %d)\n",
@@ -1186,6 +1240,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1186 /* Was this an NFSv2 write or an NFSv3 stable write? */ 1240 /* Was this an NFSv2 write or an NFSv3 stable write? */
1187 if (resp->verf->committed != NFS_UNSTABLE) { 1241 if (resp->verf->committed != NFS_UNSTABLE) {
1188 /* Resend from where the server left off */ 1242 /* Resend from where the server left off */
1243 data->mds_offset += resp->count;
1189 argp->offset += resp->count; 1244 argp->offset += resp->count;
1190 argp->pgbase += resp->count; 1245 argp->pgbase += resp->count;
1191 argp->count -= resp->count; 1246 argp->count -= resp->count;
@@ -1196,7 +1251,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1196 argp->stable = NFS_FILE_SYNC; 1251 argp->stable = NFS_FILE_SYNC;
1197 } 1252 }
1198 nfs_restart_rpc(task, server->nfs_client); 1253 nfs_restart_rpc(task, server->nfs_client);
1199 return -EAGAIN; 1254 return;
1200 } 1255 }
1201 if (time_before(complain, jiffies)) { 1256 if (time_before(complain, jiffies)) {
1202 printk(KERN_WARNING 1257 printk(KERN_WARNING
@@ -1207,64 +1262,89 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data)
1207 /* Can't do anything about it except throw an error. */ 1262 /* Can't do anything about it except throw an error. */
1208 task->tk_status = -EIO; 1263 task->tk_status = -EIO;
1209 } 1264 }
1210 return 0; 1265 return;
1211} 1266}
1212 1267
1213 1268
1214#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) 1269#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
1215static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) 1270static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
1216{ 1271{
1272 int ret;
1273
1217 if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) 1274 if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
1218 return 1; 1275 return 1;
1219 if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, 1276 if (!may_wait)
1220 NFS_INO_COMMIT, nfs_wait_bit_killable, 1277 return 0;
1221 TASK_KILLABLE)) 1278 ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
1222 return 1; 1279 NFS_INO_COMMIT,
1223 return 0; 1280 nfs_wait_bit_killable,
1281 TASK_KILLABLE);
1282 return (ret < 0) ? ret : 1;
1224} 1283}
1225 1284
1226static void nfs_commit_clear_lock(struct nfs_inode *nfsi) 1285void nfs_commit_clear_lock(struct nfs_inode *nfsi)
1227{ 1286{
1228 clear_bit(NFS_INO_COMMIT, &nfsi->flags); 1287 clear_bit(NFS_INO_COMMIT, &nfsi->flags);
1229 smp_mb__after_clear_bit(); 1288 smp_mb__after_clear_bit();
1230 wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); 1289 wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
1231} 1290}
1291EXPORT_SYMBOL_GPL(nfs_commit_clear_lock);
1232 1292
1233 1293void nfs_commitdata_release(void *data)
1234static void nfs_commitdata_release(void *data)
1235{ 1294{
1236 struct nfs_write_data *wdata = data; 1295 struct nfs_write_data *wdata = data;
1237 1296
1297 put_lseg(wdata->lseg);
1238 put_nfs_open_context(wdata->args.context); 1298 put_nfs_open_context(wdata->args.context);
1239 nfs_commit_free(wdata); 1299 nfs_commit_free(wdata);
1240} 1300}
1301EXPORT_SYMBOL_GPL(nfs_commitdata_release);
1241 1302
1242/* 1303int nfs_initiate_commit(struct nfs_write_data *data, struct rpc_clnt *clnt,
1243 * Set up the argument/result storage required for the RPC call. 1304 const struct rpc_call_ops *call_ops,
1244 */ 1305 int how)
1245static int nfs_commit_rpcsetup(struct list_head *head,
1246 struct nfs_write_data *data,
1247 int how)
1248{ 1306{
1249 struct nfs_page *first = nfs_list_entry(head->next);
1250 struct inode *inode = first->wb_context->path.dentry->d_inode;
1251 int priority = flush_task_priority(how);
1252 struct rpc_task *task; 1307 struct rpc_task *task;
1308 int priority = flush_task_priority(how);
1253 struct rpc_message msg = { 1309 struct rpc_message msg = {
1254 .rpc_argp = &data->args, 1310 .rpc_argp = &data->args,
1255 .rpc_resp = &data->res, 1311 .rpc_resp = &data->res,
1256 .rpc_cred = first->wb_context->cred, 1312 .rpc_cred = data->cred,
1257 }; 1313 };
1258 struct rpc_task_setup task_setup_data = { 1314 struct rpc_task_setup task_setup_data = {
1259 .task = &data->task, 1315 .task = &data->task,
1260 .rpc_client = NFS_CLIENT(inode), 1316 .rpc_client = clnt,
1261 .rpc_message = &msg, 1317 .rpc_message = &msg,
1262 .callback_ops = &nfs_commit_ops, 1318 .callback_ops = call_ops,
1263 .callback_data = data, 1319 .callback_data = data,
1264 .workqueue = nfsiod_workqueue, 1320 .workqueue = nfsiod_workqueue,
1265 .flags = RPC_TASK_ASYNC, 1321 .flags = RPC_TASK_ASYNC,
1266 .priority = priority, 1322 .priority = priority,
1267 }; 1323 };
1324 /* Set up the initial task struct. */
1325 NFS_PROTO(data->inode)->commit_setup(data, &msg);
1326
1327 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
1328
1329 task = rpc_run_task(&task_setup_data);
1330 if (IS_ERR(task))
1331 return PTR_ERR(task);
1332 if (how & FLUSH_SYNC)
1333 rpc_wait_for_completion_task(task);
1334 rpc_put_task(task);
1335 return 0;
1336}
1337EXPORT_SYMBOL_GPL(nfs_initiate_commit);
1338
1339/*
1340 * Set up the argument/result storage required for the RPC call.
1341 */
1342void nfs_init_commit(struct nfs_write_data *data,
1343 struct list_head *head,
1344 struct pnfs_layout_segment *lseg)
1345{
1346 struct nfs_page *first = nfs_list_entry(head->next);
1347 struct inode *inode = first->wb_context->path.dentry->d_inode;
1268 1348
1269 /* Set up the RPC argument and reply structs 1349 /* Set up the RPC argument and reply structs
1270 * NB: take care not to mess about with data->commit et al. */ 1350 * NB: take care not to mess about with data->commit et al. */
@@ -1272,7 +1352,9 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1272 list_splice_init(head, &data->pages); 1352 list_splice_init(head, &data->pages);
1273 1353
1274 data->inode = inode; 1354 data->inode = inode;
1275 data->cred = msg.rpc_cred; 1355 data->cred = first->wb_context->cred;
1356 data->lseg = lseg; /* reference transferred */
1357 data->mds_ops = &nfs_commit_ops;
1276 1358
1277 data->args.fh = NFS_FH(data->inode); 1359 data->args.fh = NFS_FH(data->inode);
1278 /* Note: we always request a commit of the entire inode */ 1360 /* Note: we always request a commit of the entire inode */
@@ -1283,18 +1365,25 @@ static int nfs_commit_rpcsetup(struct list_head *head,
1283 data->res.fattr = &data->fattr; 1365 data->res.fattr = &data->fattr;
1284 data->res.verf = &data->verf; 1366 data->res.verf = &data->verf;
1285 nfs_fattr_init(&data->fattr); 1367 nfs_fattr_init(&data->fattr);
1368}
1369EXPORT_SYMBOL_GPL(nfs_init_commit);
1286 1370
1287 /* Set up the initial task struct. */ 1371void nfs_retry_commit(struct list_head *page_list,
1288 NFS_PROTO(inode)->commit_setup(data, &msg); 1372 struct pnfs_layout_segment *lseg)
1289 1373{
1290 dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); 1374 struct nfs_page *req;
1291 1375
1292 task = rpc_run_task(&task_setup_data); 1376 while (!list_empty(page_list)) {
1293 if (IS_ERR(task)) 1377 req = nfs_list_entry(page_list->next);
1294 return PTR_ERR(task); 1378 nfs_list_remove_request(req);
1295 rpc_put_task(task); 1379 nfs_mark_request_commit(req, lseg);
1296 return 0; 1380 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1381 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
1382 BDI_RECLAIMABLE);
1383 nfs_clear_page_tag_locked(req);
1384 }
1297} 1385}
1386EXPORT_SYMBOL_GPL(nfs_retry_commit);
1298 1387
1299/* 1388/*
1300 * Commit dirty pages 1389 * Commit dirty pages
@@ -1303,7 +1392,6 @@ static int
1303nfs_commit_list(struct inode *inode, struct list_head *head, int how) 1392nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1304{ 1393{
1305 struct nfs_write_data *data; 1394 struct nfs_write_data *data;
1306 struct nfs_page *req;
1307 1395
1308 data = nfs_commitdata_alloc(); 1396 data = nfs_commitdata_alloc();
1309 1397
@@ -1311,17 +1399,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how)
1311 goto out_bad; 1399 goto out_bad;
1312 1400
1313 /* Set up the argument struct */ 1401 /* Set up the argument struct */
1314 return nfs_commit_rpcsetup(head, data, how); 1402 nfs_init_commit(data, head, NULL);
1403 return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, how);
1315 out_bad: 1404 out_bad:
1316 while (!list_empty(head)) { 1405 nfs_retry_commit(head, NULL);
1317 req = nfs_list_entry(head->next);
1318 nfs_list_remove_request(req);
1319 nfs_mark_request_commit(req);
1320 dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
1321 dec_bdi_stat(req->wb_page->mapping->backing_dev_info,
1322 BDI_RECLAIMABLE);
1323 nfs_clear_page_tag_locked(req);
1324 }
1325 nfs_commit_clear_lock(NFS_I(inode)); 1406 nfs_commit_clear_lock(NFS_I(inode));
1326 return -ENOMEM; 1407 return -ENOMEM;
1327} 1408}
@@ -1341,10 +1422,9 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
1341 return; 1422 return;
1342} 1423}
1343 1424
1344static void nfs_commit_release(void *calldata) 1425void nfs_commit_release_pages(struct nfs_write_data *data)
1345{ 1426{
1346 struct nfs_write_data *data = calldata; 1427 struct nfs_page *req;
1347 struct nfs_page *req;
1348 int status = data->task.tk_status; 1428 int status = data->task.tk_status;
1349 1429
1350 while (!list_empty(&data->pages)) { 1430 while (!list_empty(&data->pages)) {
@@ -1378,6 +1458,14 @@ static void nfs_commit_release(void *calldata)
1378 next: 1458 next:
1379 nfs_clear_page_tag_locked(req); 1459 nfs_clear_page_tag_locked(req);
1380 } 1460 }
1461}
1462EXPORT_SYMBOL_GPL(nfs_commit_release_pages);
1463
1464static void nfs_commit_release(void *calldata)
1465{
1466 struct nfs_write_data *data = calldata;
1467
1468 nfs_commit_release_pages(data);
1381 nfs_commit_clear_lock(NFS_I(data->inode)); 1469 nfs_commit_clear_lock(NFS_I(data->inode));
1382 nfs_commitdata_release(calldata); 1470 nfs_commitdata_release(calldata);
1383} 1471}
@@ -1394,23 +1482,28 @@ int nfs_commit_inode(struct inode *inode, int how)
1394{ 1482{
1395 LIST_HEAD(head); 1483 LIST_HEAD(head);
1396 int may_wait = how & FLUSH_SYNC; 1484 int may_wait = how & FLUSH_SYNC;
1397 int res = 0; 1485 int res;
1398 1486
1399 if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) 1487 res = nfs_commit_set_lock(NFS_I(inode), may_wait);
1488 if (res <= 0)
1400 goto out_mark_dirty; 1489 goto out_mark_dirty;
1401 spin_lock(&inode->i_lock);
1402 res = nfs_scan_commit(inode, &head, 0, 0); 1490 res = nfs_scan_commit(inode, &head, 0, 0);
1403 spin_unlock(&inode->i_lock);
1404 if (res) { 1491 if (res) {
1405 int error = nfs_commit_list(inode, &head, how); 1492 int error;
1493
1494 error = pnfs_commit_list(inode, &head, how);
1495 if (error == PNFS_NOT_ATTEMPTED)
1496 error = nfs_commit_list(inode, &head, how);
1406 if (error < 0) 1497 if (error < 0)
1407 return error; 1498 return error;
1408 if (may_wait) 1499 if (!may_wait)
1409 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1410 nfs_wait_bit_killable,
1411 TASK_KILLABLE);
1412 else
1413 goto out_mark_dirty; 1500 goto out_mark_dirty;
1501 error = wait_on_bit(&NFS_I(inode)->flags,
1502 NFS_INO_COMMIT,
1503 nfs_wait_bit_killable,
1504 TASK_KILLABLE);
1505 if (error < 0)
1506 return error;
1414 } else 1507 } else
1415 nfs_commit_clear_lock(NFS_I(inode)); 1508 nfs_commit_clear_lock(NFS_I(inode));
1416 return res; 1509 return res;
@@ -1464,7 +1557,22 @@ static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_contr
1464 1557
1465int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) 1558int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1466{ 1559{
1467 return nfs_commit_unstable_pages(inode, wbc); 1560 int ret;
1561
1562 ret = nfs_commit_unstable_pages(inode, wbc);
1563 if (ret >= 0 && test_bit(NFS_INO_LAYOUTCOMMIT, &NFS_I(inode)->flags)) {
1564 int status;
1565 bool sync = true;
1566
1567 if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
1568 wbc->for_background)
1569 sync = false;
1570
1571 status = pnfs_layoutcommit_inode(inode, sync);
1572 if (status < 0)
1573 return status;
1574 }
1575 return ret;
1468} 1576}
1469 1577
1470/* 1578/*
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 84c27d69d421..6940439bd609 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -117,7 +117,6 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
117 * invoked in contexts where a memory allocation failure is 117 * invoked in contexts where a memory allocation failure is
118 * fatal. Fortunately this fake ACL is small enough to 118 * fatal. Fortunately this fake ACL is small enough to
119 * construct on the stack. */ 119 * construct on the stack. */
120 memset(acl2, 0, sizeof(acl2));
121 posix_acl_init(acl2, 4); 120 posix_acl_init(acl2, 4);
122 121
123 /* Insert entries in canonical order: other orders seem 122 /* Insert entries in canonical order: other orders seem
@@ -174,7 +173,7 @@ xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
174 return -EINVAL; 173 return -EINVAL;
175 break; 174 break;
176 case ACL_MASK: 175 case ACL_MASK:
177 /* Solaris sometimes sets additonal bits in the mask */ 176 /* Solaris sometimes sets additional bits in the mask */
178 entry->e_perm &= S_IRWXO; 177 entry->e_perm &= S_IRWXO;
179 break; 178 break;
180 default: 179 default:
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242ddd..124e8fcb0dd6 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
22 22
23static struct file *do_open(char *name, int flags) 23static struct file *do_open(char *name, int flags)
24{ 24{
25 struct nameidata nd;
26 struct vfsmount *mnt; 25 struct vfsmount *mnt;
27 int error; 26 struct file *file;
28 27
29 mnt = do_kern_mount("nfsd", 0, "nfsd", NULL); 28 mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
30 if (IS_ERR(mnt)) 29 if (IS_ERR(mnt))
31 return (struct file *)mnt; 30 return (struct file *)mnt;
32 31
33 error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd); 32 file = file_open_root(mnt->mnt_root, mnt, name, flags);
34 mntput(mnt); /* drop do_kern_mount reference */
35 if (error)
36 return ERR_PTR(error);
37
38 if (flags == O_RDWR)
39 error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
40 else
41 error = may_open(&nd.path, MAY_WRITE, flags);
42 33
43 if (!error) 34 mntput(mnt); /* drop do_kern_mount reference */
44 return dentry_open(nd.path.dentry, nd.path.mnt, flags, 35 return file;
45 current_cred());
46
47 path_put(&nd.path);
48 return ERR_PTR(error);
49} 36}
50 37
51static struct { 38static struct {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 8b31e5f8795d..ad000aeb21a2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -299,7 +299,6 @@ svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old)
299 299
300#define EXPORT_HASHBITS 8 300#define EXPORT_HASHBITS 8
301#define EXPORT_HASHMAX (1<< EXPORT_HASHBITS) 301#define EXPORT_HASHMAX (1<< EXPORT_HASHBITS)
302#define EXPORT_HASHMASK (EXPORT_HASHMAX -1)
303 302
304static struct cache_head *export_table[EXPORT_HASHMAX]; 303static struct cache_head *export_table[EXPORT_HASHMAX];
305 304
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 0c6d81670137..7c831a2731fa 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -38,7 +38,6 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
38 exp_readlock(); 38 exp_readlock();
39 nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); 39 nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
40 fh_put(&fh); 40 fh_put(&fh);
41 rqstp->rq_client = NULL;
42 exp_readunlock(); 41 exp_readunlock();
43 /* We return nlm error codes as nlm doesn't know 42 /* We return nlm error codes as nlm doesn't know
44 * about nfsd, but nfsd does know about nlm.. 43 * about nfsd, but nfsd does know about nlm..
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 7e84a852cdae..ad48faca20fc 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -702,7 +702,7 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
702 *p++ = htonl(resp->eof); 702 *p++ = htonl(resp->eof);
703 *p++ = htonl(resp->count); /* xdr opaque count */ 703 *p++ = htonl(resp->count); /* xdr opaque count */
704 xdr_ressize_check(rqstp, p); 704 xdr_ressize_check(rqstp, p);
705 /* now update rqstp->rq_res to reflect data aswell */ 705 /* now update rqstp->rq_res to reflect data as well */
706 rqstp->rq_res.page_len = resp->count; 706 rqstp->rq_res.page_len = resp->count;
707 if (resp->count & 3) { 707 if (resp->count & 3) {
708 /* need to pad the tail */ 708 /* need to pad the tail */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3be975e18919..02eb4edf0ece 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
432 * If the server returns different values for sessionID, slotID or 432 * If the server returns different values for sessionID, slotID or
433 * sequence number, the server is looney tunes. 433 * sequence number, the server is looney tunes.
434 */ 434 */
435 p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4); 435 p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
436 if (unlikely(p == NULL)) 436 if (unlikely(p == NULL))
437 goto out_overflow; 437 goto out_overflow;
438 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN); 438 memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
@@ -484,7 +484,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
484out: 484out:
485 return status; 485 return status;
486out_default: 486out_default:
487 return nfs_cb_stat_to_errno(status); 487 return nfs_cb_stat_to_errno(nfserr);
488} 488}
489 489
490/* 490/*
@@ -564,11 +564,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
564 if (unlikely(status)) 564 if (unlikely(status))
565 goto out; 565 goto out;
566 if (unlikely(nfserr != NFS4_OK)) 566 if (unlikely(nfserr != NFS4_OK))
567 goto out_default; 567 status = nfs_cb_stat_to_errno(nfserr);
568out: 568out:
569 return status; 569 return status;
570out_default:
571 return nfs_cb_stat_to_errno(status);
572} 570}
573 571
574/* 572/*
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 6d2c397d458b..55780a22fdbd 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -63,7 +63,6 @@ struct ent {
63 63
64#define ENT_HASHBITS 8 64#define ENT_HASHBITS 8
65#define ENT_HASHMAX (1 << ENT_HASHBITS) 65#define ENT_HASHMAX (1 << ENT_HASHBITS)
66#define ENT_HASHMASK (ENT_HASHMAX - 1)
67 66
68static void 67static void
69ent_init(struct cache_head *cnew, struct cache_head *citm) 68ent_init(struct cache_head *cnew, struct cache_head *citm)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index db52546143d1..5fcb1396a7e3 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -984,8 +984,8 @@ typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
984 void *); 984 void *);
985enum nfsd4_op_flags { 985enum nfsd4_op_flags {
986 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ 986 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
987 ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */ 987 ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */
988 ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */ 988 ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */
989}; 989};
990 990
991struct nfsd4_operation { 991struct nfsd4_operation {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d98d0213285d..aa309aa93fe8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -148,7 +148,7 @@ static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE];
148/* hash table for nfs4_file */ 148/* hash table for nfs4_file */
149#define FILE_HASH_BITS 8 149#define FILE_HASH_BITS 8
150#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) 150#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
151#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) 151
152/* hash table for (open)nfs4_stateid */ 152/* hash table for (open)nfs4_stateid */
153#define STATEID_HASH_BITS 10 153#define STATEID_HASH_BITS 10
154#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) 154#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS)
@@ -230,9 +230,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
230 dp->dl_client = clp; 230 dp->dl_client = clp;
231 get_nfs4_file(fp); 231 get_nfs4_file(fp);
232 dp->dl_file = fp; 232 dp->dl_file = fp;
233 dp->dl_vfs_file = find_readable_file(fp);
234 get_file(dp->dl_vfs_file);
235 dp->dl_flock = NULL;
236 dp->dl_type = type; 233 dp->dl_type = type;
237 dp->dl_stateid.si_boot = boot_time; 234 dp->dl_stateid.si_boot = boot_time;
238 dp->dl_stateid.si_stateownerid = current_delegid++; 235 dp->dl_stateid.si_stateownerid = current_delegid++;
@@ -241,8 +238,6 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
241 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle); 238 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
242 dp->dl_time = 0; 239 dp->dl_time = 0;
243 atomic_set(&dp->dl_count, 1); 240 atomic_set(&dp->dl_count, 1);
244 list_add(&dp->dl_perfile, &fp->fi_delegations);
245 list_add(&dp->dl_perclnt, &clp->cl_delegations);
246 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc); 241 INIT_WORK(&dp->dl_recall.cb_work, nfsd4_do_callback_rpc);
247 return dp; 242 return dp;
248} 243}
@@ -253,36 +248,30 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
253 if (atomic_dec_and_test(&dp->dl_count)) { 248 if (atomic_dec_and_test(&dp->dl_count)) {
254 dprintk("NFSD: freeing dp %p\n",dp); 249 dprintk("NFSD: freeing dp %p\n",dp);
255 put_nfs4_file(dp->dl_file); 250 put_nfs4_file(dp->dl_file);
256 fput(dp->dl_vfs_file);
257 kmem_cache_free(deleg_slab, dp); 251 kmem_cache_free(deleg_slab, dp);
258 num_delegations--; 252 num_delegations--;
259 } 253 }
260} 254}
261 255
262/* Remove the associated file_lock first, then remove the delegation. 256static void nfs4_put_deleg_lease(struct nfs4_file *fp)
263 * lease_modify() is called to remove the FS_LEASE file_lock from
264 * the i_flock list, eventually calling nfsd's lock_manager
265 * fl_release_callback.
266 */
267static void
268nfs4_close_delegation(struct nfs4_delegation *dp)
269{ 257{
270 dprintk("NFSD: close_delegation dp %p\n",dp); 258 if (atomic_dec_and_test(&fp->fi_delegees)) {
271 /* XXX: do we even need this check?: */ 259 vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease);
272 if (dp->dl_flock) 260 fp->fi_lease = NULL;
273 vfs_setlease(dp->dl_vfs_file, F_UNLCK, &dp->dl_flock); 261 fp->fi_deleg_file = NULL;
262 }
274} 263}
275 264
276/* Called under the state lock. */ 265/* Called under the state lock. */
277static void 266static void
278unhash_delegation(struct nfs4_delegation *dp) 267unhash_delegation(struct nfs4_delegation *dp)
279{ 268{
280 list_del_init(&dp->dl_perfile);
281 list_del_init(&dp->dl_perclnt); 269 list_del_init(&dp->dl_perclnt);
282 spin_lock(&recall_lock); 270 spin_lock(&recall_lock);
271 list_del_init(&dp->dl_perfile);
283 list_del_init(&dp->dl_recall_lru); 272 list_del_init(&dp->dl_recall_lru);
284 spin_unlock(&recall_lock); 273 spin_unlock(&recall_lock);
285 nfs4_close_delegation(dp); 274 nfs4_put_deleg_lease(dp->dl_file);
286 nfs4_put_delegation(dp); 275 nfs4_put_delegation(dp);
287} 276}
288 277
@@ -327,64 +316,6 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
327static struct list_head client_lru; 316static struct list_head client_lru;
328static struct list_head close_lru; 317static struct list_head close_lru;
329 318
330static void unhash_generic_stateid(struct nfs4_stateid *stp)
331{
332 list_del(&stp->st_hash);
333 list_del(&stp->st_perfile);
334 list_del(&stp->st_perstateowner);
335}
336
337static void free_generic_stateid(struct nfs4_stateid *stp)
338{
339 put_nfs4_file(stp->st_file);
340 kmem_cache_free(stateid_slab, stp);
341}
342
343static void release_lock_stateid(struct nfs4_stateid *stp)
344{
345 struct file *file;
346
347 unhash_generic_stateid(stp);
348 file = find_any_file(stp->st_file);
349 if (file)
350 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
351 free_generic_stateid(stp);
352}
353
354static void unhash_lockowner(struct nfs4_stateowner *sop)
355{
356 struct nfs4_stateid *stp;
357
358 list_del(&sop->so_idhash);
359 list_del(&sop->so_strhash);
360 list_del(&sop->so_perstateid);
361 while (!list_empty(&sop->so_stateids)) {
362 stp = list_first_entry(&sop->so_stateids,
363 struct nfs4_stateid, st_perstateowner);
364 release_lock_stateid(stp);
365 }
366}
367
368static void release_lockowner(struct nfs4_stateowner *sop)
369{
370 unhash_lockowner(sop);
371 nfs4_put_stateowner(sop);
372}
373
374static void
375release_stateid_lockowners(struct nfs4_stateid *open_stp)
376{
377 struct nfs4_stateowner *lock_sop;
378
379 while (!list_empty(&open_stp->st_lockowners)) {
380 lock_sop = list_entry(open_stp->st_lockowners.next,
381 struct nfs4_stateowner, so_perstateid);
382 /* list_del(&open_stp->st_lockowners); */
383 BUG_ON(lock_sop->so_is_open_owner);
384 release_lockowner(lock_sop);
385 }
386}
387
388/* 319/*
389 * We store the NONE, READ, WRITE, and BOTH bits separately in the 320 * We store the NONE, READ, WRITE, and BOTH bits separately in the
390 * st_{access,deny}_bmap field of the stateid, in order to track not 321 * st_{access,deny}_bmap field of the stateid, in order to track not
@@ -457,13 +388,74 @@ static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp)
457 return nfs4_access_to_omode(access); 388 return nfs4_access_to_omode(access);
458} 389}
459 390
460static void release_open_stateid(struct nfs4_stateid *stp) 391static void unhash_generic_stateid(struct nfs4_stateid *stp)
392{
393 list_del(&stp->st_hash);
394 list_del(&stp->st_perfile);
395 list_del(&stp->st_perstateowner);
396}
397
398static void free_generic_stateid(struct nfs4_stateid *stp)
399{
400 int oflag;
401
402 if (stp->st_access_bmap) {
403 oflag = nfs4_access_bmap_to_omode(stp);
404 nfs4_file_put_access(stp->st_file, oflag);
405 put_nfs4_file(stp->st_file);
406 }
407 kmem_cache_free(stateid_slab, stp);
408}
409
410static void release_lock_stateid(struct nfs4_stateid *stp)
411{
412 struct file *file;
413
414 unhash_generic_stateid(stp);
415 file = find_any_file(stp->st_file);
416 if (file)
417 locks_remove_posix(file, (fl_owner_t)stp->st_stateowner);
418 free_generic_stateid(stp);
419}
420
421static void unhash_lockowner(struct nfs4_stateowner *sop)
422{
423 struct nfs4_stateid *stp;
424
425 list_del(&sop->so_idhash);
426 list_del(&sop->so_strhash);
427 list_del(&sop->so_perstateid);
428 while (!list_empty(&sop->so_stateids)) {
429 stp = list_first_entry(&sop->so_stateids,
430 struct nfs4_stateid, st_perstateowner);
431 release_lock_stateid(stp);
432 }
433}
434
435static void release_lockowner(struct nfs4_stateowner *sop)
436{
437 unhash_lockowner(sop);
438 nfs4_put_stateowner(sop);
439}
440
441static void
442release_stateid_lockowners(struct nfs4_stateid *open_stp)
461{ 443{
462 int oflag = nfs4_access_bmap_to_omode(stp); 444 struct nfs4_stateowner *lock_sop;
463 445
446 while (!list_empty(&open_stp->st_lockowners)) {
447 lock_sop = list_entry(open_stp->st_lockowners.next,
448 struct nfs4_stateowner, so_perstateid);
449 /* list_del(&open_stp->st_lockowners); */
450 BUG_ON(lock_sop->so_is_open_owner);
451 release_lockowner(lock_sop);
452 }
453}
454
455static void release_open_stateid(struct nfs4_stateid *stp)
456{
464 unhash_generic_stateid(stp); 457 unhash_generic_stateid(stp);
465 release_stateid_lockowners(stp); 458 release_stateid_lockowners(stp);
466 nfs4_file_put_access(stp->st_file, oflag);
467 free_generic_stateid(stp); 459 free_generic_stateid(stp);
468} 460}
469 461
@@ -619,7 +611,8 @@ static void init_forechannel_attrs(struct nfsd4_channel_attrs *new, struct nfsd4
619 u32 maxrpc = nfsd_serv->sv_max_mesg; 611 u32 maxrpc = nfsd_serv->sv_max_mesg;
620 612
621 new->maxreqs = numslots; 613 new->maxreqs = numslots;
622 new->maxresp_cached = slotsize + NFSD_MIN_HDR_SEQ_SZ; 614 new->maxresp_cached = min_t(u32, req->maxresp_cached,
615 slotsize + NFSD_MIN_HDR_SEQ_SZ);
623 new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc); 616 new->maxreq_sz = min_t(u32, req->maxreq_sz, maxrpc);
624 new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc); 617 new->maxresp_sz = min_t(u32, req->maxresp_sz, maxrpc);
625 new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND); 618 new->maxops = min_t(u32, req->maxops, NFSD_MAX_OPS_PER_COMPOUND);
@@ -958,8 +951,6 @@ expire_client(struct nfs4_client *clp)
958 spin_lock(&recall_lock); 951 spin_lock(&recall_lock);
959 while (!list_empty(&clp->cl_delegations)) { 952 while (!list_empty(&clp->cl_delegations)) {
960 dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); 953 dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
961 dprintk("NFSD: expire client. dp %p, fp %p\n", dp,
962 dp->dl_flock);
963 list_del_init(&dp->dl_perclnt); 954 list_del_init(&dp->dl_perclnt);
964 list_move(&dp->dl_recall_lru, &reaplist); 955 list_move(&dp->dl_recall_lru, &reaplist);
965 } 956 }
@@ -2078,6 +2069,7 @@ alloc_init_file(struct inode *ino)
2078 fp->fi_inode = igrab(ino); 2069 fp->fi_inode = igrab(ino);
2079 fp->fi_id = current_fileid++; 2070 fp->fi_id = current_fileid++;
2080 fp->fi_had_conflict = false; 2071 fp->fi_had_conflict = false;
2072 fp->fi_lease = NULL;
2081 memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); 2073 memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
2082 memset(fp->fi_access, 0, sizeof(fp->fi_access)); 2074 memset(fp->fi_access, 0, sizeof(fp->fi_access));
2083 spin_lock(&recall_lock); 2075 spin_lock(&recall_lock);
@@ -2329,23 +2321,8 @@ nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access)
2329 nfs4_file_put_access(fp, O_RDONLY); 2321 nfs4_file_put_access(fp, O_RDONLY);
2330} 2322}
2331 2323
2332/* 2324static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
2333 * Spawn a thread to perform a recall on the delegation represented
2334 * by the lease (file_lock)
2335 *
2336 * Called from break_lease() with lock_flocks() held.
2337 * Note: we assume break_lease will only call this *once* for any given
2338 * lease.
2339 */
2340static
2341void nfsd_break_deleg_cb(struct file_lock *fl)
2342{ 2325{
2343 struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
2344
2345 dprintk("NFSD nfsd_break_deleg_cb: dp %p fl %p\n",dp,fl);
2346 if (!dp)
2347 return;
2348
2349 /* We're assuming the state code never drops its reference 2326 /* We're assuming the state code never drops its reference
2350 * without first removing the lease. Since we're in this lease 2327 * without first removing the lease. Since we're in this lease
2351 * callback (and since the lease code is serialized by the kernel 2328 * callback (and since the lease code is serialized by the kernel
@@ -2353,22 +2330,35 @@ void nfsd_break_deleg_cb(struct file_lock *fl)
2353 * it's safe to take a reference: */ 2330 * it's safe to take a reference: */
2354 atomic_inc(&dp->dl_count); 2331 atomic_inc(&dp->dl_count);
2355 2332
2356 spin_lock(&recall_lock);
2357 list_add_tail(&dp->dl_recall_lru, &del_recall_lru); 2333 list_add_tail(&dp->dl_recall_lru, &del_recall_lru);
2358 spin_unlock(&recall_lock);
2359 2334
2360 /* only place dl_time is set. protected by lock_flocks*/ 2335 /* only place dl_time is set. protected by lock_flocks*/
2361 dp->dl_time = get_seconds(); 2336 dp->dl_time = get_seconds();
2362 2337
2338 nfsd4_cb_recall(dp);
2339}
2340
2341/* Called from break_lease() with lock_flocks() held. */
2342static void nfsd_break_deleg_cb(struct file_lock *fl)
2343{
2344 struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner;
2345 struct nfs4_delegation *dp;
2346
2347 BUG_ON(!fp);
2348 /* We assume break_lease is only called once per lease: */
2349 BUG_ON(fp->fi_had_conflict);
2363 /* 2350 /*
2364 * We don't want the locks code to timeout the lease for us; 2351 * We don't want the locks code to timeout the lease for us;
2365 * we'll remove it ourself if the delegation isn't returned 2352 * we'll remove it ourself if a delegation isn't returned
2366 * in time. 2353 * in time:
2367 */ 2354 */
2368 fl->fl_break_time = 0; 2355 fl->fl_break_time = 0;
2369 2356
2370 dp->dl_file->fi_had_conflict = true; 2357 spin_lock(&recall_lock);
2371 nfsd4_cb_recall(dp); 2358 fp->fi_had_conflict = true;
2359 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
2360 nfsd_break_one_deleg(dp);
2361 spin_unlock(&recall_lock);
2372} 2362}
2373 2363
2374static 2364static
@@ -2461,10 +2451,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
2461{ 2451{
2462 struct nfs4_delegation *dp; 2452 struct nfs4_delegation *dp;
2463 2453
2464 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) { 2454 spin_lock(&recall_lock);
2465 if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) 2455 list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
2456 if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
2457 spin_unlock(&recall_lock);
2466 return dp; 2458 return dp;
2467 } 2459 }
2460 spin_unlock(&recall_lock);
2468 return NULL; 2461 return NULL;
2469} 2462}
2470 2463
@@ -2641,6 +2634,66 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
2641 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN; 2634 return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
2642} 2635}
2643 2636
2637static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, int flag)
2638{
2639 struct file_lock *fl;
2640
2641 fl = locks_alloc_lock();
2642 if (!fl)
2643 return NULL;
2644 locks_init_lock(fl);
2645 fl->fl_lmops = &nfsd_lease_mng_ops;
2646 fl->fl_flags = FL_LEASE;
2647 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2648 fl->fl_end = OFFSET_MAX;
2649 fl->fl_owner = (fl_owner_t)(dp->dl_file);
2650 fl->fl_pid = current->tgid;
2651 return fl;
2652}
2653
2654static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
2655{
2656 struct nfs4_file *fp = dp->dl_file;
2657 struct file_lock *fl;
2658 int status;
2659
2660 fl = nfs4_alloc_init_lease(dp, flag);
2661 if (!fl)
2662 return -ENOMEM;
2663 fl->fl_file = find_readable_file(fp);
2664 list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
2665 status = vfs_setlease(fl->fl_file, fl->fl_type, &fl);
2666 if (status) {
2667 list_del_init(&dp->dl_perclnt);
2668 locks_free_lock(fl);
2669 return -ENOMEM;
2670 }
2671 fp->fi_lease = fl;
2672 fp->fi_deleg_file = fl->fl_file;
2673 get_file(fp->fi_deleg_file);
2674 atomic_set(&fp->fi_delegees, 1);
2675 list_add(&dp->dl_perfile, &fp->fi_delegations);
2676 return 0;
2677}
2678
2679static int nfs4_set_delegation(struct nfs4_delegation *dp, int flag)
2680{
2681 struct nfs4_file *fp = dp->dl_file;
2682
2683 if (!fp->fi_lease)
2684 return nfs4_setlease(dp, flag);
2685 spin_lock(&recall_lock);
2686 if (fp->fi_had_conflict) {
2687 spin_unlock(&recall_lock);
2688 return -EAGAIN;
2689 }
2690 atomic_inc(&fp->fi_delegees);
2691 list_add(&dp->dl_perfile, &fp->fi_delegations);
2692 spin_unlock(&recall_lock);
2693 list_add(&dp->dl_perclnt, &dp->dl_client->cl_delegations);
2694 return 0;
2695}
2696
2644/* 2697/*
2645 * Attempt to hand out a delegation. 2698 * Attempt to hand out a delegation.
2646 */ 2699 */
@@ -2650,7 +2703,6 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2650 struct nfs4_delegation *dp; 2703 struct nfs4_delegation *dp;
2651 struct nfs4_stateowner *sop = stp->st_stateowner; 2704 struct nfs4_stateowner *sop = stp->st_stateowner;
2652 int cb_up; 2705 int cb_up;
2653 struct file_lock *fl;
2654 int status, flag = 0; 2706 int status, flag = 0;
2655 2707
2656 cb_up = nfsd4_cb_channel_good(sop->so_client); 2708 cb_up = nfsd4_cb_channel_good(sop->so_client);
@@ -2681,36 +2733,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
2681 } 2733 }
2682 2734
2683 dp = alloc_init_deleg(sop->so_client, stp, fh, flag); 2735 dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
2684 if (dp == NULL) { 2736 if (dp == NULL)
2685 flag = NFS4_OPEN_DELEGATE_NONE; 2737 goto out_no_deleg;
2686 goto out; 2738 status = nfs4_set_delegation(dp, flag);
2687 } 2739 if (status)
2688 status = -ENOMEM; 2740 goto out_free;
2689 fl = locks_alloc_lock();
2690 if (!fl)
2691 goto out;
2692 locks_init_lock(fl);
2693 fl->fl_lmops = &nfsd_lease_mng_ops;
2694 fl->fl_flags = FL_LEASE;
2695 fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
2696 fl->fl_end = OFFSET_MAX;
2697 fl->fl_owner = (fl_owner_t)dp;
2698 fl->fl_file = find_readable_file(stp->st_file);
2699 BUG_ON(!fl->fl_file);
2700 fl->fl_pid = current->tgid;
2701 dp->dl_flock = fl;
2702
2703 /* vfs_setlease checks to see if delegation should be handed out.
2704 * the lock_manager callback fl_change is used
2705 */
2706 if ((status = vfs_setlease(fl->fl_file, fl->fl_type, &fl))) {
2707 dprintk("NFSD: setlease failed [%d], no delegation\n", status);
2708 dp->dl_flock = NULL;
2709 locks_free_lock(fl);
2710 unhash_delegation(dp);
2711 flag = NFS4_OPEN_DELEGATE_NONE;
2712 goto out;
2713 }
2714 2741
2715 memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid)); 2742 memcpy(&open->op_delegate_stateid, &dp->dl_stateid, sizeof(dp->dl_stateid));
2716 2743
@@ -2722,6 +2749,12 @@ out:
2722 && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) 2749 && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
2723 dprintk("NFSD: WARNING: refusing delegation reclaim\n"); 2750 dprintk("NFSD: WARNING: refusing delegation reclaim\n");
2724 open->op_delegate_type = flag; 2751 open->op_delegate_type = flag;
2752 return;
2753out_free:
2754 nfs4_put_delegation(dp);
2755out_no_deleg:
2756 flag = NFS4_OPEN_DELEGATE_NONE;
2757 goto out;
2725} 2758}
2726 2759
2727/* 2760/*
@@ -2916,8 +2949,6 @@ nfs4_laundromat(void)
2916 test_val = u; 2949 test_val = u;
2917 break; 2950 break;
2918 } 2951 }
2919 dprintk("NFSD: purging unused delegation dp %p, fp %p\n",
2920 dp, dp->dl_flock);
2921 list_move(&dp->dl_recall_lru, &reaplist); 2952 list_move(&dp->dl_recall_lru, &reaplist);
2922 } 2953 }
2923 spin_unlock(&recall_lock); 2954 spin_unlock(&recall_lock);
@@ -3027,7 +3058,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
3027 if (ONE_STATEID(stateid) && (flags & RD_STATE)) 3058 if (ONE_STATEID(stateid) && (flags & RD_STATE))
3028 return nfs_ok; 3059 return nfs_ok;
3029 else if (locks_in_grace()) { 3060 else if (locks_in_grace()) {
3030 /* Answer in remaining cases depends on existance of 3061 /* Answer in remaining cases depends on existence of
3031 * conflicting state; so we must wait out the grace period. */ 3062 * conflicting state; so we must wait out the grace period. */
3032 return nfserr_grace; 3063 return nfserr_grace;
3033 } else if (flags & WR_STATE) 3064 } else if (flags & WR_STATE)
@@ -3128,7 +3159,7 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
3128 goto out; 3159 goto out;
3129 renew_client(dp->dl_client); 3160 renew_client(dp->dl_client);
3130 if (filpp) { 3161 if (filpp) {
3131 *filpp = find_readable_file(dp->dl_file); 3162 *filpp = dp->dl_file->fi_deleg_file;
3132 BUG_ON(!*filpp); 3163 BUG_ON(!*filpp);
3133 } 3164 }
3134 } else { /* open or lock stateid */ 3165 } else { /* open or lock stateid */
@@ -3647,7 +3678,7 @@ find_lockstateowner_str(struct inode *inode, clientid_t *clid,
3647/* 3678/*
3648 * Alloc a lock owner structure. 3679 * Alloc a lock owner structure.
3649 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has 3680 * Called in nfsd4_lock - therefore, OPEN and OPEN_CONFIRM (if needed) has
3650 * occured. 3681 * occurred.
3651 * 3682 *
3652 * strhashval = lock_ownerstr_hashval 3683 * strhashval = lock_ownerstr_hashval
3653 */ 3684 */
@@ -3708,6 +3739,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
3708 stp->st_stateid.si_stateownerid = sop->so_id; 3739 stp->st_stateid.si_stateownerid = sop->so_id;
3709 stp->st_stateid.si_fileid = fp->fi_id; 3740 stp->st_stateid.si_fileid = fp->fi_id;
3710 stp->st_stateid.si_generation = 0; 3741 stp->st_stateid.si_generation = 0;
3742 stp->st_access_bmap = 0;
3711 stp->st_deny_bmap = open_stp->st_deny_bmap; 3743 stp->st_deny_bmap = open_stp->st_deny_bmap;
3712 stp->st_openstp = open_stp; 3744 stp->st_openstp = open_stp;
3713 3745
@@ -3722,6 +3754,17 @@ check_lock_length(u64 offset, u64 length)
3722 LOFF_OVERFLOW(offset, length))); 3754 LOFF_OVERFLOW(offset, length)));
3723} 3755}
3724 3756
3757static void get_lock_access(struct nfs4_stateid *lock_stp, u32 access)
3758{
3759 struct nfs4_file *fp = lock_stp->st_file;
3760 int oflag = nfs4_access_to_omode(access);
3761
3762 if (test_bit(access, &lock_stp->st_access_bmap))
3763 return;
3764 nfs4_file_get_access(fp, oflag);
3765 __set_bit(access, &lock_stp->st_access_bmap);
3766}
3767
3725/* 3768/*
3726 * LOCK operation 3769 * LOCK operation
3727 */ 3770 */
@@ -3738,7 +3781,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3738 struct file_lock conflock; 3781 struct file_lock conflock;
3739 __be32 status = 0; 3782 __be32 status = 0;
3740 unsigned int strhashval; 3783 unsigned int strhashval;
3741 unsigned int cmd;
3742 int err; 3784 int err;
3743 3785
3744 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n", 3786 dprintk("NFSD: nfsd4_lock: start=%Ld length=%Ld\n",
@@ -3820,22 +3862,18 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3820 switch (lock->lk_type) { 3862 switch (lock->lk_type) {
3821 case NFS4_READ_LT: 3863 case NFS4_READ_LT:
3822 case NFS4_READW_LT: 3864 case NFS4_READW_LT:
3823 if (find_readable_file(lock_stp->st_file)) { 3865 filp = find_readable_file(lock_stp->st_file);
3824 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ); 3866 if (filp)
3825 filp = find_readable_file(lock_stp->st_file); 3867 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
3826 }
3827 file_lock.fl_type = F_RDLCK; 3868 file_lock.fl_type = F_RDLCK;
3828 cmd = F_SETLK; 3869 break;
3829 break;
3830 case NFS4_WRITE_LT: 3870 case NFS4_WRITE_LT:
3831 case NFS4_WRITEW_LT: 3871 case NFS4_WRITEW_LT:
3832 if (find_writeable_file(lock_stp->st_file)) { 3872 filp = find_writeable_file(lock_stp->st_file);
3833 nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE); 3873 if (filp)
3834 filp = find_writeable_file(lock_stp->st_file); 3874 get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
3835 }
3836 file_lock.fl_type = F_WRLCK; 3875 file_lock.fl_type = F_WRLCK;
3837 cmd = F_SETLK; 3876 break;
3838 break;
3839 default: 3877 default:
3840 status = nfserr_inval; 3878 status = nfserr_inval;
3841 goto out; 3879 goto out;
@@ -3859,7 +3897,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
3859 * Note: locks.c uses the BKL to protect the inode's lock list. 3897 * Note: locks.c uses the BKL to protect the inode's lock list.
3860 */ 3898 */
3861 3899
3862 err = vfs_lock_file(filp, cmd, &file_lock, &conflock); 3900 err = vfs_lock_file(filp, F_SETLK, &file_lock, &conflock);
3863 switch (-err) { 3901 switch (-err) {
3864 case 0: /* success! */ 3902 case 0: /* success! */
3865 update_stateid(&lock_stp->st_stateid); 3903 update_stateid(&lock_stp->st_stateid);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 956629b9cdc9..c6766af00d98 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -317,8 +317,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
317 READ_BUF(dummy32); 317 READ_BUF(dummy32);
318 len += (XDR_QUADLEN(dummy32) << 2); 318 len += (XDR_QUADLEN(dummy32) << 2);
319 READMEM(buf, dummy32); 319 READMEM(buf, dummy32);
320 if ((host_err = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid))) 320 if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
321 goto out_nfserr; 321 return status;
322 iattr->ia_valid |= ATTR_UID; 322 iattr->ia_valid |= ATTR_UID;
323 } 323 }
324 if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) { 324 if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
@@ -328,8 +328,8 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
328 READ_BUF(dummy32); 328 READ_BUF(dummy32);
329 len += (XDR_QUADLEN(dummy32) << 2); 329 len += (XDR_QUADLEN(dummy32) << 2);
330 READMEM(buf, dummy32); 330 READMEM(buf, dummy32);
331 if ((host_err = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid))) 331 if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
332 goto out_nfserr; 332 return status;
333 iattr->ia_valid |= ATTR_GID; 333 iattr->ia_valid |= ATTR_GID;
334 } 334 }
335 if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) { 335 if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
@@ -1215,8 +1215,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1215 READ_BUF(4); 1215 READ_BUF(4);
1216 READ32(dummy); 1216 READ32(dummy);
1217 READ_BUF(dummy * 4); 1217 READ_BUF(dummy * 4);
1218 for (i = 0; i < dummy; ++i)
1219 READ32(dummy);
1220 break; 1218 break;
1221 case RPC_AUTH_GSS: 1219 case RPC_AUTH_GSS:
1222 dprintk("RPC_AUTH_GSS callback secflavor " 1220 dprintk("RPC_AUTH_GSS callback secflavor "
@@ -1232,7 +1230,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1232 READ_BUF(4); 1230 READ_BUF(4);
1233 READ32(dummy); 1231 READ32(dummy);
1234 READ_BUF(dummy); 1232 READ_BUF(dummy);
1235 p += XDR_QUADLEN(dummy);
1236 break; 1233 break;
1237 default: 1234 default:
1238 dprintk("Illegal callback secflavor\n"); 1235 dprintk("Illegal callback secflavor\n");
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 33b3e2b06779..1f5eae40f34e 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -12,13 +12,14 @@
12#include <linux/nfsd/syscall.h> 12#include <linux/nfsd/syscall.h>
13#include <linux/lockd/lockd.h> 13#include <linux/lockd/lockd.h>
14#include <linux/sunrpc/clnt.h> 14#include <linux/sunrpc/clnt.h>
15#include <linux/sunrpc/gss_api.h>
15 16
16#include "idmap.h" 17#include "idmap.h"
17#include "nfsd.h" 18#include "nfsd.h"
18#include "cache.h" 19#include "cache.h"
19 20
20/* 21/*
21 * We have a single directory with 9 nodes in it. 22 * We have a single directory with several nodes in it.
22 */ 23 */
23enum { 24enum {
24 NFSD_Root = 1, 25 NFSD_Root = 1,
@@ -42,6 +43,7 @@ enum {
42 NFSD_Versions, 43 NFSD_Versions,
43 NFSD_Ports, 44 NFSD_Ports,
44 NFSD_MaxBlkSize, 45 NFSD_MaxBlkSize,
46 NFSD_SupportedEnctypes,
45 /* 47 /*
46 * The below MUST come last. Otherwise we leave a hole in nfsd_files[] 48 * The below MUST come last. Otherwise we leave a hole in nfsd_files[]
47 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops 49 * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
@@ -187,6 +189,34 @@ static struct file_operations export_features_operations = {
187 .release = single_release, 189 .release = single_release,
188}; 190};
189 191
192#ifdef CONFIG_SUNRPC_GSS
193static int supported_enctypes_show(struct seq_file *m, void *v)
194{
195 struct gss_api_mech *k5mech;
196
197 k5mech = gss_mech_get_by_name("krb5");
198 if (k5mech == NULL)
199 goto out;
200 if (k5mech->gm_upcall_enctypes != NULL)
201 seq_printf(m, k5mech->gm_upcall_enctypes);
202 gss_mech_put(k5mech);
203out:
204 return 0;
205}
206
207static int supported_enctypes_open(struct inode *inode, struct file *file)
208{
209 return single_open(file, supported_enctypes_show, NULL);
210}
211
212static struct file_operations supported_enctypes_ops = {
213 .open = supported_enctypes_open,
214 .read = seq_read,
215 .llseek = seq_lseek,
216 .release = single_release,
217};
218#endif /* CONFIG_SUNRPC_GSS */
219
190extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); 220extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
191extern int nfsd_pool_stats_release(struct inode *inode, struct file *file); 221extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
192 222
@@ -1397,6 +1427,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1397 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 1427 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
1398 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 1428 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
1399 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1429 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
1430#ifdef CONFIG_SUNRPC_GSS
1431 [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
1432#endif /* CONFIG_SUNRPC_GSS */
1400#ifdef CONFIG_NFSD_V4 1433#ifdef CONFIG_NFSD_V4
1401 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1434 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
1402 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, 1435 [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR},
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 4ce005dbf3e6..65ec595e2226 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -451,7 +451,7 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p,
451 *p++ = htonl(resp->count); 451 *p++ = htonl(resp->count);
452 xdr_ressize_check(rqstp, p); 452 xdr_ressize_check(rqstp, p);
453 453
454 /* now update rqstp->rq_res to reflect data aswell */ 454 /* now update rqstp->rq_res to reflect data as well */
455 rqstp->rq_res.page_len = resp->count; 455 rqstp->rq_res.page_len = resp->count;
456 if (resp->count & 3) { 456 if (resp->count & 3) {
457 /* need to pad the tail */ 457 /* need to pad the tail */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 3074656ba7bf..6bd2f3c21f2b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -83,8 +83,6 @@ struct nfs4_delegation {
83 atomic_t dl_count; /* ref count */ 83 atomic_t dl_count; /* ref count */
84 struct nfs4_client *dl_client; 84 struct nfs4_client *dl_client;
85 struct nfs4_file *dl_file; 85 struct nfs4_file *dl_file;
86 struct file *dl_vfs_file;
87 struct file_lock *dl_flock;
88 u32 dl_type; 86 u32 dl_type;
89 time_t dl_time; 87 time_t dl_time;
90/* For recall: */ 88/* For recall: */
@@ -369,16 +367,15 @@ struct nfs4_file {
369 struct list_head fi_delegations; 367 struct list_head fi_delegations;
370 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ 368 /* One each for O_RDONLY, O_WRONLY, O_RDWR: */
371 struct file * fi_fds[3]; 369 struct file * fi_fds[3];
372 /* One each for O_RDONLY, O_WRONLY: */
373 atomic_t fi_access[2];
374 /* 370 /*
375 * Each open stateid contributes 1 to either fi_readers or 371 * Each open or lock stateid contributes 1 to either
376 * fi_writers, or both, depending on the open mode. A 372 * fi_access[O_RDONLY], fi_access[O_WRONLY], or both, depending
377 * delegation also takes an fi_readers reference. Lock 373 * on open or lock mode:
378 * stateid's take none.
379 */ 374 */
380 atomic_t fi_readers; 375 atomic_t fi_access[2];
381 atomic_t fi_writers; 376 struct file *fi_deleg_file;
377 struct file_lock *fi_lease;
378 atomic_t fi_delegees;
382 struct inode *fi_inode; 379 struct inode *fi_inode;
383 u32 fi_id; /* used with stateowner->so_id 380 u32 fi_id; /* used with stateowner->so_id
384 * for stateid_hashtbl hash */ 381 * for stateid_hashtbl hash */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 641117f2188d..2e1cebde90df 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -87,7 +87,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
87 .dentry = dget(dentry)}; 87 .dentry = dget(dentry)};
88 int err = 0; 88 int err = 0;
89 89
90 err = follow_down(&path, false); 90 err = follow_down(&path);
91 if (err < 0) 91 if (err < 0)
92 goto out; 92 goto out;
93 93
@@ -808,7 +808,7 @@ nfsd_get_raparms(dev_t dev, ino_t ino)
808 if (ra->p_count == 0) 808 if (ra->p_count == 0)
809 frap = rap; 809 frap = rap;
810 } 810 }
811 depth = nfsdstats.ra_size*11/10; 811 depth = nfsdstats.ra_size;
812 if (!frap) { 812 if (!frap) {
813 spin_unlock(&rab->pb_lock); 813 spin_unlock(&rab->pb_lock);
814 return NULL; 814 return NULL;
@@ -1744,6 +1744,11 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1744 host_err = nfsd_break_lease(odentry->d_inode); 1744 host_err = nfsd_break_lease(odentry->d_inode);
1745 if (host_err) 1745 if (host_err)
1746 goto out_drop_write; 1746 goto out_drop_write;
1747 if (ndentry->d_inode) {
1748 host_err = nfsd_break_lease(ndentry->d_inode);
1749 if (host_err)
1750 goto out_drop_write;
1751 }
1747 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1752 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1748 if (!host_err) { 1753 if (!host_err) {
1749 host_err = commit_metadata(tfhp); 1754 host_err = commit_metadata(tfhp);
@@ -1812,22 +1817,22 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1812 1817
1813 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); 1818 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
1814 if (host_err) 1819 if (host_err)
1815 goto out_nfserr; 1820 goto out_put;
1816 1821
1817 host_err = nfsd_break_lease(rdentry->d_inode); 1822 host_err = nfsd_break_lease(rdentry->d_inode);
1818 if (host_err) 1823 if (host_err)
1819 goto out_put; 1824 goto out_drop_write;
1820 if (type != S_IFDIR) 1825 if (type != S_IFDIR)
1821 host_err = vfs_unlink(dirp, rdentry); 1826 host_err = vfs_unlink(dirp, rdentry);
1822 else 1827 else
1823 host_err = vfs_rmdir(dirp, rdentry); 1828 host_err = vfs_rmdir(dirp, rdentry);
1824out_put:
1825 dput(rdentry);
1826
1827 if (!host_err) 1829 if (!host_err)
1828 host_err = commit_metadata(fhp); 1830 host_err = commit_metadata(fhp);
1829 1831out_drop_write:
1830 mnt_drop_write(fhp->fh_export->ex_path.mnt); 1832 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1833out_put:
1834 dput(rdentry);
1835
1831out_nfserr: 1836out_nfserr:
1832 err = nfserrno(host_err); 1837 err = nfserrno(host_err);
1833out: 1838out:
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index d7fd696e595c..0a0a66d98cce 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -521,8 +521,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
521 group_offset, bitmap)) 521 group_offset, bitmap))
522 printk(KERN_WARNING "%s: entry number %llu already freed\n", 522 printk(KERN_WARNING "%s: entry number %llu already freed\n",
523 __func__, (unsigned long long)req->pr_entry_nr); 523 __func__, (unsigned long long)req->pr_entry_nr);
524 524 else
525 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); 525 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
526 526
527 kunmap(req->pr_bitmap_bh->b_page); 527 kunmap(req->pr_bitmap_bh->b_page);
528 kunmap(req->pr_desc_bh->b_page); 528 kunmap(req->pr_desc_bh->b_page);
@@ -558,8 +558,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
558 group_offset, bitmap)) 558 group_offset, bitmap))
559 printk(KERN_WARNING "%s: entry number %llu already freed\n", 559 printk(KERN_WARNING "%s: entry number %llu already freed\n",
560 __func__, (unsigned long long)req->pr_entry_nr); 560 __func__, (unsigned long long)req->pr_entry_nr);
561 561 else
562 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); 562 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
563 563
564 kunmap(req->pr_bitmap_bh->b_page); 564 kunmap(req->pr_bitmap_bh->b_page);
565 kunmap(req->pr_desc_bh->b_page); 565 kunmap(req->pr_desc_bh->b_page);
@@ -665,7 +665,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
665 for (j = i, n = 0; 665 for (j = i, n = 0;
666 (j < nitems) && nilfs_palloc_group_is_in(inode, group, 666 (j < nitems) && nilfs_palloc_group_is_in(inode, group,
667 entry_nrs[j]); 667 entry_nrs[j]);
668 j++, n++) { 668 j++) {
669 nilfs_palloc_group(inode, entry_nrs[j], &group_offset); 669 nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
670 if (!nilfs_clear_bit_atomic( 670 if (!nilfs_clear_bit_atomic(
671 nilfs_mdt_bgl_lock(inode, group), 671 nilfs_mdt_bgl_lock(inode, group),
@@ -674,6 +674,8 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
674 "%s: entry number %llu already freed\n", 674 "%s: entry number %llu already freed\n",
675 __func__, 675 __func__,
676 (unsigned long long)entry_nrs[j]); 676 (unsigned long long)entry_nrs[j]);
677 } else {
678 n++;
677 } 679 }
678 } 680 }
679 nilfs_palloc_group_desc_add_entries(inode, group, desc, n); 681 nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 9af34a7e6e13..f5fde36b9e28 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -74,7 +74,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
74 74
75#define nilfs_set_bit_atomic ext2_set_bit_atomic 75#define nilfs_set_bit_atomic ext2_set_bit_atomic
76#define nilfs_clear_bit_atomic ext2_clear_bit_atomic 76#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
77#define nilfs_find_next_zero_bit ext2_find_next_zero_bit 77#define nilfs_find_next_zero_bit find_next_zero_bit_le
78 78
79/* 79/*
80 * persistent object allocator cache 80 * persistent object allocator cache
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 3ee67c67cc52..4723f04e9b12 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -25,7 +25,6 @@
25#include <linux/errno.h> 25#include <linux/errno.h>
26#include "nilfs.h" 26#include "nilfs.h"
27#include "bmap.h" 27#include "bmap.h"
28#include "sb.h"
29#include "btree.h" 28#include "btree.h"
30#include "direct.h" 29#include "direct.h"
31#include "btnode.h" 30#include "btnode.h"
@@ -425,17 +424,6 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
425/* 424/*
426 * Internal use only 425 * Internal use only
427 */ 426 */
428
429void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
430{
431 inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
432}
433
434void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
435{
436 inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
437}
438
439__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, 427__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
440 const struct buffer_head *bh) 428 const struct buffer_head *bh)
441{ 429{
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index bde1c0aa2e15..40d9f453d31c 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -240,9 +240,6 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
240__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); 240__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
241__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); 241__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
242 242
243void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
244void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
245
246 243
247/* Assume that bmap semaphore is locked. */ 244/* Assume that bmap semaphore is locked. */
248static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap) 245static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..609cd223eea8 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -34,20 +34,10 @@
34#include "page.h" 34#include "page.h"
35#include "btnode.h" 35#include "btnode.h"
36 36
37
38void nilfs_btnode_cache_init_once(struct address_space *btnc)
39{
40 nilfs_mapping_init_once(btnc);
41}
42
43static const struct address_space_operations def_btnode_aops = {
44 .sync_page = block_sync_page,
45};
46
47void nilfs_btnode_cache_init(struct address_space *btnc, 37void nilfs_btnode_cache_init(struct address_space *btnc,
48 struct backing_dev_info *bdi) 38 struct backing_dev_info *bdi)
49{ 39{
50 nilfs_mapping_init(btnc, bdi, &def_btnode_aops); 40 nilfs_mapping_init(btnc, bdi);
51} 41}
52 42
53void nilfs_btnode_cache_clear(struct address_space *btnc) 43void nilfs_btnode_cache_clear(struct address_space *btnc)
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e0..1b8ebd888c28 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
37 struct buffer_head *newbh; 37 struct buffer_head *newbh;
38}; 38};
39 39
40void nilfs_btnode_cache_init_once(struct address_space *);
41void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); 40void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
42void nilfs_btnode_cache_clear(struct address_space *); 41void nilfs_btnode_cache_clear(struct address_space *);
43struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, 42struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 300c2bc00c3f..d451ae0e0bf3 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -1174,7 +1174,7 @@ static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr)
1174 if (ret < 0) 1174 if (ret < 0)
1175 goto out; 1175 goto out;
1176 nilfs_btree_commit_insert(btree, path, level, key, ptr); 1176 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1177 nilfs_bmap_add_blocks(btree, stats.bs_nblocks); 1177 nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
1178 1178
1179 out: 1179 out:
1180 nilfs_btree_free_path(path); 1180 nilfs_btree_free_path(path);
@@ -1511,7 +1511,7 @@ static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key)
1511 if (ret < 0) 1511 if (ret < 0)
1512 goto out; 1512 goto out;
1513 nilfs_btree_commit_delete(btree, path, level, dat); 1513 nilfs_btree_commit_delete(btree, path, level, dat);
1514 nilfs_bmap_sub_blocks(btree, stats.bs_nblocks); 1514 nilfs_inode_sub_blocks(btree->b_inode, stats.bs_nblocks);
1515 1515
1516out: 1516out:
1517 nilfs_btree_free_path(path); 1517 nilfs_btree_free_path(path);
@@ -1776,7 +1776,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
1776 return ret; 1776 return ret;
1777 nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n, 1777 nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n,
1778 di, ni, bh); 1778 di, ni, bh);
1779 nilfs_bmap_add_blocks(btree, stats.bs_nblocks); 1779 nilfs_inode_add_blocks(btree->b_inode, stats.bs_nblocks);
1780 return 0; 1780 return 0;
1781} 1781}
1782 1782
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 9d45773b79e6..3a1923943b14 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -440,7 +440,6 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
440 nilfs_commit_chunk(page, mapping, from, to); 440 nilfs_commit_chunk(page, mapping, from, to);
441 nilfs_put_page(page); 441 nilfs_put_page(page);
442 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 442 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
443/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
444} 443}
445 444
446/* 445/*
@@ -531,7 +530,6 @@ got_it:
531 nilfs_set_de_type(de, inode); 530 nilfs_set_de_type(de, inode);
532 nilfs_commit_chunk(page, page->mapping, from, to); 531 nilfs_commit_chunk(page, page->mapping, from, to);
533 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 532 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
534/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
535 nilfs_mark_inode_dirty(dir); 533 nilfs_mark_inode_dirty(dir);
536 /* OFFSET_CACHE */ 534 /* OFFSET_CACHE */
537out_put: 535out_put:
@@ -579,7 +577,6 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
579 dir->inode = 0; 577 dir->inode = 0;
580 nilfs_commit_chunk(page, mapping, from, to); 578 nilfs_commit_chunk(page, mapping, from, to);
581 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 579 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
582/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
583out: 580out:
584 nilfs_put_page(page); 581 nilfs_put_page(page);
585 return err; 582 return err;
@@ -684,7 +681,7 @@ const struct file_operations nilfs_dir_operations = {
684 .readdir = nilfs_readdir, 681 .readdir = nilfs_readdir,
685 .unlocked_ioctl = nilfs_ioctl, 682 .unlocked_ioctl = nilfs_ioctl,
686#ifdef CONFIG_COMPAT 683#ifdef CONFIG_COMPAT
687 .compat_ioctl = nilfs_ioctl, 684 .compat_ioctl = nilfs_compat_ioctl,
688#endif /* CONFIG_COMPAT */ 685#endif /* CONFIG_COMPAT */
689 .fsync = nilfs_sync_file, 686 .fsync = nilfs_sync_file,
690 687
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 324d80c57518..82f4865e86dd 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -146,7 +146,7 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
146 if (NILFS_BMAP_USE_VBN(bmap)) 146 if (NILFS_BMAP_USE_VBN(bmap))
147 nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr); 147 nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr);
148 148
149 nilfs_bmap_add_blocks(bmap, 1); 149 nilfs_inode_add_blocks(bmap->b_inode, 1);
150 } 150 }
151 return ret; 151 return ret;
152} 152}
@@ -168,7 +168,7 @@ static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
168 if (!ret) { 168 if (!ret) {
169 nilfs_bmap_commit_end_ptr(bmap, &req, dat); 169 nilfs_bmap_commit_end_ptr(bmap, &req, dat);
170 nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR); 170 nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR);
171 nilfs_bmap_sub_blocks(bmap, 1); 171 nilfs_inode_sub_blocks(bmap->b_inode, 1);
172 } 172 }
173 return ret; 173 return ret;
174} 174}
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 2f560c9fb808..397e73258631 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -59,7 +59,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
59 struct nilfs_transaction_info ti; 59 struct nilfs_transaction_info ti;
60 int ret; 60 int ret;
61 61
62 if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs))) 62 if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
63 return VM_FAULT_SIGBUS; /* -ENOSPC */ 63 return VM_FAULT_SIGBUS; /* -ENOSPC */
64 64
65 lock_page(page); 65 lock_page(page);
@@ -72,10 +72,9 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
72 /* 72 /*
73 * check to see if the page is mapped already (no holes) 73 * check to see if the page is mapped already (no holes)
74 */ 74 */
75 if (PageMappedToDisk(page)) { 75 if (PageMappedToDisk(page))
76 unlock_page(page);
77 goto mapped; 76 goto mapped;
78 } 77
79 if (page_has_buffers(page)) { 78 if (page_has_buffers(page)) {
80 struct buffer_head *bh, *head; 79 struct buffer_head *bh, *head;
81 int fully_mapped = 1; 80 int fully_mapped = 1;
@@ -90,7 +89,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
90 89
91 if (fully_mapped) { 90 if (fully_mapped) {
92 SetPageMappedToDisk(page); 91 SetPageMappedToDisk(page);
93 unlock_page(page);
94 goto mapped; 92 goto mapped;
95 } 93 }
96 } 94 }
@@ -105,16 +103,17 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
105 return VM_FAULT_SIGBUS; 103 return VM_FAULT_SIGBUS;
106 104
107 ret = block_page_mkwrite(vma, vmf, nilfs_get_block); 105 ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
108 if (unlikely(ret)) { 106 if (ret != VM_FAULT_LOCKED) {
109 nilfs_transaction_abort(inode->i_sb); 107 nilfs_transaction_abort(inode->i_sb);
110 return ret; 108 return ret;
111 } 109 }
110 nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
112 nilfs_transaction_commit(inode->i_sb); 111 nilfs_transaction_commit(inode->i_sb);
113 112
114 mapped: 113 mapped:
115 SetPageChecked(page); 114 SetPageChecked(page);
116 wait_on_page_writeback(page); 115 wait_on_page_writeback(page);
117 return 0; 116 return VM_FAULT_LOCKED;
118} 117}
119 118
120static const struct vm_operations_struct nilfs_file_vm_ops = { 119static const struct vm_operations_struct nilfs_file_vm_ops = {
@@ -142,7 +141,7 @@ const struct file_operations nilfs_file_operations = {
142 .aio_write = generic_file_aio_write, 141 .aio_write = generic_file_aio_write,
143 .unlocked_ioctl = nilfs_ioctl, 142 .unlocked_ioctl = nilfs_ioctl,
144#ifdef CONFIG_COMPAT 143#ifdef CONFIG_COMPAT
145 .compat_ioctl = nilfs_ioctl, 144 .compat_ioctl = nilfs_compat_ioctl,
146#endif /* CONFIG_COMPAT */ 145#endif /* CONFIG_COMPAT */
147 .mmap = nilfs_file_mmap, 146 .mmap = nilfs_file_mmap,
148 .open = generic_file_open, 147 .open = generic_file_open,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index caf9a6a3fb54..1c2a3e23f8b2 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -49,7 +49,6 @@
49#include "ifile.h" 49#include "ifile.h"
50 50
51static const struct address_space_operations def_gcinode_aops = { 51static const struct address_space_operations def_gcinode_aops = {
52 .sync_page = block_sync_page,
53}; 52};
54 53
55/* 54/*
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 2fd440d8d6b8..c0aa27490c02 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -41,6 +41,24 @@ struct nilfs_iget_args {
41 int for_gc; 41 int for_gc;
42}; 42};
43 43
44void nilfs_inode_add_blocks(struct inode *inode, int n)
45{
46 struct nilfs_root *root = NILFS_I(inode)->i_root;
47
48 inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
49 if (root)
50 atomic_add(n, &root->blocks_count);
51}
52
53void nilfs_inode_sub_blocks(struct inode *inode, int n)
54{
55 struct nilfs_root *root = NILFS_I(inode)->i_root;
56
57 inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
58 if (root)
59 atomic_sub(n, &root->blocks_count);
60}
61
44/** 62/**
45 * nilfs_get_block() - get a file block on the filesystem (callback function) 63 * nilfs_get_block() - get a file block on the filesystem (callback function)
46 * @inode - inode struct of the target file 64 * @inode - inode struct of the target file
@@ -262,7 +280,6 @@ nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
262const struct address_space_operations nilfs_aops = { 280const struct address_space_operations nilfs_aops = {
263 .writepage = nilfs_writepage, 281 .writepage = nilfs_writepage,
264 .readpage = nilfs_readpage, 282 .readpage = nilfs_readpage,
265 .sync_page = block_sync_page,
266 .writepages = nilfs_writepages, 283 .writepages = nilfs_writepages,
267 .set_page_dirty = nilfs_set_page_dirty, 284 .set_page_dirty = nilfs_set_page_dirty,
268 .readpages = nilfs_readpages, 285 .readpages = nilfs_readpages,
@@ -277,7 +294,7 @@ const struct address_space_operations nilfs_aops = {
277struct inode *nilfs_new_inode(struct inode *dir, int mode) 294struct inode *nilfs_new_inode(struct inode *dir, int mode)
278{ 295{
279 struct super_block *sb = dir->i_sb; 296 struct super_block *sb = dir->i_sb;
280 struct nilfs_sb_info *sbi = NILFS_SB(sb); 297 struct the_nilfs *nilfs = sb->s_fs_info;
281 struct inode *inode; 298 struct inode *inode;
282 struct nilfs_inode_info *ii; 299 struct nilfs_inode_info *ii;
283 struct nilfs_root *root; 300 struct nilfs_root *root;
@@ -315,19 +332,16 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
315 /* No lock is needed; iget() ensures it. */ 332 /* No lock is needed; iget() ensures it. */
316 } 333 }
317 334
318 ii->i_flags = NILFS_I(dir)->i_flags; 335 ii->i_flags = nilfs_mask_flags(
319 if (S_ISLNK(mode)) 336 mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
320 ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
321 if (!S_ISDIR(mode))
322 ii->i_flags &= ~NILFS_DIRSYNC_FL;
323 337
324 /* ii->i_file_acl = 0; */ 338 /* ii->i_file_acl = 0; */
325 /* ii->i_dir_acl = 0; */ 339 /* ii->i_dir_acl = 0; */
326 ii->i_dir_start_lookup = 0; 340 ii->i_dir_start_lookup = 0;
327 nilfs_set_inode_flags(inode); 341 nilfs_set_inode_flags(inode);
328 spin_lock(&sbi->s_next_gen_lock); 342 spin_lock(&nilfs->ns_next_gen_lock);
329 inode->i_generation = sbi->s_next_generation++; 343 inode->i_generation = nilfs->ns_next_generation++;
330 spin_unlock(&sbi->s_next_gen_lock); 344 spin_unlock(&nilfs->ns_next_gen_lock);
331 insert_inode_hash(inode); 345 insert_inode_hash(inode);
332 346
333 err = nilfs_init_acl(inode, dir); 347 err = nilfs_init_acl(inode, dir);
@@ -359,17 +373,15 @@ void nilfs_set_inode_flags(struct inode *inode)
359 373
360 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | 374 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
361 S_DIRSYNC); 375 S_DIRSYNC);
362 if (flags & NILFS_SYNC_FL) 376 if (flags & FS_SYNC_FL)
363 inode->i_flags |= S_SYNC; 377 inode->i_flags |= S_SYNC;
364 if (flags & NILFS_APPEND_FL) 378 if (flags & FS_APPEND_FL)
365 inode->i_flags |= S_APPEND; 379 inode->i_flags |= S_APPEND;
366 if (flags & NILFS_IMMUTABLE_FL) 380 if (flags & FS_IMMUTABLE_FL)
367 inode->i_flags |= S_IMMUTABLE; 381 inode->i_flags |= S_IMMUTABLE;
368#ifndef NILFS_ATIME_DISABLE 382 if (flags & FS_NOATIME_FL)
369 if (flags & NILFS_NOATIME_FL)
370#endif
371 inode->i_flags |= S_NOATIME; 383 inode->i_flags |= S_NOATIME;
372 if (flags & NILFS_DIRSYNC_FL) 384 if (flags & FS_DIRSYNC_FL)
373 inode->i_flags |= S_DIRSYNC; 385 inode->i_flags |= S_DIRSYNC;
374 mapping_set_gfp_mask(inode->i_mapping, 386 mapping_set_gfp_mask(inode->i_mapping,
375 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); 387 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
@@ -420,7 +432,7 @@ static int __nilfs_read_inode(struct super_block *sb,
420 struct nilfs_root *root, unsigned long ino, 432 struct nilfs_root *root, unsigned long ino,
421 struct inode *inode) 433 struct inode *inode)
422{ 434{
423 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; 435 struct the_nilfs *nilfs = sb->s_fs_info;
424 struct buffer_head *bh; 436 struct buffer_head *bh;
425 struct nilfs_inode *raw_inode; 437 struct nilfs_inode *raw_inode;
426 int err; 438 int err;
@@ -707,6 +719,7 @@ void nilfs_evict_inode(struct inode *inode)
707 struct nilfs_transaction_info ti; 719 struct nilfs_transaction_info ti;
708 struct super_block *sb = inode->i_sb; 720 struct super_block *sb = inode->i_sb;
709 struct nilfs_inode_info *ii = NILFS_I(inode); 721 struct nilfs_inode_info *ii = NILFS_I(inode);
722 int ret;
710 723
711 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) { 724 if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
712 if (inode->i_data.nrpages) 725 if (inode->i_data.nrpages)
@@ -725,8 +738,9 @@ void nilfs_evict_inode(struct inode *inode)
725 nilfs_mark_inode_dirty(inode); 738 nilfs_mark_inode_dirty(inode);
726 end_writeback(inode); 739 end_writeback(inode);
727 740
728 nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino); 741 ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
729 atomic_dec(&ii->i_root->inodes_count); 742 if (!ret)
743 atomic_dec(&ii->i_root->inodes_count);
730 744
731 nilfs_clear_inode(inode); 745 nilfs_clear_inode(inode);
732 746
@@ -792,18 +806,18 @@ int nilfs_permission(struct inode *inode, int mask, unsigned int flags)
792 806
793int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) 807int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
794{ 808{
795 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); 809 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
796 struct nilfs_inode_info *ii = NILFS_I(inode); 810 struct nilfs_inode_info *ii = NILFS_I(inode);
797 int err; 811 int err;
798 812
799 spin_lock(&sbi->s_inode_lock); 813 spin_lock(&nilfs->ns_inode_lock);
800 if (ii->i_bh == NULL) { 814 if (ii->i_bh == NULL) {
801 spin_unlock(&sbi->s_inode_lock); 815 spin_unlock(&nilfs->ns_inode_lock);
802 err = nilfs_ifile_get_inode_block(ii->i_root->ifile, 816 err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
803 inode->i_ino, pbh); 817 inode->i_ino, pbh);
804 if (unlikely(err)) 818 if (unlikely(err))
805 return err; 819 return err;
806 spin_lock(&sbi->s_inode_lock); 820 spin_lock(&nilfs->ns_inode_lock);
807 if (ii->i_bh == NULL) 821 if (ii->i_bh == NULL)
808 ii->i_bh = *pbh; 822 ii->i_bh = *pbh;
809 else { 823 else {
@@ -814,36 +828,36 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
814 *pbh = ii->i_bh; 828 *pbh = ii->i_bh;
815 829
816 get_bh(*pbh); 830 get_bh(*pbh);
817 spin_unlock(&sbi->s_inode_lock); 831 spin_unlock(&nilfs->ns_inode_lock);
818 return 0; 832 return 0;
819} 833}
820 834
821int nilfs_inode_dirty(struct inode *inode) 835int nilfs_inode_dirty(struct inode *inode)
822{ 836{
823 struct nilfs_inode_info *ii = NILFS_I(inode); 837 struct nilfs_inode_info *ii = NILFS_I(inode);
824 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb); 838 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
825 int ret = 0; 839 int ret = 0;
826 840
827 if (!list_empty(&ii->i_dirty)) { 841 if (!list_empty(&ii->i_dirty)) {
828 spin_lock(&sbi->s_inode_lock); 842 spin_lock(&nilfs->ns_inode_lock);
829 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) || 843 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
830 test_bit(NILFS_I_BUSY, &ii->i_state); 844 test_bit(NILFS_I_BUSY, &ii->i_state);
831 spin_unlock(&sbi->s_inode_lock); 845 spin_unlock(&nilfs->ns_inode_lock);
832 } 846 }
833 return ret; 847 return ret;
834} 848}
835 849
836int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) 850int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
837{ 851{
838 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
839 struct nilfs_inode_info *ii = NILFS_I(inode); 852 struct nilfs_inode_info *ii = NILFS_I(inode);
853 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
840 854
841 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks); 855 atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
842 856
843 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state)) 857 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
844 return 0; 858 return 0;
845 859
846 spin_lock(&sbi->s_inode_lock); 860 spin_lock(&nilfs->ns_inode_lock);
847 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && 861 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
848 !test_bit(NILFS_I_BUSY, &ii->i_state)) { 862 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
849 /* Because this routine may race with nilfs_dispose_list(), 863 /* Because this routine may race with nilfs_dispose_list(),
@@ -851,18 +865,18 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
851 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) { 865 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
852 /* This will happen when somebody is freeing 866 /* This will happen when somebody is freeing
853 this inode. */ 867 this inode. */
854 nilfs_warning(sbi->s_super, __func__, 868 nilfs_warning(inode->i_sb, __func__,
855 "cannot get inode (ino=%lu)\n", 869 "cannot get inode (ino=%lu)\n",
856 inode->i_ino); 870 inode->i_ino);
857 spin_unlock(&sbi->s_inode_lock); 871 spin_unlock(&nilfs->ns_inode_lock);
858 return -EINVAL; /* NILFS_I_DIRTY may remain for 872 return -EINVAL; /* NILFS_I_DIRTY may remain for
859 freeing inode */ 873 freeing inode */
860 } 874 }
861 list_del(&ii->i_dirty); 875 list_del(&ii->i_dirty);
862 list_add_tail(&ii->i_dirty, &sbi->s_dirty_files); 876 list_add_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
863 set_bit(NILFS_I_QUEUED, &ii->i_state); 877 set_bit(NILFS_I_QUEUED, &ii->i_state);
864 } 878 }
865 spin_unlock(&sbi->s_inode_lock); 879 spin_unlock(&nilfs->ns_inode_lock);
866 return 0; 880 return 0;
867} 881}
868 882
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 496738963fdb..f2469ba6246b 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -26,7 +26,9 @@
26#include <linux/capability.h> /* capable() */ 26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */ 27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/vmalloc.h> 28#include <linux/vmalloc.h>
29#include <linux/compat.h> /* compat_ptr() */
29#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */ 30#include <linux/mount.h> /* mnt_want_write(), mnt_drop_write() */
31#include <linux/buffer_head.h>
30#include <linux/nilfs2_fs.h> 32#include <linux/nilfs2_fs.h>
31#include "nilfs.h" 33#include "nilfs.h"
32#include "segment.h" 34#include "segment.h"
@@ -97,11 +99,74 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
97 return ret; 99 return ret;
98} 100}
99 101
102static int nilfs_ioctl_getflags(struct inode *inode, void __user *argp)
103{
104 unsigned int flags = NILFS_I(inode)->i_flags & FS_FL_USER_VISIBLE;
105
106 return put_user(flags, (int __user *)argp);
107}
108
109static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
110 void __user *argp)
111{
112 struct nilfs_transaction_info ti;
113 unsigned int flags, oldflags;
114 int ret;
115
116 if (!inode_owner_or_capable(inode))
117 return -EACCES;
118
119 if (get_user(flags, (int __user *)argp))
120 return -EFAULT;
121
122 ret = mnt_want_write(filp->f_path.mnt);
123 if (ret)
124 return ret;
125
126 flags = nilfs_mask_flags(inode->i_mode, flags);
127
128 mutex_lock(&inode->i_mutex);
129
130 oldflags = NILFS_I(inode)->i_flags;
131
132 /*
133 * The IMMUTABLE and APPEND_ONLY flags can only be changed by the
134 * relevant capability.
135 */
136 ret = -EPERM;
137 if (((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) &&
138 !capable(CAP_LINUX_IMMUTABLE))
139 goto out;
140
141 ret = nilfs_transaction_begin(inode->i_sb, &ti, 0);
142 if (ret)
143 goto out;
144
145 NILFS_I(inode)->i_flags = (oldflags & ~FS_FL_USER_MODIFIABLE) |
146 (flags & FS_FL_USER_MODIFIABLE);
147
148 nilfs_set_inode_flags(inode);
149 inode->i_ctime = CURRENT_TIME;
150 if (IS_SYNC(inode))
151 nilfs_set_transaction_flag(NILFS_TI_SYNC);
152
153 nilfs_mark_inode_dirty(inode);
154 ret = nilfs_transaction_commit(inode->i_sb);
155out:
156 mutex_unlock(&inode->i_mutex);
157 mnt_drop_write(filp->f_path.mnt);
158 return ret;
159}
160
161static int nilfs_ioctl_getversion(struct inode *inode, void __user *argp)
162{
163 return put_user(inode->i_generation, (int __user *)argp);
164}
165
100static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp, 166static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
101 unsigned int cmd, void __user *argp) 167 unsigned int cmd, void __user *argp)
102{ 168{
103 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 169 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
104 struct inode *cpfile = nilfs->ns_cpfile;
105 struct nilfs_transaction_info ti; 170 struct nilfs_transaction_info ti;
106 struct nilfs_cpmode cpmode; 171 struct nilfs_cpmode cpmode;
107 int ret; 172 int ret;
@@ -121,7 +186,7 @@ static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
121 186
122 nilfs_transaction_begin(inode->i_sb, &ti, 0); 187 nilfs_transaction_begin(inode->i_sb, &ti, 0);
123 ret = nilfs_cpfile_change_cpmode( 188 ret = nilfs_cpfile_change_cpmode(
124 cpfile, cpmode.cm_cno, cpmode.cm_mode); 189 nilfs->ns_cpfile, cpmode.cm_cno, cpmode.cm_mode);
125 if (unlikely(ret < 0)) 190 if (unlikely(ret < 0))
126 nilfs_transaction_abort(inode->i_sb); 191 nilfs_transaction_abort(inode->i_sb);
127 else 192 else
@@ -137,7 +202,7 @@ static int
137nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp, 202nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
138 unsigned int cmd, void __user *argp) 203 unsigned int cmd, void __user *argp)
139{ 204{
140 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile; 205 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
141 struct nilfs_transaction_info ti; 206 struct nilfs_transaction_info ti;
142 __u64 cno; 207 __u64 cno;
143 int ret; 208 int ret;
@@ -154,7 +219,7 @@ nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
154 goto out; 219 goto out;
155 220
156 nilfs_transaction_begin(inode->i_sb, &ti, 0); 221 nilfs_transaction_begin(inode->i_sb, &ti, 0);
157 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno); 222 ret = nilfs_cpfile_delete_checkpoint(nilfs->ns_cpfile, cno);
158 if (unlikely(ret < 0)) 223 if (unlikely(ret < 0))
159 nilfs_transaction_abort(inode->i_sb); 224 nilfs_transaction_abort(inode->i_sb);
160 else 225 else
@@ -180,7 +245,7 @@ nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
180static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp, 245static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
181 unsigned int cmd, void __user *argp) 246 unsigned int cmd, void __user *argp)
182{ 247{
183 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 248 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
184 struct nilfs_cpstat cpstat; 249 struct nilfs_cpstat cpstat;
185 int ret; 250 int ret;
186 251
@@ -211,7 +276,7 @@ nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
211static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp, 276static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
212 unsigned int cmd, void __user *argp) 277 unsigned int cmd, void __user *argp)
213{ 278{
214 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 279 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
215 struct nilfs_sustat sustat; 280 struct nilfs_sustat sustat;
216 int ret; 281 int ret;
217 282
@@ -267,7 +332,7 @@ nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
267static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp, 332static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
268 unsigned int cmd, void __user *argp) 333 unsigned int cmd, void __user *argp)
269{ 334{
270 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 335 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
271 struct nilfs_argv argv; 336 struct nilfs_argv argv;
272 int ret; 337 int ret;
273 338
@@ -336,7 +401,7 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb,
336 struct nilfs_argv *argv, void *buf) 401 struct nilfs_argv *argv, void *buf)
337{ 402{
338 size_t nmembs = argv->v_nmembs; 403 size_t nmembs = argv->v_nmembs;
339 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; 404 struct the_nilfs *nilfs = sb->s_fs_info;
340 struct inode *inode; 405 struct inode *inode;
341 struct nilfs_vdesc *vdesc; 406 struct nilfs_vdesc *vdesc;
342 struct buffer_head *bh, *n; 407 struct buffer_head *bh, *n;
@@ -550,7 +615,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
550 ret = PTR_ERR(kbufs[4]); 615 ret = PTR_ERR(kbufs[4]);
551 goto out; 616 goto out;
552 } 617 }
553 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 618 nilfs = inode->i_sb->s_fs_info;
554 619
555 for (n = 0; n < 4; n++) { 620 for (n = 0; n < 4; n++) {
556 ret = -EINVAL; 621 ret = -EINVAL;
@@ -623,7 +688,7 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
623 return ret; 688 return ret;
624 689
625 if (argp != NULL) { 690 if (argp != NULL) {
626 nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 691 nilfs = inode->i_sb->s_fs_info;
627 down_read(&nilfs->ns_segctor_sem); 692 down_read(&nilfs->ns_segctor_sem);
628 cno = nilfs->ns_cno - 1; 693 cno = nilfs->ns_cno - 1;
629 up_read(&nilfs->ns_segctor_sem); 694 up_read(&nilfs->ns_segctor_sem);
@@ -641,7 +706,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp,
641 void *, size_t, size_t)) 706 void *, size_t, size_t))
642 707
643{ 708{
644 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs; 709 struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
645 struct nilfs_argv argv; 710 struct nilfs_argv argv;
646 int ret; 711 int ret;
647 712
@@ -666,6 +731,12 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
666 void __user *argp = (void __user *)arg; 731 void __user *argp = (void __user *)arg;
667 732
668 switch (cmd) { 733 switch (cmd) {
734 case FS_IOC_GETFLAGS:
735 return nilfs_ioctl_getflags(inode, argp);
736 case FS_IOC_SETFLAGS:
737 return nilfs_ioctl_setflags(inode, filp, argp);
738 case FS_IOC_GETVERSION:
739 return nilfs_ioctl_getversion(inode, argp);
669 case NILFS_IOCTL_CHANGE_CPMODE: 740 case NILFS_IOCTL_CHANGE_CPMODE:
670 return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp); 741 return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
671 case NILFS_IOCTL_DELETE_CHECKPOINT: 742 case NILFS_IOCTL_DELETE_CHECKPOINT:
@@ -696,3 +767,23 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
696 return -ENOTTY; 767 return -ENOTTY;
697 } 768 }
698} 769}
770
771#ifdef CONFIG_COMPAT
772long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
773{
774 switch (cmd) {
775 case FS_IOC32_GETFLAGS:
776 cmd = FS_IOC_GETFLAGS;
777 break;
778 case FS_IOC32_SETFLAGS:
779 cmd = FS_IOC_SETFLAGS;
780 break;
781 case FS_IOC32_GETVERSION:
782 cmd = FS_IOC_GETVERSION;
783 break;
784 default:
785 return -ENOIOCTLCMD;
786 }
787 return nilfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
788}
789#endif
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..a649b05f7069 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -399,7 +399,6 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
399 399
400static const struct address_space_operations def_mdt_aops = { 400static const struct address_space_operations def_mdt_aops = {
401 .writepage = nilfs_mdt_write_page, 401 .writepage = nilfs_mdt_write_page,
402 .sync_page = block_sync_page,
403}; 402};
404 403
405static const struct inode_operations def_mdt_iops; 404static const struct inode_operations def_mdt_iops;
@@ -438,10 +437,6 @@ void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
438 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size); 437 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
439} 438}
440 439
441static const struct address_space_operations shadow_map_aops = {
442 .sync_page = block_sync_page,
443};
444
445/** 440/**
446 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file 441 * nilfs_mdt_setup_shadow_map - setup shadow map and bind it to metadata file
447 * @inode: inode of the metadata file 442 * @inode: inode of the metadata file
@@ -454,10 +449,10 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
454 struct backing_dev_info *bdi = inode->i_sb->s_bdi; 449 struct backing_dev_info *bdi = inode->i_sb->s_bdi;
455 450
456 INIT_LIST_HEAD(&shadow->frozen_buffers); 451 INIT_LIST_HEAD(&shadow->frozen_buffers);
457 nilfs_mapping_init_once(&shadow->frozen_data); 452 address_space_init_once(&shadow->frozen_data);
458 nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops); 453 nilfs_mapping_init(&shadow->frozen_data, bdi);
459 nilfs_mapping_init_once(&shadow->frozen_btnodes); 454 address_space_init_once(&shadow->frozen_btnodes);
460 nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops); 455 nilfs_mapping_init(&shadow->frozen_btnodes, bdi);
461 mi->mi_shadow = shadow; 456 mi->mi_shadow = shadow;
462 return 0; 457 return 0;
463} 458}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index b13734bf3521..ed68563ec708 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -66,7 +66,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
66 66
67static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode) 67static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
68{ 68{
69 return NILFS_SB(inode->i_sb)->s_nilfs; 69 return inode->i_sb->s_fs_info;
70} 70}
71 71
72/* Default GFP flags using highmem */ 72/* Default GFP flags using highmem */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 98034271cd02..546849b3e88f 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
397 new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page); 397 new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
398 if (!new_de) 398 if (!new_de)
399 goto out_dir; 399 goto out_dir;
400 inc_nlink(old_inode);
401 nilfs_set_link(new_dir, new_de, new_page, old_inode); 400 nilfs_set_link(new_dir, new_de, new_page, old_inode);
402 nilfs_mark_inode_dirty(new_dir); 401 nilfs_mark_inode_dirty(new_dir);
403 new_inode->i_ctime = CURRENT_TIME; 402 new_inode->i_ctime = CURRENT_TIME;
@@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
411 if (new_dir->i_nlink >= NILFS_LINK_MAX) 410 if (new_dir->i_nlink >= NILFS_LINK_MAX)
412 goto out_dir; 411 goto out_dir;
413 } 412 }
414 inc_nlink(old_inode);
415 err = nilfs_add_link(new_dentry, old_inode); 413 err = nilfs_add_link(new_dentry, old_inode);
416 if (err) { 414 if (err)
417 drop_nlink(old_inode);
418 nilfs_mark_inode_dirty(old_inode);
419 goto out_dir; 415 goto out_dir;
420 }
421 if (dir_de) { 416 if (dir_de) {
422 inc_nlink(new_dir); 417 inc_nlink(new_dir);
423 nilfs_mark_inode_dirty(new_dir); 418 nilfs_mark_inode_dirty(new_dir);
@@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
431 old_inode->i_ctime = CURRENT_TIME; 426 old_inode->i_ctime = CURRENT_TIME;
432 427
433 nilfs_delete_entry(old_de, old_page); 428 nilfs_delete_entry(old_de, old_page);
434 drop_nlink(old_inode);
435 429
436 if (dir_de) { 430 if (dir_de) {
437 nilfs_set_link(old_inode, dir_de, dir_page, new_dir); 431 nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
@@ -488,7 +482,7 @@ static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
488 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO) 482 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO)
489 return ERR_PTR(-ESTALE); 483 return ERR_PTR(-ESTALE);
490 484
491 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno); 485 root = nilfs_lookup_root(sb->s_fs_info, cno);
492 if (!root) 486 if (!root)
493 return ERR_PTR(-ESTALE); 487 return ERR_PTR(-ESTALE);
494 488
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 777e8fd04304..a8dd344303cb 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -30,7 +30,6 @@
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/nilfs2_fs.h> 31#include <linux/nilfs2_fs.h>
32#include "the_nilfs.h" 32#include "the_nilfs.h"
33#include "sb.h"
34#include "bmap.h" 33#include "bmap.h"
35 34
36/* 35/*
@@ -115,19 +114,19 @@ enum {
115 * Macros to check inode numbers 114 * Macros to check inode numbers
116 */ 115 */
117#define NILFS_MDT_INO_BITS \ 116#define NILFS_MDT_INO_BITS \
118 ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \ 117 ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \
119 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \ 118 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \
120 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO)) 119 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
121 120
122#define NILFS_SYS_INO_BITS \ 121#define NILFS_SYS_INO_BITS \
123 ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS) 122 ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
124 123
125#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino) 124#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
126 125
127#define NILFS_MDT_INODE(sb, ino) \ 126#define NILFS_MDT_INODE(sb, ino) \
128 ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino)))) 127 ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
129#define NILFS_VALID_INODE(sb, ino) \ 128#define NILFS_VALID_INODE(sb, ino) \
130 ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino)))) 129 ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
131 130
132/** 131/**
133 * struct nilfs_transaction_info: context information for synchronization 132 * struct nilfs_transaction_info: context information for synchronization
@@ -212,6 +211,23 @@ static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
212 211
213#define NILFS_ATIME_DISABLE 212#define NILFS_ATIME_DISABLE
214 213
214/* Flags that should be inherited by new inodes from their parent. */
215#define NILFS_FL_INHERITED \
216 (FS_SECRM_FL | FS_UNRM_FL | FS_COMPR_FL | FS_SYNC_FL | \
217 FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL |\
218 FS_COMPRBLK_FL | FS_NOCOMP_FL | FS_NOTAIL_FL | FS_DIRSYNC_FL)
219
220/* Mask out flags that are inappropriate for the given type of inode. */
221static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
222{
223 if (S_ISDIR(mode))
224 return flags;
225 else if (S_ISREG(mode))
226 return flags & ~(FS_DIRSYNC_FL | FS_TOPDIR_FL);
227 else
228 return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
229}
230
215/* dir.c */ 231/* dir.c */
216extern int nilfs_add_link(struct dentry *, struct inode *); 232extern int nilfs_add_link(struct dentry *, struct inode *);
217extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *); 233extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
@@ -229,10 +245,13 @@ extern int nilfs_sync_file(struct file *, int);
229 245
230/* ioctl.c */ 246/* ioctl.c */
231long nilfs_ioctl(struct file *, unsigned int, unsigned long); 247long nilfs_ioctl(struct file *, unsigned int, unsigned long);
248long nilfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
232int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *, 249int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, struct nilfs_argv *,
233 void **); 250 void **);
234 251
235/* inode.c */ 252/* inode.c */
253void nilfs_inode_add_blocks(struct inode *inode, int n);
254void nilfs_inode_sub_blocks(struct inode *inode, int n);
236extern struct inode *nilfs_new_inode(struct inode *, int); 255extern struct inode *nilfs_new_inode(struct inode *, int);
237extern void nilfs_free_inode(struct inode *); 256extern void nilfs_free_inode(struct inode *);
238extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int); 257extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
@@ -266,7 +285,7 @@ extern void nilfs_destroy_inode(struct inode *);
266extern void nilfs_error(struct super_block *, const char *, const char *, ...) 285extern void nilfs_error(struct super_block *, const char *, const char *, ...)
267 __attribute__ ((format (printf, 3, 4))); 286 __attribute__ ((format (printf, 3, 4)));
268extern void nilfs_warning(struct super_block *, const char *, const char *, ...) 287extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
269 __attribute__ ((format (printf, 3, 4))); 288 __attribute__ ((format (printf, 3, 4)));
270extern struct nilfs_super_block * 289extern struct nilfs_super_block *
271nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); 290nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
272extern int nilfs_store_magic_and_option(struct super_block *, 291extern int nilfs_store_magic_and_option(struct super_block *,
@@ -275,11 +294,11 @@ extern int nilfs_check_feature_compatibility(struct super_block *,
275 struct nilfs_super_block *); 294 struct nilfs_super_block *);
276extern void nilfs_set_log_cursor(struct nilfs_super_block *, 295extern void nilfs_set_log_cursor(struct nilfs_super_block *,
277 struct the_nilfs *); 296 struct the_nilfs *);
278extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *, 297struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
279 int flip); 298 int flip);
280extern int nilfs_commit_super(struct nilfs_sb_info *, int); 299int nilfs_commit_super(struct super_block *sb, int flag);
281extern int nilfs_cleanup_super(struct nilfs_sb_info *); 300int nilfs_cleanup_super(struct super_block *sb);
282int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt, 301int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
283 struct nilfs_root **root); 302 struct nilfs_root **root);
284int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno); 303int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno);
285 304
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..1168059c7efd 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,29 +492,15 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
492 return nc; 492 return nc;
493} 493}
494 494
495void nilfs_mapping_init_once(struct address_space *mapping)
496{
497 memset(mapping, 0, sizeof(*mapping));
498 INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
499 spin_lock_init(&mapping->tree_lock);
500 INIT_LIST_HEAD(&mapping->private_list);
501 spin_lock_init(&mapping->private_lock);
502
503 spin_lock_init(&mapping->i_mmap_lock);
504 INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
505 INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
506}
507
508void nilfs_mapping_init(struct address_space *mapping, 495void nilfs_mapping_init(struct address_space *mapping,
509 struct backing_dev_info *bdi, 496 struct backing_dev_info *bdi)
510 const struct address_space_operations *aops)
511{ 497{
512 mapping->host = NULL; 498 mapping->host = NULL;
513 mapping->flags = 0; 499 mapping->flags = 0;
514 mapping_set_gfp_mask(mapping, GFP_NOFS); 500 mapping_set_gfp_mask(mapping, GFP_NOFS);
515 mapping->assoc_mapping = NULL; 501 mapping->assoc_mapping = NULL;
516 mapping->backing_dev_info = bdi; 502 mapping->backing_dev_info = bdi;
517 mapping->a_ops = aops; 503 mapping->a_ops = &empty_aops;
518} 504}
519 505
520/* 506/*
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..f06b79ad7493 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,10 +61,8 @@ void nilfs_free_private_page(struct page *);
61int nilfs_copy_dirty_pages(struct address_space *, struct address_space *); 61int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
62void nilfs_copy_back_pages(struct address_space *, struct address_space *); 62void nilfs_copy_back_pages(struct address_space *, struct address_space *);
63void nilfs_clear_dirty_pages(struct address_space *); 63void nilfs_clear_dirty_pages(struct address_space *);
64void nilfs_mapping_init_once(struct address_space *mapping);
65void nilfs_mapping_init(struct address_space *mapping, 64void nilfs_mapping_init(struct address_space *mapping,
66 struct backing_dev_info *bdi, 65 struct backing_dev_info *bdi);
67 const struct address_space_operations *aops);
68unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned); 66unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
69unsigned long nilfs_find_uncommitted_extent(struct inode *inode, 67unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
70 sector_t start_blk, 68 sector_t start_blk,
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 3dfcd3b7d389..ba4a64518f38 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -425,7 +425,7 @@ void nilfs_dispose_segment_list(struct list_head *head)
425} 425}
426 426
427static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, 427static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
428 struct nilfs_sb_info *sbi, 428 struct super_block *sb,
429 struct nilfs_recovery_info *ri) 429 struct nilfs_recovery_info *ri)
430{ 430{
431 struct list_head *head = &ri->ri_used_segments; 431 struct list_head *head = &ri->ri_used_segments;
@@ -501,7 +501,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
501} 501}
502 502
503static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, 503static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
504 struct nilfs_sb_info *sbi, 504 struct super_block *sb,
505 struct nilfs_root *root, 505 struct nilfs_root *root,
506 struct list_head *head, 506 struct list_head *head,
507 unsigned long *nr_salvaged_blocks) 507 unsigned long *nr_salvaged_blocks)
@@ -514,7 +514,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
514 int err = 0, err2 = 0; 514 int err = 0, err2 = 0;
515 515
516 list_for_each_entry_safe(rb, n, head, list) { 516 list_for_each_entry_safe(rb, n, head, list) {
517 inode = nilfs_iget(sbi->s_super, root, rb->ino); 517 inode = nilfs_iget(sb, root, rb->ino);
518 if (IS_ERR(inode)) { 518 if (IS_ERR(inode)) {
519 err = PTR_ERR(inode); 519 err = PTR_ERR(inode);
520 inode = NULL; 520 inode = NULL;
@@ -572,11 +572,11 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
572 * nilfs_do_roll_forward - salvage logical segments newer than the latest 572 * nilfs_do_roll_forward - salvage logical segments newer than the latest
573 * checkpoint 573 * checkpoint
574 * @nilfs: nilfs object 574 * @nilfs: nilfs object
575 * @sbi: nilfs_sb_info 575 * @sb: super block instance
576 * @ri: pointer to a nilfs_recovery_info 576 * @ri: pointer to a nilfs_recovery_info
577 */ 577 */
578static int nilfs_do_roll_forward(struct the_nilfs *nilfs, 578static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
579 struct nilfs_sb_info *sbi, 579 struct super_block *sb,
580 struct nilfs_root *root, 580 struct nilfs_root *root,
581 struct nilfs_recovery_info *ri) 581 struct nilfs_recovery_info *ri)
582{ 582{
@@ -648,7 +648,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
648 goto failed; 648 goto failed;
649 if (flags & NILFS_SS_LOGEND) { 649 if (flags & NILFS_SS_LOGEND) {
650 err = nilfs_recover_dsync_blocks( 650 err = nilfs_recover_dsync_blocks(
651 nilfs, sbi, root, &dsync_blocks, 651 nilfs, sb, root, &dsync_blocks,
652 &nsalvaged_blocks); 652 &nsalvaged_blocks);
653 if (unlikely(err)) 653 if (unlikely(err))
654 goto failed; 654 goto failed;
@@ -681,7 +681,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
681 681
682 if (nsalvaged_blocks) { 682 if (nsalvaged_blocks) {
683 printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n", 683 printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
684 sbi->s_super->s_id, nsalvaged_blocks); 684 sb->s_id, nsalvaged_blocks);
685 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; 685 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
686 } 686 }
687 out: 687 out:
@@ -695,7 +695,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
695 printk(KERN_ERR 695 printk(KERN_ERR
696 "NILFS (device %s): Error roll-forwarding " 696 "NILFS (device %s): Error roll-forwarding "
697 "(err=%d, pseg block=%llu). ", 697 "(err=%d, pseg block=%llu). ",
698 sbi->s_super->s_id, err, (unsigned long long)pseg_start); 698 sb->s_id, err, (unsigned long long)pseg_start);
699 goto out; 699 goto out;
700} 700}
701 701
@@ -724,7 +724,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
724/** 724/**
725 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint 725 * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint
726 * @nilfs: nilfs object 726 * @nilfs: nilfs object
727 * @sbi: nilfs_sb_info 727 * @sb: super block instance
728 * @ri: pointer to a nilfs_recovery_info struct to store search results. 728 * @ri: pointer to a nilfs_recovery_info struct to store search results.
729 * 729 *
730 * Return Value: On success, 0 is returned. On error, one of the following 730 * Return Value: On success, 0 is returned. On error, one of the following
@@ -741,7 +741,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
741 * %-ENOMEM - Insufficient memory available. 741 * %-ENOMEM - Insufficient memory available.
742 */ 742 */
743int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, 743int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
744 struct nilfs_sb_info *sbi, 744 struct super_block *sb,
745 struct nilfs_recovery_info *ri) 745 struct nilfs_recovery_info *ri)
746{ 746{
747 struct nilfs_root *root; 747 struct nilfs_root *root;
@@ -750,32 +750,32 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
750 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0) 750 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
751 return 0; 751 return 0;
752 752
753 err = nilfs_attach_checkpoint(sbi, ri->ri_cno, true, &root); 753 err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
754 if (unlikely(err)) { 754 if (unlikely(err)) {
755 printk(KERN_ERR 755 printk(KERN_ERR
756 "NILFS: error loading the latest checkpoint.\n"); 756 "NILFS: error loading the latest checkpoint.\n");
757 return err; 757 return err;
758 } 758 }
759 759
760 err = nilfs_do_roll_forward(nilfs, sbi, root, ri); 760 err = nilfs_do_roll_forward(nilfs, sb, root, ri);
761 if (unlikely(err)) 761 if (unlikely(err))
762 goto failed; 762 goto failed;
763 763
764 if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) { 764 if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
765 err = nilfs_prepare_segment_for_recovery(nilfs, sbi, ri); 765 err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
766 if (unlikely(err)) { 766 if (unlikely(err)) {
767 printk(KERN_ERR "NILFS: Error preparing segments for " 767 printk(KERN_ERR "NILFS: Error preparing segments for "
768 "recovery.\n"); 768 "recovery.\n");
769 goto failed; 769 goto failed;
770 } 770 }
771 771
772 err = nilfs_attach_segment_constructor(sbi, root); 772 err = nilfs_attach_log_writer(sb, root);
773 if (unlikely(err)) 773 if (unlikely(err))
774 goto failed; 774 goto failed;
775 775
776 set_nilfs_discontinued(nilfs); 776 set_nilfs_discontinued(nilfs);
777 err = nilfs_construct_segment(sbi->s_super); 777 err = nilfs_construct_segment(sb);
778 nilfs_detach_segment_constructor(sbi); 778 nilfs_detach_log_writer(sb);
779 779
780 if (unlikely(err)) { 780 if (unlikely(err)) {
781 printk(KERN_ERR "NILFS: Oops! recovery failed. " 781 printk(KERN_ERR "NILFS: Oops! recovery failed. "
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
deleted file mode 100644
index 7a17715f215f..000000000000
--- a/fs/nilfs2/sb.h
+++ /dev/null
@@ -1,85 +0,0 @@
1/*
2 * sb.h - NILFS on-memory super block structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _NILFS_SB
25#define _NILFS_SB
26
27#include <linux/types.h>
28#include <linux/fs.h>
29
30struct the_nilfs;
31struct nilfs_sc_info;
32
33/*
34 * NILFS super-block data in memory
35 */
36struct nilfs_sb_info {
37 /* Mount options */
38 unsigned long s_mount_opt;
39 uid_t s_resuid;
40 gid_t s_resgid;
41
42 unsigned long s_interval; /* construction interval */
43 unsigned long s_watermark; /* threshold of data amount
44 for the segment construction */
45
46 /* Fundamental members */
47 struct super_block *s_super; /* reverse pointer to super_block */
48 struct the_nilfs *s_nilfs;
49
50 /* Segment constructor */
51 struct list_head s_dirty_files; /* dirty files list */
52 struct nilfs_sc_info *s_sc_info; /* segment constructor info */
53 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
54 It covers s_dirty_files list */
55
56 /* Inode allocator */
57 spinlock_t s_next_gen_lock;
58 u32 s_next_generation;
59};
60
61static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
62{
63 return sb->s_fs_info;
64}
65
66static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
67{
68 return sbi->s_sc_info;
69}
70
71/*
72 * Bit operations for the mount option
73 */
74#define nilfs_clear_opt(sbi, opt) \
75 do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
76#define nilfs_set_opt(sbi, opt) \
77 do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
78#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
79#define nilfs_write_opt(sbi, mask, opt) \
80 do { (sbi)->s_mount_opt = \
81 (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \
82 NILFS_MOUNT_##opt); \
83 } while (0)
84
85#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 0f83e93935b2..2853ff20f85a 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -509,7 +509,7 @@ static int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
509 * Last BIO is always sent through the following 509 * Last BIO is always sent through the following
510 * submission. 510 * submission.
511 */ 511 */
512 rw |= REQ_SYNC | REQ_UNPLUG; 512 rw |= REQ_SYNC;
513 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw); 513 res = nilfs_segbuf_submit_bio(segbuf, &wi, rw);
514 } 514 }
515 515
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55ebae5c7f39..afe4f2183454 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -104,8 +104,7 @@ struct nilfs_sc_operations {
104static void nilfs_segctor_start_timer(struct nilfs_sc_info *); 104static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
105static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int); 105static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
106static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *); 106static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
107static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *, 107static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
108 int);
109 108
110#define nilfs_cnt32_gt(a, b) \ 109#define nilfs_cnt32_gt(a, b) \
111 (typecheck(__u32, a) && typecheck(__u32, b) && \ 110 (typecheck(__u32, a) && typecheck(__u32, b) && \
@@ -182,7 +181,6 @@ int nilfs_transaction_begin(struct super_block *sb,
182 struct nilfs_transaction_info *ti, 181 struct nilfs_transaction_info *ti,
183 int vacancy_check) 182 int vacancy_check)
184{ 183{
185 struct nilfs_sb_info *sbi;
186 struct the_nilfs *nilfs; 184 struct the_nilfs *nilfs;
187 int ret = nilfs_prepare_segment_lock(ti); 185 int ret = nilfs_prepare_segment_lock(ti);
188 186
@@ -193,8 +191,7 @@ int nilfs_transaction_begin(struct super_block *sb,
193 191
194 vfs_check_frozen(sb, SB_FREEZE_WRITE); 192 vfs_check_frozen(sb, SB_FREEZE_WRITE);
195 193
196 sbi = NILFS_SB(sb); 194 nilfs = sb->s_fs_info;
197 nilfs = sbi->s_nilfs;
198 down_read(&nilfs->ns_segctor_sem); 195 down_read(&nilfs->ns_segctor_sem);
199 if (vacancy_check && nilfs_near_disk_full(nilfs)) { 196 if (vacancy_check && nilfs_near_disk_full(nilfs)) {
200 up_read(&nilfs->ns_segctor_sem); 197 up_read(&nilfs->ns_segctor_sem);
@@ -225,8 +222,7 @@ int nilfs_transaction_begin(struct super_block *sb,
225int nilfs_transaction_commit(struct super_block *sb) 222int nilfs_transaction_commit(struct super_block *sb)
226{ 223{
227 struct nilfs_transaction_info *ti = current->journal_info; 224 struct nilfs_transaction_info *ti = current->journal_info;
228 struct nilfs_sb_info *sbi; 225 struct the_nilfs *nilfs = sb->s_fs_info;
229 struct nilfs_sc_info *sci;
230 int err = 0; 226 int err = 0;
231 227
232 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); 228 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
@@ -235,16 +231,15 @@ int nilfs_transaction_commit(struct super_block *sb)
235 ti->ti_count--; 231 ti->ti_count--;
236 return 0; 232 return 0;
237 } 233 }
238 sbi = NILFS_SB(sb); 234 if (nilfs->ns_writer) {
239 sci = NILFS_SC(sbi); 235 struct nilfs_sc_info *sci = nilfs->ns_writer;
240 if (sci != NULL) { 236
241 if (ti->ti_flags & NILFS_TI_COMMIT) 237 if (ti->ti_flags & NILFS_TI_COMMIT)
242 nilfs_segctor_start_timer(sci); 238 nilfs_segctor_start_timer(sci);
243 if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) > 239 if (atomic_read(&nilfs->ns_ndirtyblks) > sci->sc_watermark)
244 sci->sc_watermark)
245 nilfs_segctor_do_flush(sci, 0); 240 nilfs_segctor_do_flush(sci, 0);
246 } 241 }
247 up_read(&sbi->s_nilfs->ns_segctor_sem); 242 up_read(&nilfs->ns_segctor_sem);
248 current->journal_info = ti->ti_save; 243 current->journal_info = ti->ti_save;
249 244
250 if (ti->ti_flags & NILFS_TI_SYNC) 245 if (ti->ti_flags & NILFS_TI_SYNC)
@@ -257,13 +252,14 @@ int nilfs_transaction_commit(struct super_block *sb)
257void nilfs_transaction_abort(struct super_block *sb) 252void nilfs_transaction_abort(struct super_block *sb)
258{ 253{
259 struct nilfs_transaction_info *ti = current->journal_info; 254 struct nilfs_transaction_info *ti = current->journal_info;
255 struct the_nilfs *nilfs = sb->s_fs_info;
260 256
261 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); 257 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
262 if (ti->ti_count > 0) { 258 if (ti->ti_count > 0) {
263 ti->ti_count--; 259 ti->ti_count--;
264 return; 260 return;
265 } 261 }
266 up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem); 262 up_read(&nilfs->ns_segctor_sem);
267 263
268 current->journal_info = ti->ti_save; 264 current->journal_info = ti->ti_save;
269 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC) 265 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
@@ -272,9 +268,8 @@ void nilfs_transaction_abort(struct super_block *sb)
272 268
273void nilfs_relax_pressure_in_lock(struct super_block *sb) 269void nilfs_relax_pressure_in_lock(struct super_block *sb)
274{ 270{
275 struct nilfs_sb_info *sbi = NILFS_SB(sb); 271 struct the_nilfs *nilfs = sb->s_fs_info;
276 struct nilfs_sc_info *sci = NILFS_SC(sbi); 272 struct nilfs_sc_info *sci = nilfs->ns_writer;
277 struct the_nilfs *nilfs = sbi->s_nilfs;
278 273
279 if (!sci || !sci->sc_flush_request) 274 if (!sci || !sci->sc_flush_request)
280 return; 275 return;
@@ -294,11 +289,13 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
294 downgrade_write(&nilfs->ns_segctor_sem); 289 downgrade_write(&nilfs->ns_segctor_sem);
295} 290}
296 291
297static void nilfs_transaction_lock(struct nilfs_sb_info *sbi, 292static void nilfs_transaction_lock(struct super_block *sb,
298 struct nilfs_transaction_info *ti, 293 struct nilfs_transaction_info *ti,
299 int gcflag) 294 int gcflag)
300{ 295{
301 struct nilfs_transaction_info *cur_ti = current->journal_info; 296 struct nilfs_transaction_info *cur_ti = current->journal_info;
297 struct the_nilfs *nilfs = sb->s_fs_info;
298 struct nilfs_sc_info *sci = nilfs->ns_writer;
302 299
303 WARN_ON(cur_ti); 300 WARN_ON(cur_ti);
304 ti->ti_flags = NILFS_TI_WRITER; 301 ti->ti_flags = NILFS_TI_WRITER;
@@ -309,30 +306,31 @@ static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
309 current->journal_info = ti; 306 current->journal_info = ti;
310 307
311 for (;;) { 308 for (;;) {
312 down_write(&sbi->s_nilfs->ns_segctor_sem); 309 down_write(&nilfs->ns_segctor_sem);
313 if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags)) 310 if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
314 break; 311 break;
315 312
316 nilfs_segctor_do_immediate_flush(NILFS_SC(sbi)); 313 nilfs_segctor_do_immediate_flush(sci);
317 314
318 up_write(&sbi->s_nilfs->ns_segctor_sem); 315 up_write(&nilfs->ns_segctor_sem);
319 yield(); 316 yield();
320 } 317 }
321 if (gcflag) 318 if (gcflag)
322 ti->ti_flags |= NILFS_TI_GC; 319 ti->ti_flags |= NILFS_TI_GC;
323} 320}
324 321
325static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi) 322static void nilfs_transaction_unlock(struct super_block *sb)
326{ 323{
327 struct nilfs_transaction_info *ti = current->journal_info; 324 struct nilfs_transaction_info *ti = current->journal_info;
325 struct the_nilfs *nilfs = sb->s_fs_info;
328 326
329 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC); 327 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
330 BUG_ON(ti->ti_count > 0); 328 BUG_ON(ti->ti_count > 0);
331 329
332 up_write(&sbi->s_nilfs->ns_segctor_sem); 330 up_write(&nilfs->ns_segctor_sem);
333 current->journal_info = ti->ti_save; 331 current->journal_info = ti->ti_save;
334 if (!list_empty(&ti->ti_garbage)) 332 if (!list_empty(&ti->ti_garbage))
335 nilfs_dispose_list(sbi, &ti->ti_garbage, 0); 333 nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
336} 334}
337 335
338static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci, 336static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -430,7 +428,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
430 nilfs_segctor_map_segsum_entry( 428 nilfs_segctor_map_segsum_entry(
431 sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo)); 429 sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
432 430
433 if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags)) 431 if (NILFS_I(inode)->i_root &&
432 !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
434 set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); 433 set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
435 /* skip finfo */ 434 /* skip finfo */
436} 435}
@@ -713,7 +712,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
713 } 712 }
714} 713}
715 714
716static void nilfs_dispose_list(struct nilfs_sb_info *sbi, 715static void nilfs_dispose_list(struct the_nilfs *nilfs,
717 struct list_head *head, int force) 716 struct list_head *head, int force)
718{ 717{
719 struct nilfs_inode_info *ii, *n; 718 struct nilfs_inode_info *ii, *n;
@@ -721,7 +720,7 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
721 unsigned nv = 0; 720 unsigned nv = 0;
722 721
723 while (!list_empty(head)) { 722 while (!list_empty(head)) {
724 spin_lock(&sbi->s_inode_lock); 723 spin_lock(&nilfs->ns_inode_lock);
725 list_for_each_entry_safe(ii, n, head, i_dirty) { 724 list_for_each_entry_safe(ii, n, head, i_dirty) {
726 list_del_init(&ii->i_dirty); 725 list_del_init(&ii->i_dirty);
727 if (force) { 726 if (force) {
@@ -732,14 +731,14 @@ static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
732 } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) { 731 } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
733 set_bit(NILFS_I_QUEUED, &ii->i_state); 732 set_bit(NILFS_I_QUEUED, &ii->i_state);
734 list_add_tail(&ii->i_dirty, 733 list_add_tail(&ii->i_dirty,
735 &sbi->s_dirty_files); 734 &nilfs->ns_dirty_files);
736 continue; 735 continue;
737 } 736 }
738 ivec[nv++] = ii; 737 ivec[nv++] = ii;
739 if (nv == SC_N_INODEVEC) 738 if (nv == SC_N_INODEVEC)
740 break; 739 break;
741 } 740 }
742 spin_unlock(&sbi->s_inode_lock); 741 spin_unlock(&nilfs->ns_inode_lock);
743 742
744 for (pii = ivec; nv > 0; pii++, nv--) 743 for (pii = ivec; nv > 0; pii++, nv--)
745 iput(&(*pii)->vfs_inode); 744 iput(&(*pii)->vfs_inode);
@@ -772,24 +771,23 @@ static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
772 771
773static int nilfs_segctor_confirm(struct nilfs_sc_info *sci) 772static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
774{ 773{
775 struct nilfs_sb_info *sbi = sci->sc_sbi; 774 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
776 int ret = 0; 775 int ret = 0;
777 776
778 if (nilfs_test_metadata_dirty(sbi->s_nilfs, sci->sc_root)) 777 if (nilfs_test_metadata_dirty(nilfs, sci->sc_root))
779 set_bit(NILFS_SC_DIRTY, &sci->sc_flags); 778 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
780 779
781 spin_lock(&sbi->s_inode_lock); 780 spin_lock(&nilfs->ns_inode_lock);
782 if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci)) 781 if (list_empty(&nilfs->ns_dirty_files) && nilfs_segctor_clean(sci))
783 ret++; 782 ret++;
784 783
785 spin_unlock(&sbi->s_inode_lock); 784 spin_unlock(&nilfs->ns_inode_lock);
786 return ret; 785 return ret;
787} 786}
788 787
789static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci) 788static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
790{ 789{
791 struct nilfs_sb_info *sbi = sci->sc_sbi; 790 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
792 struct the_nilfs *nilfs = sbi->s_nilfs;
793 791
794 nilfs_mdt_clear_dirty(sci->sc_root->ifile); 792 nilfs_mdt_clear_dirty(sci->sc_root->ifile);
795 nilfs_mdt_clear_dirty(nilfs->ns_cpfile); 793 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
@@ -799,7 +797,7 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
799 797
800static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci) 798static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
801{ 799{
802 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 800 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
803 struct buffer_head *bh_cp; 801 struct buffer_head *bh_cp;
804 struct nilfs_checkpoint *raw_cp; 802 struct nilfs_checkpoint *raw_cp;
805 int err; 803 int err;
@@ -823,8 +821,7 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
823 821
824static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci) 822static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
825{ 823{
826 struct nilfs_sb_info *sbi = sci->sc_sbi; 824 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
827 struct the_nilfs *nilfs = sbi->s_nilfs;
828 struct buffer_head *bh_cp; 825 struct buffer_head *bh_cp;
829 struct nilfs_checkpoint *raw_cp; 826 struct nilfs_checkpoint *raw_cp;
830 int err; 827 int err;
@@ -1048,8 +1045,7 @@ static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
1048 1045
1049static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode) 1046static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1050{ 1047{
1051 struct nilfs_sb_info *sbi = sci->sc_sbi; 1048 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
1052 struct the_nilfs *nilfs = sbi->s_nilfs;
1053 struct list_head *head; 1049 struct list_head *head;
1054 struct nilfs_inode_info *ii; 1050 struct nilfs_inode_info *ii;
1055 size_t ndone; 1051 size_t ndone;
@@ -1858,7 +1854,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1858{ 1854{
1859 struct nilfs_segment_buffer *segbuf; 1855 struct nilfs_segment_buffer *segbuf;
1860 struct page *bd_page = NULL, *fs_page = NULL; 1856 struct page *bd_page = NULL, *fs_page = NULL;
1861 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 1857 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
1862 int update_sr = false; 1858 int update_sr = false;
1863 1859
1864 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) { 1860 list_for_each_entry(segbuf, &sci->sc_write_logs, sb_list) {
@@ -1962,30 +1958,30 @@ static int nilfs_segctor_wait(struct nilfs_sc_info *sci)
1962 return ret; 1958 return ret;
1963} 1959}
1964 1960
1965static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci, 1961static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
1966 struct nilfs_sb_info *sbi) 1962 struct the_nilfs *nilfs)
1967{ 1963{
1968 struct nilfs_inode_info *ii, *n; 1964 struct nilfs_inode_info *ii, *n;
1969 struct inode *ifile = sci->sc_root->ifile; 1965 struct inode *ifile = sci->sc_root->ifile;
1970 1966
1971 spin_lock(&sbi->s_inode_lock); 1967 spin_lock(&nilfs->ns_inode_lock);
1972 retry: 1968 retry:
1973 list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) { 1969 list_for_each_entry_safe(ii, n, &nilfs->ns_dirty_files, i_dirty) {
1974 if (!ii->i_bh) { 1970 if (!ii->i_bh) {
1975 struct buffer_head *ibh; 1971 struct buffer_head *ibh;
1976 int err; 1972 int err;
1977 1973
1978 spin_unlock(&sbi->s_inode_lock); 1974 spin_unlock(&nilfs->ns_inode_lock);
1979 err = nilfs_ifile_get_inode_block( 1975 err = nilfs_ifile_get_inode_block(
1980 ifile, ii->vfs_inode.i_ino, &ibh); 1976 ifile, ii->vfs_inode.i_ino, &ibh);
1981 if (unlikely(err)) { 1977 if (unlikely(err)) {
1982 nilfs_warning(sbi->s_super, __func__, 1978 nilfs_warning(sci->sc_super, __func__,
1983 "failed to get inode block.\n"); 1979 "failed to get inode block.\n");
1984 return err; 1980 return err;
1985 } 1981 }
1986 nilfs_mdt_mark_buffer_dirty(ibh); 1982 nilfs_mdt_mark_buffer_dirty(ibh);
1987 nilfs_mdt_mark_dirty(ifile); 1983 nilfs_mdt_mark_dirty(ifile);
1988 spin_lock(&sbi->s_inode_lock); 1984 spin_lock(&nilfs->ns_inode_lock);
1989 if (likely(!ii->i_bh)) 1985 if (likely(!ii->i_bh))
1990 ii->i_bh = ibh; 1986 ii->i_bh = ibh;
1991 else 1987 else
@@ -1998,18 +1994,18 @@ static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
1998 list_del(&ii->i_dirty); 1994 list_del(&ii->i_dirty);
1999 list_add_tail(&ii->i_dirty, &sci->sc_dirty_files); 1995 list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
2000 } 1996 }
2001 spin_unlock(&sbi->s_inode_lock); 1997 spin_unlock(&nilfs->ns_inode_lock);
2002 1998
2003 return 0; 1999 return 0;
2004} 2000}
2005 2001
2006static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci, 2002static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
2007 struct nilfs_sb_info *sbi) 2003 struct the_nilfs *nilfs)
2008{ 2004{
2009 struct nilfs_transaction_info *ti = current->journal_info; 2005 struct nilfs_transaction_info *ti = current->journal_info;
2010 struct nilfs_inode_info *ii, *n; 2006 struct nilfs_inode_info *ii, *n;
2011 2007
2012 spin_lock(&sbi->s_inode_lock); 2008 spin_lock(&nilfs->ns_inode_lock);
2013 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) { 2009 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2014 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) || 2010 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2015 test_bit(NILFS_I_DIRTY, &ii->i_state)) 2011 test_bit(NILFS_I_DIRTY, &ii->i_state))
@@ -2021,7 +2017,7 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2021 list_del(&ii->i_dirty); 2017 list_del(&ii->i_dirty);
2022 list_add_tail(&ii->i_dirty, &ti->ti_garbage); 2018 list_add_tail(&ii->i_dirty, &ti->ti_garbage);
2023 } 2019 }
2024 spin_unlock(&sbi->s_inode_lock); 2020 spin_unlock(&nilfs->ns_inode_lock);
2025} 2021}
2026 2022
2027/* 2023/*
@@ -2029,15 +2025,14 @@ static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2029 */ 2025 */
2030static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) 2026static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2031{ 2027{
2032 struct nilfs_sb_info *sbi = sci->sc_sbi; 2028 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
2033 struct the_nilfs *nilfs = sbi->s_nilfs;
2034 struct page *failed_page; 2029 struct page *failed_page;
2035 int err; 2030 int err;
2036 2031
2037 sci->sc_stage.scnt = NILFS_ST_INIT; 2032 sci->sc_stage.scnt = NILFS_ST_INIT;
2038 sci->sc_cno = nilfs->ns_cno; 2033 sci->sc_cno = nilfs->ns_cno;
2039 2034
2040 err = nilfs_segctor_check_in_files(sci, sbi); 2035 err = nilfs_segctor_collect_dirty_files(sci, nilfs);
2041 if (unlikely(err)) 2036 if (unlikely(err))
2042 goto out; 2037 goto out;
2043 2038
@@ -2115,7 +2110,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2115 } while (sci->sc_stage.scnt != NILFS_ST_DONE); 2110 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2116 2111
2117 out: 2112 out:
2118 nilfs_segctor_check_out_files(sci, sbi); 2113 nilfs_segctor_drop_written_files(sci, nilfs);
2119 return err; 2114 return err;
2120 2115
2121 failed_to_write: 2116 failed_to_write:
@@ -2168,8 +2163,8 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
2168 */ 2163 */
2169void nilfs_flush_segment(struct super_block *sb, ino_t ino) 2164void nilfs_flush_segment(struct super_block *sb, ino_t ino)
2170{ 2165{
2171 struct nilfs_sb_info *sbi = NILFS_SB(sb); 2166 struct the_nilfs *nilfs = sb->s_fs_info;
2172 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2167 struct nilfs_sc_info *sci = nilfs->ns_writer;
2173 2168
2174 if (!sci || nilfs_doing_construction()) 2169 if (!sci || nilfs_doing_construction())
2175 return; 2170 return;
@@ -2258,8 +2253,8 @@ static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
2258 */ 2253 */
2259int nilfs_construct_segment(struct super_block *sb) 2254int nilfs_construct_segment(struct super_block *sb)
2260{ 2255{
2261 struct nilfs_sb_info *sbi = NILFS_SB(sb); 2256 struct the_nilfs *nilfs = sb->s_fs_info;
2262 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2257 struct nilfs_sc_info *sci = nilfs->ns_writer;
2263 struct nilfs_transaction_info *ti; 2258 struct nilfs_transaction_info *ti;
2264 int err; 2259 int err;
2265 2260
@@ -2296,8 +2291,8 @@ int nilfs_construct_segment(struct super_block *sb)
2296int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, 2291int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2297 loff_t start, loff_t end) 2292 loff_t start, loff_t end)
2298{ 2293{
2299 struct nilfs_sb_info *sbi = NILFS_SB(sb); 2294 struct the_nilfs *nilfs = sb->s_fs_info;
2300 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2295 struct nilfs_sc_info *sci = nilfs->ns_writer;
2301 struct nilfs_inode_info *ii; 2296 struct nilfs_inode_info *ii;
2302 struct nilfs_transaction_info ti; 2297 struct nilfs_transaction_info ti;
2303 int err = 0; 2298 int err = 0;
@@ -2305,33 +2300,33 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2305 if (!sci) 2300 if (!sci)
2306 return -EROFS; 2301 return -EROFS;
2307 2302
2308 nilfs_transaction_lock(sbi, &ti, 0); 2303 nilfs_transaction_lock(sb, &ti, 0);
2309 2304
2310 ii = NILFS_I(inode); 2305 ii = NILFS_I(inode);
2311 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || 2306 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
2312 nilfs_test_opt(sbi, STRICT_ORDER) || 2307 nilfs_test_opt(nilfs, STRICT_ORDER) ||
2313 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || 2308 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2314 nilfs_discontinued(sbi->s_nilfs)) { 2309 nilfs_discontinued(nilfs)) {
2315 nilfs_transaction_unlock(sbi); 2310 nilfs_transaction_unlock(sb);
2316 err = nilfs_segctor_sync(sci); 2311 err = nilfs_segctor_sync(sci);
2317 return err; 2312 return err;
2318 } 2313 }
2319 2314
2320 spin_lock(&sbi->s_inode_lock); 2315 spin_lock(&nilfs->ns_inode_lock);
2321 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) && 2316 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
2322 !test_bit(NILFS_I_BUSY, &ii->i_state)) { 2317 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
2323 spin_unlock(&sbi->s_inode_lock); 2318 spin_unlock(&nilfs->ns_inode_lock);
2324 nilfs_transaction_unlock(sbi); 2319 nilfs_transaction_unlock(sb);
2325 return 0; 2320 return 0;
2326 } 2321 }
2327 spin_unlock(&sbi->s_inode_lock); 2322 spin_unlock(&nilfs->ns_inode_lock);
2328 sci->sc_dsync_inode = ii; 2323 sci->sc_dsync_inode = ii;
2329 sci->sc_dsync_start = start; 2324 sci->sc_dsync_start = start;
2330 sci->sc_dsync_end = end; 2325 sci->sc_dsync_end = end;
2331 2326
2332 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); 2327 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
2333 2328
2334 nilfs_transaction_unlock(sbi); 2329 nilfs_transaction_unlock(sb);
2335 return err; 2330 return err;
2336} 2331}
2337 2332
@@ -2387,8 +2382,7 @@ static void nilfs_segctor_notify(struct nilfs_sc_info *sci, int mode, int err)
2387 */ 2382 */
2388static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) 2383static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2389{ 2384{
2390 struct nilfs_sb_info *sbi = sci->sc_sbi; 2385 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
2391 struct the_nilfs *nilfs = sbi->s_nilfs;
2392 struct nilfs_super_block **sbp; 2386 struct nilfs_super_block **sbp;
2393 int err = 0; 2387 int err = 0;
2394 2388
@@ -2406,11 +2400,12 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode)
2406 nilfs_discontinued(nilfs)) { 2400 nilfs_discontinued(nilfs)) {
2407 down_write(&nilfs->ns_sem); 2401 down_write(&nilfs->ns_sem);
2408 err = -EIO; 2402 err = -EIO;
2409 sbp = nilfs_prepare_super(sbi, 2403 sbp = nilfs_prepare_super(sci->sc_super,
2410 nilfs_sb_will_flip(nilfs)); 2404 nilfs_sb_will_flip(nilfs));
2411 if (likely(sbp)) { 2405 if (likely(sbp)) {
2412 nilfs_set_log_cursor(sbp[0], nilfs); 2406 nilfs_set_log_cursor(sbp[0], nilfs);
2413 err = nilfs_commit_super(sbi, NILFS_SB_COMMIT); 2407 err = nilfs_commit_super(sci->sc_super,
2408 NILFS_SB_COMMIT);
2414 } 2409 }
2415 up_write(&nilfs->ns_sem); 2410 up_write(&nilfs->ns_sem);
2416 } 2411 }
@@ -2442,16 +2437,15 @@ nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2442int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv, 2437int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2443 void **kbufs) 2438 void **kbufs)
2444{ 2439{
2445 struct nilfs_sb_info *sbi = NILFS_SB(sb); 2440 struct the_nilfs *nilfs = sb->s_fs_info;
2446 struct nilfs_sc_info *sci = NILFS_SC(sbi); 2441 struct nilfs_sc_info *sci = nilfs->ns_writer;
2447 struct the_nilfs *nilfs = sbi->s_nilfs;
2448 struct nilfs_transaction_info ti; 2442 struct nilfs_transaction_info ti;
2449 int err; 2443 int err;
2450 2444
2451 if (unlikely(!sci)) 2445 if (unlikely(!sci))
2452 return -EROFS; 2446 return -EROFS;
2453 2447
2454 nilfs_transaction_lock(sbi, &ti, 1); 2448 nilfs_transaction_lock(sb, &ti, 1);
2455 2449
2456 err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat); 2450 err = nilfs_mdt_save_to_shadow_map(nilfs->ns_dat);
2457 if (unlikely(err)) 2451 if (unlikely(err))
@@ -2479,14 +2473,14 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2479 set_current_state(TASK_INTERRUPTIBLE); 2473 set_current_state(TASK_INTERRUPTIBLE);
2480 schedule_timeout(sci->sc_interval); 2474 schedule_timeout(sci->sc_interval);
2481 } 2475 }
2482 if (nilfs_test_opt(sbi, DISCARD)) { 2476 if (nilfs_test_opt(nilfs, DISCARD)) {
2483 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs, 2477 int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
2484 sci->sc_nfreesegs); 2478 sci->sc_nfreesegs);
2485 if (ret) { 2479 if (ret) {
2486 printk(KERN_WARNING 2480 printk(KERN_WARNING
2487 "NILFS warning: error %d on discard request, " 2481 "NILFS warning: error %d on discard request, "
2488 "turning discards off for the device\n", ret); 2482 "turning discards off for the device\n", ret);
2489 nilfs_clear_opt(sbi, DISCARD); 2483 nilfs_clear_opt(nilfs, DISCARD);
2490 } 2484 }
2491 } 2485 }
2492 2486
@@ -2494,16 +2488,15 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
2494 sci->sc_freesegs = NULL; 2488 sci->sc_freesegs = NULL;
2495 sci->sc_nfreesegs = 0; 2489 sci->sc_nfreesegs = 0;
2496 nilfs_mdt_clear_shadow_map(nilfs->ns_dat); 2490 nilfs_mdt_clear_shadow_map(nilfs->ns_dat);
2497 nilfs_transaction_unlock(sbi); 2491 nilfs_transaction_unlock(sb);
2498 return err; 2492 return err;
2499} 2493}
2500 2494
2501static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode) 2495static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2502{ 2496{
2503 struct nilfs_sb_info *sbi = sci->sc_sbi;
2504 struct nilfs_transaction_info ti; 2497 struct nilfs_transaction_info ti;
2505 2498
2506 nilfs_transaction_lock(sbi, &ti, 0); 2499 nilfs_transaction_lock(sci->sc_super, &ti, 0);
2507 nilfs_segctor_construct(sci, mode); 2500 nilfs_segctor_construct(sci, mode);
2508 2501
2509 /* 2502 /*
@@ -2514,7 +2507,7 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2514 if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) 2507 if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
2515 nilfs_segctor_start_timer(sci); 2508 nilfs_segctor_start_timer(sci);
2516 2509
2517 nilfs_transaction_unlock(sbi); 2510 nilfs_transaction_unlock(sci->sc_super);
2518} 2511}
2519 2512
2520static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci) 2513static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
@@ -2560,7 +2553,7 @@ static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2560static int nilfs_segctor_thread(void *arg) 2553static int nilfs_segctor_thread(void *arg)
2561{ 2554{
2562 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg; 2555 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2563 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs; 2556 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
2564 int timeout = 0; 2557 int timeout = 0;
2565 2558
2566 sci->sc_timer.data = (unsigned long)current; 2559 sci->sc_timer.data = (unsigned long)current;
@@ -2671,17 +2664,17 @@ static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2671/* 2664/*
2672 * Setup & clean-up functions 2665 * Setup & clean-up functions
2673 */ 2666 */
2674static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi, 2667static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
2675 struct nilfs_root *root) 2668 struct nilfs_root *root)
2676{ 2669{
2670 struct the_nilfs *nilfs = sb->s_fs_info;
2677 struct nilfs_sc_info *sci; 2671 struct nilfs_sc_info *sci;
2678 2672
2679 sci = kzalloc(sizeof(*sci), GFP_KERNEL); 2673 sci = kzalloc(sizeof(*sci), GFP_KERNEL);
2680 if (!sci) 2674 if (!sci)
2681 return NULL; 2675 return NULL;
2682 2676
2683 sci->sc_sbi = sbi; 2677 sci->sc_super = sb;
2684 sci->sc_super = sbi->s_super;
2685 2678
2686 nilfs_get_root(root); 2679 nilfs_get_root(root);
2687 sci->sc_root = root; 2680 sci->sc_root = root;
@@ -2701,10 +2694,10 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi,
2701 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; 2694 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
2702 sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK; 2695 sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
2703 2696
2704 if (sbi->s_interval) 2697 if (nilfs->ns_interval)
2705 sci->sc_interval = sbi->s_interval; 2698 sci->sc_interval = nilfs->ns_interval;
2706 if (sbi->s_watermark) 2699 if (nilfs->ns_watermark)
2707 sci->sc_watermark = sbi->s_watermark; 2700 sci->sc_watermark = nilfs->ns_watermark;
2708 return sci; 2701 return sci;
2709} 2702}
2710 2703
@@ -2715,12 +2708,11 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2715 /* The segctord thread was stopped and its timer was removed. 2708 /* The segctord thread was stopped and its timer was removed.
2716 But some tasks remain. */ 2709 But some tasks remain. */
2717 do { 2710 do {
2718 struct nilfs_sb_info *sbi = sci->sc_sbi;
2719 struct nilfs_transaction_info ti; 2711 struct nilfs_transaction_info ti;
2720 2712
2721 nilfs_transaction_lock(sbi, &ti, 0); 2713 nilfs_transaction_lock(sci->sc_super, &ti, 0);
2722 ret = nilfs_segctor_construct(sci, SC_LSEG_SR); 2714 ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
2723 nilfs_transaction_unlock(sbi); 2715 nilfs_transaction_unlock(sci->sc_super);
2724 2716
2725 } while (ret && retrycount-- > 0); 2717 } while (ret && retrycount-- > 0);
2726} 2718}
@@ -2735,10 +2727,10 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2735 */ 2727 */
2736static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) 2728static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2737{ 2729{
2738 struct nilfs_sb_info *sbi = sci->sc_sbi; 2730 struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
2739 int flag; 2731 int flag;
2740 2732
2741 up_write(&sbi->s_nilfs->ns_segctor_sem); 2733 up_write(&nilfs->ns_segctor_sem);
2742 2734
2743 spin_lock(&sci->sc_state_lock); 2735 spin_lock(&sci->sc_state_lock);
2744 nilfs_segctor_kill_thread(sci); 2736 nilfs_segctor_kill_thread(sci);
@@ -2752,9 +2744,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2752 WARN_ON(!list_empty(&sci->sc_copied_buffers)); 2744 WARN_ON(!list_empty(&sci->sc_copied_buffers));
2753 2745
2754 if (!list_empty(&sci->sc_dirty_files)) { 2746 if (!list_empty(&sci->sc_dirty_files)) {
2755 nilfs_warning(sbi->s_super, __func__, 2747 nilfs_warning(sci->sc_super, __func__,
2756 "dirty file(s) after the final construction\n"); 2748 "dirty file(s) after the final construction\n");
2757 nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1); 2749 nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
2758 } 2750 }
2759 2751
2760 WARN_ON(!list_empty(&sci->sc_segbufs)); 2752 WARN_ON(!list_empty(&sci->sc_segbufs));
@@ -2762,79 +2754,78 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2762 2754
2763 nilfs_put_root(sci->sc_root); 2755 nilfs_put_root(sci->sc_root);
2764 2756
2765 down_write(&sbi->s_nilfs->ns_segctor_sem); 2757 down_write(&nilfs->ns_segctor_sem);
2766 2758
2767 del_timer_sync(&sci->sc_timer); 2759 del_timer_sync(&sci->sc_timer);
2768 kfree(sci); 2760 kfree(sci);
2769} 2761}
2770 2762
2771/** 2763/**
2772 * nilfs_attach_segment_constructor - attach a segment constructor 2764 * nilfs_attach_log_writer - attach log writer
2773 * @sbi: nilfs_sb_info 2765 * @sb: super block instance
2774 * @root: root object of the current filesystem tree 2766 * @root: root object of the current filesystem tree
2775 * 2767 *
2776 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info, 2768 * This allocates a log writer object, initializes it, and starts the
2777 * initializes it, and starts the segment constructor. 2769 * log writer.
2778 * 2770 *
2779 * Return Value: On success, 0 is returned. On error, one of the following 2771 * Return Value: On success, 0 is returned. On error, one of the following
2780 * negative error code is returned. 2772 * negative error code is returned.
2781 * 2773 *
2782 * %-ENOMEM - Insufficient memory available. 2774 * %-ENOMEM - Insufficient memory available.
2783 */ 2775 */
2784int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi, 2776int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
2785 struct nilfs_root *root)
2786{ 2777{
2778 struct the_nilfs *nilfs = sb->s_fs_info;
2787 int err; 2779 int err;
2788 2780
2789 if (NILFS_SC(sbi)) { 2781 if (nilfs->ns_writer) {
2790 /* 2782 /*
2791 * This happens if the filesystem was remounted 2783 * This happens if the filesystem was remounted
2792 * read/write after nilfs_error degenerated it into a 2784 * read/write after nilfs_error degenerated it into a
2793 * read-only mount. 2785 * read-only mount.
2794 */ 2786 */
2795 nilfs_detach_segment_constructor(sbi); 2787 nilfs_detach_log_writer(sb);
2796 } 2788 }
2797 2789
2798 sbi->s_sc_info = nilfs_segctor_new(sbi, root); 2790 nilfs->ns_writer = nilfs_segctor_new(sb, root);
2799 if (!sbi->s_sc_info) 2791 if (!nilfs->ns_writer)
2800 return -ENOMEM; 2792 return -ENOMEM;
2801 2793
2802 err = nilfs_segctor_start_thread(NILFS_SC(sbi)); 2794 err = nilfs_segctor_start_thread(nilfs->ns_writer);
2803 if (err) { 2795 if (err) {
2804 kfree(sbi->s_sc_info); 2796 kfree(nilfs->ns_writer);
2805 sbi->s_sc_info = NULL; 2797 nilfs->ns_writer = NULL;
2806 } 2798 }
2807 return err; 2799 return err;
2808} 2800}
2809 2801
2810/** 2802/**
2811 * nilfs_detach_segment_constructor - destroy the segment constructor 2803 * nilfs_detach_log_writer - destroy log writer
2812 * @sbi: nilfs_sb_info 2804 * @sb: super block instance
2813 * 2805 *
2814 * nilfs_detach_segment_constructor() kills the segment constructor daemon, 2806 * This kills log writer daemon, frees the log writer object, and
2815 * frees the struct nilfs_sc_info, and destroy the dirty file list. 2807 * destroys list of dirty files.
2816 */ 2808 */
2817void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi) 2809void nilfs_detach_log_writer(struct super_block *sb)
2818{ 2810{
2819 struct the_nilfs *nilfs = sbi->s_nilfs; 2811 struct the_nilfs *nilfs = sb->s_fs_info;
2820 LIST_HEAD(garbage_list); 2812 LIST_HEAD(garbage_list);
2821 2813
2822 down_write(&nilfs->ns_segctor_sem); 2814 down_write(&nilfs->ns_segctor_sem);
2823 if (NILFS_SC(sbi)) { 2815 if (nilfs->ns_writer) {
2824 nilfs_segctor_destroy(NILFS_SC(sbi)); 2816 nilfs_segctor_destroy(nilfs->ns_writer);
2825 sbi->s_sc_info = NULL; 2817 nilfs->ns_writer = NULL;
2826 } 2818 }
2827 2819
2828 /* Force to free the list of dirty files */ 2820 /* Force to free the list of dirty files */
2829 spin_lock(&sbi->s_inode_lock); 2821 spin_lock(&nilfs->ns_inode_lock);
2830 if (!list_empty(&sbi->s_dirty_files)) { 2822 if (!list_empty(&nilfs->ns_dirty_files)) {
2831 list_splice_init(&sbi->s_dirty_files, &garbage_list); 2823 list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
2832 nilfs_warning(sbi->s_super, __func__, 2824 nilfs_warning(sb, __func__,
2833 "Non empty dirty list after the last " 2825 "Hit dirty file after stopped log writer\n");
2834 "segment construction\n"); 2826 }
2835 } 2827 spin_unlock(&nilfs->ns_inode_lock);
2836 spin_unlock(&sbi->s_inode_lock);
2837 up_write(&nilfs->ns_segctor_sem); 2828 up_write(&nilfs->ns_segctor_sem);
2838 2829
2839 nilfs_dispose_list(sbi, &garbage_list, 1); 2830 nilfs_dispose_list(nilfs, &garbage_list, 1);
2840} 2831}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index cd8056e7cbed..6c02a86745fb 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -27,7 +27,7 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h> 29#include <linux/nilfs2_fs.h>
30#include "sb.h" 30#include "nilfs.h"
31 31
32struct nilfs_root; 32struct nilfs_root;
33 33
@@ -88,7 +88,6 @@ struct nilfs_segsum_pointer {
88/** 88/**
89 * struct nilfs_sc_info - Segment constructor information 89 * struct nilfs_sc_info - Segment constructor information
90 * @sc_super: Back pointer to super_block struct 90 * @sc_super: Back pointer to super_block struct
91 * @sc_sbi: Back pointer to nilfs_sb_info struct
92 * @sc_root: root object of the current filesystem tree 91 * @sc_root: root object of the current filesystem tree
93 * @sc_nblk_inc: Block count of current generation 92 * @sc_nblk_inc: Block count of current generation
94 * @sc_dirty_files: List of files to be written 93 * @sc_dirty_files: List of files to be written
@@ -131,7 +130,6 @@ struct nilfs_segsum_pointer {
131 */ 130 */
132struct nilfs_sc_info { 131struct nilfs_sc_info {
133 struct super_block *sc_super; 132 struct super_block *sc_super;
134 struct nilfs_sb_info *sc_sbi;
135 struct nilfs_root *sc_root; 133 struct nilfs_root *sc_root;
136 134
137 unsigned long sc_nblk_inc; 135 unsigned long sc_nblk_inc;
@@ -235,18 +233,16 @@ extern void nilfs_flush_segment(struct super_block *, ino_t);
235extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, 233extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *,
236 void **); 234 void **);
237 235
238int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi, 236int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root);
239 struct nilfs_root *root); 237void nilfs_detach_log_writer(struct super_block *sb);
240extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
241 238
242/* recovery.c */ 239/* recovery.c */
243extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t, 240extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t,
244 struct buffer_head **, int); 241 struct buffer_head **, int);
245extern int nilfs_search_super_root(struct the_nilfs *, 242extern int nilfs_search_super_root(struct the_nilfs *,
246 struct nilfs_recovery_info *); 243 struct nilfs_recovery_info *);
247extern int nilfs_salvage_orphan_logs(struct the_nilfs *, 244int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, struct super_block *sb,
248 struct nilfs_sb_info *, 245 struct nilfs_recovery_info *ri);
249 struct nilfs_recovery_info *);
250extern void nilfs_dispose_segment_list(struct list_head *); 246extern void nilfs_dispose_segment_list(struct list_head *);
251 247
252#endif /* _NILFS_SEGMENT_H */ 248#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 58fd707174e1..062cca065195 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -43,7 +43,6 @@
43#include <linux/init.h> 43#include <linux/init.h>
44#include <linux/blkdev.h> 44#include <linux/blkdev.h>
45#include <linux/parser.h> 45#include <linux/parser.h>
46#include <linux/random.h>
47#include <linux/crc32.h> 46#include <linux/crc32.h>
48#include <linux/vfs.h> 47#include <linux/vfs.h>
49#include <linux/writeback.h> 48#include <linux/writeback.h>
@@ -72,23 +71,23 @@ struct kmem_cache *nilfs_transaction_cachep;
72struct kmem_cache *nilfs_segbuf_cachep; 71struct kmem_cache *nilfs_segbuf_cachep;
73struct kmem_cache *nilfs_btree_path_cache; 72struct kmem_cache *nilfs_btree_path_cache;
74 73
75static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount); 74static int nilfs_setup_super(struct super_block *sb, int is_mount);
76static int nilfs_remount(struct super_block *sb, int *flags, char *data); 75static int nilfs_remount(struct super_block *sb, int *flags, char *data);
77 76
78static void nilfs_set_error(struct nilfs_sb_info *sbi) 77static void nilfs_set_error(struct super_block *sb)
79{ 78{
80 struct the_nilfs *nilfs = sbi->s_nilfs; 79 struct the_nilfs *nilfs = sb->s_fs_info;
81 struct nilfs_super_block **sbp; 80 struct nilfs_super_block **sbp;
82 81
83 down_write(&nilfs->ns_sem); 82 down_write(&nilfs->ns_sem);
84 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { 83 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
85 nilfs->ns_mount_state |= NILFS_ERROR_FS; 84 nilfs->ns_mount_state |= NILFS_ERROR_FS;
86 sbp = nilfs_prepare_super(sbi, 0); 85 sbp = nilfs_prepare_super(sb, 0);
87 if (likely(sbp)) { 86 if (likely(sbp)) {
88 sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); 87 sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
89 if (sbp[1]) 88 if (sbp[1])
90 sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); 89 sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS);
91 nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 90 nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
92 } 91 }
93 } 92 }
94 up_write(&nilfs->ns_sem); 93 up_write(&nilfs->ns_sem);
@@ -109,7 +108,7 @@ static void nilfs_set_error(struct nilfs_sb_info *sbi)
109void nilfs_error(struct super_block *sb, const char *function, 108void nilfs_error(struct super_block *sb, const char *function,
110 const char *fmt, ...) 109 const char *fmt, ...)
111{ 110{
112 struct nilfs_sb_info *sbi = NILFS_SB(sb); 111 struct the_nilfs *nilfs = sb->s_fs_info;
113 struct va_format vaf; 112 struct va_format vaf;
114 va_list args; 113 va_list args;
115 114
@@ -124,15 +123,15 @@ void nilfs_error(struct super_block *sb, const char *function,
124 va_end(args); 123 va_end(args);
125 124
126 if (!(sb->s_flags & MS_RDONLY)) { 125 if (!(sb->s_flags & MS_RDONLY)) {
127 nilfs_set_error(sbi); 126 nilfs_set_error(sb);
128 127
129 if (nilfs_test_opt(sbi, ERRORS_RO)) { 128 if (nilfs_test_opt(nilfs, ERRORS_RO)) {
130 printk(KERN_CRIT "Remounting filesystem read-only\n"); 129 printk(KERN_CRIT "Remounting filesystem read-only\n");
131 sb->s_flags |= MS_RDONLY; 130 sb->s_flags |= MS_RDONLY;
132 } 131 }
133 } 132 }
134 133
135 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 134 if (nilfs_test_opt(nilfs, ERRORS_PANIC))
136 panic("NILFS (device %s): panic forced after error\n", 135 panic("NILFS (device %s): panic forced after error\n",
137 sb->s_id); 136 sb->s_id);
138} 137}
@@ -189,14 +188,14 @@ void nilfs_destroy_inode(struct inode *inode)
189 call_rcu(&inode->i_rcu, nilfs_i_callback); 188 call_rcu(&inode->i_rcu, nilfs_i_callback);
190} 189}
191 190
192static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) 191static int nilfs_sync_super(struct super_block *sb, int flag)
193{ 192{
194 struct the_nilfs *nilfs = sbi->s_nilfs; 193 struct the_nilfs *nilfs = sb->s_fs_info;
195 int err; 194 int err;
196 195
197 retry: 196 retry:
198 set_buffer_dirty(nilfs->ns_sbh[0]); 197 set_buffer_dirty(nilfs->ns_sbh[0]);
199 if (nilfs_test_opt(sbi, BARRIER)) { 198 if (nilfs_test_opt(nilfs, BARRIER)) {
200 err = __sync_dirty_buffer(nilfs->ns_sbh[0], 199 err = __sync_dirty_buffer(nilfs->ns_sbh[0],
201 WRITE_SYNC | WRITE_FLUSH_FUA); 200 WRITE_SYNC | WRITE_FLUSH_FUA);
202 } else { 201 } else {
@@ -263,10 +262,10 @@ void nilfs_set_log_cursor(struct nilfs_super_block *sbp,
263 spin_unlock(&nilfs->ns_last_segment_lock); 262 spin_unlock(&nilfs->ns_last_segment_lock);
264} 263}
265 264
266struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi, 265struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
267 int flip) 266 int flip)
268{ 267{
269 struct the_nilfs *nilfs = sbi->s_nilfs; 268 struct the_nilfs *nilfs = sb->s_fs_info;
270 struct nilfs_super_block **sbp = nilfs->ns_sbp; 269 struct nilfs_super_block **sbp = nilfs->ns_sbp;
271 270
272 /* nilfs->ns_sem must be locked by the caller. */ 271 /* nilfs->ns_sem must be locked by the caller. */
@@ -276,7 +275,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
276 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); 275 memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
277 } else { 276 } else {
278 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", 277 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
279 sbi->s_super->s_id); 278 sb->s_id);
280 return NULL; 279 return NULL;
281 } 280 }
282 } else if (sbp[1] && 281 } else if (sbp[1] &&
@@ -290,9 +289,9 @@ struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi,
290 return sbp; 289 return sbp;
291} 290}
292 291
293int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag) 292int nilfs_commit_super(struct super_block *sb, int flag)
294{ 293{
295 struct the_nilfs *nilfs = sbi->s_nilfs; 294 struct the_nilfs *nilfs = sb->s_fs_info;
296 struct nilfs_super_block **sbp = nilfs->ns_sbp; 295 struct nilfs_super_block **sbp = nilfs->ns_sbp;
297 time_t t; 296 time_t t;
298 297
@@ -312,27 +311,28 @@ int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag)
312 nilfs->ns_sbsize)); 311 nilfs->ns_sbsize));
313 } 312 }
314 clear_nilfs_sb_dirty(nilfs); 313 clear_nilfs_sb_dirty(nilfs);
315 return nilfs_sync_super(sbi, flag); 314 return nilfs_sync_super(sb, flag);
316} 315}
317 316
318/** 317/**
319 * nilfs_cleanup_super() - write filesystem state for cleanup 318 * nilfs_cleanup_super() - write filesystem state for cleanup
320 * @sbi: nilfs_sb_info to be unmounted or degraded to read-only 319 * @sb: super block instance to be unmounted or degraded to read-only
321 * 320 *
322 * This function restores state flags in the on-disk super block. 321 * This function restores state flags in the on-disk super block.
323 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the 322 * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the
324 * filesystem was not clean previously. 323 * filesystem was not clean previously.
325 */ 324 */
326int nilfs_cleanup_super(struct nilfs_sb_info *sbi) 325int nilfs_cleanup_super(struct super_block *sb)
327{ 326{
327 struct the_nilfs *nilfs = sb->s_fs_info;
328 struct nilfs_super_block **sbp; 328 struct nilfs_super_block **sbp;
329 int flag = NILFS_SB_COMMIT; 329 int flag = NILFS_SB_COMMIT;
330 int ret = -EIO; 330 int ret = -EIO;
331 331
332 sbp = nilfs_prepare_super(sbi, 0); 332 sbp = nilfs_prepare_super(sb, 0);
333 if (sbp) { 333 if (sbp) {
334 sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state); 334 sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
335 nilfs_set_log_cursor(sbp[0], sbi->s_nilfs); 335 nilfs_set_log_cursor(sbp[0], nilfs);
336 if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { 336 if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) {
337 /* 337 /*
338 * make the "clean" flag also to the opposite 338 * make the "clean" flag also to the opposite
@@ -342,21 +342,20 @@ int nilfs_cleanup_super(struct nilfs_sb_info *sbi)
342 sbp[1]->s_state = sbp[0]->s_state; 342 sbp[1]->s_state = sbp[0]->s_state;
343 flag = NILFS_SB_COMMIT_ALL; 343 flag = NILFS_SB_COMMIT_ALL;
344 } 344 }
345 ret = nilfs_commit_super(sbi, flag); 345 ret = nilfs_commit_super(sb, flag);
346 } 346 }
347 return ret; 347 return ret;
348} 348}
349 349
350static void nilfs_put_super(struct super_block *sb) 350static void nilfs_put_super(struct super_block *sb)
351{ 351{
352 struct nilfs_sb_info *sbi = NILFS_SB(sb); 352 struct the_nilfs *nilfs = sb->s_fs_info;
353 struct the_nilfs *nilfs = sbi->s_nilfs;
354 353
355 nilfs_detach_segment_constructor(sbi); 354 nilfs_detach_log_writer(sb);
356 355
357 if (!(sb->s_flags & MS_RDONLY)) { 356 if (!(sb->s_flags & MS_RDONLY)) {
358 down_write(&nilfs->ns_sem); 357 down_write(&nilfs->ns_sem);
359 nilfs_cleanup_super(sbi); 358 nilfs_cleanup_super(sb);
360 up_write(&nilfs->ns_sem); 359 up_write(&nilfs->ns_sem);
361 } 360 }
362 361
@@ -365,15 +364,12 @@ static void nilfs_put_super(struct super_block *sb)
365 iput(nilfs->ns_dat); 364 iput(nilfs->ns_dat);
366 365
367 destroy_nilfs(nilfs); 366 destroy_nilfs(nilfs);
368 sbi->s_super = NULL;
369 sb->s_fs_info = NULL; 367 sb->s_fs_info = NULL;
370 kfree(sbi);
371} 368}
372 369
373static int nilfs_sync_fs(struct super_block *sb, int wait) 370static int nilfs_sync_fs(struct super_block *sb, int wait)
374{ 371{
375 struct nilfs_sb_info *sbi = NILFS_SB(sb); 372 struct the_nilfs *nilfs = sb->s_fs_info;
376 struct the_nilfs *nilfs = sbi->s_nilfs;
377 struct nilfs_super_block **sbp; 373 struct nilfs_super_block **sbp;
378 int err = 0; 374 int err = 0;
379 375
@@ -383,10 +379,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
383 379
384 down_write(&nilfs->ns_sem); 380 down_write(&nilfs->ns_sem);
385 if (nilfs_sb_dirty(nilfs)) { 381 if (nilfs_sb_dirty(nilfs)) {
386 sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs)); 382 sbp = nilfs_prepare_super(sb, nilfs_sb_will_flip(nilfs));
387 if (likely(sbp)) { 383 if (likely(sbp)) {
388 nilfs_set_log_cursor(sbp[0], nilfs); 384 nilfs_set_log_cursor(sbp[0], nilfs);
389 nilfs_commit_super(sbi, NILFS_SB_COMMIT); 385 nilfs_commit_super(sb, NILFS_SB_COMMIT);
390 } 386 }
391 } 387 }
392 up_write(&nilfs->ns_sem); 388 up_write(&nilfs->ns_sem);
@@ -394,10 +390,10 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
394 return err; 390 return err;
395} 391}
396 392
397int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt, 393int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
398 struct nilfs_root **rootp) 394 struct nilfs_root **rootp)
399{ 395{
400 struct the_nilfs *nilfs = sbi->s_nilfs; 396 struct the_nilfs *nilfs = sb->s_fs_info;
401 struct nilfs_root *root; 397 struct nilfs_root *root;
402 struct nilfs_checkpoint *raw_cp; 398 struct nilfs_checkpoint *raw_cp;
403 struct buffer_head *bh_cp; 399 struct buffer_head *bh_cp;
@@ -426,7 +422,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
426 goto failed; 422 goto failed;
427 } 423 }
428 424
429 err = nilfs_ifile_read(sbi->s_super, root, nilfs->ns_inode_size, 425 err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
430 &raw_cp->cp_ifile_inode, &root->ifile); 426 &raw_cp->cp_ifile_inode, &root->ifile);
431 if (err) 427 if (err)
432 goto failed_bh; 428 goto failed_bh;
@@ -450,8 +446,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno, int curr_mnt,
450 446
451static int nilfs_freeze(struct super_block *sb) 447static int nilfs_freeze(struct super_block *sb)
452{ 448{
453 struct nilfs_sb_info *sbi = NILFS_SB(sb); 449 struct the_nilfs *nilfs = sb->s_fs_info;
454 struct the_nilfs *nilfs = sbi->s_nilfs;
455 int err; 450 int err;
456 451
457 if (sb->s_flags & MS_RDONLY) 452 if (sb->s_flags & MS_RDONLY)
@@ -459,21 +454,20 @@ static int nilfs_freeze(struct super_block *sb)
459 454
460 /* Mark super block clean */ 455 /* Mark super block clean */
461 down_write(&nilfs->ns_sem); 456 down_write(&nilfs->ns_sem);
462 err = nilfs_cleanup_super(sbi); 457 err = nilfs_cleanup_super(sb);
463 up_write(&nilfs->ns_sem); 458 up_write(&nilfs->ns_sem);
464 return err; 459 return err;
465} 460}
466 461
467static int nilfs_unfreeze(struct super_block *sb) 462static int nilfs_unfreeze(struct super_block *sb)
468{ 463{
469 struct nilfs_sb_info *sbi = NILFS_SB(sb); 464 struct the_nilfs *nilfs = sb->s_fs_info;
470 struct the_nilfs *nilfs = sbi->s_nilfs;
471 465
472 if (sb->s_flags & MS_RDONLY) 466 if (sb->s_flags & MS_RDONLY)
473 return 0; 467 return 0;
474 468
475 down_write(&nilfs->ns_sem); 469 down_write(&nilfs->ns_sem);
476 nilfs_setup_super(sbi, false); 470 nilfs_setup_super(sb, false);
477 up_write(&nilfs->ns_sem); 471 up_write(&nilfs->ns_sem);
478 return 0; 472 return 0;
479} 473}
@@ -530,22 +524,22 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
530static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) 524static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
531{ 525{
532 struct super_block *sb = vfs->mnt_sb; 526 struct super_block *sb = vfs->mnt_sb;
533 struct nilfs_sb_info *sbi = NILFS_SB(sb); 527 struct the_nilfs *nilfs = sb->s_fs_info;
534 struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root; 528 struct nilfs_root *root = NILFS_I(vfs->mnt_root->d_inode)->i_root;
535 529
536 if (!nilfs_test_opt(sbi, BARRIER)) 530 if (!nilfs_test_opt(nilfs, BARRIER))
537 seq_puts(seq, ",nobarrier"); 531 seq_puts(seq, ",nobarrier");
538 if (root->cno != NILFS_CPTREE_CURRENT_CNO) 532 if (root->cno != NILFS_CPTREE_CURRENT_CNO)
539 seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno); 533 seq_printf(seq, ",cp=%llu", (unsigned long long)root->cno);
540 if (nilfs_test_opt(sbi, ERRORS_PANIC)) 534 if (nilfs_test_opt(nilfs, ERRORS_PANIC))
541 seq_puts(seq, ",errors=panic"); 535 seq_puts(seq, ",errors=panic");
542 if (nilfs_test_opt(sbi, ERRORS_CONT)) 536 if (nilfs_test_opt(nilfs, ERRORS_CONT))
543 seq_puts(seq, ",errors=continue"); 537 seq_puts(seq, ",errors=continue");
544 if (nilfs_test_opt(sbi, STRICT_ORDER)) 538 if (nilfs_test_opt(nilfs, STRICT_ORDER))
545 seq_puts(seq, ",order=strict"); 539 seq_puts(seq, ",order=strict");
546 if (nilfs_test_opt(sbi, NORECOVERY)) 540 if (nilfs_test_opt(nilfs, NORECOVERY))
547 seq_puts(seq, ",norecovery"); 541 seq_puts(seq, ",norecovery");
548 if (nilfs_test_opt(sbi, DISCARD)) 542 if (nilfs_test_opt(nilfs, DISCARD))
549 seq_puts(seq, ",discard"); 543 seq_puts(seq, ",discard");
550 544
551 return 0; 545 return 0;
@@ -594,7 +588,7 @@ static match_table_t tokens = {
594 588
595static int parse_options(char *options, struct super_block *sb, int is_remount) 589static int parse_options(char *options, struct super_block *sb, int is_remount)
596{ 590{
597 struct nilfs_sb_info *sbi = NILFS_SB(sb); 591 struct the_nilfs *nilfs = sb->s_fs_info;
598 char *p; 592 char *p;
599 substring_t args[MAX_OPT_ARGS]; 593 substring_t args[MAX_OPT_ARGS];
600 594
@@ -609,29 +603,29 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
609 token = match_token(p, tokens, args); 603 token = match_token(p, tokens, args);
610 switch (token) { 604 switch (token) {
611 case Opt_barrier: 605 case Opt_barrier:
612 nilfs_set_opt(sbi, BARRIER); 606 nilfs_set_opt(nilfs, BARRIER);
613 break; 607 break;
614 case Opt_nobarrier: 608 case Opt_nobarrier:
615 nilfs_clear_opt(sbi, BARRIER); 609 nilfs_clear_opt(nilfs, BARRIER);
616 break; 610 break;
617 case Opt_order: 611 case Opt_order:
618 if (strcmp(args[0].from, "relaxed") == 0) 612 if (strcmp(args[0].from, "relaxed") == 0)
619 /* Ordered data semantics */ 613 /* Ordered data semantics */
620 nilfs_clear_opt(sbi, STRICT_ORDER); 614 nilfs_clear_opt(nilfs, STRICT_ORDER);
621 else if (strcmp(args[0].from, "strict") == 0) 615 else if (strcmp(args[0].from, "strict") == 0)
622 /* Strict in-order semantics */ 616 /* Strict in-order semantics */
623 nilfs_set_opt(sbi, STRICT_ORDER); 617 nilfs_set_opt(nilfs, STRICT_ORDER);
624 else 618 else
625 return 0; 619 return 0;
626 break; 620 break;
627 case Opt_err_panic: 621 case Opt_err_panic:
628 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC); 622 nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_PANIC);
629 break; 623 break;
630 case Opt_err_ro: 624 case Opt_err_ro:
631 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO); 625 nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_RO);
632 break; 626 break;
633 case Opt_err_cont: 627 case Opt_err_cont:
634 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT); 628 nilfs_write_opt(nilfs, ERROR_MODE, ERRORS_CONT);
635 break; 629 break;
636 case Opt_snapshot: 630 case Opt_snapshot:
637 if (is_remount) { 631 if (is_remount) {
@@ -642,13 +636,13 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
642 } 636 }
643 break; 637 break;
644 case Opt_norecovery: 638 case Opt_norecovery:
645 nilfs_set_opt(sbi, NORECOVERY); 639 nilfs_set_opt(nilfs, NORECOVERY);
646 break; 640 break;
647 case Opt_discard: 641 case Opt_discard:
648 nilfs_set_opt(sbi, DISCARD); 642 nilfs_set_opt(nilfs, DISCARD);
649 break; 643 break;
650 case Opt_nodiscard: 644 case Opt_nodiscard:
651 nilfs_clear_opt(sbi, DISCARD); 645 nilfs_clear_opt(nilfs, DISCARD);
652 break; 646 break;
653 default: 647 default:
654 printk(KERN_ERR 648 printk(KERN_ERR
@@ -660,22 +654,24 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
660} 654}
661 655
662static inline void 656static inline void
663nilfs_set_default_options(struct nilfs_sb_info *sbi, 657nilfs_set_default_options(struct super_block *sb,
664 struct nilfs_super_block *sbp) 658 struct nilfs_super_block *sbp)
665{ 659{
666 sbi->s_mount_opt = 660 struct the_nilfs *nilfs = sb->s_fs_info;
661
662 nilfs->ns_mount_opt =
667 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER; 663 NILFS_MOUNT_ERRORS_RO | NILFS_MOUNT_BARRIER;
668} 664}
669 665
670static int nilfs_setup_super(struct nilfs_sb_info *sbi, int is_mount) 666static int nilfs_setup_super(struct super_block *sb, int is_mount)
671{ 667{
672 struct the_nilfs *nilfs = sbi->s_nilfs; 668 struct the_nilfs *nilfs = sb->s_fs_info;
673 struct nilfs_super_block **sbp; 669 struct nilfs_super_block **sbp;
674 int max_mnt_count; 670 int max_mnt_count;
675 int mnt_count; 671 int mnt_count;
676 672
677 /* nilfs->ns_sem must be locked by the caller. */ 673 /* nilfs->ns_sem must be locked by the caller. */
678 sbp = nilfs_prepare_super(sbi, 0); 674 sbp = nilfs_prepare_super(sb, 0);
679 if (!sbp) 675 if (!sbp)
680 return -EIO; 676 return -EIO;
681 677
@@ -706,7 +702,7 @@ skip_mount_setup:
706 /* synchronize sbp[1] with sbp[0] */ 702 /* synchronize sbp[1] with sbp[0] */
707 if (sbp[1]) 703 if (sbp[1])
708 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); 704 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
709 return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); 705 return nilfs_commit_super(sb, NILFS_SB_COMMIT_ALL);
710} 706}
711 707
712struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, 708struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
@@ -727,7 +723,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
727 struct nilfs_super_block *sbp, 723 struct nilfs_super_block *sbp,
728 char *data) 724 char *data)
729{ 725{
730 struct nilfs_sb_info *sbi = NILFS_SB(sb); 726 struct the_nilfs *nilfs = sb->s_fs_info;
731 727
732 sb->s_magic = le16_to_cpu(sbp->s_magic); 728 sb->s_magic = le16_to_cpu(sbp->s_magic);
733 729
@@ -736,12 +732,12 @@ int nilfs_store_magic_and_option(struct super_block *sb,
736 sb->s_flags |= MS_NOATIME; 732 sb->s_flags |= MS_NOATIME;
737#endif 733#endif
738 734
739 nilfs_set_default_options(sbi, sbp); 735 nilfs_set_default_options(sb, sbp);
740 736
741 sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid); 737 nilfs->ns_resuid = le16_to_cpu(sbp->s_def_resuid);
742 sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid); 738 nilfs->ns_resgid = le16_to_cpu(sbp->s_def_resgid);
743 sbi->s_interval = le32_to_cpu(sbp->s_c_interval); 739 nilfs->ns_interval = le32_to_cpu(sbp->s_c_interval);
744 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); 740 nilfs->ns_watermark = le32_to_cpu(sbp->s_c_block_max);
745 741
746 return !parse_options(data, sb, 0) ? -EINVAL : 0 ; 742 return !parse_options(data, sb, 0) ? -EINVAL : 0 ;
747} 743}
@@ -822,7 +818,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
822static int nilfs_attach_snapshot(struct super_block *s, __u64 cno, 818static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
823 struct dentry **root_dentry) 819 struct dentry **root_dentry)
824{ 820{
825 struct the_nilfs *nilfs = NILFS_SB(s)->s_nilfs; 821 struct the_nilfs *nilfs = s->s_fs_info;
826 struct nilfs_root *root; 822 struct nilfs_root *root;
827 int ret; 823 int ret;
828 824
@@ -840,7 +836,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
840 goto out; 836 goto out;
841 } 837 }
842 838
843 ret = nilfs_attach_checkpoint(NILFS_SB(s), cno, false, &root); 839 ret = nilfs_attach_checkpoint(s, cno, false, &root);
844 if (ret) { 840 if (ret) {
845 printk(KERN_ERR "NILFS: error loading snapshot " 841 printk(KERN_ERR "NILFS: error loading snapshot "
846 "(checkpoint number=%llu).\n", 842 "(checkpoint number=%llu).\n",
@@ -874,7 +870,7 @@ static int nilfs_try_to_shrink_tree(struct dentry *root_dentry)
874 870
875int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) 871int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
876{ 872{
877 struct the_nilfs *nilfs = NILFS_SB(sb)->s_nilfs; 873 struct the_nilfs *nilfs = sb->s_fs_info;
878 struct nilfs_root *root; 874 struct nilfs_root *root;
879 struct inode *inode; 875 struct inode *inode;
880 struct dentry *dentry; 876 struct dentry *dentry;
@@ -887,7 +883,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno)
887 return true; /* protect recent checkpoints */ 883 return true; /* protect recent checkpoints */
888 884
889 ret = false; 885 ret = false;
890 root = nilfs_lookup_root(NILFS_SB(sb)->s_nilfs, cno); 886 root = nilfs_lookup_root(nilfs, cno);
891 if (root) { 887 if (root) {
892 inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO); 888 inode = nilfs_ilookup(sb, root, NILFS_ROOT_INO);
893 if (inode) { 889 if (inode) {
@@ -917,43 +913,21 @@ static int
917nilfs_fill_super(struct super_block *sb, void *data, int silent) 913nilfs_fill_super(struct super_block *sb, void *data, int silent)
918{ 914{
919 struct the_nilfs *nilfs; 915 struct the_nilfs *nilfs;
920 struct nilfs_sb_info *sbi;
921 struct nilfs_root *fsroot; 916 struct nilfs_root *fsroot;
922 struct backing_dev_info *bdi; 917 struct backing_dev_info *bdi;
923 __u64 cno; 918 __u64 cno;
924 int err; 919 int err;
925 920
926 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 921 nilfs = alloc_nilfs(sb->s_bdev);
927 if (!sbi) 922 if (!nilfs)
928 return -ENOMEM; 923 return -ENOMEM;
929 924
930 sb->s_fs_info = sbi; 925 sb->s_fs_info = nilfs;
931 sbi->s_super = sb;
932
933 nilfs = alloc_nilfs(sb->s_bdev);
934 if (!nilfs) {
935 err = -ENOMEM;
936 goto failed_sbi;
937 }
938 sbi->s_nilfs = nilfs;
939 926
940 err = init_nilfs(nilfs, sbi, (char *)data); 927 err = init_nilfs(nilfs, sb, (char *)data);
941 if (err) 928 if (err)
942 goto failed_nilfs; 929 goto failed_nilfs;
943 930
944 spin_lock_init(&sbi->s_inode_lock);
945 INIT_LIST_HEAD(&sbi->s_dirty_files);
946
947 /*
948 * Following initialization is overlapped because
949 * nilfs_sb_info structure has been cleared at the beginning.
950 * But we reserve them to keep our interest and make ready
951 * for the future change.
952 */
953 get_random_bytes(&sbi->s_next_generation,
954 sizeof(sbi->s_next_generation));
955 spin_lock_init(&sbi->s_next_gen_lock);
956
957 sb->s_op = &nilfs_sops; 931 sb->s_op = &nilfs_sops;
958 sb->s_export_op = &nilfs_export_ops; 932 sb->s_export_op = &nilfs_export_ops;
959 sb->s_root = NULL; 933 sb->s_root = NULL;
@@ -962,12 +936,12 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
962 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info; 936 bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
963 sb->s_bdi = bdi ? : &default_backing_dev_info; 937 sb->s_bdi = bdi ? : &default_backing_dev_info;
964 938
965 err = load_nilfs(nilfs, sbi); 939 err = load_nilfs(nilfs, sb);
966 if (err) 940 if (err)
967 goto failed_nilfs; 941 goto failed_nilfs;
968 942
969 cno = nilfs_last_cno(nilfs); 943 cno = nilfs_last_cno(nilfs);
970 err = nilfs_attach_checkpoint(sbi, cno, true, &fsroot); 944 err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
971 if (err) { 945 if (err) {
972 printk(KERN_ERR "NILFS: error loading last checkpoint " 946 printk(KERN_ERR "NILFS: error loading last checkpoint "
973 "(checkpoint number=%llu).\n", (unsigned long long)cno); 947 "(checkpoint number=%llu).\n", (unsigned long long)cno);
@@ -975,7 +949,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
975 } 949 }
976 950
977 if (!(sb->s_flags & MS_RDONLY)) { 951 if (!(sb->s_flags & MS_RDONLY)) {
978 err = nilfs_attach_segment_constructor(sbi, fsroot); 952 err = nilfs_attach_log_writer(sb, fsroot);
979 if (err) 953 if (err)
980 goto failed_checkpoint; 954 goto failed_checkpoint;
981 } 955 }
@@ -988,14 +962,14 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
988 962
989 if (!(sb->s_flags & MS_RDONLY)) { 963 if (!(sb->s_flags & MS_RDONLY)) {
990 down_write(&nilfs->ns_sem); 964 down_write(&nilfs->ns_sem);
991 nilfs_setup_super(sbi, true); 965 nilfs_setup_super(sb, true);
992 up_write(&nilfs->ns_sem); 966 up_write(&nilfs->ns_sem);
993 } 967 }
994 968
995 return 0; 969 return 0;
996 970
997 failed_segctor: 971 failed_segctor:
998 nilfs_detach_segment_constructor(sbi); 972 nilfs_detach_log_writer(sb);
999 973
1000 failed_checkpoint: 974 failed_checkpoint:
1001 nilfs_put_root(fsroot); 975 nilfs_put_root(fsroot);
@@ -1007,23 +981,18 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
1007 981
1008 failed_nilfs: 982 failed_nilfs:
1009 destroy_nilfs(nilfs); 983 destroy_nilfs(nilfs);
1010
1011 failed_sbi:
1012 sb->s_fs_info = NULL;
1013 kfree(sbi);
1014 return err; 984 return err;
1015} 985}
1016 986
1017static int nilfs_remount(struct super_block *sb, int *flags, char *data) 987static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1018{ 988{
1019 struct nilfs_sb_info *sbi = NILFS_SB(sb); 989 struct the_nilfs *nilfs = sb->s_fs_info;
1020 struct the_nilfs *nilfs = sbi->s_nilfs;
1021 unsigned long old_sb_flags; 990 unsigned long old_sb_flags;
1022 unsigned long old_mount_opt; 991 unsigned long old_mount_opt;
1023 int err; 992 int err;
1024 993
1025 old_sb_flags = sb->s_flags; 994 old_sb_flags = sb->s_flags;
1026 old_mount_opt = sbi->s_mount_opt; 995 old_mount_opt = nilfs->ns_mount_opt;
1027 996
1028 if (!parse_options(data, sb, 1)) { 997 if (!parse_options(data, sb, 1)) {
1029 err = -EINVAL; 998 err = -EINVAL;
@@ -1043,8 +1012,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1043 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 1012 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
1044 goto out; 1013 goto out;
1045 if (*flags & MS_RDONLY) { 1014 if (*flags & MS_RDONLY) {
1046 /* Shutting down the segment constructor */ 1015 /* Shutting down log writer */
1047 nilfs_detach_segment_constructor(sbi); 1016 nilfs_detach_log_writer(sb);
1048 sb->s_flags |= MS_RDONLY; 1017 sb->s_flags |= MS_RDONLY;
1049 1018
1050 /* 1019 /*
@@ -1052,7 +1021,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1052 * the RDONLY flag and then mark the partition as valid again. 1021 * the RDONLY flag and then mark the partition as valid again.
1053 */ 1022 */
1054 down_write(&nilfs->ns_sem); 1023 down_write(&nilfs->ns_sem);
1055 nilfs_cleanup_super(sbi); 1024 nilfs_cleanup_super(sb);
1056 up_write(&nilfs->ns_sem); 1025 up_write(&nilfs->ns_sem);
1057 } else { 1026 } else {
1058 __u64 features; 1027 __u64 features;
@@ -1079,12 +1048,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1079 sb->s_flags &= ~MS_RDONLY; 1048 sb->s_flags &= ~MS_RDONLY;
1080 1049
1081 root = NILFS_I(sb->s_root->d_inode)->i_root; 1050 root = NILFS_I(sb->s_root->d_inode)->i_root;
1082 err = nilfs_attach_segment_constructor(sbi, root); 1051 err = nilfs_attach_log_writer(sb, root);
1083 if (err) 1052 if (err)
1084 goto restore_opts; 1053 goto restore_opts;
1085 1054
1086 down_write(&nilfs->ns_sem); 1055 down_write(&nilfs->ns_sem);
1087 nilfs_setup_super(sbi, true); 1056 nilfs_setup_super(sb, true);
1088 up_write(&nilfs->ns_sem); 1057 up_write(&nilfs->ns_sem);
1089 } 1058 }
1090 out: 1059 out:
@@ -1092,13 +1061,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
1092 1061
1093 restore_opts: 1062 restore_opts:
1094 sb->s_flags = old_sb_flags; 1063 sb->s_flags = old_sb_flags;
1095 sbi->s_mount_opt = old_mount_opt; 1064 nilfs->ns_mount_opt = old_mount_opt;
1096 return err; 1065 return err;
1097} 1066}
1098 1067
1099struct nilfs_super_data { 1068struct nilfs_super_data {
1100 struct block_device *bdev; 1069 struct block_device *bdev;
1101 struct nilfs_sb_info *sbi;
1102 __u64 cno; 1070 __u64 cno;
1103 int flags; 1071 int flags;
1104}; 1072};
@@ -1279,7 +1247,7 @@ static void nilfs_inode_init_once(void *obj)
1279#ifdef CONFIG_NILFS_XATTR 1247#ifdef CONFIG_NILFS_XATTR
1280 init_rwsem(&ii->xattr_sem); 1248 init_rwsem(&ii->xattr_sem);
1281#endif 1249#endif
1282 nilfs_btnode_cache_init_once(&ii->i_btnode_cache); 1250 address_space_init_once(&ii->i_btnode_cache);
1283 ii->i_bmap = &ii->i_bmap_data; 1251 ii->i_bmap = &ii->i_bmap_data;
1284 inode_init_once(&ii->vfs_inode); 1252 inode_init_once(&ii->vfs_inode);
1285} 1253}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index ad4ac607cf57..d2acd1a651f3 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -25,6 +25,7 @@
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/blkdev.h> 26#include <linux/blkdev.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/random.h>
28#include <linux/crc32.h> 29#include <linux/crc32.h>
29#include "nilfs.h" 30#include "nilfs.h"
30#include "segment.h" 31#include "segment.h"
@@ -75,7 +76,10 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
75 nilfs->ns_bdev = bdev; 76 nilfs->ns_bdev = bdev;
76 atomic_set(&nilfs->ns_ndirtyblks, 0); 77 atomic_set(&nilfs->ns_ndirtyblks, 0);
77 init_rwsem(&nilfs->ns_sem); 78 init_rwsem(&nilfs->ns_sem);
79 INIT_LIST_HEAD(&nilfs->ns_dirty_files);
78 INIT_LIST_HEAD(&nilfs->ns_gc_inodes); 80 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
81 spin_lock_init(&nilfs->ns_inode_lock);
82 spin_lock_init(&nilfs->ns_next_gen_lock);
79 spin_lock_init(&nilfs->ns_last_segment_lock); 83 spin_lock_init(&nilfs->ns_last_segment_lock);
80 nilfs->ns_cptree = RB_ROOT; 84 nilfs->ns_cptree = RB_ROOT;
81 spin_lock_init(&nilfs->ns_cptree_lock); 85 spin_lock_init(&nilfs->ns_cptree_lock);
@@ -197,16 +201,16 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
197/** 201/**
198 * load_nilfs - load and recover the nilfs 202 * load_nilfs - load and recover the nilfs
199 * @nilfs: the_nilfs structure to be released 203 * @nilfs: the_nilfs structure to be released
200 * @sbi: nilfs_sb_info used to recover past segment 204 * @sb: super block isntance used to recover past segment
201 * 205 *
202 * load_nilfs() searches and load the latest super root, 206 * load_nilfs() searches and load the latest super root,
203 * attaches the last segment, and does recovery if needed. 207 * attaches the last segment, and does recovery if needed.
204 * The caller must call this exclusively for simultaneous mounts. 208 * The caller must call this exclusively for simultaneous mounts.
205 */ 209 */
206int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) 210int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
207{ 211{
208 struct nilfs_recovery_info ri; 212 struct nilfs_recovery_info ri;
209 unsigned int s_flags = sbi->s_super->s_flags; 213 unsigned int s_flags = sb->s_flags;
210 int really_read_only = bdev_read_only(nilfs->ns_bdev); 214 int really_read_only = bdev_read_only(nilfs->ns_bdev);
211 int valid_fs = nilfs_valid_fs(nilfs); 215 int valid_fs = nilfs_valid_fs(nilfs);
212 int err; 216 int err;
@@ -271,7 +275,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
271 goto scan_error; 275 goto scan_error;
272 } 276 }
273 277
274 err = nilfs_load_super_root(nilfs, sbi->s_super, ri.ri_super_root); 278 err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
275 if (unlikely(err)) { 279 if (unlikely(err)) {
276 printk(KERN_ERR "NILFS: error loading super root.\n"); 280 printk(KERN_ERR "NILFS: error loading super root.\n");
277 goto failed; 281 goto failed;
@@ -283,7 +287,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
283 if (s_flags & MS_RDONLY) { 287 if (s_flags & MS_RDONLY) {
284 __u64 features; 288 __u64 features;
285 289
286 if (nilfs_test_opt(sbi, NORECOVERY)) { 290 if (nilfs_test_opt(nilfs, NORECOVERY)) {
287 printk(KERN_INFO "NILFS: norecovery option specified. " 291 printk(KERN_INFO "NILFS: norecovery option specified. "
288 "skipping roll-forward recovery\n"); 292 "skipping roll-forward recovery\n");
289 goto skip_recovery; 293 goto skip_recovery;
@@ -304,21 +308,21 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
304 err = -EROFS; 308 err = -EROFS;
305 goto failed_unload; 309 goto failed_unload;
306 } 310 }
307 sbi->s_super->s_flags &= ~MS_RDONLY; 311 sb->s_flags &= ~MS_RDONLY;
308 } else if (nilfs_test_opt(sbi, NORECOVERY)) { 312 } else if (nilfs_test_opt(nilfs, NORECOVERY)) {
309 printk(KERN_ERR "NILFS: recovery cancelled because norecovery " 313 printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
310 "option was specified for a read/write mount\n"); 314 "option was specified for a read/write mount\n");
311 err = -EINVAL; 315 err = -EINVAL;
312 goto failed_unload; 316 goto failed_unload;
313 } 317 }
314 318
315 err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri); 319 err = nilfs_salvage_orphan_logs(nilfs, sb, &ri);
316 if (err) 320 if (err)
317 goto failed_unload; 321 goto failed_unload;
318 322
319 down_write(&nilfs->ns_sem); 323 down_write(&nilfs->ns_sem);
320 nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */ 324 nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */
321 err = nilfs_cleanup_super(sbi); 325 err = nilfs_cleanup_super(sb);
322 up_write(&nilfs->ns_sem); 326 up_write(&nilfs->ns_sem);
323 327
324 if (err) { 328 if (err) {
@@ -330,7 +334,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
330 334
331 skip_recovery: 335 skip_recovery:
332 nilfs_clear_recovery_info(&ri); 336 nilfs_clear_recovery_info(&ri);
333 sbi->s_super->s_flags = s_flags; 337 sb->s_flags = s_flags;
334 return 0; 338 return 0;
335 339
336 scan_error: 340 scan_error:
@@ -344,7 +348,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
344 348
345 failed: 349 failed:
346 nilfs_clear_recovery_info(&ri); 350 nilfs_clear_recovery_info(&ri);
347 sbi->s_super->s_flags = s_flags; 351 sb->s_flags = s_flags;
348 return err; 352 return err;
349} 353}
350 354
@@ -475,10 +479,13 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
475 return -EIO; 479 return -EIO;
476 } 480 }
477 printk(KERN_WARNING 481 printk(KERN_WARNING
478 "NILFS warning: unable to read primary superblock\n"); 482 "NILFS warning: unable to read primary superblock "
479 } else if (!sbp[1]) 483 "(blocksize = %d)\n", blocksize);
484 } else if (!sbp[1]) {
480 printk(KERN_WARNING 485 printk(KERN_WARNING
481 "NILFS warning: unable to read secondary superblock\n"); 486 "NILFS warning: unable to read secondary superblock "
487 "(blocksize = %d)\n", blocksize);
488 }
482 489
483 /* 490 /*
484 * Compare two super blocks and set 1 in swp if the secondary 491 * Compare two super blocks and set 1 in swp if the secondary
@@ -505,7 +512,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
505 512
506 if (!valid[!swp]) 513 if (!valid[!swp])
507 printk(KERN_WARNING "NILFS warning: broken superblock. " 514 printk(KERN_WARNING "NILFS warning: broken superblock. "
508 "using spare superblock.\n"); 515 "using spare superblock (blocksize = %d).\n", blocksize);
509 if (swp) 516 if (swp)
510 nilfs_swap_super_block(nilfs); 517 nilfs_swap_super_block(nilfs);
511 518
@@ -519,7 +526,6 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
519/** 526/**
520 * init_nilfs - initialize a NILFS instance. 527 * init_nilfs - initialize a NILFS instance.
521 * @nilfs: the_nilfs structure 528 * @nilfs: the_nilfs structure
522 * @sbi: nilfs_sb_info
523 * @sb: super block 529 * @sb: super block
524 * @data: mount options 530 * @data: mount options
525 * 531 *
@@ -530,9 +536,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
530 * Return Value: On success, 0 is returned. On error, a negative error 536 * Return Value: On success, 0 is returned. On error, a negative error
531 * code is returned. 537 * code is returned.
532 */ 538 */
533int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) 539int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
534{ 540{
535 struct super_block *sb = sbi->s_super;
536 struct nilfs_super_block *sbp; 541 struct nilfs_super_block *sbp;
537 int blocksize; 542 int blocksize;
538 int err; 543 int err;
@@ -588,6 +593,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
588 nilfs->ns_blocksize_bits = sb->s_blocksize_bits; 593 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
589 nilfs->ns_blocksize = blocksize; 594 nilfs->ns_blocksize = blocksize;
590 595
596 get_random_bytes(&nilfs->ns_next_generation,
597 sizeof(nilfs->ns_next_generation));
598
591 err = nilfs_store_disk_layout(nilfs, sbp); 599 err = nilfs_store_disk_layout(nilfs, sbp);
592 if (err) 600 if (err)
593 goto failed_sbh; 601 goto failed_sbh;
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index fd85e4c05c6b..f4968145c2a3 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -31,7 +31,8 @@
31#include <linux/blkdev.h> 31#include <linux/blkdev.h>
32#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
33#include <linux/slab.h> 33#include <linux/slab.h>
34#include "sb.h" 34
35struct nilfs_sc_info;
35 36
36/* the_nilfs struct */ 37/* the_nilfs struct */
37enum { 38enum {
@@ -65,13 +66,23 @@ enum {
65 * @ns_last_cno: checkpoint number of the latest segment 66 * @ns_last_cno: checkpoint number of the latest segment
66 * @ns_prot_seq: least sequence number of segments which must not be reclaimed 67 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
67 * @ns_prev_seq: base sequence number used to decide if advance log cursor 68 * @ns_prev_seq: base sequence number used to decide if advance log cursor
68 * @ns_segctor_sem: segment constructor semaphore 69 * @ns_writer: log writer
70 * @ns_segctor_sem: semaphore protecting log write
69 * @ns_dat: DAT file inode 71 * @ns_dat: DAT file inode
70 * @ns_cpfile: checkpoint file inode 72 * @ns_cpfile: checkpoint file inode
71 * @ns_sufile: segusage file inode 73 * @ns_sufile: segusage file inode
72 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root) 74 * @ns_cptree: rb-tree of all mounted checkpoints (nilfs_root)
73 * @ns_cptree_lock: lock protecting @ns_cptree 75 * @ns_cptree_lock: lock protecting @ns_cptree
76 * @ns_dirty_files: list of dirty files
77 * @ns_inode_lock: lock protecting @ns_dirty_files
74 * @ns_gc_inodes: dummy inodes to keep live blocks 78 * @ns_gc_inodes: dummy inodes to keep live blocks
79 * @ns_next_generation: next generation number for inodes
80 * @ns_next_gen_lock: lock protecting @ns_next_generation
81 * @ns_mount_opt: mount options
82 * @ns_resuid: uid for reserved blocks
83 * @ns_resgid: gid for reserved blocks
84 * @ns_interval: checkpoint creation interval
85 * @ns_watermark: watermark for the number of dirty buffers
75 * @ns_blocksize_bits: bit length of block size 86 * @ns_blocksize_bits: bit length of block size
76 * @ns_blocksize: block size 87 * @ns_blocksize: block size
77 * @ns_nsegments: number of segments in filesystem 88 * @ns_nsegments: number of segments in filesystem
@@ -131,6 +142,7 @@ struct the_nilfs {
131 u64 ns_prot_seq; 142 u64 ns_prot_seq;
132 u64 ns_prev_seq; 143 u64 ns_prev_seq;
133 144
145 struct nilfs_sc_info *ns_writer;
134 struct rw_semaphore ns_segctor_sem; 146 struct rw_semaphore ns_segctor_sem;
135 147
136 /* 148 /*
@@ -145,9 +157,25 @@ struct the_nilfs {
145 struct rb_root ns_cptree; 157 struct rb_root ns_cptree;
146 spinlock_t ns_cptree_lock; 158 spinlock_t ns_cptree_lock;
147 159
160 /* Dirty inode list */
161 struct list_head ns_dirty_files;
162 spinlock_t ns_inode_lock;
163
148 /* GC inode list */ 164 /* GC inode list */
149 struct list_head ns_gc_inodes; 165 struct list_head ns_gc_inodes;
150 166
167 /* Inode allocator */
168 u32 ns_next_generation;
169 spinlock_t ns_next_gen_lock;
170
171 /* Mount options */
172 unsigned long ns_mount_opt;
173
174 uid_t ns_resuid;
175 gid_t ns_resgid;
176 unsigned long ns_interval;
177 unsigned long ns_watermark;
178
151 /* Disk layout information (static) */ 179 /* Disk layout information (static) */
152 unsigned int ns_blocksize_bits; 180 unsigned int ns_blocksize_bits;
153 unsigned int ns_blocksize; 181 unsigned int ns_blocksize;
@@ -180,6 +208,20 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
180THE_NILFS_FNS(GC_RUNNING, gc_running) 208THE_NILFS_FNS(GC_RUNNING, gc_running)
181THE_NILFS_FNS(SB_DIRTY, sb_dirty) 209THE_NILFS_FNS(SB_DIRTY, sb_dirty)
182 210
211/*
212 * Mount option operations
213 */
214#define nilfs_clear_opt(nilfs, opt) \
215 do { (nilfs)->ns_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
216#define nilfs_set_opt(nilfs, opt) \
217 do { (nilfs)->ns_mount_opt |= NILFS_MOUNT_##opt; } while (0)
218#define nilfs_test_opt(nilfs, opt) ((nilfs)->ns_mount_opt & NILFS_MOUNT_##opt)
219#define nilfs_write_opt(nilfs, mask, opt) \
220 do { (nilfs)->ns_mount_opt = \
221 (((nilfs)->ns_mount_opt & ~NILFS_MOUNT_##mask) | \
222 NILFS_MOUNT_##opt); \
223 } while (0)
224
183/** 225/**
184 * struct nilfs_root - nilfs root object 226 * struct nilfs_root - nilfs root object
185 * @cno: checkpoint number 227 * @cno: checkpoint number
@@ -224,15 +266,14 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
224void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); 266void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
225struct the_nilfs *alloc_nilfs(struct block_device *bdev); 267struct the_nilfs *alloc_nilfs(struct block_device *bdev);
226void destroy_nilfs(struct the_nilfs *nilfs); 268void destroy_nilfs(struct the_nilfs *nilfs);
227int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *); 269int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
228int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *); 270int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
229int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t); 271int nilfs_discard_segments(struct the_nilfs *, __u64 *, size_t);
230int nilfs_count_free_blocks(struct the_nilfs *, sector_t *); 272int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
231struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno); 273struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno);
232struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs, 274struct nilfs_root *nilfs_find_or_create_root(struct the_nilfs *nilfs,
233 __u64 cno); 275 __u64 cno);
234void nilfs_put_root(struct nilfs_root *root); 276void nilfs_put_root(struct nilfs_root *root);
235struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
236int nilfs_near_disk_full(struct the_nilfs *); 277int nilfs_near_disk_full(struct the_nilfs *);
237void nilfs_fall_back_super_block(struct the_nilfs *); 278void nilfs_fall_back_super_block(struct the_nilfs *);
238void nilfs_swap_super_block(struct the_nilfs *); 279void nilfs_swap_super_block(struct the_nilfs *);
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8b61220cffc5..9fde1c00a296 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -164,7 +164,7 @@ static int process_access_response(struct fsnotify_group *group,
164 fd, response); 164 fd, response);
165 /* 165 /*
166 * make sure the response is valid, if invalid we do nothing and either 166 * make sure the response is valid, if invalid we do nothing and either
167 * userspace can send a valid responce or we will clean it up after the 167 * userspace can send a valid response or we will clean it up after the
168 * timeout 168 * timeout
169 */ 169 */
170 switch (response) { 170 switch (response) {
@@ -876,7 +876,7 @@ SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
876#endif 876#endif
877 877
878/* 878/*
879 * fanotify_user_setup - Our initialization function. Note that we cannnot return 879 * fanotify_user_setup - Our initialization function. Note that we cannot return
880 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 880 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
881 * must result in panic(). 881 * must result in panic().
882 */ 882 */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 4c29fcf557d1..07ea8d3e6ea2 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -22,13 +22,14 @@
22#include <linux/module.h> 22#include <linux/module.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25#include <linux/writeback.h> /* for inode_lock */
26 25
27#include <asm/atomic.h> 26#include <asm/atomic.h>
28 27
29#include <linux/fsnotify_backend.h> 28#include <linux/fsnotify_backend.h>
30#include "fsnotify.h" 29#include "fsnotify.h"
31 30
31#include "../internal.h"
32
32/* 33/*
33 * Recalculate the mask of events relevant to a given inode locked. 34 * Recalculate the mask of events relevant to a given inode locked.
34 */ 35 */
@@ -237,15 +238,14 @@ out:
237 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. 238 * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
238 * @list: list of inodes being unmounted (sb->s_inodes) 239 * @list: list of inodes being unmounted (sb->s_inodes)
239 * 240 *
240 * Called with inode_lock held, protecting the unmounting super block's list 241 * Called during unmount with no locks held, so needs to be safe against
241 * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay. 242 * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block.
242 * We temporarily drop inode_lock, however, and CAN block.
243 */ 243 */
244void fsnotify_unmount_inodes(struct list_head *list) 244void fsnotify_unmount_inodes(struct list_head *list)
245{ 245{
246 struct inode *inode, *next_i, *need_iput = NULL; 246 struct inode *inode, *next_i, *need_iput = NULL;
247 247
248 spin_lock(&inode_lock); 248 spin_lock(&inode_sb_list_lock);
249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) { 249 list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
250 struct inode *need_iput_tmp; 250 struct inode *need_iput_tmp;
251 251
@@ -254,8 +254,11 @@ void fsnotify_unmount_inodes(struct list_head *list)
254 * I_WILL_FREE, or I_NEW which is fine because by that point 254 * I_WILL_FREE, or I_NEW which is fine because by that point
255 * the inode cannot have any associated watches. 255 * the inode cannot have any associated watches.
256 */ 256 */
257 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 257 spin_lock(&inode->i_lock);
258 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
259 spin_unlock(&inode->i_lock);
258 continue; 260 continue;
261 }
259 262
260 /* 263 /*
261 * If i_count is zero, the inode cannot have any watches and 264 * If i_count is zero, the inode cannot have any watches and
@@ -263,8 +266,10 @@ void fsnotify_unmount_inodes(struct list_head *list)
263 * evict all inodes with zero i_count from icache which is 266 * evict all inodes with zero i_count from icache which is
264 * unnecessarily violent and may in fact be illegal to do. 267 * unnecessarily violent and may in fact be illegal to do.
265 */ 268 */
266 if (!atomic_read(&inode->i_count)) 269 if (!atomic_read(&inode->i_count)) {
270 spin_unlock(&inode->i_lock);
267 continue; 271 continue;
272 }
268 273
269 need_iput_tmp = need_iput; 274 need_iput_tmp = need_iput;
270 need_iput = NULL; 275 need_iput = NULL;
@@ -274,22 +279,25 @@ void fsnotify_unmount_inodes(struct list_head *list)
274 __iget(inode); 279 __iget(inode);
275 else 280 else
276 need_iput_tmp = NULL; 281 need_iput_tmp = NULL;
282 spin_unlock(&inode->i_lock);
277 283
278 /* In case the dropping of a reference would nuke next_i. */ 284 /* In case the dropping of a reference would nuke next_i. */
279 if ((&next_i->i_sb_list != list) && 285 if ((&next_i->i_sb_list != list) &&
280 atomic_read(&next_i->i_count) && 286 atomic_read(&next_i->i_count)) {
281 !(next_i->i_state & (I_FREEING | I_WILL_FREE))) { 287 spin_lock(&next_i->i_lock);
282 __iget(next_i); 288 if (!(next_i->i_state & (I_FREEING | I_WILL_FREE))) {
283 need_iput = next_i; 289 __iget(next_i);
290 need_iput = next_i;
291 }
292 spin_unlock(&next_i->i_lock);
284 } 293 }
285 294
286 /* 295 /*
287 * We can safely drop inode_lock here because we hold 296 * We can safely drop inode_sb_list_lock here because we hold
288 * references on both inode and next_i. Also no new inodes 297 * references on both inode and next_i. Also no new inodes
289 * will be added since the umount has begun. Finally, 298 * will be added since the umount has begun.
290 * iprune_mutex keeps shrink_icache_memory() away.
291 */ 299 */
292 spin_unlock(&inode_lock); 300 spin_unlock(&inode_sb_list_lock);
293 301
294 if (need_iput_tmp) 302 if (need_iput_tmp)
295 iput(need_iput_tmp); 303 iput(need_iput_tmp);
@@ -301,7 +309,7 @@ void fsnotify_unmount_inodes(struct list_head *list)
301 309
302 iput(inode); 310 iput(inode);
303 311
304 spin_lock(&inode_lock); 312 spin_lock(&inode_sb_list_lock);
305 } 313 }
306 spin_unlock(&inode_lock); 314 spin_unlock(&inode_sb_list_lock);
307} 315}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index a91b69a6a291..e3cbd746f64a 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -194,10 +194,11 @@ static int idr_callback(int id, void *p, void *data)
194 194
195static void inotify_free_group_priv(struct fsnotify_group *group) 195static void inotify_free_group_priv(struct fsnotify_group *group)
196{ 196{
197 /* ideally the idr is empty and we won't hit the BUG in teh callback */ 197 /* ideally the idr is empty and we won't hit the BUG in the callback */
198 idr_for_each(&group->inotify_data.idr, idr_callback, group); 198 idr_for_each(&group->inotify_data.idr, idr_callback, group);
199 idr_remove_all(&group->inotify_data.idr); 199 idr_remove_all(&group->inotify_data.idr);
200 idr_destroy(&group->inotify_data.idr); 200 idr_destroy(&group->inotify_data.idr);
201 atomic_dec(&group->inotify_data.user->inotify_devs);
201 free_uid(group->inotify_data.user); 202 free_uid(group->inotify_data.user);
202} 203}
203 204
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 4cd5d5d78f9f..8445fbc8985c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -290,7 +290,6 @@ static int inotify_fasync(int fd, struct file *file, int on)
290static int inotify_release(struct inode *ignored, struct file *file) 290static int inotify_release(struct inode *ignored, struct file *file)
291{ 291{
292 struct fsnotify_group *group = file->private_data; 292 struct fsnotify_group *group = file->private_data;
293 struct user_struct *user = group->inotify_data.user;
294 293
295 pr_debug("%s: group=%p\n", __func__, group); 294 pr_debug("%s: group=%p\n", __func__, group);
296 295
@@ -299,8 +298,6 @@ static int inotify_release(struct inode *ignored, struct file *file)
299 /* free this group, matching get was inotify_init->fsnotify_obtain_group */ 298 /* free this group, matching get was inotify_init->fsnotify_obtain_group */
300 fsnotify_put_group(group); 299 fsnotify_put_group(group);
301 300
302 atomic_dec(&user->inotify_devs);
303
304 return 0; 301 return 0;
305} 302}
306 303
@@ -697,7 +694,7 @@ retry:
697 return ret; 694 return ret;
698} 695}
699 696
700static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events) 697static struct fsnotify_group *inotify_new_group(unsigned int max_events)
701{ 698{
702 struct fsnotify_group *group; 699 struct fsnotify_group *group;
703 700
@@ -710,8 +707,14 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
710 spin_lock_init(&group->inotify_data.idr_lock); 707 spin_lock_init(&group->inotify_data.idr_lock);
711 idr_init(&group->inotify_data.idr); 708 idr_init(&group->inotify_data.idr);
712 group->inotify_data.last_wd = 0; 709 group->inotify_data.last_wd = 0;
713 group->inotify_data.user = user;
714 group->inotify_data.fa = NULL; 710 group->inotify_data.fa = NULL;
711 group->inotify_data.user = get_current_user();
712
713 if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
714 inotify_max_user_instances) {
715 fsnotify_put_group(group);
716 return ERR_PTR(-EMFILE);
717 }
715 718
716 return group; 719 return group;
717} 720}
@@ -721,7 +724,6 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
721SYSCALL_DEFINE1(inotify_init1, int, flags) 724SYSCALL_DEFINE1(inotify_init1, int, flags)
722{ 725{
723 struct fsnotify_group *group; 726 struct fsnotify_group *group;
724 struct user_struct *user;
725 int ret; 727 int ret;
726 728
727 /* Check the IN_* constants for consistency. */ 729 /* Check the IN_* constants for consistency. */
@@ -731,31 +733,16 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
731 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK)) 733 if (flags & ~(IN_CLOEXEC | IN_NONBLOCK))
732 return -EINVAL; 734 return -EINVAL;
733 735
734 user = get_current_user();
735 if (unlikely(atomic_read(&user->inotify_devs) >=
736 inotify_max_user_instances)) {
737 ret = -EMFILE;
738 goto out_free_uid;
739 }
740
741 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */ 736 /* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
742 group = inotify_new_group(user, inotify_max_queued_events); 737 group = inotify_new_group(inotify_max_queued_events);
743 if (IS_ERR(group)) { 738 if (IS_ERR(group))
744 ret = PTR_ERR(group); 739 return PTR_ERR(group);
745 goto out_free_uid;
746 }
747
748 atomic_inc(&user->inotify_devs);
749 740
750 ret = anon_inode_getfd("inotify", &inotify_fops, group, 741 ret = anon_inode_getfd("inotify", &inotify_fops, group,
751 O_RDONLY | flags); 742 O_RDONLY | flags);
752 if (ret >= 0) 743 if (ret < 0)
753 return ret; 744 fsnotify_put_group(group);
754 745
755 fsnotify_put_group(group);
756 atomic_dec(&user->inotify_devs);
757out_free_uid:
758 free_uid(user);
759 return ret; 746 return ret;
760} 747}
761 748
@@ -841,7 +828,7 @@ out:
841} 828}
842 829
843/* 830/*
844 * inotify_user_setup - Our initialization function. Note that we cannnot return 831 * inotify_user_setup - Our initialization function. Note that we cannot return
845 * error because we have compiled-in VFS hooks. So an (unlikely) failure here 832 * error because we have compiled-in VFS hooks. So an (unlikely) failure here
846 * must result in panic(). 833 * must result in panic().
847 */ 834 */
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 325185e514bb..252ab1f6452b 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -24,7 +24,7 @@
24 * referencing this object. The object typically will live inside the kernel 24 * referencing this object. The object typically will live inside the kernel
25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task 25 * with a refcnt of 2, one for each list it is on (i_list, g_list). Any task
26 * which can find this object holding the appropriete locks, can take a reference 26 * which can find this object holding the appropriete locks, can take a reference
27 * and the object itself is guarenteed to survive until the reference is dropped. 27 * and the object itself is guaranteed to survive until the reference is dropped.
28 * 28 *
29 * LOCKING: 29 * LOCKING:
30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST 30 * There are 3 spinlocks involved with fsnotify inode marks and they MUST
@@ -91,7 +91,6 @@
91#include <linux/slab.h> 91#include <linux/slab.h>
92#include <linux/spinlock.h> 92#include <linux/spinlock.h>
93#include <linux/srcu.h> 93#include <linux/srcu.h>
94#include <linux/writeback.h> /* for inode_lock */
95 94
96#include <asm/atomic.h> 95#include <asm/atomic.h>
97 96
diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c
index 85eebff6d0d7..e86577d6c5c3 100644
--- a/fs/notify/vfsmount_mark.c
+++ b/fs/notify/vfsmount_mark.c
@@ -23,7 +23,6 @@
23#include <linux/mount.h> 23#include <linux/mount.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/spinlock.h> 25#include <linux/spinlock.h>
26#include <linux/writeback.h> /* for inode_lock */
27 26
28#include <asm/atomic.h> 27#include <asm/atomic.h>
29 28
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
index 4ff028fcfd6e..30206b238433 100644
--- a/fs/ntfs/Makefile
+++ b/fs/ntfs/Makefile
@@ -2,18 +2,13 @@
2 2
3obj-$(CONFIG_NTFS_FS) += ntfs.o 3obj-$(CONFIG_NTFS_FS) += ntfs.o
4 4
5ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ 5ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ 6 index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
7 unistr.o upcase.o 7 unistr.o upcase.o
8 8
9EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.30\" 9ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
10 10
11ifeq ($(CONFIG_NTFS_DEBUG),y) 11ccflags-y := -DNTFS_VERSION=\"2.1.30\"
12EXTRA_CFLAGS += -DDEBUG 12ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
13endif 13ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
14 14
15ifeq ($(CONFIG_NTFS_RW),y)
16EXTRA_CFLAGS += -DNTFS_RW
17
18ntfs-objs += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
19endif
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index c3c2c7ac9020..0b1e885b8cf8 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1543,8 +1543,6 @@ err_out:
1543 */ 1543 */
1544const struct address_space_operations ntfs_aops = { 1544const struct address_space_operations ntfs_aops = {
1545 .readpage = ntfs_readpage, /* Fill page with data. */ 1545 .readpage = ntfs_readpage, /* Fill page with data. */
1546 .sync_page = block_sync_page, /* Currently, just unplugs the
1547 disk request queue. */
1548#ifdef NTFS_RW 1546#ifdef NTFS_RW
1549 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1547 .writepage = ntfs_writepage, /* Write dirty page to disk. */
1550#endif /* NTFS_RW */ 1548#endif /* NTFS_RW */
@@ -1560,8 +1558,6 @@ const struct address_space_operations ntfs_aops = {
1560 */ 1558 */
1561const struct address_space_operations ntfs_mst_aops = { 1559const struct address_space_operations ntfs_mst_aops = {
1562 .readpage = ntfs_readpage, /* Fill page with data. */ 1560 .readpage = ntfs_readpage, /* Fill page with data. */
1563 .sync_page = block_sync_page, /* Currently, just unplugs the
1564 disk request queue. */
1565#ifdef NTFS_RW 1561#ifdef NTFS_RW
1566 .writepage = ntfs_writepage, /* Write dirty page to disk. */ 1562 .writepage = ntfs_writepage, /* Write dirty page to disk. */
1567 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty 1563 .set_page_dirty = __set_page_dirty_nobuffers, /* Set the page dirty
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index f5094ee224c1..f14fde2b03d6 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -197,7 +197,7 @@ err_out:
197 } else if (ctx_needs_reset) { 197 } else if (ctx_needs_reset) {
198 /* 198 /*
199 * If there is no attribute list, restoring the search context 199 * If there is no attribute list, restoring the search context
200 * is acomplished simply by copying the saved context back over 200 * is accomplished simply by copying the saved context back over
201 * the caller supplied context. If there is an attribute list, 201 * the caller supplied context. If there is an attribute list,
202 * things are more complicated as we need to deal with mapping 202 * things are more complicated as we need to deal with mapping
203 * of mft records and resulting potential changes in pointers. 203 * of mft records and resulting potential changes in pointers.
@@ -1181,7 +1181,7 @@ not_found:
1181 * for, i.e. if one wants to add the attribute to the mft record this is the 1181 * for, i.e. if one wants to add the attribute to the mft record this is the
1182 * correct place to insert its attribute list entry into. 1182 * correct place to insert its attribute list entry into.
1183 * 1183 *
1184 * When -errno != -ENOENT, an error occured during the lookup. @ctx->attr is 1184 * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is
1185 * then undefined and in particular you should not rely on it not changing. 1185 * then undefined and in particular you should not rely on it not changing.
1186 */ 1186 */
1187int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, 1187int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 6551c7cbad92..ee4144ce5d7c 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -501,7 +501,7 @@ int ntfs_read_compressed_block(struct page *page)
501 VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >> 501 VCN start_vcn = (((s64)index << PAGE_CACHE_SHIFT) & ~cb_size_mask) >>
502 vol->cluster_size_bits; 502 vol->cluster_size_bits;
503 /* 503 /*
504 * The first vcn after the last wanted vcn (minumum alignment is again 504 * The first vcn after the last wanted vcn (minimum alignment is again
505 * PAGE_CACHE_SIZE. 505 * PAGE_CACHE_SIZE.
506 */ 506 */
507 VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1) 507 VCN end_vcn = ((((s64)(index + 1UL) << PAGE_CACHE_SHIFT) + cb_size - 1)
@@ -698,8 +698,7 @@ lock_retry_remap:
698 "uptodate! Unplugging the disk queue " 698 "uptodate! Unplugging the disk queue "
699 "and rescheduling."); 699 "and rescheduling.");
700 get_bh(tbh); 700 get_bh(tbh);
701 blk_run_address_space(mapping); 701 io_schedule();
702 schedule();
703 put_bh(tbh); 702 put_bh(tbh);
704 if (unlikely(!buffer_uptodate(tbh))) 703 if (unlikely(!buffer_uptodate(tbh)))
705 goto read_err; 704 goto read_err;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index a627ed82c0a3..c05d6dcf77a4 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -54,7 +54,7 @@
54 * 54 *
55 * Return 1 if the attributes match and 0 if not. 55 * Return 1 if the attributes match and 0 if not.
56 * 56 *
57 * NOTE: This function runs with the inode_lock spin lock held so it is not 57 * NOTE: This function runs with the inode->i_lock spin lock held so it is not
58 * allowed to sleep. 58 * allowed to sleep.
59 */ 59 */
60int ntfs_test_inode(struct inode *vi, ntfs_attr *na) 60int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
@@ -98,7 +98,7 @@ int ntfs_test_inode(struct inode *vi, ntfs_attr *na)
98 * 98 *
99 * Return 0 on success and -errno on error. 99 * Return 0 on success and -errno on error.
100 * 100 *
101 * NOTE: This function runs with the inode_lock spin lock held so it is not 101 * NOTE: This function runs with the inode->i_lock spin lock held so it is not
102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.) 102 * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
103 */ 103 */
104static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na) 104static int ntfs_init_locked_inode(struct inode *vi, ntfs_attr *na)
@@ -622,7 +622,7 @@ static int ntfs_read_locked_inode(struct inode *vi)
622 */ 622 */
623 /* Everyone gets all permissions. */ 623 /* Everyone gets all permissions. */
624 vi->i_mode |= S_IRWXUGO; 624 vi->i_mode |= S_IRWXUGO;
625 /* If read-only, noone gets write permissions. */ 625 /* If read-only, no one gets write permissions. */
626 if (IS_RDONLY(vi)) 626 if (IS_RDONLY(vi))
627 vi->i_mode &= ~S_IWUGO; 627 vi->i_mode &= ~S_IWUGO;
628 if (m->flags & MFT_RECORD_IS_DIRECTORY) { 628 if (m->flags & MFT_RECORD_IS_DIRECTORY) {
@@ -2529,7 +2529,7 @@ retry_truncate:
2529 * specifies that the behaviour is unspecified thus we do not 2529 * specifies that the behaviour is unspecified thus we do not
2530 * have to do anything. This means that in our implementation 2530 * have to do anything. This means that in our implementation
2531 * in the rare case that the file is mmap()ped and a write 2531 * in the rare case that the file is mmap()ped and a write
2532 * occured into the mmap()ped region just beyond the file size 2532 * occurred into the mmap()ped region just beyond the file size
2533 * and writepage has not yet been called to write out the page 2533 * and writepage has not yet been called to write out the page
2534 * (which would clear the area beyond the file size) and we now 2534 * (which would clear the area beyond the file size) and we now
2535 * extend the file size to incorporate this dirty region 2535 * extend the file size to incorporate this dirty region
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 8b2549f672bf..faece7190866 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -286,7 +286,7 @@ typedef le16 MFT_RECORD_FLAGS;
286 * fragmented. Volume free space includes the empty part of the mft zone and 286 * fragmented. Volume free space includes the empty part of the mft zone and
287 * when the volume's free 88% are used up, the mft zone is shrunk by a factor 287 * when the volume's free 88% are used up, the mft zone is shrunk by a factor
288 * of 2, thus making more space available for more files/data. This process is 288 * of 2, thus making more space available for more files/data. This process is
289 * repeated everytime there is no more free space except for the mft zone until 289 * repeated every time there is no more free space except for the mft zone until
290 * there really is no more free space. 290 * there really is no more free space.
291 */ 291 */
292 292
@@ -1657,13 +1657,13 @@ typedef enum {
1657 * pointed to by the Owner field was provided by a defaulting mechanism 1657 * pointed to by the Owner field was provided by a defaulting mechanism
1658 * rather than explicitly provided by the original provider of the 1658 * rather than explicitly provided by the original provider of the
1659 * security descriptor. This may affect the treatment of the SID with 1659 * security descriptor. This may affect the treatment of the SID with
1660 * respect to inheritence of an owner. 1660 * respect to inheritance of an owner.
1661 * 1661 *
1662 * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in 1662 * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
1663 * the Group field was provided by a defaulting mechanism rather than 1663 * the Group field was provided by a defaulting mechanism rather than
1664 * explicitly provided by the original provider of the security 1664 * explicitly provided by the original provider of the security
1665 * descriptor. This may affect the treatment of the SID with respect to 1665 * descriptor. This may affect the treatment of the SID with respect to
1666 * inheritence of a primary group. 1666 * inheritance of a primary group.
1667 * 1667 *
1668 * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security 1668 * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
1669 * descriptor contains a discretionary ACL. If this flag is set and the 1669 * descriptor contains a discretionary ACL. If this flag is set and the
@@ -1674,7 +1674,7 @@ typedef enum {
1674 * pointed to by the Dacl field was provided by a defaulting mechanism 1674 * pointed to by the Dacl field was provided by a defaulting mechanism
1675 * rather than explicitly provided by the original provider of the 1675 * rather than explicitly provided by the original provider of the
1676 * security descriptor. This may affect the treatment of the ACL with 1676 * security descriptor. This may affect the treatment of the ACL with
1677 * respect to inheritence of an ACL. This flag is ignored if the 1677 * respect to inheritance of an ACL. This flag is ignored if the
1678 * DaclPresent flag is not set. 1678 * DaclPresent flag is not set.
1679 * 1679 *
1680 * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security 1680 * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security
@@ -1686,7 +1686,7 @@ typedef enum {
1686 * pointed to by the Sacl field was provided by a defaulting mechanism 1686 * pointed to by the Sacl field was provided by a defaulting mechanism
1687 * rather than explicitly provided by the original provider of the 1687 * rather than explicitly provided by the original provider of the
1688 * security descriptor. This may affect the treatment of the ACL with 1688 * security descriptor. This may affect the treatment of the ACL with
1689 * respect to inheritence of an ACL. This flag is ignored if the 1689 * respect to inheritance of an ACL. This flag is ignored if the
1690 * SaclPresent flag is not set. 1690 * SaclPresent flag is not set.
1691 * 1691 *
1692 * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security 1692 * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
@@ -2283,7 +2283,7 @@ typedef struct {
2283 // the key_length is zero, then the vcn immediately 2283 // the key_length is zero, then the vcn immediately
2284 // follows the INDEX_ENTRY_HEADER. Regardless of 2284 // follows the INDEX_ENTRY_HEADER. Regardless of
2285 // key_length, the address of the 8-byte boundary 2285 // key_length, the address of the 8-byte boundary
2286 // alligned vcn of INDEX_ENTRY{_HEADER} *ie is given by 2286 // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
2287 // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), 2287 // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
2288 // where sizeof(VCN) can be hardcoded as 8 if wanted. */ 2288 // where sizeof(VCN) can be hardcoded as 8 if wanted. */
2289} __attribute__ ((__packed__)) INDEX_ENTRY; 2289} __attribute__ ((__packed__)) INDEX_ENTRY;
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 4dadcdf3d451..c71de292c5ad 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -669,7 +669,7 @@ err_out:
669 * of cases where we think that a volume is dirty when in fact it is clean. 669 * of cases where we think that a volume is dirty when in fact it is clean.
670 * This should only affect volumes that have not been shutdown cleanly but did 670 * This should only affect volumes that have not been shutdown cleanly but did
671 * not have any pending, non-check-pointed i/o, i.e. they were completely idle 671 * not have any pending, non-check-pointed i/o, i.e. they were completely idle
672 * at least for the five seconds preceeding the unclean shutdown. 672 * at least for the five seconds preceding the unclean shutdown.
673 * 673 *
674 * This function assumes that the $LogFile journal has already been consistency 674 * This function assumes that the $LogFile journal has already been consistency
675 * checked by a call to ntfs_check_logfile() and in particular if the $LogFile 675 * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index b5a6f08bd35c..aa2b6ac3f0a4 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -222,7 +222,7 @@ typedef struct {
222/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the 222/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the
223 restart_area_offset + the offset of the 223 restart_area_offset + the offset of the
224 file_size are > 510 then corruption has 224 file_size are > 510 then corruption has
225 occured. This is the very first check when 225 occurred. This is the very first check when
226 starting with the restart_area as if it 226 starting with the restart_area as if it
227 fails it means that some of the above values 227 fails it means that some of the above values
228 will be corrupted by the multi sector 228 will be corrupted by the multi sector
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 326e7475a22a..382857f9c7db 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -73,7 +73,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
73 if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs + 73 if (index > end_index || (i_size & ~PAGE_CACHE_MASK) < ofs +
74 vol->mft_record_size) { 74 vol->mft_record_size) {
75 page = ERR_PTR(-ENOENT); 75 page = ERR_PTR(-ENOENT);
76 ntfs_error(vol->sb, "Attemt to read mft record 0x%lx, " 76 ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
77 "which is beyond the end of the mft. " 77 "which is beyond the end of the mft. "
78 "This is probably a bug in the ntfs " 78 "This is probably a bug in the ntfs "
79 "driver.", ni->mft_no); 79 "driver.", ni->mft_no);
@@ -1442,7 +1442,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
1442 // Note: It will need to be a special mft record and if none of 1442 // Note: It will need to be a special mft record and if none of
1443 // those are available it gets rather complicated... 1443 // those are available it gets rather complicated...
1444 ntfs_error(vol->sb, "Not enough space in this mft record to " 1444 ntfs_error(vol->sb, "Not enough space in this mft record to "
1445 "accomodate extended mft bitmap attribute " 1445 "accommodate extended mft bitmap attribute "
1446 "extent. Cannot handle this yet."); 1446 "extent. Cannot handle this yet.");
1447 ret = -EOPNOTSUPP; 1447 ret = -EOPNOTSUPP;
1448 goto undo_alloc; 1448 goto undo_alloc;
@@ -1879,7 +1879,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
1879 // and we would then need to update all references to this mft 1879 // and we would then need to update all references to this mft
1880 // record appropriately. This is rather complicated... 1880 // record appropriately. This is rather complicated...
1881 ntfs_error(vol->sb, "Not enough space in this mft record to " 1881 ntfs_error(vol->sb, "Not enough space in this mft record to "
1882 "accomodate extended mft data attribute " 1882 "accommodate extended mft data attribute "
1883 "extent. Cannot handle this yet."); 1883 "extent. Cannot handle this yet.");
1884 ret = -EOPNOTSUPP; 1884 ret = -EOPNOTSUPP;
1885 goto undo_alloc; 1885 goto undo_alloc;
@@ -2357,7 +2357,7 @@ ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
2357 } 2357 }
2358#ifdef DEBUG 2358#ifdef DEBUG
2359 read_lock_irqsave(&mftbmp_ni->size_lock, flags); 2359 read_lock_irqsave(&mftbmp_ni->size_lock, flags);
2360 ntfs_debug("Status of mftbmp after initialized extention: " 2360 ntfs_debug("Status of mftbmp after initialized extension: "
2361 "allocated_size 0x%llx, data_size 0x%llx, " 2361 "allocated_size 0x%llx, data_size 0x%llx, "
2362 "initialized_size 0x%llx.", 2362 "initialized_size 0x%llx.",
2363 (long long)mftbmp_ni->allocated_size, 2363 (long long)mftbmp_ni->allocated_size,
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
index 56a9a6d25a2a..eac7d6788a10 100644
--- a/fs/ntfs/runlist.c
+++ b/fs/ntfs/runlist.c
@@ -1243,7 +1243,7 @@ err_out:
1243 * write. 1243 * write.
1244 * 1244 *
1245 * This is used when building the mapping pairs array of a runlist to compress 1245 * This is used when building the mapping pairs array of a runlist to compress
1246 * a given logical cluster number (lcn) or a specific run length to the minumum 1246 * a given logical cluster number (lcn) or a specific run length to the minimum
1247 * size possible. 1247 * size possible.
1248 * 1248 *
1249 * Return the number of bytes written on success. On error, i.e. the 1249 * Return the number of bytes written on success. On error, i.e. the
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 29099a07b9fe..b52706da4645 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -458,7 +458,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
458 * the volume on boot and updates them. 458 * the volume on boot and updates them.
459 * 459 *
460 * When remounting read-only, mark the volume clean if no volume errors 460 * When remounting read-only, mark the volume clean if no volume errors
461 * have occured. 461 * have occurred.
462 */ 462 */
463 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { 463 if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
464 static const char *es = ". Cannot remount read-write."; 464 static const char *es = ". Cannot remount read-write.";
@@ -1269,7 +1269,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
1269 "hibernated on the volume."); 1269 "hibernated on the volume.");
1270 return 0; 1270 return 0;
1271 } 1271 }
1272 /* A real error occured. */ 1272 /* A real error occurred. */
1273 ntfs_error(vol->sb, "Failed to find inode number for " 1273 ntfs_error(vol->sb, "Failed to find inode number for "
1274 "hiberfil.sys."); 1274 "hiberfil.sys.");
1275 return ret; 1275 return ret;
@@ -1370,7 +1370,7 @@ static bool load_and_init_quota(ntfs_volume *vol)
1370 NVolSetQuotaOutOfDate(vol); 1370 NVolSetQuotaOutOfDate(vol);
1371 return true; 1371 return true;
1372 } 1372 }
1373 /* A real error occured. */ 1373 /* A real error occurred. */
1374 ntfs_error(vol->sb, "Failed to find inode number for $Quota."); 1374 ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
1375 return false; 1375 return false;
1376 } 1376 }
@@ -1454,7 +1454,7 @@ not_enabled:
1454 NVolSetUsnJrnlStamped(vol); 1454 NVolSetUsnJrnlStamped(vol);
1455 return true; 1455 return true;
1456 } 1456 }
1457 /* A real error occured. */ 1457 /* A real error occurred. */
1458 ntfs_error(vol->sb, "Failed to find inode number for " 1458 ntfs_error(vol->sb, "Failed to find inode number for "
1459 "$UsnJrnl."); 1459 "$UsnJrnl.");
1460 return false; 1460 return false;
@@ -2292,7 +2292,7 @@ static void ntfs_put_super(struct super_block *sb)
2292 ntfs_commit_inode(vol->mft_ino); 2292 ntfs_commit_inode(vol->mft_ino);
2293 2293
2294 /* 2294 /*
2295 * If a read-write mount and no volume errors have occured, mark the 2295 * If a read-write mount and no volume errors have occurred, mark the
2296 * volume clean. Also, re-commit all affected inodes. 2296 * volume clean. Also, re-commit all affected inodes.
2297 */ 2297 */
2298 if (!(sb->s_flags & MS_RDONLY)) { 2298 if (!(sb->s_flags & MS_RDONLY)) {
@@ -2496,7 +2496,7 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2496 if (vol->nr_clusters & 63) 2496 if (vol->nr_clusters & 63)
2497 nr_free += 64 - (vol->nr_clusters & 63); 2497 nr_free += 64 - (vol->nr_clusters & 63);
2498 up_read(&vol->lcnbmp_lock); 2498 up_read(&vol->lcnbmp_lock);
2499 /* If errors occured we may well have gone below zero, fix this. */ 2499 /* If errors occurred we may well have gone below zero, fix this. */
2500 if (nr_free < 0) 2500 if (nr_free < 0)
2501 nr_free = 0; 2501 nr_free = 0;
2502 ntfs_debug("Exiting."); 2502 ntfs_debug("Exiting.");
@@ -2561,7 +2561,7 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2561 } 2561 }
2562 ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", 2562 ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
2563 index - 1); 2563 index - 1);
2564 /* If errors occured we may well have gone below zero, fix this. */ 2564 /* If errors occurred we may well have gone below zero, fix this. */
2565 if (nr_free < 0) 2565 if (nr_free < 0)
2566 nr_free = 0; 2566 nr_free = 0;
2567 ntfs_debug("Exiting."); 2567 ntfs_debug("Exiting.");
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 07d9fd854350..d8a0313e99e6 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -1,6 +1,6 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1ccflags-y := -Ifs/ocfs2
2 2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES 3ccflags-y += -DCATCH_BH_JBD_RACES
4 4
5obj-$(CONFIG_OCFS2_FS) += \ 5obj-$(CONFIG_OCFS2_FS) += \
6 ocfs2.o \ 6 ocfs2.o \
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 704f6b1742f3..e913ad130fdd 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -24,7 +24,6 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/string.h> 25#include <linux/string.h>
26 26
27#define MLOG_MASK_PREFIX ML_INODE
28#include <cluster/masklog.h> 27#include <cluster/masklog.h>
29 28
30#include "ocfs2.h" 29#include "ocfs2.h"
@@ -497,7 +496,7 @@ static int ocfs2_xattr_set_acl(struct dentry *dentry, const char *name,
497 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) 496 if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
498 return -EOPNOTSUPP; 497 return -EOPNOTSUPP;
499 498
500 if (!is_owner_or_cap(inode)) 499 if (!inode_owner_or_capable(inode))
501 return -EPERM; 500 return -EPERM;
502 501
503 if (value) { 502 if (value) {
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e4984e259cb6..48aa9c7401c7 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -30,7 +30,6 @@
30#include <linux/swap.h> 30#include <linux/swap.h>
31#include <linux/quotaops.h> 31#include <linux/quotaops.h>
32 32
33#define MLOG_MASK_PREFIX ML_DISK_ALLOC
34#include <cluster/masklog.h> 33#include <cluster/masklog.h>
35 34
36#include "ocfs2.h" 35#include "ocfs2.h"
@@ -50,6 +49,7 @@
50#include "uptodate.h" 49#include "uptodate.h"
51#include "xattr.h" 50#include "xattr.h"
52#include "refcounttree.h" 51#include "refcounttree.h"
52#include "ocfs2_trace.h"
53 53
54#include "buffer_head_io.h" 54#include "buffer_head_io.h"
55 55
@@ -886,8 +886,7 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
886 struct ocfs2_extent_block *eb = 886 struct ocfs2_extent_block *eb =
887 (struct ocfs2_extent_block *)bh->b_data; 887 (struct ocfs2_extent_block *)bh->b_data;
888 888
889 mlog(0, "Validating extent block %llu\n", 889 trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
890 (unsigned long long)bh->b_blocknr);
891 890
892 BUG_ON(!buffer_uptodate(bh)); 891 BUG_ON(!buffer_uptodate(bh));
893 892
@@ -965,8 +964,6 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
965 struct buffer_head *eb_bh = NULL; 964 struct buffer_head *eb_bh = NULL;
966 u64 last_eb_blk = 0; 965 u64 last_eb_blk = 0;
967 966
968 mlog_entry_void();
969
970 el = et->et_root_el; 967 el = et->et_root_el;
971 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 968 last_eb_blk = ocfs2_et_get_last_eb_blk(et);
972 969
@@ -987,7 +984,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
987bail: 984bail:
988 brelse(eb_bh); 985 brelse(eb_bh);
989 986
990 mlog_exit(retval); 987 trace_ocfs2_num_free_extents(retval);
991 return retval; 988 return retval;
992} 989}
993 990
@@ -1010,8 +1007,6 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
1010 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); 1007 OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
1011 struct ocfs2_extent_block *eb; 1008 struct ocfs2_extent_block *eb;
1012 1009
1013 mlog_entry_void();
1014
1015 count = 0; 1010 count = 0;
1016 while (count < wanted) { 1011 while (count < wanted) {
1017 status = ocfs2_claim_metadata(handle, 1012 status = ocfs2_claim_metadata(handle,
@@ -1074,8 +1069,8 @@ bail:
1074 brelse(bhs[i]); 1069 brelse(bhs[i]);
1075 bhs[i] = NULL; 1070 bhs[i] = NULL;
1076 } 1071 }
1072 mlog_errno(status);
1077 } 1073 }
1078 mlog_exit(status);
1079 return status; 1074 return status;
1080} 1075}
1081 1076
@@ -1173,8 +1168,6 @@ static int ocfs2_add_branch(handle_t *handle,
1173 struct ocfs2_extent_list *el; 1168 struct ocfs2_extent_list *el;
1174 u32 new_cpos, root_end; 1169 u32 new_cpos, root_end;
1175 1170
1176 mlog_entry_void();
1177
1178 BUG_ON(!last_eb_bh || !*last_eb_bh); 1171 BUG_ON(!last_eb_bh || !*last_eb_bh);
1179 1172
1180 if (eb_bh) { 1173 if (eb_bh) {
@@ -1200,8 +1193,11 @@ static int ocfs2_add_branch(handle_t *handle,
1200 * from new_cpos). 1193 * from new_cpos).
1201 */ 1194 */
1202 if (root_end > new_cpos) { 1195 if (root_end > new_cpos) {
1203 mlog(0, "adjust the cluster end from %u to %u\n", 1196 trace_ocfs2_adjust_rightmost_branch(
1204 root_end, new_cpos); 1197 (unsigned long long)
1198 ocfs2_metadata_cache_owner(et->et_ci),
1199 root_end, new_cpos);
1200
1205 status = ocfs2_adjust_rightmost_branch(handle, et); 1201 status = ocfs2_adjust_rightmost_branch(handle, et);
1206 if (status) { 1202 if (status) {
1207 mlog_errno(status); 1203 mlog_errno(status);
@@ -1332,7 +1328,6 @@ bail:
1332 kfree(new_eb_bhs); 1328 kfree(new_eb_bhs);
1333 } 1329 }
1334 1330
1335 mlog_exit(status);
1336 return status; 1331 return status;
1337} 1332}
1338 1333
@@ -1353,8 +1348,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1353 struct ocfs2_extent_list *root_el; 1348 struct ocfs2_extent_list *root_el;
1354 struct ocfs2_extent_list *eb_el; 1349 struct ocfs2_extent_list *eb_el;
1355 1350
1356 mlog_entry_void();
1357
1358 status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, 1351 status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1359 &new_eb_bh); 1352 &new_eb_bh);
1360 if (status < 0) { 1353 if (status < 0) {
@@ -1415,7 +1408,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
1415bail: 1408bail:
1416 brelse(new_eb_bh); 1409 brelse(new_eb_bh);
1417 1410
1418 mlog_exit(status);
1419 return status; 1411 return status;
1420} 1412}
1421 1413
@@ -1446,8 +1438,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1446 struct buffer_head *bh = NULL; 1438 struct buffer_head *bh = NULL;
1447 struct buffer_head *lowest_bh = NULL; 1439 struct buffer_head *lowest_bh = NULL;
1448 1440
1449 mlog_entry_void();
1450
1451 *target_bh = NULL; 1441 *target_bh = NULL;
1452 1442
1453 el = et->et_root_el; 1443 el = et->et_root_el;
@@ -1503,7 +1493,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1503bail: 1493bail:
1504 brelse(bh); 1494 brelse(bh);
1505 1495
1506 mlog_exit(status);
1507 return status; 1496 return status;
1508} 1497}
1509 1498
@@ -1540,7 +1529,10 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1540 * another tree level */ 1529 * another tree level */
1541 if (shift) { 1530 if (shift) {
1542 BUG_ON(bh); 1531 BUG_ON(bh);
1543 mlog(0, "need to shift tree depth (current = %d)\n", depth); 1532 trace_ocfs2_grow_tree(
1533 (unsigned long long)
1534 ocfs2_metadata_cache_owner(et->et_ci),
1535 depth);
1544 1536
1545 /* ocfs2_shift_tree_depth will return us a buffer with 1537 /* ocfs2_shift_tree_depth will return us a buffer with
1546 * the new extent block (so we can pass that to 1538 * the new extent block (so we can pass that to
@@ -1570,7 +1562,6 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1570 1562
1571 /* call ocfs2_add_branch to add the final part of the tree with 1563 /* call ocfs2_add_branch to add the final part of the tree with
1572 * the new data. */ 1564 * the new data. */
1573 mlog(0, "add branch. bh = %p\n", bh);
1574 ret = ocfs2_add_branch(handle, et, bh, last_eb_bh, 1565 ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1575 meta_ac); 1566 meta_ac);
1576 if (ret < 0) { 1567 if (ret < 0) {
@@ -1645,8 +1636,9 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1645 } 1636 }
1646 insert_index = i; 1637 insert_index = i;
1647 1638
1648 mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", 1639 trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
1649 insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); 1640 has_empty, next_free,
1641 le16_to_cpu(el->l_count));
1650 1642
1651 BUG_ON(insert_index < 0); 1643 BUG_ON(insert_index < 0);
1652 BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 1644 BUG_ON(insert_index >= le16_to_cpu(el->l_count));
@@ -2059,7 +2051,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
2059 left_el = path_leaf_el(left_path); 2051 left_el = path_leaf_el(left_path);
2060 right_el = path_leaf_el(right_path); 2052 right_el = path_leaf_el(right_path);
2061 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 2053 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2062 mlog(0, "Adjust records at index %u\n", i); 2054 trace_ocfs2_complete_edge_insert(i);
2063 2055
2064 /* 2056 /*
2065 * One nice property of knowing that all of these 2057 * One nice property of knowing that all of these
@@ -2389,7 +2381,9 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
2389 goto out; 2381 goto out;
2390 } 2382 }
2391 2383
2392 mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); 2384 trace_ocfs2_rotate_tree_right(
2385 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2386 insert_cpos, cpos);
2393 2387
2394 /* 2388 /*
2395 * What we want to do here is: 2389 * What we want to do here is:
@@ -2418,8 +2412,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
2418 * rotating subtrees. 2412 * rotating subtrees.
2419 */ 2413 */
2420 while (cpos && insert_cpos <= cpos) { 2414 while (cpos && insert_cpos <= cpos) {
2421 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 2415 trace_ocfs2_rotate_tree_right(
2422 insert_cpos, cpos); 2416 (unsigned long long)
2417 ocfs2_metadata_cache_owner(et->et_ci),
2418 insert_cpos, cpos);
2423 2419
2424 ret = ocfs2_find_path(et->et_ci, left_path, cpos); 2420 ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2425 if (ret) { 2421 if (ret) {
@@ -2461,10 +2457,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
2461 2457
2462 start = ocfs2_find_subtree_root(et, left_path, right_path); 2458 start = ocfs2_find_subtree_root(et, left_path, right_path);
2463 2459
2464 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2460 trace_ocfs2_rotate_subtree(start,
2465 start, 2461 (unsigned long long)
2466 (unsigned long long) right_path->p_node[start].bh->b_blocknr, 2462 right_path->p_node[start].bh->b_blocknr,
2467 right_path->p_tree_depth); 2463 right_path->p_tree_depth);
2468 2464
2469 ret = ocfs2_extend_rotate_transaction(handle, start, 2465 ret = ocfs2_extend_rotate_transaction(handle, start,
2470 orig_credits, right_path); 2466 orig_credits, right_path);
@@ -2964,8 +2960,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
2964 subtree_root = ocfs2_find_subtree_root(et, left_path, 2960 subtree_root = ocfs2_find_subtree_root(et, left_path,
2965 right_path); 2961 right_path);
2966 2962
2967 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2963 trace_ocfs2_rotate_subtree(subtree_root,
2968 subtree_root,
2969 (unsigned long long) 2964 (unsigned long long)
2970 right_path->p_node[subtree_root].bh->b_blocknr, 2965 right_path->p_node[subtree_root].bh->b_blocknr,
2971 right_path->p_tree_depth); 2966 right_path->p_tree_depth);
@@ -3989,9 +3984,11 @@ static int ocfs2_append_rec_to_path(handle_t *handle,
3989 goto out; 3984 goto out;
3990 } 3985 }
3991 3986
3992 mlog(0, "Append may need a left path update. cpos: %u, " 3987 trace_ocfs2_append_rec_to_path(
3993 "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), 3988 (unsigned long long)
3994 left_cpos); 3989 ocfs2_metadata_cache_owner(et->et_ci),
3990 le32_to_cpu(insert_rec->e_cpos),
3991 left_cpos);
3995 3992
3996 /* 3993 /*
3997 * No need to worry if the append is already in the 3994 * No need to worry if the append is already in the
@@ -4522,7 +4519,7 @@ set_tail_append:
4522} 4519}
4523 4520
4524/* 4521/*
4525 * Helper function called at the begining of an insert. 4522 * Helper function called at the beginning of an insert.
4526 * 4523 *
4527 * This computes a few things that are commonly used in the process of 4524 * This computes a few things that are commonly used in the process of
4528 * inserting into the btree: 4525 * inserting into the btree:
@@ -4562,7 +4559,7 @@ static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4562 ocfs2_et_get_last_eb_blk(et), 4559 ocfs2_et_get_last_eb_blk(et),
4563 &bh); 4560 &bh);
4564 if (ret) { 4561 if (ret) {
4565 mlog_exit(ret); 4562 mlog_errno(ret);
4566 goto out; 4563 goto out;
4567 } 4564 }
4568 eb = (struct ocfs2_extent_block *) bh->b_data; 4565 eb = (struct ocfs2_extent_block *) bh->b_data;
@@ -4678,9 +4675,9 @@ int ocfs2_insert_extent(handle_t *handle,
4678 struct ocfs2_insert_type insert = {0, }; 4675 struct ocfs2_insert_type insert = {0, };
4679 struct ocfs2_extent_rec rec; 4676 struct ocfs2_extent_rec rec;
4680 4677
4681 mlog(0, "add %u clusters at position %u to owner %llu\n", 4678 trace_ocfs2_insert_extent_start(
4682 new_clusters, cpos, 4679 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4683 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 4680 cpos, new_clusters);
4684 4681
4685 memset(&rec, 0, sizeof(rec)); 4682 memset(&rec, 0, sizeof(rec));
4686 rec.e_cpos = cpu_to_le32(cpos); 4683 rec.e_cpos = cpu_to_le32(cpos);
@@ -4700,11 +4697,9 @@ int ocfs2_insert_extent(handle_t *handle,
4700 goto bail; 4697 goto bail;
4701 } 4698 }
4702 4699
4703 mlog(0, "Insert.appending: %u, Insert.Contig: %u, " 4700 trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
4704 "Insert.contig_index: %d, Insert.free_records: %d, " 4701 insert.ins_contig_index, free_records,
4705 "Insert.tree_depth: %d\n", 4702 insert.ins_tree_depth);
4706 insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4707 free_records, insert.ins_tree_depth);
4708 4703
4709 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4704 if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4710 status = ocfs2_grow_tree(handle, et, 4705 status = ocfs2_grow_tree(handle, et,
@@ -4726,7 +4721,6 @@ int ocfs2_insert_extent(handle_t *handle,
4726bail: 4721bail:
4727 brelse(last_eb_bh); 4722 brelse(last_eb_bh);
4728 4723
4729 mlog_exit(status);
4730 return status; 4724 return status;
4731} 4725}
4732 4726
@@ -4746,7 +4740,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4746 struct ocfs2_alloc_context *meta_ac, 4740 struct ocfs2_alloc_context *meta_ac,
4747 enum ocfs2_alloc_restarted *reason_ret) 4741 enum ocfs2_alloc_restarted *reason_ret)
4748{ 4742{
4749 int status = 0; 4743 int status = 0, err = 0;
4750 int free_extents; 4744 int free_extents;
4751 enum ocfs2_alloc_restarted reason = RESTART_NONE; 4745 enum ocfs2_alloc_restarted reason = RESTART_NONE;
4752 u32 bit_off, num_bits; 4746 u32 bit_off, num_bits;
@@ -4773,14 +4767,14 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4773 * 2) we are so fragmented, we've needed to add metadata too 4767 * 2) we are so fragmented, we've needed to add metadata too
4774 * many times. */ 4768 * many times. */
4775 if (!free_extents && !meta_ac) { 4769 if (!free_extents && !meta_ac) {
4776 mlog(0, "we haven't reserved any metadata!\n"); 4770 err = -1;
4777 status = -EAGAIN; 4771 status = -EAGAIN;
4778 reason = RESTART_META; 4772 reason = RESTART_META;
4779 goto leave; 4773 goto leave;
4780 } else if ((!free_extents) 4774 } else if ((!free_extents)
4781 && (ocfs2_alloc_context_bits_left(meta_ac) 4775 && (ocfs2_alloc_context_bits_left(meta_ac)
4782 < ocfs2_extend_meta_needed(et->et_root_el))) { 4776 < ocfs2_extend_meta_needed(et->et_root_el))) {
4783 mlog(0, "filesystem is really fragmented...\n"); 4777 err = -2;
4784 status = -EAGAIN; 4778 status = -EAGAIN;
4785 reason = RESTART_META; 4779 reason = RESTART_META;
4786 goto leave; 4780 goto leave;
@@ -4805,9 +4799,9 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4805 } 4799 }
4806 4800
4807 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4801 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4808 mlog(0, "Allocating %u clusters at block %u for owner %llu\n", 4802 trace_ocfs2_add_clusters_in_btree(
4809 num_bits, bit_off, 4803 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
4810 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); 4804 bit_off, num_bits);
4811 status = ocfs2_insert_extent(handle, et, *logical_offset, block, 4805 status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4812 num_bits, flags, meta_ac); 4806 num_bits, flags, meta_ac);
4813 if (status < 0) { 4807 if (status < 0) {
@@ -4821,16 +4815,15 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
4821 *logical_offset += num_bits; 4815 *logical_offset += num_bits;
4822 4816
4823 if (clusters_to_add) { 4817 if (clusters_to_add) {
4824 mlog(0, "need to alloc once more, wanted = %u\n", 4818 err = clusters_to_add;
4825 clusters_to_add);
4826 status = -EAGAIN; 4819 status = -EAGAIN;
4827 reason = RESTART_TRANS; 4820 reason = RESTART_TRANS;
4828 } 4821 }
4829 4822
4830leave: 4823leave:
4831 mlog_exit(status);
4832 if (reason_ret) 4824 if (reason_ret)
4833 *reason_ret = reason; 4825 *reason_ret = reason;
4826 trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
4834 return status; 4827 return status;
4835} 4828}
4836 4829
@@ -5039,7 +5032,7 @@ int ocfs2_split_extent(handle_t *handle,
5039 ocfs2_et_get_last_eb_blk(et), 5032 ocfs2_et_get_last_eb_blk(et),
5040 &last_eb_bh); 5033 &last_eb_bh);
5041 if (ret) { 5034 if (ret) {
5042 mlog_exit(ret); 5035 mlog_errno(ret);
5043 goto out; 5036 goto out;
5044 } 5037 }
5045 5038
@@ -5056,9 +5049,9 @@ int ocfs2_split_extent(handle_t *handle,
5056 5049
5057 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); 5050 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5058 5051
5059 mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n", 5052 trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
5060 split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent, 5053 ctxt.c_has_empty_extent,
5061 ctxt.c_split_covers_rec); 5054 ctxt.c_split_covers_rec);
5062 5055
5063 if (ctxt.c_contig_type == CONTIG_NONE) { 5056 if (ctxt.c_contig_type == CONTIG_NONE) {
5064 if (ctxt.c_split_covers_rec) 5057 if (ctxt.c_split_covers_rec)
@@ -5192,8 +5185,9 @@ int ocfs2_mark_extent_written(struct inode *inode,
5192{ 5185{
5193 int ret; 5186 int ret;
5194 5187
5195 mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n", 5188 trace_ocfs2_mark_extent_written(
5196 inode->i_ino, cpos, len, phys); 5189 (unsigned long long)OCFS2_I(inode)->ip_blkno,
5190 cpos, len, phys);
5197 5191
5198 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { 5192 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5199 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " 5193 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
@@ -5512,11 +5506,10 @@ int ocfs2_remove_extent(handle_t *handle,
5512 5506
5513 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5507 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5514 5508
5515 mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d " 5509 trace_ocfs2_remove_extent(
5516 "(cpos %u, len %u)\n", 5510 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5517 (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), 5511 cpos, len, index, le32_to_cpu(rec->e_cpos),
5518 cpos, len, index, 5512 ocfs2_rec_clusters(el, rec));
5519 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5520 5513
5521 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5514 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5522 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc, 5515 ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
@@ -5795,9 +5788,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5795 struct ocfs2_dinode *di; 5788 struct ocfs2_dinode *di;
5796 struct ocfs2_truncate_log *tl; 5789 struct ocfs2_truncate_log *tl;
5797 5790
5798 mlog_entry("start_blk = %llu, num_clusters = %u\n",
5799 (unsigned long long)start_blk, num_clusters);
5800
5801 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5791 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5802 5792
5803 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5793 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5834,10 +5824,9 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5834 goto bail; 5824 goto bail;
5835 } 5825 }
5836 5826
5837 mlog(0, "Log truncate of %u clusters starting at cluster %u to " 5827 trace_ocfs2_truncate_log_append(
5838 "%llu (index = %d)\n", num_clusters, start_cluster, 5828 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
5839 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); 5829 start_cluster, num_clusters);
5840
5841 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 5830 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5842 /* 5831 /*
5843 * Move index back to the record we are coalescing with. 5832 * Move index back to the record we are coalescing with.
@@ -5846,9 +5835,10 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5846 index--; 5835 index--;
5847 5836
5848 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 5837 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5849 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", 5838 trace_ocfs2_truncate_log_append(
5850 index, le32_to_cpu(tl->tl_recs[index].t_start), 5839 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5851 num_clusters); 5840 index, le32_to_cpu(tl->tl_recs[index].t_start),
5841 num_clusters);
5852 } else { 5842 } else {
5853 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 5843 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5854 tl->tl_used = cpu_to_le16(index + 1); 5844 tl->tl_used = cpu_to_le16(index + 1);
@@ -5859,7 +5849,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5859 5849
5860 osb->truncated_clusters += num_clusters; 5850 osb->truncated_clusters += num_clusters;
5861bail: 5851bail:
5862 mlog_exit(status);
5863 return status; 5852 return status;
5864} 5853}
5865 5854
@@ -5878,8 +5867,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5878 struct inode *tl_inode = osb->osb_tl_inode; 5867 struct inode *tl_inode = osb->osb_tl_inode;
5879 struct buffer_head *tl_bh = osb->osb_tl_bh; 5868 struct buffer_head *tl_bh = osb->osb_tl_bh;
5880 5869
5881 mlog_entry_void();
5882
5883 di = (struct ocfs2_dinode *) tl_bh->b_data; 5870 di = (struct ocfs2_dinode *) tl_bh->b_data;
5884 tl = &di->id2.i_dealloc; 5871 tl = &di->id2.i_dealloc;
5885 i = le16_to_cpu(tl->tl_used) - 1; 5872 i = le16_to_cpu(tl->tl_used) - 1;
@@ -5915,8 +5902,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5915 /* if start_blk is not set, we ignore the record as 5902 /* if start_blk is not set, we ignore the record as
5916 * invalid. */ 5903 * invalid. */
5917 if (start_blk) { 5904 if (start_blk) {
5918 mlog(0, "free record %d, start = %u, clusters = %u\n", 5905 trace_ocfs2_replay_truncate_records(
5919 i, le32_to_cpu(rec.t_start), num_clusters); 5906 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5907 i, le32_to_cpu(rec.t_start), num_clusters);
5920 5908
5921 status = ocfs2_free_clusters(handle, data_alloc_inode, 5909 status = ocfs2_free_clusters(handle, data_alloc_inode,
5922 data_alloc_bh, start_blk, 5910 data_alloc_bh, start_blk,
@@ -5932,7 +5920,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5932 osb->truncated_clusters = 0; 5920 osb->truncated_clusters = 0;
5933 5921
5934bail: 5922bail:
5935 mlog_exit(status);
5936 return status; 5923 return status;
5937} 5924}
5938 5925
@@ -5949,8 +5936,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5949 struct ocfs2_dinode *di; 5936 struct ocfs2_dinode *di;
5950 struct ocfs2_truncate_log *tl; 5937 struct ocfs2_truncate_log *tl;
5951 5938
5952 mlog_entry_void();
5953
5954 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5939 BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5955 5940
5956 di = (struct ocfs2_dinode *) tl_bh->b_data; 5941 di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -5962,8 +5947,9 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5962 5947
5963 tl = &di->id2.i_dealloc; 5948 tl = &di->id2.i_dealloc;
5964 num_to_flush = le16_to_cpu(tl->tl_used); 5949 num_to_flush = le16_to_cpu(tl->tl_used);
5965 mlog(0, "Flush %u records from truncate log #%llu\n", 5950 trace_ocfs2_flush_truncate_log(
5966 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 5951 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5952 num_to_flush);
5967 if (!num_to_flush) { 5953 if (!num_to_flush) {
5968 status = 0; 5954 status = 0;
5969 goto out; 5955 goto out;
@@ -6009,7 +5995,6 @@ out_mutex:
6009 iput(data_alloc_inode); 5995 iput(data_alloc_inode);
6010 5996
6011out: 5997out:
6012 mlog_exit(status);
6013 return status; 5998 return status;
6014} 5999}
6015 6000
@@ -6032,15 +6017,11 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
6032 container_of(work, struct ocfs2_super, 6017 container_of(work, struct ocfs2_super,
6033 osb_truncate_log_wq.work); 6018 osb_truncate_log_wq.work);
6034 6019
6035 mlog_entry_void();
6036
6037 status = ocfs2_flush_truncate_log(osb); 6020 status = ocfs2_flush_truncate_log(osb);
6038 if (status < 0) 6021 if (status < 0)
6039 mlog_errno(status); 6022 mlog_errno(status);
6040 else 6023 else
6041 ocfs2_init_steal_slots(osb); 6024 ocfs2_init_steal_slots(osb);
6042
6043 mlog_exit(status);
6044} 6025}
6045 6026
6046#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 6027#define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
@@ -6086,7 +6067,6 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
6086 *tl_inode = inode; 6067 *tl_inode = inode;
6087 *tl_bh = bh; 6068 *tl_bh = bh;
6088bail: 6069bail:
6089 mlog_exit(status);
6090 return status; 6070 return status;
6091} 6071}
6092 6072
@@ -6106,7 +6086,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6106 6086
6107 *tl_copy = NULL; 6087 *tl_copy = NULL;
6108 6088
6109 mlog(0, "recover truncate log from slot %d\n", slot_num); 6089 trace_ocfs2_begin_truncate_log_recovery(slot_num);
6110 6090
6111 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 6091 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
6112 if (status < 0) { 6092 if (status < 0) {
@@ -6123,8 +6103,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
6123 6103
6124 tl = &di->id2.i_dealloc; 6104 tl = &di->id2.i_dealloc;
6125 if (le16_to_cpu(tl->tl_used)) { 6105 if (le16_to_cpu(tl->tl_used)) {
6126 mlog(0, "We'll have %u logs to recover\n", 6106 trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
6127 le16_to_cpu(tl->tl_used));
6128 6107
6129 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); 6108 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
6130 if (!(*tl_copy)) { 6109 if (!(*tl_copy)) {
@@ -6157,9 +6136,9 @@ bail:
6157 if (status < 0 && (*tl_copy)) { 6136 if (status < 0 && (*tl_copy)) {
6158 kfree(*tl_copy); 6137 kfree(*tl_copy);
6159 *tl_copy = NULL; 6138 *tl_copy = NULL;
6139 mlog_errno(status);
6160 } 6140 }
6161 6141
6162 mlog_exit(status);
6163 return status; 6142 return status;
6164} 6143}
6165 6144
@@ -6174,8 +6153,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6174 struct inode *tl_inode = osb->osb_tl_inode; 6153 struct inode *tl_inode = osb->osb_tl_inode;
6175 struct ocfs2_truncate_log *tl; 6154 struct ocfs2_truncate_log *tl;
6176 6155
6177 mlog_entry_void();
6178
6179 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 6156 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6180 mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 6157 mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6181 return -EINVAL; 6158 return -EINVAL;
@@ -6183,8 +6160,9 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6183 6160
6184 tl = &tl_copy->id2.i_dealloc; 6161 tl = &tl_copy->id2.i_dealloc;
6185 num_recs = le16_to_cpu(tl->tl_used); 6162 num_recs = le16_to_cpu(tl->tl_used);
6186 mlog(0, "cleanup %u records from %llu\n", num_recs, 6163 trace_ocfs2_complete_truncate_log_recovery(
6187 (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); 6164 (unsigned long long)le64_to_cpu(tl_copy->i_blkno),
6165 num_recs);
6188 6166
6189 mutex_lock(&tl_inode->i_mutex); 6167 mutex_lock(&tl_inode->i_mutex);
6190 for(i = 0; i < num_recs; i++) { 6168 for(i = 0; i < num_recs; i++) {
@@ -6219,7 +6197,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6219bail_up: 6197bail_up:
6220 mutex_unlock(&tl_inode->i_mutex); 6198 mutex_unlock(&tl_inode->i_mutex);
6221 6199
6222 mlog_exit(status);
6223 return status; 6200 return status;
6224} 6201}
6225 6202
@@ -6228,8 +6205,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6228 int status; 6205 int status;
6229 struct inode *tl_inode = osb->osb_tl_inode; 6206 struct inode *tl_inode = osb->osb_tl_inode;
6230 6207
6231 mlog_entry_void();
6232
6233 if (tl_inode) { 6208 if (tl_inode) {
6234 cancel_delayed_work(&osb->osb_truncate_log_wq); 6209 cancel_delayed_work(&osb->osb_truncate_log_wq);
6235 flush_workqueue(ocfs2_wq); 6210 flush_workqueue(ocfs2_wq);
@@ -6241,8 +6216,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6241 brelse(osb->osb_tl_bh); 6216 brelse(osb->osb_tl_bh);
6242 iput(osb->osb_tl_inode); 6217 iput(osb->osb_tl_inode);
6243 } 6218 }
6244
6245 mlog_exit_void();
6246} 6219}
6247 6220
6248int ocfs2_truncate_log_init(struct ocfs2_super *osb) 6221int ocfs2_truncate_log_init(struct ocfs2_super *osb)
@@ -6251,8 +6224,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6251 struct inode *tl_inode = NULL; 6224 struct inode *tl_inode = NULL;
6252 struct buffer_head *tl_bh = NULL; 6225 struct buffer_head *tl_bh = NULL;
6253 6226
6254 mlog_entry_void();
6255
6256 status = ocfs2_get_truncate_log_info(osb, 6227 status = ocfs2_get_truncate_log_info(osb,
6257 osb->slot_num, 6228 osb->slot_num,
6258 &tl_inode, 6229 &tl_inode,
@@ -6268,7 +6239,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6268 osb->osb_tl_bh = tl_bh; 6239 osb->osb_tl_bh = tl_bh;
6269 osb->osb_tl_inode = tl_inode; 6240 osb->osb_tl_inode = tl_inode;
6270 6241
6271 mlog_exit(status);
6272 return status; 6242 return status;
6273} 6243}
6274 6244
@@ -6350,8 +6320,8 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6350 else 6320 else
6351 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 6321 bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6352 head->free_bit); 6322 head->free_bit);
6353 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 6323 trace_ocfs2_free_cached_blocks(
6354 head->free_bit, (unsigned long long)head->free_blk); 6324 (unsigned long long)head->free_blk, head->free_bit);
6355 6325
6356 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 6326 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6357 head->free_bit, bg_blkno, 1); 6327 head->free_bit, bg_blkno, 1);
@@ -6404,8 +6374,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6404 return ret; 6374 return ret;
6405 } 6375 }
6406 6376
6407 mlog(0, "Insert clusters: (bit %u, blk %llu)\n", 6377 trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
6408 bit, (unsigned long long)blkno);
6409 6378
6410 item->free_blk = blkno; 6379 item->free_blk = blkno;
6411 item->free_bit = bit; 6380 item->free_bit = bit;
@@ -6480,8 +6449,8 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
6480 fl = ctxt->c_first_suballocator; 6449 fl = ctxt->c_first_suballocator;
6481 6450
6482 if (fl->f_first) { 6451 if (fl->f_first) {
6483 mlog(0, "Free items: (type %u, slot %d)\n", 6452 trace_ocfs2_run_deallocs(fl->f_inode_type,
6484 fl->f_inode_type, fl->f_slot); 6453 fl->f_slot);
6485 ret2 = ocfs2_free_cached_blocks(osb, 6454 ret2 = ocfs2_free_cached_blocks(osb,
6486 fl->f_inode_type, 6455 fl->f_inode_type,
6487 fl->f_slot, 6456 fl->f_slot,
@@ -6558,8 +6527,9 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6558 goto out; 6527 goto out;
6559 } 6528 }
6560 6529
6561 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 6530 trace_ocfs2_cache_block_dealloc(type, slot,
6562 type, slot, bit, (unsigned long long)blkno); 6531 (unsigned long long)suballoc,
6532 (unsigned long long)blkno, bit);
6563 6533
6564 item->free_bg = suballoc; 6534 item->free_bg = suballoc;
6565 item->free_blk = blkno; 6535 item->free_blk = blkno;
@@ -7005,8 +6975,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
7005 struct ocfs2_extent_tree et; 6975 struct ocfs2_extent_tree et;
7006 struct ocfs2_cached_dealloc_ctxt dealloc; 6976 struct ocfs2_cached_dealloc_ctxt dealloc;
7007 6977
7008 mlog_entry_void();
7009
7010 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 6978 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7011 ocfs2_init_dealloc_ctxt(&dealloc); 6979 ocfs2_init_dealloc_ctxt(&dealloc);
7012 6980
@@ -7041,8 +7009,11 @@ start:
7041 goto bail; 7009 goto bail;
7042 } 7010 }
7043 7011
7044 mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", 7012 trace_ocfs2_commit_truncate(
7045 OCFS2_I(inode)->ip_clusters, path->p_tree_depth); 7013 (unsigned long long)OCFS2_I(inode)->ip_blkno,
7014 new_highest_cpos,
7015 OCFS2_I(inode)->ip_clusters,
7016 path->p_tree_depth);
7046 7017
7047 /* 7018 /*
7048 * By now, el will point to the extent list on the bottom most 7019 * By now, el will point to the extent list on the bottom most
@@ -7136,7 +7107,6 @@ bail:
7136 7107
7137 ocfs2_free_path(path); 7108 ocfs2_free_path(path);
7138 7109
7139 mlog_exit(status);
7140 return status; 7110 return status;
7141} 7111}
7142 7112
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1fbb0e20131b..ac97bca282d2 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,7 +29,6 @@
29#include <linux/mpage.h> 29#include <linux/mpage.h>
30#include <linux/quotaops.h> 30#include <linux/quotaops.h>
31 31
32#define MLOG_MASK_PREFIX ML_FILE_IO
33#include <cluster/masklog.h> 32#include <cluster/masklog.h>
34 33
35#include "ocfs2.h" 34#include "ocfs2.h"
@@ -45,6 +44,7 @@
45#include "super.h" 44#include "super.h"
46#include "symlink.h" 45#include "symlink.h"
47#include "refcounttree.h" 46#include "refcounttree.h"
47#include "ocfs2_trace.h"
48 48
49#include "buffer_head_io.h" 49#include "buffer_head_io.h"
50 50
@@ -59,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
59 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 59 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
60 void *kaddr; 60 void *kaddr;
61 61
62 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 62 trace_ocfs2_symlink_get_block(
63 (unsigned long long)iblock, bh_result, create); 63 (unsigned long long)OCFS2_I(inode)->ip_blkno,
64 (unsigned long long)iblock, bh_result, create);
64 65
65 BUG_ON(ocfs2_inode_is_fast_symlink(inode)); 66 BUG_ON(ocfs2_inode_is_fast_symlink(inode));
66 67
@@ -123,7 +124,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
123bail: 124bail:
124 brelse(bh); 125 brelse(bh);
125 126
126 mlog_exit(err);
127 return err; 127 return err;
128} 128}
129 129
@@ -136,8 +136,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
136 u64 p_blkno, count, past_eof; 136 u64 p_blkno, count, past_eof;
137 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 137 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
138 138
139 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 139 trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
140 (unsigned long long)iblock, bh_result, create); 140 (unsigned long long)iblock, bh_result, create);
141 141
142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 142 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
143 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", 143 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -199,8 +199,9 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
199 } 199 }
200 200
201 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 201 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
202 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, 202
203 (unsigned long long)past_eof); 203 trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
204 (unsigned long long)past_eof);
204 if (create && (iblock >= past_eof)) 205 if (create && (iblock >= past_eof))
205 set_buffer_new(bh_result); 206 set_buffer_new(bh_result);
206 207
@@ -208,7 +209,6 @@ bail:
208 if (err < 0) 209 if (err < 0)
209 err = -EIO; 210 err = -EIO;
210 211
211 mlog_exit(err);
212 return err; 212 return err;
213} 213}
214 214
@@ -278,7 +278,8 @@ static int ocfs2_readpage(struct file *file, struct page *page)
278 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 278 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
279 int ret, unlock = 1; 279 int ret, unlock = 1;
280 280
281 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 281 trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
282 (page ? page->index : 0));
282 283
283 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); 284 ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
284 if (ret != 0) { 285 if (ret != 0) {
@@ -323,7 +324,6 @@ out_inode_unlock:
323out: 324out:
324 if (unlock) 325 if (unlock)
325 unlock_page(page); 326 unlock_page(page);
326 mlog_exit(ret);
327 return ret; 327 return ret;
328} 328}
329 329
@@ -396,15 +396,11 @@ out_unlock:
396 */ 396 */
397static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) 397static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
398{ 398{
399 int ret; 399 trace_ocfs2_writepage(
400 400 (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
401 mlog_entry("(0x%p)\n", page); 401 page->index);
402
403 ret = block_write_full_page(page, ocfs2_get_block, wbc);
404 402
405 mlog_exit(ret); 403 return block_write_full_page(page, ocfs2_get_block, wbc);
406
407 return ret;
408} 404}
409 405
410/* Taken from ext3. We don't necessarily need the full blown 406/* Taken from ext3. We don't necessarily need the full blown
@@ -450,7 +446,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
450 int err = 0; 446 int err = 0;
451 struct inode *inode = mapping->host; 447 struct inode *inode = mapping->host;
452 448
453 mlog_entry("(block = %llu)\n", (unsigned long long)block); 449 trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
450 (unsigned long long)block);
454 451
455 /* We don't need to lock journal system files, since they aren't 452 /* We don't need to lock journal system files, since they aren't
456 * accessed concurrently from multiple nodes. 453 * accessed concurrently from multiple nodes.
@@ -484,8 +481,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
484bail: 481bail:
485 status = err ? 0 : p_blkno; 482 status = err ? 0 : p_blkno;
486 483
487 mlog_exit((int)status);
488
489 return status; 484 return status;
490} 485}
491 486
@@ -616,9 +611,6 @@ static ssize_t ocfs2_direct_IO(int rw,
616{ 611{
617 struct file *file = iocb->ki_filp; 612 struct file *file = iocb->ki_filp;
618 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 613 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
619 int ret;
620
621 mlog_entry_void();
622 614
623 /* 615 /*
624 * Fallback to buffered I/O if we see an inode without 616 * Fallback to buffered I/O if we see an inode without
@@ -631,13 +623,10 @@ static ssize_t ocfs2_direct_IO(int rw,
631 if (i_size_read(inode) <= offset) 623 if (i_size_read(inode) <= offset)
632 return 0; 624 return 0;
633 625
634 ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, 626 return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
635 iov, offset, nr_segs, 627 iov, offset, nr_segs,
636 ocfs2_direct_IO_get_blocks, 628 ocfs2_direct_IO_get_blocks,
637 ocfs2_dio_end_io, NULL, 0); 629 ocfs2_dio_end_io, NULL, 0);
638
639 mlog_exit(ret);
640 return ret;
641} 630}
642 631
643static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 632static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
@@ -1026,6 +1015,12 @@ static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
1026 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, 1015 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
1027 &cluster_start, &cluster_end); 1016 &cluster_start, &cluster_end);
1028 1017
1018 /* treat the write as new if the a hole/lseek spanned across
1019 * the page boundary.
1020 */
1021 new = new | ((i_size_read(inode) <= page_offset(page)) &&
1022 (page_offset(page) <= user_pos));
1023
1029 if (page == wc->w_target_page) { 1024 if (page == wc->w_target_page) {
1030 map_from = user_pos & (PAGE_CACHE_SIZE - 1); 1025 map_from = user_pos & (PAGE_CACHE_SIZE - 1);
1031 map_to = map_from + user_len; 1026 map_to = map_from + user_len;
@@ -1534,9 +1529,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
1534 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1529 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1535 struct ocfs2_dinode *di = NULL; 1530 struct ocfs2_dinode *di = NULL;
1536 1531
1537 mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", 1532 trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
1538 (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, 1533 len, (unsigned long long)pos,
1539 oi->ip_dyn_features); 1534 oi->ip_dyn_features);
1540 1535
1541 /* 1536 /*
1542 * Handle inodes which already have inline data 1st. 1537 * Handle inodes which already have inline data 1st.
@@ -1739,6 +1734,13 @@ try_again:
1739 1734
1740 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; 1735 di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
1741 1736
1737 trace_ocfs2_write_begin_nolock(
1738 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1739 (long long)i_size_read(inode),
1740 le32_to_cpu(di->i_clusters),
1741 pos, len, flags, mmap_page,
1742 clusters_to_alloc, extents_to_split);
1743
1742 /* 1744 /*
1743 * We set w_target_from, w_target_to here so that 1745 * We set w_target_from, w_target_to here so that
1744 * ocfs2_write_end() knows which range in the target page to 1746 * ocfs2_write_end() knows which range in the target page to
@@ -1751,12 +1753,6 @@ try_again:
1751 * ocfs2_lock_allocators(). It greatly over-estimates 1753 * ocfs2_lock_allocators(). It greatly over-estimates
1752 * the work to be done. 1754 * the work to be done.
1753 */ 1755 */
1754 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
1755 " clusters_to_add = %u, extents_to_split = %u\n",
1756 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1757 (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
1758 clusters_to_alloc, extents_to_split);
1759
1760 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), 1756 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
1761 wc->w_di_bh); 1757 wc->w_di_bh);
1762 ret = ocfs2_lock_allocators(inode, &et, 1758 ret = ocfs2_lock_allocators(inode, &et,
@@ -1938,8 +1934,8 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
1938 memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied); 1934 memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
1939 kunmap_atomic(kaddr, KM_USER0); 1935 kunmap_atomic(kaddr, KM_USER0);
1940 1936
1941 mlog(0, "Data written to inode at offset %llu. " 1937 trace_ocfs2_write_end_inline(
1942 "id_count = %u, copied = %u, i_dyn_features = 0x%x\n", 1938 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1943 (unsigned long long)pos, *copied, 1939 (unsigned long long)pos, *copied,
1944 le16_to_cpu(di->id2.i_data.id_count), 1940 le16_to_cpu(di->id2.i_data.id_count),
1945 le16_to_cpu(di->i_dyn_features)); 1941 le16_to_cpu(di->i_dyn_features));
@@ -2043,7 +2039,6 @@ const struct address_space_operations ocfs2_aops = {
2043 .write_begin = ocfs2_write_begin, 2039 .write_begin = ocfs2_write_begin,
2044 .write_end = ocfs2_write_end, 2040 .write_end = ocfs2_write_end,
2045 .bmap = ocfs2_bmap, 2041 .bmap = ocfs2_bmap,
2046 .sync_page = block_sync_page,
2047 .direct_IO = ocfs2_direct_IO, 2042 .direct_IO = ocfs2_direct_IO,
2048 .invalidatepage = ocfs2_invalidatepage, 2043 .invalidatepage = ocfs2_invalidatepage,
2049 .releasepage = ocfs2_releasepage, 2044 .releasepage = ocfs2_releasepage,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index eceb456037c1..75cf3ad987a6 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -71,7 +71,7 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
71 71
72/* 72/*
73 * Using a named enum representing lock types in terms of #N bit stored in 73 * Using a named enum representing lock types in terms of #N bit stored in
74 * iocb->private, which is going to be used for communication bewteen 74 * iocb->private, which is going to be used for communication between
75 * ocfs2_dio_end_io() and ocfs2_file_aio_write/read(). 75 * ocfs2_dio_end_io() and ocfs2_file_aio_write/read().
76 */ 76 */
77enum ocfs2_iocb_lock_bits { 77enum ocfs2_iocb_lock_bits {
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f9d5d3ffc75a..5d18ad10c27f 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -35,8 +35,8 @@
35#include "inode.h" 35#include "inode.h"
36#include "journal.h" 36#include "journal.h"
37#include "uptodate.h" 37#include "uptodate.h"
38
39#include "buffer_head_io.h" 38#include "buffer_head_io.h"
39#include "ocfs2_trace.h"
40 40
41/* 41/*
42 * Bits on bh->b_state used by ocfs2. 42 * Bits on bh->b_state used by ocfs2.
@@ -55,8 +55,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
55{ 55{
56 int ret = 0; 56 int ret = 0;
57 57
58 mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n", 58 trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
59 (unsigned long long)bh->b_blocknr, ci);
60 59
61 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO); 60 BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
62 BUG_ON(buffer_jbd(bh)); 61 BUG_ON(buffer_jbd(bh));
@@ -66,6 +65,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
66 * can get modified during recovery even if read-only. */ 65 * can get modified during recovery even if read-only. */
67 if (ocfs2_is_hard_readonly(osb)) { 66 if (ocfs2_is_hard_readonly(osb)) {
68 ret = -EROFS; 67 ret = -EROFS;
68 mlog_errno(ret);
69 goto out; 69 goto out;
70 } 70 }
71 71
@@ -91,11 +91,11 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
91 * uptodate. */ 91 * uptodate. */
92 ret = -EIO; 92 ret = -EIO;
93 put_bh(bh); 93 put_bh(bh);
94 mlog_errno(ret);
94 } 95 }
95 96
96 ocfs2_metadata_cache_io_unlock(ci); 97 ocfs2_metadata_cache_io_unlock(ci);
97out: 98out:
98 mlog_exit(ret);
99 return ret; 99 return ret;
100} 100}
101 101
@@ -106,10 +106,10 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
106 unsigned int i; 106 unsigned int i;
107 struct buffer_head *bh; 107 struct buffer_head *bh;
108 108
109 if (!nr) { 109 trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
110 mlog(ML_BH_IO, "No buffers will be read!\n"); 110
111 if (!nr)
111 goto bail; 112 goto bail;
112 }
113 113
114 for (i = 0 ; i < nr ; i++) { 114 for (i = 0 ; i < nr ; i++) {
115 if (bhs[i] == NULL) { 115 if (bhs[i] == NULL) {
@@ -123,10 +123,8 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
123 bh = bhs[i]; 123 bh = bhs[i];
124 124
125 if (buffer_jbd(bh)) { 125 if (buffer_jbd(bh)) {
126 mlog(ML_BH_IO, 126 trace_ocfs2_read_blocks_sync_jbd(
127 "trying to sync read a jbd " 127 (unsigned long long)bh->b_blocknr);
128 "managed bh (blocknr = %llu), skipping\n",
129 (unsigned long long)bh->b_blocknr);
130 continue; 128 continue;
131 } 129 }
132 130
@@ -186,8 +184,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
186 struct buffer_head *bh; 184 struct buffer_head *bh;
187 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 185 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
188 186
189 mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n", 187 trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
190 ci, (unsigned long long)block, nr, flags);
191 188
192 BUG_ON(!ci); 189 BUG_ON(!ci);
193 BUG_ON((flags & OCFS2_BH_READAHEAD) && 190 BUG_ON((flags & OCFS2_BH_READAHEAD) &&
@@ -207,7 +204,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
207 } 204 }
208 205
209 if (nr == 0) { 206 if (nr == 0) {
210 mlog(ML_BH_IO, "No buffers will be read!\n");
211 status = 0; 207 status = 0;
212 goto bail; 208 goto bail;
213 } 209 }
@@ -251,8 +247,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
251 */ 247 */
252 248
253 if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) { 249 if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
254 mlog(ML_UPTODATE, 250 trace_ocfs2_read_blocks_from_disk(
255 "bh (%llu), owner %llu not uptodate\n",
256 (unsigned long long)bh->b_blocknr, 251 (unsigned long long)bh->b_blocknr,
257 (unsigned long long)ocfs2_metadata_cache_owner(ci)); 252 (unsigned long long)ocfs2_metadata_cache_owner(ci));
258 /* We're using ignore_cache here to say 253 /* We're using ignore_cache here to say
@@ -260,11 +255,10 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
260 ignore_cache = 1; 255 ignore_cache = 1;
261 } 256 }
262 257
258 trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
259 ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
260
263 if (buffer_jbd(bh)) { 261 if (buffer_jbd(bh)) {
264 if (ignore_cache)
265 mlog(ML_BH_IO, "trying to sync read a jbd "
266 "managed bh (blocknr = %llu)\n",
267 (unsigned long long)bh->b_blocknr);
268 continue; 262 continue;
269 } 263 }
270 264
@@ -272,9 +266,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
272 if (buffer_dirty(bh)) { 266 if (buffer_dirty(bh)) {
273 /* This should probably be a BUG, or 267 /* This should probably be a BUG, or
274 * at least return an error. */ 268 * at least return an error. */
275 mlog(ML_BH_IO, "asking me to sync read a dirty "
276 "buffer! (blocknr = %llu)\n",
277 (unsigned long long)bh->b_blocknr);
278 continue; 269 continue;
279 } 270 }
280 271
@@ -367,14 +358,11 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
367 } 358 }
368 ocfs2_metadata_cache_io_unlock(ci); 359 ocfs2_metadata_cache_io_unlock(ci);
369 360
370 mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", 361 trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
371 (unsigned long long)block, nr, 362 flags, ignore_cache);
372 ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
373 flags);
374 363
375bail: 364bail:
376 365
377 mlog_exit(status);
378 return status; 366 return status;
379} 367}
380 368
@@ -408,13 +396,12 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
408 int ret = 0; 396 int ret = 0;
409 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 397 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
410 398
411 mlog_entry_void();
412
413 BUG_ON(buffer_jbd(bh)); 399 BUG_ON(buffer_jbd(bh));
414 ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr); 400 ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
415 401
416 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) { 402 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
417 ret = -EROFS; 403 ret = -EROFS;
404 mlog_errno(ret);
418 goto out; 405 goto out;
419 } 406 }
420 407
@@ -434,9 +421,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
434 if (!buffer_uptodate(bh)) { 421 if (!buffer_uptodate(bh)) {
435 ret = -EIO; 422 ret = -EIO;
436 put_bh(bh); 423 put_bh(bh);
424 mlog_errno(ret);
437 } 425 }
438 426
439out: 427out:
440 mlog_exit(ret);
441 return ret; 428 return ret;
442} 429}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b108e863d8f6..643720209a98 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -367,11 +367,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
367static void o2hb_wait_on_io(struct o2hb_region *reg, 367static void o2hb_wait_on_io(struct o2hb_region *reg,
368 struct o2hb_bio_wait_ctxt *wc) 368 struct o2hb_bio_wait_ctxt *wc)
369{ 369{
370 struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;
371
372 blk_run_address_space(mapping);
373 o2hb_bio_wait_dec(wc, 1); 370 o2hb_bio_wait_dec(wc, 1);
374
375 wait_for_completion(&wc->wc_io_complete); 371 wait_for_completion(&wc->wc_io_complete);
376} 372}
377 373
@@ -1658,8 +1654,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
1658 struct o2hb_disk_slot *slot; 1654 struct o2hb_disk_slot *slot;
1659 struct o2hb_disk_heartbeat_block *hb_block; 1655 struct o2hb_disk_heartbeat_block *hb_block;
1660 1656
1661 mlog_entry_void();
1662
1663 ret = o2hb_read_slots(reg, reg->hr_blocks); 1657 ret = o2hb_read_slots(reg, reg->hr_blocks);
1664 if (ret) { 1658 if (ret) {
1665 mlog_errno(ret); 1659 mlog_errno(ret);
@@ -1681,7 +1675,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
1681 } 1675 }
1682 1676
1683out: 1677out:
1684 mlog_exit(ret);
1685 return ret; 1678 return ret;
1686} 1679}
1687 1680
@@ -2282,7 +2275,7 @@ void o2hb_free_hb_set(struct config_group *group)
2282 kfree(hs); 2275 kfree(hs);
2283} 2276}
2284 2277
2285/* hb callback registration and issueing */ 2278/* hb callback registration and issuing */
2286 2279
2287static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type) 2280static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type)
2288{ 2281{
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 6c61771469af..07ac24fd9252 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -30,7 +30,7 @@
30 30
31struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK); 31struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
32EXPORT_SYMBOL_GPL(mlog_and_bits); 32EXPORT_SYMBOL_GPL(mlog_and_bits);
33struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK); 33struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
34EXPORT_SYMBOL_GPL(mlog_not_bits); 34EXPORT_SYMBOL_GPL(mlog_not_bits);
35 35
36static ssize_t mlog_mask_show(u64 mask, char *buf) 36static ssize_t mlog_mask_show(u64 mask, char *buf)
@@ -80,8 +80,6 @@ struct mlog_attribute {
80} 80}
81 81
82static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { 82static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
83 define_mask(ENTRY),
84 define_mask(EXIT),
85 define_mask(TCP), 83 define_mask(TCP),
86 define_mask(MSG), 84 define_mask(MSG),
87 define_mask(SOCKET), 85 define_mask(SOCKET),
@@ -93,27 +91,11 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
93 define_mask(DLM_THREAD), 91 define_mask(DLM_THREAD),
94 define_mask(DLM_MASTER), 92 define_mask(DLM_MASTER),
95 define_mask(DLM_RECOVERY), 93 define_mask(DLM_RECOVERY),
96 define_mask(AIO),
97 define_mask(JOURNAL),
98 define_mask(DISK_ALLOC),
99 define_mask(SUPER),
100 define_mask(FILE_IO),
101 define_mask(EXTENT_MAP),
102 define_mask(DLM_GLUE), 94 define_mask(DLM_GLUE),
103 define_mask(BH_IO),
104 define_mask(UPTODATE),
105 define_mask(NAMEI),
106 define_mask(INODE),
107 define_mask(VOTE), 95 define_mask(VOTE),
108 define_mask(DCACHE),
109 define_mask(CONN), 96 define_mask(CONN),
110 define_mask(QUORUM), 97 define_mask(QUORUM),
111 define_mask(EXPORT),
112 define_mask(XATTR),
113 define_mask(QUOTA),
114 define_mask(REFCOUNT),
115 define_mask(BASTS), 98 define_mask(BASTS),
116 define_mask(RESERVATIONS),
117 define_mask(CLUSTER), 99 define_mask(CLUSTER),
118 define_mask(ERROR), 100 define_mask(ERROR),
119 define_mask(NOTICE), 101 define_mask(NOTICE),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 34d6544357d9..baa2b9ef7eef 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -82,41 +82,23 @@
82 82
83/* bits that are frequently given and infrequently matched in the low word */ 83/* bits that are frequently given and infrequently matched in the low word */
84/* NOTE: If you add a flag, you need to also update masklog.c! */ 84/* NOTE: If you add a flag, you need to also update masklog.c! */
85#define ML_ENTRY 0x0000000000000001ULL /* func call entry */ 85#define ML_TCP 0x0000000000000001ULL /* net cluster/tcp.c */
86#define ML_EXIT 0x0000000000000002ULL /* func call exit */ 86#define ML_MSG 0x0000000000000002ULL /* net network messages */
87#define ML_TCP 0x0000000000000004ULL /* net cluster/tcp.c */ 87#define ML_SOCKET 0x0000000000000004ULL /* net socket lifetime */
88#define ML_MSG 0x0000000000000008ULL /* net network messages */ 88#define ML_HEARTBEAT 0x0000000000000008ULL /* hb all heartbeat tracking */
89#define ML_SOCKET 0x0000000000000010ULL /* net socket lifetime */ 89#define ML_HB_BIO 0x0000000000000010ULL /* hb io tracing */
90#define ML_HEARTBEAT 0x0000000000000020ULL /* hb all heartbeat tracking */ 90#define ML_DLMFS 0x0000000000000020ULL /* dlm user dlmfs */
91#define ML_HB_BIO 0x0000000000000040ULL /* hb io tracing */ 91#define ML_DLM 0x0000000000000040ULL /* dlm general debugging */
92#define ML_DLMFS 0x0000000000000080ULL /* dlm user dlmfs */ 92#define ML_DLM_DOMAIN 0x0000000000000080ULL /* dlm domain debugging */
93#define ML_DLM 0x0000000000000100ULL /* dlm general debugging */ 93#define ML_DLM_THREAD 0x0000000000000100ULL /* dlm domain thread */
94#define ML_DLM_DOMAIN 0x0000000000000200ULL /* dlm domain debugging */ 94#define ML_DLM_MASTER 0x0000000000000200ULL /* dlm master functions */
95#define ML_DLM_THREAD 0x0000000000000400ULL /* dlm domain thread */ 95#define ML_DLM_RECOVERY 0x0000000000000400ULL /* dlm master functions */
96#define ML_DLM_MASTER 0x0000000000000800ULL /* dlm master functions */ 96#define ML_DLM_GLUE 0x0000000000000800ULL /* ocfs2 dlm glue layer */
97#define ML_DLM_RECOVERY 0x0000000000001000ULL /* dlm master functions */ 97#define ML_VOTE 0x0000000000001000ULL /* ocfs2 node messaging */
98#define ML_AIO 0x0000000000002000ULL /* ocfs2 aio read and write */ 98#define ML_CONN 0x0000000000002000ULL /* net connection management */
99#define ML_JOURNAL 0x0000000000004000ULL /* ocfs2 journalling functions */ 99#define ML_QUORUM 0x0000000000004000ULL /* net connection quorum */
100#define ML_DISK_ALLOC 0x0000000000008000ULL /* ocfs2 disk allocation */ 100#define ML_BASTS 0x0000000000008000ULL /* dlmglue asts and basts */
101#define ML_SUPER 0x0000000000010000ULL /* ocfs2 mount / umount */ 101#define ML_CLUSTER 0x0000000000010000ULL /* cluster stack */
102#define ML_FILE_IO 0x0000000000020000ULL /* ocfs2 file I/O */
103#define ML_EXTENT_MAP 0x0000000000040000ULL /* ocfs2 extent map caching */
104#define ML_DLM_GLUE 0x0000000000080000ULL /* ocfs2 dlm glue layer */
105#define ML_BH_IO 0x0000000000100000ULL /* ocfs2 buffer I/O */
106#define ML_UPTODATE 0x0000000000200000ULL /* ocfs2 caching sequence #'s */
107#define ML_NAMEI 0x0000000000400000ULL /* ocfs2 directory / namespace */
108#define ML_INODE 0x0000000000800000ULL /* ocfs2 inode manipulation */
109#define ML_VOTE 0x0000000001000000ULL /* ocfs2 node messaging */
110#define ML_DCACHE 0x0000000002000000ULL /* ocfs2 dcache operations */
111#define ML_CONN 0x0000000004000000ULL /* net connection management */
112#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
113#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
114#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
115#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
116#define ML_REFCOUNT 0x0000000080000000ULL /* refcount tree operations */
117#define ML_BASTS 0x0000000100000000ULL /* dlmglue asts and basts */
118#define ML_RESERVATIONS 0x0000000200000000ULL /* ocfs2 alloc reservations */
119#define ML_CLUSTER 0x0000000400000000ULL /* cluster stack */
120 102
121/* bits that are infrequently given and frequently matched in the high word */ 103/* bits that are infrequently given and frequently matched in the high word */
122#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */ 104#define ML_ERROR 0x1000000000000000ULL /* sent to KERN_ERR */
@@ -124,7 +106,6 @@
124#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */ 106#define ML_KTHREAD 0x4000000000000000ULL /* kernel thread activity */
125 107
126#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE) 108#define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
127#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
128#ifndef MLOG_MASK_PREFIX 109#ifndef MLOG_MASK_PREFIX
129#define MLOG_MASK_PREFIX 0 110#define MLOG_MASK_PREFIX 0
130#endif 111#endif
@@ -222,58 +203,6 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
222 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ 203 mlog(ML_ERROR, "status = %lld\n", (long long)_st); \
223} while (0) 204} while (0)
224 205
225#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
226#define mlog_entry(fmt, args...) do { \
227 mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \
228} while (0)
229
230#define mlog_entry_void() do { \
231 mlog(ML_ENTRY, "ENTRY:\n"); \
232} while (0)
233
234/*
235 * We disable this for sparse.
236 */
237#if !defined(__CHECKER__)
238#define mlog_exit(st) do { \
239 if (__builtin_types_compatible_p(typeof(st), unsigned long)) \
240 mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st)); \
241 else if (__builtin_types_compatible_p(typeof(st), signed long)) \
242 mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st)); \
243 else if (__builtin_types_compatible_p(typeof(st), unsigned int) \
244 || __builtin_types_compatible_p(typeof(st), unsigned short) \
245 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
246 mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st)); \
247 else if (__builtin_types_compatible_p(typeof(st), signed int) \
248 || __builtin_types_compatible_p(typeof(st), signed short) \
249 || __builtin_types_compatible_p(typeof(st), signed char)) \
250 mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st)); \
251 else if (__builtin_types_compatible_p(typeof(st), long long)) \
252 mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
253 else \
254 mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st)); \
255} while (0)
256#else
257#define mlog_exit(st) do { \
258 mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st)); \
259} while (0)
260#endif
261
262#define mlog_exit_ptr(ptr) do { \
263 mlog(ML_EXIT, "EXIT: %p\n", ptr); \
264} while (0)
265
266#define mlog_exit_void() do { \
267 mlog(ML_EXIT, "EXIT\n"); \
268} while (0)
269#else
270#define mlog_entry(...) do { } while (0)
271#define mlog_entry_void(...) do { } while (0)
272#define mlog_exit(...) do { } while (0)
273#define mlog_exit_ptr(...) do { } while (0)
274#define mlog_exit_void(...) do { } while (0)
275#endif /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
276
277#define mlog_bug_on_msg(cond, fmt, args...) do { \ 206#define mlog_bug_on_msg(cond, fmt, args...) do { \
278 if (cond) { \ 207 if (cond) { \
279 mlog(ML_ERROR, "bug expression: " #cond "\n"); \ 208 mlog(ML_ERROR, "bug expression: " #cond "\n"); \
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index a87366750f23..8f9cea1597af 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -89,7 +89,7 @@ static void o2quo_fence_self(void)
89 }; 89 };
90} 90}
91 91
92/* Indicate that a timeout occured on a hearbeat region write. The 92/* Indicate that a timeout occurred on a hearbeat region write. The
93 * other nodes in the cluster may consider us dead at that time so we 93 * other nodes in the cluster may consider us dead at that time so we
94 * want to "fence" ourselves so that we don't scribble on the disk 94 * want to "fence" ourselves so that we don't scribble on the disk
95 * after they think they've recovered us. This can't solve all 95 * after they think they've recovered us. This can't solve all
@@ -261,7 +261,7 @@ void o2quo_hb_still_up(u8 node)
261 spin_unlock(&qs->qs_lock); 261 spin_unlock(&qs->qs_lock);
262} 262}
263 263
264/* This is analagous to hb_up. as a node's connection comes up we delay the 264/* This is analogous to hb_up. as a node's connection comes up we delay the
265 * quorum decision until we see it heartbeating. the hold will be droped in 265 * quorum decision until we see it heartbeating. the hold will be droped in
266 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if 266 * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if
267 * it's already heartbeating we we might be dropping a hold that conn_up got. 267 * it's already heartbeating we we might be dropping a hold that conn_up got.
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 3b11cb1e38fc..db5ee4b4f47a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -210,10 +210,6 @@ static inline void o2net_set_func_stop_time(struct o2net_sock_container *sc)
210 sc->sc_tv_func_stop = ktime_get(); 210 sc->sc_tv_func_stop = ktime_get();
211} 211}
212 212
213static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
214{
215 return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
216}
217#else /* CONFIG_DEBUG_FS */ 213#else /* CONFIG_DEBUG_FS */
218# define o2net_init_nst(a, b, c, d, e) 214# define o2net_init_nst(a, b, c, d, e)
219# define o2net_set_nst_sock_time(a) 215# define o2net_set_nst_sock_time(a)
@@ -227,10 +223,14 @@ static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
227# define o2net_set_advance_stop_time(a) 223# define o2net_set_advance_stop_time(a)
228# define o2net_set_func_start_time(a) 224# define o2net_set_func_start_time(a)
229# define o2net_set_func_stop_time(a) 225# define o2net_set_func_stop_time(a)
230# define o2net_get_func_run_time(a) (ktime_t)0
231#endif /* CONFIG_DEBUG_FS */ 226#endif /* CONFIG_DEBUG_FS */
232 227
233#ifdef CONFIG_OCFS2_FS_STATS 228#ifdef CONFIG_OCFS2_FS_STATS
229static ktime_t o2net_get_func_run_time(struct o2net_sock_container *sc)
230{
231 return ktime_sub(sc->sc_tv_func_stop, sc->sc_tv_func_start);
232}
233
234static void o2net_update_send_stats(struct o2net_send_tracking *nst, 234static void o2net_update_send_stats(struct o2net_send_tracking *nst,
235 struct o2net_sock_container *sc) 235 struct o2net_sock_container *sc)
236{ 236{
@@ -565,7 +565,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
565 * the work queue actually being up. */ 565 * the work queue actually being up. */
566 if (!valid && o2net_wq) { 566 if (!valid && o2net_wq) {
567 unsigned long delay; 567 unsigned long delay;
568 /* delay if we're withing a RECONNECT_DELAY of the 568 /* delay if we're within a RECONNECT_DELAY of the
569 * last attempt */ 569 * last attempt */
570 delay = (nn->nn_last_connect_attempt + 570 delay = (nn->nn_last_connect_attempt +
571 msecs_to_jiffies(o2net_reconnect_delay())) 571 msecs_to_jiffies(o2net_reconnect_delay()))
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 6d80ecc7834f..e5ba34818332 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -28,7 +28,6 @@
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/namei.h> 29#include <linux/namei.h>
30 30
31#define MLOG_MASK_PREFIX ML_DCACHE
32#include <cluster/masklog.h> 31#include <cluster/masklog.h>
33 32
34#include "ocfs2.h" 33#include "ocfs2.h"
@@ -39,6 +38,7 @@
39#include "file.h" 38#include "file.h"
40#include "inode.h" 39#include "inode.h"
41#include "super.h" 40#include "super.h"
41#include "ocfs2_trace.h"
42 42
43void ocfs2_dentry_attach_gen(struct dentry *dentry) 43void ocfs2_dentry_attach_gen(struct dentry *dentry)
44{ 44{
@@ -56,14 +56,14 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
56 int ret = 0; /* if all else fails, just return false */ 56 int ret = 0; /* if all else fails, just return false */
57 struct ocfs2_super *osb; 57 struct ocfs2_super *osb;
58 58
59 if (nd->flags & LOOKUP_RCU) 59 if (nd && nd->flags & LOOKUP_RCU)
60 return -ECHILD; 60 return -ECHILD;
61 61
62 inode = dentry->d_inode; 62 inode = dentry->d_inode;
63 osb = OCFS2_SB(dentry->d_sb); 63 osb = OCFS2_SB(dentry->d_sb);
64 64
65 mlog_entry("(0x%p, '%.*s')\n", dentry, 65 trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
66 dentry->d_name.len, dentry->d_name.name); 66 dentry->d_name.name);
67 67
68 /* For a negative dentry - 68 /* For a negative dentry -
69 * check the generation number of the parent and compare with the 69 * check the generation number of the parent and compare with the
@@ -73,9 +73,10 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
73 unsigned long gen = (unsigned long) dentry->d_fsdata; 73 unsigned long gen = (unsigned long) dentry->d_fsdata;
74 unsigned long pgen = 74 unsigned long pgen =
75 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; 75 OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
76 mlog(0, "negative dentry: %.*s parent gen: %lu " 76
77 "dentry gen: %lu\n", 77 trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
78 dentry->d_name.len, dentry->d_name.name, pgen, gen); 78 dentry->d_name.name,
79 pgen, gen);
79 if (gen != pgen) 80 if (gen != pgen)
80 goto bail; 81 goto bail;
81 goto valid; 82 goto valid;
@@ -90,8 +91,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
90 /* did we or someone else delete this inode? */ 91 /* did we or someone else delete this inode? */
91 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 92 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
92 spin_unlock(&OCFS2_I(inode)->ip_lock); 93 spin_unlock(&OCFS2_I(inode)->ip_lock);
93 mlog(0, "inode (%llu) deleted, returning false\n", 94 trace_ocfs2_dentry_revalidate_delete(
94 (unsigned long long)OCFS2_I(inode)->ip_blkno); 95 (unsigned long long)OCFS2_I(inode)->ip_blkno);
95 goto bail; 96 goto bail;
96 } 97 }
97 spin_unlock(&OCFS2_I(inode)->ip_lock); 98 spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -101,10 +102,9 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
101 * inode nlink hits zero, it never goes back. 102 * inode nlink hits zero, it never goes back.
102 */ 103 */
103 if (inode->i_nlink == 0) { 104 if (inode->i_nlink == 0) {
104 mlog(0, "Inode %llu orphaned, returning false " 105 trace_ocfs2_dentry_revalidate_orphaned(
105 "dir = %d\n", 106 (unsigned long long)OCFS2_I(inode)->ip_blkno,
106 (unsigned long long)OCFS2_I(inode)->ip_blkno, 107 S_ISDIR(inode->i_mode));
107 S_ISDIR(inode->i_mode));
108 goto bail; 108 goto bail;
109 } 109 }
110 110
@@ -113,9 +113,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
113 * redo it. 113 * redo it.
114 */ 114 */
115 if (!dentry->d_fsdata) { 115 if (!dentry->d_fsdata) {
116 mlog(0, "Inode %llu doesn't have dentry lock, " 116 trace_ocfs2_dentry_revalidate_nofsdata(
117 "returning false\n", 117 (unsigned long long)OCFS2_I(inode)->ip_blkno);
118 (unsigned long long)OCFS2_I(inode)->ip_blkno);
119 goto bail; 118 goto bail;
120 } 119 }
121 120
@@ -123,8 +122,7 @@ valid:
123 ret = 1; 122 ret = 1;
124 123
125bail: 124bail:
126 mlog_exit(ret); 125 trace_ocfs2_dentry_revalidate_ret(ret);
127
128 return ret; 126 return ret;
129} 127}
130 128
@@ -181,8 +179,8 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
181 179
182 spin_lock(&dentry->d_lock); 180 spin_lock(&dentry->d_lock);
183 if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { 181 if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
184 mlog(0, "dentry found: %.*s\n", 182 trace_ocfs2_find_local_alias(dentry->d_name.len,
185 dentry->d_name.len, dentry->d_name.name); 183 dentry->d_name.name);
186 184
187 dget_dlock(dentry); 185 dget_dlock(dentry);
188 spin_unlock(&dentry->d_lock); 186 spin_unlock(&dentry->d_lock);
@@ -240,9 +238,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
240 struct dentry *alias; 238 struct dentry *alias;
241 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 239 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
242 240
243 mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n", 241 trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
244 dentry->d_name.len, dentry->d_name.name, 242 (unsigned long long)parent_blkno, dl);
245 (unsigned long long)parent_blkno, dl);
246 243
247 /* 244 /*
248 * Negative dentry. We ignore these for now. 245 * Negative dentry. We ignore these for now.
@@ -292,7 +289,9 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
292 (unsigned long long)parent_blkno, 289 (unsigned long long)parent_blkno,
293 (unsigned long long)dl->dl_parent_blkno); 290 (unsigned long long)dl->dl_parent_blkno);
294 291
295 mlog(0, "Found: %s\n", dl->dl_lockres.l_name); 292 trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
293 (unsigned long long)parent_blkno,
294 (unsigned long long)OCFS2_I(inode)->ip_blkno);
296 295
297 goto out_attach; 296 goto out_attach;
298 } 297 }
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d417b3f9b0c7..9fe5b8fd658f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -43,7 +43,6 @@
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44#include <linux/sort.h> 44#include <linux/sort.h>
45 45
46#define MLOG_MASK_PREFIX ML_NAMEI
47#include <cluster/masklog.h> 46#include <cluster/masklog.h>
48 47
49#include "ocfs2.h" 48#include "ocfs2.h"
@@ -61,6 +60,7 @@
61#include "super.h" 60#include "super.h"
62#include "sysfile.h" 61#include "sysfile.h"
63#include "uptodate.h" 62#include "uptodate.h"
63#include "ocfs2_trace.h"
64 64
65#include "buffer_head_io.h" 65#include "buffer_head_io.h"
66 66
@@ -322,21 +322,23 @@ static int ocfs2_check_dir_entry(struct inode * dir,
322 const char *error_msg = NULL; 322 const char *error_msg = NULL;
323 const int rlen = le16_to_cpu(de->rec_len); 323 const int rlen = le16_to_cpu(de->rec_len);
324 324
325 if (rlen < OCFS2_DIR_REC_LEN(1)) 325 if (unlikely(rlen < OCFS2_DIR_REC_LEN(1)))
326 error_msg = "rec_len is smaller than minimal"; 326 error_msg = "rec_len is smaller than minimal";
327 else if (rlen % 4 != 0) 327 else if (unlikely(rlen % 4 != 0))
328 error_msg = "rec_len % 4 != 0"; 328 error_msg = "rec_len % 4 != 0";
329 else if (rlen < OCFS2_DIR_REC_LEN(de->name_len)) 329 else if (unlikely(rlen < OCFS2_DIR_REC_LEN(de->name_len)))
330 error_msg = "rec_len is too small for name_len"; 330 error_msg = "rec_len is too small for name_len";
331 else if (((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize) 331 else if (unlikely(
332 ((char *) de - bh->b_data) + rlen > dir->i_sb->s_blocksize))
332 error_msg = "directory entry across blocks"; 333 error_msg = "directory entry across blocks";
333 334
334 if (error_msg != NULL) 335 if (unlikely(error_msg != NULL))
335 mlog(ML_ERROR, "bad entry in directory #%llu: %s - " 336 mlog(ML_ERROR, "bad entry in directory #%llu: %s - "
336 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n", 337 "offset=%lu, inode=%llu, rec_len=%d, name_len=%d\n",
337 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg, 338 (unsigned long long)OCFS2_I(dir)->ip_blkno, error_msg,
338 offset, (unsigned long long)le64_to_cpu(de->inode), rlen, 339 offset, (unsigned long long)le64_to_cpu(de->inode), rlen,
339 de->name_len); 340 de->name_len);
341
340 return error_msg == NULL ? 1 : 0; 342 return error_msg == NULL ? 1 : 0;
341} 343}
342 344
@@ -354,7 +356,7 @@ static inline int ocfs2_match(int len,
354/* 356/*
355 * Returns 0 if not found, -1 on failure, and 1 on success 357 * Returns 0 if not found, -1 on failure, and 1 on success
356 */ 358 */
357static int inline ocfs2_search_dirblock(struct buffer_head *bh, 359static inline int ocfs2_search_dirblock(struct buffer_head *bh,
358 struct inode *dir, 360 struct inode *dir,
359 const char *name, int namelen, 361 const char *name, int namelen,
360 unsigned long offset, 362 unsigned long offset,
@@ -367,8 +369,6 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
367 int de_len; 369 int de_len;
368 int ret = 0; 370 int ret = 0;
369 371
370 mlog_entry_void();
371
372 de_buf = first_de; 372 de_buf = first_de;
373 dlimit = de_buf + bytes; 373 dlimit = de_buf + bytes;
374 374
@@ -402,7 +402,7 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
402 } 402 }
403 403
404bail: 404bail:
405 mlog_exit(ret); 405 trace_ocfs2_search_dirblock(ret);
406 return ret; 406 return ret;
407} 407}
408 408
@@ -447,8 +447,7 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
447 * We don't validate dirents here, that's handled 447 * We don't validate dirents here, that's handled
448 * in-place when the code walks them. 448 * in-place when the code walks them.
449 */ 449 */
450 mlog(0, "Validating dirblock %llu\n", 450 trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
451 (unsigned long long)bh->b_blocknr);
452 451
453 BUG_ON(!buffer_uptodate(bh)); 452 BUG_ON(!buffer_uptodate(bh));
454 453
@@ -706,8 +705,6 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
706 int num = 0; 705 int num = 0;
707 int nblocks, i, err; 706 int nblocks, i, err;
708 707
709 mlog_entry_void();
710
711 sb = dir->i_sb; 708 sb = dir->i_sb;
712 709
713 nblocks = i_size_read(dir) >> sb->s_blocksize_bits; 710 nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
@@ -788,7 +785,7 @@ cleanup_and_exit:
788 for (; ra_ptr < ra_max; ra_ptr++) 785 for (; ra_ptr < ra_max; ra_ptr++)
789 brelse(bh_use[ra_ptr]); 786 brelse(bh_use[ra_ptr]);
790 787
791 mlog_exit_ptr(ret); 788 trace_ocfs2_find_entry_el(ret);
792 return ret; 789 return ret;
793} 790}
794 791
@@ -950,11 +947,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
950 goto out; 947 goto out;
951 } 948 }
952 949
953 mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x " 950 trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
954 "returns: %llu\n", 951 namelen, name, hinfo->major_hash,
955 (unsigned long long)OCFS2_I(dir)->ip_blkno, 952 hinfo->minor_hash, (unsigned long long)phys);
956 namelen, name, hinfo->major_hash, hinfo->minor_hash,
957 (unsigned long long)phys);
958 953
959 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh); 954 ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
960 if (ret) { 955 if (ret) {
@@ -964,9 +959,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
964 959
965 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data; 960 dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
966 961
967 mlog(0, "leaf info: num_used: %d, count: %d\n", 962 trace_ocfs2_dx_dir_search_leaf_info(
968 le16_to_cpu(dx_leaf->dl_list.de_num_used), 963 le16_to_cpu(dx_leaf->dl_list.de_num_used),
969 le16_to_cpu(dx_leaf->dl_list.de_count)); 964 le16_to_cpu(dx_leaf->dl_list.de_count));
970 965
971 entry_list = &dx_leaf->dl_list; 966 entry_list = &dx_leaf->dl_list;
972 967
@@ -1166,8 +1161,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1166 int i, status = -ENOENT; 1161 int i, status = -ENOENT;
1167 ocfs2_journal_access_func access = ocfs2_journal_access_db; 1162 ocfs2_journal_access_func access = ocfs2_journal_access_db;
1168 1163
1169 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
1170
1171 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 1164 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
1172 access = ocfs2_journal_access_di; 1165 access = ocfs2_journal_access_di;
1173 1166
@@ -1202,7 +1195,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
1202 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len)); 1195 de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
1203 } 1196 }
1204bail: 1197bail:
1205 mlog_exit(status);
1206 return status; 1198 return status;
1207} 1199}
1208 1200
@@ -1348,8 +1340,8 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
1348 } 1340 }
1349 } 1341 }
1350 1342
1351 mlog(0, "Dir %llu: delete entry at index: %d\n", 1343 trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
1352 (unsigned long long)OCFS2_I(dir)->ip_blkno, index); 1344 index);
1353 1345
1354 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry, 1346 ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
1355 leaf_bh, leaf_bh->b_data, leaf_bh->b_size); 1347 leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
@@ -1632,8 +1624,6 @@ int __ocfs2_add_entry(handle_t *handle,
1632 struct buffer_head *insert_bh = lookup->dl_leaf_bh; 1624 struct buffer_head *insert_bh = lookup->dl_leaf_bh;
1633 char *data_start = insert_bh->b_data; 1625 char *data_start = insert_bh->b_data;
1634 1626
1635 mlog_entry_void();
1636
1637 if (!namelen) 1627 if (!namelen)
1638 return -EINVAL; 1628 return -EINVAL;
1639 1629
@@ -1765,8 +1755,9 @@ int __ocfs2_add_entry(handle_t *handle,
1765 * from ever getting here. */ 1755 * from ever getting here. */
1766 retval = -ENOSPC; 1756 retval = -ENOSPC;
1767bail: 1757bail:
1758 if (retval)
1759 mlog_errno(retval);
1768 1760
1769 mlog_exit(retval);
1770 return retval; 1761 return retval;
1771} 1762}
1772 1763
@@ -2028,8 +2019,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
2028 struct inode *inode = filp->f_path.dentry->d_inode; 2019 struct inode *inode = filp->f_path.dentry->d_inode;
2029 int lock_level = 0; 2020 int lock_level = 0;
2030 2021
2031 mlog_entry("dirino=%llu\n", 2022 trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
2032 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2033 2023
2034 error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); 2024 error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
2035 if (lock_level && error >= 0) { 2025 if (lock_level && error >= 0) {
@@ -2051,9 +2041,10 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
2051 dirent, filldir, NULL); 2041 dirent, filldir, NULL);
2052 2042
2053 ocfs2_inode_unlock(inode, lock_level); 2043 ocfs2_inode_unlock(inode, lock_level);
2044 if (error)
2045 mlog_errno(error);
2054 2046
2055bail_nolock: 2047bail_nolock:
2056 mlog_exit(error);
2057 2048
2058 return error; 2049 return error;
2059} 2050}
@@ -2069,8 +2060,8 @@ int ocfs2_find_files_on_disk(const char *name,
2069{ 2060{
2070 int status = -ENOENT; 2061 int status = -ENOENT;
2071 2062
2072 mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno, 2063 trace_ocfs2_find_files_on_disk(namelen, name, blkno,
2073 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2064 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2074 2065
2075 status = ocfs2_find_entry(name, namelen, inode, lookup); 2066 status = ocfs2_find_entry(name, namelen, inode, lookup);
2076 if (status) 2067 if (status)
@@ -2114,8 +2105,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
2114 int ret; 2105 int ret;
2115 struct ocfs2_dir_lookup_result lookup = { NULL, }; 2106 struct ocfs2_dir_lookup_result lookup = { NULL, };
2116 2107
2117 mlog_entry("dir %llu, name '%.*s'\n", 2108 trace_ocfs2_check_dir_for_entry(
2118 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); 2109 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
2119 2110
2120 ret = -EEXIST; 2111 ret = -EEXIST;
2121 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) 2112 if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
@@ -2125,7 +2116,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
2125bail: 2116bail:
2126 ocfs2_free_dir_lookup_result(&lookup); 2117 ocfs2_free_dir_lookup_result(&lookup);
2127 2118
2128 mlog_exit(ret); 2119 if (ret)
2120 mlog_errno(ret);
2129 return ret; 2121 return ret;
2130} 2122}
2131 2123
@@ -2324,8 +2316,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2324 struct buffer_head *new_bh = NULL; 2316 struct buffer_head *new_bh = NULL;
2325 struct ocfs2_dir_entry *de; 2317 struct ocfs2_dir_entry *de;
2326 2318
2327 mlog_entry_void();
2328
2329 if (ocfs2_new_dir_wants_trailer(inode)) 2319 if (ocfs2_new_dir_wants_trailer(inode))
2330 size = ocfs2_dir_trailer_blk_off(parent->i_sb); 2320 size = ocfs2_dir_trailer_blk_off(parent->i_sb);
2331 2321
@@ -2380,7 +2370,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
2380bail: 2370bail:
2381 brelse(new_bh); 2371 brelse(new_bh);
2382 2372
2383 mlog_exit(status);
2384 return status; 2373 return status;
2385} 2374}
2386 2375
@@ -2409,9 +2398,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
2409 goto out; 2398 goto out;
2410 } 2399 }
2411 2400
2412 mlog(0, "Dir %llu, attach new index block: %llu\n", 2401 trace_ocfs2_dx_dir_attach_index(
2413 (unsigned long long)OCFS2_I(dir)->ip_blkno, 2402 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2414 (unsigned long long)dr_blkno); 2403 (unsigned long long)dr_blkno);
2415 2404
2416 dx_root_bh = sb_getblk(osb->sb, dr_blkno); 2405 dx_root_bh = sb_getblk(osb->sb, dr_blkno);
2417 if (dx_root_bh == NULL) { 2406 if (dx_root_bh == NULL) {
@@ -2511,11 +2500,10 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
2511 dx_leaf->dl_list.de_count = 2500 dx_leaf->dl_list.de_count =
2512 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb)); 2501 cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
2513 2502
2514 mlog(0, 2503 trace_ocfs2_dx_dir_format_cluster(
2515 "Dir %llu, format dx_leaf: %llu, entry count: %u\n", 2504 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2516 (unsigned long long)OCFS2_I(dir)->ip_blkno, 2505 (unsigned long long)bh->b_blocknr,
2517 (unsigned long long)bh->b_blocknr, 2506 le16_to_cpu(dx_leaf->dl_list.de_count));
2518 le16_to_cpu(dx_leaf->dl_list.de_count));
2519 2507
2520 ocfs2_journal_dirty(handle, bh); 2508 ocfs2_journal_dirty(handle, bh);
2521 } 2509 }
@@ -2759,12 +2747,11 @@ static void ocfs2_dx_dir_index_root_block(struct inode *dir,
2759 2747
2760 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo); 2748 ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
2761 2749
2762 mlog(0, 2750 trace_ocfs2_dx_dir_index_root_block(
2763 "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n", 2751 (unsigned long long)dir->i_ino,
2764 (unsigned long long)dir->i_ino, hinfo.major_hash, 2752 hinfo.major_hash, hinfo.minor_hash,
2765 hinfo.minor_hash, 2753 de->name_len, de->name,
2766 le16_to_cpu(dx_root->dr_entries.de_num_used), 2754 le16_to_cpu(dx_root->dr_entries.de_num_used));
2767 de->name_len, de->name);
2768 2755
2769 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo, 2756 ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
2770 dirent_blk); 2757 dirent_blk);
@@ -3235,7 +3222,6 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
3235bail: 3222bail:
3236 if (did_quota && status < 0) 3223 if (did_quota && status < 0)
3237 dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); 3224 dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
3238 mlog_exit(status);
3239 return status; 3225 return status;
3240} 3226}
3241 3227
@@ -3270,8 +3256,6 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3270 struct ocfs2_extent_tree et; 3256 struct ocfs2_extent_tree et;
3271 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh; 3257 struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
3272 3258
3273 mlog_entry_void();
3274
3275 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 3259 if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
3276 /* 3260 /*
3277 * This would be a code error as an inline directory should 3261 * This would be a code error as an inline directory should
@@ -3320,8 +3304,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
3320 down_write(&OCFS2_I(dir)->ip_alloc_sem); 3304 down_write(&OCFS2_I(dir)->ip_alloc_sem);
3321 drop_alloc_sem = 1; 3305 drop_alloc_sem = 1;
3322 dir_i_size = i_size_read(dir); 3306 dir_i_size = i_size_read(dir);
3323 mlog(0, "extending dir %llu (i_size = %lld)\n", 3307 trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
3324 (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size); 3308 dir_i_size);
3325 3309
3326 /* dir->i_size is always block aligned. */ 3310 /* dir->i_size is always block aligned. */
3327 spin_lock(&OCFS2_I(dir)->ip_lock); 3311 spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -3436,7 +3420,6 @@ bail:
3436 3420
3437 brelse(new_bh); 3421 brelse(new_bh);
3438 3422
3439 mlog_exit(status);
3440 return status; 3423 return status;
3441} 3424}
3442 3425
@@ -3583,8 +3566,9 @@ next:
3583 status = 0; 3566 status = 0;
3584bail: 3567bail:
3585 brelse(bh); 3568 brelse(bh);
3569 if (status)
3570 mlog_errno(status);
3586 3571
3587 mlog_exit(status);
3588 return status; 3572 return status;
3589} 3573}
3590 3574
@@ -3815,9 +3799,9 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3815 struct ocfs2_dx_root_block *dx_root; 3799 struct ocfs2_dx_root_block *dx_root;
3816 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL; 3800 struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
3817 3801
3818 mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n", 3802 trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
3819 (unsigned long long)OCFS2_I(dir)->ip_blkno, 3803 (unsigned long long)leaf_blkno,
3820 (unsigned long long)leaf_blkno, insert_hash); 3804 insert_hash);
3821 3805
3822 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh); 3806 ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
3823 3807
@@ -3897,8 +3881,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
3897 goto out_commit; 3881 goto out_commit;
3898 } 3882 }
3899 3883
3900 mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n", 3884 trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
3901 leaf_cpos, split_hash, insert_hash);
3902 3885
3903 /* 3886 /*
3904 * We have to carefully order operations here. There are items 3887 * We have to carefully order operations here. There are items
@@ -4355,8 +4338,8 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
4355 unsigned int blocks_wanted = 1; 4338 unsigned int blocks_wanted = 1;
4356 struct buffer_head *bh = NULL; 4339 struct buffer_head *bh = NULL;
4357 4340
4358 mlog(0, "getting ready to insert namelen %d into dir %llu\n", 4341 trace_ocfs2_prepare_dir_for_insert(
4359 namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno); 4342 (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
4360 4343
4361 if (!namelen) { 4344 if (!namelen) {
4362 ret = -EINVAL; 4345 ret = -EINVAL;
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index dcebf0d920fa..c8a044efbb15 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,4 +1,4 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1ccflags-y := -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
4 4
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 9f30491e5e88..29a886d1e82c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -128,8 +128,8 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
128 128
129 assert_spin_locked(&res->spinlock); 129 assert_spin_locked(&res->spinlock);
130 130
131 mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n", 131 mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
132 lock->ml.type, lock->ml.convert_type, type); 132 lock->ml.type, lock->ml.convert_type, type);
133 133
134 spin_lock(&lock->spinlock); 134 spin_lock(&lock->spinlock);
135 135
@@ -353,7 +353,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
353 struct kvec vec[2]; 353 struct kvec vec[2];
354 size_t veclen = 1; 354 size_t veclen = 1;
355 355
356 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); 356 mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
357 357
358 memset(&convert, 0, sizeof(struct dlm_convert_lock)); 358 memset(&convert, 0, sizeof(struct dlm_convert_lock));
359 convert.node_idx = dlm->node_num; 359 convert.node_idx = dlm->node_num;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7e38a072d720..7540a492eaba 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -188,7 +188,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
188 struct hlist_head *bucket; 188 struct hlist_head *bucket;
189 struct hlist_node *list; 189 struct hlist_node *list;
190 190
191 mlog_entry("%.*s\n", len, name); 191 mlog(0, "%.*s\n", len, name);
192 192
193 assert_spin_locked(&dlm->spinlock); 193 assert_spin_locked(&dlm->spinlock);
194 194
@@ -222,7 +222,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
222{ 222{
223 struct dlm_lock_resource *res = NULL; 223 struct dlm_lock_resource *res = NULL;
224 224
225 mlog_entry("%.*s\n", len, name); 225 mlog(0, "%.*s\n", len, name);
226 226
227 assert_spin_locked(&dlm->spinlock); 227 assert_spin_locked(&dlm->spinlock);
228 228
@@ -531,7 +531,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
531 unsigned int node; 531 unsigned int node;
532 struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf; 532 struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
533 533
534 mlog_entry("%p %u %p", msg, len, data); 534 mlog(0, "%p %u %p", msg, len, data);
535 535
536 if (!dlm_grab(dlm)) 536 if (!dlm_grab(dlm))
537 return 0; 537 return 0;
@@ -926,9 +926,10 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
926} 926}
927 927
928static int dlm_match_regions(struct dlm_ctxt *dlm, 928static int dlm_match_regions(struct dlm_ctxt *dlm,
929 struct dlm_query_region *qr) 929 struct dlm_query_region *qr,
930 char *local, int locallen)
930{ 931{
931 char *local = NULL, *remote = qr->qr_regions; 932 char *remote = qr->qr_regions;
932 char *l, *r; 933 char *l, *r;
933 int localnr, i, j, foundit; 934 int localnr, i, j, foundit;
934 int status = 0; 935 int status = 0;
@@ -957,13 +958,8 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
957 r += O2HB_MAX_REGION_NAME_LEN; 958 r += O2HB_MAX_REGION_NAME_LEN;
958 } 959 }
959 960
960 local = kmalloc(sizeof(qr->qr_regions), GFP_ATOMIC); 961 localnr = min(O2NM_MAX_REGIONS, locallen/O2HB_MAX_REGION_NAME_LEN);
961 if (!local) { 962 localnr = o2hb_get_all_regions(local, (u8)localnr);
962 status = -ENOMEM;
963 goto bail;
964 }
965
966 localnr = o2hb_get_all_regions(local, O2NM_MAX_REGIONS);
967 963
968 /* compare local regions with remote */ 964 /* compare local regions with remote */
969 l = local; 965 l = local;
@@ -1012,8 +1008,6 @@ static int dlm_match_regions(struct dlm_ctxt *dlm,
1012 } 1008 }
1013 1009
1014bail: 1010bail:
1015 kfree(local);
1016
1017 return status; 1011 return status;
1018} 1012}
1019 1013
@@ -1075,6 +1069,7 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1075{ 1069{
1076 struct dlm_query_region *qr; 1070 struct dlm_query_region *qr;
1077 struct dlm_ctxt *dlm = NULL; 1071 struct dlm_ctxt *dlm = NULL;
1072 char *local = NULL;
1078 int status = 0; 1073 int status = 0;
1079 int locked = 0; 1074 int locked = 0;
1080 1075
@@ -1083,6 +1078,13 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1083 mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node, 1078 mlog(0, "Node %u queries hb regions on domain %s\n", qr->qr_node,
1084 qr->qr_domain); 1079 qr->qr_domain);
1085 1080
1081 /* buffer used in dlm_mast_regions() */
1082 local = kmalloc(sizeof(qr->qr_regions), GFP_KERNEL);
1083 if (!local) {
1084 status = -ENOMEM;
1085 goto bail;
1086 }
1087
1086 status = -EINVAL; 1088 status = -EINVAL;
1087 1089
1088 spin_lock(&dlm_domain_lock); 1090 spin_lock(&dlm_domain_lock);
@@ -1112,13 +1114,15 @@ static int dlm_query_region_handler(struct o2net_msg *msg, u32 len,
1112 goto bail; 1114 goto bail;
1113 } 1115 }
1114 1116
1115 status = dlm_match_regions(dlm, qr); 1117 status = dlm_match_regions(dlm, qr, local, sizeof(qr->qr_regions));
1116 1118
1117bail: 1119bail:
1118 if (locked) 1120 if (locked)
1119 spin_unlock(&dlm->spinlock); 1121 spin_unlock(&dlm->spinlock);
1120 spin_unlock(&dlm_domain_lock); 1122 spin_unlock(&dlm_domain_lock);
1121 1123
1124 kfree(local);
1125
1122 return status; 1126 return status;
1123} 1127}
1124 1128
@@ -1553,7 +1557,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1553 struct domain_join_ctxt *ctxt; 1557 struct domain_join_ctxt *ctxt;
1554 enum dlm_query_join_response_code response = JOIN_DISALLOW; 1558 enum dlm_query_join_response_code response = JOIN_DISALLOW;
1555 1559
1556 mlog_entry("%p", dlm); 1560 mlog(0, "%p", dlm);
1557 1561
1558 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); 1562 ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
1559 if (!ctxt) { 1563 if (!ctxt) {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 7009292aac5a..8d39e0fd66f7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -128,7 +128,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
128 int call_ast = 0, kick_thread = 0; 128 int call_ast = 0, kick_thread = 0;
129 enum dlm_status status = DLM_NORMAL; 129 enum dlm_status status = DLM_NORMAL;
130 130
131 mlog_entry("type=%d\n", lock->ml.type); 131 mlog(0, "type=%d\n", lock->ml.type);
132 132
133 spin_lock(&res->spinlock); 133 spin_lock(&res->spinlock);
134 /* if called from dlm_create_lock_handler, need to 134 /* if called from dlm_create_lock_handler, need to
@@ -227,8 +227,8 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
227 enum dlm_status status = DLM_DENIED; 227 enum dlm_status status = DLM_DENIED;
228 int lockres_changed = 1; 228 int lockres_changed = 1;
229 229
230 mlog_entry("type=%d\n", lock->ml.type); 230 mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
231 mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, 231 lock->ml.type, res->lockname.len,
232 res->lockname.name, flags); 232 res->lockname.name, flags);
233 233
234 spin_lock(&res->spinlock); 234 spin_lock(&res->spinlock);
@@ -308,8 +308,6 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
308 int tmpret, status = 0; 308 int tmpret, status = 0;
309 enum dlm_status ret; 309 enum dlm_status ret;
310 310
311 mlog_entry_void();
312
313 memset(&create, 0, sizeof(create)); 311 memset(&create, 0, sizeof(create));
314 create.node_idx = dlm->node_num; 312 create.node_idx = dlm->node_num;
315 create.requested_type = lock->ml.type; 313 create.requested_type = lock->ml.type;
@@ -477,8 +475,6 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
477 475
478 BUG_ON(!dlm); 476 BUG_ON(!dlm);
479 477
480 mlog_entry_void();
481
482 if (!dlm_grab(dlm)) 478 if (!dlm_grab(dlm))
483 return DLM_REJECTED; 479 return DLM_REJECTED;
484 480
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 59f0f6bdfc62..fede57ed005f 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -426,8 +426,6 @@ static void dlm_mle_release(struct kref *kref)
426 struct dlm_master_list_entry *mle; 426 struct dlm_master_list_entry *mle;
427 struct dlm_ctxt *dlm; 427 struct dlm_ctxt *dlm;
428 428
429 mlog_entry_void();
430
431 mle = container_of(kref, struct dlm_master_list_entry, mle_refs); 429 mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
432 dlm = mle->dlm; 430 dlm = mle->dlm;
433 431
@@ -810,7 +808,7 @@ lookup:
810 dlm_mle_detach_hb_events(dlm, mle); 808 dlm_mle_detach_hb_events(dlm, mle);
811 dlm_put_mle(mle); 809 dlm_put_mle(mle);
812 mle = NULL; 810 mle = NULL;
813 /* this is lame, but we cant wait on either 811 /* this is lame, but we can't wait on either
814 * the mle or lockres waitqueue here */ 812 * the mle or lockres waitqueue here */
815 if (mig) 813 if (mig)
816 msleep(100); 814 msleep(100);
@@ -845,7 +843,7 @@ lookup:
845 843
846 /* finally add the lockres to its hash bucket */ 844 /* finally add the lockres to its hash bucket */
847 __dlm_insert_lockres(dlm, res); 845 __dlm_insert_lockres(dlm, res);
848 /* since this lockres is new it doesnt not require the spinlock */ 846 /* since this lockres is new it doesn't not require the spinlock */
849 dlm_lockres_grab_inflight_ref_new(dlm, res); 847 dlm_lockres_grab_inflight_ref_new(dlm, res);
850 848
851 /* if this node does not become the master make sure to drop 849 /* if this node does not become the master make sure to drop
@@ -3120,8 +3118,6 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3120 3118
3121 *oldmle = NULL; 3119 *oldmle = NULL;
3122 3120
3123 mlog_entry_void();
3124
3125 assert_spin_locked(&dlm->spinlock); 3121 assert_spin_locked(&dlm->spinlock);
3126 assert_spin_locked(&dlm->master_lock); 3122 assert_spin_locked(&dlm->master_lock);
3127 3123
@@ -3261,7 +3257,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3261 struct hlist_node *list; 3257 struct hlist_node *list;
3262 unsigned int i; 3258 unsigned int i;
3263 3259
3264 mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); 3260 mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
3265top: 3261top:
3266 assert_spin_locked(&dlm->spinlock); 3262 assert_spin_locked(&dlm->spinlock);
3267 3263
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index aaaffbcbe916..f1beb6fc254d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -727,7 +727,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
727 if (destroy) 727 if (destroy)
728 dlm_destroy_recovery_area(dlm, dead_node); 728 dlm_destroy_recovery_area(dlm, dead_node);
729 729
730 mlog_exit(status);
731 return status; 730 return status;
732} 731}
733 732
@@ -1496,9 +1495,9 @@ leave:
1496 kfree(buf); 1495 kfree(buf);
1497 if (item) 1496 if (item)
1498 kfree(item); 1497 kfree(item);
1498 mlog_errno(ret);
1499 } 1499 }
1500 1500
1501 mlog_exit(ret);
1502 return ret; 1501 return ret;
1503} 1502}
1504 1503
@@ -1567,7 +1566,6 @@ leave:
1567 dlm_lockres_put(res); 1566 dlm_lockres_put(res);
1568 } 1567 }
1569 kfree(data); 1568 kfree(data);
1570 mlog_exit(ret);
1571} 1569}
1572 1570
1573 1571
@@ -1986,7 +1984,6 @@ leave:
1986 dlm_lock_put(newlock); 1984 dlm_lock_put(newlock);
1987 } 1985 }
1988 1986
1989 mlog_exit(ret);
1990 return ret; 1987 return ret;
1991} 1988}
1992 1989
@@ -2083,8 +2080,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2083 struct hlist_head *bucket; 2080 struct hlist_head *bucket;
2084 struct dlm_lock_resource *res, *next; 2081 struct dlm_lock_resource *res, *next;
2085 2082
2086 mlog_entry_void();
2087
2088 assert_spin_locked(&dlm->spinlock); 2083 assert_spin_locked(&dlm->spinlock);
2089 2084
2090 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { 2085 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
@@ -2607,8 +2602,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
2607 int nodenum; 2602 int nodenum;
2608 int status; 2603 int status;
2609 2604
2610 mlog_entry("%u\n", dead_node);
2611
2612 mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); 2605 mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
2613 2606
2614 spin_lock(&dlm->spinlock); 2607 spin_lock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 817287c6a6db..850aa7e87537 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -317,7 +317,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
317 struct kvec vec[2]; 317 struct kvec vec[2];
318 size_t veclen = 1; 318 size_t veclen = 1;
319 319
320 mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); 320 mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
321 321
322 if (owner == dlm->node_num) { 322 if (owner == dlm->node_num) {
323 /* ended up trying to contact ourself. this means 323 /* ended up trying to contact ourself. this means
@@ -588,8 +588,6 @@ enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
588 struct dlm_lock *lock = NULL; 588 struct dlm_lock *lock = NULL;
589 int call_ast, is_master; 589 int call_ast, is_master;
590 590
591 mlog_entry_void();
592
593 if (!lksb) { 591 if (!lksb) {
594 dlm_error(DLM_BADARGS); 592 dlm_error(DLM_BADARGS);
595 return DLM_BADARGS; 593 return DLM_BADARGS;
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index df69b4856d0d..f14be89a6701 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,4 +1,4 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1ccflags-y := -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
4 4
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e8d94d722ecb..7642d7ca73e5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -64,7 +64,7 @@ struct ocfs2_mask_waiter {
64 unsigned long mw_mask; 64 unsigned long mw_mask;
65 unsigned long mw_goal; 65 unsigned long mw_goal;
66#ifdef CONFIG_OCFS2_FS_STATS 66#ifdef CONFIG_OCFS2_FS_STATS
67 unsigned long long mw_lock_start; 67 ktime_t mw_lock_start;
68#endif 68#endif
69}; 69};
70 70
@@ -397,8 +397,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
397{ 397{
398 int len; 398 int len;
399 399
400 mlog_entry_void();
401
402 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); 400 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
403 401
404 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x", 402 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
@@ -408,8 +406,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
408 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); 406 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
409 407
410 mlog(0, "built lock resource with name: %s\n", name); 408 mlog(0, "built lock resource with name: %s\n", name);
411
412 mlog_exit_void();
413} 409}
414 410
415static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); 411static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
@@ -435,44 +431,41 @@ static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
435#ifdef CONFIG_OCFS2_FS_STATS 431#ifdef CONFIG_OCFS2_FS_STATS
436static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) 432static void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
437{ 433{
438 res->l_lock_num_prmode = 0;
439 res->l_lock_num_prmode_failed = 0;
440 res->l_lock_total_prmode = 0;
441 res->l_lock_max_prmode = 0;
442 res->l_lock_num_exmode = 0;
443 res->l_lock_num_exmode_failed = 0;
444 res->l_lock_total_exmode = 0;
445 res->l_lock_max_exmode = 0;
446 res->l_lock_refresh = 0; 434 res->l_lock_refresh = 0;
435 memset(&res->l_lock_prmode, 0, sizeof(struct ocfs2_lock_stats));
436 memset(&res->l_lock_exmode, 0, sizeof(struct ocfs2_lock_stats));
447} 437}
448 438
449static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level, 439static void ocfs2_update_lock_stats(struct ocfs2_lock_res *res, int level,
450 struct ocfs2_mask_waiter *mw, int ret) 440 struct ocfs2_mask_waiter *mw, int ret)
451{ 441{
452 unsigned long long *num, *sum; 442 u32 usec;
453 unsigned int *max, *failed; 443 ktime_t kt;
454 struct timespec ts = current_kernel_time(); 444 struct ocfs2_lock_stats *stats;
455 unsigned long long time = timespec_to_ns(&ts) - mw->mw_lock_start; 445
456 446 if (level == LKM_PRMODE)
457 if (level == LKM_PRMODE) { 447 stats = &res->l_lock_prmode;
458 num = &res->l_lock_num_prmode; 448 else if (level == LKM_EXMODE)
459 sum = &res->l_lock_total_prmode; 449 stats = &res->l_lock_exmode;
460 max = &res->l_lock_max_prmode; 450 else
461 failed = &res->l_lock_num_prmode_failed;
462 } else if (level == LKM_EXMODE) {
463 num = &res->l_lock_num_exmode;
464 sum = &res->l_lock_total_exmode;
465 max = &res->l_lock_max_exmode;
466 failed = &res->l_lock_num_exmode_failed;
467 } else
468 return; 451 return;
469 452
470 (*num)++; 453 kt = ktime_sub(ktime_get(), mw->mw_lock_start);
471 (*sum) += time; 454 usec = ktime_to_us(kt);
472 if (time > *max) 455
473 *max = time; 456 stats->ls_gets++;
457 stats->ls_total += ktime_to_ns(kt);
458 /* overflow */
459 if (unlikely(stats->ls_gets) == 0) {
460 stats->ls_gets++;
461 stats->ls_total = ktime_to_ns(kt);
462 }
463
464 if (stats->ls_max < usec)
465 stats->ls_max = usec;
466
474 if (ret) 467 if (ret)
475 (*failed)++; 468 stats->ls_fail++;
476} 469}
477 470
478static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres) 471static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
@@ -482,8 +475,7 @@ static inline void ocfs2_track_lock_refresh(struct ocfs2_lock_res *lockres)
482 475
483static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw) 476static inline void ocfs2_init_start_time(struct ocfs2_mask_waiter *mw)
484{ 477{
485 struct timespec ts = current_kernel_time(); 478 mw->mw_lock_start = ktime_get();
486 mw->mw_lock_start = timespec_to_ns(&ts);
487} 479}
488#else 480#else
489static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res) 481static inline void ocfs2_init_lock_stats(struct ocfs2_lock_res *res)
@@ -729,8 +721,6 @@ void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
729 721
730void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 722void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
731{ 723{
732 mlog_entry_void();
733
734 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) 724 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
735 return; 725 return;
736 726
@@ -756,14 +746,11 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
756 memset(&res->l_lksb, 0, sizeof(res->l_lksb)); 746 memset(&res->l_lksb, 0, sizeof(res->l_lksb));
757 747
758 res->l_flags = 0UL; 748 res->l_flags = 0UL;
759 mlog_exit_void();
760} 749}
761 750
762static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, 751static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
763 int level) 752 int level)
764{ 753{
765 mlog_entry_void();
766
767 BUG_ON(!lockres); 754 BUG_ON(!lockres);
768 755
769 switch(level) { 756 switch(level) {
@@ -776,15 +763,11 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
776 default: 763 default:
777 BUG(); 764 BUG();
778 } 765 }
779
780 mlog_exit_void();
781} 766}
782 767
783static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, 768static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
784 int level) 769 int level)
785{ 770{
786 mlog_entry_void();
787
788 BUG_ON(!lockres); 771 BUG_ON(!lockres);
789 772
790 switch(level) { 773 switch(level) {
@@ -799,7 +782,6 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
799 default: 782 default:
800 BUG(); 783 BUG();
801 } 784 }
802 mlog_exit_void();
803} 785}
804 786
805/* WARNING: This function lives in a world where the only three lock 787/* WARNING: This function lives in a world where the only three lock
@@ -846,8 +828,6 @@ static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
846 828
847static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) 829static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
848{ 830{
849 mlog_entry_void();
850
851 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 831 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
852 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 832 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
853 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 833 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
@@ -860,14 +840,10 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
860 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 840 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
861 } 841 }
862 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 842 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
863
864 mlog_exit_void();
865} 843}
866 844
867static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) 845static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
868{ 846{
869 mlog_entry_void();
870
871 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 847 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
872 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 848 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
873 849
@@ -889,14 +865,10 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
889 lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); 865 lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
890 866
891 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 867 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
892
893 mlog_exit_void();
894} 868}
895 869
896static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) 870static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
897{ 871{
898 mlog_entry_void();
899
900 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); 872 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
901 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 873 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
902 874
@@ -908,15 +880,12 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
908 lockres->l_level = lockres->l_requested; 880 lockres->l_level = lockres->l_requested;
909 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); 881 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
910 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 882 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
911
912 mlog_exit_void();
913} 883}
914 884
915static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, 885static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
916 int level) 886 int level)
917{ 887{
918 int needs_downconvert = 0; 888 int needs_downconvert = 0;
919 mlog_entry_void();
920 889
921 assert_spin_locked(&lockres->l_lock); 890 assert_spin_locked(&lockres->l_lock);
922 891
@@ -938,8 +907,7 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
938 907
939 if (needs_downconvert) 908 if (needs_downconvert)
940 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 909 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
941 910 mlog(0, "needs_downconvert = %d\n", needs_downconvert);
942 mlog_exit(needs_downconvert);
943 return needs_downconvert; 911 return needs_downconvert;
944} 912}
945 913
@@ -1151,8 +1119,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1151 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb); 1119 struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
1152 unsigned long flags; 1120 unsigned long flags;
1153 1121
1154 mlog_entry_void();
1155
1156 mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n", 1122 mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
1157 lockres->l_name, lockres->l_unlock_action); 1123 lockres->l_name, lockres->l_unlock_action);
1158 1124
@@ -1162,7 +1128,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1162 "unlock_action %d\n", error, lockres->l_name, 1128 "unlock_action %d\n", error, lockres->l_name,
1163 lockres->l_unlock_action); 1129 lockres->l_unlock_action);
1164 spin_unlock_irqrestore(&lockres->l_lock, flags); 1130 spin_unlock_irqrestore(&lockres->l_lock, flags);
1165 mlog_exit_void();
1166 return; 1131 return;
1167 } 1132 }
1168 1133
@@ -1186,8 +1151,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
1186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 1151 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
1187 wake_up(&lockres->l_event); 1152 wake_up(&lockres->l_event);
1188 spin_unlock_irqrestore(&lockres->l_lock, flags); 1153 spin_unlock_irqrestore(&lockres->l_lock, flags);
1189
1190 mlog_exit_void();
1191} 1154}
1192 1155
1193/* 1156/*
@@ -1233,7 +1196,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1233{ 1196{
1234 unsigned long flags; 1197 unsigned long flags;
1235 1198
1236 mlog_entry_void();
1237 spin_lock_irqsave(&lockres->l_lock, flags); 1199 spin_lock_irqsave(&lockres->l_lock, flags);
1238 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 1200 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
1239 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); 1201 lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
@@ -1244,7 +1206,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
1244 spin_unlock_irqrestore(&lockres->l_lock, flags); 1206 spin_unlock_irqrestore(&lockres->l_lock, flags);
1245 1207
1246 wake_up(&lockres->l_event); 1208 wake_up(&lockres->l_event);
1247 mlog_exit_void();
1248} 1209}
1249 1210
1250/* Note: If we detect another process working on the lock (i.e., 1211/* Note: If we detect another process working on the lock (i.e.,
@@ -1260,8 +1221,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
1260 unsigned long flags; 1221 unsigned long flags;
1261 unsigned int gen; 1222 unsigned int gen;
1262 1223
1263 mlog_entry_void();
1264
1265 mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level, 1224 mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
1266 dlm_flags); 1225 dlm_flags);
1267 1226
@@ -1293,7 +1252,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
1293 mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name); 1252 mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
1294 1253
1295bail: 1254bail:
1296 mlog_exit(ret);
1297 return ret; 1255 return ret;
1298} 1256}
1299 1257
@@ -1416,8 +1374,6 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
1416 unsigned int gen; 1374 unsigned int gen;
1417 int noqueue_attempted = 0; 1375 int noqueue_attempted = 0;
1418 1376
1419 mlog_entry_void();
1420
1421 ocfs2_init_mask_waiter(&mw); 1377 ocfs2_init_mask_waiter(&mw);
1422 1378
1423 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 1379 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -1583,7 +1539,6 @@ out:
1583 caller_ip); 1539 caller_ip);
1584 } 1540 }
1585#endif 1541#endif
1586 mlog_exit(ret);
1587 return ret; 1542 return ret;
1588} 1543}
1589 1544
@@ -1605,7 +1560,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
1605{ 1560{
1606 unsigned long flags; 1561 unsigned long flags;
1607 1562
1608 mlog_entry_void();
1609 spin_lock_irqsave(&lockres->l_lock, flags); 1563 spin_lock_irqsave(&lockres->l_lock, flags);
1610 ocfs2_dec_holders(lockres, level); 1564 ocfs2_dec_holders(lockres, level);
1611 ocfs2_downconvert_on_unlock(osb, lockres); 1565 ocfs2_downconvert_on_unlock(osb, lockres);
@@ -1614,7 +1568,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
1614 if (lockres->l_lockdep_map.key != NULL) 1568 if (lockres->l_lockdep_map.key != NULL)
1615 rwsem_release(&lockres->l_lockdep_map, 1, caller_ip); 1569 rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
1616#endif 1570#endif
1617 mlog_exit_void();
1618} 1571}
1619 1572
1620static int ocfs2_create_new_lock(struct ocfs2_super *osb, 1573static int ocfs2_create_new_lock(struct ocfs2_super *osb,
@@ -1648,8 +1601,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1648 BUG_ON(!inode); 1601 BUG_ON(!inode);
1649 BUG_ON(!ocfs2_inode_is_new(inode)); 1602 BUG_ON(!ocfs2_inode_is_new(inode));
1650 1603
1651 mlog_entry_void();
1652
1653 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 1604 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
1654 1605
1655 /* NOTE: That we don't increment any of the holder counts, nor 1606 /* NOTE: That we don't increment any of the holder counts, nor
@@ -1683,7 +1634,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1683 } 1634 }
1684 1635
1685bail: 1636bail:
1686 mlog_exit(ret);
1687 return ret; 1637 return ret;
1688} 1638}
1689 1639
@@ -1695,16 +1645,12 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1695 1645
1696 BUG_ON(!inode); 1646 BUG_ON(!inode);
1697 1647
1698 mlog_entry_void();
1699
1700 mlog(0, "inode %llu take %s RW lock\n", 1648 mlog(0, "inode %llu take %s RW lock\n",
1701 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1649 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1702 write ? "EXMODE" : "PRMODE"); 1650 write ? "EXMODE" : "PRMODE");
1703 1651
1704 if (ocfs2_mount_local(osb)) { 1652 if (ocfs2_mount_local(osb))
1705 mlog_exit(0);
1706 return 0; 1653 return 0;
1707 }
1708 1654
1709 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1655 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1710 1656
@@ -1715,7 +1661,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1715 if (status < 0) 1661 if (status < 0)
1716 mlog_errno(status); 1662 mlog_errno(status);
1717 1663
1718 mlog_exit(status);
1719 return status; 1664 return status;
1720} 1665}
1721 1666
@@ -1725,16 +1670,12 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
1725 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; 1670 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1726 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1671 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1727 1672
1728 mlog_entry_void();
1729
1730 mlog(0, "inode %llu drop %s RW lock\n", 1673 mlog(0, "inode %llu drop %s RW lock\n",
1731 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1674 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1732 write ? "EXMODE" : "PRMODE"); 1675 write ? "EXMODE" : "PRMODE");
1733 1676
1734 if (!ocfs2_mount_local(osb)) 1677 if (!ocfs2_mount_local(osb))
1735 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1678 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1736
1737 mlog_exit_void();
1738} 1679}
1739 1680
1740/* 1681/*
@@ -1748,8 +1689,6 @@ int ocfs2_open_lock(struct inode *inode)
1748 1689
1749 BUG_ON(!inode); 1690 BUG_ON(!inode);
1750 1691
1751 mlog_entry_void();
1752
1753 mlog(0, "inode %llu take PRMODE open lock\n", 1692 mlog(0, "inode %llu take PRMODE open lock\n",
1754 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1693 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1755 1694
@@ -1764,7 +1703,6 @@ int ocfs2_open_lock(struct inode *inode)
1764 mlog_errno(status); 1703 mlog_errno(status);
1765 1704
1766out: 1705out:
1767 mlog_exit(status);
1768 return status; 1706 return status;
1769} 1707}
1770 1708
@@ -1776,8 +1714,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1776 1714
1777 BUG_ON(!inode); 1715 BUG_ON(!inode);
1778 1716
1779 mlog_entry_void();
1780
1781 mlog(0, "inode %llu try to take %s open lock\n", 1717 mlog(0, "inode %llu try to take %s open lock\n",
1782 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1718 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1783 write ? "EXMODE" : "PRMODE"); 1719 write ? "EXMODE" : "PRMODE");
@@ -1799,7 +1735,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1799 level, DLM_LKF_NOQUEUE, 0); 1735 level, DLM_LKF_NOQUEUE, 0);
1800 1736
1801out: 1737out:
1802 mlog_exit(status);
1803 return status; 1738 return status;
1804} 1739}
1805 1740
@@ -1811,8 +1746,6 @@ void ocfs2_open_unlock(struct inode *inode)
1811 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres; 1746 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
1812 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1747 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1813 1748
1814 mlog_entry_void();
1815
1816 mlog(0, "inode %llu drop open lock\n", 1749 mlog(0, "inode %llu drop open lock\n",
1817 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1750 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1818 1751
@@ -1827,7 +1760,7 @@ void ocfs2_open_unlock(struct inode *inode)
1827 DLM_LOCK_EX); 1760 DLM_LOCK_EX);
1828 1761
1829out: 1762out:
1830 mlog_exit_void(); 1763 return;
1831} 1764}
1832 1765
1833static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres, 1766static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
@@ -2043,8 +1976,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
2043{ 1976{
2044 int kick = 0; 1977 int kick = 0;
2045 1978
2046 mlog_entry_void();
2047
2048 /* If we know that another node is waiting on our lock, kick 1979 /* If we know that another node is waiting on our lock, kick
2049 * the downconvert thread * pre-emptively when we reach a release 1980 * the downconvert thread * pre-emptively when we reach a release
2050 * condition. */ 1981 * condition. */
@@ -2065,8 +1996,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
2065 1996
2066 if (kick) 1997 if (kick)
2067 ocfs2_wake_downconvert_thread(osb); 1998 ocfs2_wake_downconvert_thread(osb);
2068
2069 mlog_exit_void();
2070} 1999}
2071 2000
2072#define OCFS2_SEC_BITS 34 2001#define OCFS2_SEC_BITS 34
@@ -2095,8 +2024,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
2095 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; 2024 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
2096 struct ocfs2_meta_lvb *lvb; 2025 struct ocfs2_meta_lvb *lvb;
2097 2026
2098 mlog_entry_void();
2099
2100 lvb = ocfs2_dlm_lvb(&lockres->l_lksb); 2027 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
2101 2028
2102 /* 2029 /*
@@ -2128,8 +2055,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
2128 2055
2129out: 2056out:
2130 mlog_meta_lvb(0, lockres); 2057 mlog_meta_lvb(0, lockres);
2131
2132 mlog_exit_void();
2133} 2058}
2134 2059
2135static void ocfs2_unpack_timespec(struct timespec *spec, 2060static void ocfs2_unpack_timespec(struct timespec *spec,
@@ -2145,8 +2070,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
2145 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; 2070 struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
2146 struct ocfs2_meta_lvb *lvb; 2071 struct ocfs2_meta_lvb *lvb;
2147 2072
2148 mlog_entry_void();
2149
2150 mlog_meta_lvb(0, lockres); 2073 mlog_meta_lvb(0, lockres);
2151 2074
2152 lvb = ocfs2_dlm_lvb(&lockres->l_lksb); 2075 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2177,8 +2100,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
2177 ocfs2_unpack_timespec(&inode->i_ctime, 2100 ocfs2_unpack_timespec(&inode->i_ctime,
2178 be64_to_cpu(lvb->lvb_ictime_packed)); 2101 be64_to_cpu(lvb->lvb_ictime_packed));
2179 spin_unlock(&oi->ip_lock); 2102 spin_unlock(&oi->ip_lock);
2180
2181 mlog_exit_void();
2182} 2103}
2183 2104
2184static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, 2105static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -2205,8 +2126,6 @@ static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
2205 unsigned long flags; 2126 unsigned long flags;
2206 int status = 0; 2127 int status = 0;
2207 2128
2208 mlog_entry_void();
2209
2210refresh_check: 2129refresh_check:
2211 spin_lock_irqsave(&lockres->l_lock, flags); 2130 spin_lock_irqsave(&lockres->l_lock, flags);
2212 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { 2131 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
@@ -2227,7 +2146,7 @@ refresh_check:
2227 2146
2228 status = 1; 2147 status = 1;
2229bail: 2148bail:
2230 mlog_exit(status); 2149 mlog(0, "status %d\n", status);
2231 return status; 2150 return status;
2232} 2151}
2233 2152
@@ -2237,7 +2156,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
2237 int status) 2156 int status)
2238{ 2157{
2239 unsigned long flags; 2158 unsigned long flags;
2240 mlog_entry_void();
2241 2159
2242 spin_lock_irqsave(&lockres->l_lock, flags); 2160 spin_lock_irqsave(&lockres->l_lock, flags);
2243 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); 2161 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
@@ -2246,8 +2164,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
2246 spin_unlock_irqrestore(&lockres->l_lock, flags); 2164 spin_unlock_irqrestore(&lockres->l_lock, flags);
2247 2165
2248 wake_up(&lockres->l_event); 2166 wake_up(&lockres->l_event);
2249
2250 mlog_exit_void();
2251} 2167}
2252 2168
2253/* may or may not return a bh if it went to disk. */ 2169/* may or may not return a bh if it went to disk. */
@@ -2260,8 +2176,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2260 struct ocfs2_dinode *fe; 2176 struct ocfs2_dinode *fe;
2261 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2177 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2262 2178
2263 mlog_entry_void();
2264
2265 if (ocfs2_mount_local(osb)) 2179 if (ocfs2_mount_local(osb))
2266 goto bail; 2180 goto bail;
2267 2181
@@ -2330,7 +2244,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
2330bail_refresh: 2244bail_refresh:
2331 ocfs2_complete_lock_res_refresh(lockres, status); 2245 ocfs2_complete_lock_res_refresh(lockres, status);
2332bail: 2246bail:
2333 mlog_exit(status);
2334 return status; 2247 return status;
2335} 2248}
2336 2249
@@ -2374,8 +2287,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
2374 2287
2375 BUG_ON(!inode); 2288 BUG_ON(!inode);
2376 2289
2377 mlog_entry_void();
2378
2379 mlog(0, "inode %llu, take %s META lock\n", 2290 mlog(0, "inode %llu, take %s META lock\n",
2380 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2291 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2381 ex ? "EXMODE" : "PRMODE"); 2292 ex ? "EXMODE" : "PRMODE");
@@ -2467,7 +2378,6 @@ bail:
2467 if (local_bh) 2378 if (local_bh)
2468 brelse(local_bh); 2379 brelse(local_bh);
2469 2380
2470 mlog_exit(status);
2471 return status; 2381 return status;
2472} 2382}
2473 2383
@@ -2517,7 +2427,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
2517{ 2427{
2518 int ret; 2428 int ret;
2519 2429
2520 mlog_entry_void();
2521 ret = ocfs2_inode_lock(inode, NULL, 0); 2430 ret = ocfs2_inode_lock(inode, NULL, 0);
2522 if (ret < 0) { 2431 if (ret < 0) {
2523 mlog_errno(ret); 2432 mlog_errno(ret);
@@ -2545,7 +2454,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
2545 } else 2454 } else
2546 *level = 0; 2455 *level = 0;
2547 2456
2548 mlog_exit(ret);
2549 return ret; 2457 return ret;
2550} 2458}
2551 2459
@@ -2556,8 +2464,6 @@ void ocfs2_inode_unlock(struct inode *inode,
2556 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; 2464 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
2557 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2465 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2558 2466
2559 mlog_entry_void();
2560
2561 mlog(0, "inode %llu drop %s META lock\n", 2467 mlog(0, "inode %llu drop %s META lock\n",
2562 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2468 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2563 ex ? "EXMODE" : "PRMODE"); 2469 ex ? "EXMODE" : "PRMODE");
@@ -2565,8 +2471,6 @@ void ocfs2_inode_unlock(struct inode *inode,
2565 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) && 2471 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
2566 !ocfs2_mount_local(osb)) 2472 !ocfs2_mount_local(osb))
2567 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 2473 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
2568
2569 mlog_exit_void();
2570} 2474}
2571 2475
2572int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno) 2476int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
@@ -2617,8 +2521,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2617 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; 2521 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2618 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 2522 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
2619 2523
2620 mlog_entry_void();
2621
2622 if (ocfs2_is_hard_readonly(osb)) 2524 if (ocfs2_is_hard_readonly(osb))
2623 return -EROFS; 2525 return -EROFS;
2624 2526
@@ -2650,7 +2552,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2650 ocfs2_track_lock_refresh(lockres); 2552 ocfs2_track_lock_refresh(lockres);
2651 } 2553 }
2652bail: 2554bail:
2653 mlog_exit(status);
2654 return status; 2555 return status;
2655} 2556}
2656 2557
@@ -2869,8 +2770,15 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
2869 return iter; 2770 return iter;
2870} 2771}
2871 2772
2872/* So that debugfs.ocfs2 can determine which format is being used */ 2773/*
2873#define OCFS2_DLM_DEBUG_STR_VERSION 2 2774 * Version is used by debugfs.ocfs2 to determine the format being used
2775 *
2776 * New in version 2
2777 * - Lock stats printed
2778 * New in version 3
2779 * - Max time in lock stats is in usecs (instead of nsecs)
2780 */
2781#define OCFS2_DLM_DEBUG_STR_VERSION 3
2874static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) 2782static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2875{ 2783{
2876 int i; 2784 int i;
@@ -2912,18 +2820,18 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2912 seq_printf(m, "0x%x\t", lvb[i]); 2820 seq_printf(m, "0x%x\t", lvb[i]);
2913 2821
2914#ifdef CONFIG_OCFS2_FS_STATS 2822#ifdef CONFIG_OCFS2_FS_STATS
2915# define lock_num_prmode(_l) (_l)->l_lock_num_prmode 2823# define lock_num_prmode(_l) ((_l)->l_lock_prmode.ls_gets)
2916# define lock_num_exmode(_l) (_l)->l_lock_num_exmode 2824# define lock_num_exmode(_l) ((_l)->l_lock_exmode.ls_gets)
2917# define lock_num_prmode_failed(_l) (_l)->l_lock_num_prmode_failed 2825# define lock_num_prmode_failed(_l) ((_l)->l_lock_prmode.ls_fail)
2918# define lock_num_exmode_failed(_l) (_l)->l_lock_num_exmode_failed 2826# define lock_num_exmode_failed(_l) ((_l)->l_lock_exmode.ls_fail)
2919# define lock_total_prmode(_l) (_l)->l_lock_total_prmode 2827# define lock_total_prmode(_l) ((_l)->l_lock_prmode.ls_total)
2920# define lock_total_exmode(_l) (_l)->l_lock_total_exmode 2828# define lock_total_exmode(_l) ((_l)->l_lock_exmode.ls_total)
2921# define lock_max_prmode(_l) (_l)->l_lock_max_prmode 2829# define lock_max_prmode(_l) ((_l)->l_lock_prmode.ls_max)
2922# define lock_max_exmode(_l) (_l)->l_lock_max_exmode 2830# define lock_max_exmode(_l) ((_l)->l_lock_exmode.ls_max)
2923# define lock_refresh(_l) (_l)->l_lock_refresh 2831# define lock_refresh(_l) ((_l)->l_lock_refresh)
2924#else 2832#else
2925# define lock_num_prmode(_l) (0ULL) 2833# define lock_num_prmode(_l) (0)
2926# define lock_num_exmode(_l) (0ULL) 2834# define lock_num_exmode(_l) (0)
2927# define lock_num_prmode_failed(_l) (0) 2835# define lock_num_prmode_failed(_l) (0)
2928# define lock_num_exmode_failed(_l) (0) 2836# define lock_num_exmode_failed(_l) (0)
2929# define lock_total_prmode(_l) (0ULL) 2837# define lock_total_prmode(_l) (0ULL)
@@ -2933,8 +2841,8 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2933# define lock_refresh(_l) (0) 2841# define lock_refresh(_l) (0)
2934#endif 2842#endif
2935 /* The following seq_print was added in version 2 of this output */ 2843 /* The following seq_print was added in version 2 of this output */
2936 seq_printf(m, "%llu\t" 2844 seq_printf(m, "%u\t"
2937 "%llu\t" 2845 "%u\t"
2938 "%u\t" 2846 "%u\t"
2939 "%u\t" 2847 "%u\t"
2940 "%llu\t" 2848 "%llu\t"
@@ -3054,8 +2962,6 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
3054 int status = 0; 2962 int status = 0;
3055 struct ocfs2_cluster_connection *conn = NULL; 2963 struct ocfs2_cluster_connection *conn = NULL;
3056 2964
3057 mlog_entry_void();
3058
3059 if (ocfs2_mount_local(osb)) { 2965 if (ocfs2_mount_local(osb)) {
3060 osb->node_num = 0; 2966 osb->node_num = 0;
3061 goto local; 2967 goto local;
@@ -3112,15 +3018,12 @@ bail:
3112 kthread_stop(osb->dc_task); 3018 kthread_stop(osb->dc_task);
3113 } 3019 }
3114 3020
3115 mlog_exit(status);
3116 return status; 3021 return status;
3117} 3022}
3118 3023
3119void ocfs2_dlm_shutdown(struct ocfs2_super *osb, 3024void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3120 int hangup_pending) 3025 int hangup_pending)
3121{ 3026{
3122 mlog_entry_void();
3123
3124 ocfs2_drop_osb_locks(osb); 3027 ocfs2_drop_osb_locks(osb);
3125 3028
3126 /* 3029 /*
@@ -3143,8 +3046,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
3143 osb->cconn = NULL; 3046 osb->cconn = NULL;
3144 3047
3145 ocfs2_dlm_shutdown_debug(osb); 3048 ocfs2_dlm_shutdown_debug(osb);
3146
3147 mlog_exit_void();
3148} 3049}
3149 3050
3150static int ocfs2_drop_lock(struct ocfs2_super *osb, 3051static int ocfs2_drop_lock(struct ocfs2_super *osb,
@@ -3226,7 +3127,6 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
3226 3127
3227 ocfs2_wait_on_busy_lock(lockres); 3128 ocfs2_wait_on_busy_lock(lockres);
3228out: 3129out:
3229 mlog_exit(0);
3230 return 0; 3130 return 0;
3231} 3131}
3232 3132
@@ -3284,8 +3184,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
3284{ 3184{
3285 int status, err; 3185 int status, err;
3286 3186
3287 mlog_entry_void();
3288
3289 /* No need to call ocfs2_mark_lockres_freeing here - 3187 /* No need to call ocfs2_mark_lockres_freeing here -
3290 * ocfs2_clear_inode has done it for us. */ 3188 * ocfs2_clear_inode has done it for us. */
3291 3189
@@ -3310,7 +3208,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
3310 if (err < 0 && !status) 3208 if (err < 0 && !status)
3311 status = err; 3209 status = err;
3312 3210
3313 mlog_exit(status);
3314 return status; 3211 return status;
3315} 3212}
3316 3213
@@ -3352,8 +3249,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3352 int ret; 3249 int ret;
3353 u32 dlm_flags = DLM_LKF_CONVERT; 3250 u32 dlm_flags = DLM_LKF_CONVERT;
3354 3251
3355 mlog_entry_void();
3356
3357 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, 3252 mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
3358 lockres->l_level, new_level); 3253 lockres->l_level, new_level);
3359 3254
@@ -3375,7 +3270,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
3375 3270
3376 ret = 0; 3271 ret = 0;
3377bail: 3272bail:
3378 mlog_exit(ret);
3379 return ret; 3273 return ret;
3380} 3274}
3381 3275
@@ -3385,8 +3279,6 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
3385{ 3279{
3386 assert_spin_locked(&lockres->l_lock); 3280 assert_spin_locked(&lockres->l_lock);
3387 3281
3388 mlog_entry_void();
3389
3390 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 3282 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
3391 /* If we're already trying to cancel a lock conversion 3283 /* If we're already trying to cancel a lock conversion
3392 * then just drop the spinlock and allow the caller to 3284 * then just drop the spinlock and allow the caller to
@@ -3416,8 +3308,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3416{ 3308{
3417 int ret; 3309 int ret;
3418 3310
3419 mlog_entry_void();
3420
3421 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, 3311 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
3422 DLM_LKF_CANCEL); 3312 DLM_LKF_CANCEL);
3423 if (ret) { 3313 if (ret) {
@@ -3427,7 +3317,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
3427 3317
3428 mlog(ML_BASTS, "lockres %s\n", lockres->l_name); 3318 mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
3429 3319
3430 mlog_exit(ret);
3431 return ret; 3320 return ret;
3432} 3321}
3433 3322
@@ -3443,8 +3332,6 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
3443 int set_lvb = 0; 3332 int set_lvb = 0;
3444 unsigned int gen; 3333 unsigned int gen;
3445 3334
3446 mlog_entry_void();
3447
3448 spin_lock_irqsave(&lockres->l_lock, flags); 3335 spin_lock_irqsave(&lockres->l_lock, flags);
3449 3336
3450recheck: 3337recheck:
@@ -3619,14 +3506,14 @@ downconvert:
3619 gen); 3506 gen);
3620 3507
3621leave: 3508leave:
3622 mlog_exit(ret); 3509 if (ret)
3510 mlog_errno(ret);
3623 return ret; 3511 return ret;
3624 3512
3625leave_requeue: 3513leave_requeue:
3626 spin_unlock_irqrestore(&lockres->l_lock, flags); 3514 spin_unlock_irqrestore(&lockres->l_lock, flags);
3627 ctl->requeue = 1; 3515 ctl->requeue = 1;
3628 3516
3629 mlog_exit(0);
3630 return 0; 3517 return 0;
3631} 3518}
3632 3519
@@ -3859,8 +3746,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3859 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, 3746 struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
3860 oinfo->dqi_gi.dqi_type); 3747 oinfo->dqi_gi.dqi_type);
3861 3748
3862 mlog_entry_void();
3863
3864 lvb = ocfs2_dlm_lvb(&lockres->l_lksb); 3749 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
3865 lvb->lvb_version = OCFS2_QINFO_LVB_VERSION; 3750 lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
3866 lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace); 3751 lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
@@ -3869,8 +3754,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
3869 lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks); 3754 lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
3870 lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk); 3755 lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
3871 lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry); 3756 lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
3872
3873 mlog_exit_void();
3874} 3757}
3875 3758
3876void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex) 3759void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
@@ -3879,10 +3762,8 @@ void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3879 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); 3762 struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
3880 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; 3763 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3881 3764
3882 mlog_entry_void();
3883 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) 3765 if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
3884 ocfs2_cluster_unlock(osb, lockres, level); 3766 ocfs2_cluster_unlock(osb, lockres, level);
3885 mlog_exit_void();
3886} 3767}
3887 3768
3888static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo) 3769static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
@@ -3937,8 +3818,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3937 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; 3818 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
3938 int status = 0; 3819 int status = 0;
3939 3820
3940 mlog_entry_void();
3941
3942 /* On RO devices, locking really isn't needed... */ 3821 /* On RO devices, locking really isn't needed... */
3943 if (ocfs2_is_hard_readonly(osb)) { 3822 if (ocfs2_is_hard_readonly(osb)) {
3944 if (ex) 3823 if (ex)
@@ -3961,7 +3840,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
3961 ocfs2_qinfo_unlock(oinfo, ex); 3840 ocfs2_qinfo_unlock(oinfo, ex);
3962 ocfs2_complete_lock_res_refresh(lockres, status); 3841 ocfs2_complete_lock_res_refresh(lockres, status);
3963bail: 3842bail:
3964 mlog_exit(status);
3965 return status; 3843 return status;
3966} 3844}
3967 3845
@@ -4007,8 +3885,6 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
4007 * considered valid until we remove the OCFS2_LOCK_QUEUED 3885 * considered valid until we remove the OCFS2_LOCK_QUEUED
4008 * flag. */ 3886 * flag. */
4009 3887
4010 mlog_entry_void();
4011
4012 BUG_ON(!lockres); 3888 BUG_ON(!lockres);
4013 BUG_ON(!lockres->l_ops); 3889 BUG_ON(!lockres->l_ops);
4014 3890
@@ -4042,15 +3918,11 @@ unqueue:
4042 if (ctl.unblock_action != UNBLOCK_CONTINUE 3918 if (ctl.unblock_action != UNBLOCK_CONTINUE
4043 && lockres->l_ops->post_unlock) 3919 && lockres->l_ops->post_unlock)
4044 lockres->l_ops->post_unlock(osb, lockres); 3920 lockres->l_ops->post_unlock(osb, lockres);
4045
4046 mlog_exit_void();
4047} 3921}
4048 3922
4049static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, 3923static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
4050 struct ocfs2_lock_res *lockres) 3924 struct ocfs2_lock_res *lockres)
4051{ 3925{
4052 mlog_entry_void();
4053
4054 assert_spin_locked(&lockres->l_lock); 3926 assert_spin_locked(&lockres->l_lock);
4055 3927
4056 if (lockres->l_flags & OCFS2_LOCK_FREEING) { 3928 if (lockres->l_flags & OCFS2_LOCK_FREEING) {
@@ -4071,8 +3943,6 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
4071 osb->blocked_lock_count++; 3943 osb->blocked_lock_count++;
4072 } 3944 }
4073 spin_unlock(&osb->dc_task_lock); 3945 spin_unlock(&osb->dc_task_lock);
4074
4075 mlog_exit_void();
4076} 3946}
4077 3947
4078static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) 3948static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
@@ -4080,8 +3950,6 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
4080 unsigned long processed; 3950 unsigned long processed;
4081 struct ocfs2_lock_res *lockres; 3951 struct ocfs2_lock_res *lockres;
4082 3952
4083 mlog_entry_void();
4084
4085 spin_lock(&osb->dc_task_lock); 3953 spin_lock(&osb->dc_task_lock);
4086 /* grab this early so we know to try again if a state change and 3954 /* grab this early so we know to try again if a state change and
4087 * wake happens part-way through our work */ 3955 * wake happens part-way through our work */
@@ -4105,8 +3973,6 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
4105 spin_lock(&osb->dc_task_lock); 3973 spin_lock(&osb->dc_task_lock);
4106 } 3974 }
4107 spin_unlock(&osb->dc_task_lock); 3975 spin_unlock(&osb->dc_task_lock);
4108
4109 mlog_exit_void();
4110} 3976}
4111 3977
4112static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb) 3978static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4fd..745db42528d5 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -26,7 +26,6 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/types.h> 27#include <linux/types.h>
28 28
29#define MLOG_MASK_PREFIX ML_EXPORT
30#include <cluster/masklog.h> 29#include <cluster/masklog.h>
31 30
32#include "ocfs2.h" 31#include "ocfs2.h"
@@ -40,6 +39,7 @@
40 39
41#include "buffer_head_io.h" 40#include "buffer_head_io.h"
42#include "suballoc.h" 41#include "suballoc.h"
42#include "ocfs2_trace.h"
43 43
44struct ocfs2_inode_handle 44struct ocfs2_inode_handle
45{ 45{
@@ -56,10 +56,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
56 int status, set; 56 int status, set;
57 struct dentry *result; 57 struct dentry *result;
58 58
59 mlog_entry("(0x%p, 0x%p)\n", sb, handle); 59 trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
60 60
61 if (blkno == 0) { 61 if (blkno == 0) {
62 mlog(0, "nfs wants inode with blkno: 0\n");
63 result = ERR_PTR(-ESTALE); 62 result = ERR_PTR(-ESTALE);
64 goto bail; 63 goto bail;
65 } 64 }
@@ -83,6 +82,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
83 } 82 }
84 83
85 status = ocfs2_test_inode_bit(osb, blkno, &set); 84 status = ocfs2_test_inode_bit(osb, blkno, &set);
85 trace_ocfs2_get_dentry_test_bit(status, set);
86 if (status < 0) { 86 if (status < 0) {
87 if (status == -EINVAL) { 87 if (status == -EINVAL) {
88 /* 88 /*
@@ -90,18 +90,14 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
90 * as an inode, we return -ESTALE to be 90 * as an inode, we return -ESTALE to be
91 * nice 91 * nice
92 */ 92 */
93 mlog(0, "test inode bit failed %d\n", status);
94 status = -ESTALE; 93 status = -ESTALE;
95 } else { 94 } else
96 mlog(ML_ERROR, "test inode bit failed %d\n", status); 95 mlog(ML_ERROR, "test inode bit failed %d\n", status);
97 }
98 goto unlock_nfs_sync; 96 goto unlock_nfs_sync;
99 } 97 }
100 98
101 /* If the inode allocator bit is clear, this inode must be stale */ 99 /* If the inode allocator bit is clear, this inode must be stale */
102 if (!set) { 100 if (!set) {
103 mlog(0, "inode %llu suballoc bit is clear\n",
104 (unsigned long long)blkno);
105 status = -ESTALE; 101 status = -ESTALE;
106 goto unlock_nfs_sync; 102 goto unlock_nfs_sync;
107 } 103 }
@@ -114,8 +110,8 @@ unlock_nfs_sync:
114check_err: 110check_err:
115 if (status < 0) { 111 if (status < 0) {
116 if (status == -ESTALE) { 112 if (status == -ESTALE) {
117 mlog(0, "stale inode ino: %llu generation: %u\n", 113 trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
118 (unsigned long long)blkno, handle->ih_generation); 114 handle->ih_generation);
119 } 115 }
120 result = ERR_PTR(status); 116 result = ERR_PTR(status);
121 goto bail; 117 goto bail;
@@ -130,8 +126,9 @@ check_err:
130check_gen: 126check_gen:
131 if (handle->ih_generation != inode->i_generation) { 127 if (handle->ih_generation != inode->i_generation) {
132 iput(inode); 128 iput(inode);
133 mlog(0, "stale inode ino: %llu generation: %u\n", 129 trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
134 (unsigned long long)blkno, handle->ih_generation); 130 handle->ih_generation,
131 inode->i_generation);
135 result = ERR_PTR(-ESTALE); 132 result = ERR_PTR(-ESTALE);
136 goto bail; 133 goto bail;
137 } 134 }
@@ -141,7 +138,7 @@ check_gen:
141 mlog_errno(PTR_ERR(result)); 138 mlog_errno(PTR_ERR(result));
142 139
143bail: 140bail:
144 mlog_exit_ptr(result); 141 trace_ocfs2_get_dentry_end(result);
145 return result; 142 return result;
146} 143}
147 144
@@ -152,11 +149,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
152 struct dentry *parent; 149 struct dentry *parent;
153 struct inode *dir = child->d_inode; 150 struct inode *dir = child->d_inode;
154 151
155 mlog_entry("(0x%p, '%.*s')\n", child, 152 trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
156 child->d_name.len, child->d_name.name); 153 (unsigned long long)OCFS2_I(dir)->ip_blkno);
157
158 mlog(0, "find parent of directory %llu\n",
159 (unsigned long long)OCFS2_I(dir)->ip_blkno);
160 154
161 status = ocfs2_inode_lock(dir, NULL, 0); 155 status = ocfs2_inode_lock(dir, NULL, 0);
162 if (status < 0) { 156 if (status < 0) {
@@ -178,7 +172,7 @@ bail_unlock:
178 ocfs2_inode_unlock(dir, 0); 172 ocfs2_inode_unlock(dir, 0);
179 173
180bail: 174bail:
181 mlog_exit_ptr(parent); 175 trace_ocfs2_get_parent_end(parent);
182 176
183 return parent; 177 return parent;
184} 178}
@@ -193,12 +187,16 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
193 u32 generation; 187 u32 generation;
194 __le32 *fh = (__force __le32 *) fh_in; 188 __le32 *fh = (__force __le32 *) fh_in;
195 189
196 mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry, 190 trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
197 dentry->d_name.len, dentry->d_name.name, 191 dentry->d_name.name,
198 fh, len, connectable); 192 fh, len, connectable);
199 193
200 if (len < 3 || (connectable && len < 6)) { 194 if (connectable && (len < 6)) {
201 mlog(ML_ERROR, "fh buffer is too small for encoding\n"); 195 *max_len = 6;
196 type = 255;
197 goto bail;
198 } else if (len < 3) {
199 *max_len = 3;
202 type = 255; 200 type = 255;
203 goto bail; 201 goto bail;
204 } 202 }
@@ -206,8 +204,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
206 blkno = OCFS2_I(inode)->ip_blkno; 204 blkno = OCFS2_I(inode)->ip_blkno;
207 generation = inode->i_generation; 205 generation = inode->i_generation;
208 206
209 mlog(0, "Encoding fh: blkno: %llu, generation: %u\n", 207 trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
210 (unsigned long long)blkno, generation);
211 208
212 len = 3; 209 len = 3;
213 fh[0] = cpu_to_le32((u32)(blkno >> 32)); 210 fh[0] = cpu_to_le32((u32)(blkno >> 32));
@@ -232,14 +229,14 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
232 len = 6; 229 len = 6;
233 type = 2; 230 type = 2;
234 231
235 mlog(0, "Encoding parent: blkno: %llu, generation: %u\n", 232 trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
236 (unsigned long long)blkno, generation); 233 generation);
237 } 234 }
238 235
239 *max_len = len; 236 *max_len = len;
240 237
241bail: 238bail:
242 mlog_exit(type); 239 trace_ocfs2_encode_fh_type(type);
243 return type; 240 return type;
244} 241}
245 242
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 09e3fdfa6d33..23457b491e8c 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -28,7 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/fiemap.h> 29#include <linux/fiemap.h>
30 30
31#define MLOG_MASK_PREFIX ML_EXTENT_MAP
32#include <cluster/masklog.h> 31#include <cluster/masklog.h>
33 32
34#include "ocfs2.h" 33#include "ocfs2.h"
@@ -39,6 +38,7 @@
39#include "inode.h" 38#include "inode.h"
40#include "super.h" 39#include "super.h"
41#include "symlink.h" 40#include "symlink.h"
41#include "ocfs2_trace.h"
42 42
43#include "buffer_head_io.h" 43#include "buffer_head_io.h"
44 44
@@ -841,10 +841,9 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
841 u64 p_block, p_count; 841 u64 p_block, p_count;
842 int i, count, done = 0; 842 int i, count, done = 0;
843 843
844 mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, " 844 trace_ocfs2_read_virt_blocks(
845 "flags = %x, validate = %p)\n", 845 inode, (unsigned long long)v_block, nr, bhs, flags,
846 inode, (unsigned long long)v_block, nr, bhs, flags, 846 validate);
847 validate);
848 847
849 if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >= 848 if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
850 i_size_read(inode)) { 849 i_size_read(inode)) {
@@ -897,7 +896,6 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
897 } 896 }
898 897
899out: 898out:
900 mlog_exit(rc);
901 return rc; 899 return rc;
902} 900}
903 901
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a6651956482e..41565ae52856 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -38,7 +38,6 @@
38#include <linux/quotaops.h> 38#include <linux/quotaops.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
40 40
41#define MLOG_MASK_PREFIX ML_INODE
42#include <cluster/masklog.h> 41#include <cluster/masklog.h>
43 42
44#include "ocfs2.h" 43#include "ocfs2.h"
@@ -61,6 +60,7 @@
61#include "acl.h" 60#include "acl.h"
62#include "quota.h" 61#include "quota.h"
63#include "refcounttree.h" 62#include "refcounttree.h"
63#include "ocfs2_trace.h"
64 64
65#include "buffer_head_io.h" 65#include "buffer_head_io.h"
66 66
@@ -99,8 +99,10 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
99 int mode = file->f_flags; 99 int mode = file->f_flags;
100 struct ocfs2_inode_info *oi = OCFS2_I(inode); 100 struct ocfs2_inode_info *oi = OCFS2_I(inode);
101 101
102 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, 102 trace_ocfs2_file_open(inode, file, file->f_path.dentry,
103 file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); 103 (unsigned long long)OCFS2_I(inode)->ip_blkno,
104 file->f_path.dentry->d_name.len,
105 file->f_path.dentry->d_name.name, mode);
104 106
105 if (file->f_mode & FMODE_WRITE) 107 if (file->f_mode & FMODE_WRITE)
106 dquot_initialize(inode); 108 dquot_initialize(inode);
@@ -135,7 +137,6 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
135 } 137 }
136 138
137leave: 139leave:
138 mlog_exit(status);
139 return status; 140 return status;
140} 141}
141 142
@@ -143,19 +144,19 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
143{ 144{
144 struct ocfs2_inode_info *oi = OCFS2_I(inode); 145 struct ocfs2_inode_info *oi = OCFS2_I(inode);
145 146
146 mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
147 file->f_path.dentry->d_name.len,
148 file->f_path.dentry->d_name.name);
149
150 spin_lock(&oi->ip_lock); 147 spin_lock(&oi->ip_lock);
151 if (!--oi->ip_open_count) 148 if (!--oi->ip_open_count)
152 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; 149 oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
150
151 trace_ocfs2_file_release(inode, file, file->f_path.dentry,
152 oi->ip_blkno,
153 file->f_path.dentry->d_name.len,
154 file->f_path.dentry->d_name.name,
155 oi->ip_open_count);
153 spin_unlock(&oi->ip_lock); 156 spin_unlock(&oi->ip_lock);
154 157
155 ocfs2_free_file_private(inode, file); 158 ocfs2_free_file_private(inode, file);
156 159
157 mlog_exit(0);
158
159 return 0; 160 return 0;
160} 161}
161 162
@@ -177,9 +178,11 @@ static int ocfs2_sync_file(struct file *file, int datasync)
177 struct inode *inode = file->f_mapping->host; 178 struct inode *inode = file->f_mapping->host;
178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 179 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
179 180
180 mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync, 181 trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
181 file->f_path.dentry, file->f_path.dentry->d_name.len, 182 OCFS2_I(inode)->ip_blkno,
182 file->f_path.dentry->d_name.name); 183 file->f_path.dentry->d_name.len,
184 file->f_path.dentry->d_name.name,
185 (unsigned long long)datasync);
183 186
184 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 187 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
185 /* 188 /*
@@ -195,7 +198,8 @@ static int ocfs2_sync_file(struct file *file, int datasync)
195 err = jbd2_journal_force_commit(journal); 198 err = jbd2_journal_force_commit(journal);
196 199
197bail: 200bail:
198 mlog_exit(err); 201 if (err)
202 mlog_errno(err);
199 203
200 return (err < 0) ? -EIO : 0; 204 return (err < 0) ? -EIO : 0;
201} 205}
@@ -251,8 +255,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
251 handle_t *handle; 255 handle_t *handle;
252 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; 256 struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
253 257
254 mlog_entry_void();
255
256 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 258 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
257 if (IS_ERR(handle)) { 259 if (IS_ERR(handle)) {
258 ret = PTR_ERR(handle); 260 ret = PTR_ERR(handle);
@@ -280,7 +282,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
280out_commit: 282out_commit:
281 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 283 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
282out: 284out:
283 mlog_exit(ret);
284 return ret; 285 return ret;
285} 286}
286 287
@@ -291,7 +292,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
291{ 292{
292 int status; 293 int status;
293 294
294 mlog_entry_void();
295 i_size_write(inode, new_i_size); 295 i_size_write(inode, new_i_size);
296 inode->i_blocks = ocfs2_inode_sector_count(inode); 296 inode->i_blocks = ocfs2_inode_sector_count(inode);
297 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -303,7 +303,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
303 } 303 }
304 304
305bail: 305bail:
306 mlog_exit(status);
307 return status; 306 return status;
308} 307}
309 308
@@ -375,8 +374,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
375 struct ocfs2_dinode *di; 374 struct ocfs2_dinode *di;
376 u64 cluster_bytes; 375 u64 cluster_bytes;
377 376
378 mlog_entry_void();
379
380 /* 377 /*
381 * We need to CoW the cluster contains the offset if it is reflinked 378 * We need to CoW the cluster contains the offset if it is reflinked
382 * since we will call ocfs2_zero_range_for_truncate later which will 379 * since we will call ocfs2_zero_range_for_truncate later which will
@@ -429,8 +426,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
429out_commit: 426out_commit:
430 ocfs2_commit_trans(osb, handle); 427 ocfs2_commit_trans(osb, handle);
431out: 428out:
432
433 mlog_exit(status);
434 return status; 429 return status;
435} 430}
436 431
@@ -442,14 +437,14 @@ static int ocfs2_truncate_file(struct inode *inode,
442 struct ocfs2_dinode *fe = NULL; 437 struct ocfs2_dinode *fe = NULL;
443 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 438 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
444 439
445 mlog_entry("(inode = %llu, new_i_size = %llu\n",
446 (unsigned long long)OCFS2_I(inode)->ip_blkno,
447 (unsigned long long)new_i_size);
448
449 /* We trust di_bh because it comes from ocfs2_inode_lock(), which 440 /* We trust di_bh because it comes from ocfs2_inode_lock(), which
450 * already validated it */ 441 * already validated it */
451 fe = (struct ocfs2_dinode *) di_bh->b_data; 442 fe = (struct ocfs2_dinode *) di_bh->b_data;
452 443
444 trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
445 (unsigned long long)le64_to_cpu(fe->i_size),
446 (unsigned long long)new_i_size);
447
453 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), 448 mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
454 "Inode %llu, inode i_size = %lld != di " 449 "Inode %llu, inode i_size = %lld != di "
455 "i_size = %llu, i_flags = 0x%x\n", 450 "i_size = %llu, i_flags = 0x%x\n",
@@ -459,19 +454,14 @@ static int ocfs2_truncate_file(struct inode *inode,
459 le32_to_cpu(fe->i_flags)); 454 le32_to_cpu(fe->i_flags));
460 455
461 if (new_i_size > le64_to_cpu(fe->i_size)) { 456 if (new_i_size > le64_to_cpu(fe->i_size)) {
462 mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", 457 trace_ocfs2_truncate_file_error(
463 (unsigned long long)le64_to_cpu(fe->i_size), 458 (unsigned long long)le64_to_cpu(fe->i_size),
464 (unsigned long long)new_i_size); 459 (unsigned long long)new_i_size);
465 status = -EINVAL; 460 status = -EINVAL;
466 mlog_errno(status); 461 mlog_errno(status);
467 goto bail; 462 goto bail;
468 } 463 }
469 464
470 mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
471 (unsigned long long)le64_to_cpu(fe->i_blkno),
472 (unsigned long long)le64_to_cpu(fe->i_size),
473 (unsigned long long)new_i_size);
474
475 /* lets handle the simple truncate cases before doing any more 465 /* lets handle the simple truncate cases before doing any more
476 * cluster locking. */ 466 * cluster locking. */
477 if (new_i_size == le64_to_cpu(fe->i_size)) 467 if (new_i_size == le64_to_cpu(fe->i_size))
@@ -525,7 +515,6 @@ bail:
525 if (!status && OCFS2_I(inode)->ip_clusters == 0) 515 if (!status && OCFS2_I(inode)->ip_clusters == 0)
526 status = ocfs2_try_remove_refcount_tree(inode, di_bh); 516 status = ocfs2_try_remove_refcount_tree(inode, di_bh);
527 517
528 mlog_exit(status);
529 return status; 518 return status;
530} 519}
531 520
@@ -578,8 +567,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
578 struct ocfs2_extent_tree et; 567 struct ocfs2_extent_tree et;
579 int did_quota = 0; 568 int did_quota = 0;
580 569
581 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
582
583 /* 570 /*
584 * This function only exists for file systems which don't 571 * This function only exists for file systems which don't
585 * support holes. 572 * support holes.
@@ -596,11 +583,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
596restart_all: 583restart_all:
597 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 584 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
598 585
599 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
600 "clusters_to_add = %u\n",
601 (unsigned long long)OCFS2_I(inode)->ip_blkno,
602 (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
603 clusters_to_add);
604 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh); 586 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
605 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, 587 status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
606 &data_ac, &meta_ac); 588 &data_ac, &meta_ac);
@@ -620,6 +602,12 @@ restart_all:
620 } 602 }
621 603
622restarted_transaction: 604restarted_transaction:
605 trace_ocfs2_extend_allocation(
606 (unsigned long long)OCFS2_I(inode)->ip_blkno,
607 (unsigned long long)i_size_read(inode),
608 le32_to_cpu(fe->i_clusters), clusters_to_add,
609 why, restart_func);
610
623 status = dquot_alloc_space_nodirty(inode, 611 status = dquot_alloc_space_nodirty(inode,
624 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); 612 ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
625 if (status) 613 if (status)
@@ -666,13 +654,11 @@ restarted_transaction:
666 654
667 if (why != RESTART_NONE && clusters_to_add) { 655 if (why != RESTART_NONE && clusters_to_add) {
668 if (why == RESTART_META) { 656 if (why == RESTART_META) {
669 mlog(0, "restarting function.\n");
670 restart_func = 1; 657 restart_func = 1;
671 status = 0; 658 status = 0;
672 } else { 659 } else {
673 BUG_ON(why != RESTART_TRANS); 660 BUG_ON(why != RESTART_TRANS);
674 661
675 mlog(0, "restarting transaction.\n");
676 /* TODO: This can be more intelligent. */ 662 /* TODO: This can be more intelligent. */
677 credits = ocfs2_calc_extend_credits(osb->sb, 663 credits = ocfs2_calc_extend_credits(osb->sb,
678 &fe->id2.i_list, 664 &fe->id2.i_list,
@@ -689,11 +675,11 @@ restarted_transaction:
689 } 675 }
690 } 676 }
691 677
692 mlog(0, "fe: i_clusters = %u, i_size=%llu\n", 678 trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
693 le32_to_cpu(fe->i_clusters), 679 le32_to_cpu(fe->i_clusters),
694 (unsigned long long)le64_to_cpu(fe->i_size)); 680 (unsigned long long)le64_to_cpu(fe->i_size),
695 mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", 681 OCFS2_I(inode)->ip_clusters,
696 OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); 682 (unsigned long long)i_size_read(inode));
697 683
698leave: 684leave:
699 if (status < 0 && did_quota) 685 if (status < 0 && did_quota)
@@ -718,7 +704,6 @@ leave:
718 brelse(bh); 704 brelse(bh);
719 bh = NULL; 705 bh = NULL;
720 706
721 mlog_exit(status);
722 return status; 707 return status;
723} 708}
724 709
@@ -785,10 +770,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
785 if (!zero_to) 770 if (!zero_to)
786 zero_to = PAGE_CACHE_SIZE; 771 zero_to = PAGE_CACHE_SIZE;
787 772
788 mlog(0, 773 trace_ocfs2_write_zero_page(
789 "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n", 774 (unsigned long long)OCFS2_I(inode)->ip_blkno,
790 (unsigned long long)abs_from, (unsigned long long)abs_to, 775 (unsigned long long)abs_from,
791 index, zero_from, zero_to); 776 (unsigned long long)abs_to,
777 index, zero_from, zero_to);
792 778
793 /* We know that zero_from is block aligned */ 779 /* We know that zero_from is block aligned */
794 for (block_start = zero_from; block_start < zero_to; 780 for (block_start = zero_from; block_start < zero_to;
@@ -928,9 +914,10 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
928 u64 next_pos; 914 u64 next_pos;
929 u64 zero_pos = range_start; 915 u64 zero_pos = range_start;
930 916
931 mlog(0, "range_start = %llu, range_end = %llu\n", 917 trace_ocfs2_zero_extend_range(
932 (unsigned long long)range_start, 918 (unsigned long long)OCFS2_I(inode)->ip_blkno,
933 (unsigned long long)range_end); 919 (unsigned long long)range_start,
920 (unsigned long long)range_end);
934 BUG_ON(range_start >= range_end); 921 BUG_ON(range_start >= range_end);
935 922
936 while (zero_pos < range_end) { 923 while (zero_pos < range_end) {
@@ -962,9 +949,9 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
962 struct super_block *sb = inode->i_sb; 949 struct super_block *sb = inode->i_sb;
963 950
964 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); 951 zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
965 mlog(0, "zero_start %llu for i_size %llu\n", 952 trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
966 (unsigned long long)zero_start, 953 (unsigned long long)zero_start,
967 (unsigned long long)i_size_read(inode)); 954 (unsigned long long)i_size_read(inode));
968 while (zero_start < zero_to_size) { 955 while (zero_start < zero_to_size) {
969 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, 956 ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
970 zero_to_size, 957 zero_to_size,
@@ -1113,30 +1100,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1113 struct dquot *transfer_to[MAXQUOTAS] = { }; 1100 struct dquot *transfer_to[MAXQUOTAS] = { };
1114 int qtype; 1101 int qtype;
1115 1102
1116 mlog_entry("(0x%p, '%.*s')\n", dentry, 1103 trace_ocfs2_setattr(inode, dentry,
1117 dentry->d_name.len, dentry->d_name.name); 1104 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1105 dentry->d_name.len, dentry->d_name.name,
1106 attr->ia_valid, attr->ia_mode,
1107 attr->ia_uid, attr->ia_gid);
1118 1108
1119 /* ensuring we don't even attempt to truncate a symlink */ 1109 /* ensuring we don't even attempt to truncate a symlink */
1120 if (S_ISLNK(inode->i_mode)) 1110 if (S_ISLNK(inode->i_mode))
1121 attr->ia_valid &= ~ATTR_SIZE; 1111 attr->ia_valid &= ~ATTR_SIZE;
1122 1112
1123 if (attr->ia_valid & ATTR_MODE)
1124 mlog(0, "mode change: %d\n", attr->ia_mode);
1125 if (attr->ia_valid & ATTR_UID)
1126 mlog(0, "uid change: %d\n", attr->ia_uid);
1127 if (attr->ia_valid & ATTR_GID)
1128 mlog(0, "gid change: %d\n", attr->ia_gid);
1129 if (attr->ia_valid & ATTR_SIZE)
1130 mlog(0, "size change...\n");
1131 if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
1132 mlog(0, "time change...\n");
1133
1134#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ 1113#define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
1135 | ATTR_GID | ATTR_UID | ATTR_MODE) 1114 | ATTR_GID | ATTR_UID | ATTR_MODE)
1136 if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { 1115 if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
1137 mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
1138 return 0; 1116 return 0;
1139 }
1140 1117
1141 status = inode_change_ok(inode, attr); 1118 status = inode_change_ok(inode, attr);
1142 if (status) 1119 if (status)
@@ -1274,7 +1251,6 @@ bail:
1274 mlog_errno(status); 1251 mlog_errno(status);
1275 } 1252 }
1276 1253
1277 mlog_exit(status);
1278 return status; 1254 return status;
1279} 1255}
1280 1256
@@ -1287,8 +1263,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
1287 struct ocfs2_super *osb = sb->s_fs_info; 1263 struct ocfs2_super *osb = sb->s_fs_info;
1288 int err; 1264 int err;
1289 1265
1290 mlog_entry_void();
1291
1292 err = ocfs2_inode_revalidate(dentry); 1266 err = ocfs2_inode_revalidate(dentry);
1293 if (err) { 1267 if (err) {
1294 if (err != -ENOENT) 1268 if (err != -ENOENT)
@@ -1302,8 +1276,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
1302 stat->blksize = osb->s_clustersize; 1276 stat->blksize = osb->s_clustersize;
1303 1277
1304bail: 1278bail:
1305 mlog_exit(err);
1306
1307 return err; 1279 return err;
1308} 1280}
1309 1281
@@ -1314,8 +1286,6 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
1314 if (flags & IPERM_FLAG_RCU) 1286 if (flags & IPERM_FLAG_RCU)
1315 return -ECHILD; 1287 return -ECHILD;
1316 1288
1317 mlog_entry_void();
1318
1319 ret = ocfs2_inode_lock(inode, NULL, 0); 1289 ret = ocfs2_inode_lock(inode, NULL, 0);
1320 if (ret) { 1290 if (ret) {
1321 if (ret != -ENOENT) 1291 if (ret != -ENOENT)
@@ -1327,7 +1297,6 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
1327 1297
1328 ocfs2_inode_unlock(inode, 0); 1298 ocfs2_inode_unlock(inode, 0);
1329out: 1299out:
1330 mlog_exit(ret);
1331 return ret; 1300 return ret;
1332} 1301}
1333 1302
@@ -1339,8 +1308,9 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1339 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1308 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1340 struct ocfs2_dinode *di; 1309 struct ocfs2_dinode *di;
1341 1310
1342 mlog_entry("(Inode %llu, mode 0%o)\n", 1311 trace_ocfs2_write_remove_suid(
1343 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); 1312 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1313 inode->i_mode);
1344 1314
1345 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1315 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1346 if (IS_ERR(handle)) { 1316 if (IS_ERR(handle)) {
@@ -1368,7 +1338,6 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
1368out_trans: 1338out_trans:
1369 ocfs2_commit_trans(osb, handle); 1339 ocfs2_commit_trans(osb, handle);
1370out: 1340out:
1371 mlog_exit(ret);
1372 return ret; 1341 return ret;
1373} 1342}
1374 1343
@@ -1547,8 +1516,9 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
1547 * partial clusters here. There's no need to worry about 1516 * partial clusters here. There's no need to worry about
1548 * physical allocation - the zeroing code knows to skip holes. 1517 * physical allocation - the zeroing code knows to skip holes.
1549 */ 1518 */
1550 mlog(0, "byte start: %llu, end: %llu\n", 1519 trace_ocfs2_zero_partial_clusters(
1551 (unsigned long long)start, (unsigned long long)end); 1520 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1521 (unsigned long long)start, (unsigned long long)end);
1552 1522
1553 /* 1523 /*
1554 * If both edges are on a cluster boundary then there's no 1524 * If both edges are on a cluster boundary then there's no
@@ -1572,8 +1542,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
1572 if (tmpend > end) 1542 if (tmpend > end)
1573 tmpend = end; 1543 tmpend = end;
1574 1544
1575 mlog(0, "1st range: start: %llu, tmpend: %llu\n", 1545 trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
1576 (unsigned long long)start, (unsigned long long)tmpend); 1546 (unsigned long long)tmpend);
1577 1547
1578 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); 1548 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1579 if (ret) 1549 if (ret)
@@ -1587,8 +1557,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
1587 */ 1557 */
1588 start = end & ~(osb->s_clustersize - 1); 1558 start = end & ~(osb->s_clustersize - 1);
1589 1559
1590 mlog(0, "2nd range: start: %llu, end: %llu\n", 1560 trace_ocfs2_zero_partial_clusters_range2(
1591 (unsigned long long)start, (unsigned long long)end); 1561 (unsigned long long)start, (unsigned long long)end);
1592 1562
1593 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); 1563 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1594 if (ret) 1564 if (ret)
@@ -1688,6 +1658,11 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1688 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); 1658 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
1689 ocfs2_init_dealloc_ctxt(&dealloc); 1659 ocfs2_init_dealloc_ctxt(&dealloc);
1690 1660
1661 trace_ocfs2_remove_inode_range(
1662 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1663 (unsigned long long)byte_start,
1664 (unsigned long long)byte_len);
1665
1691 if (byte_len == 0) 1666 if (byte_len == 0)
1692 return 0; 1667 return 0;
1693 1668
@@ -1734,11 +1709,6 @@ static int ocfs2_remove_inode_range(struct inode *inode,
1734 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits; 1709 trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
1735 cluster_in_el = trunc_end; 1710 cluster_in_el = trunc_end;
1736 1711
1737 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
1738 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1739 (unsigned long long)byte_start,
1740 (unsigned long long)byte_len, trunc_start, trunc_end);
1741
1742 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); 1712 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1743 if (ret) { 1713 if (ret) {
1744 mlog_errno(ret); 1714 mlog_errno(ret);
@@ -2093,7 +2063,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2093 int ret = 0, meta_level = 0; 2063 int ret = 0, meta_level = 0;
2094 struct dentry *dentry = file->f_path.dentry; 2064 struct dentry *dentry = file->f_path.dentry;
2095 struct inode *inode = dentry->d_inode; 2065 struct inode *inode = dentry->d_inode;
2096 loff_t saved_pos, end; 2066 loff_t saved_pos = 0, end;
2097 2067
2098 /* 2068 /*
2099 * We start with a read level meta lock and only jump to an ex 2069 * We start with a read level meta lock and only jump to an ex
@@ -2132,12 +2102,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2132 2102
2133 /* work on a copy of ppos until we're sure that we won't have 2103 /* work on a copy of ppos until we're sure that we won't have
2134 * to recalculate it due to relocking. */ 2104 * to recalculate it due to relocking. */
2135 if (appending) { 2105 if (appending)
2136 saved_pos = i_size_read(inode); 2106 saved_pos = i_size_read(inode);
2137 mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); 2107 else
2138 } else {
2139 saved_pos = *ppos; 2108 saved_pos = *ppos;
2140 }
2141 2109
2142 end = saved_pos + count; 2110 end = saved_pos + count;
2143 2111
@@ -2208,6 +2176,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2208 *ppos = saved_pos; 2176 *ppos = saved_pos;
2209 2177
2210out_unlock: 2178out_unlock:
2179 trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
2180 saved_pos, appending, count,
2181 direct_io, has_refcount);
2182
2211 if (meta_level >= 0) 2183 if (meta_level >= 0)
2212 ocfs2_inode_unlock(inode, meta_level); 2184 ocfs2_inode_unlock(inode, meta_level);
2213 2185
@@ -2233,10 +2205,11 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2233 int full_coherency = !(osb->s_mount_opt & 2205 int full_coherency = !(osb->s_mount_opt &
2234 OCFS2_MOUNT_COHERENCY_BUFFERED); 2206 OCFS2_MOUNT_COHERENCY_BUFFERED);
2235 2207
2236 mlog_entry("(0x%p, %u, '%.*s')\n", file, 2208 trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
2237 (unsigned int)nr_segs, 2209 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2238 file->f_path.dentry->d_name.len, 2210 file->f_path.dentry->d_name.len,
2239 file->f_path.dentry->d_name.name); 2211 file->f_path.dentry->d_name.name,
2212 (unsigned int)nr_segs);
2240 2213
2241 if (iocb->ki_left == 0) 2214 if (iocb->ki_left == 0)
2242 return 0; 2215 return 0;
@@ -2402,7 +2375,6 @@ out_sems:
2402 2375
2403 if (written) 2376 if (written)
2404 ret = written; 2377 ret = written;
2405 mlog_exit(ret);
2406 return ret; 2378 return ret;
2407} 2379}
2408 2380
@@ -2438,10 +2410,11 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2438 .u.file = out, 2410 .u.file = out,
2439 }; 2411 };
2440 2412
2441 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, 2413
2442 (unsigned int)len, 2414 trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
2443 out->f_path.dentry->d_name.len, 2415 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2444 out->f_path.dentry->d_name.name); 2416 out->f_path.dentry->d_name.len,
2417 out->f_path.dentry->d_name.name, len);
2445 2418
2446 if (pipe->inode) 2419 if (pipe->inode)
2447 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT); 2420 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2485,7 +2458,6 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
2485 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 2458 balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
2486 } 2459 }
2487 2460
2488 mlog_exit(ret);
2489 return ret; 2461 return ret;
2490} 2462}
2491 2463
@@ -2498,10 +2470,10 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2498 int ret = 0, lock_level = 0; 2470 int ret = 0, lock_level = 0;
2499 struct inode *inode = in->f_path.dentry->d_inode; 2471 struct inode *inode = in->f_path.dentry->d_inode;
2500 2472
2501 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, 2473 trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
2502 (unsigned int)len, 2474 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2503 in->f_path.dentry->d_name.len, 2475 in->f_path.dentry->d_name.len,
2504 in->f_path.dentry->d_name.name); 2476 in->f_path.dentry->d_name.name, len);
2505 2477
2506 /* 2478 /*
2507 * See the comment in ocfs2_file_aio_read() 2479 * See the comment in ocfs2_file_aio_read()
@@ -2516,7 +2488,6 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
2516 ret = generic_file_splice_read(in, ppos, pipe, len, flags); 2488 ret = generic_file_splice_read(in, ppos, pipe, len, flags);
2517 2489
2518bail: 2490bail:
2519 mlog_exit(ret);
2520 return ret; 2491 return ret;
2521} 2492}
2522 2493
@@ -2529,10 +2500,11 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2529 struct file *filp = iocb->ki_filp; 2500 struct file *filp = iocb->ki_filp;
2530 struct inode *inode = filp->f_path.dentry->d_inode; 2501 struct inode *inode = filp->f_path.dentry->d_inode;
2531 2502
2532 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 2503 trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
2533 (unsigned int)nr_segs, 2504 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2534 filp->f_path.dentry->d_name.len, 2505 filp->f_path.dentry->d_name.len,
2535 filp->f_path.dentry->d_name.name); 2506 filp->f_path.dentry->d_name.name, nr_segs);
2507
2536 2508
2537 if (!inode) { 2509 if (!inode) {
2538 ret = -EINVAL; 2510 ret = -EINVAL;
@@ -2578,8 +2550,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2578 ocfs2_inode_unlock(inode, lock_level); 2550 ocfs2_inode_unlock(inode, lock_level);
2579 2551
2580 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); 2552 ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
2581 if (ret == -EINVAL) 2553 trace_generic_file_aio_read_ret(ret);
2582 mlog(0, "generic_file_aio_read returned -EINVAL\n");
2583 2554
2584 /* buffered aio wouldn't have proper lock coverage today */ 2555 /* buffered aio wouldn't have proper lock coverage today */
2585 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 2556 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
@@ -2597,7 +2568,6 @@ bail:
2597 } 2568 }
2598 if (rw_level != -1) 2569 if (rw_level != -1)
2599 ocfs2_rw_unlock(inode, rw_level); 2570 ocfs2_rw_unlock(inode, rw_level);
2600 mlog_exit(ret);
2601 2571
2602 return ret; 2572 return ret;
2603} 2573}
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 1aa863dd901f..d8208b20dc53 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,7 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30 30
31#define MLOG_MASK_PREFIX ML_SUPER
32#include <cluster/masklog.h> 31#include <cluster/masklog.h>
33 32
34#include "ocfs2.h" 33#include "ocfs2.h"
@@ -37,6 +36,7 @@
37#include "heartbeat.h" 36#include "heartbeat.h"
38#include "inode.h" 37#include "inode.h"
39#include "journal.h" 38#include "journal.h"
39#include "ocfs2_trace.h"
40 40
41#include "buffer_head_io.h" 41#include "buffer_head_io.h"
42 42
@@ -66,7 +66,7 @@ void ocfs2_do_node_down(int node_num, void *data)
66 66
67 BUG_ON(osb->node_num == node_num); 67 BUG_ON(osb->node_num == node_num);
68 68
69 mlog(0, "ocfs2: node down event for %d\n", node_num); 69 trace_ocfs2_do_node_down(node_num);
70 70
71 if (!osb->cconn) { 71 if (!osb->cconn) {
72 /* 72 /*
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4068c6c4c6f6..b4c8bb6b8d28 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -31,7 +31,6 @@
31 31
32#include <asm/byteorder.h> 32#include <asm/byteorder.h>
33 33
34#define MLOG_MASK_PREFIX ML_INODE
35#include <cluster/masklog.h> 34#include <cluster/masklog.h>
36 35
37#include "ocfs2.h" 36#include "ocfs2.h"
@@ -53,6 +52,7 @@
53#include "uptodate.h" 52#include "uptodate.h"
54#include "xattr.h" 53#include "xattr.h"
55#include "refcounttree.h" 54#include "refcounttree.h"
55#include "ocfs2_trace.h"
56 56
57#include "buffer_head_io.h" 57#include "buffer_head_io.h"
58 58
@@ -131,7 +131,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
131 struct super_block *sb = osb->sb; 131 struct super_block *sb = osb->sb;
132 struct ocfs2_find_inode_args args; 132 struct ocfs2_find_inode_args args;
133 133
134 mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno); 134 trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
135 sysfile_type);
135 136
136 /* Ok. By now we've either got the offsets passed to us by the 137 /* Ok. By now we've either got the offsets passed to us by the
137 * caller, or we just pulled them off the bh. Lets do some 138 * caller, or we just pulled them off the bh. Lets do some
@@ -152,16 +153,16 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
152 /* inode was *not* in the inode cache. 2.6.x requires 153 /* inode was *not* in the inode cache. 2.6.x requires
153 * us to do our own read_inode call and unlock it 154 * us to do our own read_inode call and unlock it
154 * afterwards. */ 155 * afterwards. */
155 if (inode && inode->i_state & I_NEW) {
156 mlog(0, "Inode was not in inode cache, reading it.\n");
157 ocfs2_read_locked_inode(inode, &args);
158 unlock_new_inode(inode);
159 }
160 if (inode == NULL) { 156 if (inode == NULL) {
161 inode = ERR_PTR(-ENOMEM); 157 inode = ERR_PTR(-ENOMEM);
162 mlog_errno(PTR_ERR(inode)); 158 mlog_errno(PTR_ERR(inode));
163 goto bail; 159 goto bail;
164 } 160 }
161 trace_ocfs2_iget5_locked(inode->i_state);
162 if (inode->i_state & I_NEW) {
163 ocfs2_read_locked_inode(inode, &args);
164 unlock_new_inode(inode);
165 }
165 if (is_bad_inode(inode)) { 166 if (is_bad_inode(inode)) {
166 iput(inode); 167 iput(inode);
167 inode = ERR_PTR(-ESTALE); 168 inode = ERR_PTR(-ESTALE);
@@ -170,9 +171,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
170 171
171bail: 172bail:
172 if (!IS_ERR(inode)) { 173 if (!IS_ERR(inode)) {
173 mlog(0, "returning inode with number %llu\n", 174 trace_ocfs2_iget_end(inode,
174 (unsigned long long)OCFS2_I(inode)->ip_blkno); 175 (unsigned long long)OCFS2_I(inode)->ip_blkno);
175 mlog_exit_ptr(inode);
176 } 176 }
177 177
178 return inode; 178 return inode;
@@ -192,18 +192,17 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
192 struct ocfs2_inode_info *oi = OCFS2_I(inode); 192 struct ocfs2_inode_info *oi = OCFS2_I(inode);
193 int ret = 0; 193 int ret = 0;
194 194
195 mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
196
197 args = opaque; 195 args = opaque;
198 196
199 mlog_bug_on_msg(!inode, "No inode in find actor!\n"); 197 mlog_bug_on_msg(!inode, "No inode in find actor!\n");
200 198
199 trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
200
201 if (oi->ip_blkno != args->fi_blkno) 201 if (oi->ip_blkno != args->fi_blkno)
202 goto bail; 202 goto bail;
203 203
204 ret = 1; 204 ret = 1;
205bail: 205bail:
206 mlog_exit(ret);
207 return ret; 206 return ret;
208} 207}
209 208
@@ -218,8 +217,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
218 static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, 217 static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
219 ocfs2_file_ip_alloc_sem_key; 218 ocfs2_file_ip_alloc_sem_key;
220 219
221 mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
222
223 inode->i_ino = args->fi_ino; 220 inode->i_ino = args->fi_ino;
224 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 221 OCFS2_I(inode)->ip_blkno = args->fi_blkno;
225 if (args->fi_sysfile_type != 0) 222 if (args->fi_sysfile_type != 0)
@@ -235,7 +232,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
235 lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, 232 lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
236 &ocfs2_file_ip_alloc_sem_key); 233 &ocfs2_file_ip_alloc_sem_key);
237 234
238 mlog_exit(0);
239 return 0; 235 return 0;
240} 236}
241 237
@@ -246,9 +242,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
246 struct ocfs2_super *osb; 242 struct ocfs2_super *osb;
247 int use_plocks = 1; 243 int use_plocks = 1;
248 244
249 mlog_entry("(0x%p, size:%llu)\n", inode,
250 (unsigned long long)le64_to_cpu(fe->i_size));
251
252 sb = inode->i_sb; 245 sb = inode->i_sb;
253 osb = OCFS2_SB(sb); 246 osb = OCFS2_SB(sb);
254 247
@@ -300,20 +293,20 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
300 293
301 inode->i_nlink = ocfs2_read_links_count(fe); 294 inode->i_nlink = ocfs2_read_links_count(fe);
302 295
296 trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
297 le32_to_cpu(fe->i_flags));
303 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { 298 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
304 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 299 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
305 inode->i_flags |= S_NOQUOTA; 300 inode->i_flags |= S_NOQUOTA;
306 } 301 }
307 302
308 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { 303 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
309 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 304 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
310 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
311 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { 305 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
312 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 306 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
313 } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { 307 } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
314 inode->i_flags |= S_NOQUOTA; 308 inode->i_flags |= S_NOQUOTA;
315 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { 309 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
316 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
317 /* we can't actually hit this as read_inode can't 310 /* we can't actually hit this as read_inode can't
318 * handle superblocks today ;-) */ 311 * handle superblocks today ;-) */
319 BUG(); 312 BUG();
@@ -381,7 +374,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
381 if (S_ISDIR(inode->i_mode)) 374 if (S_ISDIR(inode->i_mode))
382 ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, 375 ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
383 OCFS2_RESV_FLAG_DIR); 376 OCFS2_RESV_FLAG_DIR);
384 mlog_exit_void();
385} 377}
386 378
387static int ocfs2_read_locked_inode(struct inode *inode, 379static int ocfs2_read_locked_inode(struct inode *inode,
@@ -394,8 +386,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
394 int status, can_lock; 386 int status, can_lock;
395 u32 generation = 0; 387 u32 generation = 0;
396 388
397 mlog_entry("(0x%p, 0x%p)\n", inode, args);
398
399 status = -EINVAL; 389 status = -EINVAL;
400 if (inode == NULL || inode->i_sb == NULL) { 390 if (inode == NULL || inode->i_sb == NULL) {
401 mlog(ML_ERROR, "bad inode\n"); 391 mlog(ML_ERROR, "bad inode\n");
@@ -443,6 +433,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
443 && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) 433 && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
444 && !ocfs2_mount_local(osb); 434 && !ocfs2_mount_local(osb);
445 435
436 trace_ocfs2_read_locked_inode(
437 (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
438
446 /* 439 /*
447 * To maintain backwards compatibility with older versions of 440 * To maintain backwards compatibility with older versions of
448 * ocfs2-tools, we still store the generation value for system 441 * ocfs2-tools, we still store the generation value for system
@@ -534,7 +527,6 @@ bail:
534 if (args && bh) 527 if (args && bh)
535 brelse(bh); 528 brelse(bh);
536 529
537 mlog_exit(status);
538 return status; 530 return status;
539} 531}
540 532
@@ -551,8 +543,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
551 struct ocfs2_dinode *fe; 543 struct ocfs2_dinode *fe;
552 handle_t *handle = NULL; 544 handle_t *handle = NULL;
553 545
554 mlog_entry_void();
555
556 fe = (struct ocfs2_dinode *) fe_bh->b_data; 546 fe = (struct ocfs2_dinode *) fe_bh->b_data;
557 547
558 /* 548 /*
@@ -600,7 +590,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
600out: 590out:
601 if (handle) 591 if (handle)
602 ocfs2_commit_trans(osb, handle); 592 ocfs2_commit_trans(osb, handle);
603 mlog_exit(status);
604 return status; 593 return status;
605} 594}
606 595
@@ -696,8 +685,6 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
696 685
697 spin_lock(&osb->osb_lock); 686 spin_lock(&osb->osb_lock);
698 if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { 687 if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
699 mlog(0, "Recovery is happening on orphan dir %d, will skip "
700 "this inode\n", slot);
701 ret = -EDEADLK; 688 ret = -EDEADLK;
702 goto out; 689 goto out;
703 } 690 }
@@ -706,6 +693,7 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
706 osb->osb_orphan_wipes[slot]++; 693 osb->osb_orphan_wipes[slot]++;
707out: 694out:
708 spin_unlock(&osb->osb_lock); 695 spin_unlock(&osb->osb_lock);
696 trace_ocfs2_check_orphan_recovery_state(slot, ret);
709 return ret; 697 return ret;
710} 698}
711 699
@@ -816,6 +804,10 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
816 struct ocfs2_inode_info *oi = OCFS2_I(inode); 804 struct ocfs2_inode_info *oi = OCFS2_I(inode);
817 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 805 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
818 806
807 trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
808 (unsigned long long)oi->ip_blkno,
809 oi->ip_flags);
810
819 /* We shouldn't be getting here for the root directory 811 /* We shouldn't be getting here for the root directory
820 * inode.. */ 812 * inode.. */
821 if (inode == osb->root_inode) { 813 if (inode == osb->root_inode) {
@@ -828,11 +820,8 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
828 * have to skip deleting this guy. That's OK though because 820 * have to skip deleting this guy. That's OK though because
829 * the node who's doing the actual deleting should handle it 821 * the node who's doing the actual deleting should handle it
830 * anyway. */ 822 * anyway. */
831 if (current == osb->dc_task) { 823 if (current == osb->dc_task)
832 mlog(0, "Skipping delete of %lu because we're currently "
833 "in downconvert\n", inode->i_ino);
834 goto bail; 824 goto bail;
835 }
836 825
837 spin_lock(&oi->ip_lock); 826 spin_lock(&oi->ip_lock);
838 /* OCFS2 *never* deletes system files. This should technically 827 /* OCFS2 *never* deletes system files. This should technically
@@ -846,12 +835,9 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
846 835
847 /* If we have allowd wipe of this inode for another node, it 836 /* If we have allowd wipe of this inode for another node, it
848 * will be marked here so we can safely skip it. Recovery will 837 * will be marked here so we can safely skip it. Recovery will
849 * cleanup any inodes we might inadvertantly skip here. */ 838 * cleanup any inodes we might inadvertently skip here. */
850 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { 839 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
851 mlog(0, "Skipping delete of %lu because another node "
852 "has done this for us.\n", inode->i_ino);
853 goto bail_unlock; 840 goto bail_unlock;
854 }
855 841
856 ret = 1; 842 ret = 1;
857bail_unlock: 843bail_unlock:
@@ -868,28 +854,27 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
868 struct buffer_head *di_bh, 854 struct buffer_head *di_bh,
869 int *wipe) 855 int *wipe)
870{ 856{
871 int status = 0; 857 int status = 0, reason = 0;
872 struct ocfs2_inode_info *oi = OCFS2_I(inode); 858 struct ocfs2_inode_info *oi = OCFS2_I(inode);
873 struct ocfs2_dinode *di; 859 struct ocfs2_dinode *di;
874 860
875 *wipe = 0; 861 *wipe = 0;
876 862
863 trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
864 inode->i_nlink);
865
877 /* While we were waiting for the cluster lock in 866 /* While we were waiting for the cluster lock in
878 * ocfs2_delete_inode, another node might have asked to delete 867 * ocfs2_delete_inode, another node might have asked to delete
879 * the inode. Recheck our flags to catch this. */ 868 * the inode. Recheck our flags to catch this. */
880 if (!ocfs2_inode_is_valid_to_delete(inode)) { 869 if (!ocfs2_inode_is_valid_to_delete(inode)) {
881 mlog(0, "Skipping delete of %llu because flags changed\n", 870 reason = 1;
882 (unsigned long long)oi->ip_blkno);
883 goto bail; 871 goto bail;
884 } 872 }
885 873
886 /* Now that we have an up to date inode, we can double check 874 /* Now that we have an up to date inode, we can double check
887 * the link count. */ 875 * the link count. */
888 if (inode->i_nlink) { 876 if (inode->i_nlink)
889 mlog(0, "Skipping delete of %llu because nlink = %u\n",
890 (unsigned long long)oi->ip_blkno, inode->i_nlink);
891 goto bail; 877 goto bail;
892 }
893 878
894 /* Do some basic inode verification... */ 879 /* Do some basic inode verification... */
895 di = (struct ocfs2_dinode *) di_bh->b_data; 880 di = (struct ocfs2_dinode *) di_bh->b_data;
@@ -904,9 +889,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
904 * ORPHANED_FL not. 889 * ORPHANED_FL not.
905 */ 890 */
906 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { 891 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
907 mlog(0, "Reflinked inode %llu is no longer orphaned. " 892 reason = 2;
908 "it shouldn't be deleted\n",
909 (unsigned long long)oi->ip_blkno);
910 goto bail; 893 goto bail;
911 } 894 }
912 895
@@ -934,7 +917,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
934 * the inode open lock in ocfs2_read_locked_inode(). When we 917 * the inode open lock in ocfs2_read_locked_inode(). When we
935 * get to ->delete_inode(), each node tries to convert it's 918 * get to ->delete_inode(), each node tries to convert it's
936 * lock to an exclusive. Trylocks are serialized by the inode 919 * lock to an exclusive. Trylocks are serialized by the inode
937 * meta data lock. If the upconvert suceeds, we know the inode 920 * meta data lock. If the upconvert succeeds, we know the inode
938 * is no longer live and can be deleted. 921 * is no longer live and can be deleted.
939 * 922 *
940 * Though we call this with the meta data lock held, the 923 * Though we call this with the meta data lock held, the
@@ -943,8 +926,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
943 status = ocfs2_try_open_lock(inode, 1); 926 status = ocfs2_try_open_lock(inode, 1);
944 if (status == -EAGAIN) { 927 if (status == -EAGAIN) {
945 status = 0; 928 status = 0;
946 mlog(0, "Skipping delete of %llu because it is in use on " 929 reason = 3;
947 "other nodes\n", (unsigned long long)oi->ip_blkno);
948 goto bail; 930 goto bail;
949 } 931 }
950 if (status < 0) { 932 if (status < 0) {
@@ -953,11 +935,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
953 } 935 }
954 936
955 *wipe = 1; 937 *wipe = 1;
956 mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n", 938 trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
957 (unsigned long long)oi->ip_blkno,
958 le16_to_cpu(di->i_orphaned_slot));
959 939
960bail: 940bail:
941 trace_ocfs2_query_inode_wipe_end(status, reason);
961 return status; 942 return status;
962} 943}
963 944
@@ -967,8 +948,8 @@ bail:
967static void ocfs2_cleanup_delete_inode(struct inode *inode, 948static void ocfs2_cleanup_delete_inode(struct inode *inode,
968 int sync_data) 949 int sync_data)
969{ 950{
970 mlog(0, "Cleanup inode %llu, sync = %d\n", 951 trace_ocfs2_cleanup_delete_inode(
971 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 952 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
972 if (sync_data) 953 if (sync_data)
973 write_inode_now(inode, 1); 954 write_inode_now(inode, 1);
974 truncate_inode_pages(&inode->i_data, 0); 955 truncate_inode_pages(&inode->i_data, 0);
@@ -980,15 +961,15 @@ static void ocfs2_delete_inode(struct inode *inode)
980 sigset_t oldset; 961 sigset_t oldset;
981 struct buffer_head *di_bh = NULL; 962 struct buffer_head *di_bh = NULL;
982 963
983 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 964 trace_ocfs2_delete_inode(inode->i_ino,
965 (unsigned long long)OCFS2_I(inode)->ip_blkno,
966 is_bad_inode(inode));
984 967
985 /* When we fail in read_inode() we mark inode as bad. The second test 968 /* When we fail in read_inode() we mark inode as bad. The second test
986 * catches the case when inode allocation fails before allocating 969 * catches the case when inode allocation fails before allocating
987 * a block for inode. */ 970 * a block for inode. */
988 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) { 971 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
989 mlog(0, "Skipping delete of bad inode\n");
990 goto bail; 972 goto bail;
991 }
992 973
993 dquot_initialize(inode); 974 dquot_initialize(inode);
994 975
@@ -1080,7 +1061,7 @@ bail_unlock_nfs_sync:
1080bail_unblock: 1061bail_unblock:
1081 ocfs2_unblock_signals(&oldset); 1062 ocfs2_unblock_signals(&oldset);
1082bail: 1063bail:
1083 mlog_exit_void(); 1064 return;
1084} 1065}
1085 1066
1086static void ocfs2_clear_inode(struct inode *inode) 1067static void ocfs2_clear_inode(struct inode *inode)
@@ -1088,11 +1069,9 @@ static void ocfs2_clear_inode(struct inode *inode)
1088 int status; 1069 int status;
1089 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1070 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1090 1071
1091 mlog_entry_void();
1092
1093 end_writeback(inode); 1072 end_writeback(inode);
1094 mlog(0, "Clearing inode: %llu, nlink = %u\n", 1073 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
1095 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink); 1074 inode->i_nlink);
1096 1075
1097 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 1076 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
1098 "Inode=%lu\n", inode->i_ino); 1077 "Inode=%lu\n", inode->i_ino);
@@ -1181,8 +1160,6 @@ static void ocfs2_clear_inode(struct inode *inode)
1181 */ 1160 */
1182 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, 1161 jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
1183 &oi->ip_jinode); 1162 &oi->ip_jinode);
1184
1185 mlog_exit_void();
1186} 1163}
1187 1164
1188void ocfs2_evict_inode(struct inode *inode) 1165void ocfs2_evict_inode(struct inode *inode)
@@ -1204,17 +1181,14 @@ int ocfs2_drop_inode(struct inode *inode)
1204 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1181 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1205 int res; 1182 int res;
1206 1183
1207 mlog_entry_void(); 1184 trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
1208 1185 inode->i_nlink, oi->ip_flags);
1209 mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
1210 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
1211 1186
1212 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) 1187 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
1213 res = 1; 1188 res = 1;
1214 else 1189 else
1215 res = generic_drop_inode(inode); 1190 res = generic_drop_inode(inode);
1216 1191
1217 mlog_exit_void();
1218 return res; 1192 return res;
1219} 1193}
1220 1194
@@ -1226,11 +1200,11 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
1226 struct inode *inode = dentry->d_inode; 1200 struct inode *inode = dentry->d_inode;
1227 int status = 0; 1201 int status = 0;
1228 1202
1229 mlog_entry("(inode = 0x%p, ino = %llu)\n", inode, 1203 trace_ocfs2_inode_revalidate(inode,
1230 inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL); 1204 inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
1205 inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
1231 1206
1232 if (!inode) { 1207 if (!inode) {
1233 mlog(0, "eep, no inode!\n");
1234 status = -ENOENT; 1208 status = -ENOENT;
1235 goto bail; 1209 goto bail;
1236 } 1210 }
@@ -1238,7 +1212,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
1238 spin_lock(&OCFS2_I(inode)->ip_lock); 1212 spin_lock(&OCFS2_I(inode)->ip_lock);
1239 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 1213 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
1240 spin_unlock(&OCFS2_I(inode)->ip_lock); 1214 spin_unlock(&OCFS2_I(inode)->ip_lock);
1241 mlog(0, "inode deleted!\n");
1242 status = -ENOENT; 1215 status = -ENOENT;
1243 goto bail; 1216 goto bail;
1244 } 1217 }
@@ -1254,8 +1227,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
1254 } 1227 }
1255 ocfs2_inode_unlock(inode, 0); 1228 ocfs2_inode_unlock(inode, 0);
1256bail: 1229bail:
1257 mlog_exit(status);
1258
1259 return status; 1230 return status;
1260} 1231}
1261 1232
@@ -1271,8 +1242,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1271 int status; 1242 int status;
1272 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 1243 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
1273 1244
1274 mlog_entry("(inode %llu)\n", 1245 trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
1275 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1276 1246
1277 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 1247 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
1278 OCFS2_JOURNAL_ACCESS_WRITE); 1248 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1302,7 +1272,6 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
1302 1272
1303 ocfs2_journal_dirty(handle, bh); 1273 ocfs2_journal_dirty(handle, bh);
1304leave: 1274leave:
1305 mlog_exit(status);
1306 return status; 1275 return status;
1307} 1276}
1308 1277
@@ -1345,8 +1314,7 @@ int ocfs2_validate_inode_block(struct super_block *sb,
1345 int rc; 1314 int rc;
1346 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1315 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
1347 1316
1348 mlog(0, "Validating dinode %llu\n", 1317 trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
1349 (unsigned long long)bh->b_blocknr);
1350 1318
1351 BUG_ON(!buffer_uptodate(bh)); 1319 BUG_ON(!buffer_uptodate(bh));
1352 1320
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7a4868196152..8f13c5989eae 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -9,7 +9,6 @@
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/compat.h> 10#include <linux/compat.h>
11 11
12#define MLOG_MASK_PREFIX ML_INODE
13#include <cluster/masklog.h> 12#include <cluster/masklog.h>
14 13
15#include "ocfs2.h" 14#include "ocfs2.h"
@@ -46,6 +45,22 @@ static inline void __o2info_set_request_error(struct ocfs2_info_request *kreq,
46#define o2info_set_request_error(a, b) \ 45#define o2info_set_request_error(a, b) \
47 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b) 46 __o2info_set_request_error((struct ocfs2_info_request *)&(a), b)
48 47
48static inline void __o2info_set_request_filled(struct ocfs2_info_request *req)
49{
50 req->ir_flags |= OCFS2_INFO_FL_FILLED;
51}
52
53#define o2info_set_request_filled(a) \
54 __o2info_set_request_filled((struct ocfs2_info_request *)&(a))
55
56static inline void __o2info_clear_request_filled(struct ocfs2_info_request *req)
57{
58 req->ir_flags &= ~OCFS2_INFO_FL_FILLED;
59}
60
61#define o2info_clear_request_filled(a) \
62 __o2info_clear_request_filled((struct ocfs2_info_request *)&(a))
63
49static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags) 64static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
50{ 65{
51 int status; 66 int status;
@@ -59,7 +74,6 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
59 *flags = OCFS2_I(inode)->ip_attr; 74 *flags = OCFS2_I(inode)->ip_attr;
60 ocfs2_inode_unlock(inode, 0); 75 ocfs2_inode_unlock(inode, 0);
61 76
62 mlog_exit(status);
63 return status; 77 return status;
64} 78}
65 79
@@ -82,7 +96,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
82 } 96 }
83 97
84 status = -EACCES; 98 status = -EACCES;
85 if (!is_owner_or_cap(inode)) 99 if (!inode_owner_or_capable(inode))
86 goto bail_unlock; 100 goto bail_unlock;
87 101
88 if (!S_ISDIR(inode->i_mode)) 102 if (!S_ISDIR(inode->i_mode))
@@ -125,7 +139,6 @@ bail:
125 139
126 brelse(bh); 140 brelse(bh);
127 141
128 mlog_exit(status);
129 return status; 142 return status;
130} 143}
131 144
@@ -139,7 +152,8 @@ int ocfs2_info_handle_blocksize(struct inode *inode,
139 goto bail; 152 goto bail;
140 153
141 oib.ib_blocksize = inode->i_sb->s_blocksize; 154 oib.ib_blocksize = inode->i_sb->s_blocksize;
142 oib.ib_req.ir_flags |= OCFS2_INFO_FL_FILLED; 155
156 o2info_set_request_filled(oib);
143 157
144 if (o2info_to_user(oib, req)) 158 if (o2info_to_user(oib, req))
145 goto bail; 159 goto bail;
@@ -163,7 +177,8 @@ int ocfs2_info_handle_clustersize(struct inode *inode,
163 goto bail; 177 goto bail;
164 178
165 oic.ic_clustersize = osb->s_clustersize; 179 oic.ic_clustersize = osb->s_clustersize;
166 oic.ic_req.ir_flags |= OCFS2_INFO_FL_FILLED; 180
181 o2info_set_request_filled(oic);
167 182
168 if (o2info_to_user(oic, req)) 183 if (o2info_to_user(oic, req))
169 goto bail; 184 goto bail;
@@ -187,7 +202,8 @@ int ocfs2_info_handle_maxslots(struct inode *inode,
187 goto bail; 202 goto bail;
188 203
189 oim.im_max_slots = osb->max_slots; 204 oim.im_max_slots = osb->max_slots;
190 oim.im_req.ir_flags |= OCFS2_INFO_FL_FILLED; 205
206 o2info_set_request_filled(oim);
191 207
192 if (o2info_to_user(oim, req)) 208 if (o2info_to_user(oim, req))
193 goto bail; 209 goto bail;
@@ -211,7 +227,8 @@ int ocfs2_info_handle_label(struct inode *inode,
211 goto bail; 227 goto bail;
212 228
213 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); 229 memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN);
214 oil.il_req.ir_flags |= OCFS2_INFO_FL_FILLED; 230
231 o2info_set_request_filled(oil);
215 232
216 if (o2info_to_user(oil, req)) 233 if (o2info_to_user(oil, req))
217 goto bail; 234 goto bail;
@@ -235,7 +252,8 @@ int ocfs2_info_handle_uuid(struct inode *inode,
235 goto bail; 252 goto bail;
236 253
237 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); 254 memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1);
238 oiu.iu_req.ir_flags |= OCFS2_INFO_FL_FILLED; 255
256 o2info_set_request_filled(oiu);
239 257
240 if (o2info_to_user(oiu, req)) 258 if (o2info_to_user(oiu, req))
241 goto bail; 259 goto bail;
@@ -261,7 +279,8 @@ int ocfs2_info_handle_fs_features(struct inode *inode,
261 oif.if_compat_features = osb->s_feature_compat; 279 oif.if_compat_features = osb->s_feature_compat;
262 oif.if_incompat_features = osb->s_feature_incompat; 280 oif.if_incompat_features = osb->s_feature_incompat;
263 oif.if_ro_compat_features = osb->s_feature_ro_compat; 281 oif.if_ro_compat_features = osb->s_feature_ro_compat;
264 oif.if_req.ir_flags |= OCFS2_INFO_FL_FILLED; 282
283 o2info_set_request_filled(oif);
265 284
266 if (o2info_to_user(oif, req)) 285 if (o2info_to_user(oif, req))
267 goto bail; 286 goto bail;
@@ -286,7 +305,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
286 305
287 oij.ij_journal_size = osb->journal->j_inode->i_size; 306 oij.ij_journal_size = osb->journal->j_inode->i_size;
288 307
289 oij.ij_req.ir_flags |= OCFS2_INFO_FL_FILLED; 308 o2info_set_request_filled(oij);
290 309
291 if (o2info_to_user(oij, req)) 310 if (o2info_to_user(oij, req))
292 goto bail; 311 goto bail;
@@ -308,7 +327,7 @@ int ocfs2_info_handle_unknown(struct inode *inode,
308 if (o2info_from_user(oir, req)) 327 if (o2info_from_user(oir, req))
309 goto bail; 328 goto bail;
310 329
311 oir.ir_flags &= ~OCFS2_INFO_FL_FILLED; 330 o2info_clear_request_filled(oir);
312 331
313 if (o2info_to_user(oir, req)) 332 if (o2info_to_user(oir, req))
314 goto bail; 333 goto bail;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index faa2303dbf0a..b141a44605ca 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -31,7 +31,6 @@
31#include <linux/time.h> 31#include <linux/time.h>
32#include <linux/random.h> 32#include <linux/random.h>
33 33
34#define MLOG_MASK_PREFIX ML_JOURNAL
35#include <cluster/masklog.h> 34#include <cluster/masklog.h>
36 35
37#include "ocfs2.h" 36#include "ocfs2.h"
@@ -52,6 +51,7 @@
52#include "quota.h" 51#include "quota.h"
53 52
54#include "buffer_head_io.h" 53#include "buffer_head_io.h"
54#include "ocfs2_trace.h"
55 55
56DEFINE_SPINLOCK(trans_inc_lock); 56DEFINE_SPINLOCK(trans_inc_lock);
57 57
@@ -303,16 +303,15 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
303 unsigned int flushed; 303 unsigned int flushed;
304 struct ocfs2_journal *journal = NULL; 304 struct ocfs2_journal *journal = NULL;
305 305
306 mlog_entry_void();
307
308 journal = osb->journal; 306 journal = osb->journal;
309 307
310 /* Flush all pending commits and checkpoint the journal. */ 308 /* Flush all pending commits and checkpoint the journal. */
311 down_write(&journal->j_trans_barrier); 309 down_write(&journal->j_trans_barrier);
312 310
313 if (atomic_read(&journal->j_num_trans) == 0) { 311 flushed = atomic_read(&journal->j_num_trans);
312 trace_ocfs2_commit_cache_begin(flushed);
313 if (flushed == 0) {
314 up_write(&journal->j_trans_barrier); 314 up_write(&journal->j_trans_barrier);
315 mlog(0, "No transactions for me to flush!\n");
316 goto finally; 315 goto finally;
317 } 316 }
318 317
@@ -331,13 +330,11 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
331 atomic_set(&journal->j_num_trans, 0); 330 atomic_set(&journal->j_num_trans, 0);
332 up_write(&journal->j_trans_barrier); 331 up_write(&journal->j_trans_barrier);
333 332
334 mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n", 333 trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
335 journal->j_trans_id, flushed);
336 334
337 ocfs2_wake_downconvert_thread(osb); 335 ocfs2_wake_downconvert_thread(osb);
338 wake_up(&journal->j_checkpointed); 336 wake_up(&journal->j_checkpointed);
339finally: 337finally:
340 mlog_exit(status);
341 return status; 338 return status;
342} 339}
343 340
@@ -425,9 +422,8 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
425 return 0; 422 return 0;
426 423
427 old_nblocks = handle->h_buffer_credits; 424 old_nblocks = handle->h_buffer_credits;
428 mlog_entry_void();
429 425
430 mlog(0, "Trying to extend transaction by %d blocks\n", nblocks); 426 trace_ocfs2_extend_trans(old_nblocks, nblocks);
431 427
432#ifdef CONFIG_OCFS2_DEBUG_FS 428#ifdef CONFIG_OCFS2_DEBUG_FS
433 status = 1; 429 status = 1;
@@ -440,9 +436,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
440#endif 436#endif
441 437
442 if (status > 0) { 438 if (status > 0) {
443 mlog(0, 439 trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
444 "jbd2_journal_extend failed, trying "
445 "jbd2_journal_restart\n");
446 status = jbd2_journal_restart(handle, 440 status = jbd2_journal_restart(handle,
447 old_nblocks + nblocks); 441 old_nblocks + nblocks);
448 if (status < 0) { 442 if (status < 0) {
@@ -453,8 +447,6 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
453 447
454 status = 0; 448 status = 0;
455bail: 449bail:
456
457 mlog_exit(status);
458 return status; 450 return status;
459} 451}
460 452
@@ -622,12 +614,9 @@ static int __ocfs2_journal_access(handle_t *handle,
622 BUG_ON(!handle); 614 BUG_ON(!handle);
623 BUG_ON(!bh); 615 BUG_ON(!bh);
624 616
625 mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n", 617 trace_ocfs2_journal_access(
626 (unsigned long long)bh->b_blocknr, type, 618 (unsigned long long)ocfs2_metadata_cache_owner(ci),
627 (type == OCFS2_JOURNAL_ACCESS_CREATE) ? 619 (unsigned long long)bh->b_blocknr, type, bh->b_size);
628 "OCFS2_JOURNAL_ACCESS_CREATE" :
629 "OCFS2_JOURNAL_ACCESS_WRITE",
630 bh->b_size);
631 620
632 /* we can safely remove this assertion after testing. */ 621 /* we can safely remove this assertion after testing. */
633 if (!buffer_uptodate(bh)) { 622 if (!buffer_uptodate(bh)) {
@@ -668,7 +657,6 @@ static int __ocfs2_journal_access(handle_t *handle,
668 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 657 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
669 status, type); 658 status, type);
670 659
671 mlog_exit(status);
672 return status; 660 return status;
673} 661}
674 662
@@ -737,13 +725,10 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
737{ 725{
738 int status; 726 int status;
739 727
740 mlog_entry("(bh->b_blocknr=%llu)\n", 728 trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
741 (unsigned long long)bh->b_blocknr);
742 729
743 status = jbd2_journal_dirty_metadata(handle, bh); 730 status = jbd2_journal_dirty_metadata(handle, bh);
744 BUG_ON(status); 731 BUG_ON(status);
745
746 mlog_exit_void();
747} 732}
748 733
749#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) 734#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -775,8 +760,6 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
775 struct ocfs2_super *osb; 760 struct ocfs2_super *osb;
776 int inode_lock = 0; 761 int inode_lock = 0;
777 762
778 mlog_entry_void();
779
780 BUG_ON(!journal); 763 BUG_ON(!journal);
781 764
782 osb = journal->j_osb; 765 osb = journal->j_osb;
@@ -820,10 +803,9 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
820 goto done; 803 goto done;
821 } 804 }
822 805
823 mlog(0, "inode->i_size = %lld\n", inode->i_size); 806 trace_ocfs2_journal_init(inode->i_size,
824 mlog(0, "inode->i_blocks = %llu\n", 807 (unsigned long long)inode->i_blocks,
825 (unsigned long long)inode->i_blocks); 808 OCFS2_I(inode)->ip_clusters);
826 mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
827 809
828 /* call the kernels journal init function now */ 810 /* call the kernels journal init function now */
829 j_journal = jbd2_journal_init_inode(inode); 811 j_journal = jbd2_journal_init_inode(inode);
@@ -833,8 +815,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
833 goto done; 815 goto done;
834 } 816 }
835 817
836 mlog(0, "Returned from jbd2_journal_init_inode\n"); 818 trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
837 mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
838 819
839 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & 820 *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
840 OCFS2_JOURNAL_DIRTY_FL); 821 OCFS2_JOURNAL_DIRTY_FL);
@@ -859,7 +840,6 @@ done:
859 } 840 }
860 } 841 }
861 842
862 mlog_exit(status);
863 return status; 843 return status;
864} 844}
865 845
@@ -882,8 +862,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
882 struct buffer_head *bh = journal->j_bh; 862 struct buffer_head *bh = journal->j_bh;
883 struct ocfs2_dinode *fe; 863 struct ocfs2_dinode *fe;
884 864
885 mlog_entry_void();
886
887 fe = (struct ocfs2_dinode *)bh->b_data; 865 fe = (struct ocfs2_dinode *)bh->b_data;
888 866
889 /* The journal bh on the osb always comes from ocfs2_journal_init() 867 /* The journal bh on the osb always comes from ocfs2_journal_init()
@@ -906,7 +884,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
906 if (status < 0) 884 if (status < 0)
907 mlog_errno(status); 885 mlog_errno(status);
908 886
909 mlog_exit(status);
910 return status; 887 return status;
911} 888}
912 889
@@ -921,8 +898,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
921 struct inode *inode = NULL; 898 struct inode *inode = NULL;
922 int num_running_trans = 0; 899 int num_running_trans = 0;
923 900
924 mlog_entry_void();
925
926 BUG_ON(!osb); 901 BUG_ON(!osb);
927 902
928 journal = osb->journal; 903 journal = osb->journal;
@@ -939,10 +914,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
939 BUG(); 914 BUG();
940 915
941 num_running_trans = atomic_read(&(osb->journal->j_num_trans)); 916 num_running_trans = atomic_read(&(osb->journal->j_num_trans));
942 if (num_running_trans > 0) 917 trace_ocfs2_journal_shutdown(num_running_trans);
943 mlog(0, "Shutting down journal: must wait on %d "
944 "running transactions!\n",
945 num_running_trans);
946 918
947 /* Do a commit_cache here. It will flush our journal, *and* 919 /* Do a commit_cache here. It will flush our journal, *and*
948 * release any locks that are still held. 920 * release any locks that are still held.
@@ -955,7 +927,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
955 * completely destroy the journal. */ 927 * completely destroy the journal. */
956 if (osb->commit_task) { 928 if (osb->commit_task) {
957 /* Wait for the commit thread */ 929 /* Wait for the commit thread */
958 mlog(0, "Waiting for ocfs2commit to exit....\n"); 930 trace_ocfs2_journal_shutdown_wait(osb->commit_task);
959 kthread_stop(osb->commit_task); 931 kthread_stop(osb->commit_task);
960 osb->commit_task = NULL; 932 osb->commit_task = NULL;
961 } 933 }
@@ -998,7 +970,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
998done: 970done:
999 if (inode) 971 if (inode)
1000 iput(inode); 972 iput(inode);
1001 mlog_exit_void();
1002} 973}
1003 974
1004static void ocfs2_clear_journal_error(struct super_block *sb, 975static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1024,8 +995,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
1024 int status = 0; 995 int status = 0;
1025 struct ocfs2_super *osb; 996 struct ocfs2_super *osb;
1026 997
1027 mlog_entry_void();
1028
1029 BUG_ON(!journal); 998 BUG_ON(!journal);
1030 999
1031 osb = journal->j_osb; 1000 osb = journal->j_osb;
@@ -1059,7 +1028,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
1059 osb->commit_task = NULL; 1028 osb->commit_task = NULL;
1060 1029
1061done: 1030done:
1062 mlog_exit(status);
1063 return status; 1031 return status;
1064} 1032}
1065 1033
@@ -1070,8 +1038,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
1070{ 1038{
1071 int status; 1039 int status;
1072 1040
1073 mlog_entry_void();
1074
1075 BUG_ON(!journal); 1041 BUG_ON(!journal);
1076 1042
1077 status = jbd2_journal_wipe(journal->j_journal, full); 1043 status = jbd2_journal_wipe(journal->j_journal, full);
@@ -1085,7 +1051,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
1085 mlog_errno(status); 1051 mlog_errno(status);
1086 1052
1087bail: 1053bail:
1088 mlog_exit(status);
1089 return status; 1054 return status;
1090} 1055}
1091 1056
@@ -1124,8 +1089,6 @@ static int ocfs2_force_read_journal(struct inode *inode)
1124#define CONCURRENT_JOURNAL_FILL 32ULL 1089#define CONCURRENT_JOURNAL_FILL 32ULL
1125 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; 1090 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
1126 1091
1127 mlog_entry_void();
1128
1129 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); 1092 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
1130 1093
1131 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); 1094 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
@@ -1161,7 +1124,6 @@ static int ocfs2_force_read_journal(struct inode *inode)
1161bail: 1124bail:
1162 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) 1125 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
1163 brelse(bhs[i]); 1126 brelse(bhs[i]);
1164 mlog_exit(status);
1165 return status; 1127 return status;
1166} 1128}
1167 1129
@@ -1185,7 +1147,7 @@ struct ocfs2_la_recovery_item {
1185 */ 1147 */
1186void ocfs2_complete_recovery(struct work_struct *work) 1148void ocfs2_complete_recovery(struct work_struct *work)
1187{ 1149{
1188 int ret; 1150 int ret = 0;
1189 struct ocfs2_journal *journal = 1151 struct ocfs2_journal *journal =
1190 container_of(work, struct ocfs2_journal, j_recovery_work); 1152 container_of(work, struct ocfs2_journal, j_recovery_work);
1191 struct ocfs2_super *osb = journal->j_osb; 1153 struct ocfs2_super *osb = journal->j_osb;
@@ -1194,9 +1156,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
1194 struct ocfs2_quota_recovery *qrec; 1156 struct ocfs2_quota_recovery *qrec;
1195 LIST_HEAD(tmp_la_list); 1157 LIST_HEAD(tmp_la_list);
1196 1158
1197 mlog_entry_void(); 1159 trace_ocfs2_complete_recovery(
1198 1160 (unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
1199 mlog(0, "completing recovery from keventd\n");
1200 1161
1201 spin_lock(&journal->j_lock); 1162 spin_lock(&journal->j_lock);
1202 list_splice_init(&journal->j_la_cleanups, &tmp_la_list); 1163 list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
@@ -1205,15 +1166,18 @@ void ocfs2_complete_recovery(struct work_struct *work)
1205 list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) { 1166 list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
1206 list_del_init(&item->lri_list); 1167 list_del_init(&item->lri_list);
1207 1168
1208 mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
1209
1210 ocfs2_wait_on_quotas(osb); 1169 ocfs2_wait_on_quotas(osb);
1211 1170
1212 la_dinode = item->lri_la_dinode; 1171 la_dinode = item->lri_la_dinode;
1213 if (la_dinode) { 1172 tl_dinode = item->lri_tl_dinode;
1214 mlog(0, "Clean up local alloc %llu\n", 1173 qrec = item->lri_qrec;
1215 (unsigned long long)le64_to_cpu(la_dinode->i_blkno)); 1174
1175 trace_ocfs2_complete_recovery_slot(item->lri_slot,
1176 la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
1177 tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
1178 qrec);
1216 1179
1180 if (la_dinode) {
1217 ret = ocfs2_complete_local_alloc_recovery(osb, 1181 ret = ocfs2_complete_local_alloc_recovery(osb,
1218 la_dinode); 1182 la_dinode);
1219 if (ret < 0) 1183 if (ret < 0)
@@ -1222,11 +1186,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
1222 kfree(la_dinode); 1186 kfree(la_dinode);
1223 } 1187 }
1224 1188
1225 tl_dinode = item->lri_tl_dinode;
1226 if (tl_dinode) { 1189 if (tl_dinode) {
1227 mlog(0, "Clean up truncate log %llu\n",
1228 (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
1229
1230 ret = ocfs2_complete_truncate_log_recovery(osb, 1190 ret = ocfs2_complete_truncate_log_recovery(osb,
1231 tl_dinode); 1191 tl_dinode);
1232 if (ret < 0) 1192 if (ret < 0)
@@ -1239,9 +1199,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
1239 if (ret < 0) 1199 if (ret < 0)
1240 mlog_errno(ret); 1200 mlog_errno(ret);
1241 1201
1242 qrec = item->lri_qrec;
1243 if (qrec) { 1202 if (qrec) {
1244 mlog(0, "Recovering quota files");
1245 ret = ocfs2_finish_quota_recovery(osb, qrec, 1203 ret = ocfs2_finish_quota_recovery(osb, qrec,
1246 item->lri_slot); 1204 item->lri_slot);
1247 if (ret < 0) 1205 if (ret < 0)
@@ -1252,8 +1210,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
1252 kfree(item); 1210 kfree(item);
1253 } 1211 }
1254 1212
1255 mlog(0, "Recovery completion\n"); 1213 trace_ocfs2_complete_recovery_end(ret);
1256 mlog_exit_void();
1257} 1214}
1258 1215
1259/* NOTE: This function always eats your references to la_dinode and 1216/* NOTE: This function always eats your references to la_dinode and
@@ -1339,8 +1296,6 @@ static int __ocfs2_recovery_thread(void *arg)
1339 int rm_quota_used = 0, i; 1296 int rm_quota_used = 0, i;
1340 struct ocfs2_quota_recovery *qrec; 1297 struct ocfs2_quota_recovery *qrec;
1341 1298
1342 mlog_entry_void();
1343
1344 status = ocfs2_wait_on_mount(osb); 1299 status = ocfs2_wait_on_mount(osb);
1345 if (status < 0) { 1300 if (status < 0) {
1346 goto bail; 1301 goto bail;
@@ -1372,15 +1327,12 @@ restart:
1372 * clear it until ocfs2_recover_node() has succeeded. */ 1327 * clear it until ocfs2_recover_node() has succeeded. */
1373 node_num = rm->rm_entries[0]; 1328 node_num = rm->rm_entries[0];
1374 spin_unlock(&osb->osb_lock); 1329 spin_unlock(&osb->osb_lock);
1375 mlog(0, "checking node %d\n", node_num);
1376 slot_num = ocfs2_node_num_to_slot(osb, node_num); 1330 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1331 trace_ocfs2_recovery_thread_node(node_num, slot_num);
1377 if (slot_num == -ENOENT) { 1332 if (slot_num == -ENOENT) {
1378 status = 0; 1333 status = 0;
1379 mlog(0, "no slot for this node, so no recovery"
1380 "required.\n");
1381 goto skip_recovery; 1334 goto skip_recovery;
1382 } 1335 }
1383 mlog(0, "node %d was using slot %d\n", node_num, slot_num);
1384 1336
1385 /* It is a bit subtle with quota recovery. We cannot do it 1337 /* It is a bit subtle with quota recovery. We cannot do it
1386 * immediately because we have to obtain cluster locks from 1338 * immediately because we have to obtain cluster locks from
@@ -1407,7 +1359,7 @@ skip_recovery:
1407 spin_lock(&osb->osb_lock); 1359 spin_lock(&osb->osb_lock);
1408 } 1360 }
1409 spin_unlock(&osb->osb_lock); 1361 spin_unlock(&osb->osb_lock);
1410 mlog(0, "All nodes recovered\n"); 1362 trace_ocfs2_recovery_thread_end(status);
1411 1363
1412 /* Refresh all journal recovery generations from disk */ 1364 /* Refresh all journal recovery generations from disk */
1413 status = ocfs2_check_journals_nolocks(osb); 1365 status = ocfs2_check_journals_nolocks(osb);
@@ -1416,7 +1368,7 @@ skip_recovery:
1416 mlog_errno(status); 1368 mlog_errno(status);
1417 1369
1418 /* Now it is right time to recover quotas... We have to do this under 1370 /* Now it is right time to recover quotas... We have to do this under
1419 * superblock lock so that noone can start using the slot (and crash) 1371 * superblock lock so that no one can start using the slot (and crash)
1420 * before we recover it */ 1372 * before we recover it */
1421 for (i = 0; i < rm_quota_used; i++) { 1373 for (i = 0; i < rm_quota_used; i++) {
1422 qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); 1374 qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
@@ -1451,7 +1403,6 @@ bail:
1451 if (rm_quota) 1403 if (rm_quota)
1452 kfree(rm_quota); 1404 kfree(rm_quota);
1453 1405
1454 mlog_exit(status);
1455 /* no one is callint kthread_stop() for us so the kthread() api 1406 /* no one is callint kthread_stop() for us so the kthread() api
1456 * requires that we call do_exit(). And it isn't exported, but 1407 * requires that we call do_exit(). And it isn't exported, but
1457 * complete_and_exit() seems to be a minimal wrapper around it. */ 1408 * complete_and_exit() seems to be a minimal wrapper around it. */
@@ -1461,19 +1412,15 @@ bail:
1461 1412
1462void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) 1413void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1463{ 1414{
1464 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1465 node_num, osb->node_num);
1466
1467 mutex_lock(&osb->recovery_lock); 1415 mutex_lock(&osb->recovery_lock);
1468 if (osb->disable_recovery)
1469 goto out;
1470 1416
1471 /* People waiting on recovery will wait on 1417 trace_ocfs2_recovery_thread(node_num, osb->node_num,
1472 * the recovery map to empty. */ 1418 osb->disable_recovery, osb->recovery_thread_task,
1473 if (ocfs2_recovery_map_set(osb, node_num)) 1419 osb->disable_recovery ?
1474 mlog(0, "node %d already in recovery map.\n", node_num); 1420 -1 : ocfs2_recovery_map_set(osb, node_num));
1475 1421
1476 mlog(0, "starting recovery thread...\n"); 1422 if (osb->disable_recovery)
1423 goto out;
1477 1424
1478 if (osb->recovery_thread_task) 1425 if (osb->recovery_thread_task)
1479 goto out; 1426 goto out;
@@ -1488,8 +1435,6 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1488out: 1435out:
1489 mutex_unlock(&osb->recovery_lock); 1436 mutex_unlock(&osb->recovery_lock);
1490 wake_up(&osb->recovery_event); 1437 wake_up(&osb->recovery_event);
1491
1492 mlog_exit_void();
1493} 1438}
1494 1439
1495static int ocfs2_read_journal_inode(struct ocfs2_super *osb, 1440static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
@@ -1563,7 +1508,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1563 * If not, it needs recovery. 1508 * If not, it needs recovery.
1564 */ 1509 */
1565 if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) { 1510 if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
1566 mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num, 1511 trace_ocfs2_replay_journal_recovered(slot_num,
1567 osb->slot_recovery_generations[slot_num], slot_reco_gen); 1512 osb->slot_recovery_generations[slot_num], slot_reco_gen);
1568 osb->slot_recovery_generations[slot_num] = slot_reco_gen; 1513 osb->slot_recovery_generations[slot_num] = slot_reco_gen;
1569 status = -EBUSY; 1514 status = -EBUSY;
@@ -1574,7 +1519,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1574 1519
1575 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY); 1520 status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
1576 if (status < 0) { 1521 if (status < 0) {
1577 mlog(0, "status returned from ocfs2_inode_lock=%d\n", status); 1522 trace_ocfs2_replay_journal_lock_err(status);
1578 if (status != -ERESTARTSYS) 1523 if (status != -ERESTARTSYS)
1579 mlog(ML_ERROR, "Could not lock journal!\n"); 1524 mlog(ML_ERROR, "Could not lock journal!\n");
1580 goto done; 1525 goto done;
@@ -1587,7 +1532,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1587 slot_reco_gen = ocfs2_get_recovery_generation(fe); 1532 slot_reco_gen = ocfs2_get_recovery_generation(fe);
1588 1533
1589 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) { 1534 if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
1590 mlog(0, "No recovery required for node %d\n", node_num); 1535 trace_ocfs2_replay_journal_skip(node_num);
1591 /* Refresh recovery generation for the slot */ 1536 /* Refresh recovery generation for the slot */
1592 osb->slot_recovery_generations[slot_num] = slot_reco_gen; 1537 osb->slot_recovery_generations[slot_num] = slot_reco_gen;
1593 goto done; 1538 goto done;
@@ -1608,7 +1553,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1608 goto done; 1553 goto done;
1609 } 1554 }
1610 1555
1611 mlog(0, "calling journal_init_inode\n");
1612 journal = jbd2_journal_init_inode(inode); 1556 journal = jbd2_journal_init_inode(inode);
1613 if (journal == NULL) { 1557 if (journal == NULL) {
1614 mlog(ML_ERROR, "Linux journal layer error\n"); 1558 mlog(ML_ERROR, "Linux journal layer error\n");
@@ -1628,7 +1572,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1628 ocfs2_clear_journal_error(osb->sb, journal, slot_num); 1572 ocfs2_clear_journal_error(osb->sb, journal, slot_num);
1629 1573
1630 /* wipe the journal */ 1574 /* wipe the journal */
1631 mlog(0, "flushing the journal.\n");
1632 jbd2_journal_lock_updates(journal); 1575 jbd2_journal_lock_updates(journal);
1633 status = jbd2_journal_flush(journal); 1576 status = jbd2_journal_flush(journal);
1634 jbd2_journal_unlock_updates(journal); 1577 jbd2_journal_unlock_updates(journal);
@@ -1665,7 +1608,6 @@ done:
1665 1608
1666 brelse(bh); 1609 brelse(bh);
1667 1610
1668 mlog_exit(status);
1669 return status; 1611 return status;
1670} 1612}
1671 1613
@@ -1688,8 +1630,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1688 struct ocfs2_dinode *la_copy = NULL; 1630 struct ocfs2_dinode *la_copy = NULL;
1689 struct ocfs2_dinode *tl_copy = NULL; 1631 struct ocfs2_dinode *tl_copy = NULL;
1690 1632
1691 mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n", 1633 trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
1692 node_num, slot_num, osb->node_num);
1693 1634
1694 /* Should not ever be called to recover ourselves -- in that 1635 /* Should not ever be called to recover ourselves -- in that
1695 * case we should've called ocfs2_journal_load instead. */ 1636 * case we should've called ocfs2_journal_load instead. */
@@ -1698,9 +1639,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1698 status = ocfs2_replay_journal(osb, node_num, slot_num); 1639 status = ocfs2_replay_journal(osb, node_num, slot_num);
1699 if (status < 0) { 1640 if (status < 0) {
1700 if (status == -EBUSY) { 1641 if (status == -EBUSY) {
1701 mlog(0, "Skipping recovery for slot %u (node %u) " 1642 trace_ocfs2_recover_node_skip(slot_num, node_num);
1702 "as another node has recovered it\n", slot_num,
1703 node_num);
1704 status = 0; 1643 status = 0;
1705 goto done; 1644 goto done;
1706 } 1645 }
@@ -1735,7 +1674,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1735 status = 0; 1674 status = 0;
1736done: 1675done:
1737 1676
1738 mlog_exit(status);
1739 return status; 1677 return status;
1740} 1678}
1741 1679
@@ -1808,8 +1746,8 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1808 spin_lock(&osb->osb_lock); 1746 spin_lock(&osb->osb_lock);
1809 osb->slot_recovery_generations[i] = gen; 1747 osb->slot_recovery_generations[i] = gen;
1810 1748
1811 mlog(0, "Slot %u recovery generation is %u\n", i, 1749 trace_ocfs2_mark_dead_nodes(i,
1812 osb->slot_recovery_generations[i]); 1750 osb->slot_recovery_generations[i]);
1813 1751
1814 if (i == osb->slot_num) { 1752 if (i == osb->slot_num) {
1815 spin_unlock(&osb->osb_lock); 1753 spin_unlock(&osb->osb_lock);
@@ -1845,7 +1783,6 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1845 1783
1846 status = 0; 1784 status = 0;
1847bail: 1785bail:
1848 mlog_exit(status);
1849 return status; 1786 return status;
1850} 1787}
1851 1788
@@ -1884,11 +1821,12 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1884 1821
1885 os = &osb->osb_orphan_scan; 1822 os = &osb->osb_orphan_scan;
1886 1823
1887 mlog(0, "Begin orphan scan\n");
1888
1889 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE) 1824 if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
1890 goto out; 1825 goto out;
1891 1826
1827 trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
1828 atomic_read(&os->os_state));
1829
1892 status = ocfs2_orphan_scan_lock(osb, &seqno); 1830 status = ocfs2_orphan_scan_lock(osb, &seqno);
1893 if (status < 0) { 1831 if (status < 0) {
1894 if (status != -EAGAIN) 1832 if (status != -EAGAIN)
@@ -1918,7 +1856,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
1918unlock: 1856unlock:
1919 ocfs2_orphan_scan_unlock(osb, seqno); 1857 ocfs2_orphan_scan_unlock(osb, seqno);
1920out: 1858out:
1921 mlog(0, "Orphan scan completed\n"); 1859 trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
1860 atomic_read(&os->os_state));
1922 return; 1861 return;
1923} 1862}
1924 1863
@@ -2002,8 +1941,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
2002 if (IS_ERR(iter)) 1941 if (IS_ERR(iter))
2003 return 0; 1942 return 0;
2004 1943
2005 mlog(0, "queue orphan %llu\n", 1944 trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
2006 (unsigned long long)OCFS2_I(iter)->ip_blkno);
2007 /* No locking is required for the next_orphan queue as there 1945 /* No locking is required for the next_orphan queue as there
2008 * is only ever a single process doing orphan recovery. */ 1946 * is only ever a single process doing orphan recovery. */
2009 OCFS2_I(iter)->ip_next_orphan = p->head; 1947 OCFS2_I(iter)->ip_next_orphan = p->head;
@@ -2119,7 +2057,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2119 struct inode *iter; 2057 struct inode *iter;
2120 struct ocfs2_inode_info *oi; 2058 struct ocfs2_inode_info *oi;
2121 2059
2122 mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); 2060 trace_ocfs2_recover_orphans(slot);
2123 2061
2124 ocfs2_mark_recovering_orphan_dir(osb, slot); 2062 ocfs2_mark_recovering_orphan_dir(osb, slot);
2125 ret = ocfs2_queue_orphans(osb, slot, &inode); 2063 ret = ocfs2_queue_orphans(osb, slot, &inode);
@@ -2132,7 +2070,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
2132 2070
2133 while (inode) { 2071 while (inode) {
2134 oi = OCFS2_I(inode); 2072 oi = OCFS2_I(inode);
2135 mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno); 2073 trace_ocfs2_recover_orphans_iput(
2074 (unsigned long long)oi->ip_blkno);
2136 2075
2137 iter = oi->ip_next_orphan; 2076 iter = oi->ip_next_orphan;
2138 2077
@@ -2170,6 +2109,7 @@ static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
2170 * MOUNTED flag, but this is set right before 2109 * MOUNTED flag, but this is set right before
2171 * dismount_volume() so we can trust it. */ 2110 * dismount_volume() so we can trust it. */
2172 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) { 2111 if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
2112 trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
2173 mlog(0, "mount error, exiting!\n"); 2113 mlog(0, "mount error, exiting!\n");
2174 return -EBUSY; 2114 return -EBUSY;
2175 } 2115 }
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 43e56b97f9c0..68cf2f6d3c6a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -215,7 +215,7 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
215 /* WARNING: This only kicks off a single 215 /* WARNING: This only kicks off a single
216 * checkpoint. If someone races you and adds more 216 * checkpoint. If someone races you and adds more
217 * metadata to the journal, you won't know, and will 217 * metadata to the journal, you won't know, and will
218 * wind up waiting *alot* longer than necessary. Right 218 * wind up waiting *a lot* longer than necessary. Right
219 * now we only use this in clear_inode so that's 219 * now we only use this in clear_inode so that's
220 * OK. */ 220 * OK. */
221 ocfs2_start_checkpoint(osb); 221 ocfs2_start_checkpoint(osb);
@@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
405 ocfs2_quota_trans_credits(sb); 405 ocfs2_quota_trans_credits(sb);
406} 406}
407 407
408/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + 408/* data block for new dir/symlink, allocation of directory block, dx_root
409 * bitmap block for the new bit) dx_root update for free list */ 409 * update for free list */
410#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1) 410#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
411 411
412static inline int ocfs2_add_dir_index_credits(struct super_block *sb) 412static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
413{ 413{
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec6adbf8f551..210c35237548 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -29,7 +29,6 @@
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31 31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h> 32#include <cluster/masklog.h>
34 33
35#include "ocfs2.h" 34#include "ocfs2.h"
@@ -43,6 +42,7 @@
43#include "suballoc.h" 42#include "suballoc.h"
44#include "super.h" 43#include "super.h"
45#include "sysfile.h" 44#include "sysfile.h"
45#include "ocfs2_trace.h"
46 46
47#include "buffer_head_io.h" 47#include "buffer_head_io.h"
48 48
@@ -201,8 +201,7 @@ void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
201 la_max_mb = ocfs2_clusters_to_megabytes(sb, 201 la_max_mb = ocfs2_clusters_to_megabytes(sb,
202 ocfs2_local_alloc_size(sb) * 8); 202 ocfs2_local_alloc_size(sb) * 8);
203 203
204 mlog(0, "requested: %dM, max: %uM, default: %uM\n", 204 trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
205 requested_mb, la_max_mb, la_default_mb);
206 205
207 if (requested_mb == -1) { 206 if (requested_mb == -1) {
208 /* No user request - use defaults */ 207 /* No user request - use defaults */
@@ -276,8 +275,8 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
276 275
277 ret = 1; 276 ret = 1;
278bail: 277bail:
279 mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", 278 trace_ocfs2_alloc_should_use_local(
280 osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); 279 (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
281 spin_unlock(&osb->osb_lock); 280 spin_unlock(&osb->osb_lock);
282 return ret; 281 return ret;
283} 282}
@@ -291,8 +290,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
291 struct inode *inode = NULL; 290 struct inode *inode = NULL;
292 struct ocfs2_local_alloc *la; 291 struct ocfs2_local_alloc *la;
293 292
294 mlog_entry_void();
295
296 if (osb->local_alloc_bits == 0) 293 if (osb->local_alloc_bits == 0)
297 goto bail; 294 goto bail;
298 295
@@ -364,9 +361,10 @@ bail:
364 if (inode) 361 if (inode)
365 iput(inode); 362 iput(inode);
366 363
367 mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); 364 trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
368 365
369 mlog_exit(status); 366 if (status)
367 mlog_errno(status);
370 return status; 368 return status;
371} 369}
372 370
@@ -388,8 +386,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
388 struct ocfs2_dinode *alloc_copy = NULL; 386 struct ocfs2_dinode *alloc_copy = NULL;
389 struct ocfs2_dinode *alloc = NULL; 387 struct ocfs2_dinode *alloc = NULL;
390 388
391 mlog_entry_void();
392
393 cancel_delayed_work(&osb->la_enable_wq); 389 cancel_delayed_work(&osb->la_enable_wq);
394 flush_workqueue(ocfs2_wq); 390 flush_workqueue(ocfs2_wq);
395 391
@@ -482,8 +478,6 @@ out:
482 478
483 if (alloc_copy) 479 if (alloc_copy)
484 kfree(alloc_copy); 480 kfree(alloc_copy);
485
486 mlog_exit_void();
487} 481}
488 482
489/* 483/*
@@ -502,7 +496,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
502 struct inode *inode = NULL; 496 struct inode *inode = NULL;
503 struct ocfs2_dinode *alloc; 497 struct ocfs2_dinode *alloc;
504 498
505 mlog_entry("(slot_num = %d)\n", slot_num); 499 trace_ocfs2_begin_local_alloc_recovery(slot_num);
506 500
507 *alloc_copy = NULL; 501 *alloc_copy = NULL;
508 502
@@ -552,7 +546,8 @@ bail:
552 iput(inode); 546 iput(inode);
553 } 547 }
554 548
555 mlog_exit(status); 549 if (status)
550 mlog_errno(status);
556 return status; 551 return status;
557} 552}
558 553
@@ -570,8 +565,6 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
570 struct buffer_head *main_bm_bh = NULL; 565 struct buffer_head *main_bm_bh = NULL;
571 struct inode *main_bm_inode; 566 struct inode *main_bm_inode;
572 567
573 mlog_entry_void();
574
575 main_bm_inode = ocfs2_get_system_file_inode(osb, 568 main_bm_inode = ocfs2_get_system_file_inode(osb,
576 GLOBAL_BITMAP_SYSTEM_INODE, 569 GLOBAL_BITMAP_SYSTEM_INODE,
577 OCFS2_INVALID_SLOT); 570 OCFS2_INVALID_SLOT);
@@ -620,7 +613,8 @@ out_mutex:
620out: 613out:
621 if (!status) 614 if (!status)
622 ocfs2_init_steal_slots(osb); 615 ocfs2_init_steal_slots(osb);
623 mlog_exit(status); 616 if (status)
617 mlog_errno(status);
624 return status; 618 return status;
625} 619}
626 620
@@ -640,8 +634,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
640 struct inode *local_alloc_inode; 634 struct inode *local_alloc_inode;
641 unsigned int free_bits; 635 unsigned int free_bits;
642 636
643 mlog_entry_void();
644
645 BUG_ON(!ac); 637 BUG_ON(!ac);
646 638
647 local_alloc_inode = 639 local_alloc_inode =
@@ -712,10 +704,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
712 goto bail; 704 goto bail;
713 } 705 }
714 706
715 if (ac->ac_max_block)
716 mlog(0, "Calling in_range for max block %llu\n",
717 (unsigned long long)ac->ac_max_block);
718
719 ac->ac_inode = local_alloc_inode; 707 ac->ac_inode = local_alloc_inode;
720 /* We should never use localalloc from another slot */ 708 /* We should never use localalloc from another slot */
721 ac->ac_alloc_slot = osb->slot_num; 709 ac->ac_alloc_slot = osb->slot_num;
@@ -729,10 +717,12 @@ bail:
729 iput(local_alloc_inode); 717 iput(local_alloc_inode);
730 } 718 }
731 719
732 mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num, 720 trace_ocfs2_reserve_local_alloc_bits(
733 status); 721 (unsigned long long)ac->ac_max_block,
722 bits_wanted, osb->slot_num, status);
734 723
735 mlog_exit(status); 724 if (status)
725 mlog_errno(status);
736 return status; 726 return status;
737} 727}
738 728
@@ -749,7 +739,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
749 struct ocfs2_dinode *alloc; 739 struct ocfs2_dinode *alloc;
750 struct ocfs2_local_alloc *la; 740 struct ocfs2_local_alloc *la;
751 741
752 mlog_entry_void();
753 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL); 742 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
754 743
755 local_alloc_inode = ac->ac_inode; 744 local_alloc_inode = ac->ac_inode;
@@ -788,7 +777,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
788 ocfs2_journal_dirty(handle, osb->local_alloc_bh); 777 ocfs2_journal_dirty(handle, osb->local_alloc_bh);
789 778
790bail: 779bail:
791 mlog_exit(status); 780 if (status)
781 mlog_errno(status);
792 return status; 782 return status;
793} 783}
794 784
@@ -799,13 +789,11 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
799 u32 count = 0; 789 u32 count = 0;
800 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); 790 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
801 791
802 mlog_entry_void();
803
804 buffer = la->la_bitmap; 792 buffer = la->la_bitmap;
805 for (i = 0; i < le16_to_cpu(la->la_size); i++) 793 for (i = 0; i < le16_to_cpu(la->la_size); i++)
806 count += hweight8(buffer[i]); 794 count += hweight8(buffer[i]);
807 795
808 mlog_exit(count); 796 trace_ocfs2_local_alloc_count_bits(count);
809 return count; 797 return count;
810} 798}
811 799
@@ -820,10 +808,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
820 void *bitmap = NULL; 808 void *bitmap = NULL;
821 struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap; 809 struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
822 810
823 mlog_entry("(numbits wanted = %u)\n", *numbits);
824
825 if (!alloc->id1.bitmap1.i_total) { 811 if (!alloc->id1.bitmap1.i_total) {
826 mlog(0, "No bits in my window!\n");
827 bitoff = -1; 812 bitoff = -1;
828 goto bail; 813 goto bail;
829 } 814 }
@@ -883,8 +868,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
883 } 868 }
884 } 869 }
885 870
886 mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff, 871 trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
887 numfound);
888 872
889 if (numfound == *numbits) 873 if (numfound == *numbits)
890 bitoff = startoff - numfound; 874 bitoff = startoff - numfound;
@@ -895,7 +879,10 @@ bail:
895 if (local_resv) 879 if (local_resv)
896 ocfs2_resv_discard(resmap, resv); 880 ocfs2_resv_discard(resmap, resv);
897 881
898 mlog_exit(bitoff); 882 trace_ocfs2_local_alloc_find_clear_bits(*numbits,
883 le32_to_cpu(alloc->id1.bitmap1.i_total),
884 bitoff, numfound);
885
899 return bitoff; 886 return bitoff;
900} 887}
901 888
@@ -903,15 +890,12 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
903{ 890{
904 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); 891 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
905 int i; 892 int i;
906 mlog_entry_void();
907 893
908 alloc->id1.bitmap1.i_total = 0; 894 alloc->id1.bitmap1.i_total = 0;
909 alloc->id1.bitmap1.i_used = 0; 895 alloc->id1.bitmap1.i_used = 0;
910 la->la_bm_off = 0; 896 la->la_bm_off = 0;
911 for(i = 0; i < le16_to_cpu(la->la_size); i++) 897 for(i = 0; i < le16_to_cpu(la->la_size); i++)
912 la->la_bitmap[i] = 0; 898 la->la_bitmap[i] = 0;
913
914 mlog_exit_void();
915} 899}
916 900
917#if 0 901#if 0
@@ -952,18 +936,16 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
952 void *bitmap; 936 void *bitmap;
953 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); 937 struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
954 938
955 mlog_entry("total = %u, used = %u\n", 939 trace_ocfs2_sync_local_to_main(
956 le32_to_cpu(alloc->id1.bitmap1.i_total), 940 le32_to_cpu(alloc->id1.bitmap1.i_total),
957 le32_to_cpu(alloc->id1.bitmap1.i_used)); 941 le32_to_cpu(alloc->id1.bitmap1.i_used));
958 942
959 if (!alloc->id1.bitmap1.i_total) { 943 if (!alloc->id1.bitmap1.i_total) {
960 mlog(0, "nothing to sync!\n");
961 goto bail; 944 goto bail;
962 } 945 }
963 946
964 if (le32_to_cpu(alloc->id1.bitmap1.i_used) == 947 if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
965 le32_to_cpu(alloc->id1.bitmap1.i_total)) { 948 le32_to_cpu(alloc->id1.bitmap1.i_total)) {
966 mlog(0, "all bits were taken!\n");
967 goto bail; 949 goto bail;
968 } 950 }
969 951
@@ -985,8 +967,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
985 ocfs2_clusters_to_blocks(osb->sb, 967 ocfs2_clusters_to_blocks(osb->sb,
986 start - count); 968 start - count);
987 969
988 mlog(0, "freeing %u bits starting at local alloc bit " 970 trace_ocfs2_sync_local_to_main_free(
989 "%u (la_start_blk = %llu, blkno = %llu)\n",
990 count, start - count, 971 count, start - count,
991 (unsigned long long)la_start_blk, 972 (unsigned long long)la_start_blk,
992 (unsigned long long)blkno); 973 (unsigned long long)blkno);
@@ -1007,7 +988,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
1007 } 988 }
1008 989
1009bail: 990bail:
1010 mlog_exit(status); 991 if (status)
992 mlog_errno(status);
1011 return status; 993 return status;
1012} 994}
1013 995
@@ -1132,7 +1114,8 @@ bail:
1132 *ac = NULL; 1114 *ac = NULL;
1133 } 1115 }
1134 1116
1135 mlog_exit(status); 1117 if (status)
1118 mlog_errno(status);
1136 return status; 1119 return status;
1137} 1120}
1138 1121
@@ -1148,17 +1131,12 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
1148 struct ocfs2_dinode *alloc = NULL; 1131 struct ocfs2_dinode *alloc = NULL;
1149 struct ocfs2_local_alloc *la; 1132 struct ocfs2_local_alloc *la;
1150 1133
1151 mlog_entry_void();
1152
1153 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; 1134 alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
1154 la = OCFS2_LOCAL_ALLOC(alloc); 1135 la = OCFS2_LOCAL_ALLOC(alloc);
1155 1136
1156 if (alloc->id1.bitmap1.i_total) 1137 trace_ocfs2_local_alloc_new_window(
1157 mlog(0, "asking me to alloc a new window over a non-empty " 1138 le32_to_cpu(alloc->id1.bitmap1.i_total),
1158 "one\n"); 1139 osb->local_alloc_bits);
1159
1160 mlog(0, "Allocating %u clusters for a new window.\n",
1161 osb->local_alloc_bits);
1162 1140
1163 /* Instruct the allocation code to try the most recently used 1141 /* Instruct the allocation code to try the most recently used
1164 * cluster group. We'll re-record the group used this pass 1142 * cluster group. We'll re-record the group used this pass
@@ -1220,13 +1198,13 @@ retry_enospc:
1220 ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count, 1198 ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
1221 OCFS2_LOCAL_ALLOC(alloc)->la_bitmap); 1199 OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
1222 1200
1223 mlog(0, "New window allocated:\n"); 1201 trace_ocfs2_local_alloc_new_window_result(
1224 mlog(0, "window la_bm_off = %u\n", 1202 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
1225 OCFS2_LOCAL_ALLOC(alloc)->la_bm_off); 1203 le32_to_cpu(alloc->id1.bitmap1.i_total));
1226 mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
1227 1204
1228bail: 1205bail:
1229 mlog_exit(status); 1206 if (status)
1207 mlog_errno(status);
1230 return status; 1208 return status;
1231} 1209}
1232 1210
@@ -1243,8 +1221,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1243 struct ocfs2_dinode *alloc_copy = NULL; 1221 struct ocfs2_dinode *alloc_copy = NULL;
1244 struct ocfs2_alloc_context *ac = NULL; 1222 struct ocfs2_alloc_context *ac = NULL;
1245 1223
1246 mlog_entry_void();
1247
1248 ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE); 1224 ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
1249 1225
1250 /* This will lock the main bitmap for us. */ 1226 /* This will lock the main bitmap for us. */
@@ -1324,7 +1300,8 @@ bail:
1324 if (ac) 1300 if (ac)
1325 ocfs2_free_alloc_context(ac); 1301 ocfs2_free_alloc_context(ac);
1326 1302
1327 mlog_exit(status); 1303 if (status)
1304 mlog_errno(status);
1328 return status; 1305 return status;
1329} 1306}
1330 1307
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index b5cb3ede9408..e57c804069ea 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -26,7 +26,6 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/fcntl.h> 27#include <linux/fcntl.h>
28 28
29#define MLOG_MASK_PREFIX ML_INODE
30#include <cluster/masklog.h> 29#include <cluster/masklog.h>
31 30
32#include "ocfs2.h" 31#include "ocfs2.h"
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7e32db9c2c99..3e9393ca39eb 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -31,7 +31,6 @@
31#include <linux/signal.h> 31#include <linux/signal.h>
32#include <linux/rbtree.h> 32#include <linux/rbtree.h>
33 33
34#define MLOG_MASK_PREFIX ML_FILE_IO
35#include <cluster/masklog.h> 34#include <cluster/masklog.h>
36 35
37#include "ocfs2.h" 36#include "ocfs2.h"
@@ -42,6 +41,7 @@
42#include "inode.h" 41#include "inode.h"
43#include "mmap.h" 42#include "mmap.h"
44#include "super.h" 43#include "super.h"
44#include "ocfs2_trace.h"
45 45
46 46
47static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) 47static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
@@ -49,13 +49,12 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
49 sigset_t oldset; 49 sigset_t oldset;
50 int ret; 50 int ret;
51 51
52 mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
53
54 ocfs2_block_signals(&oldset); 52 ocfs2_block_signals(&oldset);
55 ret = filemap_fault(area, vmf); 53 ret = filemap_fault(area, vmf);
56 ocfs2_unblock_signals(&oldset); 54 ocfs2_unblock_signals(&oldset);
57 55
58 mlog_exit_ptr(vmf->page); 56 trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
57 area, vmf->page, vmf->pgoff);
59 return ret; 58 return ret;
60} 59}
61 60
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849fb4a2e814..e5d738cd9cc0 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -42,7 +42,6 @@
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44 44
45#define MLOG_MASK_PREFIX ML_NAMEI
46#include <cluster/masklog.h> 45#include <cluster/masklog.h>
47 46
48#include "ocfs2.h" 47#include "ocfs2.h"
@@ -63,6 +62,7 @@
63#include "uptodate.h" 62#include "uptodate.h"
64#include "xattr.h" 63#include "xattr.h"
65#include "acl.h" 64#include "acl.h"
65#include "ocfs2_trace.h"
66 66
67#include "buffer_head_io.h" 67#include "buffer_head_io.h"
68 68
@@ -106,17 +106,15 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
106 struct dentry *ret; 106 struct dentry *ret;
107 struct ocfs2_inode_info *oi; 107 struct ocfs2_inode_info *oi;
108 108
109 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 109 trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
110 dentry->d_name.len, dentry->d_name.name); 110 dentry->d_name.name,
111 (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
111 112
112 if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) { 113 if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
113 ret = ERR_PTR(-ENAMETOOLONG); 114 ret = ERR_PTR(-ENAMETOOLONG);
114 goto bail; 115 goto bail;
115 } 116 }
116 117
117 mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
118 dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
119
120 status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT); 118 status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
121 if (status < 0) { 119 if (status < 0) {
122 if (status != -ENOENT) 120 if (status != -ENOENT)
@@ -182,7 +180,7 @@ bail_unlock:
182 180
183bail: 181bail:
184 182
185 mlog_exit_ptr(ret); 183 trace_ocfs2_lookup_ret(ret);
186 184
187 return ret; 185 return ret;
188} 186}
@@ -235,9 +233,9 @@ static int ocfs2_mknod(struct inode *dir,
235 sigset_t oldset; 233 sigset_t oldset;
236 int did_block_signals = 0; 234 int did_block_signals = 0;
237 235
238 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, 236 trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
239 (unsigned long)dev, dentry->d_name.len, 237 (unsigned long long)OCFS2_I(dir)->ip_blkno,
240 dentry->d_name.name); 238 (unsigned long)dev, mode);
241 239
242 dquot_initialize(dir); 240 dquot_initialize(dir);
243 241
@@ -293,7 +291,7 @@ static int ocfs2_mknod(struct inode *dir,
293 } 291 }
294 292
295 /* get security xattr */ 293 /* get security xattr */
296 status = ocfs2_init_security_get(inode, dir, &si); 294 status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
297 if (status) { 295 if (status) {
298 if (status == -EOPNOTSUPP) 296 if (status == -EOPNOTSUPP)
299 si.enable = 0; 297 si.enable = 0;
@@ -354,10 +352,6 @@ static int ocfs2_mknod(struct inode *dir,
354 goto leave; 352 goto leave;
355 did_quota_inode = 1; 353 did_quota_inode = 1;
356 354
357 mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
358 inode->i_mode, (unsigned long)dev, dentry->d_name.len,
359 dentry->d_name.name);
360
361 /* do the real work now. */ 355 /* do the real work now. */
362 status = ocfs2_mknod_locked(osb, dir, inode, dev, 356 status = ocfs2_mknod_locked(osb, dir, inode, dev,
363 &new_fe_bh, parent_fe_bh, handle, 357 &new_fe_bh, parent_fe_bh, handle,
@@ -436,9 +430,6 @@ leave:
436 if (did_block_signals) 430 if (did_block_signals)
437 ocfs2_unblock_signals(&oldset); 431 ocfs2_unblock_signals(&oldset);
438 432
439 if (status == -ENOSPC)
440 mlog(0, "Disk is full\n");
441
442 brelse(new_fe_bh); 433 brelse(new_fe_bh);
443 brelse(parent_fe_bh); 434 brelse(parent_fe_bh);
444 kfree(si.name); 435 kfree(si.name);
@@ -466,7 +457,8 @@ leave:
466 iput(inode); 457 iput(inode);
467 } 458 }
468 459
469 mlog_exit(status); 460 if (status)
461 mlog_errno(status);
470 462
471 return status; 463 return status;
472} 464}
@@ -577,7 +569,8 @@ leave:
577 } 569 }
578 } 570 }
579 571
580 mlog_exit(status); 572 if (status)
573 mlog_errno(status);
581 return status; 574 return status;
582} 575}
583 576
@@ -615,10 +608,11 @@ static int ocfs2_mkdir(struct inode *dir,
615{ 608{
616 int ret; 609 int ret;
617 610
618 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, 611 trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
619 dentry->d_name.len, dentry->d_name.name); 612 OCFS2_I(dir)->ip_blkno, mode);
620 ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); 613 ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
621 mlog_exit(ret); 614 if (ret)
615 mlog_errno(ret);
622 616
623 return ret; 617 return ret;
624} 618}
@@ -630,10 +624,11 @@ static int ocfs2_create(struct inode *dir,
630{ 624{
631 int ret; 625 int ret;
632 626
633 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode, 627 trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
634 dentry->d_name.len, dentry->d_name.name); 628 (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
635 ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); 629 ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
636 mlog_exit(ret); 630 if (ret)
631 mlog_errno(ret);
637 632
638 return ret; 633 return ret;
639} 634}
@@ -652,9 +647,9 @@ static int ocfs2_link(struct dentry *old_dentry,
652 struct ocfs2_dir_lookup_result lookup = { NULL, }; 647 struct ocfs2_dir_lookup_result lookup = { NULL, };
653 sigset_t oldset; 648 sigset_t oldset;
654 649
655 mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino, 650 trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
656 old_dentry->d_name.len, old_dentry->d_name.name, 651 old_dentry->d_name.len, old_dentry->d_name.name,
657 dentry->d_name.len, dentry->d_name.name); 652 dentry->d_name.len, dentry->d_name.name);
658 653
659 if (S_ISDIR(inode->i_mode)) 654 if (S_ISDIR(inode->i_mode))
660 return -EPERM; 655 return -EPERM;
@@ -757,7 +752,8 @@ out:
757 752
758 ocfs2_free_dir_lookup_result(&lookup); 753 ocfs2_free_dir_lookup_result(&lookup);
759 754
760 mlog_exit(err); 755 if (err)
756 mlog_errno(err);
761 757
762 return err; 758 return err;
763} 759}
@@ -809,19 +805,17 @@ static int ocfs2_unlink(struct inode *dir,
809 struct ocfs2_dir_lookup_result lookup = { NULL, }; 805 struct ocfs2_dir_lookup_result lookup = { NULL, };
810 struct ocfs2_dir_lookup_result orphan_insert = { NULL, }; 806 struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
811 807
812 mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry, 808 trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
813 dentry->d_name.len, dentry->d_name.name); 809 dentry->d_name.name,
810 (unsigned long long)OCFS2_I(dir)->ip_blkno,
811 (unsigned long long)OCFS2_I(inode)->ip_blkno);
814 812
815 dquot_initialize(dir); 813 dquot_initialize(dir);
816 814
817 BUG_ON(dentry->d_parent->d_inode != dir); 815 BUG_ON(dentry->d_parent->d_inode != dir);
818 816
819 mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 817 if (inode == osb->root_inode)
820
821 if (inode == osb->root_inode) {
822 mlog(0, "Cannot delete the root directory\n");
823 return -EPERM; 818 return -EPERM;
824 }
825 819
826 status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1, 820 status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
827 OI_LS_PARENT); 821 OI_LS_PARENT);
@@ -843,9 +837,10 @@ static int ocfs2_unlink(struct inode *dir,
843 if (OCFS2_I(inode)->ip_blkno != blkno) { 837 if (OCFS2_I(inode)->ip_blkno != blkno) {
844 status = -ENOENT; 838 status = -ENOENT;
845 839
846 mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n", 840 trace_ocfs2_unlink_noent(
847 (unsigned long long)OCFS2_I(inode)->ip_blkno, 841 (unsigned long long)OCFS2_I(inode)->ip_blkno,
848 (unsigned long long)blkno, OCFS2_I(inode)->ip_flags); 842 (unsigned long long)blkno,
843 OCFS2_I(inode)->ip_flags);
849 goto leave; 844 goto leave;
850 } 845 }
851 846
@@ -954,7 +949,8 @@ leave:
954 ocfs2_free_dir_lookup_result(&orphan_insert); 949 ocfs2_free_dir_lookup_result(&orphan_insert);
955 ocfs2_free_dir_lookup_result(&lookup); 950 ocfs2_free_dir_lookup_result(&lookup);
956 951
957 mlog_exit(status); 952 if (status)
953 mlog_errno(status);
958 954
959 return status; 955 return status;
960} 956}
@@ -975,9 +971,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
975 struct buffer_head **tmpbh; 971 struct buffer_head **tmpbh;
976 struct inode *tmpinode; 972 struct inode *tmpinode;
977 973
978 mlog_entry("(inode1 = %llu, inode2 = %llu)\n", 974 trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
979 (unsigned long long)oi1->ip_blkno, 975 (unsigned long long)oi2->ip_blkno);
980 (unsigned long long)oi2->ip_blkno);
981 976
982 if (*bh1) 977 if (*bh1)
983 *bh1 = NULL; 978 *bh1 = NULL;
@@ -988,7 +983,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
988 if (oi1->ip_blkno != oi2->ip_blkno) { 983 if (oi1->ip_blkno != oi2->ip_blkno) {
989 if (oi1->ip_blkno < oi2->ip_blkno) { 984 if (oi1->ip_blkno < oi2->ip_blkno) {
990 /* switch id1 and id2 around */ 985 /* switch id1 and id2 around */
991 mlog(0, "switching them around...\n");
992 tmpbh = bh2; 986 tmpbh = bh2;
993 bh2 = bh1; 987 bh2 = bh1;
994 bh1 = tmpbh; 988 bh1 = tmpbh;
@@ -1024,8 +1018,13 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1024 mlog_errno(status); 1018 mlog_errno(status);
1025 } 1019 }
1026 1020
1021 trace_ocfs2_double_lock_end(
1022 (unsigned long long)OCFS2_I(inode1)->ip_blkno,
1023 (unsigned long long)OCFS2_I(inode2)->ip_blkno);
1024
1027bail: 1025bail:
1028 mlog_exit(status); 1026 if (status)
1027 mlog_errno(status);
1029 return status; 1028 return status;
1030} 1029}
1031 1030
@@ -1067,10 +1066,9 @@ static int ocfs2_rename(struct inode *old_dir,
1067 /* At some point it might be nice to break this function up a 1066 /* At some point it might be nice to break this function up a
1068 * bit. */ 1067 * bit. */
1069 1068
1070 mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n", 1069 trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
1071 old_dir, old_dentry, new_dir, new_dentry, 1070 old_dentry->d_name.len, old_dentry->d_name.name,
1072 old_dentry->d_name.len, old_dentry->d_name.name, 1071 new_dentry->d_name.len, new_dentry->d_name.name);
1073 new_dentry->d_name.len, new_dentry->d_name.name);
1074 1072
1075 dquot_initialize(old_dir); 1073 dquot_initialize(old_dir);
1076 dquot_initialize(new_dir); 1074 dquot_initialize(new_dir);
@@ -1227,16 +1225,15 @@ static int ocfs2_rename(struct inode *old_dir,
1227 if (!new_inode) { 1225 if (!new_inode) {
1228 status = -EACCES; 1226 status = -EACCES;
1229 1227
1230 mlog(0, "We found an inode for name %.*s but VFS " 1228 trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
1231 "didn't give us one.\n", new_dentry->d_name.len, 1229 new_dentry->d_name.name);
1232 new_dentry->d_name.name);
1233 goto bail; 1230 goto bail;
1234 } 1231 }
1235 1232
1236 if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) { 1233 if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
1237 status = -EACCES; 1234 status = -EACCES;
1238 1235
1239 mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n", 1236 trace_ocfs2_rename_disagree(
1240 (unsigned long long)OCFS2_I(new_inode)->ip_blkno, 1237 (unsigned long long)OCFS2_I(new_inode)->ip_blkno,
1241 (unsigned long long)newfe_blkno, 1238 (unsigned long long)newfe_blkno,
1242 OCFS2_I(new_inode)->ip_flags); 1239 OCFS2_I(new_inode)->ip_flags);
@@ -1259,8 +1256,7 @@ static int ocfs2_rename(struct inode *old_dir,
1259 1256
1260 newfe = (struct ocfs2_dinode *) newfe_bh->b_data; 1257 newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
1261 1258
1262 mlog(0, "aha rename over existing... new_blkno=%llu " 1259 trace_ocfs2_rename_over_existing(
1263 "newfebh=%p bhblocknr=%llu\n",
1264 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ? 1260 (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
1265 (unsigned long long)newfe_bh->b_blocknr : 0ULL); 1261 (unsigned long long)newfe_bh->b_blocknr : 0ULL);
1266 1262
@@ -1476,7 +1472,8 @@ bail:
1476 brelse(old_dir_bh); 1472 brelse(old_dir_bh);
1477 brelse(new_dir_bh); 1473 brelse(new_dir_bh);
1478 1474
1479 mlog_exit(status); 1475 if (status)
1476 mlog_errno(status);
1480 1477
1481 return status; 1478 return status;
1482} 1479}
@@ -1501,9 +1498,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
1501 * write i_size + 1 bytes. */ 1498 * write i_size + 1 bytes. */
1502 blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 1499 blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
1503 1500
1504 mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n", 1501 trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
1505 (unsigned long long)inode->i_blocks, 1502 i_size_read(inode), blocks);
1506 i_size_read(inode), blocks);
1507 1503
1508 /* Sanity check -- make sure we're going to fit. */ 1504 /* Sanity check -- make sure we're going to fit. */
1509 if (bytes_left > 1505 if (bytes_left >
@@ -1579,7 +1575,8 @@ bail:
1579 kfree(bhs); 1575 kfree(bhs);
1580 } 1576 }
1581 1577
1582 mlog_exit(status); 1578 if (status)
1579 mlog_errno(status);
1583 return status; 1580 return status;
1584} 1581}
1585 1582
@@ -1610,8 +1607,8 @@ static int ocfs2_symlink(struct inode *dir,
1610 sigset_t oldset; 1607 sigset_t oldset;
1611 int did_block_signals = 0; 1608 int did_block_signals = 0;
1612 1609
1613 mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, 1610 trace_ocfs2_symlink_begin(dir, dentry, symname,
1614 dentry, symname, dentry->d_name.len, dentry->d_name.name); 1611 dentry->d_name.len, dentry->d_name.name);
1615 1612
1616 dquot_initialize(dir); 1613 dquot_initialize(dir);
1617 1614
@@ -1665,7 +1662,7 @@ static int ocfs2_symlink(struct inode *dir,
1665 } 1662 }
1666 1663
1667 /* get security xattr */ 1664 /* get security xattr */
1668 status = ocfs2_init_security_get(inode, dir, &si); 1665 status = ocfs2_init_security_get(inode, dir, &dentry->d_name, &si);
1669 if (status) { 1666 if (status) {
1670 if (status == -EOPNOTSUPP) 1667 if (status == -EOPNOTSUPP)
1671 si.enable = 0; 1668 si.enable = 0;
@@ -1713,9 +1710,10 @@ static int ocfs2_symlink(struct inode *dir,
1713 goto bail; 1710 goto bail;
1714 did_quota_inode = 1; 1711 did_quota_inode = 1;
1715 1712
1716 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, 1713 trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
1717 inode->i_mode, dentry->d_name.len, 1714 dentry->d_name.name,
1718 dentry->d_name.name); 1715 (unsigned long long)OCFS2_I(dir)->ip_blkno,
1716 inode->i_mode);
1719 1717
1720 status = ocfs2_mknod_locked(osb, dir, inode, 1718 status = ocfs2_mknod_locked(osb, dir, inode,
1721 0, &new_fe_bh, parent_fe_bh, handle, 1719 0, &new_fe_bh, parent_fe_bh, handle,
@@ -1835,7 +1833,8 @@ bail:
1835 iput(inode); 1833 iput(inode);
1836 } 1834 }
1837 1835
1838 mlog_exit(status); 1836 if (status)
1837 mlog_errno(status);
1839 1838
1840 return status; 1839 return status;
1841} 1840}
@@ -1844,8 +1843,6 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
1844{ 1843{
1845 int status, namelen; 1844 int status, namelen;
1846 1845
1847 mlog_entry_void();
1848
1849 namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx", 1846 namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
1850 (long long)blkno); 1847 (long long)blkno);
1851 if (namelen <= 0) { 1848 if (namelen <= 0) {
@@ -1862,12 +1859,12 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
1862 goto bail; 1859 goto bail;
1863 } 1860 }
1864 1861
1865 mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name, 1862 trace_ocfs2_blkno_stringify(blkno, name, namelen);
1866 namelen);
1867 1863
1868 status = 0; 1864 status = 0;
1869bail: 1865bail:
1870 mlog_exit(status); 1866 if (status < 0)
1867 mlog_errno(status);
1871 return status; 1868 return status;
1872} 1869}
1873 1870
@@ -1980,7 +1977,8 @@ out:
1980 iput(orphan_dir_inode); 1977 iput(orphan_dir_inode);
1981 } 1978 }
1982 1979
1983 mlog_exit(ret); 1980 if (ret)
1981 mlog_errno(ret);
1984 return ret; 1982 return ret;
1985} 1983}
1986 1984
@@ -1997,7 +1995,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
1997 struct ocfs2_dinode *orphan_fe; 1995 struct ocfs2_dinode *orphan_fe;
1998 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; 1996 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1999 1997
2000 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 1998 trace_ocfs2_orphan_add_begin(
1999 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2001 2000
2002 status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh); 2001 status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
2003 if (status < 0) { 2002 if (status < 0) {
@@ -2056,13 +2055,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
2056 2055
2057 ocfs2_journal_dirty(handle, fe_bh); 2056 ocfs2_journal_dirty(handle, fe_bh);
2058 2057
2059 mlog(0, "Inode %llu orphaned in slot %d\n", 2058 trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
2060 (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); 2059 osb->slot_num);
2061 2060
2062leave: 2061leave:
2063 brelse(orphan_dir_bh); 2062 brelse(orphan_dir_bh);
2064 2063
2065 mlog_exit(status); 2064 if (status)
2065 mlog_errno(status);
2066 return status; 2066 return status;
2067} 2067}
2068 2068
@@ -2078,17 +2078,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2078 int status = 0; 2078 int status = 0;
2079 struct ocfs2_dir_lookup_result lookup = { NULL, }; 2079 struct ocfs2_dir_lookup_result lookup = { NULL, };
2080 2080
2081 mlog_entry_void();
2082
2083 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name); 2081 status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
2084 if (status < 0) { 2082 if (status < 0) {
2085 mlog_errno(status); 2083 mlog_errno(status);
2086 goto leave; 2084 goto leave;
2087 } 2085 }
2088 2086
2089 mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n", 2087 trace_ocfs2_orphan_del(
2090 name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, 2088 (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
2091 OCFS2_ORPHAN_NAMELEN); 2089 name, OCFS2_ORPHAN_NAMELEN);
2092 2090
2093 /* find it's spot in the orphan directory */ 2091 /* find it's spot in the orphan directory */
2094 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode, 2092 status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
@@ -2124,12 +2122,13 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
2124leave: 2122leave:
2125 ocfs2_free_dir_lookup_result(&lookup); 2123 ocfs2_free_dir_lookup_result(&lookup);
2126 2124
2127 mlog_exit(status); 2125 if (status)
2126 mlog_errno(status);
2128 return status; 2127 return status;
2129} 2128}
2130 2129
2131/** 2130/**
2132 * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to recieve a newly 2131 * ocfs2_prep_new_orphaned_file() - Prepare the orphan dir to receive a newly
2133 * allocated file. This is different from the typical 'add to orphan dir' 2132 * allocated file. This is different from the typical 'add to orphan dir'
2134 * operation in that the inode does not yet exist. This is a problem because 2133 * operation in that the inode does not yet exist. This is a problem because
2135 * the orphan dir stringifies the inode block number to come up with it's 2134 * the orphan dir stringifies the inode block number to come up with it's
@@ -2321,9 +2320,6 @@ leave:
2321 iput(orphan_dir); 2320 iput(orphan_dir);
2322 } 2321 }
2323 2322
2324 if (status == -ENOSPC)
2325 mlog(0, "Disk is full\n");
2326
2327 if ((status < 0) && inode) { 2323 if ((status < 0) && inode) {
2328 clear_nlink(inode); 2324 clear_nlink(inode);
2329 iput(inode); 2325 iput(inode);
@@ -2358,8 +2354,10 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
2358 struct buffer_head *di_bh = NULL; 2354 struct buffer_head *di_bh = NULL;
2359 struct ocfs2_dir_lookup_result lookup = { NULL, }; 2355 struct ocfs2_dir_lookup_result lookup = { NULL, };
2360 2356
2361 mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry, 2357 trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
2362 dentry->d_name.len, dentry->d_name.name); 2358 dentry->d_name.len, dentry->d_name.name,
2359 (unsigned long long)OCFS2_I(dir)->ip_blkno,
2360 (unsigned long long)OCFS2_I(inode)->ip_blkno);
2363 2361
2364 status = ocfs2_inode_lock(dir, &parent_di_bh, 1); 2362 status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
2365 if (status < 0) { 2363 if (status < 0) {
@@ -2476,7 +2474,8 @@ leave:
2476 2474
2477 ocfs2_free_dir_lookup_result(&lookup); 2475 ocfs2_free_dir_lookup_result(&lookup);
2478 2476
2479 mlog_exit(status); 2477 if (status)
2478 mlog_errno(status);
2480 2479
2481 return status; 2480 return status;
2482} 2481}
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 51cd6898e7f1..409285854f64 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -147,6 +147,17 @@ struct ocfs2_lock_res_ops;
147 147
148typedef void (*ocfs2_lock_callback)(int status, unsigned long data); 148typedef void (*ocfs2_lock_callback)(int status, unsigned long data);
149 149
150#ifdef CONFIG_OCFS2_FS_STATS
151struct ocfs2_lock_stats {
152 u64 ls_total; /* Total wait in NSEC */
153 u32 ls_gets; /* Num acquires */
154 u32 ls_fail; /* Num failed acquires */
155
156 /* Storing max wait in usecs saves 24 bytes per inode */
157 u32 ls_max; /* Max wait in USEC */
158};
159#endif
160
150struct ocfs2_lock_res { 161struct ocfs2_lock_res {
151 void *l_priv; 162 void *l_priv;
152 struct ocfs2_lock_res_ops *l_ops; 163 struct ocfs2_lock_res_ops *l_ops;
@@ -182,15 +193,9 @@ struct ocfs2_lock_res {
182 struct list_head l_debug_list; 193 struct list_head l_debug_list;
183 194
184#ifdef CONFIG_OCFS2_FS_STATS 195#ifdef CONFIG_OCFS2_FS_STATS
185 unsigned long long l_lock_num_prmode; /* PR acquires */ 196 struct ocfs2_lock_stats l_lock_prmode; /* PR mode stats */
186 unsigned long long l_lock_num_exmode; /* EX acquires */ 197 u32 l_lock_refresh; /* Disk refreshes */
187 unsigned int l_lock_num_prmode_failed; /* Failed PR gets */ 198 struct ocfs2_lock_stats l_lock_exmode; /* EX mode stats */
188 unsigned int l_lock_num_exmode_failed; /* Failed EX gets */
189 unsigned long long l_lock_total_prmode; /* Tot wait for PR */
190 unsigned long long l_lock_total_exmode; /* Tot wait for EX */
191 unsigned int l_lock_max_prmode; /* Max wait for PR */
192 unsigned int l_lock_max_exmode; /* Max wait for EX */
193 unsigned int l_lock_refresh; /* Disk refreshes */
194#endif 199#endif
195#ifdef CONFIG_DEBUG_LOCK_ALLOC 200#ifdef CONFIG_DEBUG_LOCK_ALLOC
196 struct lockdep_map l_lockdep_map; 201 struct lockdep_map l_lockdep_map;
@@ -831,18 +836,18 @@ static inline unsigned int ocfs2_clusters_to_megabytes(struct super_block *sb,
831 836
832static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) 837static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap)
833{ 838{
834 ext2_set_bit(bit, bitmap); 839 __test_and_set_bit_le(bit, bitmap);
835} 840}
836#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) 841#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr))
837 842
838static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) 843static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap)
839{ 844{
840 ext2_clear_bit(bit, bitmap); 845 __test_and_clear_bit_le(bit, bitmap);
841} 846}
842#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) 847#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr))
843 848
844#define ocfs2_test_bit ext2_test_bit 849#define ocfs2_test_bit test_bit_le
845#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit 850#define ocfs2_find_next_zero_bit find_next_zero_bit_le
846#define ocfs2_find_next_bit ext2_find_next_bit 851#define ocfs2_find_next_bit find_next_bit_le
847#endif /* OCFS2_H */ 852#endif /* OCFS2_H */
848 853
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index bf2e7764920e..b68f87a83924 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -441,7 +441,7 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
441struct ocfs2_block_check { 441struct ocfs2_block_check {
442/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */ 442/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
443 __le16 bc_ecc; /* Single-error-correction parity vector. 443 __le16 bc_ecc; /* Single-error-correction parity vector.
444 This is a simple Hamming code dependant 444 This is a simple Hamming code dependent
445 on the blocksize. OCFS2's maximum 445 on the blocksize. OCFS2's maximum
446 blocksize, 4K, requires 16 parity bits, 446 blocksize, 4K, requires 16 parity bits,
447 so we fit in __le16. */ 447 so we fit in __le16. */
@@ -750,7 +750,7 @@ struct ocfs2_dinode {
750 after an unclean 750 after an unclean
751 shutdown */ 751 shutdown */
752 } journal1; 752 } journal1;
753 } id1; /* Inode type dependant 1 */ 753 } id1; /* Inode type dependent 1 */
754/*C0*/ union { 754/*C0*/ union {
755 struct ocfs2_super_block i_super; 755 struct ocfs2_super_block i_super;
756 struct ocfs2_local_alloc i_lab; 756 struct ocfs2_local_alloc i_lab;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 000000000000..a1dae5bb54ac
--- /dev/null
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,2739 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM ocfs2
3
4#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_OCFS2_H
6
7#include <linux/tracepoint.h>
8
9DECLARE_EVENT_CLASS(ocfs2__int,
10 TP_PROTO(int num),
11 TP_ARGS(num),
12 TP_STRUCT__entry(
13 __field(int, num)
14 ),
15 TP_fast_assign(
16 __entry->num = num;
17 ),
18 TP_printk("%d", __entry->num)
19);
20
21#define DEFINE_OCFS2_INT_EVENT(name) \
22DEFINE_EVENT(ocfs2__int, name, \
23 TP_PROTO(int num), \
24 TP_ARGS(num))
25
26DECLARE_EVENT_CLASS(ocfs2__uint,
27 TP_PROTO(unsigned int num),
28 TP_ARGS(num),
29 TP_STRUCT__entry(
30 __field( unsigned int, num )
31 ),
32 TP_fast_assign(
33 __entry->num = num;
34 ),
35 TP_printk("%u", __entry->num)
36);
37
38#define DEFINE_OCFS2_UINT_EVENT(name) \
39DEFINE_EVENT(ocfs2__uint, name, \
40 TP_PROTO(unsigned int num), \
41 TP_ARGS(num))
42
43DECLARE_EVENT_CLASS(ocfs2__ull,
44 TP_PROTO(unsigned long long blkno),
45 TP_ARGS(blkno),
46 TP_STRUCT__entry(
47 __field(unsigned long long, blkno)
48 ),
49 TP_fast_assign(
50 __entry->blkno = blkno;
51 ),
52 TP_printk("%llu", __entry->blkno)
53);
54
55#define DEFINE_OCFS2_ULL_EVENT(name) \
56DEFINE_EVENT(ocfs2__ull, name, \
57 TP_PROTO(unsigned long long num), \
58 TP_ARGS(num))
59
60DECLARE_EVENT_CLASS(ocfs2__pointer,
61 TP_PROTO(void *pointer),
62 TP_ARGS(pointer),
63 TP_STRUCT__entry(
64 __field(void *, pointer)
65 ),
66 TP_fast_assign(
67 __entry->pointer = pointer;
68 ),
69 TP_printk("%p", __entry->pointer)
70);
71
72#define DEFINE_OCFS2_POINTER_EVENT(name) \
73DEFINE_EVENT(ocfs2__pointer, name, \
74 TP_PROTO(void *pointer), \
75 TP_ARGS(pointer))
76
77DECLARE_EVENT_CLASS(ocfs2__string,
78 TP_PROTO(const char *name),
79 TP_ARGS(name),
80 TP_STRUCT__entry(
81 __string(name,name)
82 ),
83 TP_fast_assign(
84 __assign_str(name, name);
85 ),
86 TP_printk("%s", __get_str(name))
87);
88
89#define DEFINE_OCFS2_STRING_EVENT(name) \
90DEFINE_EVENT(ocfs2__string, name, \
91 TP_PROTO(const char *name), \
92 TP_ARGS(name))
93
94DECLARE_EVENT_CLASS(ocfs2__int_int,
95 TP_PROTO(int value1, int value2),
96 TP_ARGS(value1, value2),
97 TP_STRUCT__entry(
98 __field(int, value1)
99 __field(int, value2)
100 ),
101 TP_fast_assign(
102 __entry->value1 = value1;
103 __entry->value2 = value2;
104 ),
105 TP_printk("%d %d", __entry->value1, __entry->value2)
106);
107
108#define DEFINE_OCFS2_INT_INT_EVENT(name) \
109DEFINE_EVENT(ocfs2__int_int, name, \
110 TP_PROTO(int val1, int val2), \
111 TP_ARGS(val1, val2))
112
113DECLARE_EVENT_CLASS(ocfs2__uint_int,
114 TP_PROTO(unsigned int value1, int value2),
115 TP_ARGS(value1, value2),
116 TP_STRUCT__entry(
117 __field(unsigned int, value1)
118 __field(int, value2)
119 ),
120 TP_fast_assign(
121 __entry->value1 = value1;
122 __entry->value2 = value2;
123 ),
124 TP_printk("%u %d", __entry->value1, __entry->value2)
125);
126
127#define DEFINE_OCFS2_UINT_INT_EVENT(name) \
128DEFINE_EVENT(ocfs2__uint_int, name, \
129 TP_PROTO(unsigned int val1, int val2), \
130 TP_ARGS(val1, val2))
131
132DECLARE_EVENT_CLASS(ocfs2__uint_uint,
133 TP_PROTO(unsigned int value1, unsigned int value2),
134 TP_ARGS(value1, value2),
135 TP_STRUCT__entry(
136 __field(unsigned int, value1)
137 __field(unsigned int, value2)
138 ),
139 TP_fast_assign(
140 __entry->value1 = value1;
141 __entry->value2 = value2;
142 ),
143 TP_printk("%u %u", __entry->value1, __entry->value2)
144);
145
146#define DEFINE_OCFS2_UINT_UINT_EVENT(name) \
147DEFINE_EVENT(ocfs2__uint_uint, name, \
148 TP_PROTO(unsigned int val1, unsigned int val2), \
149 TP_ARGS(val1, val2))
150
151DECLARE_EVENT_CLASS(ocfs2__ull_uint,
152 TP_PROTO(unsigned long long value1, unsigned int value2),
153 TP_ARGS(value1, value2),
154 TP_STRUCT__entry(
155 __field(unsigned long long, value1)
156 __field(unsigned int, value2)
157 ),
158 TP_fast_assign(
159 __entry->value1 = value1;
160 __entry->value2 = value2;
161 ),
162 TP_printk("%llu %u", __entry->value1, __entry->value2)
163);
164
165#define DEFINE_OCFS2_ULL_UINT_EVENT(name) \
166DEFINE_EVENT(ocfs2__ull_uint, name, \
167 TP_PROTO(unsigned long long val1, unsigned int val2), \
168 TP_ARGS(val1, val2))
169
170DECLARE_EVENT_CLASS(ocfs2__ull_int,
171 TP_PROTO(unsigned long long value1, int value2),
172 TP_ARGS(value1, value2),
173 TP_STRUCT__entry(
174 __field(unsigned long long, value1)
175 __field(int, value2)
176 ),
177 TP_fast_assign(
178 __entry->value1 = value1;
179 __entry->value2 = value2;
180 ),
181 TP_printk("%llu %d", __entry->value1, __entry->value2)
182);
183
184#define DEFINE_OCFS2_ULL_INT_EVENT(name) \
185DEFINE_EVENT(ocfs2__ull_int, name, \
186 TP_PROTO(unsigned long long val1, int val2), \
187 TP_ARGS(val1, val2))
188
189DECLARE_EVENT_CLASS(ocfs2__ull_ull,
190 TP_PROTO(unsigned long long value1, unsigned long long value2),
191 TP_ARGS(value1, value2),
192 TP_STRUCT__entry(
193 __field(unsigned long long, value1)
194 __field(unsigned long long, value2)
195 ),
196 TP_fast_assign(
197 __entry->value1 = value1;
198 __entry->value2 = value2;
199 ),
200 TP_printk("%llu %llu", __entry->value1, __entry->value2)
201);
202
203#define DEFINE_OCFS2_ULL_ULL_EVENT(name) \
204DEFINE_EVENT(ocfs2__ull_ull, name, \
205 TP_PROTO(unsigned long long val1, unsigned long long val2), \
206 TP_ARGS(val1, val2))
207
208DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
209 TP_PROTO(unsigned long long value1,
210 unsigned long long value2, unsigned int value3),
211 TP_ARGS(value1, value2, value3),
212 TP_STRUCT__entry(
213 __field(unsigned long long, value1)
214 __field(unsigned long long, value2)
215 __field(unsigned int, value3)
216 ),
217 TP_fast_assign(
218 __entry->value1 = value1;
219 __entry->value2 = value2;
220 __entry->value3 = value3;
221 ),
222 TP_printk("%llu %llu %u",
223 __entry->value1, __entry->value2, __entry->value3)
224);
225
226#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name) \
227DEFINE_EVENT(ocfs2__ull_ull_uint, name, \
228 TP_PROTO(unsigned long long val1, \
229 unsigned long long val2, unsigned int val3), \
230 TP_ARGS(val1, val2, val3))
231
232DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
233 TP_PROTO(unsigned long long value1,
234 unsigned int value2, unsigned int value3),
235 TP_ARGS(value1, value2, value3),
236 TP_STRUCT__entry(
237 __field(unsigned long long, value1)
238 __field(unsigned int, value2)
239 __field(unsigned int, value3)
240 ),
241 TP_fast_assign(
242 __entry->value1 = value1;
243 __entry->value2 = value2;
244 __entry->value3 = value3;
245 ),
246 TP_printk("%llu %u %u", __entry->value1,
247 __entry->value2, __entry->value3)
248);
249
250#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name) \
251DEFINE_EVENT(ocfs2__ull_uint_uint, name, \
252 TP_PROTO(unsigned long long val1, \
253 unsigned int val2, unsigned int val3), \
254 TP_ARGS(val1, val2, val3))
255
256DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
257 TP_PROTO(unsigned int value1, unsigned int value2,
258 unsigned int value3),
259 TP_ARGS(value1, value2, value3),
260 TP_STRUCT__entry(
261 __field( unsigned int, value1 )
262 __field( unsigned int, value2 )
263 __field( unsigned int, value3 )
264 ),
265 TP_fast_assign(
266 __entry->value1 = value1;
267 __entry->value2 = value2;
268 __entry->value3 = value3;
269 ),
270 TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
271);
272
273#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name) \
274DEFINE_EVENT(ocfs2__uint_uint_uint, name, \
275 TP_PROTO(unsigned int value1, unsigned int value2, \
276 unsigned int value3), \
277 TP_ARGS(value1, value2, value3))
278
279DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
280 TP_PROTO(unsigned long long value1,
281 unsigned long long value2, unsigned long long value3),
282 TP_ARGS(value1, value2, value3),
283 TP_STRUCT__entry(
284 __field(unsigned long long, value1)
285 __field(unsigned long long, value2)
286 __field(unsigned long long, value3)
287 ),
288 TP_fast_assign(
289 __entry->value1 = value1;
290 __entry->value2 = value2;
291 __entry->value3 = value3;
292 ),
293 TP_printk("%llu %llu %llu",
294 __entry->value1, __entry->value2, __entry->value3)
295);
296
297#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name) \
298DEFINE_EVENT(ocfs2__ull_ull_ull, name, \
299 TP_PROTO(unsigned long long value1, unsigned long long value2, \
300 unsigned long long value3), \
301 TP_ARGS(value1, value2, value3))
302
303DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
304 TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
305 TP_ARGS(ull, value1, value2, value3),
306 TP_STRUCT__entry(
307 __field( unsigned long long, ull )
308 __field( int, value1 )
309 __field( int, value2 )
310 __field( int, value3 )
311 ),
312 TP_fast_assign(
313 __entry->ull = ull;
314 __entry->value1 = value1;
315 __entry->value2 = value2;
316 __entry->value3 = value3;
317 ),
318 TP_printk("%llu %d %d %d",
319 __entry->ull, __entry->value1,
320 __entry->value2, __entry->value3)
321);
322
323#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name) \
324DEFINE_EVENT(ocfs2__ull_int_int_int, name, \
325 TP_PROTO(unsigned long long ull, int value1, \
326 int value2, int value3), \
327 TP_ARGS(ull, value1, value2, value3))
328
329DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
330 TP_PROTO(unsigned long long ull, unsigned int value1,
331 unsigned int value2, unsigned int value3),
332 TP_ARGS(ull, value1, value2, value3),
333 TP_STRUCT__entry(
334 __field(unsigned long long, ull)
335 __field(unsigned int, value1)
336 __field(unsigned int, value2)
337 __field(unsigned int, value3)
338 ),
339 TP_fast_assign(
340 __entry->ull = ull;
341 __entry->value1 = value1;
342 __entry->value2 = value2;
343 __entry->value3 = value3;
344 ),
345 TP_printk("%llu %u %u %u",
346 __entry->ull, __entry->value1,
347 __entry->value2, __entry->value3)
348);
349
350#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name) \
351DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name, \
352 TP_PROTO(unsigned long long ull, unsigned int value1, \
353 unsigned int value2, unsigned int value3), \
354 TP_ARGS(ull, value1, value2, value3))
355
356DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
357 TP_PROTO(unsigned long long value1, unsigned long long value2,
358 unsigned int value3, unsigned int value4),
359 TP_ARGS(value1, value2, value3, value4),
360 TP_STRUCT__entry(
361 __field(unsigned long long, value1)
362 __field(unsigned long long, value2)
363 __field(unsigned int, value3)
364 __field(unsigned int, value4)
365 ),
366 TP_fast_assign(
367 __entry->value1 = value1;
368 __entry->value2 = value2;
369 __entry->value3 = value3;
370 __entry->value4 = value4;
371 ),
372 TP_printk("%llu %llu %u %u",
373 __entry->value1, __entry->value2,
374 __entry->value3, __entry->value4)
375);
376
377#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name) \
378DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name, \
379 TP_PROTO(unsigned long long ull, unsigned long long ull1, \
380 unsigned int value2, unsigned int value3), \
381 TP_ARGS(ull, ull1, value2, value3))
382
383/* Trace events for fs/ocfs2/alloc.c. */
384DECLARE_EVENT_CLASS(ocfs2__btree_ops,
385 TP_PROTO(unsigned long long owner,\
386 unsigned int value1, unsigned int value2),
387 TP_ARGS(owner, value1, value2),
388 TP_STRUCT__entry(
389 __field(unsigned long long, owner)
390 __field(unsigned int, value1)
391 __field(unsigned int, value2)
392 ),
393 TP_fast_assign(
394 __entry->owner = owner;
395 __entry->value1 = value1;
396 __entry->value2 = value2;
397 ),
398 TP_printk("%llu %u %u",
399 __entry->owner, __entry->value1, __entry->value2)
400);
401
402#define DEFINE_OCFS2_BTREE_EVENT(name) \
403DEFINE_EVENT(ocfs2__btree_ops, name, \
404 TP_PROTO(unsigned long long owner, \
405 unsigned int value1, unsigned int value2), \
406 TP_ARGS(owner, value1, value2))
407
408DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
409
410DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
411
412DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
413
414DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
415
416DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
417
418DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
419
420DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
421
422TRACE_EVENT(ocfs2_grow_tree,
423 TP_PROTO(unsigned long long owner, int depth),
424 TP_ARGS(owner, depth),
425 TP_STRUCT__entry(
426 __field(unsigned long long, owner)
427 __field(int, depth)
428 ),
429 TP_fast_assign(
430 __entry->owner = owner;
431 __entry->depth = depth;
432 ),
433 TP_printk("%llu %d", __entry->owner, __entry->depth)
434);
435
436TRACE_EVENT(ocfs2_rotate_subtree,
437 TP_PROTO(int subtree_root, unsigned long long blkno,
438 int depth),
439 TP_ARGS(subtree_root, blkno, depth),
440 TP_STRUCT__entry(
441 __field(int, subtree_root)
442 __field(unsigned long long, blkno)
443 __field(int, depth)
444 ),
445 TP_fast_assign(
446 __entry->subtree_root = subtree_root;
447 __entry->blkno = blkno;
448 __entry->depth = depth;
449 ),
450 TP_printk("%d %llu %d", __entry->subtree_root,
451 __entry->blkno, __entry->depth)
452);
453
454TRACE_EVENT(ocfs2_insert_extent,
455 TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
456 int ins_contig_index, int free_records, int ins_tree_depth),
457 TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
458 ins_tree_depth),
459 TP_STRUCT__entry(
460 __field(unsigned int, ins_appending)
461 __field(unsigned int, ins_contig)
462 __field(int, ins_contig_index)
463 __field(int, free_records)
464 __field(int, ins_tree_depth)
465 ),
466 TP_fast_assign(
467 __entry->ins_appending = ins_appending;
468 __entry->ins_contig = ins_contig;
469 __entry->ins_contig_index = ins_contig_index;
470 __entry->free_records = free_records;
471 __entry->ins_tree_depth = ins_tree_depth;
472 ),
473 TP_printk("%u %u %d %d %d",
474 __entry->ins_appending, __entry->ins_contig,
475 __entry->ins_contig_index, __entry->free_records,
476 __entry->ins_tree_depth)
477);
478
479TRACE_EVENT(ocfs2_split_extent,
480 TP_PROTO(int split_index, unsigned int c_contig_type,
481 unsigned int c_has_empty_extent,
482 unsigned int c_split_covers_rec),
483 TP_ARGS(split_index, c_contig_type,
484 c_has_empty_extent, c_split_covers_rec),
485 TP_STRUCT__entry(
486 __field(int, split_index)
487 __field(unsigned int, c_contig_type)
488 __field(unsigned int, c_has_empty_extent)
489 __field(unsigned int, c_split_covers_rec)
490 ),
491 TP_fast_assign(
492 __entry->split_index = split_index;
493 __entry->c_contig_type = c_contig_type;
494 __entry->c_has_empty_extent = c_has_empty_extent;
495 __entry->c_split_covers_rec = c_split_covers_rec;
496 ),
497 TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
498 __entry->c_has_empty_extent, __entry->c_split_covers_rec)
499);
500
501TRACE_EVENT(ocfs2_remove_extent,
502 TP_PROTO(unsigned long long owner, unsigned int cpos,
503 unsigned int len, int index,
504 unsigned int e_cpos, unsigned int clusters),
505 TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
506 TP_STRUCT__entry(
507 __field(unsigned long long, owner)
508 __field(unsigned int, cpos)
509 __field(unsigned int, len)
510 __field(int, index)
511 __field(unsigned int, e_cpos)
512 __field(unsigned int, clusters)
513 ),
514 TP_fast_assign(
515 __entry->owner = owner;
516 __entry->cpos = cpos;
517 __entry->len = len;
518 __entry->index = index;
519 __entry->e_cpos = e_cpos;
520 __entry->clusters = clusters;
521 ),
522 TP_printk("%llu %u %u %d %u %u",
523 __entry->owner, __entry->cpos, __entry->len, __entry->index,
524 __entry->e_cpos, __entry->clusters)
525);
526
527TRACE_EVENT(ocfs2_commit_truncate,
528 TP_PROTO(unsigned long long ino, unsigned int new_cpos,
529 unsigned int clusters, unsigned int depth),
530 TP_ARGS(ino, new_cpos, clusters, depth),
531 TP_STRUCT__entry(
532 __field(unsigned long long, ino)
533 __field(unsigned int, new_cpos)
534 __field(unsigned int, clusters)
535 __field(unsigned int, depth)
536 ),
537 TP_fast_assign(
538 __entry->ino = ino;
539 __entry->new_cpos = new_cpos;
540 __entry->clusters = clusters;
541 __entry->depth = depth;
542 ),
543 TP_printk("%llu %u %u %u",
544 __entry->ino, __entry->new_cpos,
545 __entry->clusters, __entry->depth)
546);
547
548TRACE_EVENT(ocfs2_validate_extent_block,
549 TP_PROTO(unsigned long long blkno),
550 TP_ARGS(blkno),
551 TP_STRUCT__entry(
552 __field(unsigned long long, blkno)
553 ),
554 TP_fast_assign(
555 __entry->blkno = blkno;
556 ),
557 TP_printk("%llu ", __entry->blkno)
558);
559
560TRACE_EVENT(ocfs2_rotate_leaf,
561 TP_PROTO(unsigned int insert_cpos, int insert_index,
562 int has_empty, int next_free,
563 unsigned int l_count),
564 TP_ARGS(insert_cpos, insert_index, has_empty,
565 next_free, l_count),
566 TP_STRUCT__entry(
567 __field(unsigned int, insert_cpos)
568 __field(int, insert_index)
569 __field(int, has_empty)
570 __field(int, next_free)
571 __field(unsigned int, l_count)
572 ),
573 TP_fast_assign(
574 __entry->insert_cpos = insert_cpos;
575 __entry->insert_index = insert_index;
576 __entry->has_empty = has_empty;
577 __entry->next_free = next_free;
578 __entry->l_count = l_count;
579 ),
580 TP_printk("%u %d %d %d %u", __entry->insert_cpos,
581 __entry->insert_index, __entry->has_empty,
582 __entry->next_free, __entry->l_count)
583);
584
585TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
586 TP_PROTO(int status, int reason, int err),
587 TP_ARGS(status, reason, err),
588 TP_STRUCT__entry(
589 __field(int, status)
590 __field(int, reason)
591 __field(int, err)
592 ),
593 TP_fast_assign(
594 __entry->status = status;
595 __entry->reason = reason;
596 __entry->err = err;
597 ),
598 TP_printk("%d %d %d", __entry->status,
599 __entry->reason, __entry->err)
600);
601
602TRACE_EVENT(ocfs2_mark_extent_written,
603 TP_PROTO(unsigned long long owner, unsigned int cpos,
604 unsigned int len, unsigned int phys),
605 TP_ARGS(owner, cpos, len, phys),
606 TP_STRUCT__entry(
607 __field(unsigned long long, owner)
608 __field(unsigned int, cpos)
609 __field(unsigned int, len)
610 __field(unsigned int, phys)
611 ),
612 TP_fast_assign(
613 __entry->owner = owner;
614 __entry->cpos = cpos;
615 __entry->len = len;
616 __entry->phys = phys;
617 ),
618 TP_printk("%llu %u %u %u",
619 __entry->owner, __entry->cpos,
620 __entry->len, __entry->phys)
621);
622
623DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
624 TP_PROTO(unsigned long long blkno, int index,
625 unsigned int start, unsigned int num),
626 TP_ARGS(blkno, index, start, num),
627 TP_STRUCT__entry(
628 __field(unsigned long long, blkno)
629 __field(int, index)
630 __field(unsigned int, start)
631 __field(unsigned int, num)
632 ),
633 TP_fast_assign(
634 __entry->blkno = blkno;
635 __entry->index = index;
636 __entry->start = start;
637 __entry->num = num;
638 ),
639 TP_printk("%llu %d %u %u",
640 __entry->blkno, __entry->index,
641 __entry->start, __entry->num)
642);
643
644#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name) \
645DEFINE_EVENT(ocfs2__truncate_log_ops, name, \
646 TP_PROTO(unsigned long long blkno, int index, \
647 unsigned int start, unsigned int num), \
648 TP_ARGS(blkno, index, start, num))
649
650DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
651
652DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
653
654DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
655
656DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
657
658DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
659
660DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
661
662DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
663
664DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
665
666DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
667
668TRACE_EVENT(ocfs2_cache_block_dealloc,
669 TP_PROTO(int type, int slot, unsigned long long suballoc,
670 unsigned long long blkno, unsigned int bit),
671 TP_ARGS(type, slot, suballoc, blkno, bit),
672 TP_STRUCT__entry(
673 __field(int, type)
674 __field(int, slot)
675 __field(unsigned long long, suballoc)
676 __field(unsigned long long, blkno)
677 __field(unsigned int, bit)
678 ),
679 TP_fast_assign(
680 __entry->type = type;
681 __entry->slot = slot;
682 __entry->suballoc = suballoc;
683 __entry->blkno = blkno;
684 __entry->bit = bit;
685 ),
686 TP_printk("%d %d %llu %llu %u",
687 __entry->type, __entry->slot, __entry->suballoc,
688 __entry->blkno, __entry->bit)
689);
690
691/* End of trace events for fs/ocfs2/alloc.c. */
692
693/* Trace events for fs/ocfs2/localalloc.c. */
694
695DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
696
697DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
698
699DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
700
701DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
702
703DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
704
705DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
706
707DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
708
709DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
710
711DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
712
713TRACE_EVENT(ocfs2_sync_local_to_main_free,
714 TP_PROTO(int count, int bit, unsigned long long start_blk,
715 unsigned long long blkno),
716 TP_ARGS(count, bit, start_blk, blkno),
717 TP_STRUCT__entry(
718 __field(int, count)
719 __field(int, bit)
720 __field(unsigned long long, start_blk)
721 __field(unsigned long long, blkno)
722 ),
723 TP_fast_assign(
724 __entry->count = count;
725 __entry->bit = bit;
726 __entry->start_blk = start_blk;
727 __entry->blkno = blkno;
728 ),
729 TP_printk("%d %d %llu %llu",
730 __entry->count, __entry->bit, __entry->start_blk,
731 __entry->blkno)
732);
733
734DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
735
736DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
737
738/* End of trace events for fs/ocfs2/localalloc.c. */
739
740/* Trace events for fs/ocfs2/resize.c. */
741
742DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
743
744DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
745
746DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
747
748/* End of trace events for fs/ocfs2/resize.c. */
749
750/* Trace events for fs/ocfs2/suballoc.c. */
751
752DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
753
754DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
755
756DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
757
758DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
759
760DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
761
762DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
763
764DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
765
766DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
767
768TRACE_EVENT(ocfs2_relink_block_group,
769 TP_PROTO(unsigned long long i_blkno, unsigned int chain,
770 unsigned long long bg_blkno,
771 unsigned long long prev_blkno),
772 TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
773 TP_STRUCT__entry(
774 __field(unsigned long long, i_blkno)
775 __field(unsigned int, chain)
776 __field(unsigned long long, bg_blkno)
777 __field(unsigned long long, prev_blkno)
778 ),
779 TP_fast_assign(
780 __entry->i_blkno = i_blkno;
781 __entry->chain = chain;
782 __entry->bg_blkno = bg_blkno;
783 __entry->prev_blkno = prev_blkno;
784 ),
785 TP_printk("%llu %u %llu %llu",
786 __entry->i_blkno, __entry->chain, __entry->bg_blkno,
787 __entry->prev_blkno)
788);
789
790DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
791
792DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
793
794DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
795
796DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
797
798DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
799
800DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
801
802DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
803
804DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
805
806DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
807
808TRACE_EVENT(ocfs2_free_suballoc_bits,
809 TP_PROTO(unsigned long long inode, unsigned long long group,
810 unsigned int start_bit, unsigned int count),
811 TP_ARGS(inode, group, start_bit, count),
812 TP_STRUCT__entry(
813 __field(unsigned long long, inode)
814 __field(unsigned long long, group)
815 __field(unsigned int, start_bit)
816 __field(unsigned int, count)
817 ),
818 TP_fast_assign(
819 __entry->inode = inode;
820 __entry->group = group;
821 __entry->start_bit = start_bit;
822 __entry->count = count;
823 ),
824 TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
825 __entry->start_bit, __entry->count)
826);
827
828TRACE_EVENT(ocfs2_free_clusters,
829 TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
830 unsigned int start_bit, unsigned int count),
831 TP_ARGS(bg_blkno, start_blk, start_bit, count),
832 TP_STRUCT__entry(
833 __field(unsigned long long, bg_blkno)
834 __field(unsigned long long, start_blk)
835 __field(unsigned int, start_bit)
836 __field(unsigned int, count)
837 ),
838 TP_fast_assign(
839 __entry->bg_blkno = bg_blkno;
840 __entry->start_blk = start_blk;
841 __entry->start_bit = start_bit;
842 __entry->count = count;
843 ),
844 TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
845 __entry->start_bit, __entry->count)
846);
847
848DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
849
850DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
851
852DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
853
854/* End of trace events for fs/ocfs2/suballoc.c. */
855
856/* Trace events for fs/ocfs2/refcounttree.c. */
857
858DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
859
860DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
861
862DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
863
864DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
865
866DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
867
868DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
869
870DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
871
872DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
873
874DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
875 TP_PROTO(unsigned long long blkno, int index,
876 unsigned long long cpos,
877 unsigned int clusters, unsigned int refcount),
878 TP_ARGS(blkno, index, cpos, clusters, refcount),
879 TP_STRUCT__entry(
880 __field(unsigned long long, blkno)
881 __field(int, index)
882 __field(unsigned long long, cpos)
883 __field(unsigned int, clusters)
884 __field(unsigned int, refcount)
885 ),
886 TP_fast_assign(
887 __entry->blkno = blkno;
888 __entry->index = index;
889 __entry->cpos = cpos;
890 __entry->clusters = clusters;
891 __entry->refcount = refcount;
892 ),
893 TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
894 __entry->cpos, __entry->clusters, __entry->refcount)
895);
896
897#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name) \
898DEFINE_EVENT(ocfs2__refcount_tree_ops, name, \
899 TP_PROTO(unsigned long long blkno, int index, \
900 unsigned long long cpos, \
901 unsigned int count, unsigned int refcount), \
902 TP_ARGS(blkno, index, cpos, count, refcount))
903
904DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
905
906TRACE_EVENT(ocfs2_split_refcount_rec,
907 TP_PROTO(unsigned long long cpos,
908 unsigned int clusters, unsigned int refcount,
909 unsigned long long split_cpos,
910 unsigned int split_clusters, unsigned int split_refcount),
911 TP_ARGS(cpos, clusters, refcount,
912 split_cpos, split_clusters, split_refcount),
913 TP_STRUCT__entry(
914 __field(unsigned long long, cpos)
915 __field(unsigned int, clusters)
916 __field(unsigned int, refcount)
917 __field(unsigned long long, split_cpos)
918 __field(unsigned int, split_clusters)
919 __field(unsigned int, split_refcount)
920 ),
921 TP_fast_assign(
922 __entry->cpos = cpos;
923 __entry->clusters = clusters;
924 __entry->refcount = refcount;
925 __entry->split_cpos = split_cpos;
926 __entry->split_clusters = split_clusters;
927 __entry->split_refcount = split_refcount;
928 ),
929 TP_printk("%llu %u %u %llu %u %u",
930 __entry->cpos, __entry->clusters, __entry->refcount,
931 __entry->split_cpos, __entry->split_clusters,
932 __entry->split_refcount)
933);
934
935DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
936
937DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
938
939DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
940
941DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
942
943DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
944
945DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
946
947DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
948
949DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
950
951TRACE_EVENT(ocfs2_decrease_refcount,
952 TP_PROTO(unsigned long long owner,
953 unsigned long long cpos,
954 unsigned int len, int delete),
955 TP_ARGS(owner, cpos, len, delete),
956 TP_STRUCT__entry(
957 __field(unsigned long long, owner)
958 __field(unsigned long long, cpos)
959 __field(unsigned int, len)
960 __field(int, delete)
961 ),
962 TP_fast_assign(
963 __entry->owner = owner;
964 __entry->cpos = cpos;
965 __entry->len = len;
966 __entry->delete = delete;
967 ),
968 TP_printk("%llu %llu %u %d",
969 __entry->owner, __entry->cpos, __entry->len, __entry->delete)
970);
971
972DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
973
974DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
975
976TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
977 TP_PROTO(int recs_add, unsigned long long cpos,
978 unsigned int clusters, unsigned long long r_cpos,
979 unsigned int r_clusters, unsigned int refcount, int index),
980 TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
981 TP_STRUCT__entry(
982 __field(int, recs_add)
983 __field(unsigned long long, cpos)
984 __field(unsigned int, clusters)
985 __field(unsigned long long, r_cpos)
986 __field(unsigned int, r_clusters)
987 __field(unsigned int, refcount)
988 __field(int, index)
989 ),
990 TP_fast_assign(
991 __entry->recs_add = recs_add;
992 __entry->cpos = cpos;
993 __entry->clusters = clusters;
994 __entry->r_cpos = r_cpos;
995 __entry->r_clusters = r_clusters;
996 __entry->refcount = refcount;
997 __entry->index = index;
998 ),
999 TP_printk("%d %llu %u %llu %u %u %d",
1000 __entry->recs_add, __entry->cpos, __entry->clusters,
1001 __entry->r_cpos, __entry->r_clusters,
1002 __entry->refcount, __entry->index)
1003);
1004
1005DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
1006
1007DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
1008
1009DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
1010
1011DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
1012
1013DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
1014
1015TRACE_EVENT(ocfs2_clear_ext_refcount,
1016 TP_PROTO(unsigned long long ino, unsigned int cpos,
1017 unsigned int len, unsigned int p_cluster,
1018 unsigned int ext_flags),
1019 TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
1020 TP_STRUCT__entry(
1021 __field(unsigned long long, ino)
1022 __field(unsigned int, cpos)
1023 __field(unsigned int, len)
1024 __field(unsigned int, p_cluster)
1025 __field(unsigned int, ext_flags)
1026 ),
1027 TP_fast_assign(
1028 __entry->ino = ino;
1029 __entry->cpos = cpos;
1030 __entry->len = len;
1031 __entry->p_cluster = p_cluster;
1032 __entry->ext_flags = ext_flags;
1033 ),
1034 TP_printk("%llu %u %u %u %u",
1035 __entry->ino, __entry->cpos, __entry->len,
1036 __entry->p_cluster, __entry->ext_flags)
1037);
1038
1039TRACE_EVENT(ocfs2_replace_clusters,
1040 TP_PROTO(unsigned long long ino, unsigned int cpos,
1041 unsigned int old, unsigned int new, unsigned int len,
1042 unsigned int ext_flags),
1043 TP_ARGS(ino, cpos, old, new, len, ext_flags),
1044 TP_STRUCT__entry(
1045 __field(unsigned long long, ino)
1046 __field(unsigned int, cpos)
1047 __field(unsigned int, old)
1048 __field(unsigned int, new)
1049 __field(unsigned int, len)
1050 __field(unsigned int, ext_flags)
1051 ),
1052 TP_fast_assign(
1053 __entry->ino = ino;
1054 __entry->cpos = cpos;
1055 __entry->old = old;
1056 __entry->new = new;
1057 __entry->len = len;
1058 __entry->ext_flags = ext_flags;
1059 ),
1060 TP_printk("%llu %u %u %u %u %u",
1061 __entry->ino, __entry->cpos, __entry->old, __entry->new,
1062 __entry->len, __entry->ext_flags)
1063);
1064
1065DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
1066
1067TRACE_EVENT(ocfs2_refcount_cow_hunk,
1068 TP_PROTO(unsigned long long ino, unsigned int cpos,
1069 unsigned int write_len, unsigned int max_cpos,
1070 unsigned int cow_start, unsigned int cow_len),
1071 TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
1072 TP_STRUCT__entry(
1073 __field(unsigned long long, ino)
1074 __field(unsigned int, cpos)
1075 __field(unsigned int, write_len)
1076 __field(unsigned int, max_cpos)
1077 __field(unsigned int, cow_start)
1078 __field(unsigned int, cow_len)
1079 ),
1080 TP_fast_assign(
1081 __entry->ino = ino;
1082 __entry->cpos = cpos;
1083 __entry->write_len = write_len;
1084 __entry->max_cpos = max_cpos;
1085 __entry->cow_start = cow_start;
1086 __entry->cow_len = cow_len;
1087 ),
1088 TP_printk("%llu %u %u %u %u %u",
1089 __entry->ino, __entry->cpos, __entry->write_len,
1090 __entry->max_cpos, __entry->cow_start, __entry->cow_len)
1091);
1092
1093/* End of trace events for fs/ocfs2/refcounttree.c. */
1094
1095/* Trace events for fs/ocfs2/aops.c. */
1096
1097DECLARE_EVENT_CLASS(ocfs2__get_block,
1098 TP_PROTO(unsigned long long ino, unsigned long long iblock,
1099 void *bh_result, int create),
1100 TP_ARGS(ino, iblock, bh_result, create),
1101 TP_STRUCT__entry(
1102 __field(unsigned long long, ino)
1103 __field(unsigned long long, iblock)
1104 __field(void *, bh_result)
1105 __field(int, create)
1106 ),
1107 TP_fast_assign(
1108 __entry->ino = ino;
1109 __entry->iblock = iblock;
1110 __entry->bh_result = bh_result;
1111 __entry->create = create;
1112 ),
1113 TP_printk("%llu %llu %p %d",
1114 __entry->ino, __entry->iblock,
1115 __entry->bh_result, __entry->create)
1116);
1117
1118#define DEFINE_OCFS2_GET_BLOCK_EVENT(name) \
1119DEFINE_EVENT(ocfs2__get_block, name, \
1120 TP_PROTO(unsigned long long ino, unsigned long long iblock, \
1121 void *bh_result, int create), \
1122 TP_ARGS(ino, iblock, bh_result, create))
1123
1124DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
1125
1126DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
1127
1128DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
1129
1130DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
1131
1132DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
1133
1134DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
1135
1136TRACE_EVENT(ocfs2_try_to_write_inline_data,
1137 TP_PROTO(unsigned long long ino, unsigned int len,
1138 unsigned long long pos, unsigned int flags),
1139 TP_ARGS(ino, len, pos, flags),
1140 TP_STRUCT__entry(
1141 __field(unsigned long long, ino)
1142 __field(unsigned int, len)
1143 __field(unsigned long long, pos)
1144 __field(unsigned int, flags)
1145 ),
1146 TP_fast_assign(
1147 __entry->ino = ino;
1148 __entry->len = len;
1149 __entry->pos = pos;
1150 __entry->flags = flags;
1151 ),
1152 TP_printk("%llu %u %llu 0x%x",
1153 __entry->ino, __entry->len, __entry->pos, __entry->flags)
1154);
1155
1156TRACE_EVENT(ocfs2_write_begin_nolock,
1157 TP_PROTO(unsigned long long ino,
1158 long long i_size, unsigned int i_clusters,
1159 unsigned long long pos, unsigned int len,
1160 unsigned int flags, void *page,
1161 unsigned int clusters, unsigned int extents_to_split),
1162 TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
1163 page, clusters, extents_to_split),
1164 TP_STRUCT__entry(
1165 __field(unsigned long long, ino)
1166 __field(long long, i_size)
1167 __field(unsigned int, i_clusters)
1168 __field(unsigned long long, pos)
1169 __field(unsigned int, len)
1170 __field(unsigned int, flags)
1171 __field(void *, page)
1172 __field(unsigned int, clusters)
1173 __field(unsigned int, extents_to_split)
1174 ),
1175 TP_fast_assign(
1176 __entry->ino = ino;
1177 __entry->i_size = i_size;
1178 __entry->i_clusters = i_clusters;
1179 __entry->pos = pos;
1180 __entry->len = len;
1181 __entry->flags = flags;
1182 __entry->page = page;
1183 __entry->clusters = clusters;
1184 __entry->extents_to_split = extents_to_split;
1185 ),
1186 TP_printk("%llu %lld %u %llu %u %u %p %u %u",
1187 __entry->ino, __entry->i_size, __entry->i_clusters,
1188 __entry->pos, __entry->len,
1189 __entry->flags, __entry->page, __entry->clusters,
1190 __entry->extents_to_split)
1191);
1192
1193TRACE_EVENT(ocfs2_write_end_inline,
1194 TP_PROTO(unsigned long long ino,
1195 unsigned long long pos, unsigned int copied,
1196 unsigned int id_count, unsigned int features),
1197 TP_ARGS(ino, pos, copied, id_count, features),
1198 TP_STRUCT__entry(
1199 __field(unsigned long long, ino)
1200 __field(unsigned long long, pos)
1201 __field(unsigned int, copied)
1202 __field(unsigned int, id_count)
1203 __field(unsigned int, features)
1204 ),
1205 TP_fast_assign(
1206 __entry->ino = ino;
1207 __entry->pos = pos;
1208 __entry->copied = copied;
1209 __entry->id_count = id_count;
1210 __entry->features = features;
1211 ),
1212 TP_printk("%llu %llu %u %u %u",
1213 __entry->ino, __entry->pos, __entry->copied,
1214 __entry->id_count, __entry->features)
1215);
1216
1217/* End of trace events for fs/ocfs2/aops.c. */
1218
1219/* Trace events for fs/ocfs2/mmap.c. */
1220
1221TRACE_EVENT(ocfs2_fault,
1222 TP_PROTO(unsigned long long ino,
1223 void *area, void *page, unsigned long pgoff),
1224 TP_ARGS(ino, area, page, pgoff),
1225 TP_STRUCT__entry(
1226 __field(unsigned long long, ino)
1227 __field(void *, area)
1228 __field(void *, page)
1229 __field(unsigned long, pgoff)
1230 ),
1231 TP_fast_assign(
1232 __entry->ino = ino;
1233 __entry->area = area;
1234 __entry->page = page;
1235 __entry->pgoff = pgoff;
1236 ),
1237 TP_printk("%llu %p %p %lu",
1238 __entry->ino, __entry->area, __entry->page, __entry->pgoff)
1239);
1240
1241/* End of trace events for fs/ocfs2/mmap.c. */
1242
1243/* Trace events for fs/ocfs2/file.c. */
1244
1245DECLARE_EVENT_CLASS(ocfs2__file_ops,
1246 TP_PROTO(void *inode, void *file, void *dentry,
1247 unsigned long long ino,
1248 unsigned int d_len, const unsigned char *d_name,
1249 unsigned long long para),
1250 TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
1251 TP_STRUCT__entry(
1252 __field(void *, inode)
1253 __field(void *, file)
1254 __field(void *, dentry)
1255 __field(unsigned long long, ino)
1256 __field(unsigned int, d_len)
1257 __string(d_name, d_name)
1258 __field(unsigned long long, para)
1259 ),
1260 TP_fast_assign(
1261 __entry->inode = inode;
1262 __entry->file = file;
1263 __entry->dentry = dentry;
1264 __entry->ino = ino;
1265 __entry->d_len = d_len;
1266 __assign_str(d_name, d_name);
1267 __entry->para = para;
1268 ),
1269 TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
1270 __entry->dentry, __entry->ino, __entry->para,
1271 __entry->d_len, __get_str(d_name))
1272);
1273
1274#define DEFINE_OCFS2_FILE_OPS(name) \
1275DEFINE_EVENT(ocfs2__file_ops, name, \
1276TP_PROTO(void *inode, void *file, void *dentry, \
1277 unsigned long long ino, \
1278 unsigned int d_len, const unsigned char *d_name, \
1279 unsigned long long mode), \
1280 TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
1281
1282DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
1283
1284DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
1285
1286DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
1287
1288DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
1289
1290DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
1291
1292DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
1293
1294DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
1295
1296DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
1297
1298DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
1299
1300TRACE_EVENT(ocfs2_extend_allocation,
1301 TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
1302 unsigned int clusters, unsigned int clusters_to_add,
1303 int why, int restart_func),
1304 TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
1305 TP_STRUCT__entry(
1306 __field(unsigned long long, ip_blkno)
1307 __field(unsigned long long, size)
1308 __field(unsigned int, clusters)
1309 __field(unsigned int, clusters_to_add)
1310 __field(int, why)
1311 __field(int, restart_func)
1312 ),
1313 TP_fast_assign(
1314 __entry->ip_blkno = ip_blkno;
1315 __entry->size = size;
1316 __entry->clusters = clusters;
1317 __entry->clusters_to_add = clusters_to_add;
1318 __entry->why = why;
1319 __entry->restart_func = restart_func;
1320 ),
1321 TP_printk("%llu %llu %u %u %d %d",
1322 __entry->ip_blkno, __entry->size, __entry->clusters,
1323 __entry->clusters_to_add, __entry->why, __entry->restart_func)
1324);
1325
1326TRACE_EVENT(ocfs2_extend_allocation_end,
1327 TP_PROTO(unsigned long long ino,
1328 unsigned int di_clusters, unsigned long long di_size,
1329 unsigned int ip_clusters, unsigned long long i_size),
1330 TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
1331 TP_STRUCT__entry(
1332 __field(unsigned long long, ino)
1333 __field(unsigned int, di_clusters)
1334 __field(unsigned long long, di_size)
1335 __field(unsigned int, ip_clusters)
1336 __field(unsigned long long, i_size)
1337 ),
1338 TP_fast_assign(
1339 __entry->ino = ino;
1340 __entry->di_clusters = di_clusters;
1341 __entry->di_size = di_size;
1342 __entry->ip_clusters = ip_clusters;
1343 __entry->i_size = i_size;
1344 ),
1345 TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
1346 __entry->di_size, __entry->ip_clusters, __entry->i_size)
1347);
1348
1349TRACE_EVENT(ocfs2_write_zero_page,
1350 TP_PROTO(unsigned long long ino,
1351 unsigned long long abs_from, unsigned long long abs_to,
1352 unsigned long index, unsigned int zero_from,
1353 unsigned int zero_to),
1354 TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
1355 TP_STRUCT__entry(
1356 __field(unsigned long long, ino)
1357 __field(unsigned long long, abs_from)
1358 __field(unsigned long long, abs_to)
1359 __field(unsigned long, index)
1360 __field(unsigned int, zero_from)
1361 __field(unsigned int, zero_to)
1362 ),
1363 TP_fast_assign(
1364 __entry->ino = ino;
1365 __entry->abs_from = abs_from;
1366 __entry->abs_to = abs_to;
1367 __entry->index = index;
1368 __entry->zero_from = zero_from;
1369 __entry->zero_to = zero_to;
1370 ),
1371 TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
1372 __entry->abs_from, __entry->abs_to,
1373 __entry->index, __entry->zero_from, __entry->zero_to)
1374);
1375
1376DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
1377
1378DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
1379
1380TRACE_EVENT(ocfs2_setattr,
1381 TP_PROTO(void *inode, void *dentry,
1382 unsigned long long ino,
1383 unsigned int d_len, const unsigned char *d_name,
1384 unsigned int ia_valid, unsigned int ia_mode,
1385 unsigned int ia_uid, unsigned int ia_gid),
1386 TP_ARGS(inode, dentry, ino, d_len, d_name,
1387 ia_valid, ia_mode, ia_uid, ia_gid),
1388 TP_STRUCT__entry(
1389 __field(void *, inode)
1390 __field(void *, dentry)
1391 __field(unsigned long long, ino)
1392 __field(unsigned int, d_len)
1393 __string(d_name, d_name)
1394 __field(unsigned int, ia_valid)
1395 __field(unsigned int, ia_mode)
1396 __field(unsigned int, ia_uid)
1397 __field(unsigned int, ia_gid)
1398 ),
1399 TP_fast_assign(
1400 __entry->inode = inode;
1401 __entry->dentry = dentry;
1402 __entry->ino = ino;
1403 __entry->d_len = d_len;
1404 __assign_str(d_name, d_name);
1405 __entry->ia_valid = ia_valid;
1406 __entry->ia_mode = ia_mode;
1407 __entry->ia_uid = ia_uid;
1408 __entry->ia_gid = ia_gid;
1409 ),
1410 TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
1411 __entry->dentry, __entry->ino, __entry->d_len,
1412 __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
1413 __entry->ia_uid, __entry->ia_gid)
1414);
1415
1416DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
1417
1418DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
1419
1420DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
1421
1422DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
1423
1424DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
1425
1426TRACE_EVENT(ocfs2_prepare_inode_for_write,
1427 TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
1428 int appending, unsigned long count,
1429 int *direct_io, int *has_refcount),
1430 TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
1431 TP_STRUCT__entry(
1432 __field(unsigned long long, ino)
1433 __field(unsigned long long, saved_pos)
1434 __field(int, appending)
1435 __field(unsigned long, count)
1436 __field(int, direct_io)
1437 __field(int, has_refcount)
1438 ),
1439 TP_fast_assign(
1440 __entry->ino = ino;
1441 __entry->saved_pos = saved_pos;
1442 __entry->appending = appending;
1443 __entry->count = count;
1444 __entry->direct_io = direct_io ? *direct_io : -1;
1445 __entry->has_refcount = has_refcount ? *has_refcount : -1;
1446 ),
1447 TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
1448 __entry->saved_pos, __entry->appending, __entry->count,
1449 __entry->direct_io, __entry->has_refcount)
1450);
1451
1452DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
1453
1454/* End of trace events for fs/ocfs2/file.c. */
1455
1456/* Trace events for fs/ocfs2/inode.c. */
1457
1458TRACE_EVENT(ocfs2_iget_begin,
1459 TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
1460 TP_ARGS(ino, flags, sysfile_type),
1461 TP_STRUCT__entry(
1462 __field(unsigned long long, ino)
1463 __field(unsigned int, flags)
1464 __field(int, sysfile_type)
1465 ),
1466 TP_fast_assign(
1467 __entry->ino = ino;
1468 __entry->flags = flags;
1469 __entry->sysfile_type = sysfile_type;
1470 ),
1471 TP_printk("%llu %u %d", __entry->ino,
1472 __entry->flags, __entry->sysfile_type)
1473);
1474
1475DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
1476
1477TRACE_EVENT(ocfs2_iget_end,
1478 TP_PROTO(void *inode, unsigned long long ino),
1479 TP_ARGS(inode, ino),
1480 TP_STRUCT__entry(
1481 __field(void *, inode)
1482 __field(unsigned long long, ino)
1483 ),
1484 TP_fast_assign(
1485 __entry->inode = inode;
1486 __entry->ino = ino;
1487 ),
1488 TP_printk("%p %llu", __entry->inode, __entry->ino)
1489);
1490
1491TRACE_EVENT(ocfs2_find_actor,
1492 TP_PROTO(void *inode, unsigned long long ino,
1493 void *args, unsigned long long fi_blkno),
1494 TP_ARGS(inode, ino, args, fi_blkno),
1495 TP_STRUCT__entry(
1496 __field(void *, inode)
1497 __field(unsigned long long, ino)
1498 __field(void *, args)
1499 __field(unsigned long long, fi_blkno)
1500 ),
1501 TP_fast_assign(
1502 __entry->inode = inode;
1503 __entry->ino = ino;
1504 __entry->args = args;
1505 __entry->fi_blkno = fi_blkno;
1506 ),
1507 TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
1508 __entry->args, __entry->fi_blkno)
1509);
1510
1511DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
1512
1513DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
1514
1515DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
1516
1517DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
1518
1519TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
1520 TP_PROTO(void *task, void *dc_task, unsigned long long ino,
1521 unsigned int flags),
1522 TP_ARGS(task, dc_task, ino, flags),
1523 TP_STRUCT__entry(
1524 __field(void *, task)
1525 __field(void *, dc_task)
1526 __field(unsigned long long, ino)
1527 __field(unsigned int, flags)
1528 ),
1529 TP_fast_assign(
1530 __entry->task = task;
1531 __entry->dc_task = dc_task;
1532 __entry->ino = ino;
1533 __entry->flags = flags;
1534 ),
1535 TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
1536 __entry->ino, __entry->flags)
1537);
1538
1539DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
1540
1541DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
1542
1543DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
1544
1545DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
1546
1547DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
1548
1549DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
1550
1551DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
1552
1553TRACE_EVENT(ocfs2_inode_revalidate,
1554 TP_PROTO(void *inode, unsigned long long ino,
1555 unsigned int flags),
1556 TP_ARGS(inode, ino, flags),
1557 TP_STRUCT__entry(
1558 __field(void *, inode)
1559 __field(unsigned long long, ino)
1560 __field(unsigned int, flags)
1561 ),
1562 TP_fast_assign(
1563 __entry->inode = inode;
1564 __entry->ino = ino;
1565 __entry->flags = flags;
1566 ),
1567 TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
1568);
1569
1570DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
1571
1572/* End of trace events for fs/ocfs2/inode.c. */
1573
1574/* Trace events for fs/ocfs2/extent_map.c. */
1575
1576TRACE_EVENT(ocfs2_read_virt_blocks,
1577 TP_PROTO(void *inode, unsigned long long vblock, int nr,
1578 void *bhs, unsigned int flags, void *validate),
1579 TP_ARGS(inode, vblock, nr, bhs, flags, validate),
1580 TP_STRUCT__entry(
1581 __field(void *, inode)
1582 __field(unsigned long long, vblock)
1583 __field(int, nr)
1584 __field(void *, bhs)
1585 __field(unsigned int, flags)
1586 __field(void *, validate)
1587 ),
1588 TP_fast_assign(
1589 __entry->inode = inode;
1590 __entry->vblock = vblock;
1591 __entry->nr = nr;
1592 __entry->bhs = bhs;
1593 __entry->flags = flags;
1594 __entry->validate = validate;
1595 ),
1596 TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
1597 __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
1598);
1599
1600/* End of trace events for fs/ocfs2/extent_map.c. */
1601
1602/* Trace events for fs/ocfs2/slot_map.c. */
1603
1604DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
1605
1606DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
1607
1608DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
1609
1610DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
1611
1612/* End of trace events for fs/ocfs2/slot_map.c. */
1613
1614/* Trace events for fs/ocfs2/heartbeat.c. */
1615
1616DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
1617
1618/* End of trace events for fs/ocfs2/heartbeat.c. */
1619
1620/* Trace events for fs/ocfs2/super.c. */
1621
1622TRACE_EVENT(ocfs2_remount,
1623 TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
1624 TP_ARGS(s_flags, osb_flags, flags),
1625 TP_STRUCT__entry(
1626 __field(unsigned long, s_flags)
1627 __field(unsigned long, osb_flags)
1628 __field(int, flags)
1629 ),
1630 TP_fast_assign(
1631 __entry->s_flags = s_flags;
1632 __entry->osb_flags = osb_flags;
1633 __entry->flags = flags;
1634 ),
1635 TP_printk("%lu %lu %d", __entry->s_flags,
1636 __entry->osb_flags, __entry->flags)
1637);
1638
1639TRACE_EVENT(ocfs2_fill_super,
1640 TP_PROTO(void *sb, void *data, int silent),
1641 TP_ARGS(sb, data, silent),
1642 TP_STRUCT__entry(
1643 __field(void *, sb)
1644 __field(void *, data)
1645 __field(int, silent)
1646 ),
1647 TP_fast_assign(
1648 __entry->sb = sb;
1649 __entry->data = data;
1650 __entry->silent = silent;
1651 ),
1652 TP_printk("%p %p %d", __entry->sb,
1653 __entry->data, __entry->silent)
1654);
1655
1656TRACE_EVENT(ocfs2_parse_options,
1657 TP_PROTO(int is_remount, char *options),
1658 TP_ARGS(is_remount, options),
1659 TP_STRUCT__entry(
1660 __field(int, is_remount)
1661 __string(options, options)
1662 ),
1663 TP_fast_assign(
1664 __entry->is_remount = is_remount;
1665 __assign_str(options, options);
1666 ),
1667 TP_printk("%d %s", __entry->is_remount, __get_str(options))
1668);
1669
1670DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
1671
1672TRACE_EVENT(ocfs2_statfs,
1673 TP_PROTO(void *sb, void *buf),
1674 TP_ARGS(sb, buf),
1675 TP_STRUCT__entry(
1676 __field(void *, sb)
1677 __field(void *, buf)
1678 ),
1679 TP_fast_assign(
1680 __entry->sb = sb;
1681 __entry->buf = buf;
1682 ),
1683 TP_printk("%p %p", __entry->sb, __entry->buf)
1684);
1685
1686DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
1687
1688TRACE_EVENT(ocfs2_initialize_super,
1689 TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
1690 unsigned long long system_dir, int cluster_bits),
1691 TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
1692 TP_STRUCT__entry(
1693 __string(label, label)
1694 __string(uuid_str, uuid_str)
1695 __field(unsigned long long, root_dir)
1696 __field(unsigned long long, system_dir)
1697 __field(int, cluster_bits)
1698 ),
1699 TP_fast_assign(
1700 __assign_str(label, label);
1701 __assign_str(uuid_str, uuid_str);
1702 __entry->root_dir = root_dir;
1703 __entry->system_dir = system_dir;
1704 __entry->cluster_bits = cluster_bits;
1705 ),
1706 TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
1707 __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
1708);
1709
1710/* End of trace events for fs/ocfs2/super.c. */
1711
1712/* Trace events for fs/ocfs2/xattr.c. */
1713
1714DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
1715
1716DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
1717
1718TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
1719 TP_PROTO(const char *name, int meta, int clusters, int credits),
1720 TP_ARGS(name, meta, clusters, credits),
1721 TP_STRUCT__entry(
1722 __string(name, name)
1723 __field(int, meta)
1724 __field(int, clusters)
1725 __field(int, credits)
1726 ),
1727 TP_fast_assign(
1728 __assign_str(name, name);
1729 __entry->meta = meta;
1730 __entry->clusters = clusters;
1731 __entry->credits = credits;
1732 ),
1733 TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
1734 __entry->clusters, __entry->credits)
1735);
1736
1737DECLARE_EVENT_CLASS(ocfs2__xattr_find,
1738 TP_PROTO(unsigned long long ino, const char *name, int name_index,
1739 unsigned int hash, unsigned long long location,
1740 int xe_index),
1741 TP_ARGS(ino, name, name_index, hash, location, xe_index),
1742 TP_STRUCT__entry(
1743 __field(unsigned long long, ino)
1744 __string(name, name)
1745 __field(int, name_index)
1746 __field(unsigned int, hash)
1747 __field(unsigned long long, location)
1748 __field(int, xe_index)
1749 ),
1750 TP_fast_assign(
1751 __entry->ino = ino;
1752 __assign_str(name, name);
1753 __entry->name_index = name_index;
1754 __entry->hash = hash;
1755 __entry->location = location;
1756 __entry->xe_index = xe_index;
1757 ),
1758 TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
1759 __entry->name_index, __entry->hash, __entry->location,
1760 __entry->xe_index)
1761);
1762
1763#define DEFINE_OCFS2_XATTR_FIND_EVENT(name) \
1764DEFINE_EVENT(ocfs2__xattr_find, name, \
1765TP_PROTO(unsigned long long ino, const char *name, int name_index, \
1766 unsigned int hash, unsigned long long bucket, \
1767 int xe_index), \
1768 TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
1769
1770DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
1771
1772DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
1773
1774DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
1775
1776DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
1777
1778DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
1779
1780DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
1781
1782DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
1783
1784DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
1785
1786DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
1787
1788DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
1789
1790DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
1791
1792DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
1793
1794DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
1795
1796DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
1797
1798DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
1799
1800DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
1801
1802DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
1803
1804DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
1805
1806DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
1807
1808DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
1809
1810DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
1811
1812DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
1813
1814DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
1815
1816DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
1817
1818DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
1819
1820DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
1821
1822DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
1823
1824DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
1825
1826DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
1827
1828DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
1829
1830/* End of trace events for fs/ocfs2/xattr.c. */
1831
1832/* Trace events for fs/ocfs2/reservations.c. */
1833
1834DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
1835
1836DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
1837
1838DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
1839
1840TRACE_EVENT(ocfs2_resv_find_window_begin,
1841 TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
1842 unsigned int wanted, int empty_root),
1843 TP_ARGS(r_start, r_end, goal, wanted, empty_root),
1844 TP_STRUCT__entry(
1845 __field(unsigned int, r_start)
1846 __field(unsigned int, r_end)
1847 __field(unsigned int, goal)
1848 __field(unsigned int, wanted)
1849 __field(int, empty_root)
1850 ),
1851 TP_fast_assign(
1852 __entry->r_start = r_start;
1853 __entry->r_end = r_end;
1854 __entry->goal = goal;
1855 __entry->wanted = wanted;
1856 __entry->empty_root = empty_root;
1857 ),
1858 TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
1859 __entry->goal, __entry->wanted, __entry->empty_root)
1860);
1861
1862DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
1863
1864DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
1865
1866DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
1867
1868TRACE_EVENT(ocfs2_cannibalize_resv_end,
1869 TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
1870 unsigned int last_start, unsigned int last_len),
1871 TP_ARGS(start, end, len, last_start, last_len),
1872 TP_STRUCT__entry(
1873 __field(unsigned int, start)
1874 __field(unsigned int, end)
1875 __field(unsigned int, len)
1876 __field(unsigned int, last_start)
1877 __field(unsigned int, last_len)
1878 ),
1879 TP_fast_assign(
1880 __entry->start = start;
1881 __entry->end = end;
1882 __entry->len = len;
1883 __entry->last_start = last_start;
1884 __entry->last_len = last_len;
1885 ),
1886 TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
1887 __entry->len, __entry->last_start, __entry->last_len)
1888);
1889
1890DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
1891
1892TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
1893 TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
1894 unsigned int r_start, unsigned int r_end, unsigned int r_len,
1895 unsigned int last_start, unsigned int last_len),
1896 TP_ARGS(cstart, cend, clen, r_start, r_end,
1897 r_len, last_start, last_len),
1898 TP_STRUCT__entry(
1899 __field(unsigned int, cstart)
1900 __field(unsigned int, cend)
1901 __field(unsigned int, clen)
1902 __field(unsigned int, r_start)
1903 __field(unsigned int, r_end)
1904 __field(unsigned int, r_len)
1905 __field(unsigned int, last_start)
1906 __field(unsigned int, last_len)
1907 ),
1908 TP_fast_assign(
1909 __entry->cstart = cstart;
1910 __entry->cend = cend;
1911 __entry->clen = clen;
1912 __entry->r_start = r_start;
1913 __entry->r_end = r_end;
1914 __entry->r_len = r_len;
1915 __entry->last_start = last_start;
1916 __entry->last_len = last_len;
1917 ),
1918 TP_printk("%u %u %u %u %u %u %u %u",
1919 __entry->cstart, __entry->cend, __entry->clen,
1920 __entry->r_start, __entry->r_end, __entry->r_len,
1921 __entry->last_start, __entry->last_len)
1922);
1923
1924TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
1925 TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
1926 unsigned int last_start, unsigned int last_len),
1927 TP_ARGS(start, end, len, last_start, last_len),
1928 TP_STRUCT__entry(
1929 __field(unsigned int, start)
1930 __field(unsigned int, end)
1931 __field(unsigned int, len)
1932 __field(unsigned int, last_start)
1933 __field(unsigned int, last_len)
1934 ),
1935 TP_fast_assign(
1936 __entry->start = start;
1937 __entry->end = end;
1938 __entry->len = len;
1939 __entry->last_start = last_start;
1940 __entry->last_len = last_len;
1941 ),
1942 TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
1943 __entry->len, __entry->last_start, __entry->last_len)
1944);
1945
1946/* End of trace events for fs/ocfs2/reservations.c. */
1947
1948/* Trace events for fs/ocfs2/quota_local.c. */
1949
1950DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
1951
1952DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
1953
1954DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
1955
1956/* End of trace events for fs/ocfs2/quota_local.c. */
1957
1958/* Trace events for fs/ocfs2/quota_global.c. */
1959
1960DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
1961
1962TRACE_EVENT(ocfs2_sync_dquot,
1963 TP_PROTO(unsigned int dq_id, long long dqb_curspace,
1964 long long spacechange, long long curinodes,
1965 long long inodechange),
1966 TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
1967 TP_STRUCT__entry(
1968 __field(unsigned int, dq_id)
1969 __field(long long, dqb_curspace)
1970 __field(long long, spacechange)
1971 __field(long long, curinodes)
1972 __field(long long, inodechange)
1973 ),
1974 TP_fast_assign(
1975 __entry->dq_id = dq_id;
1976 __entry->dqb_curspace = dqb_curspace;
1977 __entry->spacechange = spacechange;
1978 __entry->curinodes = curinodes;
1979 __entry->inodechange = inodechange;
1980 ),
1981 TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
1982 __entry->dqb_curspace, __entry->spacechange,
1983 __entry->curinodes, __entry->inodechange)
1984);
1985
1986TRACE_EVENT(ocfs2_sync_dquot_helper,
1987 TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
1988 const char *s_id),
1989 TP_ARGS(dq_id, dq_type, type, s_id),
1990
1991 TP_STRUCT__entry(
1992 __field(unsigned int, dq_id)
1993 __field(unsigned int, dq_type)
1994 __field(unsigned long, type)
1995 __string(s_id, s_id)
1996 ),
1997 TP_fast_assign(
1998 __entry->dq_id = dq_id;
1999 __entry->dq_type = dq_type;
2000 __entry->type = type;
2001 __assign_str(s_id, s_id);
2002 ),
2003 TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
2004 __entry->type, __get_str(s_id))
2005);
2006
2007DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
2008
2009DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
2010
2011DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
2012
2013DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
2014
2015/* End of trace events for fs/ocfs2/quota_global.c. */
2016
2017/* Trace events for fs/ocfs2/dir.c. */
2018DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
2019
2020DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
2021
2022DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
2023
2024TRACE_EVENT(ocfs2_dx_dir_search,
2025 TP_PROTO(unsigned long long ino, int namelen, const char *name,
2026 unsigned int major_hash, unsigned int minor_hash,
2027 unsigned long long blkno),
2028 TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
2029 TP_STRUCT__entry(
2030 __field(unsigned long long, ino)
2031 __field(int, namelen)
2032 __string(name, name)
2033 __field(unsigned int, major_hash)
2034 __field(unsigned int,minor_hash)
2035 __field(unsigned long long, blkno)
2036 ),
2037 TP_fast_assign(
2038 __entry->ino = ino;
2039 __entry->namelen = namelen;
2040 __assign_str(name, name);
2041 __entry->major_hash = major_hash;
2042 __entry->minor_hash = minor_hash;
2043 __entry->blkno = blkno;
2044 ),
2045 TP_printk("%llu %.*s %u %u %llu", __entry->ino,
2046 __entry->namelen, __get_str(name),
2047 __entry->major_hash, __entry->minor_hash, __entry->blkno)
2048);
2049
2050DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
2051
2052DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
2053
2054DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
2055
2056TRACE_EVENT(ocfs2_find_files_on_disk,
2057 TP_PROTO(int namelen, const char *name, void *blkno,
2058 unsigned long long dir),
2059 TP_ARGS(namelen, name, blkno, dir),
2060 TP_STRUCT__entry(
2061 __field(int, namelen)
2062 __string(name, name)
2063 __field(void *, blkno)
2064 __field(unsigned long long, dir)
2065 ),
2066 TP_fast_assign(
2067 __entry->namelen = namelen;
2068 __assign_str(name, name);
2069 __entry->blkno = blkno;
2070 __entry->dir = dir;
2071 ),
2072 TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
2073 __entry->blkno, __entry->dir)
2074);
2075
2076TRACE_EVENT(ocfs2_check_dir_for_entry,
2077 TP_PROTO(unsigned long long dir, int namelen, const char *name),
2078 TP_ARGS(dir, namelen, name),
2079 TP_STRUCT__entry(
2080 __field(unsigned long long, dir)
2081 __field(int, namelen)
2082 __string(name, name)
2083 ),
2084 TP_fast_assign(
2085 __entry->dir = dir;
2086 __entry->namelen = namelen;
2087 __assign_str(name, name);
2088 ),
2089 TP_printk("%llu %.*s", __entry->dir,
2090 __entry->namelen, __get_str(name))
2091);
2092
2093DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
2094
2095DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
2096
2097TRACE_EVENT(ocfs2_dx_dir_index_root_block,
2098 TP_PROTO(unsigned long long dir,
2099 unsigned int major_hash, unsigned int minor_hash,
2100 int namelen, const char *name, unsigned int num_used),
2101 TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
2102 TP_STRUCT__entry(
2103 __field(unsigned long long, dir)
2104 __field(unsigned int, major_hash)
2105 __field(unsigned int, minor_hash)
2106 __field(int, namelen)
2107 __string(name, name)
2108 __field(unsigned int, num_used)
2109 ),
2110 TP_fast_assign(
2111 __entry->dir = dir;
2112 __entry->major_hash = major_hash;
2113 __entry->minor_hash = minor_hash;
2114 __entry->namelen = namelen;
2115 __assign_str(name, name);
2116 __entry->num_used = num_used;
2117 ),
2118 TP_printk("%llu %x %x %.*s %u", __entry->dir,
2119 __entry->major_hash, __entry->minor_hash,
2120 __entry->namelen, __get_str(name), __entry->num_used)
2121);
2122
2123DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
2124
2125DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
2126
2127DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
2128
2129DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
2130
2131/* End of trace events for fs/ocfs2/dir.c. */
2132
2133/* Trace events for fs/ocfs2/namei.c. */
2134
2135DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
2136 TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
2137 unsigned long long dir_blkno, unsigned long long extra),
2138 TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
2139 TP_STRUCT__entry(
2140 __field(void *, dir)
2141 __field(void *, dentry)
2142 __field(int, name_len)
2143 __string(name, name)
2144 __field(unsigned long long, dir_blkno)
2145 __field(unsigned long long, extra)
2146 ),
2147 TP_fast_assign(
2148 __entry->dir = dir;
2149 __entry->dentry = dentry;
2150 __entry->name_len = name_len;
2151 __assign_str(name, name);
2152 __entry->dir_blkno = dir_blkno;
2153 __entry->extra = extra;
2154 ),
2155 TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
2156 __entry->name_len, __get_str(name),
2157 __entry->dir_blkno, __entry->extra)
2158);
2159
2160#define DEFINE_OCFS2_DENTRY_OPS(name) \
2161DEFINE_EVENT(ocfs2__dentry_ops, name, \
2162TP_PROTO(void *dir, void *dentry, int name_len, const char *name, \
2163 unsigned long long dir_blkno, unsigned long long extra), \
2164 TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
2165
2166DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
2167
2168DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
2169
2170DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
2171
2172DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
2173
2174DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
2175
2176DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
2177
2178DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
2179
2180TRACE_EVENT(ocfs2_mknod,
2181 TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
2182 unsigned long long dir_blkno, unsigned long dev, int mode),
2183 TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
2184 TP_STRUCT__entry(
2185 __field(void *, dir)
2186 __field(void *, dentry)
2187 __field(int, name_len)
2188 __string(name, name)
2189 __field(unsigned long long, dir_blkno)
2190 __field(unsigned long, dev)
2191 __field(int, mode)
2192 ),
2193 TP_fast_assign(
2194 __entry->dir = dir;
2195 __entry->dentry = dentry;
2196 __entry->name_len = name_len;
2197 __assign_str(name, name);
2198 __entry->dir_blkno = dir_blkno;
2199 __entry->dev = dev;
2200 __entry->mode = mode;
2201 ),
2202 TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
2203 __entry->name_len, __get_str(name),
2204 __entry->dir_blkno, __entry->dev, __entry->mode)
2205);
2206
2207TRACE_EVENT(ocfs2_link,
2208 TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
2209 int name_len, const char *name),
2210 TP_ARGS(ino, old_len, old_name, name_len, name),
2211 TP_STRUCT__entry(
2212 __field(unsigned long long, ino)
2213 __field(int, old_len)
2214 __string(old_name, old_name)
2215 __field(int, name_len)
2216 __string(name, name)
2217 ),
2218 TP_fast_assign(
2219 __entry->ino = ino;
2220 __entry->old_len = old_len;
2221 __assign_str(old_name, old_name);
2222 __entry->name_len = name_len;
2223 __assign_str(name, name);
2224 ),
2225 TP_printk("%llu %.*s %.*s", __entry->ino,
2226 __entry->old_len, __get_str(old_name),
2227 __entry->name_len, __get_str(name))
2228);
2229
2230DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
2231
2232DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
2233
2234DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
2235
2236TRACE_EVENT(ocfs2_rename,
2237 TP_PROTO(void *old_dir, void *old_dentry,
2238 void *new_dir, void *new_dentry,
2239 int old_len, const char *old_name,
2240 int new_len, const char *new_name),
2241 TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
2242 old_len, old_name, new_len, new_name),
2243 TP_STRUCT__entry(
2244 __field(void *, old_dir)
2245 __field(void *, old_dentry)
2246 __field(void *, new_dir)
2247 __field(void *, new_dentry)
2248 __field(int, old_len)
2249 __string(old_name, old_name)
2250 __field(int, new_len)
2251 __string(new_name, new_name)
2252 ),
2253 TP_fast_assign(
2254 __entry->old_dir = old_dir;
2255 __entry->old_dentry = old_dentry;
2256 __entry->new_dir = new_dir;
2257 __entry->new_dentry = new_dentry;
2258 __entry->old_len = old_len;
2259 __assign_str(old_name, old_name);
2260 __entry->new_len = new_len;
2261 __assign_str(new_name, new_name);
2262 ),
2263 TP_printk("%p %p %p %p %.*s %.*s",
2264 __entry->old_dir, __entry->old_dentry,
2265 __entry->new_dir, __entry->new_dentry,
2266 __entry->old_len, __get_str(old_name),
2267 __entry->new_len, __get_str(new_name))
2268);
2269
2270TRACE_EVENT(ocfs2_rename_target_exists,
2271 TP_PROTO(int new_len, const char *new_name),
2272 TP_ARGS(new_len, new_name),
2273 TP_STRUCT__entry(
2274 __field(int, new_len)
2275 __string(new_name, new_name)
2276 ),
2277 TP_fast_assign(
2278 __entry->new_len = new_len;
2279 __assign_str(new_name, new_name);
2280 ),
2281 TP_printk("%.*s", __entry->new_len, __get_str(new_name))
2282);
2283
2284DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
2285
2286TRACE_EVENT(ocfs2_rename_over_existing,
2287 TP_PROTO(unsigned long long new_blkno, void *new_bh,
2288 unsigned long long newdi_blkno),
2289 TP_ARGS(new_blkno, new_bh, newdi_blkno),
2290 TP_STRUCT__entry(
2291 __field(unsigned long long, new_blkno)
2292 __field(void *, new_bh)
2293 __field(unsigned long long, newdi_blkno)
2294 ),
2295 TP_fast_assign(
2296 __entry->new_blkno = new_blkno;
2297 __entry->new_bh = new_bh;
2298 __entry->newdi_blkno = newdi_blkno;
2299 ),
2300 TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
2301 __entry->newdi_blkno)
2302);
2303
2304DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
2305
2306TRACE_EVENT(ocfs2_symlink_begin,
2307 TP_PROTO(void *dir, void *dentry, const char *symname,
2308 int len, const char *name),
2309 TP_ARGS(dir, dentry, symname, len, name),
2310 TP_STRUCT__entry(
2311 __field(void *, dir)
2312 __field(void *, dentry)
2313 __field(const char *, symname)
2314 __field(int, len)
2315 __string(name, name)
2316 ),
2317 TP_fast_assign(
2318 __entry->dir = dir;
2319 __entry->dentry = dentry;
2320 __entry->symname = symname;
2321 __entry->len = len;
2322 __assign_str(name, name);
2323 ),
2324 TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
2325 __entry->symname, __entry->len, __get_str(name))
2326);
2327
2328TRACE_EVENT(ocfs2_blkno_stringify,
2329 TP_PROTO(unsigned long long blkno, const char *name, int namelen),
2330 TP_ARGS(blkno, name, namelen),
2331 TP_STRUCT__entry(
2332 __field(unsigned long long, blkno)
2333 __string(name, name)
2334 __field(int, namelen)
2335 ),
2336 TP_fast_assign(
2337 __entry->blkno = blkno;
2338 __assign_str(name, name);
2339 __entry->namelen = namelen;
2340 ),
2341 TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
2342 __entry->namelen)
2343);
2344
2345DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
2346
2347DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
2348
2349TRACE_EVENT(ocfs2_orphan_del,
2350 TP_PROTO(unsigned long long dir, const char *name, int namelen),
2351 TP_ARGS(dir, name, namelen),
2352 TP_STRUCT__entry(
2353 __field(unsigned long long, dir)
2354 __string(name, name)
2355 __field(int, namelen)
2356 ),
2357 TP_fast_assign(
2358 __entry->dir = dir;
2359 __assign_str(name, name);
2360 __entry->namelen = namelen;
2361 ),
2362 TP_printk("%llu %s %d", __entry->dir, __get_str(name),
2363 __entry->namelen)
2364);
2365
2366/* End of trace events for fs/ocfs2/namei.c. */
2367
2368/* Trace events for fs/ocfs2/dcache.c. */
2369
2370TRACE_EVENT(ocfs2_dentry_revalidate,
2371 TP_PROTO(void *dentry, int len, const char *name),
2372 TP_ARGS(dentry, len, name),
2373 TP_STRUCT__entry(
2374 __field(void *, dentry)
2375 __field(int, len)
2376 __string(name, name)
2377 ),
2378 TP_fast_assign(
2379 __entry->dentry = dentry;
2380 __entry->len = len;
2381 __assign_str(name, name);
2382 ),
2383 TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
2384);
2385
2386TRACE_EVENT(ocfs2_dentry_revalidate_negative,
2387 TP_PROTO(int len, const char *name, unsigned long pgen,
2388 unsigned long gen),
2389 TP_ARGS(len, name, pgen, gen),
2390 TP_STRUCT__entry(
2391 __field(int, len)
2392 __string(name, name)
2393 __field(unsigned long, pgen)
2394 __field(unsigned long, gen)
2395 ),
2396 TP_fast_assign(
2397 __entry->len = len;
2398 __assign_str(name, name);
2399 __entry->pgen = pgen;
2400 __entry->gen = gen;
2401 ),
2402 TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
2403 __entry->pgen, __entry->gen)
2404);
2405
2406DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
2407
2408DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
2409
2410DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
2411
2412DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
2413
2414TRACE_EVENT(ocfs2_find_local_alias,
2415 TP_PROTO(int len, const char *name),
2416 TP_ARGS(len, name),
2417 TP_STRUCT__entry(
2418 __field(int, len)
2419 __string(name, name)
2420 ),
2421 TP_fast_assign(
2422 __entry->len = len;
2423 __assign_str(name, name);
2424 ),
2425 TP_printk("%.*s", __entry->len, __get_str(name))
2426);
2427
2428TRACE_EVENT(ocfs2_dentry_attach_lock,
2429 TP_PROTO(int len, const char *name,
2430 unsigned long long parent, void *fsdata),
2431 TP_ARGS(len, name, parent, fsdata),
2432 TP_STRUCT__entry(
2433 __field(int, len)
2434 __string(name, name)
2435 __field(unsigned long long, parent)
2436 __field(void *, fsdata)
2437 ),
2438 TP_fast_assign(
2439 __entry->len = len;
2440 __assign_str(name, name);
2441 __entry->parent = parent;
2442 __entry->fsdata = fsdata;
2443 ),
2444 TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
2445 __entry->parent, __entry->fsdata)
2446);
2447
2448TRACE_EVENT(ocfs2_dentry_attach_lock_found,
2449 TP_PROTO(const char *name, unsigned long long parent,
2450 unsigned long long ino),
2451 TP_ARGS(name, parent, ino),
2452 TP_STRUCT__entry(
2453 __string(name, name)
2454 __field(unsigned long long, parent)
2455 __field(unsigned long long, ino)
2456 ),
2457 TP_fast_assign(
2458 __assign_str(name, name);
2459 __entry->parent = parent;
2460 __entry->ino = ino;
2461 ),
2462 TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
2463);
2464/* End of trace events for fs/ocfs2/dcache.c. */
2465
2466/* Trace events for fs/ocfs2/export.c. */
2467
2468TRACE_EVENT(ocfs2_get_dentry_begin,
2469 TP_PROTO(void *sb, void *handle, unsigned long long blkno),
2470 TP_ARGS(sb, handle, blkno),
2471 TP_STRUCT__entry(
2472 __field(void *, sb)
2473 __field(void *, handle)
2474 __field(unsigned long long, blkno)
2475 ),
2476 TP_fast_assign(
2477 __entry->sb = sb;
2478 __entry->handle = handle;
2479 __entry->blkno = blkno;
2480 ),
2481 TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
2482);
2483
2484DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
2485
2486DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
2487
2488DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
2489
2490DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
2491
2492TRACE_EVENT(ocfs2_get_parent,
2493 TP_PROTO(void *child, int len, const char *name,
2494 unsigned long long ino),
2495 TP_ARGS(child, len, name, ino),
2496 TP_STRUCT__entry(
2497 __field(void *, child)
2498 __field(int, len)
2499 __string(name, name)
2500 __field(unsigned long long, ino)
2501 ),
2502 TP_fast_assign(
2503 __entry->child = child;
2504 __entry->len = len;
2505 __assign_str(name, name);
2506 __entry->ino = ino;
2507 ),
2508 TP_printk("%p %.*s %llu", __entry->child, __entry->len,
2509 __get_str(name), __entry->ino)
2510);
2511
2512DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
2513
2514TRACE_EVENT(ocfs2_encode_fh_begin,
2515 TP_PROTO(void *dentry, int name_len, const char *name,
2516 void *fh, int len, int connectable),
2517 TP_ARGS(dentry, name_len, name, fh, len, connectable),
2518 TP_STRUCT__entry(
2519 __field(void *, dentry)
2520 __field(int, name_len)
2521 __string(name, name)
2522 __field(void *, fh)
2523 __field(int, len)
2524 __field(int, connectable)
2525 ),
2526 TP_fast_assign(
2527 __entry->dentry = dentry;
2528 __entry->name_len = name_len;
2529 __assign_str(name, name);
2530 __entry->fh = fh;
2531 __entry->len = len;
2532 __entry->connectable = connectable;
2533 ),
2534 TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
2535 __get_str(name), __entry->fh, __entry->len,
2536 __entry->connectable)
2537);
2538
2539DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
2540
2541DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
2542
2543DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
2544
2545/* End of trace events for fs/ocfs2/export.c. */
2546
2547/* Trace events for fs/ocfs2/journal.c. */
2548
2549DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
2550
2551DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
2552
2553DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
2554
2555DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
2556
2557DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
2558
2559DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
2560
2561DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
2562
2563DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
2564
2565DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
2566
2567DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
2568
2569DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
2570
2571DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
2572
2573TRACE_EVENT(ocfs2_complete_recovery_slot,
2574 TP_PROTO(int slot, unsigned long long la_ino,
2575 unsigned long long tl_ino, void *qrec),
2576 TP_ARGS(slot, la_ino, tl_ino, qrec),
2577 TP_STRUCT__entry(
2578 __field(int, slot)
2579 __field(unsigned long long, la_ino)
2580 __field(unsigned long long, tl_ino)
2581 __field(void *, qrec)
2582 ),
2583 TP_fast_assign(
2584 __entry->slot = slot;
2585 __entry->la_ino = la_ino;
2586 __entry->tl_ino = tl_ino;
2587 __entry->qrec = qrec;
2588 ),
2589 TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
2590 __entry->tl_ino, __entry->qrec)
2591);
2592
2593DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
2594
2595DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
2596
2597TRACE_EVENT(ocfs2_recovery_thread,
2598 TP_PROTO(int node_num, int osb_node_num, int disable,
2599 void *recovery_thread, int map_set),
2600 TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
2601 TP_STRUCT__entry(
2602 __field(int, node_num)
2603 __field(int, osb_node_num)
2604 __field(int,disable)
2605 __field(void *, recovery_thread)
2606 __field(int,map_set)
2607 ),
2608 TP_fast_assign(
2609 __entry->node_num = node_num;
2610 __entry->osb_node_num = osb_node_num;
2611 __entry->disable = disable;
2612 __entry->recovery_thread = recovery_thread;
2613 __entry->map_set = map_set;
2614 ),
2615 TP_printk("%d %d %d %p %d", __entry->node_num,
2616 __entry->osb_node_num, __entry->disable,
2617 __entry->recovery_thread, __entry->map_set)
2618);
2619
2620DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
2621
2622DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
2623
2624DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
2625
2626DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
2627
2628DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
2629
2630DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
2631
2632DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
2633
2634DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
2635
2636DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
2637
2638DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
2639
2640DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
2641
2642DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
2643
2644/* End of trace events for fs/ocfs2/journal.c. */
2645
2646/* Trace events for fs/ocfs2/buffer_head_io.c. */
2647
2648DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
2649
2650DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
2651
2652DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
2653
2654DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
2655
2656DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
2657
2658TRACE_EVENT(ocfs2_write_block,
2659 TP_PROTO(unsigned long long block, void *ci),
2660 TP_ARGS(block, ci),
2661 TP_STRUCT__entry(
2662 __field(unsigned long long, block)
2663 __field(void *, ci)
2664 ),
2665 TP_fast_assign(
2666 __entry->block = block;
2667 __entry->ci = ci;
2668 ),
2669 TP_printk("%llu %p", __entry->block, __entry->ci)
2670);
2671
2672TRACE_EVENT(ocfs2_read_blocks_begin,
2673 TP_PROTO(void *ci, unsigned long long block,
2674 unsigned int nr, int flags),
2675 TP_ARGS(ci, block, nr, flags),
2676 TP_STRUCT__entry(
2677 __field(void *, ci)
2678 __field(unsigned long long, block)
2679 __field(unsigned int, nr)
2680 __field(int, flags)
2681 ),
2682 TP_fast_assign(
2683 __entry->ci = ci;
2684 __entry->block = block;
2685 __entry->nr = nr;
2686 __entry->flags = flags;
2687 ),
2688 TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
2689 __entry->nr, __entry->flags)
2690);
2691
2692/* End of trace events for fs/ocfs2/buffer_head_io.c. */
2693
2694/* Trace events for fs/ocfs2/uptodate.c. */
2695
2696DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
2697
2698DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
2699
2700DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
2701
2702TRACE_EVENT(ocfs2_buffer_cached_end,
2703 TP_PROTO(int index, void *item),
2704 TP_ARGS(index, item),
2705 TP_STRUCT__entry(
2706 __field(int, index)
2707 __field(void *, item)
2708 ),
2709 TP_fast_assign(
2710 __entry->index = index;
2711 __entry->item = item;
2712 ),
2713 TP_printk("%d %p", __entry->index, __entry->item)
2714);
2715
2716DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
2717
2718DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
2719
2720DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
2721
2722DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
2723
2724DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
2725
2726DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
2727
2728DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
2729
2730DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
2731
2732/* End of trace events for fs/ocfs2/uptodate.c. */
2733#endif /* _TRACE_OCFS2_H */
2734
2735/* This part must be outside protection */
2736#undef TRACE_INCLUDE_PATH
2737#define TRACE_INCLUDE_PATH .
2738#define TRACE_INCLUDE_FILE ocfs2_trace
2739#include <trace/define_trace.h>
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 196fcb52d95d..d5ab56cbe5c5 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -114,7 +114,4 @@ int ocfs2_local_write_dquot(struct dquot *dquot);
114extern const struct dquot_operations ocfs2_quota_operations; 114extern const struct dquot_operations ocfs2_quota_operations;
115extern struct quota_format_type ocfs2_quota_format; 115extern struct quota_format_type ocfs2_quota_format;
116 116
117int ocfs2_quota_setup(void);
118void ocfs2_quota_shutdown(void);
119
120#endif /* _OCFS2_QUOTA_H */ 117#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4607923eb24c..92fcd575775a 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -11,7 +11,6 @@
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/workqueue.h> 12#include <linux/workqueue.h>
13 13
14#define MLOG_MASK_PREFIX ML_QUOTA
15#include <cluster/masklog.h> 14#include <cluster/masklog.h>
16 15
17#include "ocfs2_fs.h" 16#include "ocfs2_fs.h"
@@ -27,6 +26,7 @@
27#include "super.h" 26#include "super.h"
28#include "buffer_head_io.h" 27#include "buffer_head_io.h"
29#include "quota.h" 28#include "quota.h"
29#include "ocfs2_trace.h"
30 30
31/* 31/*
32 * Locking of quotas with OCFS2 is rather complex. Here are rules that 32 * Locking of quotas with OCFS2 is rather complex. Here are rules that
@@ -63,8 +63,6 @@
63 * write to gf 63 * write to gf
64 */ 64 */
65 65
66static struct workqueue_struct *ocfs2_quota_wq = NULL;
67
68static void qsync_work_fn(struct work_struct *work); 66static void qsync_work_fn(struct work_struct *work);
69 67
70static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp) 68static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
@@ -132,8 +130,7 @@ int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
132 struct ocfs2_disk_dqtrailer *dqt = 130 struct ocfs2_disk_dqtrailer *dqt =
133 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); 131 ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
134 132
135 mlog(0, "Validating quota block %llu\n", 133 trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
136 (unsigned long long)bh->b_blocknr);
137 134
138 BUG_ON(!buffer_uptodate(bh)); 135 BUG_ON(!buffer_uptodate(bh));
139 136
@@ -343,8 +340,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
343 u64 pcount; 340 u64 pcount;
344 int status; 341 int status;
345 342
346 mlog_entry_void();
347
348 /* Read global header */ 343 /* Read global header */
349 gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], 344 gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
350 OCFS2_INVALID_SLOT); 345 OCFS2_INVALID_SLOT);
@@ -400,11 +395,12 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
400 OCFS2_QBLK_RESERVED_SPACE; 395 OCFS2_QBLK_RESERVED_SPACE;
401 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); 396 oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
402 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); 397 INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
403 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, 398 schedule_delayed_work(&oinfo->dqi_sync_work,
404 msecs_to_jiffies(oinfo->dqi_syncms)); 399 msecs_to_jiffies(oinfo->dqi_syncms));
405 400
406out_err: 401out_err:
407 mlog_exit(status); 402 if (status)
403 mlog_errno(status);
408 return status; 404 return status;
409out_unlock: 405out_unlock:
410 ocfs2_unlock_global_qf(oinfo, 0); 406 ocfs2_unlock_global_qf(oinfo, 0);
@@ -510,9 +506,10 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
510 olditime = dquot->dq_dqb.dqb_itime; 506 olditime = dquot->dq_dqb.dqb_itime;
511 oldbtime = dquot->dq_dqb.dqb_btime; 507 oldbtime = dquot->dq_dqb.dqb_btime;
512 ocfs2_global_disk2memdqb(dquot, &dqblk); 508 ocfs2_global_disk2memdqb(dquot, &dqblk);
513 mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n", 509 trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace,
514 dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange, 510 (long long)spacechange,
515 dquot->dq_dqb.dqb_curinodes, (long long)inodechange); 511 dquot->dq_dqb.dqb_curinodes,
512 (long long)inodechange);
516 if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) 513 if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
517 dquot->dq_dqb.dqb_curspace += spacechange; 514 dquot->dq_dqb.dqb_curspace += spacechange;
518 if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) 515 if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
@@ -559,7 +556,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
559 spin_unlock(&dq_data_lock); 556 spin_unlock(&dq_data_lock);
560 err = ocfs2_qinfo_lock(info, freeing); 557 err = ocfs2_qinfo_lock(info, freeing);
561 if (err < 0) { 558 if (err < 0) {
562 mlog(ML_ERROR, "Failed to lock quota info, loosing quota write" 559 mlog(ML_ERROR, "Failed to lock quota info, losing quota write"
563 " (type=%d, id=%u)\n", dquot->dq_type, 560 " (type=%d, id=%u)\n", dquot->dq_type,
564 (unsigned)dquot->dq_id); 561 (unsigned)dquot->dq_id);
565 goto out; 562 goto out;
@@ -596,8 +593,8 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
596 struct ocfs2_super *osb = OCFS2_SB(sb); 593 struct ocfs2_super *osb = OCFS2_SB(sb);
597 int status = 0; 594 int status = 0;
598 595
599 mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id, 596 trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type,
600 dquot->dq_type, type, sb->s_id); 597 type, sb->s_id);
601 if (type != dquot->dq_type) 598 if (type != dquot->dq_type)
602 goto out; 599 goto out;
603 status = ocfs2_lock_global_qf(oinfo, 1); 600 status = ocfs2_lock_global_qf(oinfo, 1);
@@ -623,7 +620,6 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
623out_ilock: 620out_ilock:
624 ocfs2_unlock_global_qf(oinfo, 1); 621 ocfs2_unlock_global_qf(oinfo, 1);
625out: 622out:
626 mlog_exit(status);
627 return status; 623 return status;
628} 624}
629 625
@@ -635,8 +631,8 @@ static void qsync_work_fn(struct work_struct *work)
635 struct super_block *sb = oinfo->dqi_gqinode->i_sb; 631 struct super_block *sb = oinfo->dqi_gqinode->i_sb;
636 632
637 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); 633 dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
638 queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, 634 schedule_delayed_work(&oinfo->dqi_sync_work,
639 msecs_to_jiffies(oinfo->dqi_syncms)); 635 msecs_to_jiffies(oinfo->dqi_syncms));
640} 636}
641 637
642/* 638/*
@@ -649,7 +645,7 @@ static int ocfs2_write_dquot(struct dquot *dquot)
649 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); 645 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
650 int status = 0; 646 int status = 0;
651 647
652 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 648 trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type);
653 649
654 handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS); 650 handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
655 if (IS_ERR(handle)) { 651 if (IS_ERR(handle)) {
@@ -662,7 +658,6 @@ static int ocfs2_write_dquot(struct dquot *dquot)
662 mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); 658 mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
663 ocfs2_commit_trans(osb, handle); 659 ocfs2_commit_trans(osb, handle);
664out: 660out:
665 mlog_exit(status);
666 return status; 661 return status;
667} 662}
668 663
@@ -688,7 +683,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
688 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); 683 struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
689 int status = 0; 684 int status = 0;
690 685
691 mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); 686 trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type);
692 687
693 mutex_lock(&dquot->dq_lock); 688 mutex_lock(&dquot->dq_lock);
694 /* Check whether we are not racing with some other dqget() */ 689 /* Check whether we are not racing with some other dqget() */
@@ -724,7 +719,8 @@ out_ilock:
724 ocfs2_unlock_global_qf(oinfo, 1); 719 ocfs2_unlock_global_qf(oinfo, 1);
725out: 720out:
726 mutex_unlock(&dquot->dq_lock); 721 mutex_unlock(&dquot->dq_lock);
727 mlog_exit(status); 722 if (status)
723 mlog_errno(status);
728 return status; 724 return status;
729} 725}
730 726
@@ -745,7 +741,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
745 int need_alloc = ocfs2_global_qinit_alloc(sb, type); 741 int need_alloc = ocfs2_global_qinit_alloc(sb, type);
746 handle_t *handle; 742 handle_t *handle;
747 743
748 mlog_entry("id=%u, type=%d", dquot->dq_id, type); 744 trace_ocfs2_acquire_dquot(dquot->dq_id, type);
749 mutex_lock(&dquot->dq_lock); 745 mutex_lock(&dquot->dq_lock);
750 /* 746 /*
751 * We need an exclusive lock, because we're going to update use count 747 * We need an exclusive lock, because we're going to update use count
@@ -811,7 +807,8 @@ out_dq:
811 set_bit(DQ_ACTIVE_B, &dquot->dq_flags); 807 set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
812out: 808out:
813 mutex_unlock(&dquot->dq_lock); 809 mutex_unlock(&dquot->dq_lock);
814 mlog_exit(status); 810 if (status)
811 mlog_errno(status);
815 return status; 812 return status;
816} 813}
817 814
@@ -831,7 +828,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
831 handle_t *handle; 828 handle_t *handle;
832 struct ocfs2_super *osb = OCFS2_SB(sb); 829 struct ocfs2_super *osb = OCFS2_SB(sb);
833 830
834 mlog_entry("id=%u, type=%d", dquot->dq_id, type); 831 trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type);
835 832
836 /* In case user set some limits, sync dquot immediately to global 833 /* In case user set some limits, sync dquot immediately to global
837 * quota file so that information propagates quicker */ 834 * quota file so that information propagates quicker */
@@ -868,7 +865,8 @@ out_dlock:
868out_ilock: 865out_ilock:
869 ocfs2_unlock_global_qf(oinfo, 1); 866 ocfs2_unlock_global_qf(oinfo, 1);
870out: 867out:
871 mlog_exit(status); 868 if (status)
869 mlog_errno(status);
872 return status; 870 return status;
873} 871}
874 872
@@ -879,8 +877,6 @@ static int ocfs2_write_info(struct super_block *sb, int type)
879 int status = 0; 877 int status = 0;
880 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; 878 struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
881 879
882 mlog_entry_void();
883
884 status = ocfs2_lock_global_qf(oinfo, 1); 880 status = ocfs2_lock_global_qf(oinfo, 1);
885 if (status < 0) 881 if (status < 0)
886 goto out; 882 goto out;
@@ -895,7 +891,8 @@ static int ocfs2_write_info(struct super_block *sb, int type)
895out_ilock: 891out_ilock:
896 ocfs2_unlock_global_qf(oinfo, 1); 892 ocfs2_unlock_global_qf(oinfo, 1);
897out: 893out:
898 mlog_exit(status); 894 if (status)
895 mlog_errno(status);
899 return status; 896 return status;
900} 897}
901 898
@@ -923,20 +920,3 @@ const struct dquot_operations ocfs2_quota_operations = {
923 .alloc_dquot = ocfs2_alloc_dquot, 920 .alloc_dquot = ocfs2_alloc_dquot,
924 .destroy_dquot = ocfs2_destroy_dquot, 921 .destroy_dquot = ocfs2_destroy_dquot,
925}; 922};
926
927int ocfs2_quota_setup(void)
928{
929 ocfs2_quota_wq = create_workqueue("o2quot");
930 if (!ocfs2_quota_wq)
931 return -ENOMEM;
932 return 0;
933}
934
935void ocfs2_quota_shutdown(void)
936{
937 if (ocfs2_quota_wq) {
938 flush_workqueue(ocfs2_quota_wq);
939 destroy_workqueue(ocfs2_quota_wq);
940 ocfs2_quota_wq = NULL;
941 }
942}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc78764ccc4c..dc8007fc9247 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -8,7 +8,6 @@
8#include <linux/quotaops.h> 8#include <linux/quotaops.h>
9#include <linux/module.h> 9#include <linux/module.h>
10 10
11#define MLOG_MASK_PREFIX ML_QUOTA
12#include <cluster/masklog.h> 11#include <cluster/masklog.h>
13 12
14#include "ocfs2_fs.h" 13#include "ocfs2_fs.h"
@@ -23,6 +22,7 @@
23#include "quota.h" 22#include "quota.h"
24#include "uptodate.h" 23#include "uptodate.h"
25#include "super.h" 24#include "super.h"
25#include "ocfs2_trace.h"
26 26
27/* Number of local quota structures per block */ 27/* Number of local quota structures per block */
28static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) 28static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -475,7 +475,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
475 struct ocfs2_recovery_chunk *rchunk, *next; 475 struct ocfs2_recovery_chunk *rchunk, *next;
476 qsize_t spacechange, inodechange; 476 qsize_t spacechange, inodechange;
477 477
478 mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type); 478 trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
479 479
480 list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { 480 list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
481 chunk = rchunk->rc_chunk; 481 chunk = rchunk->rc_chunk;
@@ -575,7 +575,8 @@ out_put_bh:
575 } 575 }
576 if (status < 0) 576 if (status < 0)
577 free_recovery_list(&(rec->r_list[type])); 577 free_recovery_list(&(rec->r_list[type]));
578 mlog_exit(status); 578 if (status)
579 mlog_errno(status);
579 return status; 580 return status;
580} 581}
581 582
@@ -600,7 +601,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
600 for (type = 0; type < MAXQUOTAS; type++) { 601 for (type = 0; type < MAXQUOTAS; type++) {
601 if (list_empty(&(rec->r_list[type]))) 602 if (list_empty(&(rec->r_list[type])))
602 continue; 603 continue;
603 mlog(0, "Recovering quota in slot %d\n", slot_num); 604 trace_ocfs2_finish_quota_recovery(slot_num);
604 lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); 605 lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
605 if (!lqinode) { 606 if (!lqinode) {
606 status = -ENOENT; 607 status = -ENOENT;
@@ -882,9 +883,10 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
882 dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - 883 dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
883 od->dq_originodes); 884 od->dq_originodes);
884 spin_unlock(&dq_data_lock); 885 spin_unlock(&dq_data_lock);
885 mlog(0, "Writing local dquot %u space %lld inodes %lld\n", 886 trace_olq_set_dquot(
886 od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod), 887 (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
887 (long long)le64_to_cpu(dqblk->dqb_inodemod)); 888 (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
889 od->dq_dquot.dq_id);
888} 890}
889 891
890/* Write dquot to local quota file */ 892/* Write dquot to local quota file */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e9..5d32749c896d 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -16,7 +16,6 @@
16 */ 16 */
17 17
18#include <linux/sort.h> 18#include <linux/sort.h>
19#define MLOG_MASK_PREFIX ML_REFCOUNT
20#include <cluster/masklog.h> 19#include <cluster/masklog.h>
21#include "ocfs2.h" 20#include "ocfs2.h"
22#include "inode.h" 21#include "inode.h"
@@ -34,6 +33,7 @@
34#include "aops.h" 33#include "aops.h"
35#include "xattr.h" 34#include "xattr.h"
36#include "namei.h" 35#include "namei.h"
36#include "ocfs2_trace.h"
37 37
38#include <linux/bio.h> 38#include <linux/bio.h>
39#include <linux/blkdev.h> 39#include <linux/blkdev.h>
@@ -84,8 +84,7 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
84 struct ocfs2_refcount_block *rb = 84 struct ocfs2_refcount_block *rb =
85 (struct ocfs2_refcount_block *)bh->b_data; 85 (struct ocfs2_refcount_block *)bh->b_data;
86 86
87 mlog(0, "Validating refcount block %llu\n", 87 trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
88 (unsigned long long)bh->b_blocknr);
89 88
90 BUG_ON(!buffer_uptodate(bh)); 89 BUG_ON(!buffer_uptodate(bh));
91 90
@@ -545,8 +544,8 @@ void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
545 while ((node = rb_last(root)) != NULL) { 544 while ((node = rb_last(root)) != NULL) {
546 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); 545 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
547 546
548 mlog(0, "Purge tree %llu\n", 547 trace_ocfs2_purge_refcount_trees(
549 (unsigned long long) tree->rf_blkno); 548 (unsigned long long) tree->rf_blkno);
550 549
551 rb_erase(&tree->rf_node, root); 550 rb_erase(&tree->rf_node, root);
552 ocfs2_free_refcount_tree(tree); 551 ocfs2_free_refcount_tree(tree);
@@ -575,7 +574,8 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
575 574
576 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 575 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
577 576
578 mlog(0, "create tree for inode %lu\n", inode->i_ino); 577 trace_ocfs2_create_refcount_tree(
578 (unsigned long long)OCFS2_I(inode)->ip_blkno);
579 579
580 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 580 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
581 if (ret) { 581 if (ret) {
@@ -646,8 +646,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
646 di->i_refcount_loc = cpu_to_le64(first_blkno); 646 di->i_refcount_loc = cpu_to_le64(first_blkno);
647 spin_unlock(&oi->ip_lock); 647 spin_unlock(&oi->ip_lock);
648 648
649 mlog(0, "created tree for inode %lu, refblock %llu\n", 649 trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
650 inode->i_ino, (unsigned long long)first_blkno);
651 650
652 ocfs2_journal_dirty(handle, di_bh); 651 ocfs2_journal_dirty(handle, di_bh);
653 652
@@ -1256,8 +1255,9 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
1256 goto out; 1255 goto out;
1257 } 1256 }
1258 1257
1259 mlog(0, "change index %d, old count %u, change %d\n", index, 1258 trace_ocfs2_change_refcount_rec(
1260 le32_to_cpu(rec->r_refcount), change); 1259 (unsigned long long)ocfs2_metadata_cache_owner(ci),
1260 index, le32_to_cpu(rec->r_refcount), change);
1261 le32_add_cpu(&rec->r_refcount, change); 1261 le32_add_cpu(&rec->r_refcount, change);
1262 1262
1263 if (!rec->r_refcount) { 1263 if (!rec->r_refcount) {
@@ -1353,8 +1353,8 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
1353 1353
1354 ocfs2_journal_dirty(handle, ref_root_bh); 1354 ocfs2_journal_dirty(handle, ref_root_bh);
1355 1355
1356 mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno, 1356 trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
1357 le16_to_cpu(new_rb->rf_records.rl_used)); 1357 le16_to_cpu(new_rb->rf_records.rl_used));
1358 1358
1359 *ref_leaf_bh = new_bh; 1359 *ref_leaf_bh = new_bh;
1360 new_bh = NULL; 1360 new_bh = NULL;
@@ -1466,9 +1466,9 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
1466 (struct ocfs2_refcount_block *)new_bh->b_data; 1466 (struct ocfs2_refcount_block *)new_bh->b_data;
1467 struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; 1467 struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
1468 1468
1469 mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n", 1469 trace_ocfs2_divide_leaf_refcount_block(
1470 (unsigned long long)ref_leaf_bh->b_blocknr, 1470 (unsigned long long)ref_leaf_bh->b_blocknr,
1471 le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); 1471 le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
1472 1472
1473 /* 1473 /*
1474 * XXX: Improvement later. 1474 * XXX: Improvement later.
@@ -1601,8 +1601,8 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
1601 1601
1602 ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); 1602 ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
1603 1603
1604 mlog(0, "insert new leaf block %llu at %u\n", 1604 trace_ocfs2_new_leaf_refcount_block(
1605 (unsigned long long)new_bh->b_blocknr, new_cpos); 1605 (unsigned long long)new_bh->b_blocknr, new_cpos);
1606 1606
1607 /* Insert the new leaf block with the specific offset cpos. */ 1607 /* Insert the new leaf block with the specific offset cpos. */
1608 ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, 1608 ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
@@ -1794,11 +1794,10 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
1794 (le16_to_cpu(rf_list->rl_used) - index) * 1794 (le16_to_cpu(rf_list->rl_used) - index) *
1795 sizeof(struct ocfs2_refcount_rec)); 1795 sizeof(struct ocfs2_refcount_rec));
1796 1796
1797 mlog(0, "insert refcount record start %llu, len %u, count %u " 1797 trace_ocfs2_insert_refcount_rec(
1798 "to leaf block %llu at index %d\n", 1798 (unsigned long long)ref_leaf_bh->b_blocknr, index,
1799 (unsigned long long)le64_to_cpu(rec->r_cpos), 1799 (unsigned long long)le64_to_cpu(rec->r_cpos),
1800 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount), 1800 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
1801 (unsigned long long)ref_leaf_bh->b_blocknr, index);
1802 1801
1803 rf_list->rl_recs[index] = *rec; 1802 rf_list->rl_recs[index] = *rec;
1804 1803
@@ -1850,10 +1849,12 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1850 1849
1851 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); 1850 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
1852 1851
1853 mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n", 1852 trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
1854 le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), 1853 le32_to_cpu(orig_rec->r_clusters),
1855 le64_to_cpu(split_rec->r_cpos), 1854 le32_to_cpu(orig_rec->r_refcount),
1856 le32_to_cpu(split_rec->r_clusters)); 1855 le64_to_cpu(split_rec->r_cpos),
1856 le32_to_cpu(split_rec->r_clusters),
1857 le32_to_cpu(split_rec->r_refcount));
1857 1858
1858 /* 1859 /*
1859 * If we just need to split the header or tail clusters, 1860 * If we just need to split the header or tail clusters,
@@ -1967,12 +1968,11 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
1967 1968
1968 if (split_rec->r_refcount) { 1969 if (split_rec->r_refcount) {
1969 rf_list->rl_recs[index] = *split_rec; 1970 rf_list->rl_recs[index] = *split_rec;
1970 mlog(0, "insert refcount record start %llu, len %u, count %u " 1971 trace_ocfs2_split_refcount_rec_insert(
1971 "to leaf block %llu at index %d\n", 1972 (unsigned long long)ref_leaf_bh->b_blocknr, index,
1972 (unsigned long long)le64_to_cpu(split_rec->r_cpos), 1973 (unsigned long long)le64_to_cpu(split_rec->r_cpos),
1973 le32_to_cpu(split_rec->r_clusters), 1974 le32_to_cpu(split_rec->r_clusters),
1974 le32_to_cpu(split_rec->r_refcount), 1975 le32_to_cpu(split_rec->r_refcount));
1975 (unsigned long long)ref_leaf_bh->b_blocknr, index);
1976 1976
1977 if (merge) 1977 if (merge)
1978 ocfs2_refcount_rec_merge(rb, index); 1978 ocfs2_refcount_rec_merge(rb, index);
@@ -1997,7 +1997,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
1997 struct ocfs2_refcount_rec rec; 1997 struct ocfs2_refcount_rec rec;
1998 unsigned int set_len = 0; 1998 unsigned int set_len = 0;
1999 1999
2000 mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n", 2000 trace_ocfs2_increase_refcount_begin(
2001 (unsigned long long)ocfs2_metadata_cache_owner(ci), 2001 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2002 (unsigned long long)cpos, len); 2002 (unsigned long long)cpos, len);
2003 2003
@@ -2024,9 +2024,9 @@ static int __ocfs2_increase_refcount(handle_t *handle,
2024 */ 2024 */
2025 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && 2025 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
2026 set_len <= len) { 2026 set_len <= len) {
2027 mlog(0, "increase refcount rec, start %llu, len %u, " 2027 trace_ocfs2_increase_refcount_change(
2028 "count %u\n", (unsigned long long)cpos, set_len, 2028 (unsigned long long)cpos, set_len,
2029 le32_to_cpu(rec.r_refcount)); 2029 le32_to_cpu(rec.r_refcount));
2030 ret = ocfs2_change_refcount_rec(handle, ci, 2030 ret = ocfs2_change_refcount_rec(handle, ci,
2031 ref_leaf_bh, index, 2031 ref_leaf_bh, index,
2032 merge, 1); 2032 merge, 1);
@@ -2037,7 +2037,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
2037 } else if (!rec.r_refcount) { 2037 } else if (!rec.r_refcount) {
2038 rec.r_refcount = cpu_to_le32(1); 2038 rec.r_refcount = cpu_to_le32(1);
2039 2039
2040 mlog(0, "insert refcount rec, start %llu, len %u\n", 2040 trace_ocfs2_increase_refcount_insert(
2041 (unsigned long long)le64_to_cpu(rec.r_cpos), 2041 (unsigned long long)le64_to_cpu(rec.r_cpos),
2042 set_len); 2042 set_len);
2043 ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, 2043 ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
@@ -2055,8 +2055,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
2055 rec.r_clusters = cpu_to_le32(set_len); 2055 rec.r_clusters = cpu_to_le32(set_len);
2056 le32_add_cpu(&rec.r_refcount, 1); 2056 le32_add_cpu(&rec.r_refcount, 1);
2057 2057
2058 mlog(0, "split refcount rec, start %llu, " 2058 trace_ocfs2_increase_refcount_split(
2059 "len %u, count %u\n",
2060 (unsigned long long)le64_to_cpu(rec.r_cpos), 2059 (unsigned long long)le64_to_cpu(rec.r_cpos),
2061 set_len, le32_to_cpu(rec.r_refcount)); 2060 set_len, le32_to_cpu(rec.r_refcount));
2062 ret = ocfs2_split_refcount_rec(handle, ci, 2061 ret = ocfs2_split_refcount_rec(handle, ci,
@@ -2095,6 +2094,11 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
2095 2094
2096 BUG_ON(rb->rf_records.rl_used); 2095 BUG_ON(rb->rf_records.rl_used);
2097 2096
2097 trace_ocfs2_remove_refcount_extent(
2098 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2099 (unsigned long long)ref_leaf_bh->b_blocknr,
2100 le32_to_cpu(rb->rf_cpos));
2101
2098 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 2102 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
2099 ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), 2103 ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
2100 1, meta_ac, dealloc); 2104 1, meta_ac, dealloc);
@@ -2137,7 +2141,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
2137 if (!rb->rf_list.l_next_free_rec) { 2141 if (!rb->rf_list.l_next_free_rec) {
2138 BUG_ON(rb->rf_clusters); 2142 BUG_ON(rb->rf_clusters);
2139 2143
2140 mlog(0, "reset refcount tree root %llu to be a record block.\n", 2144 trace_ocfs2_restore_refcount_block(
2141 (unsigned long long)ref_root_bh->b_blocknr); 2145 (unsigned long long)ref_root_bh->b_blocknr);
2142 2146
2143 rb->rf_flags = 0; 2147 rb->rf_flags = 0;
@@ -2184,6 +2188,10 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
2184 BUG_ON(cpos + len > 2188 BUG_ON(cpos + len >
2185 le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); 2189 le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
2186 2190
2191 trace_ocfs2_decrease_refcount_rec(
2192 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2193 (unsigned long long)cpos, len);
2194
2187 if (cpos == le64_to_cpu(rec->r_cpos) && 2195 if (cpos == le64_to_cpu(rec->r_cpos) &&
2188 len == le32_to_cpu(rec->r_clusters)) 2196 len == le32_to_cpu(rec->r_clusters))
2189 ret = ocfs2_change_refcount_rec(handle, ci, 2197 ret = ocfs2_change_refcount_rec(handle, ci,
@@ -2195,12 +2203,6 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
2195 2203
2196 le32_add_cpu(&split.r_refcount, -1); 2204 le32_add_cpu(&split.r_refcount, -1);
2197 2205
2198 mlog(0, "split refcount rec, start %llu, "
2199 "len %u, count %u, original start %llu, len %u\n",
2200 (unsigned long long)le64_to_cpu(split.r_cpos),
2201 len, le32_to_cpu(split.r_refcount),
2202 (unsigned long long)le64_to_cpu(rec->r_cpos),
2203 le32_to_cpu(rec->r_clusters));
2204 ret = ocfs2_split_refcount_rec(handle, ci, 2206 ret = ocfs2_split_refcount_rec(handle, ci,
2205 ref_root_bh, ref_leaf_bh, 2207 ref_root_bh, ref_leaf_bh,
2206 &split, index, 1, 2208 &split, index, 1,
@@ -2239,10 +2241,9 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
2239 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2241 struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
2240 struct buffer_head *ref_leaf_bh = NULL; 2242 struct buffer_head *ref_leaf_bh = NULL;
2241 2243
2242 mlog(0, "Tree owner %llu, decrease refcount start %llu, " 2244 trace_ocfs2_decrease_refcount(
2243 "len %u, delete %u\n", 2245 (unsigned long long)ocfs2_metadata_cache_owner(ci),
2244 (unsigned long long)ocfs2_metadata_cache_owner(ci), 2246 (unsigned long long)cpos, len, delete);
2245 (unsigned long long)cpos, len, delete);
2246 2247
2247 while (len) { 2248 while (len) {
2248 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2249 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
@@ -2352,8 +2353,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
2352{ 2353{
2353 int ret; 2354 int ret;
2354 2355
2355 mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", 2356 trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
2356 inode->i_ino, cpos, len, phys); 2357 cpos, len, phys);
2357 2358
2358 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2359 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
2359 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2360 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
@@ -2392,8 +2393,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2392 struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; 2393 struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
2393 u32 len; 2394 u32 len;
2394 2395
2395 mlog(0, "start_cpos %llu, clusters %u\n",
2396 (unsigned long long)start_cpos, clusters);
2397 while (clusters) { 2396 while (clusters) {
2398 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2397 ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
2399 cpos, clusters, &rec, 2398 cpos, clusters, &rec,
@@ -2427,12 +2426,11 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2427 2426
2428 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2427 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
2429 2428
2430 mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu," 2429 trace_ocfs2_calc_refcount_meta_credits_iterate(
2431 "rec->r_clusters %u, rec->r_refcount %u, index %d\n", 2430 recs_add, (unsigned long long)cpos, clusters,
2432 recs_add, (unsigned long long)cpos, clusters, 2431 (unsigned long long)le64_to_cpu(rec.r_cpos),
2433 (unsigned long long)le64_to_cpu(rec.r_cpos), 2432 le32_to_cpu(rec.r_clusters),
2434 le32_to_cpu(rec.r_clusters), 2433 le32_to_cpu(rec.r_refcount), index);
2435 le32_to_cpu(rec.r_refcount), index);
2436 2434
2437 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + 2435 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
2438 le32_to_cpu(rec.r_clusters)) - cpos; 2436 le32_to_cpu(rec.r_clusters)) - cpos;
@@ -2488,7 +2486,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2488 if (!ref_blocks) 2486 if (!ref_blocks)
2489 goto out; 2487 goto out;
2490 2488
2491 mlog(0, "we need ref_blocks %d\n", ref_blocks);
2492 *meta_add += ref_blocks; 2489 *meta_add += ref_blocks;
2493 *credits += ref_blocks; 2490 *credits += ref_blocks;
2494 2491
@@ -2514,6 +2511,10 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
2514 } 2511 }
2515 2512
2516out: 2513out:
2514
2515 trace_ocfs2_calc_refcount_meta_credits(
2516 (unsigned long long)start_cpos, clusters,
2517 *meta_add, *credits);
2517 brelse(ref_leaf_bh); 2518 brelse(ref_leaf_bh);
2518 brelse(prev_bh); 2519 brelse(prev_bh);
2519 return ret; 2520 return ret;
@@ -2578,8 +2579,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
2578 goto out; 2579 goto out;
2579 } 2580 }
2580 2581
2581 mlog(0, "reserve new metadata %d blocks, credits = %d\n", 2582 trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
2582 *ref_blocks, *credits);
2583 2583
2584out: 2584out:
2585 brelse(ref_root_bh); 2585 brelse(ref_root_bh);
@@ -2886,8 +2886,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
2886 goto out; 2886 goto out;
2887 } 2887 }
2888 2888
2889 mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", 2889 trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
2890 meta_add, num_clusters, *credits);
2891 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, 2890 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
2892 meta_ac); 2891 meta_ac);
2893 if (ret) { 2892 if (ret) {
@@ -2937,8 +2936,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
2937 loff_t offset, end, map_end; 2936 loff_t offset, end, map_end;
2938 struct address_space *mapping = context->inode->i_mapping; 2937 struct address_space *mapping = context->inode->i_mapping;
2939 2938
2940 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2939 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
2941 new_cluster, new_len, cpos); 2940 new_cluster, new_len);
2942 2941
2943 readahead_pages = 2942 readahead_pages =
2944 (ocfs2_cow_contig_clusters(sb) << 2943 (ocfs2_cow_contig_clusters(sb) <<
@@ -3031,8 +3030,8 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
3031 struct buffer_head *old_bh = NULL; 3030 struct buffer_head *old_bh = NULL;
3032 struct buffer_head *new_bh = NULL; 3031 struct buffer_head *new_bh = NULL;
3033 3032
3034 mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster, 3033 trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
3035 new_cluster, new_len); 3034 new_cluster, new_len);
3036 3035
3037 for (i = 0; i < blocks; i++, old_block++, new_block++) { 3036 for (i = 0; i < blocks; i++, old_block++, new_block++) {
3038 new_bh = sb_getblk(osb->sb, new_block); 3037 new_bh = sb_getblk(osb->sb, new_block);
@@ -3085,8 +3084,8 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
3085 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 3084 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
3086 u64 ino = ocfs2_metadata_cache_owner(et->et_ci); 3085 u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
3087 3086
3088 mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", 3087 trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
3089 (unsigned long long)ino, cpos, len, p_cluster, ext_flags); 3088 cpos, len, p_cluster, ext_flags);
3090 3089
3091 memset(&replace_rec, 0, sizeof(replace_rec)); 3090 memset(&replace_rec, 0, sizeof(replace_rec));
3092 replace_rec.e_cpos = cpu_to_le32(cpos); 3091 replace_rec.e_cpos = cpu_to_le32(cpos);
@@ -3141,8 +3140,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
3141 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3140 struct ocfs2_caching_info *ci = context->data_et.et_ci;
3142 u64 ino = ocfs2_metadata_cache_owner(ci); 3141 u64 ino = ocfs2_metadata_cache_owner(ci);
3143 3142
3144 mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", 3143 trace_ocfs2_replace_clusters((unsigned long long)ino,
3145 (unsigned long long)ino, cpos, old, new, len, ext_flags); 3144 cpos, old, new, len, ext_flags);
3146 3145
3147 /*If the old clusters is unwritten, no need to duplicate. */ 3146 /*If the old clusters is unwritten, no need to duplicate. */
3148 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 3147 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
@@ -3228,7 +3227,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3228 u32 num_clusters, unsigned int e_flags) 3227 u32 num_clusters, unsigned int e_flags)
3229{ 3228{
3230 int ret, delete, index, credits = 0; 3229 int ret, delete, index, credits = 0;
3231 u32 new_bit, new_len; 3230 u32 new_bit, new_len, orig_num_clusters;
3232 unsigned int set_len; 3231 unsigned int set_len;
3233 struct ocfs2_super *osb = OCFS2_SB(sb); 3232 struct ocfs2_super *osb = OCFS2_SB(sb);
3234 handle_t *handle; 3233 handle_t *handle;
@@ -3236,8 +3235,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3236 struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; 3235 struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
3237 struct ocfs2_refcount_rec rec; 3236 struct ocfs2_refcount_rec rec;
3238 3237
3239 mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", 3238 trace_ocfs2_make_clusters_writable(cpos, p_cluster,
3240 cpos, p_cluster, num_clusters, e_flags); 3239 num_clusters, e_flags);
3241 3240
3242 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, 3241 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
3243 &context->data_et, 3242 &context->data_et,
@@ -3261,6 +3260,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3261 goto out; 3260 goto out;
3262 } 3261 }
3263 3262
3263 orig_num_clusters = num_clusters;
3264
3264 while (num_clusters) { 3265 while (num_clusters) {
3265 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, 3266 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
3266 p_cluster, num_clusters, 3267 p_cluster, num_clusters,
@@ -3348,7 +3349,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
3348 * in write-back mode. 3349 * in write-back mode.
3349 */ 3350 */
3350 if (context->get_clusters == ocfs2_di_get_clusters) { 3351 if (context->get_clusters == ocfs2_di_get_clusters) {
3351 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); 3352 ret = ocfs2_cow_sync_writeback(sb, context, cpos,
3353 orig_num_clusters);
3352 if (ret) 3354 if (ret)
3353 mlog_errno(ret); 3355 mlog_errno(ret);
3354 } 3356 }
@@ -3472,9 +3474,9 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
3472 goto out; 3474 goto out;
3473 } 3475 }
3474 3476
3475 mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " 3477 trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
3476 "cow_len %u\n", inode->i_ino, 3478 cpos, write_len, max_cpos,
3477 cpos, write_len, cow_start, cow_len); 3479 cow_start, cow_len);
3478 3480
3479 BUG_ON(cow_len == 0); 3481 BUG_ON(cow_len == 0);
3480 3482
@@ -3753,8 +3755,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
3753 goto out; 3755 goto out;
3754 } 3756 }
3755 3757
3756 mlog(0, "reserve new metadata %d, credits = %d\n", 3758 trace_ocfs2_add_refcount_flag(ref_blocks, credits);
3757 ref_blocks, credits);
3758 3759
3759 if (ref_blocks) { 3760 if (ref_blocks) {
3760 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), 3761 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
@@ -4325,7 +4326,8 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
4325 4326
4326 /* If the security isn't preserved, we need to re-initialize them. */ 4327 /* If the security isn't preserved, we need to re-initialize them. */
4327 if (!preserve) { 4328 if (!preserve) {
4328 error = ocfs2_init_security_and_acl(dir, new_orphan_inode); 4329 error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
4330 &new_dentry->d_name);
4329 if (error) 4331 if (error)
4330 mlog_errno(error); 4332 mlog_errno(error);
4331 } 4333 }
@@ -4376,7 +4378,7 @@ static int ocfs2_user_path_parent(const char __user *path,
4376 if (IS_ERR(s)) 4378 if (IS_ERR(s))
4377 return PTR_ERR(s); 4379 return PTR_ERR(s);
4378 4380
4379 error = path_lookup(s, LOOKUP_PARENT, nd); 4381 error = kern_path_parent(s, nd);
4380 if (error) 4382 if (error)
4381 putname(s); 4383 putname(s);
4382 else 4384 else
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 3e78db361bc7..41ffd36c689c 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -30,10 +30,10 @@
30#include <linux/bitops.h> 30#include <linux/bitops.h>
31#include <linux/list.h> 31#include <linux/list.h>
32 32
33#define MLOG_MASK_PREFIX ML_RESERVATIONS
34#include <cluster/masklog.h> 33#include <cluster/masklog.h>
35 34
36#include "ocfs2.h" 35#include "ocfs2.h"
36#include "ocfs2_trace.h"
37 37
38#ifdef CONFIG_OCFS2_DEBUG_FS 38#ifdef CONFIG_OCFS2_DEBUG_FS
39#define OCFS2_CHECK_RESERVATIONS 39#define OCFS2_CHECK_RESERVATIONS
@@ -321,8 +321,7 @@ static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
321 321
322 assert_spin_locked(&resv_lock); 322 assert_spin_locked(&resv_lock);
323 323
324 mlog(0, "Insert reservation start: %u len: %u\n", new->r_start, 324 trace_ocfs2_resv_insert(new->r_start, new->r_len);
325 new->r_len);
326 325
327 while (*p) { 326 while (*p) {
328 parent = *p; 327 parent = *p;
@@ -423,8 +422,8 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
423 unsigned int best_start, best_len = 0; 422 unsigned int best_start, best_len = 0;
424 int offset, start, found; 423 int offset, start, found;
425 424
426 mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n", 425 trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
427 wanted, search_start, search_len, resmap->m_bitmap_len); 426 wanted, resmap->m_bitmap_len);
428 427
429 found = best_start = best_len = 0; 428 found = best_start = best_len = 0;
430 429
@@ -463,7 +462,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
463 *rlen = best_len; 462 *rlen = best_len;
464 *rstart = best_start; 463 *rstart = best_start;
465 464
466 mlog(0, "Found start: %u len: %u\n", best_start, best_len); 465 trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
467 466
468 return *rlen; 467 return *rlen;
469} 468}
@@ -487,9 +486,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
487 * - our window should be last in all reservations 486 * - our window should be last in all reservations
488 * - need to make sure we don't go past end of bitmap 487 * - need to make sure we don't go past end of bitmap
489 */ 488 */
490 489 trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
491 mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n", 490 goal, wanted, RB_EMPTY_ROOT(root));
492 resv->r_start, ocfs2_resv_end(resv), goal, wanted);
493 491
494 assert_spin_locked(&resv_lock); 492 assert_spin_locked(&resv_lock);
495 493
@@ -498,9 +496,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
498 * Easiest case - empty tree. We can just take 496 * Easiest case - empty tree. We can just take
499 * whatever window of free bits we want. 497 * whatever window of free bits we want.
500 */ 498 */
501
502 mlog(0, "Empty root\n");
503
504 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal, 499 clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
505 resmap->m_bitmap_len - goal, 500 resmap->m_bitmap_len - goal,
506 &cstart, &clen); 501 &cstart, &clen);
@@ -524,8 +519,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
524 prev_resv = ocfs2_find_resv_lhs(resmap, goal); 519 prev_resv = ocfs2_find_resv_lhs(resmap, goal);
525 520
526 if (prev_resv == NULL) { 521 if (prev_resv == NULL) {
527 mlog(0, "Goal on LHS of leftmost window\n");
528
529 /* 522 /*
530 * A NULL here means that the search code couldn't 523 * A NULL here means that the search code couldn't
531 * find a window that starts before goal. 524 * find a window that starts before goal.
@@ -570,13 +563,15 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
570 next_resv = NULL; 563 next_resv = NULL;
571 } 564 }
572 565
566 trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
567 ocfs2_resv_end(prev_resv));
568
573 prev = &prev_resv->r_node; 569 prev = &prev_resv->r_node;
574 570
575 /* Now we do a linear search for a window, starting at 'prev_rsv' */ 571 /* Now we do a linear search for a window, starting at 'prev_rsv' */
576 while (1) { 572 while (1) {
577 next = rb_next(prev); 573 next = rb_next(prev);
578 if (next) { 574 if (next) {
579 mlog(0, "One more resv found in linear search\n");
580 next_resv = rb_entry(next, 575 next_resv = rb_entry(next,
581 struct ocfs2_alloc_reservation, 576 struct ocfs2_alloc_reservation,
582 r_node); 577 r_node);
@@ -585,7 +580,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
585 gap_end = next_resv->r_start - 1; 580 gap_end = next_resv->r_start - 1;
586 gap_len = gap_end - gap_start + 1; 581 gap_len = gap_end - gap_start + 1;
587 } else { 582 } else {
588 mlog(0, "No next node\n");
589 /* 583 /*
590 * We're at the rightmost edge of the 584 * We're at the rightmost edge of the
591 * tree. See if a reservation between this 585 * tree. See if a reservation between this
@@ -596,6 +590,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
596 gap_end = resmap->m_bitmap_len - 1; 590 gap_end = resmap->m_bitmap_len - 1;
597 } 591 }
598 592
593 trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
594 next ? ocfs2_resv_end(next_resv) : -1);
599 /* 595 /*
600 * No need to check this gap if we have already found 596 * No need to check this gap if we have already found
601 * a larger region of free bits. 597 * a larger region of free bits.
@@ -654,8 +650,9 @@ static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
654 lru_resv = list_first_entry(&resmap->m_lru, 650 lru_resv = list_first_entry(&resmap->m_lru,
655 struct ocfs2_alloc_reservation, r_lru); 651 struct ocfs2_alloc_reservation, r_lru);
656 652
657 mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start, 653 trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
658 lru_resv->r_len, ocfs2_resv_end(lru_resv)); 654 lru_resv->r_len,
655 ocfs2_resv_end(lru_resv));
659 656
660 /* 657 /*
661 * Cannibalize (some or all) of the target reservation and 658 * Cannibalize (some or all) of the target reservation and
@@ -684,10 +681,9 @@ static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
684 resv->r_len = shrink; 681 resv->r_len = shrink;
685 } 682 }
686 683
687 mlog(0, "Reservation now looks like: r_start: %u r_end: %u " 684 trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
688 "r_len: %u r_last_start: %u r_last_len: %u\n", 685 resv->r_len, resv->r_last_start,
689 resv->r_start, ocfs2_resv_end(resv), resv->r_len, 686 resv->r_last_len);
690 resv->r_last_start, resv->r_last_len);
691 687
692 ocfs2_resv_insert(resmap, resv); 688 ocfs2_resv_insert(resmap, resv);
693} 689}
@@ -748,7 +744,6 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
748 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen) 744 if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
749 wanted = *clen; 745 wanted = *clen;
750 746
751 mlog(0, "empty reservation, find new window\n");
752 /* 747 /*
753 * Try to get a window here. If it works, we must fall 748 * Try to get a window here. If it works, we must fall
754 * through and test the bitmap . This avoids some 749 * through and test the bitmap . This avoids some
@@ -757,6 +752,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
757 * that inode. 752 * that inode.
758 */ 753 */
759 ocfs2_resv_find_window(resmap, resv, wanted); 754 ocfs2_resv_find_window(resmap, resv, wanted);
755 trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
760 } 756 }
761 757
762 BUG_ON(ocfs2_resv_empty(resv)); 758 BUG_ON(ocfs2_resv_empty(resv));
@@ -813,10 +809,10 @@ void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
813 809
814 spin_lock(&resv_lock); 810 spin_lock(&resv_lock);
815 811
816 mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u " 812 trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
817 "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n", 813 ocfs2_resv_end(resv), resv->r_len,
818 cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv), 814 resv->r_last_start,
819 resv->r_len, resv->r_last_start, resv->r_last_len); 815 resv->r_last_len);
820 816
821 BUG_ON(cstart < resv->r_start); 817 BUG_ON(cstart < resv->r_start);
822 BUG_ON(cstart > ocfs2_resv_end(resv)); 818 BUG_ON(cstart > ocfs2_resv_end(resv));
@@ -833,10 +829,9 @@ void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
833 if (!ocfs2_resv_empty(resv)) 829 if (!ocfs2_resv_empty(resv))
834 ocfs2_resv_mark_lru(resmap, resv); 830 ocfs2_resv_mark_lru(resmap, resv);
835 831
836 mlog(0, "Reservation now looks like: r_start: %u r_end: %u " 832 trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
837 "r_len: %u r_last_start: %u r_last_len: %u\n", 833 resv->r_len, resv->r_last_start,
838 resv->r_start, ocfs2_resv_end(resv), resv->r_len, 834 resv->r_last_len);
839 resv->r_last_start, resv->r_last_len);
840 835
841 ocfs2_check_resmap(resmap); 836 ocfs2_check_resmap(resmap);
842 837
diff --git a/fs/ocfs2/reservations.h b/fs/ocfs2/reservations.h
index 1e49cc29d06c..42c2b804f3fd 100644
--- a/fs/ocfs2/reservations.h
+++ b/fs/ocfs2/reservations.h
@@ -29,7 +29,7 @@
29struct ocfs2_alloc_reservation { 29struct ocfs2_alloc_reservation {
30 struct rb_node r_node; 30 struct rb_node r_node;
31 31
32 unsigned int r_start; /* Begining of current window */ 32 unsigned int r_start; /* Beginning of current window */
33 unsigned int r_len; /* Length of the window */ 33 unsigned int r_len; /* Length of the window */
34 34
35 unsigned int r_last_len; /* Length of most recent alloc */ 35 unsigned int r_last_len; /* Length of most recent alloc */
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index dacd553d8617..ec55add7604a 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -27,7 +27,6 @@
27#include <linux/fs.h> 27#include <linux/fs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29 29
30#define MLOG_MASK_PREFIX ML_DISK_ALLOC
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
32 31
33#include "ocfs2.h" 32#include "ocfs2.h"
@@ -39,6 +38,7 @@
39#include "super.h" 38#include "super.h"
40#include "sysfile.h" 39#include "sysfile.h"
41#include "uptodate.h" 40#include "uptodate.h"
41#include "ocfs2_trace.h"
42 42
43#include "buffer_head_io.h" 43#include "buffer_head_io.h"
44#include "suballoc.h" 44#include "suballoc.h"
@@ -82,7 +82,6 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
82 backups++; 82 backups++;
83 } 83 }
84 84
85 mlog_exit_void();
86 return backups; 85 return backups;
87} 86}
88 87
@@ -103,8 +102,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
103 u16 cl_bpc = le16_to_cpu(cl->cl_bpc); 102 u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
104 u16 cl_cpg = le16_to_cpu(cl->cl_cpg); 103 u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
105 104
106 mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", 105 trace_ocfs2_update_last_group_and_inode(new_clusters,
107 new_clusters, first_new_cluster); 106 first_new_cluster);
108 107
109 ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode), 108 ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
110 group_bh, OCFS2_JOURNAL_ACCESS_WRITE); 109 group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -176,7 +175,8 @@ out_rollback:
176 le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits); 175 le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
177 } 176 }
178out: 177out:
179 mlog_exit(ret); 178 if (ret)
179 mlog_errno(ret);
180 return ret; 180 return ret;
181} 181}
182 182
@@ -281,8 +281,6 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
281 u32 first_new_cluster; 281 u32 first_new_cluster;
282 u64 lgd_blkno; 282 u64 lgd_blkno;
283 283
284 mlog_entry_void();
285
286 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 284 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
287 return -EROFS; 285 return -EROFS;
288 286
@@ -342,7 +340,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
342 goto out_unlock; 340 goto out_unlock;
343 } 341 }
344 342
345 mlog(0, "extend the last group at %llu, new clusters = %d\n", 343
344 trace_ocfs2_group_extend(
346 (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters); 345 (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
347 346
348 handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS); 347 handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
@@ -377,7 +376,6 @@ out_mutex:
377 iput(main_bm_inode); 376 iput(main_bm_inode);
378 377
379out: 378out:
380 mlog_exit_void();
381 return ret; 379 return ret;
382} 380}
383 381
@@ -472,8 +470,6 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
472 struct ocfs2_chain_rec *cr; 470 struct ocfs2_chain_rec *cr;
473 u16 cl_bpc; 471 u16 cl_bpc;
474 472
475 mlog_entry_void();
476
477 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 473 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
478 return -EROFS; 474 return -EROFS;
479 475
@@ -520,8 +516,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
520 goto out_unlock; 516 goto out_unlock;
521 } 517 }
522 518
523 mlog(0, "Add a new group %llu in chain = %u, length = %u\n", 519 trace_ocfs2_group_add((unsigned long long)input->group,
524 (unsigned long long)input->group, input->chain, input->clusters); 520 input->chain, input->clusters, input->frees);
525 521
526 handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS); 522 handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
527 if (IS_ERR(handle)) { 523 if (IS_ERR(handle)) {
@@ -589,6 +585,5 @@ out_mutex:
589 iput(main_bm_inode); 585 iput(main_bm_inode);
590 586
591out: 587out:
592 mlog_exit_void();
593 return ret; 588 return ret;
594} 589}
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index ab4e0172cc1d..26fc0014d509 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -27,7 +27,6 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29 29
30#define MLOG_MASK_PREFIX ML_SUPER
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
32 31
33#include "ocfs2.h" 32#include "ocfs2.h"
@@ -39,6 +38,7 @@
39#include "slot_map.h" 38#include "slot_map.h"
40#include "super.h" 39#include "super.h"
41#include "sysfile.h" 40#include "sysfile.h"
41#include "ocfs2_trace.h"
42 42
43#include "buffer_head_io.h" 43#include "buffer_head_io.h"
44 44
@@ -142,8 +142,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
142 BUG_ON(si->si_blocks == 0); 142 BUG_ON(si->si_blocks == 0);
143 BUG_ON(si->si_bh == NULL); 143 BUG_ON(si->si_bh == NULL);
144 144
145 mlog(0, "Refreshing slot map, reading %u block(s)\n", 145 trace_ocfs2_refresh_slot_info(si->si_blocks);
146 si->si_blocks);
147 146
148 /* 147 /*
149 * We pass -1 as blocknr because we expect all of si->si_bh to 148 * We pass -1 as blocknr because we expect all of si->si_bh to
@@ -381,8 +380,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
381 /* The size checks above should ensure this */ 380 /* The size checks above should ensure this */
382 BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks); 381 BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
383 382
384 mlog(0, "Slot map needs %u buffers for %llu bytes\n", 383 trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
385 si->si_blocks, bytes);
386 384
387 si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, 385 si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
388 GFP_KERNEL); 386 GFP_KERNEL);
@@ -400,8 +398,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
400 goto bail; 398 goto bail;
401 } 399 }
402 400
403 mlog(0, "Reading slot map block %u at %llu\n", i, 401 trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
404 (unsigned long long)blkno);
405 402
406 bh = NULL; /* Acquire a fresh bh */ 403 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno, 404 status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
@@ -475,8 +472,6 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
475 int slot; 472 int slot;
476 struct ocfs2_slot_info *si; 473 struct ocfs2_slot_info *si;
477 474
478 mlog_entry_void();
479
480 si = osb->slot_info; 475 si = osb->slot_info;
481 476
482 spin_lock(&osb->osb_lock); 477 spin_lock(&osb->osb_lock);
@@ -505,14 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
505 osb->slot_num = slot; 500 osb->slot_num = slot;
506 spin_unlock(&osb->osb_lock); 501 spin_unlock(&osb->osb_lock);
507 502
508 mlog(0, "taking node slot %d\n", osb->slot_num); 503 trace_ocfs2_find_slot(osb->slot_num);
509 504
510 status = ocfs2_update_disk_slot(osb, si, osb->slot_num); 505 status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
511 if (status < 0) 506 if (status < 0)
512 mlog_errno(status); 507 mlog_errno(status);
513 508
514bail: 509bail:
515 mlog_exit(status);
516 return status; 510 return status;
517} 511}
518 512
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index 8ce7398ae1d2..1ec56fdb8d0d 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -126,7 +126,7 @@ struct ocfs2_stack_operations {
126 * 126 *
127 * ->connect() must not return until it is guaranteed that 127 * ->connect() must not return until it is guaranteed that
128 * 128 *
129 * - Node down notifications for the filesystem will be recieved 129 * - Node down notifications for the filesystem will be received
130 * and passed to conn->cc_recovery_handler(). 130 * and passed to conn->cc_recovery_handler().
131 * - Locking requests for the filesystem will be processed. 131 * - Locking requests for the filesystem will be processed.
132 */ 132 */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 71998d4d61d5..ba5d97e4a73e 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -29,7 +29,6 @@
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31 31
32#define MLOG_MASK_PREFIX ML_DISK_ALLOC
33#include <cluster/masklog.h> 32#include <cluster/masklog.h>
34 33
35#include "ocfs2.h" 34#include "ocfs2.h"
@@ -44,6 +43,7 @@
44#include "super.h" 43#include "super.h"
45#include "sysfile.h" 44#include "sysfile.h"
46#include "uptodate.h" 45#include "uptodate.h"
46#include "ocfs2_trace.h"
47 47
48#include "buffer_head_io.h" 48#include "buffer_head_io.h"
49 49
@@ -308,8 +308,8 @@ static int ocfs2_validate_group_descriptor(struct super_block *sb,
308 int rc; 308 int rc;
309 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 309 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
310 310
311 mlog(0, "Validating group descriptor %llu\n", 311 trace_ocfs2_validate_group_descriptor(
312 (unsigned long long)bh->b_blocknr); 312 (unsigned long long)bh->b_blocknr);
313 313
314 BUG_ON(!buffer_uptodate(bh)); 314 BUG_ON(!buffer_uptodate(bh));
315 315
@@ -389,8 +389,6 @@ static int ocfs2_block_group_fill(handle_t *handle,
389 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; 389 struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
390 struct super_block * sb = alloc_inode->i_sb; 390 struct super_block * sb = alloc_inode->i_sb;
391 391
392 mlog_entry_void();
393
394 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) { 392 if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
395 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != " 393 ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
396 "b_blocknr (%llu)", 394 "b_blocknr (%llu)",
@@ -436,7 +434,8 @@ static int ocfs2_block_group_fill(handle_t *handle,
436 * allocation time. */ 434 * allocation time. */
437 435
438bail: 436bail:
439 mlog_exit(status); 437 if (status)
438 mlog_errno(status);
440 return status; 439 return status;
441} 440}
442 441
@@ -477,8 +476,8 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
477 476
478 /* setup the group */ 477 /* setup the group */
479 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 478 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
480 mlog(0, "new descriptor, record %u, at block %llu\n", 479 trace_ocfs2_block_group_alloc_contig(
481 alloc_rec, (unsigned long long)bg_blkno); 480 (unsigned long long)bg_blkno, alloc_rec);
482 481
483 bg_bh = sb_getblk(osb->sb, bg_blkno); 482 bg_bh = sb_getblk(osb->sb, bg_blkno);
484 if (!bg_bh) { 483 if (!bg_bh) {
@@ -657,8 +656,8 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
657 656
658 /* setup the group */ 657 /* setup the group */
659 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off); 658 bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
660 mlog(0, "new descriptor, record %u, at block %llu\n", 659 trace_ocfs2_block_group_alloc_discontig(
661 alloc_rec, (unsigned long long)bg_blkno); 660 (unsigned long long)bg_blkno, alloc_rec);
662 661
663 bg_bh = sb_getblk(osb->sb, bg_blkno); 662 bg_bh = sb_getblk(osb->sb, bg_blkno);
664 if (!bg_bh) { 663 if (!bg_bh) {
@@ -707,8 +706,6 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
707 706
708 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode)); 707 BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
709 708
710 mlog_entry_void();
711
712 cl = &fe->id2.i_chain; 709 cl = &fe->id2.i_chain;
713 status = ocfs2_reserve_clusters_with_limit(osb, 710 status = ocfs2_reserve_clusters_with_limit(osb,
714 le16_to_cpu(cl->cl_cpg), 711 le16_to_cpu(cl->cl_cpg),
@@ -730,8 +727,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
730 } 727 }
731 728
732 if (last_alloc_group && *last_alloc_group != 0) { 729 if (last_alloc_group && *last_alloc_group != 0) {
733 mlog(0, "use old allocation group %llu for block group alloc\n", 730 trace_ocfs2_block_group_alloc(
734 (unsigned long long)*last_alloc_group); 731 (unsigned long long)*last_alloc_group);
735 ac->ac_last_group = *last_alloc_group; 732 ac->ac_last_group = *last_alloc_group;
736 } 733 }
737 734
@@ -796,7 +793,8 @@ bail:
796 793
797 brelse(bg_bh); 794 brelse(bg_bh);
798 795
799 mlog_exit(status); 796 if (status)
797 mlog_errno(status);
800 return status; 798 return status;
801} 799}
802 800
@@ -814,8 +812,6 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
814 struct ocfs2_dinode *fe; 812 struct ocfs2_dinode *fe;
815 u32 free_bits; 813 u32 free_bits;
816 814
817 mlog_entry_void();
818
819 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot); 815 alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
820 if (!alloc_inode) { 816 if (!alloc_inode) {
821 mlog_errno(-EINVAL); 817 mlog_errno(-EINVAL);
@@ -855,16 +851,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
855 if (bits_wanted > free_bits) { 851 if (bits_wanted > free_bits) {
856 /* cluster bitmap never grows */ 852 /* cluster bitmap never grows */
857 if (ocfs2_is_cluster_bitmap(alloc_inode)) { 853 if (ocfs2_is_cluster_bitmap(alloc_inode)) {
858 mlog(0, "Disk Full: wanted=%u, free_bits=%u\n", 854 trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
859 bits_wanted, free_bits); 855 free_bits);
860 status = -ENOSPC; 856 status = -ENOSPC;
861 goto bail; 857 goto bail;
862 } 858 }
863 859
864 if (!(flags & ALLOC_NEW_GROUP)) { 860 if (!(flags & ALLOC_NEW_GROUP)) {
865 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " 861 trace_ocfs2_reserve_suballoc_bits_no_new_group(
866 "and we don't alloc a new group for it.\n", 862 slot, bits_wanted, free_bits);
867 slot, bits_wanted, free_bits);
868 status = -ENOSPC; 863 status = -ENOSPC;
869 goto bail; 864 goto bail;
870 } 865 }
@@ -890,7 +885,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
890bail: 885bail:
891 brelse(bh); 886 brelse(bh);
892 887
893 mlog_exit(status); 888 if (status)
889 mlog_errno(status);
894 return status; 890 return status;
895} 891}
896 892
@@ -1052,7 +1048,8 @@ bail:
1052 *ac = NULL; 1048 *ac = NULL;
1053 } 1049 }
1054 1050
1055 mlog_exit(status); 1051 if (status)
1052 mlog_errno(status);
1056 return status; 1053 return status;
1057} 1054}
1058 1055
@@ -1119,8 +1116,8 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
1119 spin_lock(&osb->osb_lock); 1116 spin_lock(&osb->osb_lock);
1120 osb->osb_inode_alloc_group = alloc_group; 1117 osb->osb_inode_alloc_group = alloc_group;
1121 spin_unlock(&osb->osb_lock); 1118 spin_unlock(&osb->osb_lock);
1122 mlog(0, "after reservation, new allocation group is " 1119 trace_ocfs2_reserve_new_inode_new_group(
1123 "%llu\n", (unsigned long long)alloc_group); 1120 (unsigned long long)alloc_group);
1124 1121
1125 /* 1122 /*
1126 * Some inodes must be freed by us, so try to allocate 1123 * Some inodes must be freed by us, so try to allocate
@@ -1152,7 +1149,8 @@ bail:
1152 *ac = NULL; 1149 *ac = NULL;
1153 } 1150 }
1154 1151
1155 mlog_exit(status); 1152 if (status)
1153 mlog_errno(status);
1156 return status; 1154 return status;
1157} 1155}
1158 1156
@@ -1189,8 +1187,6 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
1189{ 1187{
1190 int status; 1188 int status;
1191 1189
1192 mlog_entry_void();
1193
1194 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 1190 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1195 if (!(*ac)) { 1191 if (!(*ac)) {
1196 status = -ENOMEM; 1192 status = -ENOMEM;
@@ -1229,7 +1225,8 @@ bail:
1229 *ac = NULL; 1225 *ac = NULL;
1230 } 1226 }
1231 1227
1232 mlog_exit(status); 1228 if (status)
1229 mlog_errno(status);
1233 return status; 1230 return status;
1234} 1231}
1235 1232
@@ -1357,15 +1354,12 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1357 void *bitmap = bg->bg_bitmap; 1354 void *bitmap = bg->bg_bitmap;
1358 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 1355 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1359 1356
1360 mlog_entry_void();
1361
1362 /* All callers get the descriptor via 1357 /* All callers get the descriptor via
1363 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 1358 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
1364 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1359 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1365 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 1360 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
1366 1361
1367 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 1362 trace_ocfs2_block_group_set_bits(bit_off, num_bits);
1368 num_bits);
1369 1363
1370 if (ocfs2_is_cluster_bitmap(alloc_inode)) 1364 if (ocfs2_is_cluster_bitmap(alloc_inode))
1371 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 1365 journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
@@ -1394,7 +1388,8 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
1394 ocfs2_journal_dirty(handle, group_bh); 1388 ocfs2_journal_dirty(handle, group_bh);
1395 1389
1396bail: 1390bail:
1397 mlog_exit(status); 1391 if (status)
1392 mlog_errno(status);
1398 return status; 1393 return status;
1399} 1394}
1400 1395
@@ -1437,10 +1432,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
1437 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 1432 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1438 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); 1433 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1439 1434
1440 mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", 1435 trace_ocfs2_relink_block_group(
1441 (unsigned long long)le64_to_cpu(fe->i_blkno), chain, 1436 (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1442 (unsigned long long)le64_to_cpu(bg->bg_blkno), 1437 (unsigned long long)le64_to_cpu(bg->bg_blkno),
1443 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno)); 1438 (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1444 1439
1445 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno); 1440 fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1446 bg_ptr = le64_to_cpu(bg->bg_next_group); 1441 bg_ptr = le64_to_cpu(bg->bg_next_group);
@@ -1484,7 +1479,8 @@ out_rollback:
1484 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); 1479 prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1485 } 1480 }
1486 1481
1487 mlog_exit(status); 1482 if (status)
1483 mlog_errno(status);
1488 return status; 1484 return status;
1489} 1485}
1490 1486
@@ -1515,7 +1511,7 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1515 max_bits = le16_to_cpu(gd->bg_bits); 1511 max_bits = le16_to_cpu(gd->bg_bits);
1516 1512
1517 /* Tail groups in cluster bitmaps which aren't cpg 1513 /* Tail groups in cluster bitmaps which aren't cpg
1518 * aligned are prone to partial extention by a failed 1514 * aligned are prone to partial extension by a failed
1519 * fs resize. If the file system resize never got to 1515 * fs resize. If the file system resize never got to
1520 * update the dinode cluster count, then we don't want 1516 * update the dinode cluster count, then we don't want
1521 * to trust any clusters past it, regardless of what 1517 * to trust any clusters past it, regardless of what
@@ -1525,10 +1521,10 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1525 if ((gd_cluster_off + max_bits) > 1521 if ((gd_cluster_off + max_bits) >
1526 OCFS2_I(inode)->ip_clusters) { 1522 OCFS2_I(inode)->ip_clusters) {
1527 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off; 1523 max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1528 mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n", 1524 trace_ocfs2_cluster_group_search_wrong_max_bits(
1529 (unsigned long long)le64_to_cpu(gd->bg_blkno), 1525 (unsigned long long)le64_to_cpu(gd->bg_blkno),
1530 le16_to_cpu(gd->bg_bits), 1526 le16_to_cpu(gd->bg_bits),
1531 OCFS2_I(inode)->ip_clusters, max_bits); 1527 OCFS2_I(inode)->ip_clusters, max_bits);
1532 } 1528 }
1533 1529
1534 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), 1530 ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
@@ -1542,9 +1538,9 @@ static int ocfs2_cluster_group_search(struct inode *inode,
1542 gd_cluster_off + 1538 gd_cluster_off +
1543 res->sr_bit_offset + 1539 res->sr_bit_offset +
1544 res->sr_bits); 1540 res->sr_bits);
1545 mlog(0, "Checking %llu against %llu\n", 1541 trace_ocfs2_cluster_group_search_max_block(
1546 (unsigned long long)blkoff, 1542 (unsigned long long)blkoff,
1547 (unsigned long long)max_block); 1543 (unsigned long long)max_block);
1548 if (blkoff > max_block) 1544 if (blkoff > max_block)
1549 return -ENOSPC; 1545 return -ENOSPC;
1550 } 1546 }
@@ -1588,9 +1584,9 @@ static int ocfs2_block_group_search(struct inode *inode,
1588 if (!ret && max_block) { 1584 if (!ret && max_block) {
1589 blkoff = le64_to_cpu(bg->bg_blkno) + 1585 blkoff = le64_to_cpu(bg->bg_blkno) +
1590 res->sr_bit_offset + res->sr_bits; 1586 res->sr_bit_offset + res->sr_bits;
1591 mlog(0, "Checking %llu against %llu\n", 1587 trace_ocfs2_block_group_search_max_block(
1592 (unsigned long long)blkoff, 1588 (unsigned long long)blkoff,
1593 (unsigned long long)max_block); 1589 (unsigned long long)max_block);
1594 if (blkoff > max_block) 1590 if (blkoff > max_block)
1595 ret = -ENOSPC; 1591 ret = -ENOSPC;
1596 } 1592 }
@@ -1756,9 +1752,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1756 struct ocfs2_group_desc *bg; 1752 struct ocfs2_group_desc *bg;
1757 1753
1758 chain = ac->ac_chain; 1754 chain = ac->ac_chain;
1759 mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n", 1755 trace_ocfs2_search_chain_begin(
1760 bits_wanted, chain, 1756 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
1761 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); 1757 bits_wanted, chain);
1762 1758
1763 status = ocfs2_read_group_descriptor(alloc_inode, fe, 1759 status = ocfs2_read_group_descriptor(alloc_inode, fe,
1764 le64_to_cpu(cl->cl_recs[chain].c_blkno), 1760 le64_to_cpu(cl->cl_recs[chain].c_blkno),
@@ -1799,8 +1795,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1799 goto bail; 1795 goto bail;
1800 } 1796 }
1801 1797
1802 mlog(0, "alloc succeeds: we give %u bits from block group %llu\n", 1798 trace_ocfs2_search_chain_succ(
1803 res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno)); 1799 (unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
1804 1800
1805 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno); 1801 res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
1806 1802
@@ -1861,8 +1857,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1861 goto bail; 1857 goto bail;
1862 } 1858 }
1863 1859
1864 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, 1860 trace_ocfs2_search_chain_end(
1865 (unsigned long long)le64_to_cpu(fe->i_blkno)); 1861 (unsigned long long)le64_to_cpu(fe->i_blkno),
1862 res->sr_bits);
1866 1863
1867out_loc_only: 1864out_loc_only:
1868 *bits_left = le16_to_cpu(bg->bg_free_bits_count); 1865 *bits_left = le16_to_cpu(bg->bg_free_bits_count);
@@ -1870,7 +1867,8 @@ bail:
1870 brelse(group_bh); 1867 brelse(group_bh);
1871 brelse(prev_group_bh); 1868 brelse(prev_group_bh);
1872 1869
1873 mlog_exit(status); 1870 if (status)
1871 mlog_errno(status);
1874 return status; 1872 return status;
1875} 1873}
1876 1874
@@ -1888,8 +1886,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1888 struct ocfs2_chain_list *cl; 1886 struct ocfs2_chain_list *cl;
1889 struct ocfs2_dinode *fe; 1887 struct ocfs2_dinode *fe;
1890 1888
1891 mlog_entry_void();
1892
1893 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 1889 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1894 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given)); 1890 BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1895 BUG_ON(!ac->ac_bh); 1891 BUG_ON(!ac->ac_bh);
@@ -1945,8 +1941,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
1945 goto bail; 1941 goto bail;
1946 } 1942 }
1947 1943
1948 mlog(0, "Search of victim chain %u came up with nothing, " 1944 trace_ocfs2_claim_suballoc_bits(victim);
1949 "trying all chains now.\n", victim);
1950 1945
1951 /* If we didn't pick a good victim, then just default to 1946 /* If we didn't pick a good victim, then just default to
1952 * searching each chain in order. Don't allow chain relinking 1947 * searching each chain in order. Don't allow chain relinking
@@ -1984,7 +1979,8 @@ set_hint:
1984 } 1979 }
1985 1980
1986bail: 1981bail:
1987 mlog_exit(status); 1982 if (status)
1983 mlog_errno(status);
1988 return status; 1984 return status;
1989} 1985}
1990 1986
@@ -2021,7 +2017,8 @@ int ocfs2_claim_metadata(handle_t *handle,
2021 *num_bits = res.sr_bits; 2017 *num_bits = res.sr_bits;
2022 status = 0; 2018 status = 0;
2023bail: 2019bail:
2024 mlog_exit(status); 2020 if (status)
2021 mlog_errno(status);
2025 return status; 2022 return status;
2026} 2023}
2027 2024
@@ -2172,8 +2169,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
2172 goto out; 2169 goto out;
2173 } 2170 }
2174 2171
2175 mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits, 2172 trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
2176 (unsigned long long)di_blkno); 2173 res->sr_bits);
2177 2174
2178 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs); 2175 atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
2179 2176
@@ -2201,8 +2198,6 @@ int ocfs2_claim_new_inode(handle_t *handle,
2201 int status; 2198 int status;
2202 struct ocfs2_suballoc_result res; 2199 struct ocfs2_suballoc_result res;
2203 2200
2204 mlog_entry_void();
2205
2206 BUG_ON(!ac); 2201 BUG_ON(!ac);
2207 BUG_ON(ac->ac_bits_given != 0); 2202 BUG_ON(ac->ac_bits_given != 0);
2208 BUG_ON(ac->ac_bits_wanted != 1); 2203 BUG_ON(ac->ac_bits_wanted != 1);
@@ -2230,7 +2225,8 @@ int ocfs2_claim_new_inode(handle_t *handle,
2230 ocfs2_save_inode_ac_group(dir, ac); 2225 ocfs2_save_inode_ac_group(dir, ac);
2231 status = 0; 2226 status = 0;
2232bail: 2227bail:
2233 mlog_exit(status); 2228 if (status)
2229 mlog_errno(status);
2234 return status; 2230 return status;
2235} 2231}
2236 2232
@@ -2307,8 +2303,6 @@ int __ocfs2_claim_clusters(handle_t *handle,
2307 struct ocfs2_suballoc_result res = { .sr_blkno = 0, }; 2303 struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
2308 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb); 2304 struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
2309 2305
2310 mlog_entry_void();
2311
2312 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); 2306 BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
2313 2307
2314 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL 2308 BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
@@ -2363,7 +2357,8 @@ int __ocfs2_claim_clusters(handle_t *handle,
2363 ac->ac_bits_given += *num_clusters; 2357 ac->ac_bits_given += *num_clusters;
2364 2358
2365bail: 2359bail:
2366 mlog_exit(status); 2360 if (status)
2361 mlog_errno(status);
2367 return status; 2362 return status;
2368} 2363}
2369 2364
@@ -2392,13 +2387,11 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
2392 unsigned int tmp; 2387 unsigned int tmp;
2393 struct ocfs2_group_desc *undo_bg = NULL; 2388 struct ocfs2_group_desc *undo_bg = NULL;
2394 2389
2395 mlog_entry_void();
2396
2397 /* The caller got this descriptor from 2390 /* The caller got this descriptor from
2398 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 2391 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
2399 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 2392 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
2400 2393
2401 mlog(0, "off = %u, num = %u\n", bit_off, num_bits); 2394 trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
2402 2395
2403 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); 2396 BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
2404 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), 2397 status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
@@ -2463,19 +2456,18 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
2463 struct buffer_head *group_bh = NULL; 2456 struct buffer_head *group_bh = NULL;
2464 struct ocfs2_group_desc *group; 2457 struct ocfs2_group_desc *group;
2465 2458
2466 mlog_entry_void();
2467
2468 /* The alloc_bh comes from ocfs2_free_dinode() or 2459 /* The alloc_bh comes from ocfs2_free_dinode() or
2469 * ocfs2_free_clusters(). The callers have all locked the 2460 * ocfs2_free_clusters(). The callers have all locked the
2470 * allocator and gotten alloc_bh from the lock call. This 2461 * allocator and gotten alloc_bh from the lock call. This
2471 * validates the dinode buffer. Any corruption that has happended 2462 * validates the dinode buffer. Any corruption that has happened
2472 * is a code bug. */ 2463 * is a code bug. */
2473 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); 2464 BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
2474 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); 2465 BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
2475 2466
2476 mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", 2467 trace_ocfs2_free_suballoc_bits(
2477 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, 2468 (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
2478 (unsigned long long)bg_blkno, start_bit); 2469 (unsigned long long)bg_blkno,
2470 start_bit, count);
2479 2471
2480 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, 2472 status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
2481 &group_bh); 2473 &group_bh);
@@ -2511,7 +2503,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
2511bail: 2503bail:
2512 brelse(group_bh); 2504 brelse(group_bh);
2513 2505
2514 mlog_exit(status); 2506 if (status)
2507 mlog_errno(status);
2515 return status; 2508 return status;
2516} 2509}
2517 2510
@@ -2556,11 +2549,8 @@ static int _ocfs2_free_clusters(handle_t *handle,
2556 2549
2557 /* You can't ever have a contiguous set of clusters 2550 /* You can't ever have a contiguous set of clusters
2558 * bigger than a block group bitmap so we never have to worry 2551 * bigger than a block group bitmap so we never have to worry
2559 * about looping on them. */ 2552 * about looping on them.
2560 2553 * This is expensive. We can safely remove once this stuff has
2561 mlog_entry_void();
2562
2563 /* This is expensive. We can safely remove once this stuff has
2564 * gotten tested really well. */ 2554 * gotten tested really well. */
2565 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); 2555 BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
2566 2556
@@ -2569,10 +2559,9 @@ static int _ocfs2_free_clusters(handle_t *handle,
2569 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, 2559 ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
2570 &bg_start_bit); 2560 &bg_start_bit);
2571 2561
2572 mlog(0, "want to free %u clusters starting at block %llu\n", 2562 trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
2573 num_clusters, (unsigned long long)start_blk); 2563 (unsigned long long)start_blk,
2574 mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", 2564 bg_start_bit, num_clusters);
2575 (unsigned long long)bg_blkno, bg_start_bit);
2576 2565
2577 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, 2566 status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
2578 bg_start_bit, bg_blkno, 2567 bg_start_bit, bg_blkno,
@@ -2586,7 +2575,8 @@ static int _ocfs2_free_clusters(handle_t *handle,
2586 num_clusters); 2575 num_clusters);
2587 2576
2588out: 2577out:
2589 mlog_exit(status); 2578 if (status)
2579 mlog_errno(status);
2590 return status; 2580 return status;
2591} 2581}
2592 2582
@@ -2756,7 +2746,7 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2756 struct buffer_head *inode_bh = NULL; 2746 struct buffer_head *inode_bh = NULL;
2757 struct ocfs2_dinode *inode_fe; 2747 struct ocfs2_dinode *inode_fe;
2758 2748
2759 mlog_entry("blkno: %llu\n", (unsigned long long)blkno); 2749 trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
2760 2750
2761 /* dirty read disk */ 2751 /* dirty read disk */
2762 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh); 2752 status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
@@ -2793,7 +2783,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
2793bail: 2783bail:
2794 brelse(inode_bh); 2784 brelse(inode_bh);
2795 2785
2796 mlog_exit(status); 2786 if (status)
2787 mlog_errno(status);
2797 return status; 2788 return status;
2798} 2789}
2799 2790
@@ -2816,8 +2807,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2816 u64 bg_blkno; 2807 u64 bg_blkno;
2817 int status; 2808 int status;
2818 2809
2819 mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno, 2810 trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
2820 (unsigned int)bit); 2811 (unsigned int)bit);
2821 2812
2822 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data; 2813 alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
2823 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) { 2814 if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
@@ -2844,7 +2835,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
2844bail: 2835bail:
2845 brelse(group_bh); 2836 brelse(group_bh);
2846 2837
2847 mlog_exit(status); 2838 if (status)
2839 mlog_errno(status);
2848 return status; 2840 return status;
2849} 2841}
2850 2842
@@ -2869,7 +2861,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2869 struct inode *inode_alloc_inode; 2861 struct inode *inode_alloc_inode;
2870 struct buffer_head *alloc_bh = NULL; 2862 struct buffer_head *alloc_bh = NULL;
2871 2863
2872 mlog_entry("blkno: %llu", (unsigned long long)blkno); 2864 trace_ocfs2_test_inode_bit((unsigned long long)blkno);
2873 2865
2874 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot, 2866 status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
2875 &group_blkno, &suballoc_bit); 2867 &group_blkno, &suballoc_bit);
@@ -2910,6 +2902,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
2910 iput(inode_alloc_inode); 2902 iput(inode_alloc_inode);
2911 brelse(alloc_bh); 2903 brelse(alloc_bh);
2912bail: 2904bail:
2913 mlog_exit(status); 2905 if (status)
2906 mlog_errno(status);
2914 return status; 2907 return status;
2915} 2908}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447e..5a521c748859 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,7 +42,9 @@
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/quotaops.h> 43#include <linux/quotaops.h>
44 44
45#define MLOG_MASK_PREFIX ML_SUPER 45#define CREATE_TRACE_POINTS
46#include "ocfs2_trace.h"
47
46#include <cluster/masklog.h> 48#include <cluster/masklog.h>
47 49
48#include "ocfs2.h" 50#include "ocfs2.h"
@@ -76,7 +78,7 @@ static struct kmem_cache *ocfs2_inode_cachep = NULL;
76struct kmem_cache *ocfs2_dquot_cachep; 78struct kmem_cache *ocfs2_dquot_cachep;
77struct kmem_cache *ocfs2_qf_chunk_cachep; 79struct kmem_cache *ocfs2_qf_chunk_cachep;
78 80
79/* OCFS2 needs to schedule several differnt types of work which 81/* OCFS2 needs to schedule several different types of work which
80 * require cluster locking, disk I/O, recovery waits, etc. Since these 82 * require cluster locking, disk I/O, recovery waits, etc. Since these
81 * types of work tend to be heavy we avoid using the kernel events 83 * types of work tend to be heavy we avoid using the kernel events
82 * workqueue and schedule on our own. */ 84 * workqueue and schedule on our own. */
@@ -441,8 +443,6 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
441 int status = 0; 443 int status = 0;
442 int i; 444 int i;
443 445
444 mlog_entry_void();
445
446 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0); 446 new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
447 if (IS_ERR(new)) { 447 if (IS_ERR(new)) {
448 status = PTR_ERR(new); 448 status = PTR_ERR(new);
@@ -478,7 +478,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
478 } 478 }
479 479
480bail: 480bail:
481 mlog_exit(status); 481 if (status)
482 mlog_errno(status);
482 return status; 483 return status;
483} 484}
484 485
@@ -488,8 +489,6 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
488 int status = 0; 489 int status = 0;
489 int i; 490 int i;
490 491
491 mlog_entry_void();
492
493 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; 492 for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
494 i < NUM_SYSTEM_INODES; 493 i < NUM_SYSTEM_INODES;
495 i++) { 494 i++) {
@@ -508,7 +507,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
508 } 507 }
509 508
510bail: 509bail:
511 mlog_exit(status); 510 if (status)
511 mlog_errno(status);
512 return status; 512 return status;
513} 513}
514 514
@@ -517,8 +517,6 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
517 int i; 517 int i;
518 struct inode *inode; 518 struct inode *inode;
519 519
520 mlog_entry_void();
521
522 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) { 520 for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
523 inode = osb->global_system_inodes[i]; 521 inode = osb->global_system_inodes[i];
524 if (inode) { 522 if (inode) {
@@ -540,7 +538,7 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
540 } 538 }
541 539
542 if (!osb->local_system_inodes) 540 if (!osb->local_system_inodes)
543 goto out; 541 return;
544 542
545 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) { 543 for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
546 if (osb->local_system_inodes[i]) { 544 if (osb->local_system_inodes[i]) {
@@ -551,9 +549,6 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
551 549
552 kfree(osb->local_system_inodes); 550 kfree(osb->local_system_inodes);
553 osb->local_system_inodes = NULL; 551 osb->local_system_inodes = NULL;
554
555out:
556 mlog_exit(0);
557} 552}
558 553
559/* We're allocating fs objects, use GFP_NOFS */ 554/* We're allocating fs objects, use GFP_NOFS */
@@ -684,12 +679,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
684 } 679 }
685 680
686 if (*flags & MS_RDONLY) { 681 if (*flags & MS_RDONLY) {
687 mlog(0, "Going to ro mode.\n");
688 sb->s_flags |= MS_RDONLY; 682 sb->s_flags |= MS_RDONLY;
689 osb->osb_flags |= OCFS2_OSB_SOFT_RO; 683 osb->osb_flags |= OCFS2_OSB_SOFT_RO;
690 } else { 684 } else {
691 mlog(0, "Making ro filesystem writeable.\n");
692
693 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { 685 if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
694 mlog(ML_ERROR, "Cannot remount RDWR " 686 mlog(ML_ERROR, "Cannot remount RDWR "
695 "filesystem due to previous errors.\n"); 687 "filesystem due to previous errors.\n");
@@ -707,6 +699,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
707 sb->s_flags &= ~MS_RDONLY; 699 sb->s_flags &= ~MS_RDONLY;
708 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; 700 osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
709 } 701 }
702 trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
710unlock_osb: 703unlock_osb:
711 spin_unlock(&osb->osb_lock); 704 spin_unlock(&osb->osb_lock);
712 /* Enable quota accounting after remounting RW */ 705 /* Enable quota accounting after remounting RW */
@@ -1032,7 +1025,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1032 char nodestr[8]; 1025 char nodestr[8];
1033 struct ocfs2_blockcheck_stats stats; 1026 struct ocfs2_blockcheck_stats stats;
1034 1027
1035 mlog_entry("%p, %p, %i", sb, data, silent); 1028 trace_ocfs2_fill_super(sb, data, silent);
1036 1029
1037 if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) { 1030 if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
1038 status = -EINVAL; 1031 status = -EINVAL;
@@ -1208,7 +1201,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1208 mlog_errno(status); 1201 mlog_errno(status);
1209 atomic_set(&osb->vol_state, VOLUME_DISABLED); 1202 atomic_set(&osb->vol_state, VOLUME_DISABLED);
1210 wake_up(&osb->osb_mount_event); 1203 wake_up(&osb->osb_mount_event);
1211 mlog_exit(status);
1212 return status; 1204 return status;
1213 } 1205 }
1214 } 1206 }
@@ -1222,7 +1214,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1222 /* Start this when the mount is almost sure of being successful */ 1214 /* Start this when the mount is almost sure of being successful */
1223 ocfs2_orphan_scan_start(osb); 1215 ocfs2_orphan_scan_start(osb);
1224 1216
1225 mlog_exit(status);
1226 return status; 1217 return status;
1227 1218
1228read_super_error: 1219read_super_error:
@@ -1237,7 +1228,8 @@ read_super_error:
1237 ocfs2_dismount_volume(sb, 1); 1228 ocfs2_dismount_volume(sb, 1);
1238 } 1229 }
1239 1230
1240 mlog_exit(status); 1231 if (status)
1232 mlog_errno(status);
1241 return status; 1233 return status;
1242} 1234}
1243 1235
@@ -1316,12 +1308,11 @@ static int ocfs2_parse_options(struct super_block *sb,
1316 struct mount_options *mopt, 1308 struct mount_options *mopt,
1317 int is_remount) 1309 int is_remount)
1318{ 1310{
1319 int status; 1311 int status, user_stack = 0;
1320 char *p; 1312 char *p;
1321 u32 tmp; 1313 u32 tmp;
1322 1314
1323 mlog_entry("remount: %d, options: \"%s\"\n", is_remount, 1315 trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
1324 options ? options : "(none)");
1325 1316
1326 mopt->commit_interval = 0; 1317 mopt->commit_interval = 0;
1327 mopt->mount_opt = OCFS2_MOUNT_NOINTR; 1318 mopt->mount_opt = OCFS2_MOUNT_NOINTR;
@@ -1459,6 +1450,15 @@ static int ocfs2_parse_options(struct super_block *sb,
1459 memcpy(mopt->cluster_stack, args[0].from, 1450 memcpy(mopt->cluster_stack, args[0].from,
1460 OCFS2_STACK_LABEL_LEN); 1451 OCFS2_STACK_LABEL_LEN);
1461 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; 1452 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
1453 /*
1454 * Open code the memcmp here as we don't have
1455 * an osb to pass to
1456 * ocfs2_userspace_stack().
1457 */
1458 if (memcmp(mopt->cluster_stack,
1459 OCFS2_CLASSIC_CLUSTER_STACK,
1460 OCFS2_STACK_LABEL_LEN))
1461 user_stack = 1;
1462 break; 1462 break;
1463 case Opt_inode64: 1463 case Opt_inode64:
1464 mopt->mount_opt |= OCFS2_MOUNT_INODE64; 1464 mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1514,19 +1514,21 @@ static int ocfs2_parse_options(struct super_block *sb,
1514 } 1514 }
1515 } 1515 }
1516 1516
1517 /* Ensure only one heartbeat mode */ 1517 if (user_stack == 0) {
1518 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL | 1518 /* Ensure only one heartbeat mode */
1519 OCFS2_MOUNT_HB_NONE); 1519 tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
1520 if (hweight32(tmp) != 1) { 1520 OCFS2_MOUNT_HB_GLOBAL |
1521 mlog(ML_ERROR, "Invalid heartbeat mount options\n"); 1521 OCFS2_MOUNT_HB_NONE);
1522 status = 0; 1522 if (hweight32(tmp) != 1) {
1523 goto bail; 1523 mlog(ML_ERROR, "Invalid heartbeat mount options\n");
1524 status = 0;
1525 goto bail;
1526 }
1524 } 1527 }
1525 1528
1526 status = 1; 1529 status = 1;
1527 1530
1528bail: 1531bail:
1529 mlog_exit(status);
1530 return status; 1532 return status;
1531} 1533}
1532 1534
@@ -1617,8 +1619,6 @@ static int __init ocfs2_init(void)
1617{ 1619{
1618 int status; 1620 int status;
1619 1621
1620 mlog_entry_void();
1621
1622 ocfs2_print_version(); 1622 ocfs2_print_version();
1623 1623
1624 status = init_ocfs2_uptodate_cache(); 1624 status = init_ocfs2_uptodate_cache();
@@ -1645,22 +1645,16 @@ static int __init ocfs2_init(void)
1645 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1645 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
1646 } 1646 }
1647 1647
1648 status = ocfs2_quota_setup();
1649 if (status)
1650 goto leave;
1651
1652 ocfs2_set_locking_protocol(); 1648 ocfs2_set_locking_protocol();
1653 1649
1654 status = register_quota_format(&ocfs2_quota_format); 1650 status = register_quota_format(&ocfs2_quota_format);
1655leave: 1651leave:
1656 if (status < 0) { 1652 if (status < 0) {
1657 ocfs2_quota_shutdown();
1658 ocfs2_free_mem_caches(); 1653 ocfs2_free_mem_caches();
1659 exit_ocfs2_uptodate_cache(); 1654 exit_ocfs2_uptodate_cache();
1655 mlog_errno(status);
1660 } 1656 }
1661 1657
1662 mlog_exit(status);
1663
1664 if (status >= 0) { 1658 if (status >= 0) {
1665 return register_filesystem(&ocfs2_fs_type); 1659 return register_filesystem(&ocfs2_fs_type);
1666 } else 1660 } else
@@ -1669,10 +1663,6 @@ leave:
1669 1663
1670static void __exit ocfs2_exit(void) 1664static void __exit ocfs2_exit(void)
1671{ 1665{
1672 mlog_entry_void();
1673
1674 ocfs2_quota_shutdown();
1675
1676 if (ocfs2_wq) { 1666 if (ocfs2_wq) {
1677 flush_workqueue(ocfs2_wq); 1667 flush_workqueue(ocfs2_wq);
1678 destroy_workqueue(ocfs2_wq); 1668 destroy_workqueue(ocfs2_wq);
@@ -1687,18 +1677,14 @@ static void __exit ocfs2_exit(void)
1687 unregister_filesystem(&ocfs2_fs_type); 1677 unregister_filesystem(&ocfs2_fs_type);
1688 1678
1689 exit_ocfs2_uptodate_cache(); 1679 exit_ocfs2_uptodate_cache();
1690
1691 mlog_exit_void();
1692} 1680}
1693 1681
1694static void ocfs2_put_super(struct super_block *sb) 1682static void ocfs2_put_super(struct super_block *sb)
1695{ 1683{
1696 mlog_entry("(0x%p)\n", sb); 1684 trace_ocfs2_put_super(sb);
1697 1685
1698 ocfs2_sync_blockdev(sb); 1686 ocfs2_sync_blockdev(sb);
1699 ocfs2_dismount_volume(sb, 0); 1687 ocfs2_dismount_volume(sb, 0);
1700
1701 mlog_exit_void();
1702} 1688}
1703 1689
1704static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) 1690static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1710,7 +1696,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
1710 struct buffer_head *bh = NULL; 1696 struct buffer_head *bh = NULL;
1711 struct inode *inode = NULL; 1697 struct inode *inode = NULL;
1712 1698
1713 mlog_entry("(%p, %p)\n", dentry->d_sb, buf); 1699 trace_ocfs2_statfs(dentry->d_sb, buf);
1714 1700
1715 osb = OCFS2_SB(dentry->d_sb); 1701 osb = OCFS2_SB(dentry->d_sb);
1716 1702
@@ -1757,7 +1743,8 @@ bail:
1757 if (inode) 1743 if (inode)
1758 iput(inode); 1744 iput(inode);
1759 1745
1760 mlog_exit(status); 1746 if (status)
1747 mlog_errno(status);
1761 1748
1762 return status; 1749 return status;
1763} 1750}
@@ -1877,8 +1864,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1877 int unlock_super = 0; 1864 int unlock_super = 0;
1878 struct ocfs2_super *osb = OCFS2_SB(sb); 1865 struct ocfs2_super *osb = OCFS2_SB(sb);
1879 1866
1880 mlog_entry_void();
1881
1882 if (ocfs2_is_hard_readonly(osb)) 1867 if (ocfs2_is_hard_readonly(osb))
1883 goto leave; 1868 goto leave;
1884 1869
@@ -1923,7 +1908,6 @@ leave:
1923 if (unlock_super) 1908 if (unlock_super)
1924 ocfs2_super_unlock(osb, 1); 1909 ocfs2_super_unlock(osb, 1);
1925 1910
1926 mlog_exit(status);
1927 return status; 1911 return status;
1928} 1912}
1929 1913
@@ -1933,7 +1917,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1933 struct ocfs2_super *osb = NULL; 1917 struct ocfs2_super *osb = NULL;
1934 char nodestr[8]; 1918 char nodestr[8];
1935 1919
1936 mlog_entry("(0x%p)\n", sb); 1920 trace_ocfs2_dismount_volume(sb);
1937 1921
1938 BUG_ON(!sb); 1922 BUG_ON(!sb);
1939 osb = OCFS2_SB(sb); 1923 osb = OCFS2_SB(sb);
@@ -2085,8 +2069,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2085 struct ocfs2_super *osb; 2069 struct ocfs2_super *osb;
2086 u64 total_blocks; 2070 u64 total_blocks;
2087 2071
2088 mlog_entry_void();
2089
2090 osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL); 2072 osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
2091 if (!osb) { 2073 if (!osb) {
2092 status = -ENOMEM; 2074 status = -ENOMEM;
@@ -2150,7 +2132,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2150 status = -EINVAL; 2132 status = -EINVAL;
2151 goto bail; 2133 goto bail;
2152 } 2134 }
2153 mlog(0, "max_slots for this device: %u\n", osb->max_slots);
2154 2135
2155 ocfs2_orphan_scan_init(osb); 2136 ocfs2_orphan_scan_init(osb);
2156 2137
@@ -2289,7 +2270,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2289 osb->s_clustersize_bits = 2270 osb->s_clustersize_bits =
2290 le32_to_cpu(di->id2.i_super.s_clustersize_bits); 2271 le32_to_cpu(di->id2.i_super.s_clustersize_bits);
2291 osb->s_clustersize = 1 << osb->s_clustersize_bits; 2272 osb->s_clustersize = 1 << osb->s_clustersize_bits;
2292 mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
2293 2273
2294 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE || 2274 if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
2295 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) { 2275 osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
@@ -2328,11 +2308,10 @@ static int ocfs2_initialize_super(struct super_block *sb,
2328 le64_to_cpu(di->id2.i_super.s_first_cluster_group); 2308 le64_to_cpu(di->id2.i_super.s_first_cluster_group);
2329 osb->fs_generation = le32_to_cpu(di->i_fs_generation); 2309 osb->fs_generation = le32_to_cpu(di->i_fs_generation);
2330 osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); 2310 osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
2331 mlog(0, "vol_label: %s\n", osb->vol_label); 2311 trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
2332 mlog(0, "uuid: %s\n", osb->uuid_str); 2312 (unsigned long long)osb->root_blkno,
2333 mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", 2313 (unsigned long long)osb->system_dir_blkno,
2334 (unsigned long long)osb->root_blkno, 2314 osb->s_clustersize_bits);
2335 (unsigned long long)osb->system_dir_blkno);
2336 2315
2337 osb->osb_dlm_debug = ocfs2_new_dlm_debug(); 2316 osb->osb_dlm_debug = ocfs2_new_dlm_debug();
2338 if (!osb->osb_dlm_debug) { 2317 if (!osb->osb_dlm_debug) {
@@ -2375,7 +2354,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2375 } 2354 }
2376 2355
2377bail: 2356bail:
2378 mlog_exit(status);
2379 return status; 2357 return status;
2380} 2358}
2381 2359
@@ -2391,8 +2369,6 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
2391{ 2369{
2392 int status = -EAGAIN; 2370 int status = -EAGAIN;
2393 2371
2394 mlog_entry_void();
2395
2396 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, 2372 if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
2397 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { 2373 strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
2398 /* We have to do a raw check of the feature here */ 2374 /* We have to do a raw check of the feature here */
@@ -2447,7 +2423,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
2447 } 2423 }
2448 2424
2449out: 2425out:
2450 mlog_exit(status); 2426 if (status && status != -EAGAIN)
2427 mlog_errno(status);
2451 return status; 2428 return status;
2452} 2429}
2453 2430
@@ -2460,8 +2437,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2460 * recover 2437 * recover
2461 * ourselves. */ 2438 * ourselves. */
2462 2439
2463 mlog_entry_void();
2464
2465 /* Init our journal object. */ 2440 /* Init our journal object. */
2466 status = ocfs2_journal_init(osb->journal, &dirty); 2441 status = ocfs2_journal_init(osb->journal, &dirty);
2467 if (status < 0) { 2442 if (status < 0) {
@@ -2511,8 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2511 * ourselves as mounted. */ 2486 * ourselves as mounted. */
2512 } 2487 }
2513 2488
2514 mlog(0, "Journal loaded.\n");
2515
2516 status = ocfs2_load_local_alloc(osb); 2489 status = ocfs2_load_local_alloc(osb);
2517 if (status < 0) { 2490 if (status < 0) {
2518 mlog_errno(status); 2491 mlog_errno(status);
@@ -2544,7 +2517,8 @@ finally:
2544 if (local_alloc) 2517 if (local_alloc)
2545 kfree(local_alloc); 2518 kfree(local_alloc);
2546 2519
2547 mlog_exit(status); 2520 if (status)
2521 mlog_errno(status);
2548 return status; 2522 return status;
2549} 2523}
2550 2524
@@ -2556,8 +2530,6 @@ finally:
2556 */ 2530 */
2557static void ocfs2_delete_osb(struct ocfs2_super *osb) 2531static void ocfs2_delete_osb(struct ocfs2_super *osb)
2558{ 2532{
2559 mlog_entry_void();
2560
2561 /* This function assumes that the caller has the main osb resource */ 2533 /* This function assumes that the caller has the main osb resource */
2562 2534
2563 ocfs2_free_slot_info(osb); 2535 ocfs2_free_slot_info(osb);
@@ -2575,8 +2547,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
2575 kfree(osb->uuid_str); 2547 kfree(osb->uuid_str);
2576 ocfs2_put_dlm_debug(osb->osb_dlm_debug); 2548 ocfs2_put_dlm_debug(osb->osb_dlm_debug);
2577 memset(osb, 0, sizeof(struct ocfs2_super)); 2549 memset(osb, 0, sizeof(struct ocfs2_super));
2578
2579 mlog_exit_void();
2580} 2550}
2581 2551
2582/* Put OCFS2 into a readonly state, or (if the user specifies it), 2552/* Put OCFS2 into a readonly state, or (if the user specifies it),
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 9975457c981f..5d22872e2bb3 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -40,7 +40,6 @@
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/namei.h> 41#include <linux/namei.h>
42 42
43#define MLOG_MASK_PREFIX ML_NAMEI
44#include <cluster/masklog.h> 43#include <cluster/masklog.h>
45 44
46#include "ocfs2.h" 45#include "ocfs2.h"
@@ -62,8 +61,6 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
62 char *link = NULL; 61 char *link = NULL;
63 struct ocfs2_dinode *fe; 62 struct ocfs2_dinode *fe;
64 63
65 mlog_entry_void();
66
67 status = ocfs2_read_inode_block(inode, bh); 64 status = ocfs2_read_inode_block(inode, bh);
68 if (status < 0) { 65 if (status < 0) {
69 mlog_errno(status); 66 mlog_errno(status);
@@ -74,7 +71,6 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
74 fe = (struct ocfs2_dinode *) (*bh)->b_data; 71 fe = (struct ocfs2_dinode *) (*bh)->b_data;
75 link = (char *) fe->id2.i_symlink; 72 link = (char *) fe->id2.i_symlink;
76bail: 73bail:
77 mlog_exit(status);
78 74
79 return link; 75 return link;
80} 76}
@@ -88,8 +84,6 @@ static int ocfs2_readlink(struct dentry *dentry,
88 struct buffer_head *bh = NULL; 84 struct buffer_head *bh = NULL;
89 struct inode *inode = dentry->d_inode; 85 struct inode *inode = dentry->d_inode;
90 86
91 mlog_entry_void();
92
93 link = ocfs2_fast_symlink_getlink(inode, &bh); 87 link = ocfs2_fast_symlink_getlink(inode, &bh);
94 if (IS_ERR(link)) { 88 if (IS_ERR(link)) {
95 ret = PTR_ERR(link); 89 ret = PTR_ERR(link);
@@ -104,7 +98,8 @@ static int ocfs2_readlink(struct dentry *dentry,
104 98
105 brelse(bh); 99 brelse(bh);
106out: 100out:
107 mlog_exit(ret); 101 if (ret < 0)
102 mlog_errno(ret);
108 return ret; 103 return ret;
109} 104}
110 105
@@ -117,8 +112,6 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
117 struct inode *inode = dentry->d_inode; 112 struct inode *inode = dentry->d_inode;
118 struct buffer_head *bh = NULL; 113 struct buffer_head *bh = NULL;
119 114
120 mlog_entry_void();
121
122 BUG_ON(!ocfs2_inode_is_fast_symlink(inode)); 115 BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
123 target = ocfs2_fast_symlink_getlink(inode, &bh); 116 target = ocfs2_fast_symlink_getlink(inode, &bh);
124 if (IS_ERR(target)) { 117 if (IS_ERR(target)) {
@@ -142,7 +135,8 @@ bail:
142 nd_set_link(nd, status ? ERR_PTR(status) : link); 135 nd_set_link(nd, status ? ERR_PTR(status) : link);
143 brelse(bh); 136 brelse(bh);
144 137
145 mlog_exit(status); 138 if (status)
139 mlog_errno(status);
146 return NULL; 140 return NULL;
147} 141}
148 142
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 902efb23b6a6..3d635f4bbb20 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -27,7 +27,6 @@
27#include <linux/types.h> 27#include <linux/types.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29 29
30#define MLOG_MASK_PREFIX ML_INODE
31#include <cluster/masklog.h> 30#include <cluster/masklog.h>
32 31
33#include "ocfs2.h" 32#include "ocfs2.h"
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index a0a120e82b97..52eaf33d346f 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -54,14 +54,13 @@
54#include <linux/buffer_head.h> 54#include <linux/buffer_head.h>
55#include <linux/rbtree.h> 55#include <linux/rbtree.h>
56 56
57#define MLOG_MASK_PREFIX ML_UPTODATE
58
59#include <cluster/masklog.h> 57#include <cluster/masklog.h>
60 58
61#include "ocfs2.h" 59#include "ocfs2.h"
62 60
63#include "inode.h" 61#include "inode.h"
64#include "uptodate.h" 62#include "uptodate.h"
63#include "ocfs2_trace.h"
65 64
66struct ocfs2_meta_cache_item { 65struct ocfs2_meta_cache_item {
67 struct rb_node c_node; 66 struct rb_node c_node;
@@ -152,8 +151,8 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
152 while ((node = rb_last(root)) != NULL) { 151 while ((node = rb_last(root)) != NULL) {
153 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node); 152 item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
154 153
155 mlog(0, "Purge item %llu\n", 154 trace_ocfs2_purge_copied_metadata_tree(
156 (unsigned long long) item->c_block); 155 (unsigned long long) item->c_block);
157 156
158 rb_erase(&item->c_node, root); 157 rb_erase(&item->c_node, root);
159 kmem_cache_free(ocfs2_uptodate_cachep, item); 158 kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -180,9 +179,9 @@ void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
180 tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE); 179 tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
181 to_purge = ci->ci_num_cached; 180 to_purge = ci->ci_num_cached;
182 181
183 mlog(0, "Purge %u %s items from Owner %llu\n", to_purge, 182 trace_ocfs2_metadata_cache_purge(
184 tree ? "array" : "tree", 183 (unsigned long long)ocfs2_metadata_cache_owner(ci),
185 (unsigned long long)ocfs2_metadata_cache_owner(ci)); 184 to_purge, tree);
186 185
187 /* If we're a tree, save off the root so that we can safely 186 /* If we're a tree, save off the root so that we can safely
188 * initialize the cache. We do the work to free tree members 187 * initialize the cache. We do the work to free tree members
@@ -249,10 +248,10 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
249 248
250 ocfs2_metadata_cache_lock(ci); 249 ocfs2_metadata_cache_lock(ci);
251 250
252 mlog(0, "Owner %llu, query block %llu (inline = %u)\n", 251 trace_ocfs2_buffer_cached_begin(
253 (unsigned long long)ocfs2_metadata_cache_owner(ci), 252 (unsigned long long)ocfs2_metadata_cache_owner(ci),
254 (unsigned long long) bh->b_blocknr, 253 (unsigned long long) bh->b_blocknr,
255 !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE)); 254 !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
256 255
257 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) 256 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
258 index = ocfs2_search_cache_array(ci, bh->b_blocknr); 257 index = ocfs2_search_cache_array(ci, bh->b_blocknr);
@@ -261,7 +260,7 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
261 260
262 ocfs2_metadata_cache_unlock(ci); 261 ocfs2_metadata_cache_unlock(ci);
263 262
264 mlog(0, "index = %d, item = %p\n", index, item); 263 trace_ocfs2_buffer_cached_end(index, item);
265 264
266 return (index != -1) || (item != NULL); 265 return (index != -1) || (item != NULL);
267} 266}
@@ -306,8 +305,9 @@ static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
306{ 305{
307 BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY); 306 BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
308 307
309 mlog(0, "block %llu takes position %u\n", (unsigned long long) block, 308 trace_ocfs2_append_cache_array(
310 ci->ci_num_cached); 309 (unsigned long long)ocfs2_metadata_cache_owner(ci),
310 (unsigned long long)block, ci->ci_num_cached);
311 311
312 ci->ci_cache.ci_array[ci->ci_num_cached] = block; 312 ci->ci_cache.ci_array[ci->ci_num_cached] = block;
313 ci->ci_num_cached++; 313 ci->ci_num_cached++;
@@ -324,8 +324,9 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
324 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node; 324 struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
325 struct ocfs2_meta_cache_item *tmp; 325 struct ocfs2_meta_cache_item *tmp;
326 326
327 mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block, 327 trace_ocfs2_insert_cache_tree(
328 ci->ci_num_cached); 328 (unsigned long long)ocfs2_metadata_cache_owner(ci),
329 (unsigned long long)block, ci->ci_num_cached);
329 330
330 while(*p) { 331 while(*p) {
331 parent = *p; 332 parent = *p;
@@ -389,9 +390,9 @@ static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
389 tree[i] = NULL; 390 tree[i] = NULL;
390 } 391 }
391 392
392 mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n", 393 trace_ocfs2_expand_cache(
393 (unsigned long long)ocfs2_metadata_cache_owner(ci), 394 (unsigned long long)ocfs2_metadata_cache_owner(ci),
394 ci->ci_flags, ci->ci_num_cached); 395 ci->ci_flags, ci->ci_num_cached);
395} 396}
396 397
397/* Slow path function - memory allocation is necessary. See the 398/* Slow path function - memory allocation is necessary. See the
@@ -405,9 +406,9 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
405 struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] = 406 struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
406 { NULL, }; 407 { NULL, };
407 408
408 mlog(0, "Owner %llu, block %llu, expand = %d\n", 409 trace_ocfs2_set_buffer_uptodate(
409 (unsigned long long)ocfs2_metadata_cache_owner(ci), 410 (unsigned long long)ocfs2_metadata_cache_owner(ci),
410 (unsigned long long)block, expand_tree); 411 (unsigned long long)block, expand_tree);
411 412
412 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); 413 new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
413 if (!new) { 414 if (!new) {
@@ -433,7 +434,6 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
433 434
434 ocfs2_metadata_cache_lock(ci); 435 ocfs2_metadata_cache_lock(ci);
435 if (ocfs2_insert_can_use_array(ci)) { 436 if (ocfs2_insert_can_use_array(ci)) {
436 mlog(0, "Someone cleared the tree underneath us\n");
437 /* Ok, items were removed from the cache in between 437 /* Ok, items were removed from the cache in between
438 * locks. Detect this and revert back to the fast path */ 438 * locks. Detect this and revert back to the fast path */
439 ocfs2_append_cache_array(ci, block); 439 ocfs2_append_cache_array(ci, block);
@@ -490,9 +490,9 @@ void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
490 if (ocfs2_buffer_cached(ci, bh)) 490 if (ocfs2_buffer_cached(ci, bh))
491 return; 491 return;
492 492
493 mlog(0, "Owner %llu, inserting block %llu\n", 493 trace_ocfs2_set_buffer_uptodate_begin(
494 (unsigned long long)ocfs2_metadata_cache_owner(ci), 494 (unsigned long long)ocfs2_metadata_cache_owner(ci),
495 (unsigned long long)bh->b_blocknr); 495 (unsigned long long)bh->b_blocknr);
496 496
497 /* No need to recheck under spinlock - insertion is guarded by 497 /* No need to recheck under spinlock - insertion is guarded by
498 * co_io_lock() */ 498 * co_io_lock() */
@@ -542,8 +542,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
542 BUG_ON(index >= ci->ci_num_cached); 542 BUG_ON(index >= ci->ci_num_cached);
543 BUG_ON(!ci->ci_num_cached); 543 BUG_ON(!ci->ci_num_cached);
544 544
545 mlog(0, "remove index %d (num_cached = %u\n", index, 545 trace_ocfs2_remove_metadata_array(
546 ci->ci_num_cached); 546 (unsigned long long)ocfs2_metadata_cache_owner(ci),
547 index, ci->ci_num_cached);
547 548
548 ci->ci_num_cached--; 549 ci->ci_num_cached--;
549 550
@@ -559,8 +560,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
559static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, 560static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
560 struct ocfs2_meta_cache_item *item) 561 struct ocfs2_meta_cache_item *item)
561{ 562{
562 mlog(0, "remove block %llu from tree\n", 563 trace_ocfs2_remove_metadata_tree(
563 (unsigned long long) item->c_block); 564 (unsigned long long)ocfs2_metadata_cache_owner(ci),
565 (unsigned long long)item->c_block);
564 566
565 rb_erase(&item->c_node, &ci->ci_cache.ci_tree); 567 rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
566 ci->ci_num_cached--; 568 ci->ci_num_cached--;
@@ -573,10 +575,10 @@ static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
573 struct ocfs2_meta_cache_item *item = NULL; 575 struct ocfs2_meta_cache_item *item = NULL;
574 576
575 ocfs2_metadata_cache_lock(ci); 577 ocfs2_metadata_cache_lock(ci);
576 mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n", 578 trace_ocfs2_remove_block_from_cache(
577 (unsigned long long)ocfs2_metadata_cache_owner(ci), 579 (unsigned long long)ocfs2_metadata_cache_owner(ci),
578 (unsigned long long) block, ci->ci_num_cached, 580 (unsigned long long) block, ci->ci_num_cached,
579 ci->ci_flags & OCFS2_CACHE_FL_INLINE); 581 ci->ci_flags);
580 582
581 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) { 583 if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
582 index = ocfs2_search_cache_array(ci, block); 584 index = ocfs2_search_cache_array(ci, block);
@@ -626,9 +628,6 @@ int __init init_ocfs2_uptodate_cache(void)
626 if (!ocfs2_uptodate_cachep) 628 if (!ocfs2_uptodate_cachep)
627 return -ENOMEM; 629 return -ENOMEM;
628 630
629 mlog(0, "%u inlined cache items per inode.\n",
630 OCFS2_CACHE_INFO_MAX_ARRAY);
631
632 return 0; 631 return 0;
633} 632}
634 633
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 67cd43914641..81ecf9c0bf0a 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -37,7 +37,6 @@
37#include <linux/string.h> 37#include <linux/string.h>
38#include <linux/security.h> 38#include <linux/security.h>
39 39
40#define MLOG_MASK_PREFIX ML_XATTR
41#include <cluster/masklog.h> 40#include <cluster/masklog.h>
42 41
43#include "ocfs2.h" 42#include "ocfs2.h"
@@ -57,6 +56,7 @@
57#include "xattr.h" 56#include "xattr.h"
58#include "refcounttree.h" 57#include "refcounttree.h"
59#include "acl.h" 58#include "acl.h"
59#include "ocfs2_trace.h"
60 60
61struct ocfs2_xattr_def_value_root { 61struct ocfs2_xattr_def_value_root {
62 struct ocfs2_xattr_value_root xv; 62 struct ocfs2_xattr_value_root xv;
@@ -474,8 +474,7 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
474 struct ocfs2_xattr_block *xb = 474 struct ocfs2_xattr_block *xb =
475 (struct ocfs2_xattr_block *)bh->b_data; 475 (struct ocfs2_xattr_block *)bh->b_data;
476 476
477 mlog(0, "Validating xattr block %llu\n", 477 trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
478 (unsigned long long)bh->b_blocknr);
479 478
480 BUG_ON(!buffer_uptodate(bh)); 479 BUG_ON(!buffer_uptodate(bh));
481 480
@@ -715,11 +714,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
715 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); 714 u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
716 struct ocfs2_extent_tree et; 715 struct ocfs2_extent_tree et;
717 716
718 mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
719
720 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); 717 ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
721 718
722 while (clusters_to_add) { 719 while (clusters_to_add) {
720 trace_ocfs2_xattr_extend_allocation(clusters_to_add);
721
723 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, 722 status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
724 OCFS2_JOURNAL_ACCESS_WRITE); 723 OCFS2_JOURNAL_ACCESS_WRITE);
725 if (status < 0) { 724 if (status < 0) {
@@ -754,8 +753,6 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
754 */ 753 */
755 BUG_ON(why == RESTART_META); 754 BUG_ON(why == RESTART_META);
756 755
757 mlog(0, "restarting xattr value extension for %u"
758 " clusters,.\n", clusters_to_add);
759 credits = ocfs2_calc_extend_credits(inode->i_sb, 756 credits = ocfs2_calc_extend_credits(inode->i_sb,
760 &vb->vb_xv->xr_list, 757 &vb->vb_xv->xr_list,
761 clusters_to_add); 758 clusters_to_add);
@@ -3246,8 +3243,8 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
3246 } 3243 }
3247 3244
3248 meta_add += extra_meta; 3245 meta_add += extra_meta;
3249 mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " 3246 trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
3250 "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits); 3247 clusters_add, *credits);
3251 3248
3252 if (meta_add) { 3249 if (meta_add) {
3253 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, 3250 ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -3557,7 +3554,7 @@ int ocfs2_xattr_set(struct inode *inode,
3557 down_write(&OCFS2_I(inode)->ip_xattr_sem); 3554 down_write(&OCFS2_I(inode)->ip_xattr_sem);
3558 /* 3555 /*
3559 * Scan inode and external block to find the same name 3556 * Scan inode and external block to find the same name
3560 * extended attribute and collect search infomation. 3557 * extended attribute and collect search information.
3561 */ 3558 */
3562 ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); 3559 ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
3563 if (ret) 3560 if (ret)
@@ -3581,7 +3578,7 @@ int ocfs2_xattr_set(struct inode *inode,
3581 goto cleanup; 3578 goto cleanup;
3582 } 3579 }
3583 3580
3584 /* Check whether the value is refcounted and do some prepartion. */ 3581 /* Check whether the value is refcounted and do some preparation. */
3585 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && 3582 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
3586 (!xis.not_found || !xbs.not_found)) { 3583 (!xis.not_found || !xbs.not_found)) {
3587 ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, 3584 ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
@@ -3887,8 +3884,10 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
3887 3884
3888 if (found) { 3885 if (found) {
3889 xs->here = &xs->header->xh_entries[index]; 3886 xs->here = &xs->header->xh_entries[index];
3890 mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, 3887 trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
3891 (unsigned long long)bucket_blkno(xs->bucket), index); 3888 name, name_index, name_hash,
3889 (unsigned long long)bucket_blkno(xs->bucket),
3890 index);
3892 } else 3891 } else
3893 ret = -ENODATA; 3892 ret = -ENODATA;
3894 3893
@@ -3915,8 +3914,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
3915 if (le16_to_cpu(el->l_next_free_rec) == 0) 3914 if (le16_to_cpu(el->l_next_free_rec) == 0)
3916 return -ENODATA; 3915 return -ENODATA;
3917 3916
3918 mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n", 3917 trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
3919 name, name_hash, name_index); 3918 name, name_index, name_hash,
3919 (unsigned long long)root_bh->b_blocknr,
3920 -1);
3920 3921
3921 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash, 3922 ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
3922 &num_clusters, el); 3923 &num_clusters, el);
@@ -3927,9 +3928,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
3927 3928
3928 BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); 3929 BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
3929 3930
3930 mlog(0, "find xattr extent rec %u clusters from %llu, the first hash " 3931 trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
3931 "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno, 3932 name, name_index, first_hash,
3932 first_hash); 3933 (unsigned long long)p_blkno,
3934 num_clusters);
3933 3935
3934 ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, 3936 ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
3935 p_blkno, first_hash, num_clusters, xs); 3937 p_blkno, first_hash, num_clusters, xs);
@@ -3955,8 +3957,9 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
3955 return -ENOMEM; 3957 return -ENOMEM;
3956 } 3958 }
3957 3959
3958 mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", 3960 trace_ocfs2_iterate_xattr_buckets(
3959 clusters, (unsigned long long)blkno); 3961 (unsigned long long)OCFS2_I(inode)->ip_blkno,
3962 (unsigned long long)blkno, clusters);
3960 3963
3961 for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) { 3964 for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
3962 ret = ocfs2_read_xattr_bucket(bucket, blkno); 3965 ret = ocfs2_read_xattr_bucket(bucket, blkno);
@@ -3972,8 +3975,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
3972 if (i == 0) 3975 if (i == 0)
3973 num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets); 3976 num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
3974 3977
3975 mlog(0, "iterating xattr bucket %llu, first hash %u\n", 3978 trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
3976 (unsigned long long)blkno,
3977 le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash)); 3979 le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
3978 if (func) { 3980 if (func) {
3979 ret = func(inode, bucket, para); 3981 ret = func(inode, bucket, para);
@@ -4173,9 +4175,9 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
4173 char *src = xb_bh->b_data; 4175 char *src = xb_bh->b_data;
4174 char *target = bucket_block(bucket, blks - 1); 4176 char *target = bucket_block(bucket, blks - 1);
4175 4177
4176 mlog(0, "cp xattr from block %llu to bucket %llu\n", 4178 trace_ocfs2_cp_xattr_block_to_bucket_begin(
4177 (unsigned long long)xb_bh->b_blocknr, 4179 (unsigned long long)xb_bh->b_blocknr,
4178 (unsigned long long)bucket_blkno(bucket)); 4180 (unsigned long long)bucket_blkno(bucket));
4179 4181
4180 for (i = 0; i < blks; i++) 4182 for (i = 0; i < blks; i++)
4181 memset(bucket_block(bucket, i), 0, blocksize); 4183 memset(bucket_block(bucket, i), 0, blocksize);
@@ -4211,8 +4213,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
4211 for (i = 0; i < count; i++) 4213 for (i = 0; i < count; i++)
4212 le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change); 4214 le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
4213 4215
4214 mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n", 4216 trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
4215 offset, size, off_change);
4216 4217
4217 sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), 4218 sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
4218 cmp_xe, swap_xe); 4219 cmp_xe, swap_xe);
@@ -4261,8 +4262,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4261 struct ocfs2_xattr_tree_root *xr; 4262 struct ocfs2_xattr_tree_root *xr;
4262 u16 xb_flags = le16_to_cpu(xb->xb_flags); 4263 u16 xb_flags = le16_to_cpu(xb->xb_flags);
4263 4264
4264 mlog(0, "create xattr index block for %llu\n", 4265 trace_ocfs2_xattr_create_index_block_begin(
4265 (unsigned long long)xb_bh->b_blocknr); 4266 (unsigned long long)xb_bh->b_blocknr);
4266 4267
4267 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); 4268 BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
4268 BUG_ON(!xs->bucket); 4269 BUG_ON(!xs->bucket);
@@ -4295,8 +4296,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
4295 */ 4296 */
4296 blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); 4297 blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
4297 4298
4298 mlog(0, "allocate 1 cluster from %llu to xattr block\n", 4299 trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
4299 (unsigned long long)blkno);
4300 4300
4301 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); 4301 ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
4302 if (ret) { 4302 if (ret) {
@@ -4400,8 +4400,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
4400 entries = (char *)xh->xh_entries; 4400 entries = (char *)xh->xh_entries;
4401 xh_free_start = le16_to_cpu(xh->xh_free_start); 4401 xh_free_start = le16_to_cpu(xh->xh_free_start);
4402 4402
4403 mlog(0, "adjust xattr bucket in %llu, count = %u, " 4403 trace_ocfs2_defrag_xattr_bucket(
4404 "xh_free_start = %u, xh_name_value_len = %u.\n",
4405 (unsigned long long)blkno, le16_to_cpu(xh->xh_count), 4404 (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
4406 xh_free_start, le16_to_cpu(xh->xh_name_value_len)); 4405 xh_free_start, le16_to_cpu(xh->xh_name_value_len));
4407 4406
@@ -4503,8 +4502,9 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
4503 BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets); 4502 BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
4504 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize); 4503 BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
4505 4504
4506 mlog(0, "move half of xattrs in cluster %llu to %llu\n", 4505 trace_ocfs2_mv_xattr_bucket_cross_cluster(
4507 (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno); 4506 (unsigned long long)last_cluster_blkno,
4507 (unsigned long long)new_blkno);
4508 4508
4509 ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first), 4509 ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
4510 last_cluster_blkno, new_blkno, 4510 last_cluster_blkno, new_blkno,
@@ -4614,8 +4614,8 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4614 struct ocfs2_xattr_entry *xe; 4614 struct ocfs2_xattr_entry *xe;
4615 int blocksize = inode->i_sb->s_blocksize; 4615 int blocksize = inode->i_sb->s_blocksize;
4616 4616
4617 mlog(0, "move some of xattrs from bucket %llu to %llu\n", 4617 trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
4618 (unsigned long long)blk, (unsigned long long)new_blk); 4618 (unsigned long long)new_blk);
4619 4619
4620 s_bucket = ocfs2_xattr_bucket_new(inode); 4620 s_bucket = ocfs2_xattr_bucket_new(inode);
4621 t_bucket = ocfs2_xattr_bucket_new(inode); 4621 t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4714,9 +4714,9 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
4714 */ 4714 */
4715 xe = &xh->xh_entries[start]; 4715 xe = &xh->xh_entries[start];
4716 len = sizeof(struct ocfs2_xattr_entry) * (count - start); 4716 len = sizeof(struct ocfs2_xattr_entry) * (count - start);
4717 mlog(0, "mv xattr entry len %d from %d to %d\n", len, 4717 trace_ocfs2_divide_xattr_bucket_move(len,
4718 (int)((char *)xe - (char *)xh), 4718 (int)((char *)xe - (char *)xh),
4719 (int)((char *)xh->xh_entries - (char *)xh)); 4719 (int)((char *)xh->xh_entries - (char *)xh));
4720 memmove((char *)xh->xh_entries, (char *)xe, len); 4720 memmove((char *)xh->xh_entries, (char *)xe, len);
4721 xe = &xh->xh_entries[count - start]; 4721 xe = &xh->xh_entries[count - start];
4722 len = sizeof(struct ocfs2_xattr_entry) * start; 4722 len = sizeof(struct ocfs2_xattr_entry) * start;
@@ -4788,9 +4788,9 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
4788 4788
4789 BUG_ON(s_blkno == t_blkno); 4789 BUG_ON(s_blkno == t_blkno);
4790 4790
4791 mlog(0, "cp bucket %llu to %llu, target is %d\n", 4791 trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
4792 (unsigned long long)s_blkno, (unsigned long long)t_blkno, 4792 (unsigned long long)t_blkno,
4793 t_is_new); 4793 t_is_new);
4794 4794
4795 s_bucket = ocfs2_xattr_bucket_new(inode); 4795 s_bucket = ocfs2_xattr_bucket_new(inode);
4796 t_bucket = ocfs2_xattr_bucket_new(inode); 4796 t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4862,8 +4862,8 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
4862 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); 4862 int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
4863 struct ocfs2_xattr_bucket *old_first, *new_first; 4863 struct ocfs2_xattr_bucket *old_first, *new_first;
4864 4864
4865 mlog(0, "mv xattrs from cluster %llu to %llu\n", 4865 trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
4866 (unsigned long long)last_blk, (unsigned long long)to_blk); 4866 (unsigned long long)to_blk);
4867 4867
4868 BUG_ON(start_bucket >= num_buckets); 4868 BUG_ON(start_bucket >= num_buckets);
4869 if (start_bucket) { 4869 if (start_bucket) {
@@ -5013,9 +5013,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
5013{ 5013{
5014 int ret; 5014 int ret;
5015 5015
5016 mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", 5016 trace_ocfs2_adjust_xattr_cross_cluster(
5017 (unsigned long long)bucket_blkno(first), prev_clusters, 5017 (unsigned long long)bucket_blkno(first),
5018 (unsigned long long)new_blk); 5018 (unsigned long long)new_blk, prev_clusters);
5019 5019
5020 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) { 5020 if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
5021 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, 5021 ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -5088,10 +5088,10 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5088 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5088 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5089 struct ocfs2_extent_tree et; 5089 struct ocfs2_extent_tree et;
5090 5090
5091 mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " 5091 trace_ocfs2_add_new_xattr_cluster_begin(
5092 "previous xattr blkno = %llu\n", 5092 (unsigned long long)OCFS2_I(inode)->ip_blkno,
5093 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5093 (unsigned long long)bucket_blkno(first),
5094 prev_cpos, (unsigned long long)bucket_blkno(first)); 5094 prev_cpos, prev_clusters);
5095 5095
5096 ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh); 5096 ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
5097 5097
@@ -5113,8 +5113,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5113 BUG_ON(num_bits > clusters_to_add); 5113 BUG_ON(num_bits > clusters_to_add);
5114 5114
5115 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 5115 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
5116 mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", 5116 trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
5117 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
5118 5117
5119 if (bucket_blkno(first) + (prev_clusters * bpc) == block && 5118 if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
5120 (prev_clusters + num_bits) << osb->s_clustersize_bits <= 5119 (prev_clusters + num_bits) << osb->s_clustersize_bits <=
@@ -5130,8 +5129,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5130 */ 5129 */
5131 v_start = prev_cpos + prev_clusters; 5130 v_start = prev_cpos + prev_clusters;
5132 *num_clusters = prev_clusters + num_bits; 5131 *num_clusters = prev_clusters + num_bits;
5133 mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
5134 num_bits);
5135 } else { 5132 } else {
5136 ret = ocfs2_adjust_xattr_cross_cluster(inode, 5133 ret = ocfs2_adjust_xattr_cross_cluster(inode,
5137 handle, 5134 handle,
@@ -5147,8 +5144,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
5147 } 5144 }
5148 } 5145 }
5149 5146
5150 mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", 5147 trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
5151 num_bits, (unsigned long long)block, v_start); 5148 v_start, num_bits);
5152 ret = ocfs2_insert_extent(handle, &et, v_start, block, 5149 ret = ocfs2_insert_extent(handle, &et, v_start, block,
5153 num_bits, 0, ctxt->meta_ac); 5150 num_bits, 0, ctxt->meta_ac);
5154 if (ret < 0) { 5151 if (ret < 0) {
@@ -5183,9 +5180,9 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
5183 u64 end_blk; 5180 u64 end_blk;
5184 u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets); 5181 u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
5185 5182
5186 mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " 5183 trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
5187 "from %llu, len = %u\n", (unsigned long long)target_blk, 5184 (unsigned long long)bucket_blkno(first),
5188 (unsigned long long)bucket_blkno(first), num_clusters); 5185 num_clusters, new_bucket);
5189 5186
5190 /* The extent must have room for an additional bucket */ 5187 /* The extent must have room for an additional bucket */
5191 BUG_ON(new_bucket >= 5188 BUG_ON(new_bucket >=
@@ -5265,8 +5262,8 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
5265 /* The bucket at the front of the extent */ 5262 /* The bucket at the front of the extent */
5266 struct ocfs2_xattr_bucket *first; 5263 struct ocfs2_xattr_bucket *first;
5267 5264
5268 mlog(0, "Add new xattr bucket starting from %llu\n", 5265 trace_ocfs2_add_new_xattr_bucket(
5269 (unsigned long long)bucket_blkno(target)); 5266 (unsigned long long)bucket_blkno(target));
5270 5267
5271 /* The first bucket of the original extent */ 5268 /* The first bucket of the original extent */
5272 first = ocfs2_xattr_bucket_new(inode); 5269 first = ocfs2_xattr_bucket_new(inode);
@@ -5382,8 +5379,8 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
5382 * modified something. We have to assume they did, and dirty 5379 * modified something. We have to assume they did, and dirty
5383 * the whole bucket. This leaves us in a consistent state. 5380 * the whole bucket. This leaves us in a consistent state.
5384 */ 5381 */
5385 mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", 5382 trace_ocfs2_xattr_bucket_value_truncate(
5386 xe_off, (unsigned long long)bucket_blkno(bucket), len); 5383 (unsigned long long)bucket_blkno(bucket), xe_off, len);
5387 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); 5384 ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
5388 if (ret) { 5385 if (ret) {
5389 mlog_errno(ret); 5386 mlog_errno(ret);
@@ -5433,8 +5430,9 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
5433 5430
5434 ocfs2_init_dealloc_ctxt(&dealloc); 5431 ocfs2_init_dealloc_ctxt(&dealloc);
5435 5432
5436 mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", 5433 trace_ocfs2_rm_xattr_cluster(
5437 cpos, len, (unsigned long long)blkno); 5434 (unsigned long long)OCFS2_I(inode)->ip_blkno,
5435 (unsigned long long)blkno, cpos, len);
5438 5436
5439 ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno, 5437 ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
5440 len); 5438 len);
@@ -5538,7 +5536,7 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
5538 int ret; 5536 int ret;
5539 struct ocfs2_xa_loc loc; 5537 struct ocfs2_xa_loc loc;
5540 5538
5541 mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name); 5539 trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
5542 5540
5543 ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket, 5541 ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
5544 xs->not_found ? NULL : xs->here); 5542 xs->not_found ? NULL : xs->here);
@@ -5570,7 +5568,6 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
5570 5568
5571 5569
5572out: 5570out:
5573 mlog_exit(ret);
5574 return ret; 5571 return ret;
5575} 5572}
5576 5573
@@ -5581,7 +5578,7 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
5581{ 5578{
5582 int ret; 5579 int ret;
5583 5580
5584 mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name); 5581 trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
5585 5582
5586 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt); 5583 ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
5587 if (!ret) 5584 if (!ret)
@@ -5637,7 +5634,6 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
5637 mlog_errno(ret); 5634 mlog_errno(ret);
5638 5635
5639out: 5636out:
5640 mlog_exit(ret);
5641 return ret; 5637 return ret;
5642} 5638}
5643 5639
@@ -6041,9 +6037,9 @@ static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
6041 if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb))) 6037 if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
6042 p = &refcount; 6038 p = &refcount;
6043 6039
6044 mlog(0, "refcount bucket %llu, count = %u\n", 6040 trace_ocfs2_xattr_bucket_value_refcount(
6045 (unsigned long long)bucket_blkno(bucket), 6041 (unsigned long long)bucket_blkno(bucket),
6046 le16_to_cpu(xh->xh_count)); 6042 le16_to_cpu(xh->xh_count));
6047 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { 6043 for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
6048 xe = &xh->xh_entries[i]; 6044 xe = &xh->xh_entries[i];
6049 6045
@@ -6339,8 +6335,8 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
6339 u32 clusters, cpos, p_cluster, num_clusters; 6335 u32 clusters, cpos, p_cluster, num_clusters;
6340 unsigned int ext_flags = 0; 6336 unsigned int ext_flags = 0;
6341 6337
6342 mlog(0, "reflink xattr in container %llu, count = %u\n", 6338 trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
6343 (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count)); 6339 le16_to_cpu(xh->xh_count));
6344 6340
6345 last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)]; 6341 last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
6346 for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) { 6342 for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
@@ -6540,8 +6536,8 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
6540 goto out; 6536 goto out;
6541 } 6537 }
6542 6538
6543 mlog(0, "create new xattr block for inode %llu, index = %d\n", 6539 trace_ocfs2_create_empty_xattr_block(
6544 (unsigned long long)fe_bh->b_blocknr, indexed); 6540 (unsigned long long)fe_bh->b_blocknr, indexed);
6545 ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed, 6541 ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
6546 ret_bh); 6542 ret_bh);
6547 if (ret) 6543 if (ret)
@@ -6952,8 +6948,8 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
6952 if (ret) 6948 if (ret)
6953 mlog_errno(ret); 6949 mlog_errno(ret);
6954 6950
6955 mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", 6951 trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
6956 (unsigned long long)new_blkno, num_clusters, reflink_cpos); 6952 num_clusters, reflink_cpos);
6957 6953
6958 len -= num_clusters; 6954 len -= num_clusters;
6959 blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); 6955 blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
@@ -6982,8 +6978,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
6982 struct ocfs2_alloc_context *data_ac = NULL; 6978 struct ocfs2_alloc_context *data_ac = NULL;
6983 struct ocfs2_extent_tree et; 6979 struct ocfs2_extent_tree et;
6984 6980
6985 mlog(0, "reflink xattr buckets %llu len %u\n", 6981 trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
6986 (unsigned long long)blkno, len);
6987 6982
6988 ocfs2_init_xattr_tree_extent_tree(&et, 6983 ocfs2_init_xattr_tree_extent_tree(&et,
6989 INODE_CACHE(args->reflink->new_inode), 6984 INODE_CACHE(args->reflink->new_inode),
@@ -7185,7 +7180,8 @@ out:
7185 * must not hold any lock expect i_mutex. 7180 * must not hold any lock expect i_mutex.
7186 */ 7181 */
7187int ocfs2_init_security_and_acl(struct inode *dir, 7182int ocfs2_init_security_and_acl(struct inode *dir,
7188 struct inode *inode) 7183 struct inode *inode,
7184 const struct qstr *qstr)
7189{ 7185{
7190 int ret = 0; 7186 int ret = 0;
7191 struct buffer_head *dir_bh = NULL; 7187 struct buffer_head *dir_bh = NULL;
@@ -7193,7 +7189,7 @@ int ocfs2_init_security_and_acl(struct inode *dir,
7193 .enable = 1, 7189 .enable = 1,
7194 }; 7190 };
7195 7191
7196 ret = ocfs2_init_security_get(inode, dir, &si); 7192 ret = ocfs2_init_security_get(inode, dir, qstr, &si);
7197 if (!ret) { 7193 if (!ret) {
7198 ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, 7194 ret = ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
7199 si.name, si.value, si.value_len, 7195 si.name, si.value, si.value_len,
@@ -7261,13 +7257,14 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
7261 7257
7262int ocfs2_init_security_get(struct inode *inode, 7258int ocfs2_init_security_get(struct inode *inode,
7263 struct inode *dir, 7259 struct inode *dir,
7260 const struct qstr *qstr,
7264 struct ocfs2_security_xattr_info *si) 7261 struct ocfs2_security_xattr_info *si)
7265{ 7262{
7266 /* check whether ocfs2 support feature xattr */ 7263 /* check whether ocfs2 support feature xattr */
7267 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) 7264 if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
7268 return -EOPNOTSUPP; 7265 return -EOPNOTSUPP;
7269 return security_inode_init_security(inode, dir, &si->name, &si->value, 7266 return security_inode_init_security(inode, dir, qstr, &si->name,
7270 &si->value_len); 7267 &si->value, &si->value_len);
7271} 7268}
7272 7269
7273int ocfs2_init_security_set(handle_t *handle, 7270int ocfs2_init_security_set(handle_t *handle,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index aa64bb37a65b..d63cfb72316b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -57,6 +57,7 @@ int ocfs2_has_inline_xattr_value_outside(struct inode *inode,
57 struct ocfs2_dinode *di); 57 struct ocfs2_dinode *di);
58int ocfs2_xattr_remove(struct inode *, struct buffer_head *); 58int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
59int ocfs2_init_security_get(struct inode *, struct inode *, 59int ocfs2_init_security_get(struct inode *, struct inode *,
60 const struct qstr *,
60 struct ocfs2_security_xattr_info *); 61 struct ocfs2_security_xattr_info *);
61int ocfs2_init_security_set(handle_t *, struct inode *, 62int ocfs2_init_security_set(handle_t *, struct inode *,
62 struct buffer_head *, 63 struct buffer_head *,
@@ -94,5 +95,6 @@ int ocfs2_reflink_xattrs(struct inode *old_inode,
94 struct buffer_head *new_bh, 95 struct buffer_head *new_bh,
95 bool preserve_security); 96 bool preserve_security);
96int ocfs2_init_security_and_acl(struct inode *dir, 97int ocfs2_init_security_and_acl(struct inode *dir,
97 struct inode *inode); 98 struct inode *inode,
99 const struct qstr *qstr);
98#endif /* OCFS2_XATTR_H */ 100#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 393f3f659da7..de4ff29f1e05 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -235,33 +235,22 @@ static int omfs_dir_is_empty(struct inode *inode)
235 return *ptr != ~0; 235 return *ptr != ~0;
236} 236}
237 237
238static int omfs_unlink(struct inode *dir, struct dentry *dentry) 238static int omfs_remove(struct inode *dir, struct dentry *dentry)
239{ 239{
240 int ret;
241 struct inode *inode = dentry->d_inode; 240 struct inode *inode = dentry->d_inode;
241 int ret;
242
243 if (S_ISDIR(inode->i_mode) && !omfs_dir_is_empty(inode))
244 return -ENOTEMPTY;
242 245
243 ret = omfs_delete_entry(dentry); 246 ret = omfs_delete_entry(dentry);
244 if (ret) 247 if (ret)
245 goto end_unlink; 248 return ret;
246 249
247 inode_dec_link_count(inode); 250 clear_nlink(inode);
251 mark_inode_dirty(inode);
248 mark_inode_dirty(dir); 252 mark_inode_dirty(dir);
249 253 return 0;
250end_unlink:
251 return ret;
252}
253
254static int omfs_rmdir(struct inode *dir, struct dentry *dentry)
255{
256 int err = -ENOTEMPTY;
257 struct inode *inode = dentry->d_inode;
258
259 if (omfs_dir_is_empty(inode)) {
260 err = omfs_unlink(dir, dentry);
261 if (!err)
262 inode_dec_link_count(inode);
263 }
264 return err;
265} 254}
266 255
267static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode) 256static int omfs_add_node(struct inode *dir, struct dentry *dentry, int mode)
@@ -372,9 +361,10 @@ static int omfs_fill_chain(struct file *filp, void *dirent, filldir_t filldir,
372 361
373 res = filldir(dirent, oi->i_name, strnlen(oi->i_name, 362 res = filldir(dirent, oi->i_name, strnlen(oi->i_name,
374 OMFS_NAMELEN), filp->f_pos, self, d_type); 363 OMFS_NAMELEN), filp->f_pos, self, d_type);
375 if (res == 0)
376 filp->f_pos++;
377 brelse(bh); 364 brelse(bh);
365 if (res < 0)
366 break;
367 filp->f_pos++;
378 } 368 }
379out: 369out:
380 return res; 370 return res;
@@ -385,44 +375,28 @@ static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry,
385{ 375{
386 struct inode *new_inode = new_dentry->d_inode; 376 struct inode *new_inode = new_dentry->d_inode;
387 struct inode *old_inode = old_dentry->d_inode; 377 struct inode *old_inode = old_dentry->d_inode;
388 struct buffer_head *bh;
389 int is_dir;
390 int err; 378 int err;
391 379
392 is_dir = S_ISDIR(old_inode->i_mode);
393
394 if (new_inode) { 380 if (new_inode) {
395 /* overwriting existing file/dir */ 381 /* overwriting existing file/dir */
396 err = -ENOTEMPTY; 382 err = omfs_remove(new_dir, new_dentry);
397 if (is_dir && !omfs_dir_is_empty(new_inode))
398 goto out;
399
400 err = -ENOENT;
401 bh = omfs_find_entry(new_dir, new_dentry->d_name.name,
402 new_dentry->d_name.len);
403 if (IS_ERR(bh))
404 goto out;
405 brelse(bh);
406
407 err = omfs_unlink(new_dir, new_dentry);
408 if (err) 383 if (err)
409 goto out; 384 goto out;
410 } 385 }
411 386
412 /* since omfs locates files by name, we need to unlink _before_ 387 /* since omfs locates files by name, we need to unlink _before_
413 * adding the new link or we won't find the old one */ 388 * adding the new link or we won't find the old one */
414 inode_inc_link_count(old_inode); 389 err = omfs_delete_entry(old_dentry);
415 err = omfs_unlink(old_dir, old_dentry); 390 if (err)
416 if (err) {
417 inode_dec_link_count(old_inode);
418 goto out; 391 goto out;
419 }
420 392
393 mark_inode_dirty(old_dir);
421 err = omfs_add_link(new_dentry, old_inode); 394 err = omfs_add_link(new_dentry, old_inode);
422 if (err) 395 if (err)
423 goto out; 396 goto out;
424 397
425 old_inode->i_ctime = CURRENT_TIME_SEC; 398 old_inode->i_ctime = CURRENT_TIME_SEC;
399 mark_inode_dirty(old_inode);
426out: 400out:
427 return err; 401 return err;
428} 402}
@@ -488,8 +462,8 @@ const struct inode_operations omfs_dir_inops = {
488 .mkdir = omfs_mkdir, 462 .mkdir = omfs_mkdir,
489 .rename = omfs_rename, 463 .rename = omfs_rename,
490 .create = omfs_create, 464 .create = omfs_create,
491 .unlink = omfs_unlink, 465 .unlink = omfs_remove,
492 .rmdir = omfs_rmdir, 466 .rmdir = omfs_remove,
493}; 467};
494 468
495const struct file_operations omfs_dir_operations = { 469const struct file_operations omfs_dir_operations = {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 8a6d34fa668a..d738a7e493dd 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -372,7 +372,6 @@ const struct address_space_operations omfs_aops = {
372 .readpages = omfs_readpages, 372 .readpages = omfs_readpages,
373 .writepage = omfs_writepage, 373 .writepage = omfs_writepage,
374 .writepages = omfs_writepages, 374 .writepages = omfs_writepages,
375 .sync_page = block_sync_page,
376 .write_begin = omfs_write_begin, 375 .write_begin = omfs_write_begin,
377 .write_end = generic_write_end, 376 .write_end = generic_write_end,
378 .bmap = omfs_bmap, 377 .bmap = omfs_bmap,
diff --git a/fs/open.c b/fs/open.c
index e52389e1f05b..b52cf013ffa1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
233 233
234 if (!(file->f_mode & FMODE_WRITE)) 234 if (!(file->f_mode & FMODE_WRITE))
235 return -EBADF; 235 return -EBADF;
236
237 /* It's not possible punch hole on append only file */
238 if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
239 return -EPERM;
240
241 if (IS_IMMUTABLE(inode))
242 return -EPERM;
243
236 /* 244 /*
237 * Revalidate the write permissions, in case security policy has 245 * Revalidate the write permissions, in case security policy has
238 * changed since the files were opened. 246 * changed since the files were opened.
@@ -565,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
565{ 573{
566 struct path path; 574 struct path path;
567 int error = -EINVAL; 575 int error = -EINVAL;
568 int follow; 576 int lookup_flags;
569 577
570 if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) 578 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
571 goto out; 579 goto out;
572 580
573 follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; 581 lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
574 error = user_path_at(dfd, filename, follow, &path); 582 if (flag & AT_EMPTY_PATH)
583 lookup_flags |= LOOKUP_EMPTY;
584 error = user_path_at(dfd, filename, lookup_flags, &path);
575 if (error) 585 if (error)
576 goto out; 586 goto out;
577 error = mnt_want_write(path.mnt); 587 error = mnt_want_write(path.mnt);
@@ -661,11 +671,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
661 int (*open)(struct inode *, struct file *), 671 int (*open)(struct inode *, struct file *),
662 const struct cred *cred) 672 const struct cred *cred)
663{ 673{
674 static const struct file_operations empty_fops = {};
664 struct inode *inode; 675 struct inode *inode;
665 int error; 676 int error;
666 677
667 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | 678 f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
668 FMODE_PREAD | FMODE_PWRITE; 679 FMODE_PREAD | FMODE_PWRITE;
680
681 if (unlikely(f->f_flags & O_PATH))
682 f->f_mode = FMODE_PATH;
683
669 inode = dentry->d_inode; 684 inode = dentry->d_inode;
670 if (f->f_mode & FMODE_WRITE) { 685 if (f->f_mode & FMODE_WRITE) {
671 error = __get_file_write_access(inode, mnt); 686 error = __get_file_write_access(inode, mnt);
@@ -679,9 +694,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
679 f->f_path.dentry = dentry; 694 f->f_path.dentry = dentry;
680 f->f_path.mnt = mnt; 695 f->f_path.mnt = mnt;
681 f->f_pos = 0; 696 f->f_pos = 0;
682 f->f_op = fops_get(inode->i_fop);
683 file_sb_list_add(f, inode->i_sb); 697 file_sb_list_add(f, inode->i_sb);
684 698
699 if (unlikely(f->f_mode & FMODE_PATH)) {
700 f->f_op = &empty_fops;
701 return f;
702 }
703
704 f->f_op = fops_get(inode->i_fop);
705
685 error = security_dentry_open(f, cred); 706 error = security_dentry_open(f, cred);
686 if (error) 707 if (error)
687 goto cleanup_all; 708 goto cleanup_all;
@@ -693,7 +714,8 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
693 if (error) 714 if (error)
694 goto cleanup_all; 715 goto cleanup_all;
695 } 716 }
696 ima_counts_get(f); 717 if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
718 i_readcount_inc(inode);
697 719
698 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); 720 f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
699 721
@@ -790,6 +812,8 @@ struct file *nameidata_to_filp(struct nameidata *nd)
790 812
791 /* Pick up the filp from the open intent */ 813 /* Pick up the filp from the open intent */
792 filp = nd->intent.open.file; 814 filp = nd->intent.open.file;
815 nd->intent.open.file = NULL;
816
793 /* Has the filesystem initialised the file for us? */ 817 /* Has the filesystem initialised the file for us? */
794 if (filp->f_path.dentry == NULL) { 818 if (filp->f_path.dentry == NULL) {
795 path_get(&nd->path); 819 path_get(&nd->path);
@@ -811,17 +835,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
811 835
812 validate_creds(cred); 836 validate_creds(cred);
813 837
814 /* 838 /* We must always pass in a valid mount pointer. */
815 * We must always pass in a valid mount pointer. Historically 839 BUG_ON(!mnt);
816 * callers got away with not passing it, but we must enforce this at
817 * the earliest possible point now to avoid strange problems deep in the
818 * filesystem stack.
819 */
820 if (!mnt) {
821 printk(KERN_WARNING "%s called with NULL vfsmount\n", __func__);
822 dump_stack();
823 return ERR_PTR(-EINVAL);
824 }
825 840
826 error = -ENFILE; 841 error = -ENFILE;
827 f = get_empty_filp(); 842 f = get_empty_filp();
@@ -880,15 +895,110 @@ void fd_install(unsigned int fd, struct file *file)
880 895
881EXPORT_SYMBOL(fd_install); 896EXPORT_SYMBOL(fd_install);
882 897
898static inline int build_open_flags(int flags, int mode, struct open_flags *op)
899{
900 int lookup_flags = 0;
901 int acc_mode;
902
903 if (!(flags & O_CREAT))
904 mode = 0;
905 op->mode = mode;
906
907 /* Must never be set by userspace */
908 flags &= ~FMODE_NONOTIFY;
909
910 /*
911 * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only
912 * check for O_DSYNC if the need any syncing at all we enforce it's
913 * always set instead of having to deal with possibly weird behaviour
914 * for malicious applications setting only __O_SYNC.
915 */
916 if (flags & __O_SYNC)
917 flags |= O_DSYNC;
918
919 /*
920 * If we have O_PATH in the open flag. Then we
921 * cannot have anything other than the below set of flags
922 */
923 if (flags & O_PATH) {
924 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
925 acc_mode = 0;
926 } else {
927 acc_mode = MAY_OPEN | ACC_MODE(flags);
928 }
929
930 op->open_flag = flags;
931
932 /* O_TRUNC implies we need access checks for write permissions */
933 if (flags & O_TRUNC)
934 acc_mode |= MAY_WRITE;
935
936 /* Allow the LSM permission hook to distinguish append
937 access from general write access. */
938 if (flags & O_APPEND)
939 acc_mode |= MAY_APPEND;
940
941 op->acc_mode = acc_mode;
942
943 op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
944
945 if (flags & O_CREAT) {
946 op->intent |= LOOKUP_CREATE;
947 if (flags & O_EXCL)
948 op->intent |= LOOKUP_EXCL;
949 }
950
951 if (flags & O_DIRECTORY)
952 lookup_flags |= LOOKUP_DIRECTORY;
953 if (!(flags & O_NOFOLLOW))
954 lookup_flags |= LOOKUP_FOLLOW;
955 return lookup_flags;
956}
957
958/**
959 * filp_open - open file and return file pointer
960 *
961 * @filename: path to open
962 * @flags: open flags as per the open(2) second argument
963 * @mode: mode for the new file if O_CREAT is set, else ignored
964 *
965 * This is the helper to open a file from kernelspace if you really
966 * have to. But in generally you should not do this, so please move
967 * along, nothing to see here..
968 */
969struct file *filp_open(const char *filename, int flags, int mode)
970{
971 struct open_flags op;
972 int lookup = build_open_flags(flags, mode, &op);
973 return do_filp_open(AT_FDCWD, filename, &op, lookup);
974}
975EXPORT_SYMBOL(filp_open);
976
977struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
978 const char *filename, int flags)
979{
980 struct open_flags op;
981 int lookup = build_open_flags(flags, 0, &op);
982 if (flags & O_CREAT)
983 return ERR_PTR(-EINVAL);
984 if (!filename && (flags & O_DIRECTORY))
985 if (!dentry->d_inode->i_op->lookup)
986 return ERR_PTR(-ENOTDIR);
987 return do_file_open_root(dentry, mnt, filename, &op, lookup);
988}
989EXPORT_SYMBOL(file_open_root);
990
883long do_sys_open(int dfd, const char __user *filename, int flags, int mode) 991long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
884{ 992{
993 struct open_flags op;
994 int lookup = build_open_flags(flags, mode, &op);
885 char *tmp = getname(filename); 995 char *tmp = getname(filename);
886 int fd = PTR_ERR(tmp); 996 int fd = PTR_ERR(tmp);
887 997
888 if (!IS_ERR(tmp)) { 998 if (!IS_ERR(tmp)) {
889 fd = get_unused_fd_flags(flags); 999 fd = get_unused_fd_flags(flags);
890 if (fd >= 0) { 1000 if (fd >= 0) {
891 struct file *f = do_filp_open(dfd, tmp, flags, mode, 0); 1001 struct file *f = do_filp_open(dfd, tmp, &op, lookup);
892 if (IS_ERR(f)) { 1002 if (IS_ERR(f)) {
893 put_unused_fd(fd); 1003 put_unused_fd(fd);
894 fd = PTR_ERR(f); 1004 fd = PTR_ERR(f);
@@ -958,8 +1068,10 @@ int filp_close(struct file *filp, fl_owner_t id)
958 if (filp->f_op && filp->f_op->flush) 1068 if (filp->f_op && filp->f_op->flush)
959 retval = filp->f_op->flush(filp, id); 1069 retval = filp->f_op->flush(filp, id);
960 1070
961 dnotify_flush(filp, id); 1071 if (likely(!(filp->f_mode & FMODE_PATH))) {
962 locks_remove_posix(filp, id); 1072 dnotify_flush(filp, id);
1073 locks_remove_posix(filp, id);
1074 }
963 fput(filp); 1075 fput(filp);
964 return retval; 1076 return retval;
965} 1077}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 9c21119512b9..d545e97d99c3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -290,7 +290,8 @@ ssize_t part_inflight_show(struct device *dev,
290{ 290{
291 struct hd_struct *p = dev_to_part(dev); 291 struct hd_struct *p = dev_to_part(dev);
292 292
293 return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); 293 return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
294 atomic_read(&p->in_flight[1]));
294} 295}
295 296
296#ifdef CONFIG_FAIL_MAKE_REQUEST 297#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -499,7 +500,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
499 /* everything is up and running, commence */ 500 /* everything is up and running, commence */
500 rcu_assign_pointer(ptbl->part[partno], p); 501 rcu_assign_pointer(ptbl->part[partno], p);
501 502
502 /* suppress uevent if the disk supresses it */ 503 /* suppress uevent if the disk suppresses it */
503 if (!dev_get_uevent_suppress(ddev)) 504 if (!dev_get_uevent_suppress(ddev))
504 kobject_uevent(&pdev->kobj, KOBJ_ADD); 505 kobject_uevent(&pdev->kobj, KOBJ_ADD);
505 506
@@ -584,7 +585,7 @@ rescan:
584 /* 585 /*
585 * If any partition code tried to read beyond EOD, try 586 * If any partition code tried to read beyond EOD, try
586 * unlocking native capacity even if partition table is 587 * unlocking native capacity even if partition table is
587 * sucessfully read as we could be missing some partitions. 588 * successfully read as we could be missing some partitions.
588 */ 589 */
589 if (state->access_beyond_eod) { 590 if (state->access_beyond_eod) {
590 printk(KERN_WARNING 591 printk(KERN_WARNING
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 789c625c7aa5..ce4f62440425 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
251 } 251 }
252 252
253 vm->vblk_size = get_unaligned_be32(data + 0x08); 253 vm->vblk_size = get_unaligned_be32(data + 0x08);
254 if (vm->vblk_size == 0) {
255 ldm_error ("Illegal VBLK size");
256 return false;
257 }
258
254 vm->vblk_offset = get_unaligned_be32(data + 0x0C); 259 vm->vblk_offset = get_unaligned_be32(data + 0x0C);
255 vm->last_vblk_seq = get_unaligned_be32(data + 0x04); 260 vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
256 261
@@ -1294,6 +1299,11 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
1294 1299
1295 BUG_ON (!data || !frags); 1300 BUG_ON (!data || !frags);
1296 1301
1302 if (size < 2 * VBLK_SIZE_HEAD) {
1303 ldm_error("Value of size is to small.");
1304 return false;
1305 }
1306
1297 group = get_unaligned_be32(data + 0x08); 1307 group = get_unaligned_be32(data + 0x08);
1298 rec = get_unaligned_be16(data + 0x0C); 1308 rec = get_unaligned_be16(data + 0x0C);
1299 num = get_unaligned_be16(data + 0x0E); 1309 num = get_unaligned_be16(data + 0x0E);
@@ -1301,6 +1311,10 @@ static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags)
1301 ldm_error ("A VBLK claims to have %d parts.", num); 1311 ldm_error ("A VBLK claims to have %d parts.", num);
1302 return false; 1312 return false;
1303 } 1313 }
1314 if (rec >= num) {
1315 ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num);
1316 return false;
1317 }
1304 1318
1305 list_for_each (item, frags) { 1319 list_for_each (item, frags) {
1306 f = list_entry (item, struct frag, list); 1320 f = list_entry (item, struct frag, list);
@@ -1329,10 +1343,9 @@ found:
1329 1343
1330 f->map |= (1 << rec); 1344 f->map |= (1 << rec);
1331 1345
1332 if (num > 0) { 1346 data += VBLK_SIZE_HEAD;
1333 data += VBLK_SIZE_HEAD; 1347 size -= VBLK_SIZE_HEAD;
1334 size -= VBLK_SIZE_HEAD; 1348
1335 }
1336 memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size); 1349 memcpy (f->data+rec*(size-VBLK_SIZE_HEAD)+VBLK_SIZE_HEAD, data, size);
1337 1350
1338 return true; 1351 return true;
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index 68d6a216ee79..11f688bd76c5 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -29,10 +29,9 @@ static inline void mac_fix_string(char *stg, int len)
29 29
30int mac_partition(struct parsed_partitions *state) 30int mac_partition(struct parsed_partitions *state)
31{ 31{
32 int slot = 1;
33 Sector sect; 32 Sector sect;
34 unsigned char *data; 33 unsigned char *data;
35 int blk, blocks_in_map; 34 int slot, blocks_in_map;
36 unsigned secsize; 35 unsigned secsize;
37#ifdef CONFIG_PPC_PMAC 36#ifdef CONFIG_PPC_PMAC
38 int found_root = 0; 37 int found_root = 0;
@@ -59,10 +58,14 @@ int mac_partition(struct parsed_partitions *state)
59 put_dev_sector(sect); 58 put_dev_sector(sect);
60 return 0; /* not a MacOS disk */ 59 return 0; /* not a MacOS disk */
61 } 60 }
62 strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
63 blocks_in_map = be32_to_cpu(part->map_count); 61 blocks_in_map = be32_to_cpu(part->map_count);
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 62 if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) {
65 int pos = blk * secsize; 63 put_dev_sector(sect);
64 return 0;
65 }
66 strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
67 for (slot = 1; slot <= blocks_in_map; ++slot) {
68 int pos = slot * secsize;
66 put_dev_sector(sect); 69 put_dev_sector(sect);
67 data = read_part_sector(state, pos/512, &sect); 70 data = read_part_sector(state, pos/512, &sect);
68 if (!data) 71 if (!data)
@@ -113,13 +116,11 @@ int mac_partition(struct parsed_partitions *state)
113 } 116 }
114 117
115 if (goodness > found_root_goodness) { 118 if (goodness > found_root_goodness) {
116 found_root = blk; 119 found_root = slot;
117 found_root_goodness = goodness; 120 found_root_goodness = goodness;
118 } 121 }
119 } 122 }
120#endif /* CONFIG_PPC_PMAC */ 123#endif /* CONFIG_PPC_PMAC */
121
122 ++slot;
123 } 124 }
124#ifdef CONFIG_PPC_PMAC 125#ifdef CONFIG_PPC_PMAC
125 if (found_root_goodness) 126 if (found_root_goodness)
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index 48cec7cbca17..764b86a01965 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,10 +10,13 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13#define MAX_OSF_PARTITIONS 18
14
13int osf_partition(struct parsed_partitions *state) 15int osf_partition(struct parsed_partitions *state)
14{ 16{
15 int i; 17 int i;
16 int slot = 1; 18 int slot = 1;
19 unsigned int npartitions;
17 Sector sect; 20 Sector sect;
18 unsigned char *data; 21 unsigned char *data;
19 struct disklabel { 22 struct disklabel {
@@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state)
45 u8 p_fstype; 48 u8 p_fstype;
46 u8 p_frag; 49 u8 p_frag;
47 __le16 p_cpg; 50 __le16 p_cpg;
48 } d_partitions[8]; 51 } d_partitions[MAX_OSF_PARTITIONS];
49 } * label; 52 } * label;
50 struct d_partition * partition; 53 struct d_partition * partition;
51 54
@@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state)
63 put_dev_sector(sect); 66 put_dev_sector(sect);
64 return 0; 67 return 0;
65 } 68 }
66 for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) { 69 npartitions = le16_to_cpu(label->d_npartitions);
70 if (npartitions > MAX_OSF_PARTITIONS) {
71 put_dev_sector(sect);
72 return 0;
73 }
74 for (i = 0 ; i < npartitions; i++, partition++) {
67 if (slot == state->limit) 75 if (slot == state->limit)
68 break; 76 break;
69 if (le32_to_cpu(partition->p_size)) 77 if (le32_to_cpu(partition->p_size))
diff --git a/fs/proc/array.c b/fs/proc/array.c
index df2b703b9d0f..5e4f776b0917 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -353,9 +353,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
353 task_cap(m, task); 353 task_cap(m, task);
354 task_cpus_allowed(m, task); 354 task_cpus_allowed(m, task);
355 cpuset_task_status_allowed(m, task); 355 cpuset_task_status_allowed(m, task);
356#if defined(CONFIG_S390)
357 task_show_regs(m, task);
358#endif
359 task_context_switch_counts(m, task); 356 task_context_switch_counts(m, task);
360 return 0; 357 return 0;
361} 358}
@@ -492,8 +489,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
492 vsize, 489 vsize,
493 mm ? get_mm_rss(mm) : 0, 490 mm ? get_mm_rss(mm) : 0,
494 rsslim, 491 rsslim,
495 mm ? mm->start_code : 0, 492 mm ? (permitted ? mm->start_code : 1) : 0,
496 mm ? mm->end_code : 0, 493 mm ? (permitted ? mm->end_code : 1) : 0,
497 (permitted && mm) ? mm->start_stack : 0, 494 (permitted && mm) ? mm->start_stack : 0,
498 esp, 495 esp,
499 eip, 496 eip,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d096e82b201..dfa532730e55 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -191,17 +191,20 @@ static int proc_root_link(struct inode *inode, struct path *path)
191 return result; 191 return result;
192} 192}
193 193
194/* 194static struct mm_struct *__check_mem_permission(struct task_struct *task)
195 * Return zero if current may access user memory in @task, -error if not.
196 */
197static int check_mem_permission(struct task_struct *task)
198{ 195{
196 struct mm_struct *mm;
197
198 mm = get_task_mm(task);
199 if (!mm)
200 return ERR_PTR(-EINVAL);
201
199 /* 202 /*
200 * A task can always look at itself, in case it chooses 203 * A task can always look at itself, in case it chooses
201 * to use system calls instead of load instructions. 204 * to use system calls instead of load instructions.
202 */ 205 */
203 if (task == current) 206 if (task == current)
204 return 0; 207 return mm;
205 208
206 /* 209 /*
207 * If current is actively ptrace'ing, and would also be 210 * If current is actively ptrace'ing, and would also be
@@ -213,27 +216,53 @@ static int check_mem_permission(struct task_struct *task)
213 match = (tracehook_tracer_task(task) == current); 216 match = (tracehook_tracer_task(task) == current);
214 rcu_read_unlock(); 217 rcu_read_unlock();
215 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH)) 218 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
216 return 0; 219 return mm;
217 } 220 }
218 221
219 /* 222 /*
220 * Noone else is allowed. 223 * No one else is allowed.
224 */
225 mmput(mm);
226 return ERR_PTR(-EPERM);
227}
228
229/*
230 * If current may access user memory in @task return a reference to the
231 * corresponding mm, otherwise ERR_PTR.
232 */
233static struct mm_struct *check_mem_permission(struct task_struct *task)
234{
235 struct mm_struct *mm;
236 int err;
237
238 /*
239 * Avoid racing if task exec's as we might get a new mm but validate
240 * against old credentials.
221 */ 241 */
222 return -EPERM; 242 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
243 if (err)
244 return ERR_PTR(err);
245
246 mm = __check_mem_permission(task);
247 mutex_unlock(&task->signal->cred_guard_mutex);
248
249 return mm;
223} 250}
224 251
225struct mm_struct *mm_for_maps(struct task_struct *task) 252struct mm_struct *mm_for_maps(struct task_struct *task)
226{ 253{
227 struct mm_struct *mm; 254 struct mm_struct *mm;
255 int err;
228 256
229 if (mutex_lock_killable(&task->signal->cred_guard_mutex)) 257 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
230 return NULL; 258 if (err)
259 return ERR_PTR(err);
231 260
232 mm = get_task_mm(task); 261 mm = get_task_mm(task);
233 if (mm && mm != current->mm && 262 if (mm && mm != current->mm &&
234 !ptrace_may_access(task, PTRACE_MODE_READ)) { 263 !ptrace_may_access(task, PTRACE_MODE_READ)) {
235 mmput(mm); 264 mmput(mm);
236 mm = NULL; 265 mm = ERR_PTR(-EACCES);
237 } 266 }
238 mutex_unlock(&task->signal->cred_guard_mutex); 267 mutex_unlock(&task->signal->cred_guard_mutex);
239 268
@@ -279,9 +308,9 @@ out:
279 308
280static int proc_pid_auxv(struct task_struct *task, char *buffer) 309static int proc_pid_auxv(struct task_struct *task, char *buffer)
281{ 310{
282 int res = 0; 311 struct mm_struct *mm = mm_for_maps(task);
283 struct mm_struct *mm = get_task_mm(task); 312 int res = PTR_ERR(mm);
284 if (mm) { 313 if (mm && !IS_ERR(mm)) {
285 unsigned int nwords = 0; 314 unsigned int nwords = 0;
286 do { 315 do {
287 nwords += 2; 316 nwords += 2;
@@ -318,6 +347,23 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
318} 347}
319#endif /* CONFIG_KALLSYMS */ 348#endif /* CONFIG_KALLSYMS */
320 349
350static int lock_trace(struct task_struct *task)
351{
352 int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
353 if (err)
354 return err;
355 if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
356 mutex_unlock(&task->signal->cred_guard_mutex);
357 return -EPERM;
358 }
359 return 0;
360}
361
362static void unlock_trace(struct task_struct *task)
363{
364 mutex_unlock(&task->signal->cred_guard_mutex);
365}
366
321#ifdef CONFIG_STACKTRACE 367#ifdef CONFIG_STACKTRACE
322 368
323#define MAX_STACK_TRACE_DEPTH 64 369#define MAX_STACK_TRACE_DEPTH 64
@@ -327,6 +373,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
327{ 373{
328 struct stack_trace trace; 374 struct stack_trace trace;
329 unsigned long *entries; 375 unsigned long *entries;
376 int err;
330 int i; 377 int i;
331 378
332 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); 379 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
@@ -337,15 +384,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
337 trace.max_entries = MAX_STACK_TRACE_DEPTH; 384 trace.max_entries = MAX_STACK_TRACE_DEPTH;
338 trace.entries = entries; 385 trace.entries = entries;
339 trace.skip = 0; 386 trace.skip = 0;
340 save_stack_trace_tsk(task, &trace);
341 387
342 for (i = 0; i < trace.nr_entries; i++) { 388 err = lock_trace(task);
343 seq_printf(m, "[<%p>] %pS\n", 389 if (!err) {
344 (void *)entries[i], (void *)entries[i]); 390 save_stack_trace_tsk(task, &trace);
391
392 for (i = 0; i < trace.nr_entries; i++) {
393 seq_printf(m, "[<%pK>] %pS\n",
394 (void *)entries[i], (void *)entries[i]);
395 }
396 unlock_trace(task);
345 } 397 }
346 kfree(entries); 398 kfree(entries);
347 399
348 return 0; 400 return err;
349} 401}
350#endif 402#endif
351 403
@@ -508,18 +560,22 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
508{ 560{
509 long nr; 561 long nr;
510 unsigned long args[6], sp, pc; 562 unsigned long args[6], sp, pc;
563 int res = lock_trace(task);
564 if (res)
565 return res;
511 566
512 if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) 567 if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
513 return sprintf(buffer, "running\n"); 568 res = sprintf(buffer, "running\n");
514 569 else if (nr < 0)
515 if (nr < 0) 570 res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
516 return sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc); 571 else
517 572 res = sprintf(buffer,
518 return sprintf(buffer,
519 "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", 573 "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
520 nr, 574 nr,
521 args[0], args[1], args[2], args[3], args[4], args[5], 575 args[0], args[1], args[2], args[3], args[4], args[5],
522 sp, pc); 576 sp, pc);
577 unlock_trace(task);
578 return res;
523} 579}
524#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ 580#endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
525 581
@@ -775,18 +831,14 @@ static ssize_t mem_read(struct file * file, char __user * buf,
775 if (!task) 831 if (!task)
776 goto out_no_task; 832 goto out_no_task;
777 833
778 if (check_mem_permission(task))
779 goto out;
780
781 ret = -ENOMEM; 834 ret = -ENOMEM;
782 page = (char *)__get_free_page(GFP_TEMPORARY); 835 page = (char *)__get_free_page(GFP_TEMPORARY);
783 if (!page) 836 if (!page)
784 goto out; 837 goto out;
785 838
786 ret = 0; 839 mm = check_mem_permission(task);
787 840 ret = PTR_ERR(mm);
788 mm = get_task_mm(task); 841 if (IS_ERR(mm))
789 if (!mm)
790 goto out_free; 842 goto out_free;
791 843
792 ret = -EIO; 844 ret = -EIO;
@@ -800,8 +852,8 @@ static ssize_t mem_read(struct file * file, char __user * buf,
800 int this_len, retval; 852 int this_len, retval;
801 853
802 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; 854 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
803 retval = access_process_vm(task, src, page, this_len, 0); 855 retval = access_remote_vm(mm, src, page, this_len, 0);
804 if (!retval || check_mem_permission(task)) { 856 if (!retval) {
805 if (!ret) 857 if (!ret)
806 ret = -EIO; 858 ret = -EIO;
807 break; 859 break;
@@ -829,10 +881,6 @@ out_no_task:
829 return ret; 881 return ret;
830} 882}
831 883
832#define mem_write NULL
833
834#ifndef mem_write
835/* This is a security hazard */
836static ssize_t mem_write(struct file * file, const char __user *buf, 884static ssize_t mem_write(struct file * file, const char __user *buf,
837 size_t count, loff_t *ppos) 885 size_t count, loff_t *ppos)
838{ 886{
@@ -840,18 +888,25 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
840 char *page; 888 char *page;
841 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 889 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
842 unsigned long dst = *ppos; 890 unsigned long dst = *ppos;
891 struct mm_struct *mm;
843 892
844 copied = -ESRCH; 893 copied = -ESRCH;
845 if (!task) 894 if (!task)
846 goto out_no_task; 895 goto out_no_task;
847 896
848 if (check_mem_permission(task)) 897 mm = check_mem_permission(task);
849 goto out; 898 copied = PTR_ERR(mm);
899 if (IS_ERR(mm))
900 goto out_task;
901
902 copied = -EIO;
903 if (file->private_data != (void *)((long)current->self_exec_id))
904 goto out_mm;
850 905
851 copied = -ENOMEM; 906 copied = -ENOMEM;
852 page = (char *)__get_free_page(GFP_TEMPORARY); 907 page = (char *)__get_free_page(GFP_TEMPORARY);
853 if (!page) 908 if (!page)
854 goto out; 909 goto out_mm;
855 910
856 copied = 0; 911 copied = 0;
857 while (count > 0) { 912 while (count > 0) {
@@ -862,7 +917,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
862 copied = -EFAULT; 917 copied = -EFAULT;
863 break; 918 break;
864 } 919 }
865 retval = access_process_vm(task, dst, page, this_len, 1); 920 retval = access_remote_vm(mm, dst, page, this_len, 1);
866 if (!retval) { 921 if (!retval) {
867 if (!copied) 922 if (!copied)
868 copied = -EIO; 923 copied = -EIO;
@@ -875,12 +930,13 @@ static ssize_t mem_write(struct file * file, const char __user *buf,
875 } 930 }
876 *ppos = dst; 931 *ppos = dst;
877 free_page((unsigned long) page); 932 free_page((unsigned long) page);
878out: 933out_mm:
934 mmput(mm);
935out_task:
879 put_task_struct(task); 936 put_task_struct(task);
880out_no_task: 937out_no_task:
881 return copied; 938 return copied;
882} 939}
883#endif
884 940
885loff_t mem_lseek(struct file *file, loff_t offset, int orig) 941loff_t mem_lseek(struct file *file, loff_t offset, int orig)
886{ 942{
@@ -917,20 +973,18 @@ static ssize_t environ_read(struct file *file, char __user *buf,
917 if (!task) 973 if (!task)
918 goto out_no_task; 974 goto out_no_task;
919 975
920 if (!ptrace_may_access(task, PTRACE_MODE_READ))
921 goto out;
922
923 ret = -ENOMEM; 976 ret = -ENOMEM;
924 page = (char *)__get_free_page(GFP_TEMPORARY); 977 page = (char *)__get_free_page(GFP_TEMPORARY);
925 if (!page) 978 if (!page)
926 goto out; 979 goto out;
927 980
928 ret = 0;
929 981
930 mm = get_task_mm(task); 982 mm = mm_for_maps(task);
931 if (!mm) 983 ret = PTR_ERR(mm);
984 if (!mm || IS_ERR(mm))
932 goto out_free; 985 goto out_free;
933 986
987 ret = 0;
934 while (count > 0) { 988 while (count > 0) {
935 int this_len, retval, max_len; 989 int this_len, retval, max_len;
936 990
@@ -2620,35 +2674,6 @@ static const struct pid_entry proc_base_stuff[] = {
2620 &proc_self_inode_operations, NULL, {}), 2674 &proc_self_inode_operations, NULL, {}),
2621}; 2675};
2622 2676
2623/*
2624 * Exceptional case: normally we are not allowed to unhash a busy
2625 * directory. In this case, however, we can do it - no aliasing problems
2626 * due to the way we treat inodes.
2627 */
2628static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2629{
2630 struct inode *inode;
2631 struct task_struct *task;
2632
2633 if (nd->flags & LOOKUP_RCU)
2634 return -ECHILD;
2635
2636 inode = dentry->d_inode;
2637 task = get_proc_task(inode);
2638 if (task) {
2639 put_task_struct(task);
2640 return 1;
2641 }
2642 d_drop(dentry);
2643 return 0;
2644}
2645
2646static const struct dentry_operations proc_base_dentry_operations =
2647{
2648 .d_revalidate = proc_base_revalidate,
2649 .d_delete = pid_delete_dentry,
2650};
2651
2652static struct dentry *proc_base_instantiate(struct inode *dir, 2677static struct dentry *proc_base_instantiate(struct inode *dir,
2653 struct dentry *dentry, struct task_struct *task, const void *ptr) 2678 struct dentry *dentry, struct task_struct *task, const void *ptr)
2654{ 2679{
@@ -2685,7 +2710,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2685 if (p->fop) 2710 if (p->fop)
2686 inode->i_fop = p->fop; 2711 inode->i_fop = p->fop;
2687 ei->op = p->op; 2712 ei->op = p->op;
2688 d_set_d_op(dentry, &proc_base_dentry_operations);
2689 d_add(dentry, inode); 2713 d_add(dentry, inode);
2690 error = NULL; 2714 error = NULL;
2691out: 2715out:
@@ -2778,8 +2802,12 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
2778static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, 2802static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
2779 struct pid *pid, struct task_struct *task) 2803 struct pid *pid, struct task_struct *task)
2780{ 2804{
2781 seq_printf(m, "%08x\n", task->personality); 2805 int err = lock_trace(task);
2782 return 0; 2806 if (!err) {
2807 seq_printf(m, "%08x\n", task->personality);
2808 unlock_trace(task);
2809 }
2810 return err;
2783} 2811}
2784 2812
2785/* 2813/*
@@ -2798,7 +2826,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2798 REG("environ", S_IRUSR, proc_environ_operations), 2826 REG("environ", S_IRUSR, proc_environ_operations),
2799 INF("auxv", S_IRUSR, proc_pid_auxv), 2827 INF("auxv", S_IRUSR, proc_pid_auxv),
2800 ONE("status", S_IRUGO, proc_pid_status), 2828 ONE("status", S_IRUGO, proc_pid_status),
2801 ONE("personality", S_IRUSR, proc_pid_personality), 2829 ONE("personality", S_IRUGO, proc_pid_personality),
2802 INF("limits", S_IRUGO, proc_pid_limits), 2830 INF("limits", S_IRUGO, proc_pid_limits),
2803#ifdef CONFIG_SCHED_DEBUG 2831#ifdef CONFIG_SCHED_DEBUG
2804 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2832 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
@@ -2808,7 +2836,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2808#endif 2836#endif
2809 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2837 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2810#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2838#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2811 INF("syscall", S_IRUSR, proc_pid_syscall), 2839 INF("syscall", S_IRUGO, proc_pid_syscall),
2812#endif 2840#endif
2813 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2841 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2814 ONE("stat", S_IRUGO, proc_tgid_stat), 2842 ONE("stat", S_IRUGO, proc_tgid_stat),
@@ -2827,7 +2855,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2827#ifdef CONFIG_PROC_PAGE_MONITOR 2855#ifdef CONFIG_PROC_PAGE_MONITOR
2828 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2856 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2829 REG("smaps", S_IRUGO, proc_smaps_operations), 2857 REG("smaps", S_IRUGO, proc_smaps_operations),
2830 REG("pagemap", S_IRUSR, proc_pagemap_operations), 2858 REG("pagemap", S_IRUGO, proc_pagemap_operations),
2831#endif 2859#endif
2832#ifdef CONFIG_SECURITY 2860#ifdef CONFIG_SECURITY
2833 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2861 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -2836,7 +2864,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2836 INF("wchan", S_IRUGO, proc_pid_wchan), 2864 INF("wchan", S_IRUGO, proc_pid_wchan),
2837#endif 2865#endif
2838#ifdef CONFIG_STACKTRACE 2866#ifdef CONFIG_STACKTRACE
2839 ONE("stack", S_IRUSR, proc_pid_stack), 2867 ONE("stack", S_IRUGO, proc_pid_stack),
2840#endif 2868#endif
2841#ifdef CONFIG_SCHEDSTATS 2869#ifdef CONFIG_SCHEDSTATS
2842 INF("schedstat", S_IRUGO, proc_pid_schedstat), 2870 INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -3096,11 +3124,16 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
3096/* for the /proc/ directory itself, after non-process stuff has been done */ 3124/* for the /proc/ directory itself, after non-process stuff has been done */
3097int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 3125int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
3098{ 3126{
3099 unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; 3127 unsigned int nr;
3100 struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode); 3128 struct task_struct *reaper;
3101 struct tgid_iter iter; 3129 struct tgid_iter iter;
3102 struct pid_namespace *ns; 3130 struct pid_namespace *ns;
3103 3131
3132 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
3133 goto out_no_task;
3134 nr = filp->f_pos - FIRST_PROCESS_ENTRY;
3135
3136 reaper = get_proc_task(filp->f_path.dentry->d_inode);
3104 if (!reaper) 3137 if (!reaper)
3105 goto out_no_task; 3138 goto out_no_task;
3106 3139
@@ -3138,14 +3171,14 @@ static const struct pid_entry tid_base_stuff[] = {
3138 REG("environ", S_IRUSR, proc_environ_operations), 3171 REG("environ", S_IRUSR, proc_environ_operations),
3139 INF("auxv", S_IRUSR, proc_pid_auxv), 3172 INF("auxv", S_IRUSR, proc_pid_auxv),
3140 ONE("status", S_IRUGO, proc_pid_status), 3173 ONE("status", S_IRUGO, proc_pid_status),
3141 ONE("personality", S_IRUSR, proc_pid_personality), 3174 ONE("personality", S_IRUGO, proc_pid_personality),
3142 INF("limits", S_IRUGO, proc_pid_limits), 3175 INF("limits", S_IRUGO, proc_pid_limits),
3143#ifdef CONFIG_SCHED_DEBUG 3176#ifdef CONFIG_SCHED_DEBUG
3144 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 3177 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
3145#endif 3178#endif
3146 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 3179 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
3147#ifdef CONFIG_HAVE_ARCH_TRACEHOOK 3180#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
3148 INF("syscall", S_IRUSR, proc_pid_syscall), 3181 INF("syscall", S_IRUGO, proc_pid_syscall),
3149#endif 3182#endif
3150 INF("cmdline", S_IRUGO, proc_pid_cmdline), 3183 INF("cmdline", S_IRUGO, proc_pid_cmdline),
3151 ONE("stat", S_IRUGO, proc_tid_stat), 3184 ONE("stat", S_IRUGO, proc_tid_stat),
@@ -3163,7 +3196,7 @@ static const struct pid_entry tid_base_stuff[] = {
3163#ifdef CONFIG_PROC_PAGE_MONITOR 3196#ifdef CONFIG_PROC_PAGE_MONITOR
3164 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 3197 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
3165 REG("smaps", S_IRUGO, proc_smaps_operations), 3198 REG("smaps", S_IRUGO, proc_smaps_operations),
3166 REG("pagemap", S_IRUSR, proc_pagemap_operations), 3199 REG("pagemap", S_IRUGO, proc_pagemap_operations),
3167#endif 3200#endif
3168#ifdef CONFIG_SECURITY 3201#ifdef CONFIG_SECURITY
3169 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 3202 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
@@ -3172,7 +3205,7 @@ static const struct pid_entry tid_base_stuff[] = {
3172 INF("wchan", S_IRUGO, proc_pid_wchan), 3205 INF("wchan", S_IRUGO, proc_pid_wchan),
3173#endif 3206#endif
3174#ifdef CONFIG_STACKTRACE 3207#ifdef CONFIG_STACKTRACE
3175 ONE("stack", S_IRUSR, proc_pid_stack), 3208 ONE("stack", S_IRUGO, proc_pid_stack),
3176#endif 3209#endif
3177#ifdef CONFIG_SCHEDSTATS 3210#ifdef CONFIG_SCHEDSTATS
3178 INF("schedstat", S_IRUGO, proc_pid_schedstat), 3211 INF("schedstat", S_IRUGO, proc_pid_schedstat),
@@ -3191,7 +3224,7 @@ static const struct pid_entry tid_base_stuff[] = {
3191 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 3224 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
3192#ifdef CONFIG_AUDITSYSCALL 3225#ifdef CONFIG_AUDITSYSCALL
3193 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3226 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
3194 REG("sessionid", S_IRUSR, proc_sessionid_operations), 3227 REG("sessionid", S_IRUGO, proc_sessionid_operations),
3195#endif 3228#endif
3196#ifdef CONFIG_FAULT_INJECTION 3229#ifdef CONFIG_FAULT_INJECTION
3197 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), 3230 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 01e07f2a188f..f1281339b6fa 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -28,7 +28,7 @@
28 28
29DEFINE_SPINLOCK(proc_subdir_lock); 29DEFINE_SPINLOCK(proc_subdir_lock);
30 30
31static int proc_match(int len, const char *name, struct proc_dir_entry *de) 31static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
32{ 32{
33 if (de->namelen != len) 33 if (de->namelen != len)
34 return 0; 34 return 0;
@@ -303,7 +303,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret,
303{ 303{
304 const char *cp = name, *next; 304 const char *cp = name, *next;
305 struct proc_dir_entry *de; 305 struct proc_dir_entry *de;
306 int len; 306 unsigned int len;
307 307
308 de = *ret; 308 de = *ret;
309 if (!de) 309 if (!de)
@@ -602,7 +602,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
602{ 602{
603 struct proc_dir_entry *ent = NULL; 603 struct proc_dir_entry *ent = NULL;
604 const char *fn = name; 604 const char *fn = name;
605 int len; 605 unsigned int len;
606 606
607 /* make sure name is valid */ 607 /* make sure name is valid */
608 if (!name || !strlen(name)) goto out; 608 if (!name || !strlen(name)) goto out;
@@ -786,7 +786,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
786 struct proc_dir_entry **p; 786 struct proc_dir_entry **p;
787 struct proc_dir_entry *de = NULL; 787 struct proc_dir_entry *de = NULL;
788 const char *fn = name; 788 const char *fn = name;
789 int len; 789 unsigned int len;
790 790
791 spin_lock(&proc_subdir_lock); 791 spin_lock(&proc_subdir_lock);
792 if (__xlate_proc_name(name, &parent, &fn) != 0) { 792 if (__xlate_proc_name(name, &parent, &fn) != 0) {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4cda68a..d15aa1b1cc8f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -27,6 +27,7 @@
27static void proc_evict_inode(struct inode *inode) 27static void proc_evict_inode(struct inode *inode)
28{ 28{
29 struct proc_dir_entry *de; 29 struct proc_dir_entry *de;
30 struct ctl_table_header *head;
30 31
31 truncate_inode_pages(&inode->i_data, 0); 32 truncate_inode_pages(&inode->i_data, 0);
32 end_writeback(inode); 33 end_writeback(inode);
@@ -38,12 +39,13 @@ static void proc_evict_inode(struct inode *inode)
38 de = PROC_I(inode)->pde; 39 de = PROC_I(inode)->pde;
39 if (de) 40 if (de)
40 pde_put(de); 41 pde_put(de);
41 if (PROC_I(inode)->sysctl) 42 head = PROC_I(inode)->sysctl;
42 sysctl_head_put(PROC_I(inode)->sysctl); 43 if (head) {
44 rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
45 sysctl_head_put(head);
46 }
43} 47}
44 48
45struct vfsmount *proc_mnt;
46
47static struct kmem_cache * proc_inode_cachep; 49static struct kmem_cache * proc_inode_cachep;
48 50
49static struct inode *proc_alloc_inode(struct super_block *sb) 51static struct inode *proc_alloc_inode(struct super_block *sb)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9ad561ded409..c03e8d3a3a5b 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
107} 107}
108void pde_put(struct proc_dir_entry *pde); 108void pde_put(struct proc_dir_entry *pde);
109 109
110extern struct vfsmount *proc_mnt;
111int proc_fill_super(struct super_block *); 110int proc_fill_super(struct super_block *);
112struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); 111struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
113 112
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d9396a4fc7ff..927cbd115e53 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -233,7 +233,7 @@ void __init proc_device_tree_init(void)
233 return; 233 return;
234 root = of_find_node_by_path("/"); 234 root = of_find_node_by_path("/");
235 if (root == NULL) { 235 if (root == NULL) {
236 printk(KERN_ERR "/proc/device-tree: can't find root\n"); 236 pr_debug("/proc/device-tree: can't find root\n");
237 return; 237 return;
238 } 238 }
239 proc_device_tree_add_node(root, proc_device_tree); 239 proc_device_tree_add_node(root, proc_device_tree);
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34ef..f50133c11c24 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -32,7 +32,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
32 ei->sysctl_entry = table; 32 ei->sysctl_entry = table;
33 33
34 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 34 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
35 inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
36 inode->i_mode = table->mode; 35 inode->i_mode = table->mode;
37 if (!table->child) { 36 if (!table->child) {
38 inode->i_mode |= S_IFREG; 37 inode->i_mode |= S_IFREG;
@@ -408,15 +407,18 @@ static int proc_sys_compare(const struct dentry *parent,
408 const struct dentry *dentry, const struct inode *inode, 407 const struct dentry *dentry, const struct inode *inode,
409 unsigned int len, const char *str, const struct qstr *name) 408 unsigned int len, const char *str, const struct qstr *name)
410{ 409{
410 struct ctl_table_header *head;
411 /* Although proc doesn't have negative dentries, rcu-walk means 411 /* Although proc doesn't have negative dentries, rcu-walk means
412 * that inode here can be NULL */ 412 * that inode here can be NULL */
413 /* AV: can it, indeed? */
413 if (!inode) 414 if (!inode)
414 return 0; 415 return 1;
415 if (name->len != len) 416 if (name->len != len)
416 return 1; 417 return 1;
417 if (memcmp(name->name, str, len)) 418 if (memcmp(name->name, str, len))
418 return 1; 419 return 1;
419 return !sysctl_is_seen(PROC_I(inode)->sysctl); 420 head = rcu_dereference(PROC_I(inode)->sysctl);
421 return !head || !sysctl_is_seen(head);
420} 422}
421 423
422static const struct dentry_operations proc_sys_dentry_operations = { 424static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ef9fa8e24ad6..a9000e9cfee5 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
43 struct pid_namespace *ns; 43 struct pid_namespace *ns;
44 struct proc_inode *ei; 44 struct proc_inode *ei;
45 45
46 if (proc_mnt) {
47 /* Seed the root directory with a pid so it doesn't need
48 * to be special in base.c. I would do this earlier but
49 * the only task alive when /proc is mounted the first time
50 * is the init_task and it doesn't have any pids.
51 */
52 ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
53 if (!ei->pid)
54 ei->pid = find_get_pid(1);
55 }
56
57 if (flags & MS_KERNMOUNT) 46 if (flags & MS_KERNMOUNT)
58 ns = (struct pid_namespace *)data; 47 ns = (struct pid_namespace *)data;
59 else 48 else
@@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
71 return ERR_PTR(err); 60 return ERR_PTR(err);
72 } 61 }
73 62
74 ei = PROC_I(sb->s_root->d_inode);
75 if (!ei->pid) {
76 rcu_read_lock();
77 ei->pid = get_pid(find_pid_ns(1, ns));
78 rcu_read_unlock();
79 }
80
81 sb->s_flags |= MS_ACTIVE; 63 sb->s_flags |= MS_ACTIVE;
82 } 64 }
83 65
66 ei = PROC_I(sb->s_root->d_inode);
67 if (!ei->pid) {
68 rcu_read_lock();
69 ei->pid = get_pid(find_pid_ns(1, ns));
70 rcu_read_unlock();
71 }
72
84 return dget(sb->s_root); 73 return dget(sb->s_root);
85} 74}
86 75
@@ -101,19 +90,20 @@ static struct file_system_type proc_fs_type = {
101 90
102void __init proc_root_init(void) 91void __init proc_root_init(void)
103{ 92{
93 struct vfsmount *mnt;
104 int err; 94 int err;
105 95
106 proc_init_inodecache(); 96 proc_init_inodecache();
107 err = register_filesystem(&proc_fs_type); 97 err = register_filesystem(&proc_fs_type);
108 if (err) 98 if (err)
109 return; 99 return;
110 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 100 mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
111 if (IS_ERR(proc_mnt)) { 101 if (IS_ERR(mnt)) {
112 unregister_filesystem(&proc_fs_type); 102 unregister_filesystem(&proc_fs_type);
113 return; 103 return;
114 } 104 }
115 105
116 init_pid_ns.proc_mnt = proc_mnt; 106 init_pid_ns.proc_mnt = mnt;
117 proc_symlink("mounts", NULL, "self/mounts"); 107 proc_symlink("mounts", NULL, "self/mounts");
118 108
119 proc_net_init(); 109 proc_net_init();
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 60b914860f81..2e7addfd9803 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,5 +1,6 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/hugetlb.h> 2#include <linux/hugetlb.h>
3#include <linux/huge_mm.h>
3#include <linux/mount.h> 4#include <linux/mount.h>
4#include <linux/seq_file.h> 5#include <linux/seq_file.h>
5#include <linux/highmem.h> 6#include <linux/highmem.h>
@@ -7,6 +8,7 @@
7#include <linux/slab.h> 8#include <linux/slab.h>
8#include <linux/pagemap.h> 9#include <linux/pagemap.h>
9#include <linux/mempolicy.h> 10#include <linux/mempolicy.h>
11#include <linux/rmap.h>
10#include <linux/swap.h> 12#include <linux/swap.h>
11#include <linux/swapops.h> 13#include <linux/swapops.h>
12 14
@@ -119,14 +121,14 @@ static void *m_start(struct seq_file *m, loff_t *pos)
119 121
120 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 122 priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
121 if (!priv->task) 123 if (!priv->task)
122 return NULL; 124 return ERR_PTR(-ESRCH);
123 125
124 mm = mm_for_maps(priv->task); 126 mm = mm_for_maps(priv->task);
125 if (!mm) 127 if (!mm || IS_ERR(mm))
126 return NULL; 128 return mm;
127 down_read(&mm->mmap_sem); 129 down_read(&mm->mmap_sem);
128 130
129 tail_vma = get_gate_vma(priv->task); 131 tail_vma = get_gate_vma(priv->task->mm);
130 priv->tail_vma = tail_vma; 132 priv->tail_vma = tail_vma;
131 133
132 /* Start with last addr hint */ 134 /* Start with last addr hint */
@@ -180,7 +182,8 @@ static void m_stop(struct seq_file *m, void *v)
180 struct proc_maps_private *priv = m->private; 182 struct proc_maps_private *priv = m->private;
181 struct vm_area_struct *vma = v; 183 struct vm_area_struct *vma = v;
182 184
183 vma_stop(priv, vma); 185 if (!IS_ERR(vma))
186 vma_stop(priv, vma);
184 if (priv->task) 187 if (priv->task)
185 put_task_struct(priv->task); 188 put_task_struct(priv->task);
186} 189}
@@ -249,8 +252,8 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
249 const char *name = arch_vma_name(vma); 252 const char *name = arch_vma_name(vma);
250 if (!name) { 253 if (!name) {
251 if (mm) { 254 if (mm) {
252 if (vma->vm_start <= mm->start_brk && 255 if (vma->vm_start <= mm->brk &&
253 vma->vm_end >= mm->brk) { 256 vma->vm_end >= mm->start_brk) {
254 name = "[heap]"; 257 name = "[heap]";
255 } else if (vma->vm_start <= mm->start_stack && 258 } else if (vma->vm_start <= mm->start_stack &&
256 vma->vm_end >= mm->start_stack) { 259 vma->vm_end >= mm->start_stack) {
@@ -277,7 +280,8 @@ static int show_map(struct seq_file *m, void *v)
277 show_map_vma(m, vma); 280 show_map_vma(m, vma);
278 281
279 if (m->count < m->size) /* vma is copied successfully */ 282 if (m->count < m->size) /* vma is copied successfully */
280 m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; 283 m->version = (vma != get_gate_vma(task->mm))
284 ? vma->vm_start : 0;
281 return 0; 285 return 0;
282} 286}
283 287
@@ -329,58 +333,86 @@ struct mem_size_stats {
329 unsigned long private_dirty; 333 unsigned long private_dirty;
330 unsigned long referenced; 334 unsigned long referenced;
331 unsigned long anonymous; 335 unsigned long anonymous;
336 unsigned long anonymous_thp;
332 unsigned long swap; 337 unsigned long swap;
333 u64 pss; 338 u64 pss;
334}; 339};
335 340
336static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 341
337 struct mm_walk *walk) 342static void smaps_pte_entry(pte_t ptent, unsigned long addr,
343 unsigned long ptent_size, struct mm_walk *walk)
338{ 344{
339 struct mem_size_stats *mss = walk->private; 345 struct mem_size_stats *mss = walk->private;
340 struct vm_area_struct *vma = mss->vma; 346 struct vm_area_struct *vma = mss->vma;
341 pte_t *pte, ptent;
342 spinlock_t *ptl;
343 struct page *page; 347 struct page *page;
344 int mapcount; 348 int mapcount;
345 349
346 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 350 if (is_swap_pte(ptent)) {
347 for (; addr != end; pte++, addr += PAGE_SIZE) { 351 mss->swap += ptent_size;
348 ptent = *pte; 352 return;
349 353 }
350 if (is_swap_pte(ptent)) {
351 mss->swap += PAGE_SIZE;
352 continue;
353 }
354 354
355 if (!pte_present(ptent)) 355 if (!pte_present(ptent))
356 continue; 356 return;
357
358 page = vm_normal_page(vma, addr, ptent);
359 if (!page)
360 return;
361
362 if (PageAnon(page))
363 mss->anonymous += ptent_size;
364
365 mss->resident += ptent_size;
366 /* Accumulate the size in pages that have been accessed. */
367 if (pte_young(ptent) || PageReferenced(page))
368 mss->referenced += ptent_size;
369 mapcount = page_mapcount(page);
370 if (mapcount >= 2) {
371 if (pte_dirty(ptent) || PageDirty(page))
372 mss->shared_dirty += ptent_size;
373 else
374 mss->shared_clean += ptent_size;
375 mss->pss += (ptent_size << PSS_SHIFT) / mapcount;
376 } else {
377 if (pte_dirty(ptent) || PageDirty(page))
378 mss->private_dirty += ptent_size;
379 else
380 mss->private_clean += ptent_size;
381 mss->pss += (ptent_size << PSS_SHIFT);
382 }
383}
357 384
358 page = vm_normal_page(vma, addr, ptent); 385static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
359 if (!page) 386 struct mm_walk *walk)
360 continue; 387{
388 struct mem_size_stats *mss = walk->private;
389 struct vm_area_struct *vma = mss->vma;
390 pte_t *pte;
391 spinlock_t *ptl;
361 392
362 if (PageAnon(page)) 393 spin_lock(&walk->mm->page_table_lock);
363 mss->anonymous += PAGE_SIZE; 394 if (pmd_trans_huge(*pmd)) {
364 395 if (pmd_trans_splitting(*pmd)) {
365 mss->resident += PAGE_SIZE; 396 spin_unlock(&walk->mm->page_table_lock);
366 /* Accumulate the size in pages that have been accessed. */ 397 wait_split_huge_page(vma->anon_vma, pmd);
367 if (pte_young(ptent) || PageReferenced(page))
368 mss->referenced += PAGE_SIZE;
369 mapcount = page_mapcount(page);
370 if (mapcount >= 2) {
371 if (pte_dirty(ptent) || PageDirty(page))
372 mss->shared_dirty += PAGE_SIZE;
373 else
374 mss->shared_clean += PAGE_SIZE;
375 mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
376 } else { 398 } else {
377 if (pte_dirty(ptent) || PageDirty(page)) 399 smaps_pte_entry(*(pte_t *)pmd, addr,
378 mss->private_dirty += PAGE_SIZE; 400 HPAGE_PMD_SIZE, walk);
379 else 401 spin_unlock(&walk->mm->page_table_lock);
380 mss->private_clean += PAGE_SIZE; 402 mss->anonymous_thp += HPAGE_PMD_SIZE;
381 mss->pss += (PAGE_SIZE << PSS_SHIFT); 403 return 0;
382 } 404 }
405 } else {
406 spin_unlock(&walk->mm->page_table_lock);
383 } 407 }
408 /*
409 * The mmap_sem held all the way back in m_start() is what
410 * keeps khugepaged out of here and from collapsing things
411 * in here.
412 */
413 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
414 for (; addr != end; pte++, addr += PAGE_SIZE)
415 smaps_pte_entry(*pte, addr, PAGE_SIZE, walk);
384 pte_unmap_unlock(pte - 1, ptl); 416 pte_unmap_unlock(pte - 1, ptl);
385 cond_resched(); 417 cond_resched();
386 return 0; 418 return 0;
@@ -416,6 +448,7 @@ static int show_smap(struct seq_file *m, void *v)
416 "Private_Dirty: %8lu kB\n" 448 "Private_Dirty: %8lu kB\n"
417 "Referenced: %8lu kB\n" 449 "Referenced: %8lu kB\n"
418 "Anonymous: %8lu kB\n" 450 "Anonymous: %8lu kB\n"
451 "AnonHugePages: %8lu kB\n"
419 "Swap: %8lu kB\n" 452 "Swap: %8lu kB\n"
420 "KernelPageSize: %8lu kB\n" 453 "KernelPageSize: %8lu kB\n"
421 "MMUPageSize: %8lu kB\n" 454 "MMUPageSize: %8lu kB\n"
@@ -429,6 +462,7 @@ static int show_smap(struct seq_file *m, void *v)
429 mss.private_dirty >> 10, 462 mss.private_dirty >> 10,
430 mss.referenced >> 10, 463 mss.referenced >> 10,
431 mss.anonymous >> 10, 464 mss.anonymous >> 10,
465 mss.anonymous_thp >> 10,
432 mss.swap >> 10, 466 mss.swap >> 10,
433 vma_kernel_pagesize(vma) >> 10, 467 vma_kernel_pagesize(vma) >> 10,
434 vma_mmu_pagesize(vma) >> 10, 468 vma_mmu_pagesize(vma) >> 10,
@@ -436,7 +470,8 @@ static int show_smap(struct seq_file *m, void *v)
436 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 470 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
437 471
438 if (m->count < m->size) /* vma is copied successfully */ 472 if (m->count < m->size) /* vma is copied successfully */
439 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; 473 m->version = (vma != get_gate_vma(task->mm))
474 ? vma->vm_start : 0;
440 return 0; 475 return 0;
441} 476}
442 477
@@ -467,6 +502,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
467 spinlock_t *ptl; 502 spinlock_t *ptl;
468 struct page *page; 503 struct page *page;
469 504
505 split_huge_page_pmd(walk->mm, pmd);
506
470 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 507 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
471 for (; addr != end; pte++, addr += PAGE_SIZE) { 508 for (; addr != end; pte++, addr += PAGE_SIZE) {
472 ptent = *pte; 509 ptent = *pte;
@@ -623,6 +660,8 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
623 pte_t *pte; 660 pte_t *pte;
624 int err = 0; 661 int err = 0;
625 662
663 split_huge_page_pmd(walk->mm, pmd);
664
626 /* find the first VMA at or above 'addr' */ 665 /* find the first VMA at or above 'addr' */
627 vma = find_vma(walk->mm, addr); 666 vma = find_vma(walk->mm, addr);
628 for (; addr != end; addr += PAGE_SIZE) { 667 for (; addr != end; addr += PAGE_SIZE) {
@@ -728,8 +767,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
728 if (!task) 767 if (!task)
729 goto out; 768 goto out;
730 769
731 ret = -EACCES; 770 mm = mm_for_maps(task);
732 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 771 ret = PTR_ERR(mm);
772 if (!mm || IS_ERR(mm))
733 goto out_task; 773 goto out_task;
734 774
735 ret = -EINVAL; 775 ret = -EINVAL;
@@ -742,10 +782,6 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
742 if (!count) 782 if (!count)
743 goto out_task; 783 goto out_task;
744 784
745 mm = get_task_mm(task);
746 if (!mm)
747 goto out_task;
748
749 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 785 pm.len = PM_ENTRY_BYTES * (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
750 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY); 786 pm.buffer = kmalloc(pm.len, GFP_TEMPORARY);
751 ret = -ENOMEM; 787 ret = -ENOMEM;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index b535d3e5d5f1..980de547c070 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -199,13 +199,13 @@ static void *m_start(struct seq_file *m, loff_t *pos)
199 /* pin the task and mm whilst we play with them */ 199 /* pin the task and mm whilst we play with them */
200 priv->task = get_pid_task(priv->pid, PIDTYPE_PID); 200 priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
201 if (!priv->task) 201 if (!priv->task)
202 return NULL; 202 return ERR_PTR(-ESRCH);
203 203
204 mm = mm_for_maps(priv->task); 204 mm = mm_for_maps(priv->task);
205 if (!mm) { 205 if (!mm || IS_ERR(mm)) {
206 put_task_struct(priv->task); 206 put_task_struct(priv->task);
207 priv->task = NULL; 207 priv->task = NULL;
208 return NULL; 208 return mm;
209 } 209 }
210 down_read(&mm->mmap_sem); 210 down_read(&mm->mmap_sem);
211 211
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
new file mode 100644
index 000000000000..8007ae7c0d8c
--- /dev/null
+++ b/fs/pstore/Kconfig
@@ -0,0 +1,13 @@
1config PSTORE
2 bool "Persistent store support"
3 default n
4 help
5 This option enables generic access to platform level
6 persistent storage via "pstore" filesystem that can
7 be mounted as /dev/pstore. Only useful if you have
8 a platform level driver that registers with pstore to
9 provide the data, so you probably should just go say "Y"
10 (or "M") to a platform specific persistent store driver
11 (e.g. ACPI_APEI on X86) which will select this for you.
12 If you don't have a platform persistent store driver,
13 say N.
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
new file mode 100644
index 000000000000..760f4bce7d1d
--- /dev/null
+++ b/fs/pstore/Makefile
@@ -0,0 +1,7 @@
1#
2# Makefile for the linux pstorefs routines.
3#
4
5obj-y += pstore.o
6
7pstore-objs += inode.o platform.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
new file mode 100644
index 000000000000..977ed2723845
--- /dev/null
+++ b/fs/pstore/inode.c
@@ -0,0 +1,311 @@
1/*
2 * Persistent Storage - ramfs parts.
3 *
4 * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/fs.h>
22#include <linux/fsnotify.h>
23#include <linux/pagemap.h>
24#include <linux/highmem.h>
25#include <linux/time.h>
26#include <linux/init.h>
27#include <linux/string.h>
28#include <linux/mount.h>
29#include <linux/ramfs.h>
30#include <linux/parser.h>
31#include <linux/sched.h>
32#include <linux/magic.h>
33#include <linux/pstore.h>
34#include <linux/slab.h>
35#include <linux/uaccess.h>
36
37#include "internal.h"
38
39#define PSTORE_NAMELEN 64
40
41struct pstore_private {
42 u64 id;
43 int (*erase)(u64);
44 ssize_t size;
45 char data[];
46};
47
48static int pstore_file_open(struct inode *inode, struct file *file)
49{
50 file->private_data = inode->i_private;
51 return 0;
52}
53
54static ssize_t pstore_file_read(struct file *file, char __user *userbuf,
55 size_t count, loff_t *ppos)
56{
57 struct pstore_private *ps = file->private_data;
58
59 return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size);
60}
61
62static const struct file_operations pstore_file_operations = {
63 .open = pstore_file_open,
64 .read = pstore_file_read,
65 .llseek = default_llseek,
66};
67
68/*
69 * When a file is unlinked from our file system we call the
70 * platform driver to erase the record from persistent store.
71 */
72static int pstore_unlink(struct inode *dir, struct dentry *dentry)
73{
74 struct pstore_private *p = dentry->d_inode->i_private;
75
76 p->erase(p->id);
77
78 return simple_unlink(dir, dentry);
79}
80
81static void pstore_evict_inode(struct inode *inode)
82{
83 end_writeback(inode);
84 kfree(inode->i_private);
85}
86
87static const struct inode_operations pstore_dir_inode_operations = {
88 .lookup = simple_lookup,
89 .unlink = pstore_unlink,
90};
91
92static struct inode *pstore_get_inode(struct super_block *sb,
93 const struct inode *dir, int mode, dev_t dev)
94{
95 struct inode *inode = new_inode(sb);
96
97 if (inode) {
98 inode->i_ino = get_next_ino();
99 inode->i_uid = inode->i_gid = 0;
100 inode->i_mode = mode;
101 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
102 switch (mode & S_IFMT) {
103 case S_IFREG:
104 inode->i_fop = &pstore_file_operations;
105 break;
106 case S_IFDIR:
107 inode->i_op = &pstore_dir_inode_operations;
108 inode->i_fop = &simple_dir_operations;
109 inc_nlink(inode);
110 break;
111 }
112 }
113 return inode;
114}
115
116enum {
117 Opt_kmsg_bytes, Opt_err
118};
119
120static const match_table_t tokens = {
121 {Opt_kmsg_bytes, "kmsg_bytes=%u"},
122 {Opt_err, NULL}
123};
124
125static void parse_options(char *options)
126{
127 char *p;
128 substring_t args[MAX_OPT_ARGS];
129 int option;
130
131 if (!options)
132 return;
133
134 while ((p = strsep(&options, ",")) != NULL) {
135 int token;
136
137 if (!*p)
138 continue;
139
140 token = match_token(p, tokens, args);
141 switch (token) {
142 case Opt_kmsg_bytes:
143 if (!match_int(&args[0], &option))
144 pstore_set_kmsg_bytes(option);
145 break;
146 }
147 }
148}
149
150static int pstore_remount(struct super_block *sb, int *flags, char *data)
151{
152 parse_options(data);
153
154 return 0;
155}
156
157static const struct super_operations pstore_ops = {
158 .statfs = simple_statfs,
159 .drop_inode = generic_delete_inode,
160 .evict_inode = pstore_evict_inode,
161 .remount_fs = pstore_remount,
162 .show_options = generic_show_options,
163};
164
165static struct super_block *pstore_sb;
166
167int pstore_is_mounted(void)
168{
169 return pstore_sb != NULL;
170}
171
172/*
173 * Make a regular file in the root directory of our file system.
174 * Load it up with "size" bytes of data from "buf".
175 * Set the mtime & ctime to the date that this record was originally stored.
176 */
177int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
178 char *data, size_t size,
179 struct timespec time, int (*erase)(u64))
180{
181 struct dentry *root = pstore_sb->s_root;
182 struct dentry *dentry;
183 struct inode *inode;
184 int rc;
185 char name[PSTORE_NAMELEN];
186 struct pstore_private *private;
187
188 rc = -ENOMEM;
189 inode = pstore_get_inode(pstore_sb, root->d_inode, S_IFREG | 0444, 0);
190 if (!inode)
191 goto fail;
192 private = kmalloc(sizeof *private + size, GFP_KERNEL);
193 if (!private)
194 goto fail_alloc;
195 private->id = id;
196 private->erase = erase;
197
198 switch (type) {
199 case PSTORE_TYPE_DMESG:
200 sprintf(name, "dmesg-%s-%lld", psname, id);
201 break;
202 case PSTORE_TYPE_MCE:
203 sprintf(name, "mce-%s-%lld", psname, id);
204 break;
205 case PSTORE_TYPE_UNKNOWN:
206 sprintf(name, "unknown-%s-%lld", psname, id);
207 break;
208 default:
209 sprintf(name, "type%d-%s-%lld", type, psname, id);
210 break;
211 }
212
213 mutex_lock(&root->d_inode->i_mutex);
214
215 rc = -ENOSPC;
216 dentry = d_alloc_name(root, name);
217 if (IS_ERR(dentry))
218 goto fail_lockedalloc;
219
220 memcpy(private->data, data, size);
221 inode->i_size = private->size = size;
222
223 inode->i_private = private;
224
225 if (time.tv_sec)
226 inode->i_mtime = inode->i_ctime = time;
227
228 d_add(dentry, inode);
229
230 mutex_unlock(&root->d_inode->i_mutex);
231
232 return 0;
233
234fail_lockedalloc:
235 mutex_unlock(&root->d_inode->i_mutex);
236 kfree(private);
237fail_alloc:
238 iput(inode);
239
240fail:
241 return rc;
242}
243
244int pstore_fill_super(struct super_block *sb, void *data, int silent)
245{
246 struct inode *inode = NULL;
247 struct dentry *root;
248 int err;
249
250 save_mount_options(sb, data);
251
252 pstore_sb = sb;
253
254 sb->s_maxbytes = MAX_LFS_FILESIZE;
255 sb->s_blocksize = PAGE_CACHE_SIZE;
256 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
257 sb->s_magic = PSTOREFS_MAGIC;
258 sb->s_op = &pstore_ops;
259 sb->s_time_gran = 1;
260
261 parse_options(data);
262
263 inode = pstore_get_inode(sb, NULL, S_IFDIR | 0755, 0);
264 if (!inode) {
265 err = -ENOMEM;
266 goto fail;
267 }
268 /* override ramfs "dir" options so we catch unlink(2) */
269 inode->i_op = &pstore_dir_inode_operations;
270
271 root = d_alloc_root(inode);
272 sb->s_root = root;
273 if (!root) {
274 err = -ENOMEM;
275 goto fail;
276 }
277
278 pstore_get_records();
279
280 return 0;
281fail:
282 iput(inode);
283 return err;
284}
285
286static struct dentry *pstore_mount(struct file_system_type *fs_type,
287 int flags, const char *dev_name, void *data)
288{
289 return mount_single(fs_type, flags, data, pstore_fill_super);
290}
291
292static void pstore_kill_sb(struct super_block *sb)
293{
294 kill_litter_super(sb);
295 pstore_sb = NULL;
296}
297
298static struct file_system_type pstore_fs_type = {
299 .name = "pstore",
300 .mount = pstore_mount,
301 .kill_sb = pstore_kill_sb,
302};
303
304static int __init init_pstore_fs(void)
305{
306 return register_filesystem(&pstore_fs_type);
307}
308module_init(init_pstore_fs)
309
310MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
311MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
new file mode 100644
index 000000000000..8c9f23eb1645
--- /dev/null
+++ b/fs/pstore/internal.h
@@ -0,0 +1,6 @@
1extern void pstore_set_kmsg_bytes(int);
2extern void pstore_get_records(void);
3extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
4 char *data, size_t size,
5 struct timespec time, int (*erase)(u64));
6extern int pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
new file mode 100644
index 000000000000..f835a25625ff
--- /dev/null
+++ b/fs/pstore/platform.c
@@ -0,0 +1,201 @@
1/*
2 * Persistent Storage - platform driver interface parts.
3 *
4 * Copyright (C) 2010 Intel Corporation <tony.luck@intel.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/atomic.h>
21#include <linux/types.h>
22#include <linux/errno.h>
23#include <linux/init.h>
24#include <linux/kmsg_dump.h>
25#include <linux/module.h>
26#include <linux/pstore.h>
27#include <linux/string.h>
28#include <linux/slab.h>
29#include <linux/uaccess.h>
30
31#include "internal.h"
32
33/*
34 * pstore_lock just protects "psinfo" during
35 * calls to pstore_register()
36 */
37static DEFINE_SPINLOCK(pstore_lock);
38static struct pstore_info *psinfo;
39
40/* How much of the console log to snapshot */
41static unsigned long kmsg_bytes = 10240;
42
43void pstore_set_kmsg_bytes(int bytes)
44{
45 kmsg_bytes = bytes;
46}
47
48/* Tag each group of saved records with a sequence number */
49static int oopscount;
50
51static char *reason_str[] = {
52 "Oops", "Panic", "Kexec", "Restart", "Halt", "Poweroff", "Emergency"
53};
54
55/*
56 * callback from kmsg_dump. (s2,l2) has the most recently
57 * written bytes, older bytes are in (s1,l1). Save as much
58 * as we can from the end of the buffer.
59 */
60static void pstore_dump(struct kmsg_dumper *dumper,
61 enum kmsg_dump_reason reason,
62 const char *s1, unsigned long l1,
63 const char *s2, unsigned long l2)
64{
65 unsigned long s1_start, s2_start;
66 unsigned long l1_cpy, l2_cpy;
67 unsigned long size, total = 0;
68 char *dst, *why;
69 u64 id;
70 int hsize, part = 1;
71
72 if (reason < ARRAY_SIZE(reason_str))
73 why = reason_str[reason];
74 else
75 why = "Unknown";
76
77 mutex_lock(&psinfo->buf_mutex);
78 oopscount++;
79 while (total < kmsg_bytes) {
80 dst = psinfo->buf;
81 hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part++);
82 size = psinfo->bufsize - hsize;
83 dst += hsize;
84
85 l2_cpy = min(l2, size);
86 l1_cpy = min(l1, size - l2_cpy);
87
88 if (l1_cpy + l2_cpy == 0)
89 break;
90
91 s2_start = l2 - l2_cpy;
92 s1_start = l1 - l1_cpy;
93
94 memcpy(dst, s1 + s1_start, l1_cpy);
95 memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy);
96
97 id = psinfo->write(PSTORE_TYPE_DMESG, hsize + l1_cpy + l2_cpy);
98 if (reason == KMSG_DUMP_OOPS && pstore_is_mounted())
99 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id,
100 psinfo->buf, hsize + l1_cpy + l2_cpy,
101 CURRENT_TIME, psinfo->erase);
102 l1 -= l1_cpy;
103 l2 -= l2_cpy;
104 total += l1_cpy + l2_cpy;
105 }
106 mutex_unlock(&psinfo->buf_mutex);
107}
108
109static struct kmsg_dumper pstore_dumper = {
110 .dump = pstore_dump,
111};
112
113/*
114 * platform specific persistent storage driver registers with
115 * us here. If pstore is already mounted, call the platform
116 * read function right away to populate the file system. If not
117 * then the pstore mount code will call us later to fill out
118 * the file system.
119 *
120 * Register with kmsg_dump to save last part of console log on panic.
121 */
122int pstore_register(struct pstore_info *psi)
123{
124 struct module *owner = psi->owner;
125
126 spin_lock(&pstore_lock);
127 if (psinfo) {
128 spin_unlock(&pstore_lock);
129 return -EBUSY;
130 }
131 psinfo = psi;
132 spin_unlock(&pstore_lock);
133
134 if (owner && !try_module_get(owner)) {
135 psinfo = NULL;
136 return -EINVAL;
137 }
138
139 if (pstore_is_mounted())
140 pstore_get_records();
141
142 kmsg_dump_register(&pstore_dumper);
143
144 return 0;
145}
146EXPORT_SYMBOL_GPL(pstore_register);
147
148/*
149 * Read all the records from the persistent store. Create and
150 * file files in our filesystem.
151 */
152void pstore_get_records(void)
153{
154 struct pstore_info *psi = psinfo;
155 size_t size;
156 u64 id;
157 enum pstore_type_id type;
158 struct timespec time;
159 int failed = 0;
160
161 if (!psi)
162 return;
163
164 mutex_lock(&psinfo->buf_mutex);
165 while ((size = psi->read(&id, &type, &time)) > 0) {
166 if (pstore_mkfile(type, psi->name, id, psi->buf, size,
167 time, psi->erase))
168 failed++;
169 }
170 mutex_unlock(&psinfo->buf_mutex);
171
172 if (failed)
173 printk(KERN_WARNING "pstore: failed to load %d record(s) from '%s'\n",
174 failed, psi->name);
175}
176
177/*
178 * Call platform driver to write a record to the
179 * persistent store.
180 */
181int pstore_write(enum pstore_type_id type, char *buf, size_t size)
182{
183 u64 id;
184
185 if (!psinfo)
186 return -ENODEV;
187
188 if (size > psinfo->bufsize)
189 return -EFBIG;
190
191 mutex_lock(&psinfo->buf_mutex);
192 memcpy(psinfo->buf, buf, size);
193 id = psinfo->write(type, size);
194 if (pstore_is_mounted())
195 pstore_mkfile(PSTORE_TYPE_DMESG, psinfo->name, id, psinfo->buf,
196 size, CURRENT_TIME, psinfo->erase);
197 mutex_unlock(&psinfo->buf_mutex);
198
199 return 0;
200}
201EXPORT_SYMBOL_GPL(pstore_write);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index e63b4171d583..2b0646613f5a 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -335,7 +335,6 @@ static sector_t qnx4_bmap(struct address_space *mapping, sector_t block)
335static const struct address_space_operations qnx4_aops = { 335static const struct address_space_operations qnx4_aops = {
336 .readpage = qnx4_readpage, 336 .readpage = qnx4_readpage,
337 .writepage = qnx4_writepage, 337 .writepage = qnx4_writepage,
338 .sync_page = block_sync_page,
339 .write_begin = qnx4_write_begin, 338 .write_begin = qnx4_write_begin,
340 .write_end = generic_write_end, 339 .write_end = generic_write_end,
341 .bmap = qnx4_bmap 340 .bmap = qnx4_bmap
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index a2a622e079f0..d3c032f5fa0a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -76,7 +76,7 @@
76#include <linux/buffer_head.h> 76#include <linux/buffer_head.h>
77#include <linux/capability.h> 77#include <linux/capability.h>
78#include <linux/quotaops.h> 78#include <linux/quotaops.h>
79#include <linux/writeback.h> /* for inode_lock, oddly enough.. */ 79#include "../internal.h" /* ugh */
80 80
81#include <asm/uaccess.h> 81#include <asm/uaccess.h>
82 82
@@ -442,7 +442,7 @@ EXPORT_SYMBOL(dquot_acquire);
442 */ 442 */
443int dquot_commit(struct dquot *dquot) 443int dquot_commit(struct dquot *dquot)
444{ 444{
445 int ret = 0, ret2 = 0; 445 int ret = 0;
446 struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); 446 struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
447 447
448 mutex_lock(&dqopt->dqio_mutex); 448 mutex_lock(&dqopt->dqio_mutex);
@@ -454,15 +454,10 @@ int dquot_commit(struct dquot *dquot)
454 spin_unlock(&dq_list_lock); 454 spin_unlock(&dq_list_lock);
455 /* Inactive dquot can be only if there was error during read/init 455 /* Inactive dquot can be only if there was error during read/init
456 * => we have better not writing it */ 456 * => we have better not writing it */
457 if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { 457 if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
458 ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot); 458 ret = dqopt->ops[dquot->dq_type]->commit_dqblk(dquot);
459 if (info_dirty(&dqopt->info[dquot->dq_type])) { 459 else
460 ret2 = dqopt->ops[dquot->dq_type]->write_file_info( 460 ret = -EIO;
461 dquot->dq_sb, dquot->dq_type);
462 }
463 if (ret >= 0)
464 ret = ret2;
465 }
466out_sem: 461out_sem:
467 mutex_unlock(&dqopt->dqio_mutex); 462 mutex_unlock(&dqopt->dqio_mutex);
468 return ret; 463 return ret;
@@ -900,33 +895,38 @@ static void add_dquot_ref(struct super_block *sb, int type)
900 int reserved = 0; 895 int reserved = 0;
901#endif 896#endif
902 897
903 spin_lock(&inode_lock); 898 spin_lock(&inode_sb_list_lock);
904 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 899 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
905 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) 900 spin_lock(&inode->i_lock);
901 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
902 !atomic_read(&inode->i_writecount) ||
903 !dqinit_needed(inode, type)) {
904 spin_unlock(&inode->i_lock);
906 continue; 905 continue;
906 }
907#ifdef CONFIG_QUOTA_DEBUG 907#ifdef CONFIG_QUOTA_DEBUG
908 if (unlikely(inode_get_rsv_space(inode) > 0)) 908 if (unlikely(inode_get_rsv_space(inode) > 0))
909 reserved = 1; 909 reserved = 1;
910#endif 910#endif
911 if (!atomic_read(&inode->i_writecount))
912 continue;
913 if (!dqinit_needed(inode, type))
914 continue;
915
916 __iget(inode); 911 __iget(inode);
917 spin_unlock(&inode_lock); 912 spin_unlock(&inode->i_lock);
913 spin_unlock(&inode_sb_list_lock);
918 914
919 iput(old_inode); 915 iput(old_inode);
920 __dquot_initialize(inode, type); 916 __dquot_initialize(inode, type);
921 /* We hold a reference to 'inode' so it couldn't have been 917
922 * removed from s_inodes list while we dropped the inode_lock. 918 /*
923 * We cannot iput the inode now as we can be holding the last 919 * We hold a reference to 'inode' so it couldn't have been
924 * reference and we cannot iput it under inode_lock. So we 920 * removed from s_inodes list while we dropped the
925 * keep the reference and iput it later. */ 921 * inode_sb_list_lock We cannot iput the inode now as we can be
922 * holding the last reference and we cannot iput it under
923 * inode_sb_list_lock. So we keep the reference and iput it
924 * later.
925 */
926 old_inode = inode; 926 old_inode = inode;
927 spin_lock(&inode_lock); 927 spin_lock(&inode_sb_list_lock);
928 } 928 }
929 spin_unlock(&inode_lock); 929 spin_unlock(&inode_sb_list_lock);
930 iput(old_inode); 930 iput(old_inode);
931 931
932#ifdef CONFIG_QUOTA_DEBUG 932#ifdef CONFIG_QUOTA_DEBUG
@@ -951,7 +951,7 @@ static inline int dqput_blocks(struct dquot *dquot)
951 951
952/* 952/*
953 * Remove references to dquots from inode and add dquot to list for freeing 953 * Remove references to dquots from inode and add dquot to list for freeing
954 * if we have the last referece to dquot 954 * if we have the last reference to dquot
955 * We can't race with anybody because we hold dqptr_sem for writing... 955 * We can't race with anybody because we hold dqptr_sem for writing...
956 */ 956 */
957static int remove_inode_dquot_ref(struct inode *inode, int type, 957static int remove_inode_dquot_ref(struct inode *inode, int type,
@@ -1007,7 +1007,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1007 struct inode *inode; 1007 struct inode *inode;
1008 int reserved = 0; 1008 int reserved = 0;
1009 1009
1010 spin_lock(&inode_lock); 1010 spin_lock(&inode_sb_list_lock);
1011 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 1011 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
1012 /* 1012 /*
1013 * We have to scan also I_NEW inodes because they can already 1013 * We have to scan also I_NEW inodes because they can already
@@ -1021,7 +1021,7 @@ static void remove_dquot_ref(struct super_block *sb, int type,
1021 remove_inode_dquot_ref(inode, type, tofree_head); 1021 remove_inode_dquot_ref(inode, type, tofree_head);
1022 } 1022 }
1023 } 1023 }
1024 spin_unlock(&inode_lock); 1024 spin_unlock(&inode_sb_list_lock);
1025#ifdef CONFIG_QUOTA_DEBUG 1025#ifdef CONFIG_QUOTA_DEBUG
1026 if (reserved) { 1026 if (reserved) {
1027 printk(KERN_WARNING "VFS (%s): Writes happened after quota" 1027 printk(KERN_WARNING "VFS (%s): Writes happened after quota"
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 65444d29406b..f1ab3604db5a 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -112,7 +112,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
112 if (!info->dqi_priv) { 112 if (!info->dqi_priv) {
113 printk(KERN_WARNING 113 printk(KERN_WARNING
114 "Not enough memory for quota information structure.\n"); 114 "Not enough memory for quota information structure.\n");
115 return -1; 115 return -ENOMEM;
116 } 116 }
117 qinfo = info->dqi_priv; 117 qinfo = info->dqi_priv;
118 if (version == 0) { 118 if (version == 0) {
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 9eead2c796b7..fbb0b478a346 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -112,6 +112,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
112 SetPageDirty(page); 112 SetPageDirty(page);
113 113
114 unlock_page(page); 114 unlock_page(page);
115 put_page(page);
115 } 116 }
116 117
117 return 0; 118 return 0;
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 792b3cb2cd18..3c3b00165114 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -31,9 +31,7 @@ endif
31# and causing a panic. Since this behavior only affects ppc32, this ifeq 31# and causing a panic. Since this behavior only affects ppc32, this ifeq
32# will work around it. If any other architecture displays this behavior, 32# will work around it. If any other architecture displays this behavior,
33# add it here. 33# add it here.
34ifeq ($(CONFIG_PPC32),y) 34ccflags-$(CONFIG_PPC32) := $(call cc-ifversion, -lt, 0400, -O1)
35EXTRA_CFLAGS := $(call cc-ifversion, -lt, 0400, -O1)
36endif
37 35
38TAGS: 36TAGS:
39 etags *.c 37 etags *.c
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e2..4fd5bb33dbb5 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
1593 struct inode *inode = dentry->d_inode; 1593 struct inode *inode = dentry->d_inode;
1594 int maxlen = *lenp; 1594 int maxlen = *lenp;
1595 1595
1596 if (maxlen < 3) 1596 if (need_parent && (maxlen < 5)) {
1597 *lenp = 5;
1597 return 255; 1598 return 255;
1599 } else if (maxlen < 3) {
1600 *lenp = 3;
1601 return 255;
1602 }
1598 1603
1599 data[0] = inode->i_ino; 1604 data[0] = inode->i_ino;
1600 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); 1605 data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
@@ -3212,7 +3217,6 @@ const struct address_space_operations reiserfs_address_space_operations = {
3212 .readpages = reiserfs_readpages, 3217 .readpages = reiserfs_readpages,
3213 .releasepage = reiserfs_releasepage, 3218 .releasepage = reiserfs_releasepage,
3214 .invalidatepage = reiserfs_invalidatepage, 3219 .invalidatepage = reiserfs_invalidatepage,
3215 .sync_page = block_sync_page,
3216 .write_begin = reiserfs_write_begin, 3220 .write_begin = reiserfs_write_begin,
3217 .write_end = reiserfs_write_end, 3221 .write_end = reiserfs_write_end,
3218 .bmap = reiserfs_aop_bmap, 3222 .bmap = reiserfs_aop_bmap,
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 79265fdc317a..4e153051bc75 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
59 if (err) 59 if (err)
60 break; 60 break;
61 61
62 if (!is_owner_or_cap(inode)) { 62 if (!inode_owner_or_capable(inode)) {
63 err = -EPERM; 63 err = -EPERM;
64 goto setflags_out; 64 goto setflags_out;
65 } 65 }
@@ -103,7 +103,7 @@ setflags_out:
103 err = put_user(inode->i_generation, (int __user *)arg); 103 err = put_user(inode->i_generation, (int __user *)arg);
104 break; 104 break;
105 case REISERFS_IOC_SETVERSION: 105 case REISERFS_IOC_SETVERSION:
106 if (!is_owner_or_cap(inode)) { 106 if (!inode_owner_or_capable(inode)) {
107 err = -EPERM; 107 err = -EPERM;
108 break; 108 break;
109 } 109 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 3eea859e6990..c5e82ece7c6c 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1,7 +1,7 @@
1/* 1/*
2** Write ahead logging implementation copyright Chris Mason 2000 2** Write ahead logging implementation copyright Chris Mason 2000
3** 3**
4** The background commits make this code very interelated, and 4** The background commits make this code very interrelated, and
5** overly complex. I need to rethink things a bit....The major players: 5** overly complex. I need to rethink things a bit....The major players:
6** 6**
7** journal_begin -- call with the number of blocks you expect to log. 7** journal_begin -- call with the number of blocks you expect to log.
@@ -2725,7 +2725,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2725 REISERFS_DISK_OFFSET_IN_BYTES / 2725 REISERFS_DISK_OFFSET_IN_BYTES /
2726 sb->s_blocksize + 2); 2726 sb->s_blocksize + 2);
2727 2727
2728 /* Sanity check to see is the standard journal fitting withing first bitmap 2728 /* Sanity check to see is the standard journal fitting within first bitmap
2729 (actual for small blocksizes) */ 2729 (actual for small blocksizes) */
2730 if (!SB_ONDISK_JOURNAL_DEVICE(sb) && 2730 if (!SB_ONDISK_JOURNAL_DEVICE(sb) &&
2731 (SB_JOURNAL_1st_RESERVED_BLOCK(sb) + 2731 (SB_JOURNAL_1st_RESERVED_BLOCK(sb) +
@@ -2876,7 +2876,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2876 reiserfs_mounted_fs_count++; 2876 reiserfs_mounted_fs_count++;
2877 if (reiserfs_mounted_fs_count <= 1) { 2877 if (reiserfs_mounted_fs_count <= 1) {
2878 reiserfs_write_unlock(sb); 2878 reiserfs_write_unlock(sb);
2879 commit_wq = create_workqueue("reiserfs"); 2879 commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0);
2880 reiserfs_write_lock(sb); 2880 reiserfs_write_lock(sb);
2881 } 2881 }
2882 2882
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
index b87aa2c1afc1..7df1ce48203a 100644
--- a/fs/reiserfs/lock.c
+++ b/fs/reiserfs/lock.c
@@ -15,7 +15,7 @@
15 * for this mutex, no need for a system wide mutex facility. 15 * for this mutex, no need for a system wide mutex facility.
16 * 16 *
17 * Also this lock is often released before a call that could block because 17 * Also this lock is often released before a call that could block because
18 * reiserfs performances were partialy based on the release while schedule() 18 * reiserfs performances were partially based on the release while schedule()
19 * property of the Bkl. 19 * property of the Bkl.
20 */ 20 */
21void reiserfs_write_lock(struct super_block *s) 21void reiserfs_write_lock(struct super_block *s)
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec3458..118662690cdf 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -593,7 +593,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
593 new_inode_init(inode, dir, mode); 593 new_inode_init(inode, dir, mode);
594 594
595 jbegin_count += reiserfs_cache_default_acl(dir); 595 jbegin_count += reiserfs_cache_default_acl(dir);
596 retval = reiserfs_security_init(dir, inode, &security); 596 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
597 if (retval < 0) { 597 if (retval < 0) {
598 drop_new_inode(inode); 598 drop_new_inode(inode);
599 return retval; 599 return retval;
@@ -667,7 +667,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
667 new_inode_init(inode, dir, mode); 667 new_inode_init(inode, dir, mode);
668 668
669 jbegin_count += reiserfs_cache_default_acl(dir); 669 jbegin_count += reiserfs_cache_default_acl(dir);
670 retval = reiserfs_security_init(dir, inode, &security); 670 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
671 if (retval < 0) { 671 if (retval < 0) {
672 drop_new_inode(inode); 672 drop_new_inode(inode);
673 return retval; 673 return retval;
@@ -747,7 +747,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
747 new_inode_init(inode, dir, mode); 747 new_inode_init(inode, dir, mode);
748 748
749 jbegin_count += reiserfs_cache_default_acl(dir); 749 jbegin_count += reiserfs_cache_default_acl(dir);
750 retval = reiserfs_security_init(dir, inode, &security); 750 retval = reiserfs_security_init(dir, inode, &dentry->d_name, &security);
751 if (retval < 0) { 751 if (retval < 0) {
752 drop_new_inode(inode); 752 drop_new_inode(inode);
753 return retval; 753 return retval;
@@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
771 EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, 771 EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
772 dentry, inode, &security); 772 dentry, inode, &security);
773 if (retval) { 773 if (retval) {
774 dir->i_nlink--; 774 DEC_DIR_INODE_NLINK(dir)
775 goto out_failed; 775 goto out_failed;
776 } 776 }
777 777
@@ -1032,7 +1032,8 @@ static int reiserfs_symlink(struct inode *parent_dir,
1032 } 1032 }
1033 new_inode_init(inode, parent_dir, mode); 1033 new_inode_init(inode, parent_dir, mode);
1034 1034
1035 retval = reiserfs_security_init(parent_dir, inode, &security); 1035 retval = reiserfs_security_init(parent_dir, inode, &dentry->d_name,
1036 &security);
1036 if (retval < 0) { 1037 if (retval < 0) {
1037 drop_new_inode(inode); 1038 drop_new_inode(inode);
1038 return retval; 1039 return retval;
@@ -1122,10 +1123,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
1122 reiserfs_write_unlock(dir->i_sb); 1123 reiserfs_write_unlock(dir->i_sb);
1123 return -EMLINK; 1124 return -EMLINK;
1124 } 1125 }
1125 if (inode->i_nlink == 0) {
1126 reiserfs_write_unlock(dir->i_sb);
1127 return -ENOENT;
1128 }
1129 1126
1130 /* inc before scheduling so reiserfs_unlink knows we are here */ 1127 /* inc before scheduling so reiserfs_unlink knows we are here */
1131 inc_nlink(inode); 1128 inc_nlink(inode);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0aab04f46827..b216ff6be1c9 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -393,7 +393,7 @@ void add_save_link(struct reiserfs_transaction_handle *th,
393 /* body of "save" link */ 393 /* body of "save" link */
394 link = INODE_PKEY(inode)->k_dir_id; 394 link = INODE_PKEY(inode)->k_dir_id;
395 395
396 /* put "save" link inot tree, don't charge quota to anyone */ 396 /* put "save" link into tree, don't charge quota to anyone */
397 retval = 397 retval =
398 reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link); 398 reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
399 if (retval) { 399 if (retval) {
@@ -2104,7 +2104,7 @@ out:
2104 2104
2105/* Read data from quotafile - avoid pagecache and such because we cannot afford 2105/* Read data from quotafile - avoid pagecache and such because we cannot afford
2106 * acquiring the locks... As quota files are never truncated and quota code 2106 * acquiring the locks... As quota files are never truncated and quota code
2107 * itself serializes the operations (and noone else should touch the files) 2107 * itself serializes the operations (and no one else should touch the files)
2108 * we don't have to be afraid of races */ 2108 * we don't have to be afraid of races */
2109static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data, 2109static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
2110 size_t len, loff_t off) 2110 size_t len, loff_t off)
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3cfb2e933644..47d2a4498b03 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -396,7 +396,7 @@ static struct page *reiserfs_get_page(struct inode *dir, size_t n)
396 struct address_space *mapping = dir->i_mapping; 396 struct address_space *mapping = dir->i_mapping;
397 struct page *page; 397 struct page *page;
398 /* We can deadlock if we try to free dentries, 398 /* We can deadlock if we try to free dentries,
399 and an unlink/rmdir has just occured - GFP_NOFS avoids this */ 399 and an unlink/rmdir has just occurred - GFP_NOFS avoids this */
400 mapping_set_gfp_mask(mapping, GFP_NOFS); 400 mapping_set_gfp_mask(mapping, GFP_NOFS);
401 page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL); 401 page = read_mapping_page(mapping, n >> PAGE_CACHE_SHIFT, NULL);
402 if (!IS_ERR(page)) { 402 if (!IS_ERR(page)) {
@@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
978 978
979static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) 979static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
980{ 980{
981 if (nd->flags & LOOKUP_RCU)
982 return -ECHILD;
983 return -EPERM; 981 return -EPERM;
984} 982}
985 983
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 90d2fcb67a31..3dc38f1206fc 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -26,7 +26,7 @@ posix_acl_set(struct dentry *dentry, const char *name, const void *value,
26 size_t jcreate_blocks; 26 size_t jcreate_blocks;
27 if (!reiserfs_posixacl(inode->i_sb)) 27 if (!reiserfs_posixacl(inode->i_sb))
28 return -EOPNOTSUPP; 28 return -EOPNOTSUPP;
29 if (!is_owner_or_cap(inode)) 29 if (!inode_owner_or_capable(inode))
30 return -EPERM; 30 return -EPERM;
31 31
32 if (value) { 32 if (value) {
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 237c6928d3c6..ef66c18a9332 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -54,6 +54,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
54 * of blocks needed for the transaction. If successful, reiserfs_security 54 * of blocks needed for the transaction. If successful, reiserfs_security
55 * must be released using reiserfs_security_free when the caller is done. */ 55 * must be released using reiserfs_security_free when the caller is done. */
56int reiserfs_security_init(struct inode *dir, struct inode *inode, 56int reiserfs_security_init(struct inode *dir, struct inode *inode,
57 const struct qstr *qstr,
57 struct reiserfs_security_handle *sec) 58 struct reiserfs_security_handle *sec)
58{ 59{
59 int blocks = 0; 60 int blocks = 0;
@@ -65,7 +66,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
65 if (IS_PRIVATE(dir)) 66 if (IS_PRIVATE(dir))
66 return 0; 67 return 0;
67 68
68 error = security_inode_init_security(inode, dir, &sec->name, 69 error = security_inode_init_security(inode, dir, qstr, &sec->name,
69 &sec->value, &sec->length); 70 &sec->value, &sec->length);
70 if (error) { 71 if (error) {
71 if (error == -EOPNOTSUPP) 72 if (error == -EOPNOTSUPP)
diff --git a/fs/select.c b/fs/select.c
index e56560d2b08a..d33418fdc858 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -517,9 +517,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
517 * Update: ERESTARTSYS breaks at least the xview clock binary, so 517 * Update: ERESTARTSYS breaks at least the xview clock binary, so
518 * I'm trying ERESTARTNOHAND which restart only when you want to. 518 * I'm trying ERESTARTNOHAND which restart only when you want to.
519 */ 519 */
520#define MAX_SELECT_SECONDS \
521 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
522
523int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, 520int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
524 fd_set __user *exp, struct timespec *end_time) 521 fd_set __user *exp, struct timespec *end_time)
525{ 522{
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index aa68a8a31518..efc309fa3035 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -5,12 +5,12 @@ config SQUASHFS
5 help 5 help
6 Saying Y here includes support for SquashFS 4.0 (a Compressed 6 Saying Y here includes support for SquashFS 4.0 (a Compressed
7 Read-Only File System). Squashfs is a highly compressed read-only 7 Read-Only File System). Squashfs is a highly compressed read-only
8 filesystem for Linux. It uses zlib/lzo compression to compress both 8 filesystem for Linux. It uses zlib, lzo or xz compression to
9 files, inodes and directories. Inodes in the system are very small 9 compress both files, inodes and directories. Inodes in the system
10 and all blocks are packed to minimise data overhead. Block sizes 10 are very small and all blocks are packed to minimise data overhead.
11 greater than 4K are supported up to a maximum of 1 Mbytes (default 11 Block sizes greater than 4K are supported up to a maximum of 1 Mbytes
12 block size 128K). SquashFS 4.0 supports 64 bit filesystems and files 12 (default block size 128K). SquashFS 4.0 supports 64 bit filesystems
13 (larger than 4GB), full uid/gid information, hard links and 13 and files (larger than 4GB), full uid/gid information, hard links and
14 timestamps. 14 timestamps.
15 15
16 Squashfs is intended for general read-only filesystem use, for 16 Squashfs is intended for general read-only filesystem use, for
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index 26b15ae34d6f..c37b520132ff 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -104,7 +104,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
104 entry = &cache->entry[i]; 104 entry = &cache->entry[i];
105 105
106 /* 106 /*
107 * Initialise choosen cache entry, and fill it in from 107 * Initialise chosen cache entry, and fill it in from
108 * disk. 108 * disk.
109 */ 109 */
110 cache->unused--; 110 cache->unused--;
@@ -286,7 +286,7 @@ cleanup:
286 286
287 287
288/* 288/*
289 * Copy upto length bytes from cache entry to buffer starting at offset bytes 289 * Copy up to length bytes from cache entry to buffer starting at offset bytes
290 * into the cache entry. If there's not length bytes then copy the number of 290 * into the cache entry. If there's not length bytes then copy the number of
291 * bytes available. In all cases return the number of bytes copied. 291 * bytes available. In all cases return the number of bytes copied.
292 */ 292 */
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index a5940e54c4dd..e921bd213738 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -23,6 +23,7 @@
23 23
24#include <linux/types.h> 24#include <linux/types.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/slab.h>
26#include <linux/buffer_head.h> 27#include <linux/buffer_head.h>
27 28
28#include "squashfs_fs.h" 29#include "squashfs_fs.h"
@@ -74,3 +75,36 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
74 75
75 return decompressor[i]; 76 return decompressor[i];
76} 77}
78
79
80void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
81{
82 struct squashfs_sb_info *msblk = sb->s_fs_info;
83 void *strm, *buffer = NULL;
84 int length = 0;
85
86 /*
87 * Read decompressor specific options from file system if present
88 */
89 if (SQUASHFS_COMP_OPTS(flags)) {
90 buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
91 if (buffer == NULL)
92 return ERR_PTR(-ENOMEM);
93
94 length = squashfs_read_data(sb, &buffer,
95 sizeof(struct squashfs_super_block), 0, NULL,
96 PAGE_CACHE_SIZE, 1);
97
98 if (length < 0) {
99 strm = ERR_PTR(length);
100 goto finished;
101 }
102 }
103
104 strm = msblk->decompressor->init(msblk, buffer, length);
105
106finished:
107 kfree(buffer);
108
109 return strm;
110}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 3b305a70f7aa..099745ad5691 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -24,7 +24,7 @@
24 */ 24 */
25 25
26struct squashfs_decompressor { 26struct squashfs_decompressor {
27 void *(*init)(struct squashfs_sb_info *); 27 void *(*init)(struct squashfs_sb_info *, void *, int);
28 void (*free)(void *); 28 void (*free)(void *);
29 int (*decompress)(struct squashfs_sb_info *, void **, 29 int (*decompress)(struct squashfs_sb_info *, void **,
30 struct buffer_head **, int, int, int, int, int); 30 struct buffer_head **, int, int, int, int, int);
@@ -33,11 +33,6 @@ struct squashfs_decompressor {
33 int supported; 33 int supported;
34}; 34};
35 35
36static inline void *squashfs_decompressor_init(struct squashfs_sb_info *msblk)
37{
38 return msblk->decompressor->init(msblk);
39}
40
41static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, 36static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk,
42 void *s) 37 void *s)
43{ 38{
diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c
index 0dc340aa2be9..3f79cd1d0c19 100644
--- a/fs/squashfs/dir.c
+++ b/fs/squashfs/dir.c
@@ -172,6 +172,11 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
172 length += sizeof(dirh); 172 length += sizeof(dirh);
173 173
174 dir_count = le32_to_cpu(dirh.count) + 1; 174 dir_count = le32_to_cpu(dirh.count) + 1;
175
176 /* dir_count should never be larger than 256 */
177 if (dir_count > 256)
178 goto failed_read;
179
175 while (dir_count--) { 180 while (dir_count--) {
176 /* 181 /*
177 * Read directory entry. 182 * Read directory entry.
@@ -183,6 +188,10 @@ static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir)
183 188
184 size = le16_to_cpu(dire->size) + 1; 189 size = le16_to_cpu(dire->size) + 1;
185 190
191 /* size should never be larger than SQUASHFS_NAME_LEN */
192 if (size > SQUASHFS_NAME_LEN)
193 goto failed_read;
194
186 err = squashfs_read_metadata(inode->i_sb, dire->name, 195 err = squashfs_read_metadata(inode->i_sb, dire->name,
187 &block, &offset, size); 196 &block, &offset, size);
188 if (err < 0) 197 if (err < 0)
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 7da759e34c52..00f4dfc5f088 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -37,7 +37,7 @@ struct squashfs_lzo {
37 void *output; 37 void *output;
38}; 38};
39 39
40static void *lzo_init(struct squashfs_sb_info *msblk) 40static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len)
41{ 41{
42 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); 42 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
43 43
@@ -58,7 +58,7 @@ failed2:
58failed: 58failed:
59 ERROR("Failed to allocate lzo workspace\n"); 59 ERROR("Failed to allocate lzo workspace\n");
60 kfree(stream); 60 kfree(stream);
61 return NULL; 61 return ERR_PTR(-ENOMEM);
62} 62}
63 63
64 64
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 7a9464d08cf6..5d922a6701ab 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -176,6 +176,11 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
176 length += sizeof(dirh); 176 length += sizeof(dirh);
177 177
178 dir_count = le32_to_cpu(dirh.count) + 1; 178 dir_count = le32_to_cpu(dirh.count) + 1;
179
180 /* dir_count should never be larger than 256 */
181 if (dir_count > 256)
182 goto data_error;
183
179 while (dir_count--) { 184 while (dir_count--) {
180 /* 185 /*
181 * Read directory entry. 186 * Read directory entry.
@@ -187,6 +192,10 @@ static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry,
187 192
188 size = le16_to_cpu(dire->size) + 1; 193 size = le16_to_cpu(dire->size) + 1;
189 194
195 /* size should never be larger than SQUASHFS_NAME_LEN */
196 if (size > SQUASHFS_NAME_LEN)
197 goto data_error;
198
190 err = squashfs_read_metadata(dir->i_sb, dire->name, 199 err = squashfs_read_metadata(dir->i_sb, dire->name,
191 &block, &offset, size); 200 &block, &offset, size);
192 if (err < 0) 201 if (err < 0)
@@ -228,6 +237,9 @@ exit_lookup:
228 d_add(dentry, inode); 237 d_add(dentry, inode);
229 return ERR_PTR(0); 238 return ERR_PTR(0);
230 239
240data_error:
241 err = -EIO;
242
231read_failure: 243read_failure:
232 ERROR("Unable to read directory block [%llx:%x]\n", 244 ERROR("Unable to read directory block [%llx:%x]\n",
233 squashfs_i(dir)->start + msblk->directory_table, 245 squashfs_i(dir)->start + msblk->directory_table,
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index ba729d808876..1f2e608b8785 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -48,6 +48,7 @@ extern int squashfs_read_table(struct super_block *, void *, u64, int);
48 48
49/* decompressor.c */ 49/* decompressor.c */
50extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); 50extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
51extern void *squashfs_decompressor_init(struct super_block *, unsigned short);
51 52
52/* export.c */ 53/* export.c */
53extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, 54extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64,
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 39533feffd6d..4582c568ef4d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -57,6 +57,7 @@
57#define SQUASHFS_ALWAYS_FRAG 5 57#define SQUASHFS_ALWAYS_FRAG 5
58#define SQUASHFS_DUPLICATE 6 58#define SQUASHFS_DUPLICATE 6
59#define SQUASHFS_EXPORT 7 59#define SQUASHFS_EXPORT 7
60#define SQUASHFS_COMP_OPT 10
60 61
61#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1) 62#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1)
62 63
@@ -81,6 +82,9 @@
81#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \ 82#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \
82 SQUASHFS_EXPORT) 83 SQUASHFS_EXPORT)
83 84
85#define SQUASHFS_COMP_OPTS(flags) SQUASHFS_BIT(flags, \
86 SQUASHFS_COMP_OPT)
87
84/* Max number of types and file types */ 88/* Max number of types and file types */
85#define SQUASHFS_DIR_TYPE 1 89#define SQUASHFS_DIR_TYPE 1
86#define SQUASHFS_REG_TYPE 2 90#define SQUASHFS_REG_TYPE 2
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 20700b9f2b4c..5c8184c061a4 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -199,10 +199,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
199 199
200 err = -ENOMEM; 200 err = -ENOMEM;
201 201
202 msblk->stream = squashfs_decompressor_init(msblk);
203 if (msblk->stream == NULL)
204 goto failed_mount;
205
206 msblk->block_cache = squashfs_cache_init("metadata", 202 msblk->block_cache = squashfs_cache_init("metadata",
207 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); 203 SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE);
208 if (msblk->block_cache == NULL) 204 if (msblk->block_cache == NULL)
@@ -215,6 +211,13 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
215 goto failed_mount; 211 goto failed_mount;
216 } 212 }
217 213
214 msblk->stream = squashfs_decompressor_init(sb, flags);
215 if (IS_ERR(msblk->stream)) {
216 err = PTR_ERR(msblk->stream);
217 msblk->stream = NULL;
218 goto failed_mount;
219 }
220
218 /* Allocate and read id index table */ 221 /* Allocate and read id index table */
219 msblk->id_table = squashfs_read_id_index_table(sb, 222 msblk->id_table = squashfs_read_id_index_table(sb,
220 le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids)); 223 le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids));
@@ -370,8 +373,8 @@ static void squashfs_put_super(struct super_block *sb)
370} 373}
371 374
372 375
373static struct dentry *squashfs_mount(struct file_system_type *fs_type, int flags, 376static struct dentry *squashfs_mount(struct file_system_type *fs_type,
374 const char *dev_name, void *data) 377 int flags, const char *dev_name, void *data)
375{ 378{
376 return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super); 379 return mount_bdev(fs_type, flags, dev_name, data, squashfs_fill_super);
377} 380}
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index c4eb40018256..aa47a286d1f8 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -26,10 +26,10 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/xz.h> 28#include <linux/xz.h>
29#include <linux/bitops.h>
29 30
30#include "squashfs_fs.h" 31#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 32#include "squashfs_fs_sb.h"
32#include "squashfs_fs_i.h"
33#include "squashfs.h" 33#include "squashfs.h"
34#include "decompressor.h" 34#include "decompressor.h"
35 35
@@ -38,24 +38,57 @@ struct squashfs_xz {
38 struct xz_buf buf; 38 struct xz_buf buf;
39}; 39};
40 40
41static void *squashfs_xz_init(struct squashfs_sb_info *msblk) 41struct comp_opts {
42 __le32 dictionary_size;
43 __le32 flags;
44};
45
46static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
47 int len)
42{ 48{
43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); 49 struct comp_opts *comp_opts = buff;
50 struct squashfs_xz *stream;
51 int dict_size = msblk->block_size;
52 int err, n;
53
54 if (comp_opts) {
55 /* check compressor options are the expected length */
56 if (len < sizeof(*comp_opts)) {
57 err = -EIO;
58 goto failed;
59 }
44 60
45 struct squashfs_xz *stream = kmalloc(sizeof(*stream), GFP_KERNEL); 61 dict_size = le32_to_cpu(comp_opts->dictionary_size);
46 if (stream == NULL) 62
63 /* the dictionary size should be 2^n or 2^n+2^(n+1) */
64 n = ffs(dict_size) - 1;
65 if (dict_size != (1 << n) && dict_size != (1 << n) +
66 (1 << (n + 1))) {
67 err = -EIO;
68 goto failed;
69 }
70 }
71
72 dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE);
73
74 stream = kmalloc(sizeof(*stream), GFP_KERNEL);
75 if (stream == NULL) {
76 err = -ENOMEM;
47 goto failed; 77 goto failed;
78 }
48 79
49 stream->state = xz_dec_init(XZ_PREALLOC, block_size); 80 stream->state = xz_dec_init(XZ_PREALLOC, dict_size);
50 if (stream->state == NULL) 81 if (stream->state == NULL) {
82 kfree(stream);
83 err = -ENOMEM;
51 goto failed; 84 goto failed;
85 }
52 86
53 return stream; 87 return stream;
54 88
55failed: 89failed:
56 ERROR("Failed to allocate xz workspace\n"); 90 ERROR("Failed to initialise xz decompressor\n");
57 kfree(stream); 91 return ERR_PTR(err);
58 return NULL;
59} 92}
60 93
61 94
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 4661ae2b1cec..517688b32ffa 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -26,19 +26,19 @@
26#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/zlib.h> 28#include <linux/zlib.h>
29#include <linux/vmalloc.h>
29 30
30#include "squashfs_fs.h" 31#include "squashfs_fs.h"
31#include "squashfs_fs_sb.h" 32#include "squashfs_fs_sb.h"
32#include "squashfs.h" 33#include "squashfs.h"
33#include "decompressor.h" 34#include "decompressor.h"
34 35
35static void *zlib_init(struct squashfs_sb_info *dummy) 36static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len)
36{ 37{
37 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); 38 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
38 if (stream == NULL) 39 if (stream == NULL)
39 goto failed; 40 goto failed;
40 stream->workspace = kmalloc(zlib_inflate_workspacesize(), 41 stream->workspace = vmalloc(zlib_inflate_workspacesize());
41 GFP_KERNEL);
42 if (stream->workspace == NULL) 42 if (stream->workspace == NULL)
43 goto failed; 43 goto failed;
44 44
@@ -47,7 +47,7 @@ static void *zlib_init(struct squashfs_sb_info *dummy)
47failed: 47failed:
48 ERROR("Failed to allocate zlib workspace\n"); 48 ERROR("Failed to allocate zlib workspace\n");
49 kfree(stream); 49 kfree(stream);
50 return NULL; 50 return ERR_PTR(-ENOMEM);
51} 51}
52 52
53 53
@@ -56,7 +56,7 @@ static void zlib_free(void *strm)
56 z_stream *stream = strm; 56 z_stream *stream = strm;
57 57
58 if (stream) 58 if (stream)
59 kfree(stream->workspace); 59 vfree(stream->workspace);
60 kfree(stream); 60 kfree(stream);
61} 61}
62 62
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b703..961039121cb8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
75 int error = -EINVAL; 75 int error = -EINVAL;
76 int lookup_flags = 0; 76 int lookup_flags = 0;
77 77
78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0) 78 if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
79 AT_EMPTY_PATH)) != 0)
79 goto out; 80 goto out;
80 81
81 if (!(flag & AT_SYMLINK_NOFOLLOW)) 82 if (!(flag & AT_SYMLINK_NOFOLLOW))
82 lookup_flags |= LOOKUP_FOLLOW; 83 lookup_flags |= LOOKUP_FOLLOW;
83 if (flag & AT_NO_AUTOMOUNT) 84 if (flag & AT_NO_AUTOMOUNT)
84 lookup_flags |= LOOKUP_NO_AUTOMOUNT; 85 lookup_flags |= LOOKUP_NO_AUTOMOUNT;
86 if (flag & AT_EMPTY_PATH)
87 lookup_flags |= LOOKUP_EMPTY;
85 88
86 error = user_path_at(dfd, filename, lookup_flags, &path); 89 error = user_path_at(dfd, filename, lookup_flags, &path);
87 if (error) 90 if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
297 if (bufsiz <= 0) 300 if (bufsiz <= 0)
298 return -EINVAL; 301 return -EINVAL;
299 302
300 error = user_path_at(dfd, pathname, 0, &path); 303 error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
301 if (!error) { 304 if (!error) {
302 struct inode *inode = path.dentry->d_inode; 305 struct inode *inode = path.dentry->d_inode;
303 306
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996b..8244924dec55 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
73} 73}
74EXPORT_SYMBOL(vfs_statfs); 74EXPORT_SYMBOL(vfs_statfs);
75 75
76static int do_statfs_native(struct path *path, struct statfs *buf) 76int user_statfs(const char __user *pathname, struct kstatfs *st)
77{ 77{
78 struct kstatfs st; 78 struct path path;
79 int retval; 79 int error = user_path(pathname, &path);
80 if (!error) {
81 error = vfs_statfs(&path, st);
82 path_put(&path);
83 }
84 return error;
85}
80 86
81 retval = vfs_statfs(path, &st); 87int fd_statfs(int fd, struct kstatfs *st)
82 if (retval) 88{
83 return retval; 89 struct file *file = fget(fd);
90 int error = -EBADF;
91 if (file) {
92 error = vfs_statfs(&file->f_path, st);
93 fput(file);
94 }
95 return error;
96}
84 97
85 if (sizeof(*buf) == sizeof(st)) 98static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
86 memcpy(buf, &st, sizeof(st)); 99{
100 struct statfs buf;
101
102 if (sizeof(buf) == sizeof(*st))
103 memcpy(&buf, st, sizeof(*st));
87 else { 104 else {
88 if (sizeof buf->f_blocks == 4) { 105 if (sizeof buf.f_blocks == 4) {
89 if ((st.f_blocks | st.f_bfree | st.f_bavail | 106 if ((st->f_blocks | st->f_bfree | st->f_bavail |
90 st.f_bsize | st.f_frsize) & 107 st->f_bsize | st->f_frsize) &
91 0xffffffff00000000ULL) 108 0xffffffff00000000ULL)
92 return -EOVERFLOW; 109 return -EOVERFLOW;
93 /* 110 /*
94 * f_files and f_ffree may be -1; it's okay to stuff 111 * f_files and f_ffree may be -1; it's okay to stuff
95 * that into 32 bits 112 * that into 32 bits
96 */ 113 */
97 if (st.f_files != -1 && 114 if (st->f_files != -1 &&
98 (st.f_files & 0xffffffff00000000ULL)) 115 (st->f_files & 0xffffffff00000000ULL))
99 return -EOVERFLOW; 116 return -EOVERFLOW;
100 if (st.f_ffree != -1 && 117 if (st->f_ffree != -1 &&
101 (st.f_ffree & 0xffffffff00000000ULL)) 118 (st->f_ffree & 0xffffffff00000000ULL))
102 return -EOVERFLOW; 119 return -EOVERFLOW;
103 } 120 }
104 121
105 buf->f_type = st.f_type; 122 buf.f_type = st->f_type;
106 buf->f_bsize = st.f_bsize; 123 buf.f_bsize = st->f_bsize;
107 buf->f_blocks = st.f_blocks; 124 buf.f_blocks = st->f_blocks;
108 buf->f_bfree = st.f_bfree; 125 buf.f_bfree = st->f_bfree;
109 buf->f_bavail = st.f_bavail; 126 buf.f_bavail = st->f_bavail;
110 buf->f_files = st.f_files; 127 buf.f_files = st->f_files;
111 buf->f_ffree = st.f_ffree; 128 buf.f_ffree = st->f_ffree;
112 buf->f_fsid = st.f_fsid; 129 buf.f_fsid = st->f_fsid;
113 buf->f_namelen = st.f_namelen; 130 buf.f_namelen = st->f_namelen;
114 buf->f_frsize = st.f_frsize; 131 buf.f_frsize = st->f_frsize;
115 buf->f_flags = st.f_flags; 132 buf.f_flags = st->f_flags;
116 memset(buf->f_spare, 0, sizeof(buf->f_spare)); 133 memset(buf.f_spare, 0, sizeof(buf.f_spare));
117 } 134 }
135 if (copy_to_user(p, &buf, sizeof(buf)))
136 return -EFAULT;
118 return 0; 137 return 0;
119} 138}
120 139
121static int do_statfs64(struct path *path, struct statfs64 *buf) 140static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
122{ 141{
123 struct kstatfs st; 142 struct statfs64 buf;
124 int retval; 143 if (sizeof(buf) == sizeof(*st))
125 144 memcpy(&buf, st, sizeof(*st));
126 retval = vfs_statfs(path, &st);
127 if (retval)
128 return retval;
129
130 if (sizeof(*buf) == sizeof(st))
131 memcpy(buf, &st, sizeof(st));
132 else { 145 else {
133 buf->f_type = st.f_type; 146 buf.f_type = st->f_type;
134 buf->f_bsize = st.f_bsize; 147 buf.f_bsize = st->f_bsize;
135 buf->f_blocks = st.f_blocks; 148 buf.f_blocks = st->f_blocks;
136 buf->f_bfree = st.f_bfree; 149 buf.f_bfree = st->f_bfree;
137 buf->f_bavail = st.f_bavail; 150 buf.f_bavail = st->f_bavail;
138 buf->f_files = st.f_files; 151 buf.f_files = st->f_files;
139 buf->f_ffree = st.f_ffree; 152 buf.f_ffree = st->f_ffree;
140 buf->f_fsid = st.f_fsid; 153 buf.f_fsid = st->f_fsid;
141 buf->f_namelen = st.f_namelen; 154 buf.f_namelen = st->f_namelen;
142 buf->f_frsize = st.f_frsize; 155 buf.f_frsize = st->f_frsize;
143 buf->f_flags = st.f_flags; 156 buf.f_flags = st->f_flags;
144 memset(buf->f_spare, 0, sizeof(buf->f_spare)); 157 memset(buf.f_spare, 0, sizeof(buf.f_spare));
145 } 158 }
159 if (copy_to_user(p, &buf, sizeof(buf)))
160 return -EFAULT;
146 return 0; 161 return 0;
147} 162}
148 163
149SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf) 164SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
150{ 165{
151 struct path path; 166 struct kstatfs st;
152 int error; 167 int error = user_statfs(pathname, &st);
153 168 if (!error)
154 error = user_path(pathname, &path); 169 error = do_statfs_native(&st, buf);
155 if (!error) {
156 struct statfs tmp;
157 error = do_statfs_native(&path, &tmp);
158 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
159 error = -EFAULT;
160 path_put(&path);
161 }
162 return error; 170 return error;
163} 171}
164 172
165SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf) 173SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
166{ 174{
167 struct path path; 175 struct kstatfs st;
168 long error; 176 int error;
169
170 if (sz != sizeof(*buf)) 177 if (sz != sizeof(*buf))
171 return -EINVAL; 178 return -EINVAL;
172 error = user_path(pathname, &path); 179 error = user_statfs(pathname, &st);
173 if (!error) { 180 if (!error)
174 struct statfs64 tmp; 181 error = do_statfs64(&st, buf);
175 error = do_statfs64(&path, &tmp);
176 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
177 error = -EFAULT;
178 path_put(&path);
179 }
180 return error; 182 return error;
181} 183}
182 184
183SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf) 185SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
184{ 186{
185 struct file *file; 187 struct kstatfs st;
186 struct statfs tmp; 188 int error = fd_statfs(fd, &st);
187 int error; 189 if (!error)
188 190 error = do_statfs_native(&st, buf);
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = do_statfs_native(&file->f_path, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error; 191 return error;
199} 192}
200 193
201SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf) 194SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
202{ 195{
203 struct file *file; 196 struct kstatfs st;
204 struct statfs64 tmp;
205 int error; 197 int error;
206 198
207 if (sz != sizeof(*buf)) 199 if (sz != sizeof(*buf))
208 return -EINVAL; 200 return -EINVAL;
209 201
210 error = -EBADF; 202 error = fd_statfs(fd, &st);
211 file = fget(fd); 203 if (!error)
212 if (!file) 204 error = do_statfs64(&st, buf);
213 goto out;
214 error = do_statfs64(&file->f_path, &tmp);
215 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
216 error = -EFAULT;
217 fput(file);
218out:
219 return error; 205 return error;
220} 206}
221 207
diff --git a/fs/super.c b/fs/super.c
index 74e149efed81..8a06881b1920 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -71,6 +71,7 @@ static struct super_block *alloc_super(struct file_system_type *type)
71#else 71#else
72 INIT_LIST_HEAD(&s->s_files); 72 INIT_LIST_HEAD(&s->s_files);
73#endif 73#endif
74 s->s_bdi = &default_backing_dev_info;
74 INIT_LIST_HEAD(&s->s_instances); 75 INIT_LIST_HEAD(&s->s_instances);
75 INIT_HLIST_BL_HEAD(&s->s_anon); 76 INIT_HLIST_BL_HEAD(&s->s_anon);
76 INIT_LIST_HEAD(&s->s_inodes); 77 INIT_LIST_HEAD(&s->s_inodes);
@@ -177,6 +178,11 @@ void deactivate_locked_super(struct super_block *s)
177 struct file_system_type *fs = s->s_type; 178 struct file_system_type *fs = s->s_type;
178 if (atomic_dec_and_test(&s->s_active)) { 179 if (atomic_dec_and_test(&s->s_active)) {
179 fs->kill_sb(s); 180 fs->kill_sb(s);
181 /*
182 * We need to call rcu_barrier so all the delayed rcu free
183 * inodes are flushed before we release the fs module.
184 */
185 rcu_barrier();
180 put_filesystem(fs); 186 put_filesystem(fs);
181 put_super(s); 187 put_super(s);
182 } else { 188 } else {
@@ -838,23 +844,6 @@ error:
838} 844}
839EXPORT_SYMBOL(mount_bdev); 845EXPORT_SYMBOL(mount_bdev);
840 846
841int get_sb_bdev(struct file_system_type *fs_type,
842 int flags, const char *dev_name, void *data,
843 int (*fill_super)(struct super_block *, void *, int),
844 struct vfsmount *mnt)
845{
846 struct dentry *root;
847
848 root = mount_bdev(fs_type, flags, dev_name, data, fill_super);
849 if (IS_ERR(root))
850 return PTR_ERR(root);
851 mnt->mnt_root = root;
852 mnt->mnt_sb = root->d_sb;
853 return 0;
854}
855
856EXPORT_SYMBOL(get_sb_bdev);
857
858void kill_block_super(struct super_block *sb) 847void kill_block_super(struct super_block *sb)
859{ 848{
860 struct block_device *bdev = sb->s_bdev; 849 struct block_device *bdev = sb->s_bdev;
@@ -892,22 +881,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type,
892} 881}
893EXPORT_SYMBOL(mount_nodev); 882EXPORT_SYMBOL(mount_nodev);
894 883
895int get_sb_nodev(struct file_system_type *fs_type,
896 int flags, void *data,
897 int (*fill_super)(struct super_block *, void *, int),
898 struct vfsmount *mnt)
899{
900 struct dentry *root;
901
902 root = mount_nodev(fs_type, flags, data, fill_super);
903 if (IS_ERR(root))
904 return PTR_ERR(root);
905 mnt->mnt_root = root;
906 mnt->mnt_sb = root->d_sb;
907 return 0;
908}
909EXPORT_SYMBOL(get_sb_nodev);
910
911static int compare_single(struct super_block *s, void *p) 884static int compare_single(struct super_block *s, void *p)
912{ 885{
913 return 1; 886 return 1;
@@ -938,69 +911,36 @@ struct dentry *mount_single(struct file_system_type *fs_type,
938} 911}
939EXPORT_SYMBOL(mount_single); 912EXPORT_SYMBOL(mount_single);
940 913
941int get_sb_single(struct file_system_type *fs_type, 914struct dentry *
942 int flags, void *data, 915mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
943 int (*fill_super)(struct super_block *, void *, int),
944 struct vfsmount *mnt)
945{
946 struct dentry *root;
947 root = mount_single(fs_type, flags, data, fill_super);
948 if (IS_ERR(root))
949 return PTR_ERR(root);
950 mnt->mnt_root = root;
951 mnt->mnt_sb = root->d_sb;
952 return 0;
953}
954
955EXPORT_SYMBOL(get_sb_single);
956
957struct vfsmount *
958vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data)
959{ 916{
960 struct vfsmount *mnt;
961 struct dentry *root; 917 struct dentry *root;
918 struct super_block *sb;
962 char *secdata = NULL; 919 char *secdata = NULL;
963 int error; 920 int error = -ENOMEM;
964
965 if (!type)
966 return ERR_PTR(-ENODEV);
967
968 error = -ENOMEM;
969 mnt = alloc_vfsmnt(name);
970 if (!mnt)
971 goto out;
972
973 if (flags & MS_KERNMOUNT)
974 mnt->mnt_flags = MNT_INTERNAL;
975 921
976 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) { 922 if (data && !(type->fs_flags & FS_BINARY_MOUNTDATA)) {
977 secdata = alloc_secdata(); 923 secdata = alloc_secdata();
978 if (!secdata) 924 if (!secdata)
979 goto out_mnt; 925 goto out;
980 926
981 error = security_sb_copy_data(data, secdata); 927 error = security_sb_copy_data(data, secdata);
982 if (error) 928 if (error)
983 goto out_free_secdata; 929 goto out_free_secdata;
984 } 930 }
985 931
986 if (type->mount) { 932 root = type->mount(type, flags, name, data);
987 root = type->mount(type, flags, name, data); 933 if (IS_ERR(root)) {
988 if (IS_ERR(root)) { 934 error = PTR_ERR(root);
989 error = PTR_ERR(root); 935 goto out_free_secdata;
990 goto out_free_secdata;
991 }
992 mnt->mnt_root = root;
993 mnt->mnt_sb = root->d_sb;
994 } else {
995 error = type->get_sb(type, flags, name, data, mnt);
996 if (error < 0)
997 goto out_free_secdata;
998 } 936 }
999 BUG_ON(!mnt->mnt_sb); 937 sb = root->d_sb;
1000 WARN_ON(!mnt->mnt_sb->s_bdi); 938 BUG_ON(!sb);
1001 mnt->mnt_sb->s_flags |= MS_BORN; 939 WARN_ON(!sb->s_bdi);
940 WARN_ON(sb->s_bdi == &default_backing_dev_info);
941 sb->s_flags |= MS_BORN;
1002 942
1003 error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); 943 error = security_sb_kern_mount(sb, flags, secdata);
1004 if (error) 944 if (error)
1005 goto out_sb; 945 goto out_sb;
1006 946
@@ -1011,27 +951,21 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
1011 * violate this rule. This warning should be either removed or 951 * violate this rule. This warning should be either removed or
1012 * converted to a BUG() in 2.6.34. 952 * converted to a BUG() in 2.6.34.
1013 */ 953 */
1014 WARN((mnt->mnt_sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " 954 WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
1015 "negative value (%lld)\n", type->name, mnt->mnt_sb->s_maxbytes); 955 "negative value (%lld)\n", type->name, sb->s_maxbytes);
1016 956
1017 mnt->mnt_mountpoint = mnt->mnt_root; 957 up_write(&sb->s_umount);
1018 mnt->mnt_parent = mnt;
1019 up_write(&mnt->mnt_sb->s_umount);
1020 free_secdata(secdata); 958 free_secdata(secdata);
1021 return mnt; 959 return root;
1022out_sb: 960out_sb:
1023 dput(mnt->mnt_root); 961 dput(root);
1024 deactivate_locked_super(mnt->mnt_sb); 962 deactivate_locked_super(sb);
1025out_free_secdata: 963out_free_secdata:
1026 free_secdata(secdata); 964 free_secdata(secdata);
1027out_mnt:
1028 free_vfsmnt(mnt);
1029out: 965out:
1030 return ERR_PTR(error); 966 return ERR_PTR(error);
1031} 967}
1032 968
1033EXPORT_SYMBOL_GPL(vfs_kern_mount);
1034
1035/** 969/**
1036 * freeze_super - lock the filesystem and force it into a consistent state 970 * freeze_super - lock the filesystem and force it into a consistent state
1037 * @sb: the super to lock 971 * @sb: the super to lock
@@ -1121,49 +1055,3 @@ out:
1121 return 0; 1055 return 0;
1122} 1056}
1123EXPORT_SYMBOL(thaw_super); 1057EXPORT_SYMBOL(thaw_super);
1124
1125static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
1126{
1127 int err;
1128 const char *subtype = strchr(fstype, '.');
1129 if (subtype) {
1130 subtype++;
1131 err = -EINVAL;
1132 if (!subtype[0])
1133 goto err;
1134 } else
1135 subtype = "";
1136
1137 mnt->mnt_sb->s_subtype = kstrdup(subtype, GFP_KERNEL);
1138 err = -ENOMEM;
1139 if (!mnt->mnt_sb->s_subtype)
1140 goto err;
1141 return mnt;
1142
1143 err:
1144 mntput(mnt);
1145 return ERR_PTR(err);
1146}
1147
1148struct vfsmount *
1149do_kern_mount(const char *fstype, int flags, const char *name, void *data)
1150{
1151 struct file_system_type *type = get_fs_type(fstype);
1152 struct vfsmount *mnt;
1153 if (!type)
1154 return ERR_PTR(-ENODEV);
1155 mnt = vfs_kern_mount(type, flags, name, data);
1156 if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
1157 !mnt->mnt_sb->s_subtype)
1158 mnt = fs_set_subtype(mnt, fstype);
1159 put_filesystem(type);
1160 return mnt;
1161}
1162EXPORT_SYMBOL_GPL(do_kern_mount);
1163
1164struct vfsmount *kern_mount_data(struct file_system_type *type, void *data)
1165{
1166 return vfs_kern_mount(type, MS_KERNMOUNT, type->name, data);
1167}
1168
1169EXPORT_SYMBOL_GPL(kern_mount_data);
diff --git a/fs/sync.c b/fs/sync.c
index ba76b9623e7e..c38ec163da6c 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,6 +7,7 @@
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/namei.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/writeback.h> 12#include <linux/writeback.h>
12#include <linux/syscalls.h> 13#include <linux/syscalls.h>
@@ -33,7 +34,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
33 * This should be safe, as we require bdi backing to actually 34 * This should be safe, as we require bdi backing to actually
34 * write out data in the first place 35 * write out data in the first place
35 */ 36 */
36 if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info) 37 if (sb->s_bdi == &noop_backing_dev_info)
37 return 0; 38 return 0;
38 39
39 if (sb->s_qcop && sb->s_qcop->quota_sync) 40 if (sb->s_qcop && sb->s_qcop->quota_sync)
@@ -79,7 +80,7 @@ EXPORT_SYMBOL_GPL(sync_filesystem);
79 80
80static void sync_one_sb(struct super_block *sb, void *arg) 81static void sync_one_sb(struct super_block *sb, void *arg)
81{ 82{
82 if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi) 83 if (!(sb->s_flags & MS_RDONLY))
83 __sync_filesystem(sb, *(int *)arg); 84 __sync_filesystem(sb, *(int *)arg);
84} 85}
85/* 86/*
@@ -128,6 +129,29 @@ void emergency_sync(void)
128 } 129 }
129} 130}
130 131
132/*
133 * sync a single super
134 */
135SYSCALL_DEFINE1(syncfs, int, fd)
136{
137 struct file *file;
138 struct super_block *sb;
139 int ret;
140 int fput_needed;
141
142 file = fget_light(fd, &fput_needed);
143 if (!file)
144 return -EBADF;
145 sb = file->f_dentry->d_sb;
146
147 down_read(&sb->s_umount);
148 ret = sync_filesystem(sb);
149 up_read(&sb->s_umount);
150
151 fput_light(file, fput_needed);
152 return ret;
153}
154
131/** 155/**
132 * vfs_fsync_range - helper to sync a range of data & metadata to disk 156 * vfs_fsync_range - helper to sync a range of data & metadata to disk
133 * @file: file to sync 157 * @file: file to sync
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 9ca66276315e..fa8d43c92bb8 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -488,7 +488,6 @@ static sector_t sysv_bmap(struct address_space *mapping, sector_t block)
488const struct address_space_operations sysv_aops = { 488const struct address_space_operations sysv_aops = {
489 .readpage = sysv_readpage, 489 .readpage = sysv_readpage,
490 .writepage = sysv_writepage, 490 .writepage = sysv_writepage,
491 .sync_page = block_sync_page,
492 .write_begin = sysv_write_begin, 491 .write_begin = sysv_write_begin,
493 .write_end = generic_write_end, 492 .write_end = generic_write_end,
494 .bmap = sysv_bmap 493 .bmap = sysv_bmap
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b427b1208c26..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
245 new_de = sysv_find_entry(new_dentry, &new_page); 245 new_de = sysv_find_entry(new_dentry, &new_page);
246 if (!new_de) 246 if (!new_de)
247 goto out_dir; 247 goto out_dir;
248 inode_inc_link_count(old_inode);
249 sysv_set_link(new_de, new_page, old_inode); 248 sysv_set_link(new_de, new_page, old_inode);
250 new_inode->i_ctime = CURRENT_TIME_SEC; 249 new_inode->i_ctime = CURRENT_TIME_SEC;
251 if (dir_de) 250 if (dir_de)
@@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
257 if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max) 256 if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
258 goto out_dir; 257 goto out_dir;
259 } 258 }
260 inode_inc_link_count(old_inode);
261 err = sysv_add_link(new_dentry, old_inode); 259 err = sysv_add_link(new_dentry, old_inode);
262 if (err) { 260 if (err)
263 inode_dec_link_count(old_inode);
264 goto out_dir; 261 goto out_dir;
265 }
266 if (dir_de) 262 if (dir_de)
267 inode_inc_link_count(new_dir); 263 inode_inc_link_count(new_dir);
268 } 264 }
269 265
270 sysv_delete_entry(old_de, old_page); 266 sysv_delete_entry(old_de, old_page);
271 inode_dec_link_count(old_inode); 267 mark_inode_dirty(old_inode);
272 268
273 if (dir_de) { 269 if (dir_de) {
274 sysv_set_link(dir_de, dir_page, new_dir); 270 sysv_set_link(dir_de, dir_page, new_dir);
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 830e3f76f442..f8b0160da2da 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -44,29 +44,17 @@ config UBIFS_FS_ZLIB
44 44
45# Debugging-related stuff 45# Debugging-related stuff
46config UBIFS_FS_DEBUG 46config UBIFS_FS_DEBUG
47 bool "Enable debugging" 47 bool "Enable debugging support"
48 depends on UBIFS_FS 48 depends on UBIFS_FS
49 select DEBUG_FS 49 select DEBUG_FS
50 select KALLSYMS_ALL 50 select KALLSYMS
51 help 51 help
52 This option enables UBIFS debugging. 52 This option enables UBIFS debugging support. It makes sure various
53 53 assertions, self-checks, debugging messages and test modes are compiled
54config UBIFS_FS_DEBUG_MSG_LVL 54 in (this all is compiled out otherwise). Assertions are light-weight
55 int "Default message level (0 = no extra messages, 3 = lots)" 55 and this option also enables them. Self-checks, debugging messages and
56 depends on UBIFS_FS_DEBUG 56 test modes are switched off by default. Thus, it is safe and actually
57 default "0" 57 recommended to have debugging support enabled, and it should not slow
58 help 58 down UBIFS. You can then further enable / disable individual debugging
59 This controls the amount of debugging messages produced by UBIFS. 59 features using UBIFS module parameters and the corresponding sysfs
60 If reporting bugs, please try to have available a full dump of the 60 interfaces.
61 messages at level 1 while the misbehaviour was occurring. Level 2
62 may become necessary if level 1 messages were not enough to find the
63 bug. Generally Level 3 should be avoided.
64
65config UBIFS_FS_DEBUG_CHKS
66 bool "Enable extra checks"
67 depends on UBIFS_FS_DEBUG
68 help
69 If extra checks are enabled UBIFS will check the consistency of its
70 internal data structures during operation. However, UBIFS performance
71 is dramatically slower when this option is selected especially if the
72 file system is large.
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index c8ff0d1ae5d3..8b3a7da531eb 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -147,7 +147,7 @@ static int make_free_space(struct ubifs_info *c)
147 if (liab2 < liab1) 147 if (liab2 < liab1)
148 return -EAGAIN; 148 return -EAGAIN;
149 149
150 dbg_budg("new liability %lld (not shrinked)", liab2); 150 dbg_budg("new liability %lld (not shrunk)", liab2);
151 151
152 /* Liability did not shrink again, try GC */ 152 /* Liability did not shrink again, try GC */
153 dbg_budg("Run GC"); 153 dbg_budg("Run GC");
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index 02429d81ca33..1bd01ded7123 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -48,6 +48,56 @@
48#include <linux/slab.h> 48#include <linux/slab.h>
49#include "ubifs.h" 49#include "ubifs.h"
50 50
51/*
52 * nothing_to_commit - check if there is nothing to commit.
53 * @c: UBIFS file-system description object
54 *
55 * This is a helper function which checks if there is anything to commit. It is
56 * used as an optimization to avoid starting the commit if it is not really
57 * necessary. Indeed, the commit operation always assumes flash I/O (e.g.,
58 * writing the commit start node to the log), and it is better to avoid doing
59 * this unnecessarily. E.g., 'ubifs_sync_fs()' runs the commit, but if there is
60 * nothing to commit, it is more optimal to avoid any flash I/O.
61 *
62 * This function has to be called with @c->commit_sem locked for writing -
63 * this function does not take LPT/TNC locks because the @c->commit_sem
64 * guarantees that we have exclusive access to the TNC and LPT data structures.
65 *
66 * This function returns %1 if there is nothing to commit and %0 otherwise.
67 */
68static int nothing_to_commit(struct ubifs_info *c)
69{
70 /*
71 * During mounting or remounting from R/O mode to R/W mode we may
72 * commit for various recovery-related reasons.
73 */
74 if (c->mounting || c->remounting_rw)
75 return 0;
76
77 /*
78 * If the root TNC node is dirty, we definitely have something to
79 * commit.
80 */
81 if (c->zroot.znode && test_bit(DIRTY_ZNODE, &c->zroot.znode->flags))
82 return 0;
83
84 /*
85 * Even though the TNC is clean, the LPT tree may have dirty nodes. For
86 * example, this may happen if the budgeting subsystem invoked GC to
87 * make some free space, and the GC found an LEB with only dirty and
88 * free space. In this case GC would just change the lprops of this
89 * LEB (by turning all space into free space) and unmap it.
90 */
91 if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
92 return 0;
93
94 ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0);
95 ubifs_assert(c->dirty_pn_cnt == 0);
96 ubifs_assert(c->dirty_nn_cnt == 0);
97
98 return 1;
99}
100
51/** 101/**
52 * do_commit - commit the journal. 102 * do_commit - commit the journal.
53 * @c: UBIFS file-system description object 103 * @c: UBIFS file-system description object
@@ -70,6 +120,12 @@ static int do_commit(struct ubifs_info *c)
70 goto out_up; 120 goto out_up;
71 } 121 }
72 122
123 if (nothing_to_commit(c)) {
124 up_write(&c->commit_sem);
125 err = 0;
126 goto out_cancel;
127 }
128
73 /* Sync all write buffers (necessary for recovery) */ 129 /* Sync all write buffers (necessary for recovery) */
74 for (i = 0; i < c->jhead_cnt; i++) { 130 for (i = 0; i < c->jhead_cnt; i++) {
75 err = ubifs_wbuf_sync(&c->jheads[i].wbuf); 131 err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
@@ -162,12 +218,12 @@ static int do_commit(struct ubifs_info *c)
162 if (err) 218 if (err)
163 goto out; 219 goto out;
164 220
221out_cancel:
165 spin_lock(&c->cs_lock); 222 spin_lock(&c->cs_lock);
166 c->cmt_state = COMMIT_RESTING; 223 c->cmt_state = COMMIT_RESTING;
167 wake_up(&c->cmt_wq); 224 wake_up(&c->cmt_wq);
168 dbg_cmt("commit end"); 225 dbg_cmt("commit end");
169 spin_unlock(&c->cs_lock); 226 spin_unlock(&c->cs_lock);
170
171 return 0; 227 return 0;
172 228
173out_up: 229out_up:
@@ -521,7 +577,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
521 size_t sz; 577 size_t sz;
522 578
523 if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX)) 579 if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX))
524 goto out; 580 return 0;
525 581
526 INIT_LIST_HEAD(&list); 582 INIT_LIST_HEAD(&list);
527 583
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 0bee4dbffc31..004d3745dc45 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -43,8 +43,8 @@ DEFINE_SPINLOCK(dbg_lock);
43static char dbg_key_buf0[128]; 43static char dbg_key_buf0[128];
44static char dbg_key_buf1[128]; 44static char dbg_key_buf1[128];
45 45
46unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT; 46unsigned int ubifs_msg_flags;
47unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT; 47unsigned int ubifs_chk_flags;
48unsigned int ubifs_tst_flags; 48unsigned int ubifs_tst_flags;
49 49
50module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); 50module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR);
@@ -810,16 +810,24 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
810{ 810{
811 struct ubifs_scan_leb *sleb; 811 struct ubifs_scan_leb *sleb;
812 struct ubifs_scan_node *snod; 812 struct ubifs_scan_node *snod;
813 void *buf;
813 814
814 if (dbg_failure_mode) 815 if (dbg_failure_mode)
815 return; 816 return;
816 817
817 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 818 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
818 current->pid, lnum); 819 current->pid, lnum);
819 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); 820
821 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
822 if (!buf) {
823 ubifs_err("cannot allocate memory for dumping LEB %d", lnum);
824 return;
825 }
826
827 sleb = ubifs_scan(c, lnum, 0, buf, 0);
820 if (IS_ERR(sleb)) { 828 if (IS_ERR(sleb)) {
821 ubifs_err("scan error %d", (int)PTR_ERR(sleb)); 829 ubifs_err("scan error %d", (int)PTR_ERR(sleb));
822 return; 830 goto out;
823 } 831 }
824 832
825 printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, 833 printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
@@ -835,6 +843,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
835 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", 843 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
836 current->pid, lnum); 844 current->pid, lnum);
837 ubifs_scan_destroy(sleb); 845 ubifs_scan_destroy(sleb);
846
847out:
848 vfree(buf);
838 return; 849 return;
839} 850}
840 851
@@ -961,11 +972,39 @@ void dbg_dump_index(struct ubifs_info *c)
961void dbg_save_space_info(struct ubifs_info *c) 972void dbg_save_space_info(struct ubifs_info *c)
962{ 973{
963 struct ubifs_debug_info *d = c->dbg; 974 struct ubifs_debug_info *d = c->dbg;
964 975 int freeable_cnt;
965 ubifs_get_lp_stats(c, &d->saved_lst);
966 976
967 spin_lock(&c->space_lock); 977 spin_lock(&c->space_lock);
978 memcpy(&d->saved_lst, &c->lst, sizeof(struct ubifs_lp_stats));
979
980 /*
981 * We use a dirty hack here and zero out @c->freeable_cnt, because it
982 * affects the free space calculations, and UBIFS might not know about
983 * all freeable eraseblocks. Indeed, we know about freeable eraseblocks
984 * only when we read their lprops, and we do this only lazily, upon the
985 * need. So at any given point of time @c->freeable_cnt might be not
986 * exactly accurate.
987 *
988 * Just one example about the issue we hit when we did not zero
989 * @c->freeable_cnt.
990 * 1. The file-system is mounted R/O, c->freeable_cnt is %0. We save the
991 * amount of free space in @d->saved_free
992 * 2. We re-mount R/W, which makes UBIFS to read the "lsave"
993 * information from flash, where we cache LEBs from various
994 * categories ('ubifs_remount_fs()' -> 'ubifs_lpt_init()'
995 * -> 'lpt_init_wr()' -> 'read_lsave()' -> 'ubifs_lpt_lookup()'
996 * -> 'ubifs_get_pnode()' -> 'update_cats()'
997 * -> 'ubifs_add_to_cat()').
998 * 3. Lsave contains a freeable eraseblock, and @c->freeable_cnt
999 * becomes %1.
1000 * 4. We calculate the amount of free space when the re-mount is
1001 * finished in 'dbg_check_space_info()' and it does not match
1002 * @d->saved_free.
1003 */
1004 freeable_cnt = c->freeable_cnt;
1005 c->freeable_cnt = 0;
968 d->saved_free = ubifs_get_free_space_nolock(c); 1006 d->saved_free = ubifs_get_free_space_nolock(c);
1007 c->freeable_cnt = freeable_cnt;
969 spin_unlock(&c->space_lock); 1008 spin_unlock(&c->space_lock);
970} 1009}
971 1010
@@ -982,12 +1021,15 @@ int dbg_check_space_info(struct ubifs_info *c)
982{ 1021{
983 struct ubifs_debug_info *d = c->dbg; 1022 struct ubifs_debug_info *d = c->dbg;
984 struct ubifs_lp_stats lst; 1023 struct ubifs_lp_stats lst;
985 long long avail, free; 1024 long long free;
1025 int freeable_cnt;
986 1026
987 spin_lock(&c->space_lock); 1027 spin_lock(&c->space_lock);
988 avail = ubifs_calc_available(c, c->min_idx_lebs); 1028 freeable_cnt = c->freeable_cnt;
1029 c->freeable_cnt = 0;
1030 free = ubifs_get_free_space_nolock(c);
1031 c->freeable_cnt = freeable_cnt;
989 spin_unlock(&c->space_lock); 1032 spin_unlock(&c->space_lock);
990 free = ubifs_get_free_space(c);
991 1033
992 if (free != d->saved_free) { 1034 if (free != d->saved_free) {
993 ubifs_err("free space changed from %lld to %lld", 1035 ubifs_err("free space changed from %lld to %lld",
@@ -2690,16 +2732,8 @@ int ubifs_debugging_init(struct ubifs_info *c)
2690 if (!c->dbg) 2732 if (!c->dbg)
2691 return -ENOMEM; 2733 return -ENOMEM;
2692 2734
2693 c->dbg->buf = vmalloc(c->leb_size);
2694 if (!c->dbg->buf)
2695 goto out;
2696
2697 failure_mode_init(c); 2735 failure_mode_init(c);
2698 return 0; 2736 return 0;
2699
2700out:
2701 kfree(c->dbg);
2702 return -ENOMEM;
2703} 2737}
2704 2738
2705/** 2739/**
@@ -2709,7 +2743,6 @@ out:
2709void ubifs_debugging_exit(struct ubifs_info *c) 2743void ubifs_debugging_exit(struct ubifs_info *c)
2710{ 2744{
2711 failure_mode_exit(c); 2745 failure_mode_exit(c);
2712 vfree(c->dbg->buf);
2713 kfree(c->dbg); 2746 kfree(c->dbg);
2714} 2747}
2715 2748
@@ -2804,40 +2837,38 @@ int dbg_debugfs_init_fs(struct ubifs_info *c)
2804 struct ubifs_debug_info *d = c->dbg; 2837 struct ubifs_debug_info *d = c->dbg;
2805 2838
2806 sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); 2839 sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
2807 d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir); 2840 fname = d->dfs_dir_name;
2808 if (IS_ERR(d->dfs_dir)) { 2841 dent = debugfs_create_dir(fname, dfs_rootdir);
2809 err = PTR_ERR(d->dfs_dir); 2842 if (IS_ERR_OR_NULL(dent))
2810 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2811 d->dfs_dir_name, err);
2812 goto out; 2843 goto out;
2813 } 2844 d->dfs_dir = dent;
2814 2845
2815 fname = "dump_lprops"; 2846 fname = "dump_lprops";
2816 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); 2847 dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
2817 if (IS_ERR(dent)) 2848 if (IS_ERR_OR_NULL(dent))
2818 goto out_remove; 2849 goto out_remove;
2819 d->dfs_dump_lprops = dent; 2850 d->dfs_dump_lprops = dent;
2820 2851
2821 fname = "dump_budg"; 2852 fname = "dump_budg";
2822 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); 2853 dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
2823 if (IS_ERR(dent)) 2854 if (IS_ERR_OR_NULL(dent))
2824 goto out_remove; 2855 goto out_remove;
2825 d->dfs_dump_budg = dent; 2856 d->dfs_dump_budg = dent;
2826 2857
2827 fname = "dump_tnc"; 2858 fname = "dump_tnc";
2828 dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); 2859 dent = debugfs_create_file(fname, S_IWUSR, d->dfs_dir, c, &dfs_fops);
2829 if (IS_ERR(dent)) 2860 if (IS_ERR_OR_NULL(dent))
2830 goto out_remove; 2861 goto out_remove;
2831 d->dfs_dump_tnc = dent; 2862 d->dfs_dump_tnc = dent;
2832 2863
2833 return 0; 2864 return 0;
2834 2865
2835out_remove: 2866out_remove:
2836 err = PTR_ERR(dent);
2837 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2838 fname, err);
2839 debugfs_remove_recursive(d->dfs_dir); 2867 debugfs_remove_recursive(d->dfs_dir);
2840out: 2868out:
2869 err = dent ? PTR_ERR(dent) : -ENODEV;
2870 ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
2871 fname, err);
2841 return err; 2872 return err;
2842} 2873}
2843 2874
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 69ebe4729151..e6493cac193d 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -23,11 +23,16 @@
23#ifndef __UBIFS_DEBUG_H__ 23#ifndef __UBIFS_DEBUG_H__
24#define __UBIFS_DEBUG_H__ 24#define __UBIFS_DEBUG_H__
25 25
26/* Checking helper functions */
27typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
28 struct ubifs_zbranch *zbr, void *priv);
29typedef int (*dbg_znode_callback)(struct ubifs_info *c,
30 struct ubifs_znode *znode, void *priv);
31
26#ifdef CONFIG_UBIFS_FS_DEBUG 32#ifdef CONFIG_UBIFS_FS_DEBUG
27 33
28/** 34/**
29 * ubifs_debug_info - per-FS debugging information. 35 * ubifs_debug_info - per-FS debugging information.
30 * @buf: a buffer of LEB size, used for various purposes
31 * @old_zroot: old index root - used by 'dbg_check_old_index()' 36 * @old_zroot: old index root - used by 'dbg_check_old_index()'
32 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' 37 * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
33 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' 38 * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
@@ -54,7 +59,6 @@
54 * dfs_dump_tnc: "dump TNC" debugfs knob 59 * dfs_dump_tnc: "dump TNC" debugfs knob
55 */ 60 */
56struct ubifs_debug_info { 61struct ubifs_debug_info {
57 void *buf;
58 struct ubifs_zbranch old_zroot; 62 struct ubifs_zbranch old_zroot;
59 int old_zroot_level; 63 int old_zroot_level;
60 unsigned long long old_zroot_sqnum; 64 unsigned long long old_zroot_sqnum;
@@ -173,7 +177,7 @@ const char *dbg_key_str1(const struct ubifs_info *c,
173#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) 177#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__)
174 178
175/* 179/*
176 * Debugging message type flags (must match msg_type_names in debug.c). 180 * Debugging message type flags.
177 * 181 *
178 * UBIFS_MSG_GEN: general messages 182 * UBIFS_MSG_GEN: general messages
179 * UBIFS_MSG_JNL: journal messages 183 * UBIFS_MSG_JNL: journal messages
@@ -205,14 +209,8 @@ enum {
205 UBIFS_MSG_RCVRY = 0x1000, 209 UBIFS_MSG_RCVRY = 0x1000,
206}; 210};
207 211
208/* Debugging message type flags for each default debug message level */
209#define UBIFS_MSG_LVL_0 0
210#define UBIFS_MSG_LVL_1 0x1
211#define UBIFS_MSG_LVL_2 0x7f
212#define UBIFS_MSG_LVL_3 0xffff
213
214/* 212/*
215 * Debugging check flags (must match chk_names in debug.c). 213 * Debugging check flags.
216 * 214 *
217 * UBIFS_CHK_GEN: general checks 215 * UBIFS_CHK_GEN: general checks
218 * UBIFS_CHK_TNC: check TNC 216 * UBIFS_CHK_TNC: check TNC
@@ -233,7 +231,7 @@ enum {
233}; 231};
234 232
235/* 233/*
236 * Special testing flags (must match tst_names in debug.c). 234 * Special testing flags.
237 * 235 *
238 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method 236 * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method
239 * UBIFS_TST_RCVRY: failure mode for recovery testing 237 * UBIFS_TST_RCVRY: failure mode for recovery testing
@@ -243,22 +241,6 @@ enum {
243 UBIFS_TST_RCVRY = 0x4, 241 UBIFS_TST_RCVRY = 0x4,
244}; 242};
245 243
246#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1
247#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1
248#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2
249#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2
250#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3
251#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3
252#else
253#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0
254#endif
255
256#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS
257#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff
258#else
259#define UBIFS_CHK_FLAGS_DEFAULT 0
260#endif
261
262extern spinlock_t dbg_lock; 244extern spinlock_t dbg_lock;
263 245
264extern unsigned int ubifs_msg_flags; 246extern unsigned int ubifs_msg_flags;
@@ -294,11 +276,6 @@ void dbg_dump_tnc(struct ubifs_info *c);
294void dbg_dump_index(struct ubifs_info *c); 276void dbg_dump_index(struct ubifs_info *c);
295void dbg_dump_lpt_lebs(const struct ubifs_info *c); 277void dbg_dump_lpt_lebs(const struct ubifs_info *c);
296 278
297/* Checking helper functions */
298typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
299 struct ubifs_zbranch *zbr, void *priv);
300typedef int (*dbg_znode_callback)(struct ubifs_info *c,
301 struct ubifs_znode *znode, void *priv);
302int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, 279int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
303 dbg_znode_callback znode_cb, void *priv); 280 dbg_znode_callback znode_cb, void *priv);
304 281
@@ -319,7 +296,6 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size);
319int dbg_check_filesystem(struct ubifs_info *c); 296int dbg_check_filesystem(struct ubifs_info *c);
320void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, 297void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat,
321 int add_pos); 298 int add_pos);
322int dbg_check_lprops(struct ubifs_info *c);
323int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, 299int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode,
324 int row, int col); 300 int row, int col);
325int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, 301int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode,
@@ -425,58 +401,94 @@ void dbg_debugfs_exit_fs(struct ubifs_info *c);
425#define DBGKEY(key) ((char *)(key)) 401#define DBGKEY(key) ((char *)(key))
426#define DBGKEY1(key) ((char *)(key)) 402#define DBGKEY1(key) ((char *)(key))
427 403
428#define ubifs_debugging_init(c) 0 404static inline int ubifs_debugging_init(struct ubifs_info *c) { return 0; }
429#define ubifs_debugging_exit(c) ({}) 405static inline void ubifs_debugging_exit(struct ubifs_info *c) { return; }
430 406static inline const char *dbg_ntype(int type) { return ""; }
431#define dbg_ntype(type) "" 407static inline const char *dbg_cstate(int cmt_state) { return ""; }
432#define dbg_cstate(cmt_state) "" 408static inline const char *dbg_jhead(int jhead) { return ""; }
433#define dbg_jhead(jhead) "" 409static inline const char *
434#define dbg_get_key_dump(c, key) ({}) 410dbg_get_key_dump(const struct ubifs_info *c,
435#define dbg_dump_inode(c, inode) ({}) 411 const union ubifs_key *key) { return ""; }
436#define dbg_dump_node(c, node) ({}) 412static inline void dbg_dump_inode(const struct ubifs_info *c,
437#define dbg_dump_lpt_node(c, node, lnum, offs) ({}) 413 const struct inode *inode) { return; }
438#define dbg_dump_budget_req(req) ({}) 414static inline void dbg_dump_node(const struct ubifs_info *c,
439#define dbg_dump_lstats(lst) ({}) 415 const void *node) { return; }
440#define dbg_dump_budg(c) ({}) 416static inline void dbg_dump_lpt_node(const struct ubifs_info *c,
441#define dbg_dump_lprop(c, lp) ({}) 417 void *node, int lnum,
442#define dbg_dump_lprops(c) ({}) 418 int offs) { return; }
443#define dbg_dump_lpt_info(c) ({}) 419static inline void
444#define dbg_dump_leb(c, lnum) ({}) 420dbg_dump_budget_req(const struct ubifs_budget_req *req) { return; }
445#define dbg_dump_znode(c, znode) ({}) 421static inline void
446#define dbg_dump_heap(c, heap, cat) ({}) 422dbg_dump_lstats(const struct ubifs_lp_stats *lst) { return; }
447#define dbg_dump_pnode(c, pnode, parent, iip) ({}) 423static inline void dbg_dump_budg(struct ubifs_info *c) { return; }
448#define dbg_dump_tnc(c) ({}) 424static inline void dbg_dump_lprop(const struct ubifs_info *c,
449#define dbg_dump_index(c) ({}) 425 const struct ubifs_lprops *lp) { return; }
450#define dbg_dump_lpt_lebs(c) ({}) 426static inline void dbg_dump_lprops(struct ubifs_info *c) { return; }
451 427static inline void dbg_dump_lpt_info(struct ubifs_info *c) { return; }
452#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 428static inline void dbg_dump_leb(const struct ubifs_info *c,
453#define dbg_old_index_check_init(c, zroot) 0 429 int lnum) { return; }
454#define dbg_save_space_info(c) ({}) 430static inline void
455#define dbg_check_space_info(c) 0 431dbg_dump_znode(const struct ubifs_info *c,
456#define dbg_check_old_index(c, zroot) 0 432 const struct ubifs_znode *znode) { return; }
457#define dbg_check_cats(c) 0 433static inline void dbg_dump_heap(struct ubifs_info *c,
458#define dbg_check_ltab(c) 0 434 struct ubifs_lpt_heap *heap,
459#define dbg_chk_lpt_free_spc(c) 0 435 int cat) { return; }
460#define dbg_chk_lpt_sz(c, action, len) 0 436static inline void dbg_dump_pnode(struct ubifs_info *c,
461#define dbg_check_synced_i_size(inode) 0 437 struct ubifs_pnode *pnode,
462#define dbg_check_dir_size(c, dir) 0 438 struct ubifs_nnode *parent,
463#define dbg_check_tnc(c, x) 0 439 int iip) { return; }
464#define dbg_check_idx_size(c, idx_size) 0 440static inline void dbg_dump_tnc(struct ubifs_info *c) { return; }
465#define dbg_check_filesystem(c) 0 441static inline void dbg_dump_index(struct ubifs_info *c) { return; }
466#define dbg_check_heap(c, heap, cat, add_pos) ({}) 442static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c) { return; }
467#define dbg_check_lprops(c) 0 443
468#define dbg_check_lpt_nodes(c, cnode, row, col) 0 444static inline int dbg_walk_index(struct ubifs_info *c,
469#define dbg_check_inode_size(c, inode, size) 0 445 dbg_leaf_callback leaf_cb,
470#define dbg_check_data_nodes_order(c, head) 0 446 dbg_znode_callback znode_cb,
471#define dbg_check_nondata_nodes_order(c, head) 0 447 void *priv) { return 0; }
472#define dbg_force_in_the_gaps_enabled 0 448static inline void dbg_save_space_info(struct ubifs_info *c) { return; }
473#define dbg_force_in_the_gaps() 0 449static inline int dbg_check_space_info(struct ubifs_info *c) { return 0; }
474#define dbg_failure_mode 0 450static inline int dbg_check_lprops(struct ubifs_info *c) { return 0; }
475 451static inline int
476#define dbg_debugfs_init() 0 452dbg_old_index_check_init(struct ubifs_info *c,
477#define dbg_debugfs_exit() 453 struct ubifs_zbranch *zroot) { return 0; }
478#define dbg_debugfs_init_fs(c) 0 454static inline int
479#define dbg_debugfs_exit_fs(c) 0 455dbg_check_old_index(struct ubifs_info *c,
456 struct ubifs_zbranch *zroot) { return 0; }
457static inline int dbg_check_cats(struct ubifs_info *c) { return 0; }
458static inline int dbg_check_ltab(struct ubifs_info *c) { return 0; }
459static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c) { return 0; }
460static inline int dbg_chk_lpt_sz(struct ubifs_info *c,
461 int action, int len) { return 0; }
462static inline int dbg_check_synced_i_size(struct inode *inode) { return 0; }
463static inline int dbg_check_dir_size(struct ubifs_info *c,
464 const struct inode *dir) { return 0; }
465static inline int dbg_check_tnc(struct ubifs_info *c, int extra) { return 0; }
466static inline int dbg_check_idx_size(struct ubifs_info *c,
467 long long idx_size) { return 0; }
468static inline int dbg_check_filesystem(struct ubifs_info *c) { return 0; }
469static inline void dbg_check_heap(struct ubifs_info *c,
470 struct ubifs_lpt_heap *heap,
471 int cat, int add_pos) { return; }
472static inline int dbg_check_lpt_nodes(struct ubifs_info *c,
473 struct ubifs_cnode *cnode, int row, int col) { return 0; }
474static inline int dbg_check_inode_size(struct ubifs_info *c,
475 const struct inode *inode,
476 loff_t size) { return 0; }
477static inline int
478dbg_check_data_nodes_order(struct ubifs_info *c,
479 struct list_head *head) { return 0; }
480static inline int
481dbg_check_nondata_nodes_order(struct ubifs_info *c,
482 struct list_head *head) { return 0; }
483
484static inline int dbg_force_in_the_gaps(void) { return 0; }
485#define dbg_force_in_the_gaps_enabled 0
486#define dbg_failure_mode 0
487
488static inline int dbg_debugfs_init(void) { return 0; }
489static inline void dbg_debugfs_exit(void) { return; }
490static inline int dbg_debugfs_init_fs(struct ubifs_info *c) { return 0; }
491static inline int dbg_debugfs_exit_fs(struct ubifs_info *c) { return 0; }
480 492
481#endif /* !CONFIG_UBIFS_FS_DEBUG */ 493#endif /* !CONFIG_UBIFS_FS_DEBUG */
482#endif /* !__UBIFS_DEBUG_H__ */ 494#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7f..7217d67a80a6 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
522 ubifs_assert(mutex_is_locked(&dir->i_mutex)); 522 ubifs_assert(mutex_is_locked(&dir->i_mutex));
523 ubifs_assert(mutex_is_locked(&inode->i_mutex)); 523 ubifs_assert(mutex_is_locked(&inode->i_mutex));
524 524
525 /*
526 * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing
527 * otherwise has the potential to corrupt the orphan inode list.
528 *
529 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
530 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
531 * lock 'dirA->i_mutex', so this is possible. Both of the functions
532 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
533 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
534 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
535 * to the list of orphans. After this, 'vfs_link()' will link
536 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
537 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
538 * to the list of orphans.
539 */
540 if (inode->i_nlink == 0)
541 return -ENOENT;
542
543 err = dbg_check_synced_i_size(inode); 525 err = dbg_check_synced_i_size(inode);
544 if (err) 526 if (err)
545 return err; 527 return err;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index d77db7e36484..b286db79c686 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -448,10 +448,12 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { 448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
449 /* 449 /*
450 * We change whole page so no need to load it. But we 450 * We change whole page so no need to load it. But we
451 * have to set the @PG_checked flag to make the further 451 * do not know whether this page exists on the media or
452 * code know that the page is new. This might be not 452 * not, so we assume the latter because it requires
453 * true, but it is better to budget more than to read 453 * larger budget. The assumption is that it is better
454 * the page from the media. 454 * to budget a bit more than to read the page from the
455 * media. Thus, we are setting the @PG_checked flag
456 * here.
455 */ 457 */
456 SetPageChecked(page); 458 SetPageChecked(page);
457 skipped_read = 1; 459 skipped_read = 1;
@@ -559,6 +561,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
559 dbg_gen("copied %d instead of %d, read page and repeat", 561 dbg_gen("copied %d instead of %d, read page and repeat",
560 copied, len); 562 copied, len);
561 cancel_budget(c, page, ui, appending); 563 cancel_budget(c, page, ui, appending);
564 ClearPageChecked(page);
562 565
563 /* 566 /*
564 * Return 0 to force VFS to repeat the whole operation, or the 567 * Return 0 to force VFS to repeat the whole operation, or the
@@ -1309,6 +1312,9 @@ int ubifs_fsync(struct file *file, int datasync)
1309 1312
1310 dbg_gen("syncing inode %lu", inode->i_ino); 1313 dbg_gen("syncing inode %lu", inode->i_ino);
1311 1314
1315 if (inode->i_sb->s_flags & MS_RDONLY)
1316 return 0;
1317
1312 /* 1318 /*
1313 * VFS has already synchronized dirty pages for this inode. Synchronize 1319 * VFS has already synchronized dirty pages for this inode. Synchronize
1314 * the inode unless this is a 'datasync()' call. 1320 * the inode unless this is a 'datasync()' call.
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index d82173182eeb..dfd168b7807e 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -31,6 +31,26 @@
31 * buffer is full or when it is not used for some time (by timer). This is 31 * buffer is full or when it is not used for some time (by timer). This is
32 * similar to the mechanism is used by JFFS2. 32 * similar to the mechanism is used by JFFS2.
33 * 33 *
34 * UBIFS distinguishes between minimum write size (@c->min_io_size) and maximum
35 * write size (@c->max_write_size). The latter is the maximum amount of bytes
36 * the underlying flash is able to program at a time, and writing in
37 * @c->max_write_size units should presumably be faster. Obviously,
38 * @c->min_io_size <= @c->max_write_size. Write-buffers are of
39 * @c->max_write_size bytes in size for maximum performance. However, when a
40 * write-buffer is flushed, only the portion of it (aligned to @c->min_io_size
41 * boundary) which contains data is written, not the whole write-buffer,
42 * because this is more space-efficient.
43 *
44 * This optimization adds few complications to the code. Indeed, on the one
45 * hand, we want to write in optimal @c->max_write_size bytes chunks, which
46 * also means aligning writes at the @c->max_write_size bytes offsets. On the
47 * other hand, we do not want to waste space when synchronizing the write
48 * buffer, so during synchronization we writes in smaller chunks. And this makes
49 * the next write offset to be not aligned to @c->max_write_size bytes. So the
50 * have to make sure that the write-buffer offset (@wbuf->offs) becomes aligned
51 * to @c->max_write_size bytes again. We do this by temporarily shrinking
52 * write-buffer size (@wbuf->size).
53 *
34 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by 54 * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by
35 * mutexes defined inside these objects. Since sometimes upper-level code 55 * mutexes defined inside these objects. Since sometimes upper-level code
36 * has to lock the write-buffer (e.g. journal space reservation code), many 56 * has to lock the write-buffer (e.g. journal space reservation code), many
@@ -46,8 +66,8 @@
46 * UBIFS uses padding when it pads to the next min. I/O unit. In this case it 66 * UBIFS uses padding when it pads to the next min. I/O unit. In this case it
47 * uses padding nodes or padding bytes, if the padding node does not fit. 67 * uses padding nodes or padding bytes, if the padding node does not fit.
48 * 68 *
49 * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes 69 * All UBIFS nodes are protected by CRC checksums and UBIFS checks CRC when
50 * every time they are read from the flash media. 70 * they are read from the flash media.
51 */ 71 */
52 72
53#include <linux/crc32.h> 73#include <linux/crc32.h>
@@ -88,8 +108,12 @@ void ubifs_ro_mode(struct ubifs_info *c, int err)
88 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is 108 * This function may skip data nodes CRC checking if @c->no_chk_data_crc is
89 * true, which is controlled by corresponding UBIFS mount option. However, if 109 * true, which is controlled by corresponding UBIFS mount option. However, if
90 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is 110 * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is
91 * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is 111 * checked. Similarly, if @c->mounting or @c->remounting_rw is true (we are
92 * ignored and CRC is checked. 112 * mounting or re-mounting to R/W mode), @c->no_chk_data_crc is ignored and CRC
113 * is checked. This is because during mounting or re-mounting from R/O mode to
114 * R/W mode we may read journal nodes (when replying the journal or doing the
115 * recovery) and the journal nodes may potentially be corrupted, so checking is
116 * required.
93 * 117 *
94 * This function returns zero in case of success and %-EUCLEAN in case of bad 118 * This function returns zero in case of success and %-EUCLEAN in case of bad
95 * CRC or magic. 119 * CRC or magic.
@@ -131,8 +155,8 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
131 node_len > c->ranges[type].max_len) 155 node_len > c->ranges[type].max_len)
132 goto out_len; 156 goto out_len;
133 157
134 if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc && 158 if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->mounting &&
135 c->no_chk_data_crc) 159 !c->remounting_rw && c->no_chk_data_crc)
136 return 0; 160 return 0;
137 161
138 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 162 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
@@ -343,11 +367,17 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
343 * 367 *
344 * This function synchronizes write-buffer @buf and returns zero in case of 368 * This function synchronizes write-buffer @buf and returns zero in case of
345 * success or a negative error code in case of failure. 369 * success or a negative error code in case of failure.
370 *
371 * Note, although write-buffers are of @c->max_write_size, this function does
372 * not necessarily writes all @c->max_write_size bytes to the flash. Instead,
373 * if the write-buffer is only partially filled with data, only the used part
374 * of the write-buffer (aligned on @c->min_io_size boundary) is synchronized.
375 * This way we waste less space.
346 */ 376 */
347int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) 377int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
348{ 378{
349 struct ubifs_info *c = wbuf->c; 379 struct ubifs_info *c = wbuf->c;
350 int err, dirt; 380 int err, dirt, sync_len;
351 381
352 cancel_wbuf_timer_nolock(wbuf); 382 cancel_wbuf_timer_nolock(wbuf);
353 if (!wbuf->used || wbuf->lnum == -1) 383 if (!wbuf->used || wbuf->lnum == -1)
@@ -357,27 +387,53 @@ int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)
357 dbg_io("LEB %d:%d, %d bytes, jhead %s", 387 dbg_io("LEB %d:%d, %d bytes, jhead %s",
358 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead)); 388 wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));
359 ubifs_assert(!(wbuf->avail & 7)); 389 ubifs_assert(!(wbuf->avail & 7));
360 ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); 390 ubifs_assert(wbuf->offs + wbuf->size <= c->leb_size);
391 ubifs_assert(wbuf->size >= c->min_io_size);
392 ubifs_assert(wbuf->size <= c->max_write_size);
393 ubifs_assert(wbuf->size % c->min_io_size == 0);
361 ubifs_assert(!c->ro_media && !c->ro_mount); 394 ubifs_assert(!c->ro_media && !c->ro_mount);
395 if (c->leb_size - wbuf->offs >= c->max_write_size)
396 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
362 397
363 if (c->ro_error) 398 if (c->ro_error)
364 return -EROFS; 399 return -EROFS;
365 400
366 ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); 401 /*
402 * Do not write whole write buffer but write only the minimum necessary
403 * amount of min. I/O units.
404 */
405 sync_len = ALIGN(wbuf->used, c->min_io_size);
406 dirt = sync_len - wbuf->used;
407 if (dirt)
408 ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
367 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, 409 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
368 c->min_io_size, wbuf->dtype); 410 sync_len, wbuf->dtype);
369 if (err) { 411 if (err) {
370 ubifs_err("cannot write %d bytes to LEB %d:%d", 412 ubifs_err("cannot write %d bytes to LEB %d:%d",
371 c->min_io_size, wbuf->lnum, wbuf->offs); 413 sync_len, wbuf->lnum, wbuf->offs);
372 dbg_dump_stack(); 414 dbg_dump_stack();
373 return err; 415 return err;
374 } 416 }
375 417
376 dirt = wbuf->avail;
377
378 spin_lock(&wbuf->lock); 418 spin_lock(&wbuf->lock);
379 wbuf->offs += c->min_io_size; 419 wbuf->offs += sync_len;
380 wbuf->avail = c->min_io_size; 420 /*
421 * Now @wbuf->offs is not necessarily aligned to @c->max_write_size.
422 * But our goal is to optimize writes and make sure we write in
423 * @c->max_write_size chunks and to @c->max_write_size-aligned offset.
424 * Thus, if @wbuf->offs is not aligned to @c->max_write_size now, make
425 * sure that @wbuf->offs + @wbuf->size is aligned to
426 * @c->max_write_size. This way we make sure that after next
427 * write-buffer flush we are again at the optimal offset (aligned to
428 * @c->max_write_size).
429 */
430 if (c->leb_size - wbuf->offs < c->max_write_size)
431 wbuf->size = c->leb_size - wbuf->offs;
432 else if (wbuf->offs & (c->max_write_size - 1))
433 wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
434 else
435 wbuf->size = c->max_write_size;
436 wbuf->avail = wbuf->size;
381 wbuf->used = 0; 437 wbuf->used = 0;
382 wbuf->next_ino = 0; 438 wbuf->next_ino = 0;
383 spin_unlock(&wbuf->lock); 439 spin_unlock(&wbuf->lock);
@@ -420,7 +476,13 @@ int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
420 spin_lock(&wbuf->lock); 476 spin_lock(&wbuf->lock);
421 wbuf->lnum = lnum; 477 wbuf->lnum = lnum;
422 wbuf->offs = offs; 478 wbuf->offs = offs;
423 wbuf->avail = c->min_io_size; 479 if (c->leb_size - wbuf->offs < c->max_write_size)
480 wbuf->size = c->leb_size - wbuf->offs;
481 else if (wbuf->offs & (c->max_write_size - 1))
482 wbuf->size = ALIGN(wbuf->offs, c->max_write_size) - wbuf->offs;
483 else
484 wbuf->size = c->max_write_size;
485 wbuf->avail = wbuf->size;
424 wbuf->used = 0; 486 wbuf->used = 0;
425 spin_unlock(&wbuf->lock); 487 spin_unlock(&wbuf->lock);
426 wbuf->dtype = dtype; 488 wbuf->dtype = dtype;
@@ -500,8 +562,9 @@ out_timers:
500 * 562 *
501 * This function writes data to flash via write-buffer @wbuf. This means that 563 * This function writes data to flash via write-buffer @wbuf. This means that
502 * the last piece of the node won't reach the flash media immediately if it 564 * the last piece of the node won't reach the flash media immediately if it
503 * does not take whole minimal I/O unit. Instead, the node will sit in RAM 565 * does not take whole max. write unit (@c->max_write_size). Instead, the node
504 * until the write-buffer is synchronized (e.g., by timer). 566 * will sit in RAM until the write-buffer is synchronized (e.g., by timer, or
567 * because more data are appended to the write-buffer).
505 * 568 *
506 * This function returns zero in case of success and a negative error code in 569 * This function returns zero in case of success and a negative error code in
507 * case of failure. If the node cannot be written because there is no more 570 * case of failure. If the node cannot be written because there is no more
@@ -518,9 +581,14 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
518 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); 581 ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt);
519 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); 582 ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0);
520 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); 583 ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size);
521 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); 584 ubifs_assert(wbuf->avail > 0 && wbuf->avail <= wbuf->size);
585 ubifs_assert(wbuf->size >= c->min_io_size);
586 ubifs_assert(wbuf->size <= c->max_write_size);
587 ubifs_assert(wbuf->size % c->min_io_size == 0);
522 ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); 588 ubifs_assert(mutex_is_locked(&wbuf->io_mutex));
523 ubifs_assert(!c->ro_media && !c->ro_mount); 589 ubifs_assert(!c->ro_media && !c->ro_mount);
590 if (c->leb_size - wbuf->offs >= c->max_write_size)
591 ubifs_assert(!((wbuf->offs + wbuf->size) % c->max_write_size ));
524 592
525 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { 593 if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) {
526 err = -ENOSPC; 594 err = -ENOSPC;
@@ -543,14 +611,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
543 dbg_io("flush jhead %s wbuf to LEB %d:%d", 611 dbg_io("flush jhead %s wbuf to LEB %d:%d",
544 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs); 612 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
545 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, 613 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf,
546 wbuf->offs, c->min_io_size, 614 wbuf->offs, wbuf->size,
547 wbuf->dtype); 615 wbuf->dtype);
548 if (err) 616 if (err)
549 goto out; 617 goto out;
550 618
551 spin_lock(&wbuf->lock); 619 spin_lock(&wbuf->lock);
552 wbuf->offs += c->min_io_size; 620 wbuf->offs += wbuf->size;
553 wbuf->avail = c->min_io_size; 621 if (c->leb_size - wbuf->offs >= c->max_write_size)
622 wbuf->size = c->max_write_size;
623 else
624 wbuf->size = c->leb_size - wbuf->offs;
625 wbuf->avail = wbuf->size;
554 wbuf->used = 0; 626 wbuf->used = 0;
555 wbuf->next_ino = 0; 627 wbuf->next_ino = 0;
556 spin_unlock(&wbuf->lock); 628 spin_unlock(&wbuf->lock);
@@ -564,33 +636,57 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
564 goto exit; 636 goto exit;
565 } 637 }
566 638
567 /* 639 offs = wbuf->offs;
568 * The node is large enough and does not fit entirely within current 640 written = 0;
569 * minimal I/O unit. We have to fill and flush write-buffer and switch
570 * to the next min. I/O unit.
571 */
572 dbg_io("flush jhead %s wbuf to LEB %d:%d",
573 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
574 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
575 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
576 c->min_io_size, wbuf->dtype);
577 if (err)
578 goto out;
579 641
580 offs = wbuf->offs + c->min_io_size; 642 if (wbuf->used) {
581 len -= wbuf->avail; 643 /*
582 aligned_len -= wbuf->avail; 644 * The node is large enough and does not fit entirely within
583 written = wbuf->avail; 645 * current available space. We have to fill and flush
646 * write-buffer and switch to the next max. write unit.
647 */
648 dbg_io("flush jhead %s wbuf to LEB %d:%d",
649 dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
650 memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
651 err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,
652 wbuf->size, wbuf->dtype);
653 if (err)
654 goto out;
655
656 offs += wbuf->size;
657 len -= wbuf->avail;
658 aligned_len -= wbuf->avail;
659 written += wbuf->avail;
660 } else if (wbuf->offs & (c->max_write_size - 1)) {
661 /*
662 * The write-buffer offset is not aligned to
663 * @c->max_write_size and @wbuf->size is less than
664 * @c->max_write_size. Write @wbuf->size bytes to make sure the
665 * following writes are done in optimal @c->max_write_size
666 * chunks.
667 */
668 dbg_io("write %d bytes to LEB %d:%d",
669 wbuf->size, wbuf->lnum, wbuf->offs);
670 err = ubi_leb_write(c->ubi, wbuf->lnum, buf, wbuf->offs,
671 wbuf->size, wbuf->dtype);
672 if (err)
673 goto out;
674
675 offs += wbuf->size;
676 len -= wbuf->size;
677 aligned_len -= wbuf->size;
678 written += wbuf->size;
679 }
584 680
585 /* 681 /*
586 * The remaining data may take more whole min. I/O units, so write the 682 * The remaining data may take more whole max. write units, so write the
587 * remains multiple to min. I/O unit size directly to the flash media. 683 * remains multiple to max. write unit size directly to the flash media.
588 * We align node length to 8-byte boundary because we anyway flash wbuf 684 * We align node length to 8-byte boundary because we anyway flash wbuf
589 * if the remaining space is less than 8 bytes. 685 * if the remaining space is less than 8 bytes.
590 */ 686 */
591 n = aligned_len >> c->min_io_shift; 687 n = aligned_len >> c->max_write_shift;
592 if (n) { 688 if (n) {
593 n <<= c->min_io_shift; 689 n <<= c->max_write_shift;
594 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); 690 dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs);
595 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, 691 err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n,
596 wbuf->dtype); 692 wbuf->dtype);
@@ -606,14 +702,18 @@ int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len)
606 if (aligned_len) 702 if (aligned_len)
607 /* 703 /*
608 * And now we have what's left and what does not take whole 704 * And now we have what's left and what does not take whole
609 * min. I/O unit, so write it to the write-buffer and we are 705 * max. write unit, so write it to the write-buffer and we are
610 * done. 706 * done.
611 */ 707 */
612 memcpy(wbuf->buf, buf + written, len); 708 memcpy(wbuf->buf, buf + written, len);
613 709
614 wbuf->offs = offs; 710 wbuf->offs = offs;
711 if (c->leb_size - wbuf->offs >= c->max_write_size)
712 wbuf->size = c->max_write_size;
713 else
714 wbuf->size = c->leb_size - wbuf->offs;
715 wbuf->avail = wbuf->size - aligned_len;
615 wbuf->used = aligned_len; 716 wbuf->used = aligned_len;
616 wbuf->avail = c->min_io_size - aligned_len;
617 wbuf->next_ino = 0; 717 wbuf->next_ino = 0;
618 spin_unlock(&wbuf->lock); 718 spin_unlock(&wbuf->lock);
619 719
@@ -837,11 +937,11 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
837{ 937{
838 size_t size; 938 size_t size;
839 939
840 wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); 940 wbuf->buf = kmalloc(c->max_write_size, GFP_KERNEL);
841 if (!wbuf->buf) 941 if (!wbuf->buf)
842 return -ENOMEM; 942 return -ENOMEM;
843 943
844 size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); 944 size = (c->max_write_size / UBIFS_CH_SZ + 1) * sizeof(ino_t);
845 wbuf->inodes = kmalloc(size, GFP_KERNEL); 945 wbuf->inodes = kmalloc(size, GFP_KERNEL);
846 if (!wbuf->inodes) { 946 if (!wbuf->inodes) {
847 kfree(wbuf->buf); 947 kfree(wbuf->buf);
@@ -851,7 +951,14 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
851 951
852 wbuf->used = 0; 952 wbuf->used = 0;
853 wbuf->lnum = wbuf->offs = -1; 953 wbuf->lnum = wbuf->offs = -1;
854 wbuf->avail = c->min_io_size; 954 /*
955 * If the LEB starts at the max. write size aligned address, then
956 * write-buffer size has to be set to @c->max_write_size. Otherwise,
957 * set it to something smaller so that it ends at the closest max.
958 * write size boundary.
959 */
960 size = c->max_write_size - (c->leb_start % c->max_write_size);
961 wbuf->avail = wbuf->size = size;
855 wbuf->dtype = UBI_UNKNOWN; 962 wbuf->dtype = UBI_UNKNOWN;
856 wbuf->sync_callback = NULL; 963 wbuf->sync_callback = NULL;
857 mutex_init(&wbuf->io_mutex); 964 mutex_init(&wbuf->io_mutex);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 8aacd64957a2..548acf494afd 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -160,7 +160,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
160 if (IS_RDONLY(inode)) 160 if (IS_RDONLY(inode))
161 return -EROFS; 161 return -EROFS;
162 162
163 if (!is_owner_or_cap(inode)) 163 if (!inode_owner_or_capable(inode))
164 return -EACCES; 164 return -EACCES;
165 165
166 if (get_user(flags, (int __user *) arg)) 166 if (get_user(flags, (int __user *) arg))
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 914f1bd89e57..aed25e864227 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -690,7 +690,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
690{ 690{
691 struct ubifs_data_node *data; 691 struct ubifs_data_node *data;
692 int err, lnum, offs, compr_type, out_len; 692 int err, lnum, offs, compr_type, out_len;
693 int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR; 693 int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
694 struct ubifs_inode *ui = ubifs_inode(inode); 694 struct ubifs_inode *ui = ubifs_inode(inode);
695 695
696 dbg_jnl("ino %lu, blk %u, len %d, key %s", 696 dbg_jnl("ino %lu, blk %u, len %d, key %s",
@@ -698,9 +698,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
698 DBGKEY(key)); 698 DBGKEY(key));
699 ubifs_assert(len <= UBIFS_BLOCK_SIZE); 699 ubifs_assert(len <= UBIFS_BLOCK_SIZE);
700 700
701 data = kmalloc(dlen, GFP_NOFS); 701 data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
702 if (!data) 702 if (!data) {
703 return -ENOMEM; 703 /*
704 * Fall-back to the write reserve buffer. Note, we might be
705 * currently on the memory reclaim path, when the kernel is
706 * trying to free some memory by writing out dirty pages. The
707 * write reserve buffer helps us to guarantee that we are
708 * always able to write the data.
709 */
710 allocated = 0;
711 mutex_lock(&c->write_reserve_mutex);
712 data = c->write_reserve_buf;
713 }
704 714
705 data->ch.node_type = UBIFS_DATA_NODE; 715 data->ch.node_type = UBIFS_DATA_NODE;
706 key_write(c, key, &data->key); 716 key_write(c, key, &data->key);
@@ -736,7 +746,10 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
736 goto out_ro; 746 goto out_ro;
737 747
738 finish_reservation(c); 748 finish_reservation(c);
739 kfree(data); 749 if (!allocated)
750 mutex_unlock(&c->write_reserve_mutex);
751 else
752 kfree(data);
740 return 0; 753 return 0;
741 754
742out_release: 755out_release:
@@ -745,7 +758,10 @@ out_ro:
745 ubifs_ro_mode(c, err); 758 ubifs_ro_mode(c, err);
746 finish_reservation(c); 759 finish_reservation(c);
747out_free: 760out_free:
748 kfree(data); 761 if (!allocated)
762 mutex_unlock(&c->write_reserve_mutex);
763 else
764 kfree(data);
749 return err; 765 return err;
750} 766}
751 767
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 4d4ca388889b..0ee0847f2421 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1035,7 +1035,8 @@ static int scan_check_cb(struct ubifs_info *c,
1035 struct ubifs_scan_leb *sleb; 1035 struct ubifs_scan_leb *sleb;
1036 struct ubifs_scan_node *snod; 1036 struct ubifs_scan_node *snod;
1037 struct ubifs_lp_stats *lst = &data->lst; 1037 struct ubifs_lp_stats *lst = &data->lst;
1038 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty; 1038 int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
1039 void *buf = NULL;
1039 1040
1040 cat = lp->flags & LPROPS_CAT_MASK; 1041 cat = lp->flags & LPROPS_CAT_MASK;
1041 if (cat != LPROPS_UNCAT) { 1042 if (cat != LPROPS_UNCAT) {
@@ -1093,7 +1094,13 @@ static int scan_check_cb(struct ubifs_info *c,
1093 } 1094 }
1094 } 1095 }
1095 1096
1096 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); 1097 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1098 if (!buf) {
1099 ubifs_err("cannot allocate memory to scan LEB %d", lnum);
1100 goto out;
1101 }
1102
1103 sleb = ubifs_scan(c, lnum, 0, buf, 0);
1097 if (IS_ERR(sleb)) { 1104 if (IS_ERR(sleb)) {
1098 /* 1105 /*
1099 * After an unclean unmount, empty and freeable LEBs 1106 * After an unclean unmount, empty and freeable LEBs
@@ -1105,7 +1112,8 @@ static int scan_check_cb(struct ubifs_info *c,
1105 lst->empty_lebs += 1; 1112 lst->empty_lebs += 1;
1106 lst->total_free += c->leb_size; 1113 lst->total_free += c->leb_size;
1107 lst->total_dark += ubifs_calc_dark(c, c->leb_size); 1114 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1108 return LPT_SCAN_CONTINUE; 1115 ret = LPT_SCAN_CONTINUE;
1116 goto exit;
1109 } 1117 }
1110 1118
1111 if (lp->free + lp->dirty == c->leb_size && 1119 if (lp->free + lp->dirty == c->leb_size &&
@@ -1115,10 +1123,12 @@ static int scan_check_cb(struct ubifs_info *c,
1115 lst->total_free += lp->free; 1123 lst->total_free += lp->free;
1116 lst->total_dirty += lp->dirty; 1124 lst->total_dirty += lp->dirty;
1117 lst->total_dark += ubifs_calc_dark(c, c->leb_size); 1125 lst->total_dark += ubifs_calc_dark(c, c->leb_size);
1118 return LPT_SCAN_CONTINUE; 1126 ret = LPT_SCAN_CONTINUE;
1127 goto exit;
1119 } 1128 }
1120 data->err = PTR_ERR(sleb); 1129 data->err = PTR_ERR(sleb);
1121 return LPT_SCAN_STOP; 1130 ret = LPT_SCAN_STOP;
1131 goto exit;
1122 } 1132 }
1123 1133
1124 is_idx = -1; 1134 is_idx = -1;
@@ -1236,7 +1246,10 @@ static int scan_check_cb(struct ubifs_info *c,
1236 } 1246 }
1237 1247
1238 ubifs_scan_destroy(sleb); 1248 ubifs_scan_destroy(sleb);
1239 return LPT_SCAN_CONTINUE; 1249 ret = LPT_SCAN_CONTINUE;
1250exit:
1251 vfree(buf);
1252 return ret;
1240 1253
1241out_print: 1254out_print:
1242 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " 1255 ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
@@ -1246,6 +1259,7 @@ out_print:
1246out_destroy: 1259out_destroy:
1247 ubifs_scan_destroy(sleb); 1260 ubifs_scan_destroy(sleb);
1248out: 1261out:
1262 vfree(buf);
1249 data->err = -EINVAL; 1263 data->err = -EINVAL;
1250 return LPT_SCAN_STOP; 1264 return LPT_SCAN_STOP;
1251} 1265}
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index 72775d35b99e..ef5155e109a2 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1270,10 +1270,9 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
1270 lnum = branch->lnum; 1270 lnum = branch->lnum;
1271 offs = branch->offs; 1271 offs = branch->offs;
1272 pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS); 1272 pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
1273 if (!pnode) { 1273 if (!pnode)
1274 err = -ENOMEM; 1274 return -ENOMEM;
1275 goto out; 1275
1276 }
1277 if (lnum == 0) { 1276 if (lnum == 0) {
1278 /* 1277 /*
1279 * This pnode was not written which just means that the LEB 1278 * This pnode was not written which just means that the LEB
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 5c90dec5db0b..0c9c69bd983a 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1628,29 +1628,35 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1628{ 1628{
1629 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; 1629 int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
1630 int ret; 1630 int ret;
1631 void *buf = c->dbg->buf; 1631 void *buf, *p;
1632 1632
1633 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) 1633 if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
1634 return 0; 1634 return 0;
1635 1635
1636 buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1637 if (!buf) {
1638 ubifs_err("cannot allocate memory for ltab checking");
1639 return 0;
1640 }
1641
1636 dbg_lp("LEB %d", lnum); 1642 dbg_lp("LEB %d", lnum);
1637 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); 1643 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1638 if (err) { 1644 if (err) {
1639 dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); 1645 dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err);
1640 return err; 1646 goto out;
1641 } 1647 }
1642 while (1) { 1648 while (1) {
1643 if (!is_a_node(c, buf, len)) { 1649 if (!is_a_node(c, p, len)) {
1644 int i, pad_len; 1650 int i, pad_len;
1645 1651
1646 pad_len = get_pad_len(c, buf, len); 1652 pad_len = get_pad_len(c, p, len);
1647 if (pad_len) { 1653 if (pad_len) {
1648 buf += pad_len; 1654 p += pad_len;
1649 len -= pad_len; 1655 len -= pad_len;
1650 dirty += pad_len; 1656 dirty += pad_len;
1651 continue; 1657 continue;
1652 } 1658 }
1653 if (!dbg_is_all_ff(buf, len)) { 1659 if (!dbg_is_all_ff(p, len)) {
1654 dbg_msg("invalid empty space in LEB %d at %d", 1660 dbg_msg("invalid empty space in LEB %d at %d",
1655 lnum, c->leb_size - len); 1661 lnum, c->leb_size - len);
1656 err = -EINVAL; 1662 err = -EINVAL;
@@ -1668,16 +1674,21 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
1668 lnum, dirty, c->ltab[i].dirty); 1674 lnum, dirty, c->ltab[i].dirty);
1669 err = -EINVAL; 1675 err = -EINVAL;
1670 } 1676 }
1671 return err; 1677 goto out;
1672 } 1678 }
1673 node_type = get_lpt_node_type(c, buf, &node_num); 1679 node_type = get_lpt_node_type(c, p, &node_num);
1674 node_len = get_lpt_node_len(c, node_type); 1680 node_len = get_lpt_node_len(c, node_type);
1675 ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); 1681 ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len);
1676 if (ret == 1) 1682 if (ret == 1)
1677 dirty += node_len; 1683 dirty += node_len;
1678 buf += node_len; 1684 p += node_len;
1679 len -= node_len; 1685 len -= node_len;
1680 } 1686 }
1687
1688 err = 0;
1689out:
1690 vfree(buf);
1691 return err;
1681} 1692}
1682 1693
1683/** 1694/**
@@ -1870,25 +1881,31 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1870static void dump_lpt_leb(const struct ubifs_info *c, int lnum) 1881static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1871{ 1882{
1872 int err, len = c->leb_size, node_type, node_num, node_len, offs; 1883 int err, len = c->leb_size, node_type, node_num, node_len, offs;
1873 void *buf = c->dbg->buf; 1884 void *buf, *p;
1874 1885
1875 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", 1886 printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
1876 current->pid, lnum); 1887 current->pid, lnum);
1888 buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
1889 if (!buf) {
1890 ubifs_err("cannot allocate memory to dump LPT");
1891 return;
1892 }
1893
1877 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); 1894 err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
1878 if (err) { 1895 if (err) {
1879 ubifs_err("cannot read LEB %d, error %d", lnum, err); 1896 ubifs_err("cannot read LEB %d, error %d", lnum, err);
1880 return; 1897 goto out;
1881 } 1898 }
1882 while (1) { 1899 while (1) {
1883 offs = c->leb_size - len; 1900 offs = c->leb_size - len;
1884 if (!is_a_node(c, buf, len)) { 1901 if (!is_a_node(c, p, len)) {
1885 int pad_len; 1902 int pad_len;
1886 1903
1887 pad_len = get_pad_len(c, buf, len); 1904 pad_len = get_pad_len(c, p, len);
1888 if (pad_len) { 1905 if (pad_len) {
1889 printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", 1906 printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
1890 lnum, offs, pad_len); 1907 lnum, offs, pad_len);
1891 buf += pad_len; 1908 p += pad_len;
1892 len -= pad_len; 1909 len -= pad_len;
1893 continue; 1910 continue;
1894 } 1911 }
@@ -1898,7 +1915,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1898 break; 1915 break;
1899 } 1916 }
1900 1917
1901 node_type = get_lpt_node_type(c, buf, &node_num); 1918 node_type = get_lpt_node_type(c, p, &node_num);
1902 switch (node_type) { 1919 switch (node_type) {
1903 case UBIFS_LPT_PNODE: 1920 case UBIFS_LPT_PNODE:
1904 { 1921 {
@@ -1923,7 +1940,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1923 else 1940 else
1924 printk(KERN_DEBUG "LEB %d:%d, nnode, ", 1941 printk(KERN_DEBUG "LEB %d:%d, nnode, ",
1925 lnum, offs); 1942 lnum, offs);
1926 err = ubifs_unpack_nnode(c, buf, &nnode); 1943 err = ubifs_unpack_nnode(c, p, &nnode);
1927 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1944 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1928 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, 1945 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
1929 nnode.nbranch[i].offs); 1946 nnode.nbranch[i].offs);
@@ -1944,15 +1961,18 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1944 break; 1961 break;
1945 default: 1962 default:
1946 ubifs_err("LPT node type %d not recognized", node_type); 1963 ubifs_err("LPT node type %d not recognized", node_type);
1947 return; 1964 goto out;
1948 } 1965 }
1949 1966
1950 buf += node_len; 1967 p += node_len;
1951 len -= node_len; 1968 len -= node_len;
1952 } 1969 }
1953 1970
1954 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", 1971 printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
1955 current->pid, lnum); 1972 current->pid, lnum);
1973out:
1974 vfree(buf);
1975 return;
1956} 1976}
1957 1977
1958/** 1978/**
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 82009c74b6a3..09df318e368f 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -892,15 +892,22 @@ static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb)
892static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) 892static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
893{ 893{
894 int lnum, err = 0; 894 int lnum, err = 0;
895 void *buf;
895 896
896 /* Check no-orphans flag and skip this if no orphans */ 897 /* Check no-orphans flag and skip this if no orphans */
897 if (c->no_orphs) 898 if (c->no_orphs)
898 return 0; 899 return 0;
899 900
901 buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
902 if (!buf) {
903 ubifs_err("cannot allocate memory to check orphans");
904 return 0;
905 }
906
900 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { 907 for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
901 struct ubifs_scan_leb *sleb; 908 struct ubifs_scan_leb *sleb;
902 909
903 sleb = ubifs_scan(c, lnum, 0, c->dbg->buf, 0); 910 sleb = ubifs_scan(c, lnum, 0, buf, 0);
904 if (IS_ERR(sleb)) { 911 if (IS_ERR(sleb)) {
905 err = PTR_ERR(sleb); 912 err = PTR_ERR(sleb);
906 break; 913 break;
@@ -912,6 +919,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
912 break; 919 break;
913 } 920 }
914 921
922 vfree(buf);
915 return err; 923 return err;
916} 924}
917 925
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 77e9b874b6c2..936f2cbfe6b6 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -28,6 +28,23 @@
28 * UBIFS always cleans away all remnants of an unclean un-mount, so that 28 * UBIFS always cleans away all remnants of an unclean un-mount, so that
29 * errors do not accumulate. However UBIFS defers recovery if it is mounted 29 * errors do not accumulate. However UBIFS defers recovery if it is mounted
30 * read-only, and the flash is not modified in that case. 30 * read-only, and the flash is not modified in that case.
31 *
32 * The general UBIFS approach to the recovery is that it recovers from
33 * corruptions which could be caused by power cuts, but it refuses to recover
34 * from corruption caused by other reasons. And UBIFS tries to distinguish
35 * between these 2 reasons of corruptions and silently recover in the former
36 * case and loudly complain in the latter case.
37 *
38 * UBIFS writes only to erased LEBs, so it writes only to the flash space
39 * containing only 0xFFs. UBIFS also always writes strictly from the beginning
40 * of the LEB to the end. And UBIFS assumes that the underlying flash media
41 * writes in @c->max_write_size bytes at a time.
42 *
43 * Hence, if UBIFS finds a corrupted node at offset X, it expects only the min.
44 * I/O unit corresponding to offset X to contain corrupted data, all the
45 * following min. I/O units have to contain empty space (all 0xFFs). If this is
46 * not true, the corruption cannot be the result of a power cut, and UBIFS
47 * refuses to mount.
31 */ 48 */
32 49
33#include <linux/crc32.h> 50#include <linux/crc32.h>
@@ -362,8 +379,9 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c)
362 * @offs: offset to check 379 * @offs: offset to check
363 * 380 *
364 * This function returns %1 if @offs was in the last write to the LEB whose data 381 * This function returns %1 if @offs was in the last write to the LEB whose data
365 * is in @buf, otherwise %0 is returned. The determination is made by checking 382 * is in @buf, otherwise %0 is returned. The determination is made by checking
366 * for subsequent empty space starting from the next @c->min_io_size boundary. 383 * for subsequent empty space starting from the next @c->max_write_size
384 * boundary.
367 */ 385 */
368static int is_last_write(const struct ubifs_info *c, void *buf, int offs) 386static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
369{ 387{
@@ -371,10 +389,10 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs)
371 uint8_t *p; 389 uint8_t *p;
372 390
373 /* 391 /*
374 * Round up to the next @c->min_io_size boundary i.e. @offs is in the 392 * Round up to the next @c->max_write_size boundary i.e. @offs is in
375 * last wbuf written. After that should be empty space. 393 * the last wbuf written. After that should be empty space.
376 */ 394 */
377 empty_offs = ALIGN(offs + 1, c->min_io_size); 395 empty_offs = ALIGN(offs + 1, c->max_write_size);
378 check_len = c->leb_size - empty_offs; 396 check_len = c->leb_size - empty_offs;
379 p = buf + empty_offs - offs; 397 p = buf + empty_offs - offs;
380 return is_empty(p, check_len); 398 return is_empty(p, check_len);
@@ -429,7 +447,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
429 int skip, dlen = le32_to_cpu(ch->len); 447 int skip, dlen = le32_to_cpu(ch->len);
430 448
431 /* Check for empty space after the corrupt node's common header */ 449 /* Check for empty space after the corrupt node's common header */
432 skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; 450 skip = ALIGN(offs + UBIFS_CH_SZ, c->max_write_size) - offs;
433 if (is_empty(buf + skip, len - skip)) 451 if (is_empty(buf + skip, len - skip))
434 return 1; 452 return 1;
435 /* 453 /*
@@ -441,7 +459,7 @@ static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
441 return 0; 459 return 0;
442 } 460 }
443 /* Now we know the corrupt node's length we can skip over it */ 461 /* Now we know the corrupt node's length we can skip over it */
444 skip = ALIGN(offs + dlen, c->min_io_size) - offs; 462 skip = ALIGN(offs + dlen, c->max_write_size) - offs;
445 /* After which there should be empty space */ 463 /* After which there should be empty space */
446 if (is_empty(buf + skip, len - skip)) 464 if (is_empty(buf + skip, len - skip))
447 return 1; 465 return 1;
@@ -671,10 +689,14 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
671 } else { 689 } else {
672 int corruption = first_non_ff(buf, len); 690 int corruption = first_non_ff(buf, len);
673 691
692 /*
693 * See header comment for this file for more
694 * explanations about the reasons we have this check.
695 */
674 ubifs_err("corrupt empty space LEB %d:%d, corruption " 696 ubifs_err("corrupt empty space LEB %d:%d, corruption "
675 "starts at %d", lnum, offs, corruption); 697 "starts at %d", lnum, offs, corruption);
676 /* Make sure we dump interesting non-0xFF data */ 698 /* Make sure we dump interesting non-0xFF data */
677 offs = corruption; 699 offs += corruption;
678 buf += corruption; 700 buf += corruption;
679 goto corrupted; 701 goto corrupted;
680 } 702 }
@@ -836,12 +858,8 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum,
836static int recover_head(const struct ubifs_info *c, int lnum, int offs, 858static int recover_head(const struct ubifs_info *c, int lnum, int offs,
837 void *sbuf) 859 void *sbuf)
838{ 860{
839 int len, err; 861 int len = c->max_write_size, err;
840 862
841 if (c->min_io_size > 1)
842 len = c->min_io_size;
843 else
844 len = 512;
845 if (offs + len > c->leb_size) 863 if (offs + len > c->leb_size)
846 len = c->leb_size - offs; 864 len = c->leb_size - offs;
847 865
diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c
index 3e1ee57dbeaa..36216b46f772 100644
--- a/fs/ubifs/scan.c
+++ b/fs/ubifs/scan.c
@@ -328,7 +328,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum,
328 if (!quiet) 328 if (!quiet)
329 ubifs_err("empty space starts at non-aligned offset %d", 329 ubifs_err("empty space starts at non-aligned offset %d",
330 offs); 330 offs);
331 goto corrupted;; 331 goto corrupted;
332 } 332 }
333 333
334 ubifs_end_scan(c, sleb, lnum, offs); 334 ubifs_end_scan(c, sleb, lnum, offs);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6e11c2975dcf..c75f6133206c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -512,9 +512,12 @@ static int init_constants_early(struct ubifs_info *c)
512 512
513 c->leb_cnt = c->vi.size; 513 c->leb_cnt = c->vi.size;
514 c->leb_size = c->vi.usable_leb_size; 514 c->leb_size = c->vi.usable_leb_size;
515 c->leb_start = c->di.leb_start;
515 c->half_leb_size = c->leb_size / 2; 516 c->half_leb_size = c->leb_size / 2;
516 c->min_io_size = c->di.min_io_size; 517 c->min_io_size = c->di.min_io_size;
517 c->min_io_shift = fls(c->min_io_size) - 1; 518 c->min_io_shift = fls(c->min_io_size) - 1;
519 c->max_write_size = c->di.max_write_size;
520 c->max_write_shift = fls(c->max_write_size) - 1;
518 521
519 if (c->leb_size < UBIFS_MIN_LEB_SZ) { 522 if (c->leb_size < UBIFS_MIN_LEB_SZ) {
520 ubifs_err("too small LEBs (%d bytes), min. is %d bytes", 523 ubifs_err("too small LEBs (%d bytes), min. is %d bytes",
@@ -534,6 +537,18 @@ static int init_constants_early(struct ubifs_info *c)
534 } 537 }
535 538
536 /* 539 /*
540 * Maximum write size has to be greater or equivalent to min. I/O
541 * size, and be multiple of min. I/O size.
542 */
543 if (c->max_write_size < c->min_io_size ||
544 c->max_write_size % c->min_io_size ||
545 !is_power_of_2(c->max_write_size)) {
546 ubifs_err("bad write buffer size %d for %d min. I/O unit",
547 c->max_write_size, c->min_io_size);
548 return -EINVAL;
549 }
550
551 /*
537 * UBIFS aligns all node to 8-byte boundary, so to make function in 552 * UBIFS aligns all node to 8-byte boundary, so to make function in
538 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is 553 * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is
539 * less than 8. 554 * less than 8.
@@ -541,6 +556,10 @@ static int init_constants_early(struct ubifs_info *c)
541 if (c->min_io_size < 8) { 556 if (c->min_io_size < 8) {
542 c->min_io_size = 8; 557 c->min_io_size = 8;
543 c->min_io_shift = 3; 558 c->min_io_shift = 3;
559 if (c->max_write_size < c->min_io_size) {
560 c->max_write_size = c->min_io_size;
561 c->max_write_shift = c->min_io_shift;
562 }
544 } 563 }
545 564
546 c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); 565 c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size);
@@ -1202,11 +1221,14 @@ static int mount_ubifs(struct ubifs_info *c)
1202 if (c->bulk_read == 1) 1221 if (c->bulk_read == 1)
1203 bu_init(c); 1222 bu_init(c);
1204 1223
1205 /* 1224 if (!c->ro_mount) {
1206 * We have to check all CRCs, even for data nodes, when we mount the FS 1225 c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
1207 * (specifically, when we are replaying). 1226 GFP_KERNEL);
1208 */ 1227 if (!c->write_reserve_buf)
1209 c->always_chk_crc = 1; 1228 goto out_free;
1229 }
1230
1231 c->mounting = 1;
1210 1232
1211 err = ubifs_read_superblock(c); 1233 err = ubifs_read_superblock(c);
1212 if (err) 1234 if (err)
@@ -1382,7 +1404,7 @@ static int mount_ubifs(struct ubifs_info *c)
1382 if (err) 1404 if (err)
1383 goto out_infos; 1405 goto out_infos;
1384 1406
1385 c->always_chk_crc = 0; 1407 c->mounting = 0;
1386 1408
1387 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", 1409 ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
1388 c->vi.ubi_num, c->vi.vol_id, c->vi.name); 1410 c->vi.ubi_num, c->vi.vol_id, c->vi.name);
@@ -1403,6 +1425,7 @@ static int mount_ubifs(struct ubifs_info *c)
1403 1425
1404 dbg_msg("compiled on: " __DATE__ " at " __TIME__); 1426 dbg_msg("compiled on: " __DATE__ " at " __TIME__);
1405 dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); 1427 dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);
1428 dbg_msg("max. write size: %d bytes", c->max_write_size);
1406 dbg_msg("LEB size: %d bytes (%d KiB)", 1429 dbg_msg("LEB size: %d bytes (%d KiB)",
1407 c->leb_size, c->leb_size >> 10); 1430 c->leb_size, c->leb_size >> 10);
1408 dbg_msg("data journal heads: %d", 1431 dbg_msg("data journal heads: %d",
@@ -1432,9 +1455,9 @@ static int mount_ubifs(struct ubifs_info *c)
1432 UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); 1455 UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
1433 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", 1456 dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
1434 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); 1457 UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
1435 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu", 1458 dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu, idx %d",
1436 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, 1459 UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
1437 UBIFS_MAX_DENT_NODE_SZ); 1460 UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
1438 dbg_msg("dead watermark: %d", c->dead_wm); 1461 dbg_msg("dead watermark: %d", c->dead_wm);
1439 dbg_msg("dark watermark: %d", c->dark_wm); 1462 dbg_msg("dark watermark: %d", c->dark_wm);
1440 dbg_msg("LEB overhead: %d", c->leb_overhead); 1463 dbg_msg("LEB overhead: %d", c->leb_overhead);
@@ -1474,6 +1497,7 @@ out_wbufs:
1474out_cbuf: 1497out_cbuf:
1475 kfree(c->cbuf); 1498 kfree(c->cbuf);
1476out_free: 1499out_free:
1500 kfree(c->write_reserve_buf);
1477 kfree(c->bu.buf); 1501 kfree(c->bu.buf);
1478 vfree(c->ileb_buf); 1502 vfree(c->ileb_buf);
1479 vfree(c->sbuf); 1503 vfree(c->sbuf);
@@ -1512,6 +1536,7 @@ static void ubifs_umount(struct ubifs_info *c)
1512 kfree(c->cbuf); 1536 kfree(c->cbuf);
1513 kfree(c->rcvrd_mst_node); 1537 kfree(c->rcvrd_mst_node);
1514 kfree(c->mst_node); 1538 kfree(c->mst_node);
1539 kfree(c->write_reserve_buf);
1515 kfree(c->bu.buf); 1540 kfree(c->bu.buf);
1516 vfree(c->ileb_buf); 1541 vfree(c->ileb_buf);
1517 vfree(c->sbuf); 1542 vfree(c->sbuf);
@@ -1543,7 +1568,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1543 mutex_lock(&c->umount_mutex); 1568 mutex_lock(&c->umount_mutex);
1544 dbg_save_space_info(c); 1569 dbg_save_space_info(c);
1545 c->remounting_rw = 1; 1570 c->remounting_rw = 1;
1546 c->always_chk_crc = 1; 1571 c->ro_mount = 0;
1547 1572
1548 err = check_free_space(c); 1573 err = check_free_space(c);
1549 if (err) 1574 if (err)
@@ -1598,6 +1623,10 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1598 goto out; 1623 goto out;
1599 } 1624 }
1600 1625
1626 c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
1627 if (!c->write_reserve_buf)
1628 goto out;
1629
1601 err = ubifs_lpt_init(c, 0, 1); 1630 err = ubifs_lpt_init(c, 0, 1);
1602 if (err) 1631 if (err)
1603 goto out; 1632 goto out;
@@ -1648,14 +1677,13 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1648 } 1677 }
1649 1678
1650 dbg_gen("re-mounted read-write"); 1679 dbg_gen("re-mounted read-write");
1651 c->ro_mount = 0;
1652 c->remounting_rw = 0; 1680 c->remounting_rw = 0;
1653 c->always_chk_crc = 0;
1654 err = dbg_check_space_info(c); 1681 err = dbg_check_space_info(c);
1655 mutex_unlock(&c->umount_mutex); 1682 mutex_unlock(&c->umount_mutex);
1656 return err; 1683 return err;
1657 1684
1658out: 1685out:
1686 c->ro_mount = 1;
1659 vfree(c->orph_buf); 1687 vfree(c->orph_buf);
1660 c->orph_buf = NULL; 1688 c->orph_buf = NULL;
1661 if (c->bgt) { 1689 if (c->bgt) {
@@ -1663,11 +1691,12 @@ out:
1663 c->bgt = NULL; 1691 c->bgt = NULL;
1664 } 1692 }
1665 free_wbufs(c); 1693 free_wbufs(c);
1694 kfree(c->write_reserve_buf);
1695 c->write_reserve_buf = NULL;
1666 vfree(c->ileb_buf); 1696 vfree(c->ileb_buf);
1667 c->ileb_buf = NULL; 1697 c->ileb_buf = NULL;
1668 ubifs_lpt_free(c, 1); 1698 ubifs_lpt_free(c, 1);
1669 c->remounting_rw = 0; 1699 c->remounting_rw = 0;
1670 c->always_chk_crc = 0;
1671 mutex_unlock(&c->umount_mutex); 1700 mutex_unlock(&c->umount_mutex);
1672 return err; 1701 return err;
1673} 1702}
@@ -1707,6 +1736,8 @@ static void ubifs_remount_ro(struct ubifs_info *c)
1707 free_wbufs(c); 1736 free_wbufs(c);
1708 vfree(c->orph_buf); 1737 vfree(c->orph_buf);
1709 c->orph_buf = NULL; 1738 c->orph_buf = NULL;
1739 kfree(c->write_reserve_buf);
1740 c->write_reserve_buf = NULL;
1710 vfree(c->ileb_buf); 1741 vfree(c->ileb_buf);
1711 c->ileb_buf = NULL; 1742 c->ileb_buf = NULL;
1712 ubifs_lpt_free(c, 1); 1743 ubifs_lpt_free(c, 1);
@@ -1937,6 +1968,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1937 mutex_init(&c->mst_mutex); 1968 mutex_init(&c->mst_mutex);
1938 mutex_init(&c->umount_mutex); 1969 mutex_init(&c->umount_mutex);
1939 mutex_init(&c->bu_mutex); 1970 mutex_init(&c->bu_mutex);
1971 mutex_init(&c->write_reserve_mutex);
1940 init_waitqueue_head(&c->cmt_wq); 1972 init_waitqueue_head(&c->cmt_wq);
1941 c->buds = RB_ROOT; 1973 c->buds = RB_ROOT;
1942 c->old_idx = RB_ROOT; 1974 c->old_idx = RB_ROOT;
@@ -1954,6 +1986,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1954 INIT_LIST_HEAD(&c->old_buds); 1986 INIT_LIST_HEAD(&c->old_buds);
1955 INIT_LIST_HEAD(&c->orph_list); 1987 INIT_LIST_HEAD(&c->orph_list);
1956 INIT_LIST_HEAD(&c->orph_new); 1988 INIT_LIST_HEAD(&c->orph_new);
1989 c->no_chk_data_crc = 1;
1957 1990
1958 c->vfs_sb = sb; 1991 c->vfs_sb = sb;
1959 c->highest_inum = UBIFS_FIRST_INO; 1992 c->highest_inum = UBIFS_FIRST_INO;
@@ -1979,7 +2012,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
1979 */ 2012 */
1980 c->bdi.name = "ubifs", 2013 c->bdi.name = "ubifs",
1981 c->bdi.capabilities = BDI_CAP_MAP_COPY; 2014 c->bdi.capabilities = BDI_CAP_MAP_COPY;
1982 c->bdi.unplug_io_fn = default_unplug_io_fn;
1983 err = bdi_init(&c->bdi); 2015 err = bdi_init(&c->bdi);
1984 if (err) 2016 if (err)
1985 goto out_close; 2017 goto out_close;
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index ad9cf0133622..de485979ca39 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -447,8 +447,11 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
447 * 447 *
448 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc 448 * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc
449 * is true (it is controlled by corresponding mount option). However, if 449 * is true (it is controlled by corresponding mount option). However, if
450 * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always 450 * @c->mounting or @c->remounting_rw is true (we are mounting or re-mounting to
451 * checked. 451 * R/W mode), @c->no_chk_data_crc is ignored and CRC is checked. This is
452 * because during mounting or re-mounting from R/O mode to R/W mode we may read
453 * journal nodes (when replying the journal or doing the recovery) and the
454 * journal nodes may potentially be corrupted, so checking is required.
452 */ 455 */
453static int try_read_node(const struct ubifs_info *c, void *buf, int type, 456static int try_read_node(const struct ubifs_info *c, void *buf, int type,
454 int len, int lnum, int offs) 457 int len, int lnum, int offs)
@@ -476,7 +479,8 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type,
476 if (node_len != len) 479 if (node_len != len)
477 return 0; 480 return 0;
478 481
479 if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc) 482 if (type == UBIFS_DATA_NODE && c->no_chk_data_crc && !c->mounting &&
483 !c->remounting_rw)
480 return 1; 484 return 1;
481 485
482 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); 486 crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 381d6b207a52..8c40ad3c6721 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -151,6 +151,12 @@
151 */ 151 */
152#define WORST_COMPR_FACTOR 2 152#define WORST_COMPR_FACTOR 2
153 153
154/*
155 * How much memory is needed for a buffer where we comress a data node.
156 */
157#define COMPRESSED_DATA_NODE_BUF_SZ \
158 (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR)
159
154/* Maximum expected tree height for use by bottom_up_buf */ 160/* Maximum expected tree height for use by bottom_up_buf */
155#define BOTTOM_UP_HEIGHT 64 161#define BOTTOM_UP_HEIGHT 64
156 162
@@ -646,6 +652,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
646 * @offs: write-buffer offset in this logical eraseblock 652 * @offs: write-buffer offset in this logical eraseblock
647 * @avail: number of bytes available in the write-buffer 653 * @avail: number of bytes available in the write-buffer
648 * @used: number of used bytes in the write-buffer 654 * @used: number of used bytes in the write-buffer
655 * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
649 * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, 656 * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
650 * %UBI_UNKNOWN) 657 * %UBI_UNKNOWN)
651 * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep 658 * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
@@ -680,6 +687,7 @@ struct ubifs_wbuf {
680 int offs; 687 int offs;
681 int avail; 688 int avail;
682 int used; 689 int used;
690 int size;
683 int dtype; 691 int dtype;
684 int jhead; 692 int jhead;
685 int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); 693 int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
@@ -1003,6 +1011,11 @@ struct ubifs_debug_info;
1003 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu 1011 * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu
1004 * @bu: pre-allocated bulk-read information 1012 * @bu: pre-allocated bulk-read information
1005 * 1013 *
1014 * @write_reserve_mutex: protects @write_reserve_buf
1015 * @write_reserve_buf: on the write path we allocate memory, which might
1016 * sometimes be unavailable, in which case we use this
1017 * write reserve buffer
1018 *
1006 * @log_lebs: number of logical eraseblocks in the log 1019 * @log_lebs: number of logical eraseblocks in the log
1007 * @log_bytes: log size in bytes 1020 * @log_bytes: log size in bytes
1008 * @log_last: last LEB of the log 1021 * @log_last: last LEB of the log
@@ -1024,7 +1037,12 @@ struct ubifs_debug_info;
1024 * 1037 *
1025 * @min_io_size: minimal input/output unit size 1038 * @min_io_size: minimal input/output unit size
1026 * @min_io_shift: number of bits in @min_io_size minus one 1039 * @min_io_shift: number of bits in @min_io_size minus one
1040 * @max_write_size: maximum amount of bytes the underlying flash can write at a
1041 * time (MTD write buffer size)
1042 * @max_write_shift: number of bits in @max_write_size minus one
1027 * @leb_size: logical eraseblock size in bytes 1043 * @leb_size: logical eraseblock size in bytes
1044 * @leb_start: starting offset of logical eraseblocks within physical
1045 * eraseblocks
1028 * @half_leb_size: half LEB size 1046 * @half_leb_size: half LEB size
1029 * @idx_leb_size: how many bytes of an LEB are effectively available when it is 1047 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
1030 * used to store indexing nodes (@leb_size - @max_idx_node_sz) 1048 * used to store indexing nodes (@leb_size - @max_idx_node_sz)
@@ -1166,22 +1184,21 @@ struct ubifs_debug_info;
1166 * @rp_uid: reserved pool user ID 1184 * @rp_uid: reserved pool user ID
1167 * @rp_gid: reserved pool group ID 1185 * @rp_gid: reserved pool group ID
1168 * 1186 *
1169 * @empty: if the UBI device is empty 1187 * @empty: %1 if the UBI device is empty
1188 * @need_recovery: %1 if the file-system needs recovery
1189 * @replaying: %1 during journal replay
1190 * @mounting: %1 while mounting
1191 * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode
1170 * @replay_tree: temporary tree used during journal replay 1192 * @replay_tree: temporary tree used during journal replay
1171 * @replay_list: temporary list used during journal replay 1193 * @replay_list: temporary list used during journal replay
1172 * @replay_buds: list of buds to replay 1194 * @replay_buds: list of buds to replay
1173 * @cs_sqnum: sequence number of first node in the log (commit start node) 1195 * @cs_sqnum: sequence number of first node in the log (commit start node)
1174 * @replay_sqnum: sequence number of node currently being replayed 1196 * @replay_sqnum: sequence number of node currently being replayed
1175 * @need_recovery: file-system needs recovery
1176 * @replaying: set to %1 during journal replay
1177 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W 1197 * @unclean_leb_list: LEBs to recover when re-mounting R/O mounted FS to R/W
1178 * mode 1198 * mode
1179 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted 1199 * @rcvrd_mst_node: recovered master node to write when re-mounting R/O mounted
1180 * FS to R/W mode 1200 * FS to R/W mode
1181 * @size_tree: inode size information for recovery 1201 * @size_tree: inode size information for recovery
1182 * @remounting_rw: set while re-mounting from R/O mode to R/W mode
1183 * @always_chk_crc: always check CRCs (while mounting and remounting to R/W
1184 * mode)
1185 * @mount_opts: UBIFS-specific mount options 1202 * @mount_opts: UBIFS-specific mount options
1186 * 1203 *
1187 * @dbg: debugging-related information 1204 * @dbg: debugging-related information
@@ -1250,6 +1267,9 @@ struct ubifs_info {
1250 struct mutex bu_mutex; 1267 struct mutex bu_mutex;
1251 struct bu_info bu; 1268 struct bu_info bu;
1252 1269
1270 struct mutex write_reserve_mutex;
1271 void *write_reserve_buf;
1272
1253 int log_lebs; 1273 int log_lebs;
1254 long long log_bytes; 1274 long long log_bytes;
1255 int log_last; 1275 int log_last;
@@ -1271,7 +1291,10 @@ struct ubifs_info {
1271 1291
1272 int min_io_size; 1292 int min_io_size;
1273 int min_io_shift; 1293 int min_io_shift;
1294 int max_write_size;
1295 int max_write_shift;
1274 int leb_size; 1296 int leb_size;
1297 int leb_start;
1275 int half_leb_size; 1298 int half_leb_size;
1276 int idx_leb_size; 1299 int idx_leb_size;
1277 int leb_cnt; 1300 int leb_cnt;
@@ -1402,19 +1425,19 @@ struct ubifs_info {
1402 gid_t rp_gid; 1425 gid_t rp_gid;
1403 1426
1404 /* The below fields are used only during mounting and re-mounting */ 1427 /* The below fields are used only during mounting and re-mounting */
1405 int empty; 1428 unsigned int empty:1;
1429 unsigned int need_recovery:1;
1430 unsigned int replaying:1;
1431 unsigned int mounting:1;
1432 unsigned int remounting_rw:1;
1406 struct rb_root replay_tree; 1433 struct rb_root replay_tree;
1407 struct list_head replay_list; 1434 struct list_head replay_list;
1408 struct list_head replay_buds; 1435 struct list_head replay_buds;
1409 unsigned long long cs_sqnum; 1436 unsigned long long cs_sqnum;
1410 unsigned long long replay_sqnum; 1437 unsigned long long replay_sqnum;
1411 int need_recovery;
1412 int replaying;
1413 struct list_head unclean_leb_list; 1438 struct list_head unclean_leb_list;
1414 struct ubifs_mst_node *rcvrd_mst_node; 1439 struct ubifs_mst_node *rcvrd_mst_node;
1415 struct rb_root size_tree; 1440 struct rb_root size_tree;
1416 int remounting_rw;
1417 int always_chk_crc;
1418 struct ubifs_mount_opts mount_opts; 1441 struct ubifs_mount_opts mount_opts;
1419 1442
1420#ifdef CONFIG_UBIFS_FS_DEBUG 1443#ifdef CONFIG_UBIFS_FS_DEBUG
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index c74400f88fe0..3299f469e712 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -56,6 +56,7 @@
56 */ 56 */
57 57
58#include "ubifs.h" 58#include "ubifs.h"
59#include <linux/fs.h>
59#include <linux/slab.h> 60#include <linux/slab.h>
60#include <linux/xattr.h> 61#include <linux/xattr.h>
61#include <linux/posix_acl_xattr.h> 62#include <linux/posix_acl_xattr.h>
@@ -80,7 +81,6 @@ enum {
80}; 81};
81 82
82static const struct inode_operations none_inode_operations; 83static const struct inode_operations none_inode_operations;
83static const struct address_space_operations none_address_operations;
84static const struct file_operations none_file_operations; 84static const struct file_operations none_file_operations;
85 85
86/** 86/**
@@ -130,7 +130,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
130 } 130 }
131 131
132 /* Re-define all operations to be "nothing" */ 132 /* Re-define all operations to be "nothing" */
133 inode->i_mapping->a_ops = &none_address_operations; 133 inode->i_mapping->a_ops = &empty_aops;
134 inode->i_op = &none_inode_operations; 134 inode->i_op = &none_inode_operations;
135 inode->i_fop = &none_file_operations; 135 inode->i_fop = &none_file_operations;
136 136
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 306ee39ef2c3..95518a9f589e 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -27,11 +27,10 @@
27#include "udf_i.h" 27#include "udf_i.h"
28#include "udf_sb.h" 28#include "udf_sb.h"
29 29
30#define udf_clear_bit(nr, addr) ext2_clear_bit(nr, addr) 30#define udf_clear_bit __test_and_clear_bit_le
31#define udf_set_bit(nr, addr) ext2_set_bit(nr, addr) 31#define udf_set_bit __test_and_set_bit_le
32#define udf_test_bit(nr, addr) ext2_test_bit(nr, addr) 32#define udf_test_bit test_bit_le
33#define udf_find_next_one_bit(addr, size, offset) \ 33#define udf_find_next_one_bit find_next_bit_le
34 ext2_find_next_bit(addr, size, offset)
35 34
36static int read_block_bitmap(struct super_block *sb, 35static int read_block_bitmap(struct super_block *sb,
37 struct udf_bitmap *bitmap, unsigned int block, 36 struct udf_bitmap *bitmap, unsigned int block,
@@ -297,7 +296,7 @@ repeat:
297 break; 296 break;
298 } 297 }
299 } else { 298 } else {
300 bit = udf_find_next_one_bit((char *)bh->b_data, 299 bit = udf_find_next_one_bit(bh->b_data,
301 sb->s_blocksize << 3, 300 sb->s_blocksize << 3,
302 group_start << 3); 301 group_start << 3);
303 if (bit < sb->s_blocksize << 3) 302 if (bit < sb->s_blocksize << 3)
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 89c78486cbbe..2a346bb1d9f5 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -98,7 +98,6 @@ static int udf_adinicb_write_end(struct file *file,
98const struct address_space_operations udf_adinicb_aops = { 98const struct address_space_operations udf_adinicb_aops = {
99 .readpage = udf_adinicb_readpage, 99 .readpage = udf_adinicb_readpage,
100 .writepage = udf_adinicb_writepage, 100 .writepage = udf_adinicb_writepage,
101 .sync_page = block_sync_page,
102 .write_begin = simple_write_begin, 101 .write_begin = simple_write_begin,
103 .write_end = udf_adinicb_write_end, 102 .write_end = udf_adinicb_write_end,
104}; 103};
@@ -123,8 +122,8 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
123 if (inode->i_sb->s_blocksize < 122 if (inode->i_sb->s_blocksize <
124 (udf_file_entry_alloc_offset(inode) + 123 (udf_file_entry_alloc_offset(inode) +
125 pos + count)) { 124 pos + count)) {
126 udf_expand_file_adinicb(inode, pos + count, &err); 125 err = udf_expand_file_adinicb(inode);
127 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 126 if (err) {
128 udf_debug("udf_expand_adinicb: err=%d\n", err); 127 udf_debug("udf_expand_adinicb: err=%d\n", err);
129 up_write(&iinfo->i_data_sem); 128 up_write(&iinfo->i_data_sem);
130 return err; 129 return err;
@@ -237,7 +236,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
237 236
238 if ((attr->ia_valid & ATTR_SIZE) && 237 if ((attr->ia_valid & ATTR_SIZE) &&
239 attr->ia_size != i_size_read(inode)) { 238 attr->ia_size != i_size_read(inode)) {
240 error = vmtruncate(inode, attr->ia_size); 239 error = udf_setsize(inode, attr->ia_size);
241 if (error) 240 if (error)
242 return error; 241 return error;
243 } 242 }
@@ -249,5 +248,4 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr)
249 248
250const struct inode_operations udf_file_inode_operations = { 249const struct inode_operations udf_file_inode_operations = {
251 .setattr = udf_setattr, 250 .setattr = udf_setattr,
252 .truncate = udf_truncate,
253}; 251};
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c6a2e782b97b..1d1358ed80c1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -73,14 +73,12 @@ void udf_evict_inode(struct inode *inode)
73 struct udf_inode_info *iinfo = UDF_I(inode); 73 struct udf_inode_info *iinfo = UDF_I(inode);
74 int want_delete = 0; 74 int want_delete = 0;
75 75
76 truncate_inode_pages(&inode->i_data, 0);
77
78 if (!inode->i_nlink && !is_bad_inode(inode)) { 76 if (!inode->i_nlink && !is_bad_inode(inode)) {
79 want_delete = 1; 77 want_delete = 1;
80 inode->i_size = 0; 78 udf_setsize(inode, 0);
81 udf_truncate(inode);
82 udf_update_inode(inode, IS_SYNC(inode)); 79 udf_update_inode(inode, IS_SYNC(inode));
83 } 80 } else
81 truncate_inode_pages(&inode->i_data, 0);
84 invalidate_inode_buffers(inode); 82 invalidate_inode_buffers(inode);
85 end_writeback(inode); 83 end_writeback(inode);
86 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && 84 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
@@ -117,9 +115,18 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
117 115
118 ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); 116 ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block);
119 if (unlikely(ret)) { 117 if (unlikely(ret)) {
120 loff_t isize = mapping->host->i_size; 118 struct inode *inode = mapping->host;
121 if (pos + len > isize) 119 struct udf_inode_info *iinfo = UDF_I(inode);
122 vmtruncate(mapping->host, isize); 120 loff_t isize = inode->i_size;
121
122 if (pos + len > isize) {
123 truncate_pagecache(inode, pos + len, isize);
124 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
125 down_write(&iinfo->i_data_sem);
126 udf_truncate_extents(inode);
127 up_write(&iinfo->i_data_sem);
128 }
129 }
123 } 130 }
124 131
125 return ret; 132 return ret;
@@ -133,36 +140,36 @@ static sector_t udf_bmap(struct address_space *mapping, sector_t block)
133const struct address_space_operations udf_aops = { 140const struct address_space_operations udf_aops = {
134 .readpage = udf_readpage, 141 .readpage = udf_readpage,
135 .writepage = udf_writepage, 142 .writepage = udf_writepage,
136 .sync_page = block_sync_page,
137 .write_begin = udf_write_begin, 143 .write_begin = udf_write_begin,
138 .write_end = generic_write_end, 144 .write_end = generic_write_end,
139 .bmap = udf_bmap, 145 .bmap = udf_bmap,
140}; 146};
141 147
142void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err) 148int udf_expand_file_adinicb(struct inode *inode)
143{ 149{
144 struct page *page; 150 struct page *page;
145 char *kaddr; 151 char *kaddr;
146 struct udf_inode_info *iinfo = UDF_I(inode); 152 struct udf_inode_info *iinfo = UDF_I(inode);
153 int err;
147 struct writeback_control udf_wbc = { 154 struct writeback_control udf_wbc = {
148 .sync_mode = WB_SYNC_NONE, 155 .sync_mode = WB_SYNC_NONE,
149 .nr_to_write = 1, 156 .nr_to_write = 1,
150 }; 157 };
151 158
152 /* from now on we have normal address_space methods */
153 inode->i_data.a_ops = &udf_aops;
154
155 if (!iinfo->i_lenAlloc) { 159 if (!iinfo->i_lenAlloc) {
156 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) 160 if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
157 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; 161 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
158 else 162 else
159 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; 163 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
164 /* from now on we have normal address_space methods */
165 inode->i_data.a_ops = &udf_aops;
160 mark_inode_dirty(inode); 166 mark_inode_dirty(inode);
161 return; 167 return 0;
162 } 168 }
163 169
164 page = grab_cache_page(inode->i_mapping, 0); 170 page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
165 BUG_ON(!PageLocked(page)); 171 if (!page)
172 return -ENOMEM;
166 173
167 if (!PageUptodate(page)) { 174 if (!PageUptodate(page)) {
168 kaddr = kmap(page); 175 kaddr = kmap(page);
@@ -181,11 +188,24 @@ void udf_expand_file_adinicb(struct inode *inode, int newsize, int *err)
181 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; 188 iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
182 else 189 else
183 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; 190 iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
184 191 /* from now on we have normal address_space methods */
185 inode->i_data.a_ops->writepage(page, &udf_wbc); 192 inode->i_data.a_ops = &udf_aops;
193 err = inode->i_data.a_ops->writepage(page, &udf_wbc);
194 if (err) {
195 /* Restore everything back so that we don't lose data... */
196 lock_page(page);
197 kaddr = kmap(page);
198 memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr,
199 inode->i_size);
200 kunmap(page);
201 unlock_page(page);
202 iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB;
203 inode->i_data.a_ops = &udf_adinicb_aops;
204 }
186 page_cache_release(page); 205 page_cache_release(page);
187
188 mark_inode_dirty(inode); 206 mark_inode_dirty(inode);
207
208 return err;
189} 209}
190 210
191struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, 211struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block,
@@ -348,8 +368,10 @@ static struct buffer_head *udf_getblk(struct inode *inode, long block,
348} 368}
349 369
350/* Extend the file by 'blocks' blocks, return the number of extents added */ 370/* Extend the file by 'blocks' blocks, return the number of extents added */
351int udf_extend_file(struct inode *inode, struct extent_position *last_pos, 371static int udf_do_extend_file(struct inode *inode,
352 struct kernel_long_ad *last_ext, sector_t blocks) 372 struct extent_position *last_pos,
373 struct kernel_long_ad *last_ext,
374 sector_t blocks)
353{ 375{
354 sector_t add; 376 sector_t add;
355 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); 377 int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
@@ -357,6 +379,7 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
357 struct kernel_lb_addr prealloc_loc = {}; 379 struct kernel_lb_addr prealloc_loc = {};
358 int prealloc_len = 0; 380 int prealloc_len = 0;
359 struct udf_inode_info *iinfo; 381 struct udf_inode_info *iinfo;
382 int err;
360 383
361 /* The previous extent is fake and we should not extend by anything 384 /* The previous extent is fake and we should not extend by anything
362 * - there's nothing to do... */ 385 * - there's nothing to do... */
@@ -422,26 +445,29 @@ int udf_extend_file(struct inode *inode, struct extent_position *last_pos,
422 /* Create enough extents to cover the whole hole */ 445 /* Create enough extents to cover the whole hole */
423 while (blocks > add) { 446 while (blocks > add) {
424 blocks -= add; 447 blocks -= add;
425 if (udf_add_aext(inode, last_pos, &last_ext->extLocation, 448 err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
426 last_ext->extLength, 1) == -1) 449 last_ext->extLength, 1);
427 return -1; 450 if (err)
451 return err;
428 count++; 452 count++;
429 } 453 }
430 if (blocks) { 454 if (blocks) {
431 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | 455 last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
432 (blocks << sb->s_blocksize_bits); 456 (blocks << sb->s_blocksize_bits);
433 if (udf_add_aext(inode, last_pos, &last_ext->extLocation, 457 err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
434 last_ext->extLength, 1) == -1) 458 last_ext->extLength, 1);
435 return -1; 459 if (err)
460 return err;
436 count++; 461 count++;
437 } 462 }
438 463
439out: 464out:
440 /* Do we have some preallocated blocks saved? */ 465 /* Do we have some preallocated blocks saved? */
441 if (prealloc_len) { 466 if (prealloc_len) {
442 if (udf_add_aext(inode, last_pos, &prealloc_loc, 467 err = udf_add_aext(inode, last_pos, &prealloc_loc,
443 prealloc_len, 1) == -1) 468 prealloc_len, 1);
444 return -1; 469 if (err)
470 return err;
445 last_ext->extLocation = prealloc_loc; 471 last_ext->extLocation = prealloc_loc;
446 last_ext->extLength = prealloc_len; 472 last_ext->extLength = prealloc_len;
447 count++; 473 count++;
@@ -453,11 +479,68 @@ out:
453 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 479 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
454 last_pos->offset -= sizeof(struct long_ad); 480 last_pos->offset -= sizeof(struct long_ad);
455 else 481 else
456 return -1; 482 return -EIO;
457 483
458 return count; 484 return count;
459} 485}
460 486
487static int udf_extend_file(struct inode *inode, loff_t newsize)
488{
489
490 struct extent_position epos;
491 struct kernel_lb_addr eloc;
492 uint32_t elen;
493 int8_t etype;
494 struct super_block *sb = inode->i_sb;
495 sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
496 int adsize;
497 struct udf_inode_info *iinfo = UDF_I(inode);
498 struct kernel_long_ad extent;
499 int err;
500
501 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
502 adsize = sizeof(struct short_ad);
503 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
504 adsize = sizeof(struct long_ad);
505 else
506 BUG();
507
508 etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
509
510 /* File has extent covering the new size (could happen when extending
511 * inside a block)? */
512 if (etype != -1)
513 return 0;
514 if (newsize & (sb->s_blocksize - 1))
515 offset++;
516 /* Extended file just to the boundary of the last file block? */
517 if (offset == 0)
518 return 0;
519
520 /* Truncate is extending the file by 'offset' blocks */
521 if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
522 (epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
523 /* File has no extents at all or has empty last
524 * indirect extent! Create a fake extent... */
525 extent.extLocation.logicalBlockNum = 0;
526 extent.extLocation.partitionReferenceNum = 0;
527 extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
528 } else {
529 epos.offset -= adsize;
530 etype = udf_next_aext(inode, &epos, &extent.extLocation,
531 &extent.extLength, 0);
532 extent.extLength |= etype << 30;
533 }
534 err = udf_do_extend_file(inode, &epos, &extent, offset);
535 if (err < 0)
536 goto out;
537 err = 0;
538 iinfo->i_lenExtents = newsize;
539out:
540 brelse(epos.bh);
541 return err;
542}
543
461static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, 544static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
462 int *err, sector_t *phys, int *new) 545 int *err, sector_t *phys, int *new)
463{ 546{
@@ -540,7 +623,7 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
540 elen = EXT_RECORDED_ALLOCATED | 623 elen = EXT_RECORDED_ALLOCATED |
541 ((elen + inode->i_sb->s_blocksize - 1) & 624 ((elen + inode->i_sb->s_blocksize - 1) &
542 ~(inode->i_sb->s_blocksize - 1)); 625 ~(inode->i_sb->s_blocksize - 1));
543 etype = udf_write_aext(inode, &cur_epos, &eloc, elen, 1); 626 udf_write_aext(inode, &cur_epos, &eloc, elen, 1);
544 } 627 }
545 brelse(prev_epos.bh); 628 brelse(prev_epos.bh);
546 brelse(cur_epos.bh); 629 brelse(cur_epos.bh);
@@ -564,19 +647,17 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
564 memset(&laarr[0].extLocation, 0x00, 647 memset(&laarr[0].extLocation, 0x00,
565 sizeof(struct kernel_lb_addr)); 648 sizeof(struct kernel_lb_addr));
566 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; 649 laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED;
567 /* Will udf_extend_file() create real extent from 650 /* Will udf_do_extend_file() create real extent from
568 a fake one? */ 651 a fake one? */
569 startnum = (offset > 0); 652 startnum = (offset > 0);
570 } 653 }
571 /* Create extents for the hole between EOF and offset */ 654 /* Create extents for the hole between EOF and offset */
572 ret = udf_extend_file(inode, &prev_epos, laarr, offset); 655 ret = udf_do_extend_file(inode, &prev_epos, laarr, offset);
573 if (ret == -1) { 656 if (ret < 0) {
574 brelse(prev_epos.bh); 657 brelse(prev_epos.bh);
575 brelse(cur_epos.bh); 658 brelse(cur_epos.bh);
576 brelse(next_epos.bh); 659 brelse(next_epos.bh);
577 /* We don't really know the error here so we just make 660 *err = ret;
578 * something up */
579 *err = -ENOSPC;
580 return NULL; 661 return NULL;
581 } 662 }
582 c = 0; 663 c = 0;
@@ -1005,52 +1086,66 @@ struct buffer_head *udf_bread(struct inode *inode, int block,
1005 return NULL; 1086 return NULL;
1006} 1087}
1007 1088
1008void udf_truncate(struct inode *inode) 1089int udf_setsize(struct inode *inode, loff_t newsize)
1009{ 1090{
1010 int offset;
1011 int err; 1091 int err;
1012 struct udf_inode_info *iinfo; 1092 struct udf_inode_info *iinfo;
1093 int bsize = 1 << inode->i_blkbits;
1013 1094
1014 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || 1095 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1015 S_ISLNK(inode->i_mode))) 1096 S_ISLNK(inode->i_mode)))
1016 return; 1097 return -EINVAL;
1017 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 1098 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1018 return; 1099 return -EPERM;
1019 1100
1020 iinfo = UDF_I(inode); 1101 iinfo = UDF_I(inode);
1021 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1102 if (newsize > inode->i_size) {
1022 down_write(&iinfo->i_data_sem); 1103 down_write(&iinfo->i_data_sem);
1023 if (inode->i_sb->s_blocksize < 1104 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1024 (udf_file_entry_alloc_offset(inode) + 1105 if (bsize <
1025 inode->i_size)) { 1106 (udf_file_entry_alloc_offset(inode) + newsize)) {
1026 udf_expand_file_adinicb(inode, inode->i_size, &err); 1107 err = udf_expand_file_adinicb(inode);
1027 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 1108 if (err) {
1028 inode->i_size = iinfo->i_lenAlloc; 1109 up_write(&iinfo->i_data_sem);
1029 up_write(&iinfo->i_data_sem); 1110 return err;
1030 return; 1111 }
1031 } else 1112 } else
1032 udf_truncate_extents(inode); 1113 iinfo->i_lenAlloc = newsize;
1033 } else { 1114 }
1034 offset = inode->i_size & (inode->i_sb->s_blocksize - 1); 1115 err = udf_extend_file(inode, newsize);
1035 memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + offset, 1116 if (err) {
1036 0x00, inode->i_sb->s_blocksize - 1117 up_write(&iinfo->i_data_sem);
1037 offset - udf_file_entry_alloc_offset(inode)); 1118 return err;
1038 iinfo->i_lenAlloc = inode->i_size;
1039 } 1119 }
1120 truncate_setsize(inode, newsize);
1040 up_write(&iinfo->i_data_sem); 1121 up_write(&iinfo->i_data_sem);
1041 } else { 1122 } else {
1042 block_truncate_page(inode->i_mapping, inode->i_size, 1123 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
1043 udf_get_block); 1124 down_write(&iinfo->i_data_sem);
1125 memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize,
1126 0x00, bsize - newsize -
1127 udf_file_entry_alloc_offset(inode));
1128 iinfo->i_lenAlloc = newsize;
1129 truncate_setsize(inode, newsize);
1130 up_write(&iinfo->i_data_sem);
1131 goto update_time;
1132 }
1133 err = block_truncate_page(inode->i_mapping, newsize,
1134 udf_get_block);
1135 if (err)
1136 return err;
1044 down_write(&iinfo->i_data_sem); 1137 down_write(&iinfo->i_data_sem);
1138 truncate_setsize(inode, newsize);
1045 udf_truncate_extents(inode); 1139 udf_truncate_extents(inode);
1046 up_write(&iinfo->i_data_sem); 1140 up_write(&iinfo->i_data_sem);
1047 } 1141 }
1048 1142update_time:
1049 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); 1143 inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb);
1050 if (IS_SYNC(inode)) 1144 if (IS_SYNC(inode))
1051 udf_sync_inode(inode); 1145 udf_sync_inode(inode);
1052 else 1146 else
1053 mark_inode_dirty(inode); 1147 mark_inode_dirty(inode);
1148 return 0;
1054} 1149}
1055 1150
1056static void __udf_read_inode(struct inode *inode) 1151static void __udf_read_inode(struct inode *inode)
@@ -1637,14 +1732,13 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
1637 return NULL; 1732 return NULL;
1638} 1733}
1639 1734
1640int8_t udf_add_aext(struct inode *inode, struct extent_position *epos, 1735int udf_add_aext(struct inode *inode, struct extent_position *epos,
1641 struct kernel_lb_addr *eloc, uint32_t elen, int inc) 1736 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1642{ 1737{
1643 int adsize; 1738 int adsize;
1644 struct short_ad *sad = NULL; 1739 struct short_ad *sad = NULL;
1645 struct long_ad *lad = NULL; 1740 struct long_ad *lad = NULL;
1646 struct allocExtDesc *aed; 1741 struct allocExtDesc *aed;
1647 int8_t etype;
1648 uint8_t *ptr; 1742 uint8_t *ptr;
1649 struct udf_inode_info *iinfo = UDF_I(inode); 1743 struct udf_inode_info *iinfo = UDF_I(inode);
1650 1744
@@ -1660,7 +1754,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1660 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) 1754 else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
1661 adsize = sizeof(struct long_ad); 1755 adsize = sizeof(struct long_ad);
1662 else 1756 else
1663 return -1; 1757 return -EIO;
1664 1758
1665 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { 1759 if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
1666 unsigned char *sptr, *dptr; 1760 unsigned char *sptr, *dptr;
@@ -1672,12 +1766,12 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1672 obloc.partitionReferenceNum, 1766 obloc.partitionReferenceNum,
1673 obloc.logicalBlockNum, &err); 1767 obloc.logicalBlockNum, &err);
1674 if (!epos->block.logicalBlockNum) 1768 if (!epos->block.logicalBlockNum)
1675 return -1; 1769 return -ENOSPC;
1676 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, 1770 nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
1677 &epos->block, 1771 &epos->block,
1678 0)); 1772 0));
1679 if (!nbh) 1773 if (!nbh)
1680 return -1; 1774 return -EIO;
1681 lock_buffer(nbh); 1775 lock_buffer(nbh);
1682 memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); 1776 memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
1683 set_buffer_uptodate(nbh); 1777 set_buffer_uptodate(nbh);
@@ -1746,7 +1840,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1746 epos->bh = nbh; 1840 epos->bh = nbh;
1747 } 1841 }
1748 1842
1749 etype = udf_write_aext(inode, epos, eloc, elen, inc); 1843 udf_write_aext(inode, epos, eloc, elen, inc);
1750 1844
1751 if (!epos->bh) { 1845 if (!epos->bh) {
1752 iinfo->i_lenAlloc += adsize; 1846 iinfo->i_lenAlloc += adsize;
@@ -1764,11 +1858,11 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1764 mark_buffer_dirty_inode(epos->bh, inode); 1858 mark_buffer_dirty_inode(epos->bh, inode);
1765 } 1859 }
1766 1860
1767 return etype; 1861 return 0;
1768} 1862}
1769 1863
1770int8_t udf_write_aext(struct inode *inode, struct extent_position *epos, 1864void udf_write_aext(struct inode *inode, struct extent_position *epos,
1771 struct kernel_lb_addr *eloc, uint32_t elen, int inc) 1865 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
1772{ 1866{
1773 int adsize; 1867 int adsize;
1774 uint8_t *ptr; 1868 uint8_t *ptr;
@@ -1798,7 +1892,7 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1798 adsize = sizeof(struct long_ad); 1892 adsize = sizeof(struct long_ad);
1799 break; 1893 break;
1800 default: 1894 default:
1801 return -1; 1895 return;
1802 } 1896 }
1803 1897
1804 if (epos->bh) { 1898 if (epos->bh) {
@@ -1817,8 +1911,6 @@ int8_t udf_write_aext(struct inode *inode, struct extent_position *epos,
1817 1911
1818 if (inc) 1912 if (inc)
1819 epos->offset += adsize; 1913 epos->offset += adsize;
1820
1821 return (elen >> 30);
1822} 1914}
1823 1915
1824int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, 1916int8_t udf_next_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2be0f9eb86d2..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,8 @@
32#include <linux/crc-itu-t.h> 32#include <linux/crc-itu-t.h>
33#include <linux/exportfs.h> 33#include <linux/exportfs.h>
34 34
35enum { UDF_MAX_LINKS = 0xffff };
36
35static inline int udf_match(int len1, const unsigned char *name1, int len2, 37static inline int udf_match(int len1, const unsigned char *name1, int len2,
36 const unsigned char *name2) 38 const unsigned char *name2)
37{ 39{
@@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
650 struct udf_inode_info *iinfo; 652 struct udf_inode_info *iinfo;
651 653
652 err = -EMLINK; 654 err = -EMLINK;
653 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 655 if (dir->i_nlink >= UDF_MAX_LINKS)
654 goto out; 656 goto out;
655 657
656 err = -EIO; 658 err = -EIO;
@@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1034 struct fileIdentDesc cfi, *fi; 1036 struct fileIdentDesc cfi, *fi;
1035 int err; 1037 int err;
1036 1038
1037 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1039 if (inode->i_nlink >= UDF_MAX_LINKS)
1038 return -EMLINK; 1040 return -EMLINK;
1039 }
1040 1041
1041 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 1042 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
1042 if (!fi) { 1043 if (!fi) {
@@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1131 goto end_rename; 1132 goto end_rename;
1132 1133
1133 retval = -EMLINK; 1134 retval = -EMLINK;
1134 if (!new_inode && 1135 if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
1135 new_dir->i_nlink >=
1136 (256 << sizeof(new_dir->i_nlink)) - 1)
1137 goto end_rename; 1136 goto end_rename;
1138 } 1137 }
1139 if (!nfi) { 1138 if (!nfi) {
@@ -1287,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
1287 struct fid *fid = (struct fid *)fh; 1286 struct fid *fid = (struct fid *)fh;
1288 int type = FILEID_UDF_WITHOUT_PARENT; 1287 int type = FILEID_UDF_WITHOUT_PARENT;
1289 1288
1290 if (len < 3 || (connectable && len < 5)) 1289 if (connectable && (len < 5)) {
1290 *lenp = 5;
1291 return 255; 1291 return 255;
1292 } else if (len < 3) {
1293 *lenp = 3;
1294 return 255;
1295 }
1292 1296
1293 *lenp = 3; 1297 *lenp = 3;
1294 fid->udf.block = location.logicalBlockNum; 1298 fid->udf.block = location.logicalBlockNum;
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 225527cdc885..8424308db4b4 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -197,6 +197,11 @@ static void udf_update_alloc_ext_desc(struct inode *inode,
197 mark_buffer_dirty_inode(epos->bh, inode); 197 mark_buffer_dirty_inode(epos->bh, inode);
198} 198}
199 199
200/*
201 * Truncate extents of inode to inode->i_size. This function can be used only
202 * for making file shorter. For making file longer, udf_extend_file() has to
203 * be used.
204 */
200void udf_truncate_extents(struct inode *inode) 205void udf_truncate_extents(struct inode *inode)
201{ 206{
202 struct extent_position epos; 207 struct extent_position epos;
@@ -219,96 +224,65 @@ void udf_truncate_extents(struct inode *inode)
219 etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); 224 etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
220 byte_offset = (offset << sb->s_blocksize_bits) + 225 byte_offset = (offset << sb->s_blocksize_bits) +
221 (inode->i_size & (sb->s_blocksize - 1)); 226 (inode->i_size & (sb->s_blocksize - 1));
222 if (etype != -1) { 227 if (etype == -1) {
223 epos.offset -= adsize; 228 /* We should extend the file? */
224 extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); 229 WARN_ON(byte_offset);
225 epos.offset += adsize; 230 return;
226 if (byte_offset) 231 }
227 lenalloc = epos.offset; 232 epos.offset -= adsize;
228 else 233 extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset);
229 lenalloc = epos.offset - adsize; 234 epos.offset += adsize;
230 235 if (byte_offset)
231 if (!epos.bh) 236 lenalloc = epos.offset;
232 lenalloc -= udf_file_entry_alloc_offset(inode); 237 else
233 else 238 lenalloc = epos.offset - adsize;
234 lenalloc -= sizeof(struct allocExtDesc);
235
236 while ((etype = udf_current_aext(inode, &epos, &eloc,
237 &elen, 0)) != -1) {
238 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
239 udf_write_aext(inode, &epos, &neloc, nelen, 0);
240 if (indirect_ext_len) {
241 /* We managed to free all extents in the
242 * indirect extent - free it too */
243 BUG_ON(!epos.bh);
244 udf_free_blocks(sb, inode, &epos.block,
245 0, indirect_ext_len);
246 } else if (!epos.bh) {
247 iinfo->i_lenAlloc = lenalloc;
248 mark_inode_dirty(inode);
249 } else
250 udf_update_alloc_ext_desc(inode,
251 &epos, lenalloc);
252 brelse(epos.bh);
253 epos.offset = sizeof(struct allocExtDesc);
254 epos.block = eloc;
255 epos.bh = udf_tread(sb,
256 udf_get_lb_pblock(sb, &eloc, 0));
257 if (elen)
258 indirect_ext_len =
259 (elen + sb->s_blocksize - 1) >>
260 sb->s_blocksize_bits;
261 else
262 indirect_ext_len = 1;
263 } else {
264 extent_trunc(inode, &epos, &eloc, etype,
265 elen, 0);
266 epos.offset += adsize;
267 }
268 }
269 239
270 if (indirect_ext_len) { 240 if (!epos.bh)
271 BUG_ON(!epos.bh); 241 lenalloc -= udf_file_entry_alloc_offset(inode);
272 udf_free_blocks(sb, inode, &epos.block, 0, 242 else
273 indirect_ext_len); 243 lenalloc -= sizeof(struct allocExtDesc);
274 } else if (!epos.bh) {
275 iinfo->i_lenAlloc = lenalloc;
276 mark_inode_dirty(inode);
277 } else
278 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
279 } else if (inode->i_size) {
280 if (byte_offset) {
281 struct kernel_long_ad extent;
282 244
283 /* 245 while ((etype = udf_current_aext(inode, &epos, &eloc,
284 * OK, there is not extent covering inode->i_size and 246 &elen, 0)) != -1) {
285 * no extent above inode->i_size => truncate is 247 if (etype == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) {
286 * extending the file by 'offset' blocks. 248 udf_write_aext(inode, &epos, &neloc, nelen, 0);
287 */ 249 if (indirect_ext_len) {
288 if ((!epos.bh && 250 /* We managed to free all extents in the
289 epos.offset == 251 * indirect extent - free it too */
290 udf_file_entry_alloc_offset(inode)) || 252 BUG_ON(!epos.bh);
291 (epos.bh && epos.offset == 253 udf_free_blocks(sb, inode, &epos.block,
292 sizeof(struct allocExtDesc))) { 254 0, indirect_ext_len);
293 /* File has no extents at all or has empty last 255 } else if (!epos.bh) {
294 * indirect extent! Create a fake extent... */ 256 iinfo->i_lenAlloc = lenalloc;
295 extent.extLocation.logicalBlockNum = 0; 257 mark_inode_dirty(inode);
296 extent.extLocation.partitionReferenceNum = 0; 258 } else
297 extent.extLength = 259 udf_update_alloc_ext_desc(inode,
298 EXT_NOT_RECORDED_NOT_ALLOCATED; 260 &epos, lenalloc);
299 } else { 261 brelse(epos.bh);
300 epos.offset -= adsize; 262 epos.offset = sizeof(struct allocExtDesc);
301 etype = udf_next_aext(inode, &epos, 263 epos.block = eloc;
302 &extent.extLocation, 264 epos.bh = udf_tread(sb,
303 &extent.extLength, 0); 265 udf_get_lb_pblock(sb, &eloc, 0));
304 extent.extLength |= etype << 30; 266 if (elen)
305 } 267 indirect_ext_len =
306 udf_extend_file(inode, &epos, &extent, 268 (elen + sb->s_blocksize - 1) >>
307 offset + 269 sb->s_blocksize_bits;
308 ((inode->i_size & 270 else
309 (sb->s_blocksize - 1)) != 0)); 271 indirect_ext_len = 1;
272 } else {
273 extent_trunc(inode, &epos, &eloc, etype, elen, 0);
274 epos.offset += adsize;
310 } 275 }
311 } 276 }
277
278 if (indirect_ext_len) {
279 BUG_ON(!epos.bh);
280 udf_free_blocks(sb, inode, &epos.block, 0, indirect_ext_len);
281 } else if (!epos.bh) {
282 iinfo->i_lenAlloc = lenalloc;
283 mark_inode_dirty(inode);
284 } else
285 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
312 iinfo->i_lenExtents = inode->i_size; 286 iinfo->i_lenExtents = inode->i_size;
313 287
314 brelse(epos.bh); 288 brelse(epos.bh);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index eba48209f9f3..dbd52d4b5eed 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -136,22 +136,20 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
136extern long udf_ioctl(struct file *, unsigned int, unsigned long); 136extern long udf_ioctl(struct file *, unsigned int, unsigned long);
137/* inode.c */ 137/* inode.c */
138extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 138extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
139extern void udf_expand_file_adinicb(struct inode *, int, int *); 139extern int udf_expand_file_adinicb(struct inode *);
140extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); 140extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
141extern struct buffer_head *udf_bread(struct inode *, int, int, int *); 141extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
142extern void udf_truncate(struct inode *); 142extern int udf_setsize(struct inode *, loff_t);
143extern void udf_read_inode(struct inode *); 143extern void udf_read_inode(struct inode *);
144extern void udf_evict_inode(struct inode *); 144extern void udf_evict_inode(struct inode *);
145extern int udf_write_inode(struct inode *, struct writeback_control *wbc); 145extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
146extern long udf_block_map(struct inode *, sector_t); 146extern long udf_block_map(struct inode *, sector_t);
147extern int udf_extend_file(struct inode *, struct extent_position *,
148 struct kernel_long_ad *, sector_t);
149extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, 147extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
150 struct kernel_lb_addr *, uint32_t *, sector_t *); 148 struct kernel_lb_addr *, uint32_t *, sector_t *);
151extern int8_t udf_add_aext(struct inode *, struct extent_position *, 149extern int udf_add_aext(struct inode *, struct extent_position *,
150 struct kernel_lb_addr *, uint32_t, int);
151extern void udf_write_aext(struct inode *, struct extent_position *,
152 struct kernel_lb_addr *, uint32_t, int); 152 struct kernel_lb_addr *, uint32_t, int);
153extern int8_t udf_write_aext(struct inode *, struct extent_position *,
154 struct kernel_lb_addr *, uint32_t, int);
155extern int8_t udf_delete_aext(struct inode *, struct extent_position, 153extern int8_t udf_delete_aext(struct inode *, struct extent_position,
156 struct kernel_lb_addr, uint32_t); 154 struct kernel_lb_addr, uint32_t);
157extern int8_t udf_next_aext(struct inode *, struct extent_position *, 155extern int8_t udf_next_aext(struct inode *, struct extent_position *,
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index 30c8f223253d..e4f10a40768a 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -1,7 +1,6 @@
1config UFS_FS 1config UFS_FS
2 tristate "UFS file system support (read only)" 2 tristate "UFS file system support (read only)"
3 depends on BLOCK 3 depends on BLOCK
4 depends on BKL # probably fixable
5 help 4 help
6 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, 5 BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
7 OpenBSD and NeXTstep) use a file system called UFS. Some System V 6 OpenBSD and NeXTstep) use a file system called UFS. Some System V
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 2b251f2093af..e765743cf9f3 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -34,7 +34,6 @@
34#include <linux/stat.h> 34#include <linux/stat.h>
35#include <linux/string.h> 35#include <linux/string.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
39#include <linux/writeback.h> 38#include <linux/writeback.h>
40 39
@@ -43,7 +42,7 @@
43#include "swab.h" 42#include "swab.h"
44#include "util.h" 43#include "util.h"
45 44
46static u64 ufs_frag_map(struct inode *inode, sector_t frag); 45static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock);
47 46
48static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) 47static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4])
49{ 48{
@@ -79,10 +78,10 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off
79 78
80/* 79/*
81 * Returns the location of the fragment from 80 * Returns the location of the fragment from
82 * the begining of the filesystem. 81 * the beginning of the filesystem.
83 */ 82 */
84 83
85static u64 ufs_frag_map(struct inode *inode, sector_t frag) 84static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock)
86{ 85{
87 struct ufs_inode_info *ufsi = UFS_I(inode); 86 struct ufs_inode_info *ufsi = UFS_I(inode);
88 struct super_block *sb = inode->i_sb; 87 struct super_block *sb = inode->i_sb;
@@ -107,7 +106,8 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag)
107 106
108 p = offsets; 107 p = offsets;
109 108
110 lock_kernel(); 109 if (needs_lock)
110 lock_ufs(sb);
111 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) 111 if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
112 goto ufs2; 112 goto ufs2;
113 113
@@ -152,7 +152,8 @@ ufs2:
152 ret = temp + (u64) (frag & uspi->s_fpbmask); 152 ret = temp + (u64) (frag & uspi->s_fpbmask);
153 153
154out: 154out:
155 unlock_kernel(); 155 if (needs_lock)
156 unlock_ufs(sb);
156 return ret; 157 return ret;
157} 158}
158 159
@@ -415,14 +416,16 @@ out:
415int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) 416int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
416{ 417{
417 struct super_block * sb = inode->i_sb; 418 struct super_block * sb = inode->i_sb;
418 struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; 419 struct ufs_sb_info * sbi = UFS_SB(sb);
420 struct ufs_sb_private_info * uspi = sbi->s_uspi;
419 struct buffer_head * bh; 421 struct buffer_head * bh;
420 int ret, err, new; 422 int ret, err, new;
421 unsigned long ptr,phys; 423 unsigned long ptr,phys;
422 u64 phys64 = 0; 424 u64 phys64 = 0;
425 bool needs_lock = (sbi->mutex_owner != current);
423 426
424 if (!create) { 427 if (!create) {
425 phys64 = ufs_frag_map(inode, fragment); 428 phys64 = ufs_frag_map(inode, fragment, needs_lock);
426 UFSD("phys64 = %llu\n", (unsigned long long)phys64); 429 UFSD("phys64 = %llu\n", (unsigned long long)phys64);
427 if (phys64) 430 if (phys64)
428 map_bh(bh_result, sb, phys64); 431 map_bh(bh_result, sb, phys64);
@@ -436,7 +439,8 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head
436 ret = 0; 439 ret = 0;
437 bh = NULL; 440 bh = NULL;
438 441
439 lock_kernel(); 442 if (needs_lock)
443 lock_ufs(sb);
440 444
441 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); 445 UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment);
442 if (fragment > 446 if (fragment >
@@ -498,7 +502,9 @@ out:
498 set_buffer_new(bh_result); 502 set_buffer_new(bh_result);
499 map_bh(bh_result, sb, phys); 503 map_bh(bh_result, sb, phys);
500abort: 504abort:
501 unlock_kernel(); 505 if (needs_lock)
506 unlock_ufs(sb);
507
502 return err; 508 return err;
503 509
504abort_too_big: 510abort_too_big:
@@ -506,48 +512,6 @@ abort_too_big:
506 goto abort; 512 goto abort;
507} 513}
508 514
509static struct buffer_head *ufs_getfrag(struct inode *inode,
510 unsigned int fragment,
511 int create, int *err)
512{
513 struct buffer_head dummy;
514 int error;
515
516 dummy.b_state = 0;
517 dummy.b_blocknr = -1000;
518 error = ufs_getfrag_block(inode, fragment, &dummy, create);
519 *err = error;
520 if (!error && buffer_mapped(&dummy)) {
521 struct buffer_head *bh;
522 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
523 if (buffer_new(&dummy)) {
524 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
525 set_buffer_uptodate(bh);
526 mark_buffer_dirty(bh);
527 }
528 return bh;
529 }
530 return NULL;
531}
532
533struct buffer_head * ufs_bread (struct inode * inode, unsigned fragment,
534 int create, int * err)
535{
536 struct buffer_head * bh;
537
538 UFSD("ENTER, ino %lu, fragment %u\n", inode->i_ino, fragment);
539 bh = ufs_getfrag (inode, fragment, create, err);
540 if (!bh || buffer_uptodate(bh))
541 return bh;
542 ll_rw_block (READ, 1, &bh);
543 wait_on_buffer (bh);
544 if (buffer_uptodate(bh))
545 return bh;
546 brelse (bh);
547 *err = -EIO;
548 return NULL;
549}
550
551static int ufs_writepage(struct page *page, struct writeback_control *wbc) 515static int ufs_writepage(struct page *page, struct writeback_control *wbc)
552{ 516{
553 return block_write_full_page(page,ufs_getfrag_block,wbc); 517 return block_write_full_page(page,ufs_getfrag_block,wbc);
@@ -588,7 +552,6 @@ static sector_t ufs_bmap(struct address_space *mapping, sector_t block)
588const struct address_space_operations ufs_aops = { 552const struct address_space_operations ufs_aops = {
589 .readpage = ufs_readpage, 553 .readpage = ufs_readpage,
590 .writepage = ufs_writepage, 554 .writepage = ufs_writepage,
591 .sync_page = block_sync_page,
592 .write_begin = ufs_write_begin, 555 .write_begin = ufs_write_begin,
593 .write_end = generic_write_end, 556 .write_end = generic_write_end,
594 .bmap = ufs_bmap 557 .bmap = ufs_bmap
@@ -900,9 +863,9 @@ static int ufs_update_inode(struct inode * inode, int do_sync)
900int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) 863int ufs_write_inode(struct inode *inode, struct writeback_control *wbc)
901{ 864{
902 int ret; 865 int ret;
903 lock_kernel(); 866 lock_ufs(inode->i_sb);
904 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); 867 ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
905 unlock_kernel(); 868 unlock_ufs(inode->i_sb);
906 return ret; 869 return ret;
907} 870}
908 871
@@ -922,22 +885,22 @@ void ufs_evict_inode(struct inode * inode)
922 if (want_delete) { 885 if (want_delete) {
923 loff_t old_i_size; 886 loff_t old_i_size;
924 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ 887 /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
925 lock_kernel(); 888 lock_ufs(inode->i_sb);
926 mark_inode_dirty(inode); 889 mark_inode_dirty(inode);
927 ufs_update_inode(inode, IS_SYNC(inode)); 890 ufs_update_inode(inode, IS_SYNC(inode));
928 old_i_size = inode->i_size; 891 old_i_size = inode->i_size;
929 inode->i_size = 0; 892 inode->i_size = 0;
930 if (inode->i_blocks && ufs_truncate(inode, old_i_size)) 893 if (inode->i_blocks && ufs_truncate(inode, old_i_size))
931 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); 894 ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n");
932 unlock_kernel(); 895 unlock_ufs(inode->i_sb);
933 } 896 }
934 897
935 invalidate_inode_buffers(inode); 898 invalidate_inode_buffers(inode);
936 end_writeback(inode); 899 end_writeback(inode);
937 900
938 if (want_delete) { 901 if (want_delete) {
939 lock_kernel(); 902 lock_ufs(inode->i_sb);
940 ufs_free_inode (inode); 903 ufs_free_inode (inode);
941 unlock_kernel(); 904 unlock_ufs(inode->i_sb);
942 } 905 }
943} 906}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e4437..29309e25417f 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h>
33 32
34#include "ufs_fs.h" 33#include "ufs_fs.h"
35#include "ufs.h" 34#include "ufs.h"
@@ -55,16 +54,16 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, stru
55 if (dentry->d_name.len > UFS_MAXNAMLEN) 54 if (dentry->d_name.len > UFS_MAXNAMLEN)
56 return ERR_PTR(-ENAMETOOLONG); 55 return ERR_PTR(-ENAMETOOLONG);
57 56
58 lock_kernel(); 57 lock_ufs(dir->i_sb);
59 ino = ufs_inode_by_name(dir, &dentry->d_name); 58 ino = ufs_inode_by_name(dir, &dentry->d_name);
60 if (ino) { 59 if (ino) {
61 inode = ufs_iget(dir->i_sb, ino); 60 inode = ufs_iget(dir->i_sb, ino);
62 if (IS_ERR(inode)) { 61 if (IS_ERR(inode)) {
63 unlock_kernel(); 62 unlock_ufs(dir->i_sb);
64 return ERR_CAST(inode); 63 return ERR_CAST(inode);
65 } 64 }
66 } 65 }
67 unlock_kernel(); 66 unlock_ufs(dir->i_sb);
68 d_add(dentry, inode); 67 d_add(dentry, inode);
69 return NULL; 68 return NULL;
70} 69}
@@ -93,9 +92,9 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
93 inode->i_fop = &ufs_file_operations; 92 inode->i_fop = &ufs_file_operations;
94 inode->i_mapping->a_ops = &ufs_aops; 93 inode->i_mapping->a_ops = &ufs_aops;
95 mark_inode_dirty(inode); 94 mark_inode_dirty(inode);
96 lock_kernel(); 95 lock_ufs(dir->i_sb);
97 err = ufs_add_nondir(dentry, inode); 96 err = ufs_add_nondir(dentry, inode);
98 unlock_kernel(); 97 unlock_ufs(dir->i_sb);
99 } 98 }
100 UFSD("END: err=%d\n", err); 99 UFSD("END: err=%d\n", err);
101 return err; 100 return err;
@@ -115,9 +114,9 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
115 init_special_inode(inode, mode, rdev); 114 init_special_inode(inode, mode, rdev);
116 ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); 115 ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev);
117 mark_inode_dirty(inode); 116 mark_inode_dirty(inode);
118 lock_kernel(); 117 lock_ufs(dir->i_sb);
119 err = ufs_add_nondir(dentry, inode); 118 err = ufs_add_nondir(dentry, inode);
120 unlock_kernel(); 119 unlock_ufs(dir->i_sb);
121 } 120 }
122 return err; 121 return err;
123} 122}
@@ -133,7 +132,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
133 if (l > sb->s_blocksize) 132 if (l > sb->s_blocksize)
134 goto out_notlocked; 133 goto out_notlocked;
135 134
136 lock_kernel(); 135 lock_ufs(dir->i_sb);
137 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 136 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
138 err = PTR_ERR(inode); 137 err = PTR_ERR(inode);
139 if (IS_ERR(inode)) 138 if (IS_ERR(inode))
@@ -156,7 +155,7 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
156 155
157 err = ufs_add_nondir(dentry, inode); 156 err = ufs_add_nondir(dentry, inode);
158out: 157out:
159 unlock_kernel(); 158 unlock_ufs(dir->i_sb);
160out_notlocked: 159out_notlocked:
161 return err; 160 return err;
162 161
@@ -172,9 +171,9 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
172 struct inode *inode = old_dentry->d_inode; 171 struct inode *inode = old_dentry->d_inode;
173 int error; 172 int error;
174 173
175 lock_kernel(); 174 lock_ufs(dir->i_sb);
176 if (inode->i_nlink >= UFS_LINK_MAX) { 175 if (inode->i_nlink >= UFS_LINK_MAX) {
177 unlock_kernel(); 176 unlock_ufs(dir->i_sb);
178 return -EMLINK; 177 return -EMLINK;
179 } 178 }
180 179
@@ -183,7 +182,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
183 ihold(inode); 182 ihold(inode);
184 183
185 error = ufs_add_nondir(dentry, inode); 184 error = ufs_add_nondir(dentry, inode);
186 unlock_kernel(); 185 unlock_ufs(dir->i_sb);
187 return error; 186 return error;
188} 187}
189 188
@@ -195,7 +194,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
195 if (dir->i_nlink >= UFS_LINK_MAX) 194 if (dir->i_nlink >= UFS_LINK_MAX)
196 goto out; 195 goto out;
197 196
198 lock_kernel(); 197 lock_ufs(dir->i_sb);
199 inode_inc_link_count(dir); 198 inode_inc_link_count(dir);
200 199
201 inode = ufs_new_inode(dir, S_IFDIR|mode); 200 inode = ufs_new_inode(dir, S_IFDIR|mode);
@@ -216,7 +215,7 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
216 err = ufs_add_link(dentry, inode); 215 err = ufs_add_link(dentry, inode);
217 if (err) 216 if (err)
218 goto out_fail; 217 goto out_fail;
219 unlock_kernel(); 218 unlock_ufs(dir->i_sb);
220 219
221 d_instantiate(dentry, inode); 220 d_instantiate(dentry, inode);
222out: 221out:
@@ -228,7 +227,7 @@ out_fail:
228 iput (inode); 227 iput (inode);
229out_dir: 228out_dir:
230 inode_dec_link_count(dir); 229 inode_dec_link_count(dir);
231 unlock_kernel(); 230 unlock_ufs(dir->i_sb);
232 goto out; 231 goto out;
233} 232}
234 233
@@ -259,7 +258,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
259 struct inode * inode = dentry->d_inode; 258 struct inode * inode = dentry->d_inode;
260 int err= -ENOTEMPTY; 259 int err= -ENOTEMPTY;
261 260
262 lock_kernel(); 261 lock_ufs(dir->i_sb);
263 if (ufs_empty_dir (inode)) { 262 if (ufs_empty_dir (inode)) {
264 err = ufs_unlink(dir, dentry); 263 err = ufs_unlink(dir, dentry);
265 if (!err) { 264 if (!err) {
@@ -268,7 +267,7 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry)
268 inode_dec_link_count(dir); 267 inode_dec_link_count(dir);
269 } 268 }
270 } 269 }
271 unlock_kernel(); 270 unlock_ufs(dir->i_sb);
272 return err; 271 return err;
273} 272}
274 273
@@ -306,7 +305,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
306 new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); 305 new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
307 if (!new_de) 306 if (!new_de)
308 goto out_dir; 307 goto out_dir;
309 inode_inc_link_count(old_inode);
310 ufs_set_link(new_dir, new_de, new_page, old_inode); 308 ufs_set_link(new_dir, new_de, new_page, old_inode);
311 new_inode->i_ctime = CURRENT_TIME_SEC; 309 new_inode->i_ctime = CURRENT_TIME_SEC;
312 if (dir_de) 310 if (dir_de)
@@ -318,12 +316,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
318 if (new_dir->i_nlink >= UFS_LINK_MAX) 316 if (new_dir->i_nlink >= UFS_LINK_MAX)
319 goto out_dir; 317 goto out_dir;
320 } 318 }
321 inode_inc_link_count(old_inode);
322 err = ufs_add_link(new_dentry, old_inode); 319 err = ufs_add_link(new_dentry, old_inode);
323 if (err) { 320 if (err)
324 inode_dec_link_count(old_inode);
325 goto out_dir; 321 goto out_dir;
326 }
327 if (dir_de) 322 if (dir_de)
328 inode_inc_link_count(new_dir); 323 inode_inc_link_count(new_dir);
329 } 324 }
@@ -331,12 +326,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
331 /* 326 /*
332 * Like most other Unix systems, set the ctime for inodes on a 327 * Like most other Unix systems, set the ctime for inodes on a
333 * rename. 328 * rename.
334 * inode_dec_link_count() will mark the inode dirty.
335 */ 329 */
336 old_inode->i_ctime = CURRENT_TIME_SEC; 330 old_inode->i_ctime = CURRENT_TIME_SEC;
337 331
338 ufs_delete_entry(old_dir, old_de, old_page); 332 ufs_delete_entry(old_dir, old_de, old_page);
339 inode_dec_link_count(old_inode); 333 mark_inode_dirty(old_inode);
340 334
341 if (dir_de) { 335 if (dir_de) {
342 ufs_set_link(old_inode, dir_de, dir_page, new_dir); 336 ufs_set_link(old_inode, dir_de, dir_page, new_dir);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2c61ac5d4e48..3915ade6f9a8 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -84,7 +84,6 @@
84#include <linux/blkdev.h> 84#include <linux/blkdev.h>
85#include <linux/init.h> 85#include <linux/init.h>
86#include <linux/parser.h> 86#include <linux/parser.h>
87#include <linux/smp_lock.h>
88#include <linux/buffer_head.h> 87#include <linux/buffer_head.h>
89#include <linux/vfs.h> 88#include <linux/vfs.h>
90#include <linux/log2.h> 89#include <linux/log2.h>
@@ -96,6 +95,26 @@
96#include "swab.h" 95#include "swab.h"
97#include "util.h" 96#include "util.h"
98 97
98void lock_ufs(struct super_block *sb)
99{
100#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
101 struct ufs_sb_info *sbi = UFS_SB(sb);
102
103 mutex_lock(&sbi->mutex);
104 sbi->mutex_owner = current;
105#endif
106}
107
108void unlock_ufs(struct super_block *sb)
109{
110#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
111 struct ufs_sb_info *sbi = UFS_SB(sb);
112
113 sbi->mutex_owner = NULL;
114 mutex_unlock(&sbi->mutex);
115#endif
116}
117
99static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) 118static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
100{ 119{
101 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; 120 struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -313,7 +332,6 @@ void ufs_panic (struct super_block * sb, const char * function,
313 struct ufs_super_block_first * usb1; 332 struct ufs_super_block_first * usb1;
314 va_list args; 333 va_list args;
315 334
316 lock_kernel();
317 uspi = UFS_SB(sb)->s_uspi; 335 uspi = UFS_SB(sb)->s_uspi;
318 usb1 = ubh_get_usb_first(uspi); 336 usb1 = ubh_get_usb_first(uspi);
319 337
@@ -465,9 +483,9 @@ static int ufs_parse_options (char * options, unsigned * mount_options)
465} 483}
466 484
467/* 485/*
468 * Diffrent types of UFS hold fs_cstotal in different 486 * Different types of UFS hold fs_cstotal in different
469 * places, and use diffrent data structure for it. 487 * places, and use different data structure for it.
470 * To make things simplier we just copy fs_cstotal to ufs_sb_private_info 488 * To make things simpler we just copy fs_cstotal to ufs_sb_private_info
471 */ 489 */
472static void ufs_setup_cstotal(struct super_block *sb) 490static void ufs_setup_cstotal(struct super_block *sb)
473{ 491{
@@ -521,7 +539,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
521 */ 539 */
522 size = uspi->s_cssize; 540 size = uspi->s_cssize;
523 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 541 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
524 base = space = kmalloc(size, GFP_KERNEL); 542 base = space = kmalloc(size, GFP_NOFS);
525 if (!base) 543 if (!base)
526 goto failed; 544 goto failed;
527 sbi->s_csp = (struct ufs_csum *)space; 545 sbi->s_csp = (struct ufs_csum *)space;
@@ -546,7 +564,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
546 * Read cylinder group (we read only first fragment from block 564 * Read cylinder group (we read only first fragment from block
547 * at this time) and prepare internal data structures for cg caching. 565 * at this time) and prepare internal data structures for cg caching.
548 */ 566 */
549 if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_KERNEL))) 567 if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS)))
550 goto failed; 568 goto failed;
551 for (i = 0; i < uspi->s_ncg; i++) 569 for (i = 0; i < uspi->s_ncg; i++)
552 sbi->s_ucg[i] = NULL; 570 sbi->s_ucg[i] = NULL;
@@ -564,7 +582,7 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
564 ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); 582 ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data);
565 } 583 }
566 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { 584 for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) {
567 if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_KERNEL))) 585 if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS)))
568 goto failed; 586 goto failed;
569 sbi->s_cgno[i] = UFS_CGNO_EMPTY; 587 sbi->s_cgno[i] = UFS_CGNO_EMPTY;
570 } 588 }
@@ -646,8 +664,6 @@ static void ufs_put_super_internal(struct super_block *sb)
646 664
647 UFSD("ENTER\n"); 665 UFSD("ENTER\n");
648 666
649 lock_kernel();
650
651 ufs_put_cstotal(sb); 667 ufs_put_cstotal(sb);
652 size = uspi->s_cssize; 668 size = uspi->s_cssize;
653 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; 669 blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -676,8 +692,6 @@ static void ufs_put_super_internal(struct super_block *sb)
676 kfree (sbi->s_ucg); 692 kfree (sbi->s_ucg);
677 kfree (base); 693 kfree (base);
678 694
679 unlock_kernel();
680
681 UFSD("EXIT\n"); 695 UFSD("EXIT\n");
682} 696}
683 697
@@ -696,8 +710,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
696 unsigned maxsymlen; 710 unsigned maxsymlen;
697 int ret = -EINVAL; 711 int ret = -EINVAL;
698 712
699 lock_kernel();
700
701 uspi = NULL; 713 uspi = NULL;
702 ubh = NULL; 714 ubh = NULL;
703 flags = 0; 715 flags = 0;
@@ -718,6 +730,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
718 goto failed; 730 goto failed;
719 } 731 }
720#endif 732#endif
733 mutex_init(&sbi->mutex);
721 /* 734 /*
722 * Set default mount options 735 * Set default mount options
723 * Parse mount options 736 * Parse mount options
@@ -1165,7 +1178,6 @@ magic_found:
1165 goto failed; 1178 goto failed;
1166 1179
1167 UFSD("EXIT\n"); 1180 UFSD("EXIT\n");
1168 unlock_kernel();
1169 return 0; 1181 return 0;
1170 1182
1171dalloc_failed: 1183dalloc_failed:
@@ -1177,12 +1189,10 @@ failed:
1177 kfree(sbi); 1189 kfree(sbi);
1178 sb->s_fs_info = NULL; 1190 sb->s_fs_info = NULL;
1179 UFSD("EXIT (FAILED)\n"); 1191 UFSD("EXIT (FAILED)\n");
1180 unlock_kernel();
1181 return ret; 1192 return ret;
1182 1193
1183failed_nomem: 1194failed_nomem:
1184 UFSD("EXIT (NOMEM)\n"); 1195 UFSD("EXIT (NOMEM)\n");
1185 unlock_kernel();
1186 return -ENOMEM; 1196 return -ENOMEM;
1187} 1197}
1188 1198
@@ -1193,8 +1203,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
1193 struct ufs_super_block_third * usb3; 1203 struct ufs_super_block_third * usb3;
1194 unsigned flags; 1204 unsigned flags;
1195 1205
1206 lock_ufs(sb);
1196 lock_super(sb); 1207 lock_super(sb);
1197 lock_kernel();
1198 1208
1199 UFSD("ENTER\n"); 1209 UFSD("ENTER\n");
1200 1210
@@ -1213,8 +1223,8 @@ static int ufs_sync_fs(struct super_block *sb, int wait)
1213 sb->s_dirt = 0; 1223 sb->s_dirt = 0;
1214 1224
1215 UFSD("EXIT\n"); 1225 UFSD("EXIT\n");
1216 unlock_kernel();
1217 unlock_super(sb); 1226 unlock_super(sb);
1227 unlock_ufs(sb);
1218 1228
1219 return 0; 1229 return 0;
1220} 1230}
@@ -1256,7 +1266,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1256 unsigned new_mount_opt, ufstype; 1266 unsigned new_mount_opt, ufstype;
1257 unsigned flags; 1267 unsigned flags;
1258 1268
1259 lock_kernel(); 1269 lock_ufs(sb);
1260 lock_super(sb); 1270 lock_super(sb);
1261 uspi = UFS_SB(sb)->s_uspi; 1271 uspi = UFS_SB(sb)->s_uspi;
1262 flags = UFS_SB(sb)->s_flags; 1272 flags = UFS_SB(sb)->s_flags;
@@ -1272,7 +1282,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1272 ufs_set_opt (new_mount_opt, ONERROR_LOCK); 1282 ufs_set_opt (new_mount_opt, ONERROR_LOCK);
1273 if (!ufs_parse_options (data, &new_mount_opt)) { 1283 if (!ufs_parse_options (data, &new_mount_opt)) {
1274 unlock_super(sb); 1284 unlock_super(sb);
1275 unlock_kernel(); 1285 unlock_ufs(sb);
1276 return -EINVAL; 1286 return -EINVAL;
1277 } 1287 }
1278 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { 1288 if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1280,14 +1290,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1280 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { 1290 } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
1281 printk("ufstype can't be changed during remount\n"); 1291 printk("ufstype can't be changed during remount\n");
1282 unlock_super(sb); 1292 unlock_super(sb);
1283 unlock_kernel(); 1293 unlock_ufs(sb);
1284 return -EINVAL; 1294 return -EINVAL;
1285 } 1295 }
1286 1296
1287 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 1297 if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
1288 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1298 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1289 unlock_super(sb); 1299 unlock_super(sb);
1290 unlock_kernel(); 1300 unlock_ufs(sb);
1291 return 0; 1301 return 0;
1292 } 1302 }
1293 1303
@@ -1313,7 +1323,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1313 printk("ufs was compiled with read-only support, " 1323 printk("ufs was compiled with read-only support, "
1314 "can't be mounted as read-write\n"); 1324 "can't be mounted as read-write\n");
1315 unlock_super(sb); 1325 unlock_super(sb);
1316 unlock_kernel(); 1326 unlock_ufs(sb);
1317 return -EINVAL; 1327 return -EINVAL;
1318#else 1328#else
1319 if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 1329 if (ufstype != UFS_MOUNT_UFSTYPE_SUN &&
@@ -1323,13 +1333,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1323 ufstype != UFS_MOUNT_UFSTYPE_UFS2) { 1333 ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
1324 printk("this ufstype is read-only supported\n"); 1334 printk("this ufstype is read-only supported\n");
1325 unlock_super(sb); 1335 unlock_super(sb);
1326 unlock_kernel(); 1336 unlock_ufs(sb);
1327 return -EINVAL; 1337 return -EINVAL;
1328 } 1338 }
1329 if (!ufs_read_cylinder_structures(sb)) { 1339 if (!ufs_read_cylinder_structures(sb)) {
1330 printk("failed during remounting\n"); 1340 printk("failed during remounting\n");
1331 unlock_super(sb); 1341 unlock_super(sb);
1332 unlock_kernel(); 1342 unlock_ufs(sb);
1333 return -EPERM; 1343 return -EPERM;
1334 } 1344 }
1335 sb->s_flags &= ~MS_RDONLY; 1345 sb->s_flags &= ~MS_RDONLY;
@@ -1337,7 +1347,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
1337 } 1347 }
1338 UFS_SB(sb)->s_mount_opt = new_mount_opt; 1348 UFS_SB(sb)->s_mount_opt = new_mount_opt;
1339 unlock_super(sb); 1349 unlock_super(sb);
1340 unlock_kernel(); 1350 unlock_ufs(sb);
1341 return 0; 1351 return 0;
1342} 1352}
1343 1353
@@ -1371,7 +1381,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1371 struct ufs_super_block_third *usb3; 1381 struct ufs_super_block_third *usb3;
1372 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 1382 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
1373 1383
1374 lock_kernel(); 1384 lock_ufs(sb);
1375 1385
1376 usb1 = ubh_get_usb_first(uspi); 1386 usb1 = ubh_get_usb_first(uspi);
1377 usb2 = ubh_get_usb_second(uspi); 1387 usb2 = ubh_get_usb_second(uspi);
@@ -1395,7 +1405,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf)
1395 buf->f_fsid.val[0] = (u32)id; 1405 buf->f_fsid.val[0] = (u32)id;
1396 buf->f_fsid.val[1] = (u32)(id >> 32); 1406 buf->f_fsid.val[1] = (u32)(id >> 32);
1397 1407
1398 unlock_kernel(); 1408 unlock_ufs(sb);
1399 1409
1400 return 0; 1410 return 0;
1401} 1411}
@@ -1405,7 +1415,7 @@ static struct kmem_cache * ufs_inode_cachep;
1405static struct inode *ufs_alloc_inode(struct super_block *sb) 1415static struct inode *ufs_alloc_inode(struct super_block *sb)
1406{ 1416{
1407 struct ufs_inode_info *ei; 1417 struct ufs_inode_info *ei;
1408 ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_KERNEL); 1418 ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
1409 if (!ei) 1419 if (!ei)
1410 return NULL; 1420 return NULL;
1411 ei->vfs_inode.i_version = 1; 1421 ei->vfs_inode.i_version = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index a58f9155fc9a..5f821dbc0579 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -40,7 +40,6 @@
40#include <linux/time.h> 40#include <linux/time.h>
41#include <linux/stat.h> 41#include <linux/stat.h>
42#include <linux/string.h> 42#include <linux/string.h>
43#include <linux/smp_lock.h>
44#include <linux/buffer_head.h> 43#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 44#include <linux/blkdev.h>
46#include <linux/sched.h> 45#include <linux/sched.h>
@@ -467,7 +466,6 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
467 466
468 block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); 467 block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block);
469 468
470 lock_kernel();
471 while (1) { 469 while (1) {
472 retry = ufs_trunc_direct(inode); 470 retry = ufs_trunc_direct(inode);
473 retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, 471 retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK,
@@ -481,13 +479,11 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size)
481 break; 479 break;
482 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) 480 if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
483 ufs_sync_inode (inode); 481 ufs_sync_inode (inode);
484 blk_run_address_space(inode->i_mapping);
485 yield(); 482 yield();
486 } 483 }
487 484
488 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 485 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
489 ufsi->i_lastfrag = DIRECT_FRAGMENT; 486 ufsi->i_lastfrag = DIRECT_FRAGMENT;
490 unlock_kernel();
491 mark_inode_dirty(inode); 487 mark_inode_dirty(inode);
492out: 488out:
493 UFSD("EXIT: err %d\n", err); 489 UFSD("EXIT: err %d\n", err);
@@ -510,7 +506,9 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
510 /* XXX(truncate): truncate_setsize should be called last */ 506 /* XXX(truncate): truncate_setsize should be called last */
511 truncate_setsize(inode, attr->ia_size); 507 truncate_setsize(inode, attr->ia_size);
512 508
509 lock_ufs(inode->i_sb);
513 error = ufs_truncate(inode, old_i_size); 510 error = ufs_truncate(inode, old_i_size);
511 unlock_ufs(inode->i_sb);
514 if (error) 512 if (error)
515 return error; 513 return error;
516 } 514 }
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index c08782e1b48a..5be2755dd715 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -18,6 +18,8 @@ struct ufs_sb_info {
18 unsigned s_cgno[UFS_MAX_GROUP_LOADED]; 18 unsigned s_cgno[UFS_MAX_GROUP_LOADED];
19 unsigned short s_cg_loaded; 19 unsigned short s_cg_loaded;
20 unsigned s_mount_opt; 20 unsigned s_mount_opt;
21 struct mutex mutex;
22 struct task_struct *mutex_owner;
21}; 23};
22 24
23struct ufs_inode_info { 25struct ufs_inode_info {
@@ -109,7 +111,6 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long);
109extern int ufs_write_inode (struct inode *, struct writeback_control *); 111extern int ufs_write_inode (struct inode *, struct writeback_control *);
110extern int ufs_sync_inode (struct inode *); 112extern int ufs_sync_inode (struct inode *);
111extern void ufs_evict_inode (struct inode *); 113extern void ufs_evict_inode (struct inode *);
112extern struct buffer_head * ufs_bread (struct inode *, unsigned, int, int *);
113extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); 114extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create);
114 115
115/* namei.c */ 116/* namei.c */
@@ -154,4 +155,7 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b)
154 return do_div(b, uspi->s_fpg); 155 return do_div(b, uspi->s_fpg);
155} 156}
156 157
158extern void lock_ufs(struct super_block *sb);
159extern void unlock_ufs(struct super_block *sb);
160
157#endif /* _UFS_UFS_H */ 161#endif /* _UFS_UFS_H */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index d2c36d53fe66..95425b59ce0a 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -27,7 +27,7 @@ struct ufs_buffer_head * _ubh_bread_ (struct ufs_sb_private_info * uspi,
27 if (count > UFS_MAXFRAG) 27 if (count > UFS_MAXFRAG)
28 return NULL; 28 return NULL;
29 ubh = (struct ufs_buffer_head *) 29 ubh = (struct ufs_buffer_head *)
30 kmalloc (sizeof (struct ufs_buffer_head), GFP_KERNEL); 30 kmalloc (sizeof (struct ufs_buffer_head), GFP_NOFS);
31 if (!ubh) 31 if (!ubh)
32 return NULL; 32 return NULL;
33 ubh->fragment = fragment; 33 ubh->fragment = fragment;
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 9f8775ce381c..954175928240 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -408,7 +408,7 @@ static inline unsigned _ubh_find_next_zero_bit_(
408 for (;;) { 408 for (;;) {
409 count = min_t(unsigned int, size + offset, uspi->s_bpf); 409 count = min_t(unsigned int, size + offset, uspi->s_bpf);
410 size -= count - offset; 410 size -= count - offset;
411 pos = ext2_find_next_zero_bit (ubh->bh[base]->b_data, count, offset); 411 pos = find_next_zero_bit_le(ubh->bh[base]->b_data, count, offset);
412 if (pos < count || !size) 412 if (pos < count || !size)
413 break; 413 break;
414 base++; 414 base++;
diff --git a/fs/utimes.c b/fs/utimes.c
index 179b58690657..ba653f3dc1bc 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -95,7 +95,7 @@ static int utimes_common(struct path *path, struct timespec *times)
95 if (IS_IMMUTABLE(inode)) 95 if (IS_IMMUTABLE(inode))
96 goto mnt_drop_write_and_out; 96 goto mnt_drop_write_and_out;
97 97
98 if (!is_owner_or_cap(inode)) { 98 if (!inode_owner_or_capable(inode)) {
99 error = inode_permission(inode, MAY_WRITE); 99 error = inode_permission(inode, MAY_WRITE);
100 if (error) 100 if (error)
101 goto mnt_drop_write_and_out; 101 goto mnt_drop_write_and_out;
diff --git a/fs/xattr.c b/fs/xattr.c
index 01bb8135e14a..a19acdb81cd1 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -59,7 +59,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
59 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) 59 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
60 return -EPERM; 60 return -EPERM;
61 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && 61 if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
62 (mask & MAY_WRITE) && !is_owner_or_cap(inode)) 62 (mask & MAY_WRITE) && !inode_owner_or_capable(inode))
63 return -EPERM; 63 return -EPERM;
64 } 64 }
65 65
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index faca44997099..284a7c89697e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -16,14 +16,11 @@
16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17# 17#
18 18
19EXTRA_CFLAGS += -I$(src) -I$(src)/linux-2.6 19ccflags-y := -I$(src) -I$(src)/linux-2.6
20ccflags-$(CONFIG_XFS_DEBUG) += -g
20 21
21XFS_LINUX := linux-2.6 22XFS_LINUX := linux-2.6
22 23
23ifeq ($(CONFIG_XFS_DEBUG),y)
24 EXTRA_CFLAGS += -g
25endif
26
27obj-$(CONFIG_XFS_FS) += xfs.o 24obj-$(CONFIG_XFS_FS) += xfs.o
28 25
29xfs-y += linux-2.6/xfs_trace.o 26xfs-y += linux-2.6/xfs_trace.o
@@ -105,11 +102,10 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
105 xfs_globals.o \ 102 xfs_globals.o \
106 xfs_ioctl.o \ 103 xfs_ioctl.o \
107 xfs_iops.o \ 104 xfs_iops.o \
105 xfs_message.o \
108 xfs_super.o \ 106 xfs_super.o \
109 xfs_sync.o \ 107 xfs_sync.o \
110 xfs_xattr.o) 108 xfs_xattr.o)
111 109
112# Objects in support/ 110# Objects in support/
113xfs-y += $(addprefix support/, \ 111xfs-y += support/uuid.o
114 debug.o \
115 uuid.o)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 666c9db48eb6..a907de565db3 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -23,6 +23,7 @@
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include "time.h" 24#include "time.h"
25#include "kmem.h" 25#include "kmem.h"
26#include "xfs_message.h"
26 27
27/* 28/*
28 * Greedy allocation. May fail and may return vmalloced memory. 29 * Greedy allocation. May fail and may return vmalloced memory.
@@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
56 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 57 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
57 return ptr; 58 return ptr;
58 if (!(++retries % 100)) 59 if (!(++retries % 100))
59 printk(KERN_ERR "XFS: possible memory allocation " 60 xfs_err(NULL,
60 "deadlock in %s (mode:0x%x)\n", 61 "possible memory allocation deadlock in %s (mode:0x%x)",
61 __func__, lflags); 62 __func__, lflags);
62 congestion_wait(BLK_RW_ASYNC, HZ/50); 63 congestion_wait(BLK_RW_ASYNC, HZ/50);
63 } while (1); 64 } while (1);
@@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
112 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) 113 if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
113 return ptr; 114 return ptr;
114 if (!(++retries % 100)) 115 if (!(++retries % 100))
115 printk(KERN_ERR "XFS: possible memory allocation " 116 xfs_err(NULL,
116 "deadlock in %s (mode:0x%x)\n", 117 "possible memory allocation deadlock in %s (mode:0x%x)",
117 __func__, lflags); 118 __func__, lflags);
118 congestion_wait(BLK_RW_ASYNC, HZ/50); 119 congestion_wait(BLK_RW_ASYNC, HZ/50);
119 } while (1); 120 } while (1);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ec7bbb5645b6..79ce38be15a1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -413,8 +413,7 @@ xfs_submit_ioend_bio(
413 if (xfs_ioend_new_eof(ioend)) 413 if (xfs_ioend_new_eof(ioend))
414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); 414 xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
415 415
416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? 416 submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
417 WRITE_SYNC_PLUG : WRITE, bio);
418} 417}
419 418
420STATIC struct bio * 419STATIC struct bio *
@@ -854,7 +853,7 @@ xfs_aops_discard_page(
854 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 853 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
855 goto out_invalidate; 854 goto out_invalidate;
856 855
857 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 856 xfs_alert(ip->i_mount,
858 "page discard on page %p, inode 0x%llx, offset %llu.", 857 "page discard on page %p, inode 0x%llx, offset %llu.",
859 page, ip->i_ino, offset); 858 page, ip->i_ino, offset);
860 859
@@ -872,7 +871,7 @@ xfs_aops_discard_page(
872 if (error) { 871 if (error) {
873 /* something screwed, just bail */ 872 /* something screwed, just bail */
874 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 873 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
875 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 874 xfs_alert(ip->i_mount,
876 "page discard unable to remove delalloc mapping."); 875 "page discard unable to remove delalloc mapping.");
877 } 876 }
878 break; 877 break;
@@ -1296,7 +1295,7 @@ xfs_get_blocks_direct(
1296 * If the private argument is non-NULL __xfs_get_blocks signals us that we 1295 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1297 * need to issue a transaction to convert the range from unwritten to written 1296 * need to issue a transaction to convert the range from unwritten to written
1298 * extents. In case this is regular synchronous I/O we just call xfs_end_io 1297 * extents. In case this is regular synchronous I/O we just call xfs_end_io
1299 * to do this and we are done. But in case this was a successfull AIO 1298 * to do this and we are done. But in case this was a successful AIO
1300 * request this handler is called from interrupt context, from which we 1299 * request this handler is called from interrupt context, from which we
1301 * can't start transactions. In that case offload the I/O completion to 1300 * can't start transactions. In that case offload the I/O completion to
1302 * the workqueues we also use for buffered I/O completion. 1301 * the workqueues we also use for buffered I/O completion.
@@ -1411,7 +1410,7 @@ xfs_vm_write_failed(
1411 if (error) { 1410 if (error) {
1412 /* something screwed, just bail */ 1411 /* something screwed, just bail */
1413 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1412 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1414 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 1413 xfs_alert(ip->i_mount,
1415 "xfs_vm_write_failed: unable to clean up ino %lld", 1414 "xfs_vm_write_failed: unable to clean up ino %lld",
1416 ip->i_ino); 1415 ip->i_ino);
1417 } 1416 }
@@ -1495,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = {
1495 .readpages = xfs_vm_readpages, 1494 .readpages = xfs_vm_readpages,
1496 .writepage = xfs_vm_writepage, 1495 .writepage = xfs_vm_writepage,
1497 .writepages = xfs_vm_writepages, 1496 .writepages = xfs_vm_writepages,
1498 .sync_page = block_sync_page,
1499 .releasepage = xfs_vm_releasepage, 1497 .releasepage = xfs_vm_releasepage,
1500 .invalidatepage = xfs_vm_invalidatepage, 1498 .invalidatepage = xfs_vm_invalidatepage,
1501 .write_begin = xfs_vm_write_begin, 1499 .write_begin = xfs_vm_write_begin,
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ac1c7e8378dd..9ef9ed2cfe2e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -94,75 +94,6 @@ xfs_buf_vmap_len(
94} 94}
95 95
96/* 96/*
97 * Page Region interfaces.
98 *
99 * For pages in filesystems where the blocksize is smaller than the
100 * pagesize, we use the page->private field (long) to hold a bitmap
101 * of uptodate regions within the page.
102 *
103 * Each such region is "bytes per page / bits per long" bytes long.
104 *
105 * NBPPR == number-of-bytes-per-page-region
106 * BTOPR == bytes-to-page-region (rounded up)
107 * BTOPRT == bytes-to-page-region-truncated (rounded down)
108 */
109#if (BITS_PER_LONG == 32)
110#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
111#elif (BITS_PER_LONG == 64)
112#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
113#else
114#error BITS_PER_LONG must be 32 or 64
115#endif
116#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
117#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
118#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
119
120STATIC unsigned long
121page_region_mask(
122 size_t offset,
123 size_t length)
124{
125 unsigned long mask;
126 int first, final;
127
128 first = BTOPR(offset);
129 final = BTOPRT(offset + length - 1);
130 first = min(first, final);
131
132 mask = ~0UL;
133 mask <<= BITS_PER_LONG - (final - first);
134 mask >>= BITS_PER_LONG - (final);
135
136 ASSERT(offset + length <= PAGE_CACHE_SIZE);
137 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
138
139 return mask;
140}
141
142STATIC void
143set_page_region(
144 struct page *page,
145 size_t offset,
146 size_t length)
147{
148 set_page_private(page,
149 page_private(page) | page_region_mask(offset, length));
150 if (page_private(page) == ~0UL)
151 SetPageUptodate(page);
152}
153
154STATIC int
155test_page_region(
156 struct page *page,
157 size_t offset,
158 size_t length)
159{
160 unsigned long mask = page_region_mask(offset, length);
161
162 return (mask && (page_private(page) & mask) == mask);
163}
164
165/*
166 * xfs_buf_lru_add - add a buffer to the LRU. 97 * xfs_buf_lru_add - add a buffer to the LRU.
167 * 98 *
168 * The LRU takes a new reference to the buffer so that it will only be freed 99 * The LRU takes a new reference to the buffer so that it will only be freed
@@ -189,7 +120,7 @@ xfs_buf_lru_add(
189 * The unlocked check is safe here because it only occurs when there are not 120 * The unlocked check is safe here because it only occurs when there are not
190 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there 121 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
191 * to optimise the shrinker removing the buffer from the LRU and calling 122 * to optimise the shrinker removing the buffer from the LRU and calling
192 * xfs_buf_free(). i.e. it removes an unneccessary round trip on the 123 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
193 * bt_lru_lock. 124 * bt_lru_lock.
194 */ 125 */
195STATIC void 126STATIC void
@@ -332,7 +263,7 @@ xfs_buf_free(
332 263
333 ASSERT(list_empty(&bp->b_lru)); 264 ASSERT(list_empty(&bp->b_lru));
334 265
335 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 266 if (bp->b_flags & _XBF_PAGES) {
336 uint i; 267 uint i;
337 268
338 if (xfs_buf_is_vmapped(bp)) 269 if (xfs_buf_is_vmapped(bp))
@@ -342,56 +273,77 @@ xfs_buf_free(
342 for (i = 0; i < bp->b_page_count; i++) { 273 for (i = 0; i < bp->b_page_count; i++) {
343 struct page *page = bp->b_pages[i]; 274 struct page *page = bp->b_pages[i];
344 275
345 if (bp->b_flags & _XBF_PAGE_CACHE) 276 __free_page(page);
346 ASSERT(!PagePrivate(page));
347 page_cache_release(page);
348 } 277 }
349 } 278 } else if (bp->b_flags & _XBF_KMEM)
279 kmem_free(bp->b_addr);
350 _xfs_buf_free_pages(bp); 280 _xfs_buf_free_pages(bp);
351 xfs_buf_deallocate(bp); 281 xfs_buf_deallocate(bp);
352} 282}
353 283
354/* 284/*
355 * Finds all pages for buffer in question and builds it's page list. 285 * Allocates all the pages for buffer in question and builds it's page list.
356 */ 286 */
357STATIC int 287STATIC int
358_xfs_buf_lookup_pages( 288xfs_buf_allocate_memory(
359 xfs_buf_t *bp, 289 xfs_buf_t *bp,
360 uint flags) 290 uint flags)
361{ 291{
362 struct address_space *mapping = bp->b_target->bt_mapping;
363 size_t blocksize = bp->b_target->bt_bsize;
364 size_t size = bp->b_count_desired; 292 size_t size = bp->b_count_desired;
365 size_t nbytes, offset; 293 size_t nbytes, offset;
366 gfp_t gfp_mask = xb_to_gfp(flags); 294 gfp_t gfp_mask = xb_to_gfp(flags);
367 unsigned short page_count, i; 295 unsigned short page_count, i;
368 pgoff_t first;
369 xfs_off_t end; 296 xfs_off_t end;
370 int error; 297 int error;
371 298
299 /*
300 * for buffers that are contained within a single page, just allocate
301 * the memory from the heap - there's no need for the complexity of
302 * page arrays to keep allocation down to order 0.
303 */
304 if (bp->b_buffer_length < PAGE_SIZE) {
305 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
306 if (!bp->b_addr) {
307 /* low memory - use alloc_page loop instead */
308 goto use_alloc_page;
309 }
310
311 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
312 PAGE_MASK) !=
313 ((unsigned long)bp->b_addr & PAGE_MASK)) {
314 /* b_addr spans two pages - use alloc_page instead */
315 kmem_free(bp->b_addr);
316 bp->b_addr = NULL;
317 goto use_alloc_page;
318 }
319 bp->b_offset = offset_in_page(bp->b_addr);
320 bp->b_pages = bp->b_page_array;
321 bp->b_pages[0] = virt_to_page(bp->b_addr);
322 bp->b_page_count = 1;
323 bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
324 return 0;
325 }
326
327use_alloc_page:
372 end = bp->b_file_offset + bp->b_buffer_length; 328 end = bp->b_file_offset + bp->b_buffer_length;
373 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 329 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
374
375 error = _xfs_buf_get_pages(bp, page_count, flags); 330 error = _xfs_buf_get_pages(bp, page_count, flags);
376 if (unlikely(error)) 331 if (unlikely(error))
377 return error; 332 return error;
378 bp->b_flags |= _XBF_PAGE_CACHE;
379 333
380 offset = bp->b_offset; 334 offset = bp->b_offset;
381 first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 335 bp->b_flags |= _XBF_PAGES;
382 336
383 for (i = 0; i < bp->b_page_count; i++) { 337 for (i = 0; i < bp->b_page_count; i++) {
384 struct page *page; 338 struct page *page;
385 uint retries = 0; 339 uint retries = 0;
386 340retry:
387 retry: 341 page = alloc_page(gfp_mask);
388 page = find_or_create_page(mapping, first + i, gfp_mask);
389 if (unlikely(page == NULL)) { 342 if (unlikely(page == NULL)) {
390 if (flags & XBF_READ_AHEAD) { 343 if (flags & XBF_READ_AHEAD) {
391 bp->b_page_count = i; 344 bp->b_page_count = i;
392 for (i = 0; i < bp->b_page_count; i++) 345 error = ENOMEM;
393 unlock_page(bp->b_pages[i]); 346 goto out_free_pages;
394 return -ENOMEM;
395 } 347 }
396 348
397 /* 349 /*
@@ -401,9 +353,8 @@ _xfs_buf_lookup_pages(
401 * handle buffer allocation failures we can't do much. 353 * handle buffer allocation failures we can't do much.
402 */ 354 */
403 if (!(++retries % 100)) 355 if (!(++retries % 100))
404 printk(KERN_ERR 356 xfs_err(NULL,
405 "XFS: possible memory allocation " 357 "possible memory allocation deadlock in %s (mode:0x%x)",
406 "deadlock in %s (mode:0x%x)\n",
407 __func__, gfp_mask); 358 __func__, gfp_mask);
408 359
409 XFS_STATS_INC(xb_page_retries); 360 XFS_STATS_INC(xb_page_retries);
@@ -413,52 +364,44 @@ _xfs_buf_lookup_pages(
413 364
414 XFS_STATS_INC(xb_page_found); 365 XFS_STATS_INC(xb_page_found);
415 366
416 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 367 nbytes = min_t(size_t, size, PAGE_SIZE - offset);
417 size -= nbytes; 368 size -= nbytes;
418
419 ASSERT(!PagePrivate(page));
420 if (!PageUptodate(page)) {
421 page_count--;
422 if (blocksize >= PAGE_CACHE_SIZE) {
423 if (flags & XBF_READ)
424 bp->b_flags |= _XBF_PAGE_LOCKED;
425 } else if (!PagePrivate(page)) {
426 if (test_page_region(page, offset, nbytes))
427 page_count++;
428 }
429 }
430
431 bp->b_pages[i] = page; 369 bp->b_pages[i] = page;
432 offset = 0; 370 offset = 0;
433 } 371 }
372 return 0;
434 373
435 if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { 374out_free_pages:
436 for (i = 0; i < bp->b_page_count; i++) 375 for (i = 0; i < bp->b_page_count; i++)
437 unlock_page(bp->b_pages[i]); 376 __free_page(bp->b_pages[i]);
438 }
439
440 if (page_count == bp->b_page_count)
441 bp->b_flags |= XBF_DONE;
442
443 return error; 377 return error;
444} 378}
445 379
446/* 380/*
447 * Map buffer into kernel address-space if nessecary. 381 * Map buffer into kernel address-space if necessary.
448 */ 382 */
449STATIC int 383STATIC int
450_xfs_buf_map_pages( 384_xfs_buf_map_pages(
451 xfs_buf_t *bp, 385 xfs_buf_t *bp,
452 uint flags) 386 uint flags)
453{ 387{
454 /* A single page buffer is always mappable */ 388 ASSERT(bp->b_flags & _XBF_PAGES);
455 if (bp->b_page_count == 1) { 389 if (bp->b_page_count == 1) {
390 /* A single page buffer is always mappable */
456 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 391 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
457 bp->b_flags |= XBF_MAPPED; 392 bp->b_flags |= XBF_MAPPED;
458 } else if (flags & XBF_MAPPED) { 393 } else if (flags & XBF_MAPPED) {
459 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 394 int retried = 0;
460 -1, PAGE_KERNEL); 395
461 if (unlikely(bp->b_addr == NULL)) 396 do {
397 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
398 -1, PAGE_KERNEL);
399 if (bp->b_addr)
400 break;
401 vm_unmap_aliases();
402 } while (retried++ <= 1);
403
404 if (!bp->b_addr)
462 return -ENOMEM; 405 return -ENOMEM;
463 bp->b_addr += bp->b_offset; 406 bp->b_addr += bp->b_offset;
464 bp->b_flags |= XBF_MAPPED; 407 bp->b_flags |= XBF_MAPPED;
@@ -569,9 +512,14 @@ found:
569 } 512 }
570 } 513 }
571 514
515 /*
516 * if the buffer is stale, clear all the external state associated with
517 * it. We need to keep flags such as how we allocated the buffer memory
518 * intact here.
519 */
572 if (bp->b_flags & XBF_STALE) { 520 if (bp->b_flags & XBF_STALE) {
573 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 521 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
574 bp->b_flags &= XBF_MAPPED; 522 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
575 } 523 }
576 524
577 trace_xfs_buf_find(bp, flags, _RET_IP_); 525 trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -592,7 +540,7 @@ xfs_buf_get(
592 xfs_buf_flags_t flags) 540 xfs_buf_flags_t flags)
593{ 541{
594 xfs_buf_t *bp, *new_bp; 542 xfs_buf_t *bp, *new_bp;
595 int error = 0, i; 543 int error = 0;
596 544
597 new_bp = xfs_buf_allocate(flags); 545 new_bp = xfs_buf_allocate(flags);
598 if (unlikely(!new_bp)) 546 if (unlikely(!new_bp))
@@ -600,7 +548,7 @@ xfs_buf_get(
600 548
601 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 549 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
602 if (bp == new_bp) { 550 if (bp == new_bp) {
603 error = _xfs_buf_lookup_pages(bp, flags); 551 error = xfs_buf_allocate_memory(bp, flags);
604 if (error) 552 if (error)
605 goto no_buffer; 553 goto no_buffer;
606 } else { 554 } else {
@@ -609,14 +557,11 @@ xfs_buf_get(
609 return NULL; 557 return NULL;
610 } 558 }
611 559
612 for (i = 0; i < bp->b_page_count; i++)
613 mark_page_accessed(bp->b_pages[i]);
614
615 if (!(bp->b_flags & XBF_MAPPED)) { 560 if (!(bp->b_flags & XBF_MAPPED)) {
616 error = _xfs_buf_map_pages(bp, flags); 561 error = _xfs_buf_map_pages(bp, flags);
617 if (unlikely(error)) { 562 if (unlikely(error)) {
618 printk(KERN_WARNING "%s: failed to map pages\n", 563 xfs_warn(target->bt_mount,
619 __func__); 564 "%s: failed to map pages\n", __func__);
620 goto no_buffer; 565 goto no_buffer;
621 } 566 }
622 } 567 }
@@ -710,10 +655,7 @@ xfs_buf_readahead(
710 xfs_off_t ioff, 655 xfs_off_t ioff,
711 size_t isize) 656 size_t isize)
712{ 657{
713 struct backing_dev_info *bdi; 658 if (bdi_read_congested(target->bt_bdi))
714
715 bdi = target->bt_mapping->backing_dev_info;
716 if (bdi_read_congested(bdi))
717 return; 659 return;
718 660
719 xfs_buf_read(target, ioff, isize, 661 xfs_buf_read(target, ioff, isize,
@@ -791,10 +733,10 @@ xfs_buf_associate_memory(
791 size_t buflen; 733 size_t buflen;
792 int page_count; 734 int page_count;
793 735
794 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 736 pageaddr = (unsigned long)mem & PAGE_MASK;
795 offset = (unsigned long)mem - pageaddr; 737 offset = (unsigned long)mem - pageaddr;
796 buflen = PAGE_CACHE_ALIGN(len + offset); 738 buflen = PAGE_ALIGN(len + offset);
797 page_count = buflen >> PAGE_CACHE_SHIFT; 739 page_count = buflen >> PAGE_SHIFT;
798 740
799 /* Free any previous set of page pointers */ 741 /* Free any previous set of page pointers */
800 if (bp->b_pages) 742 if (bp->b_pages)
@@ -811,13 +753,12 @@ xfs_buf_associate_memory(
811 753
812 for (i = 0; i < bp->b_page_count; i++) { 754 for (i = 0; i < bp->b_page_count; i++) {
813 bp->b_pages[i] = mem_to_page((void *)pageaddr); 755 bp->b_pages[i] = mem_to_page((void *)pageaddr);
814 pageaddr += PAGE_CACHE_SIZE; 756 pageaddr += PAGE_SIZE;
815 } 757 }
816 758
817 bp->b_count_desired = len; 759 bp->b_count_desired = len;
818 bp->b_buffer_length = buflen; 760 bp->b_buffer_length = buflen;
819 bp->b_flags |= XBF_MAPPED; 761 bp->b_flags |= XBF_MAPPED;
820 bp->b_flags &= ~_XBF_PAGE_LOCKED;
821 762
822 return 0; 763 return 0;
823} 764}
@@ -850,8 +791,8 @@ xfs_buf_get_uncached(
850 791
851 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 792 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
852 if (unlikely(error)) { 793 if (unlikely(error)) {
853 printk(KERN_WARNING "%s: failed to map pages\n", 794 xfs_warn(target->bt_mount,
854 __func__); 795 "%s: failed to map pages\n", __func__);
855 goto fail_free_mem; 796 goto fail_free_mem;
856 } 797 }
857 798
@@ -924,20 +865,7 @@ xfs_buf_rele(
924 865
925 866
926/* 867/*
927 * Mutual exclusion on buffers. Locking model: 868 * Lock a buffer object, if it is not already locked.
928 *
929 * Buffers associated with inodes for which buffer locking
930 * is not enabled are not protected by semaphores, and are
931 * assumed to be exclusively owned by the caller. There is a
932 * spinlock in the buffer, used by the caller when concurrent
933 * access is possible.
934 */
935
936/*
937 * Locks a buffer object, if it is not already locked. Note that this in
938 * no way locks the underlying pages, so it is only useful for
939 * synchronizing concurrent use of buffer objects, not for synchronizing
940 * independent access to the underlying pages.
941 * 869 *
942 * If we come across a stale, pinned, locked buffer, we know that we are 870 * If we come across a stale, pinned, locked buffer, we know that we are
943 * being asked to lock a buffer that has been reallocated. Because it is 871 * being asked to lock a buffer that has been reallocated. Because it is
@@ -971,10 +899,7 @@ xfs_buf_lock_value(
971} 899}
972 900
973/* 901/*
974 * Locks a buffer object. 902 * Lock a buffer object.
975 * Note that this in no way locks the underlying pages, so it is only
976 * useful for synchronizing concurrent use of buffer objects, not for
977 * synchronizing independent access to the underlying pages.
978 * 903 *
979 * If we come across a stale, pinned, locked buffer, we know that we 904 * If we come across a stale, pinned, locked buffer, we know that we
980 * are being asked to lock a buffer that has been reallocated. Because 905 * are being asked to lock a buffer that has been reallocated. Because
@@ -990,8 +915,6 @@ xfs_buf_lock(
990 915
991 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 916 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
992 xfs_log_force(bp->b_target->bt_mount, 0); 917 xfs_log_force(bp->b_target->bt_mount, 0);
993 if (atomic_read(&bp->b_io_remaining))
994 blk_run_address_space(bp->b_target->bt_mapping);
995 down(&bp->b_sema); 918 down(&bp->b_sema);
996 XB_SET_OWNER(bp); 919 XB_SET_OWNER(bp);
997 920
@@ -1035,9 +958,7 @@ xfs_buf_wait_unpin(
1035 set_current_state(TASK_UNINTERRUPTIBLE); 958 set_current_state(TASK_UNINTERRUPTIBLE);
1036 if (atomic_read(&bp->b_pin_count) == 0) 959 if (atomic_read(&bp->b_pin_count) == 0)
1037 break; 960 break;
1038 if (atomic_read(&bp->b_io_remaining)) 961 io_schedule();
1039 blk_run_address_space(bp->b_target->bt_mapping);
1040 schedule();
1041 } 962 }
1042 remove_wait_queue(&bp->b_waiters, &wait); 963 remove_wait_queue(&bp->b_waiters, &wait);
1043 set_current_state(TASK_RUNNING); 964 set_current_state(TASK_RUNNING);
@@ -1249,10 +1170,8 @@ _xfs_buf_ioend(
1249 xfs_buf_t *bp, 1170 xfs_buf_t *bp,
1250 int schedule) 1171 int schedule)
1251{ 1172{
1252 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1173 if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1253 bp->b_flags &= ~_XBF_PAGE_LOCKED;
1254 xfs_buf_ioend(bp, schedule); 1174 xfs_buf_ioend(bp, schedule);
1255 }
1256} 1175}
1257 1176
1258STATIC void 1177STATIC void
@@ -1261,35 +1180,12 @@ xfs_buf_bio_end_io(
1261 int error) 1180 int error)
1262{ 1181{
1263 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1182 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1264 unsigned int blocksize = bp->b_target->bt_bsize;
1265 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1266 1183
1267 xfs_buf_ioerror(bp, -error); 1184 xfs_buf_ioerror(bp, -error);
1268 1185
1269 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1186 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1270 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1187 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1271 1188
1272 do {
1273 struct page *page = bvec->bv_page;
1274
1275 ASSERT(!PagePrivate(page));
1276 if (unlikely(bp->b_error)) {
1277 if (bp->b_flags & XBF_READ)
1278 ClearPageUptodate(page);
1279 } else if (blocksize >= PAGE_CACHE_SIZE) {
1280 SetPageUptodate(page);
1281 } else if (!PagePrivate(page) &&
1282 (bp->b_flags & _XBF_PAGE_CACHE)) {
1283 set_page_region(page, bvec->bv_offset, bvec->bv_len);
1284 }
1285
1286 if (--bvec >= bio->bi_io_vec)
1287 prefetchw(&bvec->bv_page->flags);
1288
1289 if (bp->b_flags & _XBF_PAGE_LOCKED)
1290 unlock_page(page);
1291 } while (bvec >= bio->bi_io_vec);
1292
1293 _xfs_buf_ioend(bp, 1); 1189 _xfs_buf_ioend(bp, 1);
1294 bio_put(bio); 1190 bio_put(bio);
1295} 1191}
@@ -1303,7 +1199,6 @@ _xfs_buf_ioapply(
1303 int offset = bp->b_offset; 1199 int offset = bp->b_offset;
1304 int size = bp->b_count_desired; 1200 int size = bp->b_count_desired;
1305 sector_t sector = bp->b_bn; 1201 sector_t sector = bp->b_bn;
1306 unsigned int blocksize = bp->b_target->bt_bsize;
1307 1202
1308 total_nr_pages = bp->b_page_count; 1203 total_nr_pages = bp->b_page_count;
1309 map_i = 0; 1204 map_i = 0;
@@ -1324,29 +1219,6 @@ _xfs_buf_ioapply(
1324 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1219 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1325 } 1220 }
1326 1221
1327 /* Special code path for reading a sub page size buffer in --
1328 * we populate up the whole page, and hence the other metadata
1329 * in the same page. This optimization is only valid when the
1330 * filesystem block size is not smaller than the page size.
1331 */
1332 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1333 ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1334 (XBF_READ|_XBF_PAGE_LOCKED)) &&
1335 (blocksize >= PAGE_CACHE_SIZE)) {
1336 bio = bio_alloc(GFP_NOIO, 1);
1337
1338 bio->bi_bdev = bp->b_target->bt_bdev;
1339 bio->bi_sector = sector - (offset >> BBSHIFT);
1340 bio->bi_end_io = xfs_buf_bio_end_io;
1341 bio->bi_private = bp;
1342
1343 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1344 size = 0;
1345
1346 atomic_inc(&bp->b_io_remaining);
1347
1348 goto submit_io;
1349 }
1350 1222
1351next_chunk: 1223next_chunk:
1352 atomic_inc(&bp->b_io_remaining); 1224 atomic_inc(&bp->b_io_remaining);
@@ -1360,8 +1232,9 @@ next_chunk:
1360 bio->bi_end_io = xfs_buf_bio_end_io; 1232 bio->bi_end_io = xfs_buf_bio_end_io;
1361 bio->bi_private = bp; 1233 bio->bi_private = bp;
1362 1234
1235
1363 for (; size && nr_pages; nr_pages--, map_i++) { 1236 for (; size && nr_pages; nr_pages--, map_i++) {
1364 int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1237 int rbytes, nbytes = PAGE_SIZE - offset;
1365 1238
1366 if (nbytes > size) 1239 if (nbytes > size)
1367 nbytes = size; 1240 nbytes = size;
@@ -1376,7 +1249,6 @@ next_chunk:
1376 total_nr_pages--; 1249 total_nr_pages--;
1377 } 1250 }
1378 1251
1379submit_io:
1380 if (likely(bio->bi_size)) { 1252 if (likely(bio->bi_size)) {
1381 if (xfs_buf_is_vmapped(bp)) { 1253 if (xfs_buf_is_vmapped(bp)) {
1382 flush_kernel_vmap_range(bp->b_addr, 1254 flush_kernel_vmap_range(bp->b_addr,
@@ -1386,18 +1258,7 @@ submit_io:
1386 if (size) 1258 if (size)
1387 goto next_chunk; 1259 goto next_chunk;
1388 } else { 1260 } else {
1389 /*
1390 * if we get here, no pages were added to the bio. However,
1391 * we can't just error out here - if the pages are locked then
1392 * we have to unlock them otherwise we can hang on a later
1393 * access to the page.
1394 */
1395 xfs_buf_ioerror(bp, EIO); 1261 xfs_buf_ioerror(bp, EIO);
1396 if (bp->b_flags & _XBF_PAGE_LOCKED) {
1397 int i;
1398 for (i = 0; i < bp->b_page_count; i++)
1399 unlock_page(bp->b_pages[i]);
1400 }
1401 bio_put(bio); 1262 bio_put(bio);
1402 } 1263 }
1403} 1264}
@@ -1442,8 +1303,6 @@ xfs_buf_iowait(
1442{ 1303{
1443 trace_xfs_buf_iowait(bp, _RET_IP_); 1304 trace_xfs_buf_iowait(bp, _RET_IP_);
1444 1305
1445 if (atomic_read(&bp->b_io_remaining))
1446 blk_run_address_space(bp->b_target->bt_mapping);
1447 wait_for_completion(&bp->b_iowait); 1306 wait_for_completion(&bp->b_iowait);
1448 1307
1449 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1308 trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1461,8 +1320,8 @@ xfs_buf_offset(
1461 return XFS_BUF_PTR(bp) + offset; 1320 return XFS_BUF_PTR(bp) + offset;
1462 1321
1463 offset += bp->b_offset; 1322 offset += bp->b_offset;
1464 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1323 page = bp->b_pages[offset >> PAGE_SHIFT];
1465 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1324 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1466} 1325}
1467 1326
1468/* 1327/*
@@ -1484,9 +1343,9 @@ xfs_buf_iomove(
1484 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1343 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1485 cpoff = xfs_buf_poff(boff + bp->b_offset); 1344 cpoff = xfs_buf_poff(boff + bp->b_offset);
1486 csize = min_t(size_t, 1345 csize = min_t(size_t,
1487 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1346 PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1488 1347
1489 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1348 ASSERT(((csize + cpoff) <= PAGE_SIZE));
1490 1349
1491 switch (mode) { 1350 switch (mode) {
1492 case XBRW_ZERO: 1351 case XBRW_ZERO:
@@ -1599,7 +1458,6 @@ xfs_free_buftarg(
1599 xfs_flush_buftarg(btp, 1); 1458 xfs_flush_buftarg(btp, 1);
1600 if (mp->m_flags & XFS_MOUNT_BARRIER) 1459 if (mp->m_flags & XFS_MOUNT_BARRIER)
1601 xfs_blkdev_issue_flush(btp); 1460 xfs_blkdev_issue_flush(btp);
1602 iput(btp->bt_mapping->host);
1603 1461
1604 kthread_stop(btp->bt_task); 1462 kthread_stop(btp->bt_task);
1605 kmem_free(btp); 1463 kmem_free(btp);
@@ -1617,21 +1475,12 @@ xfs_setsize_buftarg_flags(
1617 btp->bt_smask = sectorsize - 1; 1475 btp->bt_smask = sectorsize - 1;
1618 1476
1619 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1477 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1620 printk(KERN_WARNING 1478 xfs_warn(btp->bt_mount,
1621 "XFS: Cannot set_blocksize to %u on device %s\n", 1479 "Cannot set_blocksize to %u on device %s\n",
1622 sectorsize, XFS_BUFTARG_NAME(btp)); 1480 sectorsize, XFS_BUFTARG_NAME(btp));
1623 return EINVAL; 1481 return EINVAL;
1624 } 1482 }
1625 1483
1626 if (verbose &&
1627 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1628 printk(KERN_WARNING
1629 "XFS: %u byte sectors in use on device %s. "
1630 "This is suboptimal; %u or greater is ideal.\n",
1631 sectorsize, XFS_BUFTARG_NAME(btp),
1632 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1633 }
1634
1635 return 0; 1484 return 0;
1636} 1485}
1637 1486
@@ -1646,7 +1495,7 @@ xfs_setsize_buftarg_early(
1646 struct block_device *bdev) 1495 struct block_device *bdev)
1647{ 1496{
1648 return xfs_setsize_buftarg_flags(btp, 1497 return xfs_setsize_buftarg_flags(btp,
1649 PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); 1498 PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1650} 1499}
1651 1500
1652int 1501int
@@ -1659,41 +1508,6 @@ xfs_setsize_buftarg(
1659} 1508}
1660 1509
1661STATIC int 1510STATIC int
1662xfs_mapping_buftarg(
1663 xfs_buftarg_t *btp,
1664 struct block_device *bdev)
1665{
1666 struct backing_dev_info *bdi;
1667 struct inode *inode;
1668 struct address_space *mapping;
1669 static const struct address_space_operations mapping_aops = {
1670 .sync_page = block_sync_page,
1671 .migratepage = fail_migrate_page,
1672 };
1673
1674 inode = new_inode(bdev->bd_inode->i_sb);
1675 if (!inode) {
1676 printk(KERN_WARNING
1677 "XFS: Cannot allocate mapping inode for device %s\n",
1678 XFS_BUFTARG_NAME(btp));
1679 return ENOMEM;
1680 }
1681 inode->i_ino = get_next_ino();
1682 inode->i_mode = S_IFBLK;
1683 inode->i_bdev = bdev;
1684 inode->i_rdev = bdev->bd_dev;
1685 bdi = blk_get_backing_dev_info(bdev);
1686 if (!bdi)
1687 bdi = &default_backing_dev_info;
1688 mapping = &inode->i_data;
1689 mapping->a_ops = &mapping_aops;
1690 mapping->backing_dev_info = bdi;
1691 mapping_set_gfp_mask(mapping, GFP_NOFS);
1692 btp->bt_mapping = mapping;
1693 return 0;
1694}
1695
1696STATIC int
1697xfs_alloc_delwrite_queue( 1511xfs_alloc_delwrite_queue(
1698 xfs_buftarg_t *btp, 1512 xfs_buftarg_t *btp,
1699 const char *fsname) 1513 const char *fsname)
@@ -1721,12 +1535,14 @@ xfs_alloc_buftarg(
1721 btp->bt_mount = mp; 1535 btp->bt_mount = mp;
1722 btp->bt_dev = bdev->bd_dev; 1536 btp->bt_dev = bdev->bd_dev;
1723 btp->bt_bdev = bdev; 1537 btp->bt_bdev = bdev;
1538 btp->bt_bdi = blk_get_backing_dev_info(bdev);
1539 if (!btp->bt_bdi)
1540 goto error;
1541
1724 INIT_LIST_HEAD(&btp->bt_lru); 1542 INIT_LIST_HEAD(&btp->bt_lru);
1725 spin_lock_init(&btp->bt_lru_lock); 1543 spin_lock_init(&btp->bt_lru_lock);
1726 if (xfs_setsize_buftarg_early(btp, bdev)) 1544 if (xfs_setsize_buftarg_early(btp, bdev))
1727 goto error; 1545 goto error;
1728 if (xfs_mapping_buftarg(btp, bdev))
1729 goto error;
1730 if (xfs_alloc_delwrite_queue(btp, fsname)) 1546 if (xfs_alloc_delwrite_queue(btp, fsname))
1731 goto error; 1547 goto error;
1732 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1548 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
@@ -1923,8 +1739,8 @@ xfsbufd(
1923 do { 1739 do {
1924 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1740 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1925 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1741 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1926 int count = 0;
1927 struct list_head tmp; 1742 struct list_head tmp;
1743 struct blk_plug plug;
1928 1744
1929 if (unlikely(freezing(current))) { 1745 if (unlikely(freezing(current))) {
1930 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1746 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1940,16 +1756,15 @@ xfsbufd(
1940 1756
1941 xfs_buf_delwri_split(target, &tmp, age); 1757 xfs_buf_delwri_split(target, &tmp, age);
1942 list_sort(NULL, &tmp, xfs_buf_cmp); 1758 list_sort(NULL, &tmp, xfs_buf_cmp);
1759
1760 blk_start_plug(&plug);
1943 while (!list_empty(&tmp)) { 1761 while (!list_empty(&tmp)) {
1944 struct xfs_buf *bp; 1762 struct xfs_buf *bp;
1945 bp = list_first_entry(&tmp, struct xfs_buf, b_list); 1763 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1946 list_del_init(&bp->b_list); 1764 list_del_init(&bp->b_list);
1947 xfs_bdstrat_cb(bp); 1765 xfs_bdstrat_cb(bp);
1948 count++;
1949 } 1766 }
1950 if (count) 1767 blk_finish_plug(&plug);
1951 blk_run_address_space(target->bt_mapping);
1952
1953 } while (!kthread_should_stop()); 1768 } while (!kthread_should_stop());
1954 1769
1955 return 0; 1770 return 0;
@@ -1969,6 +1784,7 @@ xfs_flush_buftarg(
1969 int pincount = 0; 1784 int pincount = 0;
1970 LIST_HEAD(tmp_list); 1785 LIST_HEAD(tmp_list);
1971 LIST_HEAD(wait_list); 1786 LIST_HEAD(wait_list);
1787 struct blk_plug plug;
1972 1788
1973 xfs_buf_runall_queues(xfsconvertd_workqueue); 1789 xfs_buf_runall_queues(xfsconvertd_workqueue);
1974 xfs_buf_runall_queues(xfsdatad_workqueue); 1790 xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1983,6 +1799,8 @@ xfs_flush_buftarg(
1983 * we do that after issuing all the IO. 1799 * we do that after issuing all the IO.
1984 */ 1800 */
1985 list_sort(NULL, &tmp_list, xfs_buf_cmp); 1801 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1802
1803 blk_start_plug(&plug);
1986 while (!list_empty(&tmp_list)) { 1804 while (!list_empty(&tmp_list)) {
1987 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); 1805 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1988 ASSERT(target == bp->b_target); 1806 ASSERT(target == bp->b_target);
@@ -1993,10 +1811,10 @@ xfs_flush_buftarg(
1993 } 1811 }
1994 xfs_bdstrat_cb(bp); 1812 xfs_bdstrat_cb(bp);
1995 } 1813 }
1814 blk_finish_plug(&plug);
1996 1815
1997 if (wait) { 1816 if (wait) {
1998 /* Expedite and wait for IO to complete. */ 1817 /* Wait for IO to complete. */
1999 blk_run_address_space(target->bt_mapping);
2000 while (!list_empty(&wait_list)) { 1818 while (!list_empty(&wait_list)) {
2001 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1819 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2002 1820
@@ -2022,11 +1840,12 @@ xfs_buf_init(void)
2022 if (!xfslogd_workqueue) 1840 if (!xfslogd_workqueue)
2023 goto out_free_buf_zone; 1841 goto out_free_buf_zone;
2024 1842
2025 xfsdatad_workqueue = create_workqueue("xfsdatad"); 1843 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
2026 if (!xfsdatad_workqueue) 1844 if (!xfsdatad_workqueue)
2027 goto out_destroy_xfslogd_workqueue; 1845 goto out_destroy_xfslogd_workqueue;
2028 1846
2029 xfsconvertd_workqueue = create_workqueue("xfsconvertd"); 1847 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
1848 WQ_MEM_RECLAIM, 1);
2030 if (!xfsconvertd_workqueue) 1849 if (!xfsconvertd_workqueue)
2031 goto out_destroy_xfsdatad_workqueue; 1850 goto out_destroy_xfsdatad_workqueue;
2032 1851
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index cbe65950e524..a9a1c4512645 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -61,30 +61,11 @@ typedef enum {
61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ 61#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
62 62
63/* flags used only internally */ 63/* flags used only internally */
64#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
65#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ 64#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
66#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ 65#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
66#define _XBF_KMEM (1 << 20)/* backed by heap memory */
67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ 67#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
68 68
69/*
70 * Special flag for supporting metadata blocks smaller than a FSB.
71 *
72 * In this case we can have multiple xfs_buf_t on a single page and
73 * need to lock out concurrent xfs_buf_t readers as they only
74 * serialise access to the buffer.
75 *
76 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
77 * between reads of the page. Hence we can have one thread read the
78 * page and modify it, but then race with another thread that thinks
79 * the page is not up-to-date and hence reads it again.
80 *
81 * The result is that the first modifcation to the page is lost.
82 * This sort of AGF/AGI reading race can happen when unlinking inodes
83 * that require truncation and results in the AGI unlinked list
84 * modifications being lost.
85 */
86#define _XBF_PAGE_LOCKED (1 << 22)
87
88typedef unsigned int xfs_buf_flags_t; 69typedef unsigned int xfs_buf_flags_t;
89 70
90#define XFS_BUF_FLAGS \ 71#define XFS_BUF_FLAGS \
@@ -100,12 +81,10 @@ typedef unsigned int xfs_buf_flags_t;
100 { XBF_LOCK, "LOCK" }, /* should never be set */\ 81 { XBF_LOCK, "LOCK" }, /* should never be set */\
101 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ 82 { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
102 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ 83 { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
103 { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \
104 { _XBF_PAGES, "PAGES" }, \ 84 { _XBF_PAGES, "PAGES" }, \
105 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ 85 { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
106 { _XBF_DELWRI_Q, "DELWRI_Q" }, \ 86 { _XBF_KMEM, "KMEM" }, \
107 { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } 87 { _XBF_DELWRI_Q, "DELWRI_Q" }
108
109 88
110typedef enum { 89typedef enum {
111 XBT_FORCE_SLEEP = 0, 90 XBT_FORCE_SLEEP = 0,
@@ -120,7 +99,7 @@ typedef struct xfs_bufhash {
120typedef struct xfs_buftarg { 99typedef struct xfs_buftarg {
121 dev_t bt_dev; 100 dev_t bt_dev;
122 struct block_device *bt_bdev; 101 struct block_device *bt_bdev;
123 struct address_space *bt_mapping; 102 struct backing_dev_info *bt_bdi;
124 struct xfs_mount *bt_mount; 103 struct xfs_mount *bt_mount;
125 unsigned int bt_bsize; 104 unsigned int bt_bsize;
126 unsigned int bt_sshift; 105 unsigned int bt_sshift;
@@ -139,17 +118,6 @@ typedef struct xfs_buftarg {
139 unsigned int bt_lru_nr; 118 unsigned int bt_lru_nr;
140} xfs_buftarg_t; 119} xfs_buftarg_t;
141 120
142/*
143 * xfs_buf_t: Buffer structure for pagecache-based buffers
144 *
145 * This buffer structure is used by the pagecache buffer management routines
146 * to refer to an assembly of pages forming a logical buffer.
147 *
148 * The buffer structure is used on a temporary basis only, and discarded when
149 * released. The real data storage is recorded in the pagecache. Buffers are
150 * hashed to the block device on which the file system resides.
151 */
152
153struct xfs_buf; 121struct xfs_buf;
154typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 122typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
155 123
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e5..d61611c88012 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
152 152
153 if (!capable(CAP_SYS_ADMIN)) 153 if (!capable(CAP_SYS_ADMIN))
154 return -XFS_ERROR(EPERM); 154 return -XFS_ERROR(EPERM);
155 if (!blk_queue_discard(q))
156 return -XFS_ERROR(EOPNOTSUPP);
155 if (copy_from_user(&range, urange, sizeof(range))) 157 if (copy_from_user(&range, urange, sizeof(range)))
156 return -XFS_ERROR(EFAULT); 158 return -XFS_ERROR(EFAULT);
157 159
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fdd..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
89 * seven combinations work. The real answer is "don't use v2". 89 * seven combinations work. The real answer is "don't use v2".
90 */ 90 */
91 len = xfs_fileid_length(fileid_type); 91 len = xfs_fileid_length(fileid_type);
92 if (*max_len < len) 92 if (*max_len < len) {
93 *max_len = len;
93 return 255; 94 return 255;
95 }
94 *max_len = len; 96 *max_len = len;
95 97
96 switch (fileid_type) { 98 switch (fileid_type) {
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index a55c1b46b219..f4213ba1ff85 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -381,7 +381,7 @@ xfs_aio_write_isize_update(
381 381
382/* 382/*
383 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then 383 * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
384 * part of the I/O may have been written to disk before the error occured. In 384 * part of the I/O may have been written to disk before the error occurred. In
385 * this case the on-disk file size may have been adjusted beyond the in-memory 385 * this case the on-disk file size may have been adjusted beyond the in-memory
386 * file size and now needs to be truncated back. 386 * file size and now needs to be truncated back.
387 */ 387 */
@@ -896,6 +896,7 @@ xfs_file_fallocate(
896 xfs_flock64_t bf; 896 xfs_flock64_t bf;
897 xfs_inode_t *ip = XFS_I(inode); 897 xfs_inode_t *ip = XFS_I(inode);
898 int cmd = XFS_IOC_RESVSP; 898 int cmd = XFS_IOC_RESVSP;
899 int attr_flags = XFS_ATTR_NOLOCK;
899 900
900 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 901 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
901 return -EOPNOTSUPP; 902 return -EOPNOTSUPP;
@@ -918,7 +919,10 @@ xfs_file_fallocate(
918 goto out_unlock; 919 goto out_unlock;
919 } 920 }
920 921
921 error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); 922 if (file->f_flags & O_DSYNC)
923 attr_flags |= XFS_ATTR_SYNC;
924
925 error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
922 if (error) 926 if (error)
923 goto out_unlock; 927 goto out_unlock;
924 928
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f5e2a19e0f8e..acca2c5ca3fa 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -624,6 +624,10 @@ xfs_ioc_space(
624 624
625 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 625 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
626 attr_flags |= XFS_ATTR_NONBLOCK; 626 attr_flags |= XFS_ATTR_NONBLOCK;
627
628 if (filp->f_flags & O_DSYNC)
629 attr_flags |= XFS_ATTR_SYNC;
630
627 if (ioflags & IO_INVIS) 631 if (ioflags & IO_INVIS)
628 attr_flags |= XFS_ATTR_DMI; 632 attr_flags |= XFS_ATTR_DMI;
629 633
@@ -695,14 +699,19 @@ xfs_ioc_fsgeometry_v1(
695 xfs_mount_t *mp, 699 xfs_mount_t *mp,
696 void __user *arg) 700 void __user *arg)
697{ 701{
698 xfs_fsop_geom_v1_t fsgeo; 702 xfs_fsop_geom_t fsgeo;
699 int error; 703 int error;
700 704
701 error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3); 705 error = xfs_fs_geometry(mp, &fsgeo, 3);
702 if (error) 706 if (error)
703 return -error; 707 return -error;
704 708
705 if (copy_to_user(arg, &fsgeo, sizeof(fsgeo))) 709 /*
710 * Caller should have passed an argument of type
711 * xfs_fsop_geom_v1_t. This is a proper subset of the
712 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
713 */
714 if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
706 return -XFS_ERROR(EFAULT); 715 return -XFS_ERROR(EFAULT);
707 return 0; 716 return 0;
708} 717}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index bd5727852fd6..dd21784525a8 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -70,7 +70,7 @@ xfs_synchronize_times(
70 70
71/* 71/*
72 * If the linux inode is valid, mark it dirty. 72 * If the linux inode is valid, mark it dirty.
73 * Used when commiting a dirty inode into a transaction so that 73 * Used when committing a dirty inode into a transaction so that
74 * the inode will get written back by the linux code 74 * the inode will get written back by the linux code
75 */ 75 */
76void 76void
@@ -102,7 +102,8 @@ xfs_mark_inode_dirty(
102STATIC int 102STATIC int
103xfs_init_security( 103xfs_init_security(
104 struct inode *inode, 104 struct inode *inode,
105 struct inode *dir) 105 struct inode *dir,
106 const struct qstr *qstr)
106{ 107{
107 struct xfs_inode *ip = XFS_I(inode); 108 struct xfs_inode *ip = XFS_I(inode);
108 size_t length; 109 size_t length;
@@ -110,7 +111,7 @@ xfs_init_security(
110 unsigned char *name; 111 unsigned char *name;
111 int error; 112 int error;
112 113
113 error = security_inode_init_security(inode, dir, (char **)&name, 114 error = security_inode_init_security(inode, dir, qstr, (char **)&name,
114 &value, &length); 115 &value, &length);
115 if (error) { 116 if (error) {
116 if (error == -EOPNOTSUPP) 117 if (error == -EOPNOTSUPP)
@@ -194,7 +195,7 @@ xfs_vn_mknod(
194 195
195 inode = VFS_I(ip); 196 inode = VFS_I(ip);
196 197
197 error = xfs_init_security(inode, dir); 198 error = xfs_init_security(inode, dir, &dentry->d_name);
198 if (unlikely(error)) 199 if (unlikely(error))
199 goto out_cleanup_inode; 200 goto out_cleanup_inode;
200 201
@@ -367,7 +368,7 @@ xfs_vn_symlink(
367 368
368 inode = VFS_I(cip); 369 inode = VFS_I(cip);
369 370
370 error = xfs_init_security(inode, dir); 371 error = xfs_init_security(inode, dir, &dentry->d_name);
371 if (unlikely(error)) 372 if (unlikely(error))
372 goto out_cleanup_inode; 373 goto out_cleanup_inode;
373 374
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 096494997747..244be9cbfe78 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -39,7 +39,6 @@
39#include <mrlock.h> 39#include <mrlock.h>
40#include <time.h> 40#include <time.h>
41 41
42#include <support/debug.h>
43#include <support/uuid.h> 42#include <support/uuid.h>
44 43
45#include <linux/semaphore.h> 44#include <linux/semaphore.h>
@@ -86,6 +85,7 @@
86#include <xfs_aops.h> 85#include <xfs_aops.h>
87#include <xfs_super.h> 86#include <xfs_super.h>
88#include <xfs_buf.h> 87#include <xfs_buf.h>
88#include <xfs_message.h>
89 89
90/* 90/*
91 * Feature macros (disable/enable) 91 * Feature macros (disable/enable)
@@ -280,4 +280,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
280#define __arch_pack 280#define __arch_pack
281#endif 281#endif
282 282
283#define ASSERT_ALWAYS(expr) \
284 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
285
286#ifndef DEBUG
287#define ASSERT(expr) ((void)0)
288
289#ifndef STATIC
290# define STATIC static noinline
291#endif
292
293#else /* DEBUG */
294
295#define ASSERT(expr) \
296 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
297
298#ifndef STATIC
299# define STATIC noinline
300#endif
301
302#endif /* DEBUG */
303
283#endif /* __XFS_LINUX__ */ 304#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
new file mode 100644
index 000000000000..3ca795609113
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -0,0 +1,124 @@
1/*
2 * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_log.h"
22#include "xfs_inum.h"
23#include "xfs_trans.h"
24#include "xfs_sb.h"
25#include "xfs_ag.h"
26#include "xfs_mount.h"
27
28/*
29 * XFS logging functions
30 */
31static void
32__xfs_printk(
33 const char *level,
34 const struct xfs_mount *mp,
35 struct va_format *vaf)
36{
37 if (mp && mp->m_fsname)
38 printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
39 printk("%sXFS: %pV\n", level, vaf);
40}
41
42void xfs_printk(
43 const char *level,
44 const struct xfs_mount *mp,
45 const char *fmt, ...)
46{
47 struct va_format vaf;
48 va_list args;
49
50 va_start(args, fmt);
51
52 vaf.fmt = fmt;
53 vaf.va = &args;
54
55 __xfs_printk(level, mp, &vaf);
56 va_end(args);
57}
58
59#define define_xfs_printk_level(func, kern_level) \
60void func(const struct xfs_mount *mp, const char *fmt, ...) \
61{ \
62 struct va_format vaf; \
63 va_list args; \
64 \
65 va_start(args, fmt); \
66 \
67 vaf.fmt = fmt; \
68 vaf.va = &args; \
69 \
70 __xfs_printk(kern_level, mp, &vaf); \
71 va_end(args); \
72} \
73
74define_xfs_printk_level(xfs_emerg, KERN_EMERG);
75define_xfs_printk_level(xfs_alert, KERN_ALERT);
76define_xfs_printk_level(xfs_crit, KERN_CRIT);
77define_xfs_printk_level(xfs_err, KERN_ERR);
78define_xfs_printk_level(xfs_warn, KERN_WARNING);
79define_xfs_printk_level(xfs_notice, KERN_NOTICE);
80define_xfs_printk_level(xfs_info, KERN_INFO);
81#ifdef DEBUG
82define_xfs_printk_level(xfs_debug, KERN_DEBUG);
83#endif
84
85void
86xfs_alert_tag(
87 const struct xfs_mount *mp,
88 int panic_tag,
89 const char *fmt, ...)
90{
91 struct va_format vaf;
92 va_list args;
93 int do_panic = 0;
94
95 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
96 xfs_printk(KERN_ALERT, mp,
97 "XFS: Transforming an alert into a BUG.");
98 do_panic = 1;
99 }
100
101 va_start(args, fmt);
102
103 vaf.fmt = fmt;
104 vaf.va = &args;
105
106 __xfs_printk(KERN_ALERT, mp, &vaf);
107 va_end(args);
108
109 BUG_ON(do_panic);
110}
111
112void
113assfail(char *expr, char *file, int line)
114{
115 xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
116 expr, file, line);
117 BUG();
118}
119
120void
121xfs_hex_dump(void *p, int length)
122{
123 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
124}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
new file mode 100644
index 000000000000..f1b3fc1b6c4e
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -0,0 +1,40 @@
1#ifndef __XFS_MESSAGE_H
2#define __XFS_MESSAGE_H 1
3
4struct xfs_mount;
5
6extern void xfs_printk(const char *level, const struct xfs_mount *mp,
7 const char *fmt, ...)
8 __attribute__ ((format (printf, 3, 4)));
9extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
10 __attribute__ ((format (printf, 2, 3)));
11extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
12 __attribute__ ((format (printf, 2, 3)));
13extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
14 const char *fmt, ...)
15 __attribute__ ((format (printf, 3, 4)));
16extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
17 __attribute__ ((format (printf, 2, 3)));
18extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
19 __attribute__ ((format (printf, 2, 3)));
20extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
21 __attribute__ ((format (printf, 2, 3)));
22extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
23 __attribute__ ((format (printf, 2, 3)));
24extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
25 __attribute__ ((format (printf, 2, 3)));
26
27#ifdef DEBUG
28extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
29 __attribute__ ((format (printf, 2, 3)));
30#else
31static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
32{
33}
34#endif
35
36extern void assfail(char *expr, char *f, int l);
37
38extern void xfs_hex_dump(void *p, int length);
39
40#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 9731898083ae..b38e58d02299 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -173,6 +173,15 @@ xfs_parseargs(
173 __uint8_t iosizelog = 0; 173 __uint8_t iosizelog = 0;
174 174
175 /* 175 /*
176 * set up the mount name first so all the errors will refer to the
177 * correct device.
178 */
179 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
180 if (!mp->m_fsname)
181 return ENOMEM;
182 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
183
184 /*
176 * Copy binary VFS mount flags we are interested in. 185 * Copy binary VFS mount flags we are interested in.
177 */ 186 */
178 if (sb->s_flags & MS_RDONLY) 187 if (sb->s_flags & MS_RDONLY)
@@ -189,6 +198,7 @@ xfs_parseargs(
189 mp->m_flags |= XFS_MOUNT_BARRIER; 198 mp->m_flags |= XFS_MOUNT_BARRIER;
190 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; 199 mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
191 mp->m_flags |= XFS_MOUNT_SMALL_INUMS; 200 mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
201 mp->m_flags |= XFS_MOUNT_DELAYLOG;
192 202
193 /* 203 /*
194 * These can be overridden by the mount option parsing. 204 * These can be overridden by the mount option parsing.
@@ -207,24 +217,21 @@ xfs_parseargs(
207 217
208 if (!strcmp(this_char, MNTOPT_LOGBUFS)) { 218 if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
209 if (!value || !*value) { 219 if (!value || !*value) {
210 cmn_err(CE_WARN, 220 xfs_warn(mp, "%s option requires an argument",
211 "XFS: %s option requires an argument",
212 this_char); 221 this_char);
213 return EINVAL; 222 return EINVAL;
214 } 223 }
215 mp->m_logbufs = simple_strtoul(value, &eov, 10); 224 mp->m_logbufs = simple_strtoul(value, &eov, 10);
216 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { 225 } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
217 if (!value || !*value) { 226 if (!value || !*value) {
218 cmn_err(CE_WARN, 227 xfs_warn(mp, "%s option requires an argument",
219 "XFS: %s option requires an argument",
220 this_char); 228 this_char);
221 return EINVAL; 229 return EINVAL;
222 } 230 }
223 mp->m_logbsize = suffix_strtoul(value, &eov, 10); 231 mp->m_logbsize = suffix_strtoul(value, &eov, 10);
224 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { 232 } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
225 if (!value || !*value) { 233 if (!value || !*value) {
226 cmn_err(CE_WARN, 234 xfs_warn(mp, "%s option requires an argument",
227 "XFS: %s option requires an argument",
228 this_char); 235 this_char);
229 return EINVAL; 236 return EINVAL;
230 } 237 }
@@ -232,14 +239,12 @@ xfs_parseargs(
232 if (!mp->m_logname) 239 if (!mp->m_logname)
233 return ENOMEM; 240 return ENOMEM;
234 } else if (!strcmp(this_char, MNTOPT_MTPT)) { 241 } else if (!strcmp(this_char, MNTOPT_MTPT)) {
235 cmn_err(CE_WARN, 242 xfs_warn(mp, "%s option not allowed on this system",
236 "XFS: %s option not allowed on this system",
237 this_char); 243 this_char);
238 return EINVAL; 244 return EINVAL;
239 } else if (!strcmp(this_char, MNTOPT_RTDEV)) { 245 } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
240 if (!value || !*value) { 246 if (!value || !*value) {
241 cmn_err(CE_WARN, 247 xfs_warn(mp, "%s option requires an argument",
242 "XFS: %s option requires an argument",
243 this_char); 248 this_char);
244 return EINVAL; 249 return EINVAL;
245 } 250 }
@@ -248,8 +253,7 @@ xfs_parseargs(
248 return ENOMEM; 253 return ENOMEM;
249 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { 254 } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
250 if (!value || !*value) { 255 if (!value || !*value) {
251 cmn_err(CE_WARN, 256 xfs_warn(mp, "%s option requires an argument",
252 "XFS: %s option requires an argument",
253 this_char); 257 this_char);
254 return EINVAL; 258 return EINVAL;
255 } 259 }
@@ -257,8 +261,7 @@ xfs_parseargs(
257 iosizelog = ffs(iosize) - 1; 261 iosizelog = ffs(iosize) - 1;
258 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { 262 } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
259 if (!value || !*value) { 263 if (!value || !*value) {
260 cmn_err(CE_WARN, 264 xfs_warn(mp, "%s option requires an argument",
261 "XFS: %s option requires an argument",
262 this_char); 265 this_char);
263 return EINVAL; 266 return EINVAL;
264 } 267 }
@@ -280,16 +283,14 @@ xfs_parseargs(
280 mp->m_flags |= XFS_MOUNT_SWALLOC; 283 mp->m_flags |= XFS_MOUNT_SWALLOC;
281 } else if (!strcmp(this_char, MNTOPT_SUNIT)) { 284 } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
282 if (!value || !*value) { 285 if (!value || !*value) {
283 cmn_err(CE_WARN, 286 xfs_warn(mp, "%s option requires an argument",
284 "XFS: %s option requires an argument",
285 this_char); 287 this_char);
286 return EINVAL; 288 return EINVAL;
287 } 289 }
288 dsunit = simple_strtoul(value, &eov, 10); 290 dsunit = simple_strtoul(value, &eov, 10);
289 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { 291 } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
290 if (!value || !*value) { 292 if (!value || !*value) {
291 cmn_err(CE_WARN, 293 xfs_warn(mp, "%s option requires an argument",
292 "XFS: %s option requires an argument",
293 this_char); 294 this_char);
294 return EINVAL; 295 return EINVAL;
295 } 296 }
@@ -297,8 +298,7 @@ xfs_parseargs(
297 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { 298 } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
298 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; 299 mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
299#if !XFS_BIG_INUMS 300#if !XFS_BIG_INUMS
300 cmn_err(CE_WARN, 301 xfs_warn(mp, "%s option not allowed on this system",
301 "XFS: %s option not allowed on this system",
302 this_char); 302 this_char);
303 return EINVAL; 303 return EINVAL;
304#endif 304#endif
@@ -356,20 +356,19 @@ xfs_parseargs(
356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { 356 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG; 357 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
358 } else if (!strcmp(this_char, "ihashsize")) { 358 } else if (!strcmp(this_char, "ihashsize")) {
359 cmn_err(CE_WARN, 359 xfs_warn(mp,
360 "XFS: ihashsize no longer used, option is deprecated."); 360 "ihashsize no longer used, option is deprecated.");
361 } else if (!strcmp(this_char, "osyncisdsync")) { 361 } else if (!strcmp(this_char, "osyncisdsync")) {
362 cmn_err(CE_WARN, 362 xfs_warn(mp,
363 "XFS: osyncisdsync has no effect, option is deprecated."); 363 "osyncisdsync has no effect, option is deprecated.");
364 } else if (!strcmp(this_char, "osyncisosync")) { 364 } else if (!strcmp(this_char, "osyncisosync")) {
365 cmn_err(CE_WARN, 365 xfs_warn(mp,
366 "XFS: osyncisosync has no effect, option is deprecated."); 366 "osyncisosync has no effect, option is deprecated.");
367 } else if (!strcmp(this_char, "irixsgid")) { 367 } else if (!strcmp(this_char, "irixsgid")) {
368 cmn_err(CE_WARN, 368 xfs_warn(mp,
369 "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); 369 "irixsgid is now a sysctl(2) variable, option is deprecated.");
370 } else { 370 } else {
371 cmn_err(CE_WARN, 371 xfs_warn(mp, "unknown mount option [%s].", this_char);
372 "XFS: unknown mount option [%s].", this_char);
373 return EINVAL; 372 return EINVAL;
374 } 373 }
375 } 374 }
@@ -379,40 +378,37 @@ xfs_parseargs(
379 */ 378 */
380 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && 379 if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
381 !(mp->m_flags & XFS_MOUNT_RDONLY)) { 380 !(mp->m_flags & XFS_MOUNT_RDONLY)) {
382 cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only."); 381 xfs_warn(mp, "no-recovery mounts must be read-only.");
383 return EINVAL; 382 return EINVAL;
384 } 383 }
385 384
386 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { 385 if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
387 cmn_err(CE_WARN, 386 xfs_warn(mp,
388 "XFS: sunit and swidth options incompatible with the noalign option"); 387 "sunit and swidth options incompatible with the noalign option");
389 return EINVAL; 388 return EINVAL;
390 } 389 }
391 390
392#ifndef CONFIG_XFS_QUOTA 391#ifndef CONFIG_XFS_QUOTA
393 if (XFS_IS_QUOTA_RUNNING(mp)) { 392 if (XFS_IS_QUOTA_RUNNING(mp)) {
394 cmn_err(CE_WARN, 393 xfs_warn(mp, "quota support not available in this kernel.");
395 "XFS: quota support not available in this kernel.");
396 return EINVAL; 394 return EINVAL;
397 } 395 }
398#endif 396#endif
399 397
400 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && 398 if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
401 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { 399 (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
402 cmn_err(CE_WARN, 400 xfs_warn(mp, "cannot mount with both project and group quota");
403 "XFS: cannot mount with both project and group quota");
404 return EINVAL; 401 return EINVAL;
405 } 402 }
406 403
407 if ((dsunit && !dswidth) || (!dsunit && dswidth)) { 404 if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
408 cmn_err(CE_WARN, 405 xfs_warn(mp, "sunit and swidth must be specified together");
409 "XFS: sunit and swidth must be specified together");
410 return EINVAL; 406 return EINVAL;
411 } 407 }
412 408
413 if (dsunit && (dswidth % dsunit != 0)) { 409 if (dsunit && (dswidth % dsunit != 0)) {
414 cmn_err(CE_WARN, 410 xfs_warn(mp,
415 "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", 411 "stripe width (%d) must be a multiple of the stripe unit (%d)",
416 dswidth, dsunit); 412 dswidth, dsunit);
417 return EINVAL; 413 return EINVAL;
418 } 414 }
@@ -438,8 +434,7 @@ done:
438 mp->m_logbufs != 0 && 434 mp->m_logbufs != 0 &&
439 (mp->m_logbufs < XLOG_MIN_ICLOGS || 435 (mp->m_logbufs < XLOG_MIN_ICLOGS ||
440 mp->m_logbufs > XLOG_MAX_ICLOGS)) { 436 mp->m_logbufs > XLOG_MAX_ICLOGS)) {
441 cmn_err(CE_WARN, 437 xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
442 "XFS: invalid logbufs value: %d [not %d-%d]",
443 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); 438 mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
444 return XFS_ERROR(EINVAL); 439 return XFS_ERROR(EINVAL);
445 } 440 }
@@ -448,22 +443,16 @@ done:
448 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || 443 (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
449 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || 444 mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
450 !is_power_of_2(mp->m_logbsize))) { 445 !is_power_of_2(mp->m_logbsize))) {
451 cmn_err(CE_WARN, 446 xfs_warn(mp,
452 "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", 447 "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
453 mp->m_logbsize); 448 mp->m_logbsize);
454 return XFS_ERROR(EINVAL); 449 return XFS_ERROR(EINVAL);
455 } 450 }
456 451
457 mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
458 if (!mp->m_fsname)
459 return ENOMEM;
460 mp->m_fsname_len = strlen(mp->m_fsname) + 1;
461
462 if (iosizelog) { 452 if (iosizelog) {
463 if (iosizelog > XFS_MAX_IO_LOG || 453 if (iosizelog > XFS_MAX_IO_LOG ||
464 iosizelog < XFS_MIN_IO_LOG) { 454 iosizelog < XFS_MIN_IO_LOG) {
465 cmn_err(CE_WARN, 455 xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
466 "XFS: invalid log iosize: %d [not %d-%d]",
467 iosizelog, XFS_MIN_IO_LOG, 456 iosizelog, XFS_MIN_IO_LOG,
468 XFS_MAX_IO_LOG); 457 XFS_MAX_IO_LOG);
469 return XFS_ERROR(EINVAL); 458 return XFS_ERROR(EINVAL);
@@ -610,7 +599,7 @@ xfs_blkdev_get(
610 mp); 599 mp);
611 if (IS_ERR(*bdevp)) { 600 if (IS_ERR(*bdevp)) {
612 error = PTR_ERR(*bdevp); 601 error = PTR_ERR(*bdevp);
613 printk("XFS: Invalid device [%s], error=%d\n", name, error); 602 xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
614 } 603 }
615 604
616 return -error; 605 return -error;
@@ -664,23 +653,23 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp)
664 int error; 653 int error;
665 654
666 if (mp->m_logdev_targp != mp->m_ddev_targp) { 655 if (mp->m_logdev_targp != mp->m_ddev_targp) {
667 xfs_fs_cmn_err(CE_NOTE, mp, 656 xfs_notice(mp,
668 "Disabling barriers, not supported with external log device"); 657 "Disabling barriers, not supported with external log device");
669 mp->m_flags &= ~XFS_MOUNT_BARRIER; 658 mp->m_flags &= ~XFS_MOUNT_BARRIER;
670 return; 659 return;
671 } 660 }
672 661
673 if (xfs_readonly_buftarg(mp->m_ddev_targp)) { 662 if (xfs_readonly_buftarg(mp->m_ddev_targp)) {
674 xfs_fs_cmn_err(CE_NOTE, mp, 663 xfs_notice(mp,
675 "Disabling barriers, underlying device is readonly"); 664 "Disabling barriers, underlying device is readonly");
676 mp->m_flags &= ~XFS_MOUNT_BARRIER; 665 mp->m_flags &= ~XFS_MOUNT_BARRIER;
677 return; 666 return;
678 } 667 }
679 668
680 error = xfs_barrier_test(mp); 669 error = xfs_barrier_test(mp);
681 if (error) { 670 if (error) {
682 xfs_fs_cmn_err(CE_NOTE, mp, 671 xfs_notice(mp,
683 "Disabling barriers, trial barrier write failed"); 672 "Disabling barriers, trial barrier write failed");
684 mp->m_flags &= ~XFS_MOUNT_BARRIER; 673 mp->m_flags &= ~XFS_MOUNT_BARRIER;
685 return; 674 return;
686 } 675 }
@@ -743,8 +732,8 @@ xfs_open_devices(
743 goto out_close_logdev; 732 goto out_close_logdev;
744 733
745 if (rtdev == ddev || rtdev == logdev) { 734 if (rtdev == ddev || rtdev == logdev) {
746 cmn_err(CE_WARN, 735 xfs_warn(mp,
747 "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); 736 "Cannot mount filesystem with identical rtdev and ddev/logdev.");
748 error = EINVAL; 737 error = EINVAL;
749 goto out_close_rtdev; 738 goto out_close_rtdev;
750 } 739 }
@@ -827,75 +816,6 @@ xfs_setup_devices(
827 return 0; 816 return 0;
828} 817}
829 818
830/*
831 * XFS AIL push thread support
832 */
833void
834xfsaild_wakeup(
835 struct xfs_ail *ailp,
836 xfs_lsn_t threshold_lsn)
837{
838 /* only ever move the target forwards */
839 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
840 ailp->xa_target = threshold_lsn;
841 wake_up_process(ailp->xa_task);
842 }
843}
844
845STATIC int
846xfsaild(
847 void *data)
848{
849 struct xfs_ail *ailp = data;
850 xfs_lsn_t last_pushed_lsn = 0;
851 long tout = 0; /* milliseconds */
852
853 while (!kthread_should_stop()) {
854 /*
855 * for short sleeps indicating congestion, don't allow us to
856 * get woken early. Otherwise all we do is bang on the AIL lock
857 * without making progress.
858 */
859 if (tout && tout <= 20)
860 __set_current_state(TASK_KILLABLE);
861 else
862 __set_current_state(TASK_INTERRUPTIBLE);
863 schedule_timeout(tout ?
864 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
865
866 /* swsusp */
867 try_to_freeze();
868
869 ASSERT(ailp->xa_mount->m_log);
870 if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
871 continue;
872
873 tout = xfsaild_push(ailp, &last_pushed_lsn);
874 }
875
876 return 0;
877} /* xfsaild */
878
879int
880xfsaild_start(
881 struct xfs_ail *ailp)
882{
883 ailp->xa_target = 0;
884 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
885 ailp->xa_mount->m_fsname);
886 if (IS_ERR(ailp->xa_task))
887 return -PTR_ERR(ailp->xa_task);
888 return 0;
889}
890
891void
892xfsaild_stop(
893 struct xfs_ail *ailp)
894{
895 kthread_stop(ailp->xa_task);
896}
897
898
899/* Catch misguided souls that try to use this interface on XFS */ 819/* Catch misguided souls that try to use this interface on XFS */
900STATIC struct inode * 820STATIC struct inode *
901xfs_fs_alloc_inode( 821xfs_fs_alloc_inode(
@@ -1089,7 +1009,7 @@ xfs_fs_write_inode(
1089 error = 0; 1009 error = 0;
1090 goto out_unlock; 1010 goto out_unlock;
1091 } 1011 }
1092 error = xfs_iflush(ip, 0); 1012 error = xfs_iflush(ip, SYNC_TRYLOCK);
1093 } 1013 }
1094 1014
1095 out_unlock: 1015 out_unlock:
@@ -1202,22 +1122,12 @@ xfs_fs_sync_fs(
1202 return -error; 1122 return -error;
1203 1123
1204 if (laptop_mode) { 1124 if (laptop_mode) {
1205 int prev_sync_seq = mp->m_sync_seq;
1206
1207 /* 1125 /*
1208 * The disk must be active because we're syncing. 1126 * The disk must be active because we're syncing.
1209 * We schedule xfssyncd now (now that the disk is 1127 * We schedule xfssyncd now (now that the disk is
1210 * active) instead of later (when it might not be). 1128 * active) instead of later (when it might not be).
1211 */ 1129 */
1212 wake_up_process(mp->m_sync_task); 1130 flush_delayed_work_sync(&mp->m_sync_work);
1213 /*
1214 * We have to wait for the sync iteration to complete.
1215 * If we don't, the disk activity caused by the sync
1216 * will come after the sync is completed, and that
1217 * triggers another sync from laptop mode.
1218 */
1219 wait_event(mp->m_wait_single_sync_task,
1220 mp->m_sync_seq != prev_sync_seq);
1221 } 1131 }
1222 1132
1223 return 0; 1133 return 0;
@@ -1345,8 +1255,8 @@ xfs_fs_remount(
1345 * options that we can't actually change. 1255 * options that we can't actually change.
1346 */ 1256 */
1347#if 0 1257#if 0
1348 printk(KERN_INFO 1258 xfs_info(mp,
1349 "XFS: mount option \"%s\" not supported for remount\n", p); 1259 "mount option \"%s\" not supported for remount\n", p);
1350 return -EINVAL; 1260 return -EINVAL;
1351#else 1261#else
1352 break; 1262 break;
@@ -1367,8 +1277,7 @@ xfs_fs_remount(
1367 if (mp->m_update_flags) { 1277 if (mp->m_update_flags) {
1368 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1278 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1369 if (error) { 1279 if (error) {
1370 cmn_err(CE_WARN, 1280 xfs_warn(mp, "failed to write sb changes");
1371 "XFS: failed to write sb changes");
1372 return error; 1281 return error;
1373 } 1282 }
1374 mp->m_update_flags = 0; 1283 mp->m_update_flags = 0;
@@ -1452,15 +1361,15 @@ xfs_finish_flags(
1452 mp->m_logbsize = mp->m_sb.sb_logsunit; 1361 mp->m_logbsize = mp->m_sb.sb_logsunit;
1453 } else if (mp->m_logbsize > 0 && 1362 } else if (mp->m_logbsize > 0 &&
1454 mp->m_logbsize < mp->m_sb.sb_logsunit) { 1363 mp->m_logbsize < mp->m_sb.sb_logsunit) {
1455 cmn_err(CE_WARN, 1364 xfs_warn(mp,
1456 "XFS: logbuf size must be greater than or equal to log stripe size"); 1365 "logbuf size must be greater than or equal to log stripe size");
1457 return XFS_ERROR(EINVAL); 1366 return XFS_ERROR(EINVAL);
1458 } 1367 }
1459 } else { 1368 } else {
1460 /* Fail a mount if the logbuf is larger than 32K */ 1369 /* Fail a mount if the logbuf is larger than 32K */
1461 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { 1370 if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
1462 cmn_err(CE_WARN, 1371 xfs_warn(mp,
1463 "XFS: logbuf size for version 1 logs must be 16K or 32K"); 1372 "logbuf size for version 1 logs must be 16K or 32K");
1464 return XFS_ERROR(EINVAL); 1373 return XFS_ERROR(EINVAL);
1465 } 1374 }
1466 } 1375 }
@@ -1477,8 +1386,8 @@ xfs_finish_flags(
1477 * prohibit r/w mounts of read-only filesystems 1386 * prohibit r/w mounts of read-only filesystems
1478 */ 1387 */
1479 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { 1388 if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
1480 cmn_err(CE_WARN, 1389 xfs_warn(mp,
1481 "XFS: cannot mount a read-only filesystem as read-write"); 1390 "cannot mount a read-only filesystem as read-write");
1482 return XFS_ERROR(EROFS); 1391 return XFS_ERROR(EROFS);
1483 } 1392 }
1484 1393
@@ -1502,9 +1411,6 @@ xfs_fs_fill_super(
1502 spin_lock_init(&mp->m_sb_lock); 1411 spin_lock_init(&mp->m_sb_lock);
1503 mutex_init(&mp->m_growlock); 1412 mutex_init(&mp->m_growlock);
1504 atomic_set(&mp->m_active_trans, 0); 1413 atomic_set(&mp->m_active_trans, 0);
1505 INIT_LIST_HEAD(&mp->m_sync_list);
1506 spin_lock_init(&mp->m_sync_lock);
1507 init_waitqueue_head(&mp->m_wait_single_sync_task);
1508 1414
1509 mp->m_super = sb; 1415 mp->m_super = sb;
1510 sb->s_fs_info = mp; 1416 sb->s_fs_info = mp;
@@ -1551,10 +1457,14 @@ xfs_fs_fill_super(
1551 if (error) 1457 if (error)
1552 goto out_free_sb; 1458 goto out_free_sb;
1553 1459
1554 error = xfs_mountfs(mp); 1460 /*
1555 if (error) 1461 * we must configure the block size in the superblock before we run the
1556 goto out_filestream_unmount; 1462 * full mount process as the mount process can lookup and cache inodes.
1557 1463 * For the same reason we must also initialise the syncd and register
1464 * the inode cache shrinker so that inodes can be reclaimed during
1465 * operations like a quotacheck that iterate all inodes in the
1466 * filesystem.
1467 */
1558 sb->s_magic = XFS_SB_MAGIC; 1468 sb->s_magic = XFS_SB_MAGIC;
1559 sb->s_blocksize = mp->m_sb.sb_blocksize; 1469 sb->s_blocksize = mp->m_sb.sb_blocksize;
1560 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; 1470 sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1562,6 +1472,16 @@ xfs_fs_fill_super(
1562 sb->s_time_gran = 1; 1472 sb->s_time_gran = 1;
1563 set_posix_acl_flag(sb); 1473 set_posix_acl_flag(sb);
1564 1474
1475 error = xfs_syncd_init(mp);
1476 if (error)
1477 goto out_filestream_unmount;
1478
1479 xfs_inode_shrinker_register(mp);
1480
1481 error = xfs_mountfs(mp);
1482 if (error)
1483 goto out_syncd_stop;
1484
1565 root = igrab(VFS_I(mp->m_rootip)); 1485 root = igrab(VFS_I(mp->m_rootip));
1566 if (!root) { 1486 if (!root) {
1567 error = ENOENT; 1487 error = ENOENT;
@@ -1577,14 +1497,11 @@ xfs_fs_fill_super(
1577 goto fail_vnrele; 1497 goto fail_vnrele;
1578 } 1498 }
1579 1499
1580 error = xfs_syncd_init(mp);
1581 if (error)
1582 goto fail_vnrele;
1583
1584 xfs_inode_shrinker_register(mp);
1585
1586 return 0; 1500 return 0;
1587 1501
1502 out_syncd_stop:
1503 xfs_inode_shrinker_unregister(mp);
1504 xfs_syncd_stop(mp);
1588 out_filestream_unmount: 1505 out_filestream_unmount:
1589 xfs_filestream_unmount(mp); 1506 xfs_filestream_unmount(mp);
1590 out_free_sb: 1507 out_free_sb:
@@ -1608,6 +1525,9 @@ xfs_fs_fill_super(
1608 } 1525 }
1609 1526
1610 fail_unmount: 1527 fail_unmount:
1528 xfs_inode_shrinker_unregister(mp);
1529 xfs_syncd_stop(mp);
1530
1611 /* 1531 /*
1612 * Blow away any referenced inode in the filestreams cache. 1532 * Blow away any referenced inode in the filestreams cache.
1613 * This can and will cause log traffic as inodes go inactive 1533 * This can and will cause log traffic as inodes go inactive
@@ -1797,6 +1717,38 @@ xfs_destroy_zones(void)
1797} 1717}
1798 1718
1799STATIC int __init 1719STATIC int __init
1720xfs_init_workqueues(void)
1721{
1722 /*
1723 * max_active is set to 8 to give enough concurency to allow
1724 * multiple work operations on each CPU to run. This allows multiple
1725 * filesystems to be running sync work concurrently, and scales with
1726 * the number of CPUs in the system.
1727 */
1728 xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
1729 if (!xfs_syncd_wq)
1730 goto out;
1731
1732 xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
1733 if (!xfs_ail_wq)
1734 goto out_destroy_syncd;
1735
1736 return 0;
1737
1738out_destroy_syncd:
1739 destroy_workqueue(xfs_syncd_wq);
1740out:
1741 return -ENOMEM;
1742}
1743
1744STATIC void
1745xfs_destroy_workqueues(void)
1746{
1747 destroy_workqueue(xfs_ail_wq);
1748 destroy_workqueue(xfs_syncd_wq);
1749}
1750
1751STATIC int __init
1800init_xfs_fs(void) 1752init_xfs_fs(void)
1801{ 1753{
1802 int error; 1754 int error;
@@ -1811,10 +1763,14 @@ init_xfs_fs(void)
1811 if (error) 1763 if (error)
1812 goto out; 1764 goto out;
1813 1765
1814 error = xfs_mru_cache_init(); 1766 error = xfs_init_workqueues();
1815 if (error) 1767 if (error)
1816 goto out_destroy_zones; 1768 goto out_destroy_zones;
1817 1769
1770 error = xfs_mru_cache_init();
1771 if (error)
1772 goto out_destroy_wq;
1773
1818 error = xfs_filestream_init(); 1774 error = xfs_filestream_init();
1819 if (error) 1775 if (error)
1820 goto out_mru_cache_uninit; 1776 goto out_mru_cache_uninit;
@@ -1831,6 +1787,10 @@ init_xfs_fs(void)
1831 if (error) 1787 if (error)
1832 goto out_cleanup_procfs; 1788 goto out_cleanup_procfs;
1833 1789
1790 error = xfs_init_workqueues();
1791 if (error)
1792 goto out_sysctl_unregister;
1793
1834 vfs_initquota(); 1794 vfs_initquota();
1835 1795
1836 error = register_filesystem(&xfs_fs_type); 1796 error = register_filesystem(&xfs_fs_type);
@@ -1848,6 +1808,8 @@ init_xfs_fs(void)
1848 xfs_filestream_uninit(); 1808 xfs_filestream_uninit();
1849 out_mru_cache_uninit: 1809 out_mru_cache_uninit:
1850 xfs_mru_cache_uninit(); 1810 xfs_mru_cache_uninit();
1811 out_destroy_wq:
1812 xfs_destroy_workqueues();
1851 out_destroy_zones: 1813 out_destroy_zones:
1852 xfs_destroy_zones(); 1814 xfs_destroy_zones();
1853 out: 1815 out:
@@ -1864,6 +1826,7 @@ exit_xfs_fs(void)
1864 xfs_buf_terminate(); 1826 xfs_buf_terminate();
1865 xfs_filestream_uninit(); 1827 xfs_filestream_uninit();
1866 xfs_mru_cache_uninit(); 1828 xfs_mru_cache_uninit();
1829 xfs_destroy_workqueues();
1867 xfs_destroy_zones(); 1830 xfs_destroy_zones();
1868} 1831}
1869 1832
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index e22f0057d21f..e4f9c1b0836c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
22#include "xfs_log.h" 22#include "xfs_log.h"
23#include "xfs_inum.h" 23#include "xfs_inum.h"
24#include "xfs_trans.h" 24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
25#include "xfs_sb.h" 26#include "xfs_sb.h"
26#include "xfs_ag.h" 27#include "xfs_ag.h"
27#include "xfs_mount.h" 28#include "xfs_mount.h"
@@ -39,6 +40,8 @@
39#include <linux/kthread.h> 40#include <linux/kthread.h>
40#include <linux/freezer.h> 41#include <linux/freezer.h>
41 42
43struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
44
42/* 45/*
43 * The inode lookup is done in batches to keep the amount of lock traffic and 46 * The inode lookup is done in batches to keep the amount of lock traffic and
44 * radix tree lookups to a minimum. The batch size is a trade off between 47 * radix tree lookups to a minimum. The batch size is a trade off between
@@ -401,7 +404,7 @@ xfs_quiesce_fs(
401/* 404/*
402 * Second stage of a quiesce. The data is already synced, now we have to take 405 * Second stage of a quiesce. The data is already synced, now we have to take
403 * care of the metadata. New transactions are already blocked, so we need to 406 * care of the metadata. New transactions are already blocked, so we need to
404 * wait for any remaining transactions to drain out before proceding. 407 * wait for any remaining transactions to drain out before proceeding.
405 */ 408 */
406void 409void
407xfs_quiesce_attr( 410xfs_quiesce_attr(
@@ -425,69 +428,18 @@ xfs_quiesce_attr(
425 /* Push the superblock and write an unmount record */ 428 /* Push the superblock and write an unmount record */
426 error = xfs_log_sbcount(mp, 1); 429 error = xfs_log_sbcount(mp, 1);
427 if (error) 430 if (error)
428 xfs_fs_cmn_err(CE_WARN, mp, 431 xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
429 "xfs_attr_quiesce: failed to log sb changes. "
430 "Frozen image may not be consistent."); 432 "Frozen image may not be consistent.");
431 xfs_log_unmount_write(mp); 433 xfs_log_unmount_write(mp);
432 xfs_unmountfs_writesb(mp); 434 xfs_unmountfs_writesb(mp);
433} 435}
434 436
435/* 437static void
436 * Enqueue a work item to be picked up by the vfs xfssyncd thread. 438xfs_syncd_queue_sync(
437 * Doing this has two advantages: 439 struct xfs_mount *mp)
438 * - It saves on stack space, which is tight in certain situations
439 * - It can be used (with care) as a mechanism to avoid deadlocks.
440 * Flushing while allocating in a full filesystem requires both.
441 */
442STATIC void
443xfs_syncd_queue_work(
444 struct xfs_mount *mp,
445 void *data,
446 void (*syncer)(struct xfs_mount *, void *),
447 struct completion *completion)
448{
449 struct xfs_sync_work *work;
450
451 work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
452 INIT_LIST_HEAD(&work->w_list);
453 work->w_syncer = syncer;
454 work->w_data = data;
455 work->w_mount = mp;
456 work->w_completion = completion;
457 spin_lock(&mp->m_sync_lock);
458 list_add_tail(&work->w_list, &mp->m_sync_list);
459 spin_unlock(&mp->m_sync_lock);
460 wake_up_process(mp->m_sync_task);
461}
462
463/*
464 * Flush delayed allocate data, attempting to free up reserved space
465 * from existing allocations. At this point a new allocation attempt
466 * has failed with ENOSPC and we are in the process of scratching our
467 * heads, looking about for more room...
468 */
469STATIC void
470xfs_flush_inodes_work(
471 struct xfs_mount *mp,
472 void *arg)
473{
474 struct inode *inode = arg;
475 xfs_sync_data(mp, SYNC_TRYLOCK);
476 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
477 iput(inode);
478}
479
480void
481xfs_flush_inodes(
482 xfs_inode_t *ip)
483{ 440{
484 struct inode *inode = VFS_I(ip); 441 queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
485 DECLARE_COMPLETION_ONSTACK(completion); 442 msecs_to_jiffies(xfs_syncd_centisecs * 10));
486
487 igrab(inode);
488 xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
489 wait_for_completion(&completion);
490 xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
491} 443}
492 444
493/* 445/*
@@ -497,9 +449,10 @@ xfs_flush_inodes(
497 */ 449 */
498STATIC void 450STATIC void
499xfs_sync_worker( 451xfs_sync_worker(
500 struct xfs_mount *mp, 452 struct work_struct *work)
501 void *unused)
502{ 453{
454 struct xfs_mount *mp = container_of(to_delayed_work(work),
455 struct xfs_mount, m_sync_work);
503 int error; 456 int error;
504 457
505 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { 458 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@@ -509,73 +462,106 @@ xfs_sync_worker(
509 error = xfs_fs_log_dummy(mp); 462 error = xfs_fs_log_dummy(mp);
510 else 463 else
511 xfs_log_force(mp, 0); 464 xfs_log_force(mp, 0);
512 xfs_reclaim_inodes(mp, 0);
513 error = xfs_qm_sync(mp, SYNC_TRYLOCK); 465 error = xfs_qm_sync(mp, SYNC_TRYLOCK);
466
467 /* start pushing all the metadata that is currently dirty */
468 xfs_ail_push_all(mp->m_ail);
514 } 469 }
515 mp->m_sync_seq++; 470
516 wake_up(&mp->m_wait_single_sync_task); 471 /* queue us up again */
472 xfs_syncd_queue_sync(mp);
517} 473}
518 474
519STATIC int 475/*
520xfssyncd( 476 * Queue a new inode reclaim pass if there are reclaimable inodes and there
521 void *arg) 477 * isn't a reclaim pass already in progress. By default it runs every 5s based
478 * on the xfs syncd work default of 30s. Perhaps this should have it's own
479 * tunable, but that can be done if this method proves to be ineffective or too
480 * aggressive.
481 */
482static void
483xfs_syncd_queue_reclaim(
484 struct xfs_mount *mp)
522{ 485{
523 struct xfs_mount *mp = arg;
524 long timeleft;
525 xfs_sync_work_t *work, *n;
526 LIST_HEAD (tmp);
527
528 set_freezable();
529 timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
530 for (;;) {
531 if (list_empty(&mp->m_sync_list))
532 timeleft = schedule_timeout_interruptible(timeleft);
533 /* swsusp */
534 try_to_freeze();
535 if (kthread_should_stop() && list_empty(&mp->m_sync_list))
536 break;
537 486
538 spin_lock(&mp->m_sync_lock); 487 /*
539 /* 488 * We can have inodes enter reclaim after we've shut down the syncd
540 * We can get woken by laptop mode, to do a sync - 489 * workqueue during unmount, so don't allow reclaim work to be queued
541 * that's the (only!) case where the list would be 490 * during unmount.
542 * empty with time remaining. 491 */
543 */ 492 if (!(mp->m_super->s_flags & MS_ACTIVE))
544 if (!timeleft || list_empty(&mp->m_sync_list)) { 493 return;
545 if (!timeleft)
546 timeleft = xfs_syncd_centisecs *
547 msecs_to_jiffies(10);
548 INIT_LIST_HEAD(&mp->m_sync_work.w_list);
549 list_add_tail(&mp->m_sync_work.w_list,
550 &mp->m_sync_list);
551 }
552 list_splice_init(&mp->m_sync_list, &tmp);
553 spin_unlock(&mp->m_sync_lock);
554 494
555 list_for_each_entry_safe(work, n, &tmp, w_list) { 495 rcu_read_lock();
556 (*work->w_syncer)(mp, work->w_data); 496 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
557 list_del(&work->w_list); 497 queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
558 if (work == &mp->m_sync_work) 498 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
559 continue;
560 if (work->w_completion)
561 complete(work->w_completion);
562 kmem_free(work);
563 }
564 } 499 }
500 rcu_read_unlock();
501}
565 502
566 return 0; 503/*
504 * This is a fast pass over the inode cache to try to get reclaim moving on as
505 * many inodes as possible in a short period of time. It kicks itself every few
506 * seconds, as well as being kicked by the inode cache shrinker when memory
507 * goes low. It scans as quickly as possible avoiding locked inodes or those
508 * already being flushed, and once done schedules a future pass.
509 */
510STATIC void
511xfs_reclaim_worker(
512 struct work_struct *work)
513{
514 struct xfs_mount *mp = container_of(to_delayed_work(work),
515 struct xfs_mount, m_reclaim_work);
516
517 xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
518 xfs_syncd_queue_reclaim(mp);
519}
520
521/*
522 * Flush delayed allocate data, attempting to free up reserved space
523 * from existing allocations. At this point a new allocation attempt
524 * has failed with ENOSPC and we are in the process of scratching our
525 * heads, looking about for more room.
526 *
527 * Queue a new data flush if there isn't one already in progress and
528 * wait for completion of the flush. This means that we only ever have one
529 * inode flush in progress no matter how many ENOSPC events are occurring and
530 * so will prevent the system from bogging down due to every concurrent
531 * ENOSPC event scanning all the active inodes in the system for writeback.
532 */
533void
534xfs_flush_inodes(
535 struct xfs_inode *ip)
536{
537 struct xfs_mount *mp = ip->i_mount;
538
539 queue_work(xfs_syncd_wq, &mp->m_flush_work);
540 flush_work_sync(&mp->m_flush_work);
541}
542
543STATIC void
544xfs_flush_worker(
545 struct work_struct *work)
546{
547 struct xfs_mount *mp = container_of(work,
548 struct xfs_mount, m_flush_work);
549
550 xfs_sync_data(mp, SYNC_TRYLOCK);
551 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
567} 552}
568 553
569int 554int
570xfs_syncd_init( 555xfs_syncd_init(
571 struct xfs_mount *mp) 556 struct xfs_mount *mp)
572{ 557{
573 mp->m_sync_work.w_syncer = xfs_sync_worker; 558 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
574 mp->m_sync_work.w_mount = mp; 559 INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
575 mp->m_sync_work.w_completion = NULL; 560 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
576 mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); 561
577 if (IS_ERR(mp->m_sync_task)) 562 xfs_syncd_queue_sync(mp);
578 return -PTR_ERR(mp->m_sync_task); 563 xfs_syncd_queue_reclaim(mp);
564
579 return 0; 565 return 0;
580} 566}
581 567
@@ -583,7 +569,9 @@ void
583xfs_syncd_stop( 569xfs_syncd_stop(
584 struct xfs_mount *mp) 570 struct xfs_mount *mp)
585{ 571{
586 kthread_stop(mp->m_sync_task); 572 cancel_delayed_work_sync(&mp->m_sync_work);
573 cancel_delayed_work_sync(&mp->m_reclaim_work);
574 cancel_work_sync(&mp->m_flush_work);
587} 575}
588 576
589void 577void
@@ -602,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
602 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 590 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
603 XFS_ICI_RECLAIM_TAG); 591 XFS_ICI_RECLAIM_TAG);
604 spin_unlock(&ip->i_mount->m_perag_lock); 592 spin_unlock(&ip->i_mount->m_perag_lock);
593
594 /* schedule periodic background inode reclaim */
595 xfs_syncd_queue_reclaim(ip->i_mount);
596
605 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 597 trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
606 -1, _RET_IP_); 598 -1, _RET_IP_);
607 } 599 }
@@ -762,8 +754,10 @@ xfs_reclaim_inode(
762 struct xfs_perag *pag, 754 struct xfs_perag *pag,
763 int sync_mode) 755 int sync_mode)
764{ 756{
765 int error = 0; 757 int error;
766 758
759restart:
760 error = 0;
767 xfs_ilock(ip, XFS_ILOCK_EXCL); 761 xfs_ilock(ip, XFS_ILOCK_EXCL);
768 if (!xfs_iflock_nowait(ip)) { 762 if (!xfs_iflock_nowait(ip)) {
769 if (!(sync_mode & SYNC_WAIT)) 763 if (!(sync_mode & SYNC_WAIT))
@@ -789,9 +783,31 @@ xfs_reclaim_inode(
789 if (xfs_inode_clean(ip)) 783 if (xfs_inode_clean(ip))
790 goto reclaim; 784 goto reclaim;
791 785
792 /* Now we have an inode that needs flushing */ 786 /*
793 error = xfs_iflush(ip, sync_mode); 787 * Now we have an inode that needs flushing.
788 *
789 * We do a nonblocking flush here even if we are doing a SYNC_WAIT
790 * reclaim as we can deadlock with inode cluster removal.
791 * xfs_ifree_cluster() can lock the inode buffer before it locks the
792 * ip->i_lock, and we are doing the exact opposite here. As a result,
793 * doing a blocking xfs_itobp() to get the cluster buffer will result
794 * in an ABBA deadlock with xfs_ifree_cluster().
795 *
796 * As xfs_ifree_cluser() must gather all inodes that are active in the
797 * cache to mark them stale, if we hit this case we don't actually want
798 * to do IO here - we want the inode marked stale so we can simply
799 * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
800 * just unlock the inode, back off and try again. Hopefully the next
801 * pass through will see the stale flag set on the inode.
802 */
803 error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
794 if (sync_mode & SYNC_WAIT) { 804 if (sync_mode & SYNC_WAIT) {
805 if (error == EAGAIN) {
806 xfs_iunlock(ip, XFS_ILOCK_EXCL);
807 /* backoff longer than in xfs_ifree_cluster */
808 delay(2);
809 goto restart;
810 }
795 xfs_iflock(ip); 811 xfs_iflock(ip);
796 goto reclaim; 812 goto reclaim;
797 } 813 }
@@ -806,7 +822,7 @@ xfs_reclaim_inode(
806 * pass on the error. 822 * pass on the error.
807 */ 823 */
808 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { 824 if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
809 xfs_fs_cmn_err(CE_WARN, ip->i_mount, 825 xfs_warn(ip->i_mount,
810 "inode 0x%llx background reclaim flush failed with %d", 826 "inode 0x%llx background reclaim flush failed with %d",
811 (long long)ip->i_ino, error); 827 (long long)ip->i_ino, error);
812 } 828 }
@@ -994,7 +1010,13 @@ xfs_reclaim_inodes(
994} 1010}
995 1011
996/* 1012/*
997 * Shrinker infrastructure. 1013 * Inode cache shrinker.
1014 *
1015 * When called we make sure that there is a background (fast) inode reclaim in
1016 * progress, while we will throttle the speed of reclaim via doiing synchronous
1017 * reclaim of inodes. That means if we come across dirty inodes, we wait for
1018 * them to be cleaned, which we hope will not be very long due to the
1019 * background walker having already kicked the IO off on those dirty inodes.
998 */ 1020 */
999static int 1021static int
1000xfs_reclaim_inode_shrink( 1022xfs_reclaim_inode_shrink(
@@ -1009,10 +1031,15 @@ xfs_reclaim_inode_shrink(
1009 1031
1010 mp = container_of(shrink, struct xfs_mount, m_inode_shrink); 1032 mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
1011 if (nr_to_scan) { 1033 if (nr_to_scan) {
1034 /* kick background reclaimer and push the AIL */
1035 xfs_syncd_queue_reclaim(mp);
1036 xfs_ail_push_all(mp->m_ail);
1037
1012 if (!(gfp_mask & __GFP_FS)) 1038 if (!(gfp_mask & __GFP_FS))
1013 return -1; 1039 return -1;
1014 1040
1015 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); 1041 xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
1042 &nr_to_scan);
1016 /* terminate if we don't exhaust the scan */ 1043 /* terminate if we don't exhaust the scan */
1017 if (nr_to_scan > 0) 1044 if (nr_to_scan > 0)
1018 return -1; 1045 return -1;
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 32ba6628290c..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */ 32#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ 33#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
34 34
35extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
36
35int xfs_syncd_init(struct xfs_mount *mp); 37int xfs_syncd_init(struct xfs_mount *mp);
36void xfs_syncd_stop(struct xfs_mount *mp); 38void xfs_syncd_stop(struct xfs_mount *mp);
37 39
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index ee3cee097e7e..ee2d2adaa438 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -37,7 +37,7 @@ xfs_stats_clear_proc_handler(
37 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); 37 ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
38 38
39 if (!ret && write && *valp) { 39 if (!ret && write && *valp) {
40 printk("XFS Clearing xfsstats\n"); 40 xfs_notice(NULL, "Clearing xfsstats");
41 for_each_possible_cpu(c) { 41 for_each_possible_cpu(c) {
42 preempt_disable(); 42 preempt_disable();
43 /* save vn_active, it's a universal truth! */ 43 /* save vn_active, it's a universal truth! */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index d22aa3103106..6fa214603819 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -544,9 +544,10 @@ xfs_qm_dqtobp(
544 /* 544 /*
545 * A simple sanity check in case we got a corrupted dquot... 545 * A simple sanity check in case we got a corrupted dquot...
546 */ 546 */
547 if (xfs_qm_dqcheck(ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES, 547 error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
548 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN), 548 flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
549 "dqtobp")) { 549 "dqtobp");
550 if (error) {
550 if (!(flags & XFS_QMOPT_DQREPAIR)) { 551 if (!(flags & XFS_QMOPT_DQREPAIR)) {
551 xfs_trans_brelse(tp, bp); 552 xfs_trans_brelse(tp, bp);
552 return XFS_ERROR(EIO); 553 return XFS_ERROR(EIO);
@@ -599,7 +600,7 @@ xfs_qm_dqread(
599 600
600 /* 601 /*
601 * Reservation counters are defined as reservation plus current usage 602 * Reservation counters are defined as reservation plus current usage
602 * to avoid having to add everytime. 603 * to avoid having to add every time.
603 */ 604 */
604 dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount); 605 dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
605 dqp->q_res_icount = be64_to_cpu(ddqp->d_icount); 606 dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
@@ -827,7 +828,7 @@ xfs_qm_dqget(
827 if (xfs_do_dqerror) { 828 if (xfs_do_dqerror) {
828 if ((xfs_dqerror_target == mp->m_ddev_targp) && 829 if ((xfs_dqerror_target == mp->m_ddev_targp) &&
829 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) { 830 (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
830 cmn_err(CE_DEBUG, "Returning error in dqget"); 831 xfs_debug(mp, "Returning error in dqget");
831 return (EIO); 832 return (EIO);
832 } 833 }
833 } 834 }
@@ -1207,8 +1208,9 @@ xfs_qm_dqflush(
1207 /* 1208 /*
1208 * A simple sanity check in case we got a corrupted dquot.. 1209 * A simple sanity check in case we got a corrupted dquot..
1209 */ 1210 */
1210 if (xfs_qm_dqcheck(&dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 1211 error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
1211 XFS_QMOPT_DOWARN, "dqflush (incore copy)")) { 1212 XFS_QMOPT_DOWARN, "dqflush (incore copy)");
1213 if (error) {
1212 xfs_buf_relse(bp); 1214 xfs_buf_relse(bp);
1213 xfs_dqfunlock(dqp); 1215 xfs_dqfunlock(dqp);
1214 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1216 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1391,8 +1393,8 @@ xfs_qm_dqpurge(
1391 */ 1393 */
1392 error = xfs_qm_dqflush(dqp, SYNC_WAIT); 1394 error = xfs_qm_dqflush(dqp, SYNC_WAIT);
1393 if (error) 1395 if (error)
1394 xfs_fs_cmn_err(CE_WARN, mp, 1396 xfs_warn(mp, "%s: dquot %p flush failed",
1395 "xfs_qm_dqpurge: dquot %p flush failed", dqp); 1397 __func__, dqp);
1396 xfs_dqflock(dqp); 1398 xfs_dqflock(dqp);
1397 } 1399 }
1398 ASSERT(atomic_read(&dqp->q_pincount) == 0); 1400 ASSERT(atomic_read(&dqp->q_pincount) == 0);
@@ -1425,36 +1427,38 @@ xfs_qm_dqpurge(
1425void 1427void
1426xfs_qm_dqprint(xfs_dquot_t *dqp) 1428xfs_qm_dqprint(xfs_dquot_t *dqp)
1427{ 1429{
1428 cmn_err(CE_DEBUG, "-----------KERNEL DQUOT----------------"); 1430 struct xfs_mount *mp = dqp->q_mount;
1429 cmn_err(CE_DEBUG, "---- dquotID = %d", 1431
1432 xfs_debug(mp, "-----------KERNEL DQUOT----------------");
1433 xfs_debug(mp, "---- dquotID = %d",
1430 (int)be32_to_cpu(dqp->q_core.d_id)); 1434 (int)be32_to_cpu(dqp->q_core.d_id));
1431 cmn_err(CE_DEBUG, "---- type = %s", DQFLAGTO_TYPESTR(dqp)); 1435 xfs_debug(mp, "---- type = %s", DQFLAGTO_TYPESTR(dqp));
1432 cmn_err(CE_DEBUG, "---- fs = 0x%p", dqp->q_mount); 1436 xfs_debug(mp, "---- fs = 0x%p", dqp->q_mount);
1433 cmn_err(CE_DEBUG, "---- blkno = 0x%x", (int) dqp->q_blkno); 1437 xfs_debug(mp, "---- blkno = 0x%x", (int) dqp->q_blkno);
1434 cmn_err(CE_DEBUG, "---- boffset = 0x%x", (int) dqp->q_bufoffset); 1438 xfs_debug(mp, "---- boffset = 0x%x", (int) dqp->q_bufoffset);
1435 cmn_err(CE_DEBUG, "---- blkhlimit = %Lu (0x%x)", 1439 xfs_debug(mp, "---- blkhlimit = %Lu (0x%x)",
1436 be64_to_cpu(dqp->q_core.d_blk_hardlimit), 1440 be64_to_cpu(dqp->q_core.d_blk_hardlimit),
1437 (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit)); 1441 (int)be64_to_cpu(dqp->q_core.d_blk_hardlimit));
1438 cmn_err(CE_DEBUG, "---- blkslimit = %Lu (0x%x)", 1442 xfs_debug(mp, "---- blkslimit = %Lu (0x%x)",
1439 be64_to_cpu(dqp->q_core.d_blk_softlimit), 1443 be64_to_cpu(dqp->q_core.d_blk_softlimit),
1440 (int)be64_to_cpu(dqp->q_core.d_blk_softlimit)); 1444 (int)be64_to_cpu(dqp->q_core.d_blk_softlimit));
1441 cmn_err(CE_DEBUG, "---- inohlimit = %Lu (0x%x)", 1445 xfs_debug(mp, "---- inohlimit = %Lu (0x%x)",
1442 be64_to_cpu(dqp->q_core.d_ino_hardlimit), 1446 be64_to_cpu(dqp->q_core.d_ino_hardlimit),
1443 (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit)); 1447 (int)be64_to_cpu(dqp->q_core.d_ino_hardlimit));
1444 cmn_err(CE_DEBUG, "---- inoslimit = %Lu (0x%x)", 1448 xfs_debug(mp, "---- inoslimit = %Lu (0x%x)",
1445 be64_to_cpu(dqp->q_core.d_ino_softlimit), 1449 be64_to_cpu(dqp->q_core.d_ino_softlimit),
1446 (int)be64_to_cpu(dqp->q_core.d_ino_softlimit)); 1450 (int)be64_to_cpu(dqp->q_core.d_ino_softlimit));
1447 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", 1451 xfs_debug(mp, "---- bcount = %Lu (0x%x)",
1448 be64_to_cpu(dqp->q_core.d_bcount), 1452 be64_to_cpu(dqp->q_core.d_bcount),
1449 (int)be64_to_cpu(dqp->q_core.d_bcount)); 1453 (int)be64_to_cpu(dqp->q_core.d_bcount));
1450 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", 1454 xfs_debug(mp, "---- icount = %Lu (0x%x)",
1451 be64_to_cpu(dqp->q_core.d_icount), 1455 be64_to_cpu(dqp->q_core.d_icount),
1452 (int)be64_to_cpu(dqp->q_core.d_icount)); 1456 (int)be64_to_cpu(dqp->q_core.d_icount));
1453 cmn_err(CE_DEBUG, "---- btimer = %d", 1457 xfs_debug(mp, "---- btimer = %d",
1454 (int)be32_to_cpu(dqp->q_core.d_btimer)); 1458 (int)be32_to_cpu(dqp->q_core.d_btimer));
1455 cmn_err(CE_DEBUG, "---- itimer = %d", 1459 xfs_debug(mp, "---- itimer = %d",
1456 (int)be32_to_cpu(dqp->q_core.d_itimer)); 1460 (int)be32_to_cpu(dqp->q_core.d_itimer));
1457 cmn_err(CE_DEBUG, "---------------------------"); 1461 xfs_debug(mp, "---------------------------");
1458} 1462}
1459#endif 1463#endif
1460 1464
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 2a1f3dc10a02..9e0e2fa3f2c8 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -136,9 +136,8 @@ xfs_qm_dquot_logitem_push(
136 */ 136 */
137 error = xfs_qm_dqflush(dqp, 0); 137 error = xfs_qm_dqflush(dqp, 0);
138 if (error) 138 if (error)
139 xfs_fs_cmn_err(CE_WARN, dqp->q_mount, 139 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
140 "xfs_qm_dquot_logitem_push: push error %d on dqp %p", 140 __func__, error, dqp);
141 error, dqp);
142 xfs_dqunlock(dqp); 141 xfs_dqunlock(dqp);
143} 142}
144 143
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 206a2815ced6..69228aa8605a 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -80,7 +80,7 @@ xfs_qm_dquot_list_print(
80 int i = 0; 80 int i = 0;
81 81
82 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) { 82 list_for_each_entry(dqp, &mp->m_quotainfo->qi_dqlist_lock, qi_mplist) {
83 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" " 83 xfs_debug(mp, " %d. \"%d (%s)\" "
84 "bcnt = %lld, icnt = %lld, refs = %d", 84 "bcnt = %lld, icnt = %lld, refs = %d",
85 i++, be32_to_cpu(dqp->q_core.d_id), 85 i++, be32_to_cpu(dqp->q_core.d_id),
86 DQFLAGTO_TYPESTR(dqp), 86 DQFLAGTO_TYPESTR(dqp),
@@ -205,7 +205,7 @@ xfs_qm_destroy(
205 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) { 205 list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
206 xfs_dqlock(dqp); 206 xfs_dqlock(dqp);
207#ifdef QUOTADEBUG 207#ifdef QUOTADEBUG
208 cmn_err(CE_DEBUG, "FREELIST destroy 0x%p", dqp); 208 xfs_debug(dqp->q_mount, "FREELIST destroy 0x%p", dqp);
209#endif 209#endif
210 list_del_init(&dqp->q_freelist); 210 list_del_init(&dqp->q_freelist);
211 xfs_Gqm->qm_dqfrlist_cnt--; 211 xfs_Gqm->qm_dqfrlist_cnt--;
@@ -341,9 +341,7 @@ xfs_qm_mount_quotas(
341 * quotas immediately. 341 * quotas immediately.
342 */ 342 */
343 if (mp->m_sb.sb_rextents) { 343 if (mp->m_sb.sb_rextents) {
344 cmn_err(CE_NOTE, 344 xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
345 "Cannot turn on quotas for realtime filesystem %s",
346 mp->m_fsname);
347 mp->m_qflags = 0; 345 mp->m_qflags = 0;
348 goto write_changes; 346 goto write_changes;
349 } 347 }
@@ -402,14 +400,13 @@ xfs_qm_mount_quotas(
402 * off, but the on disk superblock doesn't know that ! 400 * off, but the on disk superblock doesn't know that !
403 */ 401 */
404 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); 402 ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
405 xfs_fs_cmn_err(CE_ALERT, mp, 403 xfs_alert(mp, "%s: Superblock update failed!",
406 "XFS mount_quotas: Superblock update failed!"); 404 __func__);
407 } 405 }
408 } 406 }
409 407
410 if (error) { 408 if (error) {
411 xfs_fs_cmn_err(CE_WARN, mp, 409 xfs_warn(mp, "Failed to initialize disk quotas.");
412 "Failed to initialize disk quotas.");
413 return; 410 return;
414 } 411 }
415 412
@@ -464,12 +461,10 @@ xfs_qm_dqflush_all(
464 struct xfs_quotainfo *q = mp->m_quotainfo; 461 struct xfs_quotainfo *q = mp->m_quotainfo;
465 int recl; 462 int recl;
466 struct xfs_dquot *dqp; 463 struct xfs_dquot *dqp;
467 int niters;
468 int error; 464 int error;
469 465
470 if (!q) 466 if (!q)
471 return 0; 467 return 0;
472 niters = 0;
473again: 468again:
474 mutex_lock(&q->qi_dqlist_lock); 469 mutex_lock(&q->qi_dqlist_lock);
475 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) { 470 list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
@@ -1230,13 +1225,6 @@ xfs_qm_qino_alloc(
1230 } 1225 }
1231 1226
1232 /* 1227 /*
1233 * Keep an extra reference to this quota inode. This inode is
1234 * locked exclusively and joined to the transaction already.
1235 */
1236 ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL));
1237 IHOLD(*ip);
1238
1239 /*
1240 * Make the changes in the superblock, and log those too. 1228 * Make the changes in the superblock, and log those too.
1241 * sbfields arg may contain fields other than *QUOTINO; 1229 * sbfields arg may contain fields other than *QUOTINO;
1242 * VERSIONNUM for example. 1230 * VERSIONNUM for example.
@@ -1264,7 +1252,7 @@ xfs_qm_qino_alloc(
1264 xfs_mod_sb(tp, sbfields); 1252 xfs_mod_sb(tp, sbfields);
1265 1253
1266 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { 1254 if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
1267 xfs_fs_cmn_err(CE_ALERT, mp, "XFS qino_alloc failed!"); 1255 xfs_alert(mp, "%s failed (error %d)!", __func__, error);
1268 return error; 1256 return error;
1269 } 1257 }
1270 return 0; 1258 return 0;
@@ -1299,7 +1287,7 @@ xfs_qm_reset_dqcounts(
1299 * output any warnings because it's perfectly possible to 1287 * output any warnings because it's perfectly possible to
1300 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. 1288 * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
1301 */ 1289 */
1302 (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR, 1290 (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
1303 "xfs_quotacheck"); 1291 "xfs_quotacheck");
1304 ddq->d_bcount = 0; 1292 ddq->d_bcount = 0;
1305 ddq->d_icount = 0; 1293 ddq->d_icount = 0;
@@ -1324,14 +1312,9 @@ xfs_qm_dqiter_bufs(
1324{ 1312{
1325 xfs_buf_t *bp; 1313 xfs_buf_t *bp;
1326 int error; 1314 int error;
1327 int notcommitted;
1328 int incr;
1329 int type; 1315 int type;
1330 1316
1331 ASSERT(blkcnt > 0); 1317 ASSERT(blkcnt > 0);
1332 notcommitted = 0;
1333 incr = (blkcnt > XFS_QM_MAX_DQCLUSTER_LOGSZ) ?
1334 XFS_QM_MAX_DQCLUSTER_LOGSZ : blkcnt;
1335 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : 1318 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
1336 (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP); 1319 (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
1337 error = 0; 1320 error = 0;
@@ -1676,7 +1659,7 @@ xfs_qm_quotacheck(
1676 */ 1659 */
1677 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist)); 1660 ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
1678 1661
1679 cmn_err(CE_NOTE, "XFS quotacheck %s: Please wait.", mp->m_fsname); 1662 xfs_notice(mp, "Quotacheck needed: Please wait.");
1680 1663
1681 /* 1664 /*
1682 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset 1665 * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
@@ -1754,9 +1737,9 @@ xfs_qm_quotacheck(
1754 1737
1755 error_return: 1738 error_return:
1756 if (error) { 1739 if (error) {
1757 cmn_err(CE_WARN, "XFS quotacheck %s: Unsuccessful (Error %d): " 1740 xfs_warn(mp,
1758 "Disabling quotas.", 1741 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
1759 mp->m_fsname, error); 1742 error);
1760 /* 1743 /*
1761 * We must turn off quotas. 1744 * We must turn off quotas.
1762 */ 1745 */
@@ -1764,12 +1747,11 @@ xfs_qm_quotacheck(
1764 ASSERT(xfs_Gqm != NULL); 1747 ASSERT(xfs_Gqm != NULL);
1765 xfs_qm_destroy_quotainfo(mp); 1748 xfs_qm_destroy_quotainfo(mp);
1766 if (xfs_mount_reset_sbqflags(mp)) { 1749 if (xfs_mount_reset_sbqflags(mp)) {
1767 cmn_err(CE_WARN, "XFS quotacheck %s: " 1750 xfs_warn(mp,
1768 "Failed to reset quota flags.", mp->m_fsname); 1751 "Quotacheck: Failed to reset quota flags.");
1769 } 1752 }
1770 } else { 1753 } else
1771 cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); 1754 xfs_notice(mp, "Quotacheck: Done.");
1772 }
1773 return (error); 1755 return (error);
1774} 1756}
1775 1757
@@ -1937,8 +1919,8 @@ again:
1937 */ 1919 */
1938 error = xfs_qm_dqflush(dqp, 0); 1920 error = xfs_qm_dqflush(dqp, 0);
1939 if (error) { 1921 if (error) {
1940 xfs_fs_cmn_err(CE_WARN, mp, 1922 xfs_warn(mp, "%s: dquot %p flush failed",
1941 "xfs_qm_dqreclaim: dquot %p flush failed", dqp); 1923 __func__, dqp);
1942 } 1924 }
1943 goto dqunlock; 1925 goto dqunlock;
1944 } 1926 }
@@ -2115,7 +2097,7 @@ xfs_qm_write_sb_changes(
2115 int error; 2097 int error;
2116 2098
2117#ifdef QUOTADEBUG 2099#ifdef QUOTADEBUG
2118 cmn_err(CE_NOTE, "Writing superblock quota changes :%s", mp->m_fsname); 2100 xfs_notice(mp, "Writing superblock quota changes");
2119#endif 2101#endif
2120 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 2102 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
2121 if ((error = xfs_trans_reserve(tp, 0, 2103 if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index c9446f1c726d..567b29b9f1b3 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -65,11 +65,6 @@ extern kmem_zone_t *qm_dqtrxzone;
65 * block in the dquot/xqm code. 65 * block in the dquot/xqm code.
66 */ 66 */
67#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1 67#define XFS_DQUOT_CLUSTER_SIZE_FSB (xfs_filblks_t)1
68/*
69 * When doing a quotacheck, we log dquot clusters of this many FSBs at most
70 * in a single transaction. We don't want to ask for too huge a log reservation.
71 */
72#define XFS_QM_MAX_DQCLUSTER_LOGSZ 3
73 68
74typedef xfs_dqhash_t xfs_dqlist_t; 69typedef xfs_dqhash_t xfs_dqlist_t;
75 70
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index 45b5cb1788ab..a0a829addca9 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -119,8 +119,7 @@ xfs_qm_newmount(
119 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) || 119 (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
120 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) && 120 (!gquotaondisk && XFS_IS_OQUOTA_ON(mp))) &&
121 xfs_dev_is_read_only(mp, "changing quota state")) { 121 xfs_dev_is_read_only(mp, "changing quota state")) {
122 cmn_err(CE_WARN, 122 xfs_warn(mp, "please mount with%s%s%s%s.",
123 "XFS: please mount with%s%s%s%s.",
124 (!quotaondisk ? "out quota" : ""), 123 (!quotaondisk ? "out quota" : ""),
125 (uquotaondisk ? " usrquota" : ""), 124 (uquotaondisk ? " usrquota" : ""),
126 (pquotaondisk ? " prjquota" : ""), 125 (pquotaondisk ? " prjquota" : ""),
@@ -135,7 +134,7 @@ xfs_qm_newmount(
135 */ 134 */
136 if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) { 135 if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
137 /* 136 /*
138 * If an error occured, qm_mount_quotas code 137 * If an error occurred, qm_mount_quotas code
139 * has already disabled quotas. So, just finish 138 * has already disabled quotas. So, just finish
140 * mounting, and get on with the boring life 139 * mounting, and get on with the boring life
141 * without disk quotas. 140 * without disk quotas.
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index bdebc183223e..2dadb15d5ca9 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -41,12 +41,6 @@
41#include "xfs_qm.h" 41#include "xfs_qm.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43 43
44#ifdef DEBUG
45# define qdprintk(s, args...) cmn_err(CE_DEBUG, s, ## args)
46#else
47# define qdprintk(s, args...) do { } while (0)
48#endif
49
50STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); 44STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
51STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, 45STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
52 uint); 46 uint);
@@ -178,7 +172,7 @@ xfs_qm_scall_quotaoff(
178 /* 172 /*
179 * Next we make the changes in the quota flag in the mount struct. 173 * Next we make the changes in the quota flag in the mount struct.
180 * This isn't protected by a particular lock directly, because we 174 * This isn't protected by a particular lock directly, because we
181 * don't want to take a mrlock everytime we depend on quotas being on. 175 * don't want to take a mrlock every time we depend on quotas being on.
182 */ 176 */
183 mp->m_qflags &= ~(flags); 177 mp->m_qflags &= ~(flags);
184 178
@@ -294,7 +288,8 @@ xfs_qm_scall_trunc_qfiles(
294 int error = 0, error2 = 0; 288 int error = 0, error2 = 0;
295 289
296 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 290 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
297 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 291 xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
292 __func__, flags, mp->m_qflags);
298 return XFS_ERROR(EINVAL); 293 return XFS_ERROR(EINVAL);
299 } 294 }
300 295
@@ -318,20 +313,19 @@ xfs_qm_scall_quotaon(
318{ 313{
319 int error; 314 int error;
320 uint qf; 315 uint qf;
321 uint accflags;
322 __int64_t sbflags; 316 __int64_t sbflags;
323 317
324 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); 318 flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
325 /* 319 /*
326 * Switching on quota accounting must be done at mount time. 320 * Switching on quota accounting must be done at mount time.
327 */ 321 */
328 accflags = flags & XFS_ALL_QUOTA_ACCT;
329 flags &= ~(XFS_ALL_QUOTA_ACCT); 322 flags &= ~(XFS_ALL_QUOTA_ACCT);
330 323
331 sbflags = 0; 324 sbflags = 0;
332 325
333 if (flags == 0) { 326 if (flags == 0) {
334 qdprintk("quotaon: zero flags, m_qflags=%x\n", mp->m_qflags); 327 xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
328 __func__, mp->m_qflags);
335 return XFS_ERROR(EINVAL); 329 return XFS_ERROR(EINVAL);
336 } 330 }
337 331
@@ -352,12 +346,13 @@ xfs_qm_scall_quotaon(
352 (flags & XFS_GQUOTA_ACCT) == 0 && 346 (flags & XFS_GQUOTA_ACCT) == 0 &&
353 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 && 347 (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
354 (flags & XFS_OQUOTA_ENFD))) { 348 (flags & XFS_OQUOTA_ENFD))) {
355 qdprintk("Can't enforce without acct, flags=%x sbflags=%x\n", 349 xfs_debug(mp,
356 flags, mp->m_sb.sb_qflags); 350 "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
351 __func__, flags, mp->m_sb.sb_qflags);
357 return XFS_ERROR(EINVAL); 352 return XFS_ERROR(EINVAL);
358 } 353 }
359 /* 354 /*
360 * If everything's upto-date incore, then don't waste time. 355 * If everything's up to-date incore, then don't waste time.
361 */ 356 */
362 if ((mp->m_qflags & flags) == flags) 357 if ((mp->m_qflags & flags) == flags)
363 return XFS_ERROR(EEXIST); 358 return XFS_ERROR(EEXIST);
@@ -541,7 +536,7 @@ xfs_qm_scall_setqlim(
541 q->qi_bsoftlimit = soft; 536 q->qi_bsoftlimit = soft;
542 } 537 }
543 } else { 538 } else {
544 qdprintk("blkhard %Ld < blksoft %Ld\n", hard, soft); 539 xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
545 } 540 }
546 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ? 541 hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
547 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) : 542 (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
@@ -557,7 +552,7 @@ xfs_qm_scall_setqlim(
557 q->qi_rtbsoftlimit = soft; 552 q->qi_rtbsoftlimit = soft;
558 } 553 }
559 } else { 554 } else {
560 qdprintk("rtbhard %Ld < rtbsoft %Ld\n", hard, soft); 555 xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
561 } 556 }
562 557
563 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ? 558 hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
@@ -574,7 +569,7 @@ xfs_qm_scall_setqlim(
574 q->qi_isoftlimit = soft; 569 q->qi_isoftlimit = soft;
575 } 570 }
576 } else { 571 } else {
577 qdprintk("ihard %Ld < isoft %Ld\n", hard, soft); 572 xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
578 } 573 }
579 574
580 /* 575 /*
@@ -939,10 +934,11 @@ struct mutex qcheck_lock;
939#define DQTEST_LIST_PRINT(l, NXT, title) \ 934#define DQTEST_LIST_PRINT(l, NXT, title) \
940{ \ 935{ \
941 xfs_dqtest_t *dqp; int i = 0;\ 936 xfs_dqtest_t *dqp; int i = 0;\
942 cmn_err(CE_DEBUG, "%s (#%d)", title, (int) (l)->qh_nelems); \ 937 xfs_debug(NULL, "%s (#%d)", title, (int) (l)->qh_nelems); \
943 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \ 938 for (dqp = (xfs_dqtest_t *)(l)->qh_next; dqp != NULL; \
944 dqp = (xfs_dqtest_t *)dqp->NXT) { \ 939 dqp = (xfs_dqtest_t *)dqp->NXT) { \
945 cmn_err(CE_DEBUG, " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \ 940 xfs_debug(dqp->q_mount, \
941 " %d. \"%d (%s)\" bcnt = %d, icnt = %d", \
946 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \ 942 ++i, dqp->d_id, DQFLAGTO_TYPESTR(dqp), \
947 dqp->d_bcount, dqp->d_icount); } \ 943 dqp->d_bcount, dqp->d_icount); } \
948} 944}
@@ -966,16 +962,17 @@ xfs_qm_hashinsert(xfs_dqhash_t *h, xfs_dqtest_t *dqp)
966} 962}
967STATIC void 963STATIC void
968xfs_qm_dqtest_print( 964xfs_qm_dqtest_print(
969 xfs_dqtest_t *d) 965 struct xfs_mount *mp,
966 struct dqtest *d)
970{ 967{
971 cmn_err(CE_DEBUG, "-----------DQTEST DQUOT----------------"); 968 xfs_debug(mp, "-----------DQTEST DQUOT----------------");
972 cmn_err(CE_DEBUG, "---- dquot ID = %d", d->d_id); 969 xfs_debug(mp, "---- dquot ID = %d", d->d_id);
973 cmn_err(CE_DEBUG, "---- fs = 0x%p", d->q_mount); 970 xfs_debug(mp, "---- fs = 0x%p", d->q_mount);
974 cmn_err(CE_DEBUG, "---- bcount = %Lu (0x%x)", 971 xfs_debug(mp, "---- bcount = %Lu (0x%x)",
975 d->d_bcount, (int)d->d_bcount); 972 d->d_bcount, (int)d->d_bcount);
976 cmn_err(CE_DEBUG, "---- icount = %Lu (0x%x)", 973 xfs_debug(mp, "---- icount = %Lu (0x%x)",
977 d->d_icount, (int)d->d_icount); 974 d->d_icount, (int)d->d_icount);
978 cmn_err(CE_DEBUG, "---------------------------"); 975 xfs_debug(mp, "---------------------------");
979} 976}
980 977
981STATIC void 978STATIC void
@@ -989,12 +986,14 @@ xfs_qm_dqtest_failed(
989{ 986{
990 qmtest_nfails++; 987 qmtest_nfails++;
991 if (error) 988 if (error)
992 cmn_err(CE_DEBUG, "quotacheck failed id=%d, err=%d\nreason: %s", 989 xfs_debug(dqp->q_mount,
993 d->d_id, error, reason); 990 "quotacheck failed id=%d, err=%d\nreason: %s",
991 d->d_id, error, reason);
994 else 992 else
995 cmn_err(CE_DEBUG, "quotacheck failed id=%d (%s) [%d != %d]", 993 xfs_debug(dqp->q_mount,
996 d->d_id, reason, (int)a, (int)b); 994 "quotacheck failed id=%d (%s) [%d != %d]",
997 xfs_qm_dqtest_print(d); 995 d->d_id, reason, (int)a, (int)b);
996 xfs_qm_dqtest_print(dqp->q_mount, d);
998 if (dqp) 997 if (dqp)
999 xfs_qm_dqprint(dqp); 998 xfs_qm_dqprint(dqp);
1000} 999}
@@ -1021,9 +1020,9 @@ xfs_dqtest_cmp2(
1021 be64_to_cpu(dqp->q_core.d_bcount) >= 1020 be64_to_cpu(dqp->q_core.d_bcount) >=
1022 be64_to_cpu(dqp->q_core.d_blk_softlimit)) { 1021 be64_to_cpu(dqp->q_core.d_blk_softlimit)) {
1023 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) { 1022 if (!dqp->q_core.d_btimer && dqp->q_core.d_id) {
1024 cmn_err(CE_DEBUG, 1023 xfs_debug(dqp->q_mount,
1025 "%d [%s] [0x%p] BLK TIMER NOT STARTED", 1024 "%d [%s] BLK TIMER NOT STARTED",
1026 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1025 d->d_id, DQFLAGTO_TYPESTR(d));
1027 err++; 1026 err++;
1028 } 1027 }
1029 } 1028 }
@@ -1031,16 +1030,16 @@ xfs_dqtest_cmp2(
1031 be64_to_cpu(dqp->q_core.d_icount) >= 1030 be64_to_cpu(dqp->q_core.d_icount) >=
1032 be64_to_cpu(dqp->q_core.d_ino_softlimit)) { 1031 be64_to_cpu(dqp->q_core.d_ino_softlimit)) {
1033 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) { 1032 if (!dqp->q_core.d_itimer && dqp->q_core.d_id) {
1034 cmn_err(CE_DEBUG, 1033 xfs_debug(dqp->q_mount,
1035 "%d [%s] [0x%p] INO TIMER NOT STARTED", 1034 "%d [%s] INO TIMER NOT STARTED",
1036 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1035 d->d_id, DQFLAGTO_TYPESTR(d));
1037 err++; 1036 err++;
1038 } 1037 }
1039 } 1038 }
1040#ifdef QUOTADEBUG 1039#ifdef QUOTADEBUG
1041 if (!err) { 1040 if (!err) {
1042 cmn_err(CE_DEBUG, "%d [%s] [0x%p] qchecked", 1041 xfs_debug(dqp->q_mount, "%d [%s] qchecked",
1043 d->d_id, DQFLAGTO_TYPESTR(d), d->q_mount); 1042 d->d_id, DQFLAGTO_TYPESTR(d));
1044 } 1043 }
1045#endif 1044#endif
1046 return (err); 1045 return (err);
@@ -1137,8 +1136,8 @@ xfs_qm_internalqcheck_adjust(
1137 1136
1138 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) { 1137 if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
1139 *res = BULKSTAT_RV_NOTHING; 1138 *res = BULKSTAT_RV_NOTHING;
1140 qdprintk("internalqcheck: ino=%llu, uqino=%llu, gqino=%llu\n", 1139 xfs_debug(mp, "%s: ino=%llu, uqino=%llu, gqino=%llu\n",
1141 (unsigned long long) ino, 1140 __func__, (unsigned long long) ino,
1142 (unsigned long long) mp->m_sb.sb_uquotino, 1141 (unsigned long long) mp->m_sb.sb_uquotino,
1143 (unsigned long long) mp->m_sb.sb_gquotino); 1142 (unsigned long long) mp->m_sb.sb_gquotino);
1144 return XFS_ERROR(EINVAL); 1143 return XFS_ERROR(EINVAL);
@@ -1223,12 +1222,12 @@ xfs_qm_internalqcheck(
1223 xfs_qm_internalqcheck_adjust, 1222 xfs_qm_internalqcheck_adjust,
1224 0, NULL, &done); 1223 0, NULL, &done);
1225 if (error) { 1224 if (error) {
1226 cmn_err(CE_DEBUG, "Bulkstat returned error 0x%x", error); 1225 xfs_debug(mp, "Bulkstat returned error 0x%x", error);
1227 break; 1226 break;
1228 } 1227 }
1229 } while (!done); 1228 } while (!done);
1230 1229
1231 cmn_err(CE_DEBUG, "Checking results against system dquots"); 1230 xfs_debug(mp, "Checking results against system dquots");
1232 for (i = 0; i < qmtest_hashmask; i++) { 1231 for (i = 0; i < qmtest_hashmask; i++) {
1233 xfs_dqtest_t *d, *n; 1232 xfs_dqtest_t *d, *n;
1234 xfs_dqhash_t *h; 1233 xfs_dqhash_t *h;
@@ -1246,10 +1245,10 @@ xfs_qm_internalqcheck(
1246 } 1245 }
1247 1246
1248 if (qmtest_nfails) { 1247 if (qmtest_nfails) {
1249 cmn_err(CE_DEBUG, "******** quotacheck failed ********"); 1248 xfs_debug(mp, "******** quotacheck failed ********");
1250 cmn_err(CE_DEBUG, "failures = %d", qmtest_nfails); 1249 xfs_debug(mp, "failures = %d", qmtest_nfails);
1251 } else { 1250 } else {
1252 cmn_err(CE_DEBUG, "******** quotacheck successful! ********"); 1251 xfs_debug(mp, "******** quotacheck successful! ********");
1253 } 1252 }
1254 kmem_free(qmtest_udqtab); 1253 kmem_free(qmtest_udqtab);
1255 kmem_free(qmtest_gdqtab); 1254 kmem_free(qmtest_gdqtab);
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
index 7de91d1b75c0..2a3648731331 100644
--- a/fs/xfs/quota/xfs_trans_dquot.c
+++ b/fs/xfs/quota/xfs_trans_dquot.c
@@ -643,8 +643,9 @@ xfs_trans_dqresv(
643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) && 643 (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) { 644 (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
645#ifdef QUOTADEBUG 645#ifdef QUOTADEBUG
646 cmn_err(CE_DEBUG, "BLK Res: nblks=%ld + resbcount=%Ld" 646 xfs_debug(mp,
647 " > hardlimit=%Ld?", nblks, *resbcountp, hardlimit); 647 "BLK Res: nblks=%ld + resbcount=%Ld > hardlimit=%Ld?",
648 nblks, *resbcountp, hardlimit);
648#endif 649#endif
649 if (nblks > 0) { 650 if (nblks > 0) {
650 /* 651 /*
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
deleted file mode 100644
index 0df88897ef84..000000000000
--- a/fs/xfs/support/debug.c
+++ /dev/null
@@ -1,107 +0,0 @@
1/*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#include <xfs.h>
19#include "debug.h"
20
21/* xfs_mount.h drags a lot of crap in, sorry.. */
22#include "xfs_sb.h"
23#include "xfs_inum.h"
24#include "xfs_ag.h"
25#include "xfs_mount.h"
26#include "xfs_error.h"
27
28void
29cmn_err(
30 const char *lvl,
31 const char *fmt,
32 ...)
33{
34 struct va_format vaf;
35 va_list args;
36
37 va_start(args, fmt);
38 vaf.fmt = fmt;
39 vaf.va = &args;
40
41 printk("%s%pV", lvl, &vaf);
42 va_end(args);
43
44 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
45}
46
47void
48xfs_fs_cmn_err(
49 const char *lvl,
50 struct xfs_mount *mp,
51 const char *fmt,
52 ...)
53{
54 struct va_format vaf;
55 va_list args;
56
57 va_start(args, fmt);
58 vaf.fmt = fmt;
59 vaf.va = &args;
60
61 printk("%sFilesystem %s: %pV", lvl, mp->m_fsname, &vaf);
62 va_end(args);
63
64 BUG_ON(strncmp(lvl, KERN_EMERG, strlen(KERN_EMERG)) == 0);
65}
66
67/* All callers to xfs_cmn_err use CE_ALERT, so don't bother testing lvl */
68void
69xfs_cmn_err(
70 int panic_tag,
71 const char *lvl,
72 struct xfs_mount *mp,
73 const char *fmt,
74 ...)
75{
76 struct va_format vaf;
77 va_list args;
78 int do_panic = 0;
79
80 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
81 printk(KERN_ALERT "XFS: Transforming an alert into a BUG.");
82 do_panic = 1;
83 }
84
85 va_start(args, fmt);
86 vaf.fmt = fmt;
87 vaf.va = &args;
88
89 printk(KERN_ALERT "Filesystem %s: %pV", mp->m_fsname, &vaf);
90 va_end(args);
91
92 BUG_ON(do_panic);
93}
94
95void
96assfail(char *expr, char *file, int line)
97{
98 printk(KERN_CRIT "Assertion failed: %s, file: %s, line: %d\n", expr,
99 file, line);
100 BUG();
101}
102
103void
104xfs_hex_dump(void *p, int length)
105{
106 print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
107}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
deleted file mode 100644
index 05699f67d475..000000000000
--- a/fs/xfs/support/debug.h
+++ /dev/null
@@ -1,61 +0,0 @@
1/*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18#ifndef __XFS_SUPPORT_DEBUG_H__
19#define __XFS_SUPPORT_DEBUG_H__
20
21#include <stdarg.h>
22
23struct xfs_mount;
24
25#define CE_DEBUG KERN_DEBUG
26#define CE_CONT KERN_INFO
27#define CE_NOTE KERN_NOTICE
28#define CE_WARN KERN_WARNING
29#define CE_ALERT KERN_ALERT
30#define CE_PANIC KERN_EMERG
31
32void cmn_err(const char *lvl, const char *fmt, ...)
33 __attribute__ ((format (printf, 2, 3)));
34void xfs_fs_cmn_err( const char *lvl, struct xfs_mount *mp,
35 const char *fmt, ...) __attribute__ ((format (printf, 3, 4)));
36void xfs_cmn_err( int panic_tag, const char *lvl, struct xfs_mount *mp,
37 const char *fmt, ...) __attribute__ ((format (printf, 4, 5)));
38
39extern void assfail(char *expr, char *f, int l);
40
41#define ASSERT_ALWAYS(expr) \
42 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
43
44#ifndef DEBUG
45#define ASSERT(expr) ((void)0)
46
47#ifndef STATIC
48# define STATIC static noinline
49#endif
50
51#else /* DEBUG */
52
53#define ASSERT(expr) \
54 (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
55
56#ifndef STATIC
57# define STATIC noinline
58#endif
59
60#endif /* DEBUG */
61#endif /* __XFS_SUPPORT_DEBUG_H__ */
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index f3227984a9bf..27d64d752eab 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -147,10 +147,9 @@ xfs_alloc_get_rec(
147 */ 147 */
148STATIC void 148STATIC void
149xfs_alloc_compute_aligned( 149xfs_alloc_compute_aligned(
150 xfs_alloc_arg_t *args, /* allocation argument structure */
150 xfs_agblock_t foundbno, /* starting block in found extent */ 151 xfs_agblock_t foundbno, /* starting block in found extent */
151 xfs_extlen_t foundlen, /* length in found extent */ 152 xfs_extlen_t foundlen, /* length in found extent */
152 xfs_extlen_t alignment, /* alignment for allocation */
153 xfs_extlen_t minlen, /* minimum length for allocation */
154 xfs_agblock_t *resbno, /* result block number */ 153 xfs_agblock_t *resbno, /* result block number */
155 xfs_extlen_t *reslen) /* result length */ 154 xfs_extlen_t *reslen) /* result length */
156{ 155{
@@ -158,8 +157,8 @@ xfs_alloc_compute_aligned(
158 xfs_extlen_t diff; 157 xfs_extlen_t diff;
159 xfs_extlen_t len; 158 xfs_extlen_t len;
160 159
161 if (alignment > 1 && foundlen >= minlen) { 160 if (args->alignment > 1 && foundlen >= args->minlen) {
162 bno = roundup(foundbno, alignment); 161 bno = roundup(foundbno, args->alignment);
163 diff = bno - foundbno; 162 diff = bno - foundbno;
164 len = diff >= foundlen ? 0 : foundlen - diff; 163 len = diff >= foundlen ? 0 : foundlen - diff;
165 } else { 164 } else {
@@ -464,6 +463,27 @@ xfs_alloc_read_agfl(
464 return 0; 463 return 0;
465} 464}
466 465
466STATIC int
467xfs_alloc_update_counters(
468 struct xfs_trans *tp,
469 struct xfs_perag *pag,
470 struct xfs_buf *agbp,
471 long len)
472{
473 struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
474
475 pag->pagf_freeblks += len;
476 be32_add_cpu(&agf->agf_freeblks, len);
477
478 xfs_trans_agblocks_delta(tp, len);
479 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
480 be32_to_cpu(agf->agf_length)))
481 return EFSCORRUPTED;
482
483 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
484 return 0;
485}
486
467/* 487/*
468 * Allocation group level functions. 488 * Allocation group level functions.
469 */ 489 */
@@ -505,49 +525,44 @@ xfs_alloc_ag_vextent(
505 ASSERT(0); 525 ASSERT(0);
506 /* NOTREACHED */ 526 /* NOTREACHED */
507 } 527 }
508 if (error) 528
529 if (error || args->agbno == NULLAGBLOCK)
509 return error; 530 return error;
510 /*
511 * If the allocation worked, need to change the agf structure
512 * (and log it), and the superblock.
513 */
514 if (args->agbno != NULLAGBLOCK) {
515 xfs_agf_t *agf; /* allocation group freelist header */
516 long slen = (long)args->len;
517 531
518 ASSERT(args->len >= args->minlen && args->len <= args->maxlen); 532 ASSERT(args->len >= args->minlen);
519 ASSERT(!(args->wasfromfl) || !args->isfl); 533 ASSERT(args->len <= args->maxlen);
520 ASSERT(args->agbno % args->alignment == 0); 534 ASSERT(!args->wasfromfl || !args->isfl);
521 if (!(args->wasfromfl)) { 535 ASSERT(args->agbno % args->alignment == 0);
522 536
523 agf = XFS_BUF_TO_AGF(args->agbp); 537 if (!args->wasfromfl) {
524 be32_add_cpu(&agf->agf_freeblks, -(args->len)); 538 error = xfs_alloc_update_counters(args->tp, args->pag,
525 xfs_trans_agblocks_delta(args->tp, 539 args->agbp,
526 -((long)(args->len))); 540 -((long)(args->len)));
527 args->pag->pagf_freeblks -= args->len; 541 if (error)
528 ASSERT(be32_to_cpu(agf->agf_freeblks) <= 542 return error;
529 be32_to_cpu(agf->agf_length)); 543
530 xfs_alloc_log_agf(args->tp, args->agbp, 544 /*
531 XFS_AGF_FREEBLKS); 545 * Search the busylist for these blocks and mark the
532 /* 546 * transaction as synchronous if blocks are found. This
533 * Search the busylist for these blocks and mark the 547 * avoids the need to block due to a synchronous log
534 * transaction as synchronous if blocks are found. This 548 * force to ensure correct ordering as the synchronous
535 * avoids the need to block due to a synchronous log 549 * transaction will guarantee that for us.
536 * force to ensure correct ordering as the synchronous 550 */
537 * transaction will guarantee that for us. 551 if (xfs_alloc_busy_search(args->mp, args->agno,
538 */ 552 args->agbno, args->len))
539 if (xfs_alloc_busy_search(args->mp, args->agno, 553 xfs_trans_set_sync(args->tp);
540 args->agbno, args->len))
541 xfs_trans_set_sync(args->tp);
542 }
543 if (!args->isfl)
544 xfs_trans_mod_sb(args->tp,
545 args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
546 XFS_TRANS_SB_FDBLOCKS, -slen);
547 XFS_STATS_INC(xs_allocx);
548 XFS_STATS_ADD(xs_allocb, args->len);
549 } 554 }
550 return 0; 555
556 if (!args->isfl) {
557 xfs_trans_mod_sb(args->tp, args->wasdel ?
558 XFS_TRANS_SB_RES_FDBLOCKS :
559 XFS_TRANS_SB_FDBLOCKS,
560 -((long)(args->len)));
561 }
562
563 XFS_STATS_INC(xs_allocx);
564 XFS_STATS_ADD(xs_allocb, args->len);
565 return error;
551} 566}
552 567
553/* 568/*
@@ -693,8 +708,7 @@ xfs_alloc_find_best_extent(
693 if (error) 708 if (error)
694 goto error0; 709 goto error0;
695 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 710 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
696 xfs_alloc_compute_aligned(*sbno, *slen, args->alignment, 711 xfs_alloc_compute_aligned(args, *sbno, *slen, &bno, slena);
697 args->minlen, &bno, slena);
698 712
699 /* 713 /*
700 * The good extent is closer than this one. 714 * The good extent is closer than this one.
@@ -866,8 +880,8 @@ xfs_alloc_ag_vextent_near(
866 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 880 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
867 goto error0; 881 goto error0;
868 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 882 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
869 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 883 xfs_alloc_compute_aligned(args, ltbno, ltlen,
870 args->minlen, &ltbnoa, &ltlena); 884 &ltbnoa, &ltlena);
871 if (ltlena < args->minlen) 885 if (ltlena < args->minlen)
872 continue; 886 continue;
873 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 887 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
@@ -987,8 +1001,8 @@ xfs_alloc_ag_vextent_near(
987 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i))) 1001 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
988 goto error0; 1002 goto error0;
989 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1003 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
990 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment, 1004 xfs_alloc_compute_aligned(args, ltbno, ltlen,
991 args->minlen, &ltbnoa, &ltlena); 1005 &ltbnoa, &ltlena);
992 if (ltlena >= args->minlen) 1006 if (ltlena >= args->minlen)
993 break; 1007 break;
994 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) 1008 if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
@@ -1003,8 +1017,8 @@ xfs_alloc_ag_vextent_near(
1003 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i))) 1017 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
1004 goto error0; 1018 goto error0;
1005 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1019 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1006 xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment, 1020 xfs_alloc_compute_aligned(args, gtbno, gtlen,
1007 args->minlen, &gtbnoa, &gtlena); 1021 &gtbnoa, &gtlena);
1008 if (gtlena >= args->minlen) 1022 if (gtlena >= args->minlen)
1009 break; 1023 break;
1010 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) 1024 if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
@@ -1183,8 +1197,7 @@ xfs_alloc_ag_vextent_size(
1183 * once aligned; if not, we search left for something better. 1197 * once aligned; if not, we search left for something better.
1184 * This can't happen in the second case above. 1198 * This can't happen in the second case above.
1185 */ 1199 */
1186 xfs_alloc_compute_aligned(fbno, flen, args->alignment, args->minlen, 1200 xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen);
1187 &rbno, &rlen);
1188 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1201 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1189 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1202 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1190 (rlen <= flen && rbno + rlen <= fbno + flen), error0); 1203 (rlen <= flen && rbno + rlen <= fbno + flen), error0);
@@ -1209,8 +1222,8 @@ xfs_alloc_ag_vextent_size(
1209 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1222 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1210 if (flen < bestrlen) 1223 if (flen < bestrlen)
1211 break; 1224 break;
1212 xfs_alloc_compute_aligned(fbno, flen, args->alignment, 1225 xfs_alloc_compute_aligned(args, fbno, flen,
1213 args->minlen, &rbno, &rlen); 1226 &rbno, &rlen);
1214 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1227 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1215 XFS_WANT_CORRUPTED_GOTO(rlen == 0 || 1228 XFS_WANT_CORRUPTED_GOTO(rlen == 0 ||
1216 (rlen <= flen && rbno + rlen <= fbno + flen), 1229 (rlen <= flen && rbno + rlen <= fbno + flen),
@@ -1388,6 +1401,7 @@ xfs_free_ag_extent(
1388 xfs_mount_t *mp; /* mount point struct for filesystem */ 1401 xfs_mount_t *mp; /* mount point struct for filesystem */
1389 xfs_agblock_t nbno; /* new starting block of freespace */ 1402 xfs_agblock_t nbno; /* new starting block of freespace */
1390 xfs_extlen_t nlen; /* new length of freespace */ 1403 xfs_extlen_t nlen; /* new length of freespace */
1404 xfs_perag_t *pag; /* per allocation group data */
1391 1405
1392 mp = tp->t_mountp; 1406 mp = tp->t_mountp;
1393 /* 1407 /*
@@ -1586,30 +1600,20 @@ xfs_free_ag_extent(
1586 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1600 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1587 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1601 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1588 cnt_cur = NULL; 1602 cnt_cur = NULL;
1603
1589 /* 1604 /*
1590 * Update the freespace totals in the ag and superblock. 1605 * Update the freespace totals in the ag and superblock.
1591 */ 1606 */
1592 { 1607 pag = xfs_perag_get(mp, agno);
1593 xfs_agf_t *agf; 1608 error = xfs_alloc_update_counters(tp, pag, agbp, len);
1594 xfs_perag_t *pag; /* per allocation group data */ 1609 xfs_perag_put(pag);
1595 1610 if (error)
1596 pag = xfs_perag_get(mp, agno); 1611 goto error0;
1597 pag->pagf_freeblks += len; 1612
1598 xfs_perag_put(pag); 1613 if (!isfl)
1599 1614 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1600 agf = XFS_BUF_TO_AGF(agbp); 1615 XFS_STATS_INC(xs_freex);
1601 be32_add_cpu(&agf->agf_freeblks, len); 1616 XFS_STATS_ADD(xs_freeb, len);
1602 xfs_trans_agblocks_delta(tp, len);
1603 XFS_WANT_CORRUPTED_GOTO(
1604 be32_to_cpu(agf->agf_freeblks) <=
1605 be32_to_cpu(agf->agf_length),
1606 error0);
1607 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
1608 if (!isfl)
1609 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
1610 XFS_STATS_INC(xs_freex);
1611 XFS_STATS_ADD(xs_freeb, len);
1612 }
1613 1617
1614 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); 1618 trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
1615 1619
@@ -2391,17 +2395,33 @@ xfs_free_extent(
2391 memset(&args, 0, sizeof(xfs_alloc_arg_t)); 2395 memset(&args, 0, sizeof(xfs_alloc_arg_t));
2392 args.tp = tp; 2396 args.tp = tp;
2393 args.mp = tp->t_mountp; 2397 args.mp = tp->t_mountp;
2398
2399 /*
2400 * validate that the block number is legal - the enables us to detect
2401 * and handle a silent filesystem corruption rather than crashing.
2402 */
2394 args.agno = XFS_FSB_TO_AGNO(args.mp, bno); 2403 args.agno = XFS_FSB_TO_AGNO(args.mp, bno);
2395 ASSERT(args.agno < args.mp->m_sb.sb_agcount); 2404 if (args.agno >= args.mp->m_sb.sb_agcount)
2405 return EFSCORRUPTED;
2406
2396 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno); 2407 args.agbno = XFS_FSB_TO_AGBNO(args.mp, bno);
2408 if (args.agbno >= args.mp->m_sb.sb_agblocks)
2409 return EFSCORRUPTED;
2410
2397 args.pag = xfs_perag_get(args.mp, args.agno); 2411 args.pag = xfs_perag_get(args.mp, args.agno);
2398 if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING))) 2412 ASSERT(args.pag);
2413
2414 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
2415 if (error)
2399 goto error0; 2416 goto error0;
2400#ifdef DEBUG 2417
2401 ASSERT(args.agbp != NULL); 2418 /* validate the extent size is legal now we have the agf locked */
2402 ASSERT((args.agbno + len) <= 2419 if (args.agbno + len >
2403 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)); 2420 be32_to_cpu(XFS_BUF_TO_AGF(args.agbp)->agf_length)) {
2404#endif 2421 error = EFSCORRUPTED;
2422 goto error0;
2423 }
2424
2405 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0); 2425 error = xfs_free_ag_extent(tp, args.agbp, args.agno, args.agbno, len, 0);
2406error0: 2426error0:
2407 xfs_perag_put(args.pag); 2427 xfs_perag_put(args.pag);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index dc3afd7739ff..fa00788de2f5 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2365,6 +2365,13 @@ xfs_bmap_rtalloc(
2365 */ 2365 */
2366 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN) 2366 if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
2367 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize; 2367 ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
2368
2369 /*
2370 * Lock out other modifications to the RT bitmap inode.
2371 */
2372 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2373 xfs_trans_ijoin_ref(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2374
2368 /* 2375 /*
2369 * If it's an allocation to an empty file at offset 0, 2376 * If it's an allocation to an empty file at offset 0,
2370 * pick an extent that will space things out in the rt area. 2377 * pick an extent that will space things out in the rt area.
@@ -3519,7 +3526,7 @@ xfs_bmap_search_extents(
3519 3526
3520 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && 3527 if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
3521 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { 3528 !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
3522 xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, 3529 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
3523 "Access to block zero in inode %llu " 3530 "Access to block zero in inode %llu "
3524 "start_block: %llx start_off: %llx " 3531 "start_block: %llx start_off: %llx "
3525 "blkcnt: %llx extent-state: %x lastx: %x\n", 3532 "blkcnt: %llx extent-state: %x lastx: %x\n",
@@ -4193,12 +4200,11 @@ xfs_bmap_read_extents(
4193 num_recs = xfs_btree_get_numrecs(block); 4200 num_recs = xfs_btree_get_numrecs(block);
4194 if (unlikely(i + num_recs > room)) { 4201 if (unlikely(i + num_recs > room)) {
4195 ASSERT(i + num_recs <= room); 4202 ASSERT(i + num_recs <= room);
4196 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 4203 xfs_warn(ip->i_mount,
4197 "corrupt dinode %Lu, (btree extents).", 4204 "corrupt dinode %Lu, (btree extents).",
4198 (unsigned long long) ip->i_ino); 4205 (unsigned long long) ip->i_ino);
4199 XFS_ERROR_REPORT("xfs_bmap_read_extents(1)", 4206 XFS_CORRUPTION_ERROR("xfs_bmap_read_extents(1)",
4200 XFS_ERRLEVEL_LOW, 4207 XFS_ERRLEVEL_LOW, ip->i_mount, block);
4201 ip->i_mount);
4202 goto error0; 4208 goto error0;
4203 } 4209 }
4204 XFS_WANT_CORRUPTED_GOTO( 4210 XFS_WANT_CORRUPTED_GOTO(
@@ -5772,7 +5778,7 @@ xfs_check_block(
5772 else 5778 else
5773 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr); 5779 thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
5774 if (*thispa == *pp) { 5780 if (*thispa == *pp) {
5775 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", 5781 xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld",
5776 __func__, j, i, 5782 __func__, j, i,
5777 (unsigned long long)be64_to_cpu(*thispa)); 5783 (unsigned long long)be64_to_cpu(*thispa));
5778 panic("%s: ptrs are equal in node\n", 5784 panic("%s: ptrs are equal in node\n",
@@ -5937,11 +5943,11 @@ xfs_bmap_check_leaf_extents(
5937 return; 5943 return;
5938 5944
5939error0: 5945error0:
5940 cmn_err(CE_WARN, "%s: at error0", __func__); 5946 xfs_warn(mp, "%s: at error0", __func__);
5941 if (bp_release) 5947 if (bp_release)
5942 xfs_trans_brelse(NULL, bp); 5948 xfs_trans_brelse(NULL, bp);
5943error_norelse: 5949error_norelse:
5944 cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", 5950 xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
5945 __func__, i); 5951 __func__, i);
5946 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); 5952 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
5947 return; 5953 return;
@@ -6144,7 +6150,7 @@ xfs_bmap_punch_delalloc_range(
6144 if (error) { 6150 if (error) {
6145 /* something screwed, just bail */ 6151 /* something screwed, just bail */
6146 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 6152 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
6147 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 6153 xfs_alert(ip->i_mount,
6148 "Failed delalloc mapping lookup ino %lld fsb %lld.", 6154 "Failed delalloc mapping lookup ino %lld fsb %lld.",
6149 ip->i_ino, start_fsb); 6155 ip->i_ino, start_fsb);
6150 } 6156 }
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 6f8c21ce0d6d..7b7e005e3dcc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -130,10 +130,12 @@ xfs_buf_item_log_check(
130 orig = bip->bli_orig; 130 orig = bip->bli_orig;
131 buffer = XFS_BUF_PTR(bp); 131 buffer = XFS_BUF_PTR(bp);
132 for (x = 0; x < XFS_BUF_COUNT(bp); x++) { 132 for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
133 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) 133 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
134 cmn_err(CE_PANIC, 134 xfs_emerg(bp->b_mount,
135 "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", 135 "%s: bip %x buffer %x orig %x index %d",
136 bip, bp, orig, x); 136 __func__, bip, bp, orig, x);
137 ASSERT(0);
138 }
137 } 139 }
138} 140}
139#else 141#else
@@ -983,15 +985,14 @@ xfs_buf_iodone_callbacks(
983 if (XFS_BUF_TARGET(bp) != lasttarg || 985 if (XFS_BUF_TARGET(bp) != lasttarg ||
984 time_after(jiffies, (lasttime + 5*HZ))) { 986 time_after(jiffies, (lasttime + 5*HZ))) {
985 lasttime = jiffies; 987 lasttime = jiffies;
986 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 988 xfs_alert(mp, "Device %s: metadata write error block 0x%llx",
987 " block 0x%llx in %s",
988 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 989 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
989 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 990 (__uint64_t)XFS_BUF_ADDR(bp));
990 } 991 }
991 lasttarg = XFS_BUF_TARGET(bp); 992 lasttarg = XFS_BUF_TARGET(bp);
992 993
993 /* 994 /*
994 * If the write was asynchronous then noone will be looking for the 995 * If the write was asynchronous then no one will be looking for the
995 * error. Clear the error state and write the buffer out again. 996 * error. Clear the error state and write the buffer out again.
996 * 997 *
997 * During sync or umount we'll write all pending buffers again 998 * During sync or umount we'll write all pending buffers again
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1c00bedb3175..6102ac6d1dff 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -1995,13 +1995,12 @@ xfs_da_do_buf(
1995 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED); 1995 error = mappedbno == -2 ? 0 : XFS_ERROR(EFSCORRUPTED);
1996 if (unlikely(error == EFSCORRUPTED)) { 1996 if (unlikely(error == EFSCORRUPTED)) {
1997 if (xfs_error_level >= XFS_ERRLEVEL_LOW) { 1997 if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
1998 cmn_err(CE_ALERT, "xfs_da_do_buf: bno %lld\n", 1998 xfs_alert(mp, "%s: bno %lld dir: inode %lld",
1999 (long long)bno); 1999 __func__, (long long)bno,
2000 cmn_err(CE_ALERT, "dir: inode %lld\n",
2001 (long long)dp->i_ino); 2000 (long long)dp->i_ino);
2002 for (i = 0; i < nmap; i++) { 2001 for (i = 0; i < nmap; i++) {
2003 cmn_err(CE_ALERT, 2002 xfs_alert(mp,
2004 "[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d\n", 2003"[%02d] br_startoff %lld br_startblock %lld br_blockcount %lld br_state %d",
2005 i, 2004 i,
2006 (long long)mapp[i].br_startoff, 2005 (long long)mapp[i].br_startoff,
2007 (long long)mapp[i].br_startblock, 2006 (long long)mapp[i].br_startblock,
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e60490bc00a6..be628677c288 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -270,9 +270,9 @@ xfs_swap_extents(
270 /* check inode formats now that data is flushed */ 270 /* check inode formats now that data is flushed */
271 error = xfs_swap_extents_check_format(ip, tip); 271 error = xfs_swap_extents_check_format(ip, tip);
272 if (error) { 272 if (error) {
273 xfs_fs_cmn_err(CE_NOTE, mp, 273 xfs_notice(mp,
274 "%s: inode 0x%llx format is incompatible for exchanging.", 274 "%s: inode 0x%llx format is incompatible for exchanging.",
275 __FILE__, ip->i_ino); 275 __func__, ip->i_ino);
276 goto out_unlock; 276 goto out_unlock;
277 } 277 }
278 278
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index a1321bc7f192..dba7a71cedf3 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -159,7 +159,7 @@ xfs_dir_ino_validate(
159 XFS_AGINO_TO_INO(mp, agno, agino) == ino; 159 XFS_AGINO_TO_INO(mp, agno, agino) == ino;
160 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE, 160 if (unlikely(XFS_TEST_ERROR(!ino_ok, mp, XFS_ERRTAG_DIR_INO_VALIDATE,
161 XFS_RANDOM_DIR_INO_VALIDATE))) { 161 XFS_RANDOM_DIR_INO_VALIDATE))) {
162 xfs_fs_cmn_err(CE_WARN, mp, "Invalid inode number 0x%Lx", 162 xfs_warn(mp, "Invalid inode number 0x%Lx",
163 (unsigned long long) ino); 163 (unsigned long long) ino);
164 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp); 164 XFS_ERROR_REPORT("xfs_dir_ino_validate", XFS_ERRLEVEL_LOW, mp);
165 return XFS_ERROR(EFSCORRUPTED); 165 return XFS_ERROR(EFSCORRUPTED);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index f9a0864b696a..a0aab7d3294f 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -899,10 +899,9 @@ xfs_dir2_leafn_rebalance(
899 if(blk2->index < 0) { 899 if(blk2->index < 0) {
900 state->inleaf = 1; 900 state->inleaf = 1;
901 blk2->index = 0; 901 blk2->index = 0;
902 cmn_err(CE_ALERT, 902 xfs_alert(args->dp->i_mount,
903 "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: " 903 "%s: picked the wrong leaf? reverting original leaf: blk1->index %d\n",
904 "blk1->index %d\n", 904 __func__, blk1->index);
905 blk1->index);
906 } 905 }
907} 906}
908 907
@@ -1641,26 +1640,22 @@ xfs_dir2_node_addname_int(
1641 } 1640 }
1642 1641
1643 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) { 1642 if (unlikely(xfs_dir2_db_to_fdb(mp, dbno) != fbno)) {
1644 cmn_err(CE_ALERT, 1643 xfs_alert(mp,
1645 "xfs_dir2_node_addname_int: dir ino " 1644 "%s: dir ino " "%llu needed freesp block %lld for\n"
1646 "%llu needed freesp block %lld for\n" 1645 " data block %lld, got %lld ifbno %llu lastfbno %d",
1647 " data block %lld, got %lld\n" 1646 __func__, (unsigned long long)dp->i_ino,
1648 " ifbno %llu lastfbno %d\n",
1649 (unsigned long long)dp->i_ino,
1650 (long long)xfs_dir2_db_to_fdb(mp, dbno), 1647 (long long)xfs_dir2_db_to_fdb(mp, dbno),
1651 (long long)dbno, (long long)fbno, 1648 (long long)dbno, (long long)fbno,
1652 (unsigned long long)ifbno, lastfbno); 1649 (unsigned long long)ifbno, lastfbno);
1653 if (fblk) { 1650 if (fblk) {
1654 cmn_err(CE_ALERT, 1651 xfs_alert(mp,
1655 " fblk 0x%p blkno %llu " 1652 " fblk 0x%p blkno %llu index %d magic 0x%x",
1656 "index %d magic 0x%x\n",
1657 fblk, 1653 fblk,
1658 (unsigned long long)fblk->blkno, 1654 (unsigned long long)fblk->blkno,
1659 fblk->index, 1655 fblk->index,
1660 fblk->magic); 1656 fblk->magic);
1661 } else { 1657 } else {
1662 cmn_err(CE_ALERT, 1658 xfs_alert(mp, " ... fblk is NULL");
1663 " ... fblk is NULL\n");
1664 } 1659 }
1665 XFS_ERROR_REPORT("xfs_dir2_node_addname_int", 1660 XFS_ERROR_REPORT("xfs_dir2_node_addname_int",
1666 XFS_ERRLEVEL_LOW, mp); 1661 XFS_ERRLEVEL_LOW, mp);
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 4c7db74a05f7..39f06336b99d 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -48,7 +48,7 @@ xfs_error_trap(int e)
48 break; 48 break;
49 if (e != xfs_etrap[i]) 49 if (e != xfs_etrap[i])
50 continue; 50 continue;
51 cmn_err(CE_NOTE, "xfs_error_trap: error %d", e); 51 xfs_notice(NULL, "%s: error %d", __func__, e);
52 BUG(); 52 BUG();
53 break; 53 break;
54 } 54 }
@@ -74,7 +74,7 @@ xfs_error_test(int error_tag, int *fsidp, char *expression,
74 74
75 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 75 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
76 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) { 76 if (xfs_etest[i] == error_tag && xfs_etest_fsid[i] == fsid) {
77 cmn_err(CE_WARN, 77 xfs_warn(NULL,
78 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"", 78 "Injecting error (%s) at file %s, line %d, on filesystem \"%s\"",
79 expression, file, line, xfs_etest_fsname[i]); 79 expression, file, line, xfs_etest_fsname[i]);
80 return 1; 80 return 1;
@@ -95,14 +95,14 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
95 95
96 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 96 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
97 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) { 97 if (xfs_etest_fsid[i] == fsid && xfs_etest[i] == error_tag) {
98 cmn_err(CE_WARN, "XFS error tag #%d on", error_tag); 98 xfs_warn(mp, "error tag #%d on", error_tag);
99 return 0; 99 return 0;
100 } 100 }
101 } 101 }
102 102
103 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) { 103 for (i = 0; i < XFS_NUM_INJECT_ERROR; i++) {
104 if (xfs_etest[i] == 0) { 104 if (xfs_etest[i] == 0) {
105 cmn_err(CE_WARN, "Turned on XFS error tag #%d", 105 xfs_warn(mp, "Turned on XFS error tag #%d",
106 error_tag); 106 error_tag);
107 xfs_etest[i] = error_tag; 107 xfs_etest[i] = error_tag;
108 xfs_etest_fsid[i] = fsid; 108 xfs_etest_fsid[i] = fsid;
@@ -114,7 +114,7 @@ xfs_errortag_add(int error_tag, xfs_mount_t *mp)
114 } 114 }
115 } 115 }
116 116
117 cmn_err(CE_WARN, "error tag overflow, too many turned on"); 117 xfs_warn(mp, "error tag overflow, too many turned on");
118 118
119 return 1; 119 return 1;
120} 120}
@@ -133,7 +133,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
133 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) && 133 if ((fsid == 0LL || xfs_etest_fsid[i] == fsid) &&
134 xfs_etest[i] != 0) { 134 xfs_etest[i] != 0) {
135 cleared = 1; 135 cleared = 1;
136 cmn_err(CE_WARN, "Clearing XFS error tag #%d", 136 xfs_warn(mp, "Clearing XFS error tag #%d",
137 xfs_etest[i]); 137 xfs_etest[i]);
138 xfs_etest[i] = 0; 138 xfs_etest[i] = 0;
139 xfs_etest_fsid[i] = 0LL; 139 xfs_etest_fsid[i] = 0LL;
@@ -144,9 +144,7 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
144 } 144 }
145 145
146 if (loud || cleared) 146 if (loud || cleared)
147 cmn_err(CE_WARN, 147 xfs_warn(mp, "Cleared all XFS error tags for filesystem");
148 "Cleared all XFS error tags for filesystem \"%s\"",
149 mp->m_fsname);
150 148
151 return 0; 149 return 0;
152} 150}
@@ -162,9 +160,8 @@ xfs_error_report(
162 inst_t *ra) 160 inst_t *ra)
163{ 161{
164 if (level <= xfs_error_level) { 162 if (level <= xfs_error_level) {
165 xfs_cmn_err(XFS_PTAG_ERROR_REPORT, 163 xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT,
166 CE_ALERT, mp, 164 "Internal error %s at line %d of file %s. Caller 0x%p\n",
167 "XFS internal error %s at line %d of file %s. Caller 0x%p\n",
168 tag, linenum, filename, ra); 165 tag, linenum, filename, ra);
169 166
170 xfs_stack_trace(); 167 xfs_stack_trace();
@@ -184,4 +181,5 @@ xfs_corruption_error(
184 if (level <= xfs_error_level) 181 if (level <= xfs_error_level)
185 xfs_hex_dump(p, 16); 182 xfs_hex_dump(p, 16);
186 xfs_error_report(tag, level, mp, filename, linenum, ra); 183 xfs_error_report(tag, level, mp, filename, linenum, ra);
184 xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
187} 185}
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 10dce5475f02..079a367f44ee 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -145,10 +145,8 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
145#endif /* DEBUG */ 145#endif /* DEBUG */
146 146
147/* 147/*
148 * XFS panic tags -- allow a call to xfs_cmn_err() be turned into 148 * XFS panic tags -- allow a call to xfs_alert_tag() be turned into
149 * a panic by setting xfs_panic_mask in a 149 * a panic by setting xfs_panic_mask in a sysctl.
150 * sysctl. update xfs_max[XFS_PARAM] if
151 * more are added.
152 */ 150 */
153#define XFS_NO_PTAG 0 151#define XFS_NO_PTAG 0
154#define XFS_PTAG_IFLUSH 0x00000001 152#define XFS_PTAG_IFLUSH 0x00000001
@@ -160,17 +158,4 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp, int loud);
160#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040 158#define XFS_PTAG_SHUTDOWN_LOGERROR 0x00000040
161#define XFS_PTAG_FSBLOCK_ZERO 0x00000080 159#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
162 160
163struct xfs_mount;
164
165extern void xfs_hex_dump(void *p, int length);
166
167#define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \
168 xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args)
169
170#define xfs_fs_mount_cmn_err(f, fmt, args...) \
171 do { \
172 if (!(f & XFS_MFSI_QUIET)) \
173 cmn_err(CE_WARN, "XFS: " fmt, ## args); \
174 } while (0)
175
176#endif /* __XFS_ERROR_H__ */ 161#endif /* __XFS_ERROR_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d2..9153d2c77caf 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
53 xfs_fsop_geom_t *geo, 53 xfs_fsop_geom_t *geo,
54 int new_version) 54 int new_version)
55{ 55{
56
57 memset(geo, 0, sizeof(*geo));
58
56 geo->blocksize = mp->m_sb.sb_blocksize; 59 geo->blocksize = mp->m_sb.sb_blocksize;
57 geo->rtextsize = mp->m_sb.sb_rextsize; 60 geo->rtextsize = mp->m_sb.sb_rextsize;
58 geo->agblocks = mp->m_sb.sb_agblocks; 61 geo->agblocks = mp->m_sb.sb_agblocks;
@@ -382,8 +385,8 @@ xfs_growfs_data_private(
382 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), 385 XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
383 XFS_FSS_TO_BB(mp, 1), 0, &bp); 386 XFS_FSS_TO_BB(mp, 1), 0, &bp);
384 if (error) { 387 if (error) {
385 xfs_fs_cmn_err(CE_WARN, mp, 388 xfs_warn(mp,
386 "error %d reading secondary superblock for ag %d", 389 "error %d reading secondary superblock for ag %d",
387 error, agno); 390 error, agno);
388 break; 391 break;
389 } 392 }
@@ -396,7 +399,7 @@ xfs_growfs_data_private(
396 if (!(error = xfs_bwrite(mp, bp))) { 399 if (!(error = xfs_bwrite(mp, bp))) {
397 continue; 400 continue;
398 } else { 401 } else {
399 xfs_fs_cmn_err(CE_WARN, mp, 402 xfs_warn(mp,
400 "write error %d updating secondary superblock for ag %d", 403 "write error %d updating secondary superblock for ag %d",
401 error, agno); 404 error, agno);
402 break; /* no point in continuing */ 405 break; /* no point in continuing */
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 0626a32c3447..84ebeec16642 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1055,28 +1055,23 @@ xfs_difree(
1055 */ 1055 */
1056 agno = XFS_INO_TO_AGNO(mp, inode); 1056 agno = XFS_INO_TO_AGNO(mp, inode);
1057 if (agno >= mp->m_sb.sb_agcount) { 1057 if (agno >= mp->m_sb.sb_agcount) {
1058 cmn_err(CE_WARN, 1058 xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
1059 "xfs_difree: agno >= mp->m_sb.sb_agcount (%d >= %d) on %s. Returning EINVAL.", 1059 __func__, agno, mp->m_sb.sb_agcount);
1060 agno, mp->m_sb.sb_agcount, mp->m_fsname);
1061 ASSERT(0); 1060 ASSERT(0);
1062 return XFS_ERROR(EINVAL); 1061 return XFS_ERROR(EINVAL);
1063 } 1062 }
1064 agino = XFS_INO_TO_AGINO(mp, inode); 1063 agino = XFS_INO_TO_AGINO(mp, inode);
1065 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) { 1064 if (inode != XFS_AGINO_TO_INO(mp, agno, agino)) {
1066 cmn_err(CE_WARN, 1065 xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
1067 "xfs_difree: inode != XFS_AGINO_TO_INO() " 1066 __func__, (unsigned long long)inode,
1068 "(%llu != %llu) on %s. Returning EINVAL.", 1067 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
1069 (unsigned long long)inode,
1070 (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino),
1071 mp->m_fsname);
1072 ASSERT(0); 1068 ASSERT(0);
1073 return XFS_ERROR(EINVAL); 1069 return XFS_ERROR(EINVAL);
1074 } 1070 }
1075 agbno = XFS_AGINO_TO_AGBNO(mp, agino); 1071 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1076 if (agbno >= mp->m_sb.sb_agblocks) { 1072 if (agbno >= mp->m_sb.sb_agblocks) {
1077 cmn_err(CE_WARN, 1073 xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
1078 "xfs_difree: agbno >= mp->m_sb.sb_agblocks (%d >= %d) on %s. Returning EINVAL.", 1074 __func__, agbno, mp->m_sb.sb_agblocks);
1079 agbno, mp->m_sb.sb_agblocks, mp->m_fsname);
1080 ASSERT(0); 1075 ASSERT(0);
1081 return XFS_ERROR(EINVAL); 1076 return XFS_ERROR(EINVAL);
1082 } 1077 }
@@ -1085,9 +1080,8 @@ xfs_difree(
1085 */ 1080 */
1086 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1081 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1087 if (error) { 1082 if (error) {
1088 cmn_err(CE_WARN, 1083 xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
1089 "xfs_difree: xfs_ialloc_read_agi() returned an error %d on %s. Returning error.", 1084 __func__, error);
1090 error, mp->m_fsname);
1091 return error; 1085 return error;
1092 } 1086 }
1093 agi = XFS_BUF_TO_AGI(agbp); 1087 agi = XFS_BUF_TO_AGI(agbp);
@@ -1106,17 +1100,15 @@ xfs_difree(
1106 * Look for the entry describing this inode. 1100 * Look for the entry describing this inode.
1107 */ 1101 */
1108 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) { 1102 if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1109 cmn_err(CE_WARN, 1103 xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1110 "xfs_difree: xfs_inobt_lookup returned() an error %d on %s. Returning error.", 1104 __func__, error);
1111 error, mp->m_fsname);
1112 goto error0; 1105 goto error0;
1113 } 1106 }
1114 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1107 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1115 error = xfs_inobt_get_rec(cur, &rec, &i); 1108 error = xfs_inobt_get_rec(cur, &rec, &i);
1116 if (error) { 1109 if (error) {
1117 cmn_err(CE_WARN, 1110 xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1118 "xfs_difree: xfs_inobt_get_rec() returned an error %d on %s. Returning error.", 1111 __func__, error);
1119 error, mp->m_fsname);
1120 goto error0; 1112 goto error0;
1121 } 1113 }
1122 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 1114 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
@@ -1157,8 +1149,8 @@ xfs_difree(
1157 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); 1149 xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1158 1150
1159 if ((error = xfs_btree_delete(cur, &i))) { 1151 if ((error = xfs_btree_delete(cur, &i))) {
1160 cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n", 1152 xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
1161 error, mp->m_fsname); 1153 __func__, error);
1162 goto error0; 1154 goto error0;
1163 } 1155 }
1164 1156
@@ -1170,9 +1162,8 @@ xfs_difree(
1170 1162
1171 error = xfs_inobt_update(cur, &rec); 1163 error = xfs_inobt_update(cur, &rec);
1172 if (error) { 1164 if (error) {
1173 cmn_err(CE_WARN, 1165 xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
1174 "xfs_difree: xfs_inobt_update returned an error %d on %s.", 1166 __func__, error);
1175 error, mp->m_fsname);
1176 goto error0; 1167 goto error0;
1177 } 1168 }
1178 1169
@@ -1218,10 +1209,9 @@ xfs_imap_lookup(
1218 1209
1219 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp); 1210 error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1220 if (error) { 1211 if (error) {
1221 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1212 xfs_alert(mp,
1222 "xfs_ialloc_read_agi() returned " 1213 "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
1223 "error %d, agno %d", 1214 __func__, error, agno);
1224 error, agno);
1225 return error; 1215 return error;
1226 } 1216 }
1227 1217
@@ -1299,24 +1289,21 @@ xfs_imap(
1299 if (flags & XFS_IGET_UNTRUSTED) 1289 if (flags & XFS_IGET_UNTRUSTED)
1300 return XFS_ERROR(EINVAL); 1290 return XFS_ERROR(EINVAL);
1301 if (agno >= mp->m_sb.sb_agcount) { 1291 if (agno >= mp->m_sb.sb_agcount) {
1302 xfs_fs_cmn_err(CE_ALERT, mp, 1292 xfs_alert(mp,
1303 "xfs_imap: agno (%d) >= " 1293 "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
1304 "mp->m_sb.sb_agcount (%d)", 1294 __func__, agno, mp->m_sb.sb_agcount);
1305 agno, mp->m_sb.sb_agcount);
1306 } 1295 }
1307 if (agbno >= mp->m_sb.sb_agblocks) { 1296 if (agbno >= mp->m_sb.sb_agblocks) {
1308 xfs_fs_cmn_err(CE_ALERT, mp, 1297 xfs_alert(mp,
1309 "xfs_imap: agbno (0x%llx) >= " 1298 "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
1310 "mp->m_sb.sb_agblocks (0x%lx)", 1299 __func__, (unsigned long long)agbno,
1311 (unsigned long long) agbno, 1300 (unsigned long)mp->m_sb.sb_agblocks);
1312 (unsigned long) mp->m_sb.sb_agblocks);
1313 } 1301 }
1314 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) { 1302 if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1315 xfs_fs_cmn_err(CE_ALERT, mp, 1303 xfs_alert(mp,
1316 "xfs_imap: ino (0x%llx) != " 1304 "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
1317 "XFS_AGINO_TO_INO(mp, agno, agino) " 1305 __func__, ino,
1318 "(0x%llx)", 1306 XFS_AGINO_TO_INO(mp, agno, agino));
1319 ino, XFS_AGINO_TO_INO(mp, agno, agino));
1320 } 1307 }
1321 xfs_stack_trace(); 1308 xfs_stack_trace();
1322#endif /* DEBUG */ 1309#endif /* DEBUG */
@@ -1388,10 +1375,9 @@ out_map:
1388 */ 1375 */
1389 if ((imap->im_blkno + imap->im_len) > 1376 if ((imap->im_blkno + imap->im_len) >
1390 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) { 1377 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
1391 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: " 1378 xfs_alert(mp,
1392 "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > " 1379 "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
1393 " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)", 1380 __func__, (unsigned long long) imap->im_blkno,
1394 (unsigned long long) imap->im_blkno,
1395 (unsigned long long) imap->im_len, 1381 (unsigned long long) imap->im_len,
1396 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)); 1382 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1397 return XFS_ERROR(EINVAL); 1383 return XFS_ERROR(EINVAL);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index be7cf625421f..a37480a6e023 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -110,8 +110,8 @@ xfs_inobp_check(
110 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 110 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
111 i * mp->m_sb.sb_inodesize); 111 i * mp->m_sb.sb_inodesize);
112 if (!dip->di_next_unlinked) { 112 if (!dip->di_next_unlinked) {
113 xfs_fs_cmn_err(CE_ALERT, mp, 113 xfs_alert(mp,
114 "Detected a bogus zero next_unlinked field in incore inode buffer 0x%p. About to pop an ASSERT.", 114 "Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
115 bp); 115 bp);
116 ASSERT(dip->di_next_unlinked); 116 ASSERT(dip->di_next_unlinked);
117 } 117 }
@@ -142,10 +142,9 @@ xfs_imap_to_bp(
142 (int)imap->im_len, buf_flags, &bp); 142 (int)imap->im_len, buf_flags, &bp);
143 if (error) { 143 if (error) {
144 if (error != EAGAIN) { 144 if (error != EAGAIN) {
145 cmn_err(CE_WARN, 145 xfs_warn(mp,
146 "xfs_imap_to_bp: xfs_trans_read_buf()returned " 146 "%s: xfs_trans_read_buf() returned error %d.",
147 "an error %d on %s. Returning error.", 147 __func__, error);
148 error, mp->m_fsname);
149 } else { 148 } else {
150 ASSERT(buf_flags & XBF_TRYLOCK); 149 ASSERT(buf_flags & XBF_TRYLOCK);
151 } 150 }
@@ -180,12 +179,11 @@ xfs_imap_to_bp(
180 XFS_CORRUPTION_ERROR("xfs_imap_to_bp", 179 XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
181 XFS_ERRLEVEL_HIGH, mp, dip); 180 XFS_ERRLEVEL_HIGH, mp, dip);
182#ifdef DEBUG 181#ifdef DEBUG
183 cmn_err(CE_PANIC, 182 xfs_emerg(mp,
184 "Device %s - bad inode magic/vsn " 183 "bad inode magic/vsn daddr %lld #%d (magic=%x)",
185 "daddr %lld #%d (magic=%x)",
186 XFS_BUFTARG_NAME(mp->m_ddev_targp),
187 (unsigned long long)imap->im_blkno, i, 184 (unsigned long long)imap->im_blkno, i,
188 be16_to_cpu(dip->di_magic)); 185 be16_to_cpu(dip->di_magic));
186 ASSERT(0);
189#endif 187#endif
190 xfs_trans_brelse(tp, bp); 188 xfs_trans_brelse(tp, bp);
191 return XFS_ERROR(EFSCORRUPTED); 189 return XFS_ERROR(EFSCORRUPTED);
@@ -317,7 +315,7 @@ xfs_iformat(
317 if (unlikely(be32_to_cpu(dip->di_nextents) + 315 if (unlikely(be32_to_cpu(dip->di_nextents) +
318 be16_to_cpu(dip->di_anextents) > 316 be16_to_cpu(dip->di_anextents) >
319 be64_to_cpu(dip->di_nblocks))) { 317 be64_to_cpu(dip->di_nblocks))) {
320 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 318 xfs_warn(ip->i_mount,
321 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.", 319 "corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
322 (unsigned long long)ip->i_ino, 320 (unsigned long long)ip->i_ino,
323 (int)(be32_to_cpu(dip->di_nextents) + 321 (int)(be32_to_cpu(dip->di_nextents) +
@@ -330,8 +328,7 @@ xfs_iformat(
330 } 328 }
331 329
332 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) { 330 if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
333 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 331 xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
334 "corrupt dinode %Lu, forkoff = 0x%x.",
335 (unsigned long long)ip->i_ino, 332 (unsigned long long)ip->i_ino,
336 dip->di_forkoff); 333 dip->di_forkoff);
337 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW, 334 XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
@@ -341,7 +338,7 @@ xfs_iformat(
341 338
342 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && 339 if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
343 !ip->i_mount->m_rtdev_targp)) { 340 !ip->i_mount->m_rtdev_targp)) {
344 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 341 xfs_warn(ip->i_mount,
345 "corrupt dinode %Lu, has realtime flag set.", 342 "corrupt dinode %Lu, has realtime flag set.",
346 ip->i_ino); 343 ip->i_ino);
347 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", 344 XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
@@ -373,9 +370,8 @@ xfs_iformat(
373 * no local regular files yet 370 * no local regular files yet
374 */ 371 */
375 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) { 372 if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
376 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 373 xfs_warn(ip->i_mount,
377 "corrupt inode %Lu " 374 "corrupt inode %Lu (local format for regular file).",
378 "(local format for regular file).",
379 (unsigned long long) ip->i_ino); 375 (unsigned long long) ip->i_ino);
380 XFS_CORRUPTION_ERROR("xfs_iformat(4)", 376 XFS_CORRUPTION_ERROR("xfs_iformat(4)",
381 XFS_ERRLEVEL_LOW, 377 XFS_ERRLEVEL_LOW,
@@ -385,9 +381,8 @@ xfs_iformat(
385 381
386 di_size = be64_to_cpu(dip->di_size); 382 di_size = be64_to_cpu(dip->di_size);
387 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) { 383 if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
388 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 384 xfs_warn(ip->i_mount,
389 "corrupt inode %Lu " 385 "corrupt inode %Lu (bad size %Ld for local inode).",
390 "(bad size %Ld for local inode).",
391 (unsigned long long) ip->i_ino, 386 (unsigned long long) ip->i_ino,
392 (long long) di_size); 387 (long long) di_size);
393 XFS_CORRUPTION_ERROR("xfs_iformat(5)", 388 XFS_CORRUPTION_ERROR("xfs_iformat(5)",
@@ -431,9 +426,8 @@ xfs_iformat(
431 size = be16_to_cpu(atp->hdr.totsize); 426 size = be16_to_cpu(atp->hdr.totsize);
432 427
433 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) { 428 if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
434 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 429 xfs_warn(ip->i_mount,
435 "corrupt inode %Lu " 430 "corrupt inode %Lu (bad attr fork size %Ld).",
436 "(bad attr fork size %Ld).",
437 (unsigned long long) ip->i_ino, 431 (unsigned long long) ip->i_ino,
438 (long long) size); 432 (long long) size);
439 XFS_CORRUPTION_ERROR("xfs_iformat(8)", 433 XFS_CORRUPTION_ERROR("xfs_iformat(8)",
@@ -488,9 +482,8 @@ xfs_iformat_local(
488 * kmem_alloc() or memcpy() below. 482 * kmem_alloc() or memcpy() below.
489 */ 483 */
490 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 484 if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
491 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 485 xfs_warn(ip->i_mount,
492 "corrupt inode %Lu " 486 "corrupt inode %Lu (bad size %d for local fork, size = %d).",
493 "(bad size %d for local fork, size = %d).",
494 (unsigned long long) ip->i_ino, size, 487 (unsigned long long) ip->i_ino, size,
495 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)); 488 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
496 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW, 489 XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
@@ -547,8 +540,7 @@ xfs_iformat_extents(
547 * kmem_alloc() or memcpy() below. 540 * kmem_alloc() or memcpy() below.
548 */ 541 */
549 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { 542 if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
550 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 543 xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
551 "corrupt inode %Lu ((a)extents = %d).",
552 (unsigned long long) ip->i_ino, nex); 544 (unsigned long long) ip->i_ino, nex);
553 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, 545 XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
554 ip->i_mount, dip); 546 ip->i_mount, dip);
@@ -623,11 +615,10 @@ xfs_iformat_btree(
623 || XFS_BMDR_SPACE_CALC(nrecs) > 615 || XFS_BMDR_SPACE_CALC(nrecs) >
624 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) 616 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
625 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) { 617 || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
626 xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, 618 xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
627 "corrupt inode %Lu (btree).",
628 (unsigned long long) ip->i_ino); 619 (unsigned long long) ip->i_ino);
629 XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW, 620 XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
630 ip->i_mount); 621 ip->i_mount, dip);
631 return XFS_ERROR(EFSCORRUPTED); 622 return XFS_ERROR(EFSCORRUPTED);
632 } 623 }
633 624
@@ -813,11 +804,9 @@ xfs_iread(
813 */ 804 */
814 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) { 805 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
815#ifdef DEBUG 806#ifdef DEBUG
816 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 807 xfs_alert(mp,
817 "dip->di_magic (0x%x) != " 808 "%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
818 "XFS_DINODE_MAGIC (0x%x)", 809 __func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
819 be16_to_cpu(dip->di_magic),
820 XFS_DINODE_MAGIC);
821#endif /* DEBUG */ 810#endif /* DEBUG */
822 error = XFS_ERROR(EINVAL); 811 error = XFS_ERROR(EINVAL);
823 goto out_brelse; 812 goto out_brelse;
@@ -835,9 +824,8 @@ xfs_iread(
835 error = xfs_iformat(ip, dip); 824 error = xfs_iformat(ip, dip);
836 if (error) { 825 if (error) {
837#ifdef DEBUG 826#ifdef DEBUG
838 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: " 827 xfs_alert(mp, "%s: xfs_iformat() returned error %d",
839 "xfs_iformat() returned error %d", 828 __func__, error);
840 error);
841#endif /* DEBUG */ 829#endif /* DEBUG */
842 goto out_brelse; 830 goto out_brelse;
843 } 831 }
@@ -1016,8 +1004,8 @@ xfs_ialloc(
1016 * This is because we're setting fields here we need 1004 * This is because we're setting fields here we need
1017 * to prevent others from looking at until we're done. 1005 * to prevent others from looking at until we're done.
1018 */ 1006 */
1019 error = xfs_trans_iget(tp->t_mountp, tp, ino, 1007 error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
1020 XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 1008 XFS_ILOCK_EXCL, &ip);
1021 if (error) 1009 if (error)
1022 return error; 1010 return error;
1023 ASSERT(ip != NULL); 1011 ASSERT(ip != NULL);
@@ -1166,6 +1154,7 @@ xfs_ialloc(
1166 /* 1154 /*
1167 * Log the new values stuffed into the inode. 1155 * Log the new values stuffed into the inode.
1168 */ 1156 */
1157 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
1169 xfs_trans_log_inode(tp, ip, flags); 1158 xfs_trans_log_inode(tp, ip, flags);
1170 1159
1171 /* now that we have an i_mode we can setup inode ops and unlock */ 1160 /* now that we have an i_mode we can setup inode ops and unlock */
@@ -1820,9 +1809,8 @@ xfs_iunlink_remove(
1820 */ 1809 */
1821 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1810 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1822 if (error) { 1811 if (error) {
1823 cmn_err(CE_WARN, 1812 xfs_warn(mp, "%s: xfs_itobp() returned error %d.",
1824 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1813 __func__, error);
1825 error, mp->m_fsname);
1826 return error; 1814 return error;
1827 } 1815 }
1828 next_agino = be32_to_cpu(dip->di_next_unlinked); 1816 next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -1867,9 +1855,9 @@ xfs_iunlink_remove(
1867 error = xfs_inotobp(mp, tp, next_ino, &last_dip, 1855 error = xfs_inotobp(mp, tp, next_ino, &last_dip,
1868 &last_ibp, &last_offset, 0); 1856 &last_ibp, &last_offset, 0);
1869 if (error) { 1857 if (error) {
1870 cmn_err(CE_WARN, 1858 xfs_warn(mp,
1871 "xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.", 1859 "%s: xfs_inotobp() returned error %d.",
1872 error, mp->m_fsname); 1860 __func__, error);
1873 return error; 1861 return error;
1874 } 1862 }
1875 next_agino = be32_to_cpu(last_dip->di_next_unlinked); 1863 next_agino = be32_to_cpu(last_dip->di_next_unlinked);
@@ -1882,9 +1870,8 @@ xfs_iunlink_remove(
1882 */ 1870 */
1883 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK); 1871 error = xfs_itobp(mp, tp, ip, &dip, &ibp, XBF_LOCK);
1884 if (error) { 1872 if (error) {
1885 cmn_err(CE_WARN, 1873 xfs_warn(mp, "%s: xfs_itobp(2) returned error %d.",
1886 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 1874 __func__, error);
1887 error, mp->m_fsname);
1888 return error; 1875 return error;
1889 } 1876 }
1890 next_agino = be32_to_cpu(dip->di_next_unlinked); 1877 next_agino = be32_to_cpu(dip->di_next_unlinked);
@@ -2802,7 +2789,7 @@ xfs_iflush(
2802 2789
2803 /* 2790 /*
2804 * We can't flush the inode until it is unpinned, so wait for it if we 2791 * We can't flush the inode until it is unpinned, so wait for it if we
2805 * are allowed to block. We know noone new can pin it, because we are 2792 * are allowed to block. We know no one new can pin it, because we are
2806 * holding the inode lock shared and you need to hold it exclusively to 2793 * holding the inode lock shared and you need to hold it exclusively to
2807 * pin the inode. 2794 * pin the inode.
2808 * 2795 *
@@ -2848,7 +2835,7 @@ xfs_iflush(
2848 * Get the buffer containing the on-disk inode. 2835 * Get the buffer containing the on-disk inode.
2849 */ 2836 */
2850 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 2837 error = xfs_itobp(mp, NULL, ip, &dip, &bp,
2851 (flags & SYNC_WAIT) ? XBF_LOCK : XBF_TRYLOCK); 2838 (flags & SYNC_TRYLOCK) ? XBF_TRYLOCK : XBF_LOCK);
2852 if (error || !bp) { 2839 if (error || !bp) {
2853 xfs_ifunlock(ip); 2840 xfs_ifunlock(ip);
2854 return error; 2841 return error;
@@ -2939,16 +2926,16 @@ xfs_iflush_int(
2939 2926
2940 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC, 2927 if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
2941 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { 2928 mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2942 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2929 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2943 "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p", 2930 "%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2944 ip->i_ino, be16_to_cpu(dip->di_magic), dip); 2931 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2945 goto corrupt_out; 2932 goto corrupt_out;
2946 } 2933 }
2947 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC, 2934 if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2948 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) { 2935 mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2949 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2936 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2950 "xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x", 2937 "%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2951 ip->i_ino, ip, ip->i_d.di_magic); 2938 __func__, ip->i_ino, ip, ip->i_d.di_magic);
2952 goto corrupt_out; 2939 goto corrupt_out;
2953 } 2940 }
2954 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { 2941 if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
@@ -2956,9 +2943,9 @@ xfs_iflush_int(
2956 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && 2943 (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2957 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), 2944 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2958 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) { 2945 mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2959 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2946 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2960 "xfs_iflush: Bad regular inode %Lu, ptr 0x%p", 2947 "%s: Bad regular inode %Lu, ptr 0x%p",
2961 ip->i_ino, ip); 2948 __func__, ip->i_ino, ip);
2962 goto corrupt_out; 2949 goto corrupt_out;
2963 } 2950 }
2964 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { 2951 } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
@@ -2967,28 +2954,28 @@ xfs_iflush_int(
2967 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && 2954 (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2968 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL), 2955 (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2969 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) { 2956 mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2970 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2957 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2971 "xfs_iflush: Bad directory inode %Lu, ptr 0x%p", 2958 "%s: Bad directory inode %Lu, ptr 0x%p",
2972 ip->i_ino, ip); 2959 __func__, ip->i_ino, ip);
2973 goto corrupt_out; 2960 goto corrupt_out;
2974 } 2961 }
2975 } 2962 }
2976 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents > 2963 if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2977 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5, 2964 ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2978 XFS_RANDOM_IFLUSH_5)) { 2965 XFS_RANDOM_IFLUSH_5)) {
2979 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2966 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2980 "xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p", 2967 "%s: detected corrupt incore inode %Lu, "
2981 ip->i_ino, 2968 "total extents = %d, nblocks = %Ld, ptr 0x%p",
2969 __func__, ip->i_ino,
2982 ip->i_d.di_nextents + ip->i_d.di_anextents, 2970 ip->i_d.di_nextents + ip->i_d.di_anextents,
2983 ip->i_d.di_nblocks, 2971 ip->i_d.di_nblocks, ip);
2984 ip);
2985 goto corrupt_out; 2972 goto corrupt_out;
2986 } 2973 }
2987 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize, 2974 if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2988 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) { 2975 mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2989 xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp, 2976 xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2990 "xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p", 2977 "%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2991 ip->i_ino, ip->i_d.di_forkoff, ip); 2978 __func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2992 goto corrupt_out; 2979 goto corrupt_out;
2993 } 2980 }
2994 /* 2981 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 5c95fa8ec11d..ff4e2a30227d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -111,7 +111,7 @@ struct xfs_imap {
111 * Generally, we do not want to hold the i_rlock while holding the 111 * Generally, we do not want to hold the i_rlock while holding the
112 * i_ilock. Hierarchy is i_iolock followed by i_rlock. 112 * i_ilock. Hierarchy is i_iolock followed by i_rlock.
113 * 113 *
114 * xfs_iptr_t contains all the inode fields upto and including the 114 * xfs_iptr_t contains all the inode fields up to and including the
115 * i_mnext and i_mprev fields, it is used as a marker in the inode 115 * i_mnext and i_mprev fields, it is used as a marker in the inode
116 * chain off the mount structure by xfs_sync calls. 116 * chain off the mount structure by xfs_sync calls.
117 */ 117 */
@@ -336,7 +336,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
336 336
337/* 337/*
338 * Project quota id helpers (previously projid was 16bit only 338 * Project quota id helpers (previously projid was 16bit only
339 * and using two 16bit values to hold new 32bit projid was choosen 339 * and using two 16bit values to hold new 32bit projid was chosen
340 * to retain compatibility with "old" filesystems). 340 * to retain compatibility with "old" filesystems).
341 */ 341 */
342static inline prid_t 342static inline prid_t
@@ -409,28 +409,35 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
409/* 409/*
410 * Flags for lockdep annotations. 410 * Flags for lockdep annotations.
411 * 411 *
412 * XFS_I[O]LOCK_PARENT - for operations that require locking two inodes 412 * XFS_LOCK_PARENT - for directory operations that require locking a
413 * (ie directory operations that require locking a directory inode and 413 * parent directory inode and a child entry inode. The parent gets locked
414 * an entry inode). The first inode gets locked with this flag so it 414 * with this flag so it gets a lockdep subclass of 1 and the child entry
415 * gets a lockdep subclass of 1 and the second lock will have a lockdep 415 * lock will have a lockdep subclass of 0.
416 * subclass of 0. 416 *
417 * XFS_LOCK_RTBITMAP/XFS_LOCK_RTSUM - the realtime device bitmap and summary
418 * inodes do not participate in the normal lock order, and thus have their
419 * own subclasses.
417 * 420 *
418 * XFS_LOCK_INUMORDER - for locking several inodes at the some time 421 * XFS_LOCK_INUMORDER - for locking several inodes at the some time
419 * with xfs_lock_inodes(). This flag is used as the starting subclass 422 * with xfs_lock_inodes(). This flag is used as the starting subclass
420 * and each subsequent lock acquired will increment the subclass by one. 423 * and each subsequent lock acquired will increment the subclass by one.
421 * So the first lock acquired will have a lockdep subclass of 2, the 424 * So the first lock acquired will have a lockdep subclass of 4, the
422 * second lock will have a lockdep subclass of 3, and so on. It is 425 * second lock will have a lockdep subclass of 5, and so on. It is
423 * the responsibility of the class builder to shift this to the correct 426 * the responsibility of the class builder to shift this to the correct
424 * portion of the lock_mode lockdep mask. 427 * portion of the lock_mode lockdep mask.
425 */ 428 */
426#define XFS_LOCK_PARENT 1 429#define XFS_LOCK_PARENT 1
427#define XFS_LOCK_INUMORDER 2 430#define XFS_LOCK_RTBITMAP 2
431#define XFS_LOCK_RTSUM 3
432#define XFS_LOCK_INUMORDER 4
428 433
429#define XFS_IOLOCK_SHIFT 16 434#define XFS_IOLOCK_SHIFT 16
430#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) 435#define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT)
431 436
432#define XFS_ILOCK_SHIFT 24 437#define XFS_ILOCK_SHIFT 24
433#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) 438#define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT)
439#define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT)
440#define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT)
434 441
435#define XFS_IOLOCK_DEP_MASK 0x00ff0000 442#define XFS_IOLOCK_DEP_MASK 0x00ff0000
436#define XFS_ILOCK_DEP_MASK 0xff000000 443#define XFS_ILOCK_DEP_MASK 0xff000000
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index fd4f398bd6f1..576fdfe81d60 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -198,6 +198,41 @@ xfs_inode_item_size(
198} 198}
199 199
200/* 200/*
201 * xfs_inode_item_format_extents - convert in-core extents to on-disk form
202 *
203 * For either the data or attr fork in extent format, we need to endian convert
204 * the in-core extent as we place them into the on-disk inode. In this case, we
205 * need to do this conversion before we write the extents into the log. Because
206 * we don't have the disk inode to write into here, we allocate a buffer and
207 * format the extents into it via xfs_iextents_copy(). We free the buffer in
208 * the unlock routine after the copy for the log has been made.
209 *
210 * In the case of the data fork, the in-core and on-disk fork sizes can be
211 * different due to delayed allocation extents. We only log on-disk extents
212 * here, so always use the physical fork size to determine the size of the
213 * buffer we need to allocate.
214 */
215STATIC void
216xfs_inode_item_format_extents(
217 struct xfs_inode *ip,
218 struct xfs_log_iovec *vecp,
219 int whichfork,
220 int type)
221{
222 xfs_bmbt_rec_t *ext_buffer;
223
224 ext_buffer = kmem_alloc(XFS_IFORK_SIZE(ip, whichfork), KM_SLEEP);
225 if (whichfork == XFS_DATA_FORK)
226 ip->i_itemp->ili_extents_buf = ext_buffer;
227 else
228 ip->i_itemp->ili_aextents_buf = ext_buffer;
229
230 vecp->i_addr = ext_buffer;
231 vecp->i_len = xfs_iextents_copy(ip, ext_buffer, whichfork);
232 vecp->i_type = type;
233}
234
235/*
201 * This is called to fill in the vector of log iovecs for the 236 * This is called to fill in the vector of log iovecs for the
202 * given inode log item. It fills the first item with an inode 237 * given inode log item. It fills the first item with an inode
203 * log format structure, the second with the on-disk inode structure, 238 * log format structure, the second with the on-disk inode structure,
@@ -213,7 +248,6 @@ xfs_inode_item_format(
213 struct xfs_inode *ip = iip->ili_inode; 248 struct xfs_inode *ip = iip->ili_inode;
214 uint nvecs; 249 uint nvecs;
215 size_t data_bytes; 250 size_t data_bytes;
216 xfs_bmbt_rec_t *ext_buffer;
217 xfs_mount_t *mp; 251 xfs_mount_t *mp;
218 252
219 vecp->i_addr = &iip->ili_format; 253 vecp->i_addr = &iip->ili_format;
@@ -320,22 +354,8 @@ xfs_inode_item_format(
320 } else 354 } else
321#endif 355#endif
322 { 356 {
323 /* 357 xfs_inode_item_format_extents(ip, vecp,
324 * There are delayed allocation extents 358 XFS_DATA_FORK, XLOG_REG_TYPE_IEXT);
325 * in the inode, or we need to convert
326 * the extents to on disk format.
327 * Use xfs_iextents_copy()
328 * to copy only the real extents into
329 * a separate buffer. We'll free the
330 * buffer in the unlock routine.
331 */
332 ext_buffer = kmem_alloc(ip->i_df.if_bytes,
333 KM_SLEEP);
334 iip->ili_extents_buf = ext_buffer;
335 vecp->i_addr = ext_buffer;
336 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
337 XFS_DATA_FORK);
338 vecp->i_type = XLOG_REG_TYPE_IEXT;
339 } 359 }
340 ASSERT(vecp->i_len <= ip->i_df.if_bytes); 360 ASSERT(vecp->i_len <= ip->i_df.if_bytes);
341 iip->ili_format.ilf_dsize = vecp->i_len; 361 iip->ili_format.ilf_dsize = vecp->i_len;
@@ -445,19 +465,12 @@ xfs_inode_item_format(
445 */ 465 */
446 vecp->i_addr = ip->i_afp->if_u1.if_extents; 466 vecp->i_addr = ip->i_afp->if_u1.if_extents;
447 vecp->i_len = ip->i_afp->if_bytes; 467 vecp->i_len = ip->i_afp->if_bytes;
468 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
448#else 469#else
449 ASSERT(iip->ili_aextents_buf == NULL); 470 ASSERT(iip->ili_aextents_buf == NULL);
450 /* 471 xfs_inode_item_format_extents(ip, vecp,
451 * Need to endian flip before logging 472 XFS_ATTR_FORK, XLOG_REG_TYPE_IATTR_EXT);
452 */
453 ext_buffer = kmem_alloc(ip->i_afp->if_bytes,
454 KM_SLEEP);
455 iip->ili_aextents_buf = ext_buffer;
456 vecp->i_addr = ext_buffer;
457 vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
458 XFS_ATTR_FORK);
459#endif 473#endif
460 vecp->i_type = XLOG_REG_TYPE_IATTR_EXT;
461 iip->ili_format.ilf_asize = vecp->i_len; 474 iip->ili_format.ilf_asize = vecp->i_len;
462 vecp++; 475 vecp++;
463 nvecs++; 476 nvecs++;
@@ -760,11 +773,11 @@ xfs_inode_item_push(
760 * Push the inode to it's backing buffer. This will not remove the 773 * Push the inode to it's backing buffer. This will not remove the
761 * inode from the AIL - a further push will be required to trigger a 774 * inode from the AIL - a further push will be required to trigger a
762 * buffer push. However, this allows all the dirty inodes to be pushed 775 * buffer push. However, this allows all the dirty inodes to be pushed
763 * to the buffer before it is pushed to disk. THe buffer IO completion 776 * to the buffer before it is pushed to disk. The buffer IO completion
764 * will pull th einode from the AIL, mark it clean and unlock the flush 777 * will pull the inode from the AIL, mark it clean and unlock the flush
765 * lock. 778 * lock.
766 */ 779 */
767 (void) xfs_iflush(ip, 0); 780 (void) xfs_iflush(ip, SYNC_TRYLOCK);
768 xfs_iunlock(ip, XFS_ILOCK_SHARED); 781 xfs_iunlock(ip, XFS_ILOCK_SHARED);
769} 782}
770 783
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8a0f044750c3..091d82b94c4d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -101,11 +101,11 @@ xfs_iomap_eof_align_last_fsb(
101} 101}
102 102
103STATIC int 103STATIC int
104xfs_cmn_err_fsblock_zero( 104xfs_alert_fsblock_zero(
105 xfs_inode_t *ip, 105 xfs_inode_t *ip,
106 xfs_bmbt_irec_t *imap) 106 xfs_bmbt_irec_t *imap)
107{ 107{
108 xfs_cmn_err(XFS_PTAG_FSBLOCK_ZERO, CE_ALERT, ip->i_mount, 108 xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
109 "Access to block zero in inode %llu " 109 "Access to block zero in inode %llu "
110 "start_block: %llx start_off: %llx " 110 "start_block: %llx start_off: %llx "
111 "blkcnt: %llx extent-state: %x\n", 111 "blkcnt: %llx extent-state: %x\n",
@@ -246,7 +246,7 @@ xfs_iomap_write_direct(
246 } 246 }
247 247
248 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) { 248 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) {
249 error = xfs_cmn_err_fsblock_zero(ip, imap); 249 error = xfs_alert_fsblock_zero(ip, imap);
250 goto error_out; 250 goto error_out;
251 } 251 }
252 252
@@ -464,7 +464,7 @@ retry:
464 } 464 }
465 465
466 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 466 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
467 return xfs_cmn_err_fsblock_zero(ip, &imap[0]); 467 return xfs_alert_fsblock_zero(ip, &imap[0]);
468 468
469 *ret_imap = imap[0]; 469 *ret_imap = imap[0];
470 return 0; 470 return 0;
@@ -614,7 +614,7 @@ xfs_iomap_write_allocate(
614 * covers at least part of the callers request 614 * covers at least part of the callers request
615 */ 615 */
616 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip))) 616 if (!(imap->br_startblock || XFS_IS_REALTIME_INODE(ip)))
617 return xfs_cmn_err_fsblock_zero(ip, imap); 617 return xfs_alert_fsblock_zero(ip, imap);
618 618
619 if ((offset_fsb >= imap->br_startoff) && 619 if ((offset_fsb >= imap->br_startoff) &&
620 (offset_fsb < (imap->br_startoff + 620 (offset_fsb < (imap->br_startoff +
@@ -724,7 +724,7 @@ xfs_iomap_write_unwritten(
724 return XFS_ERROR(error); 724 return XFS_ERROR(error);
725 725
726 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) 726 if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip)))
727 return xfs_cmn_err_fsblock_zero(ip, &imap); 727 return xfs_alert_fsblock_zero(ip, &imap);
728 728
729 if ((numblks_fsb = imap.br_blockcount) == 0) { 729 if ((numblks_fsb = imap.br_blockcount) == 0) {
730 /* 730 /*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index dc1882adaf54..751e94fe1f77 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -204,7 +204,6 @@ xfs_bulkstat(
204 xfs_agi_t *agi; /* agi header data */ 204 xfs_agi_t *agi; /* agi header data */
205 xfs_agino_t agino; /* inode # in allocation group */ 205 xfs_agino_t agino; /* inode # in allocation group */
206 xfs_agnumber_t agno; /* allocation group number */ 206 xfs_agnumber_t agno; /* allocation group number */
207 xfs_daddr_t bno; /* inode cluster start daddr */
208 int chunkidx; /* current index into inode chunk */ 207 int chunkidx; /* current index into inode chunk */
209 int clustidx; /* current index into inode cluster */ 208 int clustidx; /* current index into inode cluster */
210 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ 209 xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
@@ -463,7 +462,6 @@ xfs_bulkstat(
463 mp->m_sb.sb_inopblog); 462 mp->m_sb.sb_inopblog);
464 } 463 }
465 ino = XFS_AGINO_TO_INO(mp, agno, agino); 464 ino = XFS_AGINO_TO_INO(mp, agno, agino);
466 bno = XFS_AGB_TO_DADDR(mp, agno, agbno);
467 /* 465 /*
468 * Skip if this inode is free. 466 * Skip if this inode is free.
469 */ 467 */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ae6fef1ff563..b612ce4520ae 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -374,11 +374,10 @@ xfs_log_mount(
374 int error; 374 int error;
375 375
376 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) 376 if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
377 cmn_err(CE_NOTE, "XFS mounting filesystem %s", mp->m_fsname); 377 xfs_notice(mp, "Mounting Filesystem");
378 else { 378 else {
379 cmn_err(CE_NOTE, 379 xfs_notice(mp,
380 "Mounting filesystem \"%s\" in no-recovery mode. Filesystem will be inconsistent.", 380"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent.");
381 mp->m_fsname);
382 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); 381 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
383 } 382 }
384 383
@@ -393,7 +392,7 @@ xfs_log_mount(
393 */ 392 */
394 error = xfs_trans_ail_init(mp); 393 error = xfs_trans_ail_init(mp);
395 if (error) { 394 if (error) {
396 cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error); 395 xfs_warn(mp, "AIL initialisation failed: error %d", error);
397 goto out_free_log; 396 goto out_free_log;
398 } 397 }
399 mp->m_log->l_ailp = mp->m_ail; 398 mp->m_log->l_ailp = mp->m_ail;
@@ -413,7 +412,8 @@ xfs_log_mount(
413 if (readonly) 412 if (readonly)
414 mp->m_flags |= XFS_MOUNT_RDONLY; 413 mp->m_flags |= XFS_MOUNT_RDONLY;
415 if (error) { 414 if (error) {
416 cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); 415 xfs_warn(mp, "log mount/recovery failed: error %d",
416 error);
417 goto out_destroy_ail; 417 goto out_destroy_ail;
418 } 418 }
419 } 419 }
@@ -542,10 +542,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
542 */ 542 */
543 } 543 }
544 544
545 if (error) { 545 if (error)
546 xfs_fs_cmn_err(CE_ALERT, mp, 546 xfs_alert(mp, "%s: unmount record failed", __func__);
547 "xfs_log_unmount: unmount record failed");
548 }
549 547
550 548
551 spin_lock(&log->l_icloglock); 549 spin_lock(&log->l_icloglock);
@@ -763,7 +761,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
763 break; 761 break;
764 case XLOG_STATE_COVER_NEED: 762 case XLOG_STATE_COVER_NEED:
765 case XLOG_STATE_COVER_NEED2: 763 case XLOG_STATE_COVER_NEED2:
766 if (!xfs_trans_ail_tail(log->l_ailp) && 764 if (!xfs_ail_min_lsn(log->l_ailp) &&
767 xlog_iclogs_empty(log)) { 765 xlog_iclogs_empty(log)) {
768 if (log->l_covered_state == XLOG_STATE_COVER_NEED) 766 if (log->l_covered_state == XLOG_STATE_COVER_NEED)
769 log->l_covered_state = XLOG_STATE_COVER_DONE; 767 log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -803,7 +801,7 @@ xlog_assign_tail_lsn(
803 xfs_lsn_t tail_lsn; 801 xfs_lsn_t tail_lsn;
804 struct log *log = mp->m_log; 802 struct log *log = mp->m_log;
805 803
806 tail_lsn = xfs_trans_ail_tail(mp->m_ail); 804 tail_lsn = xfs_ail_min_lsn(mp->m_ail);
807 if (!tail_lsn) 805 if (!tail_lsn)
808 tail_lsn = atomic64_read(&log->l_last_sync_lsn); 806 tail_lsn = atomic64_read(&log->l_last_sync_lsn);
809 807
@@ -852,7 +850,7 @@ xlog_space_left(
852 * In this case we just want to return the size of the 850 * In this case we just want to return the size of the
853 * log as the amount of space left. 851 * log as the amount of space left.
854 */ 852 */
855 xfs_fs_cmn_err(CE_ALERT, log->l_mp, 853 xfs_alert(log->l_mp,
856 "xlog_space_left: head behind tail\n" 854 "xlog_space_left: head behind tail\n"
857 " tail_cycle = %d, tail_bytes = %d\n" 855 " tail_cycle = %d, tail_bytes = %d\n"
858 " GH cycle = %d, GH bytes = %d", 856 " GH cycle = %d, GH bytes = %d",
@@ -1001,7 +999,7 @@ xlog_alloc_log(xfs_mount_t *mp,
1001 999
1002 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL); 1000 log = kmem_zalloc(sizeof(xlog_t), KM_MAYFAIL);
1003 if (!log) { 1001 if (!log) {
1004 xlog_warn("XFS: Log allocation failed: No memory!"); 1002 xfs_warn(mp, "Log allocation failed: No memory!");
1005 goto out; 1003 goto out;
1006 } 1004 }
1007 1005
@@ -1029,24 +1027,24 @@ xlog_alloc_log(xfs_mount_t *mp,
1029 if (xfs_sb_version_hassector(&mp->m_sb)) { 1027 if (xfs_sb_version_hassector(&mp->m_sb)) {
1030 log2_size = mp->m_sb.sb_logsectlog; 1028 log2_size = mp->m_sb.sb_logsectlog;
1031 if (log2_size < BBSHIFT) { 1029 if (log2_size < BBSHIFT) {
1032 xlog_warn("XFS: Log sector size too small " 1030 xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)",
1033 "(0x%x < 0x%x)", log2_size, BBSHIFT); 1031 log2_size, BBSHIFT);
1034 goto out_free_log; 1032 goto out_free_log;
1035 } 1033 }
1036 1034
1037 log2_size -= BBSHIFT; 1035 log2_size -= BBSHIFT;
1038 if (log2_size > mp->m_sectbb_log) { 1036 if (log2_size > mp->m_sectbb_log) {
1039 xlog_warn("XFS: Log sector size too large " 1037 xfs_warn(mp, "Log sector size too large (0x%x > 0x%x)",
1040 "(0x%x > 0x%x)", log2_size, mp->m_sectbb_log); 1038 log2_size, mp->m_sectbb_log);
1041 goto out_free_log; 1039 goto out_free_log;
1042 } 1040 }
1043 1041
1044 /* for larger sector sizes, must have v2 or external log */ 1042 /* for larger sector sizes, must have v2 or external log */
1045 if (log2_size && log->l_logBBstart > 0 && 1043 if (log2_size && log->l_logBBstart > 0 &&
1046 !xfs_sb_version_haslogv2(&mp->m_sb)) { 1044 !xfs_sb_version_haslogv2(&mp->m_sb)) {
1047 1045 xfs_warn(mp,
1048 xlog_warn("XFS: log sector size (0x%x) invalid " 1046 "log sector size (0x%x) invalid for configuration.",
1049 "for configuration.", log2_size); 1047 log2_size);
1050 goto out_free_log; 1048 goto out_free_log;
1051 } 1049 }
1052 } 1050 }
@@ -1241,7 +1239,7 @@ xlog_grant_push_ail(
1241 * the filesystem is shutting down. 1239 * the filesystem is shutting down.
1242 */ 1240 */
1243 if (!XLOG_FORCED_SHUTDOWN(log)) 1241 if (!XLOG_FORCED_SHUTDOWN(log))
1244 xfs_trans_ail_push(log->l_ailp, threshold_lsn); 1242 xfs_ail_push(log->l_ailp, threshold_lsn);
1245} 1243}
1246 1244
1247/* 1245/*
@@ -1563,38 +1561,36 @@ xlog_print_tic_res(
1563 "SWAPEXT" 1561 "SWAPEXT"
1564 }; 1562 };
1565 1563
1566 xfs_fs_cmn_err(CE_WARN, mp, 1564 xfs_warn(mp,
1567 "xfs_log_write: reservation summary:\n" 1565 "xfs_log_write: reservation summary:\n"
1568 " trans type = %s (%u)\n" 1566 " trans type = %s (%u)\n"
1569 " unit res = %d bytes\n" 1567 " unit res = %d bytes\n"
1570 " current res = %d bytes\n" 1568 " current res = %d bytes\n"
1571 " total reg = %u bytes (o/flow = %u bytes)\n" 1569 " total reg = %u bytes (o/flow = %u bytes)\n"
1572 " ophdrs = %u (ophdr space = %u bytes)\n" 1570 " ophdrs = %u (ophdr space = %u bytes)\n"
1573 " ophdr + reg = %u bytes\n" 1571 " ophdr + reg = %u bytes\n"
1574 " num regions = %u\n", 1572 " num regions = %u\n",
1575 ((ticket->t_trans_type <= 0 || 1573 ((ticket->t_trans_type <= 0 ||
1576 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ? 1574 ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
1577 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]), 1575 "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
1578 ticket->t_trans_type, 1576 ticket->t_trans_type,
1579 ticket->t_unit_res, 1577 ticket->t_unit_res,
1580 ticket->t_curr_res, 1578 ticket->t_curr_res,
1581 ticket->t_res_arr_sum, ticket->t_res_o_flow, 1579 ticket->t_res_arr_sum, ticket->t_res_o_flow,
1582 ticket->t_res_num_ophdrs, ophdr_spc, 1580 ticket->t_res_num_ophdrs, ophdr_spc,
1583 ticket->t_res_arr_sum + 1581 ticket->t_res_arr_sum +
1584 ticket->t_res_o_flow + ophdr_spc, 1582 ticket->t_res_o_flow + ophdr_spc,
1585 ticket->t_res_num); 1583 ticket->t_res_num);
1586 1584
1587 for (i = 0; i < ticket->t_res_num; i++) { 1585 for (i = 0; i < ticket->t_res_num; i++) {
1588 uint r_type = ticket->t_res_arr[i].r_type; 1586 uint r_type = ticket->t_res_arr[i].r_type;
1589 cmn_err(CE_WARN, 1587 xfs_warn(mp, "region[%u]: %s - %u bytes\n", i,
1590 "region[%u]: %s - %u bytes\n",
1591 i,
1592 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ? 1588 ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
1593 "bad-rtype" : res_type_str[r_type-1]), 1589 "bad-rtype" : res_type_str[r_type-1]),
1594 ticket->t_res_arr[i].r_len); 1590 ticket->t_res_arr[i].r_len);
1595 } 1591 }
1596 1592
1597 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp, 1593 xfs_alert_tag(mp, XFS_PTAG_LOGRES,
1598 "xfs_log_write: reservation ran out. Need to up reservation"); 1594 "xfs_log_write: reservation ran out. Need to up reservation");
1599 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 1595 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1600} 1596}
@@ -1682,7 +1678,7 @@ xlog_write_setup_ophdr(
1682 case XFS_LOG: 1678 case XFS_LOG:
1683 break; 1679 break;
1684 default: 1680 default:
1685 xfs_fs_cmn_err(CE_WARN, log->l_mp, 1681 xfs_warn(log->l_mp,
1686 "Bad XFS transaction clientid 0x%x in ticket 0x%p", 1682 "Bad XFS transaction clientid 0x%x in ticket 0x%p",
1687 ophdr->oh_clientid, ticket); 1683 ophdr->oh_clientid, ticket);
1688 return NULL; 1684 return NULL;
@@ -2264,7 +2260,7 @@ xlog_state_do_callback(
2264 if (repeats > 5000) { 2260 if (repeats > 5000) {
2265 flushcnt += repeats; 2261 flushcnt += repeats;
2266 repeats = 0; 2262 repeats = 0;
2267 xfs_fs_cmn_err(CE_WARN, log->l_mp, 2263 xfs_warn(log->l_mp,
2268 "%s: possible infinite loop (%d iterations)", 2264 "%s: possible infinite loop (%d iterations)",
2269 __func__, flushcnt); 2265 __func__, flushcnt);
2270 } 2266 }
@@ -3052,10 +3048,8 @@ xfs_log_force(
3052 int error; 3048 int error;
3053 3049
3054 error = _xfs_log_force(mp, flags, NULL); 3050 error = _xfs_log_force(mp, flags, NULL);
3055 if (error) { 3051 if (error)
3056 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " 3052 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3057 "error %d returned.", error);
3058 }
3059} 3053}
3060 3054
3061/* 3055/*
@@ -3204,10 +3198,8 @@ xfs_log_force_lsn(
3204 int error; 3198 int error;
3205 3199
3206 error = _xfs_log_force_lsn(mp, lsn, flags, NULL); 3200 error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
3207 if (error) { 3201 if (error)
3208 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: " 3202 xfs_warn(mp, "%s: error %d returned.", __func__, error);
3209 "error %d returned.", error);
3210 }
3211} 3203}
3212 3204
3213/* 3205/*
@@ -3412,9 +3404,20 @@ xlog_verify_dest_ptr(
3412 } 3404 }
3413 3405
3414 if (!good_ptr) 3406 if (!good_ptr)
3415 xlog_panic("xlog_verify_dest_ptr: invalid ptr"); 3407 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3416} 3408}
3417 3409
3410/*
3411 * Check to make sure the grant write head didn't just over lap the tail. If
3412 * the cycles are the same, we can't be overlapping. Otherwise, make sure that
3413 * the cycles differ by exactly one and check the byte count.
3414 *
3415 * This check is run unlocked, so can give false positives. Rather than assert
3416 * on failures, use a warn-once flag and a panic tag to allow the admin to
3417 * determine if they want to panic the machine when such an error occurs. For
3418 * debug kernels this will have the same effect as using an assert but, unlinke
3419 * an assert, it can be turned off at runtime.
3420 */
3418STATIC void 3421STATIC void
3419xlog_verify_grant_tail( 3422xlog_verify_grant_tail(
3420 struct log *log) 3423 struct log *log)
@@ -3422,17 +3425,22 @@ xlog_verify_grant_tail(
3422 int tail_cycle, tail_blocks; 3425 int tail_cycle, tail_blocks;
3423 int cycle, space; 3426 int cycle, space;
3424 3427
3425 /*
3426 * Check to make sure the grant write head didn't just over lap the
3427 * tail. If the cycles are the same, we can't be overlapping.
3428 * Otherwise, make sure that the cycles differ by exactly one and
3429 * check the byte count.
3430 */
3431 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space); 3428 xlog_crack_grant_head(&log->l_grant_write_head, &cycle, &space);
3432 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); 3429 xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks);
3433 if (tail_cycle != cycle) { 3430 if (tail_cycle != cycle) {
3434 ASSERT(cycle - 1 == tail_cycle); 3431 if (cycle - 1 != tail_cycle &&
3435 ASSERT(space <= BBTOB(tail_blocks)); 3432 !(log->l_flags & XLOG_TAIL_WARN)) {
3433 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3434 "%s: cycle - 1 != tail_cycle", __func__);
3435 log->l_flags |= XLOG_TAIL_WARN;
3436 }
3437
3438 if (space > BBTOB(tail_blocks) &&
3439 !(log->l_flags & XLOG_TAIL_WARN)) {
3440 xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES,
3441 "%s: space > BBTOB(tail_blocks)", __func__);
3442 log->l_flags |= XLOG_TAIL_WARN;
3443 }
3436 } 3444 }
3437} 3445}
3438 3446
@@ -3448,16 +3456,16 @@ xlog_verify_tail_lsn(xlog_t *log,
3448 blocks = 3456 blocks =
3449 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn)); 3457 log->l_logBBsize - (log->l_prev_block - BLOCK_LSN(tail_lsn));
3450 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize)) 3458 if (blocks < BTOBB(iclog->ic_offset)+BTOBB(log->l_iclog_hsize))
3451 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3459 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3452 } else { 3460 } else {
3453 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle); 3461 ASSERT(CYCLE_LSN(tail_lsn)+1 == log->l_prev_cycle);
3454 3462
3455 if (BLOCK_LSN(tail_lsn) == log->l_prev_block) 3463 if (BLOCK_LSN(tail_lsn) == log->l_prev_block)
3456 xlog_panic("xlog_verify_tail_lsn: tail wrapped"); 3464 xfs_emerg(log->l_mp, "%s: tail wrapped", __func__);
3457 3465
3458 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block; 3466 blocks = BLOCK_LSN(tail_lsn) - log->l_prev_block;
3459 if (blocks < BTOBB(iclog->ic_offset) + 1) 3467 if (blocks < BTOBB(iclog->ic_offset) + 1)
3460 xlog_panic("xlog_verify_tail_lsn: ran out of log space"); 3468 xfs_emerg(log->l_mp, "%s: ran out of log space", __func__);
3461 } 3469 }
3462} /* xlog_verify_tail_lsn */ 3470} /* xlog_verify_tail_lsn */
3463 3471
@@ -3497,22 +3505,23 @@ xlog_verify_iclog(xlog_t *log,
3497 icptr = log->l_iclog; 3505 icptr = log->l_iclog;
3498 for (i=0; i < log->l_iclog_bufs; i++) { 3506 for (i=0; i < log->l_iclog_bufs; i++) {
3499 if (icptr == NULL) 3507 if (icptr == NULL)
3500 xlog_panic("xlog_verify_iclog: invalid ptr"); 3508 xfs_emerg(log->l_mp, "%s: invalid ptr", __func__);
3501 icptr = icptr->ic_next; 3509 icptr = icptr->ic_next;
3502 } 3510 }
3503 if (icptr != log->l_iclog) 3511 if (icptr != log->l_iclog)
3504 xlog_panic("xlog_verify_iclog: corrupt iclog ring"); 3512 xfs_emerg(log->l_mp, "%s: corrupt iclog ring", __func__);
3505 spin_unlock(&log->l_icloglock); 3513 spin_unlock(&log->l_icloglock);
3506 3514
3507 /* check log magic numbers */ 3515 /* check log magic numbers */
3508 if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM) 3516 if (be32_to_cpu(iclog->ic_header.h_magicno) != XLOG_HEADER_MAGIC_NUM)
3509 xlog_panic("xlog_verify_iclog: invalid magic num"); 3517 xfs_emerg(log->l_mp, "%s: invalid magic num", __func__);
3510 3518
3511 ptr = (xfs_caddr_t) &iclog->ic_header; 3519 ptr = (xfs_caddr_t) &iclog->ic_header;
3512 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; 3520 for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count;
3513 ptr += BBSIZE) { 3521 ptr += BBSIZE) {
3514 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM) 3522 if (be32_to_cpu(*(__be32 *)ptr) == XLOG_HEADER_MAGIC_NUM)
3515 xlog_panic("xlog_verify_iclog: unexpected magic num"); 3523 xfs_emerg(log->l_mp, "%s: unexpected magic num",
3524 __func__);
3516 } 3525 }
3517 3526
3518 /* check fields */ 3527 /* check fields */
@@ -3542,9 +3551,10 @@ xlog_verify_iclog(xlog_t *log,
3542 } 3551 }
3543 } 3552 }
3544 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG) 3553 if (clientid != XFS_TRANSACTION && clientid != XFS_LOG)
3545 cmn_err(CE_WARN, "xlog_verify_iclog: " 3554 xfs_warn(log->l_mp,
3546 "invalid clientid %d op 0x%p offset 0x%lx", 3555 "%s: invalid clientid %d op 0x%p offset 0x%lx",
3547 clientid, ophead, (unsigned long)field_offset); 3556 __func__, clientid, ophead,
3557 (unsigned long)field_offset);
3548 3558
3549 /* check length */ 3559 /* check length */
3550 field_offset = (__psint_t) 3560 field_offset = (__psint_t)
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index d5f8be8f4bf6..5864850e9e34 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -87,10 +87,6 @@ static inline uint xlog_get_client_id(__be32 i)
87 return be32_to_cpu(i) >> 24; 87 return be32_to_cpu(i) >> 24;
88} 88}
89 89
90#define xlog_panic(args...) cmn_err(CE_PANIC, ## args)
91#define xlog_exit(args...) cmn_err(CE_PANIC, ## args)
92#define xlog_warn(args...) cmn_err(CE_WARN, ## args)
93
94/* 90/*
95 * In core log state 91 * In core log state
96 */ 92 */
@@ -148,6 +144,7 @@ static inline uint xlog_get_client_id(__be32 i)
148#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 144#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
149#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 145#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
150 shutdown */ 146 shutdown */
147#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */
151 148
152#ifdef __KERNEL__ 149#ifdef __KERNEL__
153/* 150/*
@@ -574,7 +571,7 @@ int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
574 * When we crack an atomic LSN, we sample it first so that the value will not 571 * When we crack an atomic LSN, we sample it first so that the value will not
575 * change while we are cracking it into the component values. This means we 572 * change while we are cracking it into the component values. This means we
576 * will always get consistent component values to work from. This should always 573 * will always get consistent component values to work from. This should always
577 * be used to smaple and crack LSNs taht are stored and updated in atomic 574 * be used to sample and crack LSNs that are stored and updated in atomic
578 * variables. 575 * variables.
579 */ 576 */
580static inline void 577static inline void
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index aa0ebb776903..5cc464a17c93 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -92,7 +92,7 @@ xlog_get_bp(
92 int nbblks) 92 int nbblks)
93{ 93{
94 if (!xlog_buf_bbcount_valid(log, nbblks)) { 94 if (!xlog_buf_bbcount_valid(log, nbblks)) {
95 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 95 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
96 nbblks); 96 nbblks);
97 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 97 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
98 return NULL; 98 return NULL;
@@ -101,7 +101,7 @@ xlog_get_bp(
101 /* 101 /*
102 * We do log I/O in units of log sectors (a power-of-2 102 * We do log I/O in units of log sectors (a power-of-2
103 * multiple of the basic block size), so we round up the 103 * multiple of the basic block size), so we round up the
104 * requested size to acommodate the basic blocks required 104 * requested size to accommodate the basic blocks required
105 * for complete log sectors. 105 * for complete log sectors.
106 * 106 *
107 * In addition, the buffer may be used for a non-sector- 107 * In addition, the buffer may be used for a non-sector-
@@ -112,7 +112,7 @@ xlog_get_bp(
112 * an issue. Nor will this be a problem if the log I/O is 112 * an issue. Nor will this be a problem if the log I/O is
113 * done in basic blocks (sector size 1). But otherwise we 113 * done in basic blocks (sector size 1). But otherwise we
114 * extend the buffer by one extra log sector to ensure 114 * extend the buffer by one extra log sector to ensure
115 * there's space to accomodate this possiblility. 115 * there's space to accommodate this possibility.
116 */ 116 */
117 if (nbblks > 1 && log->l_sectBBsize > 1) 117 if (nbblks > 1 && log->l_sectBBsize > 1)
118 nbblks += log->l_sectBBsize; 118 nbblks += log->l_sectBBsize;
@@ -160,7 +160,7 @@ xlog_bread_noalign(
160 int error; 160 int error;
161 161
162 if (!xlog_buf_bbcount_valid(log, nbblks)) { 162 if (!xlog_buf_bbcount_valid(log, nbblks)) {
163 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 163 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
164 nbblks); 164 nbblks);
165 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 165 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
166 return EFSCORRUPTED; 166 return EFSCORRUPTED;
@@ -219,7 +219,7 @@ xlog_bwrite(
219 int error; 219 int error;
220 220
221 if (!xlog_buf_bbcount_valid(log, nbblks)) { 221 if (!xlog_buf_bbcount_valid(log, nbblks)) {
222 xlog_warn("XFS: Invalid block length (0x%x) given for buffer", 222 xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
223 nbblks); 223 nbblks);
224 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); 224 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
225 return EFSCORRUPTED; 225 return EFSCORRUPTED;
@@ -254,9 +254,9 @@ xlog_header_check_dump(
254 xfs_mount_t *mp, 254 xfs_mount_t *mp,
255 xlog_rec_header_t *head) 255 xlog_rec_header_t *head)
256{ 256{
257 cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n", 257 xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n",
258 __func__, &mp->m_sb.sb_uuid, XLOG_FMT); 258 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
259 cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n", 259 xfs_debug(mp, " log : uuid = %pU, fmt = %d\n",
260 &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); 260 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
261} 261}
262#else 262#else
@@ -279,15 +279,15 @@ xlog_header_check_recover(
279 * a dirty log created in IRIX. 279 * a dirty log created in IRIX.
280 */ 280 */
281 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { 281 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
282 xlog_warn( 282 xfs_warn(mp,
283 "XFS: dirty log written in incompatible format - can't recover"); 283 "dirty log written in incompatible format - can't recover");
284 xlog_header_check_dump(mp, head); 284 xlog_header_check_dump(mp, head);
285 XFS_ERROR_REPORT("xlog_header_check_recover(1)", 285 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
286 XFS_ERRLEVEL_HIGH, mp); 286 XFS_ERRLEVEL_HIGH, mp);
287 return XFS_ERROR(EFSCORRUPTED); 287 return XFS_ERROR(EFSCORRUPTED);
288 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 288 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
289 xlog_warn( 289 xfs_warn(mp,
290 "XFS: dirty log entry has mismatched uuid - can't recover"); 290 "dirty log entry has mismatched uuid - can't recover");
291 xlog_header_check_dump(mp, head); 291 xlog_header_check_dump(mp, head);
292 XFS_ERROR_REPORT("xlog_header_check_recover(2)", 292 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
293 XFS_ERRLEVEL_HIGH, mp); 293 XFS_ERRLEVEL_HIGH, mp);
@@ -312,9 +312,9 @@ xlog_header_check_mount(
312 * h_fs_uuid is nil, we assume this log was last mounted 312 * h_fs_uuid is nil, we assume this log was last mounted
313 * by IRIX and continue. 313 * by IRIX and continue.
314 */ 314 */
315 xlog_warn("XFS: nil uuid in log - IRIX style log"); 315 xfs_warn(mp, "nil uuid in log - IRIX style log");
316 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 316 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
317 xlog_warn("XFS: log has mismatched uuid - can't recover"); 317 xfs_warn(mp, "log has mismatched uuid - can't recover");
318 xlog_header_check_dump(mp, head); 318 xlog_header_check_dump(mp, head);
319 XFS_ERROR_REPORT("xlog_header_check_mount", 319 XFS_ERROR_REPORT("xlog_header_check_mount",
320 XFS_ERRLEVEL_HIGH, mp); 320 XFS_ERRLEVEL_HIGH, mp);
@@ -490,8 +490,8 @@ xlog_find_verify_log_record(
490 for (i = (*last_blk) - 1; i >= 0; i--) { 490 for (i = (*last_blk) - 1; i >= 0; i--) {
491 if (i < start_blk) { 491 if (i < start_blk) {
492 /* valid log record not found */ 492 /* valid log record not found */
493 xlog_warn( 493 xfs_warn(log->l_mp,
494 "XFS: Log inconsistent (didn't find previous header)"); 494 "Log inconsistent (didn't find previous header)");
495 ASSERT(0); 495 ASSERT(0);
496 error = XFS_ERROR(EIO); 496 error = XFS_ERROR(EIO);
497 goto out; 497 goto out;
@@ -591,12 +591,12 @@ xlog_find_head(
591 * mkfs etc write a dummy unmount record to a fresh 591 * mkfs etc write a dummy unmount record to a fresh
592 * log so we can store the uuid in there 592 * log so we can store the uuid in there
593 */ 593 */
594 xlog_warn("XFS: totally zeroed log"); 594 xfs_warn(log->l_mp, "totally zeroed log");
595 } 595 }
596 596
597 return 0; 597 return 0;
598 } else if (error) { 598 } else if (error) {
599 xlog_warn("XFS: empty log check failed"); 599 xfs_warn(log->l_mp, "empty log check failed");
600 return error; 600 return error;
601 } 601 }
602 602
@@ -819,7 +819,7 @@ validate_head:
819 xlog_put_bp(bp); 819 xlog_put_bp(bp);
820 820
821 if (error) 821 if (error)
822 xlog_warn("XFS: failed to find log head"); 822 xfs_warn(log->l_mp, "failed to find log head");
823 return error; 823 return error;
824} 824}
825 825
@@ -912,7 +912,7 @@ xlog_find_tail(
912 } 912 }
913 } 913 }
914 if (!found) { 914 if (!found) {
915 xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); 915 xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
916 ASSERT(0); 916 ASSERT(0);
917 return XFS_ERROR(EIO); 917 return XFS_ERROR(EIO);
918 } 918 }
@@ -1028,7 +1028,7 @@ done:
1028 xlog_put_bp(bp); 1028 xlog_put_bp(bp);
1029 1029
1030 if (error) 1030 if (error)
1031 xlog_warn("XFS: failed to locate log tail"); 1031 xfs_warn(log->l_mp, "failed to locate log tail");
1032 return error; 1032 return error;
1033} 1033}
1034 1034
@@ -1092,7 +1092,8 @@ xlog_find_zeroed(
1092 * the first block must be 1. If it's not, maybe we're 1092 * the first block must be 1. If it's not, maybe we're
1093 * not looking at a log... Bail out. 1093 * not looking at a log... Bail out.
1094 */ 1094 */
1095 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); 1095 xfs_warn(log->l_mp,
1096 "Log inconsistent or not a log (last==0, first!=1)");
1096 return XFS_ERROR(EINVAL); 1097 return XFS_ERROR(EINVAL);
1097 } 1098 }
1098 1099
@@ -1506,8 +1507,8 @@ xlog_recover_add_to_trans(
1506 if (list_empty(&trans->r_itemq)) { 1507 if (list_empty(&trans->r_itemq)) {
1507 /* we need to catch log corruptions here */ 1508 /* we need to catch log corruptions here */
1508 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1509 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1509 xlog_warn("XFS: xlog_recover_add_to_trans: " 1510 xfs_warn(log->l_mp, "%s: bad header magic number",
1510 "bad header magic number"); 1511 __func__);
1511 ASSERT(0); 1512 ASSERT(0);
1512 return XFS_ERROR(EIO); 1513 return XFS_ERROR(EIO);
1513 } 1514 }
@@ -1534,8 +1535,8 @@ xlog_recover_add_to_trans(
1534 if (item->ri_total == 0) { /* first region to be added */ 1535 if (item->ri_total == 0) { /* first region to be added */
1535 if (in_f->ilf_size == 0 || 1536 if (in_f->ilf_size == 0 ||
1536 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { 1537 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1537 xlog_warn( 1538 xfs_warn(log->l_mp,
1538 "XFS: bad number of regions (%d) in inode log format", 1539 "bad number of regions (%d) in inode log format",
1539 in_f->ilf_size); 1540 in_f->ilf_size);
1540 ASSERT(0); 1541 ASSERT(0);
1541 return XFS_ERROR(EIO); 1542 return XFS_ERROR(EIO);
@@ -1592,8 +1593,9 @@ xlog_recover_reorder_trans(
1592 list_move_tail(&item->ri_list, &trans->r_itemq); 1593 list_move_tail(&item->ri_list, &trans->r_itemq);
1593 break; 1594 break;
1594 default: 1595 default:
1595 xlog_warn( 1596 xfs_warn(log->l_mp,
1596 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); 1597 "%s: unrecognized type of log operation",
1598 __func__);
1597 ASSERT(0); 1599 ASSERT(0);
1598 return XFS_ERROR(EIO); 1600 return XFS_ERROR(EIO);
1599 } 1601 }
@@ -1803,8 +1805,9 @@ xlog_recover_do_inode_buffer(
1803 logged_nextp = item->ri_buf[item_index].i_addr + 1805 logged_nextp = item->ri_buf[item_index].i_addr +
1804 next_unlinked_offset - reg_buf_offset; 1806 next_unlinked_offset - reg_buf_offset;
1805 if (unlikely(*logged_nextp == 0)) { 1807 if (unlikely(*logged_nextp == 0)) {
1806 xfs_fs_cmn_err(CE_ALERT, mp, 1808 xfs_alert(mp,
1807 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", 1809 "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1810 "Trying to replay bad (0) inode di_next_unlinked field.",
1808 item, bp); 1811 item, bp);
1809 XFS_ERROR_REPORT("xlog_recover_do_inode_buf", 1812 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1810 XFS_ERRLEVEL_LOW, mp); 1813 XFS_ERRLEVEL_LOW, mp);
@@ -1863,17 +1866,17 @@ xlog_recover_do_reg_buffer(
1863 if (buf_f->blf_flags & 1866 if (buf_f->blf_flags &
1864 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { 1867 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1865 if (item->ri_buf[i].i_addr == NULL) { 1868 if (item->ri_buf[i].i_addr == NULL) {
1866 cmn_err(CE_ALERT, 1869 xfs_alert(mp,
1867 "XFS: NULL dquot in %s.", __func__); 1870 "XFS: NULL dquot in %s.", __func__);
1868 goto next; 1871 goto next;
1869 } 1872 }
1870 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { 1873 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1871 cmn_err(CE_ALERT, 1874 xfs_alert(mp,
1872 "XFS: dquot too small (%d) in %s.", 1875 "XFS: dquot too small (%d) in %s.",
1873 item->ri_buf[i].i_len, __func__); 1876 item->ri_buf[i].i_len, __func__);
1874 goto next; 1877 goto next;
1875 } 1878 }
1876 error = xfs_qm_dqcheck(item->ri_buf[i].i_addr, 1879 error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
1877 -1, 0, XFS_QMOPT_DOWARN, 1880 -1, 0, XFS_QMOPT_DOWARN,
1878 "dquot_buf_recover"); 1881 "dquot_buf_recover");
1879 if (error) 1882 if (error)
@@ -1898,6 +1901,7 @@ xlog_recover_do_reg_buffer(
1898 */ 1901 */
1899int 1902int
1900xfs_qm_dqcheck( 1903xfs_qm_dqcheck(
1904 struct xfs_mount *mp,
1901 xfs_disk_dquot_t *ddq, 1905 xfs_disk_dquot_t *ddq,
1902 xfs_dqid_t id, 1906 xfs_dqid_t id,
1903 uint type, /* used only when IO_dorepair is true */ 1907 uint type, /* used only when IO_dorepair is true */
@@ -1924,14 +1928,14 @@ xfs_qm_dqcheck(
1924 */ 1928 */
1925 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { 1929 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
1926 if (flags & XFS_QMOPT_DOWARN) 1930 if (flags & XFS_QMOPT_DOWARN)
1927 cmn_err(CE_ALERT, 1931 xfs_alert(mp,
1928 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", 1932 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1929 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); 1933 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1930 errs++; 1934 errs++;
1931 } 1935 }
1932 if (ddq->d_version != XFS_DQUOT_VERSION) { 1936 if (ddq->d_version != XFS_DQUOT_VERSION) {
1933 if (flags & XFS_QMOPT_DOWARN) 1937 if (flags & XFS_QMOPT_DOWARN)
1934 cmn_err(CE_ALERT, 1938 xfs_alert(mp,
1935 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", 1939 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1936 str, id, ddq->d_version, XFS_DQUOT_VERSION); 1940 str, id, ddq->d_version, XFS_DQUOT_VERSION);
1937 errs++; 1941 errs++;
@@ -1941,7 +1945,7 @@ xfs_qm_dqcheck(
1941 ddq->d_flags != XFS_DQ_PROJ && 1945 ddq->d_flags != XFS_DQ_PROJ &&
1942 ddq->d_flags != XFS_DQ_GROUP) { 1946 ddq->d_flags != XFS_DQ_GROUP) {
1943 if (flags & XFS_QMOPT_DOWARN) 1947 if (flags & XFS_QMOPT_DOWARN)
1944 cmn_err(CE_ALERT, 1948 xfs_alert(mp,
1945 "%s : XFS dquot ID 0x%x, unknown flags 0x%x", 1949 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1946 str, id, ddq->d_flags); 1950 str, id, ddq->d_flags);
1947 errs++; 1951 errs++;
@@ -1949,7 +1953,7 @@ xfs_qm_dqcheck(
1949 1953
1950 if (id != -1 && id != be32_to_cpu(ddq->d_id)) { 1954 if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1951 if (flags & XFS_QMOPT_DOWARN) 1955 if (flags & XFS_QMOPT_DOWARN)
1952 cmn_err(CE_ALERT, 1956 xfs_alert(mp,
1953 "%s : ondisk-dquot 0x%p, ID mismatch: " 1957 "%s : ondisk-dquot 0x%p, ID mismatch: "
1954 "0x%x expected, found id 0x%x", 1958 "0x%x expected, found id 0x%x",
1955 str, ddq, id, be32_to_cpu(ddq->d_id)); 1959 str, ddq, id, be32_to_cpu(ddq->d_id));
@@ -1962,9 +1966,8 @@ xfs_qm_dqcheck(
1962 be64_to_cpu(ddq->d_blk_softlimit)) { 1966 be64_to_cpu(ddq->d_blk_softlimit)) {
1963 if (!ddq->d_btimer) { 1967 if (!ddq->d_btimer) {
1964 if (flags & XFS_QMOPT_DOWARN) 1968 if (flags & XFS_QMOPT_DOWARN)
1965 cmn_err(CE_ALERT, 1969 xfs_alert(mp,
1966 "%s : Dquot ID 0x%x (0x%p) " 1970 "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
1967 "BLK TIMER NOT STARTED",
1968 str, (int)be32_to_cpu(ddq->d_id), ddq); 1971 str, (int)be32_to_cpu(ddq->d_id), ddq);
1969 errs++; 1972 errs++;
1970 } 1973 }
@@ -1974,9 +1977,8 @@ xfs_qm_dqcheck(
1974 be64_to_cpu(ddq->d_ino_softlimit)) { 1977 be64_to_cpu(ddq->d_ino_softlimit)) {
1975 if (!ddq->d_itimer) { 1978 if (!ddq->d_itimer) {
1976 if (flags & XFS_QMOPT_DOWARN) 1979 if (flags & XFS_QMOPT_DOWARN)
1977 cmn_err(CE_ALERT, 1980 xfs_alert(mp,
1978 "%s : Dquot ID 0x%x (0x%p) " 1981 "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
1979 "INODE TIMER NOT STARTED",
1980 str, (int)be32_to_cpu(ddq->d_id), ddq); 1982 str, (int)be32_to_cpu(ddq->d_id), ddq);
1981 errs++; 1983 errs++;
1982 } 1984 }
@@ -1986,9 +1988,8 @@ xfs_qm_dqcheck(
1986 be64_to_cpu(ddq->d_rtb_softlimit)) { 1988 be64_to_cpu(ddq->d_rtb_softlimit)) {
1987 if (!ddq->d_rtbtimer) { 1989 if (!ddq->d_rtbtimer) {
1988 if (flags & XFS_QMOPT_DOWARN) 1990 if (flags & XFS_QMOPT_DOWARN)
1989 cmn_err(CE_ALERT, 1991 xfs_alert(mp,
1990 "%s : Dquot ID 0x%x (0x%p) " 1992 "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
1991 "RTBLK TIMER NOT STARTED",
1992 str, (int)be32_to_cpu(ddq->d_id), ddq); 1993 str, (int)be32_to_cpu(ddq->d_id), ddq);
1993 errs++; 1994 errs++;
1994 } 1995 }
@@ -1999,7 +2000,7 @@ xfs_qm_dqcheck(
1999 return errs; 2000 return errs;
2000 2001
2001 if (flags & XFS_QMOPT_DOWARN) 2002 if (flags & XFS_QMOPT_DOWARN)
2002 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); 2003 xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
2003 2004
2004 /* 2005 /*
2005 * Typically, a repair is only requested by quotacheck. 2006 * Typically, a repair is only requested by quotacheck.
@@ -2218,9 +2219,9 @@ xlog_recover_inode_pass2(
2218 */ 2219 */
2219 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { 2220 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
2220 xfs_buf_relse(bp); 2221 xfs_buf_relse(bp);
2221 xfs_fs_cmn_err(CE_ALERT, mp, 2222 xfs_alert(mp,
2222 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2223 "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2223 dip, bp, in_f->ilf_ino); 2224 __func__, dip, bp, in_f->ilf_ino);
2224 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", 2225 XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2225 XFS_ERRLEVEL_LOW, mp); 2226 XFS_ERRLEVEL_LOW, mp);
2226 error = EFSCORRUPTED; 2227 error = EFSCORRUPTED;
@@ -2229,9 +2230,9 @@ xlog_recover_inode_pass2(
2229 dicp = item->ri_buf[1].i_addr; 2230 dicp = item->ri_buf[1].i_addr;
2230 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2231 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2231 xfs_buf_relse(bp); 2232 xfs_buf_relse(bp);
2232 xfs_fs_cmn_err(CE_ALERT, mp, 2233 xfs_alert(mp,
2233 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2234 "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2234 item, in_f->ilf_ino); 2235 __func__, item, in_f->ilf_ino);
2235 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", 2236 XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2236 XFS_ERRLEVEL_LOW, mp); 2237 XFS_ERRLEVEL_LOW, mp);
2237 error = EFSCORRUPTED; 2238 error = EFSCORRUPTED;
@@ -2263,9 +2264,10 @@ xlog_recover_inode_pass2(
2263 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 2264 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2264 XFS_ERRLEVEL_LOW, mp, dicp); 2265 XFS_ERRLEVEL_LOW, mp, dicp);
2265 xfs_buf_relse(bp); 2266 xfs_buf_relse(bp);
2266 xfs_fs_cmn_err(CE_ALERT, mp, 2267 xfs_alert(mp,
2267 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2268 "%s: Bad regular inode log record, rec ptr 0x%p, "
2268 item, dip, bp, in_f->ilf_ino); 2269 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2270 __func__, item, dip, bp, in_f->ilf_ino);
2269 error = EFSCORRUPTED; 2271 error = EFSCORRUPTED;
2270 goto error; 2272 goto error;
2271 } 2273 }
@@ -2276,9 +2278,10 @@ xlog_recover_inode_pass2(
2276 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 2278 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2277 XFS_ERRLEVEL_LOW, mp, dicp); 2279 XFS_ERRLEVEL_LOW, mp, dicp);
2278 xfs_buf_relse(bp); 2280 xfs_buf_relse(bp);
2279 xfs_fs_cmn_err(CE_ALERT, mp, 2281 xfs_alert(mp,
2280 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2282 "%s: Bad dir inode log record, rec ptr 0x%p, "
2281 item, dip, bp, in_f->ilf_ino); 2283 "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2284 __func__, item, dip, bp, in_f->ilf_ino);
2282 error = EFSCORRUPTED; 2285 error = EFSCORRUPTED;
2283 goto error; 2286 goto error;
2284 } 2287 }
@@ -2287,9 +2290,10 @@ xlog_recover_inode_pass2(
2287 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 2290 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2288 XFS_ERRLEVEL_LOW, mp, dicp); 2291 XFS_ERRLEVEL_LOW, mp, dicp);
2289 xfs_buf_relse(bp); 2292 xfs_buf_relse(bp);
2290 xfs_fs_cmn_err(CE_ALERT, mp, 2293 xfs_alert(mp,
2291 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2294 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2292 item, dip, bp, in_f->ilf_ino, 2295 "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2296 __func__, item, dip, bp, in_f->ilf_ino,
2293 dicp->di_nextents + dicp->di_anextents, 2297 dicp->di_nextents + dicp->di_anextents,
2294 dicp->di_nblocks); 2298 dicp->di_nblocks);
2295 error = EFSCORRUPTED; 2299 error = EFSCORRUPTED;
@@ -2299,8 +2303,9 @@ xlog_recover_inode_pass2(
2299 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 2303 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2300 XFS_ERRLEVEL_LOW, mp, dicp); 2304 XFS_ERRLEVEL_LOW, mp, dicp);
2301 xfs_buf_relse(bp); 2305 xfs_buf_relse(bp);
2302 xfs_fs_cmn_err(CE_ALERT, mp, 2306 xfs_alert(mp,
2303 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2307 "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2308 "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2304 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); 2309 item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2305 error = EFSCORRUPTED; 2310 error = EFSCORRUPTED;
2306 goto error; 2311 goto error;
@@ -2309,9 +2314,9 @@ xlog_recover_inode_pass2(
2309 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 2314 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2310 XFS_ERRLEVEL_LOW, mp, dicp); 2315 XFS_ERRLEVEL_LOW, mp, dicp);
2311 xfs_buf_relse(bp); 2316 xfs_buf_relse(bp);
2312 xfs_fs_cmn_err(CE_ALERT, mp, 2317 xfs_alert(mp,
2313 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", 2318 "%s: Bad inode log record length %d, rec ptr 0x%p",
2314 item->ri_buf[1].i_len, item); 2319 __func__, item->ri_buf[1].i_len, item);
2315 error = EFSCORRUPTED; 2320 error = EFSCORRUPTED;
2316 goto error; 2321 goto error;
2317 } 2322 }
@@ -2398,7 +2403,7 @@ xlog_recover_inode_pass2(
2398 break; 2403 break;
2399 2404
2400 default: 2405 default:
2401 xlog_warn("XFS: xlog_recover_inode_pass2: Invalid flag"); 2406 xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2402 ASSERT(0); 2407 ASSERT(0);
2403 xfs_buf_relse(bp); 2408 xfs_buf_relse(bp);
2404 error = EIO; 2409 error = EIO;
@@ -2467,13 +2472,11 @@ xlog_recover_dquot_pass2(
2467 2472
2468 recddq = item->ri_buf[1].i_addr; 2473 recddq = item->ri_buf[1].i_addr;
2469 if (recddq == NULL) { 2474 if (recddq == NULL) {
2470 cmn_err(CE_ALERT, 2475 xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2471 "XFS: NULL dquot in %s.", __func__);
2472 return XFS_ERROR(EIO); 2476 return XFS_ERROR(EIO);
2473 } 2477 }
2474 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { 2478 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2475 cmn_err(CE_ALERT, 2479 xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2476 "XFS: dquot too small (%d) in %s.",
2477 item->ri_buf[1].i_len, __func__); 2480 item->ri_buf[1].i_len, __func__);
2478 return XFS_ERROR(EIO); 2481 return XFS_ERROR(EIO);
2479 } 2482 }
@@ -2498,12 +2501,10 @@ xlog_recover_dquot_pass2(
2498 */ 2501 */
2499 dq_f = item->ri_buf[0].i_addr; 2502 dq_f = item->ri_buf[0].i_addr;
2500 ASSERT(dq_f); 2503 ASSERT(dq_f);
2501 if ((error = xfs_qm_dqcheck(recddq, 2504 error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2502 dq_f->qlf_id, 2505 "xlog_recover_dquot_pass2 (log copy)");
2503 0, XFS_QMOPT_DOWARN, 2506 if (error)
2504 "xlog_recover_dquot_pass2 (log copy)"))) {
2505 return XFS_ERROR(EIO); 2507 return XFS_ERROR(EIO);
2506 }
2507 ASSERT(dq_f->qlf_len == 1); 2508 ASSERT(dq_f->qlf_len == 1);
2508 2509
2509 error = xfs_read_buf(mp, mp->m_ddev_targp, 2510 error = xfs_read_buf(mp, mp->m_ddev_targp,
@@ -2523,8 +2524,9 @@ xlog_recover_dquot_pass2(
2523 * was among a chunk of dquots created earlier, and we did some 2524 * was among a chunk of dquots created earlier, and we did some
2524 * minimal initialization then. 2525 * minimal initialization then.
2525 */ 2526 */
2526 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2527 error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2527 "xlog_recover_dquot_pass2")) { 2528 "xlog_recover_dquot_pass2");
2529 if (error) {
2528 xfs_buf_relse(bp); 2530 xfs_buf_relse(bp);
2529 return XFS_ERROR(EIO); 2531 return XFS_ERROR(EIO);
2530 } 2532 }
@@ -2676,9 +2678,8 @@ xlog_recover_commit_pass1(
2676 /* nothing to do in pass 1 */ 2678 /* nothing to do in pass 1 */
2677 return 0; 2679 return 0;
2678 default: 2680 default:
2679 xlog_warn( 2681 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2680 "XFS: invalid item type (%d) xlog_recover_commit_pass1", 2682 __func__, ITEM_TYPE(item));
2681 ITEM_TYPE(item));
2682 ASSERT(0); 2683 ASSERT(0);
2683 return XFS_ERROR(EIO); 2684 return XFS_ERROR(EIO);
2684 } 2685 }
@@ -2707,9 +2708,8 @@ xlog_recover_commit_pass2(
2707 /* nothing to do in pass2 */ 2708 /* nothing to do in pass2 */
2708 return 0; 2709 return 0;
2709 default: 2710 default:
2710 xlog_warn( 2711 xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2711 "XFS: invalid item type (%d) xlog_recover_commit_pass2", 2712 __func__, ITEM_TYPE(item));
2712 ITEM_TYPE(item));
2713 ASSERT(0); 2713 ASSERT(0);
2714 return XFS_ERROR(EIO); 2714 return XFS_ERROR(EIO);
2715 } 2715 }
@@ -2751,10 +2751,11 @@ xlog_recover_commit_trans(
2751 2751
2752STATIC int 2752STATIC int
2753xlog_recover_unmount_trans( 2753xlog_recover_unmount_trans(
2754 struct log *log,
2754 xlog_recover_t *trans) 2755 xlog_recover_t *trans)
2755{ 2756{
2756 /* Do nothing now */ 2757 /* Do nothing now */
2757 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); 2758 xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2758 return 0; 2759 return 0;
2759} 2760}
2760 2761
@@ -2797,8 +2798,8 @@ xlog_recover_process_data(
2797 dp += sizeof(xlog_op_header_t); 2798 dp += sizeof(xlog_op_header_t);
2798 if (ohead->oh_clientid != XFS_TRANSACTION && 2799 if (ohead->oh_clientid != XFS_TRANSACTION &&
2799 ohead->oh_clientid != XFS_LOG) { 2800 ohead->oh_clientid != XFS_LOG) {
2800 xlog_warn( 2801 xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2801 "XFS: xlog_recover_process_data: bad clientid"); 2802 __func__, ohead->oh_clientid);
2802 ASSERT(0); 2803 ASSERT(0);
2803 return (XFS_ERROR(EIO)); 2804 return (XFS_ERROR(EIO));
2804 } 2805 }
@@ -2811,8 +2812,8 @@ xlog_recover_process_data(
2811 be64_to_cpu(rhead->h_lsn)); 2812 be64_to_cpu(rhead->h_lsn));
2812 } else { 2813 } else {
2813 if (dp + be32_to_cpu(ohead->oh_len) > lp) { 2814 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2814 xlog_warn( 2815 xfs_warn(log->l_mp, "%s: bad length 0x%x",
2815 "XFS: xlog_recover_process_data: bad length"); 2816 __func__, be32_to_cpu(ohead->oh_len));
2816 WARN_ON(1); 2817 WARN_ON(1);
2817 return (XFS_ERROR(EIO)); 2818 return (XFS_ERROR(EIO));
2818 } 2819 }
@@ -2825,7 +2826,7 @@ xlog_recover_process_data(
2825 trans, pass); 2826 trans, pass);
2826 break; 2827 break;
2827 case XLOG_UNMOUNT_TRANS: 2828 case XLOG_UNMOUNT_TRANS:
2828 error = xlog_recover_unmount_trans(trans); 2829 error = xlog_recover_unmount_trans(log, trans);
2829 break; 2830 break;
2830 case XLOG_WAS_CONT_TRANS: 2831 case XLOG_WAS_CONT_TRANS:
2831 error = xlog_recover_add_to_cont_trans(log, 2832 error = xlog_recover_add_to_cont_trans(log,
@@ -2833,8 +2834,8 @@ xlog_recover_process_data(
2833 be32_to_cpu(ohead->oh_len)); 2834 be32_to_cpu(ohead->oh_len));
2834 break; 2835 break;
2835 case XLOG_START_TRANS: 2836 case XLOG_START_TRANS:
2836 xlog_warn( 2837 xfs_warn(log->l_mp, "%s: bad transaction",
2837 "XFS: xlog_recover_process_data: bad transaction"); 2838 __func__);
2838 ASSERT(0); 2839 ASSERT(0);
2839 error = XFS_ERROR(EIO); 2840 error = XFS_ERROR(EIO);
2840 break; 2841 break;
@@ -2844,8 +2845,8 @@ xlog_recover_process_data(
2844 dp, be32_to_cpu(ohead->oh_len)); 2845 dp, be32_to_cpu(ohead->oh_len));
2845 break; 2846 break;
2846 default: 2847 default:
2847 xlog_warn( 2848 xfs_warn(log->l_mp, "%s: bad flag 0x%x",
2848 "XFS: xlog_recover_process_data: bad flag"); 2849 __func__, flags);
2849 ASSERT(0); 2850 ASSERT(0);
2850 error = XFS_ERROR(EIO); 2851 error = XFS_ERROR(EIO);
2851 break; 2852 break;
@@ -3030,8 +3031,7 @@ xlog_recover_clear_agi_bucket(
3030out_abort: 3031out_abort:
3031 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3032 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3032out_error: 3033out_error:
3033 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " 3034 xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3034 "failed to clear agi %d. Continuing.", agno);
3035 return; 3035 return;
3036} 3036}
3037 3037
@@ -3282,7 +3282,7 @@ xlog_valid_rec_header(
3282 if (unlikely( 3282 if (unlikely(
3283 (!rhead->h_version || 3283 (!rhead->h_version ||
3284 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 3284 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3285 xlog_warn("XFS: %s: unrecognised log version (%d).", 3285 xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
3286 __func__, be32_to_cpu(rhead->h_version)); 3286 __func__, be32_to_cpu(rhead->h_version));
3287 return XFS_ERROR(EIO); 3287 return XFS_ERROR(EIO);
3288 } 3288 }
@@ -3740,10 +3740,9 @@ xlog_recover(
3740 return error; 3740 return error;
3741 } 3741 }
3742 3742
3743 cmn_err(CE_NOTE, 3743 xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3744 "Starting XFS recovery on filesystem: %s (logdev: %s)", 3744 log->l_mp->m_logname ? log->l_mp->m_logname
3745 log->l_mp->m_fsname, log->l_mp->m_logname ? 3745 : "internal");
3746 log->l_mp->m_logname : "internal");
3747 3746
3748 error = xlog_do_recover(log, head_blk, tail_blk); 3747 error = xlog_do_recover(log, head_blk, tail_blk);
3749 log->l_flags |= XLOG_RECOVERY_NEEDED; 3748 log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3776,9 +3775,7 @@ xlog_recover_finish(
3776 int error; 3775 int error;
3777 error = xlog_recover_process_efis(log); 3776 error = xlog_recover_process_efis(log);
3778 if (error) { 3777 if (error) {
3779 cmn_err(CE_ALERT, 3778 xfs_alert(log->l_mp, "Failed to recover EFIs");
3780 "Failed to recover EFIs on filesystem: %s",
3781 log->l_mp->m_fsname);
3782 return error; 3779 return error;
3783 } 3780 }
3784 /* 3781 /*
@@ -3793,15 +3790,12 @@ xlog_recover_finish(
3793 3790
3794 xlog_recover_check_summary(log); 3791 xlog_recover_check_summary(log);
3795 3792
3796 cmn_err(CE_NOTE, 3793 xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3797 "Ending XFS recovery on filesystem: %s (logdev: %s)", 3794 log->l_mp->m_logname ? log->l_mp->m_logname
3798 log->l_mp->m_fsname, log->l_mp->m_logname ? 3795 : "internal");
3799 log->l_mp->m_logname : "internal");
3800 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3796 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3801 } else { 3797 } else {
3802 cmn_err(CE_DEBUG, 3798 xfs_info(log->l_mp, "Ending clean mount");
3803 "Ending clean XFS mount for filesystem: %s\n",
3804 log->l_mp->m_fsname);
3805 } 3799 }
3806 return 0; 3800 return 0;
3807} 3801}
@@ -3834,10 +3828,8 @@ xlog_recover_check_summary(
3834 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 3828 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3835 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); 3829 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3836 if (error) { 3830 if (error) {
3837 xfs_fs_cmn_err(CE_ALERT, mp, 3831 xfs_alert(mp, "%s agf read failed agno %d error %d",
3838 "xlog_recover_check_summary(agf)" 3832 __func__, agno, error);
3839 "agf read failed agno %d error %d",
3840 agno, error);
3841 } else { 3833 } else {
3842 agfp = XFS_BUF_TO_AGF(agfbp); 3834 agfp = XFS_BUF_TO_AGF(agfbp);
3843 freeblks += be32_to_cpu(agfp->agf_freeblks) + 3835 freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3846,7 +3838,10 @@ xlog_recover_check_summary(
3846 } 3838 }
3847 3839
3848 error = xfs_read_agi(mp, NULL, agno, &agibp); 3840 error = xfs_read_agi(mp, NULL, agno, &agibp);
3849 if (!error) { 3841 if (error) {
3842 xfs_alert(mp, "%s agi read failed agno %d error %d",
3843 __func__, agno, error);
3844 } else {
3850 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); 3845 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
3851 3846
3852 itotal += be32_to_cpu(agi->agi_count); 3847 itotal += be32_to_cpu(agi->agi_count);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d447aef84bc3..bb3f9a7b24ed 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -133,9 +133,7 @@ xfs_uuid_mount(
133 return 0; 133 return 0;
134 134
135 if (uuid_is_nil(uuid)) { 135 if (uuid_is_nil(uuid)) {
136 cmn_err(CE_WARN, 136 xfs_warn(mp, "Filesystem has nil UUID - can't mount");
137 "XFS: Filesystem %s has nil UUID - can't mount",
138 mp->m_fsname);
139 return XFS_ERROR(EINVAL); 137 return XFS_ERROR(EINVAL);
140 } 138 }
141 139
@@ -163,8 +161,7 @@ xfs_uuid_mount(
163 161
164 out_duplicate: 162 out_duplicate:
165 mutex_unlock(&xfs_uuid_table_mutex); 163 mutex_unlock(&xfs_uuid_table_mutex);
166 cmn_err(CE_WARN, "XFS: Filesystem %s has duplicate UUID - can't mount", 164 xfs_warn(mp, "Filesystem has duplicate UUID - can't mount");
167 mp->m_fsname);
168 return XFS_ERROR(EINVAL); 165 return XFS_ERROR(EINVAL);
169} 166}
170 167
@@ -311,6 +308,8 @@ xfs_mount_validate_sb(
311 xfs_sb_t *sbp, 308 xfs_sb_t *sbp,
312 int flags) 309 int flags)
313{ 310{
311 int loud = !(flags & XFS_MFSI_QUIET);
312
314 /* 313 /*
315 * If the log device and data device have the 314 * If the log device and data device have the
316 * same device number, the log is internal. 315 * same device number, the log is internal.
@@ -319,28 +318,32 @@ xfs_mount_validate_sb(
319 * a volume filesystem in a non-volume manner. 318 * a volume filesystem in a non-volume manner.
320 */ 319 */
321 if (sbp->sb_magicnum != XFS_SB_MAGIC) { 320 if (sbp->sb_magicnum != XFS_SB_MAGIC) {
322 xfs_fs_mount_cmn_err(flags, "bad magic number"); 321 if (loud)
322 xfs_warn(mp, "bad magic number");
323 return XFS_ERROR(EWRONGFS); 323 return XFS_ERROR(EWRONGFS);
324 } 324 }
325 325
326 if (!xfs_sb_good_version(sbp)) { 326 if (!xfs_sb_good_version(sbp)) {
327 xfs_fs_mount_cmn_err(flags, "bad version"); 327 if (loud)
328 xfs_warn(mp, "bad version");
328 return XFS_ERROR(EWRONGFS); 329 return XFS_ERROR(EWRONGFS);
329 } 330 }
330 331
331 if (unlikely( 332 if (unlikely(
332 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { 333 sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
333 xfs_fs_mount_cmn_err(flags, 334 if (loud)
334 "filesystem is marked as having an external log; " 335 xfs_warn(mp,
335 "specify logdev on the\nmount command line."); 336 "filesystem is marked as having an external log; "
337 "specify logdev on the mount command line.");
336 return XFS_ERROR(EINVAL); 338 return XFS_ERROR(EINVAL);
337 } 339 }
338 340
339 if (unlikely( 341 if (unlikely(
340 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { 342 sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
341 xfs_fs_mount_cmn_err(flags, 343 if (loud)
342 "filesystem is marked as having an internal log; " 344 xfs_warn(mp,
343 "do not specify logdev on\nthe mount command line."); 345 "filesystem is marked as having an internal log; "
346 "do not specify logdev on the mount command line.");
344 return XFS_ERROR(EINVAL); 347 return XFS_ERROR(EINVAL);
345 } 348 }
346 349
@@ -369,7 +372,8 @@ xfs_mount_validate_sb(
369 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || 372 (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) ||
370 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || 373 (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) ||
371 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { 374 (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
372 xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); 375 if (loud)
376 xfs_warn(mp, "SB sanity check 1 failed");
373 return XFS_ERROR(EFSCORRUPTED); 377 return XFS_ERROR(EFSCORRUPTED);
374 } 378 }
375 379
@@ -382,7 +386,8 @@ xfs_mount_validate_sb(
382 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || 386 (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks ||
383 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * 387 sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) *
384 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { 388 sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) {
385 xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); 389 if (loud)
390 xfs_warn(mp, "SB sanity check 2 failed");
386 return XFS_ERROR(EFSCORRUPTED); 391 return XFS_ERROR(EFSCORRUPTED);
387 } 392 }
388 393
@@ -390,12 +395,12 @@ xfs_mount_validate_sb(
390 * Until this is fixed only page-sized or smaller data blocks work. 395 * Until this is fixed only page-sized or smaller data blocks work.
391 */ 396 */
392 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { 397 if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
393 xfs_fs_mount_cmn_err(flags, 398 if (loud) {
394 "file system with blocksize %d bytes", 399 xfs_warn(mp,
395 sbp->sb_blocksize); 400 "File system with blocksize %d bytes. "
396 xfs_fs_mount_cmn_err(flags, 401 "Only pagesize (%ld) or less will currently work.",
397 "only pagesize (%ld) or less will currently work.", 402 sbp->sb_blocksize, PAGE_SIZE);
398 PAGE_SIZE); 403 }
399 return XFS_ERROR(ENOSYS); 404 return XFS_ERROR(ENOSYS);
400 } 405 }
401 406
@@ -409,21 +414,23 @@ xfs_mount_validate_sb(
409 case 2048: 414 case 2048:
410 break; 415 break;
411 default: 416 default:
412 xfs_fs_mount_cmn_err(flags, 417 if (loud)
413 "inode size of %d bytes not supported", 418 xfs_warn(mp, "inode size of %d bytes not supported",
414 sbp->sb_inodesize); 419 sbp->sb_inodesize);
415 return XFS_ERROR(ENOSYS); 420 return XFS_ERROR(ENOSYS);
416 } 421 }
417 422
418 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || 423 if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
419 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 424 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
420 xfs_fs_mount_cmn_err(flags, 425 if (loud)
421 "file system too large to be mounted on this system."); 426 xfs_warn(mp,
427 "file system too large to be mounted on this system.");
422 return XFS_ERROR(EFBIG); 428 return XFS_ERROR(EFBIG);
423 } 429 }
424 430
425 if (unlikely(sbp->sb_inprogress)) { 431 if (unlikely(sbp->sb_inprogress)) {
426 xfs_fs_mount_cmn_err(flags, "file system busy"); 432 if (loud)
433 xfs_warn(mp, "file system busy");
427 return XFS_ERROR(EFSCORRUPTED); 434 return XFS_ERROR(EFSCORRUPTED);
428 } 435 }
429 436
@@ -431,8 +438,9 @@ xfs_mount_validate_sb(
431 * Version 1 directory format has never worked on Linux. 438 * Version 1 directory format has never worked on Linux.
432 */ 439 */
433 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) { 440 if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
434 xfs_fs_mount_cmn_err(flags, 441 if (loud)
435 "file system using version 1 directory format"); 442 xfs_warn(mp,
443 "file system using version 1 directory format");
436 return XFS_ERROR(ENOSYS); 444 return XFS_ERROR(ENOSYS);
437 } 445 }
438 446
@@ -673,6 +681,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
673 unsigned int sector_size; 681 unsigned int sector_size;
674 xfs_buf_t *bp; 682 xfs_buf_t *bp;
675 int error; 683 int error;
684 int loud = !(flags & XFS_MFSI_QUIET);
676 685
677 ASSERT(mp->m_sb_bp == NULL); 686 ASSERT(mp->m_sb_bp == NULL);
678 ASSERT(mp->m_ddev_targp != NULL); 687 ASSERT(mp->m_ddev_targp != NULL);
@@ -688,7 +697,8 @@ reread:
688 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 697 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
689 XFS_SB_DADDR, sector_size, 0); 698 XFS_SB_DADDR, sector_size, 0);
690 if (!bp) { 699 if (!bp) {
691 xfs_fs_mount_cmn_err(flags, "SB buffer read failed"); 700 if (loud)
701 xfs_warn(mp, "SB buffer read failed");
692 return EIO; 702 return EIO;
693 } 703 }
694 704
@@ -699,7 +709,8 @@ reread:
699 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp)); 709 xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
700 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); 710 error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
701 if (error) { 711 if (error) {
702 xfs_fs_mount_cmn_err(flags, "SB validate failed"); 712 if (loud)
713 xfs_warn(mp, "SB validate failed");
703 goto release_buf; 714 goto release_buf;
704 } 715 }
705 716
@@ -707,9 +718,9 @@ reread:
707 * We must be able to do sector-sized and sector-aligned IO. 718 * We must be able to do sector-sized and sector-aligned IO.
708 */ 719 */
709 if (sector_size > mp->m_sb.sb_sectsize) { 720 if (sector_size > mp->m_sb.sb_sectsize) {
710 xfs_fs_mount_cmn_err(flags, 721 if (loud)
711 "device supports only %u byte sectors (not %u)", 722 xfs_warn(mp, "device supports %u byte sectors (not %u)",
712 sector_size, mp->m_sb.sb_sectsize); 723 sector_size, mp->m_sb.sb_sectsize);
713 error = ENOSYS; 724 error = ENOSYS;
714 goto release_buf; 725 goto release_buf;
715 } 726 }
@@ -853,8 +864,7 @@ xfs_update_alignment(xfs_mount_t *mp)
853 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) || 864 if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
854 (BBTOB(mp->m_swidth) & mp->m_blockmask)) { 865 (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
855 if (mp->m_flags & XFS_MOUNT_RETERR) { 866 if (mp->m_flags & XFS_MOUNT_RETERR) {
856 cmn_err(CE_WARN, 867 xfs_warn(mp, "alignment check 1 failed");
857 "XFS: alignment check 1 failed");
858 return XFS_ERROR(EINVAL); 868 return XFS_ERROR(EINVAL);
859 } 869 }
860 mp->m_dalign = mp->m_swidth = 0; 870 mp->m_dalign = mp->m_swidth = 0;
@@ -867,8 +877,9 @@ xfs_update_alignment(xfs_mount_t *mp)
867 if (mp->m_flags & XFS_MOUNT_RETERR) { 877 if (mp->m_flags & XFS_MOUNT_RETERR) {
868 return XFS_ERROR(EINVAL); 878 return XFS_ERROR(EINVAL);
869 } 879 }
870 xfs_fs_cmn_err(CE_WARN, mp, 880 xfs_warn(mp,
871"stripe alignment turned off: sunit(%d)/swidth(%d) incompatible with agsize(%d)", 881 "stripe alignment turned off: sunit(%d)/swidth(%d) "
882 "incompatible with agsize(%d)",
872 mp->m_dalign, mp->m_swidth, 883 mp->m_dalign, mp->m_swidth,
873 sbp->sb_agblocks); 884 sbp->sb_agblocks);
874 885
@@ -878,9 +889,9 @@ xfs_update_alignment(xfs_mount_t *mp)
878 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth); 889 mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
879 } else { 890 } else {
880 if (mp->m_flags & XFS_MOUNT_RETERR) { 891 if (mp->m_flags & XFS_MOUNT_RETERR) {
881 xfs_fs_cmn_err(CE_WARN, mp, 892 xfs_warn(mp,
882"stripe alignment turned off: sunit(%d) less than bsize(%d)", 893 "stripe alignment turned off: sunit(%d) less than bsize(%d)",
883 mp->m_dalign, 894 mp->m_dalign,
884 mp->m_blockmask +1); 895 mp->m_blockmask +1);
885 return XFS_ERROR(EINVAL); 896 return XFS_ERROR(EINVAL);
886 } 897 }
@@ -1026,14 +1037,14 @@ xfs_check_sizes(xfs_mount_t *mp)
1026 1037
1027 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 1038 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
1028 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1039 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
1029 cmn_err(CE_WARN, "XFS: filesystem size mismatch detected"); 1040 xfs_warn(mp, "filesystem size mismatch detected");
1030 return XFS_ERROR(EFBIG); 1041 return XFS_ERROR(EFBIG);
1031 } 1042 }
1032 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp, 1043 bp = xfs_buf_read_uncached(mp, mp->m_ddev_targp,
1033 d - XFS_FSS_TO_BB(mp, 1), 1044 d - XFS_FSS_TO_BB(mp, 1),
1034 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0); 1045 BBTOB(XFS_FSS_TO_BB(mp, 1)), 0);
1035 if (!bp) { 1046 if (!bp) {
1036 cmn_err(CE_WARN, "XFS: last sector read failed"); 1047 xfs_warn(mp, "last sector read failed");
1037 return EIO; 1048 return EIO;
1038 } 1049 }
1039 xfs_buf_relse(bp); 1050 xfs_buf_relse(bp);
@@ -1041,14 +1052,14 @@ xfs_check_sizes(xfs_mount_t *mp)
1041 if (mp->m_logdev_targp != mp->m_ddev_targp) { 1052 if (mp->m_logdev_targp != mp->m_ddev_targp) {
1042 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1053 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1043 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1054 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1044 cmn_err(CE_WARN, "XFS: log size mismatch detected"); 1055 xfs_warn(mp, "log size mismatch detected");
1045 return XFS_ERROR(EFBIG); 1056 return XFS_ERROR(EFBIG);
1046 } 1057 }
1047 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp, 1058 bp = xfs_buf_read_uncached(mp, mp->m_logdev_targp,
1048 d - XFS_FSB_TO_BB(mp, 1), 1059 d - XFS_FSB_TO_BB(mp, 1),
1049 XFS_FSB_TO_B(mp, 1), 0); 1060 XFS_FSB_TO_B(mp, 1), 0);
1050 if (!bp) { 1061 if (!bp) {
1051 cmn_err(CE_WARN, "XFS: log device read failed"); 1062 xfs_warn(mp, "log device read failed");
1052 return EIO; 1063 return EIO;
1053 } 1064 }
1054 xfs_buf_relse(bp); 1065 xfs_buf_relse(bp);
@@ -1086,7 +1097,7 @@ xfs_mount_reset_sbqflags(
1086 return 0; 1097 return 0;
1087 1098
1088#ifdef QUOTADEBUG 1099#ifdef QUOTADEBUG
1089 xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes"); 1100 xfs_notice(mp, "Writing superblock quota changes");
1090#endif 1101#endif
1091 1102
1092 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE); 1103 tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
@@ -1094,8 +1105,7 @@ xfs_mount_reset_sbqflags(
1094 XFS_DEFAULT_LOG_COUNT); 1105 XFS_DEFAULT_LOG_COUNT);
1095 if (error) { 1106 if (error) {
1096 xfs_trans_cancel(tp, 0); 1107 xfs_trans_cancel(tp, 0);
1097 xfs_fs_cmn_err(CE_ALERT, mp, 1108 xfs_alert(mp, "%s: Superblock update failed!", __func__);
1098 "xfs_mount_reset_sbqflags: Superblock update failed!");
1099 return error; 1109 return error;
1100 } 1110 }
1101 1111
@@ -1161,8 +1171,7 @@ xfs_mountfs(
1161 * transaction subsystem is online. 1171 * transaction subsystem is online.
1162 */ 1172 */
1163 if (xfs_sb_has_mismatched_features2(sbp)) { 1173 if (xfs_sb_has_mismatched_features2(sbp)) {
1164 cmn_err(CE_WARN, 1174 xfs_warn(mp, "correcting sb_features alignment problem");
1165 "XFS: correcting sb_features alignment problem");
1166 sbp->sb_features2 |= sbp->sb_bad_features2; 1175 sbp->sb_features2 |= sbp->sb_bad_features2;
1167 sbp->sb_bad_features2 = sbp->sb_features2; 1176 sbp->sb_bad_features2 = sbp->sb_features2;
1168 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2; 1177 mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
@@ -1241,7 +1250,7 @@ xfs_mountfs(
1241 */ 1250 */
1242 error = xfs_rtmount_init(mp); 1251 error = xfs_rtmount_init(mp);
1243 if (error) { 1252 if (error) {
1244 cmn_err(CE_WARN, "XFS: RT mount failed"); 1253 xfs_warn(mp, "RT mount failed");
1245 goto out_remove_uuid; 1254 goto out_remove_uuid;
1246 } 1255 }
1247 1256
@@ -1272,12 +1281,12 @@ xfs_mountfs(
1272 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC); 1281 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1273 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 1282 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1274 if (error) { 1283 if (error) {
1275 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); 1284 xfs_warn(mp, "Failed per-ag init: %d", error);
1276 goto out_remove_uuid; 1285 goto out_remove_uuid;
1277 } 1286 }
1278 1287
1279 if (!sbp->sb_logblocks) { 1288 if (!sbp->sb_logblocks) {
1280 cmn_err(CE_WARN, "XFS: no log defined"); 1289 xfs_warn(mp, "no log defined");
1281 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp); 1290 XFS_ERROR_REPORT("xfs_mountfs", XFS_ERRLEVEL_LOW, mp);
1282 error = XFS_ERROR(EFSCORRUPTED); 1291 error = XFS_ERROR(EFSCORRUPTED);
1283 goto out_free_perag; 1292 goto out_free_perag;
@@ -1290,7 +1299,7 @@ xfs_mountfs(
1290 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart), 1299 XFS_FSB_TO_DADDR(mp, sbp->sb_logstart),
1291 XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); 1300 XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
1292 if (error) { 1301 if (error) {
1293 cmn_err(CE_WARN, "XFS: log mount failed"); 1302 xfs_warn(mp, "log mount failed");
1294 goto out_free_perag; 1303 goto out_free_perag;
1295 } 1304 }
1296 1305
@@ -1327,16 +1336,14 @@ xfs_mountfs(
1327 */ 1336 */
1328 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip); 1337 error = xfs_iget(mp, NULL, sbp->sb_rootino, 0, XFS_ILOCK_EXCL, &rip);
1329 if (error) { 1338 if (error) {
1330 cmn_err(CE_WARN, "XFS: failed to read root inode"); 1339 xfs_warn(mp, "failed to read root inode");
1331 goto out_log_dealloc; 1340 goto out_log_dealloc;
1332 } 1341 }
1333 1342
1334 ASSERT(rip != NULL); 1343 ASSERT(rip != NULL);
1335 1344
1336 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { 1345 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
1337 cmn_err(CE_WARN, "XFS: corrupted root inode"); 1346 xfs_warn(mp, "corrupted root inode %llu: not a directory",
1338 cmn_err(CE_WARN, "Device %s - root %llu is not a directory",
1339 XFS_BUFTARG_NAME(mp->m_ddev_targp),
1340 (unsigned long long)rip->i_ino); 1347 (unsigned long long)rip->i_ino);
1341 xfs_iunlock(rip, XFS_ILOCK_EXCL); 1348 xfs_iunlock(rip, XFS_ILOCK_EXCL);
1342 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW, 1349 XFS_ERROR_REPORT("xfs_mountfs_int(2)", XFS_ERRLEVEL_LOW,
@@ -1356,7 +1363,7 @@ xfs_mountfs(
1356 /* 1363 /*
1357 * Free up the root inode. 1364 * Free up the root inode.
1358 */ 1365 */
1359 cmn_err(CE_WARN, "XFS: failed to read RT inodes"); 1366 xfs_warn(mp, "failed to read RT inodes");
1360 goto out_rele_rip; 1367 goto out_rele_rip;
1361 } 1368 }
1362 1369
@@ -1368,7 +1375,7 @@ xfs_mountfs(
1368 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) { 1375 if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1369 error = xfs_mount_log_sb(mp, mp->m_update_flags); 1376 error = xfs_mount_log_sb(mp, mp->m_update_flags);
1370 if (error) { 1377 if (error) {
1371 cmn_err(CE_WARN, "XFS: failed to write sb changes"); 1378 xfs_warn(mp, "failed to write sb changes");
1372 goto out_rtunmount; 1379 goto out_rtunmount;
1373 } 1380 }
1374 } 1381 }
@@ -1389,10 +1396,7 @@ xfs_mountfs(
1389 * quotachecked license. 1396 * quotachecked license.
1390 */ 1397 */
1391 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) { 1398 if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
1392 cmn_err(CE_NOTE, 1399 xfs_notice(mp, "resetting quota flags");
1393 "XFS: resetting qflags for filesystem %s",
1394 mp->m_fsname);
1395
1396 error = xfs_mount_reset_sbqflags(mp); 1400 error = xfs_mount_reset_sbqflags(mp);
1397 if (error) 1401 if (error)
1398 return error; 1402 return error;
@@ -1406,7 +1410,7 @@ xfs_mountfs(
1406 */ 1410 */
1407 error = xfs_log_mount_finish(mp); 1411 error = xfs_log_mount_finish(mp);
1408 if (error) { 1412 if (error) {
1409 cmn_err(CE_WARN, "XFS: log mount finish failed"); 1413 xfs_warn(mp, "log mount finish failed");
1410 goto out_rtunmount; 1414 goto out_rtunmount;
1411 } 1415 }
1412 1416
@@ -1435,8 +1439,8 @@ xfs_mountfs(
1435 resblks = xfs_default_resblks(mp); 1439 resblks = xfs_default_resblks(mp);
1436 error = xfs_reserve_blocks(mp, &resblks, NULL); 1440 error = xfs_reserve_blocks(mp, &resblks, NULL);
1437 if (error) 1441 if (error)
1438 cmn_err(CE_WARN, "XFS: Unable to allocate reserve " 1442 xfs_warn(mp,
1439 "blocks. Continuing without a reserve pool."); 1443 "Unable to allocate reserve blocks. Continuing without reserve pool.");
1440 } 1444 }
1441 1445
1442 return 0; 1446 return 0;
@@ -1525,12 +1529,12 @@ xfs_unmountfs(
1525 resblks = 0; 1529 resblks = 0;
1526 error = xfs_reserve_blocks(mp, &resblks, NULL); 1530 error = xfs_reserve_blocks(mp, &resblks, NULL);
1527 if (error) 1531 if (error)
1528 cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. " 1532 xfs_warn(mp, "Unable to free reserved block pool. "
1529 "Freespace may not be correct on next mount."); 1533 "Freespace may not be correct on next mount.");
1530 1534
1531 error = xfs_log_sbcount(mp, 1); 1535 error = xfs_log_sbcount(mp, 1);
1532 if (error) 1536 if (error)
1533 cmn_err(CE_WARN, "XFS: Unable to update superblock counters. " 1537 xfs_warn(mp, "Unable to update superblock counters. "
1534 "Freespace may not be correct on next mount."); 1538 "Freespace may not be correct on next mount.");
1535 xfs_unmountfs_writesb(mp); 1539 xfs_unmountfs_writesb(mp);
1536 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1540 xfs_unmountfs_wait(mp); /* wait for async bufs */
@@ -2013,10 +2017,8 @@ xfs_dev_is_read_only(
2013 if (xfs_readonly_buftarg(mp->m_ddev_targp) || 2017 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
2014 xfs_readonly_buftarg(mp->m_logdev_targp) || 2018 xfs_readonly_buftarg(mp->m_logdev_targp) ||
2015 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 2019 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
2016 cmn_err(CE_NOTE, 2020 xfs_notice(mp, "%s required on read-only device.", message);
2017 "XFS: %s required on read-only device.", message); 2021 xfs_notice(mp, "write access unavailable, cannot proceed.");
2018 cmn_err(CE_NOTE,
2019 "XFS: write access unavailable, cannot proceed.");
2020 return EROFS; 2022 return EROFS;
2021 } 2023 }
2022 return 0; 2024 return 0;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a62e8971539d..19af0ab0d0c6 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -203,12 +203,9 @@ typedef struct xfs_mount {
203 struct mutex m_icsb_mutex; /* balancer sync lock */ 203 struct mutex m_icsb_mutex; /* balancer sync lock */
204#endif 204#endif
205 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 205 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
206 struct task_struct *m_sync_task; /* generalised sync thread */ 206 struct delayed_work m_sync_work; /* background sync work */
207 xfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */ 207 struct delayed_work m_reclaim_work; /* background inode reclaim */
208 struct list_head m_sync_list; /* sync thread work item list */ 208 struct work_struct m_flush_work; /* background inode flush */
209 spinlock_t m_sync_lock; /* work item list lock */
210 int m_sync_seq; /* sync thread generation no. */
211 wait_queue_head_t m_wait_single_sync_task;
212 __int64_t m_update_flags; /* sb flags we need to update 209 __int64_t m_update_flags; /* sb flags we need to update
213 on the next remount,rw */ 210 on the next remount,rw */
214 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 211 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index edfa178bafb6..4aff56395732 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -309,7 +309,7 @@ xfs_mru_cache_init(void)
309 if (!xfs_mru_elem_zone) 309 if (!xfs_mru_elem_zone)
310 goto out; 310 goto out;
311 311
312 xfs_mru_reap_wq = create_singlethread_workqueue("xfs_mru_cache"); 312 xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1);
313 if (!xfs_mru_reap_wq) 313 if (!xfs_mru_reap_wq)
314 goto out_destroy_mru_elem_zone; 314 goto out_destroy_mru_elem_zone;
315 315
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 9bb6eda4cd21..a595f29567fe 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -382,7 +382,8 @@ static inline int xfs_qm_sync(struct xfs_mount *mp, int flags)
382 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \ 382 xfs_trans_reserve_quota_bydquots(tp, mp, ud, gd, nb, ni, \
383 f | XFS_QMOPT_RES_REGBLKS) 383 f | XFS_QMOPT_RES_REGBLKS)
384 384
385extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *); 385extern int xfs_qm_dqcheck(struct xfs_mount *, xfs_disk_dquot_t *,
386 xfs_dqid_t, uint, uint, char *);
386extern int xfs_mount_reset_sbqflags(struct xfs_mount *); 387extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
387 388
388#endif /* __KERNEL__ */ 389#endif /* __KERNEL__ */
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 12a191385310..8f76fdff4f46 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -76,7 +76,7 @@ xfs_growfs_rt_alloc(
76 xfs_mount_t *mp, /* file system mount point */ 76 xfs_mount_t *mp, /* file system mount point */
77 xfs_extlen_t oblocks, /* old count of blocks */ 77 xfs_extlen_t oblocks, /* old count of blocks */
78 xfs_extlen_t nblocks, /* new count of blocks */ 78 xfs_extlen_t nblocks, /* new count of blocks */
79 xfs_ino_t ino) /* inode number (bitmap/summary) */ 79 xfs_inode_t *ip) /* inode (bitmap/summary) */
80{ 80{
81 xfs_fileoff_t bno; /* block number in file */ 81 xfs_fileoff_t bno; /* block number in file */
82 xfs_buf_t *bp; /* temporary buffer for zeroing */ 82 xfs_buf_t *bp; /* temporary buffer for zeroing */
@@ -86,7 +86,6 @@ xfs_growfs_rt_alloc(
86 xfs_fsblock_t firstblock; /* first block allocated in xaction */ 86 xfs_fsblock_t firstblock; /* first block allocated in xaction */
87 xfs_bmap_free_t flist; /* list of freed blocks */ 87 xfs_bmap_free_t flist; /* list of freed blocks */
88 xfs_fsblock_t fsbno; /* filesystem block for bno */ 88 xfs_fsblock_t fsbno; /* filesystem block for bno */
89 xfs_inode_t *ip; /* pointer to incore inode */
90 xfs_bmbt_irec_t map; /* block map output */ 89 xfs_bmbt_irec_t map; /* block map output */
91 int nmap; /* number of block maps */ 90 int nmap; /* number of block maps */
92 int resblks; /* space reservation */ 91 int resblks; /* space reservation */
@@ -112,9 +111,9 @@ xfs_growfs_rt_alloc(
112 /* 111 /*
113 * Lock the inode. 112 * Lock the inode.
114 */ 113 */
115 if ((error = xfs_trans_iget(mp, tp, ino, 0, 114 xfs_ilock(ip, XFS_ILOCK_EXCL);
116 XFS_ILOCK_EXCL, &ip))) 115 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
117 goto error_cancel; 116
118 xfs_bmap_init(&flist, &firstblock); 117 xfs_bmap_init(&flist, &firstblock);
119 /* 118 /*
120 * Allocate blocks to the bitmap file. 119 * Allocate blocks to the bitmap file.
@@ -155,9 +154,8 @@ xfs_growfs_rt_alloc(
155 /* 154 /*
156 * Lock the bitmap inode. 155 * Lock the bitmap inode.
157 */ 156 */
158 if ((error = xfs_trans_iget(mp, tp, ino, 0, 157 xfs_ilock(ip, XFS_ILOCK_EXCL);
159 XFS_ILOCK_EXCL, &ip))) 158 xfs_trans_ijoin_ref(tp, ip, XFS_ILOCK_EXCL);
160 goto error_cancel;
161 /* 159 /*
162 * Get a buffer for the block. 160 * Get a buffer for the block.
163 */ 161 */
@@ -1854,7 +1852,6 @@ xfs_growfs_rt(
1854 xfs_rtblock_t bmbno; /* bitmap block number */ 1852 xfs_rtblock_t bmbno; /* bitmap block number */
1855 xfs_buf_t *bp; /* temporary buffer */ 1853 xfs_buf_t *bp; /* temporary buffer */
1856 int error; /* error return value */ 1854 int error; /* error return value */
1857 xfs_inode_t *ip; /* bitmap inode, used as lock */
1858 xfs_mount_t *nmp; /* new (fake) mount structure */ 1855 xfs_mount_t *nmp; /* new (fake) mount structure */
1859 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */ 1856 xfs_drfsbno_t nrblocks; /* new number of realtime blocks */
1860 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */ 1857 xfs_extlen_t nrbmblocks; /* new number of rt bitmap blocks */
@@ -1918,11 +1915,11 @@ xfs_growfs_rt(
1918 /* 1915 /*
1919 * Allocate space to the bitmap and summary files, as necessary. 1916 * Allocate space to the bitmap and summary files, as necessary.
1920 */ 1917 */
1921 if ((error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, 1918 error = xfs_growfs_rt_alloc(mp, rbmblocks, nrbmblocks, mp->m_rbmip);
1922 mp->m_sb.sb_rbmino))) 1919 if (error)
1923 return error; 1920 return error;
1924 if ((error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, 1921 error = xfs_growfs_rt_alloc(mp, rsumblocks, nrsumblocks, mp->m_rsumip);
1925 mp->m_sb.sb_rsumino))) 1922 if (error)
1926 return error; 1923 return error;
1927 /* 1924 /*
1928 * Allocate a new (fake) mount/sb. 1925 * Allocate a new (fake) mount/sb.
@@ -1972,10 +1969,8 @@ xfs_growfs_rt(
1972 /* 1969 /*
1973 * Lock out other callers by grabbing the bitmap inode lock. 1970 * Lock out other callers by grabbing the bitmap inode lock.
1974 */ 1971 */
1975 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 1972 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
1976 XFS_ILOCK_EXCL, &ip))) 1973 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
1977 goto error_cancel;
1978 ASSERT(ip == mp->m_rbmip);
1979 /* 1974 /*
1980 * Update the bitmap inode's size. 1975 * Update the bitmap inode's size.
1981 */ 1976 */
@@ -1986,10 +1981,8 @@ xfs_growfs_rt(
1986 /* 1981 /*
1987 * Get the summary inode into the transaction. 1982 * Get the summary inode into the transaction.
1988 */ 1983 */
1989 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, 1984 xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
1990 XFS_ILOCK_EXCL, &ip))) 1985 xfs_trans_ijoin_ref(tp, mp->m_rsumip, XFS_ILOCK_EXCL);
1991 goto error_cancel;
1992 ASSERT(ip == mp->m_rsumip);
1993 /* 1986 /*
1994 * Update the summary inode's size. 1987 * Update the summary inode's size.
1995 */ 1988 */
@@ -2075,15 +2068,15 @@ xfs_rtallocate_extent(
2075 xfs_extlen_t prod, /* extent product factor */ 2068 xfs_extlen_t prod, /* extent product factor */
2076 xfs_rtblock_t *rtblock) /* out: start block allocated */ 2069 xfs_rtblock_t *rtblock) /* out: start block allocated */
2077{ 2070{
2071 xfs_mount_t *mp = tp->t_mountp;
2078 int error; /* error value */ 2072 int error; /* error value */
2079 xfs_inode_t *ip; /* inode for bitmap file */
2080 xfs_mount_t *mp; /* file system mount structure */
2081 xfs_rtblock_t r; /* result allocated block */ 2073 xfs_rtblock_t r; /* result allocated block */
2082 xfs_fsblock_t sb; /* summary file block number */ 2074 xfs_fsblock_t sb; /* summary file block number */
2083 xfs_buf_t *sumbp; /* summary file block buffer */ 2075 xfs_buf_t *sumbp; /* summary file block buffer */
2084 2076
2077 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2085 ASSERT(minlen > 0 && minlen <= maxlen); 2078 ASSERT(minlen > 0 && minlen <= maxlen);
2086 mp = tp->t_mountp; 2079
2087 /* 2080 /*
2088 * If prod is set then figure out what to do to minlen and maxlen. 2081 * If prod is set then figure out what to do to minlen and maxlen.
2089 */ 2082 */
@@ -2099,12 +2092,7 @@ xfs_rtallocate_extent(
2099 return 0; 2092 return 0;
2100 } 2093 }
2101 } 2094 }
2102 /* 2095
2103 * Lock out other callers by grabbing the bitmap inode lock.
2104 */
2105 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
2106 XFS_ILOCK_EXCL, &ip)))
2107 return error;
2108 sumbp = NULL; 2096 sumbp = NULL;
2109 /* 2097 /*
2110 * Allocate by size, or near another block, or exactly at some block. 2098 * Allocate by size, or near another block, or exactly at some block.
@@ -2123,11 +2111,12 @@ xfs_rtallocate_extent(
2123 len, &sumbp, &sb, prod, &r); 2111 len, &sumbp, &sb, prod, &r);
2124 break; 2112 break;
2125 default: 2113 default:
2114 error = EIO;
2126 ASSERT(0); 2115 ASSERT(0);
2127 } 2116 }
2128 if (error) { 2117 if (error)
2129 return error; 2118 return error;
2130 } 2119
2131 /* 2120 /*
2132 * If it worked, update the superblock. 2121 * If it worked, update the superblock.
2133 */ 2122 */
@@ -2155,7 +2144,6 @@ xfs_rtfree_extent(
2155 xfs_extlen_t len) /* length of extent freed */ 2144 xfs_extlen_t len) /* length of extent freed */
2156{ 2145{
2157 int error; /* error value */ 2146 int error; /* error value */
2158 xfs_inode_t *ip; /* bitmap file inode */
2159 xfs_mount_t *mp; /* file system mount structure */ 2147 xfs_mount_t *mp; /* file system mount structure */
2160 xfs_fsblock_t sb; /* summary file block number */ 2148 xfs_fsblock_t sb; /* summary file block number */
2161 xfs_buf_t *sumbp; /* summary file block buffer */ 2149 xfs_buf_t *sumbp; /* summary file block buffer */
@@ -2164,9 +2152,9 @@ xfs_rtfree_extent(
2164 /* 2152 /*
2165 * Synchronize by locking the bitmap inode. 2153 * Synchronize by locking the bitmap inode.
2166 */ 2154 */
2167 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 2155 xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
2168 XFS_ILOCK_EXCL, &ip))) 2156 xfs_trans_ijoin_ref(tp, mp->m_rbmip, XFS_ILOCK_EXCL);
2169 return error; 2157
2170#if defined(__KERNEL__) && defined(DEBUG) 2158#if defined(__KERNEL__) && defined(DEBUG)
2171 /* 2159 /*
2172 * Check to see that this whole range is currently allocated. 2160 * Check to see that this whole range is currently allocated.
@@ -2199,10 +2187,10 @@ xfs_rtfree_extent(
2199 */ 2187 */
2200 if (tp->t_frextents_delta + mp->m_sb.sb_frextents == 2188 if (tp->t_frextents_delta + mp->m_sb.sb_frextents ==
2201 mp->m_sb.sb_rextents) { 2189 mp->m_sb.sb_rextents) {
2202 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) 2190 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM))
2203 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM; 2191 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2204 *(__uint64_t *)&ip->i_d.di_atime = 0; 2192 *(__uint64_t *)&mp->m_rbmip->i_d.di_atime = 0;
2205 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2193 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2206 } 2194 }
2207 return 0; 2195 return 0;
2208} 2196}
@@ -2222,8 +2210,8 @@ xfs_rtmount_init(
2222 if (sbp->sb_rblocks == 0) 2210 if (sbp->sb_rblocks == 0)
2223 return 0; 2211 return 0;
2224 if (mp->m_rtdev_targp == NULL) { 2212 if (mp->m_rtdev_targp == NULL) {
2225 cmn_err(CE_WARN, 2213 xfs_warn(mp,
2226 "XFS: This filesystem has a realtime volume, use rtdev=device option"); 2214 "Filesystem has a realtime volume, use rtdev=device option");
2227 return XFS_ERROR(ENODEV); 2215 return XFS_ERROR(ENODEV);
2228 } 2216 }
2229 mp->m_rsumlevels = sbp->sb_rextslog + 1; 2217 mp->m_rsumlevels = sbp->sb_rextslog + 1;
@@ -2237,7 +2225,7 @@ xfs_rtmount_init(
2237 */ 2225 */
2238 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); 2226 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
2239 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { 2227 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) {
2240 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", 2228 xfs_warn(mp, "realtime mount -- %llu != %llu",
2241 (unsigned long long) XFS_BB_TO_FSB(mp, d), 2229 (unsigned long long) XFS_BB_TO_FSB(mp, d),
2242 (unsigned long long) mp->m_sb.sb_rblocks); 2230 (unsigned long long) mp->m_sb.sb_rblocks);
2243 return XFS_ERROR(EFBIG); 2231 return XFS_ERROR(EFBIG);
@@ -2246,7 +2234,7 @@ xfs_rtmount_init(
2246 d - XFS_FSB_TO_BB(mp, 1), 2234 d - XFS_FSB_TO_BB(mp, 1),
2247 XFS_FSB_TO_B(mp, 1), 0); 2235 XFS_FSB_TO_B(mp, 1), 0);
2248 if (!bp) { 2236 if (!bp) {
2249 cmn_err(CE_WARN, "XFS: realtime device size check failed"); 2237 xfs_warn(mp, "realtime device size check failed");
2250 return EIO; 2238 return EIO;
2251 } 2239 }
2252 xfs_buf_relse(bp); 2240 xfs_buf_relse(bp);
@@ -2306,20 +2294,16 @@ xfs_rtpick_extent(
2306 xfs_rtblock_t *pick) /* result rt extent */ 2294 xfs_rtblock_t *pick) /* result rt extent */
2307{ 2295{
2308 xfs_rtblock_t b; /* result block */ 2296 xfs_rtblock_t b; /* result block */
2309 int error; /* error return value */
2310 xfs_inode_t *ip; /* bitmap incore inode */
2311 int log2; /* log of sequence number */ 2297 int log2; /* log of sequence number */
2312 __uint64_t resid; /* residual after log removed */ 2298 __uint64_t resid; /* residual after log removed */
2313 __uint64_t seq; /* sequence number of file creation */ 2299 __uint64_t seq; /* sequence number of file creation */
2314 __uint64_t *seqp; /* pointer to seqno in inode */ 2300 __uint64_t *seqp; /* pointer to seqno in inode */
2315 2301
2316 if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, 2302 ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
2317 XFS_ILOCK_EXCL, &ip))) 2303
2318 return error; 2304 seqp = (__uint64_t *)&mp->m_rbmip->i_d.di_atime;
2319 ASSERT(ip == mp->m_rbmip); 2305 if (!(mp->m_rbmip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2320 seqp = (__uint64_t *)&ip->i_d.di_atime; 2306 mp->m_rbmip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2321 if (!(ip->i_d.di_flags & XFS_DIFLAG_NEWRTBM)) {
2322 ip->i_d.di_flags |= XFS_DIFLAG_NEWRTBM;
2323 *seqp = 0; 2307 *seqp = 0;
2324 } 2308 }
2325 seq = *seqp; 2309 seq = *seqp;
@@ -2335,7 +2319,7 @@ xfs_rtpick_extent(
2335 b = mp->m_sb.sb_rextents - len; 2319 b = mp->m_sb.sb_rextents - len;
2336 } 2320 }
2337 *seqp = seq + 1; 2321 *seqp = seq + 1;
2338 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2322 xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE);
2339 *pick = b; 2323 *pick = b;
2340 return 0; 2324 return 0;
2341} 2325}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index ff614c29b441..09e1f4f35e97 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -154,7 +154,7 @@ xfs_rtmount_init(
154 if (mp->m_sb.sb_rblocks == 0) 154 if (mp->m_sb.sb_rblocks == 0)
155 return 0; 155 return 0;
156 156
157 cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT"); 157 xfs_warn(mp, "Not built with CONFIG_XFS_RT");
158 return ENOSYS; 158 return ENOSYS;
159} 159}
160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 56861d5daaef..d6d6fdfe9422 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -49,9 +49,9 @@ xfs_do_force_shutdown(
49 logerror = flags & SHUTDOWN_LOG_IO_ERROR; 49 logerror = flags & SHUTDOWN_LOG_IO_ERROR;
50 50
51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 51 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
52 cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from " 52 xfs_notice(mp,
53 "line %d of file %s. Return address = 0x%p", 53 "%s(0x%x) called from line %d of file %s. Return address = 0x%p",
54 mp->m_fsname, flags, lnnum, fname, __return_address); 54 __func__, flags, lnnum, fname, __return_address);
55 } 55 }
56 /* 56 /*
57 * No need to duplicate efforts. 57 * No need to duplicate efforts.
@@ -69,30 +69,25 @@ xfs_do_force_shutdown(
69 return; 69 return;
70 70
71 if (flags & SHUTDOWN_CORRUPT_INCORE) { 71 if (flags & SHUTDOWN_CORRUPT_INCORE) {
72 xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp, 72 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
73 "Corruption of in-memory data detected. Shutting down filesystem: %s", 73 "Corruption of in-memory data detected. Shutting down filesystem");
74 mp->m_fsname); 74 if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
75 if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
76 xfs_stack_trace(); 75 xfs_stack_trace();
77 }
78 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 76 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
79 if (logerror) { 77 if (logerror) {
80 xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp, 78 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
81 "Log I/O Error Detected. Shutting down filesystem: %s", 79 "Log I/O Error Detected. Shutting down filesystem");
82 mp->m_fsname);
83 } else if (flags & SHUTDOWN_DEVICE_REQ) { 80 } else if (flags & SHUTDOWN_DEVICE_REQ) {
84 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 81 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
85 "All device paths lost. Shutting down filesystem: %s", 82 "All device paths lost. Shutting down filesystem");
86 mp->m_fsname);
87 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { 83 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
88 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 84 xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
89 "I/O Error Detected. Shutting down filesystem: %s", 85 "I/O Error Detected. Shutting down filesystem");
90 mp->m_fsname);
91 } 86 }
92 } 87 }
93 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 88 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
94 cmn_err(CE_ALERT, "Please umount the filesystem, " 89 xfs_alert(mp,
95 "and rectify the problem(s)"); 90 "Please umount the filesystem and rectify the problem(s)");
96 } 91 }
97} 92}
98 93
@@ -106,10 +101,9 @@ xfs_ioerror_alert(
106 xfs_buf_t *bp, 101 xfs_buf_t *bp,
107 xfs_daddr_t blkno) 102 xfs_daddr_t blkno)
108{ 103{
109 cmn_err(CE_ALERT, 104 xfs_alert(mp,
110 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" 105 "I/O error occurred: meta-data dev %s block 0x%llx"
111 " (\"%s\") error %d buf count %zd", 106 " (\"%s\") error %d buf count %zd",
112 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
113 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 107 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
114 (__uint64_t)blkno, func, 108 (__uint64_t)blkno, func,
115 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); 109 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
@@ -173,17 +167,9 @@ xfs_extlen_t
173xfs_get_extsz_hint( 167xfs_get_extsz_hint(
174 struct xfs_inode *ip) 168 struct xfs_inode *ip)
175{ 169{
176 xfs_extlen_t extsz; 170 if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
177 171 return ip->i_d.di_extsize;
178 if (unlikely(XFS_IS_REALTIME_INODE(ip))) { 172 if (XFS_IS_REALTIME_INODE(ip))
179 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) 173 return ip->i_mount->m_sb.sb_rextsize;
180 ? ip->i_d.di_extsize 174 return 0;
181 : ip->i_mount->m_sb.sb_rextsize;
182 ASSERT(extsz);
183 } else {
184 extsz = (ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE)
185 ? ip->i_d.di_extsize : 0;
186 }
187
188 return extsz;
189} 175}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c2042b736b81..06a9759b6352 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -469,8 +469,6 @@ void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
469void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); 469void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); 470void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); 471void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
472int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *,
473 xfs_ino_t , uint, uint, struct xfs_inode **);
474void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); 472void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
475void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint); 473void xfs_trans_ijoin_ref(struct xfs_trans *, struct xfs_inode *, uint);
476void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *); 474void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c5bbbc45db91..acdb92f14d51 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,74 +28,138 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31STATIC void xfs_ail_splice(struct xfs_ail *, struct list_head *, xfs_lsn_t); 31struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
32STATIC void xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
34STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
35 32
36#ifdef DEBUG 33#ifdef DEBUG
37STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *); 34/*
38#else 35 * Check that the list is sorted as it should be.
36 */
37STATIC void
38xfs_ail_check(
39 struct xfs_ail *ailp,
40 xfs_log_item_t *lip)
41{
42 xfs_log_item_t *prev_lip;
43
44 if (list_empty(&ailp->xa_ail))
45 return;
46
47 /*
48 * Check the next and previous entries are valid.
49 */
50 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
51 prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
52 if (&prev_lip->li_ail != &ailp->xa_ail)
53 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
54
55 prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
56 if (&prev_lip->li_ail != &ailp->xa_ail)
57 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
58
59
60#ifdef XFS_TRANS_DEBUG
61 /*
62 * Walk the list checking lsn ordering, and that every entry has the
63 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
64 * when specifically debugging the transaction subsystem.
65 */
66 prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
67 list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
68 if (&prev_lip->li_ail != &ailp->xa_ail)
69 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
70 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
71 prev_lip = lip;
72 }
73#endif /* XFS_TRANS_DEBUG */
74}
75#else /* !DEBUG */
39#define xfs_ail_check(a,l) 76#define xfs_ail_check(a,l)
40#endif /* DEBUG */ 77#endif /* DEBUG */
41 78
79/*
80 * Return a pointer to the first item in the AIL. If the AIL is empty, then
81 * return NULL.
82 */
83static xfs_log_item_t *
84xfs_ail_min(
85 struct xfs_ail *ailp)
86{
87 if (list_empty(&ailp->xa_ail))
88 return NULL;
89
90 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
91}
92
93 /*
94 * Return a pointer to the last item in the AIL. If the AIL is empty, then
95 * return NULL.
96 */
97static xfs_log_item_t *
98xfs_ail_max(
99 struct xfs_ail *ailp)
100{
101 if (list_empty(&ailp->xa_ail))
102 return NULL;
103
104 return list_entry(ailp->xa_ail.prev, xfs_log_item_t, li_ail);
105}
106
107/*
108 * Return a pointer to the item which follows the given item in the AIL. If
109 * the given item is the last item in the list, then return NULL.
110 */
111static xfs_log_item_t *
112xfs_ail_next(
113 struct xfs_ail *ailp,
114 xfs_log_item_t *lip)
115{
116 if (lip->li_ail.next == &ailp->xa_ail)
117 return NULL;
118
119 return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
120}
42 121
43/* 122/*
44 * This is called by the log manager code to determine the LSN 123 * This is called by the log manager code to determine the LSN of the tail of
45 * of the tail of the log. This is exactly the LSN of the first 124 * the log. This is exactly the LSN of the first item in the AIL. If the AIL
46 * item in the AIL. If the AIL is empty, then this function 125 * is empty, then this function returns 0.
47 * returns 0.
48 * 126 *
49 * We need the AIL lock in order to get a coherent read of the 127 * We need the AIL lock in order to get a coherent read of the lsn of the last
50 * lsn of the last item in the AIL. 128 * item in the AIL.
51 */ 129 */
52xfs_lsn_t 130xfs_lsn_t
53xfs_trans_ail_tail( 131xfs_ail_min_lsn(
54 struct xfs_ail *ailp) 132 struct xfs_ail *ailp)
55{ 133{
56 xfs_lsn_t lsn; 134 xfs_lsn_t lsn = 0;
57 xfs_log_item_t *lip; 135 xfs_log_item_t *lip;
58 136
59 spin_lock(&ailp->xa_lock); 137 spin_lock(&ailp->xa_lock);
60 lip = xfs_ail_min(ailp); 138 lip = xfs_ail_min(ailp);
61 if (lip == NULL) { 139 if (lip)
62 lsn = (xfs_lsn_t)0;
63 } else {
64 lsn = lip->li_lsn; 140 lsn = lip->li_lsn;
65 }
66 spin_unlock(&ailp->xa_lock); 141 spin_unlock(&ailp->xa_lock);
67 142
68 return lsn; 143 return lsn;
69} 144}
70 145
71/* 146/*
72 * xfs_trans_push_ail 147 * Return the maximum lsn held in the AIL, or zero if the AIL is empty.
73 *
74 * This routine is called to move the tail of the AIL forward. It does this by
75 * trying to flush items in the AIL whose lsns are below the given
76 * threshold_lsn.
77 *
78 * the push is run asynchronously in a separate thread, so we return the tail
79 * of the log right now instead of the tail after the push. This means we will
80 * either continue right away, or we will sleep waiting on the async thread to
81 * do its work.
82 *
83 * We do this unlocked - we only need to know whether there is anything in the
84 * AIL at the time we are called. We don't need to access the contents of
85 * any of the objects, so the lock is not needed.
86 */ 148 */
87void 149static xfs_lsn_t
88xfs_trans_ail_push( 150xfs_ail_max_lsn(
89 struct xfs_ail *ailp, 151 struct xfs_ail *ailp)
90 xfs_lsn_t threshold_lsn)
91{ 152{
92 xfs_log_item_t *lip; 153 xfs_lsn_t lsn = 0;
154 xfs_log_item_t *lip;
93 155
94 lip = xfs_ail_min(ailp); 156 spin_lock(&ailp->xa_lock);
95 if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) { 157 lip = xfs_ail_max(ailp);
96 if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) 158 if (lip)
97 xfsaild_wakeup(ailp, threshold_lsn); 159 lsn = lip->li_lsn;
98 } 160 spin_unlock(&ailp->xa_lock);
161
162 return lsn;
99} 163}
100 164
101/* 165/*
@@ -236,16 +300,57 @@ out:
236} 300}
237 301
238/* 302/*
239 * xfsaild_push does the work of pushing on the AIL. Returning a timeout of 303 * splice the log item list into the AIL at the given LSN.
240 * zero indicates that the caller should sleep until woken.
241 */ 304 */
242long 305static void
243xfsaild_push( 306xfs_ail_splice(
244 struct xfs_ail *ailp, 307 struct xfs_ail *ailp,
245 xfs_lsn_t *last_lsn) 308 struct list_head *list,
309 xfs_lsn_t lsn)
246{ 310{
247 long tout = 0; 311 xfs_log_item_t *next_lip;
248 xfs_lsn_t last_pushed_lsn = *last_lsn; 312
313 /* If the list is empty, just insert the item. */
314 if (list_empty(&ailp->xa_ail)) {
315 list_splice(list, &ailp->xa_ail);
316 return;
317 }
318
319 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
320 if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
321 break;
322 }
323
324 ASSERT(&next_lip->li_ail == &ailp->xa_ail ||
325 XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0);
326
327 list_splice_init(list, &next_lip->li_ail);
328}
329
330/*
331 * Delete the given item from the AIL. Return a pointer to the item.
332 */
333static void
334xfs_ail_delete(
335 struct xfs_ail *ailp,
336 xfs_log_item_t *lip)
337{
338 xfs_ail_check(ailp, lip);
339 list_del(&lip->li_ail);
340 xfs_trans_ail_cursor_clear(ailp, lip);
341}
342
343/*
344 * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
345 * to run at a later time if there is more work to do to complete the push.
346 */
347STATIC void
348xfs_ail_worker(
349 struct work_struct *work)
350{
351 struct xfs_ail *ailp = container_of(to_delayed_work(work),
352 struct xfs_ail, xa_work);
353 long tout;
249 xfs_lsn_t target = ailp->xa_target; 354 xfs_lsn_t target = ailp->xa_target;
250 xfs_lsn_t lsn; 355 xfs_lsn_t lsn;
251 xfs_log_item_t *lip; 356 xfs_log_item_t *lip;
@@ -256,15 +361,15 @@ xfsaild_push(
256 361
257 spin_lock(&ailp->xa_lock); 362 spin_lock(&ailp->xa_lock);
258 xfs_trans_ail_cursor_init(ailp, cur); 363 xfs_trans_ail_cursor_init(ailp, cur);
259 lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn); 364 lip = xfs_trans_ail_cursor_first(ailp, cur, ailp->xa_last_pushed_lsn);
260 if (!lip || XFS_FORCED_SHUTDOWN(mp)) { 365 if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
261 /* 366 /*
262 * AIL is empty or our push has reached the end. 367 * AIL is empty or our push has reached the end.
263 */ 368 */
264 xfs_trans_ail_cursor_done(ailp, cur); 369 xfs_trans_ail_cursor_done(ailp, cur);
265 spin_unlock(&ailp->xa_lock); 370 spin_unlock(&ailp->xa_lock);
266 *last_lsn = 0; 371 ailp->xa_last_pushed_lsn = 0;
267 return tout; 372 return;
268 } 373 }
269 374
270 XFS_STATS_INC(xs_push_ail); 375 XFS_STATS_INC(xs_push_ail);
@@ -301,13 +406,13 @@ xfsaild_push(
301 case XFS_ITEM_SUCCESS: 406 case XFS_ITEM_SUCCESS:
302 XFS_STATS_INC(xs_push_ail_success); 407 XFS_STATS_INC(xs_push_ail_success);
303 IOP_PUSH(lip); 408 IOP_PUSH(lip);
304 last_pushed_lsn = lsn; 409 ailp->xa_last_pushed_lsn = lsn;
305 break; 410 break;
306 411
307 case XFS_ITEM_PUSHBUF: 412 case XFS_ITEM_PUSHBUF:
308 XFS_STATS_INC(xs_push_ail_pushbuf); 413 XFS_STATS_INC(xs_push_ail_pushbuf);
309 IOP_PUSHBUF(lip); 414 IOP_PUSHBUF(lip);
310 last_pushed_lsn = lsn; 415 ailp->xa_last_pushed_lsn = lsn;
311 push_xfsbufd = 1; 416 push_xfsbufd = 1;
312 break; 417 break;
313 418
@@ -319,7 +424,7 @@ xfsaild_push(
319 424
320 case XFS_ITEM_LOCKED: 425 case XFS_ITEM_LOCKED:
321 XFS_STATS_INC(xs_push_ail_locked); 426 XFS_STATS_INC(xs_push_ail_locked);
322 last_pushed_lsn = lsn; 427 ailp->xa_last_pushed_lsn = lsn;
323 stuck++; 428 stuck++;
324 break; 429 break;
325 430
@@ -374,9 +479,23 @@ xfsaild_push(
374 wake_up_process(mp->m_ddev_targp->bt_task); 479 wake_up_process(mp->m_ddev_targp->bt_task);
375 } 480 }
376 481
482 /* assume we have more work to do in a short while */
483 tout = 10;
377 if (!count) { 484 if (!count) {
378 /* We're past our target or empty, so idle */ 485 /* We're past our target or empty, so idle */
379 last_pushed_lsn = 0; 486 ailp->xa_last_pushed_lsn = 0;
487
488 /*
489 * Check for an updated push target before clearing the
490 * XFS_AIL_PUSHING_BIT. If the target changed, we've got more
491 * work to do. Wait a bit longer before starting that work.
492 */
493 smp_rmb();
494 if (ailp->xa_target == target) {
495 clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
496 return;
497 }
498 tout = 50;
380 } else if (XFS_LSN_CMP(lsn, target) >= 0) { 499 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
381 /* 500 /*
382 * We reached the target so wait a bit longer for I/O to 501 * We reached the target so wait a bit longer for I/O to
@@ -384,7 +503,7 @@ xfsaild_push(
384 * start the next scan from the start of the AIL. 503 * start the next scan from the start of the AIL.
385 */ 504 */
386 tout = 50; 505 tout = 50;
387 last_pushed_lsn = 0; 506 ailp->xa_last_pushed_lsn = 0;
388 } else if ((stuck * 100) / count > 90) { 507 } else if ((stuck * 100) / count > 90) {
389 /* 508 /*
390 * Either there is a lot of contention on the AIL or we 509 * Either there is a lot of contention on the AIL or we
@@ -396,14 +515,61 @@ xfsaild_push(
396 * continuing from where we were. 515 * continuing from where we were.
397 */ 516 */
398 tout = 20; 517 tout = 20;
399 } else {
400 /* more to do, but wait a short while before continuing */
401 tout = 10;
402 } 518 }
403 *last_lsn = last_pushed_lsn; 519
404 return tout; 520 /* There is more to do, requeue us. */
521 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
522 msecs_to_jiffies(tout));
523}
524
525/*
526 * This routine is called to move the tail of the AIL forward. It does this by
527 * trying to flush items in the AIL whose lsns are below the given
528 * threshold_lsn.
529 *
530 * The push is run asynchronously in a workqueue, which means the caller needs
531 * to handle waiting on the async flush for space to become available.
532 * We don't want to interrupt any push that is in progress, hence we only queue
533 * work if we set the pushing bit approriately.
534 *
535 * We do this unlocked - we only need to know whether there is anything in the
536 * AIL at the time we are called. We don't need to access the contents of
537 * any of the objects, so the lock is not needed.
538 */
539void
540xfs_ail_push(
541 struct xfs_ail *ailp,
542 xfs_lsn_t threshold_lsn)
543{
544 xfs_log_item_t *lip;
545
546 lip = xfs_ail_min(ailp);
547 if (!lip || XFS_FORCED_SHUTDOWN(ailp->xa_mount) ||
548 XFS_LSN_CMP(threshold_lsn, ailp->xa_target) <= 0)
549 return;
550
551 /*
552 * Ensure that the new target is noticed in push code before it clears
553 * the XFS_AIL_PUSHING_BIT.
554 */
555 smp_wmb();
556 ailp->xa_target = threshold_lsn;
557 if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
558 queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
405} 559}
406 560
561/*
562 * Push out all items in the AIL immediately
563 */
564void
565xfs_ail_push_all(
566 struct xfs_ail *ailp)
567{
568 xfs_lsn_t threshold_lsn = xfs_ail_max_lsn(ailp);
569
570 if (threshold_lsn)
571 xfs_ail_push(ailp, threshold_lsn);
572}
407 573
408/* 574/*
409 * This is to be called when an item is unlocked that may have 575 * This is to be called when an item is unlocked that may have
@@ -563,7 +729,7 @@ xfs_trans_ail_delete_bulk(
563 729
564 spin_unlock(&ailp->xa_lock); 730 spin_unlock(&ailp->xa_lock);
565 if (!XFS_FORCED_SHUTDOWN(mp)) { 731 if (!XFS_FORCED_SHUTDOWN(mp)) {
566 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, 732 xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
567 "%s: attempting to delete a log item that is not in the AIL", 733 "%s: attempting to delete a log item that is not in the AIL",
568 __func__); 734 __func__);
569 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 735 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -615,7 +781,6 @@ xfs_trans_ail_init(
615 xfs_mount_t *mp) 781 xfs_mount_t *mp)
616{ 782{
617 struct xfs_ail *ailp; 783 struct xfs_ail *ailp;
618 int error;
619 784
620 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL); 785 ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
621 if (!ailp) 786 if (!ailp)
@@ -624,15 +789,9 @@ xfs_trans_ail_init(
624 ailp->xa_mount = mp; 789 ailp->xa_mount = mp;
625 INIT_LIST_HEAD(&ailp->xa_ail); 790 INIT_LIST_HEAD(&ailp->xa_ail);
626 spin_lock_init(&ailp->xa_lock); 791 spin_lock_init(&ailp->xa_lock);
627 error = xfsaild_start(ailp); 792 INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
628 if (error)
629 goto out_free_ailp;
630 mp->m_ail = ailp; 793 mp->m_ail = ailp;
631 return 0; 794 return 0;
632
633out_free_ailp:
634 kmem_free(ailp);
635 return error;
636} 795}
637 796
638void 797void
@@ -641,124 +800,6 @@ xfs_trans_ail_destroy(
641{ 800{
642 struct xfs_ail *ailp = mp->m_ail; 801 struct xfs_ail *ailp = mp->m_ail;
643 802
644 xfsaild_stop(ailp); 803 cancel_delayed_work_sync(&ailp->xa_work);
645 kmem_free(ailp); 804 kmem_free(ailp);
646} 805}
647
648/*
649 * splice the log item list into the AIL at the given LSN.
650 */
651STATIC void
652xfs_ail_splice(
653 struct xfs_ail *ailp,
654 struct list_head *list,
655 xfs_lsn_t lsn)
656{
657 xfs_log_item_t *next_lip;
658
659 /*
660 * If the list is empty, just insert the item.
661 */
662 if (list_empty(&ailp->xa_ail)) {
663 list_splice(list, &ailp->xa_ail);
664 return;
665 }
666
667 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
668 if (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0)
669 break;
670 }
671
672 ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
673 (XFS_LSN_CMP(next_lip->li_lsn, lsn) <= 0));
674
675 list_splice_init(list, &next_lip->li_ail);
676 return;
677}
678
679/*
680 * Delete the given item from the AIL. Return a pointer to the item.
681 */
682STATIC void
683xfs_ail_delete(
684 struct xfs_ail *ailp,
685 xfs_log_item_t *lip)
686{
687 xfs_ail_check(ailp, lip);
688 list_del(&lip->li_ail);
689 xfs_trans_ail_cursor_clear(ailp, lip);
690}
691
692/*
693 * Return a pointer to the first item in the AIL.
694 * If the AIL is empty, then return NULL.
695 */
696STATIC xfs_log_item_t *
697xfs_ail_min(
698 struct xfs_ail *ailp)
699{
700 if (list_empty(&ailp->xa_ail))
701 return NULL;
702
703 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
704}
705
706/*
707 * Return a pointer to the item which follows
708 * the given item in the AIL. If the given item
709 * is the last item in the list, then return NULL.
710 */
711STATIC xfs_log_item_t *
712xfs_ail_next(
713 struct xfs_ail *ailp,
714 xfs_log_item_t *lip)
715{
716 if (lip->li_ail.next == &ailp->xa_ail)
717 return NULL;
718
719 return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
720}
721
722#ifdef DEBUG
723/*
724 * Check that the list is sorted as it should be.
725 */
726STATIC void
727xfs_ail_check(
728 struct xfs_ail *ailp,
729 xfs_log_item_t *lip)
730{
731 xfs_log_item_t *prev_lip;
732
733 if (list_empty(&ailp->xa_ail))
734 return;
735
736 /*
737 * Check the next and previous entries are valid.
738 */
739 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
740 prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
741 if (&prev_lip->li_ail != &ailp->xa_ail)
742 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
743
744 prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
745 if (&prev_lip->li_ail != &ailp->xa_ail)
746 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
747
748
749#ifdef XFS_TRANS_DEBUG
750 /*
751 * Walk the list checking lsn ordering, and that every entry has the
752 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
753 * when specifically debugging the transaction subsystem.
754 */
755 prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
756 list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
757 if (&prev_lip->li_ail != &ailp->xa_ail)
758 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
759 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
760 prev_lip = lip;
761 }
762#endif /* XFS_TRANS_DEBUG */
763}
764#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c47918c302a5..03b3b7f85a3b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -305,7 +305,7 @@ xfs_trans_read_buf(
305 if (xfs_error_target == target) { 305 if (xfs_error_target == target) {
306 if (((xfs_req_num++) % xfs_error_mod) == 0) { 306 if (((xfs_req_num++) % xfs_error_mod) == 0) {
307 xfs_buf_relse(bp); 307 xfs_buf_relse(bp);
308 cmn_err(CE_DEBUG, "Returning error!\n"); 308 xfs_debug(mp, "Returning error!");
309 return XFS_ERROR(EIO); 309 return XFS_ERROR(EIO);
310 } 310 }
311 } 311 }
@@ -383,7 +383,8 @@ xfs_trans_read_buf(
383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK); 383 bp = xfs_buf_read(target, blkno, len, flags | XBF_DONT_BLOCK);
384 if (bp == NULL) { 384 if (bp == NULL) {
385 *bpp = NULL; 385 *bpp = NULL;
386 return 0; 386 return (flags & XBF_TRYLOCK) ?
387 0 : XFS_ERROR(ENOMEM);
387 } 388 }
388 if (XFS_BUF_GETERROR(bp) != 0) { 389 if (XFS_BUF_GETERROR(bp) != 0) {
389 XFS_BUF_SUPER_STALE(bp); 390 XFS_BUF_SUPER_STALE(bp);
@@ -403,7 +404,7 @@ xfs_trans_read_buf(
403 xfs_force_shutdown(tp->t_mountp, 404 xfs_force_shutdown(tp->t_mountp,
404 SHUTDOWN_META_IO_ERROR); 405 SHUTDOWN_META_IO_ERROR);
405 xfs_buf_relse(bp); 406 xfs_buf_relse(bp);
406 cmn_err(CE_DEBUG, "Returning trans error!\n"); 407 xfs_debug(mp, "Returning trans error!");
407 return XFS_ERROR(EIO); 408 return XFS_ERROR(EIO);
408 } 409 }
409 } 410 }
@@ -427,7 +428,7 @@ shutdown_abort:
427 */ 428 */
428#if defined(DEBUG) 429#if defined(DEBUG)
429 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp)) 430 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
430 cmn_err(CE_NOTE, "about to pop assert, bp == 0x%p", bp); 431 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
431#endif 432#endif
432 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) != 433 ASSERT((XFS_BUF_BFLAGS(bp) & (XBF_STALE|XBF_DELWRI)) !=
433 (XBF_STALE|XBF_DELWRI)); 434 (XBF_STALE|XBF_DELWRI));
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index ccb34532768b..048b0c689d3e 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -44,28 +44,6 @@ xfs_trans_inode_broot_debug(
44#endif 44#endif
45 45
46/* 46/*
47 * Get an inode and join it to the transaction.
48 */
49int
50xfs_trans_iget(
51 xfs_mount_t *mp,
52 xfs_trans_t *tp,
53 xfs_ino_t ino,
54 uint flags,
55 uint lock_flags,
56 xfs_inode_t **ipp)
57{
58 int error;
59
60 error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp);
61 if (!error && tp) {
62 xfs_trans_ijoin(tp, *ipp);
63 (*ipp)->i_itemp->ili_lock_flags = lock_flags;
64 }
65 return error;
66}
67
68/*
69 * Add a locked inode to the transaction. 47 * Add a locked inode to the transaction.
70 * 48 *
71 * The inode must be locked, and it cannot be associated with any transaction. 49 * The inode must be locked, and it cannot be associated with any transaction.
@@ -103,7 +81,7 @@ xfs_trans_ijoin(
103 * 81 *
104 * 82 *
105 * Grabs a reference to the inode which will be dropped when the transaction 83 * Grabs a reference to the inode which will be dropped when the transaction
106 * is commited. The inode will also be unlocked at that point. The inode 84 * is committed. The inode will also be unlocked at that point. The inode
107 * must be locked, and it cannot be associated with any transaction. 85 * must be locked, and it cannot be associated with any transaction.
108 */ 86 */
109void 87void
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 35162c238fa3..6b164e9e9a1f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -65,16 +65,22 @@ struct xfs_ail_cursor {
65struct xfs_ail { 65struct xfs_ail {
66 struct xfs_mount *xa_mount; 66 struct xfs_mount *xa_mount;
67 struct list_head xa_ail; 67 struct list_head xa_ail;
68 uint xa_gen;
69 struct task_struct *xa_task;
70 xfs_lsn_t xa_target; 68 xfs_lsn_t xa_target;
71 struct xfs_ail_cursor xa_cursors; 69 struct xfs_ail_cursor xa_cursors;
72 spinlock_t xa_lock; 70 spinlock_t xa_lock;
71 struct delayed_work xa_work;
72 xfs_lsn_t xa_last_pushed_lsn;
73 unsigned long xa_flags;
73}; 74};
74 75
76#define XFS_AIL_PUSHING_BIT 0
77
75/* 78/*
76 * From xfs_trans_ail.c 79 * From xfs_trans_ail.c
77 */ 80 */
81
82extern struct workqueue_struct *xfs_ail_wq; /* AIL workqueue */
83
78void xfs_trans_ail_update_bulk(struct xfs_ail *ailp, 84void xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
79 struct xfs_log_item **log_items, int nr_items, 85 struct xfs_log_item **log_items, int nr_items,
80 xfs_lsn_t lsn) __releases(ailp->xa_lock); 86 xfs_lsn_t lsn) __releases(ailp->xa_lock);
@@ -98,12 +104,13 @@ xfs_trans_ail_delete(
98 xfs_trans_ail_delete_bulk(ailp, &lip, 1); 104 xfs_trans_ail_delete_bulk(ailp, &lip, 1);
99} 105}
100 106
101void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t); 107void xfs_ail_push(struct xfs_ail *, xfs_lsn_t);
108void xfs_ail_push_all(struct xfs_ail *);
109xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp);
110
102void xfs_trans_unlocked_item(struct xfs_ail *, 111void xfs_trans_unlocked_item(struct xfs_ail *,
103 xfs_log_item_t *); 112 xfs_log_item_t *);
104 113
105xfs_lsn_t xfs_trans_ail_tail(struct xfs_ail *ailp);
106
107struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp, 114struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
108 struct xfs_ail_cursor *cur, 115 struct xfs_ail_cursor *cur,
109 xfs_lsn_t lsn); 116 xfs_lsn_t lsn);
@@ -112,11 +119,6 @@ struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
112void xfs_trans_ail_cursor_done(struct xfs_ail *ailp, 119void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
113 struct xfs_ail_cursor *cur); 120 struct xfs_ail_cursor *cur);
114 121
115long xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
116void xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
117int xfsaild_start(struct xfs_ail *);
118void xfsaild_stop(struct xfs_ail *);
119
120#if BITS_PER_LONG != 64 122#if BITS_PER_LONG != 64
121static inline void 123static inline void
122xfs_trans_ail_copy_lsn( 124xfs_trans_ail_copy_lsn(
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d8e6f8cd6f0c..b7a5fe7c52c8 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -953,7 +953,7 @@ xfs_release(
953 * If we previously truncated this file and removed old data 953 * If we previously truncated this file and removed old data
954 * in the process, we want to initiate "early" writeout on 954 * in the process, we want to initiate "early" writeout on
955 * the last close. This is an attempt to combat the notorious 955 * the last close. This is an attempt to combat the notorious
956 * NULL files problem which is particularly noticable from a 956 * NULL files problem which is particularly noticeable from a
957 * truncate down, buffered (re-)write (delalloc), followed by 957 * truncate down, buffered (re-)write (delalloc), followed by
958 * a crash. What we are effectively doing here is 958 * a crash. What we are effectively doing here is
959 * significantly reducing the time window where we'd otherwise 959 * significantly reducing the time window where we'd otherwise
@@ -982,7 +982,7 @@ xfs_release(
982 * 982 *
983 * Further, check if the inode is being opened, written and 983 * Further, check if the inode is being opened, written and
984 * closed frequently and we have delayed allocation blocks 984 * closed frequently and we have delayed allocation blocks
985 * oustanding (e.g. streaming writes from the NFS server), 985 * outstanding (e.g. streaming writes from the NFS server),
986 * truncating the blocks past EOF will cause fragmentation to 986 * truncating the blocks past EOF will cause fragmentation to
987 * occur. 987 * occur.
988 * 988 *
@@ -1189,9 +1189,8 @@ xfs_inactive(
1189 * inode might be lost for a long time or forever. 1189 * inode might be lost for a long time or forever.
1190 */ 1190 */
1191 if (!XFS_FORCED_SHUTDOWN(mp)) { 1191 if (!XFS_FORCED_SHUTDOWN(mp)) {
1192 cmn_err(CE_NOTE, 1192 xfs_notice(mp, "%s: xfs_ifree returned error %d",
1193 "xfs_inactive: xfs_ifree() returned an error = %d on %s", 1193 __func__, error);
1194 error, mp->m_fsname);
1195 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1194 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1196 } 1195 }
1197 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); 1196 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
@@ -1208,12 +1207,12 @@ xfs_inactive(
1208 */ 1207 */
1209 error = xfs_bmap_finish(&tp, &free_list, &committed); 1208 error = xfs_bmap_finish(&tp, &free_list, &committed);
1210 if (error) 1209 if (error)
1211 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1210 xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1212 "xfs_bmap_finish() returned error %d", error); 1211 __func__, error);
1213 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1212 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1214 if (error) 1213 if (error)
1215 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: " 1214 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1216 "xfs_trans_commit() returned error %d", error); 1215 __func__, error);
1217 } 1216 }
1218 1217
1219 /* 1218 /*
@@ -1310,7 +1309,7 @@ xfs_create(
1310 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid, 1309 error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
1311 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp); 1310 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp);
1312 if (error) 1311 if (error)
1313 goto std_return; 1312 return error;
1314 1313
1315 if (is_dir) { 1314 if (is_dir) {
1316 rdev = 0; 1315 rdev = 0;
@@ -1390,12 +1389,6 @@ xfs_create(
1390 } 1389 }
1391 1390
1392 /* 1391 /*
1393 * At this point, we've gotten a newly allocated inode.
1394 * It is locked (and joined to the transaction).
1395 */
1396 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1397
1398 /*
1399 * Now we join the directory inode to the transaction. We do not do it 1392 * Now we join the directory inode to the transaction. We do not do it
1400 * earlier because xfs_dir_ialloc might commit the previous transaction 1393 * earlier because xfs_dir_ialloc might commit the previous transaction
1401 * (and release all the locks). An error from here on will result in 1394 * (and release all the locks). An error from here on will result in
@@ -1440,22 +1433,13 @@ xfs_create(
1440 */ 1433 */
1441 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp); 1434 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp);
1442 1435
1443 /*
1444 * xfs_trans_commit normally decrements the vnode ref count
1445 * when it unlocks the inode. Since we want to return the
1446 * vnode to the caller, we bump the vnode ref count now.
1447 */
1448 IHOLD(ip);
1449
1450 error = xfs_bmap_finish(&tp, &free_list, &committed); 1436 error = xfs_bmap_finish(&tp, &free_list, &committed);
1451 if (error) 1437 if (error)
1452 goto out_abort_rele; 1438 goto out_bmap_cancel;
1453 1439
1454 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1440 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1455 if (error) { 1441 if (error)
1456 IRELE(ip); 1442 goto out_release_inode;
1457 goto out_dqrele;
1458 }
1459 1443
1460 xfs_qm_dqrele(udqp); 1444 xfs_qm_dqrele(udqp);
1461 xfs_qm_dqrele(gdqp); 1445 xfs_qm_dqrele(gdqp);
@@ -1469,27 +1453,21 @@ xfs_create(
1469 cancel_flags |= XFS_TRANS_ABORT; 1453 cancel_flags |= XFS_TRANS_ABORT;
1470 out_trans_cancel: 1454 out_trans_cancel:
1471 xfs_trans_cancel(tp, cancel_flags); 1455 xfs_trans_cancel(tp, cancel_flags);
1472 out_dqrele: 1456 out_release_inode:
1457 /*
1458 * Wait until after the current transaction is aborted to
1459 * release the inode. This prevents recursive transactions
1460 * and deadlocks from xfs_inactive.
1461 */
1462 if (ip)
1463 IRELE(ip);
1464
1473 xfs_qm_dqrele(udqp); 1465 xfs_qm_dqrele(udqp);
1474 xfs_qm_dqrele(gdqp); 1466 xfs_qm_dqrele(gdqp);
1475 1467
1476 if (unlock_dp_on_error) 1468 if (unlock_dp_on_error)
1477 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1469 xfs_iunlock(dp, XFS_ILOCK_EXCL);
1478 std_return:
1479 return error; 1470 return error;
1480
1481 out_abort_rele:
1482 /*
1483 * Wait until after the current transaction is aborted to
1484 * release the inode. This prevents recursive transactions
1485 * and deadlocks from xfs_inactive.
1486 */
1487 xfs_bmap_cancel(&free_list);
1488 cancel_flags |= XFS_TRANS_ABORT;
1489 xfs_trans_cancel(tp, cancel_flags);
1490 IRELE(ip);
1491 unlock_dp_on_error = B_FALSE;
1492 goto out_dqrele;
1493} 1471}
1494 1472
1495#ifdef DEBUG 1473#ifdef DEBUG
@@ -2114,9 +2092,8 @@ xfs_symlink(
2114 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA, 2092 XFS_BMAPI_WRITE | XFS_BMAPI_METADATA,
2115 &first_block, resblks, mval, &nmaps, 2093 &first_block, resblks, mval, &nmaps,
2116 &free_list); 2094 &free_list);
2117 if (error) { 2095 if (error)
2118 goto error1; 2096 goto error2;
2119 }
2120 2097
2121 if (resblks) 2098 if (resblks)
2122 resblks -= fs_blocks; 2099 resblks -= fs_blocks;
@@ -2148,7 +2125,7 @@ xfs_symlink(
2148 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, 2125 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
2149 &first_block, &free_list, resblks); 2126 &first_block, &free_list, resblks);
2150 if (error) 2127 if (error)
2151 goto error1; 2128 goto error2;
2152 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2129 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2153 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2130 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2154 2131
@@ -2161,13 +2138,6 @@ xfs_symlink(
2161 xfs_trans_set_sync(tp); 2138 xfs_trans_set_sync(tp);
2162 } 2139 }
2163 2140
2164 /*
2165 * xfs_trans_commit normally decrements the vnode ref count
2166 * when it unlocks the inode. Since we want to return the
2167 * vnode to the caller, we bump the vnode ref count now.
2168 */
2169 IHOLD(ip);
2170
2171 error = xfs_bmap_finish(&tp, &free_list, &committed); 2141 error = xfs_bmap_finish(&tp, &free_list, &committed);
2172 if (error) { 2142 if (error) {
2173 goto error2; 2143 goto error2;
@@ -2861,7 +2831,8 @@ xfs_change_file_space(
2861 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC; 2831 ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
2862 2832
2863 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2833 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2864 xfs_trans_set_sync(tp); 2834 if (attr_flags & XFS_ATTR_SYNC)
2835 xfs_trans_set_sync(tp);
2865 2836
2866 error = xfs_trans_commit(tp, 0); 2837 error = xfs_trans_commit(tp, 0);
2867 2838
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index f6702927eee4..3bcd23353d6c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -18,6 +18,7 @@ int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
18#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */ 18#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
19#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */ 19#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
20#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */ 20#define XFS_ATTR_NOACL 0x08 /* Don't call xfs_acl_chmod */
21#define XFS_ATTR_SYNC 0x10 /* synchronous operation required */
21 22
22int xfs_readlink(struct xfs_inode *ip, char *link); 23int xfs_readlink(struct xfs_inode *ip, char *link);
23int xfs_release(struct xfs_inode *ip); 24int xfs_release(struct xfs_inode *ip);