aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2010-06-16 12:08:13 -0400
committerJiri Kosina <jkosina@suse.cz>2010-06-16 12:08:13 -0400
commitf1bbbb6912662b9f6070c5bfc4ca9eb1f06a9d5b (patch)
treec2c130a74be25b0b2dff992e1a195e2728bdaadd /fs
parentfd0961ff67727482bb20ca7e8ea97b83e9de2ddb (diff)
parent7e27d6e778cd87b6f2415515d7127eba53fe5d02 (diff)
Merge branch 'master' into for-next
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_dir.c8
-rw-r--r--fs/9p/vfs_file.c17
-rw-r--r--fs/9p/vfs_inode.c111
-rw-r--r--fs/9p/vfs_super.c55
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/adfs/file.c2
-rw-r--r--fs/adfs/inode.c3
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/file.c4
-rw-r--r--fs/affs/namei.c2
-rw-r--r--fs/afs/dir.c6
-rw-r--r--fs/afs/file.c64
-rw-r--r--fs/afs/internal.h3
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/afs/server.c5
-rw-r--r--fs/afs/write.c3
-rw-r--r--fs/aio.c71
-rw-r--r--fs/attr.c50
-rw-r--r--fs/autofs/root.c1
-rw-r--r--fs/autofs4/dev-ioctl.c18
-rw-r--r--fs/autofs4/root.c22
-rw-r--r--fs/bad_inode.c3
-rw-r--r--fs/bfs/dir.c6
-rw-r--r--fs/binfmt_elf_fdpic.c26
-rw-r--r--fs/binfmt_flat.c25
-rw-r--r--fs/block_dev.c380
-rw-r--r--fs/btrfs/acl.c12
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c109
-rw-r--r--fs/btrfs/ctree.h165
-rw-r--r--fs/btrfs/delayed-ref.c101
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/disk-io.c180
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c2258
-rw-r--r--fs/btrfs/extent_io.c85
-rw-r--r--fs/btrfs/extent_io.h14
-rw-r--r--fs/btrfs/file-item.c28
-rw-r--r--fs/btrfs/file.c181
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c1724
-rw-r--r--fs/btrfs/ioctl.c208
-rw-r--r--fs/btrfs/ordered-data.c82
-rw-r--r--fs/btrfs/ordered-data.h9
-rw-r--r--fs/btrfs/relocation.c1974
-rw-r--r--fs/btrfs/root-tree.c26
-rw-r--r--fs/btrfs/super.c41
-rw-r--r--fs/btrfs/transaction.c232
-rw-r--r--fs/btrfs/transaction.h24
-rw-r--r--fs/btrfs/tree-defrag.c7
-rw-r--r--fs/btrfs/tree-log.c241
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c17
-rw-r--r--fs/btrfs/xattr.c14
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/buffer.c149
-rw-r--r--fs/ceph/addr.c11
-rw-r--r--fs/ceph/auth.c8
-rw-r--r--fs/ceph/auth.h8
-rw-r--r--fs/ceph/auth_none.c9
-rw-r--r--fs/ceph/auth_x.c31
-rw-r--r--fs/ceph/caps.c121
-rw-r--r--fs/ceph/ceph_fs.h83
-rw-r--r--fs/ceph/ceph_strings.c16
-rw-r--r--fs/ceph/debugfs.c13
-rw-r--r--fs/ceph/dir.c52
-rw-r--r--fs/ceph/export.c16
-rw-r--r--fs/ceph/file.c21
-rw-r--r--fs/ceph/inode.c99
-rw-r--r--fs/ceph/ioctl.c2
-rw-r--r--fs/ceph/mds_client.c428
-rw-r--r--fs/ceph/mds_client.h12
-rw-r--r--fs/ceph/messenger.c97
-rw-r--r--fs/ceph/messenger.h11
-rw-r--r--fs/ceph/mon_client.c264
-rw-r--r--fs/ceph/mon_client.h27
-rw-r--r--fs/ceph/msgpool.c180
-rw-r--r--fs/ceph/msgpool.h12
-rw-r--r--fs/ceph/msgr.h21
-rw-r--r--fs/ceph/osd_client.c105
-rw-r--r--fs/ceph/osdmap.c2
-rw-r--r--fs/ceph/pagelist.c2
-rw-r--r--fs/ceph/rados.h23
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.c142
-rw-r--r--fs/ceph/super.h33
-rw-r--r--fs/ceph/xattr.c35
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/file.c5
-rw-r--r--fs/coda/coda_int.h3
-rw-r--r--fs/coda/file.c6
-rw-r--r--fs/coda/pioctl.c76
-rw-r--r--fs/coda/psdev.c5
-rw-r--r--fs/compat.c132
-rw-r--r--fs/configfs/inode.c14
-rw-r--r--fs/dcache.c20
-rw-r--r--fs/debugfs/file.c21
-rw-r--r--fs/devpts/inode.c9
-rw-r--r--fs/direct-io.c123
-rw-r--r--fs/drop_caches.c24
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/file.c6
-rw-r--r--fs/ecryptfs/inode.c52
-rw-r--r--fs/ecryptfs/main.c166
-rw-r--r--fs/ecryptfs/mmap.c19
-rw-r--r--fs/ecryptfs/read_write.c13
-rw-r--r--fs/ecryptfs/super.c22
-rw-r--r--fs/exec.c202
-rw-r--r--fs/exofs/dir.c2
-rw-r--r--fs/exofs/file.c7
-rw-r--r--fs/exofs/inode.c41
-rw-r--r--fs/ext2/acl.c4
-rw-r--r--fs/ext2/ext2.h3
-rw-r--r--fs/ext2/file.c7
-rw-r--r--fs/ext2/ialloc.c12
-rw-r--r--fs/ext2/inode.c153
-rw-r--r--fs/ext2/super.c20
-rw-r--r--fs/ext2/xattr.c10
-rw-r--r--fs/ext2/xattr.h12
-rw-r--r--fs/ext2/xattr_security.c2
-rw-r--r--fs/ext2/xattr_trusted.c2
-rw-r--r--fs/ext2/xattr_user.c2
-rw-r--r--fs/ext3/acl.c4
-rw-r--r--fs/ext3/dir.c2
-rw-r--r--fs/ext3/fsync.c7
-rw-r--r--fs/ext3/ialloc.c13
-rw-r--r--fs/ext3/super.c38
-rw-r--r--fs/ext3/xattr.c10
-rw-r--r--fs/ext3/xattr.h12
-rw-r--r--fs/ext3/xattr_security.c2
-rw-r--r--fs/ext3/xattr_trusted.c2
-rw-r--r--fs/ext3/xattr_user.c2
-rw-r--r--fs/ext4/acl.c4
-rw-r--r--fs/ext4/balloc.c5
-rw-r--r--fs/ext4/block_validity.c4
-rw-r--r--fs/ext4/dir.c26
-rw-r--r--fs/ext4/ext4.h169
-rw-r--r--fs/ext4/ext4_jbd2.h8
-rw-r--r--fs/ext4/extents.c417
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsync.c47
-rw-r--r--fs/ext4/ialloc.c101
-rw-r--r--fs/ext4/inode.c763
-rw-r--r--fs/ext4/ioctl.c27
-rw-r--r--fs/ext4/mballoc.c120
-rw-r--r--fs/ext4/migrate.c2
-rw-r--r--fs/ext4/move_extent.c16
-rw-r--r--fs/ext4/namei.c61
-rw-r--r--fs/ext4/resize.c3
-rw-r--r--fs/ext4/super.c117
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/xattr.c49
-rw-r--r--fs/ext4/xattr.h12
-rw-r--r--fs/ext4/xattr_security.c2
-rw-r--r--fs/ext4/xattr_trusted.c2
-rw-r--r--fs/ext4/xattr_user.c2
-rw-r--r--fs/fat/cache.c13
-rw-r--r--fs/fat/dir.c28
-rw-r--r--fs/fat/fat.h22
-rw-r--r--fs/fat/file.c59
-rw-r--r--fs/fat/inode.c43
-rw-r--r--fs/fat/misc.c22
-rw-r--r--fs/fcntl.c12
-rw-r--r--fs/file_table.c21
-rw-r--r--fs/freevxfs/vxfs_lookup.c2
-rw-r--r--fs/fs-writeback.c44
-rw-r--r--fs/fscache/object-list.c2
-rw-r--r--fs/fscache/page.c36
-rw-r--r--fs/fuse/dev.c528
-rw-r--r--fs/fuse/dir.c5
-rw-r--r--fs/fuse/file.c48
-rw-r--r--fs/fuse/fuse_i.h6
-rw-r--r--fs/generic_acl.c4
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/gfs2/acl.h2
-rw-r--r--fs/gfs2/aops.c8
-rw-r--r--fs/gfs2/file.c11
-rw-r--r--fs/gfs2/inode.c54
-rw-r--r--fs/gfs2/inode.h3
-rw-r--r--fs/gfs2/log.c2
-rw-r--r--fs/gfs2/log.h29
-rw-r--r--fs/gfs2/ops_inode.c5
-rw-r--r--fs/gfs2/rgrp.c25
-rw-r--r--fs/gfs2/super.h2
-rw-r--r--fs/gfs2/xattr.c6
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h3
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/ioctl.c12
-rw-r--r--fs/hostfs/hostfs_kern.c4
-rw-r--r--fs/hpfs/file.c4
-rw-r--r--fs/hpfs/hpfs_fn.h2
-rw-r--r--fs/hppfs/hppfs.c2
-rw-r--r--fs/hugetlbfs/inode.c2
-rw-r--r--fs/inode.c26
-rw-r--r--fs/internal.h2
-rw-r--r--fs/ioctl.c15
-rw-r--r--fs/isofs/dir.c1
-rw-r--r--fs/jbd2/checkpoint.c3
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/jffs2/acl.c7
-rw-r--r--fs/jffs2/acl.h4
-rw-r--r--fs/jffs2/dir.c127
-rw-r--r--fs/jffs2/file.c4
-rw-r--r--fs/jffs2/fs.c11
-rw-r--r--fs/jffs2/os-linux.h2
-rw-r--r--fs/jffs2/security.c2
-rw-r--r--fs/jffs2/xattr.c8
-rw-r--r--fs/jffs2/xattr.h8
-rw-r--r--fs/jffs2/xattr_trusted.c2
-rw-r--r--fs/jffs2/xattr_user.c2
-rw-r--r--fs/jfs/file.c4
-rw-r--r--fs/jfs/jfs_inode.c12
-rw-r--r--fs/jfs/jfs_inode.h2
-rw-r--r--fs/jfs/super.c16
-rw-r--r--fs/libfs.c111
-rw-r--r--fs/logfs/file.c4
-rw-r--r--fs/logfs/inode.c9
-rw-r--r--fs/logfs/logfs.h2
-rw-r--r--fs/minix/bitmap.c5
-rw-r--r--fs/minix/dir.c11
-rw-r--r--fs/minix/file.c2
-rw-r--r--fs/minix/itree_v2.c27
-rw-r--r--fs/minix/minix.h2
-rw-r--r--fs/minix/namei.c11
-rw-r--r--fs/namei.c7
-rw-r--r--fs/ncpfs/dir.c3
-rw-r--r--fs/ncpfs/file.c4
-rw-r--r--fs/ncpfs/ioctl.c27
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/file.c5
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/write.c20
-rw-r--r--fs/nfsd/nfs4recover.c87
-rw-r--r--fs/nfsd/nfs4state.c2
-rw-r--r--fs/nfsd/nfsctl.c4
-rw-r--r--fs/nfsd/vfs.c8
-rw-r--r--fs/nilfs2/btree.h2
-rw-r--r--fs/nilfs2/file.c4
-rw-r--r--fs/nilfs2/inode.c11
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/segbuf.h2
-rw-r--r--fs/nilfs2/segment.h2
-rw-r--r--fs/nilfs2/super.c8
-rw-r--r--fs/nilfs2/the_nilfs.c4
-rw-r--r--fs/notify/inotify/inotify.c88
-rw-r--r--fs/ntfs/dir.c5
-rw-r--r--fs/ntfs/file.c37
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/blockcheck.c4
-rw-r--r--fs/ocfs2/file.c15
-rw-r--r--fs/ocfs2/namei.c9
-rw-r--r--fs/ocfs2/super.c50
-rw-r--r--fs/ocfs2/xattr.c12
-rw-r--r--fs/ocfs2/xattr.h12
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/omfs/inode.c4
-rw-r--r--fs/open.c166
-rw-r--r--fs/partitions/acorn.c68
-rw-r--r--fs/partitions/acorn.h10
-rw-r--r--fs/partitions/amiga.c13
-rw-r--r--fs/partitions/amiga.h2
-rw-r--r--fs/partitions/atari.c8
-rw-r--r--fs/partitions/atari.h2
-rw-r--r--fs/partitions/check.c84
-rw-r--r--fs/partitions/check.h12
-rw-r--r--fs/partitions/efi.c93
-rw-r--r--fs/partitions/efi.h2
-rw-r--r--fs/partitions/ibm.c21
-rw-r--r--fs/partitions/ibm.h2
-rw-r--r--fs/partitions/karma.c4
-rw-r--r--fs/partitions/karma.h2
-rw-r--r--fs/partitions/ldm.c107
-rw-r--r--fs/partitions/ldm.h2
-rw-r--r--fs/partitions/mac.c13
-rw-r--r--fs/partitions/mac.h2
-rw-r--r--fs/partitions/msdos.c87
-rw-r--r--fs/partitions/msdos.h2
-rw-r--r--fs/partitions/osf.c4
-rw-r--r--fs/partitions/osf.h2
-rw-r--r--fs/partitions/sgi.c6
-rw-r--r--fs/partitions/sgi.h2
-rw-r--r--fs/partitions/sun.c6
-rw-r--r--fs/partitions/sun.h2
-rw-r--r--fs/partitions/sysv68.c6
-rw-r--r--fs/partitions/sysv68.h2
-rw-r--r--fs/partitions/ultrix.c4
-rw-r--r--fs/partitions/ultrix.h2
-rw-r--r--fs/pipe.c170
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c16
-rw-r--r--fs/proc/generic.c15
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/root.c1
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/qnx4/dir.c3
-rw-r--r--fs/quota/dquot.c201
-rw-r--r--fs/quota/quota.c36
-rw-r--r--fs/ramfs/file-mmu.c3
-rw-r--r--fs/ramfs/file-nommu.c9
-rw-r--r--fs/ramfs/inode.c20
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/reiserfs/dir.c9
-rw-r--r--fs/reiserfs/file.c8
-rw-r--r--fs/reiserfs/namei.c18
-rw-r--r--fs/reiserfs/super.c48
-rw-r--r--fs/reiserfs/xattr.c16
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--fs/reiserfs/xattr_trusted.c2
-rw-r--r--fs/reiserfs/xattr_user.c2
-rw-r--r--fs/smbfs/dir.c3
-rw-r--r--fs/smbfs/file.c5
-rw-r--r--fs/smbfs/inode.c2
-rw-r--r--fs/smbfs/ioctl.c10
-rw-r--r--fs/smbfs/proto.h2
-rw-r--r--fs/smbfs/symlink.c1
-rw-r--r--fs/splice.c153
-rw-r--r--fs/squashfs/Kconfig11
-rw-r--r--fs/squashfs/Makefile2
-rw-r--r--fs/squashfs/inode.c92
-rw-r--r--fs/squashfs/namei.c6
-rw-r--r--fs/squashfs/squashfs.h12
-rw-r--r--fs/squashfs/squashfs_fs.h76
-rw-r--r--fs/squashfs/squashfs_fs_i.h3
-rw-r--r--fs/squashfs/squashfs_fs_sb.h3
-rw-r--r--fs/squashfs/super.c30
-rw-r--r--fs/squashfs/symlink.c11
-rw-r--r--fs/squashfs/xattr.c323
-rw-r--r--fs/squashfs/xattr.h46
-rw-r--r--fs/squashfs/xattr_id.c100
-rw-r--r--fs/statfs.c196
-rw-r--r--fs/super.c334
-rw-r--r--fs/sync.c92
-rw-r--r--fs/sysfs/inode.c8
-rw-r--r--fs/sysv/dir.c2
-rw-r--r--fs/sysv/file.c2
-rw-r--r--fs/sysv/ialloc.c11
-rw-r--r--fs/sysv/inode.c1
-rw-r--r--fs/ubifs/dir.c9
-rw-r--r--fs/ubifs/file.c17
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/udf/balloc.c43
-rw-r--r--fs/udf/dir.c3
-rw-r--r--fs/udf/file.c28
-rw-r--r--fs/udf/ialloc.c32
-rw-r--r--fs/udf/inode.c5
-rw-r--r--fs/udf/namei.c30
-rw-r--r--fs/udf/super.c13
-rw-r--r--fs/udf/udfdecl.h1
-rw-r--r--fs/ufs/balloc.c24
-rw-r--r--fs/ufs/dir.c2
-rw-r--r--fs/ufs/file.c5
-rw-r--r--fs/ufs/ialloc.c23
-rw-r--r--fs/ufs/inode.c4
-rw-r--r--fs/ufs/namei.c16
-rw-r--r--fs/ufs/super.c112
-rw-r--r--fs/ufs/truncate.c20
-rw-r--r--fs/ufs/ufs_fs.h1
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/Makefile1
-rw-r--r--fs/xfs/linux-2.6/xfs_acl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c16
-rw-r--r--fs/xfs/linux-2.6/xfs_quotaops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c15
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c9
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h439
-rw-r--r--fs/xfs/linux-2.6/xfs_xattr.c8
-rw-r--r--fs/xfs/quota/xfs_dquot.c6
-rw-r--r--fs/xfs/quota/xfs_qm.c4
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_ag.h25
-rw-r--r--fs/xfs/xfs_alloc.c357
-rw-r--r--fs/xfs/xfs_alloc.h7
-rw-r--r--fs/xfs/xfs_alloc_btree.c2
-rw-r--r--fs/xfs/xfs_buf_item.c166
-rw-r--r--fs/xfs/xfs_buf_item.h18
-rw-r--r--fs/xfs/xfs_error.c2
-rw-r--r--fs/xfs/xfs_iget.c29
-rw-r--r--fs/xfs/xfs_inode.c144
-rw-r--r--fs/xfs/xfs_log.c120
-rw-r--r--fs/xfs/xfs_log.h14
-rw-r--r--fs/xfs/xfs_log_cil.c725
-rw-r--r--fs/xfs/xfs_log_priv.h118
-rw-r--r--fs/xfs/xfs_log_recover.c57
-rw-r--r--fs/xfs/xfs_log_recover.h2
-rw-r--r--fs/xfs/xfs_mount.c68
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_rtalloc.c4
-rw-r--r--fs/xfs/xfs_rtalloc.h11
-rw-r--r--fs/xfs/xfs_trans.c590
-rw-r--r--fs/xfs/xfs_trans.h455
-rw-r--r--fs/xfs/xfs_trans_buf.c46
-rw-r--r--fs/xfs/xfs_trans_item.c114
-rw-r--r--fs/xfs/xfs_trans_priv.h15
-rw-r--r--fs/xfs/xfs_types.h2
-rw-r--r--fs/xfs/xfs_vnodeops.c2
406 files changed, 14155 insertions, 8739 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
40extern struct file_system_type v9fs_fs_type; 40extern struct file_system_type v9fs_fs_type;
41extern const struct address_space_operations v9fs_addr_operations; 41extern const struct address_space_operations v9fs_addr_operations;
42extern const struct file_operations v9fs_file_operations; 42extern const struct file_operations v9fs_file_operations;
43extern const struct file_operations v9fs_file_operations_dotl;
43extern const struct file_operations v9fs_dir_operations; 44extern const struct file_operations v9fs_dir_operations;
45extern const struct file_operations v9fs_dir_operations_dotl;
44extern const struct dentry_operations v9fs_dentry_operations; 46extern const struct dentry_operations v9fs_dentry_operations;
45extern const struct dentry_operations v9fs_cached_dentry_operations; 47extern const struct dentry_operations v9fs_cached_dentry_operations;
46 48
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfcee..d61e3b28ce37 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
203 .open = v9fs_file_open, 203 .open = v9fs_file_open,
204 .release = v9fs_dir_release, 204 .release = v9fs_dir_release,
205}; 205};
206
207const struct file_operations v9fs_dir_operations_dotl = {
208 .read = generic_read_dir,
209 .llseek = generic_file_llseek,
210 .readdir = v9fs_dir_readdir,
211 .open = v9fs_file_open,
212 .release = v9fs_dir_release,
213};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..2bedc6c94fc2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
257 return total; 257 return total;
258} 258}
259 259
260static int v9fs_file_fsync(struct file *filp, struct dentry *dentry, 260static int v9fs_file_fsync(struct file *filp, int datasync)
261 int datasync)
262{ 261{
263 struct p9_fid *fid; 262 struct p9_fid *fid;
264 struct p9_wstat wstat; 263 struct p9_wstat wstat;
265 int retval; 264 int retval;
266 265
267 P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp, 266 P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
268 dentry, datasync);
269 267
270 fid = filp->private_data; 268 fid = filp->private_data;
271 v9fs_blank_wstat(&wstat); 269 v9fs_blank_wstat(&wstat);
@@ -296,3 +294,14 @@ const struct file_operations v9fs_file_operations = {
296 .mmap = generic_file_readonly_mmap, 294 .mmap = generic_file_readonly_mmap,
297 .fsync = v9fs_file_fsync, 295 .fsync = v9fs_file_fsync,
298}; 296};
297
298const struct file_operations v9fs_file_operations_dotl = {
299 .llseek = generic_file_llseek,
300 .read = v9fs_file_read,
301 .write = v9fs_file_write,
302 .open = v9fs_file_open,
303 .release = v9fs_dir_release,
304 .lock = v9fs_file_lock,
305 .mmap = generic_file_readonly_mmap,
306 .fsync = v9fs_file_fsync,
307};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index f2434fc9d2c4..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -44,9 +44,12 @@
44#include "cache.h" 44#include "cache.h"
45 45
46static const struct inode_operations v9fs_dir_inode_operations; 46static const struct inode_operations v9fs_dir_inode_operations;
47static const struct inode_operations v9fs_dir_inode_operations_ext; 47static const struct inode_operations v9fs_dir_inode_operations_dotu;
48static const struct inode_operations v9fs_dir_inode_operations_dotl;
48static const struct inode_operations v9fs_file_inode_operations; 49static const struct inode_operations v9fs_file_inode_operations;
50static const struct inode_operations v9fs_file_inode_operations_dotl;
49static const struct inode_operations v9fs_symlink_inode_operations; 51static const struct inode_operations v9fs_symlink_inode_operations;
52static const struct inode_operations v9fs_symlink_inode_operations_dotl;
50 53
51/** 54/**
52 * unixmode2p9mode - convert unix mode bits to plan 9 55 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -253,9 +256,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
253 return ERR_PTR(-ENOMEM); 256 return ERR_PTR(-ENOMEM);
254 } 257 }
255 258
256 inode->i_mode = mode; 259 inode_init_owner(inode, NULL, mode);
257 inode->i_uid = current_fsuid();
258 inode->i_gid = current_fsgid();
259 inode->i_blocks = 0; 260 inode->i_blocks = 0;
260 inode->i_rdev = 0; 261 inode->i_rdev = 0;
261 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 262 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -275,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
275 init_special_inode(inode, inode->i_mode, inode->i_rdev); 276 init_special_inode(inode, inode->i_mode, inode->i_rdev);
276 break; 277 break;
277 case S_IFREG: 278 case S_IFREG:
278 inode->i_op = &v9fs_file_inode_operations; 279 if (v9fs_proto_dotl(v9ses)) {
279 inode->i_fop = &v9fs_file_operations; 280 inode->i_op = &v9fs_file_inode_operations_dotl;
281 inode->i_fop = &v9fs_file_operations_dotl;
282 } else {
283 inode->i_op = &v9fs_file_inode_operations;
284 inode->i_fop = &v9fs_file_operations;
285 }
286
280 break; 287 break;
288
281 case S_IFLNK: 289 case S_IFLNK:
282 if (!v9fs_proto_dotu(v9ses)) { 290 if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
283 P9_DPRINTK(P9_DEBUG_ERROR, 291 P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
284 "extended modes used w/o 9P2000.u\n"); 292 "legacy protocol.\n");
285 err = -EINVAL; 293 err = -EINVAL;
286 goto error; 294 goto error;
287 } 295 }
288 inode->i_op = &v9fs_symlink_inode_operations; 296
297 if (v9fs_proto_dotl(v9ses))
298 inode->i_op = &v9fs_symlink_inode_operations_dotl;
299 else
300 inode->i_op = &v9fs_symlink_inode_operations;
301
289 break; 302 break;
290 case S_IFDIR: 303 case S_IFDIR:
291 inc_nlink(inode); 304 inc_nlink(inode);
292 if (v9fs_proto_dotu(v9ses)) 305 if (v9fs_proto_dotl(v9ses))
293 inode->i_op = &v9fs_dir_inode_operations_ext; 306 inode->i_op = &v9fs_dir_inode_operations_dotl;
307 else if (v9fs_proto_dotu(v9ses))
308 inode->i_op = &v9fs_dir_inode_operations_dotu;
294 else 309 else
295 inode->i_op = &v9fs_dir_inode_operations; 310 inode->i_op = &v9fs_dir_inode_operations;
296 inode->i_fop = &v9fs_dir_operations; 311
312 if (v9fs_proto_dotl(v9ses))
313 inode->i_fop = &v9fs_dir_operations_dotl;
314 else
315 inode->i_fop = &v9fs_dir_operations;
316
297 break; 317 break;
298 default: 318 default:
299 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", 319 P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -434,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
434{ 454{
435 int retval; 455 int retval;
436 struct inode *file_inode; 456 struct inode *file_inode;
437 struct v9fs_session_info *v9ses;
438 struct p9_fid *v9fid; 457 struct p9_fid *v9fid;
439 458
440 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file, 459 P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
441 rmdir); 460 rmdir);
442 461
443 file_inode = file->d_inode; 462 file_inode = file->d_inode;
444 v9ses = v9fs_inode2v9ses(file_inode);
445 v9fid = v9fs_fid_clone(file); 463 v9fid = v9fs_fid_clone(file);
446 if (IS_ERR(v9fid)) 464 if (IS_ERR(v9fid))
447 return PTR_ERR(v9fid); 465 return PTR_ERR(v9fid);
@@ -484,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
484 ofid = NULL; 502 ofid = NULL;
485 fid = NULL; 503 fid = NULL;
486 name = (char *) dentry->d_name.name; 504 name = (char *) dentry->d_name.name;
487 dfid = v9fs_fid_clone(dentry->d_parent); 505 dfid = v9fs_fid_lookup(dentry->d_parent);
488 if (IS_ERR(dfid)) { 506 if (IS_ERR(dfid)) {
489 err = PTR_ERR(dfid); 507 err = PTR_ERR(dfid);
490 P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err); 508 P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
491 dfid = NULL; 509 return ERR_PTR(err);
492 goto error;
493 } 510 }
494 511
495 /* clone a fid to use for creation */ 512 /* clone a fid to use for creation */
@@ -497,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
497 if (IS_ERR(ofid)) { 514 if (IS_ERR(ofid)) {
498 err = PTR_ERR(ofid); 515 err = PTR_ERR(ofid);
499 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 516 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
500 ofid = NULL; 517 return ERR_PTR(err);
501 goto error;
502 } 518 }
503 519
504 err = p9_client_fcreate(ofid, name, perm, mode, extension); 520 err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -508,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
508 } 524 }
509 525
510 /* now walk from the parent so we can get unopened fid */ 526 /* now walk from the parent so we can get unopened fid */
511 fid = p9_client_walk(dfid, 1, &name, 0); 527 fid = p9_client_walk(dfid, 1, &name, 1);
512 if (IS_ERR(fid)) { 528 if (IS_ERR(fid)) {
513 err = PTR_ERR(fid); 529 err = PTR_ERR(fid);
514 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); 530 P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
515 fid = NULL; 531 fid = NULL;
516 goto error; 532 goto error;
517 } else 533 }
518 dfid = NULL;
519 534
520 /* instantiate inode and assign the unopened fid to the dentry */ 535 /* instantiate inode and assign the unopened fid to the dentry */
521 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb); 536 inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -538,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
538 return ofid; 553 return ofid;
539 554
540error: 555error:
541 if (dfid)
542 p9_client_clunk(dfid);
543
544 if (ofid) 556 if (ofid)
545 p9_client_clunk(ofid); 557 p9_client_clunk(ofid);
546 558
@@ -675,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
675 if (IS_ERR(fid)) { 687 if (IS_ERR(fid)) {
676 result = PTR_ERR(fid); 688 result = PTR_ERR(fid);
677 if (result == -ENOENT) { 689 if (result == -ENOENT) {
678 d_add(dentry, NULL); 690 inode = NULL;
679 return NULL; 691 goto inst_out;
680 } 692 }
681 693
682 return ERR_PTR(result); 694 return ERR_PTR(result);
@@ -693,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
693 if (result < 0) 705 if (result < 0)
694 goto error; 706 goto error;
695 707
696 if ((fid->qid.version) && (v9ses->cache)) 708inst_out:
709 if (v9ses->cache)
697 dentry->d_op = &v9fs_cached_dentry_operations; 710 dentry->d_op = &v9fs_cached_dentry_operations;
698 else 711 else
699 dentry->d_op = &v9fs_dentry_operations; 712 dentry->d_op = &v9fs_dentry_operations;
@@ -772,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
772 goto clunk_olddir; 785 goto clunk_olddir;
773 } 786 }
774 787
788 if (v9fs_proto_dotl(v9ses)) {
789 retval = p9_client_rename(oldfid, newdirfid,
790 (char *) new_dentry->d_name.name);
791 if (retval != -ENOSYS)
792 goto clunk_newdir;
793 }
794
775 /* 9P can only handle file rename in the same directory */ 795 /* 9P can only handle file rename in the same directory */
776 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) { 796 if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
777 P9_DPRINTK(P9_DEBUG_ERROR, 797 P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1197,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1197 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); 1217 sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
1198 else if (S_ISFIFO(mode)) 1218 else if (S_ISFIFO(mode))
1199 *name = 0; 1219 *name = 0;
1220 else if (S_ISSOCK(mode))
1221 *name = 0;
1200 else { 1222 else {
1201 __putname(name); 1223 __putname(name);
1202 return -EINVAL; 1224 return -EINVAL;
@@ -1208,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
1208 return retval; 1230 return retval;
1209} 1231}
1210 1232
1211static const struct inode_operations v9fs_dir_inode_operations_ext = { 1233static const struct inode_operations v9fs_dir_inode_operations_dotu = {
1234 .create = v9fs_vfs_create,
1235 .lookup = v9fs_vfs_lookup,
1236 .symlink = v9fs_vfs_symlink,
1237 .link = v9fs_vfs_link,
1238 .unlink = v9fs_vfs_unlink,
1239 .mkdir = v9fs_vfs_mkdir,
1240 .rmdir = v9fs_vfs_rmdir,
1241 .mknod = v9fs_vfs_mknod,
1242 .rename = v9fs_vfs_rename,
1243 .getattr = v9fs_vfs_getattr,
1244 .setattr = v9fs_vfs_setattr,
1245};
1246
1247static const struct inode_operations v9fs_dir_inode_operations_dotl = {
1212 .create = v9fs_vfs_create, 1248 .create = v9fs_vfs_create,
1213 .lookup = v9fs_vfs_lookup, 1249 .lookup = v9fs_vfs_lookup,
1214 .symlink = v9fs_vfs_symlink, 1250 .symlink = v9fs_vfs_symlink,
@@ -1239,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
1239 .setattr = v9fs_vfs_setattr, 1275 .setattr = v9fs_vfs_setattr,
1240}; 1276};
1241 1277
1278static const struct inode_operations v9fs_file_inode_operations_dotl = {
1279 .getattr = v9fs_vfs_getattr,
1280 .setattr = v9fs_vfs_setattr,
1281};
1282
1242static const struct inode_operations v9fs_symlink_inode_operations = { 1283static const struct inode_operations v9fs_symlink_inode_operations = {
1243 .readlink = generic_readlink, 1284 .readlink = generic_readlink,
1244 .follow_link = v9fs_vfs_follow_link, 1285 .follow_link = v9fs_vfs_follow_link,
@@ -1246,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
1246 .getattr = v9fs_vfs_getattr, 1287 .getattr = v9fs_vfs_getattr,
1247 .setattr = v9fs_vfs_setattr, 1288 .setattr = v9fs_vfs_setattr,
1248}; 1289};
1290
1291static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
1292 .readlink = generic_readlink,
1293 .follow_link = v9fs_vfs_follow_link,
1294 .put_link = v9fs_vfs_put_link,
1295 .getattr = v9fs_vfs_getattr,
1296 .setattr = v9fs_vfs_setattr,
1297};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a0..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,6 +38,7 @@
38#include <linux/idr.h> 38#include <linux/idr.h>
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/slab.h> 40#include <linux/slab.h>
41#include <linux/statfs.h>
41#include <net/9p/9p.h> 42#include <net/9p/9p.h>
42#include <net/9p/client.h> 43#include <net/9p/client.h>
43 44
@@ -45,7 +46,7 @@
45#include "v9fs_vfs.h" 46#include "v9fs_vfs.h"
46#include "fid.h" 47#include "fid.h"
47 48
48static const struct super_operations v9fs_super_ops; 49static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
49 50
50/** 51/**
51 * v9fs_set_super - set the superblock 52 * v9fs_set_super - set the superblock
@@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
76 sb->s_blocksize_bits = fls(v9ses->maxdata - 1); 77 sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
77 sb->s_blocksize = 1 << sb->s_blocksize_bits; 78 sb->s_blocksize = 1 << sb->s_blocksize_bits;
78 sb->s_magic = V9FS_MAGIC; 79 sb->s_magic = V9FS_MAGIC;
79 sb->s_op = &v9fs_super_ops; 80 if (v9fs_proto_dotl(v9ses))
81 sb->s_op = &v9fs_super_ops_dotl;
82 else
83 sb->s_op = &v9fs_super_ops;
80 sb->s_bdi = &v9ses->bdi; 84 sb->s_bdi = &v9ses->bdi;
81 85
82 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | 86 sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb)
211 v9fs_session_begin_cancel(v9ses); 215 v9fs_session_begin_cancel(v9ses);
212} 216}
213 217
218static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
219{
220 struct v9fs_session_info *v9ses;
221 struct p9_fid *fid;
222 struct p9_rstatfs rs;
223 int res;
224
225 fid = v9fs_fid_lookup(dentry);
226 if (IS_ERR(fid)) {
227 res = PTR_ERR(fid);
228 goto done;
229 }
230
231 v9ses = v9fs_inode2v9ses(dentry->d_inode);
232 if (v9fs_proto_dotl(v9ses)) {
233 res = p9_client_statfs(fid, &rs);
234 if (res == 0) {
235 buf->f_type = rs.type;
236 buf->f_bsize = rs.bsize;
237 buf->f_blocks = rs.blocks;
238 buf->f_bfree = rs.bfree;
239 buf->f_bavail = rs.bavail;
240 buf->f_files = rs.files;
241 buf->f_ffree = rs.ffree;
242 buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
243 buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
244 buf->f_namelen = rs.namelen;
245 }
246 if (res != -ENOSYS)
247 goto done;
248 }
249 res = simple_statfs(dentry, buf);
250done:
251 return res;
252}
253
214static const struct super_operations v9fs_super_ops = { 254static const struct super_operations v9fs_super_ops = {
215#ifdef CONFIG_9P_FSCACHE 255#ifdef CONFIG_9P_FSCACHE
216 .alloc_inode = v9fs_alloc_inode, 256 .alloc_inode = v9fs_alloc_inode,
@@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
222 .umount_begin = v9fs_umount_begin, 262 .umount_begin = v9fs_umount_begin,
223}; 263};
224 264
265static const struct super_operations v9fs_super_ops_dotl = {
266#ifdef CONFIG_9P_FSCACHE
267 .alloc_inode = v9fs_alloc_inode,
268 .destroy_inode = v9fs_destroy_inode,
269#endif
270 .statfs = v9fs_statfs,
271 .clear_inode = v9fs_clear_inode,
272 .show_options = generic_show_options,
273 .umount_begin = v9fs_umount_begin,
274};
275
225struct file_system_type v9fs_fs_type = { 276struct file_system_type v9fs_fs_type = {
226 .name = "9p", 277 .name = "9p",
227 .get_sb = v9fs_get_sb, 278 .get_sb = v9fs_get_sb,
diff --git a/fs/Makefile b/fs/Makefile
index 97f340f14ba2..e6ec1d309b1d 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
11 attr.o bad_inode.o file.o filesystems.o namespace.o \ 11 attr.o bad_inode.o file.o filesystems.o namespace.o \
12 seq_file.o xattr.o libfs.o fs-writeback.o \ 12 seq_file.o xattr.o libfs.o fs-writeback.o \
13 pnode.o drop_caches.o splice.o sync.o utimes.o \ 13 pnode.o drop_caches.o splice.o sync.o utimes.o \
14 stack.o fs_struct.o 14 stack.o fs_struct.o statfs.o
15 15
16ifeq ($(CONFIG_BLOCK),y) 16ifeq ($(CONFIG_BLOCK),y)
17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o 17obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
197 .read = generic_read_dir, 197 .read = generic_read_dir,
198 .llseek = generic_file_llseek, 198 .llseek = generic_file_llseek,
199 .readdir = adfs_readdir, 199 .readdir = adfs_readdir,
200 .fsync = simple_fsync, 200 .fsync = generic_file_fsync,
201}; 201};
202 202
203static int 203static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
26 .read = do_sync_read, 26 .read = do_sync_read,
27 .aio_read = generic_file_aio_read, 27 .aio_read = generic_file_aio_read,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .write = do_sync_write, 30 .write = do_sync_write,
31 .aio_write = generic_file_aio_write, 31 .aio_write = generic_file_aio_write,
32 .splice_read = generic_file_splice_read, 32 .splice_read = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
322 if (error) 322 if (error)
323 goto out; 323 goto out;
324 324
325 /* XXX: this is missing some actual on-disk truncation.. */
325 if (ia_valid & ATTR_SIZE) 326 if (ia_valid & ATTR_SIZE)
326 error = vmtruncate(inode, attr->ia_size); 327 error = simple_setsize(inode, attr->ia_size);
327 328
328 if (error) 329 if (error)
329 goto out; 330 goto out;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dent
183 183
184void affs_free_prealloc(struct inode *inode); 184void affs_free_prealloc(struct inode *inode);
185extern void affs_truncate(struct inode *); 185extern void affs_truncate(struct inode *);
186int affs_file_fsync(struct file *, struct dentry *, int); 186int affs_file_fsync(struct file *, int);
187 187
188/* dir.c */ 188/* dir.c */
189 189
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
916 affs_free_prealloc(inode); 916 affs_free_prealloc(inode);
917} 917}
918 918
919int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 919int affs_file_fsync(struct file *filp, int datasync)
920{ 920{
921 struct inode * inode = dentry->d_inode; 921 struct inode *inode = filp->f_mapping->host;
922 int ret, err; 922 int ret, err;
923 923
924 ret = write_inode_now(inode, 0); 924 ret = write_inode_now(inode, 0);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
224 affs_brelse(bh); 224 affs_brelse(bh);
225 inode = affs_iget(sb, ino); 225 inode = affs_iget(sb, ino);
226 if (IS_ERR(inode)) 226 if (IS_ERR(inode))
227 return ERR_PTR(PTR_ERR(inode)); 227 return ERR_CAST(inode);
228 } 228 }
229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations; 229 dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
230 d_add(dentry, inode); 230 d_add(dentry, inode);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index adc1cb771b57..b42d5cc1d6d2 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -189,13 +189,9 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
189 struct key *key) 189 struct key *key)
190{ 190{
191 struct page *page; 191 struct page *page;
192 struct file file = {
193 .private_data = key,
194 };
195
196 _enter("{%lu},%lu", dir->i_ino, index); 192 _enter("{%lu},%lu", dir->i_ino, index);
197 193
198 page = read_mapping_page(dir->i_mapping, index, &file); 194 page = read_cache_page(dir->i_mapping, index, afs_page_filler, key);
199 if (!IS_ERR(page)) { 195 if (!IS_ERR(page)) {
200 kmap(page); 196 kmap(page);
201 if (!PageChecked(page)) 197 if (!PageChecked(page))
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0df9bc2b724d..14d89fa58fee 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -121,34 +121,19 @@ static void afs_file_readpage_read_complete(struct page *page,
121#endif 121#endif
122 122
123/* 123/*
124 * AFS read page from file, directory or symlink 124 * read page from file, directory or symlink, given a key to use
125 */ 125 */
126static int afs_readpage(struct file *file, struct page *page) 126int afs_page_filler(void *data, struct page *page)
127{ 127{
128 struct afs_vnode *vnode; 128 struct inode *inode = page->mapping->host;
129 struct inode *inode; 129 struct afs_vnode *vnode = AFS_FS_I(inode);
130 struct key *key; 130 struct key *key = data;
131 size_t len; 131 size_t len;
132 off_t offset; 132 off_t offset;
133 int ret; 133 int ret;
134 134
135 inode = page->mapping->host;
136
137 if (file) {
138 key = file->private_data;
139 ASSERT(key != NULL);
140 } else {
141 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
142 if (IS_ERR(key)) {
143 ret = PTR_ERR(key);
144 goto error_nokey;
145 }
146 }
147
148 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); 135 _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
149 136
150 vnode = AFS_FS_I(inode);
151
152 BUG_ON(!PageLocked(page)); 137 BUG_ON(!PageLocked(page));
153 138
154 ret = -ESTALE; 139 ret = -ESTALE;
@@ -214,31 +199,56 @@ static int afs_readpage(struct file *file, struct page *page)
214 unlock_page(page); 199 unlock_page(page);
215 } 200 }
216 201
217 if (!file)
218 key_put(key);
219 _leave(" = 0"); 202 _leave(" = 0");
220 return 0; 203 return 0;
221 204
222error: 205error:
223 SetPageError(page); 206 SetPageError(page);
224 unlock_page(page); 207 unlock_page(page);
225 if (!file)
226 key_put(key);
227error_nokey:
228 _leave(" = %d", ret); 208 _leave(" = %d", ret);
229 return ret; 209 return ret;
230} 210}
231 211
232/* 212/*
213 * read page from file, directory or symlink, given a file to nominate the key
214 * to be used
215 */
216static int afs_readpage(struct file *file, struct page *page)
217{
218 struct key *key;
219 int ret;
220
221 if (file) {
222 key = file->private_data;
223 ASSERT(key != NULL);
224 ret = afs_page_filler(key, page);
225 } else {
226 struct inode *inode = page->mapping->host;
227 key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
228 if (IS_ERR(key)) {
229 ret = PTR_ERR(key);
230 } else {
231 ret = afs_page_filler(key, page);
232 key_put(key);
233 }
234 }
235 return ret;
236}
237
238/*
233 * read a set of pages 239 * read a set of pages
234 */ 240 */
235static int afs_readpages(struct file *file, struct address_space *mapping, 241static int afs_readpages(struct file *file, struct address_space *mapping,
236 struct list_head *pages, unsigned nr_pages) 242 struct list_head *pages, unsigned nr_pages)
237{ 243{
244 struct key *key = file->private_data;
238 struct afs_vnode *vnode; 245 struct afs_vnode *vnode;
239 int ret = 0; 246 int ret = 0;
240 247
241 _enter(",{%lu},,%d", mapping->host->i_ino, nr_pages); 248 _enter("{%d},{%lu},,%d",
249 key_serial(key), mapping->host->i_ino, nr_pages);
250
251 ASSERT(key != NULL);
242 252
243 vnode = AFS_FS_I(mapping->host); 253 vnode = AFS_FS_I(mapping->host);
244 if (vnode->flags & AFS_VNODE_DELETED) { 254 if (vnode->flags & AFS_VNODE_DELETED) {
@@ -279,7 +289,7 @@ static int afs_readpages(struct file *file, struct address_space *mapping,
279 } 289 }
280 290
281 /* load the missing pages from the network */ 291 /* load the missing pages from the network */
282 ret = read_cache_pages(mapping, pages, (void *) afs_readpage, file); 292 ret = read_cache_pages(mapping, pages, afs_page_filler, key);
283 293
284 _leave(" = %d [netting]", ret); 294 _leave(" = %d [netting]", ret);
285 return ret; 295 return ret;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index a10f2582844f..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -494,6 +494,7 @@ extern const struct file_operations afs_file_operations;
494 494
495extern int afs_open(struct inode *, struct file *); 495extern int afs_open(struct inode *, struct file *);
496extern int afs_release(struct inode *, struct file *); 496extern int afs_release(struct inode *, struct file *);
497extern int afs_page_filler(void *, struct page *);
497 498
498/* 499/*
499 * flock.c 500 * flock.c
@@ -739,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
739extern ssize_t afs_file_write(struct kiocb *, const struct iovec *, 740extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
740 unsigned long, loff_t); 741 unsigned long, loff_t);
741extern int afs_writeback_all(struct afs_vnode *); 742extern int afs_writeback_all(struct afs_vnode *);
742extern int afs_fsync(struct file *, struct dentry *, int); 743extern int afs_fsync(struct file *, int);
743 744
744 745
745/*****************************************************************************/ 746/*****************************************************************************/
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index b3feddc4f7d6..a9e23039ea34 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -49,9 +49,6 @@ static unsigned long afs_mntpt_expiry_timeout = 10 * 60;
49 */ 49 */
50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key) 50int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
51{ 51{
52 struct file file = {
53 .private_data = key,
54 };
55 struct page *page; 52 struct page *page;
56 size_t size; 53 size_t size;
57 char *buf; 54 char *buf;
@@ -61,7 +58,8 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
61 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique); 58 vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
62 59
63 /* read the contents of the symlink into the pagecache */ 60 /* read the contents of the symlink into the pagecache */
64 page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file); 61 page = read_cache_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0,
62 afs_page_filler, key);
65 if (IS_ERR(page)) { 63 if (IS_ERR(page)) {
66 ret = PTR_ERR(page); 64 ret = PTR_ERR(page);
67 goto out; 65 goto out;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f49099516675..9fdc7fe3a7bc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
91 91
92 memcpy(&server->addr, addr, sizeof(struct in_addr)); 92 memcpy(&server->addr, addr, sizeof(struct in_addr));
93 server->addr.s_addr = addr->s_addr; 93 server->addr.s_addr = addr->s_addr;
94 _leave(" = %p{%d}", server, atomic_read(&server->usage));
95 } else {
96 _leave(" = NULL [nomem]");
94 } 97 }
95
96 _leave(" = %p{%d}", server, atomic_read(&server->usage));
97 return server; 98 return server;
98} 99}
99 100
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..3dab9e9948d0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
701 * - the return status from this call provides a reliable indication of 701 * - the return status from this call provides a reliable indication of
702 * whether any write errors occurred for this process. 702 * whether any write errors occurred for this process.
703 */ 703 */
704int afs_fsync(struct file *file, struct dentry *dentry, int datasync) 704int afs_fsync(struct file *file, int datasync)
705{ 705{
706 struct dentry *dentry = file->f_path.dentry;
706 struct afs_writeback *wb, *xwb; 707 struct afs_writeback *wb, *xwb;
707 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 708 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
708 int ret; 709 int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/mempool.h> 37#include <linux/mempool.h>
38#include <linux/hash.h> 38#include <linux/hash.h>
39#include <linux/compat.h>
39 40
40#include <asm/kmap_types.h> 41#include <asm/kmap_types.h>
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
526 527
527 /* Complete the fput(s) */ 528 /* Complete the fput(s) */
528 if (req->ki_filp != NULL) 529 if (req->ki_filp != NULL)
529 __fput(req->ki_filp); 530 fput(req->ki_filp);
530 531
531 /* Link the iocb into the context's free list */ 532 /* Link the iocb into the context's free list */
532 spin_lock_irq(&ctx->ctx_lock); 533 spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
559 560
560 /* 561 /*
561 * Try to optimize the aio and eventfd file* puts, by avoiding to 562 * Try to optimize the aio and eventfd file* puts, by avoiding to
562 * schedule work in case it is not __fput() time. In normal cases, 563 * schedule work in case it is not final fput() time. In normal cases,
563 * we would not be holding the last reference to the file*, so 564 * we would not be holding the last reference to the file*, so
564 * this function will be executed w/out any aio kthread wakeup. 565 * this function will be executed w/out any aio kthread wakeup.
565 */ 566 */
566 if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { 567 if (unlikely(!fput_atomic(req->ki_filp))) {
567 get_ioctx(ctx); 568 get_ioctx(ctx);
568 spin_lock(&fput_lock); 569 spin_lock(&fput_lock);
569 list_add(&req->ki_list, &fput_head); 570 list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
1384 return ret; 1385 return ret;
1385} 1386}
1386 1387
1387static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb) 1388static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1388{ 1389{
1389 ssize_t ret; 1390 ssize_t ret;
1390 1391
1391 ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf, 1392#ifdef CONFIG_COMPAT
1392 kiocb->ki_nbytes, 1, 1393 if (compat)
1393 &kiocb->ki_inline_vec, &kiocb->ki_iovec); 1394 ret = compat_rw_copy_check_uvector(type,
1395 (struct compat_iovec __user *)kiocb->ki_buf,
1396 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1397 &kiocb->ki_iovec);
1398 else
1399#endif
1400 ret = rw_copy_check_uvector(type,
1401 (struct iovec __user *)kiocb->ki_buf,
1402 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1403 &kiocb->ki_iovec);
1394 if (ret < 0) 1404 if (ret < 0)
1395 goto out; 1405 goto out;
1396 1406
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
1420 * Performs the initial checks and aio retry method 1430 * Performs the initial checks and aio retry method
1421 * setup for the kiocb at the time of io submission. 1431 * setup for the kiocb at the time of io submission.
1422 */ 1432 */
1423static ssize_t aio_setup_iocb(struct kiocb *kiocb) 1433static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
1424{ 1434{
1425 struct file *file = kiocb->ki_filp; 1435 struct file *file = kiocb->ki_filp;
1426 ssize_t ret = 0; 1436 ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1469 ret = security_file_permission(file, MAY_READ); 1479 ret = security_file_permission(file, MAY_READ);
1470 if (unlikely(ret)) 1480 if (unlikely(ret))
1471 break; 1481 break;
1472 ret = aio_setup_vectored_rw(READ, kiocb); 1482 ret = aio_setup_vectored_rw(READ, kiocb, compat);
1473 if (ret) 1483 if (ret)
1474 break; 1484 break;
1475 ret = -EINVAL; 1485 ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
1483 ret = security_file_permission(file, MAY_WRITE); 1493 ret = security_file_permission(file, MAY_WRITE);
1484 if (unlikely(ret)) 1494 if (unlikely(ret))
1485 break; 1495 break;
1486 ret = aio_setup_vectored_rw(WRITE, kiocb); 1496 ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
1487 if (ret) 1497 if (ret)
1488 break; 1498 break;
1489 ret = -EINVAL; 1499 ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
1548} 1558}
1549 1559
1550static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, 1560static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1551 struct iocb *iocb, struct hlist_head *batch_hash) 1561 struct iocb *iocb, struct hlist_head *batch_hash,
1562 bool compat)
1552{ 1563{
1553 struct kiocb *req; 1564 struct kiocb *req;
1554 struct file *file; 1565 struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1609 req->ki_left = req->ki_nbytes = iocb->aio_nbytes; 1620 req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
1610 req->ki_opcode = iocb->aio_lio_opcode; 1621 req->ki_opcode = iocb->aio_lio_opcode;
1611 1622
1612 ret = aio_setup_iocb(req); 1623 ret = aio_setup_iocb(req, compat);
1613 1624
1614 if (ret) 1625 if (ret)
1615 goto out_put_req; 1626 goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
1637 return ret; 1648 return ret;
1638} 1649}
1639 1650
1640/* sys_io_submit: 1651long do_io_submit(aio_context_t ctx_id, long nr,
1641 * Queue the nr iocbs pointed to by iocbpp for processing. Returns 1652 struct iocb __user *__user *iocbpp, bool compat)
1642 * the number of iocbs queued. May return -EINVAL if the aio_context
1643 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1644 * *iocbpp[0] is not properly initialized, if the operation specified
1645 * is invalid for the file descriptor in the iocb. May fail with
1646 * -EFAULT if any of the data structures point to invalid data. May
1647 * fail with -EBADF if the file descriptor specified in the first
1648 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1649 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1650 * fail with -ENOSYS if not implemented.
1651 */
1652SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1653 struct iocb __user * __user *, iocbpp)
1654{ 1653{
1655 struct kioctx *ctx; 1654 struct kioctx *ctx;
1656 long ret = 0; 1655 long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1687 break; 1686 break;
1688 } 1687 }
1689 1688
1690 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); 1689 ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
1691 if (ret) 1690 if (ret)
1692 break; 1691 break;
1693 } 1692 }
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1697 return i ? i : ret; 1696 return i ? i : ret;
1698} 1697}
1699 1698
1699/* sys_io_submit:
1700 * Queue the nr iocbs pointed to by iocbpp for processing. Returns
1701 * the number of iocbs queued. May return -EINVAL if the aio_context
1702 * specified by ctx_id is invalid, if nr is < 0, if the iocb at
1703 * *iocbpp[0] is not properly initialized, if the operation specified
1704 * is invalid for the file descriptor in the iocb. May fail with
1705 * -EFAULT if any of the data structures point to invalid data. May
1706 * fail with -EBADF if the file descriptor specified in the first
1707 * iocb is invalid. May fail with -EAGAIN if insufficient resources
1708 * are available to queue any iocbs. Will return 0 if nr is 0. Will
1709 * fail with -ENOSYS if not implemented.
1710 */
1711SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
1712 struct iocb __user * __user *, iocbpp)
1713{
1714 return do_io_submit(ctx_id, nr, iocbpp, 0);
1715}
1716
1700/* lookup_kiocb 1717/* lookup_kiocb
1701 * Finds a given iocb for cancellation. 1718 * Finds a given iocb for cancellation.
1702 */ 1719 */
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
67 * @offset: the new size to assign to the inode 67 * @offset: the new size to assign to the inode
68 * @Returns: 0 on success, -ve errno on failure 68 * @Returns: 0 on success, -ve errno on failure
69 * 69 *
70 * inode_newsize_ok must be called with i_mutex held.
71 *
70 * inode_newsize_ok will check filesystem limits and ulimits to check that the 72 * inode_newsize_ok will check filesystem limits and ulimits to check that the
71 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ 73 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
72 * when necessary. Caller must not proceed with inode size change if failure is 74 * when necessary. Caller must not proceed with inode size change if failure is
73 * returned. @inode must be a file (not directory), with appropriate 75 * returned. @inode must be a file (not directory), with appropriate
74 * permissions to allow truncate (inode_newsize_ok does NOT check these 76 * permissions to allow truncate (inode_newsize_ok does NOT check these
75 * conditions). 77 * conditions).
76 *
77 * inode_newsize_ok must be called with i_mutex held.
78 */ 78 */
79int inode_newsize_ok(const struct inode *inode, loff_t offset) 79int inode_newsize_ok(const struct inode *inode, loff_t offset)
80{ 80{
@@ -104,17 +104,25 @@ out_big:
104} 104}
105EXPORT_SYMBOL(inode_newsize_ok); 105EXPORT_SYMBOL(inode_newsize_ok);
106 106
107int inode_setattr(struct inode * inode, struct iattr * attr) 107/**
108 * generic_setattr - copy simple metadata updates into the generic inode
109 * @inode: the inode to be updated
110 * @attr: the new attributes
111 *
112 * generic_setattr must be called with i_mutex held.
113 *
114 * generic_setattr updates the inode's metadata with that specified
115 * in attr. Noticably missing is inode size update, which is more complex
116 * as it requires pagecache updates. See simple_setsize.
117 *
118 * The inode is not marked as dirty after this operation. The rationale is
119 * that for "simple" filesystems, the struct inode is the inode storage.
120 * The caller is free to mark the inode dirty afterwards if needed.
121 */
122void generic_setattr(struct inode *inode, const struct iattr *attr)
108{ 123{
109 unsigned int ia_valid = attr->ia_valid; 124 unsigned int ia_valid = attr->ia_valid;
110 125
111 if (ia_valid & ATTR_SIZE &&
112 attr->ia_size != i_size_read(inode)) {
113 int error = vmtruncate(inode, attr->ia_size);
114 if (error)
115 return error;
116 }
117
118 if (ia_valid & ATTR_UID) 126 if (ia_valid & ATTR_UID)
119 inode->i_uid = attr->ia_uid; 127 inode->i_uid = attr->ia_uid;
120 if (ia_valid & ATTR_GID) 128 if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
135 mode &= ~S_ISGID; 143 mode &= ~S_ISGID;
136 inode->i_mode = mode; 144 inode->i_mode = mode;
137 } 145 }
146}
147EXPORT_SYMBOL(generic_setattr);
148
149/*
150 * note this function is deprecated, the new truncate sequence should be
151 * used instead -- see eg. simple_setsize, generic_setattr.
152 */
153int inode_setattr(struct inode *inode, const struct iattr *attr)
154{
155 unsigned int ia_valid = attr->ia_valid;
156
157 if (ia_valid & ATTR_SIZE &&
158 attr->ia_size != i_size_read(inode)) {
159 int error;
160
161 error = vmtruncate(inode, attr->ia_size);
162 if (error)
163 return error;
164 }
165
166 generic_setattr(inode, attr);
167
138 mark_inode_dirty(inode); 168 mark_inode_dirty(inode);
139 169
140 return 0; 170 return 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
29 29
30const struct file_operations autofs_root_operations = { 30const struct file_operations autofs_root_operations = {
31 .llseek = generic_file_llseek,
31 .read = generic_read_dir, 32 .read = generic_read_dir,
32 .readdir = autofs_root_readdir, 33 .readdir = autofs_root_readdir,
33 .ioctl = autofs_root_ioctl, 34 .ioctl = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df862..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
95 */ 95 */
96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) 96static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
97{ 97{
98 struct autofs_dev_ioctl tmp, *ads; 98 struct autofs_dev_ioctl tmp;
99 99
100 if (copy_from_user(&tmp, in, sizeof(tmp))) 100 if (copy_from_user(&tmp, in, sizeof(tmp)))
101 return ERR_PTR(-EFAULT); 101 return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
103 if (tmp.size < sizeof(tmp)) 103 if (tmp.size < sizeof(tmp))
104 return ERR_PTR(-EINVAL); 104 return ERR_PTR(-EINVAL);
105 105
106 ads = kmalloc(tmp.size, GFP_KERNEL); 106 return memdup_user(in, tmp.size);
107 if (!ads)
108 return ERR_PTR(-ENOMEM);
109
110 if (copy_from_user(ads, in, tmp.size)) {
111 kfree(ads);
112 return ERR_PTR(-EFAULT);
113 }
114
115 return ads;
116} 107}
117 108
118static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) 109static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -736,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
736}; 727};
737 728
738static struct miscdevice _autofs_dev_ioctl_misc = { 729static struct miscdevice _autofs_dev_ioctl_misc = {
739 .minor = MISC_DYNAMIC_MINOR, 730 .minor = AUTOFS_MINOR,
740 .name = AUTOFS_DEVICE_NAME, 731 .name = AUTOFS_DEVICE_NAME,
741 .fops = &_dev_ioctl_fops 732 .fops = &_dev_ioctl_fops
742}; 733};
743 734
735MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
736MODULE_ALIAS("devname:autofs");
737
744/* Register/deregister misc character device */ 738/* Register/deregister misc character device */
745int autofs_dev_ioctl_init(void) 739int autofs_dev_ioctl_init(void)
746{ 740{
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac950..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,14 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/param.h> 19#include <linux/param.h>
20#include <linux/time.h> 20#include <linux/time.h>
21#include <linux/smp_lock.h>
21#include "autofs_i.h" 22#include "autofs_i.h"
22 23
23static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *); 24static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
24static int autofs4_dir_unlink(struct inode *,struct dentry *); 25static int autofs4_dir_unlink(struct inode *,struct dentry *);
25static int autofs4_dir_rmdir(struct inode *,struct dentry *); 26static int autofs4_dir_rmdir(struct inode *,struct dentry *);
26static int autofs4_dir_mkdir(struct inode *,struct dentry *,int); 27static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
27static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long); 28static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
28static int autofs4_dir_open(struct inode *inode, struct file *file); 29static int autofs4_dir_open(struct inode *inode, struct file *file);
29static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *); 30static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
30static void *autofs4_follow_link(struct dentry *, struct nameidata *); 31static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
38 .read = generic_read_dir, 39 .read = generic_read_dir,
39 .readdir = dcache_readdir, 40 .readdir = dcache_readdir,
40 .llseek = dcache_dir_lseek, 41 .llseek = dcache_dir_lseek,
41 .ioctl = autofs4_root_ioctl, 42 .unlocked_ioctl = autofs4_root_ioctl,
42}; 43};
43 44
44const struct file_operations autofs4_dir_operations = { 45const struct file_operations autofs4_dir_operations = {
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
902 * ioctl()'s on the root directory is the chief method for the daemon to 903 * ioctl()'s on the root directory is the chief method for the daemon to
903 * generate kernel reactions 904 * generate kernel reactions
904 */ 905 */
905static int autofs4_root_ioctl(struct inode *inode, struct file *filp, 906static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
906 unsigned int cmd, unsigned long arg) 907 unsigned int cmd, unsigned long arg)
907{ 908{
908 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); 909 struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
909 void __user *p = (void __user *)arg; 910 void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
947 return -ENOSYS; 948 return -ENOSYS;
948 } 949 }
949} 950}
951
952static long autofs4_root_ioctl(struct file *filp,
953 unsigned int cmd, unsigned long arg)
954{
955 long ret;
956 struct inode *inode = filp->f_dentry->d_inode;
957
958 lock_kernel();
959 ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
960 unlock_kernel();
961
962 return ret;
963}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
93 return -EIO; 93 return -EIO;
94} 94}
95 95
96static int bad_file_fsync(struct file *file, struct dentry *dentry, 96static int bad_file_fsync(struct file *file, int datasync)
97 int datasync)
98{ 97{
99 return -EIO; 98 return -EIO;
100} 99}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 1e41aadb1068..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
78const struct file_operations bfs_dir_operations = { 78const struct file_operations bfs_dir_operations = {
79 .read = generic_read_dir, 79 .read = generic_read_dir,
80 .readdir = bfs_readdir, 80 .readdir = bfs_readdir,
81 .fsync = simple_fsync, 81 .fsync = generic_file_fsync,
82 .llseek = generic_file_llseek, 82 .llseek = generic_file_llseek,
83}; 83};
84 84
@@ -105,14 +105,12 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode,
105 } 105 }
106 set_bit(ino, info->si_imap); 106 set_bit(ino, info->si_imap);
107 info->si_freei--; 107 info->si_freei--;
108 inode->i_uid = current_fsuid(); 108 inode_init_owner(inode, dir, mode);
109 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
110 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 109 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
111 inode->i_blocks = 0; 110 inode->i_blocks = 0;
112 inode->i_op = &bfs_file_inops; 111 inode->i_op = &bfs_file_inops;
113 inode->i_fop = &bfs_file_operations; 112 inode->i_fop = &bfs_file_operations;
114 inode->i_mapping->a_ops = &bfs_aops; 113 inode->i_mapping->a_ops = &bfs_aops;
115 inode->i_mode = mode;
116 inode->i_ino = ino; 114 inode->i_ino = ino;
117 BFS_I(inode)->i_dsk_ino = ino; 115 BFS_I(inode)->i_dsk_ino = ino;
118 BFS_I(inode)->i_sblock = 0; 116 BFS_I(inode)->i_sblock = 0;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c5f9a0e5d72..63039ed9576f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
990 990
991 /* clear any space allocated but not loaded */ 991 /* clear any space allocated but not loaded */
992 if (phdr->p_filesz < phdr->p_memsz) { 992 if (phdr->p_filesz < phdr->p_memsz) {
993 ret = clear_user((void *) (seg->addr + phdr->p_filesz), 993 if (clear_user((void *) (seg->addr + phdr->p_filesz),
994 phdr->p_memsz - phdr->p_filesz); 994 phdr->p_memsz - phdr->p_filesz))
995 if (ret) 995 return -EFAULT;
996 return ret;
997 } 996 }
998 997
999 if (mm) { 998 if (mm) {
@@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1027 struct elf32_fdpic_loadseg *seg; 1026 struct elf32_fdpic_loadseg *seg;
1028 struct elf32_phdr *phdr; 1027 struct elf32_phdr *phdr;
1029 unsigned long load_addr, delta_vaddr; 1028 unsigned long load_addr, delta_vaddr;
1030 int loop, dvset, ret; 1029 int loop, dvset;
1031 1030
1032 load_addr = params->load_addr; 1031 load_addr = params->load_addr;
1033 delta_vaddr = 0; 1032 delta_vaddr = 0;
@@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1127 * PT_LOAD */ 1126 * PT_LOAD */
1128 if (prot & PROT_WRITE && disp > 0) { 1127 if (prot & PROT_WRITE && disp > 0) {
1129 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp); 1128 kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
1130 ret = clear_user((void __user *) maddr, disp); 1129 if (clear_user((void __user *) maddr, disp))
1131 if (ret) 1130 return -EFAULT;
1132 return ret;
1133 maddr += disp; 1131 maddr += disp;
1134 } 1132 }
1135 1133
@@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
1164 if (prot & PROT_WRITE && excess1 > 0) { 1162 if (prot & PROT_WRITE && excess1 > 0) {
1165 kdebug("clear[%d] ad=%lx sz=%lx", 1163 kdebug("clear[%d] ad=%lx sz=%lx",
1166 loop, maddr + phdr->p_filesz, excess1); 1164 loop, maddr + phdr->p_filesz, excess1);
1167 ret = clear_user((void __user *) maddr + phdr->p_filesz, 1165 if (clear_user((void __user *) maddr + phdr->p_filesz,
1168 excess1); 1166 excess1))
1169 if (ret) 1167 return -EFAULT;
1170 return ret;
1171 } 1168 }
1172 1169
1173#else 1170#else
1174 if (excess > 0) { 1171 if (excess > 0) {
1175 kdebug("clear[%d] ad=%lx sz=%lx", 1172 kdebug("clear[%d] ad=%lx sz=%lx",
1176 loop, maddr + phdr->p_filesz, excess); 1173 loop, maddr + phdr->p_filesz, excess);
1177 ret = clear_user((void *) maddr + phdr->p_filesz, excess); 1174 if (clear_user((void *) maddr + phdr->p_filesz, excess))
1178 if (ret) 1175 return -EFAULT;
1179 return ret;
1180 } 1176 }
1181#endif 1177#endif
1182 1178
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 49566c1687d8..b6ab27ccf214 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,15 +56,22 @@
56#endif 56#endif
57 57
58/* 58/*
59 * User data (stack, data section and bss) needs to be aligned 59 * User data (data section and bss) needs to be aligned.
60 * for the same reasons as SLAB memory is, and to the same amount. 60 * We pick 0x20 here because it is the max value elf2flt has always
61 * Avoid duplicating architecture specific code by using the same 61 * used in producing FLAT files, and because it seems to be large
62 * macro as with SLAB allocation: 62 * enough to make all the gcc alignment related tests happy.
63 */
64#define FLAT_DATA_ALIGN (0x20)
65
66/*
67 * User data (stack) also needs to be aligned.
68 * Here we can be a bit looser than the data sections since this
69 * needs to only meet arch ABI requirements.
63 */ 70 */
64#ifdef ARCH_SLAB_MINALIGN 71#ifdef ARCH_SLAB_MINALIGN
65#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN) 72#define FLAT_STACK_ALIGN (ARCH_SLAB_MINALIGN)
66#else 73#else
67#define FLAT_DATA_ALIGN (sizeof(void *)) 74#define FLAT_STACK_ALIGN (sizeof(void *))
68#endif 75#endif
69 76
70#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ 77#define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */
@@ -129,7 +136,7 @@ static unsigned long create_flat_tables(
129 136
130 sp = (unsigned long *)p; 137 sp = (unsigned long *)p;
131 sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); 138 sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
132 sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN); 139 sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
133 argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0); 140 argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
134 envp = argv + (argc + 1); 141 envp = argv + (argc + 1);
135 142
@@ -589,7 +596,7 @@ static int load_flat_file(struct linux_binprm * bprm,
589 if (IS_ERR_VALUE(result)) { 596 if (IS_ERR_VALUE(result)) {
590 printk("Unable to read data+bss, errno %d\n", (int)-result); 597 printk("Unable to read data+bss, errno %d\n", (int)-result);
591 do_munmap(current->mm, textpos, text_len); 598 do_munmap(current->mm, textpos, text_len);
592 do_munmap(current->mm, realdatastart, data_len + extra); 599 do_munmap(current->mm, realdatastart, len);
593 ret = result; 600 ret = result;
594 goto err; 601 goto err;
595 } 602 }
@@ -876,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
876 stack_len = TOP_OF_ARGS - bprm->p; /* the strings */ 883 stack_len = TOP_OF_ARGS - bprm->p; /* the strings */
877 stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */ 884 stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
878 stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ 885 stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
879 stack_len += FLAT_DATA_ALIGN - 1; /* reserve for upcoming alignment */ 886 stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */
880 887
881 res = load_flat_file(bprm, &libinfo, 0, &stack_len); 888 res = load_flat_file(bprm, &libinfo, 0, &stack_len);
882 if (IS_ERR_VALUE(res)) 889 if (IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 6dcee88c2e5d..99d6af811747 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
172 struct file *file = iocb->ki_filp; 172 struct file *file = iocb->ki_filp;
173 struct inode *inode = file->f_mapping->host; 173 struct inode *inode = file->f_mapping->host;
174 174
175 return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode), 175 return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
176 iov, offset, nr_segs, blkdev_get_blocks, NULL); 176 I_BDEV(inode), iov, offset, nr_segs,
177 blkdev_get_blocks, NULL);
177} 178}
178 179
179int __sync_blockdev(struct block_device *bdev, int wait) 180int __sync_blockdev(struct block_device *bdev, int wait)
@@ -245,37 +246,14 @@ struct super_block *freeze_bdev(struct block_device *bdev)
245 sb = get_active_super(bdev); 246 sb = get_active_super(bdev);
246 if (!sb) 247 if (!sb)
247 goto out; 248 goto out;
248 if (sb->s_flags & MS_RDONLY) { 249 error = freeze_super(sb);
249 sb->s_frozen = SB_FREEZE_TRANS; 250 if (error) {
250 up_write(&sb->s_umount); 251 deactivate_super(sb);
252 bdev->bd_fsfreeze_count--;
251 mutex_unlock(&bdev->bd_fsfreeze_mutex); 253 mutex_unlock(&bdev->bd_fsfreeze_mutex);
252 return sb; 254 return ERR_PTR(error);
253 }
254
255 sb->s_frozen = SB_FREEZE_WRITE;
256 smp_wmb();
257
258 sync_filesystem(sb);
259
260 sb->s_frozen = SB_FREEZE_TRANS;
261 smp_wmb();
262
263 sync_blockdev(sb->s_bdev);
264
265 if (sb->s_op->freeze_fs) {
266 error = sb->s_op->freeze_fs(sb);
267 if (error) {
268 printk(KERN_ERR
269 "VFS:Filesystem freeze failed\n");
270 sb->s_frozen = SB_UNFROZEN;
271 deactivate_locked_super(sb);
272 bdev->bd_fsfreeze_count--;
273 mutex_unlock(&bdev->bd_fsfreeze_mutex);
274 return ERR_PTR(error);
275 }
276 } 255 }
277 up_write(&sb->s_umount); 256 deactivate_super(sb);
278
279 out: 257 out:
280 sync_blockdev(bdev); 258 sync_blockdev(bdev);
281 mutex_unlock(&bdev->bd_fsfreeze_mutex); 259 mutex_unlock(&bdev->bd_fsfreeze_mutex);
@@ -296,40 +274,22 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb)
296 274
297 mutex_lock(&bdev->bd_fsfreeze_mutex); 275 mutex_lock(&bdev->bd_fsfreeze_mutex);
298 if (!bdev->bd_fsfreeze_count) 276 if (!bdev->bd_fsfreeze_count)
299 goto out_unlock; 277 goto out;
300 278
301 error = 0; 279 error = 0;
302 if (--bdev->bd_fsfreeze_count > 0) 280 if (--bdev->bd_fsfreeze_count > 0)
303 goto out_unlock; 281 goto out;
304 282
305 if (!sb) 283 if (!sb)
306 goto out_unlock; 284 goto out;
307
308 BUG_ON(sb->s_bdev != bdev);
309 down_write(&sb->s_umount);
310 if (sb->s_flags & MS_RDONLY)
311 goto out_unfrozen;
312
313 if (sb->s_op->unfreeze_fs) {
314 error = sb->s_op->unfreeze_fs(sb);
315 if (error) {
316 printk(KERN_ERR
317 "VFS:Filesystem thaw failed\n");
318 sb->s_frozen = SB_FREEZE_TRANS;
319 bdev->bd_fsfreeze_count++;
320 mutex_unlock(&bdev->bd_fsfreeze_mutex);
321 return error;
322 }
323 }
324
325out_unfrozen:
326 sb->s_frozen = SB_UNFROZEN;
327 smp_wmb();
328 wake_up(&sb->s_wait_unfrozen);
329 285
330 if (sb) 286 error = thaw_super(sb);
331 deactivate_locked_super(sb); 287 if (error) {
332out_unlock: 288 bdev->bd_fsfreeze_count++;
289 mutex_unlock(&bdev->bd_fsfreeze_mutex);
290 return error;
291 }
292out:
333 mutex_unlock(&bdev->bd_fsfreeze_mutex); 293 mutex_unlock(&bdev->bd_fsfreeze_mutex);
334 return 0; 294 return 0;
335} 295}
@@ -350,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
350 struct page **pagep, void **fsdata) 310 struct page **pagep, void **fsdata)
351{ 311{
352 *pagep = NULL; 312 *pagep = NULL;
353 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 313 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
354 blkdev_get_block); 314 pagep, fsdata, blkdev_get_block);
355} 315}
356 316
357static int blkdev_write_end(struct file *file, struct address_space *mapping, 317static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -399,12 +359,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
399 return retval; 359 return retval;
400} 360}
401 361
402/* 362int blkdev_fsync(struct file *filp, int datasync)
403 * Filp is never NULL; the only case when ->fsync() is called with
404 * NULL first argument is nfsd_sync_dir() and that's not a directory.
405 */
406
407int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
408{ 363{
409 struct inode *bd_inode = filp->f_mapping->host; 364 struct inode *bd_inode = filp->f_mapping->host;
410 struct block_device *bdev = I_BDEV(bd_inode); 365 struct block_device *bdev = I_BDEV(bd_inode);
@@ -417,7 +372,7 @@ int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
417 */ 372 */
418 mutex_unlock(&bd_inode->i_mutex); 373 mutex_unlock(&bd_inode->i_mutex);
419 374
420 error = blkdev_issue_flush(bdev, NULL); 375 error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
421 if (error == -EOPNOTSUPP) 376 if (error == -EOPNOTSUPP)
422 error = 0; 377 error = 0;
423 378
@@ -668,41 +623,233 @@ void bd_forget(struct inode *inode)
668 iput(bdev->bd_inode); 623 iput(bdev->bd_inode);
669} 624}
670 625
671int bd_claim(struct block_device *bdev, void *holder) 626/**
627 * bd_may_claim - test whether a block device can be claimed
628 * @bdev: block device of interest
629 * @whole: whole block device containing @bdev, may equal @bdev
630 * @holder: holder trying to claim @bdev
631 *
632 * Test whther @bdev can be claimed by @holder.
633 *
634 * CONTEXT:
635 * spin_lock(&bdev_lock).
636 *
637 * RETURNS:
638 * %true if @bdev can be claimed, %false otherwise.
639 */
640static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
641 void *holder)
672{ 642{
673 int res;
674 spin_lock(&bdev_lock);
675
676 /* first decide result */
677 if (bdev->bd_holder == holder) 643 if (bdev->bd_holder == holder)
678 res = 0; /* already a holder */ 644 return true; /* already a holder */
679 else if (bdev->bd_holder != NULL) 645 else if (bdev->bd_holder != NULL)
680 res = -EBUSY; /* held by someone else */ 646 return false; /* held by someone else */
681 else if (bdev->bd_contains == bdev) 647 else if (bdev->bd_contains == bdev)
682 res = 0; /* is a whole device which isn't held */ 648 return true; /* is a whole device which isn't held */
683 649
684 else if (bdev->bd_contains->bd_holder == bd_claim) 650 else if (whole->bd_holder == bd_claim)
685 res = 0; /* is a partition of a device that is being partitioned */ 651 return true; /* is a partition of a device that is being partitioned */
686 else if (bdev->bd_contains->bd_holder != NULL) 652 else if (whole->bd_holder != NULL)
687 res = -EBUSY; /* is a partition of a held device */ 653 return false; /* is a partition of a held device */
688 else 654 else
689 res = 0; /* is a partition of an un-held device */ 655 return true; /* is a partition of an un-held device */
656}
690 657
691 /* now impose change */ 658/**
692 if (res==0) { 659 * bd_prepare_to_claim - prepare to claim a block device
693 /* note that for a whole device bd_holders 660 * @bdev: block device of interest
694 * will be incremented twice, and bd_holder will 661 * @whole: the whole device containing @bdev, may equal @bdev
695 * be set to bd_claim before being set to holder 662 * @holder: holder trying to claim @bdev
696 */ 663 *
697 bdev->bd_contains->bd_holders ++; 664 * Prepare to claim @bdev. This function fails if @bdev is already
698 bdev->bd_contains->bd_holder = bd_claim; 665 * claimed by another holder and waits if another claiming is in
699 bdev->bd_holders++; 666 * progress. This function doesn't actually claim. On successful
700 bdev->bd_holder = holder; 667 * return, the caller has ownership of bd_claiming and bd_holder[s].
668 *
669 * CONTEXT:
670 * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
671 * it multiple times.
672 *
673 * RETURNS:
674 * 0 if @bdev can be claimed, -EBUSY otherwise.
675 */
676static int bd_prepare_to_claim(struct block_device *bdev,
677 struct block_device *whole, void *holder)
678{
679retry:
680 /* if someone else claimed, fail */
681 if (!bd_may_claim(bdev, whole, holder))
682 return -EBUSY;
683
684 /* if someone else is claiming, wait for it to finish */
685 if (whole->bd_claiming && whole->bd_claiming != holder) {
686 wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
687 DEFINE_WAIT(wait);
688
689 prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
690 spin_unlock(&bdev_lock);
691 schedule();
692 finish_wait(wq, &wait);
693 spin_lock(&bdev_lock);
694 goto retry;
695 }
696
697 /* yay, all mine */
698 return 0;
699}
700
701/**
702 * bd_start_claiming - start claiming a block device
703 * @bdev: block device of interest
704 * @holder: holder trying to claim @bdev
705 *
706 * @bdev is about to be opened exclusively. Check @bdev can be opened
707 * exclusively and mark that an exclusive open is in progress. Each
708 * successful call to this function must be matched with a call to
709 * either bd_finish_claiming() or bd_abort_claiming() (which do not
710 * fail).
711 *
712 * This function is used to gain exclusive access to the block device
713 * without actually causing other exclusive open attempts to fail. It
714 * should be used when the open sequence itself requires exclusive
715 * access but may subsequently fail.
716 *
717 * CONTEXT:
718 * Might sleep.
719 *
720 * RETURNS:
721 * Pointer to the block device containing @bdev on success, ERR_PTR()
722 * value on failure.
723 */
724static struct block_device *bd_start_claiming(struct block_device *bdev,
725 void *holder)
726{
727 struct gendisk *disk;
728 struct block_device *whole;
729 int partno, err;
730
731 might_sleep();
732
733 /*
734 * @bdev might not have been initialized properly yet, look up
735 * and grab the outer block device the hard way.
736 */
737 disk = get_gendisk(bdev->bd_dev, &partno);
738 if (!disk)
739 return ERR_PTR(-ENXIO);
740
741 whole = bdget_disk(disk, 0);
742 module_put(disk->fops->owner);
743 put_disk(disk);
744 if (!whole)
745 return ERR_PTR(-ENOMEM);
746
747 /* prepare to claim, if successful, mark claiming in progress */
748 spin_lock(&bdev_lock);
749
750 err = bd_prepare_to_claim(bdev, whole, holder);
751 if (err == 0) {
752 whole->bd_claiming = holder;
753 spin_unlock(&bdev_lock);
754 return whole;
755 } else {
756 spin_unlock(&bdev_lock);
757 bdput(whole);
758 return ERR_PTR(err);
701 } 759 }
760}
761
762/* releases bdev_lock */
763static void __bd_abort_claiming(struct block_device *whole, void *holder)
764{
765 BUG_ON(whole->bd_claiming != holder);
766 whole->bd_claiming = NULL;
767 wake_up_bit(&whole->bd_claiming, 0);
768
702 spin_unlock(&bdev_lock); 769 spin_unlock(&bdev_lock);
703 return res; 770 bdput(whole);
704} 771}
705 772
773/**
774 * bd_abort_claiming - abort claiming a block device
775 * @whole: whole block device returned by bd_start_claiming()
776 * @holder: holder trying to claim @bdev
777 *
778 * Abort a claiming block started by bd_start_claiming(). Note that
779 * @whole is not the block device to be claimed but the whole device
780 * returned by bd_start_claiming().
781 *
782 * CONTEXT:
783 * Grabs and releases bdev_lock.
784 */
785static void bd_abort_claiming(struct block_device *whole, void *holder)
786{
787 spin_lock(&bdev_lock);
788 __bd_abort_claiming(whole, holder); /* releases bdev_lock */
789}
790
791/* increment holders when we have a legitimate claim. requires bdev_lock */
792static void __bd_claim(struct block_device *bdev, struct block_device *whole,
793 void *holder)
794{
795 /* note that for a whole device bd_holders
796 * will be incremented twice, and bd_holder will
797 * be set to bd_claim before being set to holder
798 */
799 whole->bd_holders++;
800 whole->bd_holder = bd_claim;
801 bdev->bd_holders++;
802 bdev->bd_holder = holder;
803}
804
805/**
806 * bd_finish_claiming - finish claiming a block device
807 * @bdev: block device of interest (passed to bd_start_claiming())
808 * @whole: whole block device returned by bd_start_claiming()
809 * @holder: holder trying to claim @bdev
810 *
811 * Finish a claiming block started by bd_start_claiming().
812 *
813 * CONTEXT:
814 * Grabs and releases bdev_lock.
815 */
816static void bd_finish_claiming(struct block_device *bdev,
817 struct block_device *whole, void *holder)
818{
819 spin_lock(&bdev_lock);
820 BUG_ON(!bd_may_claim(bdev, whole, holder));
821 __bd_claim(bdev, whole, holder);
822 __bd_abort_claiming(whole, holder); /* not actually an abort */
823}
824
825/**
826 * bd_claim - claim a block device
827 * @bdev: block device to claim
828 * @holder: holder trying to claim @bdev
829 *
830 * Try to claim @bdev which must have been opened successfully.
831 *
832 * CONTEXT:
833 * Might sleep.
834 *
835 * RETURNS:
836 * 0 if successful, -EBUSY if @bdev is already claimed.
837 */
838int bd_claim(struct block_device *bdev, void *holder)
839{
840 struct block_device *whole = bdev->bd_contains;
841 int res;
842
843 might_sleep();
844
845 spin_lock(&bdev_lock);
846 res = bd_prepare_to_claim(bdev, whole, holder);
847 if (res == 0)
848 __bd_claim(bdev, whole, holder);
849 spin_unlock(&bdev_lock);
850
851 return res;
852}
706EXPORT_SYMBOL(bd_claim); 853EXPORT_SYMBOL(bd_claim);
707 854
708void bd_release(struct block_device *bdev) 855void bd_release(struct block_device *bdev)
@@ -1316,6 +1463,7 @@ EXPORT_SYMBOL(blkdev_get);
1316 1463
1317static int blkdev_open(struct inode * inode, struct file * filp) 1464static int blkdev_open(struct inode * inode, struct file * filp)
1318{ 1465{
1466 struct block_device *whole = NULL;
1319 struct block_device *bdev; 1467 struct block_device *bdev;
1320 int res; 1468 int res;
1321 1469
@@ -1338,22 +1486,25 @@ static int blkdev_open(struct inode * inode, struct file * filp)
1338 if (bdev == NULL) 1486 if (bdev == NULL)
1339 return -ENOMEM; 1487 return -ENOMEM;
1340 1488
1489 if (filp->f_mode & FMODE_EXCL) {
1490 whole = bd_start_claiming(bdev, filp);
1491 if (IS_ERR(whole)) {
1492 bdput(bdev);
1493 return PTR_ERR(whole);
1494 }
1495 }
1496
1341 filp->f_mapping = bdev->bd_inode->i_mapping; 1497 filp->f_mapping = bdev->bd_inode->i_mapping;
1342 1498
1343 res = blkdev_get(bdev, filp->f_mode); 1499 res = blkdev_get(bdev, filp->f_mode);
1344 if (res)
1345 return res;
1346 1500
1347 if (filp->f_mode & FMODE_EXCL) { 1501 if (whole) {
1348 res = bd_claim(bdev, filp); 1502 if (res == 0)
1349 if (res) 1503 bd_finish_claiming(bdev, whole, filp);
1350 goto out_blkdev_put; 1504 else
1505 bd_abort_claiming(whole, filp);
1351 } 1506 }
1352 1507
1353 return 0;
1354
1355 out_blkdev_put:
1356 blkdev_put(bdev, filp->f_mode);
1357 return res; 1508 return res;
1358} 1509}
1359 1510
@@ -1564,27 +1715,34 @@ EXPORT_SYMBOL(lookup_bdev);
1564 */ 1715 */
1565struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder) 1716struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
1566{ 1717{
1567 struct block_device *bdev; 1718 struct block_device *bdev, *whole;
1568 int error = 0; 1719 int error;
1569 1720
1570 bdev = lookup_bdev(path); 1721 bdev = lookup_bdev(path);
1571 if (IS_ERR(bdev)) 1722 if (IS_ERR(bdev))
1572 return bdev; 1723 return bdev;
1573 1724
1725 whole = bd_start_claiming(bdev, holder);
1726 if (IS_ERR(whole)) {
1727 bdput(bdev);
1728 return whole;
1729 }
1730
1574 error = blkdev_get(bdev, mode); 1731 error = blkdev_get(bdev, mode);
1575 if (error) 1732 if (error)
1576 return ERR_PTR(error); 1733 goto out_abort_claiming;
1734
1577 error = -EACCES; 1735 error = -EACCES;
1578 if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) 1736 if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
1579 goto blkdev_put; 1737 goto out_blkdev_put;
1580 error = bd_claim(bdev, holder);
1581 if (error)
1582 goto blkdev_put;
1583 1738
1739 bd_finish_claiming(bdev, whole, holder);
1584 return bdev; 1740 return bdev;
1585 1741
1586blkdev_put: 1742out_blkdev_put:
1587 blkdev_put(bdev, mode); 1743 blkdev_put(bdev, mode);
1744out_abort_claiming:
1745 bd_abort_claiming(whole, holder);
1588 return ERR_PTR(error); 1746 return ERR_PTR(error);
1589} 1747}
1590 1748
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6ef7b26724ec..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 61 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl))
64 return acl;
63 set_cached_acl(inode, type, acl); 65 set_cached_acl(inode, type, acl);
64 } 66 }
65 kfree(value); 67 kfree(value);
@@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
160 int ret; 162 int ret;
161 struct posix_acl *acl = NULL; 163 struct posix_acl *acl = NULL;
162 164
165 if (!is_owner_or_cap(dentry->d_inode))
166 return -EPERM;
167
168 if (!IS_POSIXACL(dentry->d_inode))
169 return -EOPNOTSUPP;
170
163 if (value) { 171 if (value) {
164 acl = posix_acl_from_xattr(value, size); 172 acl = posix_acl_from_xattr(value, size);
165 if (acl == NULL) { 173 if (acl == NULL) {
@@ -282,14 +290,14 @@ int btrfs_acl_chmod(struct inode *inode)
282 return ret; 290 return ret;
283} 291}
284 292
285struct xattr_handler btrfs_xattr_acl_default_handler = { 293const struct xattr_handler btrfs_xattr_acl_default_handler = {
286 .prefix = POSIX_ACL_XATTR_DEFAULT, 294 .prefix = POSIX_ACL_XATTR_DEFAULT,
287 .flags = ACL_TYPE_DEFAULT, 295 .flags = ACL_TYPE_DEFAULT,
288 .get = btrfs_xattr_acl_get, 296 .get = btrfs_xattr_acl_get,
289 .set = btrfs_xattr_acl_set, 297 .set = btrfs_xattr_acl_set,
290}; 298};
291 299
292struct xattr_handler btrfs_xattr_acl_access_handler = { 300const struct xattr_handler btrfs_xattr_acl_access_handler = {
293 .prefix = POSIX_ACL_XATTR_ACCESS, 301 .prefix = POSIX_ACL_XATTR_ACCESS,
294 .flags = ACL_TYPE_ACCESS, 302 .flags = ACL_TYPE_ACCESS,
295 .get = btrfs_xattr_acl_get, 303 .get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
377 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
378 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
379 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
380 goto again; 381 goto again;
381 } 382 }
382 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
156 /* 157 /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
281 struct btrfs_root *root, 281 struct btrfs_root *root,
282 struct extent_buffer *buf, 282 struct extent_buffer *buf,
283 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
284{ 285{
285 u64 refs; 286 u64 refs;
286 u64 owner; 287 u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
366 BUG_ON(ret); 367 BUG_ON(ret);
367 } 368 }
368 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
369 } 371 }
370 return 0; 372 return 0;
371} 373}
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
392 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
393 struct extent_buffer *cow; 395 struct extent_buffer *cow;
394 int level; 396 int level;
397 int last_ref = 0;
395 int unlock_orig = 0; 398 int unlock_orig = 0;
396 u64 parent_start; 399 u64 parent_start;
397 400
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
442 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
443 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
444 447
445 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
446 452
447 if (buf == root->node) { 453 if (buf == root->node) {
448 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
457 extent_buffer_get(cow); 463 extent_buffer_get(cow);
458 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
459 465
460 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
461 parent_start, root->root_key.objectid, level); 467 last_ref);
462 free_extent_buffer(buf); 468 free_extent_buffer(buf);
463 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
464 } else { 470 } else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
473 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
474 trans->transid); 480 trans->transid);
475 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
476 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
477 parent_start, root->root_key.objectid, level); 483 last_ref);
478 } 484 }
479 if (unlock_orig) 485 if (unlock_orig)
480 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
949 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
950} 956}
951 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
952/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
953 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
954 * NULL is returned on error. 976 * NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1019 btrfs_tree_lock(child); 1041 btrfs_tree_lock(child);
1020 btrfs_set_lock_blocking(child); 1042 btrfs_set_lock_blocking(child);
1021 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1043 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1022 BUG_ON(ret); 1044 if (ret) {
1045 btrfs_tree_unlock(child);
1046 free_extent_buffer(child);
1047 goto enospc;
1048 }
1023 1049
1024 spin_lock(&root->node_lock); 1050 spin_lock(&root->node_lock);
1025 root->node = child; 1051 root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1034 btrfs_tree_unlock(mid); 1060 btrfs_tree_unlock(mid);
1035 /* once for the path */ 1061 /* once for the path */
1036 free_extent_buffer(mid); 1062 free_extent_buffer(mid);
1037 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1063
1038 0, root->root_key.objectid, level); 1064 root_sub_used(root, mid->len);
1065 btrfs_free_tree_block(trans, root, mid, 0, 1);
1039 /* once for the root ptr */ 1066 /* once for the root ptr */
1040 free_extent_buffer(mid); 1067 free_extent_buffer(mid);
1041 return ret; 1068 return 0;
1042 } 1069 }
1043 if (btrfs_header_nritems(mid) > 1070 if (btrfs_header_nritems(mid) >
1044 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1088 if (wret < 0 && wret != -ENOSPC) 1115 if (wret < 0 && wret != -ENOSPC)
1089 ret = wret; 1116 ret = wret;
1090 if (btrfs_header_nritems(right) == 0) { 1117 if (btrfs_header_nritems(right) == 0) {
1091 u64 bytenr = right->start;
1092 u32 blocksize = right->len;
1093
1094 clean_tree_block(trans, root, right); 1118 clean_tree_block(trans, root, right);
1095 btrfs_tree_unlock(right); 1119 btrfs_tree_unlock(right);
1096 free_extent_buffer(right);
1097 right = NULL;
1098 wret = del_ptr(trans, root, path, level + 1, pslot + 1120 wret = del_ptr(trans, root, path, level + 1, pslot +
1099 1); 1121 1);
1100 if (wret) 1122 if (wret)
1101 ret = wret; 1123 ret = wret;
1102 wret = btrfs_free_tree_block(trans, root, 1124 root_sub_used(root, right->len);
1103 bytenr, blocksize, 0, 1125 btrfs_free_tree_block(trans, root, right, 0, 1);
1104 root->root_key.objectid, 1126 free_extent_buffer(right);
1105 level); 1127 right = NULL;
1106 if (wret)
1107 ret = wret;
1108 } else { 1128 } else {
1109 struct btrfs_disk_key right_key; 1129 struct btrfs_disk_key right_key;
1110 btrfs_node_key(right, &right_key, 0); 1130 btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1136 BUG_ON(wret == 1); 1156 BUG_ON(wret == 1);
1137 } 1157 }
1138 if (btrfs_header_nritems(mid) == 0) { 1158 if (btrfs_header_nritems(mid) == 0) {
1139 /* we've managed to empty the middle node, drop it */
1140 u64 bytenr = mid->start;
1141 u32 blocksize = mid->len;
1142
1143 clean_tree_block(trans, root, mid); 1159 clean_tree_block(trans, root, mid);
1144 btrfs_tree_unlock(mid); 1160 btrfs_tree_unlock(mid);
1145 free_extent_buffer(mid);
1146 mid = NULL;
1147 wret = del_ptr(trans, root, path, level + 1, pslot); 1161 wret = del_ptr(trans, root, path, level + 1, pslot);
1148 if (wret) 1162 if (wret)
1149 ret = wret; 1163 ret = wret;
1150 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1164 root_sub_used(root, mid->len);
1151 0, root->root_key.objectid, level); 1165 btrfs_free_tree_block(trans, root, mid, 0, 1);
1152 if (wret) 1166 free_extent_buffer(mid);
1153 ret = wret; 1167 mid = NULL;
1154 } else { 1168 } else {
1155 /* update the parent key to reflect our changes */ 1169 /* update the parent key to reflect our changes */
1156 struct btrfs_disk_key mid_key; 1170 struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1590 btrfs_release_path(NULL, p); 1604 btrfs_release_path(NULL, p);
1591 1605
1592 ret = -EAGAIN; 1606 ret = -EAGAIN;
1593 tmp = read_tree_block(root, blocknr, blocksize, gen); 1607 tmp = read_tree_block(root, blocknr, blocksize, 0);
1594 if (tmp) { 1608 if (tmp) {
1595 /* 1609 /*
1596 * If the read above didn't mark this buffer up to date, 1610 * If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
1740 p->nodes[level + 1], 1754 p->nodes[level + 1],
1741 p->slots[level + 1], &b); 1755 p->slots[level + 1], &b);
1742 if (err) { 1756 if (err) {
1743 free_extent_buffer(b);
1744 ret = err; 1757 ret = err;
1745 goto done; 1758 goto done;
1746 } 1759 }
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2076 if (IS_ERR(c)) 2089 if (IS_ERR(c))
2077 return PTR_ERR(c); 2090 return PTR_ERR(c);
2078 2091
2092 root_add_used(root, root->nodesize);
2093
2079 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2094 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2080 btrfs_set_header_nritems(c, 1); 2095 btrfs_set_header_nritems(c, 1);
2081 btrfs_set_header_level(c, level); 2096 btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2134 int nritems; 2149 int nritems;
2135 2150
2136 BUG_ON(!path->nodes[level]); 2151 BUG_ON(!path->nodes[level]);
2152 btrfs_assert_tree_locked(path->nodes[level]);
2137 lower = path->nodes[level]; 2153 lower = path->nodes[level];
2138 nritems = btrfs_header_nritems(lower); 2154 nritems = btrfs_header_nritems(lower);
2139 BUG_ON(slot > nritems); 2155 BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2202 if (IS_ERR(split)) 2218 if (IS_ERR(split))
2203 return PTR_ERR(split); 2219 return PTR_ERR(split);
2204 2220
2221 root_add_used(root, root->nodesize);
2222
2205 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2223 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2206 btrfs_set_header_level(split, btrfs_header_level(c)); 2224 btrfs_set_header_level(split, btrfs_header_level(c));
2207 btrfs_set_header_bytenr(split, split->start); 2225 btrfs_set_header_bytenr(split, split->start);
@@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2415 2433
2416 if (left_nritems) 2434 if (left_nritems)
2417 btrfs_mark_buffer_dirty(left); 2435 btrfs_mark_buffer_dirty(left);
2436 else
2437 clean_tree_block(trans, root, left);
2438
2418 btrfs_mark_buffer_dirty(right); 2439 btrfs_mark_buffer_dirty(right);
2419 2440
2420 btrfs_item_key(right, &disk_key, 0); 2441 btrfs_item_key(right, &disk_key, 0);
@@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2660 btrfs_mark_buffer_dirty(left); 2681 btrfs_mark_buffer_dirty(left);
2661 if (right_nritems) 2682 if (right_nritems)
2662 btrfs_mark_buffer_dirty(right); 2683 btrfs_mark_buffer_dirty(right);
2684 else
2685 clean_tree_block(trans, root, right);
2663 2686
2664 btrfs_item_key(right, &disk_key, 0); 2687 btrfs_item_key(right, &disk_key, 0);
2665 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2688 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2669 /* then fixup the leaf pointer in the path */ 2692 /* then fixup the leaf pointer in the path */
2670 if (path->slots[0] < push_items) { 2693 if (path->slots[0] < push_items) {
2671 path->slots[0] += old_left_nritems; 2694 path->slots[0] += old_left_nritems;
2672 if (btrfs_header_nritems(path->nodes[0]) == 0)
2673 clean_tree_block(trans, root, path->nodes[0]);
2674 btrfs_tree_unlock(path->nodes[0]); 2695 btrfs_tree_unlock(path->nodes[0]);
2675 free_extent_buffer(path->nodes[0]); 2696 free_extent_buffer(path->nodes[0]);
2676 path->nodes[0] = left; 2697 path->nodes[0] = left;
@@ -2932,10 +2953,10 @@ again:
2932 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 2953 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2933 root->root_key.objectid, 2954 root->root_key.objectid,
2934 &disk_key, 0, l->start, 0); 2955 &disk_key, 0, l->start, 0);
2935 if (IS_ERR(right)) { 2956 if (IS_ERR(right))
2936 BUG_ON(1);
2937 return PTR_ERR(right); 2957 return PTR_ERR(right);
2938 } 2958
2959 root_add_used(root, root->leafsize);
2939 2960
2940 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 2961 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2941 btrfs_set_header_bytenr(right, right->start); 2962 btrfs_set_header_bytenr(right, right->start);
@@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3054 3075
3055 btrfs_set_path_blocking(path); 3076 btrfs_set_path_blocking(path);
3056 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3077 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3057 BUG_ON(ret); 3078 if (ret)
3079 goto err;
3058 3080
3059 path->keep_locks = 0; 3081 path->keep_locks = 0;
3060 btrfs_unlock_up_safe(path, 1); 3082 btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3796 */ 3818 */
3797 btrfs_unlock_up_safe(path, 0); 3819 btrfs_unlock_up_safe(path, 0);
3798 3820
3799 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3821 root_sub_used(root, leaf->len);
3800 0, root->root_key.objectid, 0); 3822
3801 return ret; 3823 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3824 return 0;
3802} 3825}
3803/* 3826/*
3804 * delete the item at the leaf level in path. If that empties 3827 * delete the item at the leaf level in path. If that empties
@@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3865 if (leaf == root->node) { 3888 if (leaf == root->node) {
3866 btrfs_set_header_level(leaf, 0); 3889 btrfs_set_header_level(leaf, 0);
3867 } else { 3890 } else {
3891 btrfs_set_path_blocking(path);
3892 clean_tree_block(trans, root, leaf);
3868 ret = btrfs_del_leaf(trans, root, path, leaf); 3893 ret = btrfs_del_leaf(trans, root, path, leaf);
3869 BUG_ON(ret); 3894 BUG_ON(ret);
3870 } 3895 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
34 34
35struct btrfs_trans_handle; 35struct btrfs_trans_handle;
36struct btrfs_transaction; 36struct btrfs_transaction;
37struct btrfs_pending_snapshot;
37extern struct kmem_cache *btrfs_trans_handle_cachep; 38extern struct kmem_cache *btrfs_trans_handle_cachep;
38extern struct kmem_cache *btrfs_transaction_cachep; 39extern struct kmem_cache *btrfs_transaction_cachep;
39extern struct kmem_cache *btrfs_bit_radix_cachep; 40extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
663#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 664#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
664#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 665#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
665#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 666#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
667#define BTRFS_NR_RAID_TYPES 5
666 668
667struct btrfs_block_group_item { 669struct btrfs_block_group_item {
668 __le64 used; 670 __le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
674 u64 flags; 676 u64 flags;
675 677
676 u64 total_bytes; /* total bytes in the space */ 678 u64 total_bytes; /* total bytes in the space */
677 u64 bytes_used; /* total bytes used on disk */ 679 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */
678 u64 bytes_pinned; /* total bytes pinned, will be freed when the 681 u64 bytes_pinned; /* total bytes pinned, will be freed when the
679 transaction finishes */ 682 transaction finishes */
680 u64 bytes_reserved; /* total bytes the allocator has reserved for 683 u64 bytes_reserved; /* total bytes the allocator has reserved for
681 current allocations */ 684 current allocations */
682 u64 bytes_readonly; /* total bytes that are read only */ 685 u64 bytes_readonly; /* total bytes that are read only */
683 u64 bytes_super; /* total bytes reserved for the super blocks */ 686
684 u64 bytes_root; /* the number of bytes needed to commit a
685 transaction */
686 u64 bytes_may_use; /* number of bytes that may be used for 687 u64 bytes_may_use; /* number of bytes that may be used for
687 delalloc/allocations */ 688 delalloc/allocations */
688 u64 bytes_delalloc; /* number of bytes currently reserved for 689 u64 disk_used; /* total bytes used on disk */
689 delayed allocation */
690 690
691 int full; /* indicates that we cannot allocate any more 691 int full; /* indicates that we cannot allocate any more
692 chunks for this space */ 692 chunks for this space */
693 int force_alloc; /* set if we need to force a chunk alloc for 693 int force_alloc; /* set if we need to force a chunk alloc for
694 this space */ 694 this space */
695 int force_delalloc; /* make people start doing filemap_flush until
696 we're under a threshold */
697 695
698 struct list_head list; 696 struct list_head list;
699 697
700 /* for controlling how we free up space for allocations */
701 wait_queue_head_t allocate_wait;
702 wait_queue_head_t flush_wait;
703 int allocating_chunk;
704 int flushing;
705
706 /* for block groups in our same type */ 698 /* for block groups in our same type */
707 struct list_head block_groups; 699 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
708 spinlock_t lock; 700 spinlock_t lock;
709 struct rw_semaphore groups_sem; 701 struct rw_semaphore groups_sem;
710 atomic_t caching_threads; 702 atomic_t caching_threads;
711}; 703};
712 704
705struct btrfs_block_rsv {
706 u64 size;
707 u64 reserved;
708 u64 freed[2];
709 struct btrfs_space_info *space_info;
710 struct list_head list;
711 spinlock_t lock;
712 atomic_t usage;
713 unsigned int priority:8;
714 unsigned int durable:1;
715 unsigned int refill_used:1;
716 unsigned int full:1;
717};
718
713/* 719/*
714 * free clusters are used to claim free space in relatively large chunks, 720 * free clusters are used to claim free space in relatively large chunks,
715 * allowing us to do less seeky writes. They are used for all metadata 721 * allowing us to do less seeky writes. They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
760 spinlock_t lock; 766 spinlock_t lock;
761 u64 pinned; 767 u64 pinned;
762 u64 reserved; 768 u64 reserved;
769 u64 reserved_pinned;
763 u64 bytes_super; 770 u64 bytes_super;
764 u64 flags; 771 u64 flags;
765 u64 sectorsize; 772 u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
825 /* logical->physical extent mapping */ 832 /* logical->physical extent mapping */
826 struct btrfs_mapping_tree mapping_tree; 833 struct btrfs_mapping_tree mapping_tree;
827 834
835 /* block reservation for extent, checksum and root tree */
836 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv;
839 /* block reservation for metadata operations */
840 struct btrfs_block_rsv trans_block_rsv;
841 /* block reservation for chunk tree */
842 struct btrfs_block_rsv chunk_block_rsv;
843
844 struct btrfs_block_rsv empty_block_rsv;
845
846 /* list of block reservations that cross multiple transactions */
847 struct list_head durable_block_rsv_list;
848
849 struct mutex durable_block_rsv_mutex;
850
828 u64 generation; 851 u64 generation;
829 u64 last_trans_committed; 852 u64 last_trans_committed;
830 853
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
927 struct btrfs_workers endio_meta_write_workers; 950 struct btrfs_workers endio_meta_write_workers;
928 struct btrfs_workers endio_write_workers; 951 struct btrfs_workers endio_write_workers;
929 struct btrfs_workers submit_workers; 952 struct btrfs_workers submit_workers;
930 struct btrfs_workers enospc_workers;
931 /* 953 /*
932 * fixup workers take dirty pages that didn't properly go through 954 * fixup workers take dirty pages that didn't properly go through
933 * the cow mechanism and make them safe to write. It happens 955 * the cow mechanism and make them safe to write. It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
943 int do_barriers; 965 int do_barriers;
944 int closing; 966 int closing;
945 int log_root_recovering; 967 int log_root_recovering;
968 int enospc_unlink;
946 969
947 u64 total_pinned; 970 u64 total_pinned;
948 971
@@ -1012,6 +1035,9 @@ struct btrfs_root {
1012 struct completion kobj_unregister; 1035 struct completion kobj_unregister;
1013 struct mutex objectid_mutex; 1036 struct mutex objectid_mutex;
1014 1037
1038 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv;
1040
1015 struct mutex log_mutex; 1041 struct mutex log_mutex;
1016 wait_queue_head_t log_writer_wait; 1042 wait_queue_head_t log_writer_wait;
1017 wait_queue_head_t log_commit_wait[2]; 1043 wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
1043 int ref_cows; 1069 int ref_cows;
1044 int track_dirty; 1070 int track_dirty;
1045 int in_radix; 1071 int in_radix;
1046 int clean_orphans;
1047 1072
1048 u64 defrag_trans_start; 1073 u64 defrag_trans_start;
1049 struct btrfs_key defrag_progress; 1074 struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
1057 1082
1058 struct list_head root_list; 1083 struct list_head root_list;
1059 1084
1060 spinlock_t list_lock; 1085 spinlock_t orphan_lock;
1061 struct list_head orphan_list; 1086 struct list_head orphan_list;
1087 struct btrfs_block_rsv *orphan_block_rsv;
1088 int orphan_item_inserted;
1089 int orphan_cleanup_state;
1062 1090
1063 spinlock_t inode_lock; 1091 spinlock_t inode_lock;
1064 /* red-black tree that keeps track of in-memory inodes */ 1092 /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1965int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1966 struct btrfs_root *root, unsigned long count); 1994 struct btrfs_root *root, unsigned long count);
1967int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 1995int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
1996int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1997 struct btrfs_root *root, u64 bytenr,
1998 u64 num_bytes, u64 *refs, u64 *flags);
1968int btrfs_pin_extent(struct btrfs_root *root, 1999int btrfs_pin_extent(struct btrfs_root *root,
1969 u64 bytenr, u64 num, int reserved); 2000 u64 bytenr, u64 num, int reserved);
1970int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1984 u64 parent, u64 root_objectid, 2015 u64 parent, u64 root_objectid,
1985 struct btrfs_disk_key *key, int level, 2016 struct btrfs_disk_key *key, int level,
1986 u64 hint, u64 empty_size); 2017 u64 hint, u64 empty_size);
1987int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2018void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1988 struct btrfs_root *root, 2019 struct btrfs_root *root,
1989 u64 bytenr, u32 blocksize, 2020 struct extent_buffer *buf,
1990 u64 parent, u64 root_objectid, int level); 2021 u64 parent, int last_ref);
1991struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2022struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1992 struct btrfs_root *root, 2023 struct btrfs_root *root,
1993 u64 bytenr, u32 blocksize, 2024 u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2041 u64 size); 2072 u64 size);
2042int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2043 struct btrfs_root *root, u64 group_start); 2074 struct btrfs_root *root, u64 group_start);
2044int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2045 struct btrfs_block_group_cache *group);
2046
2047u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2048void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2049void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2050 2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2051int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items); 2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2052int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items); 2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2053int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root, 2081 struct btrfs_root *root,
2054 struct inode *inode, int num_items); 2082 int num_items, int *retries);
2055int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root, 2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2056 struct inode *inode, int num_items); 2084 struct btrfs_root *root);
2057int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2058 u64 bytes); 2086 struct inode *inode);
2059void btrfs_free_reserved_data_space(struct btrfs_root *root, 2087void btrfs_orphan_release_metadata(struct inode *inode);
2060 struct inode *inode, u64 bytes); 2088int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2061void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, 2089 struct btrfs_pending_snapshot *pending);
2062 u64 bytes); 2090int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2063void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 2091void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2064 u64 bytes); 2092int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2093void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2094void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2095struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2096void btrfs_free_block_rsv(struct btrfs_root *root,
2097 struct btrfs_block_rsv *rsv);
2098void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2099 struct btrfs_block_rsv *rsv);
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv,
2107 u64 min_reserved, int min_factor);
2108int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2109 struct btrfs_block_rsv *dst_rsv,
2110 u64 num_bytes);
2111void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes);
2114int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache);
2065/* ctree.c */ 2118/* ctree.c */
2066int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2067 int level, int *slot); 2120 int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2152int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2205int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2206int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2154int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2207int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2155int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2208int btrfs_drop_snapshot(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv, int update_ref);
2156int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2210int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2157 struct btrfs_root *root, 2211 struct btrfs_root *root,
2158 struct extent_buffer *node, 2212 struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2245 struct btrfs_root *root, 2299 struct btrfs_root *root,
2246 const char *name, int name_len, 2300 const char *name, int name_len,
2247 u64 inode_objectid, u64 ref_objectid, u64 *index); 2301 u64 inode_objectid, u64 ref_objectid, u64 *index);
2302struct btrfs_inode_ref *
2303btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2304 struct btrfs_root *root,
2305 struct btrfs_path *path,
2306 const char *name, int name_len,
2307 u64 inode_objectid, u64 ref_objectid, int mod);
2248int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2308int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2249 struct btrfs_root *root, 2309 struct btrfs_root *root,
2250 struct btrfs_path *path, u64 objectid); 2310 struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2257 struct btrfs_root *root, u64 bytenr, u64 len); 2317 struct btrfs_root *root, u64 bytenr, u64 len);
2258int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2318int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2259 struct bio *bio, u32 *dst); 2319 struct bio *bio, u32 *dst);
2320int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2321 struct bio *bio, u64 logical_offset, u32 *dst);
2260int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2322int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2261 struct btrfs_root *root, 2323 struct btrfs_root *root,
2262 u64 objectid, u64 pos, 2324 u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2311 u32 min_type); 2373 u32 min_type);
2312 2374
2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state); 2378 struct extent_state **cached_state);
2316int btrfs_writepages(struct address_space *mapping, 2379int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2349int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2350int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2351void btrfs_orphan_cleanup(struct btrfs_root *root); 2414void btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve);
2418void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root);
2352int btrfs_cont_expand(struct inode *inode, loff_t size); 2422int btrfs_cont_expand(struct inode *inode, loff_t size);
2353int btrfs_invalidate_inodes(struct btrfs_root *root); 2423int btrfs_invalidate_inodes(struct btrfs_root *root);
2354void btrfs_add_delayed_iput(struct inode *inode); 2424void btrfs_add_delayed_iput(struct inode *inode);
2355void btrfs_run_delayed_iputs(struct btrfs_root *root); 2425void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint);
2356extern const struct dentry_operations btrfs_dentry_operations; 2429extern const struct dentry_operations btrfs_dentry_operations;
2357 2430
2358/* ioctl.c */ 2431/* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
2361void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2434void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2362 2435
2363/* file.c */ 2436/* file.c */
2364int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2437int btrfs_sync_file(struct file *file, int datasync);
2365int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2438int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2366 int skip_pinned); 2439 int skip_pinned);
2367int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2440int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2409 struct btrfs_root *root); 2482 struct btrfs_root *root);
2410int btrfs_recover_relocation(struct btrfs_root *root); 2483int btrfs_recover_relocation(struct btrfs_root *root);
2411int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2484int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2485void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2486 struct btrfs_root *root, struct extent_buffer *buf,
2487 struct extent_buffer *cow);
2488void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2489 struct btrfs_pending_snapshot *pending,
2490 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending);
2412#endif 2493#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
319} 319}
320 320
321/* 321/*
322 * helper function to lookup reference count and flags of extent.
323 *
324 * the head node for delayed ref is used to store the sum of all the
325 * reference count modifications queued up in the rbtree. the head
326 * node may also store the extent flags to set. This way you can check
327 * to see what the reference count and extent flags would be if all of
328 * the delayed refs are not processed.
329 */
330int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
331 struct btrfs_root *root, u64 bytenr,
332 u64 num_bytes, u64 *refs, u64 *flags)
333{
334 struct btrfs_delayed_ref_node *ref;
335 struct btrfs_delayed_ref_head *head;
336 struct btrfs_delayed_ref_root *delayed_refs;
337 struct btrfs_path *path;
338 struct btrfs_extent_item *ei;
339 struct extent_buffer *leaf;
340 struct btrfs_key key;
341 u32 item_size;
342 u64 num_refs;
343 u64 extent_flags;
344 int ret;
345
346 path = btrfs_alloc_path();
347 if (!path)
348 return -ENOMEM;
349
350 key.objectid = bytenr;
351 key.type = BTRFS_EXTENT_ITEM_KEY;
352 key.offset = num_bytes;
353 delayed_refs = &trans->transaction->delayed_refs;
354again:
355 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
356 &key, path, 0, 0);
357 if (ret < 0)
358 goto out;
359
360 if (ret == 0) {
361 leaf = path->nodes[0];
362 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
363 if (item_size >= sizeof(*ei)) {
364 ei = btrfs_item_ptr(leaf, path->slots[0],
365 struct btrfs_extent_item);
366 num_refs = btrfs_extent_refs(leaf, ei);
367 extent_flags = btrfs_extent_flags(leaf, ei);
368 } else {
369#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
370 struct btrfs_extent_item_v0 *ei0;
371 BUG_ON(item_size != sizeof(*ei0));
372 ei0 = btrfs_item_ptr(leaf, path->slots[0],
373 struct btrfs_extent_item_v0);
374 num_refs = btrfs_extent_refs_v0(leaf, ei0);
375 /* FIXME: this isn't correct for data */
376 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
377#else
378 BUG();
379#endif
380 }
381 BUG_ON(num_refs == 0);
382 } else {
383 num_refs = 0;
384 extent_flags = 0;
385 ret = 0;
386 }
387
388 spin_lock(&delayed_refs->lock);
389 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
390 if (ref) {
391 head = btrfs_delayed_node_to_head(ref);
392 if (!mutex_trylock(&head->mutex)) {
393 atomic_inc(&ref->refs);
394 spin_unlock(&delayed_refs->lock);
395
396 btrfs_release_path(root->fs_info->extent_root, path);
397
398 mutex_lock(&head->mutex);
399 mutex_unlock(&head->mutex);
400 btrfs_put_delayed_ref(ref);
401 goto again;
402 }
403 if (head->extent_op && head->extent_op->update_flags)
404 extent_flags |= head->extent_op->flags_to_set;
405 else
406 BUG_ON(num_refs == 0);
407
408 num_refs += ref->ref_mod;
409 mutex_unlock(&head->mutex);
410 }
411 WARN_ON(num_refs == 0);
412 if (refs)
413 *refs = num_refs;
414 if (flags)
415 *flags = extent_flags;
416out:
417 spin_unlock(&delayed_refs->lock);
418 btrfs_free_path(path);
419 return ret;
420}
421
422/*
423 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
424 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
425 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..34f7c375567e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
74 int rw; 74 int rw;
75 int mirror_num; 75 int mirror_num;
76 unsigned long bio_flags; 76 unsigned long bio_flags;
77 /*
78 * bio_offset is optional, can be used if the pages in the bio
79 * can't tell us where in the file the bio should go
80 */
81 u64 bio_offset;
77 struct btrfs_work work; 82 struct btrfs_work work;
78}; 83};
79 84
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
534 async = container_of(work, struct async_submit_bio, work); 539 async = container_of(work, struct async_submit_bio, work);
535 fs_info = BTRFS_I(async->inode)->root->fs_info; 540 fs_info = BTRFS_I(async->inode)->root->fs_info;
536 async->submit_bio_start(async->inode, async->rw, async->bio, 541 async->submit_bio_start(async->inode, async->rw, async->bio,
537 async->mirror_num, async->bio_flags); 542 async->mirror_num, async->bio_flags,
543 async->bio_offset);
538} 544}
539 545
540static void run_one_async_done(struct btrfs_work *work) 546static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
556 wake_up(&fs_info->async_submit_wait); 562 wake_up(&fs_info->async_submit_wait);
557 563
558 async->submit_bio_done(async->inode, async->rw, async->bio, 564 async->submit_bio_done(async->inode, async->rw, async->bio,
559 async->mirror_num, async->bio_flags); 565 async->mirror_num, async->bio_flags,
566 async->bio_offset);
560} 567}
561 568
562static void run_one_async_free(struct btrfs_work *work) 569static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
570int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 577int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
571 int rw, struct bio *bio, int mirror_num, 578 int rw, struct bio *bio, int mirror_num,
572 unsigned long bio_flags, 579 unsigned long bio_flags,
580 u64 bio_offset,
573 extent_submit_bio_hook_t *submit_bio_start, 581 extent_submit_bio_hook_t *submit_bio_start,
574 extent_submit_bio_hook_t *submit_bio_done) 582 extent_submit_bio_hook_t *submit_bio_done)
575{ 583{
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
592 600
593 async->work.flags = 0; 601 async->work.flags = 0;
594 async->bio_flags = bio_flags; 602 async->bio_flags = bio_flags;
603 async->bio_offset = bio_offset;
595 604
596 atomic_inc(&fs_info->nr_async_submits); 605 atomic_inc(&fs_info->nr_async_submits);
597 606
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
627 636
628static int __btree_submit_bio_start(struct inode *inode, int rw, 637static int __btree_submit_bio_start(struct inode *inode, int rw,
629 struct bio *bio, int mirror_num, 638 struct bio *bio, int mirror_num,
630 unsigned long bio_flags) 639 unsigned long bio_flags,
640 u64 bio_offset)
631{ 641{
632 /* 642 /*
633 * when we're called for a write, we're already in the async 643 * when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
638} 648}
639 649
640static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 650static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
641 int mirror_num, unsigned long bio_flags) 651 int mirror_num, unsigned long bio_flags,
652 u64 bio_offset)
642{ 653{
643 /* 654 /*
644 * when we're called for a write, we're already in the async 655 * when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
648} 659}
649 660
650static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 661static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
651 int mirror_num, unsigned long bio_flags) 662 int mirror_num, unsigned long bio_flags,
663 u64 bio_offset)
652{ 664{
653 int ret; 665 int ret;
654 666
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
671 */ 683 */
672 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 684 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
673 inode, rw, bio, mirror_num, 0, 685 inode, rw, bio, mirror_num, 0,
686 bio_offset,
674 __btree_submit_bio_start, 687 __btree_submit_bio_start,
675 __btree_submit_bio_done); 688 __btree_submit_bio_done);
676} 689}
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
894 root->ref_cows = 0; 907 root->ref_cows = 0;
895 root->track_dirty = 0; 908 root->track_dirty = 0;
896 root->in_radix = 0; 909 root->in_radix = 0;
897 root->clean_orphans = 0; 910 root->orphan_item_inserted = 0;
911 root->orphan_cleanup_state = 0;
898 912
899 root->fs_info = fs_info; 913 root->fs_info = fs_info;
900 root->objectid = objectid; 914 root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
903 root->name = NULL; 917 root->name = NULL;
904 root->in_sysfs = 0; 918 root->in_sysfs = 0;
905 root->inode_tree = RB_ROOT; 919 root->inode_tree = RB_ROOT;
920 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL;
906 922
907 INIT_LIST_HEAD(&root->dirty_list); 923 INIT_LIST_HEAD(&root->dirty_list);
908 INIT_LIST_HEAD(&root->orphan_list); 924 INIT_LIST_HEAD(&root->orphan_list);
909 INIT_LIST_HEAD(&root->root_list); 925 INIT_LIST_HEAD(&root->root_list);
910 spin_lock_init(&root->node_lock); 926 spin_lock_init(&root->node_lock);
911 spin_lock_init(&root->list_lock); 927 spin_lock_init(&root->orphan_lock);
912 spin_lock_init(&root->inode_lock); 928 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock);
913 mutex_init(&root->objectid_mutex); 930 mutex_init(&root->objectid_mutex);
914 mutex_init(&root->log_mutex); 931 mutex_init(&root->log_mutex);
915 init_waitqueue_head(&root->log_writer_wait); 932 init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
968 return 0; 985 return 0;
969} 986}
970 987
971int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
972 struct btrfs_fs_info *fs_info)
973{
974 struct extent_buffer *eb;
975 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
976 u64 start = 0;
977 u64 end = 0;
978 int ret;
979
980 if (!log_root_tree)
981 return 0;
982
983 while (1) {
984 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
985 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
986 if (ret)
987 break;
988
989 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
990 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
991 }
992 eb = fs_info->log_root_tree->node;
993
994 WARN_ON(btrfs_header_level(eb) != 0);
995 WARN_ON(btrfs_header_nritems(eb) != 0);
996
997 ret = btrfs_free_reserved_extent(fs_info->tree_root,
998 eb->start, eb->len);
999 BUG_ON(ret);
1000
1001 free_extent_buffer(eb);
1002 kfree(fs_info->log_root_tree);
1003 fs_info->log_root_tree = NULL;
1004 return 0;
1005}
1006
1007static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, 988static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
1008 struct btrfs_fs_info *fs_info) 989 struct btrfs_fs_info *fs_info)
1009{ 990{
@@ -1191,19 +1172,23 @@ again:
1191 if (root) 1172 if (root)
1192 return root; 1173 return root;
1193 1174
1194 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1195 if (ret == 0)
1196 ret = -ENOENT;
1197 if (ret < 0)
1198 return ERR_PTR(ret);
1199
1200 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1175 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1201 if (IS_ERR(root)) 1176 if (IS_ERR(root))
1202 return root; 1177 return root;
1203 1178
1204 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1205 set_anon_super(&root->anon_super, NULL); 1179 set_anon_super(&root->anon_super, NULL);
1206 1180
1181 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT;
1183 goto fail;
1184 }
1185
1186 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1187 if (ret < 0)
1188 goto fail;
1189 if (ret == 0)
1190 root->orphan_item_inserted = 1;
1191
1207 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1192 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1208 if (ret) 1193 if (ret)
1209 goto fail; 1194 goto fail;
@@ -1212,10 +1197,9 @@ again:
1212 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1197 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1213 (unsigned long)root->root_key.objectid, 1198 (unsigned long)root->root_key.objectid,
1214 root); 1199 root);
1215 if (ret == 0) { 1200 if (ret == 0)
1216 root->in_radix = 1; 1201 root->in_radix = 1;
1217 root->clean_orphans = 1; 1202
1218 }
1219 spin_unlock(&fs_info->fs_roots_radix_lock); 1203 spin_unlock(&fs_info->fs_roots_radix_lock);
1220 radix_tree_preload_end(); 1204 radix_tree_preload_end();
1221 if (ret) { 1205 if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
1461 struct btrfs_root *root = arg; 1445 struct btrfs_root *root = arg;
1462 1446
1463 do { 1447 do {
1464 smp_mb();
1465 if (root->fs_info->closing)
1466 break;
1467
1468 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1448 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1469 1449
1470 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1450 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
1477 if (freezing(current)) { 1457 if (freezing(current)) {
1478 refrigerator(); 1458 refrigerator();
1479 } else { 1459 } else {
1480 smp_mb();
1481 if (root->fs_info->closing)
1482 break;
1483 set_current_state(TASK_INTERRUPTIBLE); 1460 set_current_state(TASK_INTERRUPTIBLE);
1484 schedule(); 1461 if (!kthread_should_stop())
1462 schedule();
1485 __set_current_state(TASK_RUNNING); 1463 __set_current_state(TASK_RUNNING);
1486 } 1464 }
1487 } while (!kthread_should_stop()); 1465 } while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
1493 struct btrfs_root *root = arg; 1471 struct btrfs_root *root = arg;
1494 struct btrfs_trans_handle *trans; 1472 struct btrfs_trans_handle *trans;
1495 struct btrfs_transaction *cur; 1473 struct btrfs_transaction *cur;
1474 u64 transid;
1496 unsigned long now; 1475 unsigned long now;
1497 unsigned long delay; 1476 unsigned long delay;
1498 int ret; 1477 int ret;
1499 1478
1500 do { 1479 do {
1501 smp_mb();
1502 if (root->fs_info->closing)
1503 break;
1504
1505 delay = HZ * 30; 1480 delay = HZ * 30;
1506 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1507 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1482 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1508 1483
1509 mutex_lock(&root->fs_info->trans_mutex); 1484 spin_lock(&root->fs_info->new_trans_lock);
1510 cur = root->fs_info->running_transaction; 1485 cur = root->fs_info->running_transaction;
1511 if (!cur) { 1486 if (!cur) {
1512 mutex_unlock(&root->fs_info->trans_mutex); 1487 spin_unlock(&root->fs_info->new_trans_lock);
1513 goto sleep; 1488 goto sleep;
1514 } 1489 }
1515 1490
1516 now = get_seconds(); 1491 now = get_seconds();
1517 if (now < cur->start_time || now - cur->start_time < 30) { 1492 if (!cur->blocked &&
1518 mutex_unlock(&root->fs_info->trans_mutex); 1493 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock);
1519 delay = HZ * 5; 1495 delay = HZ * 5;
1520 goto sleep; 1496 goto sleep;
1521 } 1497 }
1522 mutex_unlock(&root->fs_info->trans_mutex); 1498 transid = cur->transid;
1523 trans = btrfs_start_transaction(root, 1); 1499 spin_unlock(&root->fs_info->new_trans_lock);
1524 ret = btrfs_commit_transaction(trans, root);
1525 1500
1501 trans = btrfs_join_transaction(root, 1);
1502 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret);
1505 } else {
1506 btrfs_end_transaction(trans, root);
1507 }
1526sleep: 1508sleep:
1527 wake_up_process(root->fs_info->cleaner_kthread); 1509 wake_up_process(root->fs_info->cleaner_kthread);
1528 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1510 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
1530 if (freezing(current)) { 1512 if (freezing(current)) {
1531 refrigerator(); 1513 refrigerator();
1532 } else { 1514 } else {
1533 if (root->fs_info->closing)
1534 break;
1535 set_current_state(TASK_INTERRUPTIBLE); 1515 set_current_state(TASK_INTERRUPTIBLE);
1536 schedule_timeout(delay); 1516 if (!kthread_should_stop() &&
1517 !btrfs_transaction_blocked(root->fs_info))
1518 schedule_timeout(delay);
1537 __set_current_state(TASK_RUNNING); 1519 __set_current_state(TASK_RUNNING);
1538 } 1520 }
1539 } while (!kthread_should_stop()); 1521 } while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1620 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1602 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1621 INIT_LIST_HEAD(&fs_info->space_info); 1603 INIT_LIST_HEAD(&fs_info->space_info);
1622 btrfs_mapping_init(&fs_info->mapping_tree); 1604 btrfs_mapping_init(&fs_info->mapping_tree);
1605 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1606 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1607 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1608 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1609 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1610 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1611 mutex_init(&fs_info->durable_block_rsv_mutex);
1623 atomic_set(&fs_info->nr_async_submits, 0); 1612 atomic_set(&fs_info->nr_async_submits, 0);
1624 atomic_set(&fs_info->async_delalloc_pages, 0); 1613 atomic_set(&fs_info->async_delalloc_pages, 0);
1625 atomic_set(&fs_info->async_submit_draining, 0); 1614 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1759 min_t(u64, fs_devices->num_devices, 1748 min_t(u64, fs_devices->num_devices,
1760 fs_info->thread_pool_size), 1749 fs_info->thread_pool_size),
1761 &fs_info->generic_worker); 1750 &fs_info->generic_worker);
1762 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1763 fs_info->thread_pool_size,
1764 &fs_info->generic_worker);
1765 1751
1766 /* a higher idle thresh on the submit workers makes it much more 1752 /* a higher idle thresh on the submit workers makes it much more
1767 * likely that bios will be send down in a sane order to the 1753 * likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1809 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1810 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1811 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1797 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1812 btrfs_start_workers(&fs_info->enospc_workers, 1);
1813 1798
1814 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1815 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1912 1897
1913 csum_root->track_dirty = 1; 1898 csum_root->track_dirty = 1;
1914 1899
1900 fs_info->generation = generation;
1901 fs_info->last_trans_committed = generation;
1902 fs_info->data_alloc_profile = (u64)-1;
1903 fs_info->metadata_alloc_profile = (u64)-1;
1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905
1915 ret = btrfs_read_block_groups(extent_root); 1906 ret = btrfs_read_block_groups(extent_root);
1916 if (ret) { 1907 if (ret) {
1917 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1918 goto fail_block_groups; 1909 goto fail_block_groups;
1919 } 1910 }
1920 1911
1921 fs_info->generation = generation;
1922 fs_info->last_trans_committed = generation;
1923 fs_info->data_alloc_profile = (u64)-1;
1924 fs_info->metadata_alloc_profile = (u64)-1;
1925 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1926 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1912 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1927 "btrfs-cleaner"); 1913 "btrfs-cleaner");
1928 if (IS_ERR(fs_info->cleaner_kthread)) 1914 if (IS_ERR(fs_info->cleaner_kthread))
@@ -1955,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1955 btrfs_level_size(tree_root, 1941 btrfs_level_size(tree_root,
1956 btrfs_super_log_root_level(disk_super)); 1942 btrfs_super_log_root_level(disk_super));
1957 1943
1958 log_tree_root = kzalloc(sizeof(struct btrfs_root), 1944 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1959 GFP_NOFS); 1945 if (!log_tree_root) {
1946 err = -ENOMEM;
1947 goto fail_trans_kthread;
1948 }
1960 1949
1961 __setup_root(nodesize, leafsize, sectorsize, stripesize, 1950 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1962 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 1951 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1977,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1977 BUG_ON(ret); 1966 BUG_ON(ret);
1978 1967
1979 if (!(sb->s_flags & MS_RDONLY)) { 1968 if (!(sb->s_flags & MS_RDONLY)) {
1969 ret = btrfs_cleanup_fs_roots(fs_info);
1970 BUG_ON(ret);
1971
1980 ret = btrfs_recover_relocation(tree_root); 1972 ret = btrfs_recover_relocation(tree_root);
1981 if (ret < 0) { 1973 if (ret < 0) {
1982 printk(KERN_WARNING 1974 printk(KERN_WARNING
@@ -1993,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1993 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 1985 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1994 if (!fs_info->fs_root) 1986 if (!fs_info->fs_root)
1995 goto fail_trans_kthread; 1987 goto fail_trans_kthread;
1988 if (IS_ERR(fs_info->fs_root)) {
1989 err = PTR_ERR(fs_info->fs_root);
1990 goto fail_trans_kthread;
1991 }
1996 1992
1997 if (!(sb->s_flags & MS_RDONLY)) { 1993 if (!(sb->s_flags & MS_RDONLY)) {
1998 down_read(&fs_info->cleanup_work_sem); 1994 down_read(&fs_info->cleanup_work_sem);
@@ -2040,7 +2036,6 @@ fail_sb_buffer:
2040 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2041 btrfs_stop_workers(&fs_info->endio_write_workers); 2037 btrfs_stop_workers(&fs_info->endio_write_workers);
2042 btrfs_stop_workers(&fs_info->submit_workers); 2038 btrfs_stop_workers(&fs_info->submit_workers);
2043 btrfs_stop_workers(&fs_info->enospc_workers);
2044fail_iput: 2039fail_iput:
2045 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2046 iput(fs_info->btree_inode); 2041 iput(fs_info->btree_inode);
@@ -2405,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2405 down_write(&root->fs_info->cleanup_work_sem); 2400 down_write(&root->fs_info->cleanup_work_sem);
2406 up_write(&root->fs_info->cleanup_work_sem); 2401 up_write(&root->fs_info->cleanup_work_sem);
2407 2402
2408 trans = btrfs_start_transaction(root, 1); 2403 trans = btrfs_join_transaction(root, 1);
2409 ret = btrfs_commit_transaction(trans, root); 2404 ret = btrfs_commit_transaction(trans, root);
2410 BUG_ON(ret); 2405 BUG_ON(ret);
2411 /* run commit again to drop the original snapshot */ 2406 /* run commit again to drop the original snapshot */
2412 trans = btrfs_start_transaction(root, 1); 2407 trans = btrfs_join_transaction(root, 1);
2413 btrfs_commit_transaction(trans, root); 2408 btrfs_commit_transaction(trans, root);
2414 ret = btrfs_write_and_wait_transaction(NULL, root); 2409 ret = btrfs_write_and_wait_transaction(NULL, root);
2415 BUG_ON(ret); 2410 BUG_ON(ret);
@@ -2426,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
2426 fs_info->closing = 1; 2421 fs_info->closing = 1;
2427 smp_mb(); 2422 smp_mb();
2428 2423
2429 kthread_stop(root->fs_info->transaction_kthread);
2430 kthread_stop(root->fs_info->cleaner_kthread);
2431
2432 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2433 ret = btrfs_commit_super(root); 2425 ret = btrfs_commit_super(root);
2434 if (ret) 2426 if (ret)
2435 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2427 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2436 } 2428 }
2437 2429
2430 kthread_stop(root->fs_info->transaction_kthread);
2431 kthread_stop(root->fs_info->cleaner_kthread);
2432
2438 fs_info->closing = 2; 2433 fs_info->closing = 2;
2439 smp_mb(); 2434 smp_mb();
2440 2435
@@ -2473,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
2473 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2474 btrfs_stop_workers(&fs_info->endio_write_workers); 2469 btrfs_stop_workers(&fs_info->endio_write_workers);
2475 btrfs_stop_workers(&fs_info->submit_workers); 2470 btrfs_stop_workers(&fs_info->submit_workers);
2476 btrfs_stop_workers(&fs_info->enospc_workers);
2477 2471
2478 btrfs_close_devices(fs_info->fs_devices); 2472 btrfs_close_devices(fs_info->fs_devices);
2479 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2473 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 87 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 89 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 90 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 91 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 92 extent_submit_bio_hook_t *submit_bio_done);
93 93
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 96int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 98int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 99 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 100int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b34d32fdaaec..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
35 35
36static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 37 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
39 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
41 u64 num_bytes, int reserve);
42static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
43 struct btrfs_root *root, 42 struct btrfs_root *root,
44 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
61static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
62 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
63 u64 flags, int force); 62 u64 flags, int force);
64static int pin_down_bytes(struct btrfs_trans_handle *trans,
65 struct btrfs_root *root,
66 struct btrfs_path *path,
67 u64 bytenr, u64 num_bytes,
68 int is_data, int reserved,
69 struct extent_buffer **must_clean);
70static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
71 struct btrfs_key *key); 64 struct btrfs_key *key);
72static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
91 84
92void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
93{ 86{
94 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
95 kfree(cache); 91 kfree(cache);
92 }
96} 93}
97 94
98/* 95/*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
319 316
320 exclude_super_stripes(extent_root, block_group); 317 exclude_super_stripes(extent_root, block_group);
321 spin_lock(&block_group->space_info->lock); 318 spin_lock(&block_group->space_info->lock);
322 block_group->space_info->bytes_super += block_group->bytes_super; 319 block_group->space_info->bytes_readonly += block_group->bytes_super;
323 spin_unlock(&block_group->space_info->lock); 320 spin_unlock(&block_group->space_info->lock);
324 321
325 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
507 struct list_head *head = &info->space_info; 504 struct list_head *head = &info->space_info;
508 struct btrfs_space_info *found; 505 struct btrfs_space_info *found;
509 506
507 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
508 BTRFS_BLOCK_GROUP_METADATA;
509
510 rcu_read_lock(); 510 rcu_read_lock();
511 list_for_each_entry_rcu(found, head, list) { 511 list_for_each_entry_rcu(found, head, list) {
512 if (found->flags == flags) { 512 if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
610} 610}
611 611
612/* 612/*
613 * helper function to lookup reference count and flags of extent.
614 *
615 * the head node for delayed ref is used to store the sum of all the
616 * reference count modifications queued up in the rbtree. the head
617 * node may also store the extent flags to set. This way you can check
618 * to see what the reference count and extent flags would be if all of
619 * the delayed refs are not processed.
620 */
621int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
622 struct btrfs_root *root, u64 bytenr,
623 u64 num_bytes, u64 *refs, u64 *flags)
624{
625 struct btrfs_delayed_ref_head *head;
626 struct btrfs_delayed_ref_root *delayed_refs;
627 struct btrfs_path *path;
628 struct btrfs_extent_item *ei;
629 struct extent_buffer *leaf;
630 struct btrfs_key key;
631 u32 item_size;
632 u64 num_refs;
633 u64 extent_flags;
634 int ret;
635
636 path = btrfs_alloc_path();
637 if (!path)
638 return -ENOMEM;
639
640 key.objectid = bytenr;
641 key.type = BTRFS_EXTENT_ITEM_KEY;
642 key.offset = num_bytes;
643 if (!trans) {
644 path->skip_locking = 1;
645 path->search_commit_root = 1;
646 }
647again:
648 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
649 &key, path, 0, 0);
650 if (ret < 0)
651 goto out_free;
652
653 if (ret == 0) {
654 leaf = path->nodes[0];
655 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
656 if (item_size >= sizeof(*ei)) {
657 ei = btrfs_item_ptr(leaf, path->slots[0],
658 struct btrfs_extent_item);
659 num_refs = btrfs_extent_refs(leaf, ei);
660 extent_flags = btrfs_extent_flags(leaf, ei);
661 } else {
662#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
663 struct btrfs_extent_item_v0 *ei0;
664 BUG_ON(item_size != sizeof(*ei0));
665 ei0 = btrfs_item_ptr(leaf, path->slots[0],
666 struct btrfs_extent_item_v0);
667 num_refs = btrfs_extent_refs_v0(leaf, ei0);
668 /* FIXME: this isn't correct for data */
669 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
670#else
671 BUG();
672#endif
673 }
674 BUG_ON(num_refs == 0);
675 } else {
676 num_refs = 0;
677 extent_flags = 0;
678 ret = 0;
679 }
680
681 if (!trans)
682 goto out;
683
684 delayed_refs = &trans->transaction->delayed_refs;
685 spin_lock(&delayed_refs->lock);
686 head = btrfs_find_delayed_ref_head(trans, bytenr);
687 if (head) {
688 if (!mutex_trylock(&head->mutex)) {
689 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock);
691
692 btrfs_release_path(root->fs_info->extent_root, path);
693
694 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node);
697 goto again;
698 }
699 if (head->extent_op && head->extent_op->update_flags)
700 extent_flags |= head->extent_op->flags_to_set;
701 else
702 BUG_ON(num_refs == 0);
703
704 num_refs += head->node.ref_mod;
705 mutex_unlock(&head->mutex);
706 }
707 spin_unlock(&delayed_refs->lock);
708out:
709 WARN_ON(num_refs == 0);
710 if (refs)
711 *refs = num_refs;
712 if (flags)
713 *flags = extent_flags;
714out_free:
715 btrfs_free_path(path);
716 return ret;
717}
718
719/*
613 * Back reference rules. Back refs have three main goals: 720 * Back reference rules. Back refs have three main goals:
614 * 721 *
615 * 1) differentiate between all holders of references to an extent so that 722 * 1) differentiate between all holders of references to an extent so that
@@ -1589,7 +1696,7 @@ static void btrfs_issue_discard(struct block_device *bdev,
1589 u64 start, u64 len) 1696 u64 start, u64 len)
1590{ 1697{
1591 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
1592 DISCARD_FL_BARRIER); 1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1593} 1700}
1594 1701
1595static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1871 return ret; 1978 return ret;
1872} 1979}
1873 1980
1874
1875/* helper function to actually process a single delayed ref entry */ 1981/* helper function to actually process a single delayed ref entry */
1876static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 1982static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1877 struct btrfs_root *root, 1983 struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1891 BUG_ON(extent_op); 1997 BUG_ON(extent_op);
1892 head = btrfs_delayed_node_to_head(node); 1998 head = btrfs_delayed_node_to_head(node);
1893 if (insert_reserved) { 1999 if (insert_reserved) {
1894 int mark_free = 0; 2000 btrfs_pin_extent(root, node->bytenr,
1895 struct extent_buffer *must_clean = NULL; 2001 node->num_bytes, 1);
1896
1897 ret = pin_down_bytes(trans, root, NULL,
1898 node->bytenr, node->num_bytes,
1899 head->is_data, 1, &must_clean);
1900 if (ret > 0)
1901 mark_free = 1;
1902
1903 if (must_clean) {
1904 clean_tree_block(NULL, root, must_clean);
1905 btrfs_tree_unlock(must_clean);
1906 free_extent_buffer(must_clean);
1907 }
1908 if (head->is_data) { 2002 if (head->is_data) {
1909 ret = btrfs_del_csums(trans, root, 2003 ret = btrfs_del_csums(trans, root,
1910 node->bytenr, 2004 node->bytenr,
1911 node->num_bytes); 2005 node->num_bytes);
1912 BUG_ON(ret); 2006 BUG_ON(ret);
1913 } 2007 }
1914 if (mark_free) {
1915 ret = btrfs_free_reserved_extent(root,
1916 node->bytenr,
1917 node->num_bytes);
1918 BUG_ON(ret);
1919 }
1920 } 2008 }
1921 mutex_unlock(&head->mutex); 2009 mutex_unlock(&head->mutex);
1922 return 0; 2010 return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2347 ret = 0; 2435 ret = 0;
2348out: 2436out:
2349 btrfs_free_path(path); 2437 btrfs_free_path(path);
2438 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2439 WARN_ON(ret > 0);
2350 return ret; 2440 return ret;
2351} 2441}
2352 2442
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2660 struct btrfs_space_info **space_info) 2750 struct btrfs_space_info **space_info)
2661{ 2751{
2662 struct btrfs_space_info *found; 2752 struct btrfs_space_info *found;
2753 int i;
2754 int factor;
2755
2756 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2757 BTRFS_BLOCK_GROUP_RAID10))
2758 factor = 2;
2759 else
2760 factor = 1;
2663 2761
2664 found = __find_space_info(info, flags); 2762 found = __find_space_info(info, flags);
2665 if (found) { 2763 if (found) {
2666 spin_lock(&found->lock); 2764 spin_lock(&found->lock);
2667 found->total_bytes += total_bytes; 2765 found->total_bytes += total_bytes;
2668 found->bytes_used += bytes_used; 2766 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor;
2669 found->full = 0; 2768 found->full = 0;
2670 spin_unlock(&found->lock); 2769 spin_unlock(&found->lock);
2671 *space_info = found; 2770 *space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2675 if (!found) 2774 if (!found)
2676 return -ENOMEM; 2775 return -ENOMEM;
2677 2776
2678 INIT_LIST_HEAD(&found->block_groups); 2777 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
2778 INIT_LIST_HEAD(&found->block_groups[i]);
2679 init_rwsem(&found->groups_sem); 2779 init_rwsem(&found->groups_sem);
2680 init_waitqueue_head(&found->flush_wait);
2681 init_waitqueue_head(&found->allocate_wait);
2682 spin_lock_init(&found->lock); 2780 spin_lock_init(&found->lock);
2683 found->flags = flags; 2781 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
2782 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA);
2684 found->total_bytes = total_bytes; 2784 found->total_bytes = total_bytes;
2685 found->bytes_used = bytes_used; 2785 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor;
2686 found->bytes_pinned = 0; 2787 found->bytes_pinned = 0;
2687 found->bytes_reserved = 0; 2788 found->bytes_reserved = 0;
2688 found->bytes_readonly = 0; 2789 found->bytes_readonly = 0;
2689 found->bytes_delalloc = 0; 2790 found->bytes_may_use = 0;
2690 found->full = 0; 2791 found->full = 0;
2691 found->force_alloc = 0; 2792 found->force_alloc = 0;
2692 *space_info = found; 2793 *space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2711 } 2812 }
2712} 2813}
2713 2814
2714static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2715{
2716 spin_lock(&cache->space_info->lock);
2717 spin_lock(&cache->lock);
2718 if (!cache->ro) {
2719 cache->space_info->bytes_readonly += cache->key.offset -
2720 btrfs_block_group_used(&cache->item);
2721 cache->ro = 1;
2722 }
2723 spin_unlock(&cache->lock);
2724 spin_unlock(&cache->space_info->lock);
2725}
2726
2727u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2728{ 2816{
2729 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2817 u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2752 return flags; 2840 return flags;
2753} 2841}
2754 2842
2755static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 2843static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2756{
2757 struct btrfs_fs_info *info = root->fs_info;
2758 u64 alloc_profile;
2759
2760 if (data) {
2761 alloc_profile = info->avail_data_alloc_bits &
2762 info->data_alloc_profile;
2763 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2764 } else if (root == root->fs_info->chunk_root) {
2765 alloc_profile = info->avail_system_alloc_bits &
2766 info->system_alloc_profile;
2767 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2768 } else {
2769 alloc_profile = info->avail_metadata_alloc_bits &
2770 info->metadata_alloc_profile;
2771 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2772 }
2773
2774 return btrfs_reduce_alloc_profile(root, data);
2775}
2776
2777void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2778{
2779 u64 alloc_target;
2780
2781 alloc_target = btrfs_get_alloc_profile(root, 1);
2782 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2783 alloc_target);
2784}
2785
2786static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
2787{
2788 u64 num_bytes;
2789 int level;
2790
2791 level = BTRFS_MAX_LEVEL - 2;
2792 /*
2793 * NOTE: these calculations are absolutely the worst possible case.
2794 * This assumes that _every_ item we insert will require a new leaf, and
2795 * that the tree has grown to its maximum level size.
2796 */
2797
2798 /*
2799 * for every item we insert we could insert both an extent item and a
2800 * extent ref item. Then for ever item we insert, we will need to cow
2801 * both the original leaf, plus the leaf to the left and right of it.
2802 *
2803 * Unless we are talking about the extent root, then we just want the
2804 * number of items * 2, since we just need the extent item plus its ref.
2805 */
2806 if (root == root->fs_info->extent_root)
2807 num_bytes = num_items * 2;
2808 else
2809 num_bytes = (num_items + (2 * num_items)) * 3;
2810
2811 /*
2812 * num_bytes is total number of leaves we could need times the leaf
2813 * size, and then for every leaf we could end up cow'ing 2 nodes per
2814 * level, down to the leaf level.
2815 */
2816 num_bytes = (num_bytes * root->leafsize) +
2817 (num_bytes * (level * 2)) * root->nodesize;
2818
2819 return num_bytes;
2820}
2821
2822/*
2823 * Unreserve metadata space for delalloc. If we have less reserved credits than
2824 * we have extents, this function does nothing.
2825 */
2826int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2827 struct inode *inode, int num_items)
2828{
2829 struct btrfs_fs_info *info = root->fs_info;
2830 struct btrfs_space_info *meta_sinfo;
2831 u64 num_bytes;
2832 u64 alloc_target;
2833 bool bug = false;
2834
2835 /* get the space info for where the metadata will live */
2836 alloc_target = btrfs_get_alloc_profile(root, 0);
2837 meta_sinfo = __find_space_info(info, alloc_target);
2838
2839 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
2840 num_items);
2841
2842 spin_lock(&meta_sinfo->lock);
2843 spin_lock(&BTRFS_I(inode)->accounting_lock);
2844 if (BTRFS_I(inode)->reserved_extents <=
2845 BTRFS_I(inode)->outstanding_extents) {
2846 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2847 spin_unlock(&meta_sinfo->lock);
2848 return 0;
2849 }
2850 spin_unlock(&BTRFS_I(inode)->accounting_lock);
2851
2852 BTRFS_I(inode)->reserved_extents -= num_items;
2853 BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
2854
2855 if (meta_sinfo->bytes_delalloc < num_bytes) {
2856 bug = true;
2857 meta_sinfo->bytes_delalloc = 0;
2858 } else {
2859 meta_sinfo->bytes_delalloc -= num_bytes;
2860 }
2861 spin_unlock(&meta_sinfo->lock);
2862
2863 BUG_ON(bug);
2864
2865 return 0;
2866}
2867
2868static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
2869{ 2844{
2870 u64 thresh; 2845 if (flags & BTRFS_BLOCK_GROUP_DATA)
2871 2846 flags |= root->fs_info->avail_data_alloc_bits &
2872 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 2847 root->fs_info->data_alloc_profile;
2873 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 2848 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2874 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 2849 flags |= root->fs_info->avail_system_alloc_bits &
2875 meta_sinfo->bytes_may_use; 2850 root->fs_info->system_alloc_profile;
2876 2851 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2877 thresh = meta_sinfo->total_bytes - thresh; 2852 flags |= root->fs_info->avail_metadata_alloc_bits &
2878 thresh *= 80; 2853 root->fs_info->metadata_alloc_profile;
2879 do_div(thresh, 100); 2854 return btrfs_reduce_alloc_profile(root, flags);
2880 if (thresh <= meta_sinfo->bytes_delalloc)
2881 meta_sinfo->force_delalloc = 1;
2882 else
2883 meta_sinfo->force_delalloc = 0;
2884} 2855}
2885 2856
2886struct async_flush { 2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2887 struct btrfs_root *root;
2888 struct btrfs_space_info *info;
2889 struct btrfs_work work;
2890};
2891
2892static noinline void flush_delalloc_async(struct btrfs_work *work)
2893{ 2858{
2894 struct async_flush *async; 2859 u64 flags;
2895 struct btrfs_root *root;
2896 struct btrfs_space_info *info;
2897
2898 async = container_of(work, struct async_flush, work);
2899 root = async->root;
2900 info = async->info;
2901
2902 btrfs_start_delalloc_inodes(root, 0);
2903 wake_up(&info->flush_wait);
2904 btrfs_wait_ordered_extents(root, 0, 0);
2905
2906 spin_lock(&info->lock);
2907 info->flushing = 0;
2908 spin_unlock(&info->lock);
2909 wake_up(&info->flush_wait);
2910
2911 kfree(async);
2912}
2913
2914static void wait_on_flush(struct btrfs_space_info *info)
2915{
2916 DEFINE_WAIT(wait);
2917 u64 used;
2918
2919 while (1) {
2920 prepare_to_wait(&info->flush_wait, &wait,
2921 TASK_UNINTERRUPTIBLE);
2922 spin_lock(&info->lock);
2923 if (!info->flushing) {
2924 spin_unlock(&info->lock);
2925 break;
2926 }
2927
2928 used = info->bytes_used + info->bytes_reserved +
2929 info->bytes_pinned + info->bytes_readonly +
2930 info->bytes_super + info->bytes_root +
2931 info->bytes_may_use + info->bytes_delalloc;
2932 if (used < info->total_bytes) {
2933 spin_unlock(&info->lock);
2934 break;
2935 }
2936 spin_unlock(&info->lock);
2937 schedule();
2938 }
2939 finish_wait(&info->flush_wait, &wait);
2940}
2941
2942static void flush_delalloc(struct btrfs_root *root,
2943 struct btrfs_space_info *info)
2944{
2945 struct async_flush *async;
2946 bool wait = false;
2947
2948 spin_lock(&info->lock);
2949 2860
2950 if (!info->flushing) 2861 if (data)
2951 info->flushing = 1; 2862 flags = BTRFS_BLOCK_GROUP_DATA;
2863 else if (root == root->fs_info->chunk_root)
2864 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2952 else 2865 else
2953 wait = true; 2866 flags = BTRFS_BLOCK_GROUP_METADATA;
2954
2955 spin_unlock(&info->lock);
2956
2957 if (wait) {
2958 wait_on_flush(info);
2959 return;
2960 }
2961
2962 async = kzalloc(sizeof(*async), GFP_NOFS);
2963 if (!async)
2964 goto flush;
2965
2966 async->root = root;
2967 async->info = info;
2968 async->work.func = flush_delalloc_async;
2969
2970 btrfs_queue_worker(&root->fs_info->enospc_workers,
2971 &async->work);
2972 wait_on_flush(info);
2973 return;
2974
2975flush:
2976 btrfs_start_delalloc_inodes(root, 0);
2977 btrfs_wait_ordered_extents(root, 0, 0);
2978
2979 spin_lock(&info->lock);
2980 info->flushing = 0;
2981 spin_unlock(&info->lock);
2982 wake_up(&info->flush_wait);
2983}
2984
2985static int maybe_allocate_chunk(struct btrfs_root *root,
2986 struct btrfs_space_info *info)
2987{
2988 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
2989 struct btrfs_trans_handle *trans;
2990 bool wait = false;
2991 int ret = 0;
2992 u64 min_metadata;
2993 u64 free_space;
2994
2995 free_space = btrfs_super_total_bytes(disk_super);
2996 /*
2997 * we allow the metadata to grow to a max of either 10gb or 5% of the
2998 * space in the volume.
2999 */
3000 min_metadata = min((u64)10 * 1024 * 1024 * 1024,
3001 div64_u64(free_space * 5, 100));
3002 if (info->total_bytes >= min_metadata) {
3003 spin_unlock(&info->lock);
3004 return 0;
3005 }
3006
3007 if (info->full) {
3008 spin_unlock(&info->lock);
3009 return 0;
3010 }
3011
3012 if (!info->allocating_chunk) {
3013 info->force_alloc = 1;
3014 info->allocating_chunk = 1;
3015 } else {
3016 wait = true;
3017 }
3018
3019 spin_unlock(&info->lock);
3020
3021 if (wait) {
3022 wait_event(info->allocate_wait,
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026
3027 trans = btrfs_start_transaction(root, 1);
3028 if (!trans) {
3029 ret = -ENOMEM;
3030 goto out;
3031 }
3032
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3034 4096 + 2 * 1024 * 1024,
3035 info->flags, 0);
3036 btrfs_end_transaction(trans, root);
3037 if (ret)
3038 goto out;
3039out:
3040 spin_lock(&info->lock);
3041 info->allocating_chunk = 0;
3042 spin_unlock(&info->lock);
3043 wake_up(&info->allocate_wait);
3044
3045 if (ret)
3046 return 0;
3047 return 1;
3048}
3049
3050/*
3051 * Reserve metadata space for delalloc.
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{
3056 struct btrfs_fs_info *info = root->fs_info;
3057 struct btrfs_space_info *meta_sinfo;
3058 u64 num_bytes;
3059 u64 used;
3060 u64 alloc_target;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
3069 num_items);
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072
3073 force_delalloc = meta_sinfo->force_delalloc;
3074
3075 if (unlikely(!meta_sinfo->bytes_root))
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3077
3078 if (!flushed)
3079 meta_sinfo->bytes_delalloc += num_bytes;
3080
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3085
3086 if (used > meta_sinfo->total_bytes) {
3087 flushed++;
3088
3089 if (flushed == 1) {
3090 if (maybe_allocate_chunk(root, meta_sinfo))
3091 goto again;
3092 flushed++;
3093 } else {
3094 spin_unlock(&meta_sinfo->lock);
3095 }
3096
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 }
3113 2867
3114 BTRFS_I(inode)->reserved_extents += num_items; 2868 return get_alloc_profile(root, flags);
3115 check_force_delalloc(meta_sinfo);
3116 spin_unlock(&meta_sinfo->lock);
3117
3118 if (!flushed && force_delalloc)
3119 filemap_flush(inode->i_mapping);
3120
3121 return 0;
3122} 2869}
3123 2870
3124/* 2871void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{
3135 struct btrfs_fs_info *info = root->fs_info;
3136 struct btrfs_space_info *meta_sinfo;
3137 u64 num_bytes;
3138 u64 alloc_target;
3139 bool bug = false;
3140
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144
3145 num_bytes = calculate_bytes_needed(root, num_items);
3146
3147 spin_lock(&meta_sinfo->lock);
3148 if (meta_sinfo->bytes_may_use < num_bytes) {
3149 bug = true;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155
3156 BUG_ON(bug);
3157
3158 return 0;
3159}
3160
3161/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number
3163 * of bytes that would be needed to modify num_items number of items. If we
3164 * have space, fantastic, if not, you get -ENOSPC. Please call
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
3175{ 2872{
3176 struct btrfs_fs_info *info = root->fs_info; 2873 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
3177 struct btrfs_space_info *meta_sinfo; 2874 BTRFS_BLOCK_GROUP_DATA);
3178 u64 num_bytes;
3179 u64 used;
3180 u64 alloc_target;
3181 int retries = 0;
3182
3183 /* get the space info for where the metadata will live */
3184 alloc_target = btrfs_get_alloc_profile(root, 0);
3185 meta_sinfo = __find_space_info(info, alloc_target);
3186
3187 num_bytes = calculate_bytes_needed(root, num_items);
3188again:
3189 spin_lock(&meta_sinfo->lock);
3190
3191 if (unlikely(!meta_sinfo->bytes_root))
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
3193
3194 if (!retries)
3195 meta_sinfo->bytes_may_use += num_bytes;
3196
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
3201
3202 if (used > meta_sinfo->total_bytes) {
3203 retries++;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211
3212 if (retries == 2) {
3213 flush_delalloc(root, meta_sinfo);
3214 goto again;
3215 }
3216 spin_lock(&meta_sinfo->lock);
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219
3220 dump_space_info(meta_sinfo, 0, 0);
3221 return -ENOSPC;
3222 }
3223
3224 check_force_delalloc(meta_sinfo);
3225 spin_unlock(&meta_sinfo->lock);
3226
3227 return 0;
3228} 2875}
3229 2876
3230/* 2877/*
3231 * This will check the space that the inode allocates from to make sure we have 2878 * This will check the space that the inode allocates from to make sure we have
3232 * enough space for bytes. 2879 * enough space for bytes.
3233 */ 2880 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 2881int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3235 u64 bytes)
3236{ 2882{
3237 struct btrfs_space_info *data_sinfo; 2883 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root;
3238 u64 used; 2885 u64 used;
3239 int ret = 0, committed = 0, flushed = 0; 2886 int ret = 0, committed = 0;
3240 2887
3241 /* make sure bytes are sectorsize aligned */ 2888 /* make sure bytes are sectorsize aligned */
3242 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 2889 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
3248again: 2895again:
3249 /* make sure we have enough space to handle the data first */ 2896 /* make sure we have enough space to handle the data first */
3250 spin_lock(&data_sinfo->lock); 2897 spin_lock(&data_sinfo->lock);
3251 used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + 2898 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3252 data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + 2899 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3253 data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + 2900 data_sinfo->bytes_may_use;
3254 data_sinfo->bytes_super;
3255 2901
3256 if (used + bytes > data_sinfo->total_bytes) { 2902 if (used + bytes > data_sinfo->total_bytes) {
3257 struct btrfs_trans_handle *trans; 2903 struct btrfs_trans_handle *trans;
3258 2904
3259 if (!flushed) {
3260 spin_unlock(&data_sinfo->lock);
3261 flush_delalloc(root, data_sinfo);
3262 flushed = 1;
3263 goto again;
3264 }
3265
3266 /* 2905 /*
3267 * if we don't have enough free bytes in this space then we need 2906 * if we don't have enough free bytes in this space then we need
3268 * to alloc a new chunk. 2907 * to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
3274 spin_unlock(&data_sinfo->lock); 2913 spin_unlock(&data_sinfo->lock);
3275alloc: 2914alloc:
3276 alloc_target = btrfs_get_alloc_profile(root, 1); 2915 alloc_target = btrfs_get_alloc_profile(root, 1);
3277 trans = btrfs_start_transaction(root, 1); 2916 trans = btrfs_join_transaction(root, 1);
3278 if (!trans) 2917 if (IS_ERR(trans))
3279 return -ENOMEM; 2918 return PTR_ERR(trans);
3280 2919
3281 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3282 bytes + 2 * 1024 * 1024, 2921 bytes + 2 * 1024 * 1024,
3283 alloc_target, 0); 2922 alloc_target, 0);
3284 btrfs_end_transaction(trans, root); 2923 btrfs_end_transaction(trans, root);
3285 if (ret) 2924 if (ret < 0)
3286 return ret; 2925 return ret;
3287 2926
3288 if (!data_sinfo) { 2927 if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
3297 if (!committed && !root->fs_info->open_ioctl_trans) { 2936 if (!committed && !root->fs_info->open_ioctl_trans) {
3298 committed = 1; 2937 committed = 1;
3299 trans = btrfs_join_transaction(root, 1); 2938 trans = btrfs_join_transaction(root, 1);
3300 if (!trans) 2939 if (IS_ERR(trans))
3301 return -ENOMEM; 2940 return PTR_ERR(trans);
3302 ret = btrfs_commit_transaction(trans, root); 2941 ret = btrfs_commit_transaction(trans, root);
3303 if (ret) 2942 if (ret)
3304 return ret; 2943 return ret;
3305 goto again; 2944 goto again;
3306 } 2945 }
3307 2946
3308 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 2947#if 0 /* I hope we never need this code again, just in case */
3309 ", %llu bytes_used, %llu bytes_reserved, " 2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3310 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
3311 "%llu total\n", (unsigned long long)bytes, 2950 "%llu bytes_readonly, %llu may use %llu total\n",
3312 (unsigned long long)data_sinfo->bytes_delalloc, 2951 (unsigned long long)bytes,
3313 (unsigned long long)data_sinfo->bytes_used, 2952 (unsigned long long)data_sinfo->bytes_used,
3314 (unsigned long long)data_sinfo->bytes_reserved, 2953 (unsigned long long)data_sinfo->bytes_reserved,
3315 (unsigned long long)data_sinfo->bytes_pinned, 2954 (unsigned long long)data_sinfo->bytes_pinned,
3316 (unsigned long long)data_sinfo->bytes_readonly, 2955 (unsigned long long)data_sinfo->bytes_readonly,
3317 (unsigned long long)data_sinfo->bytes_may_use, 2956 (unsigned long long)data_sinfo->bytes_may_use,
3318 (unsigned long long)data_sinfo->total_bytes); 2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
3319 return -ENOSPC; 2959 return -ENOSPC;
3320 } 2960 }
3321 data_sinfo->bytes_may_use += bytes; 2961 data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
3326} 2966}
3327 2967
3328/* 2968/*
3329 * if there was an error for whatever reason after calling 2969 * called when we are clearing an delalloc extent from the
3330 * btrfs_check_data_free_space, call this so we can cleanup the counters. 2970 * inode's io_tree or there was an error for whatever reason
2971 * after calling btrfs_check_data_free_space
3331 */ 2972 */
3332void btrfs_free_reserved_data_space(struct btrfs_root *root, 2973void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3333 struct inode *inode, u64 bytes)
3334{ 2974{
2975 struct btrfs_root *root = BTRFS_I(inode)->root;
3335 struct btrfs_space_info *data_sinfo; 2976 struct btrfs_space_info *data_sinfo;
3336 2977
3337 /* make sure bytes are sectorsize aligned */ 2978 /* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
3344 spin_unlock(&data_sinfo->lock); 2985 spin_unlock(&data_sinfo->lock);
3345} 2986}
3346 2987
3347/* called when we are adding a delalloc extent to the inode's io_tree */
3348void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3349 u64 bytes)
3350{
3351 struct btrfs_space_info *data_sinfo;
3352
3353 /* get the space info for where this inode will be storing its data */
3354 data_sinfo = BTRFS_I(inode)->space_info;
3355
3356 /* make sure we have enough space to handle the data first */
3357 spin_lock(&data_sinfo->lock);
3358 data_sinfo->bytes_delalloc += bytes;
3359
3360 /*
3361 * we are adding a delalloc extent without calling
3362 * btrfs_check_data_free_space first. This happens on a weird
3363 * writepage condition, but shouldn't hurt our accounting
3364 */
3365 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
3366 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
3367 BTRFS_I(inode)->reserved_bytes = 0;
3368 } else {
3369 data_sinfo->bytes_may_use -= bytes;
3370 BTRFS_I(inode)->reserved_bytes -= bytes;
3371 }
3372
3373 spin_unlock(&data_sinfo->lock);
3374}
3375
3376/* called when we are clearing an delalloc extent from the inode's io_tree */
3377void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
3378 u64 bytes)
3379{
3380 struct btrfs_space_info *info;
3381
3382 info = BTRFS_I(inode)->space_info;
3383
3384 spin_lock(&info->lock);
3385 info->bytes_delalloc -= bytes;
3386 spin_unlock(&info->lock);
3387}
3388
3389static void force_metadata_allocation(struct btrfs_fs_info *info) 2988static void force_metadata_allocation(struct btrfs_fs_info *info)
3390{ 2989{
3391 struct list_head *head = &info->space_info; 2990 struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
3399 rcu_read_unlock(); 2998 rcu_read_unlock();
3400} 2999}
3401 3000
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo,
3002 u64 alloc_bytes)
3003{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3005
3006 if (sinfo->bytes_used + sinfo->bytes_reserved +
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0;
3009
3010 if (sinfo->bytes_used + sinfo->bytes_reserved +
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0;
3013
3014 return 1;
3015}
3016
3402static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3017static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3403 struct btrfs_root *extent_root, u64 alloc_bytes, 3018 struct btrfs_root *extent_root, u64 alloc_bytes,
3404 u64 flags, int force) 3019 u64 flags, int force)
3405{ 3020{
3406 struct btrfs_space_info *space_info; 3021 struct btrfs_space_info *space_info;
3407 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3022 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3408 u64 thresh;
3409 int ret = 0; 3023 int ret = 0;
3410 3024
3411 mutex_lock(&fs_info->chunk_mutex); 3025 mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3428 goto out; 3042 goto out;
3429 } 3043 }
3430 3044
3431 thresh = space_info->total_bytes - space_info->bytes_readonly; 3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
3432 thresh = div_factor(thresh, 8);
3433 if (!force &&
3434 (space_info->bytes_used + space_info->bytes_pinned +
3435 space_info->bytes_reserved + alloc_bytes) < thresh) {
3436 spin_unlock(&space_info->lock); 3046 spin_unlock(&space_info->lock);
3437 goto out; 3047 goto out;
3438 } 3048 }
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3454 spin_lock(&space_info->lock); 3064 spin_lock(&space_info->lock);
3455 if (ret) 3065 if (ret)
3456 space_info->full = 1; 3066 space_info->full = 1;
3067 else
3068 ret = 1;
3457 space_info->force_alloc = 0; 3069 space_info->force_alloc = 0;
3458 spin_unlock(&space_info->lock); 3070 spin_unlock(&space_info->lock);
3459out: 3071out:
@@ -3461,13 +3073,713 @@ out:
3461 return ret; 3073 return ret;
3462} 3074}
3463 3075
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/*
3109 * shrink metadata reservation for delalloc
3110 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim)
3113{
3114 struct btrfs_block_rsv *block_rsv;
3115 u64 reserved;
3116 u64 max_reclaim;
3117 u64 reclaimed = 0;
3118 int pause = 1;
3119 int ret;
3120
3121 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock);
3123 reserved = block_rsv->reserved;
3124 spin_unlock(&block_rsv->lock);
3125
3126 if (reserved == 0)
3127 return 0;
3128
3129 max_reclaim = min(reserved, to_reclaim);
3130
3131 while (1) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
3133 if (!ret) {
3134 __set_current_state(TASK_INTERRUPTIBLE);
3135 schedule_timeout(pause);
3136 pause <<= 1;
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142
3143 spin_lock(&block_rsv->lock);
3144 if (reserved > block_rsv->reserved)
3145 reclaimed = reserved - block_rsv->reserved;
3146 reserved = block_rsv->reserved;
3147 spin_unlock(&block_rsv->lock);
3148
3149 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break;
3151
3152 if (trans && trans->transaction->blocked)
3153 return -EAGAIN;
3154 }
3155 return reclaimed >= to_reclaim;
3156}
3157
3158static int should_retry_reserve(struct btrfs_trans_handle *trans,
3159 struct btrfs_root *root,
3160 struct btrfs_block_rsv *block_rsv,
3161 u64 num_bytes, int *retries)
3162{
3163 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret;
3165
3166 if ((*retries) > 2)
3167 return -ENOSPC;
3168
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
3170 if (ret)
3171 return 1;
3172
3173 if (trans && trans->transaction->in_commit)
3174 return -ENOSPC;
3175
3176 ret = shrink_delalloc(trans, root, num_bytes);
3177 if (ret)
3178 return ret;
3179
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186
3187 (*retries)++;
3188
3189 if (trans)
3190 return -EAGAIN;
3191
3192 trans = btrfs_join_transaction(root, 1);
3193 BUG_ON(IS_ERR(trans));
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196
3197 return 1;
3198}
3199
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
3201 u64 num_bytes)
3202{
3203 struct btrfs_space_info *space_info = block_rsv->space_info;
3204 u64 unused;
3205 int ret = -ENOSPC;
3206
3207 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved +
3209 space_info->bytes_pinned + space_info->bytes_readonly;
3210
3211 if (unused < space_info->total_bytes)
3212 unused = space_info->total_bytes - unused;
3213 else
3214 unused = 0;
3215
3216 if (unused >= num_bytes) {
3217 if (block_rsv->priority >= 10) {
3218 space_info->bytes_reserved += num_bytes;
3219 ret = 0;
3220 } else {
3221 if ((unused + block_rsv->reserved) *
3222 block_rsv->priority >=
3223 (num_bytes + block_rsv->reserved) * 10) {
3224 space_info->bytes_reserved += num_bytes;
3225 ret = 0;
3226 }
3227 }
3228 }
3229 spin_unlock(&space_info->lock);
3230
3231 return ret;
3232}
3233
3234static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3235 struct btrfs_root *root)
3236{
3237 struct btrfs_block_rsv *block_rsv;
3238 if (root->ref_cows)
3239 block_rsv = trans->block_rsv;
3240 else
3241 block_rsv = root->block_rsv;
3242
3243 if (!block_rsv)
3244 block_rsv = &root->fs_info->empty_block_rsv;
3245
3246 return block_rsv;
3247}
3248
3249static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3250 u64 num_bytes)
3251{
3252 int ret = -ENOSPC;
3253 spin_lock(&block_rsv->lock);
3254 if (block_rsv->reserved >= num_bytes) {
3255 block_rsv->reserved -= num_bytes;
3256 if (block_rsv->reserved < block_rsv->size)
3257 block_rsv->full = 0;
3258 ret = 0;
3259 }
3260 spin_unlock(&block_rsv->lock);
3261 return ret;
3262}
3263
3264static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3265 u64 num_bytes, int update_size)
3266{
3267 spin_lock(&block_rsv->lock);
3268 block_rsv->reserved += num_bytes;
3269 if (update_size)
3270 block_rsv->size += num_bytes;
3271 else if (block_rsv->reserved >= block_rsv->size)
3272 block_rsv->full = 1;
3273 spin_unlock(&block_rsv->lock);
3274}
3275
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{
3279 struct btrfs_space_info *space_info = block_rsv->space_info;
3280
3281 spin_lock(&block_rsv->lock);
3282 if (num_bytes == (u64)-1)
3283 num_bytes = block_rsv->size;
3284 block_rsv->size -= num_bytes;
3285 if (block_rsv->reserved >= block_rsv->size) {
3286 num_bytes = block_rsv->reserved - block_rsv->size;
3287 block_rsv->reserved = block_rsv->size;
3288 block_rsv->full = 1;
3289 } else {
3290 num_bytes = 0;
3291 }
3292 spin_unlock(&block_rsv->lock);
3293
3294 if (num_bytes > 0) {
3295 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0);
3297 } else {
3298 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes;
3300 spin_unlock(&space_info->lock);
3301 }
3302 }
3303}
3304
3305static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3306 struct btrfs_block_rsv *dst, u64 num_bytes)
3307{
3308 int ret;
3309
3310 ret = block_rsv_use_bytes(src, num_bytes);
3311 if (ret)
3312 return ret;
3313
3314 block_rsv_add_bytes(dst, num_bytes, 1);
3315 return 0;
3316}
3317
3318void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3319{
3320 memset(rsv, 0, sizeof(*rsv));
3321 spin_lock_init(&rsv->lock);
3322 atomic_set(&rsv->usage, 1);
3323 rsv->priority = 6;
3324 INIT_LIST_HEAD(&rsv->list);
3325}
3326
3327struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{
3329 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv)
3335 return NULL;
3336
3337 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv;
3344}
3345
3346void btrfs_free_block_rsv(struct btrfs_root *root,
3347 struct btrfs_block_rsv *rsv)
3348{
3349 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3350 btrfs_block_rsv_release(root, rsv, (u64)-1);
3351 if (!rsv->durable)
3352 kfree(rsv);
3353 }
3354}
3355
3356/*
3357 * make the block_rsv struct be able to capture freed space.
3358 * the captured space will re-add to the the block_rsv struct
3359 * after transaction commit
3360 */
3361void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3362 struct btrfs_block_rsv *block_rsv)
3363{
3364 block_rsv->durable = 1;
3365 mutex_lock(&fs_info->durable_block_rsv_mutex);
3366 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3367 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3368}
3369
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries)
3374{
3375 int ret;
3376
3377 if (num_bytes == 0)
3378 return 0;
3379again:
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3381 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0;
3384 }
3385
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret;
3391}
3392
3393int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3394 struct btrfs_root *root,
3395 struct btrfs_block_rsv *block_rsv,
3396 u64 min_reserved, int min_factor)
3397{
3398 u64 num_bytes = 0;
3399 int commit_trans = 0;
3400 int ret = -ENOSPC;
3401
3402 if (!block_rsv)
3403 return 0;
3404
3405 spin_lock(&block_rsv->lock);
3406 if (min_factor > 0)
3407 num_bytes = div_factor(block_rsv->size, min_factor);
3408 if (min_reserved > num_bytes)
3409 num_bytes = min_reserved;
3410
3411 if (block_rsv->reserved >= num_bytes) {
3412 ret = 0;
3413 } else {
3414 num_bytes -= block_rsv->reserved;
3415 if (block_rsv->durable &&
3416 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3417 commit_trans = 1;
3418 }
3419 spin_unlock(&block_rsv->lock);
3420 if (!ret)
3421 return 0;
3422
3423 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes);
3425 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0;
3428 }
3429 }
3430
3431 if (commit_trans) {
3432 if (trans)
3433 return -EAGAIN;
3434
3435 trans = btrfs_join_transaction(root, 1);
3436 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root);
3438 return 0;
3439 }
3440
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC;
3447}
3448
3449int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3450 struct btrfs_block_rsv *dst_rsv,
3451 u64 num_bytes)
3452{
3453 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3454}
3455
3456void btrfs_block_rsv_release(struct btrfs_root *root,
3457 struct btrfs_block_rsv *block_rsv,
3458 u64 num_bytes)
3459{
3460 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3461 if (global_rsv->full || global_rsv == block_rsv ||
3462 block_rsv->space_info != global_rsv->space_info)
3463 global_rsv = NULL;
3464 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3465}
3466
3467/*
3468 * helper to calculate size of global block reservation.
3469 * the desired value is sum of space used by extent tree,
3470 * checksum tree and root tree
3471 */
3472static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3473{
3474 struct btrfs_space_info *sinfo;
3475 u64 num_bytes;
3476 u64 meta_used;
3477 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used;
3499 spin_unlock(&sinfo->lock);
3500
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock);
3503 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock);
3505
3506 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3507 csum_size * 2;
3508 num_bytes += div64_u64(data_used + meta_used, 50);
3509
3510 if (num_bytes * 3 > meta_used)
3511 num_bytes = div64_u64(meta_used, 3);
3512
3513 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3514}
3515
3516static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3517{
3518 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3519 struct btrfs_space_info *sinfo = block_rsv->space_info;
3520 u64 num_bytes;
3521
3522 num_bytes = calc_global_metadata_size(fs_info);
3523
3524 spin_lock(&block_rsv->lock);
3525 spin_lock(&sinfo->lock);
3526
3527 block_rsv->size = num_bytes;
3528
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly;
3531
3532 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes;
3534 block_rsv->reserved += num_bytes;
3535 sinfo->bytes_reserved += num_bytes;
3536 }
3537
3538 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes;
3541 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1;
3543 }
3544#if 0
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock);
3550}
3551
3552static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3553{
3554 struct btrfs_space_info *space_info;
3555
3556 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3557 fs_info->chunk_block_rsv.space_info = space_info;
3558 fs_info->chunk_block_rsv.priority = 10;
3559
3560 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3561 fs_info->global_block_rsv.space_info = space_info;
3562 fs_info->global_block_rsv.priority = 10;
3563 fs_info->global_block_rsv.refill_used = 1;
3564 fs_info->delalloc_block_rsv.space_info = space_info;
3565 fs_info->trans_block_rsv.space_info = space_info;
3566 fs_info->empty_block_rsv.space_info = space_info;
3567 fs_info->empty_block_rsv.priority = 10;
3568
3569 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3570 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3571 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3572 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3573 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3574
3575 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3576
3577 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3578
3579 update_global_block_rsv(fs_info);
3580}
3581
3582static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3583{
3584 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3585 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3586 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3587 WARN_ON(fs_info->trans_block_rsv.size > 0);
3588 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3589 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591}
3592
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3594{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3596 3 * num_items;
3597}
3598
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root,
3601 int num_items, int *retries)
3602{
3603 u64 num_bytes;
3604 int ret;
3605
3606 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0;
3608
3609 num_bytes = calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries);
3612 if (!ret) {
3613 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv;
3615 }
3616 return ret;
3617}
3618
3619void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3620 struct btrfs_root *root)
3621{
3622 if (!trans->bytes_reserved)
3623 return;
3624
3625 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3626 btrfs_block_rsv_release(root, trans->block_rsv,
3627 trans->bytes_reserved);
3628 trans->bytes_reserved = 0;
3629}
3630
3631int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3632 struct inode *inode)
3633{
3634 struct btrfs_root *root = BTRFS_I(inode)->root;
3635 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637
3638 /*
3639 * one for deleting orphan item, one for updating inode and
3640 * two for calling btrfs_truncate_inode_items.
3641 *
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650}
3651
3652void btrfs_orphan_release_metadata(struct inode *inode)
3653{
3654 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657}
3658
3659int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3660 struct btrfs_pending_snapshot *pending)
3661{
3662 struct btrfs_root *root = pending->root;
3663 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3664 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3665 /*
3666 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot.
3668 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672}
3673
3674static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3675{
3676 return num_bytes >>= 3;
3677}
3678
3679int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3680{
3681 struct btrfs_root *root = BTRFS_I(inode)->root;
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve;
3684 int nr_extents;
3685 int retries = 0;
3686 int ret;
3687
3688 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1);
3690
3691 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again:
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3696 nr_extents -= BTRFS_I(inode)->reserved_extents;
3697 to_reserve = calc_trans_metadata_size(root, nr_extents);
3698 } else {
3699 nr_extents = 0;
3700 to_reserve = 0;
3701 }
3702
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve);
3705 if (ret) {
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret;
3712 }
3713
3714 BTRFS_I(inode)->reserved_extents += nr_extents;
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719
3720 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve);
3722
3723 return 0;
3724}
3725
3726void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3727{
3728 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free;
3730 int nr_extents;
3731
3732 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
3734
3735 spin_lock(&BTRFS_I(inode)->accounting_lock);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
3739 BTRFS_I(inode)->reserved_extents -= nr_extents;
3740 } else {
3741 nr_extents = 0;
3742 }
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3744
3745 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents);
3748
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free);
3751}
3752
3753int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
3754{
3755 int ret;
3756
3757 ret = btrfs_check_data_free_space(inode, num_bytes);
3758 if (ret)
3759 return ret;
3760
3761 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3762 if (ret) {
3763 btrfs_free_reserved_data_space(inode, num_bytes);
3764 return ret;
3765 }
3766
3767 return 0;
3768}
3769
3770void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
3771{
3772 btrfs_delalloc_release_metadata(inode, num_bytes);
3773 btrfs_free_reserved_data_space(inode, num_bytes);
3774}
3775
3464static int update_block_group(struct btrfs_trans_handle *trans, 3776static int update_block_group(struct btrfs_trans_handle *trans,
3465 struct btrfs_root *root, 3777 struct btrfs_root *root,
3466 u64 bytenr, u64 num_bytes, int alloc, 3778 u64 bytenr, u64 num_bytes, int alloc)
3467 int mark_free)
3468{ 3779{
3469 struct btrfs_block_group_cache *cache; 3780 struct btrfs_block_group_cache *cache;
3470 struct btrfs_fs_info *info = root->fs_info; 3781 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3471 u64 total = num_bytes; 3783 u64 total = num_bytes;
3472 u64 old_val; 3784 u64 old_val;
3473 u64 byte_in_group; 3785 u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3486 cache = btrfs_lookup_block_group(info, bytenr); 3798 cache = btrfs_lookup_block_group(info, bytenr);
3487 if (!cache) 3799 if (!cache)
3488 return -1; 3800 return -1;
3801 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
3802 BTRFS_BLOCK_GROUP_RAID1 |
3803 BTRFS_BLOCK_GROUP_RAID10))
3804 factor = 2;
3805 else
3806 factor = 1;
3489 byte_in_group = bytenr - cache->key.objectid; 3807 byte_in_group = bytenr - cache->key.objectid;
3490 WARN_ON(byte_in_group > cache->key.offset); 3808 WARN_ON(byte_in_group > cache->key.offset);
3491 3809
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3498 old_val += num_bytes; 3816 old_val += num_bytes;
3499 btrfs_set_block_group_used(&cache->item, old_val); 3817 btrfs_set_block_group_used(&cache->item, old_val);
3500 cache->reserved -= num_bytes; 3818 cache->reserved -= num_bytes;
3501 cache->space_info->bytes_used += num_bytes;
3502 cache->space_info->bytes_reserved -= num_bytes; 3819 cache->space_info->bytes_reserved -= num_bytes;
3503 if (cache->ro) 3820 cache->space_info->bytes_used += num_bytes;
3504 cache->space_info->bytes_readonly -= num_bytes; 3821 cache->space_info->disk_used += num_bytes * factor;
3505 spin_unlock(&cache->lock); 3822 spin_unlock(&cache->lock);
3506 spin_unlock(&cache->space_info->lock); 3823 spin_unlock(&cache->space_info->lock);
3507 } else { 3824 } else {
3508 old_val -= num_bytes; 3825 old_val -= num_bytes;
3509 cache->space_info->bytes_used -= num_bytes;
3510 if (cache->ro)
3511 cache->space_info->bytes_readonly += num_bytes;
3512 btrfs_set_block_group_used(&cache->item, old_val); 3826 btrfs_set_block_group_used(&cache->item, old_val);
3827 cache->pinned += num_bytes;
3828 cache->space_info->bytes_pinned += num_bytes;
3829 cache->space_info->bytes_used -= num_bytes;
3830 cache->space_info->disk_used -= num_bytes * factor;
3513 spin_unlock(&cache->lock); 3831 spin_unlock(&cache->lock);
3514 spin_unlock(&cache->space_info->lock); 3832 spin_unlock(&cache->space_info->lock);
3515 if (mark_free) {
3516 int ret;
3517
3518 ret = btrfs_discard_extent(root, bytenr,
3519 num_bytes);
3520 WARN_ON(ret);
3521 3833
3522 ret = btrfs_add_free_space(cache, bytenr, 3834 set_extent_dirty(info->pinned_extents,
3523 num_bytes); 3835 bytenr, bytenr + num_bytes - 1,
3524 WARN_ON(ret); 3836 GFP_NOFS | __GFP_NOFAIL);
3525 }
3526 } 3837 }
3527 btrfs_put_block_group(cache); 3838 btrfs_put_block_group(cache);
3528 total -= num_bytes; 3839 total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3546 return bytenr; 3857 return bytenr;
3547} 3858}
3548 3859
3549/* 3860static int pin_down_extent(struct btrfs_root *root,
3550 * this function must be called within transaction 3861 struct btrfs_block_group_cache *cache,
3551 */ 3862 u64 bytenr, u64 num_bytes, int reserved)
3552int btrfs_pin_extent(struct btrfs_root *root,
3553 u64 bytenr, u64 num_bytes, int reserved)
3554{ 3863{
3555 struct btrfs_fs_info *fs_info = root->fs_info;
3556 struct btrfs_block_group_cache *cache;
3557
3558 cache = btrfs_lookup_block_group(fs_info, bytenr);
3559 BUG_ON(!cache);
3560
3561 spin_lock(&cache->space_info->lock); 3864 spin_lock(&cache->space_info->lock);
3562 spin_lock(&cache->lock); 3865 spin_lock(&cache->lock);
3563 cache->pinned += num_bytes; 3866 cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3569 spin_unlock(&cache->lock); 3872 spin_unlock(&cache->lock);
3570 spin_unlock(&cache->space_info->lock); 3873 spin_unlock(&cache->space_info->lock);
3571 3874
3572 btrfs_put_block_group(cache); 3875 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
3876 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
3877 return 0;
3878}
3573 3879
3574 set_extent_dirty(fs_info->pinned_extents, 3880/*
3575 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 3881 * this function must be called within transaction
3882 */
3883int btrfs_pin_extent(struct btrfs_root *root,
3884 u64 bytenr, u64 num_bytes, int reserved)
3885{
3886 struct btrfs_block_group_cache *cache;
3887
3888 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
3889 BUG_ON(!cache);
3890
3891 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
3892
3893 btrfs_put_block_group(cache);
3576 return 0; 3894 return 0;
3577} 3895}
3578 3896
3579static int update_reserved_extents(struct btrfs_block_group_cache *cache, 3897/*
3580 u64 num_bytes, int reserve) 3898 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false.
3900 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo)
3581{ 3903{
3582 spin_lock(&cache->space_info->lock); 3904 int ret = 0;
3583 spin_lock(&cache->lock); 3905 if (sinfo) {
3584 if (reserve) { 3906 struct btrfs_space_info *space_info = cache->space_info;
3585 cache->reserved += num_bytes; 3907 spin_lock(&space_info->lock);
3586 cache->space_info->bytes_reserved += num_bytes; 3908 spin_lock(&cache->lock);
3909 if (reserve) {
3910 if (cache->ro) {
3911 ret = -EAGAIN;
3912 } else {
3913 cache->reserved += num_bytes;
3914 space_info->bytes_reserved += num_bytes;
3915 }
3916 } else {
3917 if (cache->ro)
3918 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes;
3921 }
3922 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock);
3587 } else { 3924 } else {
3588 cache->reserved -= num_bytes; 3925 spin_lock(&cache->lock);
3589 cache->space_info->bytes_reserved -= num_bytes; 3926 if (cache->ro) {
3927 ret = -EAGAIN;
3928 } else {
3929 if (reserve)
3930 cache->reserved += num_bytes;
3931 else
3932 cache->reserved -= num_bytes;
3933 }
3934 spin_unlock(&cache->lock);
3590 } 3935 }
3591 spin_unlock(&cache->lock); 3936 return ret;
3592 spin_unlock(&cache->space_info->lock);
3593 return 0;
3594} 3937}
3595 3938
3596int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 3939int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3621 fs_info->pinned_extents = &fs_info->freed_extents[0]; 3964 fs_info->pinned_extents = &fs_info->freed_extents[0];
3622 3965
3623 up_write(&fs_info->extent_commit_sem); 3966 up_write(&fs_info->extent_commit_sem);
3967
3968 update_global_block_rsv(fs_info);
3624 return 0; 3969 return 0;
3625} 3970}
3626 3971
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3647 btrfs_add_free_space(cache, start, len); 3992 btrfs_add_free_space(cache, start, len);
3648 } 3993 }
3649 3994
3995 start += len;
3996
3650 spin_lock(&cache->space_info->lock); 3997 spin_lock(&cache->space_info->lock);
3651 spin_lock(&cache->lock); 3998 spin_lock(&cache->lock);
3652 cache->pinned -= len; 3999 cache->pinned -= len;
3653 cache->space_info->bytes_pinned -= len; 4000 cache->space_info->bytes_pinned -= len;
4001 if (cache->ro) {
4002 cache->space_info->bytes_readonly += len;
4003 } else if (cache->reserved_pinned > 0) {
4004 len = min(len, cache->reserved_pinned);
4005 cache->reserved_pinned -= len;
4006 cache->space_info->bytes_reserved += len;
4007 }
3654 spin_unlock(&cache->lock); 4008 spin_unlock(&cache->lock);
3655 spin_unlock(&cache->space_info->lock); 4009 spin_unlock(&cache->space_info->lock);
3656
3657 start += len;
3658 } 4010 }
3659 4011
3660 if (cache) 4012 if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3667{ 4019{
3668 struct btrfs_fs_info *fs_info = root->fs_info; 4020 struct btrfs_fs_info *fs_info = root->fs_info;
3669 struct extent_io_tree *unpin; 4021 struct extent_io_tree *unpin;
4022 struct btrfs_block_rsv *block_rsv;
4023 struct btrfs_block_rsv *next_rsv;
3670 u64 start; 4024 u64 start;
3671 u64 end; 4025 u64 end;
4026 int idx;
3672 int ret; 4027 int ret;
3673 4028
3674 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4029 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3689 cond_resched(); 4044 cond_resched();
3690 } 4045 }
3691 4046
3692 return ret; 4047 mutex_lock(&fs_info->durable_block_rsv_mutex);
3693} 4048 list_for_each_entry_safe(block_rsv, next_rsv,
3694 4049 &fs_info->durable_block_rsv_list, list) {
3695static int pin_down_bytes(struct btrfs_trans_handle *trans,
3696 struct btrfs_root *root,
3697 struct btrfs_path *path,
3698 u64 bytenr, u64 num_bytes,
3699 int is_data, int reserved,
3700 struct extent_buffer **must_clean)
3701{
3702 int err = 0;
3703 struct extent_buffer *buf;
3704 4050
3705 if (is_data) 4051 idx = trans->transid & 0x1;
3706 goto pinit; 4052 if (block_rsv->freed[idx] > 0) {
3707 4053 block_rsv_add_bytes(block_rsv,
3708 /* 4054 block_rsv->freed[idx], 0);
3709 * discard is sloooow, and so triggering discards on 4055 block_rsv->freed[idx] = 0;
3710 * individual btree blocks isn't a good plan. Just 4056 }
3711 * pin everything in discard mode. 4057 if (atomic_read(&block_rsv->usage) == 0) {
3712 */ 4058 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3713 if (btrfs_test_opt(root, DISCARD))
3714 goto pinit;
3715
3716 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3717 if (!buf)
3718 goto pinit;
3719 4059
3720 /* we can reuse a block if it hasn't been written 4060 if (block_rsv->freed[0] == 0 &&
3721 * and it is from this transaction. We can't 4061 block_rsv->freed[1] == 0) {
3722 * reuse anything from the tree log root because 4062 list_del_init(&block_rsv->list);
3723 * it has tiny sub-transactions. 4063 kfree(block_rsv);
3724 */ 4064 }
3725 if (btrfs_buffer_uptodate(buf, 0) && 4065 } else {
3726 btrfs_try_tree_lock(buf)) { 4066 btrfs_block_rsv_release(root, block_rsv, 0);
3727 u64 header_owner = btrfs_header_owner(buf);
3728 u64 header_transid = btrfs_header_generation(buf);
3729 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3730 header_transid == trans->transid &&
3731 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3732 *must_clean = buf;
3733 return 1;
3734 } 4067 }
3735 btrfs_tree_unlock(buf);
3736 } 4068 }
3737 free_extent_buffer(buf); 4069 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3738pinit:
3739 if (path)
3740 btrfs_set_path_blocking(path);
3741 /* unlocks the pinned mutex */
3742 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3743 4070
3744 BUG_ON(err < 0);
3745 return 0; 4071 return 0;
3746} 4072}
3747 4073
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3902 BUG_ON(ret); 4228 BUG_ON(ret);
3903 } 4229 }
3904 } else { 4230 } else {
3905 int mark_free = 0;
3906 struct extent_buffer *must_clean = NULL;
3907
3908 if (found_extent) { 4231 if (found_extent) {
3909 BUG_ON(is_data && refs_to_drop != 4232 BUG_ON(is_data && refs_to_drop !=
3910 extent_data_ref_count(root, path, iref)); 4233 extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3917 } 4240 }
3918 } 4241 }
3919 4242
3920 ret = pin_down_bytes(trans, root, path, bytenr,
3921 num_bytes, is_data, 0, &must_clean);
3922 if (ret > 0)
3923 mark_free = 1;
3924 BUG_ON(ret < 0);
3925 /*
3926 * it is going to be very rare for someone to be waiting
3927 * on the block we're freeing. del_items might need to
3928 * schedule, so rather than get fancy, just force it
3929 * to blocking here
3930 */
3931 if (must_clean)
3932 btrfs_set_lock_blocking(must_clean);
3933
3934 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3935 num_to_del); 4244 num_to_del);
3936 BUG_ON(ret); 4245 BUG_ON(ret);
3937 btrfs_release_path(extent_root, path); 4246 btrfs_release_path(extent_root, path);
3938 4247
3939 if (must_clean) {
3940 clean_tree_block(NULL, root, must_clean);
3941 btrfs_tree_unlock(must_clean);
3942 free_extent_buffer(must_clean);
3943 }
3944
3945 if (is_data) { 4248 if (is_data) {
3946 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3947 BUG_ON(ret); 4250 BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3951 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4254 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3952 } 4255 }
3953 4256
3954 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4257 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3955 mark_free);
3956 BUG_ON(ret); 4258 BUG_ON(ret);
3957 } 4259 }
3958 btrfs_free_path(path); 4260 btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3960} 4262}
3961 4263
3962/* 4264/*
3963 * when we free an extent, it is possible (and likely) that we free the last 4265 * when we free an block, it is possible (and likely) that we free the last
3964 * delayed ref for that extent as well. This searches the delayed ref tree for 4266 * delayed ref for that extent as well. This searches the delayed ref tree for
3965 * a given extent, and if there are no other delayed refs to be processed, it 4267 * a given extent, and if there are no other delayed refs to be processed, it
3966 * removes it from the tree. 4268 * removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3972 struct btrfs_delayed_ref_root *delayed_refs; 4274 struct btrfs_delayed_ref_root *delayed_refs;
3973 struct btrfs_delayed_ref_node *ref; 4275 struct btrfs_delayed_ref_node *ref;
3974 struct rb_node *node; 4276 struct rb_node *node;
3975 int ret; 4277 int ret = 0;
3976 4278
3977 delayed_refs = &trans->transaction->delayed_refs; 4279 delayed_refs = &trans->transaction->delayed_refs;
3978 spin_lock(&delayed_refs->lock); 4280 spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4024 list_del_init(&head->cluster); 4326 list_del_init(&head->cluster);
4025 spin_unlock(&delayed_refs->lock); 4327 spin_unlock(&delayed_refs->lock);
4026 4328
4027 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4329 BUG_ON(head->extent_op);
4028 &head->node, head->extent_op, 4330 if (head->must_insert_reserved)
4029 head->must_insert_reserved); 4331 ret = 1;
4030 BUG_ON(ret); 4332
4333 mutex_unlock(&head->mutex);
4031 btrfs_put_delayed_ref(&head->node); 4334 btrfs_put_delayed_ref(&head->node);
4032 return 0; 4335 return ret;
4033out: 4336out:
4034 spin_unlock(&delayed_refs->lock); 4337 spin_unlock(&delayed_refs->lock);
4035 return 0; 4338 return 0;
4036} 4339}
4037 4340
4341void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4342 struct btrfs_root *root,
4343 struct extent_buffer *buf,
4344 u64 parent, int last_ref)
4345{
4346 struct btrfs_block_rsv *block_rsv;
4347 struct btrfs_block_group_cache *cache = NULL;
4348 int ret;
4349
4350 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4351 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4352 parent, root->root_key.objectid,
4353 btrfs_header_level(buf),
4354 BTRFS_DROP_DELAYED_REF, NULL);
4355 BUG_ON(ret);
4356 }
4357
4358 if (!last_ref)
4359 return;
4360
4361 block_rsv = get_block_rsv(trans, root);
4362 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4363 if (block_rsv->space_info != cache->space_info)
4364 goto out;
4365
4366 if (btrfs_header_generation(buf) == trans->transid) {
4367 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4368 ret = check_ref_cleanup(trans, root, buf->start);
4369 if (!ret)
4370 goto pin;
4371 }
4372
4373 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4374 pin_down_extent(root, cache, buf->start, buf->len, 1);
4375 goto pin;
4376 }
4377
4378 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4379
4380 btrfs_add_free_space(cache, buf->start, buf->len);
4381 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4382 if (ret == -EAGAIN) {
4383 /* block group became read-only */
4384 update_reserved_bytes(cache, buf->len, 0, 1);
4385 goto out;
4386 }
4387
4388 ret = 1;
4389 spin_lock(&block_rsv->lock);
4390 if (block_rsv->reserved < block_rsv->size) {
4391 block_rsv->reserved += buf->len;
4392 ret = 0;
4393 }
4394 spin_unlock(&block_rsv->lock);
4395
4396 if (ret) {
4397 spin_lock(&cache->space_info->lock);
4398 cache->space_info->bytes_reserved -= buf->len;
4399 spin_unlock(&cache->space_info->lock);
4400 }
4401 goto out;
4402 }
4403pin:
4404 if (block_rsv->durable && !cache->ro) {
4405 ret = 0;
4406 spin_lock(&cache->lock);
4407 if (!cache->ro) {
4408 cache->reserved_pinned += buf->len;
4409 ret = 1;
4410 }
4411 spin_unlock(&cache->lock);
4412
4413 if (ret) {
4414 spin_lock(&block_rsv->lock);
4415 block_rsv->freed[trans->transid & 0x1] += buf->len;
4416 spin_unlock(&block_rsv->lock);
4417 }
4418 }
4419out:
4420 btrfs_put_block_group(cache);
4421}
4422
4038int btrfs_free_extent(struct btrfs_trans_handle *trans, 4423int btrfs_free_extent(struct btrfs_trans_handle *trans,
4039 struct btrfs_root *root, 4424 struct btrfs_root *root,
4040 u64 bytenr, u64 num_bytes, u64 parent, 4425 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4056 parent, root_objectid, (int)owner, 4441 parent, root_objectid, (int)owner,
4057 BTRFS_DROP_DELAYED_REF, NULL); 4442 BTRFS_DROP_DELAYED_REF, NULL);
4058 BUG_ON(ret); 4443 BUG_ON(ret);
4059 ret = check_ref_cleanup(trans, root, bytenr);
4060 BUG_ON(ret);
4061 } else { 4444 } else {
4062 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4445 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4063 parent, root_objectid, owner, 4446 parent, root_objectid, owner,
@@ -4067,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4067 return ret; 4450 return ret;
4068} 4451}
4069 4452
4070int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4071 struct btrfs_root *root,
4072 u64 bytenr, u32 blocksize,
4073 u64 parent, u64 root_objectid, int level)
4074{
4075 u64 used;
4076 spin_lock(&root->node_lock);
4077 used = btrfs_root_used(&root->root_item) - blocksize;
4078 btrfs_set_root_used(&root->root_item, used);
4079 spin_unlock(&root->node_lock);
4080
4081 return btrfs_free_extent(trans, root, bytenr, blocksize,
4082 parent, root_objectid, level, 0);
4083}
4084
4085static u64 stripe_align(struct btrfs_root *root, u64 val) 4453static u64 stripe_align(struct btrfs_root *root, u64 val)
4086{ 4454{
4087 u64 mask = ((u64)root->stripesize - 1); 4455 u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4134 return 0; 4502 return 0;
4135} 4503}
4136 4504
4505static int get_block_group_index(struct btrfs_block_group_cache *cache)
4506{
4507 int index;
4508 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4509 index = 0;
4510 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4511 index = 1;
4512 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4513 index = 2;
4514 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4515 index = 3;
4516 else
4517 index = 4;
4518 return index;
4519}
4520
4137enum btrfs_loop_type { 4521enum btrfs_loop_type {
4138 LOOP_FIND_IDEAL = 0, 4522 LOOP_FIND_IDEAL = 0,
4139 LOOP_CACHING_NOWAIT = 1, 4523 LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4155 u64 num_bytes, u64 empty_size, 4539 u64 num_bytes, u64 empty_size,
4156 u64 search_start, u64 search_end, 4540 u64 search_start, u64 search_end,
4157 u64 hint_byte, struct btrfs_key *ins, 4541 u64 hint_byte, struct btrfs_key *ins,
4158 u64 exclude_start, u64 exclude_nr,
4159 int data) 4542 int data)
4160{ 4543{
4161 int ret = 0; 4544 int ret = 0;
@@ -4168,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4168 struct btrfs_space_info *space_info; 4551 struct btrfs_space_info *space_info;
4169 int last_ptr_loop = 0; 4552 int last_ptr_loop = 0;
4170 int loop = 0; 4553 int loop = 0;
4554 int index = 0;
4171 bool found_uncached_bg = false; 4555 bool found_uncached_bg = false;
4172 bool failed_cluster_refill = false; 4556 bool failed_cluster_refill = false;
4173 bool failed_alloc = false; 4557 bool failed_alloc = false;
@@ -4237,6 +4621,7 @@ ideal_cache:
4237 btrfs_put_block_group(block_group); 4621 btrfs_put_block_group(block_group);
4238 up_read(&space_info->groups_sem); 4622 up_read(&space_info->groups_sem);
4239 } else { 4623 } else {
4624 index = get_block_group_index(block_group);
4240 goto have_block_group; 4625 goto have_block_group;
4241 } 4626 }
4242 } else if (block_group) { 4627 } else if (block_group) {
@@ -4245,7 +4630,8 @@ ideal_cache:
4245 } 4630 }
4246search: 4631search:
4247 down_read(&space_info->groups_sem); 4632 down_read(&space_info->groups_sem);
4248 list_for_each_entry(block_group, &space_info->block_groups, list) { 4633 list_for_each_entry(block_group, &space_info->block_groups[index],
4634 list) {
4249 u64 offset; 4635 u64 offset;
4250 int cached; 4636 int cached;
4251 4637
@@ -4436,23 +4822,22 @@ checks:
4436 goto loop; 4822 goto loop;
4437 } 4823 }
4438 4824
4439 if (exclude_nr > 0 && 4825 ins->objectid = search_start;
4440 (search_start + num_bytes > exclude_start && 4826 ins->offset = num_bytes;
4441 search_start < exclude_start + exclude_nr)) { 4827
4442 search_start = exclude_start + exclude_nr; 4828 if (offset < search_start)
4829 btrfs_add_free_space(block_group, offset,
4830 search_start - offset);
4831 BUG_ON(offset > search_start);
4443 4832
4833 ret = update_reserved_bytes(block_group, num_bytes, 1,
4834 (data & BTRFS_BLOCK_GROUP_DATA));
4835 if (ret == -EAGAIN) {
4444 btrfs_add_free_space(block_group, offset, num_bytes); 4836 btrfs_add_free_space(block_group, offset, num_bytes);
4445 /*
4446 * if search_start is still in this block group
4447 * then we just re-search this block group
4448 */
4449 if (search_start >= block_group->key.objectid &&
4450 search_start < (block_group->key.objectid +
4451 block_group->key.offset))
4452 goto have_block_group;
4453 goto loop; 4837 goto loop;
4454 } 4838 }
4455 4839
4840 /* we are all good, lets return */
4456 ins->objectid = search_start; 4841 ins->objectid = search_start;
4457 ins->offset = num_bytes; 4842 ins->offset = num_bytes;
4458 4843
@@ -4460,18 +4845,18 @@ checks:
4460 btrfs_add_free_space(block_group, offset, 4845 btrfs_add_free_space(block_group, offset,
4461 search_start - offset); 4846 search_start - offset);
4462 BUG_ON(offset > search_start); 4847 BUG_ON(offset > search_start);
4463
4464 update_reserved_extents(block_group, num_bytes, 1);
4465
4466 /* we are all good, lets return */
4467 break; 4848 break;
4468loop: 4849loop:
4469 failed_cluster_refill = false; 4850 failed_cluster_refill = false;
4470 failed_alloc = false; 4851 failed_alloc = false;
4852 BUG_ON(index != get_block_group_index(block_group));
4471 btrfs_put_block_group(block_group); 4853 btrfs_put_block_group(block_group);
4472 } 4854 }
4473 up_read(&space_info->groups_sem); 4855 up_read(&space_info->groups_sem);
4474 4856
4857 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
4858 goto search;
4859
4475 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 4860 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4476 * for them to make caching progress. Also 4861 * for them to make caching progress. Also
4477 * determine the best possible bg to cache 4862 * determine the best possible bg to cache
@@ -4485,6 +4870,7 @@ loop:
4485 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 4870 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4486 (found_uncached_bg || empty_size || empty_cluster || 4871 (found_uncached_bg || empty_size || empty_cluster ||
4487 allowed_chunk_alloc)) { 4872 allowed_chunk_alloc)) {
4873 index = 0;
4488 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 4874 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4489 found_uncached_bg = false; 4875 found_uncached_bg = false;
4490 loop++; 4876 loop++;
@@ -4567,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4567 int dump_block_groups) 4953 int dump_block_groups)
4568{ 4954{
4569 struct btrfs_block_group_cache *cache; 4955 struct btrfs_block_group_cache *cache;
4956 int index = 0;
4570 4957
4571 spin_lock(&info->lock); 4958 spin_lock(&info->lock);
4572 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 4959 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4573 (unsigned long long)(info->total_bytes - info->bytes_used - 4960 (unsigned long long)(info->total_bytes - info->bytes_used -
4574 info->bytes_pinned - info->bytes_reserved - 4961 info->bytes_pinned - info->bytes_reserved -
4575 info->bytes_super), 4962 info->bytes_readonly),
4576 (info->full) ? "" : "not "); 4963 (info->full) ? "" : "not ");
4577 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 4964 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4578 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 4965 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4579 "\n",
4580 (unsigned long long)info->total_bytes, 4966 (unsigned long long)info->total_bytes,
4967 (unsigned long long)info->bytes_used,
4581 (unsigned long long)info->bytes_pinned, 4968 (unsigned long long)info->bytes_pinned,
4582 (unsigned long long)info->bytes_delalloc, 4969 (unsigned long long)info->bytes_reserved,
4583 (unsigned long long)info->bytes_may_use, 4970 (unsigned long long)info->bytes_may_use,
4584 (unsigned long long)info->bytes_used, 4971 (unsigned long long)info->bytes_readonly);
4585 (unsigned long long)info->bytes_root,
4586 (unsigned long long)info->bytes_super,
4587 (unsigned long long)info->bytes_reserved);
4588 spin_unlock(&info->lock); 4972 spin_unlock(&info->lock);
4589 4973
4590 if (!dump_block_groups) 4974 if (!dump_block_groups)
4591 return; 4975 return;
4592 4976
4593 down_read(&info->groups_sem); 4977 down_read(&info->groups_sem);
4594 list_for_each_entry(cache, &info->block_groups, list) { 4978again:
4979 list_for_each_entry(cache, &info->block_groups[index], list) {
4595 spin_lock(&cache->lock); 4980 spin_lock(&cache->lock);
4596 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 4981 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4597 "%llu pinned %llu reserved\n", 4982 "%llu pinned %llu reserved\n",
@@ -4603,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4603 btrfs_dump_free_space(cache, bytes); 4988 btrfs_dump_free_space(cache, bytes);
4604 spin_unlock(&cache->lock); 4989 spin_unlock(&cache->lock);
4605 } 4990 }
4991 if (++index < BTRFS_NR_RAID_TYPES)
4992 goto again;
4606 up_read(&info->groups_sem); 4993 up_read(&info->groups_sem);
4607} 4994}
4608 4995
@@ -4628,9 +5015,8 @@ again:
4628 5015
4629 WARN_ON(num_bytes < root->sectorsize); 5016 WARN_ON(num_bytes < root->sectorsize);
4630 ret = find_free_extent(trans, root, num_bytes, empty_size, 5017 ret = find_free_extent(trans, root, num_bytes, empty_size,
4631 search_start, search_end, hint_byte, ins, 5018 search_start, search_end, hint_byte,
4632 trans->alloc_exclude_start, 5019 ins, data);
4633 trans->alloc_exclude_nr, data);
4634 5020
4635 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5021 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4636 num_bytes = num_bytes >> 1; 5022 num_bytes = num_bytes >> 1;
@@ -4668,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4668 ret = btrfs_discard_extent(root, start, len); 5054 ret = btrfs_discard_extent(root, start, len);
4669 5055
4670 btrfs_add_free_space(cache, start, len); 5056 btrfs_add_free_space(cache, start, len);
4671 update_reserved_extents(cache, len, 0); 5057 update_reserved_bytes(cache, len, 0, 1);
4672 btrfs_put_block_group(cache); 5058 btrfs_put_block_group(cache);
4673 5059
4674 return ret; 5060 return ret;
@@ -4731,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4731 btrfs_mark_buffer_dirty(path->nodes[0]); 5117 btrfs_mark_buffer_dirty(path->nodes[0]);
4732 btrfs_free_path(path); 5118 btrfs_free_path(path);
4733 5119
4734 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5120 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4735 1, 0);
4736 if (ret) { 5121 if (ret) {
4737 printk(KERN_ERR "btrfs update block group failed for %llu " 5122 printk(KERN_ERR "btrfs update block group failed for %llu "
4738 "%llu\n", (unsigned long long)ins->objectid, 5123 "%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4792 btrfs_mark_buffer_dirty(leaf); 5177 btrfs_mark_buffer_dirty(leaf);
4793 btrfs_free_path(path); 5178 btrfs_free_path(path);
4794 5179
4795 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5180 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4796 1, 0);
4797 if (ret) { 5181 if (ret) {
4798 printk(KERN_ERR "btrfs update block group failed for %llu " 5182 printk(KERN_ERR "btrfs update block group failed for %llu "
4799 "%llu\n", (unsigned long long)ins->objectid, 5183 "%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4869 put_caching_control(caching_ctl); 5253 put_caching_control(caching_ctl);
4870 } 5254 }
4871 5255
4872 update_reserved_extents(block_group, ins->offset, 1); 5256 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5257 BUG_ON(ret);
4873 btrfs_put_block_group(block_group); 5258 btrfs_put_block_group(block_group);
4874 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5259 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4875 0, owner, offset, ins, 1); 5260 0, owner, offset, ins, 1);
4876 return ret; 5261 return ret;
4877} 5262}
4878 5263
4879/*
4880 * finds a free extent and does all the dirty work required for allocation
4881 * returns the key for the extent through ins, and a tree buffer for
4882 * the first block of the extent through buf.
4883 *
4884 * returns 0 if everything worked, non-zero otherwise.
4885 */
4886static int alloc_tree_block(struct btrfs_trans_handle *trans,
4887 struct btrfs_root *root,
4888 u64 num_bytes, u64 parent, u64 root_objectid,
4889 struct btrfs_disk_key *key, int level,
4890 u64 empty_size, u64 hint_byte, u64 search_end,
4891 struct btrfs_key *ins)
4892{
4893 int ret;
4894 u64 flags = 0;
4895
4896 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4897 empty_size, hint_byte, search_end,
4898 ins, 0);
4899 if (ret)
4900 return ret;
4901
4902 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4903 if (parent == 0)
4904 parent = ins->objectid;
4905 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4906 } else
4907 BUG_ON(parent > 0);
4908
4909 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4910 struct btrfs_delayed_extent_op *extent_op;
4911 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4912 BUG_ON(!extent_op);
4913 if (key)
4914 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4915 else
4916 memset(&extent_op->key, 0, sizeof(extent_op->key));
4917 extent_op->flags_to_set = flags;
4918 extent_op->update_key = 1;
4919 extent_op->update_flags = 1;
4920 extent_op->is_data = 0;
4921
4922 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4923 ins->offset, parent, root_objectid,
4924 level, BTRFS_ADD_DELAYED_EXTENT,
4925 extent_op);
4926 BUG_ON(ret);
4927 }
4928
4929 if (root_objectid == root->root_key.objectid) {
4930 u64 used;
4931 spin_lock(&root->node_lock);
4932 used = btrfs_root_used(&root->root_item) + num_bytes;
4933 btrfs_set_root_used(&root->root_item, used);
4934 spin_unlock(&root->node_lock);
4935 }
4936 return ret;
4937}
4938
4939struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5264struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4940 struct btrfs_root *root, 5265 struct btrfs_root *root,
4941 u64 bytenr, u32 blocksize, 5266 u64 bytenr, u32 blocksize,
@@ -4974,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4974 return buf; 5299 return buf;
4975} 5300}
4976 5301
5302static struct btrfs_block_rsv *
5303use_block_rsv(struct btrfs_trans_handle *trans,
5304 struct btrfs_root *root, u32 blocksize)
5305{
5306 struct btrfs_block_rsv *block_rsv;
5307 int ret;
5308
5309 block_rsv = get_block_rsv(trans, root);
5310
5311 if (block_rsv->size == 0) {
5312 ret = reserve_metadata_bytes(block_rsv, blocksize);
5313 if (ret)
5314 return ERR_PTR(ret);
5315 return block_rsv;
5316 }
5317
5318 ret = block_rsv_use_bytes(block_rsv, blocksize);
5319 if (!ret)
5320 return block_rsv;
5321
5322 WARN_ON(1);
5323 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
5324 block_rsv->size, block_rsv->reserved,
5325 block_rsv->freed[0], block_rsv->freed[1]);
5326
5327 return ERR_PTR(-ENOSPC);
5328}
5329
5330static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5331{
5332 block_rsv_add_bytes(block_rsv, blocksize, 0);
5333 block_rsv_release_bytes(block_rsv, NULL, 0);
5334}
5335
4977/* 5336/*
4978 * helper function to allocate a block for a given tree 5337 * finds a free extent and does all the dirty work required for allocation
5338 * returns the key for the extent through ins, and a tree buffer for
5339 * the first block of the extent through buf.
5340 *
4979 * returns the tree buffer or NULL. 5341 * returns the tree buffer or NULL.
4980 */ 5342 */
4981struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5343struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4985 u64 hint, u64 empty_size) 5347 u64 hint, u64 empty_size)
4986{ 5348{
4987 struct btrfs_key ins; 5349 struct btrfs_key ins;
4988 int ret; 5350 struct btrfs_block_rsv *block_rsv;
4989 struct extent_buffer *buf; 5351 struct extent_buffer *buf;
5352 u64 flags = 0;
5353 int ret;
5354
4990 5355
4991 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5356 block_rsv = use_block_rsv(trans, root, blocksize);
4992 key, level, empty_size, hint, (u64)-1, &ins); 5357 if (IS_ERR(block_rsv))
5358 return ERR_CAST(block_rsv);
5359
5360 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5361 empty_size, hint, (u64)-1, &ins, 0);
4993 if (ret) { 5362 if (ret) {
4994 BUG_ON(ret > 0); 5363 unuse_block_rsv(block_rsv, blocksize);
4995 return ERR_PTR(ret); 5364 return ERR_PTR(ret);
4996 } 5365 }
4997 5366
4998 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5367 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4999 blocksize, level); 5368 blocksize, level);
5369 BUG_ON(IS_ERR(buf));
5370
5371 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5372 if (parent == 0)
5373 parent = ins.objectid;
5374 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5375 } else
5376 BUG_ON(parent > 0);
5377
5378 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5379 struct btrfs_delayed_extent_op *extent_op;
5380 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5381 BUG_ON(!extent_op);
5382 if (key)
5383 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5384 else
5385 memset(&extent_op->key, 0, sizeof(extent_op->key));
5386 extent_op->flags_to_set = flags;
5387 extent_op->update_key = 1;
5388 extent_op->update_flags = 1;
5389 extent_op->is_data = 0;
5390
5391 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5392 ins.offset, parent, root_objectid,
5393 level, BTRFS_ADD_DELAYED_EXTENT,
5394 extent_op);
5395 BUG_ON(ret);
5396 }
5000 return buf; 5397 return buf;
5001} 5398}
5002 5399
@@ -5321,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5321 struct btrfs_path *path, 5718 struct btrfs_path *path,
5322 struct walk_control *wc) 5719 struct walk_control *wc)
5323{ 5720{
5324 int ret = 0; 5721 int ret;
5325 int level = wc->level; 5722 int level = wc->level;
5326 struct extent_buffer *eb = path->nodes[level]; 5723 struct extent_buffer *eb = path->nodes[level];
5327 u64 parent = 0; 5724 u64 parent = 0;
@@ -5399,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5399 btrfs_header_owner(path->nodes[level + 1])); 5796 btrfs_header_owner(path->nodes[level + 1]));
5400 } 5797 }
5401 5798
5402 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 5799 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5403 root->root_key.objectid, level, 0);
5404 BUG_ON(ret);
5405out: 5800out:
5406 wc->refs[level] = 0; 5801 wc->refs[level] = 0;
5407 wc->flags[level] = 0; 5802 wc->flags[level] = 0;
5408 return ret; 5803 return 0;
5409} 5804}
5410 5805
5411static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 5806static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5483 * also make sure backrefs for the shared block and all lower level 5878 * also make sure backrefs for the shared block and all lower level
5484 * blocks are properly updated. 5879 * blocks are properly updated.
5485 */ 5880 */
5486int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 5881int btrfs_drop_snapshot(struct btrfs_root *root,
5882 struct btrfs_block_rsv *block_rsv, int update_ref)
5487{ 5883{
5488 struct btrfs_path *path; 5884 struct btrfs_path *path;
5489 struct btrfs_trans_handle *trans; 5885 struct btrfs_trans_handle *trans;
@@ -5501,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5501 wc = kzalloc(sizeof(*wc), GFP_NOFS); 5897 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5502 BUG_ON(!wc); 5898 BUG_ON(!wc);
5503 5899
5504 trans = btrfs_start_transaction(tree_root, 1); 5900 trans = btrfs_start_transaction(tree_root, 0);
5901 if (block_rsv)
5902 trans->block_rsv = block_rsv;
5505 5903
5506 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 5904 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5507 level = btrfs_header_level(root->node); 5905 level = btrfs_header_level(root->node);
@@ -5589,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5589 } 5987 }
5590 5988
5591 BUG_ON(wc->level == 0); 5989 BUG_ON(wc->level == 0);
5592 if (trans->transaction->in_commit || 5990 if (btrfs_should_end_transaction(trans, tree_root)) {
5593 trans->transaction->delayed_refs.flushing) {
5594 ret = btrfs_update_root(trans, tree_root, 5991 ret = btrfs_update_root(trans, tree_root,
5595 &root->root_key, 5992 &root->root_key,
5596 root_item); 5993 root_item);
5597 BUG_ON(ret); 5994 BUG_ON(ret);
5598 5995
5599 btrfs_end_transaction(trans, tree_root); 5996 btrfs_end_transaction_throttle(trans, tree_root);
5600 trans = btrfs_start_transaction(tree_root, 1); 5997 trans = btrfs_start_transaction(tree_root, 0);
5601 } else { 5998 if (block_rsv)
5602 unsigned long update; 5999 trans->block_rsv = block_rsv;
5603 update = trans->delayed_ref_updates;
5604 trans->delayed_ref_updates = 0;
5605 if (update)
5606 btrfs_run_delayed_refs(trans, tree_root,
5607 update);
5608 } 6000 }
5609 } 6001 }
5610 btrfs_release_path(root, path); 6002 btrfs_release_path(root, path);
@@ -5632,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5632 kfree(root); 6024 kfree(root);
5633 } 6025 }
5634out: 6026out:
5635 btrfs_end_transaction(trans, tree_root); 6027 btrfs_end_transaction_throttle(trans, tree_root);
5636 kfree(wc); 6028 kfree(wc);
5637 btrfs_free_path(path); 6029 btrfs_free_path(path);
5638 return err; 6030 return err;
@@ -7228,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7228 return flags; 7620 return flags;
7229} 7621}
7230 7622
7231static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7623static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7232 struct btrfs_block_group_cache *shrink_block_group,
7233 int force)
7234{ 7624{
7235 struct btrfs_trans_handle *trans; 7625 struct btrfs_space_info *sinfo = cache->space_info;
7236 u64 new_alloc_flags; 7626 u64 num_bytes;
7237 u64 calc; 7627 int ret = -ENOSPC;
7238 7628
7239 spin_lock(&shrink_block_group->lock); 7629 if (cache->ro)
7240 if (btrfs_block_group_used(&shrink_block_group->item) + 7630 return 0;
7241 shrink_block_group->reserved > 0) {
7242 spin_unlock(&shrink_block_group->lock);
7243 7631
7244 trans = btrfs_start_transaction(root, 1); 7632 spin_lock(&sinfo->lock);
7245 spin_lock(&shrink_block_group->lock); 7633 spin_lock(&cache->lock);
7634 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7635 cache->bytes_super - btrfs_block_group_used(&cache->item);
7636
7637 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7638 sinfo->bytes_may_use + sinfo->bytes_readonly +
7639 cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
7640 sinfo->bytes_readonly += num_bytes;
7641 sinfo->bytes_reserved += cache->reserved_pinned;
7642 cache->reserved_pinned = 0;
7643 cache->ro = 1;
7644 ret = 0;
7645 }
7646 spin_unlock(&cache->lock);
7647 spin_unlock(&sinfo->lock);
7648 return ret;
7649}
7246 7650
7247 new_alloc_flags = update_block_group_flags(root, 7651int btrfs_set_block_group_ro(struct btrfs_root *root,
7248 shrink_block_group->flags); 7652 struct btrfs_block_group_cache *cache)
7249 if (new_alloc_flags != shrink_block_group->flags) {
7250 calc =
7251 btrfs_block_group_used(&shrink_block_group->item);
7252 } else {
7253 calc = shrink_block_group->key.offset;
7254 }
7255 spin_unlock(&shrink_block_group->lock);
7256 7653
7257 do_chunk_alloc(trans, root->fs_info->extent_root, 7654{
7258 calc + 2 * 1024 * 1024, new_alloc_flags, force); 7655 struct btrfs_trans_handle *trans;
7656 u64 alloc_flags;
7657 int ret;
7259 7658
7260 btrfs_end_transaction(trans, root); 7659 BUG_ON(cache->ro);
7261 } else 7660
7262 spin_unlock(&shrink_block_group->lock); 7661 trans = btrfs_join_transaction(root, 1);
7263 return 0; 7662 BUG_ON(IS_ERR(trans));
7264}
7265 7663
7664 alloc_flags = update_block_group_flags(root, cache->flags);
7665 if (alloc_flags != cache->flags)
7666 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7266 7667
7267int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 7668 ret = set_block_group_ro(cache);
7268 struct btrfs_block_group_cache *group) 7669 if (!ret)
7670 goto out;
7671 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7672 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7673 if (ret < 0)
7674 goto out;
7675 ret = set_block_group_ro(cache);
7676out:
7677 btrfs_end_transaction(trans, root);
7678 return ret;
7679}
7269 7680
7681int btrfs_set_block_group_rw(struct btrfs_root *root,
7682 struct btrfs_block_group_cache *cache)
7270{ 7683{
7271 __alloc_chunk_for_shrink(root, group, 1); 7684 struct btrfs_space_info *sinfo = cache->space_info;
7272 set_block_group_readonly(group); 7685 u64 num_bytes;
7686
7687 BUG_ON(!cache->ro);
7688
7689 spin_lock(&sinfo->lock);
7690 spin_lock(&cache->lock);
7691 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7692 cache->bytes_super - btrfs_block_group_used(&cache->item);
7693 sinfo->bytes_readonly -= num_bytes;
7694 cache->ro = 0;
7695 spin_unlock(&cache->lock);
7696 spin_unlock(&sinfo->lock);
7273 return 0; 7697 return 0;
7274} 7698}
7275 7699
@@ -7436,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7436 */ 7860 */
7437 synchronize_rcu(); 7861 synchronize_rcu();
7438 7862
7863 release_global_block_rsv(info);
7864
7439 while(!list_empty(&info->space_info)) { 7865 while(!list_empty(&info->space_info)) {
7440 space_info = list_entry(info->space_info.next, 7866 space_info = list_entry(info->space_info.next,
7441 struct btrfs_space_info, 7867 struct btrfs_space_info,
7442 list); 7868 list);
7443 7869 if (space_info->bytes_pinned > 0 ||
7870 space_info->bytes_reserved > 0) {
7871 WARN_ON(1);
7872 dump_space_info(space_info, 0, 0);
7873 }
7444 list_del(&space_info->list); 7874 list_del(&space_info->list);
7445 kfree(space_info); 7875 kfree(space_info);
7446 } 7876 }
7447 return 0; 7877 return 0;
7448} 7878}
7449 7879
7880static void __link_block_group(struct btrfs_space_info *space_info,
7881 struct btrfs_block_group_cache *cache)
7882{
7883 int index = get_block_group_index(cache);
7884
7885 down_write(&space_info->groups_sem);
7886 list_add_tail(&cache->list, &space_info->block_groups[index]);
7887 up_write(&space_info->groups_sem);
7888}
7889
7450int btrfs_read_block_groups(struct btrfs_root *root) 7890int btrfs_read_block_groups(struct btrfs_root *root)
7451{ 7891{
7452 struct btrfs_path *path; 7892 struct btrfs_path *path;
@@ -7468,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7468 7908
7469 while (1) { 7909 while (1) {
7470 ret = find_first_block_group(root, path, &key); 7910 ret = find_first_block_group(root, path, &key);
7471 if (ret > 0) { 7911 if (ret > 0)
7472 ret = 0; 7912 break;
7473 goto error;
7474 }
7475 if (ret != 0) 7913 if (ret != 0)
7476 goto error; 7914 goto error;
7477 7915
@@ -7480,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7480 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7918 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7481 if (!cache) { 7919 if (!cache) {
7482 ret = -ENOMEM; 7920 ret = -ENOMEM;
7483 break; 7921 goto error;
7484 } 7922 }
7485 7923
7486 atomic_set(&cache->count, 1); 7924 atomic_set(&cache->count, 1);
@@ -7537,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7537 BUG_ON(ret); 7975 BUG_ON(ret);
7538 cache->space_info = space_info; 7976 cache->space_info = space_info;
7539 spin_lock(&cache->space_info->lock); 7977 spin_lock(&cache->space_info->lock);
7540 cache->space_info->bytes_super += cache->bytes_super; 7978 cache->space_info->bytes_readonly += cache->bytes_super;
7541 spin_unlock(&cache->space_info->lock); 7979 spin_unlock(&cache->space_info->lock);
7542 7980
7543 down_write(&space_info->groups_sem); 7981 __link_block_group(space_info, cache);
7544 list_add_tail(&cache->list, &space_info->block_groups);
7545 up_write(&space_info->groups_sem);
7546 7982
7547 ret = btrfs_add_block_group_cache(root->fs_info, cache); 7983 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7548 BUG_ON(ret); 7984 BUG_ON(ret);
7549 7985
7550 set_avail_alloc_bits(root->fs_info, cache->flags); 7986 set_avail_alloc_bits(root->fs_info, cache->flags);
7551 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7987 if (btrfs_chunk_readonly(root, cache->key.objectid))
7552 set_block_group_readonly(cache); 7988 set_block_group_ro(cache);
7553 } 7989 }
7990
7991 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7992 if (!(get_alloc_profile(root, space_info->flags) &
7993 (BTRFS_BLOCK_GROUP_RAID10 |
7994 BTRFS_BLOCK_GROUP_RAID1 |
7995 BTRFS_BLOCK_GROUP_DUP)))
7996 continue;
7997 /*
7998 * avoid allocating from un-mirrored block group if there are
7999 * mirrored block groups.
8000 */
8001 list_for_each_entry(cache, &space_info->block_groups[3], list)
8002 set_block_group_ro(cache);
8003 list_for_each_entry(cache, &space_info->block_groups[4], list)
8004 set_block_group_ro(cache);
8005 }
8006
8007 init_global_block_rsv(info);
7554 ret = 0; 8008 ret = 0;
7555error: 8009error:
7556 btrfs_free_path(path); 8010 btrfs_free_path(path);
@@ -7611,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7611 BUG_ON(ret); 8065 BUG_ON(ret);
7612 8066
7613 spin_lock(&cache->space_info->lock); 8067 spin_lock(&cache->space_info->lock);
7614 cache->space_info->bytes_super += cache->bytes_super; 8068 cache->space_info->bytes_readonly += cache->bytes_super;
7615 spin_unlock(&cache->space_info->lock); 8069 spin_unlock(&cache->space_info->lock);
7616 8070
7617 down_write(&cache->space_info->groups_sem); 8071 __link_block_group(cache->space_info, cache);
7618 list_add_tail(&cache->list, &cache->space_info->block_groups);
7619 up_write(&cache->space_info->groups_sem);
7620 8072
7621 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8073 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7622 BUG_ON(ret); 8074 BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
135 return state; 135 return state;
136} 136}
137 137
138static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
139{ 139{
140 if (!state) 140 if (!state)
141 return; 141 return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
335} 335}
336 336
337static int set_state_cb(struct extent_io_tree *tree, 337static int set_state_cb(struct extent_io_tree *tree,
338 struct extent_state *state, 338 struct extent_state *state, int *bits)
339 unsigned long bits)
340{ 339{
341 if (tree->ops && tree->ops->set_bit_hook) { 340 if (tree->ops && tree->ops->set_bit_hook) {
342 return tree->ops->set_bit_hook(tree->mapping->host, 341 return tree->ops->set_bit_hook(tree->mapping->host,
343 state->start, state->end, 342 state, bits);
344 state->state, bits);
345 } 343 }
346 344
347 return 0; 345 return 0;
348} 346}
349 347
350static void clear_state_cb(struct extent_io_tree *tree, 348static void clear_state_cb(struct extent_io_tree *tree,
351 struct extent_state *state, 349 struct extent_state *state, int *bits)
352 unsigned long bits)
353{ 350{
354 if (tree->ops && tree->ops->clear_bit_hook) 351 if (tree->ops && tree->ops->clear_bit_hook)
355 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 352 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
367 */ 364 */
368static int insert_state(struct extent_io_tree *tree, 365static int insert_state(struct extent_io_tree *tree,
369 struct extent_state *state, u64 start, u64 end, 366 struct extent_state *state, u64 start, u64 end,
370 int bits) 367 int *bits)
371{ 368{
372 struct rb_node *node; 369 struct rb_node *node;
370 int bits_to_set = *bits & ~EXTENT_CTLBITS;
373 int ret; 371 int ret;
374 372
375 if (end < start) { 373 if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
384 if (ret) 382 if (ret)
385 return ret; 383 return ret;
386 384
387 if (bits & EXTENT_DIRTY) 385 if (bits_to_set & EXTENT_DIRTY)
388 tree->dirty_bytes += end - start + 1; 386 tree->dirty_bytes += end - start + 1;
389 state->state |= bits; 387 state->state |= bits_to_set;
390 node = tree_insert(&tree->state, end, &state->rb_node); 388 node = tree_insert(&tree->state, end, &state->rb_node);
391 if (node) { 389 if (node) {
392 struct extent_state *found; 390 struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
456 * struct is freed and removed from the tree 454 * struct is freed and removed from the tree
457 */ 455 */
458static int clear_state_bit(struct extent_io_tree *tree, 456static int clear_state_bit(struct extent_io_tree *tree,
459 struct extent_state *state, int bits, int wake, 457 struct extent_state *state,
460 int delete) 458 int *bits, int wake)
461{ 459{
462 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 460 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
463 int ret = state->state & bits_to_clear; 461 int ret = state->state & bits_to_clear;
464 462
465 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 463 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
466 u64 range = state->end - state->start + 1; 464 u64 range = state->end - state->start + 1;
467 WARN_ON(range > tree->dirty_bytes); 465 WARN_ON(range > tree->dirty_bytes);
468 tree->dirty_bytes -= range; 466 tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
471 state->state &= ~bits_to_clear; 469 state->state &= ~bits_to_clear;
472 if (wake) 470 if (wake)
473 wake_up(&state->wq); 471 wake_up(&state->wq);
474 if (delete || state->state == 0) { 472 if (state->state == 0) {
475 if (state->tree) { 473 if (state->tree) {
476 clear_state_cb(tree, state, state->state);
477 rb_erase(&state->rb_node, &tree->state); 474 rb_erase(&state->rb_node, &tree->state);
478 state->tree = NULL; 475 state->tree = NULL;
479 free_extent_state(state); 476 free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
514 int set = 0; 511 int set = 0;
515 int clear = 0; 512 int clear = 0;
516 513
514 if (delete)
515 bits |= ~EXTENT_CTLBITS;
516 bits |= EXTENT_FIRST_DELALLOC;
517
517 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
518 clear = 1; 519 clear = 1;
519again: 520again:
@@ -580,8 +581,7 @@ hit_next:
580 if (err) 581 if (err)
581 goto out; 582 goto out;
582 if (state->end <= end) { 583 if (state->end <= end) {
583 set |= clear_state_bit(tree, state, bits, wake, 584 set |= clear_state_bit(tree, state, &bits, wake);
584 delete);
585 if (last_end == (u64)-1) 585 if (last_end == (u64)-1)
586 goto out; 586 goto out;
587 start = last_end + 1; 587 start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
602 if (wake) 602 if (wake)
603 wake_up(&state->wq); 603 wake_up(&state->wq);
604 604
605 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 605 set |= clear_state_bit(tree, prealloc, &bits, wake);
606 606
607 prealloc = NULL; 607 prealloc = NULL;
608 goto out; 608 goto out;
@@ -613,7 +613,7 @@ hit_next:
613 else 613 else
614 next_node = NULL; 614 next_node = NULL;
615 615
616 set |= clear_state_bit(tree, state, bits, wake, delete); 616 set |= clear_state_bit(tree, state, &bits, wake);
617 if (last_end == (u64)-1) 617 if (last_end == (u64)-1)
618 goto out; 618 goto out;
619 start = last_end + 1; 619 start = last_end + 1;
@@ -706,19 +706,19 @@ out:
706 706
707static int set_state_bits(struct extent_io_tree *tree, 707static int set_state_bits(struct extent_io_tree *tree,
708 struct extent_state *state, 708 struct extent_state *state,
709 int bits) 709 int *bits)
710{ 710{
711 int ret; 711 int ret;
712 int bits_to_set = *bits & ~EXTENT_CTLBITS;
712 713
713 ret = set_state_cb(tree, state, bits); 714 ret = set_state_cb(tree, state, bits);
714 if (ret) 715 if (ret)
715 return ret; 716 return ret;
716 717 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
717 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
718 u64 range = state->end - state->start + 1; 718 u64 range = state->end - state->start + 1;
719 tree->dirty_bytes += range; 719 tree->dirty_bytes += range;
720 } 720 }
721 state->state |= bits; 721 state->state |= bits_to_set;
722 722
723 return 0; 723 return 0;
724} 724}
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
745 * [start, end] is inclusive This takes the tree lock. 745 * [start, end] is inclusive This takes the tree lock.
746 */ 746 */
747 747
748static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 748int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749 int bits, int exclusive_bits, u64 *failed_start, 749 int bits, int exclusive_bits, u64 *failed_start,
750 struct extent_state **cached_state, 750 struct extent_state **cached_state, gfp_t mask)
751 gfp_t mask)
752{ 751{
753 struct extent_state *state; 752 struct extent_state *state;
754 struct extent_state *prealloc = NULL; 753 struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
757 u64 last_start; 756 u64 last_start;
758 u64 last_end; 757 u64 last_end;
759 758
759 bits |= EXTENT_FIRST_DELALLOC;
760again: 760again:
761 if (!prealloc && (mask & __GFP_WAIT)) { 761 if (!prealloc && (mask & __GFP_WAIT)) {
762 prealloc = alloc_extent_state(mask); 762 prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
778 */ 778 */
779 node = tree_search(tree, start); 779 node = tree_search(tree, start);
780 if (!node) { 780 if (!node) {
781 err = insert_state(tree, prealloc, start, end, bits); 781 err = insert_state(tree, prealloc, start, end, &bits);
782 prealloc = NULL; 782 prealloc = NULL;
783 BUG_ON(err == -EEXIST); 783 BUG_ON(err == -EEXIST);
784 goto out; 784 goto out;
@@ -802,7 +802,7 @@ hit_next:
802 goto out; 802 goto out;
803 } 803 }
804 804
805 err = set_state_bits(tree, state, bits); 805 err = set_state_bits(tree, state, &bits);
806 if (err) 806 if (err)
807 goto out; 807 goto out;
808 808
@@ -852,7 +852,7 @@ hit_next:
852 if (err) 852 if (err)
853 goto out; 853 goto out;
854 if (state->end <= end) { 854 if (state->end <= end) {
855 err = set_state_bits(tree, state, bits); 855 err = set_state_bits(tree, state, &bits);
856 if (err) 856 if (err)
857 goto out; 857 goto out;
858 cache_state(state, cached_state); 858 cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
877 else 877 else
878 this_end = last_start - 1; 878 this_end = last_start - 1;
879 err = insert_state(tree, prealloc, start, this_end, 879 err = insert_state(tree, prealloc, start, this_end,
880 bits); 880 &bits);
881 BUG_ON(err == -EEXIST); 881 BUG_ON(err == -EEXIST);
882 if (err) { 882 if (err) {
883 prealloc = NULL; 883 prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
903 err = split_state(tree, state, prealloc, end + 1); 903 err = split_state(tree, state, prealloc, end + 1);
904 BUG_ON(err == -EEXIST); 904 BUG_ON(err == -EEXIST);
905 905
906 err = set_state_bits(tree, prealloc, bits); 906 err = set_state_bits(tree, prealloc, &bits);
907 if (err) { 907 if (err) {
908 prealloc = NULL; 908 prealloc = NULL;
909 goto out; 909 goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
966{ 966{
967 return clear_extent_bit(tree, start, end, 967 return clear_extent_bit(tree, start, end,
968 EXTENT_DIRTY | EXTENT_DELALLOC | 968 EXTENT_DIRTY | EXTENT_DELALLOC |
969 EXTENT_DO_ACCOUNTING, 0, 0, 969 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
970 NULL, mask);
971} 970}
972 971
973int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 972int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1435 if (op & EXTENT_CLEAR_DELALLOC) 1434 if (op & EXTENT_CLEAR_DELALLOC)
1436 clear_bits |= EXTENT_DELALLOC; 1435 clear_bits |= EXTENT_DELALLOC;
1437 1436
1438 if (op & EXTENT_CLEAR_ACCOUNTING)
1439 clear_bits |= EXTENT_DO_ACCOUNTING;
1440
1441 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1437 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1442 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1438 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1439 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1916 1912
1917 if (tree->ops && tree->ops->submit_bio_hook) 1913 if (tree->ops && tree->ops->submit_bio_hook)
1918 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1919 mirror_num, bio_flags); 1915 mirror_num, bio_flags, start);
1920 else 1916 else
1921 submit_bio(rw, bio); 1917 submit_bio(rw, bio);
1922 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1918 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2020 sector_t sector; 2016 sector_t sector;
2021 struct extent_map *em; 2017 struct extent_map *em;
2022 struct block_device *bdev; 2018 struct block_device *bdev;
2019 struct btrfs_ordered_extent *ordered;
2023 int ret; 2020 int ret;
2024 int nr = 0; 2021 int nr = 0;
2025 size_t page_offset = 0; 2022 size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2031 set_page_extent_mapped(page); 2028 set_page_extent_mapped(page);
2032 2029
2033 end = page_end; 2030 end = page_end;
2034 lock_extent(tree, start, end, GFP_NOFS); 2031 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS);
2033 ordered = btrfs_lookup_ordered_extent(inode, start);
2034 if (!ordered)
2035 break;
2036 unlock_extent(tree, start, end, GFP_NOFS);
2037 btrfs_start_ordered_extent(inode, ordered, 1);
2038 btrfs_put_ordered_extent(ordered);
2039 }
2035 2040
2036 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 2041 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2037 char *userpage; 2042 char *userpage;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/* flags for bio submission */
22#define EXTENT_BIO_COMPRESSED 1 24#define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
47 49
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 50typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 51 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 52 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 53struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 54 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 55 u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
69 struct extent_state *state); 71 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 72 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 73 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 74 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 75 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 76 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 77 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 78 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 79 struct extent_state *new,
78 struct extent_state *other); 80 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
176 u64 *start, u64 search_end, 178 u64 *start, u64 search_end,
177 u64 max_bytes, unsigned long bits); 179 u64 max_bytes, unsigned long bits);
178 180
181void free_extent_state(struct extent_state *state);
179int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
180 int bits, int filled, struct extent_state *cached_state); 183 int bits, int filled, struct extent_state *cached_state);
181int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 184int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
185 gfp_t mask); 188 gfp_t mask);
186int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 189int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
187 int bits, gfp_t mask); 190 int bits, gfp_t mask);
191int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask);
188int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
189 gfp_t mask); 195 gfp_t mask);
190int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
149} 149}
150 150
151 151
152int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
153 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
154{ 155{
155 u32 sum; 156 u32 sum;
156 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
157 int bio_index = 0; 158 int bio_index = 0;
158 u64 offset; 159 u64 offset = 0;
159 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
160 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
161 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
174 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
175 176
176 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
177 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
178 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
179 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
180 if (ret == 0) 184 if (ret == 0)
181 goto found; 185 goto found;
@@ -238,6 +242,7 @@ found:
238 else 242 else
239 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
240 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
241 bio_index++; 246 bio_index++;
242 bvec++; 247 bvec++;
243 } 248 }
@@ -245,6 +250,18 @@ found:
245 return 0; 250 return 0;
246} 251}
247 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
248int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
249 struct list_head *list) 266 struct list_head *list)
250{ 267{
@@ -657,6 +674,9 @@ again:
657 goto found; 674 goto found;
658 } 675 }
659 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
660 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
661 u32 item_size; 681 u32 item_size;
662 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47 int write_bytes, 47 int write_bytes,
48 struct page **prepared_pages, 48 struct page **prepared_pages,
49 const char __user *buf) 49 struct iov_iter *i)
50{ 50{
51 long page_fault = 0; 51 size_t copied;
52 int i; 52 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 53 int offset = pos & (PAGE_CACHE_SIZE - 1);
54 54
55 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 55 while (write_bytes > 0) {
56 size_t count = min_t(size_t, 56 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 57 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[i]; 58 struct page *page = prepared_pages[pg];
59 fault_in_pages_readable(buf, count); 59again:
60 if (unlikely(iov_iter_fault_in_readable(i, count)))
61 return -EFAULT;
60 62
61 /* Copy data from userspace to the current page */ 63 /* Copy data from userspace to the current page */
62 kmap(page); 64 copied = iov_iter_copy_from_user(page, i, offset, count);
63 page_fault = __copy_from_user(page_address(page) + offset, 65
64 buf, count);
65 /* Flush processor's dcache for this page */ 66 /* Flush processor's dcache for this page */
66 flush_dcache_page(page); 67 flush_dcache_page(page);
67 kunmap(page); 68 iov_iter_advance(i, copied);
68 buf += count; 69 write_bytes -= copied;
69 write_bytes -= count;
70 70
71 if (page_fault) 71 if (unlikely(copied == 0)) {
72 break; 72 count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 iov_iter_single_seg_count(i));
74 goto again;
75 }
76
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied;
79 } else {
80 pg++;
81 offset = 0;
82 }
73 } 83 }
74 return page_fault ? -EFAULT : 0; 84 return 0;
75} 85}
76 86
77/* 87/*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
126 end_of_last_block = start_pos + num_bytes - 1; 136 end_of_last_block = start_pos + num_bytes - 1;
127 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
128 NULL); 138 NULL);
129 if (err) 139 BUG_ON(err);
130 return err;
131 140
132 for (i = 0; i < num_pages; i++) { 141 for (i = 0; i < num_pages; i++) {
133 struct page *p = pages[i]; 142 struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
142 * at this time. 151 * at this time.
143 */ 152 */
144 } 153 }
145 return err; 154 return 0;
146} 155}
147 156
148/* 157/*
@@ -823,45 +832,46 @@ again:
823 return 0; 832 return 0;
824} 833}
825 834
826static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 835static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
827 size_t count, loff_t *ppos) 836 const struct iovec *iov,
837 unsigned long nr_segs, loff_t pos)
828{ 838{
829 loff_t pos; 839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
830 loff_t start_pos; 846 loff_t start_pos;
831 ssize_t num_written = 0; 847 ssize_t num_written = 0;
832 ssize_t err = 0; 848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
833 int ret = 0; 851 int ret = 0;
834 struct inode *inode = fdentry(file)->d_inode;
835 struct btrfs_root *root = BTRFS_I(inode)->root;
836 struct page **pages = NULL;
837 int nrptrs; 852 int nrptrs;
838 struct page *pinned[2];
839 unsigned long first_index; 853 unsigned long first_index;
840 unsigned long last_index; 854 unsigned long last_index;
841 int will_write; 855 int will_write;
856 int buffered = 0;
842 857
843 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
844 (file->f_flags & O_DIRECT)); 859 (file->f_flags & O_DIRECT));
845 860
846 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
847 PAGE_CACHE_SIZE / (sizeof(struct page *)));
848 pinned[0] = NULL; 861 pinned[0] = NULL;
849 pinned[1] = NULL; 862 pinned[1] = NULL;
850 863
851 pos = *ppos;
852 start_pos = pos; 864 start_pos = pos;
853 865
854 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
855 867
856 /* do the reserve before the mutex lock in case we have to do some
857 * flushing. We wouldn't deadlock, but this is more polite.
858 */
859 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
860 if (err)
861 goto out_nolock;
862
863 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
864 869
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 if (err)
872 goto out;
873 count = ocount;
874
865 current->backing_dev_info = inode->i_mapping->backing_dev_info; 875 current->backing_dev_info = inode->i_mapping->backing_dev_info;
866 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
867 if (err) 877 if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
875 goto out; 885 goto out;
876 886
877 file_update_time(file); 887 file_update_time(file);
888 BTRFS_I(inode)->sequence++;
889
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /*
895 * the generic O_DIRECT will update in-memory i_size after the
896 * DIOs are done. But our endio handlers that update the on
897 * disk i_size never update past the in memory i_size. So we
898 * need one more update here to catch any additions to the
899 * file
900 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 mark_inode_dirty(inode);
904 }
878 905
906 if (num_written < 0) {
907 ret = num_written;
908 num_written = 0;
909 goto out;
910 } else if (num_written == count) {
911 /* pick up pos changes done by the generic code */
912 pos = *ppos;
913 goto out;
914 }
915 /*
916 * We are going to do buffered for the rest of the range, so we
917 * need to make sure to invalidate the buffered pages when we're
918 * done.
919 */
920 buffered = 1;
921 pos += num_written;
922 }
923
924 iov_iter_init(&i, iov, nr_segs, count, num_written);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
879 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
880 929
881 /* generic_write_checks can change our pos */ 930 /* generic_write_checks can change our pos */
882 start_pos = pos; 931 start_pos = pos;
883 932
884 BTRFS_I(inode)->sequence++;
885 first_index = pos >> PAGE_CACHE_SHIFT; 933 first_index = pos >> PAGE_CACHE_SHIFT;
886 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
887 935
888 /* 936 /*
889 * there are lots of better ways to do this, but this code 937 * there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
900 unlock_page(pinned[0]); 948 unlock_page(pinned[0]);
901 } 949 }
902 } 950 }
903 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
904 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 952 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
905 if (!PageUptodate(pinned[1])) { 953 if (!PageUptodate(pinned[1])) {
906 ret = btrfs_readpage(NULL, pinned[1]); 954 ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
911 } 959 }
912 } 960 }
913 961
914 while (count > 0) { 962 while (iov_iter_count(&i) > 0) {
915 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 963 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
916 size_t write_bytes = min(count, nrptrs * 964 size_t write_bytes = min(iov_iter_count(&i),
917 (size_t)PAGE_CACHE_SIZE - 965 nrptrs * (size_t)PAGE_CACHE_SIZE -
918 offset); 966 offset);
919 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
920 PAGE_CACHE_SHIFT; 968 PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
922 WARN_ON(num_pages > nrptrs); 970 WARN_ON(num_pages > nrptrs);
923 memset(pages, 0, sizeof(struct page *) * nrptrs); 971 memset(pages, 0, sizeof(struct page *) * nrptrs);
924 972
925 ret = btrfs_check_data_free_space(root, inode, write_bytes); 973 ret = btrfs_delalloc_reserve_space(inode, write_bytes);
926 if (ret) 974 if (ret)
927 goto out; 975 goto out;
928 976
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
930 pos, first_index, last_index, 978 pos, first_index, last_index,
931 write_bytes); 979 write_bytes);
932 if (ret) { 980 if (ret) {
933 btrfs_free_reserved_data_space(root, inode, 981 btrfs_delalloc_release_space(inode, write_bytes);
934 write_bytes);
935 goto out; 982 goto out;
936 } 983 }
937 984
938 ret = btrfs_copy_from_user(pos, num_pages, 985 ret = btrfs_copy_from_user(pos, num_pages,
939 write_bytes, pages, buf); 986 write_bytes, pages, &i);
940 if (ret) { 987 if (ret == 0) {
941 btrfs_free_reserved_data_space(root, inode, 988 dirty_and_release_pages(NULL, root, file, pages,
942 write_bytes); 989 num_pages, pos, write_bytes);
943 btrfs_drop_pages(pages, num_pages);
944 goto out;
945 } 990 }
946 991
947 ret = dirty_and_release_pages(NULL, root, file, pages,
948 num_pages, pos, write_bytes);
949 btrfs_drop_pages(pages, num_pages); 992 btrfs_drop_pages(pages, num_pages);
950 if (ret) { 993 if (ret) {
951 btrfs_free_reserved_data_space(root, inode, 994 btrfs_delalloc_release_space(inode, write_bytes);
952 write_bytes);
953 goto out; 995 goto out;
954 } 996 }
955 997
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
965 btrfs_throttle(root); 1007 btrfs_throttle(root);
966 } 1008 }
967 1009
968 buf += write_bytes;
969 count -= write_bytes;
970 pos += write_bytes; 1010 pos += write_bytes;
971 num_written += write_bytes; 1011 num_written += write_bytes;
972 1012
@@ -976,9 +1016,7 @@ out:
976 mutex_unlock(&inode->i_mutex); 1016 mutex_unlock(&inode->i_mutex);
977 if (ret) 1017 if (ret)
978 err = ret; 1018 err = ret;
979 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
980 1019
981out_nolock:
982 kfree(pages); 1020 kfree(pages);
983 if (pinned[0]) 1021 if (pinned[0])
984 page_cache_release(pinned[0]); 1022 page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
1008 num_written = err; 1046 num_written = err;
1009 1047
1010 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1011 trans = btrfs_start_transaction(root, 1); 1049 trans = btrfs_start_transaction(root, 0);
1012 ret = btrfs_log_dentry_safe(trans, root, 1050 ret = btrfs_log_dentry_safe(trans, root,
1013 file->f_dentry); 1051 file->f_dentry);
1014 if (ret == 0) { 1052 if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
1023 btrfs_end_transaction(trans, root); 1061 btrfs_end_transaction(trans, root);
1024 } 1062 }
1025 } 1063 }
1026 if (file->f_flags & O_DIRECT) { 1064 if (file->f_flags & O_DIRECT && buffered) {
1027 invalidate_mapping_pages(inode->i_mapping, 1065 invalidate_mapping_pages(inode->i_mapping,
1028 start_pos >> PAGE_CACHE_SHIFT, 1066 start_pos >> PAGE_CACHE_SHIFT,
1029 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1063 * important optimization for directories because holding the mutex prevents 1101 * important optimization for directories because holding the mutex prevents
1064 * new operations on the dir while we write to disk. 1102 * new operations on the dir while we write to disk.
1065 */ 1103 */
1066int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1104int btrfs_sync_file(struct file *file, int datasync)
1067{ 1105{
1106 struct dentry *dentry = file->f_path.dentry;
1068 struct inode *inode = dentry->d_inode; 1107 struct inode *inode = dentry->d_inode;
1069 struct btrfs_root *root = BTRFS_I(inode)->root; 1108 struct btrfs_root *root = BTRFS_I(inode)->root;
1070 int ret = 0; 1109 int ret = 0;
@@ -1101,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1101 /* 1140 /*
1102 * ok we haven't committed the transaction yet, lets do a commit 1141 * ok we haven't committed the transaction yet, lets do a commit
1103 */ 1142 */
1104 if (file && file->private_data) 1143 if (file->private_data)
1105 btrfs_ioctl_trans_end(file); 1144 btrfs_ioctl_trans_end(file);
1106 1145
1107 trans = btrfs_start_transaction(root, 1); 1146 trans = btrfs_start_transaction(root, 0);
1108 if (!trans) { 1147 if (IS_ERR(trans)) {
1109 ret = -ENOMEM; 1148 ret = PTR_ERR(trans);
1110 goto out; 1149 goto out;
1111 } 1150 }
1112 1151
@@ -1151,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
1151 1190
1152static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1191static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1153{ 1192{
1154 vma->vm_ops = &btrfs_file_vm_ops; 1193 struct address_space *mapping = filp->f_mapping;
1194
1195 if (!mapping->a_ops->readpage)
1196 return -ENOEXEC;
1197
1155 file_accessed(filp); 1198 file_accessed(filp);
1199 vma->vm_ops = &btrfs_file_vm_ops;
1200 vma->vm_flags |= VM_CAN_NONLINEAR;
1201
1156 return 0; 1202 return 0;
1157} 1203}
1158 1204
1159const struct file_operations btrfs_file_operations = { 1205const struct file_operations btrfs_file_operations = {
1160 .llseek = generic_file_llseek, 1206 .llseek = generic_file_llseek,
1161 .read = do_sync_read, 1207 .read = do_sync_read,
1208 .write = do_sync_write,
1162 .aio_read = generic_file_aio_read, 1209 .aio_read = generic_file_aio_read,
1163 .splice_read = generic_file_splice_read, 1210 .splice_read = generic_file_splice_read,
1164 .write = btrfs_file_write, 1211 .aio_write = btrfs_file_aio_write,
1165 .mmap = btrfs_file_mmap, 1212 .mmap = btrfs_file_mmap,
1166 .open = generic_file_open, 1213 .open = generic_file_open,
1167 .release = btrfs_release_file, 1214 .release = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2bfdc641d4e3..1bff92ad4744 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
252 inline_len, compressed_size, 252 inline_len, compressed_size,
253 compressed_pages); 253 compressed_pages);
254 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
255 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
256 return 0; 257 return 0;
257} 258}
@@ -414,6 +415,7 @@ again:
414 trans = btrfs_join_transaction(root, 1); 415 trans = btrfs_join_transaction(root, 1);
415 BUG_ON(!trans); 416 BUG_ON(!trans);
416 btrfs_set_trans_block_group(trans, inode); 417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
417 419
418 /* lets try to make an inline extent */ 420 /* lets try to make an inline extent */
419 if (ret || total_in < (actual_end - start)) { 421 if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
439 start, end, NULL, 441 start, end, NULL,
440 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 442 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
441 EXTENT_CLEAR_DELALLOC | 443 EXTENT_CLEAR_DELALLOC |
442 EXTENT_CLEAR_ACCOUNTING |
443 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 444 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
444 445
445 btrfs_end_transaction(trans, root); 446 btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
697 return 0; 698 return 0;
698} 699}
699 700
701static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
702 u64 num_bytes)
703{
704 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
705 struct extent_map *em;
706 u64 alloc_hint = 0;
707
708 read_lock(&em_tree->lock);
709 em = search_extent_mapping(em_tree, start, num_bytes);
710 if (em) {
711 /*
712 * if block start isn't an actual block number then find the
713 * first block in this inode and use that as a hint. If that
714 * block is also bogus then just don't worry about it.
715 */
716 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
717 free_extent_map(em);
718 em = search_extent_mapping(em_tree, 0, 0);
719 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
720 alloc_hint = em->block_start;
721 if (em)
722 free_extent_map(em);
723 } else {
724 alloc_hint = em->block_start;
725 free_extent_map(em);
726 }
727 }
728 read_unlock(&em_tree->lock);
729
730 return alloc_hint;
731}
732
700/* 733/*
701 * when extent_io.c finds a delayed allocation range in the file, 734 * when extent_io.c finds a delayed allocation range in the file,
702 * the call backs end up in this code. The basic idea is to 735 * the call backs end up in this code. The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
734 trans = btrfs_join_transaction(root, 1); 767 trans = btrfs_join_transaction(root, 1);
735 BUG_ON(!trans); 768 BUG_ON(!trans);
736 btrfs_set_trans_block_group(trans, inode); 769 btrfs_set_trans_block_group(trans, inode);
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
737 771
738 actual_end = min_t(u64, isize, end + 1); 772 actual_end = min_t(u64, isize, end + 1);
739 773
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
753 EXTENT_CLEAR_UNLOCK_PAGE | 787 EXTENT_CLEAR_UNLOCK_PAGE |
754 EXTENT_CLEAR_UNLOCK | 788 EXTENT_CLEAR_UNLOCK |
755 EXTENT_CLEAR_DELALLOC | 789 EXTENT_CLEAR_DELALLOC |
756 EXTENT_CLEAR_ACCOUNTING |
757 EXTENT_CLEAR_DIRTY | 790 EXTENT_CLEAR_DIRTY |
758 EXTENT_SET_WRITEBACK | 791 EXTENT_SET_WRITEBACK |
759 EXTENT_END_WRITEBACK); 792 EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
769 BUG_ON(disk_num_bytes > 802 BUG_ON(disk_num_bytes >
770 btrfs_super_total_bytes(&root->fs_info->super_copy)); 803 btrfs_super_total_bytes(&root->fs_info->super_copy));
771 804
772 805 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
773 read_lock(&BTRFS_I(inode)->extent_tree.lock);
774 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
775 start, num_bytes);
776 if (em) {
777 /*
778 * if block start isn't an actual block number then find the
779 * first block in this inode and use that as a hint. If that
780 * block is also bogus then just don't worry about it.
781 */
782 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
783 free_extent_map(em);
784 em = search_extent_mapping(em_tree, 0, 0);
785 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
786 alloc_hint = em->block_start;
787 if (em)
788 free_extent_map(em);
789 } else {
790 alloc_hint = em->block_start;
791 free_extent_map(em);
792 }
793 }
794 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
795 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 806 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
796 807
797 while (disk_num_bytes > 0) { 808 while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
1174 num_bytes, num_bytes, type); 1185 num_bytes, num_bytes, type);
1175 BUG_ON(ret); 1186 BUG_ON(ret);
1176 1187
1188 if (root->root_key.objectid ==
1189 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1190 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1191 num_bytes);
1192 BUG_ON(ret);
1193 }
1194
1177 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1195 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1178 cur_offset, cur_offset + num_bytes - 1, 1196 cur_offset, cur_offset + num_bytes - 1,
1179 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1197 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1226} 1244}
1227 1245
1228static int btrfs_split_extent_hook(struct inode *inode, 1246static int btrfs_split_extent_hook(struct inode *inode,
1229 struct extent_state *orig, u64 split) 1247 struct extent_state *orig, u64 split)
1230{ 1248{
1249 /* not delalloc, ignore it */
1231 if (!(orig->state & EXTENT_DELALLOC)) 1250 if (!(orig->state & EXTENT_DELALLOC))
1232 return 0; 1251 return 0;
1233 1252
1234 spin_lock(&BTRFS_I(inode)->accounting_lock); 1253 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1235 BTRFS_I(inode)->outstanding_extents++;
1236 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1237
1238 return 0; 1254 return 0;
1239} 1255}
1240 1256
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1252 if (!(other->state & EXTENT_DELALLOC)) 1268 if (!(other->state & EXTENT_DELALLOC))
1253 return 0; 1269 return 0;
1254 1270
1255 spin_lock(&BTRFS_I(inode)->accounting_lock); 1271 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1256 BTRFS_I(inode)->outstanding_extents--;
1257 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1258
1259 return 0; 1272 return 0;
1260} 1273}
1261 1274
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 * bytes in this file, and to maintain the list of inodes that 1277 * bytes in this file, and to maintain the list of inodes that
1265 * have pending delalloc work to be done. 1278 * have pending delalloc work to be done.
1266 */ 1279 */
1267static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1280static int btrfs_set_bit_hook(struct inode *inode,
1268 unsigned long old, unsigned long bits) 1281 struct extent_state *state, int *bits)
1269{ 1282{
1270 1283
1271 /* 1284 /*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1273 * but in this case, we are only testeing for the DELALLOC 1286 * but in this case, we are only testeing for the DELALLOC
1274 * bit, which is only set or cleared with irqs on 1287 * bit, which is only set or cleared with irqs on
1275 */ 1288 */
1276 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1277 struct btrfs_root *root = BTRFS_I(inode)->root; 1290 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start;
1278 1292
1279 spin_lock(&BTRFS_I(inode)->accounting_lock); 1293 if (*bits & EXTENT_FIRST_DELALLOC)
1280 BTRFS_I(inode)->outstanding_extents++; 1294 *bits &= ~EXTENT_FIRST_DELALLOC;
1281 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1295 else
1282 btrfs_delalloc_reserve_space(root, inode, end - start + 1); 1296 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1283 1297
1284 spin_lock(&root->fs_info->delalloc_lock); 1298 spin_lock(&root->fs_info->delalloc_lock);
1285 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1299 BTRFS_I(inode)->delalloc_bytes += len;
1286 root->fs_info->delalloc_bytes += end - start + 1; 1300 root->fs_info->delalloc_bytes += len;
1287 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1288 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1289 &root->fs_info->delalloc_inodes); 1303 &root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1297 * extent_io.c clear_bit_hook, see set_bit_hook for why 1311 * extent_io.c clear_bit_hook, see set_bit_hook for why
1298 */ 1312 */
1299static int btrfs_clear_bit_hook(struct inode *inode, 1313static int btrfs_clear_bit_hook(struct inode *inode,
1300 struct extent_state *state, unsigned long bits) 1314 struct extent_state *state, int *bits)
1301{ 1315{
1302 /* 1316 /*
1303 * set_bit and clear bit hooks normally require _irqsave/restore 1317 * set_bit and clear bit hooks normally require _irqsave/restore
1304 * but in this case, we are only testeing for the DELALLOC 1318 * but in this case, we are only testeing for the DELALLOC
1305 * bit, which is only set or cleared with irqs on 1319 * bit, which is only set or cleared with irqs on
1306 */ 1320 */
1307 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1308 struct btrfs_root *root = BTRFS_I(inode)->root; 1322 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start;
1309 1324
1310 if (bits & EXTENT_DO_ACCOUNTING) { 1325 if (*bits & EXTENT_FIRST_DELALLOC)
1311 spin_lock(&BTRFS_I(inode)->accounting_lock); 1326 *bits &= ~EXTENT_FIRST_DELALLOC;
1312 WARN_ON(!BTRFS_I(inode)->outstanding_extents); 1327 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1313 BTRFS_I(inode)->outstanding_extents--; 1328 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1314 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1329
1315 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1330 if (*bits & EXTENT_DO_ACCOUNTING)
1316 } 1331 btrfs_delalloc_release_metadata(inode, len);
1332
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
1334 btrfs_free_reserved_data_space(inode, len);
1317 1335
1318 spin_lock(&root->fs_info->delalloc_lock); 1336 spin_lock(&root->fs_info->delalloc_lock);
1319 if (state->end - state->start + 1 > 1337 root->fs_info->delalloc_bytes -= len;
1320 root->fs_info->delalloc_bytes) { 1338 BTRFS_I(inode)->delalloc_bytes -= len;
1321 printk(KERN_INFO "btrfs warning: delalloc account " 1339
1322 "%llu %llu\n",
1323 (unsigned long long)
1324 state->end - state->start + 1,
1325 (unsigned long long)
1326 root->fs_info->delalloc_bytes);
1327 btrfs_delalloc_free_space(root, inode, (u64)-1);
1328 root->fs_info->delalloc_bytes = 0;
1329 BTRFS_I(inode)->delalloc_bytes = 0;
1330 } else {
1331 btrfs_delalloc_free_space(root, inode,
1332 state->end -
1333 state->start + 1);
1334 root->fs_info->delalloc_bytes -= state->end -
1335 state->start + 1;
1336 BTRFS_I(inode)->delalloc_bytes -= state->end -
1337 state->start + 1;
1338 }
1339 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1340 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1340 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1341 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1384 */ 1385 */
1385static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1386static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1386 struct bio *bio, int mirror_num, 1387 struct bio *bio, int mirror_num,
1387 unsigned long bio_flags) 1388 unsigned long bio_flags,
1389 u64 bio_offset)
1388{ 1390{
1389 struct btrfs_root *root = BTRFS_I(inode)->root; 1391 struct btrfs_root *root = BTRFS_I(inode)->root;
1390 int ret = 0; 1392 int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1403 * are inserted into the btree 1405 * are inserted into the btree
1404 */ 1406 */
1405static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1407static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1406 int mirror_num, unsigned long bio_flags) 1408 int mirror_num, unsigned long bio_flags,
1409 u64 bio_offset)
1407{ 1410{
1408 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1409 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1412 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1414 * on write, or reading the csums from the tree before a read 1417 * on write, or reading the csums from the tree before a read
1415 */ 1418 */
1416static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1419static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1417 int mirror_num, unsigned long bio_flags) 1420 int mirror_num, unsigned long bio_flags,
1421 u64 bio_offset)
1418{ 1422{
1419 struct btrfs_root *root = BTRFS_I(inode)->root; 1423 struct btrfs_root *root = BTRFS_I(inode)->root;
1420 int ret = 0; 1424 int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1439 /* we're doing a write, do the async checksumming */ 1443 /* we're doing a write, do the async checksumming */
1440 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1444 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1441 inode, rw, bio, mirror_num, 1445 inode, rw, bio, mirror_num,
1442 bio_flags, __btrfs_submit_bio_start, 1446 bio_flags, bio_offset,
1447 __btrfs_submit_bio_start,
1443 __btrfs_submit_bio_done); 1448 __btrfs_submit_bio_done);
1444 } 1449 }
1445 1450
@@ -1520,6 +1525,7 @@ again:
1520 goto again; 1525 goto again;
1521 } 1526 }
1522 1527
1528 BUG();
1523 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state); 1529 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1524 ClearPageChecked(page); 1530 ClearPageChecked(page);
1525out: 1531out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1650static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1656static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1651{ 1657{
1652 struct btrfs_root *root = BTRFS_I(inode)->root; 1658 struct btrfs_root *root = BTRFS_I(inode)->root;
1653 struct btrfs_trans_handle *trans; 1659 struct btrfs_trans_handle *trans = NULL;
1654 struct btrfs_ordered_extent *ordered_extent = NULL; 1660 struct btrfs_ordered_extent *ordered_extent = NULL;
1655 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1656 struct extent_state *cached_state = NULL; 1662 struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1668 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1669 if (!ret) { 1675 if (!ret) {
1670 trans = btrfs_join_transaction(root, 1); 1676 trans = btrfs_join_transaction(root, 1);
1677 btrfs_set_trans_block_group(trans, inode);
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1671 ret = btrfs_update_inode(trans, root, inode); 1679 ret = btrfs_update_inode(trans, root, inode);
1672 BUG_ON(ret); 1680 BUG_ON(ret);
1673 btrfs_end_transaction(trans, root);
1674 } 1681 }
1675 goto out; 1682 goto out;
1676 } 1683 }
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1680 0, &cached_state, GFP_NOFS); 1687 0, &cached_state, GFP_NOFS);
1681 1688
1682 trans = btrfs_join_transaction(root, 1); 1689 trans = btrfs_join_transaction(root, 1);
1690 btrfs_set_trans_block_group(trans, inode);
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1683 1692
1684 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1685 compressed = 1; 1694 compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1711 add_pending_csums(trans, inode, ordered_extent->file_offset, 1720 add_pending_csums(trans, inode, ordered_extent->file_offset,
1712 &ordered_extent->list); 1721 &ordered_extent->list);
1713 1722
1714 /* this also removes the ordered extent from the tree */
1715 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1716 ret = btrfs_update_inode(trans, root, inode); 1724 ret = btrfs_update_inode(trans, root, inode);
1717 BUG_ON(ret); 1725 BUG_ON(ret);
1718 btrfs_end_transaction(trans, root);
1719out: 1726out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1728 if (trans)
1729 btrfs_end_transaction(trans, root);
1720 /* once for us */ 1730 /* once for us */
1721 btrfs_put_ordered_extent(ordered_extent); 1731 btrfs_put_ordered_extent(ordered_extent);
1722 /* once for the tree */ 1732 /* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1838 1848
1839 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1840 failrec->last_mirror, 1850 failrec->last_mirror,
1841 failrec->bio_flags); 1851 failrec->bio_flags, 0);
1842 return 0; 1852 return 0;
1843} 1853}
1844 1854
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
1993} 2003}
1994 2004
1995/* 2005/*
2006 * calculate extra metadata reservation when snapshotting a subvolume
2007 * contains orphan files.
2008 */
2009void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2010 struct btrfs_pending_snapshot *pending,
2011 u64 *bytes_to_reserve)
2012{
2013 struct btrfs_root *root;
2014 struct btrfs_block_rsv *block_rsv;
2015 u64 num_bytes;
2016 int index;
2017
2018 root = pending->root;
2019 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2020 return;
2021
2022 block_rsv = root->orphan_block_rsv;
2023
2024 /* orphan block reservation for the snapshot */
2025 num_bytes = block_rsv->size;
2026
2027 /*
2028 * after the snapshot is created, COWing tree blocks may use more
2029 * space than it frees. So we should make sure there is enough
2030 * reserved space.
2031 */
2032 index = trans->transid & 0x1;
2033 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2034 num_bytes += block_rsv->size -
2035 (block_rsv->reserved + block_rsv->freed[index]);
2036 }
2037
2038 *bytes_to_reserve += num_bytes;
2039}
2040
2041void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2042 struct btrfs_pending_snapshot *pending)
2043{
2044 struct btrfs_root *root = pending->root;
2045 struct btrfs_root *snap = pending->snap;
2046 struct btrfs_block_rsv *block_rsv;
2047 u64 num_bytes;
2048 int index;
2049 int ret;
2050
2051 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2052 return;
2053
2054 /* refill source subvolume's orphan block reservation */
2055 block_rsv = root->orphan_block_rsv;
2056 index = trans->transid & 0x1;
2057 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2058 num_bytes = block_rsv->size -
2059 (block_rsv->reserved + block_rsv->freed[index]);
2060 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2061 root->orphan_block_rsv,
2062 num_bytes);
2063 BUG_ON(ret);
2064 }
2065
2066 /* setup orphan block reservation for the snapshot */
2067 block_rsv = btrfs_alloc_block_rsv(snap);
2068 BUG_ON(!block_rsv);
2069
2070 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2071 snap->orphan_block_rsv = block_rsv;
2072
2073 num_bytes = root->orphan_block_rsv->size;
2074 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2075 block_rsv, num_bytes);
2076 BUG_ON(ret);
2077
2078#if 0
2079 /* insert orphan item for the snapshot */
2080 WARN_ON(!root->orphan_item_inserted);
2081 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2082 snap->root_key.objectid);
2083 BUG_ON(ret);
2084 snap->orphan_item_inserted = 1;
2085#endif
2086}
2087
2088enum btrfs_orphan_cleanup_state {
2089 ORPHAN_CLEANUP_STARTED = 1,
2090 ORPHAN_CLEANUP_DONE = 2,
2091};
2092
2093/*
2094 * This is called in transaction commmit time. If there are no orphan
2095 * files in the subvolume, it removes orphan item and frees block_rsv
2096 * structure.
2097 */
2098void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2099 struct btrfs_root *root)
2100{
2101 int ret;
2102
2103 if (!list_empty(&root->orphan_list) ||
2104 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2105 return;
2106
2107 if (root->orphan_item_inserted &&
2108 btrfs_root_refs(&root->root_item) > 0) {
2109 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2110 root->root_key.objectid);
2111 BUG_ON(ret);
2112 root->orphan_item_inserted = 0;
2113 }
2114
2115 if (root->orphan_block_rsv) {
2116 WARN_ON(root->orphan_block_rsv->size > 0);
2117 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2118 root->orphan_block_rsv = NULL;
2119 }
2120}
2121
2122/*
1996 * This creates an orphan entry for the given inode in case something goes 2123 * This creates an orphan entry for the given inode in case something goes
1997 * wrong in the middle of an unlink/truncate. 2124 * wrong in the middle of an unlink/truncate.
2125 *
2126 * NOTE: caller of this function should reserve 5 units of metadata for
2127 * this function.
1998 */ 2128 */
1999int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2129int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2000{ 2130{
2001 struct btrfs_root *root = BTRFS_I(inode)->root; 2131 struct btrfs_root *root = BTRFS_I(inode)->root;
2002 int ret = 0; 2132 struct btrfs_block_rsv *block_rsv = NULL;
2133 int reserve = 0;
2134 int insert = 0;
2135 int ret;
2003 2136
2004 spin_lock(&root->list_lock); 2137 if (!root->orphan_block_rsv) {
2138 block_rsv = btrfs_alloc_block_rsv(root);
2139 BUG_ON(!block_rsv);
2140 }
2005 2141
2006 /* already on the orphan list, we're good */ 2142 spin_lock(&root->orphan_lock);
2007 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2143 if (!root->orphan_block_rsv) {
2008 spin_unlock(&root->list_lock); 2144 root->orphan_block_rsv = block_rsv;
2009 return 0; 2145 } else if (block_rsv) {
2146 btrfs_free_block_rsv(root, block_rsv);
2147 block_rsv = NULL;
2148 }
2149
2150 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2151 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2152#if 0
2153 /*
2154 * For proper ENOSPC handling, we should do orphan
2155 * cleanup when mounting. But this introduces backward
2156 * compatibility issue.
2157 */
2158 if (!xchg(&root->orphan_item_inserted, 1))
2159 insert = 2;
2160 else
2161 insert = 1;
2162#endif
2163 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2010 } 2166 }
2011 2167
2012 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2168 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2169 BTRFS_I(inode)->orphan_meta_reserved = 1;
2170 reserve = 1;
2171 }
2172 spin_unlock(&root->orphan_lock);
2013 2173
2014 spin_unlock(&root->list_lock); 2174 if (block_rsv)
2175 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2015 2176
2016 /* 2177 /* grab metadata reservation from transaction handle */
2017 * insert an orphan item to track this unlinked/truncated file 2178 if (reserve) {
2018 */ 2179 ret = btrfs_orphan_reserve_metadata(trans, inode);
2019 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2180 BUG_ON(ret);
2181 }
2020 2182
2021 return ret; 2183 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2186 BUG_ON(ret);
2187 }
2188
2189 /* insert an orphan item to track subvolume contains orphan files */
2190 if (insert >= 2) {
2191 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2192 root->root_key.objectid);
2193 BUG_ON(ret);
2194 }
2195 return 0;
2022} 2196}
2023 2197
2024/* 2198/*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2028int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2202int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2029{ 2203{
2030 struct btrfs_root *root = BTRFS_I(inode)->root; 2204 struct btrfs_root *root = BTRFS_I(inode)->root;
2205 int delete_item = 0;
2206 int release_rsv = 0;
2031 int ret = 0; 2207 int ret = 0;
2032 2208
2033 spin_lock(&root->list_lock); 2209 spin_lock(&root->orphan_lock);
2034 2210 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2035 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2211 list_del_init(&BTRFS_I(inode)->i_orphan);
2036 spin_unlock(&root->list_lock); 2212 delete_item = 1;
2037 return 0;
2038 } 2213 }
2039 2214
2040 list_del_init(&BTRFS_I(inode)->i_orphan); 2215 if (BTRFS_I(inode)->orphan_meta_reserved) {
2041 if (!trans) { 2216 BTRFS_I(inode)->orphan_meta_reserved = 0;
2042 spin_unlock(&root->list_lock); 2217 release_rsv = 1;
2043 return 0;
2044 } 2218 }
2219 spin_unlock(&root->orphan_lock);
2045 2220
2046 spin_unlock(&root->list_lock); 2221 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2223 BUG_ON(ret);
2224 }
2047 2225
2048 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2226 if (release_rsv)
2227 btrfs_orphan_release_metadata(inode);
2049 2228
2050 return ret; 2229 return 0;
2051} 2230}
2052 2231
2053/* 2232/*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2064 struct inode *inode; 2243 struct inode *inode;
2065 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2244 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2066 2245
2067 if (!xchg(&root->clean_orphans, 0)) 2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2068 return; 2247 return;
2069 2248
2070 path = btrfs_alloc_path(); 2249 path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2117 found_key.type = BTRFS_INODE_ITEM_KEY; 2296 found_key.type = BTRFS_INODE_ITEM_KEY;
2118 found_key.offset = 0; 2297 found_key.offset = 0;
2119 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2120 if (IS_ERR(inode)) 2299 BUG_ON(IS_ERR(inode));
2121 break;
2122 2300
2123 /* 2301 /*
2124 * add this inode to the orphan list so btrfs_orphan_del does 2302 * add this inode to the orphan list so btrfs_orphan_del does
2125 * the proper thing when we hit it 2303 * the proper thing when we hit it
2126 */ 2304 */
2127 spin_lock(&root->list_lock); 2305 spin_lock(&root->orphan_lock);
2128 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2306 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2129 spin_unlock(&root->list_lock); 2307 spin_unlock(&root->orphan_lock);
2130 2308
2131 /* 2309 /*
2132 * if this is a bad inode, means we actually succeeded in 2310 * if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2135 * do a destroy_inode 2313 * do a destroy_inode
2136 */ 2314 */
2137 if (is_bad_inode(inode)) { 2315 if (is_bad_inode(inode)) {
2138 trans = btrfs_start_transaction(root, 1); 2316 trans = btrfs_start_transaction(root, 0);
2139 btrfs_orphan_del(trans, inode); 2317 btrfs_orphan_del(trans, inode);
2140 btrfs_end_transaction(trans, root); 2318 btrfs_end_transaction(trans, root);
2141 iput(inode); 2319 iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 /* this will do delete_inode and everything for us */ 2331 /* this will do delete_inode and everything for us */
2154 iput(inode); 2332 iput(inode);
2155 } 2333 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337
2338 if (root->orphan_block_rsv)
2339 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2340 (u64)-1);
2341
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1);
2344 btrfs_end_transaction(trans, root);
2345 }
2156 2346
2157 if (nr_unlink) 2347 if (nr_unlink)
2158 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2159 if (nr_truncate) 2349 if (nr_truncate)
2160 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2161
2162 btrfs_free_path(path);
2163} 2351}
2164 2352
2165/* 2353/*
@@ -2478,29 +2666,201 @@ out:
2478 return ret; 2666 return ret;
2479} 2667}
2480 2668
2481static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2669/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path)
2672{
2673 struct extent_buffer *eb;
2674 int level;
2675 int ret;
2676 u64 refs = 1;
2677
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2679 if (!path->nodes[level])
2680 break;
2681 eb = path->nodes[level];
2682 if (!btrfs_block_can_be_shared(root, eb))
2683 continue;
2684 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2685 &refs, NULL);
2686 if (refs > 1)
2687 return 1;
2688 }
2689 return 0;
2690}
2691
2692/*
2693 * helper to start transaction for unlink and rmdir.
2694 *
2695 * unlink and rmdir are special in btrfs, they do not always free space.
2696 * so in enospc case, we should make sure they will free space before
2697 * allowing them to use the global metadata reservation.
2698 */
2699static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2700 struct dentry *dentry)
2482{ 2701{
2483 struct btrfs_root *root;
2484 struct btrfs_trans_handle *trans; 2702 struct btrfs_trans_handle *trans;
2703 struct btrfs_root *root = BTRFS_I(dir)->root;
2704 struct btrfs_path *path;
2705 struct btrfs_inode_ref *ref;
2706 struct btrfs_dir_item *di;
2485 struct inode *inode = dentry->d_inode; 2707 struct inode *inode = dentry->d_inode;
2708 u64 index;
2709 int check_link = 1;
2710 int err = -ENOSPC;
2486 int ret; 2711 int ret;
2487 unsigned long nr = 0;
2488 2712
2489 root = BTRFS_I(dir)->root; 2713 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans;
2490 2716
2491 /* 2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2492 * 5 items for unlink inode 2718 return ERR_PTR(-ENOSPC);
2493 * 1 for orphan 2719
2494 */ 2720 /* check if there is someone else holds reference */
2495 ret = btrfs_reserve_metadata_space(root, 6); 2721 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2496 if (ret) 2722 return ERR_PTR(-ENOSPC);
2497 return ret;
2498 2723
2499 trans = btrfs_start_transaction(root, 1); 2724 if (atomic_read(&inode->i_count) > 2)
2725 return ERR_PTR(-ENOSPC);
2726
2727 if (xchg(&root->fs_info->enospc_unlink, 1))
2728 return ERR_PTR(-ENOSPC);
2729
2730 path = btrfs_alloc_path();
2731 if (!path) {
2732 root->fs_info->enospc_unlink = 0;
2733 return ERR_PTR(-ENOMEM);
2734 }
2735
2736 trans = btrfs_start_transaction(root, 0);
2500 if (IS_ERR(trans)) { 2737 if (IS_ERR(trans)) {
2501 btrfs_unreserve_metadata_space(root, 6); 2738 btrfs_free_path(path);
2502 return PTR_ERR(trans); 2739 root->fs_info->enospc_unlink = 0;
2740 return trans;
2741 }
2742
2743 path->skip_locking = 1;
2744 path->search_commit_root = 1;
2745
2746 ret = btrfs_lookup_inode(trans, root, path,
2747 &BTRFS_I(dir)->location, 0);
2748 if (ret < 0) {
2749 err = ret;
2750 goto out;
2751 }
2752 if (ret == 0) {
2753 if (check_path_shared(root, path))
2754 goto out;
2755 } else {
2756 check_link = 0;
2757 }
2758 btrfs_release_path(root, path);
2759
2760 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0);
2762 if (ret < 0) {
2763 err = ret;
2764 goto out;
2765 }
2766 if (ret == 0) {
2767 if (check_path_shared(root, path))
2768 goto out;
2769 } else {
2770 check_link = 0;
2771 }
2772 btrfs_release_path(root, path);
2773
2774 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0);
2777 if (ret < 0) {
2778 err = ret;
2779 goto out;
2780 }
2781 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path))
2783 goto out;
2784 btrfs_release_path(root, path);
2785 }
2786
2787 if (!check_link) {
2788 err = 0;
2789 goto out;
2790 }
2791
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) {
2795 err = PTR_ERR(di);
2796 goto out;
2797 }
2798 if (di) {
2799 if (check_path_shared(root, path))
2800 goto out;
2801 } else {
2802 err = 0;
2803 goto out;
2503 } 2804 }
2805 btrfs_release_path(root, path);
2806
2807 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0);
2810 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref);
2812 goto out;
2813 }
2814 BUG_ON(!ref);
2815 if (check_path_shared(root, path))
2816 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path);
2819
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) {
2823 err = PTR_ERR(di);
2824 goto out;
2825 }
2826 BUG_ON(ret == -ENOENT);
2827 if (check_path_shared(root, path))
2828 goto out;
2829
2830 err = 0;
2831out:
2832 btrfs_free_path(path);
2833 if (err) {
2834 btrfs_end_transaction(trans, root);
2835 root->fs_info->enospc_unlink = 0;
2836 return ERR_PTR(err);
2837 }
2838
2839 trans->block_rsv = &root->fs_info->global_block_rsv;
2840 return trans;
2841}
2842
2843static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2844 struct btrfs_root *root)
2845{
2846 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2847 BUG_ON(!root->fs_info->enospc_unlink);
2848 root->fs_info->enospc_unlink = 0;
2849 }
2850 btrfs_end_transaction_throttle(trans, root);
2851}
2852
2853static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2854{
2855 struct btrfs_root *root = BTRFS_I(dir)->root;
2856 struct btrfs_trans_handle *trans;
2857 struct inode *inode = dentry->d_inode;
2858 int ret;
2859 unsigned long nr = 0;
2860
2861 trans = __unlink_start_trans(dir, dentry);
2862 if (IS_ERR(trans))
2863 return PTR_ERR(trans);
2504 2864
2505 btrfs_set_trans_block_group(trans, dir); 2865 btrfs_set_trans_block_group(trans, dir);
2506 2866
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2508 2868
2509 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2510 dentry->d_name.name, dentry->d_name.len); 2870 dentry->d_name.name, dentry->d_name.len);
2871 BUG_ON(ret);
2511 2872
2512 if (inode->i_nlink == 0) 2873 if (inode->i_nlink == 0) {
2513 ret = btrfs_orphan_add(trans, inode); 2874 ret = btrfs_orphan_add(trans, inode);
2875 BUG_ON(ret);
2876 }
2514 2877
2515 nr = trans->blocks_used; 2878 nr = trans->blocks_used;
2516 2879 __unlink_end_trans(trans, root);
2517 btrfs_end_transaction_throttle(trans, root);
2518 btrfs_unreserve_metadata_space(root, 6);
2519 btrfs_btree_balance_dirty(root, nr); 2880 btrfs_btree_balance_dirty(root, nr);
2520 return ret; 2881 return ret;
2521} 2882}
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2587{ 2948{
2588 struct inode *inode = dentry->d_inode; 2949 struct inode *inode = dentry->d_inode;
2589 int err = 0; 2950 int err = 0;
2590 int ret;
2591 struct btrfs_root *root = BTRFS_I(dir)->root; 2951 struct btrfs_root *root = BTRFS_I(dir)->root;
2592 struct btrfs_trans_handle *trans; 2952 struct btrfs_trans_handle *trans;
2593 unsigned long nr = 0; 2953 unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2596 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2956 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2597 return -ENOTEMPTY; 2957 return -ENOTEMPTY;
2598 2958
2599 ret = btrfs_reserve_metadata_space(root, 5); 2959 trans = __unlink_start_trans(dir, dentry);
2600 if (ret) 2960 if (IS_ERR(trans))
2601 return ret;
2602
2603 trans = btrfs_start_transaction(root, 1);
2604 if (IS_ERR(trans)) {
2605 btrfs_unreserve_metadata_space(root, 5);
2606 return PTR_ERR(trans); 2961 return PTR_ERR(trans);
2607 }
2608 2962
2609 btrfs_set_trans_block_group(trans, dir); 2963 btrfs_set_trans_block_group(trans, dir);
2610 2964
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2627 btrfs_i_size_write(inode, 0); 2981 btrfs_i_size_write(inode, 0);
2628out: 2982out:
2629 nr = trans->blocks_used; 2983 nr = trans->blocks_used;
2630 ret = btrfs_end_transaction_throttle(trans, root); 2984 __unlink_end_trans(trans, root);
2631 btrfs_unreserve_metadata_space(root, 5);
2632 btrfs_btree_balance_dirty(root, nr); 2985 btrfs_btree_balance_dirty(root, nr);
2633 2986
2634 if (ret && !err)
2635 err = ret;
2636 return err; 2987 return err;
2637} 2988}
2638 2989
@@ -3029,6 +3380,7 @@ out:
3029 if (pending_del_nr) { 3380 if (pending_del_nr) {
3030 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3381 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3031 pending_del_nr); 3382 pending_del_nr);
3383 BUG_ON(ret);
3032 } 3384 }
3033 btrfs_free_path(path); 3385 btrfs_free_path(path);
3034 return err; 3386 return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3056 3408
3057 if ((offset & (blocksize - 1)) == 0) 3409 if ((offset & (blocksize - 1)) == 0)
3058 goto out; 3410 goto out;
3059 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3411 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3060 if (ret)
3061 goto out;
3062
3063 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3064 if (ret) 3412 if (ret)
3065 goto out; 3413 goto out;
3066 3414
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3068again: 3416again:
3069 page = grab_cache_page(mapping, index); 3417 page = grab_cache_page(mapping, index);
3070 if (!page) { 3418 if (!page) {
3071 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3419 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3072 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3073 goto out; 3420 goto out;
3074 } 3421 }
3075 3422
@@ -3132,8 +3479,7 @@ again:
3132 3479
3133out_unlock: 3480out_unlock:
3134 if (ret) 3481 if (ret)
3135 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3482 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3136 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3137 unlock_page(page); 3483 unlock_page(page);
3138 page_cache_release(page); 3484 page_cache_release(page);
3139out: 3485out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3145 struct btrfs_trans_handle *trans; 3491 struct btrfs_trans_handle *trans;
3146 struct btrfs_root *root = BTRFS_I(inode)->root; 3492 struct btrfs_root *root = BTRFS_I(inode)->root;
3147 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3493 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3148 struct extent_map *em; 3494 struct extent_map *em = NULL;
3149 struct extent_state *cached_state = NULL; 3495 struct extent_state *cached_state = NULL;
3150 u64 mask = root->sectorsize - 1; 3496 u64 mask = root->sectorsize - 1;
3151 u64 hole_start = (inode->i_size + mask) & ~mask; 3497 u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3183 u64 hint_byte = 0; 3529 u64 hint_byte = 0;
3184 hole_size = last_byte - cur_offset; 3530 hole_size = last_byte - cur_offset;
3185 3531
3186 err = btrfs_reserve_metadata_space(root, 2); 3532 trans = btrfs_start_transaction(root, 2);
3187 if (err) 3533 if (IS_ERR(trans)) {
3534 err = PTR_ERR(trans);
3188 break; 3535 break;
3189 3536 }
3190 trans = btrfs_start_transaction(root, 1);
3191 btrfs_set_trans_block_group(trans, inode); 3537 btrfs_set_trans_block_group(trans, inode);
3192 3538
3193 err = btrfs_drop_extents(trans, inode, cur_offset, 3539 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3205 last_byte - 1, 0); 3551 last_byte - 1, 0);
3206 3552
3207 btrfs_end_transaction(trans, root); 3553 btrfs_end_transaction(trans, root);
3208 btrfs_unreserve_metadata_space(root, 2);
3209 } 3554 }
3210 free_extent_map(em); 3555 free_extent_map(em);
3556 em = NULL;
3211 cur_offset = last_byte; 3557 cur_offset = last_byte;
3212 if (cur_offset >= block_end) 3558 if (cur_offset >= block_end)
3213 break; 3559 break;
3214 } 3560 }
3215 3561
3562 free_extent_map(em);
3216 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state, 3563 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3217 GFP_NOFS); 3564 GFP_NOFS);
3218 return err; 3565 return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3239 } 3586 }
3240 } 3587 }
3241 3588
3242 ret = btrfs_reserve_metadata_space(root, 1); 3589 trans = btrfs_start_transaction(root, 5);
3243 if (ret) 3590 if (IS_ERR(trans))
3244 return ret; 3591 return PTR_ERR(trans);
3245 3592
3246 trans = btrfs_start_transaction(root, 1);
3247 btrfs_set_trans_block_group(trans, inode); 3593 btrfs_set_trans_block_group(trans, inode);
3248 3594
3249 ret = btrfs_orphan_add(trans, inode); 3595 ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3251 3597
3252 nr = trans->blocks_used; 3598 nr = trans->blocks_used;
3253 btrfs_end_transaction(trans, root); 3599 btrfs_end_transaction(trans, root);
3254 btrfs_unreserve_metadata_space(root, 1);
3255 btrfs_btree_balance_dirty(root, nr); 3600 btrfs_btree_balance_dirty(root, nr);
3256 3601
3257 if (attr->ia_size > inode->i_size) { 3602 if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3264 i_size_write(inode, attr->ia_size); 3609 i_size_write(inode, attr->ia_size);
3265 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3610 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3266 3611
3267 trans = btrfs_start_transaction(root, 1); 3612 trans = btrfs_start_transaction(root, 0);
3613 BUG_ON(IS_ERR(trans));
3268 btrfs_set_trans_block_group(trans, inode); 3614 btrfs_set_trans_block_group(trans, inode);
3615 trans->block_rsv = root->orphan_block_rsv;
3616 BUG_ON(!trans->block_rsv);
3269 3617
3270 ret = btrfs_update_inode(trans, root, inode); 3618 ret = btrfs_update_inode(trans, root, inode);
3271 BUG_ON(ret); 3619 BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
3345 btrfs_i_size_write(inode, 0); 3693 btrfs_i_size_write(inode, 0);
3346 3694
3347 while (1) { 3695 while (1) {
3348 trans = btrfs_start_transaction(root, 1); 3696 trans = btrfs_start_transaction(root, 0);
3697 BUG_ON(IS_ERR(trans));
3349 btrfs_set_trans_block_group(trans, inode); 3698 btrfs_set_trans_block_group(trans, inode);
3350 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3699 trans->block_rsv = root->orphan_block_rsv;
3351 3700
3701 ret = btrfs_block_rsv_check(trans, root,
3702 root->orphan_block_rsv, 0, 5);
3703 if (ret) {
3704 BUG_ON(ret != -EAGAIN);
3705 ret = btrfs_commit_transaction(trans, root);
3706 BUG_ON(ret);
3707 continue;
3708 }
3709
3710 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3352 if (ret != -EAGAIN) 3711 if (ret != -EAGAIN)
3353 break; 3712 break;
3354 3713
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
3356 btrfs_end_transaction(trans, root); 3715 btrfs_end_transaction(trans, root);
3357 trans = NULL; 3716 trans = NULL;
3358 btrfs_btree_balance_dirty(root, nr); 3717 btrfs_btree_balance_dirty(root, nr);
3718
3359 } 3719 }
3360 3720
3361 if (ret == 0) { 3721 if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
3596 return 0; 3956 return 0;
3597} 3957}
3598 3958
3599static noinline void init_btrfs_i(struct inode *inode)
3600{
3601 struct btrfs_inode *bi = BTRFS_I(inode);
3602
3603 bi->generation = 0;
3604 bi->sequence = 0;
3605 bi->last_trans = 0;
3606 bi->last_sub_trans = 0;
3607 bi->logged_trans = 0;
3608 bi->delalloc_bytes = 0;
3609 bi->reserved_bytes = 0;
3610 bi->disk_i_size = 0;
3611 bi->flags = 0;
3612 bi->index_cnt = (u64)-1;
3613 bi->last_unlink_trans = 0;
3614 bi->ordered_data_close = 0;
3615 bi->force_compress = 0;
3616 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3617 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3618 inode->i_mapping, GFP_NOFS);
3619 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3620 inode->i_mapping, GFP_NOFS);
3621 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3622 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3623 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3624 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3625 mutex_init(&BTRFS_I(inode)->log_mutex);
3626}
3627
3628static int btrfs_init_locked_inode(struct inode *inode, void *p) 3959static int btrfs_init_locked_inode(struct inode *inode, void *p)
3629{ 3960{
3630 struct btrfs_iget_args *args = p; 3961 struct btrfs_iget_args *args = p;
3631 inode->i_ino = args->ino; 3962 inode->i_ino = args->ino;
3632 init_btrfs_i(inode);
3633 BTRFS_I(inode)->root = args->root; 3963 BTRFS_I(inode)->root = args->root;
3634 btrfs_set_inode_space_info(args->root, inode); 3964 btrfs_set_inode_space_info(args->root, inode);
3635 return 0; 3965 return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3692 if (!inode) 4022 if (!inode)
3693 return ERR_PTR(-ENOMEM); 4023 return ERR_PTR(-ENOMEM);
3694 4024
3695 init_btrfs_i(inode);
3696
3697 BTRFS_I(inode)->root = root; 4025 BTRFS_I(inode)->root = root;
3698 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4026 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3699 BTRFS_I(inode)->dummy_inode = 1; 4027 BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
3950 struct btrfs_trans_handle *trans; 4278 struct btrfs_trans_handle *trans;
3951 int ret = 0; 4279 int ret = 0;
3952 4280
3953 if (root->fs_info->btree_inode == inode) 4281 if (BTRFS_I(inode)->dummy_inode)
3954 return 0; 4282 return 0;
3955 4283
3956 if (wbc->sync_mode == WB_SYNC_ALL) { 4284 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
3971{ 4299{
3972 struct btrfs_root *root = BTRFS_I(inode)->root; 4300 struct btrfs_root *root = BTRFS_I(inode)->root;
3973 struct btrfs_trans_handle *trans; 4301 struct btrfs_trans_handle *trans;
4302 int ret;
4303
4304 if (BTRFS_I(inode)->dummy_inode)
4305 return;
3974 4306
3975 trans = btrfs_join_transaction(root, 1); 4307 trans = btrfs_join_transaction(root, 1);
3976 btrfs_set_trans_block_group(trans, inode); 4308 btrfs_set_trans_block_group(trans, inode);
3977 btrfs_update_inode(trans, root, inode); 4309
4310 ret = btrfs_update_inode(trans, root, inode);
4311 if (ret && ret == -ENOSPC) {
4312 /* whoops, lets try again with the full transaction */
4313 btrfs_end_transaction(trans, root);
4314 trans = btrfs_start_transaction(root, 1);
4315 if (IS_ERR(trans)) {
4316 if (printk_ratelimit()) {
4317 printk(KERN_ERR "btrfs: fail to "
4318 "dirty inode %lu error %ld\n",
4319 inode->i_ino, PTR_ERR(trans));
4320 }
4321 return;
4322 }
4323 btrfs_set_trans_block_group(trans, inode);
4324
4325 ret = btrfs_update_inode(trans, root, inode);
4326 if (ret) {
4327 if (printk_ratelimit()) {
4328 printk(KERN_ERR "btrfs: fail to "
4329 "dirty inode %lu error %d\n",
4330 inode->i_ino, ret);
4331 }
4332 }
4333 }
3978 btrfs_end_transaction(trans, root); 4334 btrfs_end_transaction(trans, root);
3979} 4335}
3980 4336
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4092 * btrfs_get_inode_index_count has an explanation for the magic 4448 * btrfs_get_inode_index_count has an explanation for the magic
4093 * number 4449 * number
4094 */ 4450 */
4095 init_btrfs_i(inode);
4096 BTRFS_I(inode)->index_cnt = 2; 4451 BTRFS_I(inode)->index_cnt = 2;
4097 BTRFS_I(inode)->root = root; 4452 BTRFS_I(inode)->root = root;
4098 BTRFS_I(inode)->generation = trans->transid; 4453 BTRFS_I(inode)->generation = trans->transid;
@@ -4121,16 +4476,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4121 if (ret != 0) 4476 if (ret != 0)
4122 goto fail; 4477 goto fail;
4123 4478
4124 inode->i_uid = current_fsuid(); 4479 inode_init_owner(inode, dir, mode);
4125
4126 if (dir && (dir->i_mode & S_ISGID)) {
4127 inode->i_gid = dir->i_gid;
4128 if (S_ISDIR(mode))
4129 mode |= S_ISGID;
4130 } else
4131 inode->i_gid = current_fsgid();
4132
4133 inode->i_mode = mode;
4134 inode->i_ino = objectid; 4480 inode->i_ino = objectid;
4135 inode_set_bytes(inode, 0); 4481 inode_set_bytes(inode, 0);
4136 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4482 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4256,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4256 if (!new_valid_dev(rdev)) 4602 if (!new_valid_dev(rdev))
4257 return -EINVAL; 4603 return -EINVAL;
4258 4604
4605 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4606 if (err)
4607 return err;
4608
4259 /* 4609 /*
4260 * 2 for inode item and ref 4610 * 2 for inode item and ref
4261 * 2 for dir items 4611 * 2 for dir items
4262 * 1 for xattr if selinux is on 4612 * 1 for xattr if selinux is on
4263 */ 4613 */
4264 err = btrfs_reserve_metadata_space(root, 5); 4614 trans = btrfs_start_transaction(root, 5);
4265 if (err) 4615 if (IS_ERR(trans))
4266 return err; 4616 return PTR_ERR(trans);
4267 4617
4268 trans = btrfs_start_transaction(root, 1);
4269 if (!trans)
4270 goto fail;
4271 btrfs_set_trans_block_group(trans, dir); 4618 btrfs_set_trans_block_group(trans, dir);
4272 4619
4273 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4274 if (err) {
4275 err = -ENOSPC;
4276 goto out_unlock;
4277 }
4278
4279 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4620 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4280 dentry->d_name.len, 4621 dentry->d_name.len,
4281 dentry->d_parent->d_inode->i_ino, objectid, 4622 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4304,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4304out_unlock: 4645out_unlock:
4305 nr = trans->blocks_used; 4646 nr = trans->blocks_used;
4306 btrfs_end_transaction_throttle(trans, root); 4647 btrfs_end_transaction_throttle(trans, root);
4307fail: 4648 btrfs_btree_balance_dirty(root, nr);
4308 btrfs_unreserve_metadata_space(root, 5);
4309 if (drop_inode) { 4649 if (drop_inode) {
4310 inode_dec_link_count(inode); 4650 inode_dec_link_count(inode);
4311 iput(inode); 4651 iput(inode);
4312 } 4652 }
4313 btrfs_btree_balance_dirty(root, nr);
4314 return err; 4653 return err;
4315} 4654}
4316 4655
@@ -4320,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4320 struct btrfs_trans_handle *trans; 4659 struct btrfs_trans_handle *trans;
4321 struct btrfs_root *root = BTRFS_I(dir)->root; 4660 struct btrfs_root *root = BTRFS_I(dir)->root;
4322 struct inode *inode = NULL; 4661 struct inode *inode = NULL;
4323 int err;
4324 int drop_inode = 0; 4662 int drop_inode = 0;
4663 int err;
4325 unsigned long nr = 0; 4664 unsigned long nr = 0;
4326 u64 objectid; 4665 u64 objectid;
4327 u64 index = 0; 4666 u64 index = 0;
4328 4667
4668 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4669 if (err)
4670 return err;
4329 /* 4671 /*
4330 * 2 for inode item and ref 4672 * 2 for inode item and ref
4331 * 2 for dir items 4673 * 2 for dir items
4332 * 1 for xattr if selinux is on 4674 * 1 for xattr if selinux is on
4333 */ 4675 */
4334 err = btrfs_reserve_metadata_space(root, 5); 4676 trans = btrfs_start_transaction(root, 5);
4335 if (err) 4677 if (IS_ERR(trans))
4336 return err; 4678 return PTR_ERR(trans);
4337 4679
4338 trans = btrfs_start_transaction(root, 1);
4339 if (!trans)
4340 goto fail;
4341 btrfs_set_trans_block_group(trans, dir); 4680 btrfs_set_trans_block_group(trans, dir);
4342 4681
4343 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4344 if (err) {
4345 err = -ENOSPC;
4346 goto out_unlock;
4347 }
4348
4349 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4682 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4350 dentry->d_name.len, 4683 dentry->d_name.len,
4351 dentry->d_parent->d_inode->i_ino, 4684 dentry->d_parent->d_inode->i_ino,
@@ -4377,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4377out_unlock: 4710out_unlock:
4378 nr = trans->blocks_used; 4711 nr = trans->blocks_used;
4379 btrfs_end_transaction_throttle(trans, root); 4712 btrfs_end_transaction_throttle(trans, root);
4380fail:
4381 btrfs_unreserve_metadata_space(root, 5);
4382 if (drop_inode) { 4713 if (drop_inode) {
4383 inode_dec_link_count(inode); 4714 inode_dec_link_count(inode);
4384 iput(inode); 4715 iput(inode);
@@ -4405,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4405 if (root->objectid != BTRFS_I(inode)->root->objectid) 4736 if (root->objectid != BTRFS_I(inode)->root->objectid)
4406 return -EPERM; 4737 return -EPERM;
4407 4738
4408 /*
4409 * 1 item for inode ref
4410 * 2 items for dir items
4411 */
4412 err = btrfs_reserve_metadata_space(root, 3);
4413 if (err)
4414 return err;
4415
4416 btrfs_inc_nlink(inode); 4739 btrfs_inc_nlink(inode);
4417 4740
4418 err = btrfs_set_inode_index(dir, &index); 4741 err = btrfs_set_inode_index(dir, &index);
4419 if (err) 4742 if (err)
4420 goto fail; 4743 goto fail;
4421 4744
4422 trans = btrfs_start_transaction(root, 1); 4745 /*
4746 * 1 item for inode ref
4747 * 2 items for dir items
4748 */
4749 trans = btrfs_start_transaction(root, 3);
4750 if (IS_ERR(trans)) {
4751 err = PTR_ERR(trans);
4752 goto fail;
4753 }
4423 4754
4424 btrfs_set_trans_block_group(trans, dir); 4755 btrfs_set_trans_block_group(trans, dir);
4425 atomic_inc(&inode->i_count); 4756 atomic_inc(&inode->i_count);
@@ -4438,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4438 nr = trans->blocks_used; 4769 nr = trans->blocks_used;
4439 btrfs_end_transaction_throttle(trans, root); 4770 btrfs_end_transaction_throttle(trans, root);
4440fail: 4771fail:
4441 btrfs_unreserve_metadata_space(root, 3);
4442 if (drop_inode) { 4772 if (drop_inode) {
4443 inode_dec_link_count(inode); 4773 inode_dec_link_count(inode);
4444 iput(inode); 4774 iput(inode);
@@ -4458,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4458 u64 index = 0; 4788 u64 index = 0;
4459 unsigned long nr = 1; 4789 unsigned long nr = 1;
4460 4790
4791 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4792 if (err)
4793 return err;
4794
4461 /* 4795 /*
4462 * 2 items for inode and ref 4796 * 2 items for inode and ref
4463 * 2 items for dir items 4797 * 2 items for dir items
4464 * 1 for xattr if selinux is on 4798 * 1 for xattr if selinux is on
4465 */ 4799 */
4466 err = btrfs_reserve_metadata_space(root, 5); 4800 trans = btrfs_start_transaction(root, 5);
4467 if (err) 4801 if (IS_ERR(trans))
4468 return err; 4802 return PTR_ERR(trans);
4469
4470 trans = btrfs_start_transaction(root, 1);
4471 if (!trans) {
4472 err = -ENOMEM;
4473 goto out_unlock;
4474 }
4475 btrfs_set_trans_block_group(trans, dir); 4803 btrfs_set_trans_block_group(trans, dir);
4476 4804
4477 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4478 if (err) {
4479 err = -ENOSPC;
4480 goto out_fail;
4481 }
4482
4483 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4805 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4484 dentry->d_name.len, 4806 dentry->d_name.len,
4485 dentry->d_parent->d_inode->i_ino, objectid, 4807 dentry->d_parent->d_inode->i_ino, objectid,
@@ -4519,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4519out_fail: 4841out_fail:
4520 nr = trans->blocks_used; 4842 nr = trans->blocks_used;
4521 btrfs_end_transaction_throttle(trans, root); 4843 btrfs_end_transaction_throttle(trans, root);
4522
4523out_unlock:
4524 btrfs_unreserve_metadata_space(root, 5);
4525 if (drop_on_err) 4844 if (drop_on_err)
4526 iput(inode); 4845 iput(inode);
4527 btrfs_btree_balance_dirty(root, nr); 4846 btrfs_btree_balance_dirty(root, nr);
@@ -4779,6 +5098,7 @@ again:
4779 } 5098 }
4780 flush_dcache_page(page); 5099 flush_dcache_page(page);
4781 } else if (create && PageUptodate(page)) { 5100 } else if (create && PageUptodate(page)) {
5101 WARN_ON(1);
4782 if (!trans) { 5102 if (!trans) {
4783 kunmap(page); 5103 kunmap(page);
4784 free_extent_map(em); 5104 free_extent_map(em);
@@ -4875,11 +5195,651 @@ out:
4875 return em; 5195 return em;
4876} 5196}
4877 5197
5198static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5199 u64 start, u64 len)
5200{
5201 struct btrfs_root *root = BTRFS_I(inode)->root;
5202 struct btrfs_trans_handle *trans;
5203 struct extent_map *em;
5204 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5205 struct btrfs_key ins;
5206 u64 alloc_hint;
5207 int ret;
5208
5209 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5210
5211 trans = btrfs_join_transaction(root, 0);
5212 if (!trans)
5213 return ERR_PTR(-ENOMEM);
5214
5215 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5216
5217 alloc_hint = get_extent_allocation_hint(inode, start, len);
5218 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5219 alloc_hint, (u64)-1, &ins, 1);
5220 if (ret) {
5221 em = ERR_PTR(ret);
5222 goto out;
5223 }
5224
5225 em = alloc_extent_map(GFP_NOFS);
5226 if (!em) {
5227 em = ERR_PTR(-ENOMEM);
5228 goto out;
5229 }
5230
5231 em->start = start;
5232 em->orig_start = em->start;
5233 em->len = ins.offset;
5234
5235 em->block_start = ins.objectid;
5236 em->block_len = ins.offset;
5237 em->bdev = root->fs_info->fs_devices->latest_bdev;
5238 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5239
5240 while (1) {
5241 write_lock(&em_tree->lock);
5242 ret = add_extent_mapping(em_tree, em);
5243 write_unlock(&em_tree->lock);
5244 if (ret != -EEXIST)
5245 break;
5246 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5247 }
5248
5249 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5250 ins.offset, ins.offset, 0);
5251 if (ret) {
5252 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5253 em = ERR_PTR(ret);
5254 }
5255out:
5256 btrfs_end_transaction(trans, root);
5257 return em;
5258}
5259
5260/*
5261 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5262 * block must be cow'd
5263 */
5264static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5265 struct inode *inode, u64 offset, u64 len)
5266{
5267 struct btrfs_path *path;
5268 int ret;
5269 struct extent_buffer *leaf;
5270 struct btrfs_root *root = BTRFS_I(inode)->root;
5271 struct btrfs_file_extent_item *fi;
5272 struct btrfs_key key;
5273 u64 disk_bytenr;
5274 u64 backref_offset;
5275 u64 extent_end;
5276 u64 num_bytes;
5277 int slot;
5278 int found_type;
5279
5280 path = btrfs_alloc_path();
5281 if (!path)
5282 return -ENOMEM;
5283
5284 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5285 offset, 0);
5286 if (ret < 0)
5287 goto out;
5288
5289 slot = path->slots[0];
5290 if (ret == 1) {
5291 if (slot == 0) {
5292 /* can't find the item, must cow */
5293 ret = 0;
5294 goto out;
5295 }
5296 slot--;
5297 }
5298 ret = 0;
5299 leaf = path->nodes[0];
5300 btrfs_item_key_to_cpu(leaf, &key, slot);
5301 if (key.objectid != inode->i_ino ||
5302 key.type != BTRFS_EXTENT_DATA_KEY) {
5303 /* not our file or wrong item type, must cow */
5304 goto out;
5305 }
5306
5307 if (key.offset > offset) {
5308 /* Wrong offset, must cow */
5309 goto out;
5310 }
5311
5312 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5313 found_type = btrfs_file_extent_type(leaf, fi);
5314 if (found_type != BTRFS_FILE_EXTENT_REG &&
5315 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5316 /* not a regular extent, must cow */
5317 goto out;
5318 }
5319 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5320 backref_offset = btrfs_file_extent_offset(leaf, fi);
5321
5322 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5323 if (extent_end < offset + len) {
5324 /* extent doesn't include our full range, must cow */
5325 goto out;
5326 }
5327
5328 if (btrfs_extent_readonly(root, disk_bytenr))
5329 goto out;
5330
5331 /*
5332 * look for other files referencing this extent, if we
5333 * find any we must cow
5334 */
5335 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5336 key.offset - backref_offset, disk_bytenr))
5337 goto out;
5338
5339 /*
5340 * adjust disk_bytenr and num_bytes to cover just the bytes
5341 * in this extent we are about to write. If there
5342 * are any csums in that range we have to cow in order
5343 * to keep the csums correct
5344 */
5345 disk_bytenr += backref_offset;
5346 disk_bytenr += offset - key.offset;
5347 num_bytes = min(offset + len, extent_end) - offset;
5348 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5349 goto out;
5350 /*
5351 * all of the above have passed, it is safe to overwrite this extent
5352 * without cow
5353 */
5354 ret = 1;
5355out:
5356 btrfs_free_path(path);
5357 return ret;
5358}
5359
5360static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5361 struct buffer_head *bh_result, int create)
5362{
5363 struct extent_map *em;
5364 struct btrfs_root *root = BTRFS_I(inode)->root;
5365 u64 start = iblock << inode->i_blkbits;
5366 u64 len = bh_result->b_size;
5367 struct btrfs_trans_handle *trans;
5368
5369 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5370 if (IS_ERR(em))
5371 return PTR_ERR(em);
5372
5373 /*
5374 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5375 * io. INLINE is special, and we could probably kludge it in here, but
5376 * it's still buffered so for safety lets just fall back to the generic
5377 * buffered path.
5378 *
5379 * For COMPRESSED we _have_ to read the entire extent in so we can
5380 * decompress it, so there will be buffering required no matter what we
5381 * do, so go ahead and fallback to buffered.
5382 *
5383 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5384 * to buffered IO. Don't blame me, this is the price we pay for using
5385 * the generic code.
5386 */
5387 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5388 em->block_start == EXTENT_MAP_INLINE) {
5389 free_extent_map(em);
5390 return -ENOTBLK;
5391 }
5392
5393 /* Just a good old fashioned hole, return */
5394 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5395 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5396 free_extent_map(em);
5397 /* DIO will do one hole at a time, so just unlock a sector */
5398 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5399 start + root->sectorsize - 1, GFP_NOFS);
5400 return 0;
5401 }
5402
5403 /*
5404 * We don't allocate a new extent in the following cases
5405 *
5406 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5407 * existing extent.
5408 * 2) The extent is marked as PREALLOC. We're good to go here and can
5409 * just use the extent.
5410 *
5411 */
5412 if (!create) {
5413 len = em->len - (start - em->start);
5414 goto map;
5415 }
5416
5417 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5418 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5419 em->block_start != EXTENT_MAP_HOLE)) {
5420 int type;
5421 int ret;
5422 u64 block_start;
5423
5424 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5425 type = BTRFS_ORDERED_PREALLOC;
5426 else
5427 type = BTRFS_ORDERED_NOCOW;
5428 len = min(len, em->len - (start - em->start));
5429 block_start = em->block_start + (start - em->start);
5430
5431 /*
5432 * we're not going to log anything, but we do need
5433 * to make sure the current transaction stays open
5434 * while we look for nocow cross refs
5435 */
5436 trans = btrfs_join_transaction(root, 0);
5437 if (!trans)
5438 goto must_cow;
5439
5440 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5441 ret = btrfs_add_ordered_extent_dio(inode, start,
5442 block_start, len, len, type);
5443 btrfs_end_transaction(trans, root);
5444 if (ret) {
5445 free_extent_map(em);
5446 return ret;
5447 }
5448 goto unlock;
5449 }
5450 btrfs_end_transaction(trans, root);
5451 }
5452must_cow:
5453 /*
5454 * this will cow the extent, reset the len in case we changed
5455 * it above
5456 */
5457 len = bh_result->b_size;
5458 free_extent_map(em);
5459 em = btrfs_new_extent_direct(inode, start, len);
5460 if (IS_ERR(em))
5461 return PTR_ERR(em);
5462 len = min(len, em->len - (start - em->start));
5463unlock:
5464 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5465 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5466 0, NULL, GFP_NOFS);
5467map:
5468 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5469 inode->i_blkbits;
5470 bh_result->b_size = len;
5471 bh_result->b_bdev = em->bdev;
5472 set_buffer_mapped(bh_result);
5473 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5474 set_buffer_new(bh_result);
5475
5476 free_extent_map(em);
5477
5478 return 0;
5479}
5480
5481struct btrfs_dio_private {
5482 struct inode *inode;
5483 u64 logical_offset;
5484 u64 disk_bytenr;
5485 u64 bytes;
5486 u32 *csums;
5487 void *private;
5488};
5489
5490static void btrfs_endio_direct_read(struct bio *bio, int err)
5491{
5492 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5493 struct bio_vec *bvec = bio->bi_io_vec;
5494 struct btrfs_dio_private *dip = bio->bi_private;
5495 struct inode *inode = dip->inode;
5496 struct btrfs_root *root = BTRFS_I(inode)->root;
5497 u64 start;
5498 u32 *private = dip->csums;
5499
5500 start = dip->logical_offset;
5501 do {
5502 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5503 struct page *page = bvec->bv_page;
5504 char *kaddr;
5505 u32 csum = ~(u32)0;
5506 unsigned long flags;
5507
5508 local_irq_save(flags);
5509 kaddr = kmap_atomic(page, KM_IRQ0);
5510 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5511 csum, bvec->bv_len);
5512 btrfs_csum_final(csum, (char *)&csum);
5513 kunmap_atomic(kaddr, KM_IRQ0);
5514 local_irq_restore(flags);
5515
5516 flush_dcache_page(bvec->bv_page);
5517 if (csum != *private) {
5518 printk(KERN_ERR "btrfs csum failed ino %lu off"
5519 " %llu csum %u private %u\n",
5520 inode->i_ino, (unsigned long long)start,
5521 csum, *private);
5522 err = -EIO;
5523 }
5524 }
5525
5526 start += bvec->bv_len;
5527 private++;
5528 bvec++;
5529 } while (bvec <= bvec_end);
5530
5531 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5532 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5533 bio->bi_private = dip->private;
5534
5535 kfree(dip->csums);
5536 kfree(dip);
5537 dio_end_io(bio, err);
5538}
5539
5540static void btrfs_endio_direct_write(struct bio *bio, int err)
5541{
5542 struct btrfs_dio_private *dip = bio->bi_private;
5543 struct inode *inode = dip->inode;
5544 struct btrfs_root *root = BTRFS_I(inode)->root;
5545 struct btrfs_trans_handle *trans;
5546 struct btrfs_ordered_extent *ordered = NULL;
5547 struct extent_state *cached_state = NULL;
5548 int ret;
5549
5550 if (err)
5551 goto out_done;
5552
5553 ret = btrfs_dec_test_ordered_pending(inode, &ordered,
5554 dip->logical_offset, dip->bytes);
5555 if (!ret)
5556 goto out_done;
5557
5558 BUG_ON(!ordered);
5559
5560 trans = btrfs_join_transaction(root, 1);
5561 if (!trans) {
5562 err = -ENOMEM;
5563 goto out;
5564 }
5565 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5566
5567 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5568 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5569 if (!ret)
5570 ret = btrfs_update_inode(trans, root, inode);
5571 err = ret;
5572 goto out;
5573 }
5574
5575 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5576 ordered->file_offset + ordered->len - 1, 0,
5577 &cached_state, GFP_NOFS);
5578
5579 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5580 ret = btrfs_mark_extent_written(trans, inode,
5581 ordered->file_offset,
5582 ordered->file_offset +
5583 ordered->len);
5584 if (ret) {
5585 err = ret;
5586 goto out_unlock;
5587 }
5588 } else {
5589 ret = insert_reserved_file_extent(trans, inode,
5590 ordered->file_offset,
5591 ordered->start,
5592 ordered->disk_len,
5593 ordered->len,
5594 ordered->len,
5595 0, 0, 0,
5596 BTRFS_FILE_EXTENT_REG);
5597 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5598 ordered->file_offset, ordered->len);
5599 if (ret) {
5600 err = ret;
5601 WARN_ON(1);
5602 goto out_unlock;
5603 }
5604 }
5605
5606 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5607 btrfs_ordered_update_i_size(inode, 0, ordered);
5608 btrfs_update_inode(trans, root, inode);
5609out_unlock:
5610 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5611 ordered->file_offset + ordered->len - 1,
5612 &cached_state, GFP_NOFS);
5613out:
5614 btrfs_delalloc_release_metadata(inode, ordered->len);
5615 btrfs_end_transaction(trans, root);
5616 btrfs_put_ordered_extent(ordered);
5617 btrfs_put_ordered_extent(ordered);
5618out_done:
5619 bio->bi_private = dip->private;
5620
5621 kfree(dip->csums);
5622 kfree(dip);
5623 dio_end_io(bio, err);
5624}
5625
5626static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5627 struct bio *bio, int mirror_num,
5628 unsigned long bio_flags, u64 offset)
5629{
5630 int ret;
5631 struct btrfs_root *root = BTRFS_I(inode)->root;
5632 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5633 BUG_ON(ret);
5634 return 0;
5635}
5636
5637static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5638 loff_t file_offset)
5639{
5640 struct btrfs_root *root = BTRFS_I(inode)->root;
5641 struct btrfs_dio_private *dip;
5642 struct bio_vec *bvec = bio->bi_io_vec;
5643 u64 start;
5644 int skip_sum;
5645 int write = rw & (1 << BIO_RW);
5646 int ret = 0;
5647
5648 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5649
5650 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5651 if (!dip) {
5652 ret = -ENOMEM;
5653 goto free_ordered;
5654 }
5655 dip->csums = NULL;
5656
5657 if (!skip_sum) {
5658 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5659 if (!dip->csums) {
5660 ret = -ENOMEM;
5661 goto free_ordered;
5662 }
5663 }
5664
5665 dip->private = bio->bi_private;
5666 dip->inode = inode;
5667 dip->logical_offset = file_offset;
5668
5669 start = dip->logical_offset;
5670 dip->bytes = 0;
5671 do {
5672 dip->bytes += bvec->bv_len;
5673 bvec++;
5674 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5675
5676 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5677 bio->bi_private = dip;
5678
5679 if (write)
5680 bio->bi_end_io = btrfs_endio_direct_write;
5681 else
5682 bio->bi_end_io = btrfs_endio_direct_read;
5683
5684 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5685 if (ret)
5686 goto out_err;
5687
5688 if (write && !skip_sum) {
5689 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5690 inode, rw, bio, 0, 0,
5691 dip->logical_offset,
5692 __btrfs_submit_bio_start_direct_io,
5693 __btrfs_submit_bio_done);
5694 if (ret)
5695 goto out_err;
5696 return;
5697 } else if (!skip_sum)
5698 btrfs_lookup_bio_sums_dio(root, inode, bio,
5699 dip->logical_offset, dip->csums);
5700
5701 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5702 if (ret)
5703 goto out_err;
5704 return;
5705out_err:
5706 kfree(dip->csums);
5707 kfree(dip);
5708free_ordered:
5709 /*
5710 * If this is a write, we need to clean up the reserved space and kill
5711 * the ordered extent.
5712 */
5713 if (write) {
5714 struct btrfs_ordered_extent *ordered;
5715 ordered = btrfs_lookup_ordered_extent(inode,
5716 dip->logical_offset);
5717 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5718 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5719 btrfs_free_reserved_extent(root, ordered->start,
5720 ordered->disk_len);
5721 btrfs_put_ordered_extent(ordered);
5722 btrfs_put_ordered_extent(ordered);
5723 }
5724 bio_endio(bio, ret);
5725}
5726
5727static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5728 const struct iovec *iov, loff_t offset,
5729 unsigned long nr_segs)
5730{
5731 int seg;
5732 size_t size;
5733 unsigned long addr;
5734 unsigned blocksize_mask = root->sectorsize - 1;
5735 ssize_t retval = -EINVAL;
5736 loff_t end = offset;
5737
5738 if (offset & blocksize_mask)
5739 goto out;
5740
5741 /* Check the memory alignment. Blocks cannot straddle pages */
5742 for (seg = 0; seg < nr_segs; seg++) {
5743 addr = (unsigned long)iov[seg].iov_base;
5744 size = iov[seg].iov_len;
5745 end += size;
5746 if ((addr & blocksize_mask) || (size & blocksize_mask))
5747 goto out;
5748 }
5749 retval = 0;
5750out:
5751 return retval;
5752}
4878static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5753static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4879 const struct iovec *iov, loff_t offset, 5754 const struct iovec *iov, loff_t offset,
4880 unsigned long nr_segs) 5755 unsigned long nr_segs)
4881{ 5756{
4882 return -EINVAL; 5757 struct file *file = iocb->ki_filp;
5758 struct inode *inode = file->f_mapping->host;
5759 struct btrfs_ordered_extent *ordered;
5760 struct extent_state *cached_state = NULL;
5761 u64 lockstart, lockend;
5762 ssize_t ret;
5763 int writing = rw & WRITE;
5764 int write_bits = 0;
5765 size_t count = iov_length(iov, nr_segs);
5766
5767 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
5768 offset, nr_segs)) {
5769 return 0;
5770 }
5771
5772 lockstart = offset;
5773 lockend = offset + count - 1;
5774
5775 if (writing) {
5776 ret = btrfs_delalloc_reserve_space(inode, count);
5777 if (ret)
5778 goto out;
5779 }
5780
5781 while (1) {
5782 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5783 0, &cached_state, GFP_NOFS);
5784 /*
5785 * We're concerned with the entire range that we're going to be
5786 * doing DIO to, so we need to make sure theres no ordered
5787 * extents in this range.
5788 */
5789 ordered = btrfs_lookup_ordered_range(inode, lockstart,
5790 lockend - lockstart + 1);
5791 if (!ordered)
5792 break;
5793 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5794 &cached_state, GFP_NOFS);
5795 btrfs_start_ordered_extent(inode, ordered, 1);
5796 btrfs_put_ordered_extent(ordered);
5797 cond_resched();
5798 }
5799
5800 /*
5801 * we don't use btrfs_set_extent_delalloc because we don't want
5802 * the dirty or uptodate bits
5803 */
5804 if (writing) {
5805 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
5806 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
5807 EXTENT_DELALLOC, 0, NULL, &cached_state,
5808 GFP_NOFS);
5809 if (ret) {
5810 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
5811 lockend, EXTENT_LOCKED | write_bits,
5812 1, 0, &cached_state, GFP_NOFS);
5813 goto out;
5814 }
5815 }
5816
5817 free_extent_state(cached_state);
5818 cached_state = NULL;
5819
5820 ret = __blockdev_direct_IO(rw, iocb, inode,
5821 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
5822 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
5823 btrfs_submit_direct, 0);
5824
5825 if (ret < 0 && ret != -EIOCBQUEUED) {
5826 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
5827 offset + iov_length(iov, nr_segs) - 1,
5828 EXTENT_LOCKED | write_bits, 1, 0,
5829 &cached_state, GFP_NOFS);
5830 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
5831 /*
5832 * We're falling back to buffered, unlock the section we didn't
5833 * do IO on.
5834 */
5835 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
5836 offset + iov_length(iov, nr_segs) - 1,
5837 EXTENT_LOCKED | write_bits, 1, 0,
5838 &cached_state, GFP_NOFS);
5839 }
5840out:
5841 free_extent_state(cached_state);
5842 return ret;
4883} 5843}
4884 5844
4885static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 5845static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5043,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5043 u64 page_start; 6003 u64 page_start;
5044 u64 page_end; 6004 u64 page_end;
5045 6005
5046 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6006 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5047 if (ret) { 6007 if (ret) {
5048 if (ret == -ENOMEM) 6008 if (ret == -ENOMEM)
5049 ret = VM_FAULT_OOM; 6009 ret = VM_FAULT_OOM;
@@ -5052,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5052 goto out; 6012 goto out;
5053 } 6013 }
5054 6014
5055 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5056 if (ret) {
5057 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5058 ret = VM_FAULT_SIGBUS;
5059 goto out;
5060 }
5061
5062 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6015 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5063again: 6016again:
5064 lock_page(page); 6017 lock_page(page);
@@ -5068,7 +6021,6 @@ again:
5068 6021
5069 if ((page->mapping != inode->i_mapping) || 6022 if ((page->mapping != inode->i_mapping) ||
5070 (page_start >= size)) { 6023 (page_start >= size)) {
5071 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5072 /* page got truncated out from underneath us */ 6024 /* page got truncated out from underneath us */
5073 goto out_unlock; 6025 goto out_unlock;
5074 } 6026 }
@@ -5109,7 +6061,6 @@ again:
5109 unlock_extent_cached(io_tree, page_start, page_end, 6061 unlock_extent_cached(io_tree, page_start, page_end,
5110 &cached_state, GFP_NOFS); 6062 &cached_state, GFP_NOFS);
5111 ret = VM_FAULT_SIGBUS; 6063 ret = VM_FAULT_SIGBUS;
5112 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5113 goto out_unlock; 6064 goto out_unlock;
5114 } 6065 }
5115 ret = 0; 6066 ret = 0;
@@ -5136,10 +6087,10 @@ again:
5136 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS); 6087 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5137 6088
5138out_unlock: 6089out_unlock:
5139 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5140 if (!ret) 6090 if (!ret)
5141 return VM_FAULT_LOCKED; 6091 return VM_FAULT_LOCKED;
5142 unlock_page(page); 6092 unlock_page(page);
6093 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5143out: 6094out:
5144 return ret; 6095 return ret;
5145} 6096}
@@ -5164,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
5164 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6115 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5165 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6116 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5166 6117
5167 trans = btrfs_start_transaction(root, 1); 6118 trans = btrfs_start_transaction(root, 0);
6119 BUG_ON(IS_ERR(trans));
5168 btrfs_set_trans_block_group(trans, inode); 6120 btrfs_set_trans_block_group(trans, inode);
6121 trans->block_rsv = root->orphan_block_rsv;
5169 6122
5170 /* 6123 /*
5171 * setattr is responsible for setting the ordered_data_close flag, 6124 * setattr is responsible for setting the ordered_data_close flag,
@@ -5188,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
5188 btrfs_add_ordered_operation(trans, root, inode); 6141 btrfs_add_ordered_operation(trans, root, inode);
5189 6142
5190 while (1) { 6143 while (1) {
6144 if (!trans) {
6145 trans = btrfs_start_transaction(root, 0);
6146 BUG_ON(IS_ERR(trans));
6147 btrfs_set_trans_block_group(trans, inode);
6148 trans->block_rsv = root->orphan_block_rsv;
6149 }
6150
6151 ret = btrfs_block_rsv_check(trans, root,
6152 root->orphan_block_rsv, 0, 5);
6153 if (ret) {
6154 BUG_ON(ret != -EAGAIN);
6155 ret = btrfs_commit_transaction(trans, root);
6156 BUG_ON(ret);
6157 trans = NULL;
6158 continue;
6159 }
6160
5191 ret = btrfs_truncate_inode_items(trans, root, inode, 6161 ret = btrfs_truncate_inode_items(trans, root, inode,
5192 inode->i_size, 6162 inode->i_size,
5193 BTRFS_EXTENT_DATA_KEY); 6163 BTRFS_EXTENT_DATA_KEY);
@@ -5199,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
5199 6169
5200 nr = trans->blocks_used; 6170 nr = trans->blocks_used;
5201 btrfs_end_transaction(trans, root); 6171 btrfs_end_transaction(trans, root);
6172 trans = NULL;
5202 btrfs_btree_balance_dirty(root, nr); 6173 btrfs_btree_balance_dirty(root, nr);
5203
5204 trans = btrfs_start_transaction(root, 1);
5205 btrfs_set_trans_block_group(trans, inode);
5206 } 6174 }
5207 6175
5208 if (ret == 0 && inode->i_nlink > 0) { 6176 if (ret == 0 && inode->i_nlink > 0) {
@@ -5263,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5263struct inode *btrfs_alloc_inode(struct super_block *sb) 6231struct inode *btrfs_alloc_inode(struct super_block *sb)
5264{ 6232{
5265 struct btrfs_inode *ei; 6233 struct btrfs_inode *ei;
6234 struct inode *inode;
5266 6235
5267 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6236 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5268 if (!ei) 6237 if (!ei)
5269 return NULL; 6238 return NULL;
6239
6240 ei->root = NULL;
6241 ei->space_info = NULL;
6242 ei->generation = 0;
6243 ei->sequence = 0;
5270 ei->last_trans = 0; 6244 ei->last_trans = 0;
5271 ei->last_sub_trans = 0; 6245 ei->last_sub_trans = 0;
5272 ei->logged_trans = 0; 6246 ei->logged_trans = 0;
5273 ei->outstanding_extents = 0; 6247 ei->delalloc_bytes = 0;
5274 ei->reserved_extents = 0; 6248 ei->reserved_bytes = 0;
5275 ei->root = NULL; 6249 ei->disk_i_size = 0;
6250 ei->flags = 0;
6251 ei->index_cnt = (u64)-1;
6252 ei->last_unlink_trans = 0;
6253
5276 spin_lock_init(&ei->accounting_lock); 6254 spin_lock_init(&ei->accounting_lock);
6255 atomic_set(&ei->outstanding_extents, 0);
6256 ei->reserved_extents = 0;
6257
6258 ei->ordered_data_close = 0;
6259 ei->orphan_meta_reserved = 0;
6260 ei->dummy_inode = 0;
6261 ei->force_compress = 0;
6262
6263 inode = &ei->vfs_inode;
6264 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6265 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6266 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6267 mutex_init(&ei->log_mutex);
5277 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6268 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5278 INIT_LIST_HEAD(&ei->i_orphan); 6269 INIT_LIST_HEAD(&ei->i_orphan);
6270 INIT_LIST_HEAD(&ei->delalloc_inodes);
5279 INIT_LIST_HEAD(&ei->ordered_operations); 6271 INIT_LIST_HEAD(&ei->ordered_operations);
5280 return &ei->vfs_inode; 6272 RB_CLEAR_NODE(&ei->rb_node);
6273
6274 return inode;
5281} 6275}
5282 6276
5283void btrfs_destroy_inode(struct inode *inode) 6277void btrfs_destroy_inode(struct inode *inode)
@@ -5287,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
5287 6281
5288 WARN_ON(!list_empty(&inode->i_dentry)); 6282 WARN_ON(!list_empty(&inode->i_dentry));
5289 WARN_ON(inode->i_data.nrpages); 6283 WARN_ON(inode->i_data.nrpages);
6284 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6285 WARN_ON(BTRFS_I(inode)->reserved_extents);
5290 6286
5291 /* 6287 /*
5292 * This can happen where we create an inode, but somebody else also 6288 * This can happen where we create an inode, but somebody else also
@@ -5307,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
5307 spin_unlock(&root->fs_info->ordered_extent_lock); 6303 spin_unlock(&root->fs_info->ordered_extent_lock);
5308 } 6304 }
5309 6305
5310 spin_lock(&root->list_lock); 6306 spin_lock(&root->orphan_lock);
5311 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6307 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5312 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6308 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5313 inode->i_ino); 6309 inode->i_ino);
5314 list_del_init(&BTRFS_I(inode)->i_orphan); 6310 list_del_init(&BTRFS_I(inode)->i_orphan);
5315 } 6311 }
5316 spin_unlock(&root->list_lock); 6312 spin_unlock(&root->orphan_lock);
5317 6313
5318 while (1) { 6314 while (1) {
5319 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6315 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5434,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5434 if (S_ISDIR(old_inode->i_mode) && new_inode && 6430 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5435 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6431 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5436 return -ENOTEMPTY; 6432 return -ENOTEMPTY;
5437
5438 /*
5439 * We want to reserve the absolute worst case amount of items. So if
5440 * both inodes are subvols and we need to unlink them then that would
5441 * require 4 item modifications, but if they are both normal inodes it
5442 * would require 5 item modifications, so we'll assume their normal
5443 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5444 * should cover the worst case number of items we'll modify.
5445 */
5446 ret = btrfs_reserve_metadata_space(root, 11);
5447 if (ret)
5448 return ret;
5449
5450 /* 6433 /*
5451 * we're using rename to replace one file with another. 6434 * we're using rename to replace one file with another.
5452 * and the replacement file is large. Start IO on it now so 6435 * and the replacement file is large. Start IO on it now so
@@ -5459,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5459 /* close the racy window with snapshot create/destroy ioctl */ 6442 /* close the racy window with snapshot create/destroy ioctl */
5460 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6443 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5461 down_read(&root->fs_info->subvol_sem); 6444 down_read(&root->fs_info->subvol_sem);
6445 /*
6446 * We want to reserve the absolute worst case amount of items. So if
6447 * both inodes are subvols and we need to unlink them then that would
6448 * require 4 item modifications, but if they are both normal inodes it
6449 * would require 5 item modifications, so we'll assume their normal
6450 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6451 * should cover the worst case number of items we'll modify.
6452 */
6453 trans = btrfs_start_transaction(root, 20);
6454 if (IS_ERR(trans))
6455 return PTR_ERR(trans);
5462 6456
5463 trans = btrfs_start_transaction(root, 1);
5464 btrfs_set_trans_block_group(trans, new_dir); 6457 btrfs_set_trans_block_group(trans, new_dir);
5465 6458
5466 if (dest != root) 6459 if (dest != root)
@@ -5559,7 +6552,6 @@ out_fail:
5559 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6552 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5560 up_read(&root->fs_info->subvol_sem); 6553 up_read(&root->fs_info->subvol_sem);
5561 6554
5562 btrfs_unreserve_metadata_space(root, 11);
5563 return ret; 6555 return ret;
5564} 6556}
5565 6557
@@ -5611,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5611 return 0; 6603 return 0;
5612} 6604}
5613 6605
6606int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6607{
6608 struct btrfs_inode *binode;
6609 struct inode *inode = NULL;
6610
6611 spin_lock(&root->fs_info->delalloc_lock);
6612 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6613 binode = list_entry(root->fs_info->delalloc_inodes.next,
6614 struct btrfs_inode, delalloc_inodes);
6615 inode = igrab(&binode->vfs_inode);
6616 if (inode) {
6617 list_move_tail(&binode->delalloc_inodes,
6618 &root->fs_info->delalloc_inodes);
6619 break;
6620 }
6621
6622 list_del_init(&binode->delalloc_inodes);
6623 cond_resched_lock(&root->fs_info->delalloc_lock);
6624 }
6625 spin_unlock(&root->fs_info->delalloc_lock);
6626
6627 if (inode) {
6628 write_inode_now(inode, 0);
6629 if (delay_iput)
6630 btrfs_add_delayed_iput(inode);
6631 else
6632 iput(inode);
6633 return 1;
6634 }
6635 return 0;
6636}
6637
5614static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6638static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5615 const char *symname) 6639 const char *symname)
5616{ 6640{
@@ -5634,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5634 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6658 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5635 return -ENAMETOOLONG; 6659 return -ENAMETOOLONG;
5636 6660
6661 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6662 if (err)
6663 return err;
5637 /* 6664 /*
5638 * 2 items for inode item and ref 6665 * 2 items for inode item and ref
5639 * 2 items for dir items 6666 * 2 items for dir items
5640 * 1 item for xattr if selinux is on 6667 * 1 item for xattr if selinux is on
5641 */ 6668 */
5642 err = btrfs_reserve_metadata_space(root, 5); 6669 trans = btrfs_start_transaction(root, 5);
5643 if (err) 6670 if (IS_ERR(trans))
5644 return err; 6671 return PTR_ERR(trans);
5645 6672
5646 trans = btrfs_start_transaction(root, 1);
5647 if (!trans)
5648 goto out_fail;
5649 btrfs_set_trans_block_group(trans, dir); 6673 btrfs_set_trans_block_group(trans, dir);
5650 6674
5651 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5652 if (err) {
5653 err = -ENOSPC;
5654 goto out_unlock;
5655 }
5656
5657 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6675 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5658 dentry->d_name.len, 6676 dentry->d_name.len,
5659 dentry->d_parent->d_inode->i_ino, objectid, 6677 dentry->d_parent->d_inode->i_ino, objectid,
@@ -5725,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5725out_unlock: 6743out_unlock:
5726 nr = trans->blocks_used; 6744 nr = trans->blocks_used;
5727 btrfs_end_transaction_throttle(trans, root); 6745 btrfs_end_transaction_throttle(trans, root);
5728out_fail:
5729 btrfs_unreserve_metadata_space(root, 5);
5730 if (drop_inode) { 6746 if (drop_inode) {
5731 inode_dec_link_count(inode); 6747 inode_dec_link_count(inode);
5732 iput(inode); 6748 iput(inode);
@@ -5735,33 +6751,28 @@ out_fail:
5735 return err; 6751 return err;
5736} 6752}
5737 6753
5738static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 6754int btrfs_prealloc_file_range(struct inode *inode, int mode,
5739 u64 alloc_hint, int mode, loff_t actual_len) 6755 u64 start, u64 num_bytes, u64 min_size,
6756 loff_t actual_len, u64 *alloc_hint)
5740{ 6757{
5741 struct btrfs_trans_handle *trans; 6758 struct btrfs_trans_handle *trans;
5742 struct btrfs_root *root = BTRFS_I(inode)->root; 6759 struct btrfs_root *root = BTRFS_I(inode)->root;
5743 struct btrfs_key ins; 6760 struct btrfs_key ins;
5744 u64 cur_offset = start; 6761 u64 cur_offset = start;
5745 u64 num_bytes = end - start;
5746 int ret = 0; 6762 int ret = 0;
5747 u64 i_size;
5748 6763
5749 while (num_bytes > 0) { 6764 while (num_bytes > 0) {
5750 trans = btrfs_start_transaction(root, 1); 6765 trans = btrfs_start_transaction(root, 3);
5751 6766 if (IS_ERR(trans)) {
5752 ret = btrfs_reserve_extent(trans, root, num_bytes, 6767 ret = PTR_ERR(trans);
5753 root->sectorsize, 0, alloc_hint, 6768 break;
5754 (u64)-1, &ins, 1);
5755 if (ret) {
5756 WARN_ON(1);
5757 goto stop_trans;
5758 } 6769 }
5759 6770
5760 ret = btrfs_reserve_metadata_space(root, 3); 6771 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6772 0, *alloc_hint, (u64)-1, &ins, 1);
5761 if (ret) { 6773 if (ret) {
5762 btrfs_free_reserved_extent(root, ins.objectid, 6774 btrfs_end_transaction(trans, root);
5763 ins.offset); 6775 break;
5764 goto stop_trans;
5765 } 6776 }
5766 6777
5767 ret = insert_reserved_file_extent(trans, inode, 6778 ret = insert_reserved_file_extent(trans, inode,
@@ -5775,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5775 6786
5776 num_bytes -= ins.offset; 6787 num_bytes -= ins.offset;
5777 cur_offset += ins.offset; 6788 cur_offset += ins.offset;
5778 alloc_hint = ins.objectid + ins.offset; 6789 *alloc_hint = ins.objectid + ins.offset;
5779 6790
5780 inode->i_ctime = CURRENT_TIME; 6791 inode->i_ctime = CURRENT_TIME;
5781 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 6792 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5782 if (!(mode & FALLOC_FL_KEEP_SIZE) && 6793 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5783 (actual_len > inode->i_size) && 6794 (actual_len > inode->i_size) &&
5784 (cur_offset > inode->i_size)) { 6795 (cur_offset > inode->i_size)) {
5785
5786 if (cur_offset > actual_len) 6796 if (cur_offset > actual_len)
5787 i_size = actual_len; 6797 i_size_write(inode, actual_len);
5788 else 6798 else
5789 i_size = cur_offset; 6799 i_size_write(inode, cur_offset);
5790 i_size_write(inode, i_size); 6800 i_size_write(inode, cur_offset);
5791 btrfs_ordered_update_i_size(inode, i_size, NULL); 6801 btrfs_ordered_update_i_size(inode, cur_offset, NULL);
5792 } 6802 }
5793 6803
5794 ret = btrfs_update_inode(trans, root, inode); 6804 ret = btrfs_update_inode(trans, root, inode);
5795 BUG_ON(ret); 6805 BUG_ON(ret);
5796 6806
5797 btrfs_end_transaction(trans, root); 6807 btrfs_end_transaction(trans, root);
5798 btrfs_unreserve_metadata_space(root, 3);
5799 } 6808 }
5800 return ret; 6809 return ret;
5801
5802stop_trans:
5803 btrfs_end_transaction(trans, root);
5804 return ret;
5805
5806} 6810}
5807 6811
5808static long btrfs_fallocate(struct inode *inode, int mode, 6812static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5835,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5835 goto out; 6839 goto out;
5836 } 6840 }
5837 6841
5838 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, 6842 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
5839 alloc_end - alloc_start);
5840 if (ret) 6843 if (ret)
5841 goto out; 6844 goto out;
5842 6845
@@ -5881,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5881 if (em->block_start == EXTENT_MAP_HOLE || 6884 if (em->block_start == EXTENT_MAP_HOLE ||
5882 (cur_offset >= inode->i_size && 6885 (cur_offset >= inode->i_size &&
5883 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 6886 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5884 ret = prealloc_file_range(inode, 6887 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
5885 cur_offset, last_byte, 6888 last_byte - cur_offset,
5886 alloc_hint, mode, offset+len); 6889 1 << inode->i_blkbits,
6890 offset + len,
6891 &alloc_hint);
5887 if (ret < 0) { 6892 if (ret < 0) {
5888 free_extent_map(em); 6893 free_extent_map(em);
5889 break; 6894 break;
5890 } 6895 }
5891 } 6896 }
5892 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5893 alloc_hint = em->block_start;
5894 free_extent_map(em); 6897 free_extent_map(em);
5895 6898
5896 cur_offset = last_byte; 6899 cur_offset = last_byte;
@@ -5902,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5902 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 6905 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5903 &cached_state, GFP_NOFS); 6906 &cached_state, GFP_NOFS);
5904 6907
5905 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 6908 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
5906 alloc_end - alloc_start);
5907out: 6909out:
5908 mutex_unlock(&inode->i_mutex); 6910 mutex_unlock(&inode->i_mutex);
5909 return ret; 6911 return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..4dbaf89b1337 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240 u64 index = 0; 240 u64 index = 0;
241 241
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
243 0, &objectid);
244 if (ret)
245 return ret;
242 /* 246 /*
243 * 1 - inode item 247 * 1 - inode item
244 * 2 - refs 248 * 2 - refs
245 * 1 - root item 249 * 1 - root item
246 * 2 - dir items 250 * 2 - dir items
247 */ 251 */
248 ret = btrfs_reserve_metadata_space(root, 6); 252 trans = btrfs_start_transaction(root, 6);
249 if (ret) 253 if (IS_ERR(trans))
250 return ret; 254 return PTR_ERR(trans);
251
252 trans = btrfs_start_transaction(root, 1);
253 BUG_ON(!trans);
254
255 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
256 0, &objectid);
257 if (ret)
258 goto fail;
259 255
260 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
261 0, objectid, NULL, 0, 0, 0); 257 0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
345 err = btrfs_commit_transaction(trans, root); 341 err = btrfs_commit_transaction(trans, root);
346 if (err && !ret) 342 if (err && !ret)
347 ret = err; 343 ret = err;
348
349 btrfs_unreserve_metadata_space(root, 6);
350 return ret; 344 return ret;
351} 345}
352 346
353static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
354 char *name, int namelen)
355{ 348{
356 struct inode *inode; 349 struct inode *inode;
357 struct btrfs_pending_snapshot *pending_snapshot; 350 struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
361 if (!root->ref_cows) 354 if (!root->ref_cows)
362 return -EINVAL; 355 return -EINVAL;
363 356
364 /*
365 * 1 - inode item
366 * 2 - refs
367 * 1 - root item
368 * 2 - dir items
369 */
370 ret = btrfs_reserve_metadata_space(root, 6);
371 if (ret)
372 goto fail;
373
374 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 357 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
375 if (!pending_snapshot) { 358 if (!pending_snapshot)
376 ret = -ENOMEM; 359 return -ENOMEM;
377 btrfs_unreserve_metadata_space(root, 6); 360
378 goto fail; 361 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
379 }
380 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
381 if (!pending_snapshot->name) {
382 ret = -ENOMEM;
383 kfree(pending_snapshot);
384 btrfs_unreserve_metadata_space(root, 6);
385 goto fail;
386 }
387 memcpy(pending_snapshot->name, name, namelen);
388 pending_snapshot->name[namelen] = '\0';
389 pending_snapshot->dentry = dentry; 362 pending_snapshot->dentry = dentry;
390 trans = btrfs_start_transaction(root, 1);
391 BUG_ON(!trans);
392 pending_snapshot->root = root; 363 pending_snapshot->root = root;
364
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) {
367 ret = PTR_ERR(trans);
368 goto fail;
369 }
370
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret);
373
393 list_add(&pending_snapshot->list, 374 list_add(&pending_snapshot->list,
394 &trans->transaction->pending_snapshots); 375 &trans->transaction->pending_snapshots);
395 ret = btrfs_commit_transaction(trans, root); 376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
396 BUG_ON(ret); 377 BUG_ON(ret);
397 btrfs_unreserve_metadata_space(root, 6); 378
379 ret = pending_snapshot->error;
380 if (ret)
381 goto fail;
382
383 btrfs_orphan_cleanup(pending_snapshot->snap);
398 384
399 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
400 if (IS_ERR(inode)) { 386 if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
405 d_instantiate(dentry, inode); 391 d_instantiate(dentry, inode);
406 ret = 0; 392 ret = 0;
407fail: 393fail:
394 kfree(pending_snapshot);
408 return ret; 395 return ret;
409} 396}
410 397
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
456 goto out_up_read; 443 goto out_up_read;
457 444
458 if (snap_src) { 445 if (snap_src) {
459 error = create_snapshot(snap_src, dentry, 446 error = create_snapshot(snap_src, dentry);
460 name, namelen);
461 } else { 447 } else {
462 error = create_subvol(BTRFS_I(dir)->root, dentry, 448 error = create_subvol(BTRFS_I(dir)->root, dentry,
463 name, namelen); 449 name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1; 588 BTRFS_I(inode)->force_compress = 1;
603 589
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
605 if (ret) { 591 if (ret)
606 ret = -ENOSPC; 592 goto err_unlock;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
617again: 593again:
618 if (inode->i_size == 0 || 594 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { 595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
622 } 598 }
623 599
624 page = grab_cache_page(inode->i_mapping, i); 600 page = grab_cache_page(inode->i_mapping, i);
625 if (!page) 601 if (!page) {
602 ret = -ENOMEM;
626 goto err_reservations; 603 goto err_reservations;
604 }
627 605
628 if (!PageUptodate(page)) { 606 if (!PageUptodate(page)) {
629 btrfs_readpage(NULL, page); 607 btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
631 if (!PageUptodate(page)) { 609 if (!PageUptodate(page)) {
632 unlock_page(page); 610 unlock_page(page);
633 page_cache_release(page); 611 page_cache_release(page);
612 ret = -EIO;
634 goto err_reservations; 613 goto err_reservations;
635 } 614 }
636 } 615 }
@@ -644,8 +623,7 @@ again:
644 wait_on_page_writeback(page); 623 wait_on_page_writeback(page);
645 624
646 if (PageDirty(page)) { 625 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode, 626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
648 PAGE_CACHE_SIZE);
649 goto loop_unlock; 627 goto loop_unlock;
650 } 628 }
651 629
@@ -683,7 +661,6 @@ loop_unlock:
683 page_cache_release(page); 661 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex); 662 mutex_unlock(&inode->i_mutex);
685 663
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++; 665 i++;
689 } 666 }
@@ -713,9 +690,9 @@ loop_unlock:
713 return 0; 690 return 0;
714 691
715err_reservations: 692err_reservations:
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
694err_unlock:
716 mutex_unlock(&inode->i_mutex); 695 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret; 696 return ret;
720} 697}
721 698
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
811 device->name, (unsigned long long)new_size); 788 device->name, (unsigned long long)new_size);
812 789
813 if (new_size > old_size) { 790 if (new_size > old_size) {
814 trans = btrfs_start_transaction(root, 1); 791 trans = btrfs_start_transaction(root, 0);
815 ret = btrfs_grow_device(trans, device, new_size); 792 ret = btrfs_grow_device(trans, device, new_size);
816 btrfs_commit_transaction(trans, root); 793 btrfs_commit_transaction(trans, root);
817 } else { 794 } else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1300 if (err) 1277 if (err)
1301 goto out_up_write; 1278 goto out_up_write;
1302 1279
1303 trans = btrfs_start_transaction(root, 1); 1280 trans = btrfs_start_transaction(root, 0);
1281 if (IS_ERR(trans)) {
1282 err = PTR_ERR(trans);
1283 goto out_up_write;
1284 }
1285 trans->block_rsv = &root->fs_info->global_block_rsv;
1286
1304 ret = btrfs_unlink_subvol(trans, root, dir, 1287 ret = btrfs_unlink_subvol(trans, root, dir,
1305 dest->root_key.objectid, 1288 dest->root_key.objectid,
1306 dentry->d_name.name, 1289 dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1314 dest->root_item.drop_level = 0; 1297 dest->root_item.drop_level = 0;
1315 btrfs_set_root_refs(&dest->root_item, 0); 1298 btrfs_set_root_refs(&dest->root_item, 0);
1316 1299
1317 ret = btrfs_insert_orphan_item(trans, 1300 if (!xchg(&dest->orphan_item_inserted, 1)) {
1318 root->fs_info->tree_root, 1301 ret = btrfs_insert_orphan_item(trans,
1319 dest->root_key.objectid); 1302 root->fs_info->tree_root,
1320 BUG_ON(ret); 1303 dest->root_key.objectid);
1304 BUG_ON(ret);
1305 }
1321 1306
1322 ret = btrfs_commit_transaction(trans, root); 1307 ret = btrfs_commit_transaction(trans, root);
1323 BUG_ON(ret); 1308 BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1358 ret = -EPERM; 1343 ret = -EPERM;
1359 goto out; 1344 goto out;
1360 } 1345 }
1361 btrfs_defrag_root(root, 0); 1346 ret = btrfs_defrag_root(root, 0);
1362 btrfs_defrag_root(root->fs_info->extent_root, 0); 1347 if (ret)
1348 goto out;
1349 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
1363 break; 1350 break;
1364 case S_IFREG: 1351 case S_IFREG:
1365 if (!(file->f_mode & FMODE_WRITE)) { 1352 if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1389 /* the rest are all set to zero by kzalloc */ 1376 /* the rest are all set to zero by kzalloc */
1390 range->len = (u64)-1; 1377 range->len = (u64)-1;
1391 } 1378 }
1392 btrfs_defrag_file(file, range); 1379 ret = btrfs_defrag_file(file, range);
1393 kfree(range); 1380 kfree(range);
1394 break; 1381 break;
1382 default:
1383 ret = -EINVAL;
1395 } 1384 }
1396out: 1385out:
1397 mnt_drop_write(file->f_path.mnt); 1386 mnt_drop_write(file->f_path.mnt);
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1550 btrfs_wait_ordered_range(src, off, off+len); 1539 btrfs_wait_ordered_range(src, off, off+len);
1551 } 1540 }
1552 1541
1553 trans = btrfs_start_transaction(root, 1);
1554 BUG_ON(!trans);
1555
1556 /* punch hole in destination first */
1557 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1558
1559 /* clone data */ 1542 /* clone data */
1560 key.objectid = src->i_ino; 1543 key.objectid = src->i_ino;
1561 key.type = BTRFS_EXTENT_DATA_KEY; 1544 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1566 * note the key will change type as we walk through the 1549 * note the key will change type as we walk through the
1567 * tree. 1550 * tree.
1568 */ 1551 */
1569 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1552 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1570 if (ret < 0) 1553 if (ret < 0)
1571 goto out; 1554 goto out;
1572 1555
@@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1629 new_key.objectid = inode->i_ino; 1612 new_key.objectid = inode->i_ino;
1630 new_key.offset = key.offset + destoff - off; 1613 new_key.offset = key.offset + destoff - off;
1631 1614
1615 trans = btrfs_start_transaction(root, 1);
1616 if (IS_ERR(trans)) {
1617 ret = PTR_ERR(trans);
1618 goto out;
1619 }
1620
1632 if (type == BTRFS_FILE_EXTENT_REG || 1621 if (type == BTRFS_FILE_EXTENT_REG ||
1633 type == BTRFS_FILE_EXTENT_PREALLOC) { 1622 type == BTRFS_FILE_EXTENT_PREALLOC) {
1623 if (off > key.offset) {
1624 datao += off - key.offset;
1625 datal -= off - key.offset;
1626 }
1627
1628 if (key.offset + datal > off + len)
1629 datal = off + len - key.offset;
1630
1631 ret = btrfs_drop_extents(trans, inode,
1632 new_key.offset,
1633 new_key.offset + datal,
1634 &hint_byte, 1);
1635 BUG_ON(ret);
1636
1634 ret = btrfs_insert_empty_item(trans, root, path, 1637 ret = btrfs_insert_empty_item(trans, root, path,
1635 &new_key, size); 1638 &new_key, size);
1636 if (ret) 1639 BUG_ON(ret);
1637 goto out;
1638 1640
1639 leaf = path->nodes[0]; 1641 leaf = path->nodes[0];
1640 slot = path->slots[0]; 1642 slot = path->slots[0];
@@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1645 extent = btrfs_item_ptr(leaf, slot, 1647 extent = btrfs_item_ptr(leaf, slot,
1646 struct btrfs_file_extent_item); 1648 struct btrfs_file_extent_item);
1647 1649
1648 if (off > key.offset) {
1649 datao += off - key.offset;
1650 datal -= off - key.offset;
1651 }
1652
1653 if (key.offset + datal > off + len)
1654 datal = off + len - key.offset;
1655
1656 /* disko == 0 means it's a hole */ 1650 /* disko == 0 means it's a hole */
1657 if (!disko) 1651 if (!disko)
1658 datao = 0; 1652 datao = 0;
@@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1683 1677
1684 if (comp && (skip || trim)) { 1678 if (comp && (skip || trim)) {
1685 ret = -EINVAL; 1679 ret = -EINVAL;
1680 btrfs_end_transaction(trans, root);
1686 goto out; 1681 goto out;
1687 } 1682 }
1688 size -= skip + trim; 1683 size -= skip + trim;
1689 datal -= skip + trim; 1684 datal -= skip + trim;
1685
1686 ret = btrfs_drop_extents(trans, inode,
1687 new_key.offset,
1688 new_key.offset + datal,
1689 &hint_byte, 1);
1690 BUG_ON(ret);
1691
1690 ret = btrfs_insert_empty_item(trans, root, path, 1692 ret = btrfs_insert_empty_item(trans, root, path,
1691 &new_key, size); 1693 &new_key, size);
1692 if (ret) 1694 BUG_ON(ret);
1693 goto out;
1694 1695
1695 if (skip) { 1696 if (skip) {
1696 u32 start = 1697 u32 start =
@@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1708 } 1709 }
1709 1710
1710 btrfs_mark_buffer_dirty(leaf); 1711 btrfs_mark_buffer_dirty(leaf);
1711 } 1712 btrfs_release_path(root, path);
1712 1713
1714 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1715 if (new_key.offset + datal > inode->i_size)
1716 btrfs_i_size_write(inode,
1717 new_key.offset + datal);
1718 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1719 ret = btrfs_update_inode(trans, root, inode);
1720 BUG_ON(ret);
1721 btrfs_end_transaction(trans, root);
1722 }
1713next: 1723next:
1714 btrfs_release_path(root, path); 1724 btrfs_release_path(root, path);
1715 key.offset++; 1725 key.offset++;
@@ -1717,17 +1727,7 @@ next:
1717 ret = 0; 1727 ret = 0;
1718out: 1728out:
1719 btrfs_release_path(root, path); 1729 btrfs_release_path(root, path);
1720 if (ret == 0) {
1721 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1722 if (destoff + olen > inode->i_size)
1723 btrfs_i_size_write(inode, destoff + olen);
1724 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1725 ret = btrfs_update_inode(trans, root, inode);
1726 }
1727 btrfs_end_transaction(trans, root);
1728 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1730 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1729 if (ret)
1730 vmtruncate(inode, 0);
1731out_unlock: 1731out_unlock:
1732 mutex_unlock(&src->i_mutex); 1732 mutex_unlock(&src->i_mutex);
1733 mutex_unlock(&inode->i_mutex); 1733 mutex_unlock(&inode->i_mutex);
@@ -1845,7 +1845,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1845 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 1845 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1846 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 1846 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1847 dir_id, "default", 7, 1); 1847 dir_id, "default", 7, 1);
1848 if (!di) { 1848 if (IS_ERR_OR_NULL(di)) {
1849 btrfs_free_path(path); 1849 btrfs_free_path(path);
1850 btrfs_end_transaction(trans, root); 1850 btrfs_end_transaction(trans, root);
1851 printk(KERN_ERR "Umm, you don't have the default dir item, " 1851 printk(KERN_ERR "Umm, you don't have the default dir item, "
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
124 return 1; 124 return 1;
125} 125}
126 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
127/* 136/*
128 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
129 * the first one less than this offset 138 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
161 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
162 * inserted. 171 * inserted.
163 */ 172 */
164int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
165 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio)
166{ 176{
167 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
168 struct rb_node *node; 178 struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
182 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
183 set_bit(type, &entry->flags); 193 set_bit(type, &entry->flags);
184 194
195 if (dio)
196 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
197
185 /* one ref for the tree */ 198 /* one ref for the tree */
186 atomic_set(&entry->refs, 1); 199 atomic_set(&entry->refs, 1);
187 init_waitqueue_head(&entry->wait); 200 init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
203 return 0; 216 return 0;
204} 217}
205 218
219int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type)
221{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0);
224}
225
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type)
228{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1);
231}
232
206/* 233/*
207 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 234 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
208 * when an ordered extent is finished. If the list covers more than one 235 * when an ordered extent is finished. If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
311 tree->last = NULL; 338 tree->last = NULL;
312 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 339 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
313 340
314 spin_lock(&BTRFS_I(inode)->accounting_lock);
315 WARN_ON(!BTRFS_I(inode)->outstanding_extents);
316 BTRFS_I(inode)->outstanding_extents--;
317 spin_unlock(&BTRFS_I(inode)->accounting_lock);
318 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
319 inode, 1);
320
321 spin_lock(&root->fs_info->ordered_extent_lock); 341 spin_lock(&root->fs_info->ordered_extent_lock);
322 list_del_init(&entry->root_extent_list); 342 list_del_init(&entry->root_extent_list);
323 343
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
491 * start IO on any dirty ones so the wait doesn't stall waiting 511 * start IO on any dirty ones so the wait doesn't stall waiting
492 * for pdflush to find them 512 * for pdflush to find them
493 */ 513 */
494 filemap_fdatawrite_range(inode->i_mapping, start, end); 514 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
515 filemap_fdatawrite_range(inode->i_mapping, start, end);
495 if (wait) { 516 if (wait) {
496 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 517 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
497 &entry->flags)); 518 &entry->flags));
@@ -588,6 +609,47 @@ out:
588 return entry; 609 return entry;
589} 610}
590 611
612/* Since the DIO code tries to lock a wide area we need to look for any ordered
613 * extents that exist in the range, rather than just the start of the range.
614 */
615struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
616 u64 file_offset,
617 u64 len)
618{
619 struct btrfs_ordered_inode_tree *tree;
620 struct rb_node *node;
621 struct btrfs_ordered_extent *entry = NULL;
622
623 tree = &BTRFS_I(inode)->ordered_tree;
624 spin_lock(&tree->lock);
625 node = tree_search(tree, file_offset);
626 if (!node) {
627 node = tree_search(tree, file_offset + len);
628 if (!node)
629 goto out;
630 }
631
632 while (1) {
633 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
634 if (range_overlaps(entry, file_offset, len))
635 break;
636
637 if (entry->file_offset >= file_offset + len) {
638 entry = NULL;
639 break;
640 }
641 entry = NULL;
642 node = rb_next(node);
643 if (!node)
644 break;
645 }
646out:
647 if (entry)
648 atomic_inc(&entry->refs);
649 spin_unlock(&tree->lock);
650 return entry;
651}
652
591/* 653/*
592 * lookup and return any extent before 'file_offset'. NULL is returned 654 * lookup and return any extent before 'file_offset'. NULL is returned
593 * if none is found 655 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
140 struct btrfs_ordered_extent **cached, 142 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size); 143 u64 file_offset, u64 io_size);
142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
143 u64 start, u64 len, u64 disk_len, int tyep); 145 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type);
144int btrfs_add_ordered_sum(struct inode *inode, 148int btrfs_add_ordered_sum(struct inode *inode,
145 struct btrfs_ordered_extent *entry, 149 struct btrfs_ordered_extent *entry,
146 struct btrfs_ordered_sum *sum); 150 struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
151int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 155int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
152struct btrfs_ordered_extent * 156struct btrfs_ordered_extent *
153btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 157btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
158struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
159 u64 file_offset,
160 u64 len);
154int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 161int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
155 struct btrfs_ordered_extent *ordered); 162 struct btrfs_ordered_extent *ordered);
156int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 163int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
44struct backref_node { 44struct backref_node {
45 struct rb_node rb_node; 45 struct rb_node rb_node;
46 u64 bytenr; 46 u64 bytenr;
47 /* objectid tree block owner */ 47
48 u64 new_bytenr;
49 /* objectid of tree block owner, can be not uptodate */
48 u64 owner; 50 u64 owner;
51 /* link to pending, changed or detached list */
52 struct list_head list;
49 /* list of upper level blocks reference this block */ 53 /* list of upper level blocks reference this block */
50 struct list_head upper; 54 struct list_head upper;
51 /* list of child blocks in the cache */ 55 /* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
56 struct extent_buffer *eb; 60 struct extent_buffer *eb;
57 /* level of tree block */ 61 /* level of tree block */
58 unsigned int level:8; 62 unsigned int level:8;
59 /* 1 if the block is root of old snapshot */ 63 /* is the block in non-reference counted tree */
60 unsigned int old_root:1; 64 unsigned int cowonly:1;
61 /* 1 if no child blocks in the cache */ 65 /* 1 if no child node in the cache */
62 unsigned int lowest:1; 66 unsigned int lowest:1;
63 /* is the extent buffer locked */ 67 /* is the extent buffer locked */
64 unsigned int locked:1; 68 unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
66 unsigned int processed:1; 70 unsigned int processed:1;
67 /* have backrefs of this block been checked */ 71 /* have backrefs of this block been checked */
68 unsigned int checked:1; 72 unsigned int checked:1;
73 /*
74 * 1 if corresponding block has been cowed but some upper
75 * level block pointers may not point to the new location
76 */
77 unsigned int pending:1;
78 /*
79 * 1 if the backref node isn't connected to any other
80 * backref node.
81 */
82 unsigned int detached:1;
69}; 83};
70 84
71/* 85/*
@@ -74,7 +88,6 @@ struct backref_node {
74struct backref_edge { 88struct backref_edge {
75 struct list_head list[2]; 89 struct list_head list[2];
76 struct backref_node *node[2]; 90 struct backref_node *node[2];
77 u64 blockptr;
78}; 91};
79 92
80#define LOWER 0 93#define LOWER 0
@@ -83,9 +96,25 @@ struct backref_edge {
83struct backref_cache { 96struct backref_cache {
84 /* red black tree of all backref nodes in the cache */ 97 /* red black tree of all backref nodes in the cache */
85 struct rb_root rb_root; 98 struct rb_root rb_root;
86 /* list of backref nodes with no child block in the cache */ 99 /* for passing backref nodes to btrfs_reloc_cow_block */
100 struct backref_node *path[BTRFS_MAX_LEVEL];
101 /*
102 * list of blocks that have been cowed but some block
103 * pointers in upper level blocks may not reflect the
104 * new location
105 */
87 struct list_head pending[BTRFS_MAX_LEVEL]; 106 struct list_head pending[BTRFS_MAX_LEVEL];
88 spinlock_t lock; 107 /* list of backref nodes with no child node */
108 struct list_head leaves;
109 /* list of blocks that have been cowed in current transaction */
110 struct list_head changed;
111 /* list of detached backref node. */
112 struct list_head detached;
113
114 u64 last_trans;
115
116 int nr_nodes;
117 int nr_edges;
89}; 118};
90 119
91/* 120/*
@@ -113,15 +142,6 @@ struct tree_block {
113 unsigned int key_ready:1; 142 unsigned int key_ready:1;
114}; 143};
115 144
116/* inode vector */
117#define INODEVEC_SIZE 16
118
119struct inodevec {
120 struct list_head list;
121 struct inode *inode[INODEVEC_SIZE];
122 int nr;
123};
124
125#define MAX_EXTENTS 128 145#define MAX_EXTENTS 128
126 146
127struct file_extent_cluster { 147struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
138 struct btrfs_root *extent_root; 158 struct btrfs_root *extent_root;
139 /* inode for moving data */ 159 /* inode for moving data */
140 struct inode *data_inode; 160 struct inode *data_inode;
141 struct btrfs_workers workers; 161
162 struct btrfs_block_rsv *block_rsv;
163
164 struct backref_cache backref_cache;
165
166 struct file_extent_cluster cluster;
142 /* tree blocks have been processed */ 167 /* tree blocks have been processed */
143 struct extent_io_tree processed_blocks; 168 struct extent_io_tree processed_blocks;
144 /* map start of tree root to corresponding reloc tree */ 169 /* map start of tree root to corresponding reloc tree */
145 struct mapping_tree reloc_root_tree; 170 struct mapping_tree reloc_root_tree;
146 /* list of reloc trees */ 171 /* list of reloc trees */
147 struct list_head reloc_roots; 172 struct list_head reloc_roots;
173 /* size of metadata reservation for merging reloc trees */
174 u64 merging_rsv_size;
175 /* size of relocated tree nodes */
176 u64 nodes_relocated;
177
148 u64 search_start; 178 u64 search_start;
149 u64 extents_found; 179 u64 extents_found;
150 u64 extents_skipped; 180
151 int stage; 181 int block_rsv_retries;
152 int create_reloc_root; 182
183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1;
153 unsigned int found_file_extent:1; 186 unsigned int found_file_extent:1;
154 unsigned int found_old_snapshot:1; 187 unsigned int commit_transaction:1;
155}; 188};
156 189
157/* stages of data relocation */ 190/* stages of data relocation */
158#define MOVE_DATA_EXTENTS 0 191#define MOVE_DATA_EXTENTS 0
159#define UPDATE_DATA_PTRS 1 192#define UPDATE_DATA_PTRS 1
160 193
161/* 194static void remove_backref_node(struct backref_cache *cache,
162 * merge reloc tree to corresponding fs tree in worker threads 195 struct backref_node *node);
163 */ 196static void __mark_block_processed(struct reloc_control *rc,
164struct async_merge { 197 struct backref_node *node);
165 struct btrfs_work work;
166 struct reloc_control *rc;
167 struct btrfs_root *root;
168 struct completion *done;
169 atomic_t *num_pending;
170};
171 198
172static void mapping_tree_init(struct mapping_tree *tree) 199static void mapping_tree_init(struct mapping_tree *tree)
173{ 200{
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
181 cache->rb_root = RB_ROOT; 208 cache->rb_root = RB_ROOT;
182 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 209 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
183 INIT_LIST_HEAD(&cache->pending[i]); 210 INIT_LIST_HEAD(&cache->pending[i]);
184 spin_lock_init(&cache->lock); 211 INIT_LIST_HEAD(&cache->changed);
212 INIT_LIST_HEAD(&cache->detached);
213 INIT_LIST_HEAD(&cache->leaves);
214}
215
216static void backref_cache_cleanup(struct backref_cache *cache)
217{
218 struct backref_node *node;
219 int i;
220
221 while (!list_empty(&cache->detached)) {
222 node = list_entry(cache->detached.next,
223 struct backref_node, list);
224 remove_backref_node(cache, node);
225 }
226
227 while (!list_empty(&cache->leaves)) {
228 node = list_entry(cache->leaves.next,
229 struct backref_node, lower);
230 remove_backref_node(cache, node);
231 }
232
233 cache->last_trans = 0;
234
235 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
236 BUG_ON(!list_empty(&cache->pending[i]));
237 BUG_ON(!list_empty(&cache->changed));
238 BUG_ON(!list_empty(&cache->detached));
239 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
240 BUG_ON(cache->nr_nodes);
241 BUG_ON(cache->nr_edges);
242}
243
244static struct backref_node *alloc_backref_node(struct backref_cache *cache)
245{
246 struct backref_node *node;
247
248 node = kzalloc(sizeof(*node), GFP_NOFS);
249 if (node) {
250 INIT_LIST_HEAD(&node->list);
251 INIT_LIST_HEAD(&node->upper);
252 INIT_LIST_HEAD(&node->lower);
253 RB_CLEAR_NODE(&node->rb_node);
254 cache->nr_nodes++;
255 }
256 return node;
257}
258
259static void free_backref_node(struct backref_cache *cache,
260 struct backref_node *node)
261{
262 if (node) {
263 cache->nr_nodes--;
264 kfree(node);
265 }
266}
267
268static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
269{
270 struct backref_edge *edge;
271
272 edge = kzalloc(sizeof(*edge), GFP_NOFS);
273 if (edge)
274 cache->nr_edges++;
275 return edge;
185} 276}
186 277
187static void backref_node_init(struct backref_node *node) 278static void free_backref_edge(struct backref_cache *cache,
279 struct backref_edge *edge)
188{ 280{
189 memset(node, 0, sizeof(*node)); 281 if (edge) {
190 INIT_LIST_HEAD(&node->upper); 282 cache->nr_edges--;
191 INIT_LIST_HEAD(&node->lower); 283 kfree(edge);
192 RB_CLEAR_NODE(&node->rb_node); 284 }
193} 285}
194 286
195static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 287static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
250 edges[idx++] = edge; 342 edges[idx++] = edge;
251 node = edge->node[UPPER]; 343 node = edge->node[UPPER];
252 } 344 }
345 BUG_ON(node->detached);
253 *index = idx; 346 *index = idx;
254 return node; 347 return node;
255} 348}
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
281 return NULL; 374 return NULL;
282} 375}
283 376
377static void unlock_node_buffer(struct backref_node *node)
378{
379 if (node->locked) {
380 btrfs_tree_unlock(node->eb);
381 node->locked = 0;
382 }
383}
384
284static void drop_node_buffer(struct backref_node *node) 385static void drop_node_buffer(struct backref_node *node)
285{ 386{
286 if (node->eb) { 387 if (node->eb) {
287 if (node->locked) { 388 unlock_node_buffer(node);
288 btrfs_tree_unlock(node->eb);
289 node->locked = 0;
290 }
291 free_extent_buffer(node->eb); 389 free_extent_buffer(node->eb);
292 node->eb = NULL; 390 node->eb = NULL;
293 } 391 }
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
296static void drop_backref_node(struct backref_cache *tree, 394static void drop_backref_node(struct backref_cache *tree,
297 struct backref_node *node) 395 struct backref_node *node)
298{ 396{
299 BUG_ON(!node->lowest);
300 BUG_ON(!list_empty(&node->upper)); 397 BUG_ON(!list_empty(&node->upper));
301 398
302 drop_node_buffer(node); 399 drop_node_buffer(node);
400 list_del(&node->list);
303 list_del(&node->lower); 401 list_del(&node->lower);
304 402 if (!RB_EMPTY_NODE(&node->rb_node))
305 rb_erase(&node->rb_node, &tree->rb_root); 403 rb_erase(&node->rb_node, &tree->rb_root);
306 kfree(node); 404 free_backref_node(tree, node);
307} 405}
308 406
309/* 407/*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
318 if (!node) 416 if (!node)
319 return; 417 return;
320 418
321 BUG_ON(!node->lowest); 419 BUG_ON(!node->lowest && !node->detached);
322 while (!list_empty(&node->upper)) { 420 while (!list_empty(&node->upper)) {
323 edge = list_entry(node->upper.next, struct backref_edge, 421 edge = list_entry(node->upper.next, struct backref_edge,
324 list[LOWER]); 422 list[LOWER]);
325 upper = edge->node[UPPER]; 423 upper = edge->node[UPPER];
326 list_del(&edge->list[LOWER]); 424 list_del(&edge->list[LOWER]);
327 list_del(&edge->list[UPPER]); 425 list_del(&edge->list[UPPER]);
328 kfree(edge); 426 free_backref_edge(cache, edge);
427
428 if (RB_EMPTY_NODE(&upper->rb_node)) {
429 BUG_ON(!list_empty(&node->upper));
430 drop_backref_node(cache, node);
431 node = upper;
432 node->lowest = 1;
433 continue;
434 }
329 /* 435 /*
330 * add the node to pending list if no other 436 * add the node to leaf node list if no other
331 * child block cached. 437 * child block cached.
332 */ 438 */
333 if (list_empty(&upper->lower)) { 439 if (list_empty(&upper->lower)) {
334 list_add_tail(&upper->lower, 440 list_add_tail(&upper->lower, &cache->leaves);
335 &cache->pending[upper->level]);
336 upper->lowest = 1; 441 upper->lowest = 1;
337 } 442 }
338 } 443 }
444
339 drop_backref_node(cache, node); 445 drop_backref_node(cache, node);
340} 446}
341 447
448static void update_backref_node(struct backref_cache *cache,
449 struct backref_node *node, u64 bytenr)
450{
451 struct rb_node *rb_node;
452 rb_erase(&node->rb_node, &cache->rb_root);
453 node->bytenr = bytenr;
454 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
455 BUG_ON(rb_node);
456}
457
458/*
459 * update backref cache after a transaction commit
460 */
461static int update_backref_cache(struct btrfs_trans_handle *trans,
462 struct backref_cache *cache)
463{
464 struct backref_node *node;
465 int level = 0;
466
467 if (cache->last_trans == 0) {
468 cache->last_trans = trans->transid;
469 return 0;
470 }
471
472 if (cache->last_trans == trans->transid)
473 return 0;
474
475 /*
476 * detached nodes are used to avoid unnecessary backref
477 * lookup. transaction commit changes the extent tree.
478 * so the detached nodes are no longer useful.
479 */
480 while (!list_empty(&cache->detached)) {
481 node = list_entry(cache->detached.next,
482 struct backref_node, list);
483 remove_backref_node(cache, node);
484 }
485
486 while (!list_empty(&cache->changed)) {
487 node = list_entry(cache->changed.next,
488 struct backref_node, list);
489 list_del_init(&node->list);
490 BUG_ON(node->pending);
491 update_backref_node(cache, node, node->new_bytenr);
492 }
493
494 /*
495 * some nodes can be left in the pending list if there were
496 * errors during processing the pending nodes.
497 */
498 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
499 list_for_each_entry(node, &cache->pending[level], list) {
500 BUG_ON(!node->pending);
501 if (node->bytenr == node->new_bytenr)
502 continue;
503 update_backref_node(cache, node, node->new_bytenr);
504 }
505 }
506
507 cache->last_trans = 0;
508 return 1;
509}
510
511static int should_ignore_root(struct btrfs_root *root)
512{
513 struct btrfs_root *reloc_root;
514
515 if (!root->ref_cows)
516 return 0;
517
518 reloc_root = root->reloc_root;
519 if (!reloc_root)
520 return 0;
521
522 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
523 root->fs_info->running_transaction->transid - 1)
524 return 0;
525 /*
526 * if there is reloc tree and it was created in previous
527 * transaction backref lookup can find the reloc tree,
528 * so backref node for the fs tree root is useless for
529 * relocation.
530 */
531 return 1;
532}
533
342/* 534/*
343 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
344 */ 536 */
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
453 * for all upper level blocks that directly/indirectly reference the 645 * for all upper level blocks that directly/indirectly reference the
454 * block are also cached. 646 * block are also cached.
455 */ 647 */
456static struct backref_node *build_backref_tree(struct reloc_control *rc, 648static noinline_for_stack
457 struct backref_cache *cache, 649struct backref_node *build_backref_tree(struct reloc_control *rc,
458 struct btrfs_key *node_key, 650 struct btrfs_key *node_key,
459 int level, u64 bytenr) 651 int level, u64 bytenr)
460{ 652{
653 struct backref_cache *cache = &rc->backref_cache;
461 struct btrfs_path *path1; 654 struct btrfs_path *path1;
462 struct btrfs_path *path2; 655 struct btrfs_path *path2;
463 struct extent_buffer *eb; 656 struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
473 unsigned long end; 666 unsigned long end;
474 unsigned long ptr; 667 unsigned long ptr;
475 LIST_HEAD(list); 668 LIST_HEAD(list);
669 LIST_HEAD(useless);
670 int cowonly;
476 int ret; 671 int ret;
477 int err = 0; 672 int err = 0;
478 673
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
483 goto out; 678 goto out;
484 } 679 }
485 680
486 node = kmalloc(sizeof(*node), GFP_NOFS); 681 node = alloc_backref_node(cache);
487 if (!node) { 682 if (!node) {
488 err = -ENOMEM; 683 err = -ENOMEM;
489 goto out; 684 goto out;
490 } 685 }
491 686
492 backref_node_init(node);
493 node->bytenr = bytenr; 687 node->bytenr = bytenr;
494 node->owner = 0;
495 node->level = level; 688 node->level = level;
496 node->lowest = 1; 689 node->lowest = 1;
497 cur = node; 690 cur = node;
@@ -587,17 +780,21 @@ again:
587#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 780#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
588 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 781 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
589 key.type == BTRFS_EXTENT_REF_V0_KEY) { 782 key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 if (key.objectid == key.offset && 783 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 key.type == BTRFS_EXTENT_REF_V0_KEY) {
592 struct btrfs_extent_ref_v0 *ref0; 784 struct btrfs_extent_ref_v0 *ref0;
593 ref0 = btrfs_item_ptr(eb, path1->slots[0], 785 ref0 = btrfs_item_ptr(eb, path1->slots[0],
594 struct btrfs_extent_ref_v0); 786 struct btrfs_extent_ref_v0);
595 root = find_tree_root(rc, eb, ref0); 787 if (key.objectid == key.offset) {
596 if (root) 788 root = find_tree_root(rc, eb, ref0);
597 cur->root = root; 789 if (root && !should_ignore_root(root))
598 else 790 cur->root = root;
599 cur->old_root = 1; 791 else
600 break; 792 list_add(&cur->list, &useless);
793 break;
794 }
795 if (is_cowonly_root(btrfs_ref_root_v0(eb,
796 ref0)))
797 cur->cowonly = 1;
601 } 798 }
602#else 799#else
603 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 800 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +811,20 @@ again:
614 break; 811 break;
615 } 812 }
616 813
617 edge = kzalloc(sizeof(*edge), GFP_NOFS); 814 edge = alloc_backref_edge(cache);
618 if (!edge) { 815 if (!edge) {
619 err = -ENOMEM; 816 err = -ENOMEM;
620 goto out; 817 goto out;
621 } 818 }
622 rb_node = tree_search(&cache->rb_root, key.offset); 819 rb_node = tree_search(&cache->rb_root, key.offset);
623 if (!rb_node) { 820 if (!rb_node) {
624 upper = kmalloc(sizeof(*upper), GFP_NOFS); 821 upper = alloc_backref_node(cache);
625 if (!upper) { 822 if (!upper) {
626 kfree(edge); 823 free_backref_edge(cache, edge);
627 err = -ENOMEM; 824 err = -ENOMEM;
628 goto out; 825 goto out;
629 } 826 }
630 backref_node_init(upper);
631 upper->bytenr = key.offset; 827 upper->bytenr = key.offset;
632 upper->owner = 0;
633 upper->level = cur->level + 1; 828 upper->level = cur->level + 1;
634 /* 829 /*
635 * backrefs for the upper level block isn't 830 * backrefs for the upper level block isn't
@@ -639,11 +834,12 @@ again:
639 } else { 834 } else {
640 upper = rb_entry(rb_node, struct backref_node, 835 upper = rb_entry(rb_node, struct backref_node,
641 rb_node); 836 rb_node);
837 BUG_ON(!upper->checked);
642 INIT_LIST_HEAD(&edge->list[UPPER]); 838 INIT_LIST_HEAD(&edge->list[UPPER]);
643 } 839 }
644 list_add(&edge->list[LOWER], &cur->upper); 840 list_add_tail(&edge->list[LOWER], &cur->upper);
645 edge->node[UPPER] = upper;
646 edge->node[LOWER] = cur; 841 edge->node[LOWER] = cur;
842 edge->node[UPPER] = upper;
647 843
648 goto next; 844 goto next;
649 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 845 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +853,17 @@ again:
657 goto out; 853 goto out;
658 } 854 }
659 855
856 if (!root->ref_cows)
857 cur->cowonly = 1;
858
660 if (btrfs_root_level(&root->root_item) == cur->level) { 859 if (btrfs_root_level(&root->root_item) == cur->level) {
661 /* tree root */ 860 /* tree root */
662 BUG_ON(btrfs_root_bytenr(&root->root_item) != 861 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
663 cur->bytenr); 862 cur->bytenr);
664 cur->root = root; 863 if (should_ignore_root(root))
864 list_add(&cur->list, &useless);
865 else
866 cur->root = root;
665 break; 867 break;
666 } 868 }
667 869
@@ -692,11 +894,14 @@ again:
692 if (!path2->nodes[level]) { 894 if (!path2->nodes[level]) {
693 BUG_ON(btrfs_root_bytenr(&root->root_item) != 895 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
694 lower->bytenr); 896 lower->bytenr);
695 lower->root = root; 897 if (should_ignore_root(root))
898 list_add(&lower->list, &useless);
899 else
900 lower->root = root;
696 break; 901 break;
697 } 902 }
698 903
699 edge = kzalloc(sizeof(*edge), GFP_NOFS); 904 edge = alloc_backref_edge(cache);
700 if (!edge) { 905 if (!edge) {
701 err = -ENOMEM; 906 err = -ENOMEM;
702 goto out; 907 goto out;
@@ -705,16 +910,17 @@ again:
705 eb = path2->nodes[level]; 910 eb = path2->nodes[level];
706 rb_node = tree_search(&cache->rb_root, eb->start); 911 rb_node = tree_search(&cache->rb_root, eb->start);
707 if (!rb_node) { 912 if (!rb_node) {
708 upper = kmalloc(sizeof(*upper), GFP_NOFS); 913 upper = alloc_backref_node(cache);
709 if (!upper) { 914 if (!upper) {
710 kfree(edge); 915 free_backref_edge(cache, edge);
711 err = -ENOMEM; 916 err = -ENOMEM;
712 goto out; 917 goto out;
713 } 918 }
714 backref_node_init(upper);
715 upper->bytenr = eb->start; 919 upper->bytenr = eb->start;
716 upper->owner = btrfs_header_owner(eb); 920 upper->owner = btrfs_header_owner(eb);
717 upper->level = lower->level + 1; 921 upper->level = lower->level + 1;
922 if (!root->ref_cows)
923 upper->cowonly = 1;
718 924
719 /* 925 /*
720 * if we know the block isn't shared 926 * if we know the block isn't shared
@@ -744,10 +950,12 @@ again:
744 rb_node); 950 rb_node);
745 BUG_ON(!upper->checked); 951 BUG_ON(!upper->checked);
746 INIT_LIST_HEAD(&edge->list[UPPER]); 952 INIT_LIST_HEAD(&edge->list[UPPER]);
953 if (!upper->owner)
954 upper->owner = btrfs_header_owner(eb);
747 } 955 }
748 list_add_tail(&edge->list[LOWER], &lower->upper); 956 list_add_tail(&edge->list[LOWER], &lower->upper);
749 edge->node[UPPER] = upper;
750 edge->node[LOWER] = lower; 957 edge->node[LOWER] = lower;
958 edge->node[UPPER] = upper;
751 959
752 if (rb_node) 960 if (rb_node)
753 break; 961 break;
@@ -785,8 +993,13 @@ next:
785 * into the cache. 993 * into the cache.
786 */ 994 */
787 BUG_ON(!node->checked); 995 BUG_ON(!node->checked);
788 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 996 cowonly = node->cowonly;
789 BUG_ON(rb_node); 997 if (!cowonly) {
998 rb_node = tree_insert(&cache->rb_root, node->bytenr,
999 &node->rb_node);
1000 BUG_ON(rb_node);
1001 list_add_tail(&node->lower, &cache->leaves);
1002 }
790 1003
791 list_for_each_entry(edge, &node->upper, list[LOWER]) 1004 list_for_each_entry(edge, &node->upper, list[LOWER])
792 list_add_tail(&edge->list[UPPER], &list); 1005 list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1008,14 @@ next:
795 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1008 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
796 list_del_init(&edge->list[UPPER]); 1009 list_del_init(&edge->list[UPPER]);
797 upper = edge->node[UPPER]; 1010 upper = edge->node[UPPER];
1011 if (upper->detached) {
1012 list_del(&edge->list[LOWER]);
1013 lower = edge->node[LOWER];
1014 free_backref_edge(cache, edge);
1015 if (list_empty(&lower->upper))
1016 list_add(&lower->list, &useless);
1017 continue;
1018 }
798 1019
799 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1020 if (!RB_EMPTY_NODE(&upper->rb_node)) {
800 if (upper->lowest) { 1021 if (upper->lowest) {
@@ -807,25 +1028,69 @@ next:
807 } 1028 }
808 1029
809 BUG_ON(!upper->checked); 1030 BUG_ON(!upper->checked);
810 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1031 BUG_ON(cowonly != upper->cowonly);
811 &upper->rb_node); 1032 if (!cowonly) {
812 BUG_ON(rb_node); 1033 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1034 &upper->rb_node);
1035 BUG_ON(rb_node);
1036 }
813 1037
814 list_add_tail(&edge->list[UPPER], &upper->lower); 1038 list_add_tail(&edge->list[UPPER], &upper->lower);
815 1039
816 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1040 list_for_each_entry(edge, &upper->upper, list[LOWER])
817 list_add_tail(&edge->list[UPPER], &list); 1041 list_add_tail(&edge->list[UPPER], &list);
818 } 1042 }
1043 /*
1044 * process useless backref nodes. backref nodes for tree leaves
1045 * are deleted from the cache. backref nodes for upper level
1046 * tree blocks are left in the cache to avoid unnecessary backref
1047 * lookup.
1048 */
1049 while (!list_empty(&useless)) {
1050 upper = list_entry(useless.next, struct backref_node, list);
1051 list_del_init(&upper->list);
1052 BUG_ON(!list_empty(&upper->upper));
1053 if (upper == node)
1054 node = NULL;
1055 if (upper->lowest) {
1056 list_del_init(&upper->lower);
1057 upper->lowest = 0;
1058 }
1059 while (!list_empty(&upper->lower)) {
1060 edge = list_entry(upper->lower.next,
1061 struct backref_edge, list[UPPER]);
1062 list_del(&edge->list[UPPER]);
1063 list_del(&edge->list[LOWER]);
1064 lower = edge->node[LOWER];
1065 free_backref_edge(cache, edge);
1066
1067 if (list_empty(&lower->upper))
1068 list_add(&lower->list, &useless);
1069 }
1070 __mark_block_processed(rc, upper);
1071 if (upper->level > 0) {
1072 list_add(&upper->list, &cache->detached);
1073 upper->detached = 1;
1074 } else {
1075 rb_erase(&upper->rb_node, &cache->rb_root);
1076 free_backref_node(cache, upper);
1077 }
1078 }
819out: 1079out:
820 btrfs_free_path(path1); 1080 btrfs_free_path(path1);
821 btrfs_free_path(path2); 1081 btrfs_free_path(path2);
822 if (err) { 1082 if (err) {
823 INIT_LIST_HEAD(&list); 1083 while (!list_empty(&useless)) {
1084 lower = list_entry(useless.next,
1085 struct backref_node, upper);
1086 list_del_init(&lower->upper);
1087 }
824 upper = node; 1088 upper = node;
1089 INIT_LIST_HEAD(&list);
825 while (upper) { 1090 while (upper) {
826 if (RB_EMPTY_NODE(&upper->rb_node)) { 1091 if (RB_EMPTY_NODE(&upper->rb_node)) {
827 list_splice_tail(&upper->upper, &list); 1092 list_splice_tail(&upper->upper, &list);
828 kfree(upper); 1093 free_backref_node(cache, upper);
829 } 1094 }
830 1095
831 if (list_empty(&list)) 1096 if (list_empty(&list))
@@ -833,15 +1098,104 @@ out:
833 1098
834 edge = list_entry(list.next, struct backref_edge, 1099 edge = list_entry(list.next, struct backref_edge,
835 list[LOWER]); 1100 list[LOWER]);
1101 list_del(&edge->list[LOWER]);
836 upper = edge->node[UPPER]; 1102 upper = edge->node[UPPER];
837 kfree(edge); 1103 free_backref_edge(cache, edge);
838 } 1104 }
839 return ERR_PTR(err); 1105 return ERR_PTR(err);
840 } 1106 }
1107 BUG_ON(node && node->detached);
841 return node; 1108 return node;
842} 1109}
843 1110
844/* 1111/*
1112 * helper to add backref node for the newly created snapshot.
1113 * the backref node is created by cloning backref node that
1114 * corresponds to root of source tree
1115 */
1116static int clone_backref_node(struct btrfs_trans_handle *trans,
1117 struct reloc_control *rc,
1118 struct btrfs_root *src,
1119 struct btrfs_root *dest)
1120{
1121 struct btrfs_root *reloc_root = src->reloc_root;
1122 struct backref_cache *cache = &rc->backref_cache;
1123 struct backref_node *node = NULL;
1124 struct backref_node *new_node;
1125 struct backref_edge *edge;
1126 struct backref_edge *new_edge;
1127 struct rb_node *rb_node;
1128
1129 if (cache->last_trans > 0)
1130 update_backref_cache(trans, cache);
1131
1132 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1133 if (rb_node) {
1134 node = rb_entry(rb_node, struct backref_node, rb_node);
1135 if (node->detached)
1136 node = NULL;
1137 else
1138 BUG_ON(node->new_bytenr != reloc_root->node->start);
1139 }
1140
1141 if (!node) {
1142 rb_node = tree_search(&cache->rb_root,
1143 reloc_root->commit_root->start);
1144 if (rb_node) {
1145 node = rb_entry(rb_node, struct backref_node,
1146 rb_node);
1147 BUG_ON(node->detached);
1148 }
1149 }
1150
1151 if (!node)
1152 return 0;
1153
1154 new_node = alloc_backref_node(cache);
1155 if (!new_node)
1156 return -ENOMEM;
1157
1158 new_node->bytenr = dest->node->start;
1159 new_node->level = node->level;
1160 new_node->lowest = node->lowest;
1161 new_node->root = dest;
1162
1163 if (!node->lowest) {
1164 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1165 new_edge = alloc_backref_edge(cache);
1166 if (!new_edge)
1167 goto fail;
1168
1169 new_edge->node[UPPER] = new_node;
1170 new_edge->node[LOWER] = edge->node[LOWER];
1171 list_add_tail(&new_edge->list[UPPER],
1172 &new_node->lower);
1173 }
1174 }
1175
1176 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1177 &new_node->rb_node);
1178 BUG_ON(rb_node);
1179
1180 if (!new_node->lowest) {
1181 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1182 list_add_tail(&new_edge->list[LOWER],
1183 &new_edge->node[LOWER]->upper);
1184 }
1185 }
1186 return 0;
1187fail:
1188 while (!list_empty(&new_node->lower)) {
1189 new_edge = list_entry(new_node->lower.next,
1190 struct backref_edge, list[UPPER]);
1191 list_del(&new_edge->list[UPPER]);
1192 free_backref_edge(cache, new_edge);
1193 }
1194 free_backref_node(cache, new_node);
1195 return -ENOMEM;
1196}
1197
1198/*
845 * helper to add 'address of tree root -> reloc tree' mapping 1199 * helper to add 'address of tree root -> reloc tree' mapping
846 */ 1200 */
847static int __add_reloc_root(struct btrfs_root *root) 1201static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
901 return 0; 1255 return 0;
902} 1256}
903 1257
904/* 1258static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
905 * create reloc tree for a given fs tree. reloc tree is just a 1259 struct btrfs_root *root, u64 objectid)
906 * snapshot of the fs tree with special root objectid.
907 */
908int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
909 struct btrfs_root *root)
910{ 1260{
911 struct btrfs_root *reloc_root; 1261 struct btrfs_root *reloc_root;
912 struct extent_buffer *eb; 1262 struct extent_buffer *eb;
@@ -914,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
914 struct btrfs_key root_key; 1264 struct btrfs_key root_key;
915 int ret; 1265 int ret;
916 1266
917 if (root->reloc_root) {
918 reloc_root = root->reloc_root;
919 reloc_root->last_trans = trans->transid;
920 return 0;
921 }
922
923 if (!root->fs_info->reloc_ctl ||
924 !root->fs_info->reloc_ctl->create_reloc_root ||
925 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
926 return 0;
927
928 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1267 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
929 BUG_ON(!root_item); 1268 BUG_ON(!root_item);
930 1269
931 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1270 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
932 root_key.type = BTRFS_ROOT_ITEM_KEY; 1271 root_key.type = BTRFS_ROOT_ITEM_KEY;
933 root_key.offset = root->root_key.objectid; 1272 root_key.offset = objectid;
934 1273
935 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1274 if (root->root_key.objectid == objectid) {
936 BTRFS_TREE_RELOC_OBJECTID); 1275 /* called by btrfs_init_reloc_root */
937 BUG_ON(ret); 1276 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1277 BTRFS_TREE_RELOC_OBJECTID);
1278 BUG_ON(ret);
1279
1280 btrfs_set_root_last_snapshot(&root->root_item,
1281 trans->transid - 1);
1282 } else {
1283 /*
1284 * called by btrfs_reloc_post_snapshot_hook.
1285 * the source tree is a reloc tree, all tree blocks
1286 * modified after it was created have RELOC flag
1287 * set in their headers. so it's OK to not update
1288 * the 'last_snapshot'.
1289 */
1290 ret = btrfs_copy_root(trans, root, root->node, &eb,
1291 BTRFS_TREE_RELOC_OBJECTID);
1292 BUG_ON(ret);
1293 }
938 1294
939 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
940 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1295 memcpy(root_item, &root->root_item, sizeof(*root_item));
941 btrfs_set_root_refs(root_item, 1);
942 btrfs_set_root_bytenr(root_item, eb->start); 1296 btrfs_set_root_bytenr(root_item, eb->start);
943 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1297 btrfs_set_root_level(root_item, btrfs_header_level(eb));
944 btrfs_set_root_generation(root_item, trans->transid); 1298 btrfs_set_root_generation(root_item, trans->transid);
945 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1299
946 root_item->drop_level = 0; 1300 if (root->root_key.objectid == objectid) {
1301 btrfs_set_root_refs(root_item, 0);
1302 memset(&root_item->drop_progress, 0,
1303 sizeof(struct btrfs_disk_key));
1304 root_item->drop_level = 0;
1305 }
947 1306
948 btrfs_tree_unlock(eb); 1307 btrfs_tree_unlock(eb);
949 free_extent_buffer(eb); 1308 free_extent_buffer(eb);
@@ -957,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
957 &root_key); 1316 &root_key);
958 BUG_ON(IS_ERR(reloc_root)); 1317 BUG_ON(IS_ERR(reloc_root));
959 reloc_root->last_trans = trans->transid; 1318 reloc_root->last_trans = trans->transid;
1319 return reloc_root;
1320}
1321
1322/*
1323 * create reloc tree for a given fs tree. reloc tree is just a
1324 * snapshot of the fs tree with special root objectid.
1325 */
1326int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *root)
1328{
1329 struct btrfs_root *reloc_root;
1330 struct reloc_control *rc = root->fs_info->reloc_ctl;
1331 int clear_rsv = 0;
1332
1333 if (root->reloc_root) {
1334 reloc_root = root->reloc_root;
1335 reloc_root->last_trans = trans->transid;
1336 return 0;
1337 }
1338
1339 if (!rc || !rc->create_reloc_tree ||
1340 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1341 return 0;
1342
1343 if (!trans->block_rsv) {
1344 trans->block_rsv = rc->block_rsv;
1345 clear_rsv = 1;
1346 }
1347 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1348 if (clear_rsv)
1349 trans->block_rsv = NULL;
960 1350
961 __add_reloc_root(reloc_root); 1351 __add_reloc_root(reloc_root);
962 root->reloc_root = reloc_root; 1352 root->reloc_root = reloc_root;
@@ -980,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
980 reloc_root = root->reloc_root; 1370 reloc_root = root->reloc_root;
981 root_item = &reloc_root->root_item; 1371 root_item = &reloc_root->root_item;
982 1372
983 if (btrfs_root_refs(root_item) == 0) { 1373 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1374 btrfs_root_refs(root_item) == 0) {
984 root->reloc_root = NULL; 1375 root->reloc_root = NULL;
985 del = 1; 1376 del = 1;
986 } 1377 }
@@ -1102,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1102 goto out; 1493 goto out;
1103 } 1494 }
1104 1495
1105 if (new_bytenr) 1496 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1107 ret = 0; 1497 ret = 0;
1108out: 1498out:
1109 btrfs_free_path(path); 1499 btrfs_free_path(path);
@@ -1114,19 +1504,18 @@ out:
1114 * update file extent items in the tree leaf to point to 1504 * update file extent items in the tree leaf to point to
1115 * the new locations. 1505 * the new locations.
1116 */ 1506 */
1117static int replace_file_extents(struct btrfs_trans_handle *trans, 1507static noinline_for_stack
1118 struct reloc_control *rc, 1508int replace_file_extents(struct btrfs_trans_handle *trans,
1119 struct btrfs_root *root, 1509 struct reloc_control *rc,
1120 struct extent_buffer *leaf, 1510 struct btrfs_root *root,
1121 struct list_head *inode_list) 1511 struct extent_buffer *leaf)
1122{ 1512{
1123 struct btrfs_key key; 1513 struct btrfs_key key;
1124 struct btrfs_file_extent_item *fi; 1514 struct btrfs_file_extent_item *fi;
1125 struct inode *inode = NULL; 1515 struct inode *inode = NULL;
1126 struct inodevec *ivec = NULL;
1127 u64 parent; 1516 u64 parent;
1128 u64 bytenr; 1517 u64 bytenr;
1129 u64 new_bytenr; 1518 u64 new_bytenr = 0;
1130 u64 num_bytes; 1519 u64 num_bytes;
1131 u64 end; 1520 u64 end;
1132 u32 nritems; 1521 u32 nritems;
@@ -1166,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1166 * to complete and drop the extent cache 1555 * to complete and drop the extent cache
1167 */ 1556 */
1168 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1557 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1169 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1170 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1171 BUG_ON(!ivec);
1172 ivec->nr = 0;
1173 list_add_tail(&ivec->list, inode_list);
1174 }
1175 if (first) { 1558 if (first) {
1176 inode = find_next_inode(root, key.objectid); 1559 inode = find_next_inode(root, key.objectid);
1177 if (inode)
1178 ivec->inode[ivec->nr++] = inode;
1179 first = 0; 1560 first = 0;
1180 } else if (inode && inode->i_ino < key.objectid) { 1561 } else if (inode && inode->i_ino < key.objectid) {
1562 btrfs_add_delayed_iput(inode);
1181 inode = find_next_inode(root, key.objectid); 1563 inode = find_next_inode(root, key.objectid);
1182 if (inode)
1183 ivec->inode[ivec->nr++] = inode;
1184 } 1564 }
1185 if (inode && inode->i_ino == key.objectid) { 1565 if (inode && inode->i_ino == key.objectid) {
1186 end = key.offset + 1566 end = key.offset +
@@ -1204,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1204 1584
1205 ret = get_new_location(rc->data_inode, &new_bytenr, 1585 ret = get_new_location(rc->data_inode, &new_bytenr,
1206 bytenr, num_bytes); 1586 bytenr, num_bytes);
1207 if (ret > 0) 1587 if (ret > 0) {
1588 WARN_ON(1);
1208 continue; 1589 continue;
1590 }
1209 BUG_ON(ret < 0); 1591 BUG_ON(ret < 0);
1210 1592
1211 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1593 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1225 } 1607 }
1226 if (dirty) 1608 if (dirty)
1227 btrfs_mark_buffer_dirty(leaf); 1609 btrfs_mark_buffer_dirty(leaf);
1610 if (inode)
1611 btrfs_add_delayed_iput(inode);
1228 return 0; 1612 return 0;
1229} 1613}
1230 1614
@@ -1248,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1248 * if no block got replaced, 0 is returned. if there are other 1632 * if no block got replaced, 0 is returned. if there are other
1249 * errors, a negative error number is returned. 1633 * errors, a negative error number is returned.
1250 */ 1634 */
1251static int replace_path(struct btrfs_trans_handle *trans, 1635static noinline_for_stack
1252 struct btrfs_root *dest, struct btrfs_root *src, 1636int replace_path(struct btrfs_trans_handle *trans,
1253 struct btrfs_path *path, struct btrfs_key *next_key, 1637 struct btrfs_root *dest, struct btrfs_root *src,
1254 struct extent_buffer **leaf, 1638 struct btrfs_path *path, struct btrfs_key *next_key,
1255 int lowest_level, int max_level) 1639 int lowest_level, int max_level)
1256{ 1640{
1257 struct extent_buffer *eb; 1641 struct extent_buffer *eb;
1258 struct extent_buffer *parent; 1642 struct extent_buffer *parent;
@@ -1263,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1263 u64 new_ptr_gen; 1647 u64 new_ptr_gen;
1264 u64 last_snapshot; 1648 u64 last_snapshot;
1265 u32 blocksize; 1649 u32 blocksize;
1650 int cow = 0;
1266 int level; 1651 int level;
1267 int ret; 1652 int ret;
1268 int slot; 1653 int slot;
1269 1654
1270 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1656 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1272 BUG_ON(lowest_level > 1 && leaf);
1273 1657
1274 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1658 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1275 1659again:
1276 slot = path->slots[lowest_level]; 1660 slot = path->slots[lowest_level];
1277 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1661 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1278 1662
@@ -1286,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1286 return 0; 1670 return 0;
1287 } 1671 }
1288 1672
1289 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1673 if (cow) {
1290 BUG_ON(ret); 1674 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1675 BUG_ON(ret);
1676 }
1291 btrfs_set_lock_blocking(eb); 1677 btrfs_set_lock_blocking(eb);
1292 1678
1293 if (next_key) { 1679 if (next_key) {
@@ -1331,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1331 1717
1332 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1718 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1333 memcmp_node_keys(parent, slot, path, level)) { 1719 memcmp_node_keys(parent, slot, path, level)) {
1334 if (level <= lowest_level && !leaf) { 1720 if (level <= lowest_level) {
1335 ret = 0; 1721 ret = 0;
1336 break; 1722 break;
1337 } 1723 }
@@ -1339,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1339 eb = read_tree_block(dest, old_bytenr, blocksize, 1725 eb = read_tree_block(dest, old_bytenr, blocksize,
1340 old_ptr_gen); 1726 old_ptr_gen);
1341 btrfs_tree_lock(eb); 1727 btrfs_tree_lock(eb);
1342 ret = btrfs_cow_block(trans, dest, eb, parent, 1728 if (cow) {
1343 slot, &eb); 1729 ret = btrfs_cow_block(trans, dest, eb, parent,
1344 BUG_ON(ret); 1730 slot, &eb);
1345 btrfs_set_lock_blocking(eb); 1731 BUG_ON(ret);
1346
1347 if (level <= lowest_level) {
1348 *leaf = eb;
1349 ret = 0;
1350 break;
1351 } 1732 }
1733 btrfs_set_lock_blocking(eb);
1352 1734
1353 btrfs_tree_unlock(parent); 1735 btrfs_tree_unlock(parent);
1354 free_extent_buffer(parent); 1736 free_extent_buffer(parent);
@@ -1357,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1357 continue; 1739 continue;
1358 } 1740 }
1359 1741
1742 if (!cow) {
1743 btrfs_tree_unlock(parent);
1744 free_extent_buffer(parent);
1745 cow = 1;
1746 goto again;
1747 }
1748
1360 btrfs_node_key_to_cpu(path->nodes[level], &key, 1749 btrfs_node_key_to_cpu(path->nodes[level], &key,
1361 path->slots[level]); 1750 path->slots[level]);
1362 btrfs_release_path(src, path); 1751 btrfs_release_path(src, path);
@@ -1562,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1562 return 0; 1951 return 0;
1563} 1952}
1564 1953
1565static void put_inodes(struct list_head *list)
1566{
1567 struct inodevec *ivec;
1568 while (!list_empty(list)) {
1569 ivec = list_entry(list->next, struct inodevec, list);
1570 list_del(&ivec->list);
1571 while (ivec->nr > 0) {
1572 ivec->nr--;
1573 iput(ivec->inode[ivec->nr]);
1574 }
1575 kfree(ivec);
1576 }
1577}
1578
1579static int find_next_key(struct btrfs_path *path, int level, 1954static int find_next_key(struct btrfs_path *path, int level,
1580 struct btrfs_key *key) 1955 struct btrfs_key *key)
1581 1956
@@ -1608,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1608 struct btrfs_root *reloc_root; 1983 struct btrfs_root *reloc_root;
1609 struct btrfs_root_item *root_item; 1984 struct btrfs_root_item *root_item;
1610 struct btrfs_path *path; 1985 struct btrfs_path *path;
1611 struct extent_buffer *leaf = NULL; 1986 struct extent_buffer *leaf;
1612 unsigned long nr; 1987 unsigned long nr;
1613 int level; 1988 int level;
1614 int max_level; 1989 int max_level;
1615 int replaced = 0; 1990 int replaced = 0;
1616 int ret; 1991 int ret;
1617 int err = 0; 1992 int err = 0;
1993 u32 min_reserved;
1618 1994
1619 path = btrfs_alloc_path(); 1995 path = btrfs_alloc_path();
1620 if (!path) 1996 if (!path)
@@ -1648,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1648 btrfs_unlock_up_safe(path, 0); 2024 btrfs_unlock_up_safe(path, 0);
1649 } 2025 }
1650 2026
1651 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2027 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1652 trans = btrfs_start_transaction(root, 1); 2028 memset(&next_key, 0, sizeof(next_key));
1653 2029
1654 leaf = path->nodes[0]; 2030 while (1) {
1655 btrfs_item_key_to_cpu(leaf, &key, 0); 2031 trans = btrfs_start_transaction(root, 0);
1656 btrfs_release_path(reloc_root, path); 2032 trans->block_rsv = rc->block_rsv;
1657 2033
1658 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2034 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1659 if (ret < 0) { 2035 min_reserved, 0);
1660 err = ret; 2036 if (ret) {
1661 goto out; 2037 BUG_ON(ret != -EAGAIN);
2038 ret = btrfs_commit_transaction(trans, root);
2039 BUG_ON(ret);
2040 continue;
1662 } 2041 }
1663 2042
1664 leaf = path->nodes[0];
1665 btrfs_unlock_up_safe(path, 1);
1666 ret = replace_file_extents(trans, rc, root, leaf,
1667 &inode_list);
1668 if (ret < 0)
1669 err = ret;
1670 goto out;
1671 }
1672
1673 memset(&next_key, 0, sizeof(next_key));
1674
1675 while (1) {
1676 leaf = NULL;
1677 replaced = 0; 2043 replaced = 0;
1678 trans = btrfs_start_transaction(root, 1);
1679 max_level = level; 2044 max_level = level;
1680 2045
1681 ret = walk_down_reloc_tree(reloc_root, path, &level); 2046 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1689 if (!find_next_key(path, level, &key) && 2054 if (!find_next_key(path, level, &key) &&
1690 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2055 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1691 ret = 0; 2056 ret = 0;
1692 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1693 ret = replace_path(trans, root, reloc_root,
1694 path, &next_key, &leaf,
1695 level, max_level);
1696 } else { 2057 } else {
1697 ret = replace_path(trans, root, reloc_root, 2058 ret = replace_path(trans, root, reloc_root, path,
1698 path, &next_key, NULL, 2059 &next_key, level, max_level);
1699 level, max_level);
1700 } 2060 }
1701 if (ret < 0) { 2061 if (ret < 0) {
1702 err = ret; 2062 err = ret;
@@ -1708,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1708 btrfs_node_key_to_cpu(path->nodes[level], &key, 2068 btrfs_node_key_to_cpu(path->nodes[level], &key,
1709 path->slots[level]); 2069 path->slots[level]);
1710 replaced = 1; 2070 replaced = 1;
1711 } else if (leaf) {
1712 /*
1713 * no block got replaced, try replacing file extents
1714 */
1715 btrfs_item_key_to_cpu(leaf, &key, 0);
1716 ret = replace_file_extents(trans, rc, root, leaf,
1717 &inode_list);
1718 btrfs_tree_unlock(leaf);
1719 free_extent_buffer(leaf);
1720 BUG_ON(ret < 0);
1721 } 2071 }
1722 2072
1723 ret = walk_up_reloc_tree(reloc_root, path, &level); 2073 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1734 root_item->drop_level = level; 2084 root_item->drop_level = level;
1735 2085
1736 nr = trans->blocks_used; 2086 nr = trans->blocks_used;
1737 btrfs_end_transaction(trans, root); 2087 btrfs_end_transaction_throttle(trans, root);
1738 2088
1739 btrfs_btree_balance_dirty(root, nr); 2089 btrfs_btree_balance_dirty(root, nr);
1740 2090
1741 /*
1742 * put inodes outside transaction, otherwise we may deadlock.
1743 */
1744 put_inodes(&inode_list);
1745
1746 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2091 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1747 invalidate_extent_cache(root, &key, &next_key); 2092 invalidate_extent_cache(root, &key, &next_key);
1748 } 2093 }
@@ -1765,87 +2110,125 @@ out:
1765 sizeof(root_item->drop_progress)); 2110 sizeof(root_item->drop_progress));
1766 root_item->drop_level = 0; 2111 root_item->drop_level = 0;
1767 btrfs_set_root_refs(root_item, 0); 2112 btrfs_set_root_refs(root_item, 0);
2113 btrfs_update_reloc_root(trans, root);
1768 } 2114 }
1769 2115
1770 nr = trans->blocks_used; 2116 nr = trans->blocks_used;
1771 btrfs_end_transaction(trans, root); 2117 btrfs_end_transaction_throttle(trans, root);
1772 2118
1773 btrfs_btree_balance_dirty(root, nr); 2119 btrfs_btree_balance_dirty(root, nr);
1774 2120
1775 put_inodes(&inode_list);
1776
1777 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2121 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1778 invalidate_extent_cache(root, &key, &next_key); 2122 invalidate_extent_cache(root, &key, &next_key);
1779 2123
1780 return err; 2124 return err;
1781} 2125}
1782 2126
1783/* 2127static noinline_for_stack
1784 * callback for the work threads. 2128int prepare_to_merge(struct reloc_control *rc, int err)
1785 * this function merges reloc tree with corresponding fs tree,
1786 * and then drops the reloc tree.
1787 */
1788static void merge_func(struct btrfs_work *work)
1789{ 2129{
1790 struct btrfs_trans_handle *trans; 2130 struct btrfs_root *root = rc->extent_root;
1791 struct btrfs_root *root;
1792 struct btrfs_root *reloc_root; 2131 struct btrfs_root *reloc_root;
1793 struct async_merge *async; 2132 struct btrfs_trans_handle *trans;
2133 LIST_HEAD(reloc_roots);
2134 u64 num_bytes = 0;
2135 int ret;
2136 int retries = 0;
2137
2138 mutex_lock(&root->fs_info->trans_mutex);
2139 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2140 rc->merging_rsv_size += rc->nodes_relocated * 2;
2141 mutex_unlock(&root->fs_info->trans_mutex);
2142again:
2143 if (!err) {
2144 num_bytes = rc->merging_rsv_size;
2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2146 num_bytes, &retries);
2147 if (ret)
2148 err = ret;
2149 }
2150
2151 trans = btrfs_join_transaction(rc->extent_root, 1);
2152
2153 if (!err) {
2154 if (num_bytes != rc->merging_rsv_size) {
2155 btrfs_end_transaction(trans, rc->extent_root);
2156 btrfs_block_rsv_release(rc->extent_root,
2157 rc->block_rsv, num_bytes);
2158 retries = 0;
2159 goto again;
2160 }
2161 }
1794 2162
1795 async = container_of(work, struct async_merge, work); 2163 rc->merge_reloc_tree = 1;
1796 reloc_root = async->root; 2164
2165 while (!list_empty(&rc->reloc_roots)) {
2166 reloc_root = list_entry(rc->reloc_roots.next,
2167 struct btrfs_root, root_list);
2168 list_del_init(&reloc_root->root_list);
1797 2169
1798 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1799 root = read_fs_root(reloc_root->fs_info, 2170 root = read_fs_root(reloc_root->fs_info,
1800 reloc_root->root_key.offset); 2171 reloc_root->root_key.offset);
1801 BUG_ON(IS_ERR(root)); 2172 BUG_ON(IS_ERR(root));
1802 BUG_ON(root->reloc_root != reloc_root); 2173 BUG_ON(root->reloc_root != reloc_root);
1803 2174
1804 merge_reloc_root(async->rc, root); 2175 /*
1805 2176 * set reference count to 1, so btrfs_recover_relocation
1806 trans = btrfs_start_transaction(root, 1); 2177 * knows it should resumes merging
2178 */
2179 if (!err)
2180 btrfs_set_root_refs(&reloc_root->root_item, 1);
1807 btrfs_update_reloc_root(trans, root); 2181 btrfs_update_reloc_root(trans, root);
1808 btrfs_end_transaction(trans, root);
1809 }
1810 2182
1811 btrfs_drop_snapshot(reloc_root, 0); 2183 list_add(&reloc_root->root_list, &reloc_roots);
2184 }
1812 2185
1813 if (atomic_dec_and_test(async->num_pending)) 2186 list_splice(&reloc_roots, &rc->reloc_roots);
1814 complete(async->done);
1815 2187
1816 kfree(async); 2188 if (!err)
2189 btrfs_commit_transaction(trans, rc->extent_root);
2190 else
2191 btrfs_end_transaction(trans, rc->extent_root);
2192 return err;
1817} 2193}
1818 2194
1819static int merge_reloc_roots(struct reloc_control *rc) 2195static noinline_for_stack
2196int merge_reloc_roots(struct reloc_control *rc)
1820{ 2197{
1821 struct async_merge *async;
1822 struct btrfs_root *root; 2198 struct btrfs_root *root;
1823 struct completion done; 2199 struct btrfs_root *reloc_root;
1824 atomic_t num_pending; 2200 LIST_HEAD(reloc_roots);
2201 int found = 0;
2202 int ret;
2203again:
2204 root = rc->extent_root;
2205 mutex_lock(&root->fs_info->trans_mutex);
2206 list_splice_init(&rc->reloc_roots, &reloc_roots);
2207 mutex_unlock(&root->fs_info->trans_mutex);
1825 2208
1826 init_completion(&done); 2209 while (!list_empty(&reloc_roots)) {
1827 atomic_set(&num_pending, 1); 2210 found = 1;
2211 reloc_root = list_entry(reloc_roots.next,
2212 struct btrfs_root, root_list);
1828 2213
1829 while (!list_empty(&rc->reloc_roots)) { 2214 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1830 root = list_entry(rc->reloc_roots.next, 2215 root = read_fs_root(reloc_root->fs_info,
1831 struct btrfs_root, root_list); 2216 reloc_root->root_key.offset);
1832 list_del_init(&root->root_list); 2217 BUG_ON(IS_ERR(root));
2218 BUG_ON(root->reloc_root != reloc_root);
1833 2219
1834 async = kmalloc(sizeof(*async), GFP_NOFS); 2220 ret = merge_reloc_root(rc, root);
1835 BUG_ON(!async); 2221 BUG_ON(ret);
1836 async->work.func = merge_func; 2222 } else {
1837 async->work.flags = 0; 2223 list_del_init(&reloc_root->root_list);
1838 async->rc = rc; 2224 }
1839 async->root = root; 2225 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1840 async->done = &done;
1841 async->num_pending = &num_pending;
1842 atomic_inc(&num_pending);
1843 btrfs_queue_worker(&rc->workers, &async->work);
1844 } 2226 }
1845 2227
1846 if (!atomic_dec_and_test(&num_pending)) 2228 if (found) {
1847 wait_for_completion(&done); 2229 found = 0;
1848 2230 goto again;
2231 }
1849 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2232 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1850 return 0; 2233 return 0;
1851} 2234}
@@ -1876,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1876 return btrfs_record_root_in_trans(trans, root); 2259 return btrfs_record_root_in_trans(trans, root);
1877} 2260}
1878 2261
1879/* 2262static noinline_for_stack
1880 * select one tree from trees that references the block. 2263struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1881 * for blocks in refernce counted trees, we preper reloc tree. 2264 struct reloc_control *rc,
1882 * if no reloc tree found and reloc_only is true, NULL is returned. 2265 struct backref_node *node,
1883 */ 2266 struct backref_edge *edges[], int *nr)
1884static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1885 struct backref_node *node,
1886 struct backref_edge *edges[],
1887 int *nr, int reloc_only)
1888{ 2267{
1889 struct backref_node *next; 2268 struct backref_node *next;
1890 struct btrfs_root *root; 2269 struct btrfs_root *root;
1891 int index; 2270 int index = 0;
1892 int loop = 0; 2271
1893again:
1894 index = 0;
1895 next = node; 2272 next = node;
1896 while (1) { 2273 while (1) {
1897 cond_resched(); 2274 cond_resched();
1898 next = walk_up_backref(next, edges, &index); 2275 next = walk_up_backref(next, edges, &index);
1899 root = next->root; 2276 root = next->root;
1900 if (!root) { 2277 BUG_ON(!root);
1901 BUG_ON(!node->old_root); 2278 BUG_ON(!root->ref_cows);
1902 goto skip;
1903 }
1904
1905 /* no other choice for non-refernce counted tree */
1906 if (!root->ref_cows) {
1907 BUG_ON(reloc_only);
1908 break;
1909 }
1910 2279
1911 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2280 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1912 record_reloc_root_in_trans(trans, root); 2281 record_reloc_root_in_trans(trans, root);
1913 break; 2282 break;
1914 } 2283 }
1915 2284
1916 if (loop) { 2285 btrfs_record_root_in_trans(trans, root);
1917 btrfs_record_root_in_trans(trans, root); 2286 root = root->reloc_root;
2287
2288 if (next->new_bytenr != root->node->start) {
2289 BUG_ON(next->new_bytenr);
2290 BUG_ON(!list_empty(&next->list));
2291 next->new_bytenr = root->node->start;
2292 next->root = root;
2293 list_add_tail(&next->list,
2294 &rc->backref_cache.changed);
2295 __mark_block_processed(rc, next);
1918 break; 2296 break;
1919 } 2297 }
1920 2298
1921 if (reloc_only || next != node) { 2299 WARN_ON(1);
1922 if (!root->reloc_root)
1923 btrfs_record_root_in_trans(trans, root);
1924 root = root->reloc_root;
1925 /*
1926 * if the reloc tree was created in current
1927 * transation, there is no node in backref tree
1928 * corresponds to the root of the reloc tree.
1929 */
1930 if (btrfs_root_last_snapshot(&root->root_item) ==
1931 trans->transid - 1)
1932 break;
1933 }
1934skip:
1935 root = NULL; 2300 root = NULL;
1936 next = walk_down_backref(edges, &index); 2301 next = walk_down_backref(edges, &index);
1937 if (!next || next->level <= node->level) 2302 if (!next || next->level <= node->level)
1938 break; 2303 break;
1939 } 2304 }
2305 if (!root)
2306 return NULL;
1940 2307
1941 if (!root && !loop && !reloc_only) { 2308 *nr = index;
1942 loop = 1; 2309 next = node;
1943 goto again; 2310 /* setup backref node path for btrfs_reloc_cow_block */
2311 while (1) {
2312 rc->backref_cache.path[next->level] = next;
2313 if (--index < 0)
2314 break;
2315 next = edges[index]->node[UPPER];
1944 } 2316 }
1945
1946 if (root)
1947 *nr = index;
1948 else
1949 *nr = 0;
1950
1951 return root; 2317 return root;
1952} 2318}
1953 2319
2320/*
2321 * select a tree root for relocation. return NULL if the block
2322 * is reference counted. we should use do_relocation() in this
2323 * case. return a tree root pointer if the block isn't reference
2324 * counted. return -ENOENT if the block is root of reloc tree.
2325 */
1954static noinline_for_stack 2326static noinline_for_stack
1955struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2327struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1956 struct backref_node *node) 2328 struct backref_node *node)
1957{ 2329{
2330 struct backref_node *next;
2331 struct btrfs_root *root;
2332 struct btrfs_root *fs_root = NULL;
1958 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2333 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1959 int nr; 2334 int index = 0;
1960 return __select_one_root(trans, node, edges, &nr, 0); 2335
2336 next = node;
2337 while (1) {
2338 cond_resched();
2339 next = walk_up_backref(next, edges, &index);
2340 root = next->root;
2341 BUG_ON(!root);
2342
2343 /* no other choice for non-refernce counted tree */
2344 if (!root->ref_cows)
2345 return root;
2346
2347 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2348 fs_root = root;
2349
2350 if (next != node)
2351 return NULL;
2352
2353 next = walk_down_backref(edges, &index);
2354 if (!next || next->level <= node->level)
2355 break;
2356 }
2357
2358 if (!fs_root)
2359 return ERR_PTR(-ENOENT);
2360 return fs_root;
1961} 2361}
1962 2362
1963static noinline_for_stack 2363static noinline_for_stack
1964struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2364u64 calcu_metadata_size(struct reloc_control *rc,
1965 struct backref_node *node, 2365 struct backref_node *node, int reserve)
1966 struct backref_edge *edges[], int *nr)
1967{ 2366{
1968 return __select_one_root(trans, node, edges, nr, 1); 2367 struct backref_node *next = node;
2368 struct backref_edge *edge;
2369 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2370 u64 num_bytes = 0;
2371 int index = 0;
2372
2373 BUG_ON(reserve && node->processed);
2374
2375 while (next) {
2376 cond_resched();
2377 while (1) {
2378 if (next->processed && (reserve || next != node))
2379 break;
2380
2381 num_bytes += btrfs_level_size(rc->extent_root,
2382 next->level);
2383
2384 if (list_empty(&next->upper))
2385 break;
2386
2387 edge = list_entry(next->upper.next,
2388 struct backref_edge, list[LOWER]);
2389 edges[index++] = edge;
2390 next = edge->node[UPPER];
2391 }
2392 next = walk_down_backref(edges, &index);
2393 }
2394 return num_bytes;
1969} 2395}
1970 2396
1971static void grab_path_buffers(struct btrfs_path *path, 2397static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1972 struct backref_node *node, 2398 struct reloc_control *rc,
1973 struct backref_edge *edges[], int nr) 2399 struct backref_node *node)
1974{ 2400{
1975 int i = 0; 2401 struct btrfs_root *root = rc->extent_root;
1976 while (1) { 2402 u64 num_bytes;
1977 drop_node_buffer(node); 2403 int ret;
1978 node->eb = path->nodes[node->level]; 2404
1979 BUG_ON(!node->eb); 2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1980 if (path->locks[node->level])
1981 node->locked = 1;
1982 path->nodes[node->level] = NULL;
1983 path->locks[node->level] = 0;
1984
1985 if (i >= nr)
1986 break;
1987 2406
1988 edges[i]->blockptr = node->eb->start; 2407 trans->block_rsv = rc->block_rsv;
1989 node = edges[i]->node[UPPER]; 2408 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
1990 i++; 2409 &rc->block_rsv_retries);
2410 if (ret) {
2411 if (ret == -EAGAIN)
2412 rc->commit_transaction = 1;
2413 return ret;
1991 } 2414 }
2415
2416 rc->block_rsv_retries = 0;
2417 return 0;
2418}
2419
2420static void release_metadata_space(struct reloc_control *rc,
2421 struct backref_node *node)
2422{
2423 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2424 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1992} 2425}
1993 2426
1994/* 2427/*
@@ -1999,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1999 * in that case this function just updates pointers. 2432 * in that case this function just updates pointers.
2000 */ 2433 */
2001static int do_relocation(struct btrfs_trans_handle *trans, 2434static int do_relocation(struct btrfs_trans_handle *trans,
2435 struct reloc_control *rc,
2002 struct backref_node *node, 2436 struct backref_node *node,
2003 struct btrfs_key *key, 2437 struct btrfs_key *key,
2004 struct btrfs_path *path, int lowest) 2438 struct btrfs_path *path, int lowest)
@@ -2019,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2019 BUG_ON(lowest && node->eb); 2453 BUG_ON(lowest && node->eb);
2020 2454
2021 path->lowest_level = node->level + 1; 2455 path->lowest_level = node->level + 1;
2456 rc->backref_cache.path[node->level] = node;
2022 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2457 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2023 cond_resched(); 2458 cond_resched();
2024 if (node->eb && node->eb->start == edge->blockptr)
2025 continue;
2026 2459
2027 upper = edge->node[UPPER]; 2460 upper = edge->node[UPPER];
2028 root = select_reloc_root(trans, upper, edges, &nr); 2461 root = select_reloc_root(trans, rc, upper, edges, &nr);
2029 if (!root) 2462 BUG_ON(!root);
2030 continue; 2463
2031 2464 if (upper->eb && !upper->locked) {
2032 if (upper->eb && !upper->locked) 2465 if (!lowest) {
2466 ret = btrfs_bin_search(upper->eb, key,
2467 upper->level, &slot);
2468 BUG_ON(ret);
2469 bytenr = btrfs_node_blockptr(upper->eb, slot);
2470 if (node->eb->start == bytenr)
2471 goto next;
2472 }
2033 drop_node_buffer(upper); 2473 drop_node_buffer(upper);
2474 }
2034 2475
2035 if (!upper->eb) { 2476 if (!upper->eb) {
2036 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2477 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2040 } 2481 }
2041 BUG_ON(ret > 0); 2482 BUG_ON(ret > 0);
2042 2483
2043 slot = path->slots[upper->level]; 2484 if (!upper->eb) {
2485 upper->eb = path->nodes[upper->level];
2486 path->nodes[upper->level] = NULL;
2487 } else {
2488 BUG_ON(upper->eb != path->nodes[upper->level]);
2489 }
2044 2490
2045 btrfs_unlock_up_safe(path, upper->level + 1); 2491 upper->locked = 1;
2046 grab_path_buffers(path, upper, edges, nr); 2492 path->locks[upper->level] = 0;
2047 2493
2494 slot = path->slots[upper->level];
2048 btrfs_release_path(NULL, path); 2495 btrfs_release_path(NULL, path);
2049 } else { 2496 } else {
2050 ret = btrfs_bin_search(upper->eb, key, upper->level, 2497 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2053 } 2500 }
2054 2501
2055 bytenr = btrfs_node_blockptr(upper->eb, slot); 2502 bytenr = btrfs_node_blockptr(upper->eb, slot);
2056 if (!lowest) { 2503 if (lowest) {
2057 if (node->eb->start == bytenr) { 2504 BUG_ON(bytenr != node->bytenr);
2058 btrfs_tree_unlock(upper->eb);
2059 upper->locked = 0;
2060 continue;
2061 }
2062 } else { 2505 } else {
2063 BUG_ON(node->bytenr != bytenr); 2506 if (node->eb->start == bytenr)
2507 goto next;
2064 } 2508 }
2065 2509
2066 blocksize = btrfs_level_size(root, node->level); 2510 blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2072 if (!node->eb) { 2516 if (!node->eb) {
2073 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2517 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2074 slot, &eb); 2518 slot, &eb);
2519 btrfs_tree_unlock(eb);
2520 free_extent_buffer(eb);
2075 if (ret < 0) { 2521 if (ret < 0) {
2076 err = ret; 2522 err = ret;
2077 break; 2523 goto next;
2078 } 2524 }
2079 btrfs_set_lock_blocking(eb); 2525 BUG_ON(node->eb != eb);
2080 node->eb = eb;
2081 node->locked = 1;
2082 } else { 2526 } else {
2083 btrfs_set_node_blockptr(upper->eb, slot, 2527 btrfs_set_node_blockptr(upper->eb, slot,
2084 node->eb->start); 2528 node->eb->start);
@@ -2096,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2096 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2540 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2097 BUG_ON(ret); 2541 BUG_ON(ret);
2098 } 2542 }
2099 if (!lowest) { 2543next:
2100 btrfs_tree_unlock(upper->eb); 2544 if (!upper->pending)
2101 upper->locked = 0; 2545 drop_node_buffer(upper);
2102 } 2546 else
2547 unlock_node_buffer(upper);
2548 if (err)
2549 break;
2103 } 2550 }
2551
2552 if (!err && node->pending) {
2553 drop_node_buffer(node);
2554 list_move_tail(&node->list, &rc->backref_cache.changed);
2555 node->pending = 0;
2556 }
2557
2104 path->lowest_level = 0; 2558 path->lowest_level = 0;
2559 BUG_ON(err == -ENOSPC);
2105 return err; 2560 return err;
2106} 2561}
2107 2562
2108static int link_to_upper(struct btrfs_trans_handle *trans, 2563static int link_to_upper(struct btrfs_trans_handle *trans,
2564 struct reloc_control *rc,
2109 struct backref_node *node, 2565 struct backref_node *node,
2110 struct btrfs_path *path) 2566 struct btrfs_path *path)
2111{ 2567{
2112 struct btrfs_key key; 2568 struct btrfs_key key;
2113 if (!node->eb || list_empty(&node->upper))
2114 return 0;
2115 2569
2116 btrfs_node_key_to_cpu(node->eb, &key, 0); 2570 btrfs_node_key_to_cpu(node->eb, &key, 0);
2117 return do_relocation(trans, node, &key, path, 0); 2571 return do_relocation(trans, rc, node, &key, path, 0);
2118} 2572}
2119 2573
2120static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2574static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2121 struct backref_cache *cache, 2575 struct reloc_control *rc,
2122 struct btrfs_path *path) 2576 struct btrfs_path *path, int err)
2123{ 2577{
2578 LIST_HEAD(list);
2579 struct backref_cache *cache = &rc->backref_cache;
2124 struct backref_node *node; 2580 struct backref_node *node;
2125 int level; 2581 int level;
2126 int ret; 2582 int ret;
2127 int err = 0;
2128 2583
2129 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2584 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2130 while (!list_empty(&cache->pending[level])) { 2585 while (!list_empty(&cache->pending[level])) {
2131 node = list_entry(cache->pending[level].next, 2586 node = list_entry(cache->pending[level].next,
2132 struct backref_node, lower); 2587 struct backref_node, list);
2133 BUG_ON(node->level != level); 2588 list_move_tail(&node->list, &list);
2589 BUG_ON(!node->pending);
2134 2590
2135 ret = link_to_upper(trans, node, path); 2591 if (!err) {
2136 if (ret < 0) 2592 ret = link_to_upper(trans, rc, node, path);
2137 err = ret; 2593 if (ret < 0)
2138 /* 2594 err = ret;
2139 * this remove the node from the pending list and 2595 }
2140 * may add some other nodes to the level + 1
2141 * pending list
2142 */
2143 remove_backref_node(cache, node);
2144 } 2596 }
2597 list_splice_init(&list, &cache->pending[level]);
2145 } 2598 }
2146 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2147 return err; 2599 return err;
2148} 2600}
2149 2601
2150static void mark_block_processed(struct reloc_control *rc, 2602static void mark_block_processed(struct reloc_control *rc,
2151 struct backref_node *node) 2603 u64 bytenr, u32 blocksize)
2604{
2605 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2606 EXTENT_DIRTY, GFP_NOFS);
2607}
2608
2609static void __mark_block_processed(struct reloc_control *rc,
2610 struct backref_node *node)
2152{ 2611{
2153 u32 blocksize; 2612 u32 blocksize;
2154 if (node->level == 0 || 2613 if (node->level == 0 ||
2155 in_block_group(node->bytenr, rc->block_group)) { 2614 in_block_group(node->bytenr, rc->block_group)) {
2156 blocksize = btrfs_level_size(rc->extent_root, node->level); 2615 blocksize = btrfs_level_size(rc->extent_root, node->level);
2157 set_extent_bits(&rc->processed_blocks, node->bytenr, 2616 mark_block_processed(rc, node->bytenr, blocksize);
2158 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2159 GFP_NOFS);
2160 } 2617 }
2161 node->processed = 1; 2618 node->processed = 1;
2162} 2619}
@@ -2179,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2179 if (next->processed) 2636 if (next->processed)
2180 break; 2637 break;
2181 2638
2182 mark_block_processed(rc, next); 2639 __mark_block_processed(rc, next);
2183 2640
2184 if (list_empty(&next->upper)) 2641 if (list_empty(&next->upper))
2185 break; 2642 break;
@@ -2202,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2202 return 0; 2659 return 0;
2203} 2660}
2204 2661
2205/*
2206 * check if there are any file extent pointers in the leaf point to
2207 * data require processing
2208 */
2209static int check_file_extents(struct reloc_control *rc,
2210 u64 bytenr, u32 blocksize, u64 ptr_gen)
2211{
2212 struct btrfs_key found_key;
2213 struct btrfs_file_extent_item *fi;
2214 struct extent_buffer *leaf;
2215 u32 nritems;
2216 int i;
2217 int ret = 0;
2218
2219 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2220
2221 nritems = btrfs_header_nritems(leaf);
2222 for (i = 0; i < nritems; i++) {
2223 cond_resched();
2224 btrfs_item_key_to_cpu(leaf, &found_key, i);
2225 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2226 continue;
2227 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2228 if (btrfs_file_extent_type(leaf, fi) ==
2229 BTRFS_FILE_EXTENT_INLINE)
2230 continue;
2231 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2232 if (bytenr == 0)
2233 continue;
2234 if (in_block_group(bytenr, rc->block_group)) {
2235 ret = 1;
2236 break;
2237 }
2238 }
2239 free_extent_buffer(leaf);
2240 return ret;
2241}
2242
2243/*
2244 * scan child blocks of a given block to find blocks require processing
2245 */
2246static int add_child_blocks(struct btrfs_trans_handle *trans,
2247 struct reloc_control *rc,
2248 struct backref_node *node,
2249 struct rb_root *blocks)
2250{
2251 struct tree_block *block;
2252 struct rb_node *rb_node;
2253 u64 bytenr;
2254 u64 ptr_gen;
2255 u32 blocksize;
2256 u32 nritems;
2257 int i;
2258 int err = 0;
2259
2260 nritems = btrfs_header_nritems(node->eb);
2261 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2262 for (i = 0; i < nritems; i++) {
2263 cond_resched();
2264 bytenr = btrfs_node_blockptr(node->eb, i);
2265 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2266 if (ptr_gen == trans->transid)
2267 continue;
2268 if (!in_block_group(bytenr, rc->block_group) &&
2269 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2270 continue;
2271 if (tree_block_processed(bytenr, blocksize, rc))
2272 continue;
2273
2274 readahead_tree_block(rc->extent_root,
2275 bytenr, blocksize, ptr_gen);
2276 }
2277
2278 for (i = 0; i < nritems; i++) {
2279 cond_resched();
2280 bytenr = btrfs_node_blockptr(node->eb, i);
2281 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2282 if (ptr_gen == trans->transid)
2283 continue;
2284 if (!in_block_group(bytenr, rc->block_group) &&
2285 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2286 continue;
2287 if (tree_block_processed(bytenr, blocksize, rc))
2288 continue;
2289 if (!in_block_group(bytenr, rc->block_group) &&
2290 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2291 continue;
2292
2293 block = kmalloc(sizeof(*block), GFP_NOFS);
2294 if (!block) {
2295 err = -ENOMEM;
2296 break;
2297 }
2298 block->bytenr = bytenr;
2299 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2300 block->level = node->level - 1;
2301 block->key_ready = 1;
2302 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2303 BUG_ON(rb_node);
2304 }
2305 if (err)
2306 free_block_list(blocks);
2307 return err;
2308}
2309
2310/*
2311 * find adjacent blocks require processing
2312 */
2313static noinline_for_stack
2314int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2315 struct reloc_control *rc,
2316 struct backref_cache *cache,
2317 struct rb_root *blocks, int level,
2318 struct backref_node **upper)
2319{
2320 struct backref_node *node;
2321 int ret = 0;
2322
2323 WARN_ON(!list_empty(&cache->pending[level]));
2324
2325 if (list_empty(&cache->pending[level + 1]))
2326 return 1;
2327
2328 node = list_entry(cache->pending[level + 1].next,
2329 struct backref_node, lower);
2330 if (node->eb)
2331 ret = add_child_blocks(trans, rc, node, blocks);
2332
2333 *upper = node;
2334 return ret;
2335}
2336
2337static int get_tree_block_key(struct reloc_control *rc, 2662static int get_tree_block_key(struct reloc_control *rc,
2338 struct tree_block *block) 2663 struct tree_block *block)
2339{ 2664{
@@ -2371,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2371 struct btrfs_path *path) 2696 struct btrfs_path *path)
2372{ 2697{
2373 struct btrfs_root *root; 2698 struct btrfs_root *root;
2374 int ret; 2699 int release = 0;
2700 int ret = 0;
2375 2701
2702 if (!node)
2703 return 0;
2704
2705 BUG_ON(node->processed);
2376 root = select_one_root(trans, node); 2706 root = select_one_root(trans, node);
2377 if (unlikely(!root)) { 2707 if (root == ERR_PTR(-ENOENT)) {
2378 rc->found_old_snapshot = 1;
2379 update_processed_blocks(rc, node); 2708 update_processed_blocks(rc, node);
2380 return 0; 2709 goto out;
2381 } 2710 }
2382 2711
2383 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2712 if (!root || root->ref_cows) {
2384 ret = do_relocation(trans, node, key, path, 1); 2713 ret = reserve_metadata_space(trans, rc, node);
2385 if (ret < 0) 2714 if (ret)
2386 goto out;
2387 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2388 ret = replace_file_extents(trans, rc, root,
2389 node->eb, NULL);
2390 if (ret < 0)
2391 goto out;
2392 }
2393 drop_node_buffer(node);
2394 } else if (!root->ref_cows) {
2395 path->lowest_level = node->level;
2396 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2397 btrfs_release_path(root, path);
2398 if (ret < 0)
2399 goto out; 2715 goto out;
2400 } else if (root != node->root) { 2716 release = 1;
2401 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2402 } 2717 }
2403 2718
2404 update_processed_blocks(rc, node); 2719 if (root) {
2405 ret = 0; 2720 if (root->ref_cows) {
2721 BUG_ON(node->new_bytenr);
2722 BUG_ON(!list_empty(&node->list));
2723 btrfs_record_root_in_trans(trans, root);
2724 root = root->reloc_root;
2725 node->new_bytenr = root->node->start;
2726 node->root = root;
2727 list_add_tail(&node->list, &rc->backref_cache.changed);
2728 } else {
2729 path->lowest_level = node->level;
2730 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2731 btrfs_release_path(root, path);
2732 if (ret > 0)
2733 ret = 0;
2734 }
2735 if (!ret)
2736 update_processed_blocks(rc, node);
2737 } else {
2738 ret = do_relocation(trans, rc, node, key, path, 1);
2739 }
2406out: 2740out:
2407 drop_node_buffer(node); 2741 if (ret || node->level == 0 || node->cowonly) {
2742 if (release)
2743 release_metadata_space(rc, node);
2744 remove_backref_node(&rc->backref_cache, node);
2745 }
2408 return ret; 2746 return ret;
2409} 2747}
2410 2748
@@ -2415,12 +2753,10 @@ static noinline_for_stack
2415int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2753int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2416 struct reloc_control *rc, struct rb_root *blocks) 2754 struct reloc_control *rc, struct rb_root *blocks)
2417{ 2755{
2418 struct backref_cache *cache;
2419 struct backref_node *node; 2756 struct backref_node *node;
2420 struct btrfs_path *path; 2757 struct btrfs_path *path;
2421 struct tree_block *block; 2758 struct tree_block *block;
2422 struct rb_node *rb_node; 2759 struct rb_node *rb_node;
2423 int level = -1;
2424 int ret; 2760 int ret;
2425 int err = 0; 2761 int err = 0;
2426 2762
@@ -2428,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2428 if (!path) 2764 if (!path)
2429 return -ENOMEM; 2765 return -ENOMEM;
2430 2766
2431 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2432 if (!cache) {
2433 btrfs_free_path(path);
2434 return -ENOMEM;
2435 }
2436
2437 backref_cache_init(cache);
2438
2439 rb_node = rb_first(blocks); 2767 rb_node = rb_first(blocks);
2440 while (rb_node) { 2768 while (rb_node) {
2441 block = rb_entry(rb_node, struct tree_block, rb_node); 2769 block = rb_entry(rb_node, struct tree_block, rb_node);
2442 if (level == -1)
2443 level = block->level;
2444 else
2445 BUG_ON(level != block->level);
2446 if (!block->key_ready) 2770 if (!block->key_ready)
2447 reada_tree_block(rc, block); 2771 reada_tree_block(rc, block);
2448 rb_node = rb_next(rb_node); 2772 rb_node = rb_next(rb_node);
@@ -2460,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2460 while (rb_node) { 2784 while (rb_node) {
2461 block = rb_entry(rb_node, struct tree_block, rb_node); 2785 block = rb_entry(rb_node, struct tree_block, rb_node);
2462 2786
2463 node = build_backref_tree(rc, cache, &block->key, 2787 node = build_backref_tree(rc, &block->key,
2464 block->level, block->bytenr); 2788 block->level, block->bytenr);
2465 if (IS_ERR(node)) { 2789 if (IS_ERR(node)) {
2466 err = PTR_ERR(node); 2790 err = PTR_ERR(node);
@@ -2470,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2470 ret = relocate_tree_block(trans, rc, node, &block->key, 2794 ret = relocate_tree_block(trans, rc, node, &block->key,
2471 path); 2795 path);
2472 if (ret < 0) { 2796 if (ret < 0) {
2473 err = ret; 2797 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2798 err = ret;
2474 goto out; 2799 goto out;
2475 } 2800 }
2476 remove_backref_node(cache, node);
2477 rb_node = rb_next(rb_node); 2801 rb_node = rb_next(rb_node);
2478 } 2802 }
2479 2803out:
2480 if (level > 0)
2481 goto out;
2482
2483 free_block_list(blocks); 2804 free_block_list(blocks);
2805 err = finish_pending_nodes(trans, rc, path, err);
2484 2806
2485 /* 2807 btrfs_free_path(path);
2486 * now backrefs of some upper level tree blocks have been cached, 2808 return err;
2487 * try relocating blocks referenced by these upper level blocks. 2809}
2488 */
2489 while (1) {
2490 struct backref_node *upper = NULL;
2491 if (trans->transaction->in_commit ||
2492 trans->transaction->delayed_refs.flushing)
2493 break;
2494 2810
2495 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2811static noinline_for_stack
2496 &upper); 2812int prealloc_file_extent_cluster(struct inode *inode,
2497 if (ret < 0) 2813 struct file_extent_cluster *cluster)
2498 err = ret; 2814{
2499 if (ret != 0) 2815 u64 alloc_hint = 0;
2500 break; 2816 u64 start;
2817 u64 end;
2818 u64 offset = BTRFS_I(inode)->index_cnt;
2819 u64 num_bytes;
2820 int nr = 0;
2821 int ret = 0;
2501 2822
2502 rb_node = rb_first(blocks); 2823 BUG_ON(cluster->start != cluster->boundary[0]);
2503 while (rb_node) { 2824 mutex_lock(&inode->i_mutex);
2504 block = rb_entry(rb_node, struct tree_block, rb_node);
2505 if (trans->transaction->in_commit ||
2506 trans->transaction->delayed_refs.flushing)
2507 goto out;
2508 BUG_ON(!block->key_ready);
2509 node = build_backref_tree(rc, cache, &block->key,
2510 level, block->bytenr);
2511 if (IS_ERR(node)) {
2512 err = PTR_ERR(node);
2513 goto out;
2514 }
2515 2825
2516 ret = relocate_tree_block(trans, rc, node, 2826 ret = btrfs_check_data_free_space(inode, cluster->end +
2517 &block->key, path); 2827 1 - cluster->start);
2518 if (ret < 0) { 2828 if (ret)
2519 err = ret; 2829 goto out;
2520 goto out;
2521 }
2522 remove_backref_node(cache, node);
2523 rb_node = rb_next(rb_node);
2524 }
2525 free_block_list(blocks);
2526 2830
2527 if (upper) { 2831 while (nr < cluster->nr) {
2528 ret = link_to_upper(trans, upper, path); 2832 start = cluster->boundary[nr] - offset;
2529 if (ret < 0) { 2833 if (nr + 1 < cluster->nr)
2530 err = ret; 2834 end = cluster->boundary[nr + 1] - 1 - offset;
2531 break; 2835 else
2532 } 2836 end = cluster->end - offset;
2533 remove_backref_node(cache, upper); 2837
2534 } 2838 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2839 num_bytes = end + 1 - start;
2840 ret = btrfs_prealloc_file_range(inode, 0, start,
2841 num_bytes, num_bytes,
2842 end + 1, &alloc_hint);
2843 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2844 if (ret)
2845 break;
2846 nr++;
2535 } 2847 }
2848 btrfs_free_reserved_data_space(inode, cluster->end +
2849 1 - cluster->start);
2536out: 2850out:
2537 free_block_list(blocks); 2851 mutex_unlock(&inode->i_mutex);
2538 2852 return ret;
2539 ret = finish_pending_nodes(trans, cache, path);
2540 if (ret < 0)
2541 err = ret;
2542
2543 kfree(cache);
2544 btrfs_free_path(path);
2545 return err;
2546} 2853}
2547 2854
2548static noinline_for_stack 2855static noinline_for_stack
@@ -2588,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2588 u64 offset = BTRFS_I(inode)->index_cnt; 2895 u64 offset = BTRFS_I(inode)->index_cnt;
2589 unsigned long index; 2896 unsigned long index;
2590 unsigned long last_index; 2897 unsigned long last_index;
2591 unsigned int dirty_page = 0;
2592 struct page *page; 2898 struct page *page;
2593 struct file_ra_state *ra; 2899 struct file_ra_state *ra;
2594 int nr = 0; 2900 int nr = 0;
@@ -2601,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2601 if (!ra) 2907 if (!ra)
2602 return -ENOMEM; 2908 return -ENOMEM;
2603 2909
2604 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2910 ret = prealloc_file_extent_cluster(inode, cluster);
2605 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2911 if (ret)
2912 goto out;
2606 2913
2607 mutex_lock(&inode->i_mutex); 2914 file_ra_state_init(ra, inode->i_mapping);
2608 2915
2609 i_size_write(inode, cluster->end + 1 - offset);
2610 ret = setup_extent_mapping(inode, cluster->start - offset, 2916 ret = setup_extent_mapping(inode, cluster->start - offset,
2611 cluster->end - offset, cluster->start); 2917 cluster->end - offset, cluster->start);
2612 if (ret) 2918 if (ret)
2613 goto out_unlock; 2919 goto out;
2614
2615 file_ra_state_init(ra, inode->i_mapping);
2616 2920
2617 WARN_ON(cluster->start != cluster->boundary[0]); 2921 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2922 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2618 while (index <= last_index) { 2923 while (index <= last_index) {
2924 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2925 if (ret)
2926 goto out;
2927
2619 page = find_lock_page(inode->i_mapping, index); 2928 page = find_lock_page(inode->i_mapping, index);
2620 if (!page) { 2929 if (!page) {
2621 page_cache_sync_readahead(inode->i_mapping, 2930 page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2623 last_index + 1 - index); 2932 last_index + 1 - index);
2624 page = grab_cache_page(inode->i_mapping, index); 2933 page = grab_cache_page(inode->i_mapping, index);
2625 if (!page) { 2934 if (!page) {
2935 btrfs_delalloc_release_metadata(inode,
2936 PAGE_CACHE_SIZE);
2626 ret = -ENOMEM; 2937 ret = -ENOMEM;
2627 goto out_unlock; 2938 goto out;
2628 } 2939 }
2629 } 2940 }
2630 2941
@@ -2640,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2640 if (!PageUptodate(page)) { 2951 if (!PageUptodate(page)) {
2641 unlock_page(page); 2952 unlock_page(page);
2642 page_cache_release(page); 2953 page_cache_release(page);
2954 btrfs_delalloc_release_metadata(inode,
2955 PAGE_CACHE_SIZE);
2643 ret = -EIO; 2956 ret = -EIO;
2644 goto out_unlock; 2957 goto out;
2645 } 2958 }
2646 } 2959 }
2647 2960
@@ -2660,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2660 EXTENT_BOUNDARY, GFP_NOFS); 2973 EXTENT_BOUNDARY, GFP_NOFS);
2661 nr++; 2974 nr++;
2662 } 2975 }
2663 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 2976
2977 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2665 set_page_dirty(page); 2978 set_page_dirty(page);
2666 dirty_page++;
2667 2979
2668 unlock_extent(&BTRFS_I(inode)->io_tree, 2980 unlock_extent(&BTRFS_I(inode)->io_tree,
2669 page_start, page_end, GFP_NOFS); 2981 page_start, page_end, GFP_NOFS);
@@ -2671,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2671 page_cache_release(page); 2983 page_cache_release(page);
2672 2984
2673 index++; 2985 index++;
2674 if (nr < cluster->nr && 2986 balance_dirty_pages_ratelimited(inode->i_mapping);
2675 page_end + 1 + offset == cluster->boundary[nr]) { 2987 btrfs_throttle(BTRFS_I(inode)->root);
2676 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2677 dirty_page);
2678 dirty_page = 0;
2679 }
2680 }
2681 if (dirty_page) {
2682 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2683 dirty_page);
2684 } 2988 }
2685 WARN_ON(nr != cluster->nr); 2989 WARN_ON(nr != cluster->nr);
2686out_unlock: 2990out:
2687 mutex_unlock(&inode->i_mutex);
2688 kfree(ra); 2991 kfree(ra);
2689 return ret; 2992 return ret;
2690} 2993}
@@ -2870,9 +3173,6 @@ out:
2870static int block_use_full_backref(struct reloc_control *rc, 3173static int block_use_full_backref(struct reloc_control *rc,
2871 struct extent_buffer *eb) 3174 struct extent_buffer *eb)
2872{ 3175{
2873 struct btrfs_path *path;
2874 struct btrfs_extent_item *ei;
2875 struct btrfs_key key;
2876 u64 flags; 3176 u64 flags;
2877 int ret; 3177 int ret;
2878 3178
@@ -2880,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
2880 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3180 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2881 return 1; 3181 return 1;
2882 3182
2883 path = btrfs_alloc_path(); 3183 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2884 BUG_ON(!path); 3184 eb->start, eb->len, NULL, &flags);
2885
2886 key.objectid = eb->start;
2887 key.type = BTRFS_EXTENT_ITEM_KEY;
2888 key.offset = eb->len;
2889
2890 path->search_commit_root = 1;
2891 path->skip_locking = 1;
2892 ret = btrfs_search_slot(NULL, rc->extent_root,
2893 &key, path, 0, 0);
2894 BUG_ON(ret); 3185 BUG_ON(ret);
2895 3186
2896 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2897 struct btrfs_extent_item);
2898 flags = btrfs_extent_flags(path->nodes[0], ei);
2899 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2900 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3187 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2901 ret = 1; 3188 ret = 1;
2902 else 3189 else
2903 ret = 0; 3190 ret = 0;
2904 btrfs_free_path(path);
2905 return ret; 3191 return ret;
2906} 3192}
2907 3193
@@ -3074,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
3074 struct btrfs_extent_inline_ref *iref; 3360 struct btrfs_extent_inline_ref *iref;
3075 unsigned long ptr; 3361 unsigned long ptr;
3076 unsigned long end; 3362 unsigned long end;
3077 u32 blocksize; 3363 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3078 int ret; 3364 int ret;
3079 int err = 0; 3365 int err = 0;
3080 3366
3081 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3082 extent_key->offset);
3083 BUG_ON(ret < 0);
3084 if (ret > 0) {
3085 /* the relocated data is fragmented */
3086 rc->extents_skipped++;
3087 btrfs_release_path(rc->extent_root, path);
3088 return 0;
3089 }
3090
3091 blocksize = btrfs_level_size(rc->extent_root, 0);
3092
3093 eb = path->nodes[0]; 3367 eb = path->nodes[0];
3094 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3368 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3095 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3369 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
3170 */ 3444 */
3171static noinline_for_stack 3445static noinline_for_stack
3172int find_next_extent(struct btrfs_trans_handle *trans, 3446int find_next_extent(struct btrfs_trans_handle *trans,
3173 struct reloc_control *rc, struct btrfs_path *path) 3447 struct reloc_control *rc, struct btrfs_path *path,
3448 struct btrfs_key *extent_key)
3174{ 3449{
3175 struct btrfs_key key; 3450 struct btrfs_key key;
3176 struct extent_buffer *leaf; 3451 struct extent_buffer *leaf;
@@ -3225,6 +3500,7 @@ next:
3225 rc->search_start = end + 1; 3500 rc->search_start = end + 1;
3226 } else { 3501 } else {
3227 rc->search_start = key.objectid + key.offset; 3502 rc->search_start = key.objectid + key.offset;
3503 memcpy(extent_key, &key, sizeof(key));
3228 return 0; 3504 return 0;
3229 } 3505 }
3230 } 3506 }
@@ -3262,12 +3538,49 @@ static int check_extent_flags(u64 flags)
3262 return 0; 3538 return 0;
3263} 3539}
3264 3540
3541static noinline_for_stack
3542int prepare_to_relocate(struct reloc_control *rc)
3543{
3544 struct btrfs_trans_handle *trans;
3545 int ret;
3546
3547 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3548 if (!rc->block_rsv)
3549 return -ENOMEM;
3550
3551 /*
3552 * reserve some space for creating reloc trees.
3553 * btrfs_init_reloc_root will use them when there
3554 * is no reservation in transaction handle.
3555 */
3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3557 rc->extent_root->nodesize * 256,
3558 &rc->block_rsv_retries);
3559 if (ret)
3560 return ret;
3561
3562 rc->block_rsv->refill_used = 1;
3563 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3564
3565 memset(&rc->cluster, 0, sizeof(rc->cluster));
3566 rc->search_start = rc->block_group->key.objectid;
3567 rc->extents_found = 0;
3568 rc->nodes_relocated = 0;
3569 rc->merging_rsv_size = 0;
3570 rc->block_rsv_retries = 0;
3571
3572 rc->create_reloc_tree = 1;
3573 set_reloc_control(rc);
3574
3575 trans = btrfs_join_transaction(rc->extent_root, 1);
3576 btrfs_commit_transaction(trans, rc->extent_root);
3577 return 0;
3578}
3265 3579
3266static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3580static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3267{ 3581{
3268 struct rb_root blocks = RB_ROOT; 3582 struct rb_root blocks = RB_ROOT;
3269 struct btrfs_key key; 3583 struct btrfs_key key;
3270 struct file_extent_cluster *cluster;
3271 struct btrfs_trans_handle *trans = NULL; 3584 struct btrfs_trans_handle *trans = NULL;
3272 struct btrfs_path *path; 3585 struct btrfs_path *path;
3273 struct btrfs_extent_item *ei; 3586 struct btrfs_extent_item *ei;
@@ -3277,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3277 int ret; 3590 int ret;
3278 int err = 0; 3591 int err = 0;
3279 3592
3280 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3281 if (!cluster)
3282 return -ENOMEM;
3283
3284 path = btrfs_alloc_path(); 3593 path = btrfs_alloc_path();
3285 if (!path) { 3594 if (!path)
3286 kfree(cluster);
3287 return -ENOMEM; 3595 return -ENOMEM;
3288 }
3289
3290 rc->extents_found = 0;
3291 rc->extents_skipped = 0;
3292
3293 rc->search_start = rc->block_group->key.objectid;
3294 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3295 GFP_NOFS);
3296
3297 rc->create_reloc_root = 1;
3298 set_reloc_control(rc);
3299 3596
3300 trans = btrfs_start_transaction(rc->extent_root, 1); 3597 ret = prepare_to_relocate(rc);
3301 btrfs_commit_transaction(trans, rc->extent_root); 3598 if (ret) {
3599 err = ret;
3600 goto out_free;
3601 }
3302 3602
3303 while (1) { 3603 while (1) {
3304 trans = btrfs_start_transaction(rc->extent_root, 1); 3604 trans = btrfs_start_transaction(rc->extent_root, 0);
3605
3606 if (update_backref_cache(trans, &rc->backref_cache)) {
3607 btrfs_end_transaction(trans, rc->extent_root);
3608 continue;
3609 }
3305 3610
3306 ret = find_next_extent(trans, rc, path); 3611 ret = find_next_extent(trans, rc, path, &key);
3307 if (ret < 0) 3612 if (ret < 0)
3308 err = ret; 3613 err = ret;
3309 if (ret != 0) 3614 if (ret != 0)
@@ -3313,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3313 3618
3314 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3619 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3315 struct btrfs_extent_item); 3620 struct btrfs_extent_item);
3316 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3621 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3317 item_size = btrfs_item_size_nr(path->nodes[0],
3318 path->slots[0]);
3319 if (item_size >= sizeof(*ei)) { 3622 if (item_size >= sizeof(*ei)) {
3320 flags = btrfs_extent_flags(path->nodes[0], ei); 3623 flags = btrfs_extent_flags(path->nodes[0], ei);
3321 ret = check_extent_flags(flags); 3624 ret = check_extent_flags(flags);
@@ -3356,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3356 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3659 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3357 ret = add_tree_block(rc, &key, path, &blocks); 3660 ret = add_tree_block(rc, &key, path, &blocks);
3358 } else if (rc->stage == UPDATE_DATA_PTRS && 3661 } else if (rc->stage == UPDATE_DATA_PTRS &&
3359 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3662 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3360 ret = add_data_references(rc, &key, path, &blocks); 3663 ret = add_data_references(rc, &key, path, &blocks);
3361 } else { 3664 } else {
3362 btrfs_release_path(rc->extent_root, path); 3665 btrfs_release_path(rc->extent_root, path);
3363 ret = 0; 3666 ret = 0;
3364 } 3667 }
3365 if (ret < 0) { 3668 if (ret < 0) {
3366 err = 0; 3669 err = ret;
3367 break; 3670 break;
3368 } 3671 }
3369 3672
3370 if (!RB_EMPTY_ROOT(&blocks)) { 3673 if (!RB_EMPTY_ROOT(&blocks)) {
3371 ret = relocate_tree_blocks(trans, rc, &blocks); 3674 ret = relocate_tree_blocks(trans, rc, &blocks);
3372 if (ret < 0) { 3675 if (ret < 0) {
3676 if (ret != -EAGAIN) {
3677 err = ret;
3678 break;
3679 }
3680 rc->extents_found--;
3681 rc->search_start = key.objectid;
3682 }
3683 }
3684
3685 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3686 rc->block_rsv, 0, 5);
3687 if (ret < 0) {
3688 if (ret != -EAGAIN) {
3373 err = ret; 3689 err = ret;
3690 WARN_ON(1);
3374 break; 3691 break;
3375 } 3692 }
3693 rc->commit_transaction = 1;
3376 } 3694 }
3377 3695
3378 nr = trans->blocks_used; 3696 if (rc->commit_transaction) {
3379 btrfs_end_transaction(trans, rc->extent_root); 3697 rc->commit_transaction = 0;
3698 ret = btrfs_commit_transaction(trans, rc->extent_root);
3699 BUG_ON(ret);
3700 } else {
3701 nr = trans->blocks_used;
3702 btrfs_end_transaction_throttle(trans, rc->extent_root);
3703 btrfs_btree_balance_dirty(rc->extent_root, nr);
3704 }
3380 trans = NULL; 3705 trans = NULL;
3381 btrfs_btree_balance_dirty(rc->extent_root, nr);
3382 3706
3383 if (rc->stage == MOVE_DATA_EXTENTS && 3707 if (rc->stage == MOVE_DATA_EXTENTS &&
3384 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3708 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3385 rc->found_file_extent = 1; 3709 rc->found_file_extent = 1;
3386 ret = relocate_data_extent(rc->data_inode, 3710 ret = relocate_data_extent(rc->data_inode,
3387 &key, cluster); 3711 &key, &rc->cluster);
3388 if (ret < 0) { 3712 if (ret < 0) {
3389 err = ret; 3713 err = ret;
3390 break; 3714 break;
3391 } 3715 }
3392 } 3716 }
3393 } 3717 }
3394 btrfs_free_path(path); 3718
3719 btrfs_release_path(rc->extent_root, path);
3720 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3721 GFP_NOFS);
3395 3722
3396 if (trans) { 3723 if (trans) {
3397 nr = trans->blocks_used; 3724 nr = trans->blocks_used;
3398 btrfs_end_transaction(trans, rc->extent_root); 3725 btrfs_end_transaction_throttle(trans, rc->extent_root);
3399 btrfs_btree_balance_dirty(rc->extent_root, nr); 3726 btrfs_btree_balance_dirty(rc->extent_root, nr);
3400 } 3727 }
3401 3728
3402 if (!err) { 3729 if (!err) {
3403 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3730 ret = relocate_file_extent_cluster(rc->data_inode,
3731 &rc->cluster);
3404 if (ret < 0) 3732 if (ret < 0)
3405 err = ret; 3733 err = ret;
3406 } 3734 }
3407 3735
3408 kfree(cluster); 3736 rc->create_reloc_tree = 0;
3737 set_reloc_control(rc);
3409 3738
3410 rc->create_reloc_root = 0; 3739 backref_cache_cleanup(&rc->backref_cache);
3411 smp_mb(); 3740 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3412 3741
3413 if (rc->extents_found > 0) { 3742 err = prepare_to_merge(rc, err);
3414 trans = btrfs_start_transaction(rc->extent_root, 1);
3415 btrfs_commit_transaction(trans, rc->extent_root);
3416 }
3417 3743
3418 merge_reloc_roots(rc); 3744 merge_reloc_roots(rc);
3419 3745
3746 rc->merge_reloc_tree = 0;
3420 unset_reloc_control(rc); 3747 unset_reloc_control(rc);
3748 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3421 3749
3422 /* get rid of pinned extents */ 3750 /* get rid of pinned extents */
3423 trans = btrfs_start_transaction(rc->extent_root, 1); 3751 trans = btrfs_join_transaction(rc->extent_root, 1);
3424 btrfs_commit_transaction(trans, rc->extent_root); 3752 btrfs_commit_transaction(trans, rc->extent_root);
3425 3753out_free:
3754 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3755 btrfs_free_path(path);
3426 return err; 3756 return err;
3427} 3757}
3428 3758
@@ -3448,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3448 btrfs_set_inode_generation(leaf, item, 1); 3778 btrfs_set_inode_generation(leaf, item, 1);
3449 btrfs_set_inode_size(leaf, item, 0); 3779 btrfs_set_inode_size(leaf, item, 0);
3450 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3780 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3451 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3781 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3782 BTRFS_INODE_PREALLOC);
3452 btrfs_mark_buffer_dirty(leaf); 3783 btrfs_mark_buffer_dirty(leaf);
3453 btrfs_release_path(root, path); 3784 btrfs_release_path(root, path);
3454out: 3785out:
@@ -3460,8 +3791,9 @@ out:
3460 * helper to create inode for data relocation. 3791 * helper to create inode for data relocation.
3461 * the inode is in data relocation tree and its link count is 0 3792 * the inode is in data relocation tree and its link count is 0
3462 */ 3793 */
3463static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3794static noinline_for_stack
3464 struct btrfs_block_group_cache *group) 3795struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3796 struct btrfs_block_group_cache *group)
3465{ 3797{
3466 struct inode *inode = NULL; 3798 struct inode *inode = NULL;
3467 struct btrfs_trans_handle *trans; 3799 struct btrfs_trans_handle *trans;
@@ -3475,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3475 if (IS_ERR(root)) 3807 if (IS_ERR(root))
3476 return ERR_CAST(root); 3808 return ERR_CAST(root);
3477 3809
3478 trans = btrfs_start_transaction(root, 1); 3810 trans = btrfs_start_transaction(root, 6);
3479 BUG_ON(!trans); 3811 if (IS_ERR(trans))
3812 return ERR_CAST(trans);
3480 3813
3481 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3814 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3482 if (err) 3815 if (err)
@@ -3496,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3496out: 3829out:
3497 nr = trans->blocks_used; 3830 nr = trans->blocks_used;
3498 btrfs_end_transaction(trans, root); 3831 btrfs_end_transaction(trans, root);
3499
3500 btrfs_btree_balance_dirty(root, nr); 3832 btrfs_btree_balance_dirty(root, nr);
3501 if (err) { 3833 if (err) {
3502 if (inode) 3834 if (inode)
@@ -3506,6 +3838,21 @@ out:
3506 return inode; 3838 return inode;
3507} 3839}
3508 3840
3841static struct reloc_control *alloc_reloc_control(void)
3842{
3843 struct reloc_control *rc;
3844
3845 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3846 if (!rc)
3847 return NULL;
3848
3849 INIT_LIST_HEAD(&rc->reloc_roots);
3850 backref_cache_init(&rc->backref_cache);
3851 mapping_tree_init(&rc->reloc_root_tree);
3852 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3853 return rc;
3854}
3855
3509/* 3856/*
3510 * function to relocate all extents in a block group. 3857 * function to relocate all extents in a block group.
3511 */ 3858 */
@@ -3514,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3514 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3861 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3515 struct reloc_control *rc; 3862 struct reloc_control *rc;
3516 int ret; 3863 int ret;
3864 int rw = 0;
3517 int err = 0; 3865 int err = 0;
3518 3866
3519 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3867 rc = alloc_reloc_control();
3520 if (!rc) 3868 if (!rc)
3521 return -ENOMEM; 3869 return -ENOMEM;
3522 3870
3523 mapping_tree_init(&rc->reloc_root_tree); 3871 rc->extent_root = extent_root;
3524 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3525 INIT_LIST_HEAD(&rc->reloc_roots);
3526 3872
3527 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3873 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3528 BUG_ON(!rc->block_group); 3874 BUG_ON(!rc->block_group);
3529 3875
3530 btrfs_init_workers(&rc->workers, "relocate", 3876 if (!rc->block_group->ro) {
3531 fs_info->thread_pool_size, NULL); 3877 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3532 3878 if (ret) {
3533 rc->extent_root = extent_root; 3879 err = ret;
3534 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3880 goto out;
3881 }
3882 rw = 1;
3883 }
3535 3884
3536 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3885 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3537 if (IS_ERR(rc->data_inode)) { 3886 if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3548 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3897 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3549 3898
3550 while (1) { 3899 while (1) {
3551 rc->extents_found = 0;
3552 rc->extents_skipped = 0;
3553
3554 mutex_lock(&fs_info->cleaner_mutex); 3900 mutex_lock(&fs_info->cleaner_mutex);
3555 3901
3556 btrfs_clean_old_snapshots(fs_info->tree_root); 3902 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3559 mutex_unlock(&fs_info->cleaner_mutex); 3905 mutex_unlock(&fs_info->cleaner_mutex);
3560 if (ret < 0) { 3906 if (ret < 0) {
3561 err = ret; 3907 err = ret;
3562 break; 3908 goto out;
3563 } 3909 }
3564 3910
3565 if (rc->extents_found == 0) 3911 if (rc->extents_found == 0)
@@ -3573,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3573 invalidate_mapping_pages(rc->data_inode->i_mapping, 3919 invalidate_mapping_pages(rc->data_inode->i_mapping,
3574 0, -1); 3920 0, -1);
3575 rc->stage = UPDATE_DATA_PTRS; 3921 rc->stage = UPDATE_DATA_PTRS;
3576 } else if (rc->stage == UPDATE_DATA_PTRS &&
3577 rc->extents_skipped >= rc->extents_found) {
3578 iput(rc->data_inode);
3579 rc->data_inode = create_reloc_inode(fs_info,
3580 rc->block_group);
3581 if (IS_ERR(rc->data_inode)) {
3582 err = PTR_ERR(rc->data_inode);
3583 rc->data_inode = NULL;
3584 break;
3585 }
3586 rc->stage = MOVE_DATA_EXTENTS;
3587 rc->found_file_extent = 0;
3588 } 3922 }
3589 } 3923 }
3590 3924
@@ -3597,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3597 WARN_ON(rc->block_group->reserved > 0); 3931 WARN_ON(rc->block_group->reserved > 0);
3598 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 3932 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3599out: 3933out:
3934 if (err && rw)
3935 btrfs_set_block_group_rw(extent_root, rc->block_group);
3600 iput(rc->data_inode); 3936 iput(rc->data_inode);
3601 btrfs_stop_workers(&rc->workers);
3602 btrfs_put_block_group(rc->block_group); 3937 btrfs_put_block_group(rc->block_group);
3603 kfree(rc); 3938 kfree(rc);
3604 return err; 3939 return err;
@@ -3609,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3609 struct btrfs_trans_handle *trans; 3944 struct btrfs_trans_handle *trans;
3610 int ret; 3945 int ret;
3611 3946
3612 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 3947 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3613 3948
3614 memset(&root->root_item.drop_progress, 0, 3949 memset(&root->root_item.drop_progress, 0,
3615 sizeof(root->root_item.drop_progress)); 3950 sizeof(root->root_item.drop_progress));
@@ -3702,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3702 if (list_empty(&reloc_roots)) 4037 if (list_empty(&reloc_roots))
3703 goto out; 4038 goto out;
3704 4039
3705 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4040 rc = alloc_reloc_control();
3706 if (!rc) { 4041 if (!rc) {
3707 err = -ENOMEM; 4042 err = -ENOMEM;
3708 goto out; 4043 goto out;
3709 } 4044 }
3710 4045
3711 mapping_tree_init(&rc->reloc_root_tree);
3712 INIT_LIST_HEAD(&rc->reloc_roots);
3713 btrfs_init_workers(&rc->workers, "relocate",
3714 root->fs_info->thread_pool_size, NULL);
3715 rc->extent_root = root->fs_info->extent_root; 4046 rc->extent_root = root->fs_info->extent_root;
3716 4047
3717 set_reloc_control(rc); 4048 set_reloc_control(rc);
3718 4049
4050 trans = btrfs_join_transaction(rc->extent_root, 1);
4051
4052 rc->merge_reloc_tree = 1;
4053
3719 while (!list_empty(&reloc_roots)) { 4054 while (!list_empty(&reloc_roots)) {
3720 reloc_root = list_entry(reloc_roots.next, 4055 reloc_root = list_entry(reloc_roots.next,
3721 struct btrfs_root, root_list); 4056 struct btrfs_root, root_list);
@@ -3735,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3735 fs_root->reloc_root = reloc_root; 4070 fs_root->reloc_root = reloc_root;
3736 } 4071 }
3737 4072
3738 trans = btrfs_start_transaction(rc->extent_root, 1);
3739 btrfs_commit_transaction(trans, rc->extent_root); 4073 btrfs_commit_transaction(trans, rc->extent_root);
3740 4074
3741 merge_reloc_roots(rc); 4075 merge_reloc_roots(rc);
3742 4076
3743 unset_reloc_control(rc); 4077 unset_reloc_control(rc);
3744 4078
3745 trans = btrfs_start_transaction(rc->extent_root, 1); 4079 trans = btrfs_join_transaction(rc->extent_root, 1);
3746 btrfs_commit_transaction(trans, rc->extent_root); 4080 btrfs_commit_transaction(trans, rc->extent_root);
3747out: 4081out:
3748 if (rc) { 4082 kfree(rc);
3749 btrfs_stop_workers(&rc->workers);
3750 kfree(rc);
3751 }
3752 while (!list_empty(&reloc_roots)) { 4083 while (!list_empty(&reloc_roots)) {
3753 reloc_root = list_entry(reloc_roots.next, 4084 reloc_root = list_entry(reloc_roots.next,
3754 struct btrfs_root, root_list); 4085 struct btrfs_root, root_list);
@@ -3814,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3814 btrfs_put_ordered_extent(ordered); 4145 btrfs_put_ordered_extent(ordered);
3815 return 0; 4146 return 0;
3816} 4147}
4148
4149void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4150 struct btrfs_root *root, struct extent_buffer *buf,
4151 struct extent_buffer *cow)
4152{
4153 struct reloc_control *rc;
4154 struct backref_node *node;
4155 int first_cow = 0;
4156 int level;
4157 int ret;
4158
4159 rc = root->fs_info->reloc_ctl;
4160 if (!rc)
4161 return;
4162
4163 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4164 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4165
4166 level = btrfs_header_level(buf);
4167 if (btrfs_header_generation(buf) <=
4168 btrfs_root_last_snapshot(&root->root_item))
4169 first_cow = 1;
4170
4171 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4172 rc->create_reloc_tree) {
4173 WARN_ON(!first_cow && level == 0);
4174
4175 node = rc->backref_cache.path[level];
4176 BUG_ON(node->bytenr != buf->start &&
4177 node->new_bytenr != buf->start);
4178
4179 drop_node_buffer(node);
4180 extent_buffer_get(cow);
4181 node->eb = cow;
4182 node->new_bytenr = cow->start;
4183
4184 if (!node->pending) {
4185 list_move_tail(&node->list,
4186 &rc->backref_cache.pending[level]);
4187 node->pending = 1;
4188 }
4189
4190 if (first_cow)
4191 __mark_block_processed(rc, node);
4192
4193 if (first_cow && level > 0)
4194 rc->nodes_relocated += buf->len;
4195 }
4196
4197 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4198 ret = replace_file_extents(trans, rc, root, cow);
4199 BUG_ON(ret);
4200 }
4201}
4202
4203/*
4204 * called before creating snapshot. it calculates metadata reservation
4205 * requried for relocating tree blocks in the snapshot
4206 */
4207void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4208 struct btrfs_pending_snapshot *pending,
4209 u64 *bytes_to_reserve)
4210{
4211 struct btrfs_root *root;
4212 struct reloc_control *rc;
4213
4214 root = pending->root;
4215 if (!root->reloc_root)
4216 return;
4217
4218 rc = root->fs_info->reloc_ctl;
4219 if (!rc->merge_reloc_tree)
4220 return;
4221
4222 root = root->reloc_root;
4223 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4224 /*
4225 * relocation is in the stage of merging trees. the space
4226 * used by merging a reloc tree is twice the size of
4227 * relocated tree nodes in the worst case. half for cowing
4228 * the reloc tree, half for cowing the fs tree. the space
4229 * used by cowing the reloc tree will be freed after the
4230 * tree is dropped. if we create snapshot, cowing the fs
4231 * tree may use more space than it frees. so we need
4232 * reserve extra space.
4233 */
4234 *bytes_to_reserve += rc->nodes_relocated;
4235}
4236
4237/*
4238 * called after snapshot is created. migrate block reservation
4239 * and create reloc root for the newly created snapshot
4240 */
4241void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4242 struct btrfs_pending_snapshot *pending)
4243{
4244 struct btrfs_root *root = pending->root;
4245 struct btrfs_root *reloc_root;
4246 struct btrfs_root *new_root;
4247 struct reloc_control *rc;
4248 int ret;
4249
4250 if (!root->reloc_root)
4251 return;
4252
4253 rc = root->fs_info->reloc_ctl;
4254 rc->merging_rsv_size += rc->nodes_relocated;
4255
4256 if (rc->merge_reloc_tree) {
4257 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4258 rc->block_rsv,
4259 rc->nodes_relocated);
4260 BUG_ON(ret);
4261 }
4262
4263 new_root = pending->snap;
4264 reloc_root = create_reloc_root(trans, root->reloc_root,
4265 new_root->root_key.objectid);
4266
4267 __add_reloc_root(reloc_root);
4268 new_root->reloc_root = reloc_root;
4269
4270 if (rc->create_reloc_tree) {
4271 ret = clone_backref_node(trans, rc, root, reloc_root);
4272 BUG_ON(ret);
4273 }
4274}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 259 struct extent_buffer *leaf;
260 struct btrfs_path *path; 260 struct btrfs_path *path;
261 struct btrfs_key key; 261 struct btrfs_key key;
262 struct btrfs_key root_key;
263 struct btrfs_root *root;
262 int err = 0; 264 int err = 0;
263 int ret; 265 int ret;
264 266
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 272 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 273 key.offset = 0;
272 274
275 root_key.type = BTRFS_ROOT_ITEM_KEY;
276 root_key.offset = (u64)-1;
277
273 while (1) { 278 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 279 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 280 if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 299 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 300 break;
296 301
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 302 root_key.objectid = key.offset;
298 if (ret) { 303 key.offset++;
304
305 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
306 &root_key);
307 if (!IS_ERR(root))
308 continue;
309
310 ret = PTR_ERR(root);
311 if (ret != -ENOENT) {
299 err = ret; 312 err = ret;
300 break; 313 break;
301 } 314 }
302 315
303 key.offset++; 316 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
317 if (ret) {
318 err = ret;
319 break;
320 }
304 } 321 }
305 322
306 btrfs_free_path(path); 323 btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
313{ 330{
314 struct btrfs_path *path; 331 struct btrfs_path *path;
315 int ret; 332 int ret;
316 u32 refs;
317 struct btrfs_root_item *ri; 333 struct btrfs_root_item *ri;
318 struct extent_buffer *leaf; 334 struct extent_buffer *leaf;
319 335
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
327 leaf = path->nodes[0]; 343 leaf = path->nodes[0];
328 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); 344 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
329 345
330 refs = btrfs_disk_root_refs(leaf, ri);
331 BUG_ON(refs != 0);
332 ret = btrfs_del_item(trans, root, path); 346 ret = btrfs_del_item(trans, root, path);
333out: 347out:
334 btrfs_free_path(path); 348 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538e..f2393b390318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
360 */ 360 */
361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (IS_ERR(di))
364 return ERR_CAST(di);
363 if (!di) { 365 if (!di) {
364 /* 366 /*
365 * Ok the default dir item isn't there. This is weird since 367 * Ok the default dir item isn't there. This is weird since
@@ -390,8 +392,8 @@ setup_root:
390 location.offset = 0; 392 location.offset = 0;
391 393
392 inode = btrfs_iget(sb, &location, new_root, &new); 394 inode = btrfs_iget(sb, &location, new_root, &new);
393 if (!inode) 395 if (IS_ERR(inode))
394 return ERR_PTR(-ENOMEM); 396 return ERR_CAST(inode);
395 397
396 /* 398 /*
397 * If we're just mounting the root most subvol put the inode and return 399 * If we're just mounting the root most subvol put the inode and return
@@ -498,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
498 btrfs_start_delalloc_inodes(root, 0); 500 btrfs_start_delalloc_inodes(root, 0);
499 btrfs_wait_ordered_extents(root, 0, 0); 501 btrfs_wait_ordered_extents(root, 0, 0);
500 502
501 trans = btrfs_start_transaction(root, 1); 503 trans = btrfs_start_transaction(root, 0);
502 ret = btrfs_commit_transaction(trans, root); 504 ret = btrfs_commit_transaction(trans, root);
503 return ret; 505 return ret;
504} 506}
@@ -694,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
694 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 696 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
695 return -EINVAL; 697 return -EINVAL;
696 698
697 /* recover relocation */ 699 ret = btrfs_cleanup_fs_roots(root->fs_info);
698 ret = btrfs_recover_relocation(root);
699 WARN_ON(ret); 700 WARN_ON(ret);
700 701
701 ret = btrfs_cleanup_fs_roots(root->fs_info); 702 /* recover relocation */
703 ret = btrfs_recover_relocation(root);
702 WARN_ON(ret); 704 WARN_ON(ret);
703 705
704 sb->s_flags &= ~MS_RDONLY; 706 sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +716,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
714 struct list_head *head = &root->fs_info->space_info; 716 struct list_head *head = &root->fs_info->space_info;
715 struct btrfs_space_info *found; 717 struct btrfs_space_info *found;
716 u64 total_used = 0; 718 u64 total_used = 0;
717 u64 data_used = 0;
718 int bits = dentry->d_sb->s_blocksize_bits; 719 int bits = dentry->d_sb->s_blocksize_bits;
719 __be32 *fsid = (__be32 *)root->fs_info->fsid; 720 __be32 *fsid = (__be32 *)root->fs_info->fsid;
720 721
721 rcu_read_lock(); 722 rcu_read_lock();
722 list_for_each_entry_rcu(found, head, list) { 723 list_for_each_entry_rcu(found, head, list)
723 if (found->flags & (BTRFS_BLOCK_GROUP_DUP| 724 total_used += found->disk_used;
724 BTRFS_BLOCK_GROUP_RAID10|
725 BTRFS_BLOCK_GROUP_RAID1)) {
726 total_used += found->bytes_used;
727 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
728 data_used += found->bytes_used;
729 else
730 data_used += found->total_bytes;
731 }
732
733 total_used += found->bytes_used;
734 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
735 data_used += found->bytes_used;
736 else
737 data_used += found->total_bytes;
738 }
739 rcu_read_unlock(); 725 rcu_read_unlock();
740 726
741 buf->f_namelen = BTRFS_NAME_LEN; 727 buf->f_namelen = BTRFS_NAME_LEN;
742 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 728 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
743 buf->f_bfree = buf->f_blocks - (total_used >> bits); 729 buf->f_bfree = buf->f_blocks - (total_used >> bits);
744 buf->f_bavail = buf->f_blocks - (data_used >> bits); 730 buf->f_bavail = buf->f_bfree;
745 buf->f_bsize = dentry->d_sb->s_blocksize; 731 buf->f_bsize = dentry->d_sb->s_blocksize;
746 buf->f_type = BTRFS_SUPER_MAGIC; 732 buf->f_type = BTRFS_SUPER_MAGIC;
747 733
@@ -832,11 +818,14 @@ static const struct file_operations btrfs_ctl_fops = {
832}; 818};
833 819
834static struct miscdevice btrfs_misc = { 820static struct miscdevice btrfs_misc = {
835 .minor = MISC_DYNAMIC_MINOR, 821 .minor = BTRFS_MINOR,
836 .name = "btrfs-control", 822 .name = "btrfs-control",
837 .fops = &btrfs_ctl_fops 823 .fops = &btrfs_ctl_fops
838}; 824};
839 825
826MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
827MODULE_ALIAS("devname:btrfs-control");
828
840static int btrfs_interface_init(void) 829static int btrfs_interface_init(void)
841{ 830{
842 return misc_register(&btrfs_misc); 831 return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
165 TRANS_USERSPACE, 165 TRANS_USERSPACE,
166}; 166};
167 167
168static int may_wait_transaction(struct btrfs_root *root, int type)
169{
170 if (!root->fs_info->log_root_recovering &&
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
172 type == TRANS_USERSPACE))
173 return 1;
174 return 0;
175}
176
168static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 177static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
169 int num_blocks, int type) 178 u64 num_items, int type)
170{ 179{
171 struct btrfs_trans_handle *h = 180 struct btrfs_trans_handle *h;
172 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 181 struct btrfs_transaction *cur_trans;
182 int retries = 0;
173 int ret; 183 int ret;
184again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h)
187 return ERR_PTR(-ENOMEM);
174 188
175 mutex_lock(&root->fs_info->trans_mutex); 189 mutex_lock(&root->fs_info->trans_mutex);
176 if (!root->fs_info->log_root_recovering && 190 if (may_wait_transaction(root, type))
177 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
178 type == TRANS_USERSPACE))
179 wait_current_trans(root); 191 wait_current_trans(root);
192
180 ret = join_transaction(root); 193 ret = join_transaction(root);
181 BUG_ON(ret); 194 BUG_ON(ret);
182 195
183 h->transid = root->fs_info->running_transaction->transid; 196 cur_trans = root->fs_info->running_transaction;
184 h->transaction = root->fs_info->running_transaction; 197 cur_trans->use_count++;
185 h->blocks_reserved = num_blocks; 198 mutex_unlock(&root->fs_info->trans_mutex);
199
200 h->transid = cur_trans->transid;
201 h->transaction = cur_trans;
186 h->blocks_used = 0; 202 h->blocks_used = 0;
187 h->block_group = 0; 203 h->block_group = 0;
188 h->alloc_exclude_nr = 0; 204 h->bytes_reserved = 0;
189 h->alloc_exclude_start = 0;
190 h->delayed_ref_updates = 0; 205 h->delayed_ref_updates = 0;
206 h->block_rsv = NULL;
191 207
192 if (!current->journal_info && type != TRANS_USERSPACE) 208 smp_mb();
193 current->journal_info = h; 209 if (cur_trans->blocked && may_wait_transaction(root, type)) {
210 btrfs_commit_transaction(h, root);
211 goto again;
212 }
213
214 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items,
216 &retries);
217 if (ret == -EAGAIN) {
218 btrfs_commit_transaction(h, root);
219 goto again;
220 }
221 if (ret < 0) {
222 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret);
224 }
225 }
194 226
195 root->fs_info->running_transaction->use_count++; 227 mutex_lock(&root->fs_info->trans_mutex);
196 record_root_in_trans(h, root); 228 record_root_in_trans(h, root);
197 mutex_unlock(&root->fs_info->trans_mutex); 229 mutex_unlock(&root->fs_info->trans_mutex);
230
231 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h;
198 return h; 233 return h;
199} 234}
200 235
201struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 236struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
202 int num_blocks) 237 int num_items)
203{ 238{
204 return start_transaction(root, num_blocks, TRANS_START); 239 return start_transaction(root, num_items, TRANS_START);
205} 240}
206struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
207 int num_blocks) 242 int num_blocks)
208{ 243{
209 return start_transaction(root, num_blocks, TRANS_JOIN); 244 return start_transaction(root, 0, TRANS_JOIN);
210} 245}
211 246
212struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
213 int num_blocks) 248 int num_blocks)
214{ 249{
215 return start_transaction(r, num_blocks, TRANS_USERSPACE); 250 return start_transaction(r, 0, TRANS_USERSPACE);
216} 251}
217 252
218/* wait for a transaction commit to be fully complete */ 253/* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
286 mutex_unlock(&root->fs_info->trans_mutex); 321 mutex_unlock(&root->fs_info->trans_mutex);
287} 322}
288 323
324static int should_end_transaction(struct btrfs_trans_handle *trans,
325 struct btrfs_root *root)
326{
327 int ret;
328 ret = btrfs_block_rsv_check(trans, root,
329 &root->fs_info->global_block_rsv, 0, 5);
330 return ret ? 1 : 0;
331}
332
333int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
334 struct btrfs_root *root)
335{
336 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates;
338
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1;
341
342 updates = trans->delayed_ref_updates;
343 trans->delayed_ref_updates = 0;
344 if (updates)
345 btrfs_run_delayed_refs(trans, root, updates);
346
347 return should_end_transaction(trans, root);
348}
349
289static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
290 struct btrfs_root *root, int throttle) 351 struct btrfs_root *root, int throttle)
291{ 352{
292 struct btrfs_transaction *cur_trans; 353 struct btrfs_transaction *cur_trans = trans->transaction;
293 struct btrfs_fs_info *info = root->fs_info; 354 struct btrfs_fs_info *info = root->fs_info;
294 int count = 0; 355 int count = 0;
295 356
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
313 count++; 374 count++;
314 } 375 }
315 376
377 btrfs_trans_release_metadata(trans, root);
378
379 if (!root->fs_info->open_ioctl_trans &&
380 should_end_transaction(trans, root))
381 trans->transaction->blocked = 1;
382
383 if (cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle)
385 return btrfs_commit_transaction(trans, root);
386 else
387 wake_up_process(info->transaction_kthread);
388 }
389
316 mutex_lock(&info->trans_mutex); 390 mutex_lock(&info->trans_mutex);
317 cur_trans = info->running_transaction; 391 WARN_ON(cur_trans != info->running_transaction);
318 WARN_ON(cur_trans != trans->transaction);
319 WARN_ON(cur_trans->num_writers < 1); 392 WARN_ON(cur_trans->num_writers < 1);
320 cur_trans->num_writers--; 393 cur_trans->num_writers--;
321 394
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
603 676
604 btrfs_free_log(trans, root); 677 btrfs_free_log(trans, root);
605 btrfs_update_reloc_root(trans, root); 678 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root);
606 680
607 if (root->commit_root != root->node) { 681 if (root->commit_root != root->node) {
608 switch_commit_root(root); 682 switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
627int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 701int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
628{ 702{
629 struct btrfs_fs_info *info = root->fs_info; 703 struct btrfs_fs_info *info = root->fs_info;
630 int ret;
631 struct btrfs_trans_handle *trans; 704 struct btrfs_trans_handle *trans;
705 int ret;
632 unsigned long nr; 706 unsigned long nr;
633 707
634 smp_mb(); 708 if (xchg(&root->defrag_running, 1))
635 if (root->defrag_running)
636 return 0; 709 return 0;
637 trans = btrfs_start_transaction(root, 1); 710
638 while (1) { 711 while (1) {
639 root->defrag_running = 1; 712 trans = btrfs_start_transaction(root, 0);
713 if (IS_ERR(trans))
714 return PTR_ERR(trans);
715
640 ret = btrfs_defrag_leaves(trans, root, cacheonly); 716 ret = btrfs_defrag_leaves(trans, root, cacheonly);
717
641 nr = trans->blocks_used; 718 nr = trans->blocks_used;
642 btrfs_end_transaction(trans, root); 719 btrfs_end_transaction(trans, root);
643 btrfs_btree_balance_dirty(info->tree_root, nr); 720 btrfs_btree_balance_dirty(info->tree_root, nr);
644 cond_resched(); 721 cond_resched();
645 722
646 trans = btrfs_start_transaction(root, 1);
647 if (root->fs_info->closing || ret != -EAGAIN) 723 if (root->fs_info->closing || ret != -EAGAIN)
648 break; 724 break;
649 } 725 }
650 root->defrag_running = 0; 726 root->defrag_running = 0;
651 smp_mb(); 727 return ret;
652 btrfs_end_transaction(trans, root);
653 return 0;
654} 728}
655 729
656#if 0 730#if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
758 struct btrfs_root *root = pending->root; 832 struct btrfs_root *root = pending->root;
759 struct btrfs_root *parent_root; 833 struct btrfs_root *parent_root;
760 struct inode *parent_inode; 834 struct inode *parent_inode;
835 struct dentry *dentry;
761 struct extent_buffer *tmp; 836 struct extent_buffer *tmp;
762 struct extent_buffer *old; 837 struct extent_buffer *old;
763 int ret; 838 int ret;
764 u64 objectid; 839 int retries = 0;
765 int namelen; 840 u64 to_reserve = 0;
766 u64 index = 0; 841 u64 index = 0;
767 842 u64 objectid;
768 parent_inode = pending->dentry->d_parent->d_inode;
769 parent_root = BTRFS_I(parent_inode)->root;
770 843
771 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
772 if (!new_root_item) { 845 if (!new_root_item) {
773 ret = -ENOMEM; 846 pending->error = -ENOMEM;
774 goto fail; 847 goto fail;
775 } 848 }
849
776 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
777 if (ret) 851 if (ret) {
852 pending->error = ret;
778 goto fail; 853 goto fail;
854 }
855
856 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
857 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
858
859 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries);
862 if (ret) {
863 pending->error = ret;
864 goto fail;
865 }
866 }
779 867
780 key.objectid = objectid; 868 key.objectid = objectid;
781 /* record when the snapshot was created in key.offset */ 869 key.offset = (u64)-1;
782 key.offset = trans->transid; 870 key.type = BTRFS_ROOT_ITEM_KEY;
783 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
784 871
785 memcpy(&pending->root_key, &key, sizeof(key)); 872 trans->block_rsv = &pending->block_rsv;
786 pending->root_key.offset = (u64)-1;
787 873
874 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root;
788 record_root_in_trans(trans, parent_root); 877 record_root_in_trans(trans, parent_root);
878
789 /* 879 /*
790 * insert the directory item 880 * insert the directory item
791 */ 881 */
792 namelen = strlen(pending->name);
793 ret = btrfs_set_inode_index(parent_inode, &index); 882 ret = btrfs_set_inode_index(parent_inode, &index);
794 BUG_ON(ret); 883 BUG_ON(ret);
795 ret = btrfs_insert_dir_item(trans, parent_root, 884 ret = btrfs_insert_dir_item(trans, parent_root,
796 pending->name, namelen, 885 dentry->d_name.name, dentry->d_name.len,
797 parent_inode->i_ino, 886 parent_inode->i_ino, &key,
798 &pending->root_key, BTRFS_FT_DIR, index); 887 BTRFS_FT_DIR, index);
799 BUG_ON(ret); 888 BUG_ON(ret);
800 889
801 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); 890 btrfs_i_size_write(parent_inode, parent_inode->i_size +
891 dentry->d_name.len * 2);
802 ret = btrfs_update_inode(trans, parent_root, parent_inode); 892 ret = btrfs_update_inode(trans, parent_root, parent_inode);
803 BUG_ON(ret); 893 BUG_ON(ret);
804 894
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
815 free_extent_buffer(old); 905 free_extent_buffer(old);
816 906
817 btrfs_set_root_node(new_root_item, tmp); 907 btrfs_set_root_node(new_root_item, tmp);
818 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 908 /* record when the snapshot was created in key.offset */
819 new_root_item); 909 key.offset = trans->transid;
820 BUG_ON(ret); 910 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
821 btrfs_tree_unlock(tmp); 911 btrfs_tree_unlock(tmp);
822 free_extent_buffer(tmp); 912 free_extent_buffer(tmp);
913 BUG_ON(ret);
823 914
824 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, 915 /*
825 pending->root_key.objectid, 916 * insert root back/forward references
917 */
918 ret = btrfs_add_root_ref(trans, tree_root, objectid,
826 parent_root->root_key.objectid, 919 parent_root->root_key.objectid,
827 parent_inode->i_ino, index, pending->name, 920 parent_inode->i_ino, index,
828 namelen); 921 dentry->d_name.name, dentry->d_name.len);
829 BUG_ON(ret); 922 BUG_ON(ret);
830 923
924 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
926 BUG_ON(IS_ERR(pending->snap));
927
928 btrfs_reloc_post_snapshot(trans, pending);
929 btrfs_orphan_post_snapshot(trans, pending);
831fail: 930fail:
832 kfree(new_root_item); 931 kfree(new_root_item);
833 return ret; 932 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
933 return 0;
834} 934}
835 935
836/* 936/*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
878 return ret; 978 return ret;
879} 979}
880 980
981int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{
983 int ret = 0;
984 spin_lock(&info->new_trans_lock);
985 if (info->running_transaction)
986 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock);
988 return ret;
989}
990
881int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 991int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
882 struct btrfs_root *root) 992 struct btrfs_root *root)
883{ 993{
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
899 ret = btrfs_run_delayed_refs(trans, root, 0); 1009 ret = btrfs_run_delayed_refs(trans, root, 0);
900 BUG_ON(ret); 1010 BUG_ON(ret);
901 1011
1012 btrfs_trans_release_metadata(trans, root);
1013
902 cur_trans = trans->transaction; 1014 cur_trans = trans->transaction;
903 /* 1015 /*
904 * set the flushing flag so procs in this transaction have to 1016 * set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
951 snap_pending = 1; 1063 snap_pending = 1;
952 1064
953 WARN_ON(cur_trans != trans->transaction); 1065 WARN_ON(cur_trans != trans->transaction);
954 prepare_to_wait(&cur_trans->writer_wait, &wait,
955 TASK_UNINTERRUPTIBLE);
956
957 if (cur_trans->num_writers > 1) 1066 if (cur_trans->num_writers > 1)
958 timeout = MAX_SCHEDULE_TIMEOUT; 1067 timeout = MAX_SCHEDULE_TIMEOUT;
959 else if (should_grow) 1068 else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
976 */ 1085 */
977 btrfs_run_ordered_operations(root, 1); 1086 btrfs_run_ordered_operations(root, 1);
978 1087
1088 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE);
1090
979 smp_mb(); 1091 smp_mb();
980 if (cur_trans->num_writers > 1 || should_grow) 1092 if (cur_trans->num_writers > 1 || should_grow)
981 schedule_timeout(timeout); 1093 schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1103 1215
1104 if (btrfs_header_backref_rev(root->node) < 1216 if (btrfs_header_backref_rev(root->node) <
1105 BTRFS_MIXED_BACKREF_REV) 1217 BTRFS_MIXED_BACKREF_REV)
1106 btrfs_drop_snapshot(root, 0); 1218 btrfs_drop_snapshot(root, NULL, 0);
1107 else 1219 else
1108 btrfs_drop_snapshot(root, 1); 1220 btrfs_drop_snapshot(root, NULL, 1);
1109 } 1221 }
1110 return 0; 1222 return 0;
1111} 1223}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
62 struct list_head list; 65 struct list_head list;
63}; 66};
64 67
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 88int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 89 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 91 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 93 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 95 int num_blocks);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 97 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 106 struct btrfs_root *root);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 108 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 111void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 112int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 113 struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 117 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 118int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 119 struct extent_io_tree *dirty_pages, int mark);
120int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 121int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 122#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 117 path->nodes[1], 0,
118 cache_only, &last_ret, 118 cache_only, &last_ret,
119 &root->defrag_progress); 119 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 120 if (ret) {
121 WARN_ON(ret == -EAGAIN);
122 goto out;
123 }
121 if (next_key_ret == 0) { 124 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 125 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 126 ret = -EAGAIN;
124 } 127 }
125
126 btrfs_release_path(root, path);
127out: 128out:
128 if (path) 129 if (path)
129 btrfs_free_path(path); 130 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
135 struct btrfs_root *root) 135 struct btrfs_root *root)
136{ 136{
137 int ret; 137 int ret;
138 int err = 0;
138 139
139 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
140 if (root->log_root) { 141 if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
155 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
156 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
157 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
158 BUG_ON(ret); 159 if (ret)
160 err = ret;
159 } 161 }
160 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
161 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
162 BUG_ON(ret); 164 if (ret)
165 err = ret;
163 } 166 }
164 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
165 root->log_batch++; 168 root->log_batch++;
166 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
167 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
168 return 0; 171 return err;
169} 172}
170 173
171/* 174/*
@@ -376,7 +379,7 @@ insert:
376 BUG_ON(ret); 379 BUG_ON(ret);
377 } 380 }
378 } else if (ret) { 381 } else if (ret) {
379 BUG(); 382 return ret;
380 } 383 }
381 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
382 path->slots[0]); 385 path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1699 1702
1700 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1701 1704
1702 wc->process_func(root, next, wc, ptr_gen);
1703
1704 if (*level == 1) { 1705 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen);
1707
1705 path->slots[*level]++; 1708 path->slots[*level]++;
1706 if (wc->free) { 1709 if (wc->free) {
1707 btrfs_read_buffer(next, ptr_gen); 1710 btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1734 WARN_ON(*level < 0); 1737 WARN_ON(*level < 0);
1735 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1738 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1736 1739
1737 if (path->nodes[*level] == root->node) 1740 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1738 parent = path->nodes[*level];
1739 else
1740 parent = path->nodes[*level + 1];
1741
1742 bytenr = path->nodes[*level]->start;
1743
1744 blocksize = btrfs_level_size(root, *level);
1745 root_owner = btrfs_header_owner(parent);
1746 root_gen = btrfs_header_generation(parent);
1747
1748 wc->process_func(root, path->nodes[*level], wc,
1749 btrfs_header_generation(path->nodes[*level]));
1750
1751 if (wc->free) {
1752 next = path->nodes[*level];
1753 btrfs_tree_lock(next);
1754 clean_tree_block(trans, root, next);
1755 btrfs_set_lock_blocking(next);
1756 btrfs_wait_tree_block_writeback(next);
1757 btrfs_tree_unlock(next);
1758
1759 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1760 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1761 BUG_ON(ret);
1762 }
1763 free_extent_buffer(path->nodes[*level]);
1764 path->nodes[*level] = NULL;
1765 *level += 1;
1766 1741
1767 cond_resched(); 1742 cond_resched();
1768 return 0; 1743 return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1781 1756
1782 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1783 slot = path->slots[i]; 1758 slot = path->slots[i];
1784 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1785 struct extent_buffer *node; 1760 struct extent_buffer *node;
1786 node = path->nodes[i]; 1761 node = path->nodes[i];
1787 path->slots[i]++; 1762 path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2047 mutex_unlock(&log_root_tree->log_mutex); 2022 mutex_unlock(&log_root_tree->log_mutex);
2048 2023
2049 ret = update_log_root(trans, log); 2024 ret = update_log_root(trans, log);
2050 BUG_ON(ret);
2051 2025
2052 mutex_lock(&log_root_tree->log_mutex); 2026 mutex_lock(&log_root_tree->log_mutex);
2053 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2027 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2056 wake_up(&log_root_tree->log_writer_wait); 2030 wake_up(&log_root_tree->log_writer_wait);
2057 } 2031 }
2058 2032
2033 if (ret) {
2034 BUG_ON(ret != -ENOSPC);
2035 root->fs_info->last_trans_log_full_commit = trans->transid;
2036 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2037 mutex_unlock(&log_root_tree->log_mutex);
2038 ret = -EAGAIN;
2039 goto out;
2040 }
2041
2059 index2 = log_root_tree->log_transid % 2; 2042 index2 = log_root_tree->log_transid % 2;
2060 if (atomic_read(&log_root_tree->log_commit[index2])) { 2043 if (atomic_read(&log_root_tree->log_commit[index2])) {
2061 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2044 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
2129 return 0; 2112 return 0;
2130} 2113}
2131 2114
2132/* 2115static void free_log_tree(struct btrfs_trans_handle *trans,
2133 * free all the extents used by the tree log. This should be called 2116 struct btrfs_root *log)
2134 * at commit time of the full transaction
2135 */
2136int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2137{ 2117{
2138 int ret; 2118 int ret;
2139 struct btrfs_root *log;
2140 struct key;
2141 u64 start; 2119 u64 start;
2142 u64 end; 2120 u64 end;
2143 struct walk_control wc = { 2121 struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2145 .process_func = process_one_buffer 2123 .process_func = process_one_buffer
2146 }; 2124 };
2147 2125
2148 if (!root->log_root || root->fs_info->log_root_recovering)
2149 return 0;
2150
2151 log = root->log_root;
2152 ret = walk_log_tree(trans, log, &wc); 2126 ret = walk_log_tree(trans, log, &wc);
2153 BUG_ON(ret); 2127 BUG_ON(ret);
2154 2128
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2162 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2136 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2163 } 2137 }
2164 2138
2165 if (log->log_transid > 0) {
2166 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2167 &log->root_key);
2168 BUG_ON(ret);
2169 }
2170 root->log_root = NULL;
2171 free_extent_buffer(log->node); 2139 free_extent_buffer(log->node);
2172 kfree(log); 2140 kfree(log);
2141}
2142
2143/*
2144 * free all the extents used by the tree log. This should be called
2145 * at commit time of the full transaction
2146 */
2147int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2148{
2149 if (root->log_root) {
2150 free_log_tree(trans, root->log_root);
2151 root->log_root = NULL;
2152 }
2153 return 0;
2154}
2155
2156int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2157 struct btrfs_fs_info *fs_info)
2158{
2159 if (fs_info->log_root_tree) {
2160 free_log_tree(trans, fs_info->log_root_tree);
2161 fs_info->log_root_tree = NULL;
2162 }
2173 return 0; 2163 return 0;
2174} 2164}
2175 2165
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2203 struct btrfs_dir_item *di; 2193 struct btrfs_dir_item *di;
2204 struct btrfs_path *path; 2194 struct btrfs_path *path;
2205 int ret; 2195 int ret;
2196 int err = 0;
2206 int bytes_del = 0; 2197 int bytes_del = 0;
2207 2198
2208 if (BTRFS_I(dir)->logged_trans < trans->transid) 2199 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2218 path = btrfs_alloc_path(); 2209 path = btrfs_alloc_path();
2219 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2220 name, name_len, -1); 2211 name, name_len, -1);
2221 if (di && !IS_ERR(di)) { 2212 if (IS_ERR(di)) {
2213 err = PTR_ERR(di);
2214 goto fail;
2215 }
2216 if (di) {
2222 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2217 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2223 bytes_del += name_len; 2218 bytes_del += name_len;
2224 BUG_ON(ret); 2219 BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2226 btrfs_release_path(log, path); 2221 btrfs_release_path(log, path);
2227 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2228 index, name, name_len, -1); 2223 index, name, name_len, -1);
2229 if (di && !IS_ERR(di)) { 2224 if (IS_ERR(di)) {
2225 err = PTR_ERR(di);
2226 goto fail;
2227 }
2228 if (di) {
2230 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2229 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2231 bytes_del += name_len; 2230 bytes_del += name_len;
2232 BUG_ON(ret); 2231 BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2244 btrfs_release_path(log, path); 2243 btrfs_release_path(log, path);
2245 2244
2246 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) {
2247 err = ret;
2248 goto fail;
2249 }
2247 if (ret == 0) { 2250 if (ret == 0) {
2248 struct btrfs_inode_item *item; 2251 struct btrfs_inode_item *item;
2249 u64 i_size; 2252 u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2261 ret = 0; 2264 ret = 0;
2262 btrfs_release_path(log, path); 2265 btrfs_release_path(log, path);
2263 } 2266 }
2264 2267fail:
2265 btrfs_free_path(path); 2268 btrfs_free_path(path);
2266 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2269 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid;
2272 ret = 0;
2273 }
2267 btrfs_end_log_trans(root); 2274 btrfs_end_log_trans(root);
2268 2275
2269 return 0; 2276 return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2291 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2292 dirid, &index); 2299 dirid, &index);
2293 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2300 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) {
2302 root->fs_info->last_trans_log_full_commit = trans->transid;
2303 ret = 0;
2304 }
2294 btrfs_end_log_trans(root); 2305 btrfs_end_log_trans(root);
2295 2306
2296 return ret; 2307 return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2318 else 2329 else
2319 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2330 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2320 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2331 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2321 BUG_ON(ret); 2332 if (ret)
2333 return ret;
2322 2334
2323 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2335 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2324 struct btrfs_dir_log_item); 2336 struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2343 struct btrfs_key max_key; 2355 struct btrfs_key max_key;
2344 struct btrfs_root *log = root->log_root; 2356 struct btrfs_root *log = root->log_root;
2345 struct extent_buffer *src; 2357 struct extent_buffer *src;
2358 int err = 0;
2346 int ret; 2359 int ret;
2347 int i; 2360 int i;
2348 int nritems; 2361 int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2405 ret = overwrite_item(trans, log, dst_path, 2418 ret = overwrite_item(trans, log, dst_path,
2406 path->nodes[0], path->slots[0], 2419 path->nodes[0], path->slots[0],
2407 &tmp); 2420 &tmp);
2421 if (ret) {
2422 err = ret;
2423 goto done;
2424 }
2408 } 2425 }
2409 } 2426 }
2410 btrfs_release_path(root, path); 2427 btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2432 goto done; 2449 goto done;
2433 ret = overwrite_item(trans, log, dst_path, src, i, 2450 ret = overwrite_item(trans, log, dst_path, src, i,
2434 &min_key); 2451 &min_key);
2435 BUG_ON(ret); 2452 if (ret) {
2453 err = ret;
2454 goto done;
2455 }
2436 } 2456 }
2437 path->slots[0] = nritems; 2457 path->slots[0] = nritems;
2438 2458
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2454 ret = overwrite_item(trans, log, dst_path, 2474 ret = overwrite_item(trans, log, dst_path,
2455 path->nodes[0], path->slots[0], 2475 path->nodes[0], path->slots[0],
2456 &tmp); 2476 &tmp);
2457 2477 if (ret)
2458 BUG_ON(ret); 2478 err = ret;
2459 last_offset = tmp.offset; 2479 else
2480 last_offset = tmp.offset;
2460 goto done; 2481 goto done;
2461 } 2482 }
2462 } 2483 }
2463done: 2484done:
2464 *last_offset_ret = last_offset;
2465 btrfs_release_path(root, path); 2485 btrfs_release_path(root, path);
2466 btrfs_release_path(log, dst_path); 2486 btrfs_release_path(log, dst_path);
2467 2487
2468 /* insert the log range keys to indicate where the log is valid */ 2488 if (err == 0) {
2469 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2489 *last_offset_ret = last_offset;
2470 first_offset, last_offset); 2490 /*
2471 BUG_ON(ret); 2491 * insert the log range keys to indicate where the log
2472 return 0; 2492 * is valid
2493 */
2494 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset,
2496 last_offset);
2497 if (ret)
2498 err = ret;
2499 }
2500 return err;
2473} 2501}
2474 2502
2475/* 2503/*
@@ -2501,7 +2529,8 @@ again:
2501 ret = log_dir_items(trans, root, inode, path, 2529 ret = log_dir_items(trans, root, inode, path,
2502 dst_path, key_type, min_key, 2530 dst_path, key_type, min_key,
2503 &max_key); 2531 &max_key);
2504 BUG_ON(ret); 2532 if (ret)
2533 return ret;
2505 if (max_key == (u64)-1) 2534 if (max_key == (u64)-1)
2506 break; 2535 break;
2507 min_key = max_key + 1; 2536 min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2535 2564
2536 while (1) { 2565 while (1) {
2537 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2566 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2538 2567 BUG_ON(ret == 0);
2539 if (ret != 1) 2568 if (ret < 0)
2540 break; 2569 break;
2541 2570
2542 if (path->slots[0] == 0) 2571 if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2554 btrfs_release_path(log, path); 2583 btrfs_release_path(log, path);
2555 } 2584 }
2556 btrfs_release_path(log, path); 2585 btrfs_release_path(log, path);
2557 return 0; 2586 return ret;
2558} 2587}
2559 2588
2560static noinline int copy_items(struct btrfs_trans_handle *trans, 2589static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2587 } 2616 }
2588 ret = btrfs_insert_empty_items(trans, log, dst_path, 2617 ret = btrfs_insert_empty_items(trans, log, dst_path,
2589 ins_keys, ins_sizes, nr); 2618 ins_keys, ins_sizes, nr);
2590 BUG_ON(ret); 2619 if (ret) {
2620 kfree(ins_data);
2621 return ret;
2622 }
2591 2623
2592 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2624 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2593 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2625 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2660 * we have to do this after the loop above to avoid changing the 2692 * we have to do this after the loop above to avoid changing the
2661 * log tree while trying to change the log tree. 2693 * log tree while trying to change the log tree.
2662 */ 2694 */
2695 ret = 0;
2663 while (!list_empty(&ordered_sums)) { 2696 while (!list_empty(&ordered_sums)) {
2664 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2697 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2665 struct btrfs_ordered_sum, 2698 struct btrfs_ordered_sum,
2666 list); 2699 list);
2667 ret = btrfs_csum_file_blocks(trans, log, sums); 2700 if (!ret)
2668 BUG_ON(ret); 2701 ret = btrfs_csum_file_blocks(trans, log, sums);
2669 list_del(&sums->list); 2702 list_del(&sums->list);
2670 kfree(sums); 2703 kfree(sums);
2671 } 2704 }
2672 return 0; 2705 return ret;
2673} 2706}
2674 2707
2675/* log a single inode in the tree log. 2708/* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2697 struct btrfs_root *log = root->log_root; 2730 struct btrfs_root *log = root->log_root;
2698 struct extent_buffer *src = NULL; 2731 struct extent_buffer *src = NULL;
2699 u32 size; 2732 u32 size;
2733 int err = 0;
2700 int ret; 2734 int ret;
2701 int nritems; 2735 int nritems;
2702 int ins_start_slot = 0; 2736 int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2739 } else { 2773 } else {
2740 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2741 } 2775 }
2742 BUG_ON(ret); 2776 if (ret) {
2777 err = ret;
2778 goto out_unlock;
2779 }
2743 path->keep_locks = 1; 2780 path->keep_locks = 1;
2744 2781
2745 while (1) { 2782 while (1) {
@@ -2768,7 +2805,10 @@ again:
2768 2805
2769 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2806 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2770 ins_nr, inode_only); 2807 ins_nr, inode_only);
2771 BUG_ON(ret); 2808 if (ret) {
2809 err = ret;
2810 goto out_unlock;
2811 }
2772 ins_nr = 1; 2812 ins_nr = 1;
2773 ins_start_slot = path->slots[0]; 2813 ins_start_slot = path->slots[0];
2774next_slot: 2814next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
2784 ret = copy_items(trans, log, dst_path, src, 2824 ret = copy_items(trans, log, dst_path, src,
2785 ins_start_slot, 2825 ins_start_slot,
2786 ins_nr, inode_only); 2826 ins_nr, inode_only);
2787 BUG_ON(ret); 2827 if (ret) {
2828 err = ret;
2829 goto out_unlock;
2830 }
2788 ins_nr = 0; 2831 ins_nr = 0;
2789 } 2832 }
2790 btrfs_release_path(root, path); 2833 btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
2802 ret = copy_items(trans, log, dst_path, src, 2845 ret = copy_items(trans, log, dst_path, src,
2803 ins_start_slot, 2846 ins_start_slot,
2804 ins_nr, inode_only); 2847 ins_nr, inode_only);
2805 BUG_ON(ret); 2848 if (ret) {
2849 err = ret;
2850 goto out_unlock;
2851 }
2806 ins_nr = 0; 2852 ins_nr = 0;
2807 } 2853 }
2808 WARN_ON(ins_nr); 2854 WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
2810 btrfs_release_path(root, path); 2856 btrfs_release_path(root, path);
2811 btrfs_release_path(log, dst_path); 2857 btrfs_release_path(log, dst_path);
2812 ret = log_directory_changes(trans, root, inode, path, dst_path); 2858 ret = log_directory_changes(trans, root, inode, path, dst_path);
2813 BUG_ON(ret); 2859 if (ret) {
2860 err = ret;
2861 goto out_unlock;
2862 }
2814 } 2863 }
2815 BTRFS_I(inode)->logged_trans = trans->transid; 2864 BTRFS_I(inode)->logged_trans = trans->transid;
2865out_unlock:
2816 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2866 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2817 2867
2818 btrfs_free_path(path); 2868 btrfs_free_path(path);
2819 btrfs_free_path(dst_path); 2869 btrfs_free_path(dst_path);
2820 return 0; 2870 return err;
2821} 2871}
2822 2872
2823/* 2873/*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2942 goto end_no_trans; 2992 goto end_no_trans;
2943 } 2993 }
2944 2994
2945 start_log_trans(trans, root); 2995 ret = start_log_trans(trans, root);
2996 if (ret)
2997 goto end_trans;
2946 2998
2947 ret = btrfs_log_inode(trans, root, inode, inode_only); 2999 ret = btrfs_log_inode(trans, root, inode, inode_only);
2948 BUG_ON(ret); 3000 if (ret)
3001 goto end_trans;
2949 3002
2950 /* 3003 /*
2951 * for regular files, if its inode is already on disk, we don't 3004 * for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2955 */ 3008 */
2956 if (S_ISREG(inode->i_mode) && 3009 if (S_ISREG(inode->i_mode) &&
2957 BTRFS_I(inode)->generation <= last_committed && 3010 BTRFS_I(inode)->generation <= last_committed &&
2958 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3011 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2959 goto no_parent; 3012 ret = 0;
3013 goto end_trans;
3014 }
2960 3015
2961 inode_only = LOG_INODE_EXISTS; 3016 inode_only = LOG_INODE_EXISTS;
2962 while (1) { 3017 while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2970 if (BTRFS_I(inode)->generation > 3025 if (BTRFS_I(inode)->generation >
2971 root->fs_info->last_trans_committed) { 3026 root->fs_info->last_trans_committed) {
2972 ret = btrfs_log_inode(trans, root, inode, inode_only); 3027 ret = btrfs_log_inode(trans, root, inode, inode_only);
2973 BUG_ON(ret); 3028 if (ret)
3029 goto end_trans;
2974 } 3030 }
2975 if (IS_ROOT(parent)) 3031 if (IS_ROOT(parent))
2976 break; 3032 break;
2977 3033
2978 parent = parent->d_parent; 3034 parent = parent->d_parent;
2979 } 3035 }
2980no_parent:
2981 ret = 0; 3036 ret = 0;
3037end_trans:
3038 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid;
3041 ret = 1;
3042 }
2982 btrfs_end_log_trans(root); 3043 btrfs_end_log_trans(root);
2983end_no_trans: 3044end_no_trans:
2984 return ret; 3045 return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3020 path = btrfs_alloc_path(); 3081 path = btrfs_alloc_path();
3021 BUG_ON(!path); 3082 BUG_ON(!path);
3022 3083
3023 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3084 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3024 3085
3025 wc.trans = trans; 3086 wc.trans = trans;
3026 wc.pin = 1; 3087 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1097 if (!path) 1097 if (!path)
1098 return -ENOMEM; 1098 return -ENOMEM;
1099 1099
1100 trans = btrfs_start_transaction(root, 1); 1100 trans = btrfs_start_transaction(root, 0);
1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1102 key.type = BTRFS_DEV_ITEM_KEY; 1102 key.type = BTRFS_DEV_ITEM_KEY;
1103 key.offset = device->devid; 1103 key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1486 goto error; 1486 goto error;
1487 } 1487 }
1488 1488
1489 trans = btrfs_start_transaction(root, 1); 1489 trans = btrfs_start_transaction(root, 0);
1490 lock_chunks(root); 1490 lock_chunks(root);
1491 1491
1492 device->barriers = 1; 1492 device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1751 1751
1752 /* step one, relocate all the extents inside this chunk */ 1752 /* step one, relocate all the extents inside this chunk */
1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1753 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1754 BUG_ON(ret); 1754 if (ret)
1755 return ret;
1755 1756
1756 trans = btrfs_start_transaction(root, 1); 1757 trans = btrfs_start_transaction(root, 0);
1757 BUG_ON(!trans); 1758 BUG_ON(!trans);
1758 1759
1759 lock_chunks(root); 1760 lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1925 break; 1926 break;
1926 BUG_ON(ret); 1927 BUG_ON(ret);
1927 1928
1928 trans = btrfs_start_transaction(dev_root, 1); 1929 trans = btrfs_start_transaction(dev_root, 0);
1929 BUG_ON(!trans); 1930 BUG_ON(!trans);
1930 1931
1931 ret = btrfs_grow_device(trans, device, old_size); 1932 ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
2094 } 2095 }
2095 2096
2096 /* Shrinking succeeded, else we would be at "done". */ 2097 /* Shrinking succeeded, else we would be at "done". */
2097 trans = btrfs_start_transaction(root, 1); 2098 trans = btrfs_start_transaction(root, 0);
2098 if (!trans) {
2099 ret = -ENOMEM;
2100 goto done;
2101 }
2102 lock_chunks(root); 2099 lock_chunks(root);
2103 2100
2104 device->disk_total_bytes = new_size; 2101 device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
@@ -282,7 +276,7 @@ err:
282 * List of handlers for synthetic system.* attributes. All real ondisk 276 * List of handlers for synthetic system.* attributes. All real ondisk
283 * attributes are handled directly. 277 * attributes are handled directly.
284 */ 278 */
285struct xattr_handler *btrfs_xattr_handlers[] = { 279const struct xattr_handler *btrfs_xattr_handlers[] = {
286#ifdef CONFIG_BTRFS_FS_POSIX_ACL 280#ifdef CONFIG_BTRFS_FS_POSIX_ACL
287 &btrfs_xattr_acl_access_handler, 281 &btrfs_xattr_acl_access_handler,
288 &btrfs_xattr_acl_default_handler, 282 &btrfs_xattr_acl_default_handler,
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern struct xattr_handler btrfs_xattr_acl_access_handler; 24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler; 25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[]; 26extern const struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);
diff --git a/fs/buffer.c b/fs/buffer.c
index c9c266db0624..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -275,6 +275,7 @@ void invalidate_bdev(struct block_device *bdev)
275 return; 275 return;
276 276
277 invalidate_bh_lrus(); 277 invalidate_bh_lrus();
278 lru_add_drain_all(); /* make sure all lru add caches are flushed */
278 invalidate_mapping_pages(mapping, 0, -1); 279 invalidate_mapping_pages(mapping, 0, -1);
279} 280}
280EXPORT_SYMBOL(invalidate_bdev); 281EXPORT_SYMBOL(invalidate_bdev);
@@ -560,26 +561,17 @@ repeat:
560 return err; 561 return err;
561} 562}
562 563
563static void do_thaw_all(struct work_struct *work) 564static void do_thaw_one(struct super_block *sb, void *unused)
564{ 565{
565 struct super_block *sb;
566 char b[BDEVNAME_SIZE]; 566 char b[BDEVNAME_SIZE];
567 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
568 printk(KERN_WARNING "Emergency Thaw on %s\n",
569 bdevname(sb->s_bdev, b));
570}
567 571
568 spin_lock(&sb_lock); 572static void do_thaw_all(struct work_struct *work)
569restart: 573{
570 list_for_each_entry(sb, &super_blocks, s_list) { 574 iterate_supers(do_thaw_one, NULL);
571 sb->s_count++;
572 spin_unlock(&sb_lock);
573 down_read(&sb->s_umount);
574 while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
575 printk(KERN_WARNING "Emergency Thaw on %s\n",
576 bdevname(sb->s_bdev, b));
577 up_read(&sb->s_umount);
578 spin_lock(&sb_lock);
579 if (__put_super_and_need_restart(sb))
580 goto restart;
581 }
582 spin_unlock(&sb_lock);
583 kfree(work); 575 kfree(work);
584 printk(KERN_WARNING "Emergency Thaw complete\n"); 576 printk(KERN_WARNING "Emergency Thaw complete\n");
585} 577}
@@ -1957,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1957} 1949}
1958 1950
1959/* 1951/*
1960 * block_write_begin takes care of the basic task of block allocation and 1952 * Filesystems implementing the new truncate sequence should use the
1961 * bringing partial write blocks uptodate first. 1953 * _newtrunc postfix variant which won't incorrectly call vmtruncate.
1962 * 1954 * The filesystem needs to handle block truncation upon failure.
1963 * If *pagep is not NULL, then block_write_begin uses the locked page
1964 * at *pagep rather than allocating its own. In this case, the page will
1965 * not be unlocked or deallocated on failure.
1966 */ 1955 */
1967int block_write_begin(struct file *file, struct address_space *mapping, 1956int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
1968 loff_t pos, unsigned len, unsigned flags, 1957 loff_t pos, unsigned len, unsigned flags,
1969 struct page **pagep, void **fsdata, 1958 struct page **pagep, void **fsdata,
1970 get_block_t *get_block) 1959 get_block_t *get_block)
@@ -2000,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
2000 unlock_page(page); 1989 unlock_page(page);
2001 page_cache_release(page); 1990 page_cache_release(page);
2002 *pagep = NULL; 1991 *pagep = NULL;
2003
2004 /*
2005 * prepare_write() may have instantiated a few blocks
2006 * outside i_size. Trim these off again. Don't need
2007 * i_size_read because we hold i_mutex.
2008 */
2009 if (pos + len > inode->i_size)
2010 vmtruncate(inode, inode->i_size);
2011 } 1992 }
2012 } 1993 }
2013 1994
2014out: 1995out:
2015 return status; 1996 return status;
2016} 1997}
1998EXPORT_SYMBOL(block_write_begin_newtrunc);
1999
2000/*
2001 * block_write_begin takes care of the basic task of block allocation and
2002 * bringing partial write blocks uptodate first.
2003 *
2004 * If *pagep is not NULL, then block_write_begin uses the locked page
2005 * at *pagep rather than allocating its own. In this case, the page will
2006 * not be unlocked or deallocated on failure.
2007 */
2008int block_write_begin(struct file *file, struct address_space *mapping,
2009 loff_t pos, unsigned len, unsigned flags,
2010 struct page **pagep, void **fsdata,
2011 get_block_t *get_block)
2012{
2013 int ret;
2014
2015 ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
2016 pagep, fsdata, get_block);
2017
2018 /*
2019 * prepare_write() may have instantiated a few blocks
2020 * outside i_size. Trim these off again. Don't need
2021 * i_size_read because we hold i_mutex.
2022 *
2023 * Filesystems which pass down their own page also cannot
2024 * call into vmtruncate here because it would lead to lock
2025 * inversion problems (*pagep is locked). This is a further
2026 * example of where the old truncate sequence is inadequate.
2027 */
2028 if (unlikely(ret) && *pagep == NULL) {
2029 loff_t isize = mapping->host->i_size;
2030 if (pos + len > isize)
2031 vmtruncate(mapping->host, isize);
2032 }
2033
2034 return ret;
2035}
2017EXPORT_SYMBOL(block_write_begin); 2036EXPORT_SYMBOL(block_write_begin);
2018 2037
2019int block_write_end(struct file *file, struct address_space *mapping, 2038int block_write_end(struct file *file, struct address_space *mapping,
@@ -2332,7 +2351,7 @@ out:
2332 * For moronic filesystems that do not allow holes in file. 2351 * For moronic filesystems that do not allow holes in file.
2333 * We may have to extend the file. 2352 * We may have to extend the file.
2334 */ 2353 */
2335int cont_write_begin(struct file *file, struct address_space *mapping, 2354int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2336 loff_t pos, unsigned len, unsigned flags, 2355 loff_t pos, unsigned len, unsigned flags,
2337 struct page **pagep, void **fsdata, 2356 struct page **pagep, void **fsdata,
2338 get_block_t *get_block, loff_t *bytes) 2357 get_block_t *get_block, loff_t *bytes)
@@ -2353,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
2353 } 2372 }
2354 2373
2355 *pagep = NULL; 2374 *pagep = NULL;
2356 err = block_write_begin(file, mapping, pos, len, 2375 err = block_write_begin_newtrunc(file, mapping, pos, len,
2357 flags, pagep, fsdata, get_block); 2376 flags, pagep, fsdata, get_block);
2358out: 2377out:
2359 return err; 2378 return err;
2360} 2379}
2380EXPORT_SYMBOL(cont_write_begin_newtrunc);
2381
2382int cont_write_begin(struct file *file, struct address_space *mapping,
2383 loff_t pos, unsigned len, unsigned flags,
2384 struct page **pagep, void **fsdata,
2385 get_block_t *get_block, loff_t *bytes)
2386{
2387 int ret;
2388
2389 ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
2390 pagep, fsdata, get_block, bytes);
2391 if (unlikely(ret)) {
2392 loff_t isize = mapping->host->i_size;
2393 if (pos + len > isize)
2394 vmtruncate(mapping->host, isize);
2395 }
2396
2397 return ret;
2398}
2361EXPORT_SYMBOL(cont_write_begin); 2399EXPORT_SYMBOL(cont_write_begin);
2362 2400
2363int block_prepare_write(struct page *page, unsigned from, unsigned to, 2401int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2389,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
2389 * 2427 *
2390 * We are not allowed to take the i_mutex here so we have to play games to 2428 * We are not allowed to take the i_mutex here so we have to play games to
2391 * protect against truncate races as the page could now be beyond EOF. Because 2429 * protect against truncate races as the page could now be beyond EOF. Because
2392 * vmtruncate() writes the inode size before removing pages, once we have the 2430 * truncate writes the inode size before removing pages, once we have the
2393 * page lock we can determine safely if the page is beyond EOF. If it is not 2431 * page lock we can determine safely if the page is beyond EOF. If it is not
2394 * beyond EOF, then the page is guaranteed safe against truncation until we 2432 * beyond EOF, then the page is guaranteed safe against truncation until we
2395 * unlock the page. 2433 * unlock the page.
@@ -2472,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2472} 2510}
2473 2511
2474/* 2512/*
2475 * On entry, the page is fully not uptodate. 2513 * Filesystems implementing the new truncate sequence should use the
2476 * On exit the page is fully uptodate in the areas outside (from,to) 2514 * _newtrunc postfix variant which won't incorrectly call vmtruncate.
2515 * The filesystem needs to handle block truncation upon failure.
2477 */ 2516 */
2478int nobh_write_begin(struct file *file, struct address_space *mapping, 2517int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
2479 loff_t pos, unsigned len, unsigned flags, 2518 loff_t pos, unsigned len, unsigned flags,
2480 struct page **pagep, void **fsdata, 2519 struct page **pagep, void **fsdata,
2481 get_block_t *get_block) 2520 get_block_t *get_block)
@@ -2508,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
2508 unlock_page(page); 2547 unlock_page(page);
2509 page_cache_release(page); 2548 page_cache_release(page);
2510 *pagep = NULL; 2549 *pagep = NULL;
2511 return block_write_begin(file, mapping, pos, len, flags, pagep, 2550 return block_write_begin_newtrunc(file, mapping, pos, len,
2512 fsdata, get_block); 2551 flags, pagep, fsdata, get_block);
2513 } 2552 }
2514 2553
2515 if (PageMappedToDisk(page)) 2554 if (PageMappedToDisk(page))
@@ -2613,8 +2652,34 @@ out_release:
2613 page_cache_release(page); 2652 page_cache_release(page);
2614 *pagep = NULL; 2653 *pagep = NULL;
2615 2654
2616 if (pos + len > inode->i_size) 2655 return ret;
2617 vmtruncate(inode, inode->i_size); 2656}
2657EXPORT_SYMBOL(nobh_write_begin_newtrunc);
2658
2659/*
2660 * On entry, the page is fully not uptodate.
2661 * On exit the page is fully uptodate in the areas outside (from,to)
2662 */
2663int nobh_write_begin(struct file *file, struct address_space *mapping,
2664 loff_t pos, unsigned len, unsigned flags,
2665 struct page **pagep, void **fsdata,
2666 get_block_t *get_block)
2667{
2668 int ret;
2669
2670 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
2671 pagep, fsdata, get_block);
2672
2673 /*
2674 * prepare_write() may have instantiated a few blocks
2675 * outside i_size. Trim these off again. Don't need
2676 * i_size_read because we hold i_mutex.
2677 */
2678 if (unlikely(ret)) {
2679 loff_t isize = mapping->host->i_size;
2680 if (pos + len > isize)
2681 vmtruncate(mapping->host, isize);
2682 }
2618 2683
2619 return ret; 2684 return ret;
2620} 2685}
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0; 275 int rc = 0;
276 struct page **pages; 276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset; 277 loff_t offset;
279 u64 len; 278 u64 len;
280 279
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
297 if (rc < 0) 296 if (rc < 0)
298 goto out; 297 goto out;
299 298
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0; 299 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) { 300 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page = 301 struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
312 zero_user_segment(page, s, PAGE_CACHE_SIZE); 309 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 } 310 }
314 311
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) { 312 if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page); 313 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n", 314 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page); 315 inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
323 flush_dcache_page(page); 320 flush_dcache_page(page);
324 SetPageUptodate(page); 321 SetPageUptodate(page);
325 unlock_page(page); 322 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0) 323 page_cache_release(page);
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 } 324 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0; 325 rc = 0;
331 326
332out: 327out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
568 ceph_release_pages(req->r_pages, req->r_num_pages); 563 ceph_release_pages(req->r_pages, req->r_num_pages);
569 if (req->r_pages_from_pool) 564 if (req->r_pages_from_pool)
570 mempool_free(req->r_pages, 565 mempool_free(req->r_pages,
571 ceph_client(inode->i_sb)->wb_pagevec_pool); 566 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
572 else 567 else
573 kfree(req->r_pages); 568 kfree(req->r_pages);
574 ceph_osdc_put_request(req); 569 ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..89490beaf537 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,6 @@
1#include "ceph_debug.h" 1#include "ceph_debug.h"
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h> 4#include <linux/err.h>
6#include <linux/slab.h> 5#include <linux/slab.h>
7 6
@@ -150,7 +149,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
150 149
151 ret = ac->ops->build_request(ac, p + sizeof(u32), end); 150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
152 if (ret < 0) { 151 if (ret < 0) {
153 pr_err("error %d building request\n", ret); 152 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name);
154 return ret; 154 return ret;
155 } 155 }
156 dout(" built request %d bytes\n", ret); 156 dout(" built request %d bytes\n", ret);
@@ -229,7 +229,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
229 if (ret == -EAGAIN) { 229 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len); 230 return ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) { 231 } else if (ret) {
232 pr_err("authentication error %d\n", ret); 232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret; 233 return ret;
234 } 234 }
235 return 0; 235 return 0;
@@ -246,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
246 if (!ac->protocol) 246 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len); 247 return ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops); 248 BUG_ON(!ac->ops);
249 if (!ac->ops->is_authenticated(ac)) 249 if (ac->ops->should_authenticate(ac))
250 return ceph_build_auth_request(ac, msg_buf, msg_len); 250 return ceph_build_auth_request(ac, msg_buf, msg_len);
251 return 0; 251 return 0;
252} 252}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
15struct ceph_authorizer; 15struct ceph_authorizer;
16 16
17struct ceph_auth_client_ops { 17struct ceph_auth_client_ops {
18 const char *name;
19
18 /* 20 /*
19 * true if we are authenticated and can connect to 21 * true if we are authenticated and can connect to
20 * services. 22 * services.
@@ -22,6 +24,12 @@ struct ceph_auth_client_ops {
22 int (*is_authenticated)(struct ceph_auth_client *ac); 24 int (*is_authenticated)(struct ceph_auth_client *ac);
23 25
24 /* 26 /*
27 * true if we should (re)authenticate, e.g., when our tickets
28 * are getting old and crusty.
29 */
30 int (*should_authenticate)(struct ceph_auth_client *ac);
31
32 /*
25 * build requests and process replies during monitor 33 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build 34 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request. 35 * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
31 return !xi->starting; 31 return !xi->starting;
32} 32}
33 33
34static int should_authenticate(struct ceph_auth_client *ac)
35{
36 struct ceph_auth_none_info *xi = ac->private;
37
38 return xi->starting;
39}
40
34/* 41/*
35 * the generic auth code decode the global_id, and we carry no actual 42 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here. 43 * authenticate state, so nothing happens here.
@@ -94,9 +101,11 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
94} 101}
95 102
96static const struct ceph_auth_client_ops ceph_auth_none_ops = { 103static const struct ceph_auth_client_ops ceph_auth_none_ops = {
104 .name = "none",
97 .reset = reset, 105 .reset = reset,
98 .destroy = destroy, 106 .destroy = destroy,
99 .is_authenticated = is_authenticated, 107 .is_authenticated = is_authenticated,
108 .should_authenticate = should_authenticate,
100 .handle_reply = handle_reply, 109 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer, 110 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer, 111 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..83d4d2785ffe 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
27 return (ac->want_keys & xi->have_keys) == ac->want_keys; 27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28} 28}
29 29
30static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31{
32 struct ceph_x_info *xi = ac->private;
33 int need;
34
35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys);
38 return need != 0;
39}
40
30static int ceph_x_encrypt_buflen(int ilen) 41static int ceph_x_encrypt_buflen(int ilen)
31{ 42{
32 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + 43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
@@ -127,7 +138,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
127 int ret; 138 int ret;
128 char *dbuf; 139 char *dbuf;
129 char *ticket_buf; 140 char *ticket_buf;
130 u8 struct_v; 141 u8 reply_struct_v;
131 142
132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); 143 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
133 if (!dbuf) 144 if (!dbuf)
@@ -139,14 +150,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
139 goto out_dbuf; 150 goto out_dbuf;
140 151
141 ceph_decode_need(&p, end, 1 + sizeof(u32), bad); 152 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
142 struct_v = ceph_decode_8(&p); 153 reply_struct_v = ceph_decode_8(&p);
143 if (struct_v != 1) 154 if (reply_struct_v != 1)
144 goto bad; 155 goto bad;
145 num = ceph_decode_32(&p); 156 num = ceph_decode_32(&p);
146 dout("%d tickets\n", num); 157 dout("%d tickets\n", num);
147 while (num--) { 158 while (num--) {
148 int type; 159 int type;
149 u8 struct_v; 160 u8 tkt_struct_v, blob_struct_v;
150 struct ceph_x_ticket_handler *th; 161 struct ceph_x_ticket_handler *th;
151 void *dp, *dend; 162 void *dp, *dend;
152 int dlen; 163 int dlen;
@@ -165,8 +176,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
165 type = ceph_decode_32(&p); 176 type = ceph_decode_32(&p);
166 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); 177 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
167 178
168 struct_v = ceph_decode_8(&p); 179 tkt_struct_v = ceph_decode_8(&p);
169 if (struct_v != 1) 180 if (tkt_struct_v != 1)
170 goto bad; 181 goto bad;
171 182
172 th = get_ticket_handler(ac, type); 183 th = get_ticket_handler(ac, type);
@@ -186,8 +197,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
186 dend = dbuf + dlen; 197 dend = dbuf + dlen;
187 dp = dbuf; 198 dp = dbuf;
188 199
189 struct_v = ceph_decode_8(&dp); 200 tkt_struct_v = ceph_decode_8(&dp);
190 if (struct_v != 1) 201 if (tkt_struct_v != 1)
191 goto bad; 202 goto bad;
192 203
193 memcpy(&old_key, &th->session_key, sizeof(old_key)); 204 memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +235,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
224 tpend = tp + dlen; 235 tpend = tp + dlen;
225 dout(" ticket blob is %d bytes\n", dlen); 236 dout(" ticket blob is %d bytes\n", dlen);
226 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); 237 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
227 struct_v = ceph_decode_8(&tp); 238 blob_struct_v = ceph_decode_8(&tp);
228 new_secret_id = ceph_decode_64(&tp); 239 new_secret_id = ceph_decode_64(&tp);
229 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); 240 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
230 if (ret) 241 if (ret)
@@ -618,7 +629,9 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
618 629
619 630
620static const struct ceph_auth_client_ops ceph_x_ops = { 631static const struct ceph_auth_client_ops ceph_x_ops = {
632 .name = "x",
621 .is_authenticated = ceph_x_is_authenticated, 633 .is_authenticated = ceph_x_is_authenticated,
634 .should_authenticate = ceph_x_should_authenticate,
622 .build_request = ceph_x_build_request, 635 .build_request = ceph_x_build_request,
623 .handle_reply = ceph_x_handle_reply, 636 .handle_reply = ceph_x_handle_reply,
624 .create_authorizer = ceph_x_create_authorizer, 637 .create_authorizer = ceph_x_create_authorizer,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..619b61655ee5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
867{ 867{
868 struct ceph_mds_session *session = cap->session; 868 struct ceph_mds_session *session = cap->session;
869 struct ceph_inode_info *ci = cap->ci; 869 struct ceph_inode_info *ci = cap->ci;
870 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 870 struct ceph_mds_client *mdsc =
871 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
871 int removed = 0; 872 int removed = 0;
872 873
873 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 874 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
937 seq, issue_seq, mseq, follows, size, max_size, 938 seq, issue_seq, mseq, follows, size, max_size,
938 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 939 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
939 940
940 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL); 941 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
941 if (IS_ERR(msg)) 942 if (!msg)
942 return PTR_ERR(msg); 943 return -ENOMEM;
943 944
944 msg->hdr.tid = cpu_to_le64(flush_tid); 945 msg->hdr.tid = cpu_to_le64(flush_tid);
945 946
@@ -980,6 +981,46 @@ static int send_cap_msg(struct ceph_mds_session *session,
980 return 0; 981 return 0;
981} 982}
982 983
984static void __queue_cap_release(struct ceph_mds_session *session,
985 u64 ino, u64 cap_id, u32 migrate_seq,
986 u32 issue_seq)
987{
988 struct ceph_msg *msg;
989 struct ceph_mds_cap_release *head;
990 struct ceph_mds_cap_item *item;
991
992 spin_lock(&session->s_cap_lock);
993 BUG_ON(!session->s_num_cap_releases);
994 msg = list_first_entry(&session->s_cap_releases,
995 struct ceph_msg, list_head);
996
997 dout(" adding %llx release to mds%d msg %p (%d left)\n",
998 ino, session->s_mds, msg, session->s_num_cap_releases);
999
1000 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1001 head = msg->front.iov_base;
1002 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1003 item = msg->front.iov_base + msg->front.iov_len;
1004 item->ino = cpu_to_le64(ino);
1005 item->cap_id = cpu_to_le64(cap_id);
1006 item->migrate_seq = cpu_to_le32(migrate_seq);
1007 item->seq = cpu_to_le32(issue_seq);
1008
1009 session->s_num_cap_releases--;
1010
1011 msg->front.iov_len += sizeof(*item);
1012 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1013 dout(" release msg %p full\n", msg);
1014 list_move_tail(&msg->list_head, &session->s_cap_releases_done);
1015 } else {
1016 dout(" release msg %p at %d/%d (%d)\n", msg,
1017 (int)le32_to_cpu(head->num),
1018 (int)CEPH_CAPS_PER_RELEASE,
1019 (int)msg->front.iov_len);
1020 }
1021 spin_unlock(&session->s_cap_lock);
1022}
1023
983/* 1024/*
984 * Queue cap releases when an inode is dropped from our cache. Since 1025 * Queue cap releases when an inode is dropped from our cache. Since
985 * inode is about to be destroyed, there is no need for i_lock. 1026 * inode is about to be destroyed, there is no need for i_lock.
@@ -993,41 +1034,9 @@ void ceph_queue_caps_release(struct inode *inode)
993 while (p) { 1034 while (p) {
994 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); 1035 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
995 struct ceph_mds_session *session = cap->session; 1036 struct ceph_mds_session *session = cap->session;
996 struct ceph_msg *msg;
997 struct ceph_mds_cap_release *head;
998 struct ceph_mds_cap_item *item;
999 1037
1000 spin_lock(&session->s_cap_lock); 1038 __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
1001 BUG_ON(!session->s_num_cap_releases); 1039 cap->mseq, cap->issue_seq);
1002 msg = list_first_entry(&session->s_cap_releases,
1003 struct ceph_msg, list_head);
1004
1005 dout(" adding %p release to mds%d msg %p (%d left)\n",
1006 inode, session->s_mds, msg, session->s_num_cap_releases);
1007
1008 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1009 head = msg->front.iov_base;
1010 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1011 item = msg->front.iov_base + msg->front.iov_len;
1012 item->ino = cpu_to_le64(ceph_ino(inode));
1013 item->cap_id = cpu_to_le64(cap->cap_id);
1014 item->migrate_seq = cpu_to_le32(cap->mseq);
1015 item->seq = cpu_to_le32(cap->issue_seq);
1016
1017 session->s_num_cap_releases--;
1018
1019 msg->front.iov_len += sizeof(*item);
1020 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1021 dout(" release msg %p full\n", msg);
1022 list_move_tail(&msg->list_head,
1023 &session->s_cap_releases_done);
1024 } else {
1025 dout(" release msg %p at %d/%d (%d)\n", msg,
1026 (int)le32_to_cpu(head->num),
1027 (int)CEPH_CAPS_PER_RELEASE,
1028 (int)msg->front.iov_len);
1029 }
1030 spin_unlock(&session->s_cap_lock);
1031 p = rb_next(p); 1040 p = rb_next(p);
1032 __ceph_remove_cap(cap); 1041 __ceph_remove_cap(cap);
1033 } 1042 }
@@ -1298,7 +1307,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1298 */ 1307 */
1299void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1308void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1300{ 1309{
1301 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 1310 struct ceph_mds_client *mdsc =
1311 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1302 struct inode *inode = &ci->vfs_inode; 1312 struct inode *inode = &ci->vfs_inode;
1303 int was = ci->i_dirty_caps; 1313 int was = ci->i_dirty_caps;
1304 int dirty = 0; 1314 int dirty = 0;
@@ -1336,7 +1346,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1336static int __mark_caps_flushing(struct inode *inode, 1346static int __mark_caps_flushing(struct inode *inode,
1337 struct ceph_mds_session *session) 1347 struct ceph_mds_session *session)
1338{ 1348{
1339 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1349 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1340 struct ceph_inode_info *ci = ceph_inode(inode); 1350 struct ceph_inode_info *ci = ceph_inode(inode);
1341 int flushing; 1351 int flushing;
1342 1352
@@ -1663,7 +1673,7 @@ ack:
1663static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1673static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1664 unsigned *flush_tid) 1674 unsigned *flush_tid)
1665{ 1675{
1666 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1676 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
1667 struct ceph_inode_info *ci = ceph_inode(inode); 1677 struct ceph_inode_info *ci = ceph_inode(inode);
1668 int unlock_session = session ? 0 : 1; 1678 int unlock_session = session ? 0 : 1;
1669 int flushing = 0; 1679 int flushing = 0;
@@ -1716,10 +1726,9 @@ out_unlocked:
1716static int caps_are_flushed(struct inode *inode, unsigned tid) 1726static int caps_are_flushed(struct inode *inode, unsigned tid)
1717{ 1727{
1718 struct ceph_inode_info *ci = ceph_inode(inode); 1728 struct ceph_inode_info *ci = ceph_inode(inode);
1719 int dirty, i, ret = 1; 1729 int i, ret = 1;
1720 1730
1721 spin_lock(&inode->i_lock); 1731 spin_lock(&inode->i_lock);
1722 dirty = __ceph_caps_dirty(ci);
1723 for (i = 0; i < CEPH_CAP_BITS; i++) 1732 for (i = 0; i < CEPH_CAP_BITS; i++)
1724 if ((ci->i_flushing_caps & (1 << i)) && 1733 if ((ci->i_flushing_caps & (1 << i)) &&
1725 ci->i_cap_flush_tid[i] <= tid) { 1734 ci->i_cap_flush_tid[i] <= tid) {
@@ -1775,9 +1784,9 @@ out:
1775 spin_unlock(&ci->i_unsafe_lock); 1784 spin_unlock(&ci->i_unsafe_lock);
1776} 1785}
1777 1786
1778int ceph_fsync(struct file *file, struct dentry *dentry, int datasync) 1787int ceph_fsync(struct file *file, int datasync)
1779{ 1788{
1780 struct inode *inode = dentry->d_inode; 1789 struct inode *inode = file->f_mapping->host;
1781 struct ceph_inode_info *ci = ceph_inode(inode); 1790 struct ceph_inode_info *ci = ceph_inode(inode);
1782 unsigned flush_tid; 1791 unsigned flush_tid;
1783 int ret; 1792 int ret;
@@ -1829,7 +1838,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1829 err = wait_event_interruptible(ci->i_cap_wq, 1838 err = wait_event_interruptible(ci->i_cap_wq,
1830 caps_are_flushed(inode, flush_tid)); 1839 caps_are_flushed(inode, flush_tid));
1831 } else { 1840 } else {
1832 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 1841 struct ceph_mds_client *mdsc =
1842 &ceph_sb_to_client(inode->i_sb)->mdsc;
1833 1843
1834 spin_lock(&inode->i_lock); 1844 spin_lock(&inode->i_lock);
1835 if (__ceph_caps_dirty(ci)) 1845 if (__ceph_caps_dirty(ci))
@@ -2411,7 +2421,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2411 __releases(inode->i_lock) 2421 __releases(inode->i_lock)
2412{ 2422{
2413 struct ceph_inode_info *ci = ceph_inode(inode); 2423 struct ceph_inode_info *ci = ceph_inode(inode);
2414 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 2424 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
2415 unsigned seq = le32_to_cpu(m->seq); 2425 unsigned seq = le32_to_cpu(m->seq);
2416 int dirty = le32_to_cpu(m->dirty); 2426 int dirty = le32_to_cpu(m->dirty);
2417 int cleaned = 0; 2427 int cleaned = 0;
@@ -2653,7 +2663,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2653 struct ceph_mds_caps *h; 2663 struct ceph_mds_caps *h;
2654 int mds = session->s_mds; 2664 int mds = session->s_mds;
2655 int op; 2665 int op;
2656 u32 seq; 2666 u32 seq, mseq;
2657 struct ceph_vino vino; 2667 struct ceph_vino vino;
2658 u64 cap_id; 2668 u64 cap_id;
2659 u64 size, max_size; 2669 u64 size, max_size;
@@ -2673,6 +2683,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2673 vino.snap = CEPH_NOSNAP; 2683 vino.snap = CEPH_NOSNAP;
2674 cap_id = le64_to_cpu(h->cap_id); 2684 cap_id = le64_to_cpu(h->cap_id);
2675 seq = le32_to_cpu(h->seq); 2685 seq = le32_to_cpu(h->seq);
2686 mseq = le32_to_cpu(h->migrate_seq);
2676 size = le64_to_cpu(h->size); 2687 size = le64_to_cpu(h->size);
2677 max_size = le64_to_cpu(h->max_size); 2688 max_size = le64_to_cpu(h->max_size);
2678 2689
@@ -2687,6 +2698,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2687 vino.snap, inode); 2698 vino.snap, inode);
2688 if (!inode) { 2699 if (!inode) {
2689 dout(" i don't have ino %llx\n", vino.ino); 2700 dout(" i don't have ino %llx\n", vino.ino);
2701
2702 if (op == CEPH_CAP_OP_IMPORT)
2703 __queue_cap_release(session, vino.ino, cap_id,
2704 mseq, seq);
2705
2706 /*
2707 * send any full release message to try to move things
2708 * along for the mds (who clearly thinks we still have this
2709 * cap).
2710 */
2711 ceph_add_cap_releases(mdsc, session, -1);
2712 ceph_send_cap_releases(mdsc, session);
2690 goto done; 2713 goto done;
2691 } 2714 }
2692 2715
@@ -2712,7 +2735,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2712 spin_lock(&inode->i_lock); 2735 spin_lock(&inode->i_lock);
2713 cap = __get_cap_for_mds(ceph_inode(inode), mds); 2736 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2714 if (!cap) { 2737 if (!cap) {
2715 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n", 2738 dout(" no cap on %p ino %llx.%llx from mds%d\n",
2716 inode, ceph_ino(inode), ceph_snap(inode), mds); 2739 inode, ceph_ino(inode), ceph_snap(inode), mds);
2717 spin_unlock(&inode->i_lock); 2740 spin_unlock(&inode->i_lock);
2718 goto done; 2741 goto done;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..2fa992eaf7da 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
19 * Ceph release version 19 * Ceph release version
20 */ 20 */
21#define CEPH_VERSION_MAJOR 0 21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19 22#define CEPH_VERSION_MINOR 20
23#define CEPH_VERSION_PATCH 0 23#define CEPH_VERSION_PATCH 0
24 24
25#define _CEPH_STRINGIFY(x) #x 25#define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
36 * client-facing protocol. 36 * client-facing protocol.
37 */ 37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */ 38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */ 39#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */ 40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */ 41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */ 42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
@@ -53,8 +53,18 @@
53/* 53/*
54 * feature bits 54 * feature bits
55 */ 55 */
56#define CEPH_FEATURE_SUPPORTED 0 56#define CEPH_FEATURE_UID 1
57#define CEPH_FEATURE_REQUIRED 0 57#define CEPH_FEATURE_NOSRCADDR 2
58#define CEPH_FEATURE_FLOCK 4
59
60#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
61#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
62#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
63#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
64#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
65#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
66#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
67#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
58 68
59 69
60/* 70/*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
91#define CEPH_AUTH_NONE 0x1 101#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2 102#define CEPH_AUTH_CEPHX 0x2
93 103
104#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
105
94 106
95/********************************************* 107/*********************************************
96 * message layer 108 * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
128#define CEPH_MSG_CLIENT_SNAP 0x312 140#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313 141#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130 142
143/* pool ops */
144#define CEPH_MSG_POOLOP_REPLY 48
145#define CEPH_MSG_POOLOP 49
146
147
131/* osd */ 148/* osd */
132#define CEPH_MSG_OSD_MAP 41 149#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42 150#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43 151#define CEPH_MSG_OSD_OPREPLY 43
135 152
153/* pool operations */
154enum {
155 POOL_OP_CREATE = 0x01,
156 POOL_OP_DELETE = 0x02,
157 POOL_OP_AUID_CHANGE = 0x03,
158 POOL_OP_CREATE_SNAP = 0x11,
159 POOL_OP_DELETE_SNAP = 0x12,
160 POOL_OP_CREATE_UNMANAGED_SNAP = 0x21,
161 POOL_OP_DELETE_UNMANAGED_SNAP = 0x22,
162};
163
136struct ceph_mon_request_header { 164struct ceph_mon_request_header {
137 __le64 have_version; 165 __le64 have_version;
138 __le16 session_mon; 166 __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
155 struct ceph_statfs st; 183 struct ceph_statfs st;
156} __attribute__ ((packed)); 184} __attribute__ ((packed));
157 185
186const char *ceph_pool_op_name(int op);
187
188struct ceph_mon_poolop {
189 struct ceph_mon_request_header monhdr;
190 struct ceph_fsid fsid;
191 __le32 pool;
192 __le32 op;
193 __le64 auid;
194 __le64 snapid;
195 __le32 name_len;
196} __attribute__ ((packed));
197
198struct ceph_mon_poolop_reply {
199 struct ceph_mon_request_header monhdr;
200 struct ceph_fsid fsid;
201 __le32 reply_code;
202 __le32 epoch;
203 char has_data;
204 char data[0];
205} __attribute__ ((packed));
206
207struct ceph_mon_unmanaged_snap {
208 __le64 snapid;
209} __attribute__ ((packed));
210
158struct ceph_osd_getmap { 211struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr; 212 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid; 213 struct ceph_fsid fsid;
@@ -212,16 +265,17 @@ extern const char *ceph_mds_state_name(int s);
212 * - they also define the lock ordering by the MDS 265 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds 266 * - a few of these are internal to the mds
214 */ 267 */
215#define CEPH_LOCK_DN 1 268#define CEPH_LOCK_DVERSION 1
216#define CEPH_LOCK_ISNAP 2 269#define CEPH_LOCK_DN 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */ 270#define CEPH_LOCK_ISNAP 16
218#define CEPH_LOCK_IFILE 8 /* mds internal */ 271#define CEPH_LOCK_IVERSION 32 /* mds internal */
219#define CEPH_LOCK_IAUTH 32 272#define CEPH_LOCK_IFILE 64
220#define CEPH_LOCK_ILINK 64 273#define CEPH_LOCK_IAUTH 128
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */ 274#define CEPH_LOCK_ILINK 256
222#define CEPH_LOCK_INEST 256 /* mds internal */ 275#define CEPH_LOCK_IDFT 512 /* dir frag tree */
223#define CEPH_LOCK_IXATTR 512 276#define CEPH_LOCK_INEST 1024 /* mds internal */
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */ 277#define CEPH_LOCK_IXATTR 2048
278#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
225 279
226/* client_session ops */ 280/* client_session ops */
227enum { 281enum {
@@ -308,6 +362,7 @@ union ceph_mds_request_args {
308 struct { 362 struct {
309 __le32 frag; /* which dir fragment */ 363 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */ 364 __le32 max_entries; /* how many dentries to grab */
365 __le32 max_bytes;
311 } __attribute__ ((packed)) readdir; 366 } __attribute__ ((packed)) readdir;
312 struct { 367 struct {
313 __le32 mode; 368 __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
10 case CEPH_ENTITY_TYPE_OSD: return "osd"; 10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon"; 11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client"; 12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth"; 13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown"; 14 default: return "unknown";
16 } 15 }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs"; 44 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs"; 45 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr"; 46 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
47 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
48 48
49 case CEPH_OSD_OP_PULL: return "pull"; 49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push"; 50 case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
174 } 174 }
175 return "???"; 175 return "???";
176} 176}
177
178const char *ceph_pool_op_name(int op)
179{
180 switch (op) {
181 case POOL_OP_CREATE: return "create";
182 case POOL_OP_DELETE: return "delete";
183 case POOL_OP_AUID_CHANGE: return "auid change";
184 case POOL_OP_CREATE_SNAP: return "create snap";
185 case POOL_OP_DELETE_SNAP: return "delete snap";
186 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
187 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
188 }
189 return "???";
190}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
113static int monc_show(struct seq_file *s, void *p) 113static int monc_show(struct seq_file *s, void *p)
114{ 114{
115 struct ceph_client *client = s->private; 115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req; 116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc; 117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp; 118 struct rb_node *rp;
119 119
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
126 if (monc->want_next_osdmap) 126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n"); 127 seq_printf(s, "want next osdmap\n");
128 128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) { 129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node); 130 __u16 op;
131 seq_printf(s, "%lld statfs\n", req->tid); 131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
132 } 137 }
133 138
134 mutex_unlock(&monc->mutex); 139 mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..f85719310db2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
51 return -ENOMEM; /* oh well */ 51 return -ENOMEM; /* oh well */
52 52
53 spin_lock(&dentry->d_lock); 53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */ 54 if (dentry->d_fsdata) {
55 /* lost a race */
56 kmem_cache_free(ceph_dentry_cachep, di);
55 goto out_unlock; 57 goto out_unlock;
58 }
56 di->dentry = dentry; 59 di->dentry = dentry;
57 di->lease_session = NULL; 60 di->lease_session = NULL;
58 dentry->d_fsdata = di; 61 dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
125 dentry = list_entry(p, struct dentry, d_u.d_child); 128 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry); 129 di = ceph_dentry(dentry);
127 while (1) { 130 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next, 131 dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
132 d_unhashed(dentry) ? "!hashed" : "hashed",
129 parent->d_subdirs.prev, parent->d_subdirs.next); 133 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) { 134 if (p == &parent->d_subdirs) {
131 fi->at_end = 1; 135 fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
229 u32 ftype; 233 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo; 234 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir; 235 const int max_entries = client->mount_args->max_readdir;
236 const int max_bytes = client->mount_args->max_readdir_bytes;
232 237
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 238 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end) 239 if (fi->at_end)
@@ -312,6 +317,7 @@ more:
312 req->r_readdir_offset = fi->next_offset; 317 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag); 318 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 319 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
320 req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
315 req->r_num_caps = max_entries + 1; 321 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req); 322 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) { 323 if (err < 0) {
@@ -335,7 +341,7 @@ more:
335 if (req->r_reply_info.dir_end) { 341 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name); 342 kfree(fi->last_name);
337 fi->last_name = NULL; 343 fi->last_name = NULL;
338 fi->next_offset = 0; 344 fi->next_offset = 2;
339 } else { 345 } else {
340 rinfo = &req->r_reply_info; 346 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi, 347 err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 484struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err) 485 struct dentry *dentry, int err)
480{ 486{
481 struct ceph_client *client = ceph_client(dentry->d_sb); 487 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode; 488 struct inode *parent = dentry->d_parent->d_inode;
483 489
484 /* .snap dir? */ 490 /* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
568 !is_root_ceph_dentry(dir, dentry) && 574 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 575 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { 576 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock); 577 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir); 578 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL); 579 d_add(dentry, NULL);
@@ -582,7 +587,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
582 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 587 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
583 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 588 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
584 if (IS_ERR(req)) 589 if (IS_ERR(req))
585 return ERR_PTR(PTR_ERR(req)); 590 return ERR_CAST(req);
586 req->r_dentry = dget(dentry); 591 req->r_dentry = dget(dentry);
587 req->r_num_caps = 2; 592 req->r_num_caps = 2;
588 /* we only need inode linkage */ 593 /* we only need inode linkage */
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
888 893
889 /* ensure target dentry is invalidated, despite 894 /* ensure target dentry is invalidated, despite
890 rehashing bug in vfs_rename_dir */ 895 rehashing bug in vfs_rename_dir */
891 new_dentry->d_time = jiffies; 896 ceph_invalidate_dentry_lease(new_dentry);
892 ceph_dentry(new_dentry)->lease_shared_gen = 0;
893 } 897 }
894 ceph_mdsc_put_request(req); 898 ceph_mdsc_put_request(req);
895 return err; 899 return err;
896} 900}
897 901
902/*
903 * Ensure a dentry lease will no longer revalidate.
904 */
905void ceph_invalidate_dentry_lease(struct dentry *dentry)
906{
907 spin_lock(&dentry->d_lock);
908 dentry->d_time = jiffies;
909 ceph_dentry(dentry)->lease_shared_gen = 0;
910 spin_unlock(&dentry->d_lock);
911}
898 912
899/* 913/*
900 * Check if dentry lease is valid. If not, delete the lease. Try to 914 * Check if dentry lease is valid. If not, delete the lease. Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
972{ 986{
973 struct inode *dir = dentry->d_parent->d_inode; 987 struct inode *dir = dentry->d_parent->d_inode;
974 988
975 dout("d_revalidate %p '%.*s' inode %p\n", dentry, 989 dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
976 dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 990 dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
991 ceph_dentry(dentry)->offset);
977 992
978 /* always trust cached snapped dentries, snapdir dentry */ 993 /* always trust cached snapped dentries, snapdir dentry */
979 if (ceph_snap(dir) != CEPH_NOSNAP) { 994 if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1050 struct ceph_inode_info *ci = ceph_inode(inode); 1065 struct ceph_inode_info *ci = ceph_inode(inode);
1051 int left; 1066 int left;
1052 1067
1053 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT)) 1068 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1054 return -EISDIR; 1069 return -EISDIR;
1055 1070
1056 if (!cf->dir_info) { 1071 if (!cf->dir_info) {
@@ -1092,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1092 * an fsync() on a dir will wait for any uncommitted directory 1107 * an fsync() on a dir will wait for any uncommitted directory
1093 * operations to commit. 1108 * operations to commit.
1094 */ 1109 */
1095static int ceph_dir_fsync(struct file *file, struct dentry *dentry, 1110static int ceph_dir_fsync(struct file *file, int datasync)
1096 int datasync)
1097{ 1111{
1098 struct inode *inode = dentry->d_inode; 1112 struct inode *inode = file->f_path.dentry->d_inode;
1099 struct ceph_inode_info *ci = ceph_inode(inode); 1113 struct ceph_inode_info *ci = ceph_inode(inode);
1100 struct list_head *head = &ci->i_unsafe_dirops; 1114 struct list_head *head = &ci->i_unsafe_dirops;
1101 struct ceph_mds_request *req; 1115 struct ceph_mds_request *req;
@@ -1152,7 +1166,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1152 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1166 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1153 dn->d_name.len, dn->d_name.name); 1167 dn->d_name.len, dn->d_name.name);
1154 if (di) { 1168 if (di) {
1155 mdsc = &ceph_client(dn->d_sb)->mdsc; 1169 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1156 spin_lock(&mdsc->dentry_lru_lock); 1170 spin_lock(&mdsc->dentry_lru_lock);
1157 list_add_tail(&di->lru, &mdsc->dentry_lru); 1171 list_add_tail(&di->lru, &mdsc->dentry_lru);
1158 mdsc->num_dentry++; 1172 mdsc->num_dentry++;
@@ -1165,10 +1179,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1165 struct ceph_dentry_info *di = ceph_dentry(dn); 1179 struct ceph_dentry_info *di = ceph_dentry(dn);
1166 struct ceph_mds_client *mdsc; 1180 struct ceph_mds_client *mdsc;
1167 1181
1168 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn, 1182 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1169 dn->d_name.len, dn->d_name.name); 1183 dn->d_name.len, dn->d_name.name, di->offset);
1170 if (di) { 1184 if (di) {
1171 mdsc = &ceph_client(dn->d_sb)->mdsc; 1185 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1172 spin_lock(&mdsc->dentry_lru_lock); 1186 spin_lock(&mdsc->dentry_lru_lock);
1173 list_move_tail(&di->lru, &mdsc->dentry_lru); 1187 list_move_tail(&di->lru, &mdsc->dentry_lru);
1174 spin_unlock(&mdsc->dentry_lru_lock); 1188 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1197,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1183 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1197 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1184 dn->d_name.len, dn->d_name.name); 1198 dn->d_name.len, dn->d_name.name);
1185 if (di) { 1199 if (di) {
1186 mdsc = &ceph_client(dn->d_sb)->mdsc; 1200 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
1187 spin_lock(&mdsc->dentry_lru_lock); 1201 spin_lock(&mdsc->dentry_lru_lock);
1188 list_del_init(&di->lru); 1202 list_del_init(&di->lru);
1189 mdsc->num_dentry--; 1203 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..4480cb1c63e7 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
93 return ERR_PTR(-ESTALE); 93 return ERR_PTR(-ESTALE);
94 94
95 dentry = d_obtain_alias(inode); 95 dentry = d_obtain_alias(inode);
96 if (!dentry) { 96 if (IS_ERR(dentry)) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n", 97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode); 98 fh->ino, inode);
99 iput(inode); 99 iput(inode);
100 return ERR_PTR(-ENOMEM); 100 return dentry;
101 } 101 }
102 err = ceph_init_dentry(dentry); 102 err = ceph_init_dentry(dentry);
103 103
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
115static struct dentry *__cfh_to_dentry(struct super_block *sb, 115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh) 116 struct ceph_nfs_confh *cfh)
117{ 117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc; 118 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
119 struct inode *inode; 119 struct inode *inode;
120 struct dentry *dentry; 120 struct dentry *dentry;
121 struct ceph_vino vino; 121 struct ceph_vino vino;
@@ -133,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH, 133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS); 134 USE_ANY_MDS);
135 if (IS_ERR(req)) 135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req)); 136 return ERR_CAST(req);
137 137
138 req->r_ino1 = vino; 138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino; 139 req->r_ino2.ino = cfh->parent_ino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
149 } 149 }
150 150
151 dentry = d_obtain_alias(inode); 151 dentry = d_obtain_alias(inode);
152 if (!dentry) { 152 if (IS_ERR(dentry)) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n", 153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode); 154 cfh->ino, inode);
155 iput(inode); 155 iput(inode);
156 return ERR_PTR(-ENOMEM); 156 return dentry;
157 } 157 }
158 err = ceph_init_dentry(dentry); 158 err = ceph_init_dentry(dentry);
159 if (err < 0) { 159 if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
202 return ERR_PTR(-ESTALE); 202 return ERR_PTR(-ESTALE);
203 203
204 dentry = d_obtain_alias(inode); 204 dentry = d_obtain_alias(inode);
205 if (!dentry) { 205 if (IS_ERR(dentry)) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n", 206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode); 207 cfh->ino, inode);
208 iput(inode); 208 iput(inode);
209 return ERR_PTR(-ENOMEM); 209 return dentry;
210 } 210 }
211 err = ceph_init_dentry(dentry); 211 err = ceph_init_dentry(dentry);
212 if (err < 0) { 212 if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ed6f19721d6e..6251a1574b94 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
230 /* do the open */ 230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode); 231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req)) 232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req)); 233 return ERR_CAST(req);
234 req->r_dentry = dget(dentry); 234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2; 235 req->r_num_caps = 2;
236 if (flags & O_CREAT) { 236 if (flags & O_CREAT) {
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
317/* 317/*
318 * allocate a vector new pages 318 * allocate a vector new pages
319 */ 319 */
320static struct page **alloc_page_vector(int num_pages) 320struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{ 321{
322 struct page **pages; 322 struct page **pages;
323 int i; 323 int i;
324 324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); 325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages) 326 if (!pages)
327 return ERR_PTR(-ENOMEM); 327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) { 328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS); 329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) { 330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i); 331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM); 332 return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
540 * in sequence. 540 * in sequence.
541 */ 541 */
542 } else { 542 } else {
543 pages = alloc_page_vector(num_pages); 543 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
544 } 544 }
545 if (IS_ERR(pages)) 545 if (IS_ERR(pages))
546 return PTR_ERR(pages); 546 return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
649 do_sync, 649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size, 650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2); 651 &mtime, false, 2);
652 if (IS_ERR(req)) 652 if (!req)
653 return PTR_ERR(req); 653 return -ENOMEM;
654 654
655 num_pages = calc_pages_for(pos, len); 655 num_pages = calc_pages_for(pos, len);
656 656
@@ -668,7 +668,7 @@ more:
668 truncate_inode_pages_range(inode->i_mapping, pos, 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1)); 669 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else { 670 } else {
671 pages = alloc_page_vector(num_pages); 671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
672 if (IS_ERR(pages)) { 672 if (IS_ERR(pages)) {
673 ret = PTR_ERR(pages); 673 ret = PTR_ERR(pages);
674 goto out; 674 goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
809 struct file *file = iocb->ki_filp; 809 struct file *file = iocb->ki_filp;
810 struct inode *inode = file->f_dentry->d_inode; 810 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode); 811 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 812 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
813 loff_t endoff = pos + iov->iov_len; 813 loff_t endoff = pos + iov->iov_len;
814 int got = 0; 814 int got = 0;
815 int ret, err; 815 int ret, err;
@@ -844,8 +844,7 @@ retry_snap:
844 if ((ret >= 0 || ret == -EIOCBQUEUED) && 844 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) 845 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { 846 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
847 err = vfs_fsync_range(file, file->f_path.dentry, 847 err = vfs_fsync_range(file, pos, pos + ret - 1, 1);
848 pos, pos + ret - 1, 1);
849 if (err < 0) 848 if (err < 0)
850 ret = err; 849 ret = err;
851 } 850 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..ab47f46ca282 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
69 69
70 BUG_ON(!S_ISDIR(parent->i_mode)); 70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode)) 71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode)); 72 return inode;
73 inode->i_mode = parent->i_mode; 73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid; 74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid; 75 inode->i_gid = parent->i_gid;
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 384 */
385 if (ci->i_snap_realm) { 385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc; 387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 389
390 dout(" dropping residual ref to snap realm %p\n", realm); 390 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
619 memcpy(ci->i_xattrs.blob->vec.iov_base, 619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len); 620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version); 621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 xattr_blob = NULL;
622 } 623 }
623 624
624 inode->i_mapping->a_ops = &ceph_aops; 625 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info = 626 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info; 627 &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
627 628
628 switch (inode->i_mode & S_IFMT) { 629 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO: 630 case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
674 /* set dir completion flag? */ 675 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 && 676 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP && 677 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) { 678 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
679 (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
678 dout(" marking %p complete (empty)\n", inode); 680 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE; 681 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2; 682 ci->i_max_offset = 2;
681 } 683 }
682 684
683 /* it may be better to set st_size in getattr instead? */ 685 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES)) 686 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes; 687 inode->i_size = ci->i_rbytes;
686 break; 688 break;
687 default: 689 default:
@@ -802,6 +804,37 @@ out_unlock:
802} 804}
803 805
804/* 806/*
807 * Set dentry's directory position based on the current dir's max, and
808 * order it in d_subdirs, so that dcache_readdir behaves.
809 */
810static void ceph_set_dentry_offset(struct dentry *dn)
811{
812 struct dentry *dir = dn->d_parent;
813 struct inode *inode = dn->d_parent->d_inode;
814 struct ceph_dentry_info *di;
815
816 BUG_ON(!inode);
817
818 di = ceph_dentry(dn);
819
820 spin_lock(&inode->i_lock);
821 if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
822 spin_unlock(&inode->i_lock);
823 return;
824 }
825 di->offset = ceph_inode(inode)->i_max_offset++;
826 spin_unlock(&inode->i_lock);
827
828 spin_lock(&dcache_lock);
829 spin_lock(&dn->d_lock);
830 list_move(&dn->d_u.d_child, &dir->d_subdirs);
831 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
832 dn->d_u.d_child.prev, dn->d_u.d_child.next);
833 spin_unlock(&dn->d_lock);
834 spin_unlock(&dcache_lock);
835}
836
837/*
805 * splice a dentry to an inode. 838 * splice a dentry to an inode.
806 * caller must hold directory i_mutex for this to be safe. 839 * caller must hold directory i_mutex for this to be safe.
807 * 840 *
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
814{ 847{
815 struct dentry *realdn; 848 struct dentry *realdn;
816 849
850 BUG_ON(dn->d_inode);
851
817 /* dn must be unhashed */ 852 /* dn must be unhashed */
818 if (!d_unhashed(dn)) 853 if (!d_unhashed(dn))
819 d_drop(dn); 854 d_drop(dn);
@@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
835 dn = realdn; 870 dn = realdn;
836 } else { 871 } else {
837 BUG_ON(!ceph_dentry(dn)); 872 BUG_ON(!ceph_dentry(dn));
838
839 dout("dn %p attached to %p ino %llx.%llx\n", 873 dout("dn %p attached to %p ino %llx.%llx\n",
840 dn, dn->d_inode, ceph_vinop(dn->d_inode)); 874 dn, dn->d_inode, ceph_vinop(dn->d_inode));
841 } 875 }
842 if ((!prehash || *prehash) && d_unhashed(dn)) 876 if ((!prehash || *prehash) && d_unhashed(dn))
843 d_rehash(dn); 877 d_rehash(dn);
878 ceph_set_dentry_offset(dn);
844out: 879out:
845 return dn; 880 return dn;
846} 881}
847 882
848/* 883/*
849 * Set dentry's directory position based on the current dir's max, and
850 * order it in d_subdirs, so that dcache_readdir behaves.
851 */
852static void ceph_set_dentry_offset(struct dentry *dn)
853{
854 struct dentry *dir = dn->d_parent;
855 struct inode *inode = dn->d_parent->d_inode;
856 struct ceph_dentry_info *di;
857
858 BUG_ON(!inode);
859
860 di = ceph_dentry(dn);
861
862 spin_lock(&inode->i_lock);
863 di->offset = ceph_inode(inode)->i_max_offset++;
864 spin_unlock(&inode->i_lock);
865
866 spin_lock(&dcache_lock);
867 spin_lock(&dn->d_lock);
868 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
869 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
870 dn->d_u.d_child.prev, dn->d_u.d_child.next);
871 spin_unlock(&dn->d_lock);
872 spin_unlock(&dcache_lock);
873}
874
875/*
876 * Incorporate results into the local cache. This is either just 884 * Incorporate results into the local cache. This is either just
877 * one inode, or a directory, dentry, and possibly linked-to inode (e.g., 885 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
878 * after a lookup). 886 * after a lookup).
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
933 941
934 if (!rinfo->head->is_target && !rinfo->head->is_dentry) { 942 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
935 dout("fill_trace reply is empty!\n"); 943 dout("fill_trace reply is empty!\n");
936 if (rinfo->head->result == 0 && req->r_locked_dir) { 944 if (rinfo->head->result == 0 && req->r_locked_dir)
937 struct ceph_inode_info *ci = 945 ceph_invalidate_dir_request(req);
938 ceph_inode(req->r_locked_dir);
939 dout(" clearing %p complete (empty trace)\n",
940 req->r_locked_dir);
941 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
942 ci->i_release_count++;
943 }
944 return 0; 946 return 0;
945 } 947 }
946 948
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1011 req->r_old_dentry->d_name.len, 1013 req->r_old_dentry->d_name.len,
1012 req->r_old_dentry->d_name.name, 1014 req->r_old_dentry->d_name.name,
1013 dn, dn->d_name.len, dn->d_name.name); 1015 dn, dn->d_name.len, dn->d_name.name);
1016
1014 /* ensure target dentry is invalidated, despite 1017 /* ensure target dentry is invalidated, despite
1015 rehashing bug in vfs_rename_dir */ 1018 rehashing bug in vfs_rename_dir */
1016 dn->d_time = jiffies; 1019 ceph_invalidate_dentry_lease(dn);
1017 ceph_dentry(dn)->lease_shared_gen = 0; 1020
1018 /* take overwritten dentry's readdir offset */ 1021 /* take overwritten dentry's readdir offset */
1022 dout("dn %p gets %p offset %lld (old offset %lld)\n",
1023 req->r_old_dentry, dn, ceph_dentry(dn)->offset,
1024 ceph_dentry(req->r_old_dentry)->offset);
1019 ceph_dentry(req->r_old_dentry)->offset = 1025 ceph_dentry(req->r_old_dentry)->offset =
1020 ceph_dentry(dn)->offset; 1026 ceph_dentry(dn)->offset;
1027
1021 dn = req->r_old_dentry; /* use old_dentry */ 1028 dn = req->r_old_dentry; /* use old_dentry */
1022 in = dn->d_inode; 1029 in = dn->d_inode;
1023 } 1030 }
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1059 goto done; 1066 goto done;
1060 } 1067 }
1061 req->r_dentry = dn; /* may have spliced */ 1068 req->r_dentry = dn; /* may have spliced */
1062 ceph_set_dentry_offset(dn);
1063 igrab(in); 1069 igrab(in);
1064 } else if (ceph_ino(in) == vino.ino && 1070 } else if (ceph_ino(in) == vino.ino &&
1065 ceph_snap(in) == vino.snap) { 1071 ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1102 err = PTR_ERR(dn); 1108 err = PTR_ERR(dn);
1103 goto done; 1109 goto done;
1104 } 1110 }
1105 ceph_set_dentry_offset(dn);
1106 req->r_dentry = dn; /* may have spliced */ 1111 req->r_dentry = dn; /* may have spliced */
1107 igrab(in); 1112 igrab(in);
1108 rinfo->head->is_dentry = 1; /* fool notrace handlers */ 1113 rinfo->head->is_dentry = 1; /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
1429{ 1434{
1430 struct ceph_inode_info *ci = ceph_inode(inode); 1435 struct ceph_inode_info *ci = ceph_inode(inode);
1431 1436
1432 if (queue_work(ceph_client(inode->i_sb)->trunc_wq, 1437 if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
1433 &ci->i_vmtruncate_work)) { 1438 &ci->i_vmtruncate_work)) {
1434 dout("ceph_queue_vmtruncate %p\n", inode); 1439 dout("ceph_queue_vmtruncate %p\n", inode);
1435 igrab(inode); 1440 igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1518 struct inode *parent_inode = dentry->d_parent->d_inode; 1523 struct inode *parent_inode = dentry->d_parent->d_inode;
1519 const unsigned int ia_valid = attr->ia_valid; 1524 const unsigned int ia_valid = attr->ia_valid;
1520 struct ceph_mds_request *req; 1525 struct ceph_mds_request *req;
1521 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc; 1526 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
1522 int issued; 1527 int issued;
1523 int release = 0, dirtied = 0; 1528 int release = 0, dirtied = 0;
1524 int mask = 0; 1529 int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc; 101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
102 u64 len = 1, olen; 102 u64 len = 1, olen;
103 u64 tmp; 103 u64 tmp;
104 struct ceph_object_layout ol; 104 struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..1766947fc07a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
40static void __wake_requests(struct ceph_mds_client *mdsc, 40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head); 41 struct list_head *head);
42 42
43const static struct ceph_connection_operations mds_con_ops; 43static const struct ceph_connection_operations mds_con_ops;
44 44
45 45
46/* 46/*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
665 struct ceph_msg *msg; 665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h; 666 struct ceph_mds_session_head *h;
667 667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL); 668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
669 if (IS_ERR(msg)) { 669 if (!msg) {
670 pr_err("create_session_msg ENOMEM creating msg\n"); 670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg)); 671 return NULL;
672 } 672 }
673 h = msg->front.iov_base; 673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op); 674 h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
687 struct ceph_msg *msg; 687 struct ceph_msg *msg;
688 int mstate; 688 int mstate;
689 int mds = session->s_mds; 689 int mds = session->s_mds;
690 int err = 0;
691 690
692 /* wait for mds to go active? */ 691 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds); 692 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
698 697
699 /* send connect message */ 698 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); 699 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) { 700 if (!msg)
702 err = PTR_ERR(msg); 701 return -ENOMEM;
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg); 702 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0; 703 return 0;
709} 704}
710 705
@@ -804,12 +799,49 @@ out:
804} 799}
805 800
806static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, 801static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
807 void *arg) 802 void *arg)
808{ 803{
809 struct ceph_inode_info *ci = ceph_inode(inode); 804 struct ceph_inode_info *ci = ceph_inode(inode);
805 int drop = 0;
806
810 dout("removing cap %p, ci is %p, inode is %p\n", 807 dout("removing cap %p, ci is %p, inode is %p\n",
811 cap, ci, &ci->vfs_inode); 808 cap, ci, &ci->vfs_inode);
812 ceph_remove_cap(cap); 809 spin_lock(&inode->i_lock);
810 __ceph_remove_cap(cap);
811 if (!__ceph_is_any_real_caps(ci)) {
812 struct ceph_mds_client *mdsc =
813 &ceph_sb_to_client(inode->i_sb)->mdsc;
814
815 spin_lock(&mdsc->cap_dirty_lock);
816 if (!list_empty(&ci->i_dirty_item)) {
817 pr_info(" dropping dirty %s state for %p %lld\n",
818 ceph_cap_string(ci->i_dirty_caps),
819 inode, ceph_ino(inode));
820 ci->i_dirty_caps = 0;
821 list_del_init(&ci->i_dirty_item);
822 drop = 1;
823 }
824 if (!list_empty(&ci->i_flushing_item)) {
825 pr_info(" dropping dirty+flushing %s state for %p %lld\n",
826 ceph_cap_string(ci->i_flushing_caps),
827 inode, ceph_ino(inode));
828 ci->i_flushing_caps = 0;
829 list_del_init(&ci->i_flushing_item);
830 mdsc->num_cap_flushing--;
831 drop = 1;
832 }
833 if (drop && ci->i_wrbuffer_ref) {
834 pr_info(" dropping dirty data for %p %lld\n",
835 inode, ceph_ino(inode));
836 ci->i_wrbuffer_ref = 0;
837 ci->i_wrbuffer_ref_head = 0;
838 drop++;
839 }
840 spin_unlock(&mdsc->cap_dirty_lock);
841 }
842 spin_unlock(&inode->i_lock);
843 while (drop--)
844 iput(inode);
813 return 0; 845 return 0;
814} 846}
815 847
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
821 dout("remove_session_caps on %p\n", session); 853 dout("remove_session_caps on %p\n", session);
822 iterate_session_caps(session, remove_session_caps_cb, NULL); 854 iterate_session_caps(session, remove_session_caps_cb, NULL);
823 BUG_ON(session->s_nr_caps > 0); 855 BUG_ON(session->s_nr_caps > 0);
856 BUG_ON(!list_empty(&session->s_cap_flushing));
824 cleanup_cap_releases(session); 857 cleanup_cap_releases(session);
825} 858}
826 859
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
883 ceph_mds_state_name(state)); 916 ceph_mds_state_name(state));
884 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, 917 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
885 ++session->s_renew_seq); 918 ++session->s_renew_seq);
886 if (IS_ERR(msg)) 919 if (!msg)
887 return PTR_ERR(msg); 920 return -ENOMEM;
888 ceph_con_send(&session->s_con, msg); 921 ceph_con_send(&session->s_con, msg);
889 return 0; 922 return 0;
890} 923}
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
931 struct ceph_mds_session *session) 964 struct ceph_mds_session *session)
932{ 965{
933 struct ceph_msg *msg; 966 struct ceph_msg *msg;
934 int err = 0;
935 967
936 dout("request_close_session mds%d state %s seq %lld\n", 968 dout("request_close_session mds%d state %s seq %lld\n",
937 session->s_mds, session_state_name(session->s_state), 969 session->s_mds, session_state_name(session->s_state),
938 session->s_seq); 970 session->s_seq);
939 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); 971 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
940 if (IS_ERR(msg)) 972 if (!msg)
941 err = PTR_ERR(msg); 973 return -ENOMEM;
942 else 974 ceph_con_send(&session->s_con, msg);
943 ceph_con_send(&session->s_con, msg); 975 return 0;
944 return err;
945} 976}
946 977
947/* 978/*
@@ -1035,9 +1066,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1035 * 1066 *
1036 * Called under s_mutex. 1067 * Called under s_mutex.
1037 */ 1068 */
1038static int add_cap_releases(struct ceph_mds_client *mdsc, 1069int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1039 struct ceph_mds_session *session, 1070 struct ceph_mds_session *session,
1040 int extra) 1071 int extra)
1041{ 1072{
1042 struct ceph_msg *msg; 1073 struct ceph_msg *msg;
1043 struct ceph_mds_cap_release *head; 1074 struct ceph_mds_cap_release *head;
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
1059 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1090 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1060 spin_unlock(&session->s_cap_lock); 1091 spin_unlock(&session->s_cap_lock);
1061 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1092 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1062 0, 0, NULL); 1093 GFP_NOFS);
1063 if (!msg) 1094 if (!msg)
1064 goto out_unlocked; 1095 goto out_unlocked;
1065 dout("add_cap_releases %p msg %p now %d\n", session, msg, 1096 dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1145,16 +1176,14 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1145/* 1176/*
1146 * called under s_mutex 1177 * called under s_mutex
1147 */ 1178 */
1148static void send_cap_releases(struct ceph_mds_client *mdsc, 1179void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
1149 struct ceph_mds_session *session) 1180 struct ceph_mds_session *session)
1150{ 1181{
1151 struct ceph_msg *msg; 1182 struct ceph_msg *msg;
1152 1183
1153 dout("send_cap_releases mds%d\n", session->s_mds); 1184 dout("send_cap_releases mds%d\n", session->s_mds);
1154 while (1) { 1185 spin_lock(&session->s_cap_lock);
1155 spin_lock(&session->s_cap_lock); 1186 while (!list_empty(&session->s_cap_releases_done)) {
1156 if (list_empty(&session->s_cap_releases_done))
1157 break;
1158 msg = list_first_entry(&session->s_cap_releases_done, 1187 msg = list_first_entry(&session->s_cap_releases_done,
1159 struct ceph_msg, list_head); 1188 struct ceph_msg, list_head);
1160 list_del_init(&msg->list_head); 1189 list_del_init(&msg->list_head);
@@ -1162,7 +1191,46 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
1162 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1191 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1163 dout("send_cap_releases mds%d %p\n", session->s_mds, msg); 1192 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1164 ceph_con_send(&session->s_con, msg); 1193 ceph_con_send(&session->s_con, msg);
1194 spin_lock(&session->s_cap_lock);
1195 }
1196 spin_unlock(&session->s_cap_lock);
1197}
1198
1199static void discard_cap_releases(struct ceph_mds_client *mdsc,
1200 struct ceph_mds_session *session)
1201{
1202 struct ceph_msg *msg;
1203 struct ceph_mds_cap_release *head;
1204 unsigned num;
1205
1206 dout("discard_cap_releases mds%d\n", session->s_mds);
1207 spin_lock(&session->s_cap_lock);
1208
1209 /* zero out the in-progress message */
1210 msg = list_first_entry(&session->s_cap_releases,
1211 struct ceph_msg, list_head);
1212 head = msg->front.iov_base;
1213 num = le32_to_cpu(head->num);
1214 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
1215 head->num = cpu_to_le32(0);
1216 session->s_num_cap_releases += num;
1217
1218 /* requeue completed messages */
1219 while (!list_empty(&session->s_cap_releases_done)) {
1220 msg = list_first_entry(&session->s_cap_releases_done,
1221 struct ceph_msg, list_head);
1222 list_del_init(&msg->list_head);
1223
1224 head = msg->front.iov_base;
1225 num = le32_to_cpu(head->num);
1226 dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
1227 num);
1228 session->s_num_cap_releases += num;
1229 head->num = cpu_to_le32(0);
1230 msg->front.iov_len = sizeof(*head);
1231 list_add(&msg->list_head, &session->s_cap_releases);
1165 } 1232 }
1233
1166 spin_unlock(&session->s_cap_lock); 1234 spin_unlock(&session->s_cap_lock);
1167} 1235}
1168 1236
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1181 if (!req) 1249 if (!req)
1182 return ERR_PTR(-ENOMEM); 1250 return ERR_PTR(-ENOMEM);
1183 1251
1252 mutex_init(&req->r_fill_mutex);
1184 req->r_started = jiffies; 1253 req->r_started = jiffies;
1185 req->r_resend_mds = -1; 1254 req->r_resend_mds = -1;
1186 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1255 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
1251 len += 1 + temp->d_name.len; 1320 len += 1 + temp->d_name.len;
1252 temp = temp->d_parent; 1321 temp = temp->d_parent;
1253 if (temp == NULL) { 1322 if (temp == NULL) {
1254 pr_err("build_path_dentry corrupt dentry %p\n", dentry); 1323 pr_err("build_path corrupt dentry %p\n", dentry);
1255 return ERR_PTR(-EINVAL); 1324 return ERR_PTR(-EINVAL);
1256 } 1325 }
1257 } 1326 }
@@ -1267,7 +1336,7 @@ retry:
1267 struct inode *inode = temp->d_inode; 1336 struct inode *inode = temp->d_inode;
1268 1337
1269 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { 1338 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1270 dout("build_path_dentry path+%d: %p SNAPDIR\n", 1339 dout("build_path path+%d: %p SNAPDIR\n",
1271 pos, temp); 1340 pos, temp);
1272 } else if (stop_on_nosnap && inode && 1341 } else if (stop_on_nosnap && inode &&
1273 ceph_snap(inode) == CEPH_NOSNAP) { 1342 ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
1278 break; 1347 break;
1279 strncpy(path + pos, temp->d_name.name, 1348 strncpy(path + pos, temp->d_name.name,
1280 temp->d_name.len); 1349 temp->d_name.len);
1281 dout("build_path_dentry path+%d: %p '%.*s'\n",
1282 pos, temp, temp->d_name.len, path + pos);
1283 } 1350 }
1284 if (pos) 1351 if (pos)
1285 path[--pos] = '/'; 1352 path[--pos] = '/';
1286 temp = temp->d_parent; 1353 temp = temp->d_parent;
1287 if (temp == NULL) { 1354 if (temp == NULL) {
1288 pr_err("build_path_dentry corrupt dentry\n"); 1355 pr_err("build_path corrupt dentry\n");
1289 kfree(path); 1356 kfree(path);
1290 return ERR_PTR(-EINVAL); 1357 return ERR_PTR(-EINVAL);
1291 } 1358 }
1292 } 1359 }
1293 if (pos != 0) { 1360 if (pos != 0) {
1294 pr_err("build_path_dentry did not end path lookup where " 1361 pr_err("build_path did not end path lookup where "
1295 "expected, namelen is %d, pos is %d\n", len, pos); 1362 "expected, namelen is %d, pos is %d\n", len, pos);
1296 /* presumably this is only possible if racing with a 1363 /* presumably this is only possible if racing with a
1297 rename of one of the parent directories (we can not 1364 rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
1303 1370
1304 *base = ceph_ino(temp->d_inode); 1371 *base = ceph_ino(temp->d_inode);
1305 *plen = len; 1372 *plen = len;
1306 dout("build_path_dentry on %p %d built %llx '%.*s'\n", 1373 dout("build_path on %p %d built %llx '%.*s'\n",
1307 dentry, atomic_read(&dentry->d_count), *base, len, path); 1374 dentry, atomic_read(&dentry->d_count), *base, len, path);
1308 return path; 1375 return path;
1309} 1376}
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1426 if (req->r_old_dentry_drop) 1493 if (req->r_old_dentry_drop)
1427 len += req->r_old_dentry->d_name.len; 1494 len += req->r_old_dentry->d_name.len;
1428 1495
1429 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL); 1496 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
1430 if (IS_ERR(msg)) 1497 if (!msg) {
1498 msg = ERR_PTR(-ENOMEM);
1431 goto out_free2; 1499 goto out_free2;
1500 }
1432 1501
1433 msg->hdr.tid = cpu_to_le64(req->r_tid); 1502 msg->hdr.tid = cpu_to_le64(req->r_tid);
1434 1503
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1517 } 1586 }
1518 msg = create_request_message(mdsc, req, mds); 1587 msg = create_request_message(mdsc, req, mds);
1519 if (IS_ERR(msg)) { 1588 if (IS_ERR(msg)) {
1520 req->r_reply = ERR_PTR(PTR_ERR(msg)); 1589 req->r_err = PTR_ERR(msg);
1521 complete_request(mdsc, req); 1590 complete_request(mdsc, req);
1522 return -PTR_ERR(msg); 1591 return PTR_ERR(msg);
1523 } 1592 }
1524 req->r_request = msg; 1593 req->r_request = msg;
1525 1594
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
1552 int mds = -1; 1621 int mds = -1;
1553 int err = -EAGAIN; 1622 int err = -EAGAIN;
1554 1623
1555 if (req->r_reply) 1624 if (req->r_err || req->r_got_result)
1556 goto out; 1625 goto out;
1557 1626
1558 if (req->r_timeout && 1627 if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
1609 return err; 1678 return err;
1610 1679
1611finish: 1680finish:
1612 req->r_reply = ERR_PTR(err); 1681 req->r_err = err;
1613 complete_request(mdsc, req); 1682 complete_request(mdsc, req);
1614 goto out; 1683 goto out;
1615} 1684}
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
1630 1699
1631/* 1700/*
1632 * Wake up threads with requests pending for @mds, so that they can 1701 * Wake up threads with requests pending for @mds, so that they can
1633 * resubmit their requests to a possibly different mds. If @all is set, 1702 * resubmit their requests to a possibly different mds.
1634 * wake up if their requests has been forwarded to @mds, too.
1635 */ 1703 */
1636static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all) 1704static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1637{ 1705{
1638 struct ceph_mds_request *req; 1706 struct ceph_mds_request *req;
1639 struct rb_node *p; 1707 struct rb_node *p;
@@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1689 __register_request(mdsc, req, dir); 1757 __register_request(mdsc, req, dir);
1690 __do_request(mdsc, req); 1758 __do_request(mdsc, req);
1691 1759
1692 /* wait */ 1760 if (req->r_err) {
1693 if (!req->r_reply) { 1761 err = req->r_err;
1694 mutex_unlock(&mdsc->mutex); 1762 __unregister_request(mdsc, req);
1695 if (req->r_timeout) { 1763 dout("do_request early error %d\n", err);
1696 err = (long)wait_for_completion_interruptible_timeout( 1764 goto out;
1697 &req->r_completion, req->r_timeout);
1698 if (err == 0)
1699 req->r_reply = ERR_PTR(-EIO);
1700 else if (err < 0)
1701 req->r_reply = ERR_PTR(err);
1702 } else {
1703 err = wait_for_completion_interruptible(
1704 &req->r_completion);
1705 if (err)
1706 req->r_reply = ERR_PTR(err);
1707 }
1708 mutex_lock(&mdsc->mutex);
1709 } 1765 }
1710 1766
1711 if (IS_ERR(req->r_reply)) { 1767 /* wait */
1712 err = PTR_ERR(req->r_reply); 1768 mutex_unlock(&mdsc->mutex);
1713 req->r_reply = NULL; 1769 dout("do_request waiting\n");
1770 if (req->r_timeout) {
1771 err = (long)wait_for_completion_killable_timeout(
1772 &req->r_completion, req->r_timeout);
1773 if (err == 0)
1774 err = -EIO;
1775 } else {
1776 err = wait_for_completion_killable(&req->r_completion);
1777 }
1778 dout("do_request waited, got %d\n", err);
1779 mutex_lock(&mdsc->mutex);
1714 1780
1715 if (err == -ERESTARTSYS) { 1781 /* only abort if we didn't race with a real reply */
1716 /* aborted */ 1782 if (req->r_got_result) {
1717 req->r_aborted = true; 1783 err = le32_to_cpu(req->r_reply_info.head->result);
1784 } else if (err < 0) {
1785 dout("aborted request %lld with %d\n", req->r_tid, err);
1718 1786
1719 if (req->r_locked_dir && 1787 /*
1720 (req->r_op & CEPH_MDS_OP_WRITE)) { 1788 * ensure we aren't running concurrently with
1721 struct ceph_inode_info *ci = 1789 * ceph_fill_trace or ceph_readdir_prepopulate, which
1722 ceph_inode(req->r_locked_dir); 1790 * rely on locks (dir mutex) held by our caller.
1791 */
1792 mutex_lock(&req->r_fill_mutex);
1793 req->r_err = err;
1794 req->r_aborted = true;
1795 mutex_unlock(&req->r_fill_mutex);
1723 1796
1724 dout("aborted, clearing I_COMPLETE on %p\n", 1797 if (req->r_locked_dir &&
1725 req->r_locked_dir); 1798 (req->r_op & CEPH_MDS_OP_WRITE))
1726 spin_lock(&req->r_locked_dir->i_lock); 1799 ceph_invalidate_dir_request(req);
1727 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1728 ci->i_release_count++;
1729 spin_unlock(&req->r_locked_dir->i_lock);
1730 }
1731 } else {
1732 /* clean up this request */
1733 __unregister_request(mdsc, req);
1734 if (!list_empty(&req->r_unsafe_item))
1735 list_del_init(&req->r_unsafe_item);
1736 complete(&req->r_safe_completion);
1737 }
1738 } else if (req->r_err) {
1739 err = req->r_err;
1740 } else { 1800 } else {
1741 err = le32_to_cpu(req->r_reply_info.head->result); 1801 err = req->r_err;
1742 } 1802 }
1743 mutex_unlock(&mdsc->mutex);
1744 1803
1804out:
1805 mutex_unlock(&mdsc->mutex);
1745 dout("do_request %p done, result %d\n", req, err); 1806 dout("do_request %p done, result %d\n", req, err);
1746 return err; 1807 return err;
1747} 1808}
1748 1809
1749/* 1810/*
1811 * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
1812 * namespace request.
1813 */
1814void ceph_invalidate_dir_request(struct ceph_mds_request *req)
1815{
1816 struct inode *inode = req->r_locked_dir;
1817 struct ceph_inode_info *ci = ceph_inode(inode);
1818
1819 dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
1820 spin_lock(&inode->i_lock);
1821 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1822 ci->i_release_count++;
1823 spin_unlock(&inode->i_lock);
1824
1825 if (req->r_dentry)
1826 ceph_invalidate_dentry_lease(req->r_dentry);
1827 if (req->r_old_dentry)
1828 ceph_invalidate_dentry_lease(req->r_old_dentry);
1829}
1830
1831/*
1750 * Handle mds reply. 1832 * Handle mds reply.
1751 * 1833 *
1752 * We take the session mutex and parse and process the reply immediately. 1834 * We take the session mutex and parse and process the reply immediately.
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1797 mutex_unlock(&mdsc->mutex); 1879 mutex_unlock(&mdsc->mutex);
1798 goto out; 1880 goto out;
1799 } 1881 }
1882 if (req->r_got_safe && !head->safe) {
1883 pr_warning("got unsafe after safe on %llu from mds%d\n",
1884 tid, mds);
1885 mutex_unlock(&mdsc->mutex);
1886 goto out;
1887 }
1800 1888
1801 result = le32_to_cpu(head->result); 1889 result = le32_to_cpu(head->result);
1802 1890
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1838 mutex_unlock(&mdsc->mutex); 1926 mutex_unlock(&mdsc->mutex);
1839 goto out; 1927 goto out;
1840 } 1928 }
1841 } 1929 } else {
1842
1843 BUG_ON(req->r_reply);
1844
1845 if (!head->safe) {
1846 req->r_got_unsafe = true; 1930 req->r_got_unsafe = true;
1847 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe); 1931 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1848 } 1932 }
@@ -1871,23 +1955,32 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1871 } 1955 }
1872 1956
1873 /* insert trace into our cache */ 1957 /* insert trace into our cache */
1958 mutex_lock(&req->r_fill_mutex);
1874 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 1959 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1875 if (err == 0) { 1960 if (err == 0) {
1876 if (result == 0 && rinfo->dir_nr) 1961 if (result == 0 && rinfo->dir_nr)
1877 ceph_readdir_prepopulate(req, req->r_session); 1962 ceph_readdir_prepopulate(req, req->r_session);
1878 ceph_unreserve_caps(&req->r_caps_reservation); 1963 ceph_unreserve_caps(&req->r_caps_reservation);
1879 } 1964 }
1965 mutex_unlock(&req->r_fill_mutex);
1880 1966
1881 up_read(&mdsc->snap_rwsem); 1967 up_read(&mdsc->snap_rwsem);
1882out_err: 1968out_err:
1883 if (err) { 1969 mutex_lock(&mdsc->mutex);
1884 req->r_err = err; 1970 if (!req->r_aborted) {
1971 if (err) {
1972 req->r_err = err;
1973 } else {
1974 req->r_reply = msg;
1975 ceph_msg_get(msg);
1976 req->r_got_result = true;
1977 }
1885 } else { 1978 } else {
1886 req->r_reply = msg; 1979 dout("reply arrived after request %lld was aborted\n", tid);
1887 ceph_msg_get(msg);
1888 } 1980 }
1981 mutex_unlock(&mdsc->mutex);
1889 1982
1890 add_cap_releases(mdsc, req->r_session, -1); 1983 ceph_add_cap_releases(mdsc, req->r_session, -1);
1891 mutex_unlock(&session->s_mutex); 1984 mutex_unlock(&session->s_mutex);
1892 1985
1893 /* kick calling process */ 1986 /* kick calling process */
@@ -1921,16 +2014,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
1921 mutex_lock(&mdsc->mutex); 2014 mutex_lock(&mdsc->mutex);
1922 req = __lookup_request(mdsc, tid); 2015 req = __lookup_request(mdsc, tid);
1923 if (!req) { 2016 if (!req) {
1924 dout("forward %llu to mds%d - req dne\n", tid, next_mds); 2017 dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
1925 goto out; /* dup reply? */ 2018 goto out; /* dup reply? */
1926 } 2019 }
1927 2020
1928 if (fwd_seq <= req->r_num_fwd) { 2021 if (req->r_aborted) {
1929 dout("forward %llu to mds%d - old seq %d <= %d\n", 2022 dout("forward tid %llu aborted, unregistering\n", tid);
2023 __unregister_request(mdsc, req);
2024 } else if (fwd_seq <= req->r_num_fwd) {
2025 dout("forward tid %llu to mds%d - old seq %d <= %d\n",
1930 tid, next_mds, req->r_num_fwd, fwd_seq); 2026 tid, next_mds, req->r_num_fwd, fwd_seq);
1931 } else { 2027 } else {
1932 /* resend. forward race not possible; mds would drop */ 2028 /* resend. forward race not possible; mds would drop */
1933 dout("forward %llu to mds%d (we resend)\n", tid, next_mds); 2029 dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
2030 BUG_ON(req->r_err);
2031 BUG_ON(req->r_got_result);
1934 req->r_num_fwd = fwd_seq; 2032 req->r_num_fwd = fwd_seq;
1935 req->r_resend_mds = next_mds; 2033 req->r_resend_mds = next_mds;
1936 put_request_session(req); 2034 put_request_session(req);
@@ -1984,6 +2082,8 @@ static void handle_session(struct ceph_mds_session *session,
1984 2082
1985 switch (op) { 2083 switch (op) {
1986 case CEPH_SESSION_OPEN: 2084 case CEPH_SESSION_OPEN:
2085 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2086 pr_info("mds%d reconnect success\n", session->s_mds);
1987 session->s_state = CEPH_MDS_SESSION_OPEN; 2087 session->s_state = CEPH_MDS_SESSION_OPEN;
1988 renewed_caps(mdsc, session, 0); 2088 renewed_caps(mdsc, session, 0);
1989 wake = 1; 2089 wake = 1;
@@ -1997,10 +2097,12 @@ static void handle_session(struct ceph_mds_session *session,
1997 break; 2097 break;
1998 2098
1999 case CEPH_SESSION_CLOSE: 2099 case CEPH_SESSION_CLOSE:
2100 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
2101 pr_info("mds%d reconnect denied\n", session->s_mds);
2000 remove_session_caps(session); 2102 remove_session_caps(session);
2001 wake = 1; /* for good measure */ 2103 wake = 1; /* for good measure */
2002 complete(&mdsc->session_close_waiters); 2104 complete(&mdsc->session_close_waiters);
2003 kick_requests(mdsc, mds, 0); /* cur only */ 2105 kick_requests(mdsc, mds);
2004 break; 2106 break;
2005 2107
2006 case CEPH_SESSION_STALE: 2108 case CEPH_SESSION_STALE:
@@ -2132,54 +2234,44 @@ out:
2132 * 2234 *
2133 * called with mdsc->mutex held. 2235 * called with mdsc->mutex held.
2134 */ 2236 */
2135static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) 2237static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2238 struct ceph_mds_session *session)
2136{ 2239{
2137 struct ceph_mds_session *session = NULL;
2138 struct ceph_msg *reply; 2240 struct ceph_msg *reply;
2139 struct rb_node *p; 2241 struct rb_node *p;
2242 int mds = session->s_mds;
2140 int err = -ENOMEM; 2243 int err = -ENOMEM;
2141 struct ceph_pagelist *pagelist; 2244 struct ceph_pagelist *pagelist;
2142 2245
2143 pr_info("reconnect to recovering mds%d\n", mds); 2246 pr_info("mds%d reconnect start\n", mds);
2144 2247
2145 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 2248 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2146 if (!pagelist) 2249 if (!pagelist)
2147 goto fail_nopagelist; 2250 goto fail_nopagelist;
2148 ceph_pagelist_init(pagelist); 2251 ceph_pagelist_init(pagelist);
2149 2252
2150 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); 2253 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
2151 if (IS_ERR(reply)) { 2254 if (!reply)
2152 err = PTR_ERR(reply);
2153 goto fail_nomsg; 2255 goto fail_nomsg;
2154 }
2155
2156 /* find session */
2157 session = __ceph_lookup_mds_session(mdsc, mds);
2158 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2159
2160 if (session) {
2161 mutex_lock(&session->s_mutex);
2162 2256
2163 session->s_state = CEPH_MDS_SESSION_RECONNECTING; 2257 mutex_lock(&session->s_mutex);
2164 session->s_seq = 0; 2258 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2259 session->s_seq = 0;
2165 2260
2166 ceph_con_open(&session->s_con, 2261 ceph_con_open(&session->s_con,
2167 ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); 2262 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2168 2263
2169 /* replay unsafe requests */ 2264 /* replay unsafe requests */
2170 replay_unsafe_requests(mdsc, session); 2265 replay_unsafe_requests(mdsc, session);
2171 } else {
2172 dout("no session for mds%d, will send short reconnect\n",
2173 mds);
2174 }
2175 2266
2176 down_read(&mdsc->snap_rwsem); 2267 down_read(&mdsc->snap_rwsem);
2177 2268
2178 if (!session)
2179 goto send;
2180 dout("session %p state %s\n", session, 2269 dout("session %p state %s\n", session,
2181 session_state_name(session->s_state)); 2270 session_state_name(session->s_state));
2182 2271
2272 /* drop old cap expires; we're about to reestablish that state */
2273 discard_cap_releases(mdsc, session);
2274
2183 /* traverse this session's caps */ 2275 /* traverse this session's caps */
2184 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2276 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2185 if (err) 2277 if (err)
@@ -2208,36 +2300,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2208 goto fail; 2300 goto fail;
2209 } 2301 }
2210 2302
2211send:
2212 reply->pagelist = pagelist; 2303 reply->pagelist = pagelist;
2213 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2304 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2214 reply->nr_pages = calc_pages_for(0, pagelist->length); 2305 reply->nr_pages = calc_pages_for(0, pagelist->length);
2215 ceph_con_send(&session->s_con, reply); 2306 ceph_con_send(&session->s_con, reply);
2216 2307
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 mutex_unlock(&session->s_mutex); 2308 mutex_unlock(&session->s_mutex);
2219 2309
2220 mutex_lock(&mdsc->mutex); 2310 mutex_lock(&mdsc->mutex);
2221 __wake_requests(mdsc, &session->s_waiting); 2311 __wake_requests(mdsc, &session->s_waiting);
2222 mutex_unlock(&mdsc->mutex); 2312 mutex_unlock(&mdsc->mutex);
2223 2313
2224 ceph_put_mds_session(session);
2225
2226 up_read(&mdsc->snap_rwsem); 2314 up_read(&mdsc->snap_rwsem);
2227 mutex_lock(&mdsc->mutex);
2228 return; 2315 return;
2229 2316
2230fail: 2317fail:
2231 ceph_msg_put(reply); 2318 ceph_msg_put(reply);
2232 up_read(&mdsc->snap_rwsem); 2319 up_read(&mdsc->snap_rwsem);
2233 mutex_unlock(&session->s_mutex); 2320 mutex_unlock(&session->s_mutex);
2234 ceph_put_mds_session(session);
2235fail_nomsg: 2321fail_nomsg:
2236 ceph_pagelist_release(pagelist); 2322 ceph_pagelist_release(pagelist);
2237 kfree(pagelist); 2323 kfree(pagelist);
2238fail_nopagelist: 2324fail_nopagelist:
2239 pr_err("error %d preparing reconnect for mds%d\n", err, mds); 2325 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2240 mutex_lock(&mdsc->mutex);
2241 return; 2326 return;
2242} 2327}
2243 2328
@@ -2290,7 +2375,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2290 } 2375 }
2291 2376
2292 /* kick any requests waiting on the recovering mds */ 2377 /* kick any requests waiting on the recovering mds */
2293 kick_requests(mdsc, i, 1); 2378 kick_requests(mdsc, i);
2294 } else if (oldstate == newstate) { 2379 } else if (oldstate == newstate) {
2295 continue; /* nothing new with this mds */ 2380 continue; /* nothing new with this mds */
2296 } 2381 }
@@ -2299,22 +2384,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2299 * send reconnect? 2384 * send reconnect?
2300 */ 2385 */
2301 if (s->s_state == CEPH_MDS_SESSION_RESTARTING && 2386 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2302 newstate >= CEPH_MDS_STATE_RECONNECT) 2387 newstate >= CEPH_MDS_STATE_RECONNECT) {
2303 send_mds_reconnect(mdsc, i); 2388 mutex_unlock(&mdsc->mutex);
2389 send_mds_reconnect(mdsc, s);
2390 mutex_lock(&mdsc->mutex);
2391 }
2304 2392
2305 /* 2393 /*
2306 * kick requests on any mds that has gone active. 2394 * kick request on any mds that has gone active.
2307 *
2308 * kick requests on cur or forwarder: we may have sent
2309 * the request to mds1, mds1 told us it forwarded it
2310 * to mds2, but then we learn mds1 failed and can't be
2311 * sure it successfully forwarded our request before
2312 * it died.
2313 */ 2395 */
2314 if (oldstate < CEPH_MDS_STATE_ACTIVE && 2396 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2315 newstate >= CEPH_MDS_STATE_ACTIVE) { 2397 newstate >= CEPH_MDS_STATE_ACTIVE) {
2316 pr_info("mds%d reconnect completed\n", s->s_mds); 2398 if (oldstate != CEPH_MDS_STATE_CREATING &&
2317 kick_requests(mdsc, i, 1); 2399 oldstate != CEPH_MDS_STATE_STARTING)
2400 pr_info("mds%d recovery completed\n", s->s_mds);
2401 kick_requests(mdsc, i);
2318 ceph_kick_flushing_caps(mdsc, s); 2402 ceph_kick_flushing_caps(mdsc, s);
2319 wake_up_session_caps(s, 1); 2403 wake_up_session_caps(s, 1);
2320 } 2404 }
@@ -2349,6 +2433,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2349 struct ceph_dentry_info *di; 2433 struct ceph_dentry_info *di;
2350 int mds = session->s_mds; 2434 int mds = session->s_mds;
2351 struct ceph_mds_lease *h = msg->front.iov_base; 2435 struct ceph_mds_lease *h = msg->front.iov_base;
2436 u32 seq;
2352 struct ceph_vino vino; 2437 struct ceph_vino vino;
2353 int mask; 2438 int mask;
2354 struct qstr dname; 2439 struct qstr dname;
@@ -2362,6 +2447,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2362 vino.ino = le64_to_cpu(h->ino); 2447 vino.ino = le64_to_cpu(h->ino);
2363 vino.snap = CEPH_NOSNAP; 2448 vino.snap = CEPH_NOSNAP;
2364 mask = le16_to_cpu(h->mask); 2449 mask = le16_to_cpu(h->mask);
2450 seq = le32_to_cpu(h->seq);
2365 dname.name = (void *)h + sizeof(*h) + sizeof(u32); 2451 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2366 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); 2452 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2367 if (dname.len != get_unaligned_le32(h+1)) 2453 if (dname.len != get_unaligned_le32(h+1))
@@ -2372,8 +2458,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2372 2458
2373 /* lookup inode */ 2459 /* lookup inode */
2374 inode = ceph_find_inode(sb, vino); 2460 inode = ceph_find_inode(sb, vino);
2375 dout("handle_lease '%s', mask %d, ino %llx %p\n", 2461 dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
2376 ceph_lease_op_name(h->action), mask, vino.ino, inode); 2462 ceph_lease_op_name(h->action), mask, vino.ino, inode,
2463 dname.len, dname.name);
2377 if (inode == NULL) { 2464 if (inode == NULL) {
2378 dout("handle_lease no inode %llx\n", vino.ino); 2465 dout("handle_lease no inode %llx\n", vino.ino);
2379 goto release; 2466 goto release;
@@ -2398,7 +2485,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2398 switch (h->action) { 2485 switch (h->action) {
2399 case CEPH_MDS_LEASE_REVOKE: 2486 case CEPH_MDS_LEASE_REVOKE:
2400 if (di && di->lease_session == session) { 2487 if (di && di->lease_session == session) {
2401 h->seq = cpu_to_le32(di->lease_seq); 2488 if (ceph_seq_cmp(di->lease_seq, seq) > 0)
2489 h->seq = cpu_to_le32(di->lease_seq);
2402 __ceph_mdsc_drop_dentry_lease(dentry); 2490 __ceph_mdsc_drop_dentry_lease(dentry);
2403 } 2491 }
2404 release = 1; 2492 release = 1;
@@ -2412,7 +2500,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2412 unsigned long duration = 2500 unsigned long duration =
2413 le32_to_cpu(h->duration_ms) * HZ / 1000; 2501 le32_to_cpu(h->duration_ms) * HZ / 1000;
2414 2502
2415 di->lease_seq = le32_to_cpu(h->seq); 2503 di->lease_seq = seq;
2416 dentry->d_time = di->lease_renew_from + duration; 2504 dentry->d_time = di->lease_renew_from + duration;
2417 di->lease_renew_after = di->lease_renew_from + 2505 di->lease_renew_after = di->lease_renew_from +
2418 (duration >> 1); 2506 (duration >> 1);
@@ -2457,12 +2545,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2457 dnamelen = dentry->d_name.len; 2545 dnamelen = dentry->d_name.len;
2458 len += dnamelen; 2546 len += dnamelen;
2459 2547
2460 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL); 2548 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
2461 if (IS_ERR(msg)) 2549 if (!msg)
2462 return; 2550 return;
2463 lease = msg->front.iov_base; 2551 lease = msg->front.iov_base;
2464 lease->action = action; 2552 lease->action = action;
2465 lease->mask = cpu_to_le16(CEPH_LOCK_DN); 2553 lease->mask = cpu_to_le16(1);
2466 lease->ino = cpu_to_le64(ceph_vino(inode).ino); 2554 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2467 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap); 2555 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2468 lease->seq = cpu_to_le32(seq); 2556 lease->seq = cpu_to_le32(seq);
@@ -2492,7 +2580,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2492 2580
2493 BUG_ON(inode == NULL); 2581 BUG_ON(inode == NULL);
2494 BUG_ON(dentry == NULL); 2582 BUG_ON(dentry == NULL);
2495 BUG_ON(mask != CEPH_LOCK_DN); 2583 BUG_ON(mask == 0);
2496 2584
2497 /* is dentry lease valid? */ 2585 /* is dentry lease valid? */
2498 spin_lock(&dentry->d_lock); 2586 spin_lock(&dentry->d_lock);
@@ -2602,8 +2690,10 @@ static void delayed_work(struct work_struct *work)
2602 send_renew_caps(mdsc, s); 2690 send_renew_caps(mdsc, s);
2603 else 2691 else
2604 ceph_con_keepalive(&s->s_con); 2692 ceph_con_keepalive(&s->s_con);
2605 add_cap_releases(mdsc, s, -1); 2693 ceph_add_cap_releases(mdsc, s, -1);
2606 send_cap_releases(mdsc, s); 2694 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2695 s->s_state == CEPH_MDS_SESSION_HUNG)
2696 ceph_send_cap_releases(mdsc, s);
2607 mutex_unlock(&s->s_mutex); 2697 mutex_unlock(&s->s_mutex);
2608 ceph_put_mds_session(s); 2698 ceph_put_mds_session(s);
2609 2699
@@ -2620,6 +2710,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2620 mdsc->client = client; 2710 mdsc->client = client;
2621 mutex_init(&mdsc->mutex); 2711 mutex_init(&mdsc->mutex);
2622 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2712 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2713 if (mdsc->mdsmap == NULL)
2714 return -ENOMEM;
2715
2623 init_completion(&mdsc->safe_umount_waiters); 2716 init_completion(&mdsc->safe_umount_waiters);
2624 init_completion(&mdsc->session_close_waiters); 2717 init_completion(&mdsc->session_close_waiters);
2625 INIT_LIST_HEAD(&mdsc->waiting_for_map); 2718 INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2738,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2645 init_waitqueue_head(&mdsc->cap_flushing_wq); 2738 init_waitqueue_head(&mdsc->cap_flushing_wq);
2646 spin_lock_init(&mdsc->dentry_lru_lock); 2739 spin_lock_init(&mdsc->dentry_lru_lock);
2647 INIT_LIST_HEAD(&mdsc->dentry_lru); 2740 INIT_LIST_HEAD(&mdsc->dentry_lru);
2741
2648 return 0; 2742 return 0;
2649} 2743}
2650 2744
@@ -2740,6 +2834,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2740{ 2834{
2741 u64 want_tid, want_flush; 2835 u64 want_tid, want_flush;
2742 2836
2837 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
2838 return;
2839
2743 dout("sync\n"); 2840 dout("sync\n");
2744 mutex_lock(&mdsc->mutex); 2841 mutex_lock(&mdsc->mutex);
2745 want_tid = mdsc->last_tid; 2842 want_tid = mdsc->last_tid;
@@ -2922,9 +3019,10 @@ static void con_put(struct ceph_connection *con)
2922static void peer_reset(struct ceph_connection *con) 3019static void peer_reset(struct ceph_connection *con)
2923{ 3020{
2924 struct ceph_mds_session *s = con->private; 3021 struct ceph_mds_session *s = con->private;
3022 struct ceph_mds_client *mdsc = s->s_mdsc;
2925 3023
2926 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n", 3024 pr_warning("mds%d closed our session\n", s->s_mds);
2927 s->s_mds); 3025 send_mds_reconnect(mdsc, s);
2928} 3026}
2929 3027
2930static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 3028static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3129,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
3031 return ceph_monc_validate_auth(&mdsc->client->monc); 3129 return ceph_monc_validate_auth(&mdsc->client->monc);
3032} 3130}
3033 3131
3034const static struct ceph_connection_operations mds_con_ops = { 3132static const struct ceph_connection_operations mds_con_ops = {
3035 .get = con_get, 3133 .get = con_get,
3036 .put = con_put, 3134 .put = con_put,
3037 .dispatch = dispatch, 3135 .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..b292fa42a66d 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */ 165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */ 166 struct inode *r_target_inode; /* resulting inode */
167 167
168 struct mutex r_fill_mutex;
169
168 union ceph_mds_request_args r_args; 170 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */ 171 int r_fmode; /* file mode, if expecting cap */
170 172
@@ -213,7 +215,7 @@ struct ceph_mds_request {
213 struct completion r_safe_completion; 215 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback; 216 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */ 217 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe; 218 bool r_got_unsafe, r_got_safe, r_got_result;
217 219
218 bool r_did_prepopulate; 220 bool r_did_prepopulate;
219 u32 r_readdir_offset; 221 u32 r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode, 303 struct inode *inode,
302 struct dentry *dn, int mask); 304 struct dentry *dn, int mask);
303 305
306extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
307
304extern struct ceph_mds_request * 308extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); 309ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, 310extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
@@ -318,6 +322,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
318 kref_put(&req->r_kref, ceph_mdsc_release_request); 322 kref_put(&req->r_kref, ceph_mdsc_release_request);
319} 323}
320 324
325extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
326 struct ceph_mds_session *session,
327 int extra);
328extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
329 struct ceph_mds_session *session);
330
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); 331extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322 332
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, 333extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..64b8b1f7863d 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *); 39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con); 40static void ceph_fault(struct ceph_connection *con);
41 41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/* 42/*
55 * nicely render a sockaddr as a string. 43 * nicely render a sockaddr as a string.
56 */ 44 */
@@ -132,6 +120,12 @@ void ceph_msgr_exit(void)
132 destroy_workqueue(ceph_msgr_wq); 120 destroy_workqueue(ceph_msgr_wq);
133} 121}
134 122
123void ceph_msgr_flush()
124{
125 flush_workqueue(ceph_msgr_wq);
126}
127
128
135/* 129/*
136 * socket callback functions 130 * socket callback functions
137 */ 131 */
@@ -340,6 +334,7 @@ static void reset_connection(struct ceph_connection *con)
340 ceph_msg_put(con->out_msg); 334 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL; 335 con->out_msg = NULL;
342 } 336 }
337 con->out_keepalive_pending = false;
343 con->in_seq = 0; 338 con->in_seq = 0;
344 con->in_seq_acked = 0; 339 con->in_seq_acked = 0;
345} 340}
@@ -357,6 +352,7 @@ void ceph_con_close(struct ceph_connection *con)
357 clear_bit(WRITE_PENDING, &con->state); 352 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex); 353 mutex_lock(&con->mutex);
359 reset_connection(con); 354 reset_connection(con);
355 con->peer_global_seq = 0;
360 cancel_delayed_work(&con->work); 356 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex); 357 mutex_unlock(&con->mutex);
362 queue_con(con); 358 queue_con(con);
@@ -661,7 +657,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
661 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 657 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
662 con->connect_seq, global_seq, proto); 658 con->connect_seq, global_seq, proto);
663 659
664 con->out_connect.features = CEPH_FEATURE_SUPPORTED; 660 con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
665 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 661 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
666 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 662 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
667 con->out_connect.global_seq = cpu_to_le32(global_seq); 663 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1120,8 @@ static void fail_protocol(struct ceph_connection *con)
1124 1120
1125static int process_connect(struct ceph_connection *con) 1121static int process_connect(struct ceph_connection *con)
1126{ 1122{
1127 u64 sup_feat = CEPH_FEATURE_SUPPORTED; 1123 u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
1128 u64 req_feat = CEPH_FEATURE_REQUIRED; 1124 u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
1129 u64 server_feat = le64_to_cpu(con->in_reply.features); 1125 u64 server_feat = le64_to_cpu(con->in_reply.features);
1130 1126
1131 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1127 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1229,7 @@ static int process_connect(struct ceph_connection *con)
1233 clear_bit(CONNECTING, &con->state); 1229 clear_bit(CONNECTING, &con->state);
1234 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 1230 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1235 con->connect_seq++; 1231 con->connect_seq++;
1232 con->peer_features = server_feat;
1236 dout("process_connect got READY gseq %d cseq %d (%d)\n", 1233 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1237 con->peer_global_seq, 1234 con->peer_global_seq,
1238 le32_to_cpu(con->in_reply.connect_seq), 1235 le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1399,17 @@ static int read_partial_message(struct ceph_connection *con)
1402 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); 1399 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1403 if (skip) { 1400 if (skip) {
1404 /* skip this message */ 1401 /* skip this message */
1405 dout("alloc_msg returned NULL, skipping message\n"); 1402 dout("alloc_msg said skip message\n");
1406 con->in_base_pos = -front_len - middle_len - data_len - 1403 con->in_base_pos = -front_len - middle_len - data_len -
1407 sizeof(m->footer); 1404 sizeof(m->footer);
1408 con->in_tag = CEPH_MSGR_TAG_READY; 1405 con->in_tag = CEPH_MSGR_TAG_READY;
1409 con->in_seq++; 1406 con->in_seq++;
1410 return 0; 1407 return 0;
1411 } 1408 }
1412 if (IS_ERR(con->in_msg)) { 1409 if (!con->in_msg) {
1413 ret = PTR_ERR(con->in_msg);
1414 con->in_msg = NULL;
1415 con->error_msg = 1410 con->error_msg =
1416 "error allocating memory for incoming message"; 1411 "error allocating memory for incoming message";
1417 return ret; 1412 return -ENOMEM;
1418 } 1413 }
1419 m = con->in_msg; 1414 m = con->in_msg;
1420 m->front.iov_len = 0; /* haven't read it yet */ 1415 m->front.iov_len = 0; /* haven't read it yet */
@@ -1514,14 +1509,14 @@ static void process_message(struct ceph_connection *con)
1514 1509
1515 /* if first message, set peer_name */ 1510 /* if first message, set peer_name */
1516 if (con->peer_name.type == 0) 1511 if (con->peer_name.type == 0)
1517 con->peer_name = msg->hdr.src.name; 1512 con->peer_name = msg->hdr.src;
1518 1513
1519 con->in_seq++; 1514 con->in_seq++;
1520 mutex_unlock(&con->mutex); 1515 mutex_unlock(&con->mutex);
1521 1516
1522 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", 1517 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1523 msg, le64_to_cpu(msg->hdr.seq), 1518 msg, le64_to_cpu(msg->hdr.seq),
1524 ENTITY_NAME(msg->hdr.src.name), 1519 ENTITY_NAME(msg->hdr.src),
1525 le16_to_cpu(msg->hdr.type), 1520 le16_to_cpu(msg->hdr.type),
1526 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), 1521 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1527 le32_to_cpu(msg->hdr.front_len), 1522 le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1541,6 @@ static int try_write(struct ceph_connection *con)
1546 dout("try_write start %p state %lu nref %d\n", con, con->state, 1541 dout("try_write start %p state %lu nref %d\n", con, con->state,
1547 atomic_read(&con->nref)); 1542 atomic_read(&con->nref));
1548 1543
1549 mutex_lock(&con->mutex);
1550more: 1544more:
1551 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 1545 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1552 1546
@@ -1639,7 +1633,6 @@ do_next:
1639done: 1633done:
1640 ret = 0; 1634 ret = 0;
1641out: 1635out:
1642 mutex_unlock(&con->mutex);
1643 dout("try_write done on %p\n", con); 1636 dout("try_write done on %p\n", con);
1644 return ret; 1637 return ret;
1645} 1638}
@@ -1651,7 +1644,6 @@ out:
1651 */ 1644 */
1652static int try_read(struct ceph_connection *con) 1645static int try_read(struct ceph_connection *con)
1653{ 1646{
1654 struct ceph_messenger *msgr;
1655 int ret = -1; 1647 int ret = -1;
1656 1648
1657 if (!con->sock) 1649 if (!con->sock)
@@ -1661,9 +1653,6 @@ static int try_read(struct ceph_connection *con)
1661 return 0; 1653 return 0;
1662 1654
1663 dout("try_read start on %p\n", con); 1655 dout("try_read start on %p\n", con);
1664 msgr = con->msgr;
1665
1666 mutex_lock(&con->mutex);
1667 1656
1668more: 1657more:
1669 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, 1658 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1747,6 @@ more:
1758done: 1747done:
1759 ret = 0; 1748 ret = 0;
1760out: 1749out:
1761 mutex_unlock(&con->mutex);
1762 dout("try_read done on %p\n", con); 1750 dout("try_read done on %p\n", con);
1763 return ret; 1751 return ret;
1764 1752
@@ -1830,6 +1818,8 @@ more:
1830 dout("con_work %p start, clearing QUEUED\n", con); 1818 dout("con_work %p start, clearing QUEUED\n", con);
1831 clear_bit(QUEUED, &con->state); 1819 clear_bit(QUEUED, &con->state);
1832 1820
1821 mutex_lock(&con->mutex);
1822
1833 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ 1823 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1834 dout("con_work CLOSED\n"); 1824 dout("con_work CLOSED\n");
1835 con_close_socket(con); 1825 con_close_socket(con);
@@ -1844,11 +1834,16 @@ more:
1844 if (test_and_clear_bit(SOCK_CLOSED, &con->state) || 1834 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1845 try_read(con) < 0 || 1835 try_read(con) < 0 ||
1846 try_write(con) < 0) { 1836 try_write(con) < 0) {
1837 mutex_unlock(&con->mutex);
1847 backoff = 1; 1838 backoff = 1;
1848 ceph_fault(con); /* error/fault path */ 1839 ceph_fault(con); /* error/fault path */
1840 goto done_unlocked;
1849 } 1841 }
1850 1842
1851done: 1843done:
1844 mutex_unlock(&con->mutex);
1845
1846done_unlocked:
1852 clear_bit(BUSY, &con->state); 1847 clear_bit(BUSY, &con->state);
1853 dout("con->state=%lu\n", con->state); 1848 dout("con->state=%lu\n", con->state);
1854 if (test_bit(QUEUED, &con->state)) { 1849 if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1942,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1947 1942
1948 /* the zero page is needed if a request is "canceled" while the message 1943 /* the zero page is needed if a request is "canceled" while the message
1949 * is being written over the socket */ 1944 * is being written over the socket */
1950 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1945 msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
1951 if (!msgr->zero_page) { 1946 if (!msgr->zero_page) {
1952 kfree(msgr); 1947 kfree(msgr);
1953 return ERR_PTR(-ENOMEM); 1948 return ERR_PTR(-ENOMEM);
@@ -1987,9 +1982,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1987 } 1982 }
1988 1983
1989 /* set src+dst */ 1984 /* set src+dst */
1990 msg->hdr.src.name = con->msgr->inst.name; 1985 msg->hdr.src = con->msgr->inst.name;
1991 msg->hdr.src.addr = con->msgr->my_enc_addr;
1992 msg->hdr.orig_src = msg->hdr.src;
1993 1986
1994 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); 1987 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1995 1988
@@ -2083,12 +2076,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
2083 * construct a new message with given type, size 2076 * construct a new message with given type, size
2084 * the new msg has a ref count of 1. 2077 * the new msg has a ref count of 1.
2085 */ 2078 */
2086struct ceph_msg *ceph_msg_new(int type, int front_len, 2079struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2087 int page_len, int page_off, struct page **pages)
2088{ 2080{
2089 struct ceph_msg *m; 2081 struct ceph_msg *m;
2090 2082
2091 m = kmalloc(sizeof(*m), GFP_NOFS); 2083 m = kmalloc(sizeof(*m), flags);
2092 if (m == NULL) 2084 if (m == NULL)
2093 goto out; 2085 goto out;
2094 kref_init(&m->kref); 2086 kref_init(&m->kref);
@@ -2100,8 +2092,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2100 m->hdr.version = 0; 2092 m->hdr.version = 0;
2101 m->hdr.front_len = cpu_to_le32(front_len); 2093 m->hdr.front_len = cpu_to_le32(front_len);
2102 m->hdr.middle_len = 0; 2094 m->hdr.middle_len = 0;
2103 m->hdr.data_len = cpu_to_le32(page_len); 2095 m->hdr.data_len = 0;
2104 m->hdr.data_off = cpu_to_le16(page_off); 2096 m->hdr.data_off = 0;
2105 m->hdr.reserved = 0; 2097 m->hdr.reserved = 0;
2106 m->footer.front_crc = 0; 2098 m->footer.front_crc = 0;
2107 m->footer.middle_crc = 0; 2099 m->footer.middle_crc = 0;
@@ -2115,11 +2107,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2115 /* front */ 2107 /* front */
2116 if (front_len) { 2108 if (front_len) {
2117 if (front_len > PAGE_CACHE_SIZE) { 2109 if (front_len > PAGE_CACHE_SIZE) {
2118 m->front.iov_base = __vmalloc(front_len, GFP_NOFS, 2110 m->front.iov_base = __vmalloc(front_len, flags,
2119 PAGE_KERNEL); 2111 PAGE_KERNEL);
2120 m->front_is_vmalloc = true; 2112 m->front_is_vmalloc = true;
2121 } else { 2113 } else {
2122 m->front.iov_base = kmalloc(front_len, GFP_NOFS); 2114 m->front.iov_base = kmalloc(front_len, flags);
2123 } 2115 }
2124 if (m->front.iov_base == NULL) { 2116 if (m->front.iov_base == NULL) {
2125 pr_err("msg_new can't allocate %d bytes\n", 2117 pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2127,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2135 m->middle = NULL; 2127 m->middle = NULL;
2136 2128
2137 /* data */ 2129 /* data */
2138 m->nr_pages = calc_pages_for(page_off, page_len); 2130 m->nr_pages = 0;
2139 m->pages = pages; 2131 m->pages = NULL;
2140 m->pagelist = NULL; 2132 m->pagelist = NULL;
2141 2133
2142 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len, 2134 dout("ceph_msg_new %p front %d\n", m, front_len);
2143 m->nr_pages);
2144 return m; 2135 return m;
2145 2136
2146out2: 2137out2:
2147 ceph_msg_put(m); 2138 ceph_msg_put(m);
2148out: 2139out:
2149 pr_err("msg_new can't create type %d len %d\n", type, front_len); 2140 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2150 return ERR_PTR(-ENOMEM); 2141 return NULL;
2151} 2142}
2152 2143
2153/* 2144/*
@@ -2190,29 +2181,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2190 mutex_unlock(&con->mutex); 2181 mutex_unlock(&con->mutex);
2191 msg = con->ops->alloc_msg(con, hdr, skip); 2182 msg = con->ops->alloc_msg(con, hdr, skip);
2192 mutex_lock(&con->mutex); 2183 mutex_lock(&con->mutex);
2193 if (IS_ERR(msg)) 2184 if (!msg || *skip)
2194 return msg;
2195
2196 if (*skip)
2197 return NULL; 2185 return NULL;
2198 } 2186 }
2199 if (!msg) { 2187 if (!msg) {
2200 *skip = 0; 2188 *skip = 0;
2201 msg = ceph_msg_new(type, front_len, 0, 0, NULL); 2189 msg = ceph_msg_new(type, front_len, GFP_NOFS);
2202 if (!msg) { 2190 if (!msg) {
2203 pr_err("unable to allocate msg type %d len %d\n", 2191 pr_err("unable to allocate msg type %d len %d\n",
2204 type, front_len); 2192 type, front_len);
2205 return ERR_PTR(-ENOMEM); 2193 return NULL;
2206 } 2194 }
2207 } 2195 }
2208 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2196 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2209 2197
2210 if (middle_len) { 2198 if (middle_len && !msg->middle) {
2211 ret = ceph_alloc_middle(con, msg); 2199 ret = ceph_alloc_middle(con, msg);
2212
2213 if (ret < 0) { 2200 if (ret < 0) {
2214 ceph_msg_put(msg); 2201 ceph_msg_put(msg);
2215 return msg; 2202 return NULL;
2216 } 2203 }
2217 } 2204 }
2218 2205
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
49 int *skip); 49 int *skip);
50}; 50};
51 51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */ 52/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num) 53#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
56 54
57struct ceph_messenger { 55struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */ 56 struct ceph_entity_inst inst; /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
144 struct ceph_entity_addr peer_addr; /* peer address */ 142 struct ceph_entity_addr peer_addr; /* peer address */
145 struct ceph_entity_name peer_name; /* peer name */ 143 struct ceph_entity_name peer_name; /* peer name */
146 struct ceph_entity_addr peer_addr_for_me; 144 struct ceph_entity_addr peer_addr_for_me;
145 unsigned peer_features;
147 u32 connect_seq; /* identify the most recent connection 146 u32 connect_seq; /* identify the most recent connection
148 attempt for this connection, client */ 147 attempt for this connection, client */
149 u32 peer_global_seq; /* peer's global seq for this connection */ 148 u32 peer_global_seq; /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
158 struct list_head out_queue; 157 struct list_head out_queue;
159 struct list_head out_sent; /* sending or sent but unacked */ 158 struct list_head out_sent; /* sending or sent but unacked */
160 u64 out_seq; /* last message queued for send */ 159 u64 out_seq; /* last message queued for send */
161 u64 out_seq_sent; /* last message sent */
162 bool out_keepalive_pending; 160 bool out_keepalive_pending;
163 161
164 u64 in_seq, in_seq_acked; /* last message received, acked */ 162 u64 in_seq, in_seq_acked; /* last message received, acked */
@@ -215,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
215 213
216extern int ceph_msgr_init(void); 214extern int ceph_msgr_init(void);
217extern void ceph_msgr_exit(void); 215extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void);
218 217
219extern struct ceph_messenger *ceph_messenger_create( 218extern struct ceph_messenger *ceph_messenger_create(
220 struct ceph_entity_addr *myaddr); 219 struct ceph_entity_addr *myaddr);
@@ -234,9 +233,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
234extern struct ceph_connection *ceph_con_get(struct ceph_connection *con); 233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
235extern void ceph_con_put(struct ceph_connection *con); 234extern void ceph_con_put(struct ceph_connection *con);
236 235
237extern struct ceph_msg *ceph_msg_new(int type, int front_len, 236extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
238 int page_len, int page_off,
239 struct page **pages);
240extern void ceph_msg_kfree(struct ceph_msg *m); 237extern void ceph_msg_kfree(struct ceph_msg *m);
241 238
242 239
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..07a539906e67 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
28 * resend any outstanding requests. 28 * resend any outstanding requests.
29 */ 29 */
30 30
31const static struct ceph_connection_operations mon_con_ops; 31static const struct ceph_connection_operations mon_con_ops;
32 32
33static int __validate_auth(struct ceph_mon_client *monc); 33static int __validate_auth(struct ceph_mon_client *monc);
34 34
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
104 monc->pending_auth = 1; 104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len; 105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len); 106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_con_revoke(monc->con, monc->m_auth);
107 ceph_msg_get(monc->m_auth); /* keep our ref */ 108 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth); 109 ceph_con_send(monc->con, monc->m_auth);
109} 110}
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
187 monc->want_next_osdmap); 188 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) || 189 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) { 190 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg; 191 struct ceph_msg *msg = monc->m_subscribe;
191 struct ceph_mon_subscribe_item *i; 192 struct ceph_mon_subscribe_item *i;
192 void *p, *end; 193 void *p, *end;
193 194
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base; 195 p = msg->front.iov_base;
199 end = p + msg->front.iov_len; 196 end = p + msg->front_max;
200 197
201 dout("__send_subscribe to 'mdsmap' %u+\n", 198 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap); 199 (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
226 223
227 msg->front.iov_len = p - msg->front.iov_base; 224 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 225 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg); 226 ceph_con_revoke(monc->con, msg);
227 ceph_con_send(monc->con, ceph_msg_get(msg));
230 228
231 monc->sub_sent = jiffies | 1; /* never 0 */ 229 monc->sub_sent = jiffies | 1; /* never 0 */
232 } 230 }
@@ -353,14 +351,14 @@ out:
353/* 351/*
354 * statfs 352 * statfs
355 */ 353 */
356static struct ceph_mon_statfs_request *__lookup_statfs( 354static struct ceph_mon_generic_request *__lookup_generic_req(
357 struct ceph_mon_client *monc, u64 tid) 355 struct ceph_mon_client *monc, u64 tid)
358{ 356{
359 struct ceph_mon_statfs_request *req; 357 struct ceph_mon_generic_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node; 358 struct rb_node *n = monc->generic_request_tree.rb_node;
361 359
362 while (n) { 360 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node); 361 req = rb_entry(n, struct ceph_mon_generic_request, node);
364 if (tid < req->tid) 362 if (tid < req->tid)
365 n = n->rb_left; 363 n = n->rb_left;
366 else if (tid > req->tid) 364 else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
371 return NULL; 369 return NULL;
372} 370}
373 371
374static void __insert_statfs(struct ceph_mon_client *monc, 372static void __insert_generic_request(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new) 373 struct ceph_mon_generic_request *new)
376{ 374{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node; 375 struct rb_node **p = &monc->generic_request_tree.rb_node;
378 struct rb_node *parent = NULL; 376 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL; 377 struct ceph_mon_generic_request *req = NULL;
380 378
381 while (*p) { 379 while (*p) {
382 parent = *p; 380 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node); 381 req = rb_entry(parent, struct ceph_mon_generic_request, node);
384 if (new->tid < req->tid) 382 if (new->tid < req->tid)
385 p = &(*p)->rb_left; 383 p = &(*p)->rb_left;
386 else if (new->tid > req->tid) 384 else if (new->tid > req->tid)
@@ -390,113 +388,159 @@ static void __insert_statfs(struct ceph_mon_client *monc,
390 } 388 }
391 389
392 rb_link_node(&new->node, parent, p); 390 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree); 391 rb_insert_color(&new->node, &monc->generic_request_tree);
392}
393
394static void release_generic_request(struct kref *kref)
395{
396 struct ceph_mon_generic_request *req =
397 container_of(kref, struct ceph_mon_generic_request, kref);
398
399 if (req->reply)
400 ceph_msg_put(req->reply);
401 if (req->request)
402 ceph_msg_put(req->request);
403
404 kfree(req);
405}
406
407static void put_generic_request(struct ceph_mon_generic_request *req)
408{
409 kref_put(&req->kref, release_generic_request);
410}
411
412static void get_generic_request(struct ceph_mon_generic_request *req)
413{
414 kref_get(&req->kref);
415}
416
417static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
418 struct ceph_msg_header *hdr,
419 int *skip)
420{
421 struct ceph_mon_client *monc = con->private;
422 struct ceph_mon_generic_request *req;
423 u64 tid = le64_to_cpu(hdr->tid);
424 struct ceph_msg *m;
425
426 mutex_lock(&monc->mutex);
427 req = __lookup_generic_req(monc, tid);
428 if (!req) {
429 dout("get_generic_reply %lld dne\n", tid);
430 *skip = 1;
431 m = NULL;
432 } else {
433 dout("get_generic_reply %lld got %p\n", tid, req->reply);
434 m = ceph_msg_get(req->reply);
435 /*
436 * we don't need to track the connection reading into
437 * this reply because we only have one open connection
438 * at a time, ever.
439 */
440 }
441 mutex_unlock(&monc->mutex);
442 return m;
394} 443}
395 444
396static void handle_statfs_reply(struct ceph_mon_client *monc, 445static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg) 446 struct ceph_msg *msg)
398{ 447{
399 struct ceph_mon_statfs_request *req; 448 struct ceph_mon_generic_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 449 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid; 450 u64 tid = le64_to_cpu(msg->hdr.tid);
402 451
403 if (msg->front.iov_len != sizeof(*reply)) 452 if (msg->front.iov_len != sizeof(*reply))
404 goto bad; 453 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid); 454 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407 455
408 mutex_lock(&monc->mutex); 456 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid); 457 req = __lookup_generic_req(monc, tid);
410 if (req) { 458 if (req) {
411 *req->buf = reply->st; 459 *(struct ceph_statfs *)req->buf = reply->st;
412 req->result = 0; 460 req->result = 0;
461 get_generic_request(req);
413 } 462 }
414 mutex_unlock(&monc->mutex); 463 mutex_unlock(&monc->mutex);
415 if (req) 464 if (req) {
416 complete(&req->completion); 465 complete(&req->completion);
466 put_generic_request(req);
467 }
417 return; 468 return;
418 469
419bad: 470bad:
420 pr_err("corrupt statfs reply, no tid\n"); 471 pr_err("corrupt generic reply, no tid\n");
421 ceph_msg_dump(msg); 472 ceph_msg_dump(msg);
422} 473}
423 474
424/* 475/*
425 * (re)send a statfs request 476 * Do a synchronous statfs().
426 */ 477 */
427static int send_statfs(struct ceph_mon_client *monc, 478int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
428 struct ceph_mon_statfs_request *req)
429{ 479{
430 struct ceph_msg *msg; 480 struct ceph_mon_generic_request *req;
431 struct ceph_mon_statfs *h; 481 struct ceph_mon_statfs *h;
482 int err;
483
484 req = kzalloc(sizeof(*req), GFP_NOFS);
485 if (!req)
486 return -ENOMEM;
487
488 kref_init(&req->kref);
489 req->buf = buf;
490 init_completion(&req->completion);
491
492 err = -ENOMEM;
493 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
494 if (!req->request)
495 goto out;
496 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
497 if (!req->reply)
498 goto out;
432 499
433 dout("send_statfs tid %llu\n", req->tid); 500 /* fill out request */
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL); 501 h = req->request->front.iov_base;
435 if (IS_ERR(msg))
436 return PTR_ERR(msg);
437 req->request = msg;
438 msg->hdr.tid = cpu_to_le64(req->tid);
439 h = msg->front.iov_base;
440 h->monhdr.have_version = 0; 502 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1); 503 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0; 504 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid; 505 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463 506
464 /* register request */ 507 /* register request */
465 mutex_lock(&monc->mutex); 508 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid; 509 req->tid = ++monc->last_tid;
467 req.last_attempt = jiffies; 510 req->request->hdr.tid = cpu_to_le64(req->tid);
468 req.delay = BASE_DELAY_INTERVAL; 511 __insert_generic_request(monc, req);
469 __insert_statfs(monc, &req); 512 monc->num_generic_requests++;
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex); 513 mutex_unlock(&monc->mutex);
472 514
473 /* send request and wait */ 515 /* send request and wait */
474 err = send_statfs(monc, &req); 516 ceph_con_send(monc->con, ceph_msg_get(req->request));
475 if (!err) 517 err = wait_for_completion_interruptible(&req->completion);
476 err = wait_for_completion_interruptible(&req.completion);
477 518
478 mutex_lock(&monc->mutex); 519 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree); 520 rb_erase(&req->node, &monc->generic_request_tree);
480 monc->num_statfs_requests--; 521 monc->num_generic_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex); 522 mutex_unlock(&monc->mutex);
483 523
484 if (!err) 524 if (!err)
485 err = req.result; 525 err = req->result;
526
527out:
528 kref_put(&req->kref, release_generic_request);
486 return err; 529 return err;
487} 530}
488 531
489/* 532/*
490 * Resend pending statfs requests. 533 * Resend pending statfs requests.
491 */ 534 */
492static void __resend_statfs(struct ceph_mon_client *monc) 535static void __resend_generic_request(struct ceph_mon_client *monc)
493{ 536{
494 struct ceph_mon_statfs_request *req; 537 struct ceph_mon_generic_request *req;
495 struct rb_node *p; 538 struct rb_node *p;
496 539
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) { 540 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node); 541 req = rb_entry(p, struct ceph_mon_generic_request, node);
499 send_statfs(monc, req); 542 ceph_con_revoke(monc->con, req->request);
543 ceph_con_send(monc->con, ceph_msg_get(req->request));
500 } 544 }
501} 545}
502 546
@@ -586,26 +630,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 630 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 631 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588 632
589 /* msg pools */ 633 /* msgs */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack, 634 err = -ENOMEM;
591 sizeof(struct ceph_mon_subscribe_ack), 1, false); 635 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
592 if (err < 0) 636 sizeof(struct ceph_mon_subscribe_ack),
637 GFP_NOFS);
638 if (!monc->m_subscribe_ack)
593 goto out_monmap; 639 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply, 640
595 sizeof(struct ceph_mon_statfs_reply), 0, false); 641 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
596 if (err < 0) 642 if (!monc->m_subscribe)
597 goto out_pool1; 643 goto out_subscribe_ack;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false); 644
599 if (err < 0) 645 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
600 goto out_pool2; 646 if (!monc->m_auth_reply)
601 647 goto out_subscribe;
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL); 648
649 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
603 monc->pending_auth = 0; 650 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) { 651 if (!monc->m_auth)
605 err = PTR_ERR(monc->m_auth); 652 goto out_auth_reply;
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609 653
610 monc->cur_mon = -1; 654 monc->cur_mon = -1;
611 monc->hunting = true; 655 monc->hunting = true;
@@ -613,8 +657,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
613 monc->sub_sent = 0; 657 monc->sub_sent = 0;
614 658
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 659 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT; 660 monc->generic_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0; 661 monc->num_generic_requests = 0;
618 monc->last_tid = 0; 662 monc->last_tid = 0;
619 663
620 monc->have_mdsmap = 0; 664 monc->have_mdsmap = 0;
@@ -622,12 +666,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
622 monc->want_next_osdmap = 1; 666 monc->want_next_osdmap = 1;
623 return 0; 667 return 0;
624 668
625out_pool3: 669out_auth_reply:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 670 ceph_msg_put(monc->m_auth_reply);
627out_pool2: 671out_subscribe:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 672 ceph_msg_put(monc->m_subscribe);
629out_pool1: 673out_subscribe_ack:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 674 ceph_msg_put(monc->m_subscribe_ack);
631out_monmap: 675out_monmap:
632 kfree(monc->monmap); 676 kfree(monc->monmap);
633out: 677out:
@@ -651,9 +695,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
651 ceph_auth_destroy(monc->auth); 695 ceph_auth_destroy(monc->auth);
652 696
653 ceph_msg_put(monc->m_auth); 697 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack); 698 ceph_msg_put(monc->m_auth_reply);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply); 699 ceph_msg_put(monc->m_subscribe);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply); 700 ceph_msg_put(monc->m_subscribe_ack);
657 701
658 kfree(monc->monmap); 702 kfree(monc->monmap);
659} 703}
@@ -662,8 +706,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg) 706 struct ceph_msg *msg)
663{ 707{
664 int ret; 708 int ret;
709 int was_auth = 0;
665 710
666 mutex_lock(&monc->mutex); 711 mutex_lock(&monc->mutex);
712 if (monc->auth->ops)
713 was_auth = monc->auth->ops->is_authenticated(monc->auth);
667 monc->pending_auth = 0; 714 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 715 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len, 716 msg->front.iov_len,
@@ -674,14 +721,14 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
674 wake_up(&monc->client->auth_wq); 721 wake_up(&monc->client->auth_wq);
675 } else if (ret > 0) { 722 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret); 723 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) { 724 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n"); 725 dout("authenticated, starting session\n");
679 726
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 727 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id; 728 monc->client->msgr->inst.name.num = monc->auth->global_id;
682 729
683 __send_subscribe(monc); 730 __send_subscribe(monc);
684 __resend_statfs(monc); 731 __resend_generic_request(monc);
685 } 732 }
686 mutex_unlock(&monc->mutex); 733 mutex_unlock(&monc->mutex);
687} 734}
@@ -770,18 +817,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
770 817
771 switch (type) { 818 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK: 819 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len); 820 m = ceph_msg_get(monc->m_subscribe_ack);
774 break; 821 break;
775 case CEPH_MSG_STATFS_REPLY: 822 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len); 823 return get_generic_reply(con, hdr, skip);
777 break;
778 case CEPH_MSG_AUTH_REPLY: 824 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len); 825 m = ceph_msg_get(monc->m_auth_reply);
780 break; 826 break;
781 case CEPH_MSG_MON_MAP: 827 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP: 828 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP: 829 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL); 830 m = ceph_msg_new(type, front_len, GFP_NOFS);
785 break; 831 break;
786 } 832 }
787 833
@@ -826,7 +872,7 @@ out:
826 mutex_unlock(&monc->mutex); 872 mutex_unlock(&monc->mutex);
827} 873}
828 874
829const static struct ceph_connection_operations mon_con_ops = { 875static const struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get, 876 .get = ceph_con_get,
831 .put = ceph_con_put, 877 .put = ceph_con_put,
832 .dispatch = dispatch, 878 .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
2#define _FS_CEPH_MON_CLIENT_H 2#define _FS_CEPH_MON_CLIENT_H
3 3
4#include <linux/completion.h> 4#include <linux/completion.h>
5#include <linux/kref.h>
5#include <linux/rbtree.h> 6#include <linux/rbtree.h>
6 7
7#include "messenger.h" 8#include "messenger.h"
8#include "msgpool.h"
9 9
10struct ceph_client; 10struct ceph_client;
11struct ceph_mount_args; 11struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
22}; 22};
23 23
24struct ceph_mon_client; 24struct ceph_mon_client;
25struct ceph_mon_statfs_request; 25struct ceph_mon_generic_request;
26 26
27 27
28/* 28/*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
40}; 40};
41 41
42/* 42/*
43 * statfs() is done a bit differently because we need to get data back 43 * ceph_mon_generic_request is being used for the statfs and poolop requests
44 * which are bening done a bit differently because we need to get data back
44 * to the caller 45 * to the caller
45 */ 46 */
46struct ceph_mon_statfs_request { 47struct ceph_mon_generic_request {
48 struct kref kref;
47 u64 tid; 49 u64 tid;
48 struct rb_node node; 50 struct rb_node node;
49 int result; 51 int result;
50 struct ceph_statfs *buf; 52 void *buf;
51 struct completion completion; 53 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */ 54 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */
54}; 56};
55 57
56struct ceph_mon_client { 58struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
61 struct delayed_work delayed_work; 63 struct delayed_work delayed_work;
62 64
63 struct ceph_auth_client *auth; 65 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth; 66 struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
65 int pending_auth; 67 int pending_auth;
66 68
67 bool hunting; 69 bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
70 struct ceph_connection *con; 72 struct ceph_connection *con;
71 bool have_fsid; 73 bool have_fsid;
72 74
73 /* msg pools */ 75 /* pending generic requests */
74 struct ceph_msgpool msgpool_subscribe_ack; 76 struct rb_root generic_request_tree;
75 struct ceph_msgpool msgpool_statfs_reply; 77 int num_generic_requests;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid; 78 u64 last_tid;
82 79
83 /* mds/osd map */ 80 /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
7 7
8#include "msgpool.h" 8#include "msgpool.h"
9 9
10/* 10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11 * We use msg pools to preallocate memory for messages we expect to 11{
12 * receive over the wire, to avoid getting ourselves into OOM 12 struct ceph_msgpool *pool = arg;
13 * conditions at unexpected times. We take use a few different 13 void *p;
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31 14
15 p = ceph_msg_new(0, pool->front_len, gfp_mask);
16 if (!p)
17 pr_err("msgpool %s alloc failed\n", pool->name);
18 return p;
19}
32 20
33/* 21static void free_fn(void *element, void *arg)
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{ 22{
38 struct ceph_msg *msg; 23 ceph_msg_put(element);
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61} 24}
62 25
63int ceph_msgpool_init(struct ceph_msgpool *pool, 26int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking) 27 int front_len, int size, bool blocking, const char *name)
65{ 28{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len; 29 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs); 30 pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
72 pool->num = 0; 31 if (!pool->pool)
73 pool->min = min; 32 return -ENOMEM;
74 pool->blocking = blocking; 33 pool->name = name;
75 init_waitqueue_head(&pool->wait); 34 return 0;
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81} 35}
82 36
83void ceph_msgpool_destroy(struct ceph_msgpool *pool) 37void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{ 38{
85 dout("msgpool_destroy %p\n", pool); 39 mempool_destroy(pool->pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90} 40}
91 41
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta) 42struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
43 int front_len)
93{ 44{
94 int ret; 45 if (front_len > pool->front_len) {
95 46 pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
96 spin_lock(&pool->lock); 47 pool->name, front_len, pool->front_len);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1); 48 WARN_ON(1);
113 49
114 /* try to alloc a fresh message */ 50 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL); 51 return ceph_msg_new(0, front_len, GFP_NOFS);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 } 52 }
129 53
130 while (1) { 54 return mempool_alloc(pool->pool, GFP_NOFS);
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163} 55}
164 56
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) 57void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{ 58{
167 spin_lock(&pool->lock); 59 /* reset msg front_len; user may have changed it */
168 if (pool->num < pool->min) { 60 msg->front.iov_len = pool->front_len;
169 /* reset msg front_len; user may have changed it */ 61 msg->hdr.front_len = cpu_to_le32(pool->front_len);
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172 62
173 kref_set(&msg->kref, 1); /* retake a single ref */ 63 kref_init(&msg->kref); /* retake single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186} 64}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
1#ifndef _FS_CEPH_MSGPOOL 1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL 2#define _FS_CEPH_MSGPOOL
3 3
4#include <linux/mempool.h>
4#include "messenger.h" 5#include "messenger.h"
5 6
6/* 7/*
@@ -8,18 +9,15 @@
8 * avoid unexpected OOM conditions. 9 * avoid unexpected OOM conditions.
9 */ 10 */
10struct ceph_msgpool { 11struct ceph_msgpool {
11 spinlock_t lock; 12 const char *name;
13 mempool_t *pool;
12 int front_len; /* preallocated payload size */ 14 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17}; 15};
18 16
19extern int ceph_msgpool_init(struct ceph_msgpool *pool, 17extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking); 18 int front_len, int size, bool blocking,
19 const char *name);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); 20extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, 21extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len); 22 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); 23extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
50#define CEPH_ENTITY_TYPE_MDS 0x02 50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04 51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08 52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20 53#define CEPH_ENTITY_TYPE_AUTH 0x20
55 54
56#define CEPH_ENTITY_TYPE_ANY 0xFF 55#define CEPH_ENTITY_TYPE_ANY 0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
120/* 119/*
121 * message header 120 * message header
122 */ 121 */
123struct ceph_msg_header { 122struct ceph_msg_header_old {
124 __le64 seq; /* message seq# for this session */ 123 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */ 124 __le64 tid; /* transaction id */
126 __le16 type; /* message type */ 125 __le16 type; /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
138 __le32 crc; /* header crc32c */ 137 __le32 crc; /* header crc32c */
139} __attribute__ ((packed)); 138} __attribute__ ((packed));
140 139
140struct ceph_msg_header {
141 __le64 seq; /* message seq# for this session */
142 __le64 tid; /* transaction id */
143 __le16 type; /* message type */
144 __le16 priority; /* priority. higher value == higher priority */
145 __le16 version; /* version of message encoding */
146
147 __le32 front_len; /* bytes in main payload */
148 __le32 middle_len;/* bytes in middle payload */
149 __le32 data_len; /* bytes of data payload */
150 __le16 data_off; /* sender: include full offset;
151 receiver: mask against ~PAGE_MASK */
152
153 struct ceph_entity_name src;
154 __le32 reserved;
155 __le32 crc; /* header crc32c */
156} __attribute__ ((packed));
157
141#define CEPH_MSG_PRIO_LOW 64 158#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127 159#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196 160#define CEPH_MSG_PRIO_HIGH 196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..d25b4add85b4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
16#define OSD_OP_FRONT_LEN 4096 16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512 17#define OSD_OPREPLY_FRONT_LEN 512
18 18
19const static struct ceph_connection_operations osd_con_ops; 19static const struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc, 20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd); 21 struct ceph_osd *kickosd);
22 22
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
147 req = kzalloc(sizeof(*req), GFP_NOFS); 147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 } 148 }
149 if (req == NULL) 149 if (req == NULL)
150 return ERR_PTR(-ENOMEM); 150 return NULL;
151 151
152 req->r_osdc = osdc; 152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool; 153 req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else 165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL); 167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
168 if (IS_ERR(msg)) { 168 if (!msg) {
169 ceph_osdc_put_request(req); 169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg)); 170 return NULL;
171 } 171 }
172 req->r_reply = msg; 172 req->r_reply = msg;
173 173
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
178 if (use_mempool) 178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else 180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL); 181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
182 if (IS_ERR(msg)) { 182 if (!msg) {
183 ceph_osdc_put_request(req); 183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg)); 184 return NULL;
185 } 185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); 186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len); 187 memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
361{ 361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1); 363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref)) 364 if (atomic_dec_and_test(&osd->o_ref)) {
365 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
366
367 if (osd->o_authorizer)
368 ac->ops->destroy_authorizer(ac, osd->o_authorizer);
365 kfree(osd); 369 kfree(osd);
370 }
366} 371}
367 372
368/* 373/*
@@ -715,7 +720,7 @@ static void handle_timeout(struct work_struct *work)
715 * should mark the osd as failed and we should find out about 720 * should mark the osd as failed and we should find out about
716 * it from an updated osd map. 721 * it from an updated osd map.
717 */ 722 */
718 while (!list_empty(&osdc->req_lru)) { 723 while (timeout && !list_empty(&osdc->req_lru)) {
719 req = list_entry(osdc->req_lru.next, struct ceph_osd_request, 724 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
720 r_req_lru_item); 725 r_req_lru_item);
721 726
@@ -1078,6 +1083,7 @@ done:
1078 if (newmap) 1083 if (newmap)
1079 kick_requests(osdc, NULL); 1084 kick_requests(osdc, NULL);
1080 up_read(&osdc->map_sem); 1085 up_read(&osdc->map_sem);
1086 wake_up(&osdc->client->auth_wq);
1081 return; 1087 return;
1082 1088
1083bad: 1089bad:
@@ -1087,45 +1093,6 @@ bad:
1087 return; 1093 return;
1088} 1094}
1089 1095
1090
1091/*
1092 * A read request prepares specific pages that data is to be read into.
1093 * When a message is being read off the wire, we call prepare_pages to
1094 * find those pages.
1095 * 0 = success, -1 failure.
1096 */
1097static int __prepare_pages(struct ceph_connection *con,
1098 struct ceph_msg_header *hdr,
1099 struct ceph_osd_request *req,
1100 u64 tid,
1101 struct ceph_msg *m)
1102{
1103 struct ceph_osd *osd = con->private;
1104 struct ceph_osd_client *osdc;
1105 int ret = -1;
1106 int data_len = le32_to_cpu(hdr->data_len);
1107 unsigned data_off = le16_to_cpu(hdr->data_off);
1108
1109 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1110
1111 if (!osd)
1112 return -1;
1113
1114 osdc = osd->o_osdc;
1115
1116 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1117 tid, req->r_num_pages, want);
1118 if (unlikely(req->r_num_pages < want))
1119 goto out;
1120 m->pages = req->r_pages;
1121 m->nr_pages = req->r_num_pages;
1122 ret = 0; /* success */
1123out:
1124 BUG_ON(ret < 0 || m->nr_pages < want);
1125
1126 return ret;
1127}
1128
1129/* 1096/*
1130 * Register request, send initial attempt. 1097 * Register request, send initial attempt.
1131 */ 1098 */
@@ -1252,11 +1219,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1252 if (!osdc->req_mempool) 1219 if (!osdc->req_mempool)
1253 goto out; 1220 goto out;
1254 1221
1255 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true); 1222 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
1223 "osd_op");
1256 if (err < 0) 1224 if (err < 0)
1257 goto out_mempool; 1225 goto out_mempool;
1258 err = ceph_msgpool_init(&osdc->msgpool_op_reply, 1226 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1259 OSD_OPREPLY_FRONT_LEN, 10, true); 1227 OSD_OPREPLY_FRONT_LEN, 10, true,
1228 "osd_op_reply");
1260 if (err < 0) 1229 if (err < 0)
1261 goto out_msgpool; 1230 goto out_msgpool;
1262 return 0; 1231 return 0;
@@ -1302,8 +1271,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1302 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1271 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1303 NULL, 0, truncate_seq, truncate_size, NULL, 1272 NULL, 0, truncate_seq, truncate_size, NULL,
1304 false, 1); 1273 false, 1);
1305 if (IS_ERR(req)) 1274 if (!req)
1306 return PTR_ERR(req); 1275 return -ENOMEM;
1307 1276
1308 /* it may be a short read due to an object boundary */ 1277 /* it may be a short read due to an object boundary */
1309 req->r_pages = pages; 1278 req->r_pages = pages;
@@ -1345,8 +1314,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1345 snapc, do_sync, 1314 snapc, do_sync,
1346 truncate_seq, truncate_size, mtime, 1315 truncate_seq, truncate_size, mtime,
1347 nofail, 1); 1316 nofail, 1);
1348 if (IS_ERR(req)) 1317 if (!req)
1349 return PTR_ERR(req); 1318 return -ENOMEM;
1350 1319
1351 /* it may be a short write due to an object boundary */ 1320 /* it may be a short write due to an object boundary */
1352 req->r_pages = pages; 1321 req->r_pages = pages;
@@ -1394,7 +1363,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1394} 1363}
1395 1364
1396/* 1365/*
1397 * lookup and return message for incoming reply 1366 * lookup and return message for incoming reply. set up reply message
1367 * pages.
1398 */ 1368 */
1399static struct ceph_msg *get_reply(struct ceph_connection *con, 1369static struct ceph_msg *get_reply(struct ceph_connection *con,
1400 struct ceph_msg_header *hdr, 1370 struct ceph_msg_header *hdr,
@@ -1407,7 +1377,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1407 int front = le32_to_cpu(hdr->front_len); 1377 int front = le32_to_cpu(hdr->front_len);
1408 int data_len = le32_to_cpu(hdr->data_len); 1378 int data_len = le32_to_cpu(hdr->data_len);
1409 u64 tid; 1379 u64 tid;
1410 int err;
1411 1380
1412 tid = le64_to_cpu(hdr->tid); 1381 tid = le64_to_cpu(hdr->tid);
1413 mutex_lock(&osdc->request_mutex); 1382 mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1394,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1425 req->r_reply, req->r_con_filling_msg); 1394 req->r_reply, req->r_con_filling_msg);
1426 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); 1395 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1427 ceph_con_put(req->r_con_filling_msg); 1396 ceph_con_put(req->r_con_filling_msg);
1397 req->r_con_filling_msg = NULL;
1428 } 1398 }
1429 1399
1430 if (front > req->r_reply->front.iov_len) { 1400 if (front > req->r_reply->front.iov_len) {
1431 pr_warning("get_reply front %d > preallocated %d\n", 1401 pr_warning("get_reply front %d > preallocated %d\n",
1432 front, (int)req->r_reply->front.iov_len); 1402 front, (int)req->r_reply->front.iov_len);
1433 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL); 1403 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
1434 if (IS_ERR(m)) 1404 if (!m)
1435 goto out; 1405 goto out;
1436 ceph_msg_put(req->r_reply); 1406 ceph_msg_put(req->r_reply);
1437 req->r_reply = m; 1407 req->r_reply = m;
@@ -1439,12 +1409,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1439 m = ceph_msg_get(req->r_reply); 1409 m = ceph_msg_get(req->r_reply);
1440 1410
1441 if (data_len > 0) { 1411 if (data_len > 0) {
1442 err = __prepare_pages(con, hdr, req, tid, m); 1412 unsigned data_off = le16_to_cpu(hdr->data_off);
1443 if (err < 0) { 1413 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1414
1415 if (unlikely(req->r_num_pages < want)) {
1416 pr_warning("tid %lld reply %d > expected %d pages\n",
1417 tid, want, m->nr_pages);
1444 *skip = 1; 1418 *skip = 1;
1445 ceph_msg_put(m); 1419 ceph_msg_put(m);
1446 m = ERR_PTR(err); 1420 m = NULL;
1421 goto out;
1447 } 1422 }
1423 m->pages = req->r_pages;
1424 m->nr_pages = req->r_num_pages;
1448 } 1425 }
1449 *skip = 0; 1426 *skip = 0;
1450 req->r_con_filling_msg = ceph_con_get(con); 1427 req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1443,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1466 1443
1467 switch (type) { 1444 switch (type) {
1468 case CEPH_MSG_OSD_MAP: 1445 case CEPH_MSG_OSD_MAP:
1469 return ceph_msg_new(type, front, 0, 0, NULL); 1446 return ceph_msg_new(type, front, GFP_NOFS);
1470 case CEPH_MSG_OSD_OPREPLY: 1447 case CEPH_MSG_OSD_OPREPLY:
1471 return get_reply(con, hdr, skip); 1448 return get_reply(con, hdr, skip);
1472 default: 1449 default:
@@ -1552,7 +1529,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
1552 return ceph_monc_validate_auth(&osdc->client->monc); 1529 return ceph_monc_validate_auth(&osdc->client->monc);
1553} 1530}
1554 1531
1555const static struct ceph_connection_operations osd_con_ops = { 1532static const struct ceph_connection_operations osd_con_ops = {
1556 .get = get_osd_con, 1533 .get = get_osd_con,
1557 .put = put_osd_con, 1534 .put = put_osd_con,
1558 .dispatch = dispatch, 1535 .dispatch = dispatch,
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cfdd8f4388b7..ddc656fb5c05 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -706,7 +706,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
706 len, *p, end); 706 len, *p, end);
707 newcrush = crush_decode(*p, min(*p+len, end)); 707 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush)) 708 if (IS_ERR(newcrush))
709 return ERR_PTR(PTR_ERR(newcrush)); 709 return ERR_CAST(newcrush);
710 } 710 }
711 711
712 /* new flags? */ 712 /* new flags? */
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
20 20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl) 21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{ 22{
23 struct page *page = alloc_page(GFP_NOFS); 23 struct page *page = __page_cache_alloc(GFP_NOFS);
24 if (!page) 24 if (!page)
25 return -ENOMEM; 25 return -ENOMEM;
26 pl->room += PAGE_SIZE; 26 pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
101 __le64 snap_seq; /* seq for per-pool snapshot */ 101 __le64 snap_seq; /* seq for per-pool snapshot */
102 __le32 snap_epoch; /* epoch of last snap */ 102 __le32 snap_epoch; /* epoch of last snap */
103 __le32 num_snaps; 103 __le32 num_snaps;
104 __le32 num_removed_snap_intervals; 104 __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
105 __le64 uid; 105 __le64 auid; /* who owns the pg */
106} __attribute__ ((packed)); 106} __attribute__ ((packed));
107 107
108/* 108/*
@@ -208,6 +208,7 @@ enum {
208 /* read */ 208 /* read */
209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1, 209 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2, 210 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
211 CEPH_OSD_OP_CMPXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
211 212
212 /* write */ 213 /* write */
213 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1, 214 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
305#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ 306#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
306#define EBLACKLISTED ESHUTDOWN /* blacklisted */ 307#define EBLACKLISTED ESHUTDOWN /* blacklisted */
307 308
309/* xattr comparison */
310enum {
311 CEPH_OSD_CMPXATTR_OP_NOP = 0,
312 CEPH_OSD_CMPXATTR_OP_EQ = 1,
313 CEPH_OSD_CMPXATTR_OP_NE = 2,
314 CEPH_OSD_CMPXATTR_OP_GT = 3,
315 CEPH_OSD_CMPXATTR_OP_GTE = 4,
316 CEPH_OSD_CMPXATTR_OP_LT = 5,
317 CEPH_OSD_CMPXATTR_OP_LTE = 6
318};
319
320enum {
321 CEPH_OSD_CMPXATTR_MODE_STRING = 1,
322 CEPH_OSD_CMPXATTR_MODE_U64 = 2
323};
324
308/* 325/*
309 * an individual object operation. each may be accompanied by some data 326 * an individual object operation. each may be accompanied by some data
310 * payload 327 * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
321 struct { 338 struct {
322 __le32 name_len; 339 __le32 name_len;
323 __le32 value_len; 340 __le32 value_len;
341 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
342 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
324 } __attribute__ ((packed)) xattr; 343 } __attribute__ ((packed)) xattr;
325 struct { 344 struct {
326 __u8 class_len; 345 __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap) 512 struct ceph_cap_snap *capsnap)
513{ 513{
514 struct inode *inode = &ci->vfs_inode; 514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc; 515 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
516 516
517 BUG_ON(capsnap->writing); 517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size; 518 capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 110857ba9269..fa87f51e38e1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/parser.h> 10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/seq_file.h> 12#include <linux/seq_file.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/statfs.h> 14#include <linux/statfs.h>
16#include <linux/string.h> 15#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19 16
20#include "decode.h" 17#include "decode.h"
21#include "super.h" 18#include "super.h"
@@ -92,7 +89,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
92 89
93 buf->f_files = le64_to_cpu(st.num_objects); 90 buf->f_files = le64_to_cpu(st.num_objects);
94 buf->f_ffree = -1; 91 buf->f_ffree = -1;
95 buf->f_namelen = PATH_MAX; 92 buf->f_namelen = NAME_MAX;
96 buf->f_frsize = PAGE_CACHE_SIZE; 93 buf->f_frsize = PAGE_CACHE_SIZE;
97 94
98 /* leave fsid little-endian, regardless of host endianness */ 95 /* leave fsid little-endian, regardless of host endianness */
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
107static int ceph_syncfs(struct super_block *sb, int wait) 104static int ceph_syncfs(struct super_block *sb, int wait)
108{ 105{
109 dout("sync_fs %d\n", wait); 106 dout("sync_fs %d\n", wait);
110 ceph_osdc_sync(&ceph_client(sb)->osdc); 107 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
111 ceph_mdsc_sync(&ceph_client(sb)->mdsc); 108 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
112 dout("sync_fs %d done\n", wait); 109 dout("sync_fs %d done\n", wait);
113 return 0; 110 return 0;
114} 111}
115 112
113static int default_congestion_kb(void)
114{
115 int congestion_kb;
116
117 /*
118 * Copied from NFS
119 *
120 * congestion size, scale with available memory.
121 *
122 * 64MB: 8192k
123 * 128MB: 11585k
124 * 256MB: 16384k
125 * 512MB: 23170k
126 * 1GB: 32768k
127 * 2GB: 46340k
128 * 4GB: 65536k
129 * 8GB: 92681k
130 * 16GB: 131072k
131 *
132 * This allows larger machines to have larger/more transfers.
133 * Limit the default to 256M
134 */
135 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
136 if (congestion_kb > 256*1024)
137 congestion_kb = 256*1024;
138
139 return congestion_kb;
140}
116 141
117/** 142/**
118 * ceph_show_options - Show mount options in /proc/mounts 143 * ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
138 seq_puts(m, ",nocrc"); 163 seq_puts(m, ",nocrc");
139 if (args->flags & CEPH_OPT_NOASYNCREADDIR) 164 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
140 seq_puts(m, ",noasyncreaddir"); 165 seq_puts(m, ",noasyncreaddir");
166
167 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
168 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
169 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
170 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
171 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
172 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
173 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
174 seq_printf(m, ",osdkeepalivetimeout=%d",
175 args->osd_keepalive_timeout);
176 if (args->wsize)
177 seq_printf(m, ",wsize=%d", args->wsize);
178 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
179 seq_printf(m, ",rsize=%d", args->rsize);
180 if (args->congestion_kb != default_congestion_kb())
181 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
182 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
183 seq_printf(m, ",caps_wanted_delay_min=%d",
184 args->caps_wanted_delay_min);
185 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
186 seq_printf(m, ",caps_wanted_delay_max=%d",
187 args->caps_wanted_delay_max);
188 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
189 seq_printf(m, ",cap_release_safety=%d",
190 args->cap_release_safety);
191 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
192 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
193 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
194 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
141 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) 195 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
142 seq_printf(m, ",snapdirname=%s", args->snapdir_name); 196 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
143 if (args->name) 197 if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
161 inode_init_once(&ci->vfs_inode); 215 inode_init_once(&ci->vfs_inode);
162} 216}
163 217
164static int default_congestion_kb(void)
165{
166 int congestion_kb;
167
168 /*
169 * Copied from NFS
170 *
171 * congestion size, scale with available memory.
172 *
173 * 64MB: 8192k
174 * 128MB: 11585k
175 * 256MB: 16384k
176 * 512MB: 23170k
177 * 1GB: 32768k
178 * 2GB: 46340k
179 * 4GB: 65536k
180 * 8GB: 92681k
181 * 16GB: 131072k
182 *
183 * This allows larger machines to have larger/more transfers.
184 * Limit the default to 256M
185 */
186 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
187 if (congestion_kb > 256*1024)
188 congestion_kb = 256*1024;
189
190 return congestion_kb;
191}
192
193static int __init init_caches(void) 218static int __init init_caches(void)
194{ 219{
195 ceph_inode_cachep = kmem_cache_create("ceph_inode_info", 220 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
308 Opt_osd_idle_ttl, 333 Opt_osd_idle_ttl,
309 Opt_caps_wanted_delay_min, 334 Opt_caps_wanted_delay_min,
310 Opt_caps_wanted_delay_max, 335 Opt_caps_wanted_delay_max,
336 Opt_cap_release_safety,
311 Opt_readdir_max_entries, 337 Opt_readdir_max_entries,
338 Opt_readdir_max_bytes,
312 Opt_congestion_kb, 339 Opt_congestion_kb,
313 Opt_last_int, 340 Opt_last_int,
314 /* int args above */ 341 /* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
339 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, 366 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
340 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 367 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
341 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 368 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
369 {Opt_cap_release_safety, "cap_release_safety=%d"},
342 {Opt_readdir_max_entries, "readdir_max_entries=%d"}, 370 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
371 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
343 {Opt_congestion_kb, "write_congestion_kb=%d"}, 372 {Opt_congestion_kb, "write_congestion_kb=%d"},
344 /* int args above */ 373 /* int args above */
345 {Opt_snapdirname, "snapdirname=%s"}, 374 {Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
388 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; 417 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
389 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT; 418 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
390 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); 419 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
391 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4; 420 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
392 args->max_readdir = 1024; 421 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
422 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
393 args->congestion_kb = default_congestion_kb(); 423 args->congestion_kb = default_congestion_kb();
394 424
395 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */ 425 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
497 case Opt_readdir_max_entries: 527 case Opt_readdir_max_entries:
498 args->max_readdir = intval; 528 args->max_readdir = intval;
499 break; 529 break;
530 case Opt_readdir_max_bytes:
531 args->max_readdir_bytes = intval;
532 break;
500 case Opt_congestion_kb: 533 case Opt_congestion_kb:
501 args->congestion_kb = intval; 534 args->congestion_kb = intval;
502 break; 535 break;
@@ -636,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client)
636 669
637 /* unmount */ 670 /* unmount */
638 ceph_mdsc_stop(&client->mdsc); 671 ceph_mdsc_stop(&client->mdsc);
639 ceph_monc_stop(&client->monc);
640 ceph_osdc_stop(&client->osdc); 672 ceph_osdc_stop(&client->osdc);
641 673
674 /*
675 * make sure mds and osd connections close out before destroying
676 * the auth module, which is needed to free those connections'
677 * ceph_authorizers.
678 */
679 ceph_msgr_flush();
680
681 ceph_monc_stop(&client->monc);
682
642 ceph_adjust_min_caps(-client->min_caps); 683 ceph_adjust_min_caps(-client->min_caps);
643 684
644 ceph_debugfs_client_cleanup(client); 685 ceph_debugfs_client_cleanup(client);
@@ -682,9 +723,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
682/* 723/*
683 * true if we have the mon map (and have thus joined the cluster) 724 * true if we have the mon map (and have thus joined the cluster)
684 */ 725 */
685static int have_mon_map(struct ceph_client *client) 726static int have_mon_and_osd_map(struct ceph_client *client)
686{ 727{
687 return client->monc.monmap && client->monc.monmap->epoch; 728 return client->monc.monmap && client->monc.monmap->epoch &&
729 client->osdc.osdmap && client->osdc.osdmap->epoch;
688} 730}
689 731
690/* 732/*
@@ -704,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
704 dout("open_root_inode opening '%s'\n", path); 746 dout("open_root_inode opening '%s'\n", path);
705 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); 747 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
706 if (IS_ERR(req)) 748 if (IS_ERR(req))
707 return ERR_PTR(PTR_ERR(req)); 749 return ERR_CAST(req);
708 req->r_path1 = kstrdup(path, GFP_NOFS); 750 req->r_path1 = kstrdup(path, GFP_NOFS);
709 req->r_ino1.ino = CEPH_INO_ROOT; 751 req->r_ino1.ino = CEPH_INO_ROOT;
710 req->r_ino1.snap = CEPH_NOSNAP; 752 req->r_ino1.snap = CEPH_NOSNAP;
@@ -762,7 +804,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
762 if (err < 0) 804 if (err < 0)
763 goto out; 805 goto out;
764 806
765 while (!have_mon_map(client)) { 807 while (!have_mon_and_osd_map(client)) {
766 err = -EIO; 808 err = -EIO;
767 if (timeout && time_after_eq(jiffies, started + timeout)) 809 if (timeout && time_after_eq(jiffies, started + timeout))
768 goto out; 810 goto out;
@@ -770,8 +812,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
770 /* wait */ 812 /* wait */
771 dout("mount waiting for mon_map\n"); 813 dout("mount waiting for mon_map\n");
772 err = wait_event_interruptible_timeout(client->auth_wq, 814 err = wait_event_interruptible_timeout(client->auth_wq,
773 have_mon_map(client) || (client->auth_err < 0), 815 have_mon_and_osd_map(client) || (client->auth_err < 0),
774 timeout); 816 timeout);
775 if (err == -EINTR || err == -ERESTARTSYS) 817 if (err == -EINTR || err == -ERESTARTSYS)
776 goto out; 818 goto out;
777 if (client->auth_err < 0) { 819 if (client->auth_err < 0) {
@@ -884,6 +926,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
884/* 926/*
885 * construct our own bdi so we can control readahead, etc. 927 * construct our own bdi so we can control readahead, etc.
886 */ 928 */
929static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
930
887static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 931static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
888{ 932{
889 int err; 933 int err;
@@ -893,7 +937,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
893 client->backing_dev_info.ra_pages = 937 client->backing_dev_info.ra_pages =
894 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 938 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
895 >> PAGE_SHIFT; 939 >> PAGE_SHIFT;
896 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); 940 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
941 atomic_long_inc_return(&bdi_seq));
897 if (!err) 942 if (!err)
898 sb->s_bdi = &client->backing_dev_info; 943 sb->s_bdi = &client->backing_dev_info;
899 return err; 944 return err;
@@ -932,9 +977,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
932 goto out; 977 goto out;
933 } 978 }
934 979
935 if (ceph_client(sb) != client) { 980 if (ceph_sb_to_client(sb) != client) {
936 ceph_destroy_client(client); 981 ceph_destroy_client(client);
937 client = ceph_client(sb); 982 client = ceph_sb_to_client(sb);
938 dout("get_sb got existing client %p\n", client); 983 dout("get_sb got existing client %p\n", client);
939 } else { 984 } else {
940 dout("get_sb using new client %p\n", client); 985 dout("get_sb using new client %p\n", client);
@@ -952,8 +997,7 @@ static int ceph_get_sb(struct file_system_type *fs_type,
952 997
953out_splat: 998out_splat:
954 ceph_mdsc_close_sessions(&client->mdsc); 999 ceph_mdsc_close_sessions(&client->mdsc);
955 up_write(&sb->s_umount); 1000 deactivate_locked_super(sb);
956 deactivate_super(sb);
957 goto out_final; 1001 goto out_final;
958 1002
959out: 1003out:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..10a4a406e887 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,7 +10,6 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mempool.h> 11#include <linux/mempool.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/slab.h>
14#include <linux/wait.h> 13#include <linux/wait.h>
15#include <linux/writeback.h> 14#include <linux/writeback.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
@@ -52,24 +51,25 @@
52 51
53struct ceph_mount_args { 52struct ceph_mount_args {
54 int sb_flags; 53 int sb_flags;
54 int flags;
55 struct ceph_fsid fsid;
56 struct ceph_entity_addr my_addr;
55 int num_mon; 57 int num_mon;
56 struct ceph_entity_addr *mon_addr; 58 struct ceph_entity_addr *mon_addr;
57 int flags;
58 int mount_timeout; 59 int mount_timeout;
59 int osd_idle_ttl; 60 int osd_idle_ttl;
60 int caps_wanted_delay_min, caps_wanted_delay_max;
61 struct ceph_fsid fsid;
62 struct ceph_entity_addr my_addr;
63 int wsize;
64 int rsize; /* max readahead */
65 int max_readdir; /* max readdir size */
66 int congestion_kb; /* max readdir size */
67 int osd_timeout; 61 int osd_timeout;
68 int osd_keepalive_timeout; 62 int osd_keepalive_timeout;
63 int wsize;
64 int rsize; /* max readahead */
65 int congestion_kb; /* max writeback in flight */
66 int caps_wanted_delay_min, caps_wanted_delay_max;
67 int cap_release_safety;
68 int max_readdir; /* max readdir result (entires) */
69 int max_readdir_bytes; /* max readdir result (bytes) */
69 char *snapdir_name; /* default ".snap" */ 70 char *snapdir_name; /* default ".snap" */
70 char *name; 71 char *name;
71 char *secret; 72 char *secret;
72 int cap_release_safety;
73}; 73};
74 74
75/* 75/*
@@ -80,13 +80,14 @@ struct ceph_mount_args {
80#define CEPH_OSD_KEEPALIVE_DEFAULT 5 80#define CEPH_OSD_KEEPALIVE_DEFAULT 5
81#define CEPH_OSD_IDLE_TTL_DEFAULT 60 81#define CEPH_OSD_IDLE_TTL_DEFAULT 60
82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */ 82#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
83#define CEPH_MAX_READDIR_DEFAULT 1024
84#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
83 85
84#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 86#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
85#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) 87#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
86 88
87#define CEPH_SNAPDIRNAME_DEFAULT ".snap" 89#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
88#define CEPH_AUTH_NAME_DEFAULT "guest" 90#define CEPH_AUTH_NAME_DEFAULT "guest"
89
90/* 91/*
91 * Delay telling the MDS we no longer want caps, in case we reopen 92 * Delay telling the MDS we no longer want caps, in case we reopen
92 * the file. Delay a minimum amount of time, even if we send a cap 93 * the file. Delay a minimum amount of time, even if we send a cap
@@ -96,6 +97,7 @@ struct ceph_mount_args {
96#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ 97#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
97#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ 98#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
98 99
100#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
99 101
100/* mount state */ 102/* mount state */
101enum { 103enum {
@@ -160,12 +162,6 @@ struct ceph_client {
160#endif 162#endif
161}; 163};
162 164
163static inline struct ceph_client *ceph_client(struct super_block *sb)
164{
165 return sb->s_fs_info;
166}
167
168
169/* 165/*
170 * File i/o capability. This tracks shared state with the metadata 166 * File i/o capability. This tracks shared state with the metadata
171 * server that allows us to cache or writeback attributes or to read 167 * server that allows us to cache or writeback attributes or to read
@@ -814,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
814 810
815extern void ceph_queue_caps_release(struct inode *inode); 811extern void ceph_queue_caps_release(struct inode *inode);
816extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 812extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
817extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync); 813extern int ceph_fsync(struct file *file, int datasync);
818extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 814extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
819 struct ceph_mds_session *session); 815 struct ceph_mds_session *session);
820extern int ceph_get_cap_mds(struct inode *inode); 816extern int ceph_get_cap_mds(struct inode *inode);
@@ -871,6 +867,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
871extern void ceph_dentry_lru_add(struct dentry *dn); 867extern void ceph_dentry_lru_add(struct dentry *dn);
872extern void ceph_dentry_lru_touch(struct dentry *dn); 868extern void ceph_dentry_lru_touch(struct dentry *dn);
873extern void ceph_dentry_lru_del(struct dentry *dn); 869extern void ceph_dentry_lru_del(struct dentry *dn);
870extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
874 871
875/* 872/*
876 * our d_ops vary depending on whether the inode is live, 873 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
7 7
8static bool ceph_is_valid_xattr(const char *name) 8static bool ceph_is_valid_xattr(const char *name)
9{ 9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX, 10 return !strncmp(name, "ceph.", 5) ||
11 !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) || 12 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 13 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 14 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
76} 77}
77 78
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = { 79static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries}, 80 { true, "ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files}, 81 { true, "ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs}, 82 { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries}, 83 { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles}, 84 { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs}, 85 { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes}, 86 { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime}, 87 { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL } 88 { true, NULL, NULL }
88}; 89};
89 90
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
107} 108}
108 109
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = { 110static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout}, 111 { true, "ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL } 112 { NULL, NULL }
112}; 113};
113 114
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
186 ci->i_xattrs.names_size -= xattr->name_len; 187 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len; 188 ci->i_xattrs.vals_size -= xattr->val_len;
188 } 189 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len; 190 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len; 191 ci->i_xattrs.vals_size += val_len;
197 if (val) 192 if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
574 ci->i_xattrs.version, ci->i_xattrs.index_version); 569 ci->i_xattrs.version, ci->i_xattrs.index_version);
575 570
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && 571 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) { 572 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
578 goto list_xattr; 573 goto list_xattr;
579 } else { 574 } else {
580 spin_unlock(&inode->i_lock); 575 spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 617static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags) 618 const char *value, size_t size, int flags)
624{ 619{
625 struct ceph_client *client = ceph_client(dentry->d_sb); 620 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode; 621 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode); 622 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode; 623 struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
641 return -ENOMEM; 636 return -ENOMEM;
642 err = -ENOMEM; 637 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) { 638 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS); 639 pages[i] = __page_cache_alloc(GFP_NOFS);
645 if (!pages[i]) { 640 if (!pages[i]) {
646 nr_pages = i; 641 nr_pages = i;
647 goto out; 642 goto out;
@@ -779,7 +774,7 @@ out:
779 774
780static int ceph_send_removexattr(struct dentry *dentry, const char *name) 775static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{ 776{
782 struct ceph_client *client = ceph_client(dentry->d_sb); 777 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc; 778 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode; 779 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode; 780 struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0242ff9cbf41..a7eb65c84b1c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, 84extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
85 size_t write_size, loff_t *poffset); 85 size_t write_size, loff_t *poffset);
86extern int cifs_lock(struct file *, int, struct file_lock *); 86extern int cifs_lock(struct file *, int, struct file_lock *);
87extern int cifs_fsync(struct file *, struct dentry *, int); 87extern int cifs_fsync(struct file *, int);
88extern int cifs_flush(struct file *, fl_owner_t id); 88extern int cifs_flush(struct file *, fl_owner_t id);
89extern int cifs_file_mmap(struct file * , struct vm_area_struct *); 89extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
90extern const struct file_operations cifs_dir_ops; 90extern const struct file_operations cifs_dir_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a83541ec9713..75541af4b3db 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1676,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
1676 return rc; 1676 return rc;
1677} 1677}
1678 1678
1679int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1679int cifs_fsync(struct file *file, int datasync)
1680{ 1680{
1681 int xid; 1681 int xid;
1682 int rc = 0; 1682 int rc = 0;
@@ -1688,7 +1688,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
1688 xid = GetXid(); 1688 xid = GetXid();
1689 1689
1690 cFYI(1, "Sync file - name: %s datasync: 0x%x", 1690 cFYI(1, "Sync file - name: %s datasync: 0x%x",
1691 dentry->d_name.name, datasync); 1691 file->f_path.dentry->d_name.name, datasync);
1692 1692
1693 rc = filemap_write_and_wait(inode->i_mapping); 1693 rc = filemap_write_and_wait(inode->i_mapping);
1694 if (rc == 0) { 1694 if (rc == 0) {
@@ -1952,6 +1952,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
1952 bytes_read -= PAGE_CACHE_SIZE; 1952 bytes_read -= PAGE_CACHE_SIZE;
1953 continue; 1953 continue;
1954 } 1954 }
1955 page_cache_release(page);
1955 1956
1956 target = kmap_atomic(page, KM_USER0); 1957 target = kmap_atomic(page, KM_USER0);
1957 1958
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
11 11
12void coda_destroy_inodecache(void); 12void coda_destroy_inodecache(void);
13int coda_init_inodecache(void); 13int coda_init_inodecache(void);
14int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, 14int coda_fsync(struct file *coda_file, int datasync);
15 int datasync);
16void coda_sysctl_init(void); 15void coda_sysctl_init(void);
17void coda_sysctl_clean(void); 16void coda_sysctl_clean(void);
18 17
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 4c813f2cdc52..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
202 return 0; 202 return 0;
203} 203}
204 204
205int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) 205int coda_fsync(struct file *coda_file, int datasync)
206{ 206{
207 struct file *host_file; 207 struct file *host_file;
208 struct inode *coda_inode = coda_dentry->d_inode; 208 struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
209 struct coda_file_info *cfi; 209 struct coda_file_info *cfi;
210 int err = 0; 210 int err = 0;
211 211
@@ -217,7 +217,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); 217 BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
218 host_file = cfi->cfi_container; 218 host_file = cfi->cfi_container;
219 219
220 err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); 220 err = vfs_fsync(host_file, datasync);
221 if ( !err && !datasync ) { 221 if ( !err && !datasync ) {
222 lock_kernel(); 222 lock_kernel();
223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); 223 err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Pioctl operations for Coda. 2 * Pioctl operations for Coda.
3 * Original version: (C) 1996 Peter Braam 3 * Original version: (C) 1996 Peter Braam
4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University 4 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
5 * 5 *
6 * Carnegie Mellon encourages users of this code to contribute improvements 6 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
23#include <linux/coda_fs_i.h> 23#include <linux/coda_fs_i.h>
24#include <linux/coda_psdev.h> 24#include <linux/coda_psdev.h>
25 25
26#include <linux/smp_lock.h>
27
26/* pioctl ops */ 28/* pioctl ops */
27static int coda_ioctl_permission(struct inode *inode, int mask); 29static int coda_ioctl_permission(struct inode *inode, int mask);
28static int coda_pioctl(struct inode * inode, struct file * filp, 30static long coda_pioctl(struct file *filp, unsigned int cmd,
29 unsigned int cmd, unsigned long user_data); 31 unsigned long user_data);
30 32
31/* exported from this file */ 33/* exported from this file */
32const struct inode_operations coda_ioctl_inode_operations = 34const struct inode_operations coda_ioctl_inode_operations = {
33{
34 .permission = coda_ioctl_permission, 35 .permission = coda_ioctl_permission,
35 .setattr = coda_setattr, 36 .setattr = coda_setattr,
36}; 37};
37 38
38const struct file_operations coda_ioctl_operations = { 39const struct file_operations coda_ioctl_operations = {
39 .owner = THIS_MODULE, 40 .owner = THIS_MODULE,
40 .ioctl = coda_pioctl, 41 .unlocked_ioctl = coda_pioctl,
41}; 42};
42 43
43/* the coda pioctl inode ops */ 44/* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
46 return (mask & MAY_EXEC) ? -EACCES : 0; 47 return (mask & MAY_EXEC) ? -EACCES : 0;
47} 48}
48 49
49static int coda_pioctl(struct inode * inode, struct file * filp, 50static long coda_pioctl(struct file *filp, unsigned int cmd,
50 unsigned int cmd, unsigned long user_data) 51 unsigned long user_data)
51{ 52{
52 struct path path; 53 struct path path;
53 int error; 54 int error;
54 struct PioctlData data; 55 struct PioctlData data;
55 struct inode *target_inode = NULL; 56 struct inode *inode = filp->f_dentry->d_inode;
56 struct coda_inode_info *cnp; 57 struct inode *target_inode = NULL;
58 struct coda_inode_info *cnp;
57 59
58 /* get the Pioctl data arguments from user space */ 60 lock_kernel();
59 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) { 61
60 return -EINVAL; 62 /* get the Pioctl data arguments from user space */
61 } 63 if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
62 64 error = -EINVAL;
63 /* 65 goto out;
64 * Look up the pathname. Note that the pathname is in
65 * user memory, and namei takes care of this
66 */
67 if (data.follow) {
68 error = user_path(data.path, &path);
69 } else {
70 error = user_lpath(data.path, &path);
71 } 66 }
72 67
73 if ( error ) { 68 /*
74 return error; 69 * Look up the pathname. Note that the pathname is in
75 } else { 70 * user memory, and namei takes care of this
71 */
72 if (data.follow)
73 error = user_path(data.path, &path);
74 else
75 error = user_lpath(data.path, &path);
76
77 if (error)
78 goto out;
79 else
76 target_inode = path.dentry->d_inode; 80 target_inode = path.dentry->d_inode;
77 } 81
78
79 /* return if it is not a Coda inode */ 82 /* return if it is not a Coda inode */
80 if ( target_inode->i_sb != inode->i_sb ) { 83 if (target_inode->i_sb != inode->i_sb) {
81 path_put(&path); 84 path_put(&path);
82 return -EINVAL; 85 error = -EINVAL;
86 goto out;
83 } 87 }
84 88
85 /* now proceed to make the upcall */ 89 /* now proceed to make the upcall */
86 cnp = ITOC(target_inode); 90 cnp = ITOC(target_inode);
87 91
88 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data); 92 error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
89 93
90 path_put(&path); 94 path_put(&path);
91 return error;
92}
93 95
96out:
97 unlock_kernel();
98 return error;
99}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
73 return mask; 73 return mask;
74} 74}
75 75
76static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 76static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
77 unsigned int cmd, unsigned long arg)
78{ 77{
79 unsigned int data; 78 unsigned int data;
80 79
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
344 .read = coda_psdev_read, 343 .read = coda_psdev_read,
345 .write = coda_psdev_write, 344 .write = coda_psdev_write,
346 .poll = coda_psdev_poll, 345 .poll = coda_psdev_poll,
347 .ioctl = coda_psdev_ioctl, 346 .unlocked_ioctl = coda_psdev_ioctl,
348 .open = coda_psdev_open, 347 .open = coda_psdev_open,
349 .release = coda_psdev_release, 348 .release = coda_psdev_release,
350}; 349};
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..6490d2134ff3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
568 return ret; 568 return ret;
569} 569}
570 570
571/* A write operation does a read from user space and vice versa */
572#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
573
574ssize_t compat_rw_copy_check_uvector(int type,
575 const struct compat_iovec __user *uvector, unsigned long nr_segs,
576 unsigned long fast_segs, struct iovec *fast_pointer,
577 struct iovec **ret_pointer)
578{
579 compat_ssize_t tot_len;
580 struct iovec *iov = *ret_pointer = fast_pointer;
581 ssize_t ret = 0;
582 int seg;
583
584 /*
585 * SuS says "The readv() function *may* fail if the iovcnt argument
586 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
587 * traditionally returned zero for zero segments, so...
588 */
589 if (nr_segs == 0)
590 goto out;
591
592 ret = -EINVAL;
593 if (nr_segs > UIO_MAXIOV || nr_segs < 0)
594 goto out;
595 if (nr_segs > fast_segs) {
596 ret = -ENOMEM;
597 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
598 if (iov == NULL) {
599 *ret_pointer = fast_pointer;
600 goto out;
601 }
602 }
603 *ret_pointer = iov;
604
605 /*
606 * Single unix specification:
607 * We should -EINVAL if an element length is not >= 0 and fitting an
608 * ssize_t. The total length is fitting an ssize_t
609 *
610 * Be careful here because iov_len is a size_t not an ssize_t
611 */
612 tot_len = 0;
613 ret = -EINVAL;
614 for (seg = 0; seg < nr_segs; seg++) {
615 compat_ssize_t tmp = tot_len;
616 compat_uptr_t buf;
617 compat_ssize_t len;
618
619 if (__get_user(len, &uvector->iov_len) ||
620 __get_user(buf, &uvector->iov_base)) {
621 ret = -EFAULT;
622 goto out;
623 }
624 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
625 goto out;
626 tot_len += len;
627 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
628 goto out;
629 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
630 ret = -EFAULT;
631 goto out;
632 }
633 iov->iov_base = compat_ptr(buf);
634 iov->iov_len = (compat_size_t) len;
635 uvector++;
636 iov++;
637 }
638 ret = tot_len;
639
640out:
641 return ret;
642}
643
571static inline long 644static inline long
572copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) 645copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
573{ 646{
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
600 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); 673 iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
601 ret = copy_iocb(nr, iocb, iocb64); 674 ret = copy_iocb(nr, iocb, iocb64);
602 if (!ret) 675 if (!ret)
603 ret = sys_io_submit(ctx_id, nr, iocb64); 676 ret = do_io_submit(ctx_id, nr, iocb64, 1);
604 return ret; 677 return ret;
605} 678}
606 679
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1077{ 1150{
1078 compat_ssize_t tot_len; 1151 compat_ssize_t tot_len;
1079 struct iovec iovstack[UIO_FASTIOV]; 1152 struct iovec iovstack[UIO_FASTIOV];
1080 struct iovec *iov=iovstack, *vector; 1153 struct iovec *iov;
1081 ssize_t ret; 1154 ssize_t ret;
1082 int seg;
1083 io_fn_t fn; 1155 io_fn_t fn;
1084 iov_fn_t fnv; 1156 iov_fn_t fnv;
1085 1157
1086 /*
1087 * SuS says "The readv() function *may* fail if the iovcnt argument
1088 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
1089 * traditionally returned zero for zero segments, so...
1090 */
1091 ret = 0;
1092 if (nr_segs == 0)
1093 goto out;
1094
1095 /*
1096 * First get the "struct iovec" from user memory and
1097 * verify all the pointers
1098 */
1099 ret = -EINVAL; 1158 ret = -EINVAL;
1100 if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
1101 goto out;
1102 if (!file->f_op) 1159 if (!file->f_op)
1103 goto out; 1160 goto out;
1104 if (nr_segs > UIO_FASTIOV) { 1161
1105 ret = -ENOMEM;
1106 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
1107 if (!iov)
1108 goto out;
1109 }
1110 ret = -EFAULT; 1162 ret = -EFAULT;
1111 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) 1163 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
1112 goto out; 1164 goto out;
1113 1165
1114 /* 1166 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1115 * Single unix specification: 1167 UIO_FASTIOV, iovstack, &iov);
1116 * We should -EINVAL if an element length is not >= 0 and fitting an
1117 * ssize_t. The total length is fitting an ssize_t
1118 *
1119 * Be careful here because iov_len is a size_t not an ssize_t
1120 */
1121 tot_len = 0;
1122 vector = iov;
1123 ret = -EINVAL;
1124 for (seg = 0 ; seg < nr_segs; seg++) {
1125 compat_ssize_t tmp = tot_len;
1126 compat_ssize_t len;
1127 compat_uptr_t buf;
1128
1129 if (__get_user(len, &uvector->iov_len) ||
1130 __get_user(buf, &uvector->iov_base)) {
1131 ret = -EFAULT;
1132 goto out;
1133 }
1134 if (len < 0) /* size_t not fitting an compat_ssize_t .. */
1135 goto out;
1136 tot_len += len;
1137 if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
1138 goto out;
1139 vector->iov_base = compat_ptr(buf);
1140 vector->iov_len = (compat_size_t) len;
1141 uvector++;
1142 vector++;
1143 }
1144 if (tot_len == 0) { 1168 if (tot_len == 0) {
1145 ret = 0; 1169 ret = 0;
1146 goto out; 1170 goto out;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c8af2d91174b..cf78d44a8d6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -73,15 +73,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
73 return -EINVAL; 73 return -EINVAL;
74 74
75 sd_iattr = sd->s_iattr; 75 sd_iattr = sd->s_iattr;
76
77 error = inode_change_ok(inode, iattr);
78 if (error)
79 return error;
80
81 error = inode_setattr(inode, iattr);
82 if (error)
83 return error;
84
85 if (!sd_iattr) { 76 if (!sd_iattr) {
86 /* setting attributes for the first time, allocate now */ 77 /* setting attributes for the first time, allocate now */
87 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); 78 sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
@@ -94,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
94 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; 85 sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
95 sd->s_iattr = sd_iattr; 86 sd->s_iattr = sd_iattr;
96 } 87 }
97
98 /* attributes were changed atleast once in past */ 88 /* attributes were changed atleast once in past */
99 89
90 error = simple_setattr(dentry, iattr);
91 if (error)
92 return error;
93
100 if (ia_valid & ATTR_UID) 94 if (ia_valid & ATTR_UID)
101 sd_iattr->ia_uid = iattr->ia_uid; 95 sd_iattr->ia_uid = iattr->ia_uid;
102 if (ia_valid & ATTR_GID) 96 if (ia_valid & ATTR_GID)
diff --git a/fs/dcache.c b/fs/dcache.c
index f1358e5c3a59..d96047b4a633 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -536,7 +536,7 @@ restart:
536 */ 536 */
537static void prune_dcache(int count) 537static void prune_dcache(int count)
538{ 538{
539 struct super_block *sb; 539 struct super_block *sb, *n;
540 int w_count; 540 int w_count;
541 int unused = dentry_stat.nr_unused; 541 int unused = dentry_stat.nr_unused;
542 int prune_ratio; 542 int prune_ratio;
@@ -545,13 +545,14 @@ static void prune_dcache(int count)
545 if (unused == 0 || count == 0) 545 if (unused == 0 || count == 0)
546 return; 546 return;
547 spin_lock(&dcache_lock); 547 spin_lock(&dcache_lock);
548restart:
549 if (count >= unused) 548 if (count >= unused)
550 prune_ratio = 1; 549 prune_ratio = 1;
551 else 550 else
552 prune_ratio = unused / count; 551 prune_ratio = unused / count;
553 spin_lock(&sb_lock); 552 spin_lock(&sb_lock);
554 list_for_each_entry(sb, &super_blocks, s_list) { 553 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
554 if (list_empty(&sb->s_instances))
555 continue;
555 if (sb->s_nr_dentry_unused == 0) 556 if (sb->s_nr_dentry_unused == 0)
556 continue; 557 continue;
557 sb->s_count++; 558 sb->s_count++;
@@ -590,14 +591,10 @@ restart:
590 } 591 }
591 spin_lock(&sb_lock); 592 spin_lock(&sb_lock);
592 count -= pruned; 593 count -= pruned;
593 /* 594 __put_super(sb);
594 * restart only when sb is no longer on the list and 595 /* more work left to do? */
595 * we have more work to do. 596 if (count <= 0)
596 */ 597 break;
597 if (__put_super_and_need_restart(sb) && count > 0) {
598 spin_unlock(&sb_lock);
599 goto restart;
600 }
601 } 598 }
602 spin_unlock(&sb_lock); 599 spin_unlock(&sb_lock);
603 spin_unlock(&dcache_lock); 600 spin_unlock(&dcache_lock);
@@ -1529,6 +1526,7 @@ void d_delete(struct dentry * dentry)
1529 spin_lock(&dentry->d_lock); 1526 spin_lock(&dentry->d_lock);
1530 isdir = S_ISDIR(dentry->d_inode->i_mode); 1527 isdir = S_ISDIR(dentry->d_inode->i_mode);
1531 if (atomic_read(&dentry->d_count) == 1) { 1528 if (atomic_read(&dentry->d_count) == 1) {
1529 dentry->d_flags &= ~DCACHE_CANT_MOUNT;
1532 dentry_iput(dentry); 1530 dentry_iput(dentry);
1533 fsnotify_nameremove(dentry, isdir); 1531 fsnotify_nameremove(dentry, isdir);
1534 return; 1532 return;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); 277DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); 278DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
279 279
280DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
281
280/* 282/*
281 * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value 283 * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
282 * 284 *
283 * These functions are exactly the same as the above functions (but use a hex 285 * These functions are exactly the same as the above functions (but use a hex
284 * output for the decimal challenged). For details look at the above unsigned 286 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
357} 359}
358EXPORT_SYMBOL_GPL(debugfs_create_x32); 360EXPORT_SYMBOL_GPL(debugfs_create_x32);
359 361
362/**
363 * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
364 * @name: a pointer to a string containing the name of the file to create.
365 * @mode: the permission that the file should have
366 * @parent: a pointer to the parent dentry for this file. This should be a
367 * directory dentry if set. If this parameter is %NULL, then the
368 * file will be created in the root of the debugfs filesystem.
369 * @value: a pointer to the variable that the file should read to and write
370 * from.
371 */
372struct dentry *debugfs_create_x64(const char *name, mode_t mode,
373 struct dentry *parent, u64 *value)
374{
375 return debugfs_create_file(name, mode, parent, value, &fops_x64);
376}
377EXPORT_SYMBOL_GPL(debugfs_create_x64);
378
360 379
361static int debugfs_size_t_set(void *data, u64 val) 380static int debugfs_size_t_set(void *data, u64 val)
362{ 381{
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 0120247b41c0..8b3ffd5b5235 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -384,18 +384,15 @@ static int devpts_get_sb(struct file_system_type *fs_type,
384 s->s_flags |= MS_ACTIVE; 384 s->s_flags |= MS_ACTIVE;
385 } 385 }
386 386
387 simple_set_mnt(mnt, s);
388
389 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts)); 387 memcpy(&(DEVPTS_SB(s))->mount_opts, &opts, sizeof(opts));
390 388
391 error = mknod_ptmx(s); 389 error = mknod_ptmx(s);
392 if (error) 390 if (error)
393 goto out_dput; 391 goto out_undo_sget;
394 392
395 return 0; 393 simple_set_mnt(mnt, s);
396 394
397out_dput: 395 return 0;
398 dput(s->s_root); /* undo dget() in simple_set_mnt() */
399 396
400out_undo_sget: 397out_undo_sget:
401 deactivate_locked_super(s); 398 deactivate_locked_super(s);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
82 int reap_counter; /* rate limit reaping */ 82 int reap_counter; /* rate limit reaping */
83 get_block_t *get_block; /* block mapping function */ 83 get_block_t *get_block; /* block mapping function */
84 dio_iodone_t *end_io; /* IO completion function */ 84 dio_iodone_t *end_io; /* IO completion function */
85 dio_submit_t *submit_io; /* IO submition function */
86 loff_t logical_offset_in_bio; /* current first logical block in bio */
85 sector_t final_block_in_bio; /* current final block in bio + 1 */ 87 sector_t final_block_in_bio; /* current final block in bio + 1 */
86 sector_t next_block_for_io; /* next block to be put under IO, 88 sector_t next_block_for_io; /* next block to be put under IO,
87 in dio_blocks units */ 89 in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
96 unsigned cur_page_offset; /* Offset into it, in bytes */ 98 unsigned cur_page_offset; /* Offset into it, in bytes */
97 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */ 99 unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
98 sector_t cur_page_block; /* Where it starts */ 100 sector_t cur_page_block; /* Where it starts */
101 loff_t cur_page_fs_offset; /* Offset in file */
99 102
100 /* BIO completion state */ 103 /* BIO completion state */
101 spinlock_t bio_lock; /* protects BIO fields below */ 104 spinlock_t bio_lock; /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
300 spin_unlock_irqrestore(&dio->bio_lock, flags); 303 spin_unlock_irqrestore(&dio->bio_lock, flags);
301} 304}
302 305
306/**
307 * dio_end_io - handle the end io action for the given bio
308 * @bio: The direct io bio thats being completed
309 * @error: Error if there was one
310 *
311 * This is meant to be called by any filesystem that uses their own dio_submit_t
312 * so that the DIO specific endio actions are dealt with after the filesystem
313 * has done it's completion work.
314 */
315void dio_end_io(struct bio *bio, int error)
316{
317 struct dio *dio = bio->bi_private;
318
319 if (dio->is_async)
320 dio_bio_end_aio(bio, error);
321 else
322 dio_bio_end_io(bio, error);
323}
324EXPORT_SYMBOL_GPL(dio_end_io);
325
303static int 326static int
304dio_bio_alloc(struct dio *dio, struct block_device *bdev, 327dio_bio_alloc(struct dio *dio, struct block_device *bdev,
305 sector_t first_sector, int nr_vecs) 328 sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
316 bio->bi_end_io = dio_bio_end_io; 339 bio->bi_end_io = dio_bio_end_io;
317 340
318 dio->bio = bio; 341 dio->bio = bio;
342 dio->logical_offset_in_bio = dio->cur_page_fs_offset;
319 return 0; 343 return 0;
320} 344}
321 345
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
340 if (dio->is_async && dio->rw == READ) 364 if (dio->is_async && dio->rw == READ)
341 bio_set_pages_dirty(bio); 365 bio_set_pages_dirty(bio);
342 366
343 submit_bio(dio->rw, bio); 367 if (dio->submit_io)
368 dio->submit_io(dio->rw, bio, dio->inode,
369 dio->logical_offset_in_bio);
370 else
371 submit_bio(dio->rw, bio);
344 372
345 dio->bio = NULL; 373 dio->bio = NULL;
346 dio->boundary = 0; 374 dio->boundary = 0;
375 dio->logical_offset_in_bio = 0;
347} 376}
348 377
349/* 378/*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
603 int ret = 0; 632 int ret = 0;
604 633
605 if (dio->bio) { 634 if (dio->bio) {
635 loff_t cur_offset = dio->block_in_file << dio->blkbits;
636 loff_t bio_next_offset = dio->logical_offset_in_bio +
637 dio->bio->bi_size;
638
606 /* 639 /*
607 * See whether this new request is contiguous with the old 640 * See whether this new request is contiguous with the old.
641 *
642 * Btrfs cannot handl having logically non-contiguous requests
643 * submitted. For exmple if you have
644 *
645 * Logical: [0-4095][HOLE][8192-12287]
646 * Phyiscal: [0-4095] [4096-8181]
647 *
648 * We cannot submit those pages together as one BIO. So if our
649 * current logical offset in the file does not equal what would
650 * be the next logical offset in the bio, submit the bio we
651 * have.
608 */ 652 */
609 if (dio->final_block_in_bio != dio->cur_page_block) 653 if (dio->final_block_in_bio != dio->cur_page_block ||
654 cur_offset != bio_next_offset)
610 dio_bio_submit(dio); 655 dio_bio_submit(dio);
611 /* 656 /*
612 * Submit now if the underlying fs is about to perform a 657 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
701 dio->cur_page_offset = offset; 746 dio->cur_page_offset = offset;
702 dio->cur_page_len = len; 747 dio->cur_page_len = len;
703 dio->cur_page_block = blocknr; 748 dio->cur_page_block = blocknr;
749 dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
704out: 750out:
705 return ret; 751 return ret;
706} 752}
@@ -935,7 +981,7 @@ static ssize_t
935direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 981direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
936 const struct iovec *iov, loff_t offset, unsigned long nr_segs, 982 const struct iovec *iov, loff_t offset, unsigned long nr_segs,
937 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io, 983 unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
938 struct dio *dio) 984 dio_submit_t submit_io, struct dio *dio)
939{ 985{
940 unsigned long user_addr; 986 unsigned long user_addr;
941 unsigned long flags; 987 unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
952 998
953 dio->get_block = get_block; 999 dio->get_block = get_block;
954 dio->end_io = end_io; 1000 dio->end_io = end_io;
1001 dio->submit_io = submit_io;
955 dio->final_block_in_bio = -1; 1002 dio->final_block_in_bio = -1;
956 dio->next_block_for_io = -1; 1003 dio->next_block_for_io = -1;
957 1004
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1008 } 1055 }
1009 } /* end iovec loop */ 1056 } /* end iovec loop */
1010 1057
1011 if (ret == -ENOTBLK && (rw & WRITE)) { 1058 if (ret == -ENOTBLK) {
1012 /* 1059 /*
1013 * The remaining part of the request will be 1060 * The remaining part of the request will be
1014 * be handled by buffered I/O when we return 1061 * be handled by buffered I/O when we return
@@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1087 return ret; 1134 return ret;
1088} 1135}
1089 1136
1090/*
1091 * This is a library function for use by filesystem drivers.
1092 *
1093 * The locking rules are governed by the flags parameter:
1094 * - if the flags value contains DIO_LOCKING we use a fancy locking
1095 * scheme for dumb filesystems.
1096 * For writes this function is called under i_mutex and returns with
1097 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1098 * taken and dropped again before returning.
1099 * For reads and writes i_alloc_sem is taken in shared mode and released
1100 * on I/O completion (which may happen asynchronously after returning to
1101 * the caller).
1102 *
1103 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1104 * internal locking but rather rely on the filesystem to synchronize
1105 * direct I/O reads/writes versus each other and truncate.
1106 * For reads and writes both i_mutex and i_alloc_sem are not held on
1107 * entry and are never taken.
1108 */
1109ssize_t 1137ssize_t
1110__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 1138__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
1111 struct block_device *bdev, const struct iovec *iov, loff_t offset, 1139 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1112 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io, 1140 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1113 int flags) 1141 dio_submit_t submit_io, int flags)
1114{ 1142{
1115 int seg; 1143 int seg;
1116 size_t size; 1144 size_t size;
@@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1197 (end > i_size_read(inode))); 1225 (end > i_size_read(inode)));
1198 1226
1199 retval = direct_io_worker(rw, iocb, inode, iov, offset, 1227 retval = direct_io_worker(rw, iocb, inode, iov, offset,
1200 nr_segs, blkbits, get_block, end_io, dio); 1228 nr_segs, blkbits, get_block, end_io,
1229 submit_io, dio);
1230
1231out:
1232 return retval;
1233}
1234EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
1235
1236/*
1237 * This is a library function for use by filesystem drivers.
1238 *
1239 * The locking rules are governed by the flags parameter:
1240 * - if the flags value contains DIO_LOCKING we use a fancy locking
1241 * scheme for dumb filesystems.
1242 * For writes this function is called under i_mutex and returns with
1243 * i_mutex held, for reads, i_mutex is not held on entry, but it is
1244 * taken and dropped again before returning.
1245 * For reads and writes i_alloc_sem is taken in shared mode and released
1246 * on I/O completion (which may happen asynchronously after returning to
1247 * the caller).
1248 *
1249 * - if the flags value does NOT contain DIO_LOCKING we don't use any
1250 * internal locking but rather rely on the filesystem to synchronize
1251 * direct I/O reads/writes versus each other and truncate.
1252 * For reads and writes both i_mutex and i_alloc_sem are not held on
1253 * entry and are never taken.
1254 */
1255ssize_t
1256__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1257 struct block_device *bdev, const struct iovec *iov, loff_t offset,
1258 unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
1259 dio_submit_t submit_io, int flags)
1260{
1261 ssize_t retval;
1201 1262
1263 retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
1264 offset, nr_segs, get_block, end_io, submit_io, flags);
1202 /* 1265 /*
1203 * In case of error extending write may have instantiated a few 1266 * In case of error extending write may have instantiated a few
1204 * blocks outside i_size. Trim these off again for DIO_LOCKING. 1267 * blocks outside i_size. Trim these off again for DIO_LOCKING.
1268 * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
1269 * their own manner. This is a further example of where the old
1270 * truncate sequence is inadequate.
1205 * 1271 *
1206 * NOTE: filesystems with their own locking have to handle this 1272 * NOTE: filesystems with their own locking have to handle this
1207 * on their own. 1273 * on their own.
@@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1209 if (flags & DIO_LOCKING) { 1275 if (flags & DIO_LOCKING) {
1210 if (unlikely((rw & WRITE) && retval < 0)) { 1276 if (unlikely((rw & WRITE) && retval < 0)) {
1211 loff_t isize = i_size_read(inode); 1277 loff_t isize = i_size_read(inode);
1278 loff_t end = offset + iov_length(iov, nr_segs);
1279
1212 if (end > isize) 1280 if (end > isize)
1213 vmtruncate(inode, isize); 1281 vmtruncate(inode, isize);
1214 } 1282 }
1215 } 1283 }
1216 1284
1217out:
1218 return retval; 1285 return retval;
1219} 1286}
1220EXPORT_SYMBOL(__blockdev_direct_IO); 1287EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 31f4b0e6d72c..83c4f600786a 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -12,7 +12,7 @@
12/* A global variable is a bit ugly, but it keeps the code simple */ 12/* A global variable is a bit ugly, but it keeps the code simple */
13int sysctl_drop_caches; 13int sysctl_drop_caches;
14 14
15static void drop_pagecache_sb(struct super_block *sb) 15static void drop_pagecache_sb(struct super_block *sb, void *unused)
16{ 16{
17 struct inode *inode, *toput_inode = NULL; 17 struct inode *inode, *toput_inode = NULL;
18 18
@@ -33,26 +33,6 @@ static void drop_pagecache_sb(struct super_block *sb)
33 iput(toput_inode); 33 iput(toput_inode);
34} 34}
35 35
36static void drop_pagecache(void)
37{
38 struct super_block *sb;
39
40 spin_lock(&sb_lock);
41restart:
42 list_for_each_entry(sb, &super_blocks, s_list) {
43 sb->s_count++;
44 spin_unlock(&sb_lock);
45 down_read(&sb->s_umount);
46 if (sb->s_root)
47 drop_pagecache_sb(sb);
48 up_read(&sb->s_umount);
49 spin_lock(&sb_lock);
50 if (__put_super_and_need_restart(sb))
51 goto restart;
52 }
53 spin_unlock(&sb_lock);
54}
55
56static void drop_slab(void) 36static void drop_slab(void)
57{ 37{
58 int nr_objects; 38 int nr_objects;
@@ -68,7 +48,7 @@ int drop_caches_sysctl_handler(ctl_table *table, int write,
68 proc_dointvec_minmax(table, write, buffer, length, ppos); 48 proc_dointvec_minmax(table, write, buffer, length, ppos);
69 if (write) { 49 if (write) {
70 if (sysctl_drop_caches & 1) 50 if (sysctl_drop_caches & 1)
71 drop_pagecache(); 51 iterate_supers(drop_pagecache_sb, NULL);
72 if (sysctl_drop_caches & 2) 52 if (sysctl_drop_caches & 2)
73 drop_slab(); 53 drop_slab();
74 } 54 }
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index bfc2e0f78f00..0032a9f5a3a9 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -731,15 +731,14 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, 731int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
732 struct page *page_for_lower, 732 struct page *page_for_lower,
733 size_t offset_in_page, size_t size); 733 size_t offset_in_page, size_t size);
734int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 734int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
735 size_t size);
736int ecryptfs_read_lower(char *data, loff_t offset, size_t size, 735int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
737 struct inode *ecryptfs_inode); 736 struct inode *ecryptfs_inode);
738int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, 737int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
739 pgoff_t page_index, 738 pgoff_t page_index,
740 size_t offset_in_page, size_t size, 739 size_t offset_in_page, size_t size,
741 struct inode *ecryptfs_inode); 740 struct inode *ecryptfs_inode);
742struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); 741struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
743int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); 742int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon);
744int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, 743int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid,
745 struct user_namespace *user_ns); 744 struct user_namespace *user_ns);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index e7440a6f5ebf..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -274,11 +274,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
274} 274}
275 275
276static int 276static int
277ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) 277ecryptfs_fsync(struct file *file, int datasync)
278{ 278{
279 return vfs_fsync(ecryptfs_file_to_lower(file), 279 return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
280 ecryptfs_dentry_to_lower(dentry),
281 datasync);
282} 280}
283 281
284static int ecryptfs_fasync(int fd, struct file *file, int flag) 282static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2d4418affac..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -142,19 +142,10 @@ out:
142static int grow_file(struct dentry *ecryptfs_dentry) 142static int grow_file(struct dentry *ecryptfs_dentry)
143{ 143{
144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; 144 struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode;
145 struct file fake_file;
146 struct ecryptfs_file_info tmp_file_info;
147 char zero_virt[] = { 0x00 }; 145 char zero_virt[] = { 0x00 };
148 int rc = 0; 146 int rc = 0;
149 147
150 memset(&fake_file, 0, sizeof(fake_file)); 148 rc = ecryptfs_write(ecryptfs_inode, zero_virt, 0, 1);
151 fake_file.f_path.dentry = ecryptfs_dentry;
152 memset(&tmp_file_info, 0, sizeof(tmp_file_info));
153 ecryptfs_set_file_private(&fake_file, &tmp_file_info);
154 ecryptfs_set_file_lower(
155 &fake_file,
156 ecryptfs_inode_to_private(ecryptfs_inode)->lower_file);
157 rc = ecryptfs_write(&fake_file, zero_virt, 0, 1);
158 i_size_write(ecryptfs_inode, 0); 149 i_size_write(ecryptfs_inode, 0);
159 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode); 150 rc = ecryptfs_write_inode_size_to_metadata(ecryptfs_inode);
160 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |= 151 ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat.flags |=
@@ -784,8 +775,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
784{ 775{
785 int rc = 0; 776 int rc = 0;
786 struct inode *inode = dentry->d_inode; 777 struct inode *inode = dentry->d_inode;
787 struct dentry *lower_dentry;
788 struct file fake_ecryptfs_file;
789 struct ecryptfs_crypt_stat *crypt_stat; 778 struct ecryptfs_crypt_stat *crypt_stat;
790 loff_t i_size = i_size_read(inode); 779 loff_t i_size = i_size_read(inode);
791 loff_t lower_size_before_truncate; 780 loff_t lower_size_before_truncate;
@@ -796,23 +785,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
796 goto out; 785 goto out;
797 } 786 }
798 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; 787 crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat;
799 /* Set up a fake ecryptfs file, this is used to interface with
800 * the file in the underlying filesystem so that the
801 * truncation has an effect there as well. */
802 memset(&fake_ecryptfs_file, 0, sizeof(fake_ecryptfs_file));
803 fake_ecryptfs_file.f_path.dentry = dentry;
804 /* Released at out_free: label */
805 ecryptfs_set_file_private(&fake_ecryptfs_file,
806 kmem_cache_alloc(ecryptfs_file_info_cache,
807 GFP_KERNEL));
808 if (unlikely(!ecryptfs_file_to_private(&fake_ecryptfs_file))) {
809 rc = -ENOMEM;
810 goto out;
811 }
812 lower_dentry = ecryptfs_dentry_to_lower(dentry);
813 ecryptfs_set_file_lower(
814 &fake_ecryptfs_file,
815 ecryptfs_inode_to_private(dentry->d_inode)->lower_file);
816 /* Switch on growing or shrinking file */ 788 /* Switch on growing or shrinking file */
817 if (ia->ia_size > i_size) { 789 if (ia->ia_size > i_size) {
818 char zero[] = { 0x00 }; 790 char zero[] = { 0x00 };
@@ -822,7 +794,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
822 * this triggers code that will fill in 0's throughout 794 * this triggers code that will fill in 0's throughout
823 * the intermediate portion of the previous end of the 795 * the intermediate portion of the previous end of the
824 * file and the new and of the file */ 796 * file and the new and of the file */
825 rc = ecryptfs_write(&fake_ecryptfs_file, zero, 797 rc = ecryptfs_write(inode, zero,
826 (ia->ia_size - 1), 1); 798 (ia->ia_size - 1), 1);
827 } else { /* ia->ia_size < i_size_read(inode) */ 799 } else { /* ia->ia_size < i_size_read(inode) */
828 /* We're chopping off all the pages down to the page 800 /* We're chopping off all the pages down to the page
@@ -833,12 +805,12 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
833 - (ia->ia_size & ~PAGE_CACHE_MASK)); 805 - (ia->ia_size & ~PAGE_CACHE_MASK));
834 806
835 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { 807 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
836 rc = vmtruncate(inode, ia->ia_size); 808 rc = simple_setsize(inode, ia->ia_size);
837 if (rc) 809 if (rc)
838 goto out_free; 810 goto out;
839 lower_ia->ia_size = ia->ia_size; 811 lower_ia->ia_size = ia->ia_size;
840 lower_ia->ia_valid |= ATTR_SIZE; 812 lower_ia->ia_valid |= ATTR_SIZE;
841 goto out_free; 813 goto out;
842 } 814 }
843 if (num_zeros) { 815 if (num_zeros) {
844 char *zeros_virt; 816 char *zeros_virt;
@@ -846,25 +818,25 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
846 zeros_virt = kzalloc(num_zeros, GFP_KERNEL); 818 zeros_virt = kzalloc(num_zeros, GFP_KERNEL);
847 if (!zeros_virt) { 819 if (!zeros_virt) {
848 rc = -ENOMEM; 820 rc = -ENOMEM;
849 goto out_free; 821 goto out;
850 } 822 }
851 rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, 823 rc = ecryptfs_write(inode, zeros_virt,
852 ia->ia_size, num_zeros); 824 ia->ia_size, num_zeros);
853 kfree(zeros_virt); 825 kfree(zeros_virt);
854 if (rc) { 826 if (rc) {
855 printk(KERN_ERR "Error attempting to zero out " 827 printk(KERN_ERR "Error attempting to zero out "
856 "the remainder of the end page on " 828 "the remainder of the end page on "
857 "reducing truncate; rc = [%d]\n", rc); 829 "reducing truncate; rc = [%d]\n", rc);
858 goto out_free; 830 goto out;
859 } 831 }
860 } 832 }
861 vmtruncate(inode, ia->ia_size); 833 simple_setsize(inode, ia->ia_size);
862 rc = ecryptfs_write_inode_size_to_metadata(inode); 834 rc = ecryptfs_write_inode_size_to_metadata(inode);
863 if (rc) { 835 if (rc) {
864 printk(KERN_ERR "Problem with " 836 printk(KERN_ERR "Problem with "
865 "ecryptfs_write_inode_size_to_metadata; " 837 "ecryptfs_write_inode_size_to_metadata; "
866 "rc = [%d]\n", rc); 838 "rc = [%d]\n", rc);
867 goto out_free; 839 goto out;
868 } 840 }
869 /* We are reducing the size of the ecryptfs file, and need to 841 /* We are reducing the size of the ecryptfs file, and need to
870 * know if we need to reduce the size of the lower file. */ 842 * know if we need to reduce the size of the lower file. */
@@ -878,10 +850,6 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
878 } else 850 } else
879 lower_ia->ia_valid &= ~ATTR_SIZE; 851 lower_ia->ia_valid &= ~ATTR_SIZE;
880 } 852 }
881out_free:
882 if (ecryptfs_file_to_private(&fake_ecryptfs_file))
883 kmem_cache_free(ecryptfs_file_info_cache,
884 ecryptfs_file_to_private(&fake_ecryptfs_file));
885out: 853out:
886 return rc; 854 return rc;
887} 855}
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 760983d0f25e..cbd4e18adb20 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -281,7 +281,7 @@ static void ecryptfs_init_mount_crypt_stat(
281 * 281 *
282 * Returns zero on success; non-zero on error 282 * Returns zero on success; non-zero on error
283 */ 283 */
284static int ecryptfs_parse_options(struct super_block *sb, char *options) 284static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options)
285{ 285{
286 char *p; 286 char *p;
287 int rc = 0; 287 int rc = 0;
@@ -293,7 +293,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options)
293 int fn_cipher_key_bytes; 293 int fn_cipher_key_bytes;
294 int fn_cipher_key_bytes_set = 0; 294 int fn_cipher_key_bytes_set = 0;
295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat = 295 struct ecryptfs_mount_crypt_stat *mount_crypt_stat =
296 &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; 296 &sbi->mount_crypt_stat;
297 substring_t args[MAX_OPT_ARGS]; 297 substring_t args[MAX_OPT_ARGS];
298 int token; 298 int token;
299 char *sig_src; 299 char *sig_src;
@@ -483,68 +483,7 @@ out:
483} 483}
484 484
485struct kmem_cache *ecryptfs_sb_info_cache; 485struct kmem_cache *ecryptfs_sb_info_cache;
486 486static struct file_system_type ecryptfs_fs_type;
487/**
488 * ecryptfs_fill_super
489 * @sb: The ecryptfs super block
490 * @raw_data: The options passed to mount
491 * @silent: Not used but required by function prototype
492 *
493 * Sets up what we can of the sb, rest is done in ecryptfs_read_super
494 *
495 * Returns zero on success; non-zero otherwise
496 */
497static int
498ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent)
499{
500 struct ecryptfs_sb_info *esi;
501 int rc = 0;
502
503 /* Released in ecryptfs_put_super() */
504 ecryptfs_set_superblock_private(sb,
505 kmem_cache_zalloc(ecryptfs_sb_info_cache,
506 GFP_KERNEL));
507 esi = ecryptfs_superblock_to_private(sb);
508 if (!esi) {
509 ecryptfs_printk(KERN_WARNING, "Out of memory\n");
510 rc = -ENOMEM;
511 goto out;
512 }
513
514 rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
515 if (rc)
516 goto out;
517
518 sb->s_bdi = &esi->bdi;
519 sb->s_op = &ecryptfs_sops;
520 /* Released through deactivate_super(sb) from get_sb_nodev */
521 sb->s_root = d_alloc(NULL, &(const struct qstr) {
522 .hash = 0,.name = "/",.len = 1});
523 if (!sb->s_root) {
524 ecryptfs_printk(KERN_ERR, "d_alloc failed\n");
525 rc = -ENOMEM;
526 goto out;
527 }
528 sb->s_root->d_op = &ecryptfs_dops;
529 sb->s_root->d_sb = sb;
530 sb->s_root->d_parent = sb->s_root;
531 /* Released in d_release when dput(sb->s_root) is called */
532 /* through deactivate_super(sb) from get_sb_nodev() */
533 ecryptfs_set_dentry_private(sb->s_root,
534 kmem_cache_zalloc(ecryptfs_dentry_info_cache,
535 GFP_KERNEL));
536 if (!ecryptfs_dentry_to_private(sb->s_root)) {
537 ecryptfs_printk(KERN_ERR,
538 "dentry_info_cache alloc failed\n");
539 rc = -ENOMEM;
540 goto out;
541 }
542 rc = 0;
543out:
544 /* Should be able to rely on deactivate_super called from
545 * get_sb_nodev */
546 return rc;
547}
548 487
549/** 488/**
550 * ecryptfs_read_super 489 * ecryptfs_read_super
@@ -565,6 +504,13 @@ static int ecryptfs_read_super(struct super_block *sb, const char *dev_name)
565 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n"); 504 ecryptfs_printk(KERN_WARNING, "path_lookup() failed\n");
566 goto out; 505 goto out;
567 } 506 }
507 if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) {
508 rc = -EINVAL;
509 printk(KERN_ERR "Mount on filesystem of type "
510 "eCryptfs explicitly disallowed due to "
511 "known incompatibilities\n");
512 goto out_free;
513 }
568 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb); 514 ecryptfs_set_superblock_lower(sb, path.dentry->d_sb);
569 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes; 515 sb->s_maxbytes = path.dentry->d_sb->s_maxbytes;
570 sb->s_blocksize = path.dentry->d_sb->s_blocksize; 516 sb->s_blocksize = path.dentry->d_sb->s_blocksize;
@@ -588,11 +534,8 @@ out:
588 * @dev_name: The path to mount over 534 * @dev_name: The path to mount over
589 * @raw_data: The options passed into the kernel 535 * @raw_data: The options passed into the kernel
590 * 536 *
591 * The whole ecryptfs_get_sb process is broken into 4 functions: 537 * The whole ecryptfs_get_sb process is broken into 3 functions:
592 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any 538 * ecryptfs_parse_options(): handle options passed to ecryptfs, if any
593 * ecryptfs_fill_super(): used by get_sb_nodev, fills out the super_block
594 * with as much information as it can before needing
595 * the lower filesystem.
596 * ecryptfs_read_super(): this accesses the lower filesystem and uses 539 * ecryptfs_read_super(): this accesses the lower filesystem and uses
597 * ecryptfs_interpose to perform most of the linking 540 * ecryptfs_interpose to perform most of the linking
598 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) 541 * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c)
@@ -601,30 +544,78 @@ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags,
601 const char *dev_name, void *raw_data, 544 const char *dev_name, void *raw_data,
602 struct vfsmount *mnt) 545 struct vfsmount *mnt)
603{ 546{
547 struct super_block *s;
548 struct ecryptfs_sb_info *sbi;
549 struct ecryptfs_dentry_info *root_info;
550 const char *err = "Getting sb failed";
604 int rc; 551 int rc;
605 struct super_block *sb;
606 552
607 rc = get_sb_nodev(fs_type, flags, raw_data, ecryptfs_fill_super, mnt); 553 sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL);
608 if (rc < 0) { 554 if (!sbi) {
609 printk(KERN_ERR "Getting sb failed; rc = [%d]\n", rc); 555 rc = -ENOMEM;
610 goto out; 556 goto out;
611 } 557 }
612 sb = mnt->mnt_sb; 558
613 rc = ecryptfs_parse_options(sb, raw_data); 559 rc = ecryptfs_parse_options(sbi, raw_data);
614 if (rc) { 560 if (rc) {
615 printk(KERN_ERR "Error parsing options; rc = [%d]\n", rc); 561 err = "Error parsing options";
616 goto out_abort; 562 goto out;
563 }
564
565 s = sget(fs_type, NULL, set_anon_super, NULL);
566 if (IS_ERR(s)) {
567 rc = PTR_ERR(s);
568 goto out;
617 } 569 }
618 rc = ecryptfs_read_super(sb, dev_name); 570
571 s->s_flags = flags;
572 rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
619 if (rc) { 573 if (rc) {
620 printk(KERN_ERR "Reading sb failed; rc = [%d]\n", rc); 574 deactivate_locked_super(s);
621 goto out_abort; 575 goto out;
622 } 576 }
623 goto out; 577
624out_abort: 578 ecryptfs_set_superblock_private(s, sbi);
625 dput(sb->s_root); /* aka mnt->mnt_root, as set by get_sb_nodev() */ 579 s->s_bdi = &sbi->bdi;
626 deactivate_locked_super(sb); 580
581 /* ->kill_sb() will take care of sbi after that point */
582 sbi = NULL;
583 s->s_op = &ecryptfs_sops;
584
585 rc = -ENOMEM;
586 s->s_root = d_alloc(NULL, &(const struct qstr) {
587 .hash = 0,.name = "/",.len = 1});
588 if (!s->s_root) {
589 deactivate_locked_super(s);
590 goto out;
591 }
592 s->s_root->d_op = &ecryptfs_dops;
593 s->s_root->d_sb = s;
594 s->s_root->d_parent = s->s_root;
595
596 root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
597 if (!root_info) {
598 deactivate_locked_super(s);
599 goto out;
600 }
601 /* ->kill_sb() will take care of root_info */
602 ecryptfs_set_dentry_private(s->s_root, root_info);
603 s->s_flags |= MS_ACTIVE;
604 rc = ecryptfs_read_super(s, dev_name);
605 if (rc) {
606 deactivate_locked_super(s);
607 err = "Reading sb failed";
608 goto out;
609 }
610 simple_set_mnt(mnt, s);
611 return 0;
612
627out: 613out:
614 if (sbi) {
615 ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat);
616 kmem_cache_free(ecryptfs_sb_info_cache, sbi);
617 }
618 printk(KERN_ERR "%s; rc = [%d]\n", err, rc);
628 return rc; 619 return rc;
629} 620}
630 621
@@ -633,11 +624,16 @@ out:
633 * @sb: The ecryptfs super block 624 * @sb: The ecryptfs super block
634 * 625 *
635 * Used to bring the superblock down and free the private data. 626 * Used to bring the superblock down and free the private data.
636 * Private data is free'd in ecryptfs_put_super()
637 */ 627 */
638static void ecryptfs_kill_block_super(struct super_block *sb) 628static void ecryptfs_kill_block_super(struct super_block *sb)
639{ 629{
640 generic_shutdown_super(sb); 630 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
631 kill_anon_super(sb);
632 if (!sb_info)
633 return;
634 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
635 bdi_destroy(&sb_info->bdi);
636 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
641} 637}
642 638
643static struct file_system_type ecryptfs_fs_type = { 639static struct file_system_type ecryptfs_fs_type = {
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 2ee9a3a7b68c..b1d82756544b 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -44,17 +44,9 @@
44 * Returns locked and up-to-date page (if ok), with increased 44 * Returns locked and up-to-date page (if ok), with increased
45 * refcnt. 45 * refcnt.
46 */ 46 */
47struct page *ecryptfs_get_locked_page(struct file *file, loff_t index) 47struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
48{ 48{
49 struct dentry *dentry; 49 struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
50 struct inode *inode;
51 struct address_space *mapping;
52 struct page *page;
53
54 dentry = file->f_path.dentry;
55 inode = dentry->d_inode;
56 mapping = inode->i_mapping;
57 page = read_mapping_page(mapping, index, (void *)file);
58 if (!IS_ERR(page)) 50 if (!IS_ERR(page))
59 lock_page(page); 51 lock_page(page);
60 return page; 52 return page;
@@ -198,7 +190,7 @@ out:
198static int ecryptfs_readpage(struct file *file, struct page *page) 190static int ecryptfs_readpage(struct file *file, struct page *page)
199{ 191{
200 struct ecryptfs_crypt_stat *crypt_stat = 192 struct ecryptfs_crypt_stat *crypt_stat =
201 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 193 &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
202 int rc = 0; 194 int rc = 0;
203 195
204 if (!crypt_stat 196 if (!crypt_stat
@@ -300,8 +292,7 @@ static int ecryptfs_write_begin(struct file *file,
300 292
301 if (!PageUptodate(page)) { 293 if (!PageUptodate(page)) {
302 struct ecryptfs_crypt_stat *crypt_stat = 294 struct ecryptfs_crypt_stat *crypt_stat =
303 &ecryptfs_inode_to_private( 295 &ecryptfs_inode_to_private(mapping->host)->crypt_stat;
304 file->f_path.dentry->d_inode)->crypt_stat;
305 296
306 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) 297 if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)
307 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { 298 || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) {
@@ -487,7 +478,7 @@ static int ecryptfs_write_end(struct file *file,
487 unsigned to = from + copied; 478 unsigned to = from + copied;
488 struct inode *ecryptfs_inode = mapping->host; 479 struct inode *ecryptfs_inode = mapping->host;
489 struct ecryptfs_crypt_stat *crypt_stat = 480 struct ecryptfs_crypt_stat *crypt_stat =
490 &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; 481 &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat;
491 int rc; 482 int rc;
492 483
493 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { 484 if (crypt_stat->flags & ECRYPTFS_NEW_FILE) {
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0cc4fafd6552..db184ef15d3d 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -93,7 +93,7 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
93 93
94/** 94/**
95 * ecryptfs_write 95 * ecryptfs_write
96 * @ecryptfs_file: The eCryptfs file into which to write 96 * @ecryptfs_inode: The eCryptfs file into which to write
97 * @data: Virtual address where data to write is located 97 * @data: Virtual address where data to write is located
98 * @offset: Offset in the eCryptfs file at which to begin writing the 98 * @offset: Offset in the eCryptfs file at which to begin writing the
99 * data from @data 99 * data from @data
@@ -109,12 +109,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
109 * 109 *
110 * Returns zero on success; non-zero otherwise 110 * Returns zero on success; non-zero otherwise
111 */ 111 */
112int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, 112int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
113 size_t size) 113 size_t size)
114{ 114{
115 struct page *ecryptfs_page; 115 struct page *ecryptfs_page;
116 struct ecryptfs_crypt_stat *crypt_stat; 116 struct ecryptfs_crypt_stat *crypt_stat;
117 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
118 char *ecryptfs_page_virt; 117 char *ecryptfs_page_virt;
119 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode); 118 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
120 loff_t data_offset = 0; 119 loff_t data_offset = 0;
@@ -145,7 +144,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset,
145 if (num_bytes > total_remaining_zeros) 144 if (num_bytes > total_remaining_zeros)
146 num_bytes = total_remaining_zeros; 145 num_bytes = total_remaining_zeros;
147 } 146 }
148 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 147 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
149 ecryptfs_page_idx); 148 ecryptfs_page_idx);
150 if (IS_ERR(ecryptfs_page)) { 149 if (IS_ERR(ecryptfs_page)) {
151 rc = PTR_ERR(ecryptfs_page); 150 rc = PTR_ERR(ecryptfs_page);
@@ -302,10 +301,10 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
302int ecryptfs_read(char *data, loff_t offset, size_t size, 301int ecryptfs_read(char *data, loff_t offset, size_t size,
303 struct file *ecryptfs_file) 302 struct file *ecryptfs_file)
304{ 303{
304 struct inode *ecryptfs_inode = ecryptfs_file->f_dentry->d_inode;
305 struct page *ecryptfs_page; 305 struct page *ecryptfs_page;
306 char *ecryptfs_page_virt; 306 char *ecryptfs_page_virt;
307 loff_t ecryptfs_file_size = 307 loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
308 i_size_read(ecryptfs_file->f_dentry->d_inode);
309 loff_t data_offset = 0; 308 loff_t data_offset = 0;
310 loff_t pos; 309 loff_t pos;
311 int rc = 0; 310 int rc = 0;
@@ -327,7 +326,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size,
327 326
328 if (num_bytes > total_remaining_bytes) 327 if (num_bytes > total_remaining_bytes)
329 num_bytes = total_remaining_bytes; 328 num_bytes = total_remaining_bytes;
330 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_file, 329 ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
331 ecryptfs_page_idx); 330 ecryptfs_page_idx);
332 if (IS_ERR(ecryptfs_page)) { 331 if (IS_ERR(ecryptfs_page)) {
333 rc = PTR_ERR(ecryptfs_page); 332 rc = PTR_ERR(ecryptfs_page);
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0c0ae491d231..0435886e4a9f 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -109,27 +109,6 @@ void ecryptfs_init_inode(struct inode *inode, struct inode *lower_inode)
109} 109}
110 110
111/** 111/**
112 * ecryptfs_put_super
113 * @sb: Pointer to the ecryptfs super block
114 *
115 * Final actions when unmounting a file system.
116 * This will handle deallocation and release of our private data.
117 */
118static void ecryptfs_put_super(struct super_block *sb)
119{
120 struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
121
122 lock_kernel();
123
124 ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
125 bdi_destroy(&sb_info->bdi);
126 kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
127 ecryptfs_set_superblock_private(sb, NULL);
128
129 unlock_kernel();
130}
131
132/**
133 * ecryptfs_statfs 112 * ecryptfs_statfs
134 * @sb: The ecryptfs super block 113 * @sb: The ecryptfs super block
135 * @buf: The struct kstatfs to fill in with stats 114 * @buf: The struct kstatfs to fill in with stats
@@ -203,7 +182,6 @@ const struct super_operations ecryptfs_sops = {
203 .alloc_inode = ecryptfs_alloc_inode, 182 .alloc_inode = ecryptfs_alloc_inode,
204 .destroy_inode = ecryptfs_destroy_inode, 183 .destroy_inode = ecryptfs_destroy_inode,
205 .drop_inode = generic_delete_inode, 184 .drop_inode = generic_delete_inode,
206 .put_super = ecryptfs_put_super,
207 .statfs = ecryptfs_statfs, 185 .statfs = ecryptfs_statfs,
208 .remount_fs = NULL, 186 .remount_fs = NULL,
209 .clear_inode = ecryptfs_clear_inode, 187 .clear_inode = ecryptfs_clear_inode,
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2c..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
242 * use STACK_TOP because that can depend on attributes which aren't 242 * use STACK_TOP because that can depend on attributes which aren't
243 * configured yet. 243 * configured yet.
244 */ 244 */
245 BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
245 vma->vm_end = STACK_TOP_MAX; 246 vma->vm_end = STACK_TOP_MAX;
246 vma->vm_start = vma->vm_end - PAGE_SIZE; 247 vma->vm_start = vma->vm_end - PAGE_SIZE;
247 vma->vm_flags = VM_STACK_FLAGS; 248 vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
248 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 249 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
249 INIT_LIST_HEAD(&vma->anon_vma_chain); 250 INIT_LIST_HEAD(&vma->anon_vma_chain);
250 err = insert_vm_struct(mm, vma); 251 err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
616 else if (executable_stack == EXSTACK_DISABLE_X) 617 else if (executable_stack == EXSTACK_DISABLE_X)
617 vm_flags &= ~VM_EXEC; 618 vm_flags &= ~VM_EXEC;
618 vm_flags |= mm->def_flags; 619 vm_flags |= mm->def_flags;
620 vm_flags |= VM_STACK_INCOMPLETE_SETUP;
619 621
620 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end, 622 ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
621 vm_flags); 623 vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
630 goto out_unlock; 632 goto out_unlock;
631 } 633 }
632 634
635 /* mprotect_fixup is overkill to remove the temporary stack flags */
636 vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
637
633 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ 638 stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
634 stack_size = vma->vm_end - vma->vm_start; 639 stack_size = vma->vm_end - vma->vm_start;
635 /* 640 /*
@@ -763,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
763 struct signal_struct *sig = tsk->signal; 768 struct signal_struct *sig = tsk->signal;
764 struct sighand_struct *oldsighand = tsk->sighand; 769 struct sighand_struct *oldsighand = tsk->sighand;
765 spinlock_t *lock = &oldsighand->siglock; 770 spinlock_t *lock = &oldsighand->siglock;
766 int count;
767 771
768 if (thread_group_empty(tsk)) 772 if (thread_group_empty(tsk))
769 goto no_thread_group; 773 goto no_thread_group;
@@ -780,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
780 spin_unlock_irq(lock); 784 spin_unlock_irq(lock);
781 return -EAGAIN; 785 return -EAGAIN;
782 } 786 }
787
783 sig->group_exit_task = tsk; 788 sig->group_exit_task = tsk;
784 zap_other_threads(tsk); 789 sig->notify_count = zap_other_threads(tsk);
790 if (!thread_group_leader(tsk))
791 sig->notify_count--;
785 792
786 /* Account for the thread group leader hanging around: */ 793 while (sig->notify_count) {
787 count = thread_group_leader(tsk) ? 1 : 2;
788 sig->notify_count = count;
789 while (atomic_read(&sig->count) > count) {
790 __set_current_state(TASK_UNINTERRUPTIBLE); 794 __set_current_state(TASK_UNINTERRUPTIBLE);
791 spin_unlock_irq(lock); 795 spin_unlock_irq(lock);
792 schedule(); 796 schedule();
@@ -1657,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
1657 struct task_struct *tsk = current; 1661 struct task_struct *tsk = current;
1658 struct mm_struct *mm = tsk->mm; 1662 struct mm_struct *mm = tsk->mm;
1659 struct completion *vfork_done; 1663 struct completion *vfork_done;
1660 int core_waiters; 1664 int core_waiters = -EBUSY;
1661 1665
1662 init_completion(&core_state->startup); 1666 init_completion(&core_state->startup);
1663 core_state->dumper.task = tsk; 1667 core_state->dumper.task = tsk;
1664 core_state->dumper.next = NULL; 1668 core_state->dumper.next = NULL;
1665 core_waiters = zap_threads(tsk, mm, core_state, exit_code); 1669
1670 down_write(&mm->mmap_sem);
1671 if (!mm->core_state)
1672 core_waiters = zap_threads(tsk, mm, core_state, exit_code);
1666 up_write(&mm->mmap_sem); 1673 up_write(&mm->mmap_sem);
1667 1674
1668 if (unlikely(core_waiters < 0)) 1675 if (unlikely(core_waiters < 0))
@@ -1782,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
1782} 1789}
1783 1790
1784 1791
1792/*
1793 * uhm_pipe_setup
1794 * helper function to customize the process used
1795 * to collect the core in userspace. Specifically
1796 * it sets up a pipe and installs it as fd 0 (stdin)
1797 * for the process. Returns 0 on success, or
1798 * PTR_ERR on failure.
1799 * Note that it also sets the core limit to 1. This
1800 * is a special value that we use to trap recursive
1801 * core dumps
1802 */
1803static int umh_pipe_setup(struct subprocess_info *info)
1804{
1805 struct file *rp, *wp;
1806 struct fdtable *fdt;
1807 struct coredump_params *cp = (struct coredump_params *)info->data;
1808 struct files_struct *cf = current->files;
1809
1810 wp = create_write_pipe(0);
1811 if (IS_ERR(wp))
1812 return PTR_ERR(wp);
1813
1814 rp = create_read_pipe(wp, 0);
1815 if (IS_ERR(rp)) {
1816 free_write_pipe(wp);
1817 return PTR_ERR(rp);
1818 }
1819
1820 cp->file = wp;
1821
1822 sys_close(0);
1823 fd_install(0, rp);
1824 spin_lock(&cf->file_lock);
1825 fdt = files_fdtable(cf);
1826 FD_SET(0, fdt->open_fds);
1827 FD_CLR(0, fdt->close_on_exec);
1828 spin_unlock(&cf->file_lock);
1829
1830 /* and disallow core files too */
1831 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
1832
1833 return 0;
1834}
1835
1785void do_coredump(long signr, int exit_code, struct pt_regs *regs) 1836void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1786{ 1837{
1787 struct core_state core_state; 1838 struct core_state core_state;
1788 char corename[CORENAME_MAX_SIZE + 1]; 1839 char corename[CORENAME_MAX_SIZE + 1];
1789 struct mm_struct *mm = current->mm; 1840 struct mm_struct *mm = current->mm;
1790 struct linux_binfmt * binfmt; 1841 struct linux_binfmt * binfmt;
1791 struct inode * inode;
1792 const struct cred *old_cred; 1842 const struct cred *old_cred;
1793 struct cred *cred; 1843 struct cred *cred;
1794 int retval = 0; 1844 int retval = 0;
1795 int flag = 0; 1845 int flag = 0;
1796 int ispipe = 0; 1846 int ispipe;
1797 char **helper_argv = NULL;
1798 int helper_argc = 0;
1799 int dump_count = 0;
1800 static atomic_t core_dump_count = ATOMIC_INIT(0); 1847 static atomic_t core_dump_count = ATOMIC_INIT(0);
1801 struct coredump_params cprm = { 1848 struct coredump_params cprm = {
1802 .signr = signr, 1849 .signr = signr,
@@ -1815,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1815 binfmt = mm->binfmt; 1862 binfmt = mm->binfmt;
1816 if (!binfmt || !binfmt->core_dump) 1863 if (!binfmt || !binfmt->core_dump)
1817 goto fail; 1864 goto fail;
1818 1865 if (!__get_dumpable(cprm.mm_flags))
1819 cred = prepare_creds();
1820 if (!cred) {
1821 retval = -ENOMEM;
1822 goto fail; 1866 goto fail;
1823 }
1824 1867
1825 down_write(&mm->mmap_sem); 1868 cred = prepare_creds();
1826 /* 1869 if (!cred)
1827 * If another thread got here first, or we are not dumpable, bail out.
1828 */
1829 if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
1830 up_write(&mm->mmap_sem);
1831 put_cred(cred);
1832 goto fail; 1870 goto fail;
1833 }
1834
1835 /* 1871 /*
1836 * We cannot trust fsuid as being the "true" uid of the 1872 * We cannot trust fsuid as being the "true" uid of the
1837 * process nor do we know its entire history. We only know it 1873 * process nor do we know its entire history. We only know it
@@ -1844,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1844 } 1880 }
1845 1881
1846 retval = coredump_wait(exit_code, &core_state); 1882 retval = coredump_wait(exit_code, &core_state);
1847 if (retval < 0) { 1883 if (retval < 0)
1848 put_cred(cred); 1884 goto fail_creds;
1849 goto fail;
1850 }
1851 1885
1852 old_cred = override_creds(cred); 1886 old_cred = override_creds(cred);
1853 1887
@@ -1865,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1865 ispipe = format_corename(corename, signr); 1899 ispipe = format_corename(corename, signr);
1866 unlock_kernel(); 1900 unlock_kernel();
1867 1901
1868 if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
1869 goto fail_unlock;
1870
1871 if (ispipe) { 1902 if (ispipe) {
1872 if (cprm.limit == 0) { 1903 int dump_count;
1904 char **helper_argv;
1905
1906 if (cprm.limit == 1) {
1873 /* 1907 /*
1874 * Normally core limits are irrelevant to pipes, since 1908 * Normally core limits are irrelevant to pipes, since
1875 * we're not writing to the file system, but we use 1909 * we're not writing to the file system, but we use
1876 * cprm.limit of 0 here as a speacial value. Any 1910 * cprm.limit of 1 here as a speacial value. Any
1877 * non-zero limit gets set to RLIM_INFINITY below, but 1911 * non-1 limit gets set to RLIM_INFINITY below, but
1878 * a limit of 0 skips the dump. This is a consistent 1912 * a limit of 0 skips the dump. This is a consistent
1879 * way to catch recursive crashes. We can still crash 1913 * way to catch recursive crashes. We can still crash
1880 * if the core_pattern binary sets RLIM_CORE = !0 1914 * if the core_pattern binary sets RLIM_CORE = !1
1881 * but it runs as root, and can do lots of stupid things 1915 * but it runs as root, and can do lots of stupid things
1882 * Note that we use task_tgid_vnr here to grab the pid 1916 * Note that we use task_tgid_vnr here to grab the pid
1883 * of the process group leader. That way we get the 1917 * of the process group leader. That way we get the
@@ -1885,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1885 * core_pattern process dies. 1919 * core_pattern process dies.
1886 */ 1920 */
1887 printk(KERN_WARNING 1921 printk(KERN_WARNING
1888 "Process %d(%s) has RLIMIT_CORE set to 0\n", 1922 "Process %d(%s) has RLIMIT_CORE set to 1\n",
1889 task_tgid_vnr(current), current->comm); 1923 task_tgid_vnr(current), current->comm);
1890 printk(KERN_WARNING "Aborting core\n"); 1924 printk(KERN_WARNING "Aborting core\n");
1891 goto fail_unlock; 1925 goto fail_unlock;
1892 } 1926 }
1927 cprm.limit = RLIM_INFINITY;
1893 1928
1894 dump_count = atomic_inc_return(&core_dump_count); 1929 dump_count = atomic_inc_return(&core_dump_count);
1895 if (core_pipe_limit && (core_pipe_limit < dump_count)) { 1930 if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1899,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
1899 goto fail_dropcount; 1934 goto fail_dropcount;
1900 } 1935 }
1901 1936
1902 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); 1937 helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
1903 if (!helper_argv) { 1938 if (!helper_argv) {
1904 printk(KERN_WARNING "%s failed to allocate memory\n", 1939 printk(KERN_WARNING "%s failed to allocate memory\n",
1905 __func__); 1940 __func__);
1906 goto fail_dropcount; 1941 goto fail_dropcount;
1907 } 1942 }
1908 1943
1909 cprm.limit = RLIM_INFINITY; 1944 retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
1910 1945 NULL, UMH_WAIT_EXEC, umh_pipe_setup,
1911 /* SIGPIPE can happen, but it's just never processed */ 1946 NULL, &cprm);
1912 if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, 1947 argv_free(helper_argv);
1913 &cprm.file)) { 1948 if (retval) {
1914 printk(KERN_INFO "Core dump to %s pipe failed\n", 1949 printk(KERN_INFO "Core dump to %s pipe failed\n",
1915 corename); 1950 corename);
1916 goto fail_dropcount; 1951 goto close_fail;
1917 } 1952 }
1918 } else 1953 } else {
1954 struct inode *inode;
1955
1956 if (cprm.limit < binfmt->min_coredump)
1957 goto fail_unlock;
1958
1919 cprm.file = filp_open(corename, 1959 cprm.file = filp_open(corename,
1920 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 1960 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
1921 0600); 1961 0600);
1922 if (IS_ERR(cprm.file)) 1962 if (IS_ERR(cprm.file))
1923 goto fail_dropcount; 1963 goto fail_unlock;
1924 inode = cprm.file->f_path.dentry->d_inode;
1925 if (inode->i_nlink > 1)
1926 goto close_fail; /* multiple links - don't dump */
1927 if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
1928 goto close_fail;
1929
1930 /* AK: actually i see no reason to not allow this for named pipes etc.,
1931 but keep the previous behaviour for now. */
1932 if (!ispipe && !S_ISREG(inode->i_mode))
1933 goto close_fail;
1934 /*
1935 * Dont allow local users get cute and trick others to coredump
1936 * into their pre-created files:
1937 * Note, this is not relevant for pipes
1938 */
1939 if (!ispipe && (inode->i_uid != current_fsuid()))
1940 goto close_fail;
1941 if (!cprm.file->f_op)
1942 goto close_fail;
1943 if (!cprm.file->f_op->write)
1944 goto close_fail;
1945 if (!ispipe &&
1946 do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
1947 goto close_fail;
1948 1964
1949 retval = binfmt->core_dump(&cprm); 1965 inode = cprm.file->f_path.dentry->d_inode;
1966 if (inode->i_nlink > 1)
1967 goto close_fail;
1968 if (d_unhashed(cprm.file->f_path.dentry))
1969 goto close_fail;
1970 /*
1971 * AK: actually i see no reason to not allow this for named
1972 * pipes etc, but keep the previous behaviour for now.
1973 */
1974 if (!S_ISREG(inode->i_mode))
1975 goto close_fail;
1976 /*
1977 * Dont allow local users get cute and trick others to coredump
1978 * into their pre-created files.
1979 */
1980 if (inode->i_uid != current_fsuid())
1981 goto close_fail;
1982 if (!cprm.file->f_op || !cprm.file->f_op->write)
1983 goto close_fail;
1984 if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
1985 goto close_fail;
1986 }
1950 1987
1988 retval = binfmt->core_dump(&cprm);
1951 if (retval) 1989 if (retval)
1952 current->signal->group_exit_code |= 0x80; 1990 current->signal->group_exit_code |= 0x80;
1953close_fail: 1991
1954 if (ispipe && core_pipe_limit) 1992 if (ispipe && core_pipe_limit)
1955 wait_for_dump_helpers(cprm.file); 1993 wait_for_dump_helpers(cprm.file);
1956 filp_close(cprm.file, NULL); 1994close_fail:
1995 if (cprm.file)
1996 filp_close(cprm.file, NULL);
1957fail_dropcount: 1997fail_dropcount:
1958 if (dump_count) 1998 if (ispipe)
1959 atomic_dec(&core_dump_count); 1999 atomic_dec(&core_dump_count);
1960fail_unlock: 2000fail_unlock:
1961 if (helper_argv) 2001 coredump_finish(mm);
1962 argv_free(helper_argv);
1963
1964 revert_creds(old_cred); 2002 revert_creds(old_cred);
2003fail_creds:
1965 put_cred(cred); 2004 put_cred(cred);
1966 coredump_finish(mm);
1967fail: 2005fail:
1968 return; 2006 return;
1969} 2007}
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
608 de->inode_no = cpu_to_le64(parent->i_ino); 608 de->inode_no = cpu_to_le64(parent->i_ino);
609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR)); 609 memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
610 exofs_set_de_type(de, inode); 610 exofs_set_de_type(de, inode);
611 kunmap_atomic(page, KM_USER0); 611 kunmap_atomic(kaddr, KM_USER0);
612 err = exofs_commit_chunk(page, 0, chunk_size); 612 err = exofs_commit_chunk(page, 0, chunk_size);
613fail: 613fail:
614 page_cache_release(page); 614 page_cache_release(page);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
40 return 0; 40 return 0;
41} 41}
42 42
43static int exofs_file_fsync(struct file *filp, struct dentry *dentry, 43static int exofs_file_fsync(struct file *filp, int datasync)
44 int datasync)
45{ 44{
46 int ret; 45 int ret;
47 struct address_space *mapping = filp->f_mapping; 46 struct address_space *mapping = filp->f_mapping;
48 struct inode *inode = dentry->d_inode; 47 struct inode *inode = mapping->host;
49 struct super_block *sb; 48 struct super_block *sb;
50 49
51 ret = filemap_write_and_wait(mapping); 50 ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
66 65
67static int exofs_flush(struct file *file, fl_owner_t id) 66static int exofs_flush(struct file *file, fl_owner_t id)
68{ 67{
69 exofs_file_fsync(file, file->f_path.dentry, 1); 68 exofs_file_fsync(file, 1);
70 /* TODO: Flush the OSD target */ 69 /* TODO: Flush the OSD target */
71 return 0; 70 return 0;
72} 71}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 76d2a79ef93e..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
755 return ret; 755 return ret;
756} 756}
757 757
758static int exofs_releasepage(struct page *page, gfp_t gfp)
759{
760 EXOFS_DBGMSG("page 0x%lx\n", page->index);
761 WARN_ON(1);
762 return try_to_free_buffers(page);
763}
764
765static void exofs_invalidatepage(struct page *page, unsigned long offset)
766{
767 EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
768 WARN_ON(1);
769
770 block_invalidatepage(page, offset);
771}
772
758const struct address_space_operations exofs_aops = { 773const struct address_space_operations exofs_aops = {
759 .readpage = exofs_readpage, 774 .readpage = exofs_readpage,
760 .readpages = exofs_readpages, 775 .readpages = exofs_readpages,
@@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = {
762 .writepages = exofs_writepages, 777 .writepages = exofs_writepages,
763 .write_begin = exofs_write_begin_export, 778 .write_begin = exofs_write_begin_export,
764 .write_end = exofs_write_end, 779 .write_end = exofs_write_end,
780 .releasepage = exofs_releasepage,
781 .set_page_dirty = __set_page_dirty_nobuffers,
782 .invalidatepage = exofs_invalidatepage,
783
784 /* Not implemented Yet */
785 .bmap = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
786 .direct_IO = NULL, /* TODO: Should be trivial to do */
787
788 /* With these NULL has special meaning or default is not exported */
789 .sync_page = NULL,
790 .get_xip_mem = NULL,
791 .migratepage = NULL,
792 .launder_page = NULL,
793 .is_partially_uptodate = NULL,
794 .error_remove_page = NULL,
765}; 795};
766 796
767/****************************************************************************** 797/******************************************************************************
@@ -1123,16 +1153,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1123 sbi = sb->s_fs_info; 1153 sbi = sb->s_fs_info;
1124 1154
1125 sb->s_dirt = 1; 1155 sb->s_dirt = 1;
1126 inode->i_uid = current->cred->fsuid; 1156 inode_init_owner(inode, dir, mode);
1127 if (dir->i_mode & S_ISGID) {
1128 inode->i_gid = dir->i_gid;
1129 if (S_ISDIR(mode))
1130 mode |= S_ISGID;
1131 } else {
1132 inode->i_gid = current->cred->fsgid;
1133 }
1134 inode->i_mode = mode;
1135
1136 inode->i_ino = sbi->s_nextid++; 1157 inode->i_ino = sbi->s_nextid++;
1137 inode->i_blkbits = EXOFS_BLKSHIFT; 1158 inode->i_blkbits = EXOFS_BLKSHIFT;
1138 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1159 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index a99e54318c3d..ca7e2a0ed98a 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -420,7 +420,7 @@ release_and_out:
420 return error; 420 return error;
421} 421}
422 422
423struct xattr_handler ext2_xattr_acl_access_handler = { 423const struct xattr_handler ext2_xattr_acl_access_handler = {
424 .prefix = POSIX_ACL_XATTR_ACCESS, 424 .prefix = POSIX_ACL_XATTR_ACCESS,
425 .flags = ACL_TYPE_ACCESS, 425 .flags = ACL_TYPE_ACCESS,
426 .list = ext2_xattr_list_acl_access, 426 .list = ext2_xattr_list_acl_access,
@@ -428,7 +428,7 @@ struct xattr_handler ext2_xattr_acl_access_handler = {
428 .set = ext2_xattr_set_acl, 428 .set = ext2_xattr_set_acl,
429}; 429};
430 430
431struct xattr_handler ext2_xattr_acl_default_handler = { 431const struct xattr_handler ext2_xattr_acl_default_handler = {
432 .prefix = POSIX_ACL_XATTR_DEFAULT, 432 .prefix = POSIX_ACL_XATTR_DEFAULT,
433 .flags = ACL_TYPE_DEFAULT, 433 .flags = ACL_TYPE_DEFAULT,
434 .list = ext2_xattr_list_acl_default, 434 .list = ext2_xattr_list_acl_default,
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
122extern void ext2_delete_inode (struct inode *); 122extern void ext2_delete_inode (struct inode *);
123extern int ext2_sync_inode (struct inode *); 123extern int ext2_sync_inode (struct inode *);
124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); 124extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
125extern void ext2_truncate (struct inode *);
126extern int ext2_setattr (struct dentry *, struct iattr *); 125extern int ext2_setattr (struct dentry *, struct iattr *);
127extern void ext2_set_inode_flags(struct inode *inode); 126extern void ext2_set_inode_flags(struct inode *inode);
128extern void ext2_get_inode_flags(struct ext2_inode_info *); 127extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
155extern const struct file_operations ext2_dir_operations; 154extern const struct file_operations ext2_dir_operations;
156 155
157/* file.c */ 156/* file.c */
158extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync); 157extern int ext2_fsync(struct file *file, int datasync);
159extern const struct inode_operations ext2_file_inode_operations; 158extern const struct inode_operations ext2_file_inode_operations;
160extern const struct file_operations ext2_file_operations; 159extern const struct file_operations ext2_file_operations;
161extern const struct file_operations ext2_xip_file_operations; 160extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
40 return 0; 40 return 0;
41} 41}
42 42
43int ext2_fsync(struct file *file, struct dentry *dentry, int datasync) 43int ext2_fsync(struct file *file, int datasync)
44{ 44{
45 int ret; 45 int ret;
46 struct super_block *sb = dentry->d_inode->i_sb; 46 struct super_block *sb = file->f_mapping->host->i_sb;
47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; 47 struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
48 48
49 ret = simple_fsync(file, dentry, datasync); 49 ret = generic_file_fsync(file, datasync);
50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) { 50 if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
51 /* We don't really know where the IO error happened... */ 51 /* We don't really know where the IO error happened... */
52 ext2_error(sb, __func__, 52 ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
95#endif 95#endif
96 96
97const struct inode_operations ext2_file_inode_operations = { 97const struct inode_operations ext2_file_inode_operations = {
98 .truncate = ext2_truncate,
99#ifdef CONFIG_EXT2_FS_XATTR 98#ifdef CONFIG_EXT2_FS_XATTR
100 .setxattr = generic_setxattr, 99 .setxattr = generic_setxattr,
101 .getxattr = generic_getxattr, 100 .getxattr = generic_getxattr,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index f0c5286f9342..938dbc739d00 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -549,16 +549,12 @@ got:
549 549
550 sb->s_dirt = 1; 550 sb->s_dirt = 1;
551 mark_buffer_dirty(bh2); 551 mark_buffer_dirty(bh2);
552 inode->i_uid = current_fsuid(); 552 if (test_opt(sb, GRPID)) {
553 if (test_opt (sb, GRPID)) 553 inode->i_mode = mode;
554 inode->i_uid = current_fsuid();
554 inode->i_gid = dir->i_gid; 555 inode->i_gid = dir->i_gid;
555 else if (dir->i_mode & S_ISGID) {
556 inode->i_gid = dir->i_gid;
557 if (S_ISDIR(mode))
558 mode |= S_ISGID;
559 } else 556 } else
560 inode->i_gid = current_fsgid(); 557 inode_init_owner(inode, dir, mode);
561 inode->i_mode = mode;
562 558
563 inode->i_ino = ino; 559 inode->i_ino = ino;
564 inode->i_blocks = 0; 560 inode->i_blocks = 0;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 527c46d9bc1f..3675088cb88c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -54,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
54 inode->i_blocks - ea_blocks == 0); 54 inode->i_blocks - ea_blocks == 0);
55} 55}
56 56
57static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
58
59static void ext2_write_failed(struct address_space *mapping, loff_t to)
60{
61 struct inode *inode = mapping->host;
62
63 if (to > inode->i_size) {
64 truncate_pagecache(inode, to, inode->i_size);
65 ext2_truncate_blocks(inode, inode->i_size);
66 }
67}
68
57/* 69/*
58 * Called at the last iput() if i_nlink is zero. 70 * Called at the last iput() if i_nlink is zero.
59 */ 71 */
@@ -71,7 +83,7 @@ void ext2_delete_inode (struct inode * inode)
71 83
72 inode->i_size = 0; 84 inode->i_size = 0;
73 if (inode->i_blocks) 85 if (inode->i_blocks)
74 ext2_truncate (inode); 86 ext2_truncate_blocks(inode, 0);
75 ext2_free_inode (inode); 87 ext2_free_inode (inode);
76 88
77 return; 89 return;
@@ -757,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
757 loff_t pos, unsigned len, unsigned flags, 769 loff_t pos, unsigned len, unsigned flags,
758 struct page **pagep, void **fsdata) 770 struct page **pagep, void **fsdata)
759{ 771{
760 return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 772 return block_write_begin_newtrunc(file, mapping, pos, len, flags,
761 ext2_get_block); 773 pagep, fsdata, ext2_get_block);
762} 774}
763 775
764static int 776static int
@@ -766,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
766 loff_t pos, unsigned len, unsigned flags, 778 loff_t pos, unsigned len, unsigned flags,
767 struct page **pagep, void **fsdata) 779 struct page **pagep, void **fsdata)
768{ 780{
781 int ret;
782
769 *pagep = NULL; 783 *pagep = NULL;
770 return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata); 784 ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
785 if (ret < 0)
786 ext2_write_failed(mapping, pos + len);
787 return ret;
788}
789
790static int ext2_write_end(struct file *file, struct address_space *mapping,
791 loff_t pos, unsigned len, unsigned copied,
792 struct page *page, void *fsdata)
793{
794 int ret;
795
796 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
797 if (ret < len)
798 ext2_write_failed(mapping, pos + len);
799 return ret;
771} 800}
772 801
773static int 802static int
@@ -775,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
775 loff_t pos, unsigned len, unsigned flags, 804 loff_t pos, unsigned len, unsigned flags,
776 struct page **pagep, void **fsdata) 805 struct page **pagep, void **fsdata)
777{ 806{
807 int ret;
808
778 /* 809 /*
779 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework 810 * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
780 * directory handling code to pass around offsets rather than struct 811 * directory handling code to pass around offsets rather than struct
781 * pages in order to make this work easily. 812 * pages in order to make this work easily.
782 */ 813 */
783 return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 814 ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
784 ext2_get_block); 815 fsdata, ext2_get_block);
816 if (ret < 0)
817 ext2_write_failed(mapping, pos + len);
818 return ret;
785} 819}
786 820
787static int ext2_nobh_writepage(struct page *page, 821static int ext2_nobh_writepage(struct page *page,
@@ -800,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
800 loff_t offset, unsigned long nr_segs) 834 loff_t offset, unsigned long nr_segs)
801{ 835{
802 struct file *file = iocb->ki_filp; 836 struct file *file = iocb->ki_filp;
803 struct inode *inode = file->f_mapping->host; 837 struct address_space *mapping = file->f_mapping;
804 838 struct inode *inode = mapping->host;
805 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 839 ssize_t ret;
806 offset, nr_segs, ext2_get_block, NULL); 840
841 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
842 iov, offset, nr_segs, ext2_get_block, NULL);
843 if (ret < 0 && (rw & WRITE))
844 ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
845 return ret;
807} 846}
808 847
809static int 848static int
@@ -818,7 +857,7 @@ const struct address_space_operations ext2_aops = {
818 .writepage = ext2_writepage, 857 .writepage = ext2_writepage,
819 .sync_page = block_sync_page, 858 .sync_page = block_sync_page,
820 .write_begin = ext2_write_begin, 859 .write_begin = ext2_write_begin,
821 .write_end = generic_write_end, 860 .write_end = ext2_write_end,
822 .bmap = ext2_bmap, 861 .bmap = ext2_bmap,
823 .direct_IO = ext2_direct_IO, 862 .direct_IO = ext2_direct_IO,
824 .writepages = ext2_writepages, 863 .writepages = ext2_writepages,
@@ -1027,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
1027 ext2_free_data(inode, p, q); 1066 ext2_free_data(inode, p, q);
1028} 1067}
1029 1068
1030void ext2_truncate(struct inode *inode) 1069static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
1031{ 1070{
1032 __le32 *i_data = EXT2_I(inode)->i_data; 1071 __le32 *i_data = EXT2_I(inode)->i_data;
1033 struct ext2_inode_info *ei = EXT2_I(inode); 1072 struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1039,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
1039 int n; 1078 int n;
1040 long iblock; 1079 long iblock;
1041 unsigned blocksize; 1080 unsigned blocksize;
1042
1043 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1044 S_ISLNK(inode->i_mode)))
1045 return;
1046 if (ext2_inode_is_fast_symlink(inode))
1047 return;
1048 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1049 return;
1050
1051 blocksize = inode->i_sb->s_blocksize; 1081 blocksize = inode->i_sb->s_blocksize;
1052 iblock = (inode->i_size + blocksize-1) 1082 iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1053 >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
1054
1055 if (mapping_is_xip(inode->i_mapping))
1056 xip_truncate_page(inode->i_mapping, inode->i_size);
1057 else if (test_opt(inode->i_sb, NOBH))
1058 nobh_truncate_page(inode->i_mapping,
1059 inode->i_size, ext2_get_block);
1060 else
1061 block_truncate_page(inode->i_mapping,
1062 inode->i_size, ext2_get_block);
1063 1083
1064 n = ext2_block_to_path(inode, iblock, offsets, NULL); 1084 n = ext2_block_to_path(inode, iblock, offsets, NULL);
1065 if (n == 0) 1085 if (n == 0)
@@ -1127,6 +1147,62 @@ do_indirects:
1127 ext2_discard_reservation(inode); 1147 ext2_discard_reservation(inode);
1128 1148
1129 mutex_unlock(&ei->truncate_mutex); 1149 mutex_unlock(&ei->truncate_mutex);
1150}
1151
1152static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
1153{
1154 /*
1155 * XXX: it seems like a bug here that we don't allow
1156 * IS_APPEND inode to have blocks-past-i_size trimmed off.
1157 * review and fix this.
1158 *
1159 * Also would be nice to be able to handle IO errors and such,
1160 * but that's probably too much to ask.
1161 */
1162 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1163 S_ISLNK(inode->i_mode)))
1164 return;
1165 if (ext2_inode_is_fast_symlink(inode))
1166 return;
1167 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1168 return;
1169 __ext2_truncate_blocks(inode, offset);
1170}
1171
1172int ext2_setsize(struct inode *inode, loff_t newsize)
1173{
1174 loff_t oldsize;
1175 int error;
1176
1177 error = inode_newsize_ok(inode, newsize);
1178 if (error)
1179 return error;
1180
1181 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
1182 S_ISLNK(inode->i_mode)))
1183 return -EINVAL;
1184 if (ext2_inode_is_fast_symlink(inode))
1185 return -EINVAL;
1186 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1187 return -EPERM;
1188
1189 if (mapping_is_xip(inode->i_mapping))
1190 error = xip_truncate_page(inode->i_mapping, newsize);
1191 else if (test_opt(inode->i_sb, NOBH))
1192 error = nobh_truncate_page(inode->i_mapping,
1193 newsize, ext2_get_block);
1194 else
1195 error = block_truncate_page(inode->i_mapping,
1196 newsize, ext2_get_block);
1197 if (error)
1198 return error;
1199
1200 oldsize = inode->i_size;
1201 i_size_write(inode, newsize);
1202 truncate_pagecache(inode, oldsize, newsize);
1203
1204 __ext2_truncate_blocks(inode, newsize);
1205
1130 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 1206 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1131 if (inode_needs_sync(inode)) { 1207 if (inode_needs_sync(inode)) {
1132 sync_mapping_buffers(inode->i_mapping); 1208 sync_mapping_buffers(inode->i_mapping);
@@ -1134,6 +1210,8 @@ do_indirects:
1134 } else { 1210 } else {
1135 mark_inode_dirty(inode); 1211 mark_inode_dirty(inode);
1136 } 1212 }
1213
1214 return 0;
1137} 1215}
1138 1216
1139static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino, 1217static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1474,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
1474 if (error) 1552 if (error)
1475 return error; 1553 return error;
1476 } 1554 }
1477 error = inode_setattr(inode, iattr); 1555 if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
1478 if (!error && (iattr->ia_valid & ATTR_MODE)) 1556 error = ext2_setsize(inode, iattr->ia_size);
1557 if (error)
1558 return error;
1559 }
1560 generic_setattr(inode, iattr);
1561 if (iattr->ia_valid & ATTR_MODE)
1479 error = ext2_acl_chmod(inode); 1562 error = ext2_acl_chmod(inode);
1563 mark_inode_dirty(inode);
1564
1480 return error; 1565 return error;
1481} 1566}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 71e9eb1fa696..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -119,6 +119,8 @@ static void ext2_put_super (struct super_block * sb)
119 int i; 119 int i;
120 struct ext2_sb_info *sbi = EXT2_SB(sb); 120 struct ext2_sb_info *sbi = EXT2_SB(sb);
121 121
122 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
123
122 if (sb->s_dirt) 124 if (sb->s_dirt)
123 ext2_write_super(sb); 125 ext2_write_super(sb);
124 126
@@ -1063,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1063 sb->s_op = &ext2_sops; 1065 sb->s_op = &ext2_sops;
1064 sb->s_export_op = &ext2_export_ops; 1066 sb->s_export_op = &ext2_export_ops;
1065 sb->s_xattr = ext2_xattr_handlers; 1067 sb->s_xattr = ext2_xattr_handlers;
1068
1069#ifdef CONFIG_QUOTA
1070 sb->dq_op = &dquot_operations;
1071 sb->s_qcop = &dquot_quotactl_ops;
1072#endif
1073
1066 root = ext2_iget(sb, EXT2_ROOT_INO); 1074 root = ext2_iget(sb, EXT2_ROOT_INO);
1067 if (IS_ERR(root)) { 1075 if (IS_ERR(root)) {
1068 ret = PTR_ERR(root); 1076 ret = PTR_ERR(root);
@@ -1241,6 +1249,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1241 spin_unlock(&sbi->s_lock); 1249 spin_unlock(&sbi->s_lock);
1242 return 0; 1250 return 0;
1243 } 1251 }
1252
1244 /* 1253 /*
1245 * OK, we are remounting a valid rw partition rdonly, so set 1254 * OK, we are remounting a valid rw partition rdonly, so set
1246 * the rdonly flag and then mark the partition as valid again. 1255 * the rdonly flag and then mark the partition as valid again.
@@ -1248,6 +1257,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1248 es->s_state = cpu_to_le16(sbi->s_mount_state); 1257 es->s_state = cpu_to_le16(sbi->s_mount_state);
1249 es->s_mtime = cpu_to_le32(get_seconds()); 1258 es->s_mtime = cpu_to_le32(get_seconds());
1250 spin_unlock(&sbi->s_lock); 1259 spin_unlock(&sbi->s_lock);
1260
1261 err = dquot_suspend(sb, -1);
1262 if (err < 0) {
1263 spin_lock(&sbi->s_lock);
1264 goto restore_opts;
1265 }
1266
1251 ext2_sync_super(sb, es, 1); 1267 ext2_sync_super(sb, es, 1);
1252 } else { 1268 } else {
1253 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb, 1269 __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1269,8 +1285,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
1269 if (!ext2_setup_super (sb, es, 0)) 1285 if (!ext2_setup_super (sb, es, 0))
1270 sb->s_flags &= ~MS_RDONLY; 1286 sb->s_flags &= ~MS_RDONLY;
1271 spin_unlock(&sbi->s_lock); 1287 spin_unlock(&sbi->s_lock);
1288
1272 ext2_write_super(sb); 1289 ext2_write_super(sb);
1290
1291 dquot_resume(sb, -1);
1273 } 1292 }
1293
1274 return 0; 1294 return 0;
1275restore_opts: 1295restore_opts:
1276 sbi->s_mount_opt = old_opts.s_mount_opt; 1296 sbi->s_mount_opt = old_opts.s_mount_opt;
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 3b96045a00ce..7c3915780b19 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -101,7 +101,7 @@ static void ext2_xattr_rehash(struct ext2_xattr_header *,
101 101
102static struct mb_cache *ext2_xattr_cache; 102static struct mb_cache *ext2_xattr_cache;
103 103
104static struct xattr_handler *ext2_xattr_handler_map[] = { 104static const struct xattr_handler *ext2_xattr_handler_map[] = {
105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, 105 [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler,
106#ifdef CONFIG_EXT2_FS_POSIX_ACL 106#ifdef CONFIG_EXT2_FS_POSIX_ACL
107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler, 107 [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext2_xattr_acl_access_handler,
@@ -113,7 +113,7 @@ static struct xattr_handler *ext2_xattr_handler_map[] = {
113#endif 113#endif
114}; 114};
115 115
116struct xattr_handler *ext2_xattr_handlers[] = { 116const struct xattr_handler *ext2_xattr_handlers[] = {
117 &ext2_xattr_user_handler, 117 &ext2_xattr_user_handler,
118 &ext2_xattr_trusted_handler, 118 &ext2_xattr_trusted_handler,
119#ifdef CONFIG_EXT2_FS_POSIX_ACL 119#ifdef CONFIG_EXT2_FS_POSIX_ACL
@@ -126,10 +126,10 @@ struct xattr_handler *ext2_xattr_handlers[] = {
126 NULL 126 NULL
127}; 127};
128 128
129static inline struct xattr_handler * 129static inline const struct xattr_handler *
130ext2_xattr_handler(int name_index) 130ext2_xattr_handler(int name_index)
131{ 131{
132 struct xattr_handler *handler = NULL; 132 const struct xattr_handler *handler = NULL;
133 133
134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map)) 134 if (name_index > 0 && name_index < ARRAY_SIZE(ext2_xattr_handler_map))
135 handler = ext2_xattr_handler_map[name_index]; 135 handler = ext2_xattr_handler_map[name_index];
@@ -298,7 +298,7 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
298 /* list the attribute names */ 298 /* list the attribute names */
299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry); 299 for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
300 entry = EXT2_XATTR_NEXT(entry)) { 300 entry = EXT2_XATTR_NEXT(entry)) {
301 struct xattr_handler *handler = 301 const struct xattr_handler *handler =
302 ext2_xattr_handler(entry->e_name_index); 302 ext2_xattr_handler(entry->e_name_index);
303 303
304 if (handler) { 304 if (handler) {
diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h
index bf8175b2ced9..a1a1c2184616 100644
--- a/fs/ext2/xattr.h
+++ b/fs/ext2/xattr.h
@@ -55,11 +55,11 @@ struct ext2_xattr_entry {
55 55
56# ifdef CONFIG_EXT2_FS_XATTR 56# ifdef CONFIG_EXT2_FS_XATTR
57 57
58extern struct xattr_handler ext2_xattr_user_handler; 58extern const struct xattr_handler ext2_xattr_user_handler;
59extern struct xattr_handler ext2_xattr_trusted_handler; 59extern const struct xattr_handler ext2_xattr_trusted_handler;
60extern struct xattr_handler ext2_xattr_acl_access_handler; 60extern const struct xattr_handler ext2_xattr_acl_access_handler;
61extern struct xattr_handler ext2_xattr_acl_default_handler; 61extern const struct xattr_handler ext2_xattr_acl_default_handler;
62extern struct xattr_handler ext2_xattr_security_handler; 62extern const struct xattr_handler ext2_xattr_security_handler;
63 63
64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t); 64extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
65 65
@@ -72,7 +72,7 @@ extern void ext2_xattr_put_super(struct super_block *);
72extern int init_ext2_xattr(void); 72extern int init_ext2_xattr(void);
73extern void exit_ext2_xattr(void); 73extern void exit_ext2_xattr(void);
74 74
75extern struct xattr_handler *ext2_xattr_handlers[]; 75extern const struct xattr_handler *ext2_xattr_handlers[];
76 76
77# else /* CONFIG_EXT2_FS_XATTR */ 77# else /* CONFIG_EXT2_FS_XATTR */
78 78
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index b118c6383c6d..3004e15d5da5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -67,7 +67,7 @@ ext2_init_security(struct inode *inode, struct inode *dir)
67 return err; 67 return err;
68} 68}
69 69
70struct xattr_handler ext2_xattr_security_handler = { 70const struct xattr_handler ext2_xattr_security_handler = {
71 .prefix = XATTR_SECURITY_PREFIX, 71 .prefix = XATTR_SECURITY_PREFIX,
72 .list = ext2_xattr_security_list, 72 .list = ext2_xattr_security_list,
73 .get = ext2_xattr_security_get, 73 .get = ext2_xattr_security_get,
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 2a26d71f4771..667e46a8d62d 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -50,7 +50,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
50 value, size, flags); 50 value, size, flags);
51} 51}
52 52
53struct xattr_handler ext2_xattr_trusted_handler = { 53const struct xattr_handler ext2_xattr_trusted_handler = {
54 .prefix = XATTR_TRUSTED_PREFIX, 54 .prefix = XATTR_TRUSTED_PREFIX,
55 .list = ext2_xattr_trusted_list, 55 .list = ext2_xattr_trusted_list,
56 .get = ext2_xattr_trusted_get, 56 .get = ext2_xattr_trusted_get,
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 3f6caf3684b4..099d20f47163 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -54,7 +54,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext2_xattr_user_handler = { 57const struct xattr_handler ext2_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext2_xattr_user_list, 59 .list = ext2_xattr_user_list,
60 .get = ext2_xattr_user_get, 60 .get = ext2_xattr_user_get,
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 82ba34158661..01552abbca3c 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -456,7 +456,7 @@ release_and_out:
456 return error; 456 return error;
457} 457}
458 458
459struct xattr_handler ext3_xattr_acl_access_handler = { 459const struct xattr_handler ext3_xattr_acl_access_handler = {
460 .prefix = POSIX_ACL_XATTR_ACCESS, 460 .prefix = POSIX_ACL_XATTR_ACCESS,
461 .flags = ACL_TYPE_ACCESS, 461 .flags = ACL_TYPE_ACCESS,
462 .list = ext3_xattr_list_acl_access, 462 .list = ext3_xattr_list_acl_access,
@@ -464,7 +464,7 @@ struct xattr_handler ext3_xattr_acl_access_handler = {
464 .set = ext3_xattr_set_acl, 464 .set = ext3_xattr_set_acl,
465}; 465};
466 466
467struct xattr_handler ext3_xattr_acl_default_handler = { 467const struct xattr_handler ext3_xattr_acl_default_handler = {
468 .prefix = POSIX_ACL_XATTR_DEFAULT, 468 .prefix = POSIX_ACL_XATTR_DEFAULT,
469 .flags = ACL_TYPE_DEFAULT, 469 .flags = ACL_TYPE_DEFAULT,
470 .list = ext3_xattr_list_acl_default, 470 .list = ext3_xattr_list_acl_default,
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
297 kfree (old); 297 kfree (old);
298 } 298 }
299 if (!parent) 299 if (!parent)
300 root->rb_node = NULL; 300 *root = RB_ROOT;
301 else if (parent->rb_left == n) 301 else if (parent->rb_left == n)
302 parent->rb_left = NULL; 302 parent->rb_left = NULL;
303 else if (parent->rb_right == n) 303 else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 26289e8f4163..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,9 +43,9 @@
43 * inode to disk. 43 * inode to disk.
44 */ 44 */
45 45
46int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) 46int ext3_sync_file(struct file *file, int datasync)
47{ 47{
48 struct inode *inode = dentry->d_inode; 48 struct inode *inode = file->f_mapping->host;
49 struct ext3_inode_info *ei = EXT3_I(inode); 49 struct ext3_inode_info *ei = EXT3_I(inode);
50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal; 50 journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
51 int ret, needs_barrier = 0; 51 int ret, needs_barrier = 0;
@@ -90,6 +90,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
90 * storage 90 * storage
91 */ 91 */
92 if (needs_barrier) 92 if (needs_barrier)
93 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 93 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
94 BLKDEV_IFL_WAIT);
94 return ret; 95 return ret;
95} 96}
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 0d0e97ed3ff6..498021eb88fb 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -538,16 +538,13 @@ got:
538 if (S_ISDIR(mode)) 538 if (S_ISDIR(mode))
539 percpu_counter_inc(&sbi->s_dirs_counter); 539 percpu_counter_inc(&sbi->s_dirs_counter);
540 540
541 inode->i_uid = current_fsuid(); 541
542 if (test_opt (sb, GRPID)) 542 if (test_opt(sb, GRPID)) {
543 inode->i_gid = dir->i_gid; 543 inode->i_mode = mode;
544 else if (dir->i_mode & S_ISGID) { 544 inode->i_uid = current_fsuid();
545 inode->i_gid = dir->i_gid; 545 inode->i_gid = dir->i_gid;
546 if (S_ISDIR(mode))
547 mode |= S_ISGID;
548 } else 546 } else
549 inode->i_gid = current_fsgid(); 547 inode_init_owner(inode, dir, mode);
550 inode->i_mode = mode;
551 548
552 inode->i_ino = ino; 549 inode->i_ino = ino;
553 /* This is the optimal IO size (for stat), not the fs block size */ 550 /* This is the optimal IO size (for stat), not the fs block size */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0fc1293d0e96..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
410 struct ext3_super_block *es = sbi->s_es; 410 struct ext3_super_block *es = sbi->s_es;
411 int i, err; 411 int i, err;
412 412
413 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
414
413 lock_kernel(); 415 lock_kernel();
414 416
415 ext3_xattr_put_super(sb); 417 ext3_xattr_put_super(sb);
@@ -748,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
748static int ext3_mark_dquot_dirty(struct dquot *dquot); 750static int ext3_mark_dquot_dirty(struct dquot *dquot);
749static int ext3_write_info(struct super_block *sb, int type); 751static int ext3_write_info(struct super_block *sb, int type);
750static int ext3_quota_on(struct super_block *sb, int type, int format_id, 752static int ext3_quota_on(struct super_block *sb, int type, int format_id,
751 char *path, int remount); 753 char *path);
752static int ext3_quota_on_mount(struct super_block *sb, int type); 754static int ext3_quota_on_mount(struct super_block *sb, int type);
753static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, 755static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
754 size_t len, loff_t off); 756 size_t len, loff_t off);
@@ -767,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
767 769
768static const struct quotactl_ops ext3_qctl_operations = { 770static const struct quotactl_ops ext3_qctl_operations = {
769 .quota_on = ext3_quota_on, 771 .quota_on = ext3_quota_on,
770 .quota_off = vfs_quota_off, 772 .quota_off = dquot_quota_off,
771 .quota_sync = vfs_quota_sync, 773 .quota_sync = dquot_quota_sync,
772 .get_info = vfs_get_dqinfo, 774 .get_info = dquot_get_dqinfo,
773 .set_info = vfs_set_dqinfo, 775 .set_info = dquot_set_dqinfo,
774 .get_dqblk = vfs_get_dqblk, 776 .get_dqblk = dquot_get_dqblk,
775 .set_dqblk = vfs_set_dqblk 777 .set_dqblk = dquot_set_dqblk
776}; 778};
777#endif 779#endif
778 780
@@ -1527,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
1527 /* Turn quotas off */ 1529 /* Turn quotas off */
1528 for (i = 0; i < MAXQUOTAS; i++) { 1530 for (i = 0; i < MAXQUOTAS; i++) {
1529 if (sb_dqopt(sb)->files[i]) 1531 if (sb_dqopt(sb)->files[i])
1530 vfs_quota_off(sb, i, 0); 1532 dquot_quota_off(sb, i);
1531 } 1533 }
1532#endif 1534#endif
1533 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 1535 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2551,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2551 ext3_fsblk_t n_blocks_count = 0; 2553 ext3_fsblk_t n_blocks_count = 0;
2552 unsigned long old_sb_flags; 2554 unsigned long old_sb_flags;
2553 struct ext3_mount_options old_opts; 2555 struct ext3_mount_options old_opts;
2556 int enable_quota = 0;
2554 int err; 2557 int err;
2555#ifdef CONFIG_QUOTA 2558#ifdef CONFIG_QUOTA
2556 int i; 2559 int i;
@@ -2597,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2597 } 2600 }
2598 2601
2599 if (*flags & MS_RDONLY) { 2602 if (*flags & MS_RDONLY) {
2603 err = dquot_suspend(sb, -1);
2604 if (err < 0)
2605 goto restore_opts;
2606
2600 /* 2607 /*
2601 * First of all, the unconditional stuff we have to do 2608 * First of all, the unconditional stuff we have to do
2602 * to disable replay of the journal when we next remount 2609 * to disable replay of the journal when we next remount
@@ -2651,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2651 goto restore_opts; 2658 goto restore_opts;
2652 if (!ext3_setup_super (sb, es, 0)) 2659 if (!ext3_setup_super (sb, es, 0))
2653 sb->s_flags &= ~MS_RDONLY; 2660 sb->s_flags &= ~MS_RDONLY;
2661 enable_quota = 1;
2654 } 2662 }
2655 } 2663 }
2656#ifdef CONFIG_QUOTA 2664#ifdef CONFIG_QUOTA
@@ -2662,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
2662#endif 2670#endif
2663 unlock_super(sb); 2671 unlock_super(sb);
2664 unlock_kernel(); 2672 unlock_kernel();
2673
2674 if (enable_quota)
2675 dquot_resume(sb, -1);
2665 return 0; 2676 return 0;
2666restore_opts: 2677restore_opts:
2667 sb->s_flags = old_sb_flags; 2678 sb->s_flags = old_sb_flags;
@@ -2851,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
2851 */ 2862 */
2852static int ext3_quota_on_mount(struct super_block *sb, int type) 2863static int ext3_quota_on_mount(struct super_block *sb, int type)
2853{ 2864{
2854 return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type], 2865 return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
2855 EXT3_SB(sb)->s_jquota_fmt, type); 2866 EXT3_SB(sb)->s_jquota_fmt, type);
2856} 2867}
2857 2868
2858/* 2869/*
2859 * Standard function to be called on quota_on 2870 * Standard function to be called on quota_on
2860 */ 2871 */
2861static int ext3_quota_on(struct super_block *sb, int type, int format_id, 2872static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2862 char *name, int remount) 2873 char *name)
2863{ 2874{
2864 int err; 2875 int err;
2865 struct path path; 2876 struct path path;
2866 2877
2867 if (!test_opt(sb, QUOTA)) 2878 if (!test_opt(sb, QUOTA))
2868 return -EINVAL; 2879 return -EINVAL;
2869 /* When remounting, no checks are needed and in fact, name is NULL */
2870 if (remount)
2871 return vfs_quota_on(sb, type, format_id, name, remount);
2872 2880
2873 err = kern_path(name, LOOKUP_FOLLOW, &path); 2881 err = kern_path(name, LOOKUP_FOLLOW, &path);
2874 if (err) 2882 if (err)
@@ -2906,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
2906 } 2914 }
2907 } 2915 }
2908 2916
2909 err = vfs_quota_on_path(sb, type, format_id, &path); 2917 err = dquot_quota_on_path(sb, type, format_id, &path);
2910 path_put(&path); 2918 path_put(&path);
2911 return err; 2919 return err;
2912} 2920}
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 534a94c3a933..71fb8d65e54c 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -104,7 +104,7 @@ static int ext3_xattr_list(struct dentry *dentry, char *buffer,
104 104
105static struct mb_cache *ext3_xattr_cache; 105static struct mb_cache *ext3_xattr_cache;
106 106
107static struct xattr_handler *ext3_xattr_handler_map[] = { 107static const struct xattr_handler *ext3_xattr_handler_map[] = {
108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler, 108 [EXT3_XATTR_INDEX_USER] = &ext3_xattr_user_handler,
109#ifdef CONFIG_EXT3_FS_POSIX_ACL 109#ifdef CONFIG_EXT3_FS_POSIX_ACL
110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler, 110 [EXT3_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext3_xattr_acl_access_handler,
@@ -116,7 +116,7 @@ static struct xattr_handler *ext3_xattr_handler_map[] = {
116#endif 116#endif
117}; 117};
118 118
119struct xattr_handler *ext3_xattr_handlers[] = { 119const struct xattr_handler *ext3_xattr_handlers[] = {
120 &ext3_xattr_user_handler, 120 &ext3_xattr_user_handler,
121 &ext3_xattr_trusted_handler, 121 &ext3_xattr_trusted_handler,
122#ifdef CONFIG_EXT3_FS_POSIX_ACL 122#ifdef CONFIG_EXT3_FS_POSIX_ACL
@@ -129,10 +129,10 @@ struct xattr_handler *ext3_xattr_handlers[] = {
129 NULL 129 NULL
130}; 130};
131 131
132static inline struct xattr_handler * 132static inline const struct xattr_handler *
133ext3_xattr_handler(int name_index) 133ext3_xattr_handler(int name_index)
134{ 134{
135 struct xattr_handler *handler = NULL; 135 const struct xattr_handler *handler = NULL;
136 136
137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map)) 137 if (name_index > 0 && name_index < ARRAY_SIZE(ext3_xattr_handler_map))
138 handler = ext3_xattr_handler_map[name_index]; 138 handler = ext3_xattr_handler_map[name_index];
@@ -338,7 +338,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry,
338 size_t rest = buffer_size; 338 size_t rest = buffer_size;
339 339
340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) { 340 for (; !IS_LAST_ENTRY(entry); entry = EXT3_XATTR_NEXT(entry)) {
341 struct xattr_handler *handler = 341 const struct xattr_handler *handler =
342 ext3_xattr_handler(entry->e_name_index); 342 ext3_xattr_handler(entry->e_name_index);
343 343
344 if (handler) { 344 if (handler) {
diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h
index 148a4dfc82ab..377fe7201169 100644
--- a/fs/ext3/xattr.h
+++ b/fs/ext3/xattr.h
@@ -58,11 +58,11 @@ struct ext3_xattr_entry {
58 58
59# ifdef CONFIG_EXT3_FS_XATTR 59# ifdef CONFIG_EXT3_FS_XATTR
60 60
61extern struct xattr_handler ext3_xattr_user_handler; 61extern const struct xattr_handler ext3_xattr_user_handler;
62extern struct xattr_handler ext3_xattr_trusted_handler; 62extern const struct xattr_handler ext3_xattr_trusted_handler;
63extern struct xattr_handler ext3_xattr_acl_access_handler; 63extern const struct xattr_handler ext3_xattr_acl_access_handler;
64extern struct xattr_handler ext3_xattr_acl_default_handler; 64extern const struct xattr_handler ext3_xattr_acl_default_handler;
65extern struct xattr_handler ext3_xattr_security_handler; 65extern const struct xattr_handler ext3_xattr_security_handler;
66 66
67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); 67extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
68 68
@@ -76,7 +76,7 @@ extern void ext3_xattr_put_super(struct super_block *);
76extern int init_ext3_xattr(void); 76extern int init_ext3_xattr(void);
77extern void exit_ext3_xattr(void); 77extern void exit_ext3_xattr(void);
78 78
79extern struct xattr_handler *ext3_xattr_handlers[]; 79extern const struct xattr_handler *ext3_xattr_handlers[];
80 80
81# else /* CONFIG_EXT3_FS_XATTR */ 81# else /* CONFIG_EXT3_FS_XATTR */
82 82
diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c
index 3af91f476dff..03a99bfc59f9 100644
--- a/fs/ext3/xattr_security.c
+++ b/fs/ext3/xattr_security.c
@@ -69,7 +69,7 @@ ext3_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
69 return err; 69 return err;
70} 70}
71 71
72struct xattr_handler ext3_xattr_security_handler = { 72const struct xattr_handler ext3_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext3_xattr_security_list, 74 .list = ext3_xattr_security_list,
75 .get = ext3_xattr_security_get, 75 .get = ext3_xattr_security_get,
diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c
index e5562845ed96..dc8edda9ffe0 100644
--- a/fs/ext3/xattr_trusted.c
+++ b/fs/ext3/xattr_trusted.c
@@ -51,7 +51,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name,
51 value, size, flags); 51 value, size, flags);
52} 52}
53 53
54struct xattr_handler ext3_xattr_trusted_handler = { 54const struct xattr_handler ext3_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext3_xattr_trusted_list, 56 .list = ext3_xattr_trusted_list,
57 .get = ext3_xattr_trusted_get, 57 .get = ext3_xattr_trusted_get,
diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c
index 3bcfe9ee0a68..7a321974d584 100644
--- a/fs/ext3/xattr_user.c
+++ b/fs/ext3/xattr_user.c
@@ -54,7 +54,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext3_xattr_user_handler = { 57const struct xattr_handler ext3_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext3_xattr_user_list, 59 .list = ext3_xattr_user_list,
60 .get = ext3_xattr_user_get, 60 .get = ext3_xattr_user_get,
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index 8a2a29d35a6f..feaf498feaa6 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -454,7 +454,7 @@ release_and_out:
454 return error; 454 return error;
455} 455}
456 456
457struct xattr_handler ext4_xattr_acl_access_handler = { 457const struct xattr_handler ext4_xattr_acl_access_handler = {
458 .prefix = POSIX_ACL_XATTR_ACCESS, 458 .prefix = POSIX_ACL_XATTR_ACCESS,
459 .flags = ACL_TYPE_ACCESS, 459 .flags = ACL_TYPE_ACCESS,
460 .list = ext4_xattr_list_acl_access, 460 .list = ext4_xattr_list_acl_access,
@@ -462,7 +462,7 @@ struct xattr_handler ext4_xattr_acl_access_handler = {
462 .set = ext4_xattr_set_acl, 462 .set = ext4_xattr_set_acl,
463}; 463};
464 464
465struct xattr_handler ext4_xattr_acl_default_handler = { 465const struct xattr_handler ext4_xattr_acl_default_handler = {
466 .prefix = POSIX_ACL_XATTR_DEFAULT, 466 .prefix = POSIX_ACL_XATTR_DEFAULT,
467 .flags = ACL_TYPE_DEFAULT, 467 .flags = ACL_TYPE_DEFAULT,
468 .list = ext4_xattr_list_acl_default, 468 .list = ext4_xattr_list_acl_default,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
591 ret = ext4_mb_new_blocks(handle, &ar, errp); 591 ret = ext4_mb_new_blocks(handle, &ar, errp);
592 if (count) 592 if (count)
593 *count = ar.len; 593 *count = ar.len;
594
595 /* 594 /*
596 * Account for the allocated meta blocks 595 * Account for the allocated meta blocks. We will never
596 * fail EDQUOT for metdata, but we do account for it.
597 */ 597 */
598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { 598 if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 599 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len; 600 EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 601 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
602 dquot_alloc_block_nofail(inode, ar.len);
602 } 603 }
603 return ret; 604 return ret;
604} 605}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
72 else if (start_blk >= (entry->start_blk + entry->count)) 72 else if (start_blk >= (entry->start_blk + entry->count))
73 n = &(*n)->rb_right; 73 n = &(*n)->rb_right;
74 else { 74 else {
75 if (start_blk + count > (entry->start_blk + 75 if (start_blk + count > (entry->start_blk +
76 entry->count)) 76 entry->count))
77 entry->count = (start_blk + count - 77 entry->count = (start_blk + count -
78 entry->start_blk); 78 entry->start_blk);
79 new_node = *n; 79 new_node = *n;
80 new_entry = rb_entry(new_node, struct ext4_system_zone, 80 new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
83 error_msg = "inode out of bounds"; 83 error_msg = "inode out of bounds";
84 84
85 if (error_msg != NULL) 85 if (error_msg != NULL)
86 __ext4_error(dir->i_sb, function, 86 ext4_error_inode(function, dir,
87 "bad entry in directory #%lu: %s - block=%llu" 87 "bad entry in directory: %s - block=%llu"
88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", 88 "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
89 dir->i_ino, error_msg, 89 error_msg, (unsigned long long) bh->b_blocknr,
90 (unsigned long long) bh->b_blocknr,
91 (unsigned) (offset%bh->b_size), offset, 90 (unsigned) (offset%bh->b_size), offset,
92 le32_to_cpu(de->inode), 91 le32_to_cpu(de->inode),
93 rlen, de->name_len); 92 rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
111 110
112 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 111 if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
113 EXT4_FEATURE_COMPAT_DIR_INDEX) && 112 EXT4_FEATURE_COMPAT_DIR_INDEX) &&
114 ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) || 113 ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
115 ((inode->i_size >> sb->s_blocksize_bits) == 1))) { 114 ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
116 err = ext4_dx_readdir(filp, dirent, filldir); 115 err = ext4_dx_readdir(filp, dirent, filldir);
117 if (err != ERR_BAD_DX_DIR) { 116 if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
122 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
123 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
124 */ 123 */
125 EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL; 124 ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
126 } 125 }
127 stored = 0; 126 stored = 0;
128 offset = filp->f_pos & (sb->s_blocksize - 1); 127 offset = filp->f_pos & (sb->s_blocksize - 1);
129 128
130 while (!error && !stored && filp->f_pos < inode->i_size) { 129 while (!error && !stored && filp->f_pos < inode->i_size) {
131 ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 130 struct ext4_map_blocks map;
132 struct buffer_head map_bh;
133 struct buffer_head *bh = NULL; 131 struct buffer_head *bh = NULL;
134 132
135 map_bh.b_state = 0; 133 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
136 err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); 134 map.m_len = 1;
135 err = ext4_map_blocks(NULL, inode, &map, 0);
137 if (err > 0) { 136 if (err > 0) {
138 pgoff_t index = map_bh.b_blocknr >> 137 pgoff_t index = map.m_pblk >>
139 (PAGE_CACHE_SHIFT - inode->i_blkbits); 138 (PAGE_CACHE_SHIFT - inode->i_blkbits);
140 if (!ra_has_index(&filp->f_ra, index)) 139 if (!ra_has_index(&filp->f_ra, index))
141 page_cache_sync_readahead( 140 page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
143 &filp->f_ra, filp, 142 &filp->f_ra, filp,
144 index, 1); 143 index, 1);
145 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 144 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
146 bh = ext4_bread(NULL, inode, blk, 0, &err); 145 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
147 } 146 }
148 147
149 /* 148 /*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
152 */ 151 */
153 if (!bh) { 152 if (!bh) {
154 if (!dir_has_error) { 153 if (!dir_has_error) {
155 ext4_error(sb, "directory #%lu " 154 EXT4_ERROR_INODE(inode, "directory "
156 "contains a hole at offset %Lu", 155 "contains a hole at offset %Lu",
157 inode->i_ino,
158 (unsigned long long) filp->f_pos); 156 (unsigned long long) filp->f_pos);
159 dir_has_error = 1; 157 dir_has_error = 1;
160 } 158 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
29#include <linux/wait.h> 29#include <linux/wait.h>
30#include <linux/blockgroup_lock.h> 30#include <linux/blockgroup_lock.h>
31#include <linux/percpu_counter.h> 31#include <linux/percpu_counter.h>
32#ifdef __KERNEL__
33#include <linux/compat.h>
34#endif
32 35
33/* 36/*
34 * The fourth extended filesystem constants/structures 37 * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
54#endif 57#endif
55 58
56#define EXT4_ERROR_INODE(inode, fmt, a...) \ 59#define EXT4_ERROR_INODE(inode, fmt, a...) \
57 ext4_error_inode(__func__, (inode), (fmt), ## a); 60 ext4_error_inode(__func__, (inode), (fmt), ## a)
58 61
59#define EXT4_ERROR_FILE(file, fmt, a...) \ 62#define EXT4_ERROR_FILE(file, fmt, a...) \
60 ext4_error_file(__func__, (file), (fmt), ## a); 63 ext4_error_file(__func__, (file), (fmt), ## a)
61 64
62/* data type for block offset of block group */ 65/* data type for block offset of block group */
63typedef int ext4_grpblk_t; 66typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
72typedef unsigned int ext4_group_t; 75typedef unsigned int ext4_group_t;
73 76
74/* 77/*
75 * Flags used in mballoc's allocation_context flags field. 78 * Flags used in mballoc's allocation_context flags field.
76 * 79 *
77 * Also used to show what's going on for debugging purposes when the 80 * Also used to show what's going on for debugging purposes when the
78 * flag field is exported via the traceport interface 81 * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
126}; 129};
127 130
128/* 131/*
132 * Logical to physical block mapping, used by ext4_map_blocks()
133 *
134 * This structure is used to pass requests into ext4_map_blocks() as
135 * well as to store the information returned by ext4_map_blocks(). It
136 * takes less room on the stack than a struct buffer_head.
137 */
138#define EXT4_MAP_NEW (1 << BH_New)
139#define EXT4_MAP_MAPPED (1 << BH_Mapped)
140#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten)
141#define EXT4_MAP_BOUNDARY (1 << BH_Boundary)
142#define EXT4_MAP_UNINIT (1 << BH_Uninit)
143#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
144 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
145 EXT4_MAP_UNINIT)
146
147struct ext4_map_blocks {
148 ext4_fsblk_t m_pblk;
149 ext4_lblk_t m_lblk;
150 unsigned int m_len;
151 unsigned int m_flags;
152};
153
154/*
129 * For delayed allocation tracking 155 * For delayed allocation tracking
130 */ 156 */
131struct mpage_da_data { 157struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
321 return flags & EXT4_OTHER_FLMASK; 347 return flags & EXT4_OTHER_FLMASK;
322} 348}
323 349
350/*
351 * Inode flags used for atomic set/get
352 */
353enum {
354 EXT4_INODE_SECRM = 0, /* Secure deletion */
355 EXT4_INODE_UNRM = 1, /* Undelete */
356 EXT4_INODE_COMPR = 2, /* Compress file */
357 EXT4_INODE_SYNC = 3, /* Synchronous updates */
358 EXT4_INODE_IMMUTABLE = 4, /* Immutable file */
359 EXT4_INODE_APPEND = 5, /* writes to file may only append */
360 EXT4_INODE_NODUMP = 6, /* do not dump file */
361 EXT4_INODE_NOATIME = 7, /* do not update atime */
362/* Reserved for compression usage... */
363 EXT4_INODE_DIRTY = 8,
364 EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */
365 EXT4_INODE_NOCOMPR = 10, /* Don't compress */
366 EXT4_INODE_ECOMPR = 11, /* Compression error */
367/* End compression flags --- maybe not all used */
368 EXT4_INODE_INDEX = 12, /* hash-indexed directory */
369 EXT4_INODE_IMAGIC = 13, /* AFS directory */
370 EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */
371 EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */
372 EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */
373 EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/
374 EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */
375 EXT4_INODE_EXTENTS = 19, /* Inode uses extents */
376 EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
377 EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
378 EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
379};
380
381#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
382#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
383 printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
384 EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
385
386/*
387 * Since it's pretty easy to mix up bit numbers and hex values, and we
388 * can't do a compile-time test for ENUM values, we use a run-time
389 * test to make sure that EXT4_XXX_FL is consistent with respect to
390 * EXT4_INODE_XXX. If all is well the printk and BUG_ON will all drop
391 * out so it won't cost any extra space in the compiled kernel image.
392 * But it's important that these values are the same, since we are
393 * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
394 * must be consistent with the values of FS_XXX_FL defined in
395 * include/linux/fs.h and the on-disk values found in ext2, ext3, and
396 * ext4 filesystems, and of course the values defined in e2fsprogs.
397 *
398 * It's not paranoia if the Murphy's Law really *is* out to get you. :-)
399 */
400static inline void ext4_check_flag_values(void)
401{
402 CHECK_FLAG_VALUE(SECRM);
403 CHECK_FLAG_VALUE(UNRM);
404 CHECK_FLAG_VALUE(COMPR);
405 CHECK_FLAG_VALUE(SYNC);
406 CHECK_FLAG_VALUE(IMMUTABLE);
407 CHECK_FLAG_VALUE(APPEND);
408 CHECK_FLAG_VALUE(NODUMP);
409 CHECK_FLAG_VALUE(NOATIME);
410 CHECK_FLAG_VALUE(DIRTY);
411 CHECK_FLAG_VALUE(COMPRBLK);
412 CHECK_FLAG_VALUE(NOCOMPR);
413 CHECK_FLAG_VALUE(ECOMPR);
414 CHECK_FLAG_VALUE(INDEX);
415 CHECK_FLAG_VALUE(IMAGIC);
416 CHECK_FLAG_VALUE(JOURNAL_DATA);
417 CHECK_FLAG_VALUE(NOTAIL);
418 CHECK_FLAG_VALUE(DIRSYNC);
419 CHECK_FLAG_VALUE(TOPDIR);
420 CHECK_FLAG_VALUE(HUGE_FILE);
421 CHECK_FLAG_VALUE(EXTENTS);
422 CHECK_FLAG_VALUE(EA_INODE);
423 CHECK_FLAG_VALUE(EOFBLOCKS);
424 CHECK_FLAG_VALUE(RESERVED);
425}
426
324/* Used to pass group descriptor data when online resize is done */ 427/* Used to pass group descriptor data when online resize is done */
325struct ext4_new_group_input { 428struct ext4_new_group_input {
326 __u32 group; /* Group number for this data */ 429 __u32 group; /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
332 __u16 unused; 435 __u16 unused;
333}; 436};
334 437
438#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
439struct compat_ext4_new_group_input {
440 u32 group;
441 compat_u64 block_bitmap;
442 compat_u64 inode_bitmap;
443 compat_u64 inode_table;
444 u32 blocks_count;
445 u16 reserved_blocks;
446 u16 unused;
447};
448#endif
449
335/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ 450/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
336struct ext4_new_group_data { 451struct ext4_new_group_data {
337 __u32 group; 452 __u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
355#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ 470#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\
356 EXT4_GET_BLOCKS_CREATE) 471 EXT4_GET_BLOCKS_CREATE)
357 /* Caller is from the delayed allocation writeout path, 472 /* Caller is from the delayed allocation writeout path,
358 so set the magic i_delalloc_reserve_flag after taking the 473 so set the magic i_delalloc_reserve_flag after taking the
359 inode allocation semaphore for */ 474 inode allocation semaphore for */
360#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 475#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004
361 /* caller is from the direct IO path, request to creation of an 476 /* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
398#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) 513#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
399#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) 514#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
400 515
516#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
401/* 517/*
402 * ioctl commands in 32 bit emulation 518 * ioctl commands in 32 bit emulation
403 */ 519 */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
408#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) 524#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int)
409#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) 525#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int)
410#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) 526#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int)
527#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input)
411#ifdef CONFIG_JBD2_DEBUG 528#ifdef CONFIG_JBD2_DEBUG
412#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) 529#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int)
413#endif 530#endif
414#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION 531#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION
415#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION 532#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION
533#endif
416 534
417 535
418/* 536/*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
616 */ 734 */
617struct ext4_inode_info { 735struct ext4_inode_info {
618 __le32 i_data[15]; /* unconverted */ 736 __le32 i_data[15]; /* unconverted */
619 __u32 i_flags;
620 ext4_fsblk_t i_file_acl;
621 __u32 i_dtime; 737 __u32 i_dtime;
738 ext4_fsblk_t i_file_acl;
622 739
623 /* 740 /*
624 * i_block_group is the number of the block group which contains 741 * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
629 */ 746 */
630 ext4_group_t i_block_group; 747 ext4_group_t i_block_group;
631 unsigned long i_state_flags; /* Dynamic state flags */ 748 unsigned long i_state_flags; /* Dynamic state flags */
749 unsigned long i_flags;
632 750
633 ext4_lblk_t i_dir_start_lookup; 751 ext4_lblk_t i_dir_start_lookup;
634#ifdef CONFIG_EXT4_FS_XATTR 752#ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
1062 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ 1180 EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */
1063 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ 1181 EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */
1064 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ 1182 EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/
1183 EXT4_STATE_NEWENTRY, /* File just added to dir */
1065}; 1184};
1066 1185
1067static inline int ext4_test_inode_state(struct inode *inode, int bit) 1186#define EXT4_INODE_BIT_FNS(name, field) \
1068{ 1187static inline int ext4_test_inode_##name(struct inode *inode, int bit) \
1069 return test_bit(bit, &EXT4_I(inode)->i_state_flags); 1188{ \
1070} 1189 return test_bit(bit, &EXT4_I(inode)->i_##field); \
1071 1190} \
1072static inline void ext4_set_inode_state(struct inode *inode, int bit) 1191static inline void ext4_set_inode_##name(struct inode *inode, int bit) \
1073{ 1192{ \
1074 set_bit(bit, &EXT4_I(inode)->i_state_flags); 1193 set_bit(bit, &EXT4_I(inode)->i_##field); \
1194} \
1195static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
1196{ \
1197 clear_bit(bit, &EXT4_I(inode)->i_##field); \
1075} 1198}
1076 1199
1077static inline void ext4_clear_inode_state(struct inode *inode, int bit) 1200EXT4_INODE_BIT_FNS(flag, flags)
1078{ 1201EXT4_INODE_BIT_FNS(state, state_flags)
1079 clear_bit(bit, &EXT4_I(inode)->i_state_flags);
1080}
1081#else 1202#else
1082/* Assume that user mode programs are passing in an ext4fs superblock, not 1203/* Assume that user mode programs are passing in an ext4fs superblock, not
1083 * a kernel struct super_block. This will allow us to call the feature-test 1204 * a kernel struct super_block. This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
1264 1385
1265#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ 1386#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
1266 EXT4_FEATURE_COMPAT_DIR_INDEX) && \ 1387 EXT4_FEATURE_COMPAT_DIR_INDEX) && \
1267 (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) 1388 ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
1268#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) 1389#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
1269#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) 1390#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
1270 1391
@@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
1398extern void ext4_htree_free_dir_info(struct dir_private_info *p); 1519extern void ext4_htree_free_dir_info(struct dir_private_info *p);
1399 1520
1400/* fsync.c */ 1521/* fsync.c */
1401extern int ext4_sync_file(struct file *, struct dentry *, int); 1522extern int ext4_sync_file(struct file *, int);
1402 1523
1403/* hash.c */ 1524/* hash.c */
1404extern int ext4fs_dirhash(const char *name, int len, struct 1525extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
1678 ext4_grpblk_t bb_first_free; /* first free block */ 1799 ext4_grpblk_t bb_first_free; /* first free block */
1679 ext4_grpblk_t bb_free; /* total free blocks */ 1800 ext4_grpblk_t bb_free; /* total free blocks */
1680 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ 1801 ext4_grpblk_t bb_fragments; /* nr of freespace fragments */
1802 ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */
1681 struct list_head bb_prealloc_list; 1803 struct list_head bb_prealloc_list;
1682#ifdef DOUBLE_CHECK 1804#ifdef DOUBLE_CHECK
1683 void *bb_bitmap; 1805 void *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1772extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 1894extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1773extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 1895extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1774 int chunk); 1896 int chunk);
1775extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 1897extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
1776 ext4_lblk_t iblock, unsigned int max_blocks, 1898 struct ext4_map_blocks *map, int flags);
1777 struct buffer_head *bh_result, int flags);
1778extern void ext4_ext_truncate(struct inode *); 1899extern void ext4_ext_truncate(struct inode *);
1779extern void ext4_ext_init(struct super_block *); 1900extern void ext4_ext_init(struct super_block *);
1780extern void ext4_ext_release(struct super_block *); 1901extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1782 loff_t len); 1903 loff_t len);
1783extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 1904extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
1784 ssize_t len); 1905 ssize_t len);
1906extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
1907 struct ext4_map_blocks *map, int flags);
1785extern int ext4_get_blocks(handle_t *handle, struct inode *inode, 1908extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
1786 sector_t block, unsigned int max_blocks, 1909 sector_t block, unsigned int max_blocks,
1787 struct buffer_head *bh, int flags); 1910 struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
273 return 1; 273 return 1;
274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 274 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
275 return 1; 275 return 1;
276 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 276 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
277 return 1; 277 return 1;
278 return 0; 278 return 0;
279} 279}
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
284 return 0; 284 return 0;
285 if (!S_ISREG(inode->i_mode)) 285 if (!S_ISREG(inode->i_mode))
286 return 0; 286 return 0;
287 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 287 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
288 return 0; 288 return 0;
289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 289 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
290 return 1; 290 return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
297 return 0; 297 return 0;
298 if (EXT4_JOURNAL(inode) == NULL) 298 if (EXT4_JOURNAL(inode) == NULL)
299 return 1; 299 return 1;
300 if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) 300 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
301 return 0; 301 return 0;
302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 302 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
303 return 1; 303 return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
321 return 0; 321 return 0;
322 if (!S_ISREG(inode->i_mode)) 322 if (!S_ISREG(inode->i_mode))
323 return 0; 323 return 0;
324 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 324 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
325 return 0; 325 return 0;
326 if (ext4_should_journal_data(inode)) 326 if (ext4_should_journal_data(inode))
327 return 0; 327 return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 146f1f6a9203..bf029c7d5518 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
107 if (err <= 0) 107 if (err <= 0)
108 return err; 108 return err;
109 err = ext4_truncate_restart_trans(handle, inode, needed); 109 err = ext4_truncate_restart_trans(handle, inode, needed);
110 /* 110 if (err == 0)
111 * We have dropped i_data_sem so someone might have cached again 111 err = -EAGAIN;
112 * an extent we are going to truncate.
113 */
114 ext4_ext_invalidate_cache(inode);
115 112
116 return err; 113 return err;
117} 114}
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
185 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { 182 if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
186 /* 183 /*
187 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 184 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
188 * block groups per flexgroup, reserve the first block 185 * block groups per flexgroup, reserve the first block
189 * group for directories and special files. Regular 186 * group for directories and special files. Regular
190 * files will start at the second block group. This 187 * files will start at the second block group. This
191 * tends to speed up directory access and improves 188 * tends to speed up directory access and improves
192 * fsck times. 189 * fsck times.
193 */ 190 */
194 block_group &= ~(flex_size-1); 191 block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
439 return 0; 436 return 0;
440 437
441corrupted: 438corrupted:
442 __ext4_error(inode->i_sb, function, 439 ext4_error_inode(function, inode,
443 "bad header/extent in inode #%lu: %s - magic %x, " 440 "bad header/extent: %s - magic %x, "
444 "entries %u, max %u(%u), depth %u(%u)", 441 "entries %u, max %u(%u), depth %u(%u)",
445 inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic), 442 error_msg, le16_to_cpu(eh->eh_magic),
446 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max), 443 le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
447 max, le16_to_cpu(eh->eh_depth), depth); 444 max, le16_to_cpu(eh->eh_depth), depth);
448 445
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
1622 merge_done = 1; 1619 merge_done = 1;
1623 WARN_ON(eh->eh_entries == 0); 1620 WARN_ON(eh->eh_entries == 0);
1624 if (!eh->eh_entries) 1621 if (!eh->eh_entries)
1625 ext4_error(inode->i_sb, 1622 EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
1626 "inode#%lu, eh->eh_entries = 0!",
1627 inode->i_ino);
1628 } 1623 }
1629 1624
1630 return merge_done; 1625 return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
2039 struct ext4_ext_cache *cex; 2034 struct ext4_ext_cache *cex;
2040 int ret = EXT4_EXT_CACHE_NO; 2035 int ret = EXT4_EXT_CACHE_NO;
2041 2036
2042 /* 2037 /*
2043 * We borrow i_block_reservation_lock to protect i_cached_extent 2038 * We borrow i_block_reservation_lock to protect i_cached_extent
2044 */ 2039 */
2045 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 2040 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2361 int depth = ext_depth(inode); 2356 int depth = ext_depth(inode);
2362 struct ext4_ext_path *path; 2357 struct ext4_ext_path *path;
2363 handle_t *handle; 2358 handle_t *handle;
2364 int i = 0, err = 0; 2359 int i, err;
2365 2360
2366 ext_debug("truncate since %u\n", start); 2361 ext_debug("truncate since %u\n", start);
2367 2362
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2370 if (IS_ERR(handle)) 2365 if (IS_ERR(handle))
2371 return PTR_ERR(handle); 2366 return PTR_ERR(handle);
2372 2367
2368again:
2373 ext4_ext_invalidate_cache(inode); 2369 ext4_ext_invalidate_cache(inode);
2374 2370
2375 /* 2371 /*
2376 * We start scanning from right side, freeing all the blocks 2372 * We start scanning from right side, freeing all the blocks
2377 * after i_size and walking into the tree depth-wise. 2373 * after i_size and walking into the tree depth-wise.
2378 */ 2374 */
2375 depth = ext_depth(inode);
2379 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); 2376 path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
2380 if (path == NULL) { 2377 if (path == NULL) {
2381 ext4_journal_stop(handle); 2378 ext4_journal_stop(handle);
2382 return -ENOMEM; 2379 return -ENOMEM;
2383 } 2380 }
2381 path[0].p_depth = depth;
2384 path[0].p_hdr = ext_inode_hdr(inode); 2382 path[0].p_hdr = ext_inode_hdr(inode);
2385 if (ext4_ext_check(inode, path[0].p_hdr, depth)) { 2383 if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
2386 err = -EIO; 2384 err = -EIO;
2387 goto out; 2385 goto out;
2388 } 2386 }
2389 path[0].p_depth = depth; 2387 i = err = 0;
2390 2388
2391 while (i >= 0 && err == 0) { 2389 while (i >= 0 && err == 0) {
2392 if (i == depth) { 2390 if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
2480out: 2478out:
2481 ext4_ext_drop_refs(path); 2479 ext4_ext_drop_refs(path);
2482 kfree(path); 2480 kfree(path);
2481 if (err == -EAGAIN)
2482 goto again;
2483 ext4_journal_stop(handle); 2483 ext4_journal_stop(handle);
2484 2484
2485 return err; 2485 return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
2544/* FIXME!! we need to try to merge to left or right after zero-out */ 2544/* FIXME!! we need to try to merge to left or right after zero-out */
2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) 2545static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2546{ 2546{
2547 int ret = -EIO; 2547 int ret;
2548 struct bio *bio; 2548 struct bio *bio;
2549 int blkbits, blocksize; 2549 int blkbits, blocksize;
2550 sector_t ee_pblock; 2550 sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2568 len = ee_len; 2568 len = ee_len;
2569 2569
2570 bio = bio_alloc(GFP_NOIO, len); 2570 bio = bio_alloc(GFP_NOIO, len);
2571 if (!bio)
2572 return -ENOMEM;
2573
2571 bio->bi_sector = ee_pblock; 2574 bio->bi_sector = ee_pblock;
2572 bio->bi_bdev = inode->i_sb->s_bdev; 2575 bio->bi_bdev = inode->i_sb->s_bdev;
2573 2576
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2595 submit_bio(WRITE, bio); 2598 submit_bio(WRITE, bio);
2596 wait_for_completion(&event); 2599 wait_for_completion(&event);
2597 2600
2598 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 2601 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2599 ret = 0; 2602 bio_put(bio);
2600 else { 2603 return -EIO;
2601 ret = -EIO;
2602 break;
2603 } 2604 }
2604 bio_put(bio); 2605 bio_put(bio);
2605 ee_len -= done; 2606 ee_len -= done;
2606 ee_pblock += done << (blkbits - 9); 2607 ee_pblock += done << (blkbits - 9);
2607 } 2608 }
2608 return ret; 2609 return 0;
2609} 2610}
2610 2611
2611#define EXT4_EXT_ZERO_LEN 7 2612#define EXT4_EXT_ZERO_LEN 7
2612/* 2613/*
2613 * This function is called by ext4_ext_get_blocks() if someone tries to write 2614 * This function is called by ext4_ext_map_blocks() if someone tries to write
2614 * to an uninitialized extent. It may result in splitting the uninitialized 2615 * to an uninitialized extent. It may result in splitting the uninitialized
2615 * extent into multiple extents (upto three - one initialized and two 2616 * extent into multiple extents (upto three - one initialized and two
2616 * uninitialized). 2617 * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
2620 * c> Splits in three extents: Somone is writing in middle of the extent 2621 * c> Splits in three extents: Somone is writing in middle of the extent
2621 */ 2622 */
2622static int ext4_ext_convert_to_initialized(handle_t *handle, 2623static int ext4_ext_convert_to_initialized(handle_t *handle,
2623 struct inode *inode, 2624 struct inode *inode,
2624 struct ext4_ext_path *path, 2625 struct ext4_map_blocks *map,
2625 ext4_lblk_t iblock, 2626 struct ext4_ext_path *path)
2626 unsigned int max_blocks)
2627{ 2627{
2628 struct ext4_extent *ex, newex, orig_ex; 2628 struct ext4_extent *ex, newex, orig_ex;
2629 struct ext4_extent *ex1 = NULL; 2629 struct ext4_extent *ex1 = NULL;
2630 struct ext4_extent *ex2 = NULL; 2630 struct ext4_extent *ex2 = NULL;
2631 struct ext4_extent *ex3 = NULL; 2631 struct ext4_extent *ex3 = NULL;
2632 struct ext4_extent_header *eh; 2632 struct ext4_extent_header *eh;
2633 ext4_lblk_t ee_block; 2633 ext4_lblk_t ee_block, eof_block;
2634 unsigned int allocated, ee_len, depth; 2634 unsigned int allocated, ee_len, depth;
2635 ext4_fsblk_t newblock; 2635 ext4_fsblk_t newblock;
2636 int err = 0; 2636 int err = 0;
2637 int ret = 0; 2637 int ret = 0;
2638 int may_zeroout;
2639
2640 ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
2641 "block %llu, max_blocks %u\n", inode->i_ino,
2642 (unsigned long long)map->m_lblk, map->m_len);
2643
2644 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2645 inode->i_sb->s_blocksize_bits;
2646 if (eof_block < map->m_lblk + map->m_len)
2647 eof_block = map->m_lblk + map->m_len;
2638 2648
2639 depth = ext_depth(inode); 2649 depth = ext_depth(inode);
2640 eh = path[depth].p_hdr; 2650 eh = path[depth].p_hdr;
2641 ex = path[depth].p_ext; 2651 ex = path[depth].p_ext;
2642 ee_block = le32_to_cpu(ex->ee_block); 2652 ee_block = le32_to_cpu(ex->ee_block);
2643 ee_len = ext4_ext_get_actual_len(ex); 2653 ee_len = ext4_ext_get_actual_len(ex);
2644 allocated = ee_len - (iblock - ee_block); 2654 allocated = ee_len - (map->m_lblk - ee_block);
2645 newblock = iblock - ee_block + ext_pblock(ex); 2655 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2656
2646 ex2 = ex; 2657 ex2 = ex;
2647 orig_ex.ee_block = ex->ee_block; 2658 orig_ex.ee_block = ex->ee_block;
2648 orig_ex.ee_len = cpu_to_le16(ee_len); 2659 orig_ex.ee_len = cpu_to_le16(ee_len);
2649 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2660 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2650 2661
2662 /*
2663 * It is safe to convert extent to initialized via explicit
2664 * zeroout only if extent is fully insde i_size or new_size.
2665 */
2666 may_zeroout = ee_block + ee_len <= eof_block;
2667
2651 err = ext4_ext_get_access(handle, inode, path + depth); 2668 err = ext4_ext_get_access(handle, inode, path + depth);
2652 if (err) 2669 if (err)
2653 goto out; 2670 goto out;
2654 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ 2671 /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
2655 if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { 2672 if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
2656 err = ext4_ext_zeroout(inode, &orig_ex); 2673 err = ext4_ext_zeroout(inode, &orig_ex);
2657 if (err) 2674 if (err)
2658 goto fix_extent_len; 2675 goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2665 return allocated; 2682 return allocated;
2666 } 2683 }
2667 2684
2668 /* ex1: ee_block to iblock - 1 : uninitialized */ 2685 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2669 if (iblock > ee_block) { 2686 if (map->m_lblk > ee_block) {
2670 ex1 = ex; 2687 ex1 = ex;
2671 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2688 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2672 ext4_ext_mark_uninitialized(ex1); 2689 ext4_ext_mark_uninitialized(ex1);
2673 ex2 = &newex; 2690 ex2 = &newex;
2674 } 2691 }
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2677 * we insert ex3, if ex1 is NULL. This is to avoid temporary 2694 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2678 * overlap of blocks. 2695 * overlap of blocks.
2679 */ 2696 */
2680 if (!ex1 && allocated > max_blocks) 2697 if (!ex1 && allocated > map->m_len)
2681 ex2->ee_len = cpu_to_le16(max_blocks); 2698 ex2->ee_len = cpu_to_le16(map->m_len);
2682 /* ex3: to ee_block + ee_len : uninitialised */ 2699 /* ex3: to ee_block + ee_len : uninitialised */
2683 if (allocated > max_blocks) { 2700 if (allocated > map->m_len) {
2684 unsigned int newdepth; 2701 unsigned int newdepth;
2685 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ 2702 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
2686 if (allocated <= EXT4_EXT_ZERO_LEN) { 2703 if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
2687 /* 2704 /*
2688 * iblock == ee_block is handled by the zerouout 2705 * map->m_lblk == ee_block is handled by the zerouout
2689 * at the beginning. 2706 * at the beginning.
2690 * Mark first half uninitialized. 2707 * Mark first half uninitialized.
2691 * Mark second half initialized and zero out the 2708 * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2698 ext4_ext_dirty(handle, inode, path + depth); 2715 ext4_ext_dirty(handle, inode, path + depth);
2699 2716
2700 ex3 = &newex; 2717 ex3 = &newex;
2701 ex3->ee_block = cpu_to_le32(iblock); 2718 ex3->ee_block = cpu_to_le32(map->m_lblk);
2702 ext4_ext_store_pblock(ex3, newblock); 2719 ext4_ext_store_pblock(ex3, newblock);
2703 ex3->ee_len = cpu_to_le16(allocated); 2720 ex3->ee_len = cpu_to_le16(allocated);
2704 err = ext4_ext_insert_extent(handle, inode, path, 2721 err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2711 ex->ee_len = orig_ex.ee_len; 2728 ex->ee_len = orig_ex.ee_len;
2712 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2729 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2713 ext4_ext_dirty(handle, inode, path + depth); 2730 ext4_ext_dirty(handle, inode, path + depth);
2714 /* blocks available from iblock */ 2731 /* blocks available from map->m_lblk */
2715 return allocated; 2732 return allocated;
2716 2733
2717 } else if (err) 2734 } else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2733 */ 2750 */
2734 depth = ext_depth(inode); 2751 depth = ext_depth(inode);
2735 ext4_ext_drop_refs(path); 2752 ext4_ext_drop_refs(path);
2736 path = ext4_ext_find_extent(inode, 2753 path = ext4_ext_find_extent(inode, map->m_lblk,
2737 iblock, path); 2754 path);
2738 if (IS_ERR(path)) { 2755 if (IS_ERR(path)) {
2739 err = PTR_ERR(path); 2756 err = PTR_ERR(path);
2740 return err; 2757 return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2754 return allocated; 2771 return allocated;
2755 } 2772 }
2756 ex3 = &newex; 2773 ex3 = &newex;
2757 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 2774 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2758 ext4_ext_store_pblock(ex3, newblock + max_blocks); 2775 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2759 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 2776 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2760 ext4_ext_mark_uninitialized(ex3); 2777 ext4_ext_mark_uninitialized(ex3);
2761 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0); 2778 err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
2762 if (err == -ENOSPC) { 2779 if (err == -ENOSPC && may_zeroout) {
2763 err = ext4_ext_zeroout(inode, &orig_ex); 2780 err = ext4_ext_zeroout(inode, &orig_ex);
2764 if (err) 2781 if (err)
2765 goto fix_extent_len; 2782 goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2769 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2786 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2770 ext4_ext_dirty(handle, inode, path + depth); 2787 ext4_ext_dirty(handle, inode, path + depth);
2771 /* zeroed the full extent */ 2788 /* zeroed the full extent */
2772 /* blocks available from iblock */ 2789 /* blocks available from map->m_lblk */
2773 return allocated; 2790 return allocated;
2774 2791
2775 } else if (err) 2792 } else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2783 * update the extent length after successful insert of the 2800 * update the extent length after successful insert of the
2784 * split extent 2801 * split extent
2785 */ 2802 */
2786 orig_ex.ee_len = cpu_to_le16(ee_len - 2803 ee_len -= ext4_ext_get_actual_len(ex3);
2787 ext4_ext_get_actual_len(ex3)); 2804 orig_ex.ee_len = cpu_to_le16(ee_len);
2805 may_zeroout = ee_block + ee_len <= eof_block;
2806
2788 depth = newdepth; 2807 depth = newdepth;
2789 ext4_ext_drop_refs(path); 2808 ext4_ext_drop_refs(path);
2790 path = ext4_ext_find_extent(inode, iblock, path); 2809 path = ext4_ext_find_extent(inode, map->m_lblk, path);
2791 if (IS_ERR(path)) { 2810 if (IS_ERR(path)) {
2792 err = PTR_ERR(path); 2811 err = PTR_ERR(path);
2793 goto out; 2812 goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2801 if (err) 2820 if (err)
2802 goto out; 2821 goto out;
2803 2822
2804 allocated = max_blocks; 2823 allocated = map->m_len;
2805 2824
2806 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying 2825 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
2807 * to insert a extent in the middle zerout directly 2826 * to insert a extent in the middle zerout directly
2808 * otherwise give the extent a chance to merge to left 2827 * otherwise give the extent a chance to merge to left
2809 */ 2828 */
2810 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && 2829 if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
2811 iblock != ee_block) { 2830 map->m_lblk != ee_block && may_zeroout) {
2812 err = ext4_ext_zeroout(inode, &orig_ex); 2831 err = ext4_ext_zeroout(inode, &orig_ex);
2813 if (err) 2832 if (err)
2814 goto fix_extent_len; 2833 goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2818 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 2837 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
2819 ext4_ext_dirty(handle, inode, path + depth); 2838 ext4_ext_dirty(handle, inode, path + depth);
2820 /* zero out the first half */ 2839 /* zero out the first half */
2821 /* blocks available from iblock */ 2840 /* blocks available from map->m_lblk */
2822 return allocated; 2841 return allocated;
2823 } 2842 }
2824 } 2843 }
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2829 */ 2848 */
2830 if (ex1 && ex1 != ex) { 2849 if (ex1 && ex1 != ex) {
2831 ex1 = ex; 2850 ex1 = ex;
2832 ex1->ee_len = cpu_to_le16(iblock - ee_block); 2851 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2833 ext4_ext_mark_uninitialized(ex1); 2852 ext4_ext_mark_uninitialized(ex1);
2834 ex2 = &newex; 2853 ex2 = &newex;
2835 } 2854 }
2836 /* ex2: iblock to iblock + maxblocks-1 : initialised */ 2855 /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
2837 ex2->ee_block = cpu_to_le32(iblock); 2856 ex2->ee_block = cpu_to_le32(map->m_lblk);
2838 ext4_ext_store_pblock(ex2, newblock); 2857 ext4_ext_store_pblock(ex2, newblock);
2839 ex2->ee_len = cpu_to_le16(allocated); 2858 ex2->ee_len = cpu_to_le16(allocated);
2840 if (ex2 != ex) 2859 if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
2877 goto out; 2896 goto out;
2878insert: 2897insert:
2879 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0); 2898 err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
2880 if (err == -ENOSPC) { 2899 if (err == -ENOSPC && may_zeroout) {
2881 err = ext4_ext_zeroout(inode, &orig_ex); 2900 err = ext4_ext_zeroout(inode, &orig_ex);
2882 if (err) 2901 if (err)
2883 goto fix_extent_len; 2902 goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
2904} 2923}
2905 2924
2906/* 2925/*
2907 * This function is called by ext4_ext_get_blocks() from 2926 * This function is called by ext4_ext_map_blocks() from
2908 * ext4_get_blocks_dio_write() when DIO to write 2927 * ext4_get_blocks_dio_write() when DIO to write
2909 * to an uninitialized extent. 2928 * to an uninitialized extent.
2910 * 2929 *
@@ -2927,9 +2946,8 @@ fix_extent_len:
2927 */ 2946 */
2928static int ext4_split_unwritten_extents(handle_t *handle, 2947static int ext4_split_unwritten_extents(handle_t *handle,
2929 struct inode *inode, 2948 struct inode *inode,
2949 struct ext4_map_blocks *map,
2930 struct ext4_ext_path *path, 2950 struct ext4_ext_path *path,
2931 ext4_lblk_t iblock,
2932 unsigned int max_blocks,
2933 int flags) 2951 int flags)
2934{ 2952{
2935 struct ext4_extent *ex, newex, orig_ex; 2953 struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2937 struct ext4_extent *ex2 = NULL; 2955 struct ext4_extent *ex2 = NULL;
2938 struct ext4_extent *ex3 = NULL; 2956 struct ext4_extent *ex3 = NULL;
2939 struct ext4_extent_header *eh; 2957 struct ext4_extent_header *eh;
2940 ext4_lblk_t ee_block; 2958 ext4_lblk_t ee_block, eof_block;
2941 unsigned int allocated, ee_len, depth; 2959 unsigned int allocated, ee_len, depth;
2942 ext4_fsblk_t newblock; 2960 ext4_fsblk_t newblock;
2943 int err = 0; 2961 int err = 0;
2962 int may_zeroout;
2963
2964 ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
2965 "block %llu, max_blocks %u\n", inode->i_ino,
2966 (unsigned long long)map->m_lblk, map->m_len);
2967
2968 eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
2969 inode->i_sb->s_blocksize_bits;
2970 if (eof_block < map->m_lblk + map->m_len)
2971 eof_block = map->m_lblk + map->m_len;
2944 2972
2945 ext_debug("ext4_split_unwritten_extents: inode %lu,"
2946 "iblock %llu, max_blocks %u\n", inode->i_ino,
2947 (unsigned long long)iblock, max_blocks);
2948 depth = ext_depth(inode); 2973 depth = ext_depth(inode);
2949 eh = path[depth].p_hdr; 2974 eh = path[depth].p_hdr;
2950 ex = path[depth].p_ext; 2975 ex = path[depth].p_ext;
2951 ee_block = le32_to_cpu(ex->ee_block); 2976 ee_block = le32_to_cpu(ex->ee_block);
2952 ee_len = ext4_ext_get_actual_len(ex); 2977 ee_len = ext4_ext_get_actual_len(ex);
2953 allocated = ee_len - (iblock - ee_block); 2978 allocated = ee_len - (map->m_lblk - ee_block);
2954 newblock = iblock - ee_block + ext_pblock(ex); 2979 newblock = map->m_lblk - ee_block + ext_pblock(ex);
2980
2955 ex2 = ex; 2981 ex2 = ex;
2956 orig_ex.ee_block = ex->ee_block; 2982 orig_ex.ee_block = ex->ee_block;
2957 orig_ex.ee_len = cpu_to_le16(ee_len); 2983 orig_ex.ee_len = cpu_to_le16(ee_len);
2958 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); 2984 ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
2959 2985
2960 /* 2986 /*
2987 * It is safe to convert extent to initialized via explicit
2988 * zeroout only if extent is fully insde i_size or new_size.
2989 */
2990 may_zeroout = ee_block + ee_len <= eof_block;
2991
2992 /*
2961 * If the uninitialized extent begins at the same logical 2993 * If the uninitialized extent begins at the same logical
2962 * block where the write begins, and the write completely 2994 * block where the write begins, and the write completely
2963 * covers the extent, then we don't need to split it. 2995 * covers the extent, then we don't need to split it.
2964 */ 2996 */
2965 if ((iblock == ee_block) && (allocated <= max_blocks)) 2997 if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
2966 return allocated; 2998 return allocated;
2967 2999
2968 err = ext4_ext_get_access(handle, inode, path + depth); 3000 err = ext4_ext_get_access(handle, inode, path + depth);
2969 if (err) 3001 if (err)
2970 goto out; 3002 goto out;
2971 /* ex1: ee_block to iblock - 1 : uninitialized */ 3003 /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
2972 if (iblock > ee_block) { 3004 if (map->m_lblk > ee_block) {
2973 ex1 = ex; 3005 ex1 = ex;
2974 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3006 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
2975 ext4_ext_mark_uninitialized(ex1); 3007 ext4_ext_mark_uninitialized(ex1);
2976 ex2 = &newex; 3008 ex2 = &newex;
2977 } 3009 }
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
2980 * we insert ex3, if ex1 is NULL. This is to avoid temporary 3012 * we insert ex3, if ex1 is NULL. This is to avoid temporary
2981 * overlap of blocks. 3013 * overlap of blocks.
2982 */ 3014 */
2983 if (!ex1 && allocated > max_blocks) 3015 if (!ex1 && allocated > map->m_len)
2984 ex2->ee_len = cpu_to_le16(max_blocks); 3016 ex2->ee_len = cpu_to_le16(map->m_len);
2985 /* ex3: to ee_block + ee_len : uninitialised */ 3017 /* ex3: to ee_block + ee_len : uninitialised */
2986 if (allocated > max_blocks) { 3018 if (allocated > map->m_len) {
2987 unsigned int newdepth; 3019 unsigned int newdepth;
2988 ex3 = &newex; 3020 ex3 = &newex;
2989 ex3->ee_block = cpu_to_le32(iblock + max_blocks); 3021 ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
2990 ext4_ext_store_pblock(ex3, newblock + max_blocks); 3022 ext4_ext_store_pblock(ex3, newblock + map->m_len);
2991 ex3->ee_len = cpu_to_le16(allocated - max_blocks); 3023 ex3->ee_len = cpu_to_le16(allocated - map->m_len);
2992 ext4_ext_mark_uninitialized(ex3); 3024 ext4_ext_mark_uninitialized(ex3);
2993 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags); 3025 err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
2994 if (err == -ENOSPC) { 3026 if (err == -ENOSPC && may_zeroout) {
2995 err = ext4_ext_zeroout(inode, &orig_ex); 3027 err = ext4_ext_zeroout(inode, &orig_ex);
2996 if (err) 3028 if (err)
2997 goto fix_extent_len; 3029 goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3001 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); 3033 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
3002 ext4_ext_dirty(handle, inode, path + depth); 3034 ext4_ext_dirty(handle, inode, path + depth);
3003 /* zeroed the full extent */ 3035 /* zeroed the full extent */
3004 /* blocks available from iblock */ 3036 /* blocks available from map->m_lblk */
3005 return allocated; 3037 return allocated;
3006 3038
3007 } else if (err) 3039 } else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3015 * update the extent length after successful insert of the 3047 * update the extent length after successful insert of the
3016 * split extent 3048 * split extent
3017 */ 3049 */
3018 orig_ex.ee_len = cpu_to_le16(ee_len - 3050 ee_len -= ext4_ext_get_actual_len(ex3);
3019 ext4_ext_get_actual_len(ex3)); 3051 orig_ex.ee_len = cpu_to_le16(ee_len);
3052 may_zeroout = ee_block + ee_len <= eof_block;
3053
3020 depth = newdepth; 3054 depth = newdepth;
3021 ext4_ext_drop_refs(path); 3055 ext4_ext_drop_refs(path);
3022 path = ext4_ext_find_extent(inode, iblock, path); 3056 path = ext4_ext_find_extent(inode, map->m_lblk, path);
3023 if (IS_ERR(path)) { 3057 if (IS_ERR(path)) {
3024 err = PTR_ERR(path); 3058 err = PTR_ERR(path);
3025 goto out; 3059 goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3033 if (err) 3067 if (err)
3034 goto out; 3068 goto out;
3035 3069
3036 allocated = max_blocks; 3070 allocated = map->m_len;
3037 } 3071 }
3038 /* 3072 /*
3039 * If there was a change of depth as part of the 3073 * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3042 */ 3076 */
3043 if (ex1 && ex1 != ex) { 3077 if (ex1 && ex1 != ex) {
3044 ex1 = ex; 3078 ex1 = ex;
3045 ex1->ee_len = cpu_to_le16(iblock - ee_block); 3079 ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
3046 ext4_ext_mark_uninitialized(ex1); 3080 ext4_ext_mark_uninitialized(ex1);
3047 ex2 = &newex; 3081 ex2 = &newex;
3048 } 3082 }
3049 /* 3083 /*
3050 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written, 3084 * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
3051 * uninitialised still. 3085 * using direct I/O, uninitialised still.
3052 */ 3086 */
3053 ex2->ee_block = cpu_to_le32(iblock); 3087 ex2->ee_block = cpu_to_le32(map->m_lblk);
3054 ext4_ext_store_pblock(ex2, newblock); 3088 ext4_ext_store_pblock(ex2, newblock);
3055 ex2->ee_len = cpu_to_le16(allocated); 3089 ex2->ee_len = cpu_to_le16(allocated);
3056 ext4_ext_mark_uninitialized(ex2); 3090 ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
3062 goto out; 3096 goto out;
3063insert: 3097insert:
3064 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3098 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3065 if (err == -ENOSPC) { 3099 if (err == -ENOSPC && may_zeroout) {
3066 err = ext4_ext_zeroout(inode, &orig_ex); 3100 err = ext4_ext_zeroout(inode, &orig_ex);
3067 if (err) 3101 if (err)
3068 goto fix_extent_len; 3102 goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
3152 3186
3153static int 3187static int
3154ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, 3188ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3155 ext4_lblk_t iblock, unsigned int max_blocks, 3189 struct ext4_map_blocks *map,
3156 struct ext4_ext_path *path, int flags, 3190 struct ext4_ext_path *path, int flags,
3157 unsigned int allocated, struct buffer_head *bh_result, 3191 unsigned int allocated, ext4_fsblk_t newblock)
3158 ext4_fsblk_t newblock)
3159{ 3192{
3160 int ret = 0; 3193 int ret = 0;
3161 int err = 0; 3194 int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3163 3196
3164 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical" 3197 ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
3165 "block %llu, max_blocks %u, flags %d, allocated %u", 3198 "block %llu, max_blocks %u, flags %d, allocated %u",
3166 inode->i_ino, (unsigned long long)iblock, max_blocks, 3199 inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
3167 flags, allocated); 3200 flags, allocated);
3168 ext4_ext_show_leaf(inode, path); 3201 ext4_ext_show_leaf(inode, path);
3169 3202
3170 /* get_block() before submit the IO, split the extent */ 3203 /* get_block() before submit the IO, split the extent */
3171 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) { 3204 if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
3172 ret = ext4_split_unwritten_extents(handle, 3205 ret = ext4_split_unwritten_extents(handle, inode, map,
3173 inode, path, iblock, 3206 path, flags);
3174 max_blocks, flags);
3175 /* 3207 /*
3176 * Flag the inode(non aio case) or end_io struct (aio case) 3208 * Flag the inode(non aio case) or end_io struct (aio case)
3177 * that this IO needs to convertion to written when IO is 3209 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3182 else 3214 else
3183 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); 3215 ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
3184 if (ext4_should_dioread_nolock(inode)) 3216 if (ext4_should_dioread_nolock(inode))
3185 set_buffer_uninit(bh_result); 3217 map->m_flags |= EXT4_MAP_UNINIT;
3186 goto out; 3218 goto out;
3187 } 3219 }
3188 /* IO end_io complete, convert the filled extent to written */ 3220 /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3210 * the buffer head will be unmapped so that 3242 * the buffer head will be unmapped so that
3211 * a read from the block returns 0s. 3243 * a read from the block returns 0s.
3212 */ 3244 */
3213 set_buffer_unwritten(bh_result); 3245 map->m_flags |= EXT4_MAP_UNWRITTEN;
3214 goto out1; 3246 goto out1;
3215 } 3247 }
3216 3248
3217 /* buffered write, writepage time, convert*/ 3249 /* buffered write, writepage time, convert*/
3218 ret = ext4_ext_convert_to_initialized(handle, inode, 3250 ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
3219 path, iblock,
3220 max_blocks);
3221 if (ret >= 0) 3251 if (ret >= 0)
3222 ext4_update_inode_fsync_trans(handle, inode, 1); 3252 ext4_update_inode_fsync_trans(handle, inode, 1);
3223out: 3253out:
@@ -3226,7 +3256,7 @@ out:
3226 goto out2; 3256 goto out2;
3227 } else 3257 } else
3228 allocated = ret; 3258 allocated = ret;
3229 set_buffer_new(bh_result); 3259 map->m_flags |= EXT4_MAP_NEW;
3230 /* 3260 /*
3231 * if we allocated more blocks than requested 3261 * if we allocated more blocks than requested
3232 * we need to make sure we unmap the extra block 3262 * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
3234 * unmapped later when we find the buffer_head marked 3264 * unmapped later when we find the buffer_head marked
3235 * new. 3265 * new.
3236 */ 3266 */
3237 if (allocated > max_blocks) { 3267 if (allocated > map->m_len) {
3238 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, 3268 unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
3239 newblock + max_blocks, 3269 newblock + map->m_len,
3240 allocated - max_blocks); 3270 allocated - map->m_len);
3241 allocated = max_blocks; 3271 allocated = map->m_len;
3242 } 3272 }
3243 3273
3244 /* 3274 /*
@@ -3252,13 +3282,13 @@ out:
3252 ext4_da_update_reserve_space(inode, allocated, 0); 3282 ext4_da_update_reserve_space(inode, allocated, 0);
3253 3283
3254map_out: 3284map_out:
3255 set_buffer_mapped(bh_result); 3285 map->m_flags |= EXT4_MAP_MAPPED;
3256out1: 3286out1:
3257 if (allocated > max_blocks) 3287 if (allocated > map->m_len)
3258 allocated = max_blocks; 3288 allocated = map->m_len;
3259 ext4_ext_show_leaf(inode, path); 3289 ext4_ext_show_leaf(inode, path);
3260 bh_result->b_bdev = inode->i_sb->s_bdev; 3290 map->m_pblk = newblock;
3261 bh_result->b_blocknr = newblock; 3291 map->m_len = allocated;
3262out2: 3292out2:
3263 if (path) { 3293 if (path) {
3264 ext4_ext_drop_refs(path); 3294 ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
3284 * 3314 *
3285 * return < 0, error case. 3315 * return < 0, error case.
3286 */ 3316 */
3287int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, 3317int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
3288 ext4_lblk_t iblock, 3318 struct ext4_map_blocks *map, int flags)
3289 unsigned int max_blocks, struct buffer_head *bh_result,
3290 int flags)
3291{ 3319{
3292 struct ext4_ext_path *path = NULL; 3320 struct ext4_ext_path *path = NULL;
3293 struct ext4_extent_header *eh; 3321 struct ext4_extent_header *eh;
3294 struct ext4_extent newex, *ex, *last_ex; 3322 struct ext4_extent newex, *ex, *last_ex;
3295 ext4_fsblk_t newblock; 3323 ext4_fsblk_t newblock;
3296 int err = 0, depth, ret, cache_type; 3324 int i, err = 0, depth, ret, cache_type;
3297 unsigned int allocated = 0; 3325 unsigned int allocated = 0;
3298 struct ext4_allocation_request ar; 3326 struct ext4_allocation_request ar;
3299 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio; 3327 ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
3300 3328
3301 __clear_bit(BH_New, &bh_result->b_state);
3302 ext_debug("blocks %u/%u requested for inode %lu\n", 3329 ext_debug("blocks %u/%u requested for inode %lu\n",
3303 iblock, max_blocks, inode->i_ino); 3330 map->m_lblk, map->m_len, inode->i_ino);
3304 3331
3305 /* check in cache */ 3332 /* check in cache */
3306 cache_type = ext4_ext_in_cache(inode, iblock, &newex); 3333 cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
3307 if (cache_type) { 3334 if (cache_type) {
3308 if (cache_type == EXT4_EXT_CACHE_GAP) { 3335 if (cache_type == EXT4_EXT_CACHE_GAP) {
3309 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { 3336 if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3316 /* we should allocate requested block */ 3343 /* we should allocate requested block */
3317 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { 3344 } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
3318 /* block is already allocated */ 3345 /* block is already allocated */
3319 newblock = iblock 3346 newblock = map->m_lblk
3320 - le32_to_cpu(newex.ee_block) 3347 - le32_to_cpu(newex.ee_block)
3321 + ext_pblock(&newex); 3348 + ext_pblock(&newex);
3322 /* number of remaining blocks in the extent */ 3349 /* number of remaining blocks in the extent */
3323 allocated = ext4_ext_get_actual_len(&newex) - 3350 allocated = ext4_ext_get_actual_len(&newex) -
3324 (iblock - le32_to_cpu(newex.ee_block)); 3351 (map->m_lblk - le32_to_cpu(newex.ee_block));
3325 goto out; 3352 goto out;
3326 } else { 3353 } else {
3327 BUG(); 3354 BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3329 } 3356 }
3330 3357
3331 /* find extent for this block */ 3358 /* find extent for this block */
3332 path = ext4_ext_find_extent(inode, iblock, NULL); 3359 path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
3333 if (IS_ERR(path)) { 3360 if (IS_ERR(path)) {
3334 err = PTR_ERR(path); 3361 err = PTR_ERR(path);
3335 path = NULL; 3362 path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3345 */ 3372 */
3346 if (unlikely(path[depth].p_ext == NULL && depth != 0)) { 3373 if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
3347 EXT4_ERROR_INODE(inode, "bad extent address " 3374 EXT4_ERROR_INODE(inode, "bad extent address "
3348 "iblock: %d, depth: %d pblock %lld", 3375 "lblock: %lu, depth: %d pblock %lld",
3349 iblock, depth, path[depth].p_block); 3376 (unsigned long) map->m_lblk, depth,
3377 path[depth].p_block);
3350 err = -EIO; 3378 err = -EIO;
3351 goto out2; 3379 goto out2;
3352 } 3380 }
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3364 */ 3392 */
3365 ee_len = ext4_ext_get_actual_len(ex); 3393 ee_len = ext4_ext_get_actual_len(ex);
3366 /* if found extent covers block, simply return it */ 3394 /* if found extent covers block, simply return it */
3367 if (in_range(iblock, ee_block, ee_len)) { 3395 if (in_range(map->m_lblk, ee_block, ee_len)) {
3368 newblock = iblock - ee_block + ee_start; 3396 newblock = map->m_lblk - ee_block + ee_start;
3369 /* number of remaining blocks in the extent */ 3397 /* number of remaining blocks in the extent */
3370 allocated = ee_len - (iblock - ee_block); 3398 allocated = ee_len - (map->m_lblk - ee_block);
3371 ext_debug("%u fit into %u:%d -> %llu\n", iblock, 3399 ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
3372 ee_block, ee_len, newblock); 3400 ee_block, ee_len, newblock);
3373 3401
3374 /* Do not put uninitialized extent in the cache */ 3402 /* Do not put uninitialized extent in the cache */
3375 if (!ext4_ext_is_uninitialized(ex)) { 3403 if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3379 goto out; 3407 goto out;
3380 } 3408 }
3381 ret = ext4_ext_handle_uninitialized_extents(handle, 3409 ret = ext4_ext_handle_uninitialized_extents(handle,
3382 inode, iblock, max_blocks, path, 3410 inode, map, path, flags, allocated,
3383 flags, allocated, bh_result, newblock); 3411 newblock);
3384 return ret; 3412 return ret;
3385 } 3413 }
3386 } 3414 }
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3394 * put just found gap into cache to speed up 3422 * put just found gap into cache to speed up
3395 * subsequent requests 3423 * subsequent requests
3396 */ 3424 */
3397 ext4_ext_put_gap_in_cache(inode, path, iblock); 3425 ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
3398 goto out2; 3426 goto out2;
3399 } 3427 }
3400 /* 3428 /*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3402 */ 3430 */
3403 3431
3404 /* find neighbour allocated blocks */ 3432 /* find neighbour allocated blocks */
3405 ar.lleft = iblock; 3433 ar.lleft = map->m_lblk;
3406 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); 3434 err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
3407 if (err) 3435 if (err)
3408 goto out2; 3436 goto out2;
3409 ar.lright = iblock; 3437 ar.lright = map->m_lblk;
3410 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright); 3438 err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
3411 if (err) 3439 if (err)
3412 goto out2; 3440 goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3417 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is 3445 * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
3418 * EXT_UNINIT_MAX_LEN. 3446 * EXT_UNINIT_MAX_LEN.
3419 */ 3447 */
3420 if (max_blocks > EXT_INIT_MAX_LEN && 3448 if (map->m_len > EXT_INIT_MAX_LEN &&
3421 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3449 !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3422 max_blocks = EXT_INIT_MAX_LEN; 3450 map->m_len = EXT_INIT_MAX_LEN;
3423 else if (max_blocks > EXT_UNINIT_MAX_LEN && 3451 else if (map->m_len > EXT_UNINIT_MAX_LEN &&
3424 (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) 3452 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
3425 max_blocks = EXT_UNINIT_MAX_LEN; 3453 map->m_len = EXT_UNINIT_MAX_LEN;
3426 3454
3427 /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ 3455 /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
3428 newex.ee_block = cpu_to_le32(iblock); 3456 newex.ee_block = cpu_to_le32(map->m_lblk);
3429 newex.ee_len = cpu_to_le16(max_blocks); 3457 newex.ee_len = cpu_to_le16(map->m_len);
3430 err = ext4_ext_check_overlap(inode, &newex, path); 3458 err = ext4_ext_check_overlap(inode, &newex, path);
3431 if (err) 3459 if (err)
3432 allocated = ext4_ext_get_actual_len(&newex); 3460 allocated = ext4_ext_get_actual_len(&newex);
3433 else 3461 else
3434 allocated = max_blocks; 3462 allocated = map->m_len;
3435 3463
3436 /* allocate new block */ 3464 /* allocate new block */
3437 ar.inode = inode; 3465 ar.inode = inode;
3438 ar.goal = ext4_ext_find_goal(inode, path, iblock); 3466 ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
3439 ar.logical = iblock; 3467 ar.logical = map->m_lblk;
3440 ar.len = allocated; 3468 ar.len = allocated;
3441 if (S_ISREG(inode->i_mode)) 3469 if (S_ISREG(inode->i_mode))
3442 ar.flags = EXT4_MB_HINT_DATA; 3470 ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3470 EXT4_STATE_DIO_UNWRITTEN); 3498 EXT4_STATE_DIO_UNWRITTEN);
3471 } 3499 }
3472 if (ext4_should_dioread_nolock(inode)) 3500 if (ext4_should_dioread_nolock(inode))
3473 set_buffer_uninit(bh_result); 3501 map->m_flags |= EXT4_MAP_UNINIT;
3474 } 3502 }
3475 3503
3476 if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) { 3504 if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
3477 if (unlikely(!eh->eh_entries)) { 3505 if (unlikely(!eh->eh_entries)) {
3478 EXT4_ERROR_INODE(inode, 3506 EXT4_ERROR_INODE(inode,
3479 "eh->eh_entries == 0 ee_block %d", 3507 "eh->eh_entries == 0 and "
3480 ex->ee_block); 3508 "EOFBLOCKS_FL set");
3481 err = -EIO; 3509 err = -EIO;
3482 goto out2; 3510 goto out2;
3483 } 3511 }
3484 last_ex = EXT_LAST_EXTENT(eh); 3512 last_ex = EXT_LAST_EXTENT(eh);
3485 if (iblock + ar.len > le32_to_cpu(last_ex->ee_block) 3513 /*
3486 + ext4_ext_get_actual_len(last_ex)) 3514 * If the current leaf block was reached by looking at
3487 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 3515 * the last index block all the way down the tree, and
3516 * we are extending the inode beyond the last extent
3517 * in the current leaf block, then clear the
3518 * EOFBLOCKS_FL flag.
3519 */
3520 for (i = depth-1; i >= 0; i--) {
3521 if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
3522 break;
3523 }
3524 if ((i < 0) &&
3525 (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
3526 ext4_ext_get_actual_len(last_ex)))
3527 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3488 } 3528 }
3489 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); 3529 err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
3490 if (err) { 3530 if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3500 /* previous routine could use block we allocated */ 3540 /* previous routine could use block we allocated */
3501 newblock = ext_pblock(&newex); 3541 newblock = ext_pblock(&newex);
3502 allocated = ext4_ext_get_actual_len(&newex); 3542 allocated = ext4_ext_get_actual_len(&newex);
3503 if (allocated > max_blocks) 3543 if (allocated > map->m_len)
3504 allocated = max_blocks; 3544 allocated = map->m_len;
3505 set_buffer_new(bh_result); 3545 map->m_flags |= EXT4_MAP_NEW;
3506 3546
3507 /* 3547 /*
3508 * Update reserved blocks/metadata blocks after successful 3548 * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3516 * when it is _not_ an uninitialized extent. 3556 * when it is _not_ an uninitialized extent.
3517 */ 3557 */
3518 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) { 3558 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3519 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3559 ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
3520 EXT4_EXT_CACHE_EXTENT); 3560 EXT4_EXT_CACHE_EXTENT);
3521 ext4_update_inode_fsync_trans(handle, inode, 1); 3561 ext4_update_inode_fsync_trans(handle, inode, 1);
3522 } else 3562 } else
3523 ext4_update_inode_fsync_trans(handle, inode, 0); 3563 ext4_update_inode_fsync_trans(handle, inode, 0);
3524out: 3564out:
3525 if (allocated > max_blocks) 3565 if (allocated > map->m_len)
3526 allocated = max_blocks; 3566 allocated = map->m_len;
3527 ext4_ext_show_leaf(inode, path); 3567 ext4_ext_show_leaf(inode, path);
3528 set_buffer_mapped(bh_result); 3568 map->m_flags |= EXT4_MAP_MAPPED;
3529 bh_result->b_bdev = inode->i_sb->s_bdev; 3569 map->m_pblk = newblock;
3530 bh_result->b_blocknr = newblock; 3570 map->m_len = allocated;
3531out2: 3571out2:
3532 if (path) { 3572 if (path) {
3533 ext4_ext_drop_refs(path); 3573 ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
3625 * can proceed even if the new size is the same as i_size. 3665 * can proceed even if the new size is the same as i_size.
3626 */ 3666 */
3627 if (new_size > i_size_read(inode)) 3667 if (new_size > i_size_read(inode))
3628 EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL; 3668 ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
3629 } 3669 }
3630 3670
3631} 3671}
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
3640long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) 3680long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
3641{ 3681{
3642 handle_t *handle; 3682 handle_t *handle;
3643 ext4_lblk_t block;
3644 loff_t new_size; 3683 loff_t new_size;
3645 unsigned int max_blocks; 3684 unsigned int max_blocks;
3646 int ret = 0; 3685 int ret = 0;
3647 int ret2 = 0; 3686 int ret2 = 0;
3648 int retries = 0; 3687 int retries = 0;
3649 struct buffer_head map_bh; 3688 struct ext4_map_blocks map;
3650 unsigned int credits, blkbits = inode->i_blkbits; 3689 unsigned int credits, blkbits = inode->i_blkbits;
3651 3690
3652 /* 3691 /*
3653 * currently supporting (pre)allocate mode for extent-based 3692 * currently supporting (pre)allocate mode for extent-based
3654 * files _only_ 3693 * files _only_
3655 */ 3694 */
3656 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3695 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3657 return -EOPNOTSUPP; 3696 return -EOPNOTSUPP;
3658 3697
3659 /* preallocation to directories is currently not supported */ 3698 /* preallocation to directories is currently not supported */
3660 if (S_ISDIR(inode->i_mode)) 3699 if (S_ISDIR(inode->i_mode))
3661 return -ENODEV; 3700 return -ENODEV;
3662 3701
3663 block = offset >> blkbits; 3702 map.m_lblk = offset >> blkbits;
3664 /* 3703 /*
3665 * We can't just convert len to max_blocks because 3704 * We can't just convert len to max_blocks because
3666 * If blocksize = 4096 offset = 3072 and len = 2048 3705 * If blocksize = 4096 offset = 3072 and len = 2048
3667 */ 3706 */
3668 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3707 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
3669 - block; 3708 - map.m_lblk;
3670 /* 3709 /*
3671 * credits to insert 1 extent into extent tree 3710 * credits to insert 1 extent into extent tree
3672 */ 3711 */
3673 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3712 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3674 mutex_lock(&inode->i_mutex); 3713 mutex_lock(&inode->i_mutex);
3714 ret = inode_newsize_ok(inode, (len + offset));
3715 if (ret) {
3716 mutex_unlock(&inode->i_mutex);
3717 return ret;
3718 }
3675retry: 3719retry:
3676 while (ret >= 0 && ret < max_blocks) { 3720 while (ret >= 0 && ret < max_blocks) {
3677 block = block + ret; 3721 map.m_lblk = map.m_lblk + ret;
3678 max_blocks = max_blocks - ret; 3722 map.m_len = max_blocks = max_blocks - ret;
3679 handle = ext4_journal_start(inode, credits); 3723 handle = ext4_journal_start(inode, credits);
3680 if (IS_ERR(handle)) { 3724 if (IS_ERR(handle)) {
3681 ret = PTR_ERR(handle); 3725 ret = PTR_ERR(handle);
3682 break; 3726 break;
3683 } 3727 }
3684 map_bh.b_state = 0; 3728 ret = ext4_map_blocks(handle, inode, &map,
3685 ret = ext4_get_blocks(handle, inode, block,
3686 max_blocks, &map_bh,
3687 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); 3729 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
3688 if (ret <= 0) { 3730 if (ret <= 0) {
3689#ifdef EXT4FS_DEBUG 3731#ifdef EXT4FS_DEBUG
3690 WARN_ON(ret <= 0); 3732 WARN_ON(ret <= 0);
3691 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3733 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3692 "returned error inode#%lu, block=%u, " 3734 "returned error inode#%lu, block=%u, "
3693 "max_blocks=%u", __func__, 3735 "max_blocks=%u", __func__,
3694 inode->i_ino, block, max_blocks); 3736 inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
3697 ret2 = ext4_journal_stop(handle); 3739 ret2 = ext4_journal_stop(handle);
3698 break; 3740 break;
3699 } 3741 }
3700 if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, 3742 if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
3701 blkbits) >> blkbits)) 3743 blkbits) >> blkbits))
3702 new_size = offset + len; 3744 new_size = offset + len;
3703 else 3745 else
3704 new_size = (block + ret) << blkbits; 3746 new_size = (map.m_lblk + ret) << blkbits;
3705 3747
3706 ext4_falloc_update_inode(inode, mode, new_size, 3748 ext4_falloc_update_inode(inode, mode, new_size,
3707 buffer_new(&map_bh)); 3749 (map.m_flags & EXT4_MAP_NEW));
3708 ext4_mark_inode_dirty(handle, inode); 3750 ext4_mark_inode_dirty(handle, inode);
3709 ret2 = ext4_journal_stop(handle); 3751 ret2 = ext4_journal_stop(handle);
3710 if (ret2) 3752 if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
3733 ssize_t len) 3775 ssize_t len)
3734{ 3776{
3735 handle_t *handle; 3777 handle_t *handle;
3736 ext4_lblk_t block;
3737 unsigned int max_blocks; 3778 unsigned int max_blocks;
3738 int ret = 0; 3779 int ret = 0;
3739 int ret2 = 0; 3780 int ret2 = 0;
3740 struct buffer_head map_bh; 3781 struct ext4_map_blocks map;
3741 unsigned int credits, blkbits = inode->i_blkbits; 3782 unsigned int credits, blkbits = inode->i_blkbits;
3742 3783
3743 block = offset >> blkbits; 3784 map.m_lblk = offset >> blkbits;
3744 /* 3785 /*
3745 * We can't just convert len to max_blocks because 3786 * We can't just convert len to max_blocks because
3746 * If blocksize = 4096 offset = 3072 and len = 2048 3787 * If blocksize = 4096 offset = 3072 and len = 2048
3747 */ 3788 */
3748 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) 3789 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
3749 - block; 3790 map.m_lblk);
3750 /* 3791 /*
3751 * credits to insert 1 extent into extent tree 3792 * credits to insert 1 extent into extent tree
3752 */ 3793 */
3753 credits = ext4_chunk_trans_blocks(inode, max_blocks); 3794 credits = ext4_chunk_trans_blocks(inode, max_blocks);
3754 while (ret >= 0 && ret < max_blocks) { 3795 while (ret >= 0 && ret < max_blocks) {
3755 block = block + ret; 3796 map.m_lblk += ret;
3756 max_blocks = max_blocks - ret; 3797 map.m_len = (max_blocks -= ret);
3757 handle = ext4_journal_start(inode, credits); 3798 handle = ext4_journal_start(inode, credits);
3758 if (IS_ERR(handle)) { 3799 if (IS_ERR(handle)) {
3759 ret = PTR_ERR(handle); 3800 ret = PTR_ERR(handle);
3760 break; 3801 break;
3761 } 3802 }
3762 map_bh.b_state = 0; 3803 ret = ext4_map_blocks(handle, inode, &map,
3763 ret = ext4_get_blocks(handle, inode, block,
3764 max_blocks, &map_bh,
3765 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 3804 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
3766 if (ret <= 0) { 3805 if (ret <= 0) {
3767 WARN_ON(ret <= 0); 3806 WARN_ON(ret <= 0);
3768 printk(KERN_ERR "%s: ext4_ext_get_blocks " 3807 printk(KERN_ERR "%s: ext4_ext_map_blocks "
3769 "returned error inode#%lu, block=%u, " 3808 "returned error inode#%lu, block=%u, "
3770 "max_blocks=%u", __func__, 3809 "max_blocks=%u", __func__,
3771 inode->i_ino, block, max_blocks); 3810 inode->i_ino, map.m_lblk, map.m_len);
3772 } 3811 }
3773 ext4_mark_inode_dirty(handle, inode); 3812 ext4_mark_inode_dirty(handle, inode);
3774 ret2 = ext4_journal_stop(handle); 3813 ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3898 int error = 0; 3937 int error = 0;
3899 3938
3900 /* fallback to generic here if not in extents fmt */ 3939 /* fallback to generic here if not in extents fmt */
3901 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 3940 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
3902 return generic_block_fiemap(inode, fieinfo, start, len, 3941 return generic_block_fiemap(inode, fieinfo, start, len,
3903 ext4_get_block); 3942 ext4_get_block);
3904 3943
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
66 * is smaller than s_maxbytes, which is for extent-mapped files. 66 * is smaller than s_maxbytes, which is for extent-mapped files.
67 */ 67 */
68 68
69 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 69 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 70 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
71 size_t length = iov_length(iov, nr_segs); 71 size_t length = iov_length(iov, nr_segs);
72 72
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 0d0c3239c1cd..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
35#include <trace/events/ext4.h> 35#include <trace/events/ext4.h>
36 36
37/* 37/*
38 * If we're not journaling and this is a just-created file, we have to
39 * sync our parent directory (if it was freshly created) since
40 * otherwise it will only be written by writeback, leaving a huge
41 * window during which a crash may lose the file. This may apply for
42 * the parent directory's parent as well, and so on recursively, if
43 * they are also freshly created.
44 */
45static void ext4_sync_parent(struct inode *inode)
46{
47 struct dentry *dentry = NULL;
48
49 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
50 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
51 dentry = list_entry(inode->i_dentry.next,
52 struct dentry, d_alias);
53 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
54 break;
55 inode = dentry->d_parent->d_inode;
56 sync_mapping_buffers(inode->i_mapping);
57 }
58}
59
60/*
38 * akpm: A new design for ext4_sync_file(). 61 * akpm: A new design for ext4_sync_file().
39 * 62 *
40 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). 63 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
48 * i_mutex lock is held when entering and exiting this function 71 * i_mutex lock is held when entering and exiting this function
49 */ 72 */
50 73
51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 74int ext4_sync_file(struct file *file, int datasync)
52{ 75{
53 struct inode *inode = dentry->d_inode; 76 struct inode *inode = file->f_mapping->host;
54 struct ext4_inode_info *ei = EXT4_I(inode); 77 struct ext4_inode_info *ei = EXT4_I(inode);
55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 78 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
56 int ret; 79 int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
58 81
59 J_ASSERT(ext4_journal_current_handle() == NULL); 82 J_ASSERT(ext4_journal_current_handle() == NULL);
60 83
61 trace_ext4_sync_file(file, dentry, datasync); 84 trace_ext4_sync_file(file, datasync);
62 85
63 if (inode->i_sb->s_flags & MS_RDONLY) 86 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0; 87 return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
66 ret = flush_completed_IO(inode); 89 ret = flush_completed_IO(inode);
67 if (ret < 0) 90 if (ret < 0)
68 return ret; 91 return ret;
69 92
70 if (!journal) 93 if (!journal) {
71 return simple_fsync(file, dentry, datasync); 94 ret = generic_file_fsync(file, datasync);
95 if (!ret && !list_empty(&inode->i_dentry))
96 ext4_sync_parent(inode);
97 return ret;
98 }
72 99
73 /* 100 /*
74 * data=writeback,ordered: 101 * data=writeback,ordered:
@@ -100,9 +127,11 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
100 if (ext4_should_writeback_data(inode) && 127 if (ext4_should_writeback_data(inode) &&
101 (journal->j_fs_dev != journal->j_dev) && 128 (journal->j_fs_dev != journal->j_dev) &&
102 (journal->j_flags & JBD2_BARRIER)) 129 (journal->j_flags & JBD2_BARRIER))
103 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 130 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
104 jbd2_log_wait_commit(journal, commit_tid); 131 NULL, BLKDEV_IFL_WAIT);
132 ret = jbd2_log_wait_commit(journal, commit_tid);
105 } else if (journal->j_flags & JBD2_BARRIER) 133 } else if (journal->j_flags & JBD2_BARRIER)
106 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 134 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
135 BLKDEV_IFL_WAIT);
107 return ret; 136 return ret;
108} 137}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 57f6eef6ccd6..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
240 if (fatal) 240 if (fatal)
241 goto error_return; 241 goto error_return;
242 242
243 /* Ok, now we can actually update the inode bitmaps.. */ 243 fatal = -ESRCH;
244 cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 244 gdp = ext4_get_group_desc(sb, block_group, &bh2);
245 bit, bitmap_bh->b_data); 245 if (gdp) {
246 if (!cleared)
247 ext4_error(sb, "bit already cleared for inode %lu", ino);
248 else {
249 gdp = ext4_get_group_desc(sb, block_group, &bh2);
250
251 BUFFER_TRACE(bh2, "get_write_access"); 246 BUFFER_TRACE(bh2, "get_write_access");
252 fatal = ext4_journal_get_write_access(handle, bh2); 247 fatal = ext4_journal_get_write_access(handle, bh2);
253 if (fatal) goto error_return; 248 }
254 249 ext4_lock_group(sb, block_group);
255 if (gdp) { 250 cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
256 ext4_lock_group(sb, block_group); 251 if (fatal || !cleared) {
257 count = ext4_free_inodes_count(sb, gdp) + 1; 252 ext4_unlock_group(sb, block_group);
258 ext4_free_inodes_set(sb, gdp, count); 253 goto out;
259 if (is_directory) { 254 }
260 count = ext4_used_dirs_count(sb, gdp) - 1;
261 ext4_used_dirs_set(sb, gdp, count);
262 if (sbi->s_log_groups_per_flex) {
263 ext4_group_t f;
264
265 f = ext4_flex_group(sbi, block_group);
266 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
267 }
268 255
269 } 256 count = ext4_free_inodes_count(sb, gdp) + 1;
270 gdp->bg_checksum = ext4_group_desc_csum(sbi, 257 ext4_free_inodes_set(sb, gdp, count);
271 block_group, gdp); 258 if (is_directory) {
272 ext4_unlock_group(sb, block_group); 259 count = ext4_used_dirs_count(sb, gdp) - 1;
273 percpu_counter_inc(&sbi->s_freeinodes_counter); 260 ext4_used_dirs_set(sb, gdp, count);
274 if (is_directory) 261 percpu_counter_dec(&sbi->s_dirs_counter);
275 percpu_counter_dec(&sbi->s_dirs_counter);
276
277 if (sbi->s_log_groups_per_flex) {
278 ext4_group_t f;
279
280 f = ext4_flex_group(sbi, block_group);
281 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
282 }
283 }
284 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
285 err = ext4_handle_dirty_metadata(handle, NULL, bh2);
286 if (!fatal) fatal = err;
287 } 262 }
288 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); 263 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
289 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 264 ext4_unlock_group(sb, block_group);
290 if (!fatal) 265
291 fatal = err; 266 percpu_counter_inc(&sbi->s_freeinodes_counter);
292 sb->s_dirt = 1; 267 if (sbi->s_log_groups_per_flex) {
268 ext4_group_t f = ext4_flex_group(sbi, block_group);
269
270 atomic_inc(&sbi->s_flex_groups[f].free_inodes);
271 if (is_directory)
272 atomic_dec(&sbi->s_flex_groups[f].used_dirs);
273 }
274 BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
275 fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
276out:
277 if (cleared) {
278 BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
279 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
280 if (!fatal)
281 fatal = err;
282 sb->s_dirt = 1;
283 } else
284 ext4_error(sb, "bit already cleared for inode %lu", ino);
285
293error_return: 286error_return:
294 brelse(bitmap_bh); 287 brelse(bitmap_bh);
295 ext4_std_error(sb, fatal); 288 ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
499 492
500 if (S_ISDIR(mode) && 493 if (S_ISDIR(mode) &&
501 ((parent == sb->s_root->d_inode) || 494 ((parent == sb->s_root->d_inode) ||
502 (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) { 495 (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
503 int best_ndir = inodes_per_group; 496 int best_ndir = inodes_per_group;
504 int ret = -1; 497 int ret = -1;
505 498
@@ -979,16 +972,12 @@ got:
979 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes); 972 atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
980 } 973 }
981 974
982 inode->i_uid = current_fsuid(); 975 if (test_opt(sb, GRPID)) {
983 if (test_opt(sb, GRPID)) 976 inode->i_mode = mode;
984 inode->i_gid = dir->i_gid; 977 inode->i_uid = current_fsuid();
985 else if (dir->i_mode & S_ISGID) {
986 inode->i_gid = dir->i_gid; 978 inode->i_gid = dir->i_gid;
987 if (S_ISDIR(mode))
988 mode |= S_ISGID;
989 } else 979 } else
990 inode->i_gid = current_fsgid(); 980 inode_init_owner(inode, dir, mode);
991 inode->i_mode = mode;
992 981
993 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); 982 inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
994 /* This is the optimal IO size (for stat), not the fs block size */ 983 /* This is the optimal IO size (for stat), not the fs block size */
@@ -1045,7 +1034,7 @@ got:
1045 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { 1034 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
1046 /* set extent flag only for directory, file and normal symlink*/ 1035 /* set extent flag only for directory, file and normal symlink*/
1047 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { 1036 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
1048 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; 1037 ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
1049 ext4_ext_tree_init(handle, inode); 1038 ext4_ext_tree_init(handle, inode);
1050 } 1039 }
1051 } 1040 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08d..42272d67955a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
149 int ret; 149 int ret;
150 150
151 /* 151 /*
152 * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this 152 * Drop i_data_sem to avoid deadlock with ext4_map_blocks. At this
153 * moment, get_block can be called only for blocks inside i_size since 153 * moment, get_block can be called only for blocks inside i_size since
154 * page cache has been already dropped and writes are blocked by 154 * page cache has been already dropped and writes are blocked by
155 * i_mutex. So we can safely drop the i_data_sem here. 155 * i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
348 if (blk && 348 if (blk &&
349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 349 unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
350 blk, 1))) { 350 blk, 1))) {
351 __ext4_error(inode->i_sb, function, 351 ext4_error_inode(function, inode,
352 "invalid block reference %u " 352 "invalid block reference %u", blk);
353 "in inode #%lu", blk, inode->i_ino);
354 return -EIO; 353 return -EIO;
355 } 354 }
356 } 355 }
@@ -785,7 +784,7 @@ failed:
785 /* Allocation failed, free what we already allocated */ 784 /* Allocation failed, free what we already allocated */
786 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0); 785 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
787 for (i = 1; i <= n ; i++) { 786 for (i = 1; i <= n ; i++) {
788 /* 787 /*
789 * branch[i].bh is newly allocated, so there is no 788 * branch[i].bh is newly allocated, so there is no
790 * need to revoke the block, which is why we don't 789 * need to revoke the block, which is why we don't
791 * need to set EXT4_FREE_BLOCKS_METADATA. 790 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
875 874
876err_out: 875err_out:
877 for (i = 1; i <= num; i++) { 876 for (i = 1; i <= num; i++) {
878 /* 877 /*
879 * branch[i].bh is newly allocated, so there is no 878 * branch[i].bh is newly allocated, so there is no
880 * need to revoke the block, which is why we don't 879 * need to revoke the block, which is why we don't
881 * need to set EXT4_FREE_BLOCKS_METADATA. 880 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
890} 889}
891 890
892/* 891/*
893 * The ext4_ind_get_blocks() function handles non-extents inodes 892 * The ext4_ind_map_blocks() function handles non-extents inodes
894 * (i.e., using the traditional indirect/double-indirect i_blocks 893 * (i.e., using the traditional indirect/double-indirect i_blocks
895 * scheme) for ext4_get_blocks(). 894 * scheme) for ext4_map_blocks().
896 * 895 *
897 * Allocation strategy is simple: if we have to allocate something, we will 896 * Allocation strategy is simple: if we have to allocate something, we will
898 * have to go the whole way to leaf. So let's do it before attaching anything 897 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
917 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system 916 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
918 * blocks. 917 * blocks.
919 */ 918 */
920static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, 919static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
921 ext4_lblk_t iblock, unsigned int maxblocks, 920 struct ext4_map_blocks *map,
922 struct buffer_head *bh_result,
923 int flags) 921 int flags)
924{ 922{
925 int err = -EIO; 923 int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
933 int count = 0; 931 int count = 0;
934 ext4_fsblk_t first_block = 0; 932 ext4_fsblk_t first_block = 0;
935 933
936 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); 934 J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
937 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); 935 J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
938 depth = ext4_block_to_path(inode, iblock, offsets, 936 depth = ext4_block_to_path(inode, map->m_lblk, offsets,
939 &blocks_to_boundary); 937 &blocks_to_boundary);
940 938
941 if (depth == 0) 939 if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
946 /* Simplest case - block found, no allocation needed */ 944 /* Simplest case - block found, no allocation needed */
947 if (!partial) { 945 if (!partial) {
948 first_block = le32_to_cpu(chain[depth - 1].key); 946 first_block = le32_to_cpu(chain[depth - 1].key);
949 clear_buffer_new(bh_result);
950 count++; 947 count++;
951 /*map more blocks*/ 948 /*map more blocks*/
952 while (count < maxblocks && count <= blocks_to_boundary) { 949 while (count < map->m_len && count <= blocks_to_boundary) {
953 ext4_fsblk_t blk; 950 ext4_fsblk_t blk;
954 951
955 blk = le32_to_cpu(*(chain[depth-1].p + count)); 952 blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
969 /* 966 /*
970 * Okay, we need to do block allocation. 967 * Okay, we need to do block allocation.
971 */ 968 */
972 goal = ext4_find_goal(inode, iblock, partial); 969 goal = ext4_find_goal(inode, map->m_lblk, partial);
973 970
974 /* the number of blocks need to allocate for [d,t]indirect blocks */ 971 /* the number of blocks need to allocate for [d,t]indirect blocks */
975 indirect_blks = (chain + depth) - partial - 1; 972 indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
979 * direct blocks to allocate for this branch. 976 * direct blocks to allocate for this branch.
980 */ 977 */
981 count = ext4_blks_to_allocate(partial, indirect_blks, 978 count = ext4_blks_to_allocate(partial, indirect_blks,
982 maxblocks, blocks_to_boundary); 979 map->m_len, blocks_to_boundary);
983 /* 980 /*
984 * Block out ext4_truncate while we alter the tree 981 * Block out ext4_truncate while we alter the tree
985 */ 982 */
986 err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, 983 err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
987 &count, goal, 984 &count, goal,
988 offsets + (partial - chain), partial); 985 offsets + (partial - chain), partial);
989 986
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
995 * may need to return -EAGAIN upwards in the worst case. --sct 992 * may need to return -EAGAIN upwards in the worst case. --sct
996 */ 993 */
997 if (!err) 994 if (!err)
998 err = ext4_splice_branch(handle, inode, iblock, 995 err = ext4_splice_branch(handle, inode, map->m_lblk,
999 partial, indirect_blks, count); 996 partial, indirect_blks, count);
1000 if (err) 997 if (err)
1001 goto cleanup; 998 goto cleanup;
1002 999
1003 set_buffer_new(bh_result); 1000 map->m_flags |= EXT4_MAP_NEW;
1004 1001
1005 ext4_update_inode_fsync_trans(handle, inode, 1); 1002 ext4_update_inode_fsync_trans(handle, inode, 1);
1006got_it: 1003got_it:
1007 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 1004 map->m_flags |= EXT4_MAP_MAPPED;
1005 map->m_pblk = le32_to_cpu(chain[depth-1].key);
1006 map->m_len = count;
1008 if (count > blocks_to_boundary) 1007 if (count > blocks_to_boundary)
1009 set_buffer_boundary(bh_result); 1008 map->m_flags |= EXT4_MAP_BOUNDARY;
1010 err = count; 1009 err = count;
1011 /* Clean up and exit */ 1010 /* Clean up and exit */
1012 partial = chain + depth - 1; /* the whole chain */ 1011 partial = chain + depth - 1; /* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
1016 brelse(partial->bh); 1015 brelse(partial->bh);
1017 partial--; 1016 partial--;
1018 } 1017 }
1019 BUFFER_TRACE(bh_result, "returned");
1020out: 1018out:
1021 return err; 1019 return err;
1022} 1020}
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
1061 */ 1059 */
1062static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) 1060static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
1063{ 1061{
1064 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 1062 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
1065 return ext4_ext_calc_metadata_amount(inode, lblock); 1063 return ext4_ext_calc_metadata_amount(inode, lblock);
1066 1064
1067 return ext4_indirect_calc_metadata_amount(inode, lblock); 1065 return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
1076{ 1074{
1077 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1075 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1078 struct ext4_inode_info *ei = EXT4_I(inode); 1076 struct ext4_inode_info *ei = EXT4_I(inode);
1079 int mdb_free = 0, allocated_meta_blocks = 0;
1080 1077
1081 spin_lock(&ei->i_block_reservation_lock); 1078 spin_lock(&ei->i_block_reservation_lock);
1082 trace_ext4_da_update_reserve_space(inode, used); 1079 trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
1091 1088
1092 /* Update per-inode reservations */ 1089 /* Update per-inode reservations */
1093 ei->i_reserved_data_blocks -= used; 1090 ei->i_reserved_data_blocks -= used;
1094 used += ei->i_allocated_meta_blocks;
1095 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; 1091 ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
1096 allocated_meta_blocks = ei->i_allocated_meta_blocks; 1092 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1093 used + ei->i_allocated_meta_blocks);
1097 ei->i_allocated_meta_blocks = 0; 1094 ei->i_allocated_meta_blocks = 0;
1098 percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
1099 1095
1100 if (ei->i_reserved_data_blocks == 0) { 1096 if (ei->i_reserved_data_blocks == 0) {
1101 /* 1097 /*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
1103 * only when we have written all of the delayed 1099 * only when we have written all of the delayed
1104 * allocation blocks. 1100 * allocation blocks.
1105 */ 1101 */
1106 mdb_free = ei->i_reserved_meta_blocks; 1102 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1103 ei->i_reserved_meta_blocks);
1107 ei->i_reserved_meta_blocks = 0; 1104 ei->i_reserved_meta_blocks = 0;
1108 ei->i_da_metadata_calc_len = 0; 1105 ei->i_da_metadata_calc_len = 0;
1109 percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
1110 } 1106 }
1111 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1107 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1112 1108
1113 /* Update quota subsystem */ 1109 /* Update quota subsystem for data blocks */
1114 if (quota_claim) { 1110 if (quota_claim)
1115 dquot_claim_block(inode, used); 1111 dquot_claim_block(inode, used);
1116 if (mdb_free) 1112 else {
1117 dquot_release_reservation_block(inode, mdb_free);
1118 } else {
1119 /* 1113 /*
1120 * We did fallocate with an offset that is already delayed 1114 * We did fallocate with an offset that is already delayed
1121 * allocated. So on delayed allocated writeback we should 1115 * allocated. So on delayed allocated writeback we should
1122 * not update the quota for allocated blocks. But then 1116 * not re-claim the quota for fallocated blocks.
1123 * converting an fallocate region to initialized region would
1124 * have caused a metadata allocation. So claim quota for
1125 * that
1126 */ 1117 */
1127 if (allocated_meta_blocks) 1118 dquot_release_reservation_block(inode, used);
1128 dquot_claim_block(inode, allocated_meta_blocks);
1129 dquot_release_reservation_block(inode, mdb_free + used);
1130 } 1119 }
1131 1120
1132 /* 1121 /*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
1139 ext4_discard_preallocations(inode); 1128 ext4_discard_preallocations(inode);
1140} 1129}
1141 1130
1142static int check_block_validity(struct inode *inode, const char *msg, 1131static int check_block_validity(struct inode *inode, const char *func,
1143 sector_t logical, sector_t phys, int len) 1132 struct ext4_map_blocks *map)
1144{ 1133{
1145 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { 1134 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
1146 __ext4_error(inode->i_sb, msg, 1135 map->m_len)) {
1147 "inode #%lu logical block %llu mapped to %llu " 1136 ext4_error_inode(func, inode,
1148 "(size %d)", inode->i_ino, 1137 "lblock %lu mapped to illegal pblock %llu "
1149 (unsigned long long) logical, 1138 "(length %d)", (unsigned long) map->m_lblk,
1150 (unsigned long long) phys, len); 1139 map->m_pblk, map->m_len);
1151 return -EIO; 1140 return -EIO;
1152 } 1141 }
1153 return 0; 1142 return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1212} 1201}
1213 1202
1214/* 1203/*
1215 * The ext4_get_blocks() function tries to look up the requested blocks, 1204 * The ext4_map_blocks() function tries to look up the requested blocks,
1216 * and returns if the blocks are already mapped. 1205 * and returns if the blocks are already mapped.
1217 * 1206 *
1218 * Otherwise it takes the write lock of the i_data_sem and allocate blocks 1207 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
1219 * and store the allocated blocks in the result buffer head and mark it 1208 * and store the allocated blocks in the result buffer head and mark it
1220 * mapped. 1209 * mapped.
1221 * 1210 *
1222 * If file type is extents based, it will call ext4_ext_get_blocks(), 1211 * If file type is extents based, it will call ext4_ext_map_blocks(),
1223 * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping 1212 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
1224 * based files 1213 * based files
1225 * 1214 *
1226 * On success, it returns the number of blocks being mapped or allocate. 1215 * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
1233 * 1222 *
1234 * It returns the error in case of allocation failure. 1223 * It returns the error in case of allocation failure.
1235 */ 1224 */
1236int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, 1225int ext4_map_blocks(handle_t *handle, struct inode *inode,
1237 unsigned int max_blocks, struct buffer_head *bh, 1226 struct ext4_map_blocks *map, int flags)
1238 int flags)
1239{ 1227{
1240 int retval; 1228 int retval;
1241 1229
1242 clear_buffer_mapped(bh); 1230 map->m_flags = 0;
1243 clear_buffer_unwritten(bh); 1231 ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
1244 1232 "logical block %lu\n", inode->i_ino, flags, map->m_len,
1245 ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u," 1233 (unsigned long) map->m_lblk);
1246 "logical block %lu\n", inode->i_ino, flags, max_blocks,
1247 (unsigned long)block);
1248 /* 1234 /*
1249 * Try to see if we can get the block without requesting a new 1235 * Try to see if we can get the block without requesting a new
1250 * file system block. 1236 * file system block.
1251 */ 1237 */
1252 down_read((&EXT4_I(inode)->i_data_sem)); 1238 down_read((&EXT4_I(inode)->i_data_sem));
1253 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1239 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1254 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1240 retval = ext4_ext_map_blocks(handle, inode, map, 0);
1255 bh, 0);
1256 } else { 1241 } else {
1257 retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, 1242 retval = ext4_ind_map_blocks(handle, inode, map, 0);
1258 bh, 0);
1259 } 1243 }
1260 up_read((&EXT4_I(inode)->i_data_sem)); 1244 up_read((&EXT4_I(inode)->i_data_sem));
1261 1245
1262 if (retval > 0 && buffer_mapped(bh)) { 1246 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1263 int ret = check_block_validity(inode, "file system corruption", 1247 int ret = check_block_validity(inode, __func__, map);
1264 block, bh->b_blocknr, retval);
1265 if (ret != 0) 1248 if (ret != 0)
1266 return ret; 1249 return ret;
1267 } 1250 }
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1277 * ext4_ext_get_block() returns th create = 0 1260 * ext4_ext_get_block() returns th create = 0
1278 * with buffer head unmapped. 1261 * with buffer head unmapped.
1279 */ 1262 */
1280 if (retval > 0 && buffer_mapped(bh)) 1263 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
1281 return retval; 1264 return retval;
1282 1265
1283 /* 1266 /*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1290 * of BH_Unwritten and BH_Mapped flags being simultaneously 1273 * of BH_Unwritten and BH_Mapped flags being simultaneously
1291 * set on the buffer_head. 1274 * set on the buffer_head.
1292 */ 1275 */
1293 clear_buffer_unwritten(bh); 1276 map->m_flags &= ~EXT4_MAP_UNWRITTEN;
1294 1277
1295 /* 1278 /*
1296 * New blocks allocate and/or writing to uninitialized extent 1279 * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1312 * We need to check for EXT4 here because migrate 1295 * We need to check for EXT4 here because migrate
1313 * could have changed the inode type in between 1296 * could have changed the inode type in between
1314 */ 1297 */
1315 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 1298 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
1316 retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, 1299 retval = ext4_ext_map_blocks(handle, inode, map, flags);
1317 bh, flags);
1318 } else { 1300 } else {
1319 retval = ext4_ind_get_blocks(handle, inode, block, 1301 retval = ext4_ind_map_blocks(handle, inode, map, flags);
1320 max_blocks, bh, flags);
1321 1302
1322 if (retval > 0 && buffer_new(bh)) { 1303 if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
1323 /* 1304 /*
1324 * We allocated new blocks which will result in 1305 * We allocated new blocks which will result in
1325 * i_data's format changing. Force the migrate 1306 * i_data's format changing. Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1342 EXT4_I(inode)->i_delalloc_reserved_flag = 0; 1323 EXT4_I(inode)->i_delalloc_reserved_flag = 0;
1343 1324
1344 up_write((&EXT4_I(inode)->i_data_sem)); 1325 up_write((&EXT4_I(inode)->i_data_sem));
1345 if (retval > 0 && buffer_mapped(bh)) { 1326 if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
1346 int ret = check_block_validity(inode, "file system " 1327 int ret = check_block_validity(inode,
1347 "corruption after allocation", 1328 "ext4_map_blocks_after_alloc",
1348 block, bh->b_blocknr, retval); 1329 map);
1349 if (ret != 0) 1330 if (ret != 0)
1350 return ret; 1331 return ret;
1351 } 1332 }
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
1355/* Maximum number of blocks we map for direct IO at once. */ 1336/* Maximum number of blocks we map for direct IO at once. */
1356#define DIO_MAX_BLOCKS 4096 1337#define DIO_MAX_BLOCKS 4096
1357 1338
1358int ext4_get_block(struct inode *inode, sector_t iblock, 1339static int _ext4_get_block(struct inode *inode, sector_t iblock,
1359 struct buffer_head *bh_result, int create) 1340 struct buffer_head *bh, int flags)
1360{ 1341{
1361 handle_t *handle = ext4_journal_current_handle(); 1342 handle_t *handle = ext4_journal_current_handle();
1343 struct ext4_map_blocks map;
1362 int ret = 0, started = 0; 1344 int ret = 0, started = 0;
1363 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
1364 int dio_credits; 1345 int dio_credits;
1365 1346
1366 if (create && !handle) { 1347 map.m_lblk = iblock;
1348 map.m_len = bh->b_size >> inode->i_blkbits;
1349
1350 if (flags && !handle) {
1367 /* Direct IO write... */ 1351 /* Direct IO write... */
1368 if (max_blocks > DIO_MAX_BLOCKS) 1352 if (map.m_len > DIO_MAX_BLOCKS)
1369 max_blocks = DIO_MAX_BLOCKS; 1353 map.m_len = DIO_MAX_BLOCKS;
1370 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); 1354 dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
1371 handle = ext4_journal_start(inode, dio_credits); 1355 handle = ext4_journal_start(inode, dio_credits);
1372 if (IS_ERR(handle)) { 1356 if (IS_ERR(handle)) {
1373 ret = PTR_ERR(handle); 1357 ret = PTR_ERR(handle);
1374 goto out; 1358 return ret;
1375 } 1359 }
1376 started = 1; 1360 started = 1;
1377 } 1361 }
1378 1362
1379 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, 1363 ret = ext4_map_blocks(handle, inode, &map, flags);
1380 create ? EXT4_GET_BLOCKS_CREATE : 0);
1381 if (ret > 0) { 1364 if (ret > 0) {
1382 bh_result->b_size = (ret << inode->i_blkbits); 1365 map_bh(bh, inode->i_sb, map.m_pblk);
1366 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
1367 bh->b_size = inode->i_sb->s_blocksize * map.m_len;
1383 ret = 0; 1368 ret = 0;
1384 } 1369 }
1385 if (started) 1370 if (started)
1386 ext4_journal_stop(handle); 1371 ext4_journal_stop(handle);
1387out:
1388 return ret; 1372 return ret;
1389} 1373}
1390 1374
1375int ext4_get_block(struct inode *inode, sector_t iblock,
1376 struct buffer_head *bh, int create)
1377{
1378 return _ext4_get_block(inode, iblock, bh,
1379 create ? EXT4_GET_BLOCKS_CREATE : 0);
1380}
1381
1391/* 1382/*
1392 * `handle' can be NULL if create is zero 1383 * `handle' can be NULL if create is zero
1393 */ 1384 */
1394struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, 1385struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
1395 ext4_lblk_t block, int create, int *errp) 1386 ext4_lblk_t block, int create, int *errp)
1396{ 1387{
1397 struct buffer_head dummy; 1388 struct ext4_map_blocks map;
1389 struct buffer_head *bh;
1398 int fatal = 0, err; 1390 int fatal = 0, err;
1399 int flags = 0;
1400 1391
1401 J_ASSERT(handle != NULL || create == 0); 1392 J_ASSERT(handle != NULL || create == 0);
1402 1393
1403 dummy.b_state = 0; 1394 map.m_lblk = block;
1404 dummy.b_blocknr = -1000; 1395 map.m_len = 1;
1405 buffer_trace_init(&dummy.b_history); 1396 err = ext4_map_blocks(handle, inode, &map,
1406 if (create) 1397 create ? EXT4_GET_BLOCKS_CREATE : 0);
1407 flags |= EXT4_GET_BLOCKS_CREATE; 1398
1408 err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); 1399 if (err < 0)
1409 /* 1400 *errp = err;
1410 * ext4_get_blocks() returns number of blocks mapped. 0 in 1401 if (err <= 0)
1411 * case of a HOLE. 1402 return NULL;
1412 */ 1403 *errp = 0;
1413 if (err > 0) { 1404
1414 if (err > 1) 1405 bh = sb_getblk(inode->i_sb, map.m_pblk);
1415 WARN_ON(1); 1406 if (!bh) {
1416 err = 0; 1407 *errp = -EIO;
1408 return NULL;
1417 } 1409 }
1418 *errp = err; 1410 if (map.m_flags & EXT4_MAP_NEW) {
1419 if (!err && buffer_mapped(&dummy)) { 1411 J_ASSERT(create != 0);
1420 struct buffer_head *bh; 1412 J_ASSERT(handle != NULL);
1421 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
1422 if (!bh) {
1423 *errp = -EIO;
1424 goto err;
1425 }
1426 if (buffer_new(&dummy)) {
1427 J_ASSERT(create != 0);
1428 J_ASSERT(handle != NULL);
1429 1413
1430 /* 1414 /*
1431 * Now that we do not always journal data, we should 1415 * Now that we do not always journal data, we should
1432 * keep in mind whether this should always journal the 1416 * keep in mind whether this should always journal the
1433 * new buffer as metadata. For now, regular file 1417 * new buffer as metadata. For now, regular file
1434 * writes use ext4_get_block instead, so it's not a 1418 * writes use ext4_get_block instead, so it's not a
1435 * problem. 1419 * problem.
1436 */ 1420 */
1437 lock_buffer(bh); 1421 lock_buffer(bh);
1438 BUFFER_TRACE(bh, "call get_create_access"); 1422 BUFFER_TRACE(bh, "call get_create_access");
1439 fatal = ext4_journal_get_create_access(handle, bh); 1423 fatal = ext4_journal_get_create_access(handle, bh);
1440 if (!fatal && !buffer_uptodate(bh)) { 1424 if (!fatal && !buffer_uptodate(bh)) {
1441 memset(bh->b_data, 0, inode->i_sb->s_blocksize); 1425 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1442 set_buffer_uptodate(bh); 1426 set_buffer_uptodate(bh);
1443 }
1444 unlock_buffer(bh);
1445 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1446 err = ext4_handle_dirty_metadata(handle, inode, bh);
1447 if (!fatal)
1448 fatal = err;
1449 } else {
1450 BUFFER_TRACE(bh, "not a new buffer");
1451 }
1452 if (fatal) {
1453 *errp = fatal;
1454 brelse(bh);
1455 bh = NULL;
1456 } 1427 }
1457 return bh; 1428 unlock_buffer(bh);
1429 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
1430 err = ext4_handle_dirty_metadata(handle, inode, bh);
1431 if (!fatal)
1432 fatal = err;
1433 } else {
1434 BUFFER_TRACE(bh, "not a new buffer");
1458 } 1435 }
1459err: 1436 if (fatal) {
1460 return NULL; 1437 *errp = fatal;
1438 brelse(bh);
1439 bh = NULL;
1440 }
1441 return bh;
1461} 1442}
1462 1443
1463struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, 1444struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1860 int retries = 0; 1841 int retries = 0;
1861 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1842 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1862 struct ext4_inode_info *ei = EXT4_I(inode); 1843 struct ext4_inode_info *ei = EXT4_I(inode);
1863 unsigned long md_needed, md_reserved; 1844 unsigned long md_needed;
1864 int ret; 1845 int ret;
1865 1846
1866 /* 1847 /*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
1870 */ 1851 */
1871repeat: 1852repeat:
1872 spin_lock(&ei->i_block_reservation_lock); 1853 spin_lock(&ei->i_block_reservation_lock);
1873 md_reserved = ei->i_reserved_meta_blocks;
1874 md_needed = ext4_calc_metadata_amount(inode, lblock); 1854 md_needed = ext4_calc_metadata_amount(inode, lblock);
1875 trace_ext4_da_reserve_space(inode, md_needed); 1855 trace_ext4_da_reserve_space(inode, md_needed);
1876 spin_unlock(&ei->i_block_reservation_lock); 1856 spin_unlock(&ei->i_block_reservation_lock);
1877 1857
1878 /* 1858 /*
1879 * Make quota reservation here to prevent quota overflow 1859 * We will charge metadata quota at writeout time; this saves
1880 * later. Real quota accounting is done at pages writeout 1860 * us from metadata over-estimation, though we may go over by
1881 * time. 1861 * a small amount in the end. Here we just reserve for data.
1882 */ 1862 */
1883 ret = dquot_reserve_block(inode, md_needed + 1); 1863 ret = dquot_reserve_block(inode, 1);
1884 if (ret) 1864 if (ret)
1885 return ret; 1865 return ret;
1886 1866 /*
1867 * We do still charge estimated metadata to the sb though;
1868 * we cannot afford to run out of free blocks.
1869 */
1887 if (ext4_claim_free_blocks(sbi, md_needed + 1)) { 1870 if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
1888 dquot_release_reservation_block(inode, md_needed + 1); 1871 dquot_release_reservation_block(inode, 1);
1889 if (ext4_should_retry_alloc(inode->i_sb, &retries)) { 1872 if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
1890 yield(); 1873 yield();
1891 goto repeat; 1874 goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1910 1893
1911 spin_lock(&EXT4_I(inode)->i_block_reservation_lock); 1894 spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
1912 1895
1896 trace_ext4_da_release_space(inode, to_free);
1913 if (unlikely(to_free > ei->i_reserved_data_blocks)) { 1897 if (unlikely(to_free > ei->i_reserved_data_blocks)) {
1914 /* 1898 /*
1915 * if there aren't enough reserved blocks, then the 1899 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1932 * only when we have written all of the delayed 1916 * only when we have written all of the delayed
1933 * allocation blocks. 1917 * allocation blocks.
1934 */ 1918 */
1935 to_free += ei->i_reserved_meta_blocks; 1919 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
1920 ei->i_reserved_meta_blocks);
1936 ei->i_reserved_meta_blocks = 0; 1921 ei->i_reserved_meta_blocks = 0;
1937 ei->i_da_metadata_calc_len = 0; 1922 ei->i_da_metadata_calc_len = 0;
1938 } 1923 }
1939 1924
1940 /* update fs dirty blocks counter */ 1925 /* update fs dirty data blocks counter */
1941 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); 1926 percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
1942 1927
1943 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1928 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
2042/* 2027/*
2043 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers 2028 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
2044 * 2029 *
2045 * @mpd->inode - inode to walk through
2046 * @exbh->b_blocknr - first block on a disk
2047 * @exbh->b_size - amount of space in bytes
2048 * @logical - first logical block to start assignment with
2049 *
2050 * the function goes through all passed space and put actual disk 2030 * the function goes through all passed space and put actual disk
2051 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten 2031 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
2052 */ 2032 */
2053static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, 2033static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
2054 struct buffer_head *exbh) 2034 struct ext4_map_blocks *map)
2055{ 2035{
2056 struct inode *inode = mpd->inode; 2036 struct inode *inode = mpd->inode;
2057 struct address_space *mapping = inode->i_mapping; 2037 struct address_space *mapping = inode->i_mapping;
2058 int blocks = exbh->b_size >> inode->i_blkbits; 2038 int blocks = map->m_len;
2059 sector_t pblock = exbh->b_blocknr, cur_logical; 2039 sector_t pblock = map->m_pblk, cur_logical;
2060 struct buffer_head *head, *bh; 2040 struct buffer_head *head, *bh;
2061 pgoff_t index, end; 2041 pgoff_t index, end;
2062 struct pagevec pvec; 2042 struct pagevec pvec;
2063 int nr_pages, i; 2043 int nr_pages, i;
2064 2044
2065 index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2045 index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2066 end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); 2046 end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
2067 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 2047 cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2068 2048
2069 pagevec_init(&pvec, 0); 2049 pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2090 2070
2091 /* skip blocks out of the range */ 2071 /* skip blocks out of the range */
2092 do { 2072 do {
2093 if (cur_logical >= logical) 2073 if (cur_logical >= map->m_lblk)
2094 break; 2074 break;
2095 cur_logical++; 2075 cur_logical++;
2096 } while ((bh = bh->b_this_page) != head); 2076 } while ((bh = bh->b_this_page) != head);
2097 2077
2098 do { 2078 do {
2099 if (cur_logical >= logical + blocks) 2079 if (cur_logical >= map->m_lblk + blocks)
2100 break; 2080 break;
2101 2081
2102 if (buffer_delay(bh) || 2082 if (buffer_delay(bh) || buffer_unwritten(bh)) {
2103 buffer_unwritten(bh)) {
2104 2083
2105 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); 2084 BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
2106 2085
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2119 } else if (buffer_mapped(bh)) 2098 } else if (buffer_mapped(bh))
2120 BUG_ON(bh->b_blocknr != pblock); 2099 BUG_ON(bh->b_blocknr != pblock);
2121 2100
2122 if (buffer_uninit(exbh)) 2101 if (map->m_flags & EXT4_MAP_UNINIT)
2123 set_buffer_uninit(bh); 2102 set_buffer_uninit(bh);
2124 cur_logical++; 2103 cur_logical++;
2125 pblock++; 2104 pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
2130} 2109}
2131 2110
2132 2111
2133/*
2134 * __unmap_underlying_blocks - just a helper function to unmap
2135 * set of blocks described by @bh
2136 */
2137static inline void __unmap_underlying_blocks(struct inode *inode,
2138 struct buffer_head *bh)
2139{
2140 struct block_device *bdev = inode->i_sb->s_bdev;
2141 int blocks, i;
2142
2143 blocks = bh->b_size >> inode->i_blkbits;
2144 for (i = 0; i < blocks; i++)
2145 unmap_underlying_metadata(bdev, bh->b_blocknr + i);
2146}
2147
2148static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, 2112static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
2149 sector_t logical, long blk_cnt) 2113 sector_t logical, long blk_cnt)
2150{ 2114{
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
2206static int mpage_da_map_blocks(struct mpage_da_data *mpd) 2170static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2207{ 2171{
2208 int err, blks, get_blocks_flags; 2172 int err, blks, get_blocks_flags;
2209 struct buffer_head new; 2173 struct ext4_map_blocks map;
2210 sector_t next = mpd->b_blocknr; 2174 sector_t next = mpd->b_blocknr;
2211 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; 2175 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
2212 loff_t disksize = EXT4_I(mpd->inode)->i_disksize; 2176 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2247 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting 2211 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
2248 * variables are updated after the blocks have been allocated. 2212 * variables are updated after the blocks have been allocated.
2249 */ 2213 */
2250 new.b_state = 0; 2214 map.m_lblk = next;
2215 map.m_len = max_blocks;
2251 get_blocks_flags = EXT4_GET_BLOCKS_CREATE; 2216 get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
2252 if (ext4_should_dioread_nolock(mpd->inode)) 2217 if (ext4_should_dioread_nolock(mpd->inode))
2253 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; 2218 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2254 if (mpd->b_state & (1 << BH_Delay)) 2219 if (mpd->b_state & (1 << BH_Delay))
2255 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; 2220 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2256 2221
2257 blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, 2222 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
2258 &new, get_blocks_flags);
2259 if (blks < 0) { 2223 if (blks < 0) {
2260 err = blks; 2224 err = blks;
2261 /* 2225 /*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2282 ext4_msg(mpd->inode->i_sb, KERN_CRIT, 2246 ext4_msg(mpd->inode->i_sb, KERN_CRIT,
2283 "delayed block allocation failed for inode %lu at " 2247 "delayed block allocation failed for inode %lu at "
2284 "logical offset %llu with max blocks %zd with " 2248 "logical offset %llu with max blocks %zd with "
2285 "error %d\n", mpd->inode->i_ino, 2249 "error %d", mpd->inode->i_ino,
2286 (unsigned long long) next, 2250 (unsigned long long) next,
2287 mpd->b_size >> mpd->inode->i_blkbits, err); 2251 mpd->b_size >> mpd->inode->i_blkbits, err);
2288 printk(KERN_CRIT "This should not happen!! " 2252 printk(KERN_CRIT "This should not happen!! "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2297 } 2261 }
2298 BUG_ON(blks == 0); 2262 BUG_ON(blks == 0);
2299 2263
2300 new.b_size = (blks << mpd->inode->i_blkbits); 2264 if (map.m_flags & EXT4_MAP_NEW) {
2265 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
2266 int i;
2301 2267
2302 if (buffer_new(&new)) 2268 for (i = 0; i < map.m_len; i++)
2303 __unmap_underlying_blocks(mpd->inode, &new); 2269 unmap_underlying_metadata(bdev, map.m_pblk + i);
2270 }
2304 2271
2305 /* 2272 /*
2306 * If blocks are delayed marked, we need to 2273 * If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
2308 */ 2275 */
2309 if ((mpd->b_state & (1 << BH_Delay)) || 2276 if ((mpd->b_state & (1 << BH_Delay)) ||
2310 (mpd->b_state & (1 << BH_Unwritten))) 2277 (mpd->b_state & (1 << BH_Unwritten)))
2311 mpage_put_bnr_to_bhs(mpd, next, &new); 2278 mpage_put_bnr_to_bhs(mpd, &map);
2312 2279
2313 if (ext4_should_order_data(mpd->inode)) { 2280 if (ext4_should_order_data(mpd->inode)) {
2314 err = ext4_jbd2_file_inode(handle, mpd->inode); 2281 err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
2349 sector_t next; 2316 sector_t next;
2350 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits; 2317 int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
2351 2318
2319 /*
2320 * XXX Don't go larger than mballoc is willing to allocate
2321 * This is a stopgap solution. We eventually need to fold
2322 * mpage_da_submit_io() into this function and then call
2323 * ext4_get_blocks() multiple times in a loop
2324 */
2325 if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
2326 goto flush_it;
2327
2352 /* check if thereserved journal credits might overflow */ 2328 /* check if thereserved journal credits might overflow */
2353 if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { 2329 if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
2354 if (nrblocks >= EXT4_MAX_TRANS_DATA) { 2330 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
2355 /* 2331 /*
2356 * With non-extent format we are limited by the journal 2332 * With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
2423 struct buffer_head *bh, *head; 2399 struct buffer_head *bh, *head;
2424 sector_t logical; 2400 sector_t logical;
2425 2401
2426 if (mpd->io_done) {
2427 /*
2428 * Rest of the page in the page_vec
2429 * redirty then and skip then. We will
2430 * try to write them again after
2431 * starting a new transaction
2432 */
2433 redirty_page_for_writepage(wbc, page);
2434 unlock_page(page);
2435 return MPAGE_DA_EXTENT_TAIL;
2436 }
2437 /* 2402 /*
2438 * Can we merge this page to current extent? 2403 * Can we merge this page to current extent?
2439 */ 2404 */
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
2528 * initialized properly. 2493 * initialized properly.
2529 */ 2494 */
2530static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, 2495static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2531 struct buffer_head *bh_result, int create) 2496 struct buffer_head *bh, int create)
2532{ 2497{
2498 struct ext4_map_blocks map;
2533 int ret = 0; 2499 int ret = 0;
2534 sector_t invalid_block = ~((sector_t) 0xffff); 2500 sector_t invalid_block = ~((sector_t) 0xffff);
2535 2501
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2537 invalid_block = ~0; 2503 invalid_block = ~0;
2538 2504
2539 BUG_ON(create == 0); 2505 BUG_ON(create == 0);
2540 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2506 BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
2507
2508 map.m_lblk = iblock;
2509 map.m_len = 1;
2541 2510
2542 /* 2511 /*
2543 * first, we need to know whether the block is allocated already 2512 * first, we need to know whether the block is allocated already
2544 * preallocated blocks are unmapped but should treated 2513 * preallocated blocks are unmapped but should treated
2545 * the same as allocated blocks. 2514 * the same as allocated blocks.
2546 */ 2515 */
2547 ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); 2516 ret = ext4_map_blocks(NULL, inode, &map, 0);
2548 if ((ret == 0) && !buffer_delay(bh_result)) { 2517 if (ret < 0)
2549 /* the block isn't (pre)allocated yet, let's reserve space */ 2518 return ret;
2519 if (ret == 0) {
2520 if (buffer_delay(bh))
2521 return 0; /* Not sure this could or should happen */
2550 /* 2522 /*
2551 * XXX: __block_prepare_write() unmaps passed block, 2523 * XXX: __block_prepare_write() unmaps passed block,
2552 * is it OK? 2524 * is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2556 /* not enough space to reserve */ 2528 /* not enough space to reserve */
2557 return ret; 2529 return ret;
2558 2530
2559 map_bh(bh_result, inode->i_sb, invalid_block); 2531 map_bh(bh, inode->i_sb, invalid_block);
2560 set_buffer_new(bh_result); 2532 set_buffer_new(bh);
2561 set_buffer_delay(bh_result); 2533 set_buffer_delay(bh);
2562 } else if (ret > 0) { 2534 return 0;
2563 bh_result->b_size = (ret << inode->i_blkbits);
2564 if (buffer_unwritten(bh_result)) {
2565 /* A delayed write to unwritten bh should
2566 * be marked new and mapped. Mapped ensures
2567 * that we don't do get_block multiple times
2568 * when we write to the same offset and new
2569 * ensures that we do proper zero out for
2570 * partial write.
2571 */
2572 set_buffer_new(bh_result);
2573 set_buffer_mapped(bh_result);
2574 }
2575 ret = 0;
2576 } 2535 }
2577 2536
2578 return ret; 2537 map_bh(bh, inode->i_sb, map.m_pblk);
2538 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
2539
2540 if (buffer_unwritten(bh)) {
2541 /* A delayed write to unwritten bh should be marked
2542 * new and mapped. Mapped ensures that we don't do
2543 * get_block multiple times when we write to the same
2544 * offset and new ensures that we do proper zero out
2545 * for partial write.
2546 */
2547 set_buffer_new(bh);
2548 set_buffer_mapped(bh);
2549 }
2550 return 0;
2579} 2551}
2580 2552
2581/* 2553/*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
2597static int noalloc_get_block_write(struct inode *inode, sector_t iblock, 2569static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
2598 struct buffer_head *bh_result, int create) 2570 struct buffer_head *bh_result, int create)
2599{ 2571{
2600 int ret = 0;
2601 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2602
2603 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); 2572 BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
2604 2573 return _ext4_get_block(inode, iblock, bh_result, 0);
2605 /*
2606 * we don't want to do block allocation in writepage
2607 * so call get_block_wrap with create = 0
2608 */
2609 ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
2610 if (ret > 0) {
2611 bh_result->b_size = (ret << inode->i_blkbits);
2612 ret = 0;
2613 }
2614 return ret;
2615} 2574}
2616 2575
2617static int bget_one(handle_t *handle, struct buffer_head *bh) 2576static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2821 * number of contiguous block. So we will limit 2780 * number of contiguous block. So we will limit
2822 * number of contiguous block to a sane value 2781 * number of contiguous block to a sane value
2823 */ 2782 */
2824 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) && 2783 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
2825 (max_blocks > EXT4_MAX_TRANS_DATA)) 2784 (max_blocks > EXT4_MAX_TRANS_DATA))
2826 max_blocks = EXT4_MAX_TRANS_DATA; 2785 max_blocks = EXT4_MAX_TRANS_DATA;
2827 2786
2828 return ext4_chunk_trans_blocks(inode, max_blocks); 2787 return ext4_chunk_trans_blocks(inode, max_blocks);
2829} 2788}
2830 2789
2790/*
2791 * write_cache_pages_da - walk the list of dirty pages of the given
2792 * address space and call the callback function (which usually writes
2793 * the pages).
2794 *
2795 * This is a forked version of write_cache_pages(). Differences:
2796 * Range cyclic is ignored.
2797 * no_nrwrite_index_update is always presumed true
2798 */
2799static int write_cache_pages_da(struct address_space *mapping,
2800 struct writeback_control *wbc,
2801 struct mpage_da_data *mpd)
2802{
2803 int ret = 0;
2804 int done = 0;
2805 struct pagevec pvec;
2806 int nr_pages;
2807 pgoff_t index;
2808 pgoff_t end; /* Inclusive */
2809 long nr_to_write = wbc->nr_to_write;
2810
2811 pagevec_init(&pvec, 0);
2812 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2813 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2814
2815 while (!done && (index <= end)) {
2816 int i;
2817
2818 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2819 PAGECACHE_TAG_DIRTY,
2820 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2821 if (nr_pages == 0)
2822 break;
2823
2824 for (i = 0; i < nr_pages; i++) {
2825 struct page *page = pvec.pages[i];
2826
2827 /*
2828 * At this point, the page may be truncated or
2829 * invalidated (changing page->mapping to NULL), or
2830 * even swizzled back from swapper_space to tmpfs file
2831 * mapping. However, page->index will not change
2832 * because we have a reference on the page.
2833 */
2834 if (page->index > end) {
2835 done = 1;
2836 break;
2837 }
2838
2839 lock_page(page);
2840
2841 /*
2842 * Page truncated or invalidated. We can freely skip it
2843 * then, even for data integrity operations: the page
2844 * has disappeared concurrently, so there could be no
2845 * real expectation of this data interity operation
2846 * even if there is now a new, dirty page at the same
2847 * pagecache address.
2848 */
2849 if (unlikely(page->mapping != mapping)) {
2850continue_unlock:
2851 unlock_page(page);
2852 continue;
2853 }
2854
2855 if (!PageDirty(page)) {
2856 /* someone wrote it for us */
2857 goto continue_unlock;
2858 }
2859
2860 if (PageWriteback(page)) {
2861 if (wbc->sync_mode != WB_SYNC_NONE)
2862 wait_on_page_writeback(page);
2863 else
2864 goto continue_unlock;
2865 }
2866
2867 BUG_ON(PageWriteback(page));
2868 if (!clear_page_dirty_for_io(page))
2869 goto continue_unlock;
2870
2871 ret = __mpage_da_writepage(page, wbc, mpd);
2872 if (unlikely(ret)) {
2873 if (ret == AOP_WRITEPAGE_ACTIVATE) {
2874 unlock_page(page);
2875 ret = 0;
2876 } else {
2877 done = 1;
2878 break;
2879 }
2880 }
2881
2882 if (nr_to_write > 0) {
2883 nr_to_write--;
2884 if (nr_to_write == 0 &&
2885 wbc->sync_mode == WB_SYNC_NONE) {
2886 /*
2887 * We stop writing back only if we are
2888 * not doing integrity sync. In case of
2889 * integrity sync we have to keep going
2890 * because someone may be concurrently
2891 * dirtying pages, and we might have
2892 * synced a lot of newly appeared dirty
2893 * pages, but have not synced all of the
2894 * old dirty pages.
2895 */
2896 done = 1;
2897 break;
2898 }
2899 }
2900 }
2901 pagevec_release(&pvec);
2902 cond_resched();
2903 }
2904 return ret;
2905}
2906
2907
2831static int ext4_da_writepages(struct address_space *mapping, 2908static int ext4_da_writepages(struct address_space *mapping,
2832 struct writeback_control *wbc) 2909 struct writeback_control *wbc)
2833{ 2910{
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2836 handle_t *handle = NULL; 2913 handle_t *handle = NULL;
2837 struct mpage_da_data mpd; 2914 struct mpage_da_data mpd;
2838 struct inode *inode = mapping->host; 2915 struct inode *inode = mapping->host;
2839 int no_nrwrite_index_update;
2840 int pages_written = 0; 2916 int pages_written = 0;
2841 long pages_skipped; 2917 long pages_skipped;
2842 unsigned int max_pages; 2918 unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
2916 mpd.wbc = wbc; 2992 mpd.wbc = wbc;
2917 mpd.inode = mapping->host; 2993 mpd.inode = mapping->host;
2918 2994
2919 /*
2920 * we don't want write_cache_pages to update
2921 * nr_to_write and writeback_index
2922 */
2923 no_nrwrite_index_update = wbc->no_nrwrite_index_update;
2924 wbc->no_nrwrite_index_update = 1;
2925 pages_skipped = wbc->pages_skipped; 2995 pages_skipped = wbc->pages_skipped;
2926 2996
2927retry: 2997retry:
@@ -2941,7 +3011,7 @@ retry:
2941 if (IS_ERR(handle)) { 3011 if (IS_ERR(handle)) {
2942 ret = PTR_ERR(handle); 3012 ret = PTR_ERR(handle);
2943 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 3013 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2944 "%ld pages, ino %lu; err %d\n", __func__, 3014 "%ld pages, ino %lu; err %d", __func__,
2945 wbc->nr_to_write, inode->i_ino, ret); 3015 wbc->nr_to_write, inode->i_ino, ret);
2946 goto out_writepages; 3016 goto out_writepages;
2947 } 3017 }
@@ -2963,8 +3033,7 @@ retry:
2963 mpd.io_done = 0; 3033 mpd.io_done = 0;
2964 mpd.pages_written = 0; 3034 mpd.pages_written = 0;
2965 mpd.retval = 0; 3035 mpd.retval = 0;
2966 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 3036 ret = write_cache_pages_da(mapping, wbc, &mpd);
2967 &mpd);
2968 /* 3037 /*
2969 * If we have a contiguous extent of pages and we 3038 * If we have a contiguous extent of pages and we
2970 * haven't done the I/O yet, map the blocks and submit 3039 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
3016 if (pages_skipped != wbc->pages_skipped) 3085 if (pages_skipped != wbc->pages_skipped)
3017 ext4_msg(inode->i_sb, KERN_CRIT, 3086 ext4_msg(inode->i_sb, KERN_CRIT,
3018 "This should not happen leaving %s " 3087 "This should not happen leaving %s "
3019 "with nr_to_write = %ld ret = %d\n", 3088 "with nr_to_write = %ld ret = %d",
3020 __func__, wbc->nr_to_write, ret); 3089 __func__, wbc->nr_to_write, ret);
3021 3090
3022 /* Update index */ 3091 /* Update index */
@@ -3030,8 +3099,6 @@ retry:
3030 mapping->writeback_index = index; 3099 mapping->writeback_index = index;
3031 3100
3032out_writepages: 3101out_writepages:
3033 if (!no_nrwrite_index_update)
3034 wbc->no_nrwrite_index_update = 0;
3035 wbc->nr_to_write -= nr_to_writebump; 3102 wbc->nr_to_write -= nr_to_writebump;
3036 wbc->range_start = range_start; 3103 wbc->range_start = range_start;
3037 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); 3104 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3076 loff_t pos, unsigned len, unsigned flags, 3143 loff_t pos, unsigned len, unsigned flags,
3077 struct page **pagep, void **fsdata) 3144 struct page **pagep, void **fsdata)
3078{ 3145{
3079 int ret, retries = 0, quota_retries = 0; 3146 int ret, retries = 0;
3080 struct page *page; 3147 struct page *page;
3081 pgoff_t index; 3148 pgoff_t index;
3082 unsigned from, to; 3149 unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
3135 3202
3136 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3203 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3137 goto retry; 3204 goto retry;
3138
3139 if ((ret == -EDQUOT) &&
3140 EXT4_I(inode)->i_reserved_meta_blocks &&
3141 (quota_retries++ < 3)) {
3142 /*
3143 * Since we often over-estimate the number of meta
3144 * data blocks required, we may sometimes get a
3145 * spurios out of quota error even though there would
3146 * be enough space once we write the data blocks and
3147 * find out how many meta data blocks were _really_
3148 * required. So try forcing the inode write to see if
3149 * that helps.
3150 */
3151 write_inode_now(inode, (quota_retries == 3));
3152 goto retry;
3153 }
3154out: 3205out:
3155 return ret; 3206 return ret;
3156} 3207}
@@ -3546,46 +3597,18 @@ out:
3546 return ret; 3597 return ret;
3547} 3598}
3548 3599
3600/*
3601 * ext4_get_block used when preparing for a DIO write or buffer write.
3602 * We allocate an uinitialized extent if blocks haven't been allocated.
3603 * The extent will be converted to initialized after the IO is complete.
3604 */
3549static int ext4_get_block_write(struct inode *inode, sector_t iblock, 3605static int ext4_get_block_write(struct inode *inode, sector_t iblock,
3550 struct buffer_head *bh_result, int create) 3606 struct buffer_head *bh_result, int create)
3551{ 3607{
3552 handle_t *handle = ext4_journal_current_handle();
3553 int ret = 0;
3554 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3555 int dio_credits;
3556 int started = 0;
3557
3558 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n", 3608 ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
3559 inode->i_ino, create); 3609 inode->i_ino, create);
3560 /* 3610 return _ext4_get_block(inode, iblock, bh_result,
3561 * ext4_get_block in prepare for a DIO write or buffer write. 3611 EXT4_GET_BLOCKS_IO_CREATE_EXT);
3562 * We allocate an uinitialized extent if blocks haven't been allocated.
3563 * The extent will be converted to initialized after IO complete.
3564 */
3565 create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
3566
3567 if (!handle) {
3568 if (max_blocks > DIO_MAX_BLOCKS)
3569 max_blocks = DIO_MAX_BLOCKS;
3570 dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
3571 handle = ext4_journal_start(inode, dio_credits);
3572 if (IS_ERR(handle)) {
3573 ret = PTR_ERR(handle);
3574 goto out;
3575 }
3576 started = 1;
3577 }
3578
3579 ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
3580 create);
3581 if (ret > 0) {
3582 bh_result->b_size = (ret << inode->i_blkbits);
3583 ret = 0;
3584 }
3585 if (started)
3586 ext4_journal_stop(handle);
3587out:
3588 return ret;
3589} 3612}
3590 3613
3591static void dump_completed_IO(struct inode * inode) 3614static void dump_completed_IO(struct inode * inode)
@@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
3973 struct file *file = iocb->ki_filp; 3996 struct file *file = iocb->ki_filp;
3974 struct inode *inode = file->f_mapping->host; 3997 struct inode *inode = file->f_mapping->host;
3975 3998
3976 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) 3999 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3977 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs); 4000 return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
3978 4001
3979 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs); 4002 return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
4302 4325
4303 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 4326 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
4304 count)) { 4327 count)) {
4305 ext4_error(inode->i_sb, "inode #%lu: " 4328 EXT4_ERROR_INODE(inode, "attempt to clear invalid "
4306 "attempt to clear blocks %llu len %lu, invalid", 4329 "blocks %llu len %lu",
4307 inode->i_ino, (unsigned long long) block_to_free, 4330 (unsigned long long) block_to_free, count);
4308 count);
4309 return 1; 4331 return 1;
4310 } 4332 }
4311 4333
@@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4410 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) 4432 if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
4411 ext4_handle_dirty_metadata(handle, inode, this_bh); 4433 ext4_handle_dirty_metadata(handle, inode, this_bh);
4412 else 4434 else
4413 ext4_error(inode->i_sb, 4435 EXT4_ERROR_INODE(inode,
4414 "circular indirect block detected, " 4436 "circular indirect block detected at "
4415 "inode=%lu, block=%llu", 4437 "block %llu",
4416 inode->i_ino, 4438 (unsigned long long) this_bh->b_blocknr);
4417 (unsigned long long) this_bh->b_blocknr);
4418 } 4439 }
4419} 4440}
4420 4441
@@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4452 4473
4453 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), 4474 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
4454 nr, 1)) { 4475 nr, 1)) {
4455 ext4_error(inode->i_sb, 4476 EXT4_ERROR_INODE(inode,
4456 "indirect mapped block in inode " 4477 "invalid indirect mapped "
4457 "#%lu invalid (level %d, blk #%lu)", 4478 "block %lu (level %d)",
4458 inode->i_ino, depth, 4479 (unsigned long) nr, depth);
4459 (unsigned long) nr);
4460 break; 4480 break;
4461 } 4481 }
4462 4482
@@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4468 * (should be rare). 4488 * (should be rare).
4469 */ 4489 */
4470 if (!bh) { 4490 if (!bh) {
4471 ext4_error(inode->i_sb, 4491 EXT4_ERROR_INODE(inode,
4472 "Read failure, inode=%lu, block=%llu", 4492 "Read failure block=%llu",
4473 inode->i_ino, nr); 4493 (unsigned long long) nr);
4474 continue; 4494 continue;
4475 } 4495 }
4476 4496
@@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
4612 if (!ext4_can_truncate(inode)) 4632 if (!ext4_can_truncate(inode))
4613 return; 4633 return;
4614 4634
4615 EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL; 4635 ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
4616 4636
4617 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) 4637 if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
4618 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); 4638 ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
4619 4639
4620 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { 4640 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
4621 ext4_ext_truncate(inode); 4641 ext4_ext_truncate(inode);
4622 return; 4642 return;
4623 } 4643 }
@@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
4785 4805
4786 bh = sb_getblk(sb, block); 4806 bh = sb_getblk(sb, block);
4787 if (!bh) { 4807 if (!bh) {
4788 ext4_error(sb, "unable to read inode block - " 4808 EXT4_ERROR_INODE(inode, "unable to read inode block - "
4789 "inode=%lu, block=%llu", inode->i_ino, block); 4809 "block %llu", block);
4790 return -EIO; 4810 return -EIO;
4791 } 4811 }
4792 if (!buffer_uptodate(bh)) { 4812 if (!buffer_uptodate(bh)) {
@@ -4884,8 +4904,8 @@ make_io:
4884 submit_bh(READ_META, bh); 4904 submit_bh(READ_META, bh);
4885 wait_on_buffer(bh); 4905 wait_on_buffer(bh);
4886 if (!buffer_uptodate(bh)) { 4906 if (!buffer_uptodate(bh)) {
4887 ext4_error(sb, "unable to read inode block - inode=%lu," 4907 EXT4_ERROR_INODE(inode, "unable to read inode "
4888 " block=%llu", inode->i_ino, block); 4908 "block %llu", block);
4889 brelse(bh); 4909 brelse(bh);
4890 return -EIO; 4910 return -EIO;
4891 } 4911 }
@@ -4922,20 +4942,26 @@ void ext4_set_inode_flags(struct inode *inode)
4922/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ 4942/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
4923void ext4_get_inode_flags(struct ext4_inode_info *ei) 4943void ext4_get_inode_flags(struct ext4_inode_info *ei)
4924{ 4944{
4925 unsigned int flags = ei->vfs_inode.i_flags; 4945 unsigned int vfs_fl;
4926 4946 unsigned long old_fl, new_fl;
4927 ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL| 4947
4928 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL); 4948 do {
4929 if (flags & S_SYNC) 4949 vfs_fl = ei->vfs_inode.i_flags;
4930 ei->i_flags |= EXT4_SYNC_FL; 4950 old_fl = ei->i_flags;
4931 if (flags & S_APPEND) 4951 new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
4932 ei->i_flags |= EXT4_APPEND_FL; 4952 EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
4933 if (flags & S_IMMUTABLE) 4953 EXT4_DIRSYNC_FL);
4934 ei->i_flags |= EXT4_IMMUTABLE_FL; 4954 if (vfs_fl & S_SYNC)
4935 if (flags & S_NOATIME) 4955 new_fl |= EXT4_SYNC_FL;
4936 ei->i_flags |= EXT4_NOATIME_FL; 4956 if (vfs_fl & S_APPEND)
4937 if (flags & S_DIRSYNC) 4957 new_fl |= EXT4_APPEND_FL;
4938 ei->i_flags |= EXT4_DIRSYNC_FL; 4958 if (vfs_fl & S_IMMUTABLE)
4959 new_fl |= EXT4_IMMUTABLE_FL;
4960 if (vfs_fl & S_NOATIME)
4961 new_fl |= EXT4_NOATIME_FL;
4962 if (vfs_fl & S_DIRSYNC)
4963 new_fl |= EXT4_DIRSYNC_FL;
4964 } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
4939} 4965}
4940 4966
4941static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, 4967static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5096,8 +5122,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5096 ret = 0; 5122 ret = 0;
5097 if (ei->i_file_acl && 5123 if (ei->i_file_acl &&
5098 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { 5124 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
5099 ext4_error(sb, "bad extended attribute block %llu inode #%lu", 5125 EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
5100 ei->i_file_acl, inode->i_ino); 5126 ei->i_file_acl);
5101 ret = -EIO; 5127 ret = -EIO;
5102 goto bad_inode; 5128 goto bad_inode;
5103 } else if (ei->i_flags & EXT4_EXTENTS_FL) { 5129 } else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5168,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
5142 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 5168 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
5143 } else { 5169 } else {
5144 ret = -EIO; 5170 ret = -EIO;
5145 ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu", 5171 EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
5146 inode->i_mode, inode->i_ino);
5147 goto bad_inode; 5172 goto bad_inode;
5148 } 5173 }
5149 brelse(iloc.bh); 5174 brelse(iloc.bh);
@@ -5172,7 +5197,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
5172 */ 5197 */
5173 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5198 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5174 raw_inode->i_blocks_high = 0; 5199 raw_inode->i_blocks_high = 0;
5175 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 5200 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5176 return 0; 5201 return 0;
5177 } 5202 }
5178 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) 5203 if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5185,9 +5210,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
5185 */ 5210 */
5186 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5211 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
5187 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); 5212 raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
5188 ei->i_flags &= ~EXT4_HUGE_FILE_FL; 5213 ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5189 } else { 5214 } else {
5190 ei->i_flags |= EXT4_HUGE_FILE_FL; 5215 ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
5191 /* i_block is stored in file system block size */ 5216 /* i_block is stored in file system block size */
5192 i_blocks = i_blocks >> (inode->i_blkbits - 9); 5217 i_blocks = i_blocks >> (inode->i_blkbits - 9);
5193 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); 5218 raw_inode->i_blocks_lo = cpu_to_le32(i_blocks);
@@ -5381,9 +5406,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
5381 if (wbc->sync_mode == WB_SYNC_ALL) 5406 if (wbc->sync_mode == WB_SYNC_ALL)
5382 sync_dirty_buffer(iloc.bh); 5407 sync_dirty_buffer(iloc.bh);
5383 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { 5408 if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
5384 ext4_error(inode->i_sb, "IO error syncing inode, " 5409 EXT4_ERROR_INODE(inode,
5385 "inode=%lu, block=%llu", inode->i_ino, 5410 "IO error syncing inode (block=%llu)",
5386 (unsigned long long)iloc.bh->b_blocknr); 5411 (unsigned long long) iloc.bh->b_blocknr);
5387 err = -EIO; 5412 err = -EIO;
5388 } 5413 }
5389 brelse(iloc.bh); 5414 brelse(iloc.bh);
@@ -5455,7 +5480,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5455 } 5480 }
5456 5481
5457 if (attr->ia_valid & ATTR_SIZE) { 5482 if (attr->ia_valid & ATTR_SIZE) {
5458 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) { 5483 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
5459 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 5484 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
5460 5485
5461 if (attr->ia_size > sbi->s_bitmap_maxbytes) { 5486 if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5493,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5468 if (S_ISREG(inode->i_mode) && 5493 if (S_ISREG(inode->i_mode) &&
5469 attr->ia_valid & ATTR_SIZE && 5494 attr->ia_valid & ATTR_SIZE &&
5470 (attr->ia_size < inode->i_size || 5495 (attr->ia_size < inode->i_size ||
5471 (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) { 5496 (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
5472 handle_t *handle; 5497 handle_t *handle;
5473 5498
5474 handle = ext4_journal_start(inode, 3); 5499 handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5525,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5500 } 5525 }
5501 } 5526 }
5502 /* ext4_truncate will clear the flag */ 5527 /* ext4_truncate will clear the flag */
5503 if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) 5528 if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
5504 ext4_truncate(inode); 5529 ext4_truncate(inode);
5505 } 5530 }
5506 5531
@@ -5576,7 +5601,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
5576 5601
5577static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 5602static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5578{ 5603{
5579 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 5604 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
5580 return ext4_indirect_trans_blocks(inode, nrblocks, chunk); 5605 return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
5581 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 5606 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
5582} 5607}
@@ -5911,9 +5936,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
5911 */ 5936 */
5912 5937
5913 if (val) 5938 if (val)
5914 EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL; 5939 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5915 else 5940 else
5916 EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL; 5941 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5917 ext4_set_aops(inode); 5942 ext4_set_aops(inode);
5918 5943
5919 jbd2_journal_unlock_updates(journal); 5944 jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
258 if (me.moved_len > 0) 258 if (me.moved_len > 0)
259 file_remove_suid(donor_filp); 259 file_remove_suid(donor_filp);
260 260
261 if (copy_to_user((struct move_extent __user *)arg, 261 if (copy_to_user((struct move_extent __user *)arg,
262 &me, sizeof(me))) 262 &me, sizeof(me)))
263 err = -EFAULT; 263 err = -EFAULT;
264mext_out: 264mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
373 case EXT4_IOC32_SETRSVSZ: 373 case EXT4_IOC32_SETRSVSZ:
374 cmd = EXT4_IOC_SETRSVSZ; 374 cmd = EXT4_IOC_SETRSVSZ;
375 break; 375 break;
376 case EXT4_IOC_GROUP_ADD: 376 case EXT4_IOC32_GROUP_ADD: {
377 struct compat_ext4_new_group_input __user *uinput;
378 struct ext4_new_group_input input;
379 mm_segment_t old_fs;
380 int err;
381
382 uinput = compat_ptr(arg);
383 err = get_user(input.group, &uinput->group);
384 err |= get_user(input.block_bitmap, &uinput->block_bitmap);
385 err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
386 err |= get_user(input.inode_table, &uinput->inode_table);
387 err |= get_user(input.blocks_count, &uinput->blocks_count);
388 err |= get_user(input.reserved_blocks,
389 &uinput->reserved_blocks);
390 if (err)
391 return -EFAULT;
392 old_fs = get_fs();
393 set_fs(KERNEL_DS);
394 err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
395 (unsigned long) &input);
396 set_fs(old_fs);
397 return err;
398 }
399 case EXT4_IOC_MOVE_EXT:
377 break; 400 break;
378 default: 401 default:
379 return -ENOIOCTLCMD; 402 return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
658 } 658 }
659} 659}
660 660
661/*
662 * Cache the order of the largest free extent we have available in this block
663 * group.
664 */
665static void
666mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
667{
668 int i;
669 int bits;
670
671 grp->bb_largest_free_order = -1; /* uninit */
672
673 bits = sb->s_blocksize_bits + 1;
674 for (i = bits; i >= 0; i--) {
675 if (grp->bb_counters[i] > 0) {
676 grp->bb_largest_free_order = i;
677 break;
678 }
679 }
680}
681
661static noinline_for_stack 682static noinline_for_stack
662void ext4_mb_generate_buddy(struct super_block *sb, 683void ext4_mb_generate_buddy(struct super_block *sb,
663 void *buddy, void *bitmap, ext4_group_t group) 684 void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
700 */ 721 */
701 grp->bb_free = free; 722 grp->bb_free = free;
702 } 723 }
724 mb_set_largest_free_order(sb, grp);
703 725
704 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 726 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
705 727
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
725 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 747 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
726 * So it can have information regarding groups_per_page which 748 * So it can have information regarding groups_per_page which
727 * is blocks_per_page/2 749 * is blocks_per_page/2
750 *
751 * Locking note: This routine takes the block group lock of all groups
752 * for this page; do not hold this lock when calling this routine!
728 */ 753 */
729 754
730static int ext4_mb_init_cache(struct page *page, char *incore) 755static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
865 BUG_ON(incore == NULL); 890 BUG_ON(incore == NULL);
866 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 891 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
867 group, page->index, i * blocksize); 892 group, page->index, i * blocksize);
893 trace_ext4_mb_buddy_bitmap_load(sb, group);
868 grinfo = ext4_get_group_info(sb, group); 894 grinfo = ext4_get_group_info(sb, group);
869 grinfo->bb_fragments = 0; 895 grinfo->bb_fragments = 0;
870 memset(grinfo->bb_counters, 0, 896 memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
882 BUG_ON(incore != NULL); 908 BUG_ON(incore != NULL);
883 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 909 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
884 group, page->index, i * blocksize); 910 group, page->index, i * blocksize);
911 trace_ext4_mb_bitmap_load(sb, group);
885 912
886 /* see comments in ext4_mb_put_pa() */ 913 /* see comments in ext4_mb_put_pa() */
887 ext4_lock_group(sb, group); 914 ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
910 return err; 937 return err;
911} 938}
912 939
940/*
941 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
942 * block group lock of all groups for this page; do not hold the BG lock when
943 * calling this routine!
944 */
913static noinline_for_stack 945static noinline_for_stack
914int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 946int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
915{ 947{
@@ -1004,6 +1036,11 @@ err:
1004 return ret; 1036 return ret;
1005} 1037}
1006 1038
1039/*
1040 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1041 * block group lock of all groups for this page; do not hold the BG lock when
1042 * calling this routine!
1043 */
1007static noinline_for_stack int 1044static noinline_for_stack int
1008ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1045ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1009 struct ext4_buddy *e4b) 1046 struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
1150 return ret; 1187 return ret;
1151} 1188}
1152 1189
1153static void ext4_mb_release_desc(struct ext4_buddy *e4b) 1190static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1154{ 1191{
1155 if (e4b->bd_bitmap_page) 1192 if (e4b->bd_bitmap_page)
1156 page_cache_release(e4b->bd_bitmap_page); 1193 page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1299 buddy = buddy2; 1336 buddy = buddy2;
1300 } while (1); 1337 } while (1);
1301 } 1338 }
1339 mb_set_largest_free_order(sb, e4b->bd_info);
1302 mb_check_buddy(e4b); 1340 mb_check_buddy(e4b);
1303} 1341}
1304 1342
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1427 e4b->bd_info->bb_counters[ord]++; 1465 e4b->bd_info->bb_counters[ord]++;
1428 e4b->bd_info->bb_counters[ord]++; 1466 e4b->bd_info->bb_counters[ord]++;
1429 } 1467 }
1468 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1430 1469
1431 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1470 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1432 mb_check_buddy(e4b); 1471 mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1617 } 1656 }
1618 1657
1619 ext4_unlock_group(ac->ac_sb, group); 1658 ext4_unlock_group(ac->ac_sb, group);
1620 ext4_mb_release_desc(e4b); 1659 ext4_mb_unload_buddy(e4b);
1621 1660
1622 return 0; 1661 return 0;
1623} 1662}
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1672 ext4_mb_use_best_found(ac, e4b); 1711 ext4_mb_use_best_found(ac, e4b);
1673 } 1712 }
1674 ext4_unlock_group(ac->ac_sb, group); 1713 ext4_unlock_group(ac->ac_sb, group);
1675 ext4_mb_release_desc(e4b); 1714 ext4_mb_unload_buddy(e4b);
1676 1715
1677 return 0; 1716 return 0;
1678} 1717}
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1821 } 1860 }
1822} 1861}
1823 1862
1863/* This is now called BEFORE we load the buddy bitmap. */
1824static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1864static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1825 ext4_group_t group, int cr) 1865 ext4_group_t group, int cr)
1826{ 1866{
1827 unsigned free, fragments; 1867 unsigned free, fragments;
1828 unsigned i, bits;
1829 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1868 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1830 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1869 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1831 1870
1832 BUG_ON(cr < 0 || cr >= 4); 1871 BUG_ON(cr < 0 || cr >= 4);
1833 BUG_ON(EXT4_MB_GRP_NEED_INIT(grp)); 1872
1873 /* We only do this if the grp has never been initialized */
1874 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1875 int ret = ext4_mb_init_group(ac->ac_sb, group);
1876 if (ret)
1877 return 0;
1878 }
1834 1879
1835 free = grp->bb_free; 1880 free = grp->bb_free;
1836 fragments = grp->bb_fragments; 1881 fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1843 case 0: 1888 case 0:
1844 BUG_ON(ac->ac_2order == 0); 1889 BUG_ON(ac->ac_2order == 0);
1845 1890
1891 if (grp->bb_largest_free_order < ac->ac_2order)
1892 return 0;
1893
1846 /* Avoid using the first bg of a flexgroup for data files */ 1894 /* Avoid using the first bg of a flexgroup for data files */
1847 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1895 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1848 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1896 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1849 ((group % flex_size) == 0)) 1897 ((group % flex_size) == 0))
1850 return 0; 1898 return 0;
1851 1899
1852 bits = ac->ac_sb->s_blocksize_bits + 1; 1900 return 1;
1853 for (i = ac->ac_2order; i <= bits; i++)
1854 if (grp->bb_counters[i] > 0)
1855 return 1;
1856 break;
1857 case 1: 1901 case 1:
1858 if ((free / fragments) >= ac->ac_g_ex.fe_len) 1902 if ((free / fragments) >= ac->ac_g_ex.fe_len)
1859 return 1; 1903 return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1964 sbi = EXT4_SB(sb); 2008 sbi = EXT4_SB(sb);
1965 ngroups = ext4_get_groups_count(sb); 2009 ngroups = ext4_get_groups_count(sb);
1966 /* non-extent files are limited to low blocks/groups */ 2010 /* non-extent files are limited to low blocks/groups */
1967 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL)) 2011 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
1968 ngroups = sbi->s_blockfile_groups; 2012 ngroups = sbi->s_blockfile_groups;
1969 2013
1970 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2014 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
2024 group = ac->ac_g_ex.fe_group; 2068 group = ac->ac_g_ex.fe_group;
2025 2069
2026 for (i = 0; i < ngroups; group++, i++) { 2070 for (i = 0; i < ngroups; group++, i++) {
2027 struct ext4_group_info *grp;
2028 struct ext4_group_desc *desc;
2029
2030 if (group == ngroups) 2071 if (group == ngroups)
2031 group = 0; 2072 group = 0;
2032 2073
2033 /* quick check to skip empty groups */ 2074 /* This now checks without needing the buddy page */
2034 grp = ext4_get_group_info(sb, group); 2075 if (!ext4_mb_good_group(ac, group, cr))
2035 if (grp->bb_free == 0)
2036 continue; 2076 continue;
2037 2077
2038 err = ext4_mb_load_buddy(sb, group, &e4b); 2078 err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
2040 goto out; 2080 goto out;
2041 2081
2042 ext4_lock_group(sb, group); 2082 ext4_lock_group(sb, group);
2083
2084 /*
2085 * We need to check again after locking the
2086 * block group
2087 */
2043 if (!ext4_mb_good_group(ac, group, cr)) { 2088 if (!ext4_mb_good_group(ac, group, cr)) {
2044 /* someone did allocation from this group */
2045 ext4_unlock_group(sb, group); 2089 ext4_unlock_group(sb, group);
2046 ext4_mb_release_desc(&e4b); 2090 ext4_mb_unload_buddy(&e4b);
2047 continue; 2091 continue;
2048 } 2092 }
2049 2093
2050 ac->ac_groups_scanned++; 2094 ac->ac_groups_scanned++;
2051 desc = ext4_get_group_desc(sb, group, NULL);
2052 if (cr == 0) 2095 if (cr == 0)
2053 ext4_mb_simple_scan_group(ac, &e4b); 2096 ext4_mb_simple_scan_group(ac, &e4b);
2054 else if (cr == 1 && 2097 else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
2058 ext4_mb_complex_scan_group(ac, &e4b); 2101 ext4_mb_complex_scan_group(ac, &e4b);
2059 2102
2060 ext4_unlock_group(sb, group); 2103 ext4_unlock_group(sb, group);
2061 ext4_mb_release_desc(&e4b); 2104 ext4_mb_unload_buddy(&e4b);
2062 2105
2063 if (ac->ac_status != AC_STATUS_CONTINUE) 2106 if (ac->ac_status != AC_STATUS_CONTINUE)
2064 break; 2107 break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2148 ext4_lock_group(sb, group); 2191 ext4_lock_group(sb, group);
2149 memcpy(&sg, ext4_get_group_info(sb, group), i); 2192 memcpy(&sg, ext4_get_group_info(sb, group), i);
2150 ext4_unlock_group(sb, group); 2193 ext4_unlock_group(sb, group);
2151 ext4_mb_release_desc(&e4b); 2194 ext4_mb_unload_buddy(&e4b);
2152 2195
2153 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2196 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2154 sg.info.bb_fragments, sg.info.bb_first_free); 2197 sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2255 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2298 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2256 init_rwsem(&meta_group_info[i]->alloc_sem); 2299 init_rwsem(&meta_group_info[i]->alloc_sem);
2257 meta_group_info[i]->bb_free_root = RB_ROOT; 2300 meta_group_info[i]->bb_free_root = RB_ROOT;
2301 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2258 2302
2259#ifdef DOUBLE_CHECK 2303#ifdef DOUBLE_CHECK
2260 { 2304 {
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2536 entry->count, entry->group, entry); 2580 entry->count, entry->group, entry);
2537 2581
2538 if (test_opt(sb, DISCARD)) { 2582 if (test_opt(sb, DISCARD)) {
2583 int ret;
2539 ext4_fsblk_t discard_block; 2584 ext4_fsblk_t discard_block;
2540 2585
2541 discard_block = entry->start_blk + 2586 discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2543 trace_ext4_discard_blocks(sb, 2588 trace_ext4_discard_blocks(sb,
2544 (unsigned long long)discard_block, 2589 (unsigned long long)discard_block,
2545 entry->count); 2590 entry->count);
2546 sb_issue_discard(sb, discard_block, entry->count); 2591 ret = sb_issue_discard(sb, discard_block, entry->count);
2592 if (ret == EOPNOTSUPP) {
2593 ext4_warning(sb,
2594 "discard not supported, disabling");
2595 clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
2596 }
2547 } 2597 }
2548 2598
2549 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2599 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2568 } 2618 }
2569 ext4_unlock_group(sb, entry->group); 2619 ext4_unlock_group(sb, entry->group);
2570 kmem_cache_free(ext4_free_ext_cachep, entry); 2620 kmem_cache_free(ext4_free_ext_cachep, entry);
2571 ext4_mb_release_desc(&e4b); 2621 ext4_mb_unload_buddy(&e4b);
2572 } 2622 }
2573 2623
2574 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2624 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
2641 2691
2642void exit_ext4_mballoc(void) 2692void exit_ext4_mballoc(void)
2643{ 2693{
2644 /* 2694 /*
2645 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2695 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2646 * before destroying the slab cache. 2696 * before destroying the slab cache.
2647 */ 2697 */
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
2981 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3031 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
2982 atomic_inc(&sbi->s_bal_reqs); 3032 atomic_inc(&sbi->s_bal_reqs);
2983 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3033 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
2984 if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len) 3034 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
2985 atomic_inc(&sbi->s_bal_success); 3035 atomic_inc(&sbi->s_bal_success);
2986 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3036 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
2987 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3037 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3123 continue; 3173 continue;
3124 3174
3125 /* non-extent files can't have physical blocks past 2^32 */ 3175 /* non-extent files can't have physical blocks past 2^32 */
3126 if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) && 3176 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3127 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3177 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3128 continue; 3178 continue;
3129 3179
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3280 spin_unlock(&pa->pa_lock); 3330 spin_unlock(&pa->pa_lock);
3281 3331
3282 grp_blk = pa->pa_pstart; 3332 grp_blk = pa->pa_pstart;
3283 /* 3333 /*
3284 * If doing group-based preallocation, pa_pstart may be in the 3334 * If doing group-based preallocation, pa_pstart may be in the
3285 * next group when pa is used up 3335 * next group when pa is used up
3286 */ 3336 */
@@ -3697,7 +3747,7 @@ out:
3697 ext4_unlock_group(sb, group); 3747 ext4_unlock_group(sb, group);
3698 if (ac) 3748 if (ac)
3699 kmem_cache_free(ext4_ac_cachep, ac); 3749 kmem_cache_free(ext4_ac_cachep, ac);
3700 ext4_mb_release_desc(&e4b); 3750 ext4_mb_unload_buddy(&e4b);
3701 put_bh(bitmap_bh); 3751 put_bh(bitmap_bh);
3702 return free; 3752 return free;
3703} 3753}
@@ -3801,7 +3851,7 @@ repeat:
3801 if (bitmap_bh == NULL) { 3851 if (bitmap_bh == NULL) {
3802 ext4_error(sb, "Error reading block bitmap for %u", 3852 ext4_error(sb, "Error reading block bitmap for %u",
3803 group); 3853 group);
3804 ext4_mb_release_desc(&e4b); 3854 ext4_mb_unload_buddy(&e4b);
3805 continue; 3855 continue;
3806 } 3856 }
3807 3857
@@ -3810,7 +3860,7 @@ repeat:
3810 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); 3860 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
3811 ext4_unlock_group(sb, group); 3861 ext4_unlock_group(sb, group);
3812 3862
3813 ext4_mb_release_desc(&e4b); 3863 ext4_mb_unload_buddy(&e4b);
3814 put_bh(bitmap_bh); 3864 put_bh(bitmap_bh);
3815 3865
3816 list_del(&pa->u.pa_tmp_list); 3866 list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
4074 ext4_mb_release_group_pa(&e4b, pa, ac); 4124 ext4_mb_release_group_pa(&e4b, pa, ac);
4075 ext4_unlock_group(sb, group); 4125 ext4_unlock_group(sb, group);
4076 4126
4077 ext4_mb_release_desc(&e4b); 4127 ext4_mb_unload_buddy(&e4b);
4078 list_del(&pa->u.pa_tmp_list); 4128 list_del(&pa->u.pa_tmp_list);
4079 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4129 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4080 } 4130 }
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4484 if (!bh) 4534 if (!bh)
4485 tbh = sb_find_get_block(inode->i_sb, 4535 tbh = sb_find_get_block(inode->i_sb,
4486 block + i); 4536 block + i);
4487 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4537 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4488 inode, tbh, block + i); 4538 inode, tbh, block + i);
4489 } 4539 }
4490 } 4540 }
4491 4541
4492 /* 4542 /*
4493 * We need to make sure we don't reuse the freed block until 4543 * We need to make sure we don't reuse the freed block until
4494 * after the transaction is committed, which we can do by 4544 * after the transaction is committed, which we can do by
4495 * treating the block as metadata, below. We make an 4545 * treating the block as metadata, below. We make an
@@ -4610,7 +4660,7 @@ do_more:
4610 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4660 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4611 } 4661 }
4612 4662
4613 ext4_mb_release_desc(&e4b); 4663 ext4_mb_unload_buddy(&e4b);
4614 4664
4615 freed += count; 4665 freed += count;
4616 4666
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
475 */ 475 */
476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, 476 if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
477 EXT4_FEATURE_INCOMPAT_EXTENTS) || 477 EXT4_FEATURE_INCOMPAT_EXTENTS) ||
478 (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 478 (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
479 return -EINVAL; 479 return -EINVAL;
480 480
481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) 481 if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..52abfa12762a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
482 int depth = ext_depth(orig_inode); 482 int depth = ext_depth(orig_inode);
483 int ret; 483 int ret;
484 484
485 start_ext.ee_block = end_ext.ee_block = 0;
485 o_start = o_end = oext = orig_path[depth].p_ext; 486 o_start = o_end = oext = orig_path[depth].p_ext;
486 oext_alen = ext4_ext_get_actual_len(oext); 487 oext_alen = ext4_ext_get_actual_len(oext);
487 start_ext.ee_len = end_ext.ee_len = 0; 488 start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
529 * new_ext |-------| 530 * new_ext |-------|
530 */ 531 */
531 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { 532 if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
532 ext4_error(orig_inode->i_sb, 533 EXT4_ERROR_INODE(orig_inode,
533 "new_ext_end(%u) should be less than or equal to " 534 "new_ext_end(%u) should be less than or equal to "
534 "oext->ee_block(%u) + oext_alen(%d) - 1", 535 "oext->ee_block(%u) + oext_alen(%d) - 1",
535 new_ext_end, le32_to_cpu(oext->ee_block), 536 new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
692 while (1) { 693 while (1) {
693 /* The extent for donor must be found. */ 694 /* The extent for donor must be found. */
694 if (!dext) { 695 if (!dext) {
695 ext4_error(donor_inode->i_sb, 696 EXT4_ERROR_INODE(donor_inode,
696 "The extent for donor must be found"); 697 "The extent for donor must be found");
697 *err = -EIO; 698 *err = -EIO;
698 goto out; 699 goto out;
699 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 700 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
700 ext4_error(donor_inode->i_sb, 701 EXT4_ERROR_INODE(donor_inode,
701 "Donor offset(%u) and the first block of donor " 702 "Donor offset(%u) and the first block of donor "
702 "extent(%u) should be equal", 703 "extent(%u) should be equal",
703 donor_off, 704 donor_off,
@@ -959,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
959 return -EINVAL; 960 return -EINVAL;
960 } 961 }
961 962
963 if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
964 return -EPERM;
965
962 /* Ext4 move extent does not support swapfile */ 966 /* Ext4 move extent does not support swapfile */
963 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { 967 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
964 ext4_debug("ext4 move extent: The argument files should " 968 ext4_debug("ext4 move extent: The argument files should "
@@ -976,11 +980,11 @@ mext_check_arguments(struct inode *orig_inode,
976 } 980 }
977 981
978 /* Ext4 move extent supports only extent based file */ 982 /* Ext4 move extent supports only extent based file */
979 if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) { 983 if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
980 ext4_debug("ext4 move extent: orig file is not extents " 984 ext4_debug("ext4 move extent: orig file is not extents "
981 "based file [ino:orig %lu]\n", orig_inode->i_ino); 985 "based file [ino:orig %lu]\n", orig_inode->i_ino);
982 return -EOPNOTSUPP; 986 return -EOPNOTSUPP;
983 } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) { 987 } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
984 ext4_debug("ext4 move extent: donor file is not extents " 988 ext4_debug("ext4 move extent: donor file is not extents "
985 "based file [ino:donor %lu]\n", donor_inode->i_ino); 989 "based file [ino:donor %lu]\n", donor_inode->i_ino);
986 return -EOPNOTSUPP; 990 return -EOPNOTSUPP;
@@ -1354,7 +1358,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1354 if (ret1 < 0) 1358 if (ret1 < 0)
1355 break; 1359 break;
1356 if (*moved_len > len) { 1360 if (*moved_len > len) {
1357 ext4_error(orig_inode->i_sb, 1361 EXT4_ERROR_INODE(orig_inode,
1358 "We replaced blocks too much! " 1362 "We replaced blocks too much! "
1359 "sum of replaced: %llu requested: %llu", 1363 "sum of replaced: %llu requested: %llu",
1360 *moved_len, len); 1364 *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
187 return blocksize; 187 return blocksize;
188 return (len & 65532) | ((len & 3) << 16); 188 return (len & 65532) | ((len & 3) << 16);
189} 189}
190 190
191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) 191__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
192{ 192{
193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) 193 if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
197 if (len == blocksize) { 197 if (len == blocksize) {
198 if (blocksize == 65536) 198 if (blocksize == 65536)
199 return cpu_to_le16(EXT4_MAX_REC_LEN); 199 return cpu_to_le16(EXT4_MAX_REC_LEN);
200 else 200 else
201 return cpu_to_le16(0); 201 return cpu_to_le16(0);
202 } 202 }
203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); 203 return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
349 brelse(bh); 349 brelse(bh);
350 } 350 }
351 if (bcount) 351 if (bcount)
352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 352 printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
353 levels ? "" : " ", names, space/bcount, 353 levels ? "" : " ", names, space/bcount,
354 (space/bcount)*100/blocksize); 354 (space/bcount)*100/blocksize);
355 return (struct stats) { names, space, bcount}; 355 return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
653 int ret, err; 653 int ret, err;
654 __u32 hashval; 654 __u32 hashval;
655 655
656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 656 dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
657 start_hash, start_minor_hash)); 657 start_hash, start_minor_hash));
658 dir = dir_file->f_path.dentry->d_inode; 658 dir = dir_file->f_path.dentry->d_inode;
659 if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { 659 if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; 660 hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
661 if (hinfo.hash_version <= DX_HASH_TEA) 661 if (hinfo.hash_version <= DX_HASH_TEA)
662 hinfo.hash_version += 662 hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
801{ 801{
802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, 802 if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
803 EXT4_FEATURE_COMPAT_DIR_INDEX)) 803 EXT4_FEATURE_COMPAT_DIR_INDEX))
804 EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL; 804 ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
805} 805}
806 806
807/* 807/*
@@ -943,8 +943,8 @@ restart:
943 wait_on_buffer(bh); 943 wait_on_buffer(bh);
944 if (!buffer_uptodate(bh)) { 944 if (!buffer_uptodate(bh)) {
945 /* read error, skip block & hope for the best */ 945 /* read error, skip block & hope for the best */
946 ext4_error(sb, "reading directory #%lu offset %lu", 946 EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
947 dir->i_ino, (unsigned long)block); 947 (unsigned long) block);
948 brelse(bh); 948 brelse(bh);
949 goto next; 949 goto next;
950 } 950 }
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
1066 __u32 ino = le32_to_cpu(de->inode); 1066 __u32 ino = le32_to_cpu(de->inode);
1067 brelse(bh); 1067 brelse(bh);
1068 if (!ext4_valid_inum(dir->i_sb, ino)) { 1068 if (!ext4_valid_inum(dir->i_sb, ino)) {
1069 ext4_error(dir->i_sb, "bad inode number: %u", ino); 1069 EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
1070 return ERR_PTR(-EIO); 1070 return ERR_PTR(-EIO);
1071 } 1071 }
1072 inode = ext4_iget(dir->i_sb, ino); 1072 inode = ext4_iget(dir->i_sb, ino);
1073 if (unlikely(IS_ERR(inode))) { 1073 if (unlikely(IS_ERR(inode))) {
1074 if (PTR_ERR(inode) == -ESTALE) { 1074 if (PTR_ERR(inode) == -ESTALE) {
1075 ext4_error(dir->i_sb, 1075 EXT4_ERROR_INODE(dir,
1076 "deleted inode referenced: %u", 1076 "deleted inode referenced: %u",
1077 ino); 1077 ino);
1078 return ERR_PTR(-EIO); 1078 return ERR_PTR(-EIO);
1079 } else { 1079 } else {
1080 return ERR_CAST(inode); 1080 return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
1104 brelse(bh); 1104 brelse(bh);
1105 1105
1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { 1106 if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
1107 ext4_error(child->d_inode->i_sb, 1107 EXT4_ERROR_INODE(child->d_inode,
1108 "bad inode number: %u", ino); 1108 "bad parent inode number: %u", ino);
1109 return ERR_PTR(-EIO); 1109 return ERR_PTR(-EIO);
1110 } 1110 }
1111 1111
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
1141 unsigned rec_len = 0; 1141 unsigned rec_len = 0;
1142 1142
1143 while (count--) { 1143 while (count--) {
1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 1144 struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
1145 (from + (map->offs<<2)); 1145 (from + (map->offs<<2));
1146 rec_len = EXT4_DIR_REC_LEN(de->name_len); 1146 rec_len = EXT4_DIR_REC_LEN(de->name_len);
1147 memcpy (to, de, rec_len); 1147 memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1404 de = (struct ext4_dir_entry_2 *)((char *)fde + 1404 de = (struct ext4_dir_entry_2 *)((char *)fde +
1405 ext4_rec_len_from_disk(fde->rec_len, blocksize)); 1405 ext4_rec_len_from_disk(fde->rec_len, blocksize));
1406 if ((char *) de >= (((char *) root) + blocksize)) { 1406 if ((char *) de >= (((char *) root) + blocksize)) {
1407 ext4_error(dir->i_sb, 1407 EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
1408 "invalid rec_len for '..' in inode %lu",
1409 dir->i_ino);
1410 brelse(bh); 1408 brelse(bh);
1411 return -EIO; 1409 return -EIO;
1412 } 1410 }
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1418 brelse(bh); 1416 brelse(bh);
1419 return retval; 1417 return retval;
1420 } 1418 }
1421 EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; 1419 ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
1422 data1 = bh2->b_data; 1420 data1 = bh2->b_data;
1423 1421
1424 memcpy (data1, de, len); 1422 memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1491 retval = ext4_dx_add_entry(handle, dentry, inode); 1489 retval = ext4_dx_add_entry(handle, dentry, inode);
1492 if (!retval || (retval != ERR_BAD_DX_DIR)) 1490 if (!retval || (retval != ERR_BAD_DX_DIR))
1493 return retval; 1491 return retval;
1494 EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL; 1492 ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
1495 dx_fallback++; 1493 dx_fallback++;
1496 ext4_mark_inode_dirty(handle, dir); 1494 ext4_mark_inode_dirty(handle, dir);
1497 } 1495 }
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1519 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1517 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1520 retval = add_dirent_to_buf(handle, dentry, inode, de, bh); 1518 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1521 brelse(bh); 1519 brelse(bh);
1520 if (retval == 0)
1521 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
1522 return retval; 1522 return retval;
1523} 1523}
1524 1524
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || 1915 if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { 1916 !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
1917 if (err) 1917 if (err)
1918 ext4_error(inode->i_sb, 1918 EXT4_ERROR_INODE(inode,
1919 "error %d reading directory #%lu offset 0", 1919 "error %d reading directory lblock 0", err);
1920 err, inode->i_ino);
1921 else 1920 else
1922 ext4_warning(inode->i_sb, 1921 ext4_warning(inode->i_sb,
1923 "bad directory (dir #%lu) - no data block", 1922 "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
1941 de = ext4_next_entry(de1, sb->s_blocksize); 1940 de = ext4_next_entry(de1, sb->s_blocksize);
1942 while (offset < inode->i_size) { 1941 while (offset < inode->i_size) {
1943 if (!bh || 1942 if (!bh ||
1944 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { 1943 (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
1944 unsigned int lblock;
1945 err = 0; 1945 err = 0;
1946 brelse(bh); 1946 brelse(bh);
1947 bh = ext4_bread(NULL, inode, 1947 lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
1948 offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); 1948 bh = ext4_bread(NULL, inode, lblock, 0, &err);
1949 if (!bh) { 1949 if (!bh) {
1950 if (err) 1950 if (err)
1951 ext4_error(sb, 1951 EXT4_ERROR_INODE(inode,
1952 "error %d reading directory" 1952 "error %d reading directory "
1953 " #%lu offset %u", 1953 "lblock %u", err, lblock);
1954 err, inode->i_ino, offset);
1955 offset += sb->s_blocksize; 1954 offset += sb->s_blocksize;
1956 continue; 1955 continue;
1957 } 1956 }
@@ -2297,7 +2296,7 @@ retry:
2297 } 2296 }
2298 } else { 2297 } else {
2299 /* clear the extent format for fast symlink */ 2298 /* clear the extent format for fast symlink */
2300 EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; 2299 ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
2301 inode->i_op = &ext4_fast_symlink_inode_operations; 2300 inode->i_op = &ext4_fast_symlink_inode_operations;
2302 memcpy((char *)&EXT4_I(inode)->i_data, symname, l); 2301 memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
2303 inode->i_size = l-1; 2302 inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
911 percpu_counter_add(&sbi->s_freeinodes_counter, 911 percpu_counter_add(&sbi->s_freeinodes_counter,
912 EXT4_INODES_PER_GROUP(sb)); 912 EXT4_INODES_PER_GROUP(sb));
913 913
914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { 914 if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
915 sbi->s_log_groups_per_flex) {
915 ext4_group_t flex_group; 916 ext4_group_t flex_group;
916 flex_group = ext4_flex_group(sbi, input->group); 917 flex_group = ext4_flex_group(sbi, input->group);
917 atomic_add(input->free_blocks_count, 918 atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8d7539c9d778..e72d3235b2fd 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
241 if (sb->s_flags & MS_RDONLY) 241 if (sb->s_flags & MS_RDONLY)
242 return ERR_PTR(-EROFS); 242 return ERR_PTR(-EROFS);
243 243
244 vfs_check_frozen(sb, SB_FREEZE_WRITE);
244 /* Special case here: if the journal has aborted behind our 245 /* Special case here: if the journal has aborted behind our
245 * backs (eg. EIO in the commit thread), then we still need to 246 * backs (eg. EIO in the commit thread), then we still need to
246 * take the FS itself readonly cleanly. */ 247 * take the FS itself readonly cleanly. */
@@ -645,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
645 struct ext4_super_block *es = sbi->s_es; 646 struct ext4_super_block *es = sbi->s_es;
646 int i, err; 647 int i, err;
647 648
649 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
650
648 flush_workqueue(sbi->dio_unwritten_wq); 651 flush_workqueue(sbi->dio_unwritten_wq);
649 destroy_workqueue(sbi->dio_unwritten_wq); 652 destroy_workqueue(sbi->dio_unwritten_wq);
650 653
@@ -941,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
941 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); 944 seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
942 if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 945 if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
943 seq_puts(seq, ",journal_async_commit"); 946 seq_puts(seq, ",journal_async_commit");
947 else if (test_opt(sb, JOURNAL_CHECKSUM))
948 seq_puts(seq, ",journal_checksum");
944 if (test_opt(sb, NOBH)) 949 if (test_opt(sb, NOBH))
945 seq_puts(seq, ",nobh"); 950 seq_puts(seq, ",nobh");
946 if (test_opt(sb, I_VERSION)) 951 if (test_opt(sb, I_VERSION))
@@ -1059,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
1059static int ext4_mark_dquot_dirty(struct dquot *dquot); 1064static int ext4_mark_dquot_dirty(struct dquot *dquot);
1060static int ext4_write_info(struct super_block *sb, int type); 1065static int ext4_write_info(struct super_block *sb, int type);
1061static int ext4_quota_on(struct super_block *sb, int type, int format_id, 1066static int ext4_quota_on(struct super_block *sb, int type, int format_id,
1062 char *path, int remount); 1067 char *path);
1063static int ext4_quota_on_mount(struct super_block *sb, int type); 1068static int ext4_quota_on_mount(struct super_block *sb, int type);
1064static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, 1069static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
1065 size_t len, loff_t off); 1070 size_t len, loff_t off);
@@ -1081,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
1081 1086
1082static const struct quotactl_ops ext4_qctl_operations = { 1087static const struct quotactl_ops ext4_qctl_operations = {
1083 .quota_on = ext4_quota_on, 1088 .quota_on = ext4_quota_on,
1084 .quota_off = vfs_quota_off, 1089 .quota_off = dquot_quota_off,
1085 .quota_sync = vfs_quota_sync, 1090 .quota_sync = dquot_quota_sync,
1086 .get_info = vfs_get_dqinfo, 1091 .get_info = dquot_get_dqinfo,
1087 .set_info = vfs_set_dqinfo, 1092 .set_info = dquot_set_dqinfo,
1088 .get_dqblk = vfs_get_dqblk, 1093 .get_dqblk = dquot_get_dqblk,
1089 .set_dqblk = vfs_set_dqblk 1094 .set_dqblk = dquot_set_dqblk
1090}; 1095};
1091#endif 1096#endif
1092 1097
@@ -2051,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2051 /* Turn quotas off */ 2056 /* Turn quotas off */
2052 for (i = 0; i < MAXQUOTAS; i++) { 2057 for (i = 0; i < MAXQUOTAS; i++) {
2053 if (sb_dqopt(sb)->files[i]) 2058 if (sb_dqopt(sb)->files[i])
2054 vfs_quota_off(sb, i, 0); 2059 dquot_quota_off(sb, i);
2055 } 2060 }
2056#endif 2061#endif
2057 sb->s_flags = s_flags; /* Restore MS_RDONLY status */ 2062 sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2213,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
2213struct ext4_attr { 2218struct ext4_attr {
2214 struct attribute attr; 2219 struct attribute attr;
2215 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2220 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2216 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2221 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2217 const char *, size_t); 2222 const char *, size_t);
2218 int offset; 2223 int offset;
2219}; 2224};
@@ -2430,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2430 __releases(kernel_lock) 2435 __releases(kernel_lock)
2431 __acquires(kernel_lock) 2436 __acquires(kernel_lock)
2432{ 2437{
2438 char *orig_data = kstrdup(data, GFP_KERNEL);
2433 struct buffer_head *bh; 2439 struct buffer_head *bh;
2434 struct ext4_super_block *es = NULL; 2440 struct ext4_super_block *es = NULL;
2435 struct ext4_sb_info *sbi; 2441 struct ext4_sb_info *sbi;
@@ -2793,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2793 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 2799 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
2794 spin_lock_init(&sbi->s_next_gen_lock); 2800 spin_lock_init(&sbi->s_next_gen_lock);
2795 2801
2796 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2797 ext4_count_free_blocks(sb));
2798 if (!err) {
2799 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2800 ext4_count_free_inodes(sb));
2801 }
2802 if (!err) {
2803 err = percpu_counter_init(&sbi->s_dirs_counter,
2804 ext4_count_dirs(sb));
2805 }
2806 if (!err) {
2807 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2808 }
2809 if (err) {
2810 ext4_msg(sb, KERN_ERR, "insufficient memory");
2811 goto failed_mount3;
2812 }
2813
2814 sbi->s_stripe = ext4_get_stripe_size(sbi); 2802 sbi->s_stripe = ext4_get_stripe_size(sbi);
2815 sbi->s_max_writeback_mb_bump = 128; 2803 sbi->s_max_writeback_mb_bump = 128;
2816 2804
@@ -2910,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2910 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); 2898 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
2911 2899
2912no_journal: 2900no_journal:
2901 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2902 ext4_count_free_blocks(sb));
2903 if (!err)
2904 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2905 ext4_count_free_inodes(sb));
2906 if (!err)
2907 err = percpu_counter_init(&sbi->s_dirs_counter,
2908 ext4_count_dirs(sb));
2909 if (!err)
2910 err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
2911 if (err) {
2912 ext4_msg(sb, KERN_ERR, "insufficient memory");
2913 goto failed_mount_wq;
2914 }
2913 if (test_opt(sb, NOBH)) { 2915 if (test_opt(sb, NOBH)) {
2914 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { 2916 if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
2915 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " 2917 ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3003,7 @@ no_journal:
3001 err = ext4_setup_system_zone(sb); 3003 err = ext4_setup_system_zone(sb);
3002 if (err) { 3004 if (err) {
3003 ext4_msg(sb, KERN_ERR, "failed to initialize system " 3005 ext4_msg(sb, KERN_ERR, "failed to initialize system "
3004 "zone (%d)\n", err); 3006 "zone (%d)", err);
3005 goto failed_mount4; 3007 goto failed_mount4;
3006 } 3008 }
3007 3009
@@ -3040,9 +3042,11 @@ no_journal:
3040 } else 3042 } else
3041 descr = "out journal"; 3043 descr = "out journal";
3042 3044
3043 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr); 3045 ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
3046 "Opts: %s", descr, orig_data);
3044 3047
3045 lock_kernel(); 3048 lock_kernel();
3049 kfree(orig_data);
3046 return 0; 3050 return 0;
3047 3051
3048cantfind_ext4: 3052cantfind_ext4:
@@ -3059,6 +3063,10 @@ failed_mount_wq:
3059 jbd2_journal_destroy(sbi->s_journal); 3063 jbd2_journal_destroy(sbi->s_journal);
3060 sbi->s_journal = NULL; 3064 sbi->s_journal = NULL;
3061 } 3065 }
3066 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3067 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3068 percpu_counter_destroy(&sbi->s_dirs_counter);
3069 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3062failed_mount3: 3070failed_mount3:
3063 if (sbi->s_flex_groups) { 3071 if (sbi->s_flex_groups) {
3064 if (is_vmalloc_addr(sbi->s_flex_groups)) 3072 if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3074,6 @@ failed_mount3:
3066 else 3074 else
3067 kfree(sbi->s_flex_groups); 3075 kfree(sbi->s_flex_groups);
3068 } 3076 }
3069 percpu_counter_destroy(&sbi->s_freeblocks_counter);
3070 percpu_counter_destroy(&sbi->s_freeinodes_counter);
3071 percpu_counter_destroy(&sbi->s_dirs_counter);
3072 percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
3073failed_mount2: 3077failed_mount2:
3074 for (i = 0; i < db_count; i++) 3078 for (i = 0; i < db_count; i++)
3075 brelse(sbi->s_group_desc[i]); 3079 brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3093,7 @@ out_fail:
3089 kfree(sbi->s_blockgroup_lock); 3093 kfree(sbi->s_blockgroup_lock);
3090 kfree(sbi); 3094 kfree(sbi);
3091 lock_kernel(); 3095 lock_kernel();
3096 kfree(orig_data);
3092 return ret; 3097 return ret;
3093} 3098}
3094 3099
@@ -3380,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
3380 if (!(sb->s_flags & MS_RDONLY)) 3385 if (!(sb->s_flags & MS_RDONLY))
3381 es->s_wtime = cpu_to_le32(get_seconds()); 3386 es->s_wtime = cpu_to_le32(get_seconds());
3382 es->s_kbytes_written = 3387 es->s_kbytes_written =
3383 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 3388 cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
3384 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - 3389 ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
3385 EXT4_SB(sb)->s_sectors_written_start) >> 1)); 3390 EXT4_SB(sb)->s_sectors_written_start) >> 1));
3386 ext4_free_blocks_count_set(es, percpu_counter_sum_positive( 3391 ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
3485 return 0; 3490 return 0;
3486 3491
3487 journal = EXT4_SB(sb)->s_journal; 3492 journal = EXT4_SB(sb)->s_journal;
3488 if (journal) 3493 if (journal) {
3494 vfs_check_frozen(sb, SB_FREEZE_WRITE);
3489 ret = ext4_journal_force_commit(journal); 3495 ret = ext4_journal_force_commit(journal);
3496 }
3490 3497
3491 return ret; 3498 return ret;
3492} 3499}
@@ -3535,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
3535 * the journal. 3542 * the journal.
3536 */ 3543 */
3537 error = jbd2_journal_flush(journal); 3544 error = jbd2_journal_flush(journal);
3538 if (error < 0) { 3545 if (error < 0)
3539 out: 3546 goto out;
3540 jbd2_journal_unlock_updates(journal);
3541 return error;
3542 }
3543 3547
3544 /* Journal blocked and flushed, clear needs_recovery flag. */ 3548 /* Journal blocked and flushed, clear needs_recovery flag. */
3545 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3549 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3546 error = ext4_commit_super(sb, 1); 3550 error = ext4_commit_super(sb, 1);
3547 if (error) 3551out:
3548 goto out; 3552 /* we rely on s_frozen to stop further updates */
3549 return 0; 3553 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3554 return error;
3550} 3555}
3551 3556
3552/* 3557/*
@@ -3563,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
3563 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 3568 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
3564 ext4_commit_super(sb, 1); 3569 ext4_commit_super(sb, 1);
3565 unlock_super(sb); 3570 unlock_super(sb);
3566 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
3567 return 0; 3571 return 0;
3568} 3572}
3569 3573
@@ -3574,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3574 ext4_fsblk_t n_blocks_count = 0; 3578 ext4_fsblk_t n_blocks_count = 0;
3575 unsigned long old_sb_flags; 3579 unsigned long old_sb_flags;
3576 struct ext4_mount_options old_opts; 3580 struct ext4_mount_options old_opts;
3581 int enable_quota = 0;
3577 ext4_group_t g; 3582 ext4_group_t g;
3578 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; 3583 unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
3579 int err; 3584 int err;
3580#ifdef CONFIG_QUOTA 3585#ifdef CONFIG_QUOTA
3581 int i; 3586 int i;
3582#endif 3587#endif
3588 char *orig_data = kstrdup(data, GFP_KERNEL);
3583 3589
3584 lock_kernel(); 3590 lock_kernel();
3585 3591
@@ -3630,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3630 } 3636 }
3631 3637
3632 if (*flags & MS_RDONLY) { 3638 if (*flags & MS_RDONLY) {
3639 err = dquot_suspend(sb, -1);
3640 if (err < 0)
3641 goto restore_opts;
3642
3633 /* 3643 /*
3634 * First of all, the unconditional stuff we have to do 3644 * First of all, the unconditional stuff we have to do
3635 * to disable replay of the journal when we next remount 3645 * to disable replay of the journal when we next remount
@@ -3698,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3698 goto restore_opts; 3708 goto restore_opts;
3699 if (!ext4_setup_super(sb, es, 0)) 3709 if (!ext4_setup_super(sb, es, 0))
3700 sb->s_flags &= ~MS_RDONLY; 3710 sb->s_flags &= ~MS_RDONLY;
3711 enable_quota = 1;
3701 } 3712 }
3702 } 3713 }
3703 ext4_setup_system_zone(sb); 3714 ext4_setup_system_zone(sb);
@@ -3713,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
3713#endif 3724#endif
3714 unlock_super(sb); 3725 unlock_super(sb);
3715 unlock_kernel(); 3726 unlock_kernel();
3727 if (enable_quota)
3728 dquot_resume(sb, -1);
3729
3730 ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
3731 kfree(orig_data);
3716 return 0; 3732 return 0;
3717 3733
3718restore_opts: 3734restore_opts:
@@ -3734,6 +3750,7 @@ restore_opts:
3734#endif 3750#endif
3735 unlock_super(sb); 3751 unlock_super(sb);
3736 unlock_kernel(); 3752 unlock_kernel();
3753 kfree(orig_data);
3737 return err; 3754 return err;
3738} 3755}
3739 3756
@@ -3906,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
3906 */ 3923 */
3907static int ext4_quota_on_mount(struct super_block *sb, int type) 3924static int ext4_quota_on_mount(struct super_block *sb, int type)
3908{ 3925{
3909 return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type], 3926 return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
3910 EXT4_SB(sb)->s_jquota_fmt, type); 3927 EXT4_SB(sb)->s_jquota_fmt, type);
3911} 3928}
3912 3929
3913/* 3930/*
3914 * Standard function to be called on quota_on 3931 * Standard function to be called on quota_on
3915 */ 3932 */
3916static int ext4_quota_on(struct super_block *sb, int type, int format_id, 3933static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3917 char *name, int remount) 3934 char *name)
3918{ 3935{
3919 int err; 3936 int err;
3920 struct path path; 3937 struct path path;
3921 3938
3922 if (!test_opt(sb, QUOTA)) 3939 if (!test_opt(sb, QUOTA))
3923 return -EINVAL; 3940 return -EINVAL;
3924 /* When remounting, no checks are needed and in fact, name is NULL */
3925 if (remount)
3926 return vfs_quota_on(sb, type, format_id, name, remount);
3927 3941
3928 err = kern_path(name, LOOKUP_FOLLOW, &path); 3942 err = kern_path(name, LOOKUP_FOLLOW, &path);
3929 if (err) 3943 if (err)
@@ -3962,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
3962 } 3976 }
3963 } 3977 }
3964 3978
3965 err = vfs_quota_on_path(sb, type, format_id, &path); 3979 err = dquot_quota_on_path(sb, type, format_id, &path);
3966 path_put(&path); 3980 path_put(&path);
3967 return err; 3981 return err;
3968} 3982}
@@ -4141,6 +4155,7 @@ static int __init init_ext4_fs(void)
4141{ 4155{
4142 int err; 4156 int err;
4143 4157
4158 ext4_check_flag_values();
4144 err = init_ext4_system_zone(); 4159 err = init_ext4_system_zone();
4145 if (err) 4160 if (err)
4146 return err; 4161 return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
34 .readlink = generic_readlink, 34 .readlink = generic_readlink,
35 .follow_link = page_follow_link_light, 35 .follow_link = page_follow_link_light,
36 .put_link = page_put_link, 36 .put_link = page_put_link,
37 .setattr = ext4_setattr,
37#ifdef CONFIG_EXT4_FS_XATTR 38#ifdef CONFIG_EXT4_FS_XATTR
38 .setxattr = generic_setxattr, 39 .setxattr = generic_setxattr,
39 .getxattr = generic_getxattr, 40 .getxattr = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
45const struct inode_operations ext4_fast_symlink_inode_operations = { 46const struct inode_operations ext4_fast_symlink_inode_operations = {
46 .readlink = generic_readlink, 47 .readlink = generic_readlink,
47 .follow_link = ext4_follow_link, 48 .follow_link = ext4_follow_link,
49 .setattr = ext4_setattr,
48#ifdef CONFIG_EXT4_FS_XATTR 50#ifdef CONFIG_EXT4_FS_XATTR
49 .setxattr = generic_setxattr, 51 .setxattr = generic_setxattr,
50 .getxattr = generic_getxattr, 52 .getxattr = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index b4c5aa8489d8..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -97,7 +97,7 @@ static int ext4_xattr_list(struct dentry *dentry, char *buffer,
97 97
98static struct mb_cache *ext4_xattr_cache; 98static struct mb_cache *ext4_xattr_cache;
99 99
100static struct xattr_handler *ext4_xattr_handler_map[] = { 100static const struct xattr_handler *ext4_xattr_handler_map[] = {
101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, 101 [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler,
102#ifdef CONFIG_EXT4_FS_POSIX_ACL 102#ifdef CONFIG_EXT4_FS_POSIX_ACL
103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, 103 [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler,
@@ -109,7 +109,7 @@ static struct xattr_handler *ext4_xattr_handler_map[] = {
109#endif 109#endif
110}; 110};
111 111
112struct xattr_handler *ext4_xattr_handlers[] = { 112const struct xattr_handler *ext4_xattr_handlers[] = {
113 &ext4_xattr_user_handler, 113 &ext4_xattr_user_handler,
114 &ext4_xattr_trusted_handler, 114 &ext4_xattr_trusted_handler,
115#ifdef CONFIG_EXT4_FS_POSIX_ACL 115#ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -122,10 +122,10 @@ struct xattr_handler *ext4_xattr_handlers[] = {
122 NULL 122 NULL
123}; 123};
124 124
125static inline struct xattr_handler * 125static inline const struct xattr_handler *
126ext4_xattr_handler(int name_index) 126ext4_xattr_handler(int name_index)
127{ 127{
128 struct xattr_handler *handler = NULL; 128 const struct xattr_handler *handler = NULL;
129 129
130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) 130 if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map))
131 handler = ext4_xattr_handler_map[name_index]; 131 handler = ext4_xattr_handler_map[name_index];
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 228 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
229 if (ext4_xattr_check_block(bh)) { 229 if (ext4_xattr_check_block(bh)) {
230bad_block: 230bad_block:
231 ext4_error(inode->i_sb, 231 EXT4_ERROR_INODE(inode, "bad block %llu",
232 "inode %lu: bad block %llu", inode->i_ino, 232 EXT4_I(inode)->i_file_acl);
233 EXT4_I(inode)->i_file_acl);
234 error = -EIO; 233 error = -EIO;
235 goto cleanup; 234 goto cleanup;
236 } 235 }
@@ -332,7 +331,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
332 size_t rest = buffer_size; 331 size_t rest = buffer_size;
333 332
334 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { 333 for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
335 struct xattr_handler *handler = 334 const struct xattr_handler *handler =
336 ext4_xattr_handler(entry->e_name_index); 335 ext4_xattr_handler(entry->e_name_index);
337 336
338 if (handler) { 337 if (handler) {
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
372 ea_bdebug(bh, "b_count=%d, refcount=%d", 371 ea_bdebug(bh, "b_count=%d, refcount=%d",
373 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); 372 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
374 if (ext4_xattr_check_block(bh)) { 373 if (ext4_xattr_check_block(bh)) {
375 ext4_error(inode->i_sb, 374 EXT4_ERROR_INODE(inode, "bad block %llu",
376 "inode %lu: bad block %llu", inode->i_ino, 375 EXT4_I(inode)->i_file_acl);
377 EXT4_I(inode)->i_file_acl);
378 error = -EIO; 376 error = -EIO;
379 goto cleanup; 377 goto cleanup;
380 } 378 }
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
666 atomic_read(&(bs->bh->b_count)), 664 atomic_read(&(bs->bh->b_count)),
667 le32_to_cpu(BHDR(bs->bh)->h_refcount)); 665 le32_to_cpu(BHDR(bs->bh)->h_refcount));
668 if (ext4_xattr_check_block(bs->bh)) { 666 if (ext4_xattr_check_block(bs->bh)) {
669 ext4_error(sb, "inode %lu: bad block %llu", 667 EXT4_ERROR_INODE(inode, "bad block %llu",
670 inode->i_ino, EXT4_I(inode)->i_file_acl); 668 EXT4_I(inode)->i_file_acl);
671 error = -EIO; 669 error = -EIO;
672 goto cleanup; 670 goto cleanup;
673 } 671 }
@@ -820,7 +818,7 @@ inserted:
820 EXT4_I(inode)->i_block_group); 818 EXT4_I(inode)->i_block_group);
821 819
822 /* non-extent files can't have physical blocks past 2^32 */ 820 /* non-extent files can't have physical blocks past 2^32 */
823 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 821 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
824 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; 822 goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
825 823
826 block = ext4_new_meta_blocks(handle, inode, 824 block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
828 if (error) 826 if (error)
829 goto cleanup; 827 goto cleanup;
830 828
831 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) 829 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
832 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS); 830 BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
833 831
834 ea_idebug(inode, "creating block %d", block); 832 ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
880 goto cleanup; 878 goto cleanup;
881 879
882bad_block: 880bad_block:
883 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 881 EXT4_ERROR_INODE(inode, "bad block %llu",
884 inode->i_ino, EXT4_I(inode)->i_file_acl); 882 EXT4_I(inode)->i_file_acl);
885 goto cleanup; 883 goto cleanup;
886 884
887#undef header 885#undef header
@@ -1194,8 +1192,8 @@ retry:
1194 if (!bh) 1192 if (!bh)
1195 goto cleanup; 1193 goto cleanup;
1196 if (ext4_xattr_check_block(bh)) { 1194 if (ext4_xattr_check_block(bh)) {
1197 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1195 EXT4_ERROR_INODE(inode, "bad block %llu",
1198 inode->i_ino, EXT4_I(inode)->i_file_acl); 1196 EXT4_I(inode)->i_file_acl);
1199 error = -EIO; 1197 error = -EIO;
1200 goto cleanup; 1198 goto cleanup;
1201 } 1199 }
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
1372 goto cleanup; 1370 goto cleanup;
1373 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); 1371 bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
1374 if (!bh) { 1372 if (!bh) {
1375 ext4_error(inode->i_sb, "inode %lu: block %llu read error", 1373 EXT4_ERROR_INODE(inode, "block %llu read error",
1376 inode->i_ino, EXT4_I(inode)->i_file_acl); 1374 EXT4_I(inode)->i_file_acl);
1377 goto cleanup; 1375 goto cleanup;
1378 } 1376 }
1379 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || 1377 if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
1380 BHDR(bh)->h_blocks != cpu_to_le32(1)) { 1378 BHDR(bh)->h_blocks != cpu_to_le32(1)) {
1381 ext4_error(inode->i_sb, "inode %lu: bad block %llu", 1379 EXT4_ERROR_INODE(inode, "bad block %llu",
1382 inode->i_ino, EXT4_I(inode)->i_file_acl); 1380 EXT4_I(inode)->i_file_acl);
1383 goto cleanup; 1381 goto cleanup;
1384 } 1382 }
1385 ext4_xattr_release_block(handle, inode, bh); 1383 ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
1504 } 1502 }
1505 bh = sb_bread(inode->i_sb, ce->e_block); 1503 bh = sb_bread(inode->i_sb, ce->e_block);
1506 if (!bh) { 1504 if (!bh) {
1507 ext4_error(inode->i_sb, 1505 EXT4_ERROR_INODE(inode, "block %lu read error",
1508 "inode %lu: block %lu read error", 1506 (unsigned long) ce->e_block);
1509 inode->i_ino, (unsigned long) ce->e_block);
1510 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= 1507 } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
1511 EXT4_XATTR_REFCOUNT_MAX) { 1508 EXT4_XATTR_REFCOUNT_MAX) {
1512 ea_idebug(inode, "block %lu refcount %d>=%d", 1509 ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 8ede88b18c29..518e96e43905 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -65,11 +65,11 @@ struct ext4_xattr_entry {
65 65
66# ifdef CONFIG_EXT4_FS_XATTR 66# ifdef CONFIG_EXT4_FS_XATTR
67 67
68extern struct xattr_handler ext4_xattr_user_handler; 68extern const struct xattr_handler ext4_xattr_user_handler;
69extern struct xattr_handler ext4_xattr_trusted_handler; 69extern const struct xattr_handler ext4_xattr_trusted_handler;
70extern struct xattr_handler ext4_xattr_acl_access_handler; 70extern const struct xattr_handler ext4_xattr_acl_access_handler;
71extern struct xattr_handler ext4_xattr_acl_default_handler; 71extern const struct xattr_handler ext4_xattr_acl_default_handler;
72extern struct xattr_handler ext4_xattr_security_handler; 72extern const struct xattr_handler ext4_xattr_security_handler;
73 73
74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); 74extern ssize_t ext4_listxattr(struct dentry *, char *, size_t);
75 75
@@ -86,7 +86,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
86extern int init_ext4_xattr(void); 86extern int init_ext4_xattr(void);
87extern void exit_ext4_xattr(void); 87extern void exit_ext4_xattr(void);
88 88
89extern struct xattr_handler *ext4_xattr_handlers[]; 89extern const struct xattr_handler *ext4_xattr_handlers[];
90 90
91# else /* CONFIG_EXT4_FS_XATTR */ 91# else /* CONFIG_EXT4_FS_XATTR */
92 92
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 8b145e98df07..9b21268e121c 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -69,7 +69,7 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir)
69 return err; 69 return err;
70} 70}
71 71
72struct xattr_handler ext4_xattr_security_handler = { 72const struct xattr_handler ext4_xattr_security_handler = {
73 .prefix = XATTR_SECURITY_PREFIX, 73 .prefix = XATTR_SECURITY_PREFIX,
74 .list = ext4_xattr_security_list, 74 .list = ext4_xattr_security_list,
75 .get = ext4_xattr_security_get, 75 .get = ext4_xattr_security_get,
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 15b50edc6587..37e6ebca2cc3 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -51,7 +51,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
51 name, value, size, flags); 51 name, value, size, flags);
52} 52}
53 53
54struct xattr_handler ext4_xattr_trusted_handler = { 54const struct xattr_handler ext4_xattr_trusted_handler = {
55 .prefix = XATTR_TRUSTED_PREFIX, 55 .prefix = XATTR_TRUSTED_PREFIX,
56 .list = ext4_xattr_trusted_list, 56 .list = ext4_xattr_trusted_list,
57 .get = ext4_xattr_trusted_get, 57 .get = ext4_xattr_trusted_get,
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index c4ce05746ce1..98c375352d0e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -54,7 +54,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name,
54 name, value, size, flags); 54 name, value, size, flags);
55} 55}
56 56
57struct xattr_handler ext4_xattr_user_handler = { 57const struct xattr_handler ext4_xattr_user_handler = {
58 .prefix = XATTR_USER_PREFIX, 58 .prefix = XATTR_USER_PREFIX,
59 .list = ext4_xattr_user_list, 59 .list = ext4_xattr_user_list,
60 .get = ext4_xattr_user_get, 60 .get = ext4_xattr_user_get,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
242 while (*fclus < cluster) { 242 while (*fclus < cluster) {
243 /* prevent the infinite loop of cluster chain */ 243 /* prevent the infinite loop of cluster chain */
244 if (*fclus > limit) { 244 if (*fclus > limit) {
245 fat_fs_error(sb, "%s: detected the cluster chain loop" 245 fat_fs_error_ratelimit(sb,
246 " (i_pos %lld)", __func__, 246 "%s: detected the cluster chain loop"
247 MSDOS_I(inode)->i_pos); 247 " (i_pos %lld)", __func__,
248 MSDOS_I(inode)->i_pos);
248 nr = -EIO; 249 nr = -EIO;
249 goto out; 250 goto out;
250 } 251 }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
253 if (nr < 0) 254 if (nr < 0)
254 goto out; 255 goto out;
255 else if (nr == FAT_ENT_FREE) { 256 else if (nr == FAT_ENT_FREE) {
256 fat_fs_error(sb, "%s: invalid cluster chain" 257 fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
257 " (i_pos %lld)", __func__, 258 " (i_pos %lld)", __func__,
258 MSDOS_I(inode)->i_pos); 259 MSDOS_I(inode)->i_pos);
259 nr = -EIO; 260 nr = -EIO;
260 goto out; 261 goto out;
261 } else if (nr == FAT_ENT_EOF) { 262 } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
19#include <linux/buffer_head.h> 19#include <linux/buffer_head.h>
20#include <linux/compat.h> 20#include <linux/compat.h>
21#include <asm/uaccess.h> 21#include <asm/uaccess.h>
22#include <linux/kernel.h>
22#include "fat.h" 23#include "fat.h"
23 24
24/* 25/*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
140{ 141{
141 const wchar_t *ip; 142 const wchar_t *ip;
142 wchar_t ec; 143 wchar_t ec;
143 unsigned char *op, nc; 144 unsigned char *op;
144 int charlen; 145 int charlen;
145 int k;
146 146
147 ip = uni; 147 ip = uni;
148 op = ascii; 148 op = ascii;
149 149
150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { 150 while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
151 ec = *ip++; 151 ec = *ip++;
152 if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { 152 if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
153 op += charlen; 153 op += charlen;
154 len -= charlen; 154 len -= charlen;
155 } else { 155 } else {
156 if (uni_xlate == 1) { 156 if (uni_xlate == 1) {
157 *op = ':'; 157 *op++ = ':';
158 for (k = 4; k > 0; k--) { 158 op = pack_hex_byte(op, ec >> 8);
159 nc = ec & 0xF; 159 op = pack_hex_byte(op, ec);
160 op[k] = nc > 9 ? nc + ('a' - 10)
161 : nc + '0';
162 ec >>= 4;
163 }
164 op += 5;
165 len -= 5; 160 len -= 5;
166 } else { 161 } else {
167 *op++ = '?'; 162 *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
758 return ret; 753 return ret;
759} 754}
760 755
761static int fat_dir_ioctl(struct inode *inode, struct file *filp, 756static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
762 unsigned int cmd, unsigned long arg) 757 unsigned long arg)
763{ 758{
759 struct inode *inode = filp->f_path.dentry->d_inode;
764 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; 760 struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
765 int short_only, both; 761 int short_only, both;
766 762
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
774 both = 1; 770 both = 1;
775 break; 771 break;
776 default: 772 default:
777 return fat_generic_ioctl(inode, filp, cmd, arg); 773 return fat_generic_ioctl(filp, cmd, arg);
778 } 774 }
779 775
780 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2]))) 776 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
814 both = 1; 810 both = 1;
815 break; 811 break;
816 default: 812 default:
817 return -ENOIOCTLCMD; 813 return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
818 } 814 }
819 815
820 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2]))) 816 if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
836 .llseek = generic_file_llseek, 832 .llseek = generic_file_llseek,
837 .read = generic_read_dir, 833 .read = generic_read_dir,
838 .readdir = fat_readdir, 834 .readdir = fat_readdir,
839 .ioctl = fat_dir_ioctl, 835 .unlocked_ioctl = fat_dir_ioctl,
840#ifdef CONFIG_COMPAT 836#ifdef CONFIG_COMPAT
841 .compat_ioctl = fat_compat_dir_ioctl, 837 .compat_ioctl = fat_compat_dir_ioctl,
842#endif 838#endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
6#include <linux/nls.h> 6#include <linux/nls.h>
7#include <linux/fs.h> 7#include <linux/fs.h>
8#include <linux/mutex.h> 8#include <linux/mutex.h>
9#include <linux/ratelimit.h>
9#include <linux/msdos_fs.h> 10#include <linux/msdos_fs.h>
10 11
11/* 12/*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
82 struct fatent_operations *fatent_ops; 83 struct fatent_operations *fatent_ops;
83 struct inode *fat_inode; 84 struct inode *fat_inode;
84 85
86 struct ratelimit_state ratelimit;
87
85 spinlock_t inode_hash_lock; 88 spinlock_t inode_hash_lock;
86 struct hlist_head inode_hashtable[FAT_HASH_SIZE]; 89 struct hlist_head inode_hashtable[FAT_HASH_SIZE];
87}; 90};
@@ -298,16 +301,16 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
298extern int fat_count_free_clusters(struct super_block *sb); 301extern int fat_count_free_clusters(struct super_block *sb);
299 302
300/* fat/file.c */ 303/* fat/file.c */
301extern int fat_generic_ioctl(struct inode *inode, struct file *filp, 304extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
302 unsigned int cmd, unsigned long arg); 305 unsigned long arg);
303extern const struct file_operations fat_file_operations; 306extern const struct file_operations fat_file_operations;
304extern const struct inode_operations fat_file_inode_operations; 307extern const struct inode_operations fat_file_inode_operations;
305extern int fat_setattr(struct dentry * dentry, struct iattr * attr); 308extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
306extern void fat_truncate(struct inode *inode); 309extern int fat_setsize(struct inode *inode, loff_t offset);
310extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
307extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, 311extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
308 struct kstat *stat); 312 struct kstat *stat);
309extern int fat_file_fsync(struct file *file, struct dentry *dentry, 313extern int fat_file_fsync(struct file *file, int datasync);
310 int datasync);
311 314
312/* fat/inode.c */ 315/* fat/inode.c */
313extern void fat_attach(struct inode *inode, loff_t i_pos); 316extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
322extern int fat_flush_inodes(struct super_block *sb, struct inode *i1, 325extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
323 struct inode *i2); 326 struct inode *i2);
324/* fat/misc.c */ 327/* fat/misc.c */
325extern void fat_fs_error(struct super_block *s, const char *fmt, ...) 328extern void
326 __attribute__ ((format (printf, 2, 3))) __cold; 329__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
330 __attribute__ ((format (printf, 3, 4))) __cold;
331#define fat_fs_error(s, fmt, args...) \
332 __fat_fs_error(s, 1, fmt , ## args)
333#define fat_fs_error_ratelimit(s, fmt, args...) \
334 __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
327extern int fat_clusters_flush(struct super_block *sb); 335extern int fat_clusters_flush(struct super_block *sb);
328extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); 336extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
329extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, 337extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/capability.h> 9#include <linux/capability.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/compat.h>
11#include <linux/mount.h> 12#include <linux/mount.h>
12#include <linux/time.h> 13#include <linux/time.h>
13#include <linux/buffer_head.h> 14#include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
114 return err; 115 return err;
115} 116}
116 117
117int fat_generic_ioctl(struct inode *inode, struct file *filp, 118long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
118 unsigned int cmd, unsigned long arg)
119{ 119{
120 struct inode *inode = filp->f_path.dentry->d_inode;
120 u32 __user *user_attr = (u32 __user *)arg; 121 u32 __user *user_attr = (u32 __user *)arg;
121 122
122 switch (cmd) { 123 switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
129 } 130 }
130} 131}
131 132
133#ifdef CONFIG_COMPAT
134static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
135 unsigned long arg)
136
137{
138 return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
139}
140#endif
141
132static int fat_file_release(struct inode *inode, struct file *filp) 142static int fat_file_release(struct inode *inode, struct file *filp)
133{ 143{
134 if ((filp->f_mode & FMODE_WRITE) && 144 if ((filp->f_mode & FMODE_WRITE) &&
@@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
139 return 0; 149 return 0;
140} 150}
141 151
142int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync) 152int fat_file_fsync(struct file *filp, int datasync)
143{ 153{
144 struct inode *inode = dentry->d_inode; 154 struct inode *inode = filp->f_mapping->host;
145 int res, err; 155 int res, err;
146 156
147 res = simple_fsync(filp, dentry, datasync); 157 res = generic_file_fsync(filp, datasync);
148 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping); 158 err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
149 159
150 return res ? res : err; 160 return res ? res : err;
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
159 .aio_write = generic_file_aio_write, 169 .aio_write = generic_file_aio_write,
160 .mmap = generic_file_mmap, 170 .mmap = generic_file_mmap,
161 .release = fat_file_release, 171 .release = fat_file_release,
162 .ioctl = fat_generic_ioctl, 172 .unlocked_ioctl = fat_generic_ioctl,
173#ifdef CONFIG_COMPAT
174 .compat_ioctl = fat_generic_compat_ioctl,
175#endif
163 .fsync = fat_file_fsync, 176 .fsync = fat_file_fsync,
164 .splice_read = generic_file_splice_read, 177 .splice_read = generic_file_splice_read,
165}; 178};
@@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
270 return fat_free_clusters(inode, free_start); 283 return fat_free_clusters(inode, free_start);
271} 284}
272 285
273void fat_truncate(struct inode *inode) 286void fat_truncate_blocks(struct inode *inode, loff_t offset)
274{ 287{
275 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); 288 struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
276 const unsigned int cluster_size = sbi->cluster_size; 289 const unsigned int cluster_size = sbi->cluster_size;
@@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode)
280 * This protects against truncating a file bigger than it was then 293 * This protects against truncating a file bigger than it was then
281 * trying to write into the hole. 294 * trying to write into the hole.
282 */ 295 */
283 if (MSDOS_I(inode)->mmu_private > inode->i_size) 296 if (MSDOS_I(inode)->mmu_private > offset)
284 MSDOS_I(inode)->mmu_private = inode->i_size; 297 MSDOS_I(inode)->mmu_private = offset;
285 298
286 nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits; 299 nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
287 300
288 fat_free(inode, nr_clusters); 301 fat_free(inode, nr_clusters);
289 fat_flush_inodes(inode->i_sb, inode, NULL); 302 fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -351,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
351 return 0; 364 return 0;
352} 365}
353 366
367int fat_setsize(struct inode *inode, loff_t offset)
368{
369 int error;
370
371 error = simple_setsize(inode, offset);
372 if (error)
373 return error;
374 fat_truncate_blocks(inode, offset);
375
376 return error;
377}
378
354#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) 379#define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
355/* valid file mode bits */ 380/* valid file mode bits */
356#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) 381#define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -365,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
365 /* 390 /*
366 * Expand the file. Since inode_setattr() updates ->i_size 391 * Expand the file. Since inode_setattr() updates ->i_size
367 * before calling the ->truncate(), but FAT needs to fill the 392 * before calling the ->truncate(), but FAT needs to fill the
368 * hole before it. 393 * hole before it. XXX: this is no longer true with new truncate
394 * sequence.
369 */ 395 */
370 if (attr->ia_valid & ATTR_SIZE) { 396 if (attr->ia_valid & ATTR_SIZE) {
371 if (attr->ia_size > inode->i_size) { 397 if (attr->ia_size > inode->i_size) {
@@ -414,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
414 attr->ia_valid &= ~ATTR_MODE; 440 attr->ia_valid &= ~ATTR_MODE;
415 } 441 }
416 442
417 if (attr->ia_valid) 443 if (attr->ia_valid & ATTR_SIZE) {
418 error = inode_setattr(inode, attr); 444 error = fat_setsize(inode, attr->ia_size);
445 if (error)
446 goto out;
447 }
448
449 generic_setattr(inode, attr);
450 mark_inode_dirty(inode);
419out: 451out:
420 return error; 452 return error;
421} 453}
422EXPORT_SYMBOL_GPL(fat_setattr); 454EXPORT_SYMBOL_GPL(fat_setattr);
423 455
424const struct inode_operations fat_file_inode_operations = { 456const struct inode_operations fat_file_inode_operations = {
425 .truncate = fat_truncate,
426 .setattr = fat_setattr, 457 .setattr = fat_setattr,
427 .getattr = fat_getattr, 458 .getattr = fat_getattr,
428}; 459};
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block); 142 return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
143} 143}
144 144
145static void fat_write_failed(struct address_space *mapping, loff_t to)
146{
147 struct inode *inode = mapping->host;
148
149 if (to > inode->i_size) {
150 truncate_pagecache(inode, to, inode->i_size);
151 fat_truncate_blocks(inode, inode->i_size);
152 }
153}
154
145static int fat_write_begin(struct file *file, struct address_space *mapping, 155static int fat_write_begin(struct file *file, struct address_space *mapping,
146 loff_t pos, unsigned len, unsigned flags, 156 loff_t pos, unsigned len, unsigned flags,
147 struct page **pagep, void **fsdata) 157 struct page **pagep, void **fsdata)
148{ 158{
159 int err;
160
149 *pagep = NULL; 161 *pagep = NULL;
150 return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 162 err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
151 fat_get_block, 163 pagep, fsdata, fat_get_block,
152 &MSDOS_I(mapping->host)->mmu_private); 164 &MSDOS_I(mapping->host)->mmu_private);
165 if (err < 0)
166 fat_write_failed(mapping, pos + len);
167 return err;
153} 168}
154 169
155static int fat_write_end(struct file *file, struct address_space *mapping, 170static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
159 struct inode *inode = mapping->host; 174 struct inode *inode = mapping->host;
160 int err; 175 int err;
161 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata); 176 err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
177 if (err < len)
178 fat_write_failed(mapping, pos + len);
162 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { 179 if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
163 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; 180 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
164 MSDOS_I(inode)->i_attrs |= ATTR_ARCH; 181 MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
172 loff_t offset, unsigned long nr_segs) 189 loff_t offset, unsigned long nr_segs)
173{ 190{
174 struct file *file = iocb->ki_filp; 191 struct file *file = iocb->ki_filp;
175 struct inode *inode = file->f_mapping->host; 192 struct address_space *mapping = file->f_mapping;
193 struct inode *inode = mapping->host;
194 ssize_t ret;
176 195
177 if (rw == WRITE) { 196 if (rw == WRITE) {
178 /* 197 /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
193 * FAT need to use the DIO_LOCKING for avoiding the race 212 * FAT need to use the DIO_LOCKING for avoiding the race
194 * condition of fat_get_block() and ->truncate(). 213 * condition of fat_get_block() and ->truncate().
195 */ 214 */
196 return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, 215 ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
197 offset, nr_segs, fat_get_block, NULL); 216 iov, offset, nr_segs, fat_get_block, NULL);
217 if (ret < 0 && (rw & WRITE))
218 fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
219
220 return ret;
198} 221}
199 222
200static sector_t _fat_bmap(struct address_space *mapping, sector_t block) 223static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
429{ 452{
430 truncate_inode_pages(&inode->i_data, 0); 453 truncate_inode_pages(&inode->i_data, 0);
431 inode->i_size = 0; 454 inode->i_size = 0;
432 fat_truncate(inode); 455 fat_truncate_blocks(inode, 0);
433 clear_inode(inode); 456 clear_inode(inode);
434} 457}
435 458
@@ -1250,6 +1273,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
1250 sb->s_op = &fat_sops; 1273 sb->s_op = &fat_sops;
1251 sb->s_export_op = &fat_export_ops; 1274 sb->s_export_op = &fat_export_ops;
1252 sbi->dir_ops = fs_dir_inode_ops; 1275 sbi->dir_ops = fs_dir_inode_ops;
1276 ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
1277 DEFAULT_RATELIMIT_BURST);
1253 1278
1254 error = parse_options(data, isvfat, silent, &debug, &sbi->options); 1279 error = parse_options(data, isvfat, silent, &debug, &sbi->options);
1255 if (error) 1280 if (error)
@@ -1497,10 +1522,8 @@ out_fail:
1497 iput(fat_inode); 1522 iput(fat_inode);
1498 if (root_inode) 1523 if (root_inode)
1499 iput(root_inode); 1524 iput(root_inode);
1500 if (sbi->nls_io) 1525 unload_nls(sbi->nls_io);
1501 unload_nls(sbi->nls_io); 1526 unload_nls(sbi->nls_disk);
1502 if (sbi->nls_disk)
1503 unload_nls(sbi->nls_disk);
1504 if (sbi->options.iocharset != fat_default_iocharset) 1527 if (sbi->options.iocharset != fat_default_iocharset)
1505 kfree(sbi->options.iocharset); 1528 kfree(sbi->options.iocharset);
1506 sb->s_fs_info = NULL; 1529 sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
20 * In case the file system is remounted read-only, it can be made writable 20 * In case the file system is remounted read-only, it can be made writable
21 * again by remounting it. 21 * again by remounting it.
22 */ 22 */
23void fat_fs_error(struct super_block *s, const char *fmt, ...) 23void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
24{ 24{
25 struct fat_mount_options *opts = &MSDOS_SB(s)->options; 25 struct fat_mount_options *opts = &MSDOS_SB(s)->options;
26 va_list args; 26 va_list args;
27 27
28 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id); 28 if (report) {
29 printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
29 30
30 printk(KERN_ERR " "); 31 printk(KERN_ERR " ");
31 va_start(args, fmt); 32 va_start(args, fmt);
32 vprintk(fmt, args); 33 vprintk(fmt, args);
33 va_end(args); 34 va_end(args);
34 printk("\n"); 35 printk("\n");
36 }
35 37
36 if (opts->errors == FAT_ERRORS_PANIC) 38 if (opts->errors == FAT_ERRORS_PANIC)
37 panic(" FAT fs panic from previous error\n"); 39 panic("FAT: fs panic from previous error\n");
38 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) { 40 else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
39 s->s_flags |= MS_RDONLY; 41 s->s_flags |= MS_RDONLY;
40 printk(KERN_ERR " File system has been set read-only\n"); 42 printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
41 } 43 }
42} 44}
43EXPORT_SYMBOL_GPL(fat_fs_error); 45EXPORT_SYMBOL_GPL(__fat_fs_error);
44 46
45/* Flushes the number of free clusters on FAT32 */ 47/* Flushes the number of free clusters on FAT32 */
46/* XXX: Need to write one per FSINFO block. Currently only writes 1 */ 48/* XXX: Need to write one per FSINFO block. Currently only writes 1 */
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 0a140741b39e..51e11bf5708f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -14,6 +14,7 @@
14#include <linux/dnotify.h> 14#include <linux/dnotify.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/pipe_fs_i.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/ptrace.h> 19#include <linux/ptrace.h>
19#include <linux/signal.h> 20#include <linux/signal.h>
@@ -273,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
273 274
274 ret = copy_from_user(&owner, owner_p, sizeof(owner)); 275 ret = copy_from_user(&owner, owner_p, sizeof(owner));
275 if (ret) 276 if (ret)
276 return ret; 277 return -EFAULT;
277 278
278 switch (owner.type) { 279 switch (owner.type) {
279 case F_OWNER_TID: 280 case F_OWNER_TID:
@@ -331,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
331 } 332 }
332 read_unlock(&filp->f_owner.lock); 333 read_unlock(&filp->f_owner.lock);
333 334
334 if (!ret) 335 if (!ret) {
335 ret = copy_to_user(owner_p, &owner, sizeof(owner)); 336 ret = copy_to_user(owner_p, &owner, sizeof(owner));
337 if (ret)
338 ret = -EFAULT;
339 }
336 return ret; 340 return ret;
337} 341}
338 342
@@ -412,6 +416,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
412 case F_NOTIFY: 416 case F_NOTIFY:
413 err = fcntl_dirnotify(fd, filp, arg); 417 err = fcntl_dirnotify(fd, filp, arg);
414 break; 418 break;
419 case F_SETPIPE_SZ:
420 case F_GETPIPE_SZ:
421 err = pipe_fcntl(filp, cmd, arg);
422 break;
415 default: 423 default:
416 break; 424 break;
417 } 425 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
194} 194}
195EXPORT_SYMBOL(alloc_file); 195EXPORT_SYMBOL(alloc_file);
196 196
197void fput(struct file *file)
198{
199 if (atomic_long_dec_and_test(&file->f_count))
200 __fput(file);
201}
202
203EXPORT_SYMBOL(fput);
204
205/** 197/**
206 * drop_file_write_access - give up ability to write to a file 198 * drop_file_write_access - give up ability to write to a file
207 * @file: the file to which we will stop writing 199 * @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
227} 219}
228EXPORT_SYMBOL_GPL(drop_file_write_access); 220EXPORT_SYMBOL_GPL(drop_file_write_access);
229 221
230/* __fput is called from task context when aio completion releases the last 222/* the real guts of fput() - releasing the last reference to file
231 * last use of a struct file *. Do not use otherwise.
232 */ 223 */
233void __fput(struct file *file) 224static void __fput(struct file *file)
234{ 225{
235 struct dentry *dentry = file->f_path.dentry; 226 struct dentry *dentry = file->f_path.dentry;
236 struct vfsmount *mnt = file->f_path.mnt; 227 struct vfsmount *mnt = file->f_path.mnt;
@@ -268,6 +259,14 @@ void __fput(struct file *file)
268 mntput(mnt); 259 mntput(mnt);
269} 260}
270 261
262void fput(struct file *file)
263{
264 if (atomic_long_dec_and_test(&file->f_count))
265 __fput(file);
266}
267
268EXPORT_SYMBOL(fput);
269
271struct file *fget(unsigned int fd) 270struct file *fget(unsigned int fd)
272{ 271{
273 struct file *file; 272 struct file *file;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
57}; 57};
58 58
59const struct file_operations vxfs_dir_operations = { 59const struct file_operations vxfs_dir_operations = {
60 .llseek = generic_file_llseek,
61 .read = generic_read_dir,
60 .readdir = vxfs_readdir, 62 .readdir = vxfs_readdir,
61}; 63};
62 64
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 4b37f7cea4dd..1d1088f48bc2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -42,9 +42,9 @@ struct wb_writeback_args {
42 long nr_pages; 42 long nr_pages;
43 struct super_block *sb; 43 struct super_block *sb;
44 enum writeback_sync_modes sync_mode; 44 enum writeback_sync_modes sync_mode;
45 int for_kupdate:1; 45 unsigned int for_kupdate:1;
46 int range_cyclic:1; 46 unsigned int range_cyclic:1;
47 int for_background:1; 47 unsigned int for_background:1;
48}; 48};
49 49
50/* 50/*
@@ -398,11 +398,11 @@ static void inode_wait_for_writeback(struct inode *inode)
398 wait_queue_head_t *wqh; 398 wait_queue_head_t *wqh;
399 399
400 wqh = bit_waitqueue(&inode->i_state, __I_SYNC); 400 wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
401 do { 401 while (inode->i_state & I_SYNC) {
402 spin_unlock(&inode_lock); 402 spin_unlock(&inode_lock);
403 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE); 403 __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
404 spin_lock(&inode_lock); 404 spin_lock(&inode_lock);
405 } while (inode->i_state & I_SYNC); 405 }
406} 406}
407 407
408/* 408/*
@@ -452,11 +452,9 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
452 452
453 BUG_ON(inode->i_state & I_SYNC); 453 BUG_ON(inode->i_state & I_SYNC);
454 454
455 /* Set I_SYNC, reset I_DIRTY */ 455 /* Set I_SYNC, reset I_DIRTY_PAGES */
456 dirty = inode->i_state & I_DIRTY;
457 inode->i_state |= I_SYNC; 456 inode->i_state |= I_SYNC;
458 inode->i_state &= ~I_DIRTY; 457 inode->i_state &= ~I_DIRTY_PAGES;
459
460 spin_unlock(&inode_lock); 458 spin_unlock(&inode_lock);
461 459
462 ret = do_writepages(mapping, wbc); 460 ret = do_writepages(mapping, wbc);
@@ -472,6 +470,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
472 ret = err; 470 ret = err;
473 } 471 }
474 472
473 /*
474 * Some filesystems may redirty the inode during the writeback
475 * due to delalloc, clear dirty metadata flags right before
476 * write_inode()
477 */
478 spin_lock(&inode_lock);
479 dirty = inode->i_state & I_DIRTY;
480 inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
481 spin_unlock(&inode_lock);
475 /* Don't write the inode if only I_DIRTY_PAGES was set */ 482 /* Don't write the inode if only I_DIRTY_PAGES was set */
476 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { 483 if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
477 int err = write_inode(inode, wbc); 484 int err = write_inode(inode, wbc);
@@ -852,6 +859,12 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
852 unsigned long expired; 859 unsigned long expired;
853 long nr_pages; 860 long nr_pages;
854 861
862 /*
863 * When set to zero, disable periodic writeback
864 */
865 if (!dirty_writeback_interval)
866 return 0;
867
855 expired = wb->last_old_flush + 868 expired = wb->last_old_flush +
856 msecs_to_jiffies(dirty_writeback_interval * 10); 869 msecs_to_jiffies(dirty_writeback_interval * 10);
857 if (time_before(jiffies, expired)) 870 if (time_before(jiffies, expired))
@@ -947,8 +960,17 @@ int bdi_writeback_task(struct bdi_writeback *wb)
947 break; 960 break;
948 } 961 }
949 962
950 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); 963 if (dirty_writeback_interval) {
951 schedule_timeout_interruptible(wait_jiffies); 964 wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
965 schedule_timeout_interruptible(wait_jiffies);
966 } else {
967 set_current_state(TASK_INTERRUPTIBLE);
968 if (list_empty_careful(&wb->bdi->work_list) &&
969 !kthread_should_stop())
970 schedule();
971 __set_current_state(TASK_RUNNING);
972 }
973
952 try_to_freeze(); 974 try_to_freeze();
953 } 975 }
954 976
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
103 /* banners (can't represent line 0 by pos 0 as that would involve 103 /* banners (can't represent line 0 by pos 0 as that would involve
104 * returning a NULL pointer) */ 104 * returning a NULL pointer) */
105 if (pos == 0) 105 if (pos == 0)
106 return (struct fscache_object *) ++(*_pos); 106 return (struct fscache_object *)(long)++(*_pos);
107 if (pos < 3) 107 if (pos < 3)
108 return (struct fscache_object *)pos; 108 return (struct fscache_object *)pos;
109 109
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 47aefd376e54..723b889fd219 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -710,30 +710,26 @@ static void fscache_write_op(struct fscache_operation *_op)
710 goto superseded; 710 goto superseded;
711 } 711 }
712 712
713 if (page) { 713 radix_tree_tag_set(&cookie->stores, page->index,
714 radix_tree_tag_set(&cookie->stores, page->index, 714 FSCACHE_COOKIE_STORING_TAG);
715 FSCACHE_COOKIE_STORING_TAG); 715 radix_tree_tag_clear(&cookie->stores, page->index,
716 radix_tree_tag_clear(&cookie->stores, page->index, 716 FSCACHE_COOKIE_PENDING_TAG);
717 FSCACHE_COOKIE_PENDING_TAG);
718 }
719 717
720 spin_unlock(&cookie->stores_lock); 718 spin_unlock(&cookie->stores_lock);
721 spin_unlock(&object->lock); 719 spin_unlock(&object->lock);
722 720
723 if (page) { 721 fscache_set_op_state(&op->op, "Store");
724 fscache_set_op_state(&op->op, "Store"); 722 fscache_stat(&fscache_n_store_pages);
725 fscache_stat(&fscache_n_store_pages); 723 fscache_stat(&fscache_n_cop_write_page);
726 fscache_stat(&fscache_n_cop_write_page); 724 ret = object->cache->ops->write_page(op, page);
727 ret = object->cache->ops->write_page(op, page); 725 fscache_stat_d(&fscache_n_cop_write_page);
728 fscache_stat_d(&fscache_n_cop_write_page); 726 fscache_set_op_state(&op->op, "EndWrite");
729 fscache_set_op_state(&op->op, "EndWrite"); 727 fscache_end_page_write(object, page);
730 fscache_end_page_write(object, page); 728 if (ret < 0) {
731 if (ret < 0) { 729 fscache_set_op_state(&op->op, "Abort");
732 fscache_set_op_state(&op->op, "Abort"); 730 fscache_abort_object(object);
733 fscache_abort_object(object); 731 } else {
734 } else { 732 fscache_enqueue_operation(&op->op);
735 fscache_enqueue_operation(&op->op);
736 }
737 } 733 }
738 734
739 _leave(""); 735 _leave("");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,8 +16,12 @@
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/file.h> 17#include <linux/file.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/pipe_fs_i.h>
20#include <linux/swap.h>
21#include <linux/splice.h>
19 22
20MODULE_ALIAS_MISCDEV(FUSE_MINOR); 23MODULE_ALIAS_MISCDEV(FUSE_MINOR);
24MODULE_ALIAS("devname:fuse");
21 25
22static struct kmem_cache *fuse_req_cachep; 26static struct kmem_cache *fuse_req_cachep;
23 27
@@ -498,6 +502,9 @@ struct fuse_copy_state {
498 int write; 502 int write;
499 struct fuse_req *req; 503 struct fuse_req *req;
500 const struct iovec *iov; 504 const struct iovec *iov;
505 struct pipe_buffer *pipebufs;
506 struct pipe_buffer *currbuf;
507 struct pipe_inode_info *pipe;
501 unsigned long nr_segs; 508 unsigned long nr_segs;
502 unsigned long seglen; 509 unsigned long seglen;
503 unsigned long addr; 510 unsigned long addr;
@@ -505,16 +512,16 @@ struct fuse_copy_state {
505 void *mapaddr; 512 void *mapaddr;
506 void *buf; 513 void *buf;
507 unsigned len; 514 unsigned len;
515 unsigned move_pages:1;
508}; 516};
509 517
510static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, 518static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
511 int write, struct fuse_req *req, 519 int write,
512 const struct iovec *iov, unsigned long nr_segs) 520 const struct iovec *iov, unsigned long nr_segs)
513{ 521{
514 memset(cs, 0, sizeof(*cs)); 522 memset(cs, 0, sizeof(*cs));
515 cs->fc = fc; 523 cs->fc = fc;
516 cs->write = write; 524 cs->write = write;
517 cs->req = req;
518 cs->iov = iov; 525 cs->iov = iov;
519 cs->nr_segs = nr_segs; 526 cs->nr_segs = nr_segs;
520} 527}
@@ -522,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
522/* Unmap and put previous page of userspace buffer */ 529/* Unmap and put previous page of userspace buffer */
523static void fuse_copy_finish(struct fuse_copy_state *cs) 530static void fuse_copy_finish(struct fuse_copy_state *cs)
524{ 531{
525 if (cs->mapaddr) { 532 if (cs->currbuf) {
533 struct pipe_buffer *buf = cs->currbuf;
534
535 if (!cs->write) {
536 buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
537 } else {
538 kunmap_atomic(cs->mapaddr, KM_USER0);
539 buf->len = PAGE_SIZE - cs->len;
540 }
541 cs->currbuf = NULL;
542 cs->mapaddr = NULL;
543 } else if (cs->mapaddr) {
526 kunmap_atomic(cs->mapaddr, KM_USER0); 544 kunmap_atomic(cs->mapaddr, KM_USER0);
527 if (cs->write) { 545 if (cs->write) {
528 flush_dcache_page(cs->pg); 546 flush_dcache_page(cs->pg);
@@ -544,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
544 562
545 unlock_request(cs->fc, cs->req); 563 unlock_request(cs->fc, cs->req);
546 fuse_copy_finish(cs); 564 fuse_copy_finish(cs);
547 if (!cs->seglen) { 565 if (cs->pipebufs) {
548 BUG_ON(!cs->nr_segs); 566 struct pipe_buffer *buf = cs->pipebufs;
549 cs->seglen = cs->iov[0].iov_len; 567
550 cs->addr = (unsigned long) cs->iov[0].iov_base; 568 if (!cs->write) {
551 cs->iov++; 569 err = buf->ops->confirm(cs->pipe, buf);
552 cs->nr_segs--; 570 if (err)
571 return err;
572
573 BUG_ON(!cs->nr_segs);
574 cs->currbuf = buf;
575 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
576 cs->len = buf->len;
577 cs->buf = cs->mapaddr + buf->offset;
578 cs->pipebufs++;
579 cs->nr_segs--;
580 } else {
581 struct page *page;
582
583 if (cs->nr_segs == cs->pipe->buffers)
584 return -EIO;
585
586 page = alloc_page(GFP_HIGHUSER);
587 if (!page)
588 return -ENOMEM;
589
590 buf->page = page;
591 buf->offset = 0;
592 buf->len = 0;
593
594 cs->currbuf = buf;
595 cs->mapaddr = kmap_atomic(page, KM_USER0);
596 cs->buf = cs->mapaddr;
597 cs->len = PAGE_SIZE;
598 cs->pipebufs++;
599 cs->nr_segs++;
600 }
601 } else {
602 if (!cs->seglen) {
603 BUG_ON(!cs->nr_segs);
604 cs->seglen = cs->iov[0].iov_len;
605 cs->addr = (unsigned long) cs->iov[0].iov_base;
606 cs->iov++;
607 cs->nr_segs--;
608 }
609 err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
610 if (err < 0)
611 return err;
612 BUG_ON(err != 1);
613 offset = cs->addr % PAGE_SIZE;
614 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
615 cs->buf = cs->mapaddr + offset;
616 cs->len = min(PAGE_SIZE - offset, cs->seglen);
617 cs->seglen -= cs->len;
618 cs->addr += cs->len;
553 } 619 }
554 down_read(&current->mm->mmap_sem);
555 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
556 &cs->pg, NULL);
557 up_read(&current->mm->mmap_sem);
558 if (err < 0)
559 return err;
560 BUG_ON(err != 1);
561 offset = cs->addr % PAGE_SIZE;
562 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
563 cs->buf = cs->mapaddr + offset;
564 cs->len = min(PAGE_SIZE - offset, cs->seglen);
565 cs->seglen -= cs->len;
566 cs->addr += cs->len;
567 620
568 return lock_request(cs->fc, cs->req); 621 return lock_request(cs->fc, cs->req);
569} 622}
@@ -585,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
585 return ncpy; 638 return ncpy;
586} 639}
587 640
641static int fuse_check_page(struct page *page)
642{
643 if (page_mapcount(page) ||
644 page->mapping != NULL ||
645 page_count(page) != 1 ||
646 (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
647 ~(1 << PG_locked |
648 1 << PG_referenced |
649 1 << PG_uptodate |
650 1 << PG_lru |
651 1 << PG_active |
652 1 << PG_reclaim))) {
653 printk(KERN_WARNING "fuse: trying to steal weird page\n");
654 printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
655 return 1;
656 }
657 return 0;
658}
659
660static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
661{
662 int err;
663 struct page *oldpage = *pagep;
664 struct page *newpage;
665 struct pipe_buffer *buf = cs->pipebufs;
666 struct address_space *mapping;
667 pgoff_t index;
668
669 unlock_request(cs->fc, cs->req);
670 fuse_copy_finish(cs);
671
672 err = buf->ops->confirm(cs->pipe, buf);
673 if (err)
674 return err;
675
676 BUG_ON(!cs->nr_segs);
677 cs->currbuf = buf;
678 cs->len = buf->len;
679 cs->pipebufs++;
680 cs->nr_segs--;
681
682 if (cs->len != PAGE_SIZE)
683 goto out_fallback;
684
685 if (buf->ops->steal(cs->pipe, buf) != 0)
686 goto out_fallback;
687
688 newpage = buf->page;
689
690 if (WARN_ON(!PageUptodate(newpage)))
691 return -EIO;
692
693 ClearPageMappedToDisk(newpage);
694
695 if (fuse_check_page(newpage) != 0)
696 goto out_fallback_unlock;
697
698 mapping = oldpage->mapping;
699 index = oldpage->index;
700
701 /*
702 * This is a new and locked page, it shouldn't be mapped or
703 * have any special flags on it
704 */
705 if (WARN_ON(page_mapped(oldpage)))
706 goto out_fallback_unlock;
707 if (WARN_ON(page_has_private(oldpage)))
708 goto out_fallback_unlock;
709 if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
710 goto out_fallback_unlock;
711 if (WARN_ON(PageMlocked(oldpage)))
712 goto out_fallback_unlock;
713
714 remove_from_page_cache(oldpage);
715 page_cache_release(oldpage);
716
717 err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
718 if (err) {
719 printk(KERN_WARNING "fuse_try_move_page: failed to add page");
720 goto out_fallback_unlock;
721 }
722 page_cache_get(newpage);
723
724 if (!(buf->flags & PIPE_BUF_FLAG_LRU))
725 lru_cache_add_file(newpage);
726
727 err = 0;
728 spin_lock(&cs->fc->lock);
729 if (cs->req->aborted)
730 err = -ENOENT;
731 else
732 *pagep = newpage;
733 spin_unlock(&cs->fc->lock);
734
735 if (err) {
736 unlock_page(newpage);
737 page_cache_release(newpage);
738 return err;
739 }
740
741 unlock_page(oldpage);
742 page_cache_release(oldpage);
743 cs->len = 0;
744
745 return 0;
746
747out_fallback_unlock:
748 unlock_page(newpage);
749out_fallback:
750 cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
751 cs->buf = cs->mapaddr + buf->offset;
752
753 err = lock_request(cs->fc, cs->req);
754 if (err)
755 return err;
756
757 return 1;
758}
759
760static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
761 unsigned offset, unsigned count)
762{
763 struct pipe_buffer *buf;
764
765 if (cs->nr_segs == cs->pipe->buffers)
766 return -EIO;
767
768 unlock_request(cs->fc, cs->req);
769 fuse_copy_finish(cs);
770
771 buf = cs->pipebufs;
772 page_cache_get(page);
773 buf->page = page;
774 buf->offset = offset;
775 buf->len = count;
776
777 cs->pipebufs++;
778 cs->nr_segs++;
779 cs->len = 0;
780
781 return 0;
782}
783
588/* 784/*
589 * Copy a page in the request to/from the userspace buffer. Must be 785 * Copy a page in the request to/from the userspace buffer. Must be
590 * done atomically 786 * done atomically
591 */ 787 */
592static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, 788static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
593 unsigned offset, unsigned count, int zeroing) 789 unsigned offset, unsigned count, int zeroing)
594{ 790{
791 int err;
792 struct page *page = *pagep;
793
595 if (page && zeroing && count < PAGE_SIZE) { 794 if (page && zeroing && count < PAGE_SIZE) {
596 void *mapaddr = kmap_atomic(page, KM_USER1); 795 void *mapaddr = kmap_atomic(page, KM_USER1);
597 memset(mapaddr, 0, PAGE_SIZE); 796 memset(mapaddr, 0, PAGE_SIZE);
598 kunmap_atomic(mapaddr, KM_USER1); 797 kunmap_atomic(mapaddr, KM_USER1);
599 } 798 }
600 while (count) { 799 while (count) {
601 if (!cs->len) { 800 if (cs->write && cs->pipebufs && page) {
602 int err = fuse_copy_fill(cs); 801 return fuse_ref_page(cs, page, offset, count);
603 if (err) 802 } else if (!cs->len) {
604 return err; 803 if (cs->move_pages && page &&
804 offset == 0 && count == PAGE_SIZE) {
805 err = fuse_try_move_page(cs, pagep);
806 if (err <= 0)
807 return err;
808 } else {
809 err = fuse_copy_fill(cs);
810 if (err)
811 return err;
812 }
605 } 813 }
606 if (page) { 814 if (page) {
607 void *mapaddr = kmap_atomic(page, KM_USER1); 815 void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -626,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
626 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); 834 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
627 835
628 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { 836 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
629 struct page *page = req->pages[i]; 837 int err;
630 int err = fuse_copy_page(cs, page, offset, count, zeroing); 838
839 err = fuse_copy_page(cs, &req->pages[i], offset, count,
840 zeroing);
631 if (err) 841 if (err)
632 return err; 842 return err;
633 843
@@ -704,11 +914,10 @@ __acquires(&fc->lock)
704 * 914 *
705 * Called with fc->lock held, releases it 915 * Called with fc->lock held, releases it
706 */ 916 */
707static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, 917static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
708 const struct iovec *iov, unsigned long nr_segs) 918 size_t nbytes, struct fuse_req *req)
709__releases(&fc->lock) 919__releases(&fc->lock)
710{ 920{
711 struct fuse_copy_state cs;
712 struct fuse_in_header ih; 921 struct fuse_in_header ih;
713 struct fuse_interrupt_in arg; 922 struct fuse_interrupt_in arg;
714 unsigned reqsize = sizeof(ih) + sizeof(arg); 923 unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -724,14 +933,13 @@ __releases(&fc->lock)
724 arg.unique = req->in.h.unique; 933 arg.unique = req->in.h.unique;
725 934
726 spin_unlock(&fc->lock); 935 spin_unlock(&fc->lock);
727 if (iov_length(iov, nr_segs) < reqsize) 936 if (nbytes < reqsize)
728 return -EINVAL; 937 return -EINVAL;
729 938
730 fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); 939 err = fuse_copy_one(cs, &ih, sizeof(ih));
731 err = fuse_copy_one(&cs, &ih, sizeof(ih));
732 if (!err) 940 if (!err)
733 err = fuse_copy_one(&cs, &arg, sizeof(arg)); 941 err = fuse_copy_one(cs, &arg, sizeof(arg));
734 fuse_copy_finish(&cs); 942 fuse_copy_finish(cs);
735 943
736 return err ? err : reqsize; 944 return err ? err : reqsize;
737} 945}
@@ -745,18 +953,13 @@ __releases(&fc->lock)
745 * request_end(). Otherwise add it to the processing list, and set 953 * request_end(). Otherwise add it to the processing list, and set
746 * the 'sent' flag. 954 * the 'sent' flag.
747 */ 955 */
748static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, 956static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
749 unsigned long nr_segs, loff_t pos) 957 struct fuse_copy_state *cs, size_t nbytes)
750{ 958{
751 int err; 959 int err;
752 struct fuse_req *req; 960 struct fuse_req *req;
753 struct fuse_in *in; 961 struct fuse_in *in;
754 struct fuse_copy_state cs;
755 unsigned reqsize; 962 unsigned reqsize;
756 struct file *file = iocb->ki_filp;
757 struct fuse_conn *fc = fuse_get_conn(file);
758 if (!fc)
759 return -EPERM;
760 963
761 restart: 964 restart:
762 spin_lock(&fc->lock); 965 spin_lock(&fc->lock);
@@ -776,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
776 if (!list_empty(&fc->interrupts)) { 979 if (!list_empty(&fc->interrupts)) {
777 req = list_entry(fc->interrupts.next, struct fuse_req, 980 req = list_entry(fc->interrupts.next, struct fuse_req,
778 intr_entry); 981 intr_entry);
779 return fuse_read_interrupt(fc, req, iov, nr_segs); 982 return fuse_read_interrupt(fc, cs, nbytes, req);
780 } 983 }
781 984
782 req = list_entry(fc->pending.next, struct fuse_req, list); 985 req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -786,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
786 in = &req->in; 989 in = &req->in;
787 reqsize = in->h.len; 990 reqsize = in->h.len;
788 /* If request is too large, reply with an error and restart the read */ 991 /* If request is too large, reply with an error and restart the read */
789 if (iov_length(iov, nr_segs) < reqsize) { 992 if (nbytes < reqsize) {
790 req->out.h.error = -EIO; 993 req->out.h.error = -EIO;
791 /* SETXATTR is special, since it may contain too large data */ 994 /* SETXATTR is special, since it may contain too large data */
792 if (in->h.opcode == FUSE_SETXATTR) 995 if (in->h.opcode == FUSE_SETXATTR)
@@ -795,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
795 goto restart; 998 goto restart;
796 } 999 }
797 spin_unlock(&fc->lock); 1000 spin_unlock(&fc->lock);
798 fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); 1001 cs->req = req;
799 err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); 1002 err = fuse_copy_one(cs, &in->h, sizeof(in->h));
800 if (!err) 1003 if (!err)
801 err = fuse_copy_args(&cs, in->numargs, in->argpages, 1004 err = fuse_copy_args(cs, in->numargs, in->argpages,
802 (struct fuse_arg *) in->args, 0); 1005 (struct fuse_arg *) in->args, 0);
803 fuse_copy_finish(&cs); 1006 fuse_copy_finish(cs);
804 spin_lock(&fc->lock); 1007 spin_lock(&fc->lock);
805 req->locked = 0; 1008 req->locked = 0;
806 if (req->aborted) { 1009 if (req->aborted) {
@@ -828,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
828 return err; 1031 return err;
829} 1032}
830 1033
1034static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
1035 unsigned long nr_segs, loff_t pos)
1036{
1037 struct fuse_copy_state cs;
1038 struct file *file = iocb->ki_filp;
1039 struct fuse_conn *fc = fuse_get_conn(file);
1040 if (!fc)
1041 return -EPERM;
1042
1043 fuse_copy_init(&cs, fc, 1, iov, nr_segs);
1044
1045 return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
1046}
1047
1048static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
1049 struct pipe_buffer *buf)
1050{
1051 return 1;
1052}
1053
1054static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
1055 .can_merge = 0,
1056 .map = generic_pipe_buf_map,
1057 .unmap = generic_pipe_buf_unmap,
1058 .confirm = generic_pipe_buf_confirm,
1059 .release = generic_pipe_buf_release,
1060 .steal = fuse_dev_pipe_buf_steal,
1061 .get = generic_pipe_buf_get,
1062};
1063
1064static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
1065 struct pipe_inode_info *pipe,
1066 size_t len, unsigned int flags)
1067{
1068 int ret;
1069 int page_nr = 0;
1070 int do_wakeup = 0;
1071 struct pipe_buffer *bufs;
1072 struct fuse_copy_state cs;
1073 struct fuse_conn *fc = fuse_get_conn(in);
1074 if (!fc)
1075 return -EPERM;
1076
1077 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1078 if (!bufs)
1079 return -ENOMEM;
1080
1081 fuse_copy_init(&cs, fc, 1, NULL, 0);
1082 cs.pipebufs = bufs;
1083 cs.pipe = pipe;
1084 ret = fuse_dev_do_read(fc, in, &cs, len);
1085 if (ret < 0)
1086 goto out;
1087
1088 ret = 0;
1089 pipe_lock(pipe);
1090
1091 if (!pipe->readers) {
1092 send_sig(SIGPIPE, current, 0);
1093 if (!ret)
1094 ret = -EPIPE;
1095 goto out_unlock;
1096 }
1097
1098 if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
1099 ret = -EIO;
1100 goto out_unlock;
1101 }
1102
1103 while (page_nr < cs.nr_segs) {
1104 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
1105 struct pipe_buffer *buf = pipe->bufs + newbuf;
1106
1107 buf->page = bufs[page_nr].page;
1108 buf->offset = bufs[page_nr].offset;
1109 buf->len = bufs[page_nr].len;
1110 buf->ops = &fuse_dev_pipe_buf_ops;
1111
1112 pipe->nrbufs++;
1113 page_nr++;
1114 ret += buf->len;
1115
1116 if (pipe->inode)
1117 do_wakeup = 1;
1118 }
1119
1120out_unlock:
1121 pipe_unlock(pipe);
1122
1123 if (do_wakeup) {
1124 smp_mb();
1125 if (waitqueue_active(&pipe->wait))
1126 wake_up_interruptible(&pipe->wait);
1127 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1128 }
1129
1130out:
1131 for (; page_nr < cs.nr_segs; page_nr++)
1132 page_cache_release(bufs[page_nr].page);
1133
1134 kfree(bufs);
1135 return ret;
1136}
1137
831static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, 1138static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
832 struct fuse_copy_state *cs) 1139 struct fuse_copy_state *cs)
833{ 1140{
@@ -987,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
987 * it from the list and copy the rest of the buffer to the request. 1294 * it from the list and copy the rest of the buffer to the request.
988 * The request is finished by calling request_end() 1295 * The request is finished by calling request_end()
989 */ 1296 */
990static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, 1297static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
991 unsigned long nr_segs, loff_t pos) 1298 struct fuse_copy_state *cs, size_t nbytes)
992{ 1299{
993 int err; 1300 int err;
994 size_t nbytes = iov_length(iov, nr_segs);
995 struct fuse_req *req; 1301 struct fuse_req *req;
996 struct fuse_out_header oh; 1302 struct fuse_out_header oh;
997 struct fuse_copy_state cs;
998 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
999 if (!fc)
1000 return -EPERM;
1001 1303
1002 fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
1003 if (nbytes < sizeof(struct fuse_out_header)) 1304 if (nbytes < sizeof(struct fuse_out_header))
1004 return -EINVAL; 1305 return -EINVAL;
1005 1306
1006 err = fuse_copy_one(&cs, &oh, sizeof(oh)); 1307 err = fuse_copy_one(cs, &oh, sizeof(oh));
1007 if (err) 1308 if (err)
1008 goto err_finish; 1309 goto err_finish;
1009 1310
@@ -1016,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1016 * and error contains notification code. 1317 * and error contains notification code.
1017 */ 1318 */
1018 if (!oh.unique) { 1319 if (!oh.unique) {
1019 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); 1320 err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
1020 return err ? err : nbytes; 1321 return err ? err : nbytes;
1021 } 1322 }
1022 1323
@@ -1035,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1035 1336
1036 if (req->aborted) { 1337 if (req->aborted) {
1037 spin_unlock(&fc->lock); 1338 spin_unlock(&fc->lock);
1038 fuse_copy_finish(&cs); 1339 fuse_copy_finish(cs);
1039 spin_lock(&fc->lock); 1340 spin_lock(&fc->lock);
1040 request_end(fc, req); 1341 request_end(fc, req);
1041 return -ENOENT; 1342 return -ENOENT;
@@ -1052,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1052 queue_interrupt(fc, req); 1353 queue_interrupt(fc, req);
1053 1354
1054 spin_unlock(&fc->lock); 1355 spin_unlock(&fc->lock);
1055 fuse_copy_finish(&cs); 1356 fuse_copy_finish(cs);
1056 return nbytes; 1357 return nbytes;
1057 } 1358 }
1058 1359
@@ -1060,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1060 list_move(&req->list, &fc->io); 1361 list_move(&req->list, &fc->io);
1061 req->out.h = oh; 1362 req->out.h = oh;
1062 req->locked = 1; 1363 req->locked = 1;
1063 cs.req = req; 1364 cs->req = req;
1365 if (!req->out.page_replace)
1366 cs->move_pages = 0;
1064 spin_unlock(&fc->lock); 1367 spin_unlock(&fc->lock);
1065 1368
1066 err = copy_out_args(&cs, &req->out, nbytes); 1369 err = copy_out_args(cs, &req->out, nbytes);
1067 fuse_copy_finish(&cs); 1370 fuse_copy_finish(cs);
1068 1371
1069 spin_lock(&fc->lock); 1372 spin_lock(&fc->lock);
1070 req->locked = 0; 1373 req->locked = 0;
@@ -1080,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1080 err_unlock: 1383 err_unlock:
1081 spin_unlock(&fc->lock); 1384 spin_unlock(&fc->lock);
1082 err_finish: 1385 err_finish:
1083 fuse_copy_finish(&cs); 1386 fuse_copy_finish(cs);
1084 return err; 1387 return err;
1085} 1388}
1086 1389
1390static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
1391 unsigned long nr_segs, loff_t pos)
1392{
1393 struct fuse_copy_state cs;
1394 struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
1395 if (!fc)
1396 return -EPERM;
1397
1398 fuse_copy_init(&cs, fc, 0, iov, nr_segs);
1399
1400 return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
1401}
1402
1403static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
1404 struct file *out, loff_t *ppos,
1405 size_t len, unsigned int flags)
1406{
1407 unsigned nbuf;
1408 unsigned idx;
1409 struct pipe_buffer *bufs;
1410 struct fuse_copy_state cs;
1411 struct fuse_conn *fc;
1412 size_t rem;
1413 ssize_t ret;
1414
1415 fc = fuse_get_conn(out);
1416 if (!fc)
1417 return -EPERM;
1418
1419 bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
1420 if (!bufs)
1421 return -ENOMEM;
1422
1423 pipe_lock(pipe);
1424 nbuf = 0;
1425 rem = 0;
1426 for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
1427 rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
1428
1429 ret = -EINVAL;
1430 if (rem < len) {
1431 pipe_unlock(pipe);
1432 goto out;
1433 }
1434
1435 rem = len;
1436 while (rem) {
1437 struct pipe_buffer *ibuf;
1438 struct pipe_buffer *obuf;
1439
1440 BUG_ON(nbuf >= pipe->buffers);
1441 BUG_ON(!pipe->nrbufs);
1442 ibuf = &pipe->bufs[pipe->curbuf];
1443 obuf = &bufs[nbuf];
1444
1445 if (rem >= ibuf->len) {
1446 *obuf = *ibuf;
1447 ibuf->ops = NULL;
1448 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
1449 pipe->nrbufs--;
1450 } else {
1451 ibuf->ops->get(pipe, ibuf);
1452 *obuf = *ibuf;
1453 obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1454 obuf->len = rem;
1455 ibuf->offset += obuf->len;
1456 ibuf->len -= obuf->len;
1457 }
1458 nbuf++;
1459 rem -= obuf->len;
1460 }
1461 pipe_unlock(pipe);
1462
1463 fuse_copy_init(&cs, fc, 0, NULL, nbuf);
1464 cs.pipebufs = bufs;
1465 cs.pipe = pipe;
1466
1467 if (flags & SPLICE_F_MOVE)
1468 cs.move_pages = 1;
1469
1470 ret = fuse_dev_do_write(fc, &cs, len);
1471
1472 for (idx = 0; idx < nbuf; idx++) {
1473 struct pipe_buffer *buf = &bufs[idx];
1474 buf->ops->release(pipe, buf);
1475 }
1476out:
1477 kfree(bufs);
1478 return ret;
1479}
1480
1087static unsigned fuse_dev_poll(struct file *file, poll_table *wait) 1481static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
1088{ 1482{
1089 unsigned mask = POLLOUT | POLLWRNORM; 1483 unsigned mask = POLLOUT | POLLWRNORM;
@@ -1225,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
1225 .llseek = no_llseek, 1619 .llseek = no_llseek,
1226 .read = do_sync_read, 1620 .read = do_sync_read,
1227 .aio_read = fuse_dev_read, 1621 .aio_read = fuse_dev_read,
1622 .splice_read = fuse_dev_splice_read,
1228 .write = do_sync_write, 1623 .write = do_sync_write,
1229 .aio_write = fuse_dev_write, 1624 .aio_write = fuse_dev_write,
1625 .splice_write = fuse_dev_splice_write,
1230 .poll = fuse_dev_poll, 1626 .poll = fuse_dev_poll,
1231 .release = fuse_dev_release, 1627 .release = fuse_dev_release,
1232 .fasync = fuse_dev_fasync, 1628 .fasync = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..3cdc5f78a406 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
1156 return 0; 1156 return 0;
1157} 1157}
1158 1158
1159static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync) 1159static int fuse_dir_fsync(struct file *file, int datasync)
1160{ 1160{
1161 /* nfsd can call this with no file */ 1161 return fuse_fsync_common(file, datasync, 1);
1162 return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
1163} 1162}
1164 1163
1165static bool update_mtime(unsigned ivalid) 1164static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
351 fuse_release_nowrite(inode); 351 fuse_release_nowrite(inode);
352} 352}
353 353
354int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 354int fuse_fsync_common(struct file *file, int datasync, int isdir)
355 int isdir)
356{ 355{
357 struct inode *inode = de->d_inode; 356 struct inode *inode = file->f_mapping->host;
358 struct fuse_conn *fc = get_fuse_conn(inode); 357 struct fuse_conn *fc = get_fuse_conn(inode);
359 struct fuse_file *ff = file->private_data; 358 struct fuse_file *ff = file->private_data;
360 struct fuse_req *req; 359 struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
403 return err; 402 return err;
404} 403}
405 404
406static int fuse_fsync(struct file *file, struct dentry *de, int datasync) 405static int fuse_fsync(struct file *file, int datasync)
407{ 406{
408 return fuse_fsync_common(file, de, datasync, 0); 407 return fuse_fsync_common(file, datasync, 0);
409} 408}
410 409
411void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, 410void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
517 int i; 516 int i;
518 size_t count = req->misc.read.in.size; 517 size_t count = req->misc.read.in.size;
519 size_t num_read = req->out.args[0].size; 518 size_t num_read = req->out.args[0].size;
520 struct inode *inode = req->pages[0]->mapping->host; 519 struct address_space *mapping = NULL;
521 520
522 /* 521 for (i = 0; mapping == NULL && i < req->num_pages; i++)
523 * Short read means EOF. If file size is larger, truncate it 522 mapping = req->pages[i]->mapping;
524 */
525 if (!req->out.h.error && num_read < count) {
526 loff_t pos = page_offset(req->pages[0]) + num_read;
527 fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
528 }
529 523
530 fuse_invalidate_attr(inode); /* atime changed */ 524 if (mapping) {
525 struct inode *inode = mapping->host;
526
527 /*
528 * Short read means EOF. If file size is larger, truncate it
529 */
530 if (!req->out.h.error && num_read < count) {
531 loff_t pos;
532
533 pos = page_offset(req->pages[0]) + num_read;
534 fuse_read_update_size(inode, pos,
535 req->misc.read.attr_ver);
536 }
537 fuse_invalidate_attr(inode); /* atime changed */
538 }
531 539
532 for (i = 0; i < req->num_pages; i++) { 540 for (i = 0; i < req->num_pages; i++) {
533 struct page *page = req->pages[i]; 541 struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
536 else 544 else
537 SetPageError(page); 545 SetPageError(page);
538 unlock_page(page); 546 unlock_page(page);
547 page_cache_release(page);
539 } 548 }
540 if (req->ff) 549 if (req->ff)
541 fuse_file_put(req->ff); 550 fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
550 559
551 req->out.argpages = 1; 560 req->out.argpages = 1;
552 req->out.page_zeroing = 1; 561 req->out.page_zeroing = 1;
562 req->out.page_replace = 1;
553 fuse_read_fill(req, file, pos, count, FUSE_READ); 563 fuse_read_fill(req, file, pos, count, FUSE_READ);
554 req->misc.read.attr_ver = fuse_get_attr_version(fc); 564 req->misc.read.attr_ver = fuse_get_attr_version(fc);
555 if (fc->async_read) { 565 if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
589 return PTR_ERR(req); 599 return PTR_ERR(req);
590 } 600 }
591 } 601 }
602 page_cache_get(page);
592 req->pages[req->num_pages] = page; 603 req->pages[req->num_pages] = page;
593 req->num_pages++; 604 req->num_pages++;
594 return 0; 605 return 0;
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
994 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); 1005 nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
995 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 1006 npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
996 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); 1007 npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
997 down_read(&current->mm->mmap_sem); 1008 npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
998 npages = get_user_pages(current, current->mm, user_addr, npages, !write,
999 0, req->pages, NULL);
1000 up_read(&current->mm->mmap_sem);
1001 if (npages < 0) 1009 if (npages < 0)
1002 return npages; 1010 return npages;
1003 1011
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
1580 while (iov_iter_count(&ii)) { 1588 while (iov_iter_count(&ii)) {
1581 struct page *page = pages[page_idx++]; 1589 struct page *page = pages[page_idx++];
1582 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); 1590 size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
1583 void *kaddr, *map; 1591 void *kaddr;
1584 1592
1585 kaddr = map = kmap(page); 1593 kaddr = kmap(page);
1586 1594
1587 while (todo) { 1595 while (todo) {
1588 char __user *uaddr = ii.iov->iov_base + ii.iov_offset; 1596 char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
177 /** Zero partially or not copied pages */ 177 /** Zero partially or not copied pages */
178 unsigned page_zeroing:1; 178 unsigned page_zeroing:1;
179 179
180 /** Pages may be replaced with new ones */
181 unsigned page_replace:1;
182
180 /** Number or arguments */ 183 /** Number or arguments */
181 unsigned numargs; 184 unsigned numargs;
182 185
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
568/** 571/**
569 * Send FSYNC or FSYNCDIR request 572 * Send FSYNC or FSYNCDIR request
570 */ 573 */
571int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, 574int fuse_fsync_common(struct file *file, int datasync, int isdir);
572 int isdir);
573 575
574/** 576/**
575 * Notify poll wakeup 577 * Notify poll wakeup
diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index fe5df5457656..99800e564157 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -201,7 +201,7 @@ generic_check_acl(struct inode *inode, int mask)
201 return -EAGAIN; 201 return -EAGAIN;
202} 202}
203 203
204struct xattr_handler generic_acl_access_handler = { 204const struct xattr_handler generic_acl_access_handler = {
205 .prefix = POSIX_ACL_XATTR_ACCESS, 205 .prefix = POSIX_ACL_XATTR_ACCESS,
206 .flags = ACL_TYPE_ACCESS, 206 .flags = ACL_TYPE_ACCESS,
207 .list = generic_acl_list, 207 .list = generic_acl_list,
@@ -209,7 +209,7 @@ struct xattr_handler generic_acl_access_handler = {
209 .set = generic_acl_set, 209 .set = generic_acl_set,
210}; 210};
211 211
212struct xattr_handler generic_acl_default_handler = { 212const struct xattr_handler generic_acl_default_handler = {
213 .prefix = POSIX_ACL_XATTR_DEFAULT, 213 .prefix = POSIX_ACL_XATTR_DEFAULT,
214 .flags = ACL_TYPE_DEFAULT, 214 .flags = ACL_TYPE_DEFAULT,
215 .list = generic_acl_list, 215 .list = generic_acl_list,
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 87ee309d4c24..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
236 void *buffer, size_t size, int xtype) 236 void *buffer, size_t size, int xtype)
237{ 237{
238 struct inode *inode = dentry->d_inode; 238 struct inode *inode = dentry->d_inode;
239 struct gfs2_sbd *sdp = GFS2_SB(inode);
239 struct posix_acl *acl; 240 struct posix_acl *acl;
240 int type; 241 int type;
241 int error; 242 int error;
242 243
244 if (!sdp->sd_args.ar_posix_acl)
245 return -EOPNOTSUPP;
246
243 type = gfs2_acl_type(name); 247 type = gfs2_acl_type(name);
244 if (type < 0) 248 if (type < 0)
245 return type; 249 return type;
@@ -335,7 +339,7 @@ out:
335 return error; 339 return error;
336} 340}
337 341
338struct xattr_handler gfs2_xattr_system_handler = { 342const struct xattr_handler gfs2_xattr_system_handler = {
339 .prefix = XATTR_SYSTEM_PREFIX, 343 .prefix = XATTR_SYSTEM_PREFIX,
340 .flags = GFS2_EATYPE_SYS, 344 .flags = GFS2_EATYPE_SYS,
341 .get = gfs2_xattr_system_get, 345 .get = gfs2_xattr_system_get,
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 9306a2e6620c..b522b0cb39ea 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -19,6 +19,6 @@
19extern int gfs2_check_acl(struct inode *inode, int mask); 19extern int gfs2_check_acl(struct inode *inode, int mask);
20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode); 20extern int gfs2_acl_create(struct gfs2_inode *dip, struct inode *inode);
21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr); 21extern int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr);
22extern struct xattr_handler gfs2_xattr_system_handler; 22extern const struct xattr_handler gfs2_xattr_system_handler;
23 23
24#endif /* __ACL_DOT_H__ */ 24#endif /* __ACL_DOT_H__ */
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a739a0a48067..9f8b52500d63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -700,8 +700,14 @@ out:
700 return 0; 700 return 0;
701 701
702 page_cache_release(page); 702 page_cache_release(page);
703
704 /*
705 * XXX(hch): the call below should probably be replaced with
706 * a call to the gfs2-specific truncate blocks helper to actually
707 * release disk blocks..
708 */
703 if (pos + len > ip->i_inode.i_size) 709 if (pos + len > ip->i_inode.i_size)
704 vmtruncate(&ip->i_inode, ip->i_inode.i_size); 710 simple_setsize(&ip->i_inode, ip->i_inode.i_size);
705out_endtrans: 711out_endtrans:
706 gfs2_trans_end(sdp); 712 gfs2_trans_end(sdp);
707out_trans_fail: 713out_trans_fail:
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..ed9a94f0ef15 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
218 if (error) 218 if (error)
219 goto out_drop_write; 219 goto out_drop_write;
220 220
221 error = -EACCES;
222 if (!is_owner_or_cap(inode))
223 goto out;
224
225 error = 0;
221 flags = ip->i_diskflags; 226 flags = ip->i_diskflags;
222 new_flags = (flags & ~mask) | (reqflags & mask); 227 new_flags = (flags & ~mask) | (reqflags & mask);
223 if ((new_flags ^ flags) == 0) 228 if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
275{ 280{
276 struct inode *inode = filp->f_path.dentry->d_inode; 281 struct inode *inode = filp->f_path.dentry->d_inode;
277 u32 fsflags, gfsflags; 282 u32 fsflags, gfsflags;
283
278 if (get_user(fsflags, ptr)) 284 if (get_user(fsflags, ptr))
279 return -EFAULT; 285 return -EFAULT;
286
280 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags); 287 gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
281 if (!S_ISDIR(inode->i_mode)) { 288 if (!S_ISDIR(inode->i_mode)) {
282 if (gfsflags & GFS2_DIF_INHERIT_JDATA) 289 if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -547,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
547 * Returns: errno 554 * Returns: errno
548 */ 555 */
549 556
550static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) 557static int gfs2_fsync(struct file *file, int datasync)
551{ 558{
552 struct inode *inode = dentry->d_inode; 559 struct inode *inode = file->f_mapping->host;
553 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC); 560 int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
554 int ret = 0; 561 int ret = 0;
555 562
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 51d8061fa07a..b5612cbb62a5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -242,34 +242,38 @@ fail:
242} 242}
243 243
244/** 244/**
245 * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation 245 * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
246 * and try to reclaim it by doing iput.
247 *
248 * This function assumes no rgrp locks are currently held.
249 *
246 * @sb: The super block 250 * @sb: The super block
247 * no_addr: The inode number 251 * no_addr: The inode number
248 * @@inode: A pointer to the inode found, if any
249 * 252 *
250 * Returns: 0 and *inode if no errors occurred. If an error occurs,
251 * the resulting *inode may or may not be NULL.
252 */ 253 */
253 254
254int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, 255void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
255 struct inode **inode)
256{ 256{
257 struct gfs2_sbd *sdp; 257 struct gfs2_sbd *sdp;
258 struct gfs2_inode *ip; 258 struct gfs2_inode *ip;
259 struct gfs2_glock *io_gl; 259 struct gfs2_glock *io_gl;
260 int error; 260 int error;
261 struct gfs2_holder gh; 261 struct gfs2_holder gh;
262 struct inode *inode;
262 263
263 *inode = gfs2_iget_skip(sb, no_addr); 264 inode = gfs2_iget_skip(sb, no_addr);
264 265
265 if (!(*inode)) 266 if (!inode)
266 return -ENOBUFS; 267 return;
267 268
268 if (!((*inode)->i_state & I_NEW)) 269 /* If it's not a new inode, someone's using it, so leave it alone. */
269 return -ENOBUFS; 270 if (!(inode->i_state & I_NEW)) {
271 iput(inode);
272 return;
273 }
270 274
271 ip = GFS2_I(*inode); 275 ip = GFS2_I(inode);
272 sdp = GFS2_SB(*inode); 276 sdp = GFS2_SB(inode);
273 ip->i_no_formal_ino = -1; 277 ip->i_no_formal_ino = -1;
274 278
275 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); 279 error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
@@ -284,15 +288,13 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
284 set_bit(GIF_INVALID, &ip->i_flags); 288 set_bit(GIF_INVALID, &ip->i_flags);
285 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT, 289 error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
286 &ip->i_iopen_gh); 290 &ip->i_iopen_gh);
287 if (unlikely(error)) { 291 if (unlikely(error))
288 if (error == GLR_TRYFAILED)
289 error = 0;
290 goto fail_iopen; 292 goto fail_iopen;
291 } 293
292 ip->i_iopen_gh.gh_gl->gl_object = ip; 294 ip->i_iopen_gh.gh_gl->gl_object = ip;
293 gfs2_glock_put(io_gl); 295 gfs2_glock_put(io_gl);
294 296
295 (*inode)->i_mode = DT2IF(DT_UNKNOWN); 297 inode->i_mode = DT2IF(DT_UNKNOWN);
296 298
297 /* 299 /*
298 * We must read the inode in order to work out its type in 300 * We must read the inode in order to work out its type in
@@ -303,16 +305,17 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
303 */ 305 */
304 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY, 306 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
305 &gh); 307 &gh);
306 if (unlikely(error)) { 308 if (unlikely(error))
307 if (error == GLR_TRYFAILED)
308 error = 0;
309 goto fail_glock; 309 goto fail_glock;
310 } 310
311 /* Inode is now uptodate */ 311 /* Inode is now uptodate */
312 gfs2_glock_dq_uninit(&gh); 312 gfs2_glock_dq_uninit(&gh);
313 gfs2_set_iop(*inode); 313 gfs2_set_iop(inode);
314
315 /* The iput will cause it to be deleted. */
316 iput(inode);
317 return;
314 318
315 return 0;
316fail_glock: 319fail_glock:
317 gfs2_glock_dq(&ip->i_iopen_gh); 320 gfs2_glock_dq(&ip->i_iopen_gh);
318fail_iopen: 321fail_iopen:
@@ -321,7 +324,8 @@ fail_put:
321 ip->i_gl->gl_object = NULL; 324 ip->i_gl->gl_object = NULL;
322 gfs2_glock_put(ip->i_gl); 325 gfs2_glock_put(ip->i_gl);
323fail: 326fail:
324 return error; 327 iget_failed(inode);
328 return;
325} 329}
326 330
327static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) 331static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index e161461d4c57..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -84,8 +84,7 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
84extern void gfs2_set_iop(struct inode *inode); 84extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
86 u64 no_addr, u64 no_formal_ino); 86 u64 no_addr, u64 no_formal_ino);
87extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr, 87extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
88 struct inode **inode);
89extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); 88extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
90 89
91extern int gfs2_inode_refresh(struct gfs2_inode *ip); 90extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b593f0e28f25..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -696,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
696 * 696 *
697 */ 697 */
698 698
699void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) 699void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
700{ 700{
701 struct gfs2_ail *ai; 701 struct gfs2_ail *ai;
702 702
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index eb570b4ad443..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,28 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
47 sdp->sd_log_head = sdp->sd_log_tail = value; 47 sdp->sd_log_head = sdp->sd_log_tail = value;
48} 48}
49 49
50unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, 50extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
51 unsigned int ssize); 51 unsigned int ssize);
52 52
53int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); 53extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
54void gfs2_log_incr_head(struct gfs2_sbd *sdp); 54extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
55 55
56struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); 56extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
57struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, 57extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
58 struct buffer_head *real); 58 struct buffer_head *real);
59void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); 59extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
60extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
61extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
60 62
61static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl) 63extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
62{ 64extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
63 if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags)) 65extern int gfs2_logd(void *data);
64 __gfs2_log_flush(sbd, gl);
65}
66
67void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
68void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
69
70void gfs2_log_shutdown(struct gfs2_sbd *sdp);
71void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
72int gfs2_logd(void *data);
73 66
74#endif /* __LOG_DOT_H__ */ 67#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1071 return error;
1072} 1072}
1073 1073
1074/*
1075 * XXX: should be changed to have proper ordering by opencoding simple_setsize
1076 */
1074static int setattr_size(struct inode *inode, struct iattr *attr) 1077static int setattr_size(struct inode *inode, struct iattr *attr)
1075{ 1078{
1076 struct gfs2_inode *ip = GFS2_I(inode); 1079 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
1081 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); 1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1082 if (error) 1085 if (error)
1083 return error; 1086 return error;
1084 error = vmtruncate(inode, attr->ia_size); 1087 error = simple_setsize(inode, attr->ia_size);
1085 gfs2_trans_end(sdp); 1088 gfs2_trans_end(sdp);
1086 if (error) 1089 if (error)
1087 return error; 1090 return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8bce73ed4d8e..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -854,7 +854,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
854 if ((start + nr_sects) != blk) { 854 if ((start + nr_sects) != blk) {
855 rv = blkdev_issue_discard(bdev, start, 855 rv = blkdev_issue_discard(bdev, start,
856 nr_sects, GFP_NOFS, 856 nr_sects, GFP_NOFS,
857 DISCARD_FL_BARRIER); 857 BLKDEV_IFL_WAIT |
858 BLKDEV_IFL_BARRIER);
858 if (rv) 859 if (rv)
859 goto fail; 860 goto fail;
860 nr_sects = 0; 861 nr_sects = 0;
@@ -869,7 +870,7 @@ start_new_extent:
869 } 870 }
870 if (nr_sects) { 871 if (nr_sects) {
871 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 872 rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
872 DISCARD_FL_BARRIER); 873 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
873 if (rv) 874 if (rv)
874 goto fail; 875 goto fail;
875 } 876 }
@@ -1191,7 +1192,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1191{ 1192{
1192 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1193 struct gfs2_alloc *al = ip->i_alloc; 1194 struct gfs2_alloc *al = ip->i_alloc;
1194 struct inode *inode;
1195 int error = 0; 1195 int error = 0;
1196 u64 last_unlinked = NO_BLOCK, unlinked; 1196 u64 last_unlinked = NO_BLOCK, unlinked;
1197 1197
@@ -1209,22 +1209,27 @@ try_again:
1209 if (error) 1209 if (error)
1210 return error; 1210 return error;
1211 1211
1212 /* Find an rgrp suitable for allocation. If it encounters any unlinked
1213 dinodes along the way, error will equal -EAGAIN and unlinked will
1214 contains it block address. We then need to look up that inode and
1215 try to free it, and try the allocation again. */
1212 error = get_local_rgrp(ip, &unlinked, &last_unlinked); 1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1213 if (error) { 1217 if (error) {
1214 if (ip != GFS2_I(sdp->sd_rindex)) 1218 if (ip != GFS2_I(sdp->sd_rindex))
1215 gfs2_glock_dq_uninit(&al->al_ri_gh); 1219 gfs2_glock_dq_uninit(&al->al_ri_gh);
1216 if (error != -EAGAIN) 1220 if (error != -EAGAIN)
1217 return error; 1221 return error;
1218 error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb, 1222
1219 unlinked, &inode); 1223 gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
1220 if (inode) 1224 /* regardless of whether or not gfs2_process_unlinked_inode
1221 iput(inode); 1225 was successful, we don't want to repeat it again. */
1226 last_unlinked = unlinked;
1222 gfs2_log_flush(sdp, NULL); 1227 gfs2_log_flush(sdp, NULL);
1223 if (error == GLR_TRYFAILED) 1228 error = 0;
1224 error = 0; 1229
1225 goto try_again; 1230 goto try_again;
1226 } 1231 }
1227 1232 /* no error, so we have the rgrp set in the inode's allocation. */
1228 al->al_file = file; 1233 al->al_file = file;
1229 al->al_line = line; 1234 al->al_line = line;
1230 1235
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 3df60f2d84e3..a0464680af0b 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -54,7 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
54extern const struct export_operations gfs2_export_ops; 54extern const struct export_operations gfs2_export_ops;
55extern const struct super_operations gfs2_super_ops; 55extern const struct super_operations gfs2_super_ops;
56extern const struct dentry_operations gfs2_dops; 56extern const struct dentry_operations gfs2_dops;
57extern struct xattr_handler *gfs2_xattr_handlers[]; 57extern const struct xattr_handler *gfs2_xattr_handlers[];
58 58
59#endif /* __SUPER_DOT_H__ */ 59#endif /* __SUPER_DOT_H__ */
60 60
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index c2ebdf2c01d4..82f93da00d1b 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1535,21 +1535,21 @@ out_alloc:
1535 return error; 1535 return error;
1536} 1536}
1537 1537
1538static struct xattr_handler gfs2_xattr_user_handler = { 1538static const struct xattr_handler gfs2_xattr_user_handler = {
1539 .prefix = XATTR_USER_PREFIX, 1539 .prefix = XATTR_USER_PREFIX,
1540 .flags = GFS2_EATYPE_USR, 1540 .flags = GFS2_EATYPE_USR,
1541 .get = gfs2_xattr_get, 1541 .get = gfs2_xattr_get,
1542 .set = gfs2_xattr_set, 1542 .set = gfs2_xattr_set,
1543}; 1543};
1544 1544
1545static struct xattr_handler gfs2_xattr_security_handler = { 1545static const struct xattr_handler gfs2_xattr_security_handler = {
1546 .prefix = XATTR_SECURITY_PREFIX, 1546 .prefix = XATTR_SECURITY_PREFIX,
1547 .flags = GFS2_EATYPE_SECURITY, 1547 .flags = GFS2_EATYPE_SECURITY,
1548 .get = gfs2_xattr_get, 1548 .get = gfs2_xattr_get,
1549 .set = gfs2_xattr_set, 1549 .set = gfs2_xattr_set,
1550}; 1550};
1551 1551
1552struct xattr_handler *gfs2_xattr_handlers[] = { 1552const struct xattr_handler *gfs2_xattr_handlers[] = {
1553 &gfs2_xattr_user_handler, 1553 &gfs2_xattr_user_handler,
1554 &gfs2_xattr_security_handler, 1554 &gfs2_xattr_security_handler,
1555 &gfs2_xattr_system_handler, 1555 &gfs2_xattr_system_handler,
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
494const struct file_operations hfsplus_dir_operations = { 494const struct file_operations hfsplus_dir_operations = {
495 .read = generic_read_dir, 495 .read = generic_read_dir,
496 .readdir = hfsplus_readdir, 496 .readdir = hfsplus_readdir,
497 .ioctl = hfsplus_ioctl, 497 .unlocked_ioctl = hfsplus_ioctl,
498 .llseek = generic_file_llseek, 498 .llseek = generic_file_llseek,
499 .release = hfsplus_dir_release, 499 .release = hfsplus_dir_release,
500}; 500};
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
337void hfsplus_delete_inode(struct inode *); 337void hfsplus_delete_inode(struct inode *);
338 338
339/* ioctl.c */ 339/* ioctl.c */
340int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 340long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
341 unsigned long arg);
342int hfsplus_setxattr(struct dentry *dentry, const char *name, 341int hfsplus_setxattr(struct dentry *dentry, const char *name,
343 const void *value, size_t size, int flags); 342 const void *value, size_t size, int flags);
344ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, 343ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
285 .fsync = file_fsync, 285 .fsync = file_fsync,
286 .open = hfsplus_file_open, 286 .open = hfsplus_file_open,
287 .release = hfsplus_file_release, 287 .release = hfsplus_file_release,
288 .ioctl = hfsplus_ioctl, 288 .unlocked_ioctl = hfsplus_ioctl,
289}; 289};
290 290
291struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 291struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include "hfsplus_fs.h" 22#include "hfsplus_fs.h"
22 23
23int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24 unsigned long arg)
25{ 25{
26 struct inode *inode = filp->f_path.dentry->d_inode;
26 unsigned int flags; 27 unsigned int flags;
27 28
29 lock_kernel();
28 switch (cmd) { 30 switch (cmd) {
29 case HFSPLUS_IOC_EXT2_GETFLAGS: 31 case HFSPLUS_IOC_EXT2_GETFLAGS:
30 flags = 0; 32 flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
38 case HFSPLUS_IOC_EXT2_SETFLAGS: { 40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
39 int err = 0; 41 int err = 0;
40 err = mnt_want_write(filp->f_path.mnt); 42 err = mnt_want_write(filp->f_path.mnt);
41 if (err) 43 if (err) {
44 unlock_kernel();
42 return err; 45 return err;
46 }
43 47
44 if (!is_owner_or_cap(inode)) { 48 if (!is_owner_or_cap(inode)) {
45 err = -EACCES; 49 err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
85 mark_inode_dirty(inode); 89 mark_inode_dirty(inode);
86setflags_out: 90setflags_out:
87 mnt_drop_write(filp->f_path.mnt); 91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
88 return err; 93 return err;
89 } 94 }
90 default: 95 default:
96 unlock_kernel();
91 return -ENOTTY; 97 return -ENOTTY;
92 } 98 }
93} 99}
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf1..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
411 return 0; 411 return 0;
412} 412}
413 413
414int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync) 414int hostfs_fsync(struct file *file, int datasync)
415{ 415{
416 return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync); 416 return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
417} 417}
418 418
419static const struct file_operations hostfs_file_fops = { 419static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
19 return 0; 19 return 0;
20} 20}
21 21
22int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 22int hpfs_file_fsync(struct file *file, int datasync)
23{ 23{
24 /*return file_fsync(file, dentry);*/ 24 /*return file_fsync(file, datasync);*/
25 return 0; /* Don't fsync :-) */ 25 return 0; /* Don't fsync :-) */
26} 26}
27 27
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
268 268
269/* file.c */ 269/* file.c */
270 270
271int hpfs_file_fsync(struct file *, struct dentry *, int); 271int hpfs_file_fsync(struct file *, int);
272extern const struct file_operations hpfs_file_ops; 272extern const struct file_operations hpfs_file_ops;
273extern const struct inode_operations hpfs_file_iops; 273extern const struct inode_operations hpfs_file_iops;
274extern const struct address_space_operations hpfs_aops; 274extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
587 return err; 587 return err;
588} 588}
589 589
590static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync) 590static int hppfs_fsync(struct file *file, int datasync)
591{ 591{
592 return 0; 592 return 0;
593} 593}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
688const struct file_operations hugetlbfs_file_operations = { 688const struct file_operations hugetlbfs_file_operations = {
689 .read = hugetlbfs_read, 689 .read = hugetlbfs_read,
690 .mmap = hugetlbfs_file_mmap, 690 .mmap = hugetlbfs_file_mmap,
691 .fsync = simple_sync_file, 691 .fsync = noop_fsync,
692 .get_unmapped_area = hugetlb_get_unmapped_area, 692 .get_unmapped_area = hugetlb_get_unmapped_area,
693}; 693};
694 694
diff --git a/fs/inode.c b/fs/inode.c
index 258ec22bb298..2bee20ae3d65 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -286,11 +286,9 @@ static void init_once(void *foo)
286 */ 286 */
287void __iget(struct inode *inode) 287void __iget(struct inode *inode)
288{ 288{
289 if (atomic_read(&inode->i_count)) { 289 if (atomic_inc_return(&inode->i_count) != 1)
290 atomic_inc(&inode->i_count);
291 return; 290 return;
292 } 291
293 atomic_inc(&inode->i_count);
294 if (!(inode->i_state & (I_DIRTY|I_SYNC))) 292 if (!(inode->i_state & (I_DIRTY|I_SYNC)))
295 list_move(&inode->i_list, &inode_in_use); 293 list_move(&inode->i_list, &inode_in_use);
296 inodes_stat.nr_unused--; 294 inodes_stat.nr_unused--;
@@ -1608,3 +1606,23 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
1608 inode->i_ino); 1606 inode->i_ino);
1609} 1607}
1610EXPORT_SYMBOL(init_special_inode); 1608EXPORT_SYMBOL(init_special_inode);
1609
1610/**
1611 * Init uid,gid,mode for new inode according to posix standards
1612 * @inode: New inode
1613 * @dir: Directory inode
1614 * @mode: mode of the new inode
1615 */
1616void inode_init_owner(struct inode *inode, const struct inode *dir,
1617 mode_t mode)
1618{
1619 inode->i_uid = current_fsuid();
1620 if (dir && dir->i_mode & S_ISGID) {
1621 inode->i_gid = dir->i_gid;
1622 if (S_ISDIR(mode))
1623 mode |= S_ISGID;
1624 } else
1625 inode->i_gid = current_fsgid();
1626 inode->i_mode = mode;
1627}
1628EXPORT_SYMBOL(inode_init_owner);
diff --git a/fs/internal.h b/fs/internal.h
index 8a03a5447bdf..6b706bc60a66 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -87,6 +87,8 @@ extern struct file *get_empty_filp(void);
87 * super.c 87 * super.c
88 */ 88 */
89extern int do_remount_sb(struct super_block *, int, void *, int); 89extern int do_remount_sb(struct super_block *, int, void *, int);
90extern void __put_super(struct super_block *sb);
91extern void put_super(struct super_block *sb);
90 92
91/* 93/*
92 * open.c 94 * open.c
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 7faefb4da939..2d140a713861 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -525,15 +525,8 @@ static int ioctl_fsfreeze(struct file *filp)
525 if (sb->s_op->freeze_fs == NULL) 525 if (sb->s_op->freeze_fs == NULL)
526 return -EOPNOTSUPP; 526 return -EOPNOTSUPP;
527 527
528 /* If a blockdevice-backed filesystem isn't specified, return. */
529 if (sb->s_bdev == NULL)
530 return -EINVAL;
531
532 /* Freeze */ 528 /* Freeze */
533 sb = freeze_bdev(sb->s_bdev); 529 return freeze_super(sb);
534 if (IS_ERR(sb))
535 return PTR_ERR(sb);
536 return 0;
537} 530}
538 531
539static int ioctl_fsthaw(struct file *filp) 532static int ioctl_fsthaw(struct file *filp)
@@ -543,12 +536,8 @@ static int ioctl_fsthaw(struct file *filp)
543 if (!capable(CAP_SYS_ADMIN)) 536 if (!capable(CAP_SYS_ADMIN))
544 return -EPERM; 537 return -EPERM;
545 538
546 /* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
547 if (sb->s_bdev == NULL)
548 return -EINVAL;
549
550 /* Thaw */ 539 /* Thaw */
551 return thaw_bdev(sb->s_bdev, sb); 540 return thaw_super(sb);
552} 541}
553 542
554/* 543/*
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
272 272
273const struct file_operations isofs_dir_operations = 273const struct file_operations isofs_dir_operations =
274{ 274{
275 .llseek = generic_file_llseek,
275 .read = generic_read_dir, 276 .read = generic_read_dir,
276 .readdir = isofs_readdir, 277 .readdir = isofs_readdir,
277}; 278};
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 30beb11ef928..076d1cc44f95 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -530,7 +530,8 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
530 */ 530 */
531 if ((journal->j_fs_dev != journal->j_dev) && 531 if ((journal->j_fs_dev != journal->j_dev) &&
532 (journal->j_flags & JBD2_BARRIER)) 532 (journal->j_flags & JBD2_BARRIER))
533 blkdev_issue_flush(journal->j_fs_dev, NULL); 533 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
534 BLKDEV_IFL_WAIT);
534 if (!(journal->j_flags & JBD2_ABORT)) 535 if (!(journal->j_flags & JBD2_ABORT))
535 jbd2_journal_update_superblock(journal, 1); 536 jbd2_journal_update_superblock(journal, 1);
536 return 0; 537 return 0;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 671da7fb7ffd..75716d3d2be0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -717,7 +717,8 @@ start_journal_io:
717 if (commit_transaction->t_flushed_data_blocks && 717 if (commit_transaction->t_flushed_data_blocks &&
718 (journal->j_fs_dev != journal->j_dev) && 718 (journal->j_fs_dev != journal->j_dev) &&
719 (journal->j_flags & JBD2_BARRIER)) 719 (journal->j_flags & JBD2_BARRIER))
720 blkdev_issue_flush(journal->j_fs_dev, NULL); 720 blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
721 BLKDEV_IFL_WAIT);
721 722
722 /* Done it all: now write the commit record asynchronously. */ 723 /* Done it all: now write the commit record asynchronously. */
723 if (JBD2_HAS_INCOMPAT_FEATURE(journal, 724 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -727,7 +728,8 @@ start_journal_io:
727 if (err) 728 if (err)
728 __jbd2_journal_abort_hard(journal); 729 __jbd2_journal_abort_hard(journal);
729 if (journal->j_flags & JBD2_BARRIER) 730 if (journal->j_flags & JBD2_BARRIER)
730 blkdev_issue_flush(journal->j_dev, NULL); 731 blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
732 BLKDEV_IFL_WAIT);
731 } 733 }
732 734
733 err = journal_finish_inode_data_buffers(journal, commit_transaction); 735 err = journal_finish_inode_data_buffers(journal, commit_transaction);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
1311 if (handle->h_sync) 1311 if (handle->h_sync)
1312 transaction->t_synchronous_commit = 1; 1312 transaction->t_synchronous_commit = 1;
1313 current->journal_info = NULL; 1313 current->journal_info = NULL;
1314 spin_lock(&journal->j_state_lock);
1315 spin_lock(&transaction->t_handle_lock); 1314 spin_lock(&transaction->t_handle_lock);
1316 transaction->t_outstanding_credits -= handle->h_buffer_credits; 1315 transaction->t_outstanding_credits -= handle->h_buffer_credits;
1317 transaction->t_updates--; 1316 transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
1340 jbd_debug(2, "transaction too old, requesting commit for " 1339 jbd_debug(2, "transaction too old, requesting commit for "
1341 "handle %p\n", handle); 1340 "handle %p\n", handle);
1342 /* This is non-blocking */ 1341 /* This is non-blocking */
1343 __jbd2_log_start_commit(journal, transaction->t_tid); 1342 jbd2_log_start_commit(journal, transaction->t_tid);
1344 spin_unlock(&journal->j_state_lock);
1345 1343
1346 /* 1344 /*
1347 * Special case: JBD2_SYNC synchronous updates require us 1345 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
1351 err = jbd2_log_wait_commit(journal, tid); 1349 err = jbd2_log_wait_commit(journal, tid);
1352 } else { 1350 } else {
1353 spin_unlock(&transaction->t_handle_lock); 1351 spin_unlock(&transaction->t_handle_lock);
1354 spin_unlock(&journal->j_state_lock);
1355 } 1352 }
1356 1353
1357 lock_map_release(&handle->h_lockdep_map); 1354 lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 7cdc3196476a..54a92fd02bbd 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
234 if (inode->i_mode != mode) { 234 if (inode->i_mode != mode) {
235 struct iattr attr; 235 struct iattr attr;
236 236
237 attr.ia_valid = ATTR_MODE; 237 attr.ia_valid = ATTR_MODE | ATTR_CTIME;
238 attr.ia_mode = mode; 238 attr.ia_mode = mode;
239 attr.ia_ctime = CURRENT_TIME_SEC;
239 rc = jffs2_do_setattr(inode, &attr); 240 rc = jffs2_do_setattr(inode, &attr);
240 if (rc < 0) 241 if (rc < 0)
241 return rc; 242 return rc;
@@ -419,7 +420,7 @@ static int jffs2_acl_setxattr(struct dentry *dentry, const char *name,
419 return rc; 420 return rc;
420} 421}
421 422
422struct xattr_handler jffs2_acl_access_xattr_handler = { 423const struct xattr_handler jffs2_acl_access_xattr_handler = {
423 .prefix = POSIX_ACL_XATTR_ACCESS, 424 .prefix = POSIX_ACL_XATTR_ACCESS,
424 .flags = ACL_TYPE_DEFAULT, 425 .flags = ACL_TYPE_DEFAULT,
425 .list = jffs2_acl_access_listxattr, 426 .list = jffs2_acl_access_listxattr,
@@ -427,7 +428,7 @@ struct xattr_handler jffs2_acl_access_xattr_handler = {
427 .set = jffs2_acl_setxattr, 428 .set = jffs2_acl_setxattr,
428}; 429};
429 430
430struct xattr_handler jffs2_acl_default_xattr_handler = { 431const struct xattr_handler jffs2_acl_default_xattr_handler = {
431 .prefix = POSIX_ACL_XATTR_DEFAULT, 432 .prefix = POSIX_ACL_XATTR_DEFAULT,
432 .flags = ACL_TYPE_DEFAULT, 433 .flags = ACL_TYPE_DEFAULT,
433 .list = jffs2_acl_default_listxattr, 434 .list = jffs2_acl_default_listxattr,
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index f0ba63e3c36b..5e42de8d9541 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -31,8 +31,8 @@ extern int jffs2_acl_chmod(struct inode *);
31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); 31extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
32extern int jffs2_init_acl_post(struct inode *); 32extern int jffs2_init_acl_post(struct inode *);
33 33
34extern struct xattr_handler jffs2_acl_access_xattr_handler; 34extern const struct xattr_handler jffs2_acl_access_xattr_handler;
35extern struct xattr_handler jffs2_acl_default_xattr_handler; 35extern const struct xattr_handler jffs2_acl_default_xattr_handler;
36 36
37#else 37#else
38 38
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417e085f..166062a68230 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
222 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); 222 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
223 223
224 jffs2_free_raw_inode(ri); 224 jffs2_free_raw_inode(ri);
225 d_instantiate(dentry, inode);
226 225
227 D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", 226 D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
228 inode->i_ino, inode->i_mode, inode->i_nlink, 227 inode->i_ino, inode->i_mode, inode->i_nlink,
229 f->inocache->pino_nlink, inode->i_mapping->nrpages)); 228 f->inocache->pino_nlink, inode->i_mapping->nrpages));
229
230 d_instantiate(dentry, inode);
231 unlock_new_inode(inode);
230 return 0; 232 return 0;
231 233
232 fail: 234 fail:
233 make_bad_inode(inode); 235 make_bad_inode(inode);
236 unlock_new_inode(inode);
234 iput(inode); 237 iput(inode);
235 jffs2_free_raw_inode(ri); 238 jffs2_free_raw_inode(ri);
236 return ret; 239 return ret;
@@ -360,8 +363,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
360 /* Eeek. Wave bye bye */ 363 /* Eeek. Wave bye bye */
361 mutex_unlock(&f->sem); 364 mutex_unlock(&f->sem);
362 jffs2_complete_reservation(c); 365 jffs2_complete_reservation(c);
363 jffs2_clear_inode(inode); 366 ret = PTR_ERR(fn);
364 return PTR_ERR(fn); 367 goto fail;
365 } 368 }
366 369
367 /* We use f->target field to store the target path. */ 370 /* We use f->target field to store the target path. */
@@ -370,8 +373,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
370 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); 373 printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
371 mutex_unlock(&f->sem); 374 mutex_unlock(&f->sem);
372 jffs2_complete_reservation(c); 375 jffs2_complete_reservation(c);
373 jffs2_clear_inode(inode); 376 ret = -ENOMEM;
374 return -ENOMEM; 377 goto fail;
375 } 378 }
376 379
377 memcpy(f->target, target, targetlen + 1); 380 memcpy(f->target, target, targetlen + 1);
@@ -386,30 +389,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
386 jffs2_complete_reservation(c); 389 jffs2_complete_reservation(c);
387 390
388 ret = jffs2_init_security(inode, dir_i); 391 ret = jffs2_init_security(inode, dir_i);
389 if (ret) { 392 if (ret)
390 jffs2_clear_inode(inode); 393 goto fail;
391 return ret; 394
392 }
393 ret = jffs2_init_acl_post(inode); 395 ret = jffs2_init_acl_post(inode);
394 if (ret) { 396 if (ret)
395 jffs2_clear_inode(inode); 397 goto fail;
396 return ret;
397 }
398 398
399 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 399 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
400 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 400 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
401 if (ret) { 401 if (ret)
402 /* Eep. */ 402 goto fail;
403 jffs2_clear_inode(inode);
404 return ret;
405 }
406 403
407 rd = jffs2_alloc_raw_dirent(); 404 rd = jffs2_alloc_raw_dirent();
408 if (!rd) { 405 if (!rd) {
409 /* Argh. Now we treat it like a normal delete */ 406 /* Argh. Now we treat it like a normal delete */
410 jffs2_complete_reservation(c); 407 jffs2_complete_reservation(c);
411 jffs2_clear_inode(inode); 408 ret = -ENOMEM;
412 return -ENOMEM; 409 goto fail;
413 } 410 }
414 411
415 dir_f = JFFS2_INODE_INFO(dir_i); 412 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +434,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
437 jffs2_complete_reservation(c); 434 jffs2_complete_reservation(c);
438 jffs2_free_raw_dirent(rd); 435 jffs2_free_raw_dirent(rd);
439 mutex_unlock(&dir_f->sem); 436 mutex_unlock(&dir_f->sem);
440 jffs2_clear_inode(inode); 437 ret = PTR_ERR(fd);
441 return PTR_ERR(fd); 438 goto fail;
442 } 439 }
443 440
444 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 441 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -453,7 +450,14 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
453 jffs2_complete_reservation(c); 450 jffs2_complete_reservation(c);
454 451
455 d_instantiate(dentry, inode); 452 d_instantiate(dentry, inode);
453 unlock_new_inode(inode);
456 return 0; 454 return 0;
455
456 fail:
457 make_bad_inode(inode);
458 unlock_new_inode(inode);
459 iput(inode);
460 return ret;
457} 461}
458 462
459 463
@@ -519,8 +523,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
519 /* Eeek. Wave bye bye */ 523 /* Eeek. Wave bye bye */
520 mutex_unlock(&f->sem); 524 mutex_unlock(&f->sem);
521 jffs2_complete_reservation(c); 525 jffs2_complete_reservation(c);
522 jffs2_clear_inode(inode); 526 ret = PTR_ERR(fn);
523 return PTR_ERR(fn); 527 goto fail;
524 } 528 }
525 /* No data here. Only a metadata node, which will be 529 /* No data here. Only a metadata node, which will be
526 obsoleted by the first data write 530 obsoleted by the first data write
@@ -531,30 +535,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
531 jffs2_complete_reservation(c); 535 jffs2_complete_reservation(c);
532 536
533 ret = jffs2_init_security(inode, dir_i); 537 ret = jffs2_init_security(inode, dir_i);
534 if (ret) { 538 if (ret)
535 jffs2_clear_inode(inode); 539 goto fail;
536 return ret; 540
537 }
538 ret = jffs2_init_acl_post(inode); 541 ret = jffs2_init_acl_post(inode);
539 if (ret) { 542 if (ret)
540 jffs2_clear_inode(inode); 543 goto fail;
541 return ret;
542 }
543 544
544 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 545 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
545 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 546 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
546 if (ret) { 547 if (ret)
547 /* Eep. */ 548 goto fail;
548 jffs2_clear_inode(inode);
549 return ret;
550 }
551 549
552 rd = jffs2_alloc_raw_dirent(); 550 rd = jffs2_alloc_raw_dirent();
553 if (!rd) { 551 if (!rd) {
554 /* Argh. Now we treat it like a normal delete */ 552 /* Argh. Now we treat it like a normal delete */
555 jffs2_complete_reservation(c); 553 jffs2_complete_reservation(c);
556 jffs2_clear_inode(inode); 554 ret = -ENOMEM;
557 return -ENOMEM; 555 goto fail;
558 } 556 }
559 557
560 dir_f = JFFS2_INODE_INFO(dir_i); 558 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +580,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
582 jffs2_complete_reservation(c); 580 jffs2_complete_reservation(c);
583 jffs2_free_raw_dirent(rd); 581 jffs2_free_raw_dirent(rd);
584 mutex_unlock(&dir_f->sem); 582 mutex_unlock(&dir_f->sem);
585 jffs2_clear_inode(inode); 583 ret = PTR_ERR(fd);
586 return PTR_ERR(fd); 584 goto fail;
587 } 585 }
588 586
589 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 587 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -599,7 +597,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
599 jffs2_complete_reservation(c); 597 jffs2_complete_reservation(c);
600 598
601 d_instantiate(dentry, inode); 599 d_instantiate(dentry, inode);
600 unlock_new_inode(inode);
602 return 0; 601 return 0;
602
603 fail:
604 make_bad_inode(inode);
605 unlock_new_inode(inode);
606 iput(inode);
607 return ret;
603} 608}
604 609
605static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) 610static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +698,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
693 /* Eeek. Wave bye bye */ 698 /* Eeek. Wave bye bye */
694 mutex_unlock(&f->sem); 699 mutex_unlock(&f->sem);
695 jffs2_complete_reservation(c); 700 jffs2_complete_reservation(c);
696 jffs2_clear_inode(inode); 701 ret = PTR_ERR(fn);
697 return PTR_ERR(fn); 702 goto fail;
698 } 703 }
699 /* No data here. Only a metadata node, which will be 704 /* No data here. Only a metadata node, which will be
700 obsoleted by the first data write 705 obsoleted by the first data write
@@ -705,30 +710,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
705 jffs2_complete_reservation(c); 710 jffs2_complete_reservation(c);
706 711
707 ret = jffs2_init_security(inode, dir_i); 712 ret = jffs2_init_security(inode, dir_i);
708 if (ret) { 713 if (ret)
709 jffs2_clear_inode(inode); 714 goto fail;
710 return ret; 715
711 }
712 ret = jffs2_init_acl_post(inode); 716 ret = jffs2_init_acl_post(inode);
713 if (ret) { 717 if (ret)
714 jffs2_clear_inode(inode); 718 goto fail;
715 return ret;
716 }
717 719
718 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen, 720 ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
719 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); 721 ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
720 if (ret) { 722 if (ret)
721 /* Eep. */ 723 goto fail;
722 jffs2_clear_inode(inode);
723 return ret;
724 }
725 724
726 rd = jffs2_alloc_raw_dirent(); 725 rd = jffs2_alloc_raw_dirent();
727 if (!rd) { 726 if (!rd) {
728 /* Argh. Now we treat it like a normal delete */ 727 /* Argh. Now we treat it like a normal delete */
729 jffs2_complete_reservation(c); 728 jffs2_complete_reservation(c);
730 jffs2_clear_inode(inode); 729 ret = -ENOMEM;
731 return -ENOMEM; 730 goto fail;
732 } 731 }
733 732
734 dir_f = JFFS2_INODE_INFO(dir_i); 733 dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +758,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
759 jffs2_complete_reservation(c); 758 jffs2_complete_reservation(c);
760 jffs2_free_raw_dirent(rd); 759 jffs2_free_raw_dirent(rd);
761 mutex_unlock(&dir_f->sem); 760 mutex_unlock(&dir_f->sem);
762 jffs2_clear_inode(inode); 761 ret = PTR_ERR(fd);
763 return PTR_ERR(fd); 762 goto fail;
764 } 763 }
765 764
766 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); 765 dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -775,8 +774,14 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
775 jffs2_complete_reservation(c); 774 jffs2_complete_reservation(c);
776 775
777 d_instantiate(dentry, inode); 776 d_instantiate(dentry, inode);
778 777 unlock_new_inode(inode);
779 return 0; 778 return 0;
779
780 fail:
781 make_bad_inode(inode);
782 unlock_new_inode(inode);
783 iput(inode);
784 return ret;
780} 785}
781 786
782static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, 787static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
26 struct page **pagep, void **fsdata); 26 struct page **pagep, void **fsdata);
27static int jffs2_readpage (struct file *filp, struct page *pg); 27static int jffs2_readpage (struct file *filp, struct page *pg);
28 28
29int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync) 29int jffs2_fsync(struct file *filp, int datasync)
30{ 30{
31 struct inode *inode = dentry->d_inode; 31 struct inode *inode = filp->f_mapping->host;
32 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); 32 struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
33 33
34 /* Trigger GC to flush any pending writes for this inode */ 34 /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 86e0821fc989..459d39d1ea0b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
169 mutex_unlock(&f->sem); 169 mutex_unlock(&f->sem);
170 jffs2_complete_reservation(c); 170 jffs2_complete_reservation(c);
171 171
172 /* We have to do the vmtruncate() without f->sem held, since 172 /* We have to do the simple_setsize() without f->sem held, since
173 some pages may be locked and waiting for it in readpage(). 173 some pages may be locked and waiting for it in readpage().
174 We are protected from a simultaneous write() extending i_size 174 We are protected from a simultaneous write() extending i_size
175 back past iattr->ia_size, because do_truncate() holds the 175 back past iattr->ia_size, because do_truncate() holds the
176 generic inode semaphore. */ 176 generic inode semaphore. */
177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { 177 if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
178 vmtruncate(inode, iattr->ia_size); 178 simple_setsize(inode, iattr->ia_size);
179 inode->i_blocks = (inode->i_size + 511) >> 9; 179 inode->i_blocks = (inode->i_size + 511) >> 9;
180 } 180 }
181 181
@@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
465 inode->i_blocks = 0; 465 inode->i_blocks = 0;
466 inode->i_size = 0; 466 inode->i_size = 0;
467 467
468 insert_inode_hash(inode); 468 if (insert_inode_locked(inode) < 0) {
469 make_bad_inode(inode);
470 unlock_new_inode(inode);
471 iput(inode);
472 return ERR_PTR(-EINVAL);
473 }
469 474
470 return inode; 475 return inode;
471} 476}
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 035a767f958b..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
158extern const struct file_operations jffs2_file_operations; 158extern const struct file_operations jffs2_file_operations;
159extern const struct inode_operations jffs2_file_inode_operations; 159extern const struct inode_operations jffs2_file_inode_operations;
160extern const struct address_space_operations jffs2_file_address_operations; 160extern const struct address_space_operations jffs2_file_address_operations;
161int jffs2_fsync(struct file *, struct dentry *, int); 161int jffs2_fsync(struct file *, int);
162int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg); 162int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
163 163
164/* ioctl.c */ 164/* ioctl.c */
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index eaccee058583..239f51216a68 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -77,7 +77,7 @@ static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
77 return retlen; 77 return retlen;
78} 78}
79 79
80struct xattr_handler jffs2_security_xattr_handler = { 80const struct xattr_handler jffs2_security_xattr_handler = {
81 .prefix = XATTR_SECURITY_PREFIX, 81 .prefix = XATTR_SECURITY_PREFIX,
82 .list = jffs2_security_listxattr, 82 .list = jffs2_security_listxattr,
83 .set = jffs2_security_setxattr, 83 .set = jffs2_security_setxattr,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 9e75c62c85d6..a2d58c96f1b4 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -904,7 +904,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) 904 * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags)
905 * is an implementation of setxattr handler on jffs2. 905 * is an implementation of setxattr handler on jffs2.
906 * -------------------------------------------------- */ 906 * -------------------------------------------------- */
907struct xattr_handler *jffs2_xattr_handlers[] = { 907const struct xattr_handler *jffs2_xattr_handlers[] = {
908 &jffs2_user_xattr_handler, 908 &jffs2_user_xattr_handler,
909#ifdef CONFIG_JFFS2_FS_SECURITY 909#ifdef CONFIG_JFFS2_FS_SECURITY
910 &jffs2_security_xattr_handler, 910 &jffs2_security_xattr_handler,
@@ -917,8 +917,8 @@ struct xattr_handler *jffs2_xattr_handlers[] = {
917 NULL 917 NULL
918}; 918};
919 919
920static struct xattr_handler *xprefix_to_handler(int xprefix) { 920static const struct xattr_handler *xprefix_to_handler(int xprefix) {
921 struct xattr_handler *ret; 921 const struct xattr_handler *ret;
922 922
923 switch (xprefix) { 923 switch (xprefix) {
924 case JFFS2_XPREFIX_USER: 924 case JFFS2_XPREFIX_USER:
@@ -955,7 +955,7 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
955 struct jffs2_inode_cache *ic = f->inocache; 955 struct jffs2_inode_cache *ic = f->inocache;
956 struct jffs2_xattr_ref *ref, **pref; 956 struct jffs2_xattr_ref *ref, **pref;
957 struct jffs2_xattr_datum *xd; 957 struct jffs2_xattr_datum *xd;
958 struct xattr_handler *xhandle; 958 const struct xattr_handler *xhandle;
959 ssize_t len, rc; 959 ssize_t len, rc;
960 int retry = 0; 960 int retry = 0;
961 961
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 6e3b5ddfb7ab..cf4f5759b42b 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -93,9 +93,9 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname
93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, 93extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname,
94 const char *buffer, size_t size, int flags); 94 const char *buffer, size_t size, int flags);
95 95
96extern struct xattr_handler *jffs2_xattr_handlers[]; 96extern const struct xattr_handler *jffs2_xattr_handlers[];
97extern struct xattr_handler jffs2_user_xattr_handler; 97extern const struct xattr_handler jffs2_user_xattr_handler;
98extern struct xattr_handler jffs2_trusted_xattr_handler; 98extern const struct xattr_handler jffs2_trusted_xattr_handler;
99 99
100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t); 100extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
101#define jffs2_getxattr generic_getxattr 101#define jffs2_getxattr generic_getxattr
@@ -122,7 +122,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
122 122
123#ifdef CONFIG_JFFS2_FS_SECURITY 123#ifdef CONFIG_JFFS2_FS_SECURITY
124extern int jffs2_init_security(struct inode *inode, struct inode *dir); 124extern int jffs2_init_security(struct inode *inode, struct inode *dir);
125extern struct xattr_handler jffs2_security_xattr_handler; 125extern const struct xattr_handler jffs2_security_xattr_handler;
126#else 126#else
127#define jffs2_init_security(inode,dir) (0) 127#define jffs2_init_security(inode,dir) (0)
128#endif /* CONFIG_JFFS2_FS_SECURITY */ 128#endif /* CONFIG_JFFS2_FS_SECURITY */
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 3e5a5e356e05..1c868194c504 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -47,7 +47,7 @@ static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_trusted_xattr_handler = { 50const struct xattr_handler jffs2_trusted_xattr_handler = {
51 .prefix = XATTR_TRUSTED_PREFIX, 51 .prefix = XATTR_TRUSTED_PREFIX,
52 .list = jffs2_trusted_listxattr, 52 .list = jffs2_trusted_listxattr,
53 .set = jffs2_trusted_setxattr, 53 .set = jffs2_trusted_setxattr,
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index 8544af67dffe..916b5c966039 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -47,7 +47,7 @@ static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
47 return retlen; 47 return retlen;
48} 48}
49 49
50struct xattr_handler jffs2_user_xattr_handler = { 50const struct xattr_handler jffs2_user_xattr_handler = {
51 .prefix = XATTR_USER_PREFIX, 51 .prefix = XATTR_USER_PREFIX,
52 .list = jffs2_user_listxattr, 52 .list = jffs2_user_listxattr,
53 .set = jffs2_user_setxattr, 53 .set = jffs2_user_setxattr,
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 85d9ec659225..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
27#include "jfs_acl.h" 27#include "jfs_acl.h"
28#include "jfs_debug.h" 28#include "jfs_debug.h"
29 29
30int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) 30int jfs_fsync(struct file *file, int datasync)
31{ 31{
32 struct inode *inode = dentry->d_inode; 32 struct inode *inode = file->f_mapping->host;
33 int rc = 0; 33 int rc = 0;
34 34
35 if (!(inode->i_state & I_DIRTY) || 35 if (!(inode->i_state & I_DIRTY) ||
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 829921b67765..2686531e235a 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -98,14 +98,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
98 goto fail_unlock; 98 goto fail_unlock;
99 } 99 }
100 100
101 inode->i_uid = current_fsuid(); 101 inode_init_owner(inode, parent, mode);
102 if (parent->i_mode & S_ISGID) {
103 inode->i_gid = parent->i_gid;
104 if (S_ISDIR(mode))
105 mode |= S_ISGID;
106 } else
107 inode->i_gid = current_fsgid();
108
109 /* 102 /*
110 * New inodes need to save sane values on disk when 103 * New inodes need to save sane values on disk when
111 * uid & gid mount options are used 104 * uid & gid mount options are used
@@ -121,7 +114,6 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
121 if (rc) 114 if (rc)
122 goto fail_drop; 115 goto fail_drop;
123 116
124 inode->i_mode = mode;
125 /* inherit flags from parent */ 117 /* inherit flags from parent */
126 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; 118 jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT;
127 119
@@ -134,7 +126,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
134 if (S_ISLNK(mode)) 126 if (S_ISLNK(mode))
135 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); 127 jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL);
136 } 128 }
137 jfs_inode->mode2 |= mode; 129 jfs_inode->mode2 |= inode->i_mode;
138 130
139 inode->i_blocks = 0; 131 inode->i_blocks = 0;
140 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 132 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e8..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
21struct fid; 21struct fid;
22 22
23extern struct inode *ialloc(struct inode *, umode_t); 23extern struct inode *ialloc(struct inode *, umode_t);
24extern int jfs_fsync(struct file *, struct dentry *, int); 24extern int jfs_fsync(struct file *, int);
25extern long jfs_ioctl(struct file *, unsigned int, unsigned long); 25extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); 26extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
27extern struct inode *jfs_iget(struct super_block *, unsigned long); 27extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b66832ac33ac..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -179,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
179 179
180 jfs_info("In jfs_put_super"); 180 jfs_info("In jfs_put_super");
181 181
182 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
183
182 lock_kernel(); 184 lock_kernel();
183 185
184 rc = jfs_umount(sb); 186 rc = jfs_umount(sb);
@@ -396,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
396 398
397 JFS_SBI(sb)->flag = flag; 399 JFS_SBI(sb)->flag = flag;
398 ret = jfs_mount_rw(sb, 1); 400 ret = jfs_mount_rw(sb, 1);
401
402 /* mark the fs r/w for quota activity */
403 sb->s_flags &= ~MS_RDONLY;
404
399 unlock_kernel(); 405 unlock_kernel();
406 dquot_resume(sb, -1);
400 return ret; 407 return ret;
401 } 408 }
402 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) { 409 if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
410 rc = dquot_suspend(sb, -1);
411 if (rc < 0) {
412 unlock_kernel();
413 return rc;
414 }
403 rc = jfs_umount_rw(sb); 415 rc = jfs_umount_rw(sb);
404 JFS_SBI(sb)->flag = flag; 416 JFS_SBI(sb)->flag = flag;
405 unlock_kernel(); 417 unlock_kernel();
@@ -469,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
469 */ 481 */
470 sb->s_op = &jfs_super_operations; 482 sb->s_op = &jfs_super_operations;
471 sb->s_export_op = &jfs_export_operations; 483 sb->s_export_op = &jfs_export_operations;
484#ifdef CONFIG_QUOTA
485 sb->dq_op = &dquot_operations;
486 sb->s_qcop = &dquot_quotactl_ops;
487#endif
472 488
473 /* 489 /*
474 * Initialize direct-mapping inode/address-space 490 * Initialize direct-mapping inode/address-space
diff --git a/fs/libfs.c b/fs/libfs.c
index 232bea425b09..dcaf972cbf1b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/vfs.h> 10#include <linux/vfs.h>
11#include <linux/quotaops.h>
11#include <linux/mutex.h> 12#include <linux/mutex.h>
12#include <linux/exportfs.h> 13#include <linux/exportfs.h>
13#include <linux/writeback.h> 14#include <linux/writeback.h>
@@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
58 return NULL; 59 return NULL;
59} 60}
60 61
61int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
62{
63 return 0;
64}
65
66int dcache_dir_open(struct inode *inode, struct file *file) 62int dcache_dir_open(struct inode *inode, struct file *file)
67{ 63{
68 static struct qstr cursor_name = {.len = 1, .name = "."}; 64 static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = {
190 .llseek = dcache_dir_lseek, 186 .llseek = dcache_dir_lseek,
191 .read = generic_read_dir, 187 .read = generic_read_dir,
192 .readdir = dcache_readdir, 188 .readdir = dcache_readdir,
193 .fsync = simple_sync_file, 189 .fsync = noop_fsync,
194}; 190};
195 191
196const struct inode_operations simple_dir_inode_operations = { 192const struct inode_operations simple_dir_inode_operations = {
@@ -330,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
330 return 0; 326 return 0;
331} 327}
332 328
329/**
330 * simple_setsize - handle core mm and vfs requirements for file size change
331 * @inode: inode
332 * @newsize: new file size
333 *
334 * Returns 0 on success, -error on failure.
335 *
336 * simple_setsize must be called with inode_mutex held.
337 *
338 * simple_setsize will check that the requested new size is OK (see
339 * inode_newsize_ok), and then will perform the necessary i_size update
340 * and pagecache truncation (if necessary). It will be typically be called
341 * from the filesystem's setattr function when ATTR_SIZE is passed in.
342 *
343 * The inode itself must have correct permissions and attributes to allow
344 * i_size to be changed, this function then just checks that the new size
345 * requested is valid.
346 *
347 * In the case of simple in-memory filesystems with inodes stored solely
348 * in the inode cache, and file data in the pagecache, nothing more needs
349 * to be done to satisfy a truncate request. Filesystems with on-disk
350 * blocks for example will need to free them in the case of truncate, in
351 * that case it may be easier not to use simple_setsize (but each of its
352 * components will likely be required at some point to update pagecache
353 * and inode etc).
354 */
355int simple_setsize(struct inode *inode, loff_t newsize)
356{
357 loff_t oldsize;
358 int error;
359
360 error = inode_newsize_ok(inode, newsize);
361 if (error)
362 return error;
363
364 oldsize = inode->i_size;
365 i_size_write(inode, newsize);
366 truncate_pagecache(inode, oldsize, newsize);
367
368 return error;
369}
370EXPORT_SYMBOL(simple_setsize);
371
372/**
373 * simple_setattr - setattr for simple in-memory filesystem
374 * @dentry: dentry
375 * @iattr: iattr structure
376 *
377 * Returns 0 on success, -error on failure.
378 *
379 * simple_setattr implements setattr for an in-memory filesystem which
380 * does not store its own file data or metadata (eg. uses the page cache
381 * and inode cache as its data store).
382 */
383int simple_setattr(struct dentry *dentry, struct iattr *iattr)
384{
385 struct inode *inode = dentry->d_inode;
386 int error;
387
388 error = inode_change_ok(inode, iattr);
389 if (error)
390 return error;
391
392 if (iattr->ia_valid & ATTR_SIZE) {
393 error = simple_setsize(inode, iattr->ia_size);
394 if (error)
395 return error;
396 }
397
398 generic_setattr(inode, iattr);
399
400 return error;
401}
402EXPORT_SYMBOL(simple_setattr);
403
333int simple_readpage(struct file *file, struct page *page) 404int simple_readpage(struct file *file, struct page *page)
334{ 405{
335 clear_highpage(page); 406 clear_highpage(page);
@@ -418,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
418 * unique inode values later for this filesystem, then you must take care 489 * unique inode values later for this filesystem, then you must take care
419 * to pass it an appropriate max_reserved value to avoid collisions. 490 * to pass it an appropriate max_reserved value to avoid collisions.
420 */ 491 */
421int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files) 492int simple_fill_super(struct super_block *s, unsigned long magic,
493 struct tree_descr *files)
422{ 494{
423 struct inode *inode; 495 struct inode *inode;
424 struct dentry *root; 496 struct dentry *root;
@@ -851,13 +923,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
851} 923}
852EXPORT_SYMBOL_GPL(generic_fh_to_parent); 924EXPORT_SYMBOL_GPL(generic_fh_to_parent);
853 925
854int simple_fsync(struct file *file, struct dentry *dentry, int datasync) 926/**
927 * generic_file_fsync - generic fsync implementation for simple filesystems
928 * @file: file to synchronize
929 * @datasync: only synchronize essential metadata if true
930 *
931 * This is a generic implementation of the fsync method for simple
932 * filesystems which track all non-inode metadata in the buffers list
933 * hanging off the address_space structure.
934 */
935int generic_file_fsync(struct file *file, int datasync)
855{ 936{
856 struct writeback_control wbc = { 937 struct writeback_control wbc = {
857 .sync_mode = WB_SYNC_ALL, 938 .sync_mode = WB_SYNC_ALL,
858 .nr_to_write = 0, /* metadata-only; caller takes care of data */ 939 .nr_to_write = 0, /* metadata-only; caller takes care of data */
859 }; 940 };
860 struct inode *inode = dentry->d_inode; 941 struct inode *inode = file->f_mapping->host;
861 int err; 942 int err;
862 int ret; 943 int ret;
863 944
@@ -872,7 +953,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
872 ret = err; 953 ret = err;
873 return ret; 954 return ret;
874} 955}
875EXPORT_SYMBOL(simple_fsync); 956EXPORT_SYMBOL(generic_file_fsync);
957
958/*
959 * No-op implementation of ->fsync for in-memory filesystems.
960 */
961int noop_fsync(struct file *file, int datasync)
962{
963 return 0;
964}
876 965
877EXPORT_SYMBOL(dcache_dir_close); 966EXPORT_SYMBOL(dcache_dir_close);
878EXPORT_SYMBOL(dcache_dir_lseek); 967EXPORT_SYMBOL(dcache_dir_lseek);
@@ -895,7 +984,7 @@ EXPORT_SYMBOL(simple_release_fs);
895EXPORT_SYMBOL(simple_rename); 984EXPORT_SYMBOL(simple_rename);
896EXPORT_SYMBOL(simple_rmdir); 985EXPORT_SYMBOL(simple_rmdir);
897EXPORT_SYMBOL(simple_statfs); 986EXPORT_SYMBOL(simple_statfs);
898EXPORT_SYMBOL(simple_sync_file); 987EXPORT_SYMBOL(noop_fsync);
899EXPORT_SYMBOL(simple_unlink); 988EXPORT_SYMBOL(simple_unlink);
900EXPORT_SYMBOL(simple_read_from_buffer); 989EXPORT_SYMBOL(simple_read_from_buffer);
901EXPORT_SYMBOL(simple_write_to_buffer); 990EXPORT_SYMBOL(simple_write_to_buffer);
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 0de524071870..abe1cafbd4c2 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
219 } 219 }
220} 220}
221 221
222int logfs_fsync(struct file *file, struct dentry *dentry, int datasync) 222int logfs_fsync(struct file *file, int datasync)
223{ 223{
224 struct super_block *sb = dentry->d_inode->i_sb; 224 struct super_block *sb = file->f_mapping->host->i_sb;
225 225
226 logfs_write_anchor(sb); 226 logfs_write_anchor(sb);
227 return 0; 227 return 0;
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 755a92e8daa7..f602e230e162 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -358,14 +358,7 @@ struct inode *logfs_new_inode(struct inode *dir, int mode)
358 inode->i_mode = mode; 358 inode->i_mode = mode;
359 logfs_set_ino_generation(sb, inode); 359 logfs_set_ino_generation(sb, inode);
360 360
361 inode->i_uid = current_fsuid(); 361 inode_init_owner(inode, dir, mode);
362 inode->i_gid = current_fsgid();
363 if (dir->i_mode & S_ISGID) {
364 inode->i_gid = dir->i_gid;
365 if (S_ISDIR(mode))
366 inode->i_mode |= S_ISGID;
367 }
368
369 logfs_inode_setops(inode); 362 logfs_inode_setops(inode);
370 insert_inode_hash(inode); 363 insert_inode_hash(inode);
371 364
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 1a9db84f8d8f..c838c4d72111 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
506int logfs_readpage(struct file *file, struct page *page); 506int logfs_readpage(struct file *file, struct page *page);
507int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd, 507int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
508 unsigned long arg); 508 unsigned long arg);
509int logfs_fsync(struct file *file, struct dentry *dentry, int datasync); 509int logfs_fsync(struct file *file, int datasync);
510 510
511/* gc.c */ 511/* gc.c */
512u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); 512u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c
index 6ac693faae49..482779fe4e7c 100644
--- a/fs/minix/bitmap.c
+++ b/fs/minix/bitmap.c
@@ -221,7 +221,7 @@ void minix_free_inode(struct inode * inode)
221 clear_inode(inode); /* clear in-memory copy */ 221 clear_inode(inode); /* clear in-memory copy */
222} 222}
223 223
224struct inode * minix_new_inode(const struct inode * dir, int * error) 224struct inode *minix_new_inode(const struct inode *dir, int mode, int *error)
225{ 225{
226 struct super_block *sb = dir->i_sb; 226 struct super_block *sb = dir->i_sb;
227 struct minix_sb_info *sbi = minix_sb(sb); 227 struct minix_sb_info *sbi = minix_sb(sb);
@@ -263,8 +263,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error)
263 iput(inode); 263 iput(inode);
264 return NULL; 264 return NULL;
265 } 265 }
266 inode->i_uid = current_fsuid(); 266 inode_init_owner(inode, dir, mode);
267 inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid : current_fsgid();
268 inode->i_ino = j; 267 inode->i_ino = j;
269 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 268 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
270 inode->i_blocks = 0; 269 inode->i_blocks = 0;
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..1dbf921ca44b 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
22 .llseek = generic_file_llseek, 22 .llseek = generic_file_llseek,
23 .read = generic_read_dir, 23 .read = generic_read_dir,
24 .readdir = minix_readdir, 24 .readdir = minix_readdir,
25 .fsync = simple_fsync, 25 .fsync = generic_file_fsync,
26}; 26};
27 27
28static inline void dir_put_page(struct page *page) 28static inline void dir_put_page(struct page *page)
@@ -72,16 +72,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
72{ 72{
73 struct address_space *mapping = dir->i_mapping; 73 struct address_space *mapping = dir->i_mapping;
74 struct page *page = read_mapping_page(mapping, n, NULL); 74 struct page *page = read_mapping_page(mapping, n, NULL);
75 if (!IS_ERR(page)) { 75 if (!IS_ERR(page))
76 kmap(page); 76 kmap(page);
77 if (!PageUptodate(page))
78 goto fail;
79 }
80 return page; 77 return page;
81
82fail:
83 dir_put_page(page);
84 return ERR_PTR(-EIO);
85} 78}
86 79
87static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) 80static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
19 .write = do_sync_write, 19 .write = do_sync_write,
20 .aio_write = generic_file_aio_write, 20 .aio_write = generic_file_aio_write,
21 .mmap = generic_file_mmap, 21 .mmap = generic_file_mmap,
22 .fsync = simple_fsync, 22 .fsync = generic_file_fsync,
23 .splice_read = generic_file_splice_read, 23 .splice_read = generic_file_splice_read,
24}; 24};
25 25
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
20 return (block_t *)minix_i(inode)->u.i2_data; 20 return (block_t *)minix_i(inode)->u.i2_data;
21} 21}
22 22
23#define DIRCOUNT 7
24#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
25
23static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) 26static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
24{ 27{
25 int n = 0; 28 int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
34 printk("MINIX-fs: block_to_path: " 37 printk("MINIX-fs: block_to_path: "
35 "block %ld too big on dev %s\n", 38 "block %ld too big on dev %s\n",
36 block, bdevname(sb->s_bdev, b)); 39 block, bdevname(sb->s_bdev, b));
37 } else if (block < 7) { 40 } else if (block < DIRCOUNT) {
38 offsets[n++] = block; 41 offsets[n++] = block;
39 } else if ((block -= 7) < 256) { 42 } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
40 offsets[n++] = 7; 43 offsets[n++] = DIRCOUNT;
41 offsets[n++] = block; 44 offsets[n++] = block;
42 } else if ((block -= 256) < 256*256) { 45 } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
43 offsets[n++] = 8; 46 offsets[n++] = DIRCOUNT + 1;
44 offsets[n++] = block>>8; 47 offsets[n++] = block / INDIRCOUNT(sb);
45 offsets[n++] = block & 255; 48 offsets[n++] = block % INDIRCOUNT(sb);
46 } else { 49 } else {
47 block -= 256*256; 50 block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
48 offsets[n++] = 9; 51 offsets[n++] = DIRCOUNT + 2;
49 offsets[n++] = block>>16; 52 offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
50 offsets[n++] = (block>>8) & 255; 53 offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
51 offsets[n++] = block & 255; 54 offsets[n++] = block % INDIRCOUNT(sb);
52 } 55 }
53 return n; 56 return n;
54} 57}
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index 9dcf95b42116..111f34ee9e3b 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -46,7 +46,7 @@ struct minix_sb_info {
46extern struct inode *minix_iget(struct super_block *, unsigned long); 46extern struct inode *minix_iget(struct super_block *, unsigned long);
47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **); 47extern struct minix_inode * minix_V1_raw_inode(struct super_block *, ino_t, struct buffer_head **);
48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **); 48extern struct minix2_inode * minix_V2_raw_inode(struct super_block *, ino_t, struct buffer_head **);
49extern struct inode * minix_new_inode(const struct inode * dir, int * error); 49extern struct inode * minix_new_inode(const struct inode *, int, int *);
50extern void minix_free_inode(struct inode * inode); 50extern void minix_free_inode(struct inode * inode);
51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi); 51extern unsigned long minix_count_free_inodes(struct minix_sb_info *sbi);
52extern int minix_new_block(struct inode * inode); 52extern int minix_new_block(struct inode * inode);
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 32b131cd6121..e20ee85955d1 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -46,10 +46,9 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, int mode, dev_
46 if (!old_valid_dev(rdev)) 46 if (!old_valid_dev(rdev))
47 return -EINVAL; 47 return -EINVAL;
48 48
49 inode = minix_new_inode(dir, &error); 49 inode = minix_new_inode(dir, mode, &error);
50 50
51 if (inode) { 51 if (inode) {
52 inode->i_mode = mode;
53 minix_set_inode(inode, rdev); 52 minix_set_inode(inode, rdev);
54 mark_inode_dirty(inode); 53 mark_inode_dirty(inode);
55 error = add_nondir(dentry, inode); 54 error = add_nondir(dentry, inode);
@@ -73,11 +72,10 @@ static int minix_symlink(struct inode * dir, struct dentry *dentry,
73 if (i > dir->i_sb->s_blocksize) 72 if (i > dir->i_sb->s_blocksize)
74 goto out; 73 goto out;
75 74
76 inode = minix_new_inode(dir, &err); 75 inode = minix_new_inode(dir, S_IFLNK | 0777, &err);
77 if (!inode) 76 if (!inode)
78 goto out; 77 goto out;
79 78
80 inode->i_mode = S_IFLNK | 0777;
81 minix_set_inode(inode, 0); 79 minix_set_inode(inode, 0);
82 err = page_symlink(inode, symname, i); 80 err = page_symlink(inode, symname, i);
83 if (err) 81 if (err)
@@ -117,13 +115,10 @@ static int minix_mkdir(struct inode * dir, struct dentry *dentry, int mode)
117 115
118 inode_inc_link_count(dir); 116 inode_inc_link_count(dir);
119 117
120 inode = minix_new_inode(dir, &err); 118 inode = minix_new_inode(dir, mode, &err);
121 if (!inode) 119 if (!inode)
122 goto out_dir; 120 goto out_dir;
123 121
124 inode->i_mode = S_IFDIR | mode;
125 if (dir->i_mode & S_ISGID)
126 inode->i_mode |= S_ISGID;
127 minix_set_inode(inode, 0); 122 minix_set_inode(inode, 0);
128 123
129 inode_inc_link_count(inode); 124 inode_inc_link_count(inode);
diff --git a/fs/namei.c b/fs/namei.c
index b86b96fe1dc3..868d0cb9d473 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -523,9 +523,10 @@ static void path_put_conditional(struct path *path, struct nameidata *nd)
523static inline void path_to_nameidata(struct path *path, struct nameidata *nd) 523static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
524{ 524{
525 dput(nd->path.dentry); 525 dput(nd->path.dentry);
526 if (nd->path.mnt != path->mnt) 526 if (nd->path.mnt != path->mnt) {
527 mntput(nd->path.mnt); 527 mntput(nd->path.mnt);
528 nd->path.mnt = path->mnt; 528 nd->path.mnt = path->mnt;
529 }
529 nd->path.dentry = path->dentry; 530 nd->path.dentry = path->dentry;
530} 531}
531 532
@@ -1620,6 +1621,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1620 case LAST_DOTDOT: 1621 case LAST_DOTDOT:
1621 follow_dotdot(nd); 1622 follow_dotdot(nd);
1622 dir = nd->path.dentry; 1623 dir = nd->path.dentry;
1624 case LAST_DOT:
1623 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) { 1625 if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
1624 if (!dir->d_op->d_revalidate(dir, nd)) { 1626 if (!dir->d_op->d_revalidate(dir, nd)) {
1625 error = -ESTALE; 1627 error = -ESTALE;
@@ -1627,7 +1629,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
1627 } 1629 }
1628 } 1630 }
1629 /* fallthrough */ 1631 /* fallthrough */
1630 case LAST_DOT:
1631 case LAST_ROOT: 1632 case LAST_ROOT:
1632 if (open_flag & O_CREAT) 1633 if (open_flag & O_CREAT)
1633 goto exit; 1634 goto exit;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e52..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
49 49
50const struct file_operations ncp_dir_operations = 50const struct file_operations ncp_dir_operations =
51{ 51{
52 .llseek = generic_file_llseek,
52 .read = generic_read_dir, 53 .read = generic_read_dir,
53 .readdir = ncp_readdir, 54 .readdir = ncp_readdir,
54 .ioctl = ncp_ioctl, 55 .unlocked_ioctl = ncp_ioctl,
55#ifdef CONFIG_COMPAT 56#ifdef CONFIG_COMPAT
56 .compat_ioctl = ncp_compat_ioctl, 57 .compat_ioctl = ncp_compat_ioctl,
57#endif 58#endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a5..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
22#include <linux/ncp_fs.h> 22#include <linux/ncp_fs.h>
23#include "ncplib_kernel.h" 23#include "ncplib_kernel.h"
24 24
25static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync) 25static int ncp_fsync(struct file *file, int datasync)
26{ 26{
27 return 0; 27 return 0;
28} 28}
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
295 .llseek = ncp_remote_llseek, 295 .llseek = ncp_remote_llseek,
296 .read = ncp_file_read, 296 .read = ncp_file_read,
297 .write = ncp_file_write, 297 .write = ncp_file_write,
298 .ioctl = ncp_ioctl, 298 .unlocked_ioctl = ncp_ioctl,
299#ifdef CONFIG_COMPAT 299#ifdef CONFIG_COMPAT
300 .compat_ioctl = ncp_compat_ioctl, 300 .compat_ioctl = ncp_compat_ioctl,
301#endif 301#endif
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea8..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,6 +20,7 @@
20#include <linux/smp_lock.h> 20#include <linux/smp_lock.h>
21#include <linux/vmalloc.h> 21#include <linux/vmalloc.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/smp_lock.h>
23 24
24#include <linux/ncp_fs.h> 25#include <linux/ncp_fs.h>
25 26
@@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 262}
262#endif /* CONFIG_NCPFS_NLS */ 263#endif /* CONFIG_NCPFS_NLS */
263 264
264static int __ncp_ioctl(struct inode *inode, struct file *filp, 265static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
265 unsigned int cmd, unsigned long arg)
266{ 266{
267 struct inode *inode = filp->f_dentry->d_inode;
267 struct ncp_server *server = NCP_SERVER(inode); 268 struct ncp_server *server = NCP_SERVER(inode);
268 int result; 269 int result;
269 struct ncp_ioctl_request request; 270 struct ncp_ioctl_request request;
@@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
841 } 842 }
842} 843}
843 844
844int ncp_ioctl(struct inode *inode, struct file *filp, 845long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
845 unsigned int cmd, unsigned long arg)
846{ 846{
847 int ret; 847 long ret;
848 848
849 lock_kernel();
849 if (ncp_ioctl_need_write(cmd)) { 850 if (ncp_ioctl_need_write(cmd)) {
850 /* 851 /*
851 * inside the ioctl(), any failures which 852 * inside the ioctl(), any failures which
@@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
853 * -EACCESS, so it seems consistent to keep 854 * -EACCESS, so it seems consistent to keep
854 * that here. 855 * that here.
855 */ 856 */
856 if (mnt_want_write(filp->f_path.mnt)) 857 if (mnt_want_write(filp->f_path.mnt)) {
857 return -EACCES; 858 ret = -EACCES;
859 goto out;
860 }
858 } 861 }
859 ret = __ncp_ioctl(inode, filp, cmd, arg); 862 ret = __ncp_ioctl(filp, cmd, arg);
860 if (ncp_ioctl_need_write(cmd)) 863 if (ncp_ioctl_need_write(cmd))
861 mnt_drop_write(filp->f_path.mnt); 864 mnt_drop_write(filp->f_path.mnt);
865
866out:
867 unlock_kernel();
862 return ret; 868 return ret;
863} 869}
864 870
865#ifdef CONFIG_COMPAT 871#ifdef CONFIG_COMPAT
866long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 872long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
867{ 873{
868 struct inode *inode = file->f_path.dentry->d_inode; 874 long ret;
869 int ret;
870 875
871 lock_kernel(); 876 lock_kernel();
872 arg = (unsigned long) compat_ptr(arg); 877 arg = (unsigned long) compat_ptr(arg);
873 ret = ncp_ioctl(inode, file, cmd, arg); 878 ret = ncp_ioctl(file, cmd, arg);
874 unlock_kernel(); 879 unlock_kernel();
875 return ret; 880 return ret;
876} 881}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf3..782b431ef91c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t); 53static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
54static int nfs_rename(struct inode *, struct dentry *, 54static int nfs_rename(struct inode *, struct dentry *,
55 struct inode *, struct dentry *); 55 struct inode *, struct dentry *);
56static int nfs_fsync_dir(struct file *, struct dentry *, int); 56static int nfs_fsync_dir(struct file *, int);
57static loff_t nfs_llseek_dir(struct file *, loff_t, int); 57static loff_t nfs_llseek_dir(struct file *, loff_t, int);
58 58
59const struct file_operations nfs_dir_operations = { 59const struct file_operations nfs_dir_operations = {
@@ -641,8 +641,10 @@ out:
641 * All directory operations under NFS are synchronous, so fsync() 641 * All directory operations under NFS are synchronous, so fsync()
642 * is a dummy operation. 642 * is a dummy operation.
643 */ 643 */
644static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) 644static int nfs_fsync_dir(struct file *filp, int datasync)
645{ 645{
646 struct dentry *dentry = filp->f_path.dentry;
647
646 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n", 648 dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
647 dentry->d_parent->d_name.name, dentry->d_name.name, 649 dentry->d_parent->d_name.name, dentry->d_name.name,
648 datasync); 650 datasync);
@@ -1741,6 +1743,7 @@ remove_lru_entry:
1741 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags); 1743 clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
1742 smp_mb__after_clear_bit(); 1744 smp_mb__after_clear_bit();
1743 } 1745 }
1746 spin_unlock(&inode->i_lock);
1744 } 1747 }
1745 spin_unlock(&nfs_access_lru_lock); 1748 spin_unlock(&nfs_access_lru_lock);
1746 nfs_access_free_list(&head); 1749 nfs_access_free_list(&head);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cac96bcc91e4..36a5e74f51b4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov, 53static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
54 unsigned long nr_segs, loff_t pos); 54 unsigned long nr_segs, loff_t pos);
55static int nfs_file_flush(struct file *, fl_owner_t id); 55static int nfs_file_flush(struct file *, fl_owner_t id);
56static int nfs_file_fsync(struct file *, struct dentry *dentry, int datasync); 56static int nfs_file_fsync(struct file *, int datasync);
57static int nfs_check_flags(int flags); 57static int nfs_check_flags(int flags);
58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); 58static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl); 59static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -322,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
322 * whether any write errors occurred for this process. 322 * whether any write errors occurred for this process.
323 */ 323 */
324static int 324static int
325nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) 325nfs_file_fsync(struct file *file, int datasync)
326{ 326{
327 struct dentry *dentry = file->f_path.dentry;
327 struct nfs_open_context *ctx = nfs_file_open_context(file); 328 struct nfs_open_context *ctx = nfs_file_open_context(file);
328 struct inode *inode = dentry->d_inode; 329 struct inode *inode = dentry->d_inode;
329 330
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8b1157daa2..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
1060 goto out_nomem; 1060 goto out_nomem;
1061 rc = strict_strtoul(string, 10, &option); 1061 rc = strict_strtoul(string, 10, &option);
1062 kfree(string); 1062 kfree(string);
1063 if (rc != 0 || option > USHORT_MAX) 1063 if (rc != 0 || option > USHRT_MAX)
1064 goto out_invalid_value; 1064 goto out_invalid_value;
1065 mnt->nfs_server.port = option; 1065 mnt->nfs_server.port = option;
1066 break; 1066 break;
@@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
1181 goto out_nomem; 1181 goto out_nomem;
1182 rc = strict_strtoul(string, 10, &option); 1182 rc = strict_strtoul(string, 10, &option);
1183 kfree(string); 1183 kfree(string);
1184 if (rc != 0 || option > USHORT_MAX) 1184 if (rc != 0 || option > USHRT_MAX)
1185 goto out_invalid_value; 1185 goto out_invalid_value;
1186 mnt->mount_server.port = option; 1186 mnt->mount_server.port = option;
1187 break; 1187 break;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how)
1386 int res = 0; 1386 int res = 0;
1387 1387
1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) 1388 if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
1389 goto out; 1389 goto out_mark_dirty;
1390 spin_lock(&inode->i_lock); 1390 spin_lock(&inode->i_lock);
1391 res = nfs_scan_commit(inode, &head, 0, 0); 1391 res = nfs_scan_commit(inode, &head, 0, 0);
1392 spin_unlock(&inode->i_lock); 1392 spin_unlock(&inode->i_lock);
@@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, 1398 wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
1399 nfs_wait_bit_killable, 1399 nfs_wait_bit_killable,
1400 TASK_KILLABLE); 1400 TASK_KILLABLE);
1401 else
1402 goto out_mark_dirty;
1401 } else 1403 } else
1402 nfs_commit_clear_lock(NFS_I(inode)); 1404 nfs_commit_clear_lock(NFS_I(inode));
1403out: 1405 return res;
1406 /* Note: If we exit without ensuring that the commit is complete,
1407 * we must mark the inode as dirty. Otherwise, future calls to
1408 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
1409 * that the data is on the disk.
1410 */
1411out_mark_dirty:
1412 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1404 return res; 1413 return res;
1405} 1414}
1406 1415
@@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
1509 }; 1518 };
1510 int ret; 1519 int ret;
1511 1520
1512 while(PagePrivate(page)) { 1521 for (;;) {
1513 wait_on_page_writeback(page); 1522 wait_on_page_writeback(page);
1514 if (clear_page_dirty_for_io(page)) { 1523 if (clear_page_dirty_for_io(page)) {
1515 ret = nfs_writepage_locked(page, &wbc); 1524 ret = nfs_writepage_locked(page, &wbc);
1516 if (ret < 0) 1525 if (ret < 0)
1517 goto out_error; 1526 goto out_error;
1527 continue;
1518 } 1528 }
1519 ret = sync_inode(inode, &wbc); 1529 if (!PagePrivate(page))
1530 break;
1531 ret = nfs_commit_inode(inode, FLUSH_SYNC);
1520 if (ret < 0) 1532 if (ret < 0)
1521 goto out_error; 1533 goto out_error;
1522 } 1534 }
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7a9ae3254a4b..7e26caab2a26 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -44,8 +44,7 @@
44#define NFSDDBG_FACILITY NFSDDBG_PROC 44#define NFSDDBG_FACILITY NFSDDBG_PROC
45 45
46/* Globals */ 46/* Globals */
47static struct path rec_dir; 47static struct file *rec_file;
48static int rec_dir_init = 0;
49 48
50static int 49static int
51nfs4_save_creds(const struct cred **original_creds) 50nfs4_save_creds(const struct cred **original_creds)
@@ -117,33 +116,28 @@ out_no_tfm:
117 return status; 116 return status;
118} 117}
119 118
120static void
121nfsd4_sync_rec_dir(void)
122{
123 vfs_fsync(NULL, rec_dir.dentry, 0);
124}
125
126int 119int
127nfsd4_create_clid_dir(struct nfs4_client *clp) 120nfsd4_create_clid_dir(struct nfs4_client *clp)
128{ 121{
129 const struct cred *original_cred; 122 const struct cred *original_cred;
130 char *dname = clp->cl_recdir; 123 char *dname = clp->cl_recdir;
131 struct dentry *dentry; 124 struct dentry *dir, *dentry;
132 int status; 125 int status;
133 126
134 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname); 127 dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
135 128
136 if (!rec_dir_init || clp->cl_firststate) 129 if (!rec_file || clp->cl_firststate)
137 return 0; 130 return 0;
138 131
139 status = nfs4_save_creds(&original_cred); 132 status = nfs4_save_creds(&original_cred);
140 if (status < 0) 133 if (status < 0)
141 return status; 134 return status;
142 135
136 dir = rec_file->f_path.dentry;
143 /* lock the parent */ 137 /* lock the parent */
144 mutex_lock(&rec_dir.dentry->d_inode->i_mutex); 138 mutex_lock(&dir->d_inode->i_mutex);
145 139
146 dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1); 140 dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
147 if (IS_ERR(dentry)) { 141 if (IS_ERR(dentry)) {
148 status = PTR_ERR(dentry); 142 status = PTR_ERR(dentry);
149 goto out_unlock; 143 goto out_unlock;
@@ -153,18 +147,18 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
153 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); 147 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
154 goto out_put; 148 goto out_put;
155 } 149 }
156 status = mnt_want_write(rec_dir.mnt); 150 status = mnt_want_write(rec_file->f_path.mnt);
157 if (status) 151 if (status)
158 goto out_put; 152 goto out_put;
159 status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU); 153 status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU);
160 mnt_drop_write(rec_dir.mnt); 154 mnt_drop_write(rec_file->f_path.mnt);
161out_put: 155out_put:
162 dput(dentry); 156 dput(dentry);
163out_unlock: 157out_unlock:
164 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 158 mutex_unlock(&dir->d_inode->i_mutex);
165 if (status == 0) { 159 if (status == 0) {
166 clp->cl_firststate = 1; 160 clp->cl_firststate = 1;
167 nfsd4_sync_rec_dir(); 161 vfs_fsync(rec_file, 0);
168 } 162 }
169 nfs4_reset_creds(original_cred); 163 nfs4_reset_creds(original_cred);
170 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); 164 dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -206,14 +200,14 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
206 struct dentry *dentry; 200 struct dentry *dentry;
207 int status; 201 int status;
208 202
209 if (!rec_dir_init) 203 if (!rec_file)
210 return 0; 204 return 0;
211 205
212 status = nfs4_save_creds(&original_cred); 206 status = nfs4_save_creds(&original_cred);
213 if (status < 0) 207 if (status < 0)
214 return status; 208 return status;
215 209
216 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 210 filp = dentry_open(dget(dir), mntget(rec_file->f_path.mnt), O_RDONLY,
217 current_cred()); 211 current_cred());
218 status = PTR_ERR(filp); 212 status = PTR_ERR(filp);
219 if (IS_ERR(filp)) 213 if (IS_ERR(filp))
@@ -250,13 +244,14 @@ out:
250static int 244static int
251nfsd4_unlink_clid_dir(char *name, int namlen) 245nfsd4_unlink_clid_dir(char *name, int namlen)
252{ 246{
253 struct dentry *dentry; 247 struct dentry *dir, *dentry;
254 int status; 248 int status;
255 249
256 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); 250 dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
257 251
258 mutex_lock_nested(&rec_dir.dentry->d_inode->i_mutex, I_MUTEX_PARENT); 252 dir = rec_file->f_path.dentry;
259 dentry = lookup_one_len(name, rec_dir.dentry, namlen); 253 mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT);
254 dentry = lookup_one_len(name, dir, namlen);
260 if (IS_ERR(dentry)) { 255 if (IS_ERR(dentry)) {
261 status = PTR_ERR(dentry); 256 status = PTR_ERR(dentry);
262 goto out_unlock; 257 goto out_unlock;
@@ -264,11 +259,11 @@ nfsd4_unlink_clid_dir(char *name, int namlen)
264 status = -ENOENT; 259 status = -ENOENT;
265 if (!dentry->d_inode) 260 if (!dentry->d_inode)
266 goto out; 261 goto out;
267 status = vfs_rmdir(rec_dir.dentry->d_inode, dentry); 262 status = vfs_rmdir(dir->d_inode, dentry);
268out: 263out:
269 dput(dentry); 264 dput(dentry);
270out_unlock: 265out_unlock:
271 mutex_unlock(&rec_dir.dentry->d_inode->i_mutex); 266 mutex_unlock(&dir->d_inode->i_mutex);
272 return status; 267 return status;
273} 268}
274 269
@@ -278,10 +273,10 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
278 const struct cred *original_cred; 273 const struct cred *original_cred;
279 int status; 274 int status;
280 275
281 if (!rec_dir_init || !clp->cl_firststate) 276 if (!rec_file || !clp->cl_firststate)
282 return; 277 return;
283 278
284 status = mnt_want_write(rec_dir.mnt); 279 status = mnt_want_write(rec_file->f_path.mnt);
285 if (status) 280 if (status)
286 goto out; 281 goto out;
287 clp->cl_firststate = 0; 282 clp->cl_firststate = 0;
@@ -293,8 +288,8 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
293 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); 288 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
294 nfs4_reset_creds(original_cred); 289 nfs4_reset_creds(original_cred);
295 if (status == 0) 290 if (status == 0)
296 nfsd4_sync_rec_dir(); 291 vfs_fsync(rec_file, 0);
297 mnt_drop_write(rec_dir.mnt); 292 mnt_drop_write(rec_file->f_path.mnt);
298out: 293out:
299 if (status) 294 if (status)
300 printk("NFSD: Failed to remove expired client state directory" 295 printk("NFSD: Failed to remove expired client state directory"
@@ -323,19 +318,19 @@ void
323nfsd4_recdir_purge_old(void) { 318nfsd4_recdir_purge_old(void) {
324 int status; 319 int status;
325 320
326 if (!rec_dir_init) 321 if (!rec_file)
327 return; 322 return;
328 status = mnt_want_write(rec_dir.mnt); 323 status = mnt_want_write(rec_file->f_path.mnt);
329 if (status) 324 if (status)
330 goto out; 325 goto out;
331 status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); 326 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, purge_old);
332 if (status == 0) 327 if (status == 0)
333 nfsd4_sync_rec_dir(); 328 vfs_fsync(rec_file, 0);
334 mnt_drop_write(rec_dir.mnt); 329 mnt_drop_write(rec_file->f_path.mnt);
335out: 330out:
336 if (status) 331 if (status)
337 printk("nfsd4: failed to purge old clients from recovery" 332 printk("nfsd4: failed to purge old clients from recovery"
338 " directory %s\n", rec_dir.dentry->d_name.name); 333 " directory %s\n", rec_file->f_path.dentry->d_name.name);
339} 334}
340 335
341static int 336static int
@@ -355,10 +350,13 @@ int
355nfsd4_recdir_load(void) { 350nfsd4_recdir_load(void) {
356 int status; 351 int status;
357 352
358 status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir); 353 if (!rec_file)
354 return 0;
355
356 status = nfsd4_list_rec_dir(rec_file->f_path.dentry, load_recdir);
359 if (status) 357 if (status)
360 printk("nfsd4: failed loading clients from recovery" 358 printk("nfsd4: failed loading clients from recovery"
361 " directory %s\n", rec_dir.dentry->d_name.name); 359 " directory %s\n", rec_file->f_path.dentry->d_name.name);
362 return status; 360 return status;
363} 361}
364 362
@@ -375,7 +373,7 @@ nfsd4_init_recdir(char *rec_dirname)
375 printk("NFSD: Using %s as the NFSv4 state recovery directory\n", 373 printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
376 rec_dirname); 374 rec_dirname);
377 375
378 BUG_ON(rec_dir_init); 376 BUG_ON(rec_file);
379 377
380 status = nfs4_save_creds(&original_cred); 378 status = nfs4_save_creds(&original_cred);
381 if (status < 0) { 379 if (status < 0) {
@@ -385,22 +383,21 @@ nfsd4_init_recdir(char *rec_dirname)
385 return; 383 return;
386 } 384 }
387 385
388 status = kern_path(rec_dirname, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 386 rec_file = filp_open(rec_dirname, O_RDONLY | O_DIRECTORY, 0);
389 &rec_dir); 387 if (IS_ERR(rec_file)) {
390 if (status)
391 printk("NFSD: unable to find recovery directory %s\n", 388 printk("NFSD: unable to find recovery directory %s\n",
392 rec_dirname); 389 rec_dirname);
390 rec_file = NULL;
391 }
393 392
394 if (!status)
395 rec_dir_init = 1;
396 nfs4_reset_creds(original_cred); 393 nfs4_reset_creds(original_cred);
397} 394}
398 395
399void 396void
400nfsd4_shutdown_recdir(void) 397nfsd4_shutdown_recdir(void)
401{ 398{
402 if (!rec_dir_init) 399 if (!rec_file)
403 return; 400 return;
404 rec_dir_init = 0; 401 fput(rec_file);
405 path_put(&rec_dir); 402 rec_file = NULL;
406} 403}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 12f7109720c2..4a2734758778 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4122,8 +4122,8 @@ nfs4_state_shutdown(void)
4122 nfs4_lock_state(); 4122 nfs4_lock_state();
4123 nfs4_release_reclaim(); 4123 nfs4_release_reclaim();
4124 __nfs4_state_shutdown(); 4124 __nfs4_state_shutdown();
4125 nfsd4_destroy_callback_queue();
4126 nfs4_unlock_state(); 4125 nfs4_unlock_state();
4126 nfsd4_destroy_callback_queue();
4127} 4127}
4128 4128
4129/* 4129/*
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc3194ea01f5..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
998 if (sscanf(buf, "%15s %4u", transport, &port) != 2) 998 if (sscanf(buf, "%15s %4u", transport, &port) != 2)
999 return -EINVAL; 999 return -EINVAL;
1000 1000
1001 if (port < 1 || port > USHORT_MAX) 1001 if (port < 1 || port > USHRT_MAX)
1002 return -EINVAL; 1002 return -EINVAL;
1003 1003
1004 err = nfsd_create_serv(); 1004 err = nfsd_create_serv();
@@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2) 1040 if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
1041 return -EINVAL; 1041 return -EINVAL;
1042 1042
1043 if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL) 1043 if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
1044 return -EINVAL; 1044 return -EINVAL;
1045 1045
1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port); 1046 xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23c06f77f4ca..3c111120b619 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
443 if (size_change) 443 if (size_change)
444 put_write_access(inode); 444 put_write_access(inode);
445 if (!err) 445 if (!err)
446 if (EX_ISSYNC(fhp->fh_export)) 446 commit_metadata(fhp);
447 write_inode_now(inode, 1);
448out: 447out:
449 return err; 448 return err;
450 449
@@ -999,7 +998,7 @@ static int wait_for_concurrent_writes(struct file *file)
999 998
1000 if (inode->i_state & I_DIRTY) { 999 if (inode->i_state & I_DIRTY) {
1001 dprintk("nfsd: write sync %d\n", task_pid_nr(current)); 1000 dprintk("nfsd: write sync %d\n", task_pid_nr(current));
1002 err = vfs_fsync(file, file->f_path.dentry, 0); 1001 err = vfs_fsync(file, 0);
1003 } 1002 }
1004 last_ino = inode->i_ino; 1003 last_ino = inode->i_ino;
1005 last_dev = inode->i_sb->s_dev; 1004 last_dev = inode->i_sb->s_dev;
@@ -1175,8 +1174,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
1175 if (err) 1174 if (err)
1176 goto out; 1175 goto out;
1177 if (EX_ISSYNC(fhp->fh_export)) { 1176 if (EX_ISSYNC(fhp->fh_export)) {
1178 int err2 = vfs_fsync_range(file, file->f_path.dentry, 1177 int err2 = vfs_fsync_range(file, offset, end, 0);
1179 offset, end, 0);
1180 1178
1181 if (err2 != -EINVAL) 1179 if (err2 != -EINVAL)
1182 err = nfserrno(err2); 1180 err = nfserrno(err2);
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index af638d59e3bf..43c8c5b541fd 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -75,8 +75,6 @@ struct nilfs_btree_path {
75 75
76extern struct kmem_cache *nilfs_btree_path_cache; 76extern struct kmem_cache *nilfs_btree_path_cache;
77 77
78int nilfs_btree_path_cache_init(void);
79void nilfs_btree_path_cache_destroy(void);
80int nilfs_btree_init(struct nilfs_bmap *); 78int nilfs_btree_init(struct nilfs_bmap *);
81int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, 79int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
82 const __u64 *, const __u64 *, int); 80 const __u64 *, const __u64 *, int);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
27#include "nilfs.h" 27#include "nilfs.h"
28#include "segment.h" 28#include "segment.h"
29 29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 30int nilfs_sync_file(struct file *file, int datasync)
31{ 31{
32 /* 32 /*
33 * Called from fsync() system call 33 * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
37 * This function should be implemented when the writeback function 37 * This function should be implemented when the writeback function
38 * will be implemented. 38 * will be implemented.
39 */ 39 */
40 struct inode *inode = dentry->d_inode; 40 struct inode *inode = file->f_mapping->host;
41 int err; 41 int err;
42 42
43 if (!nilfs_inode_dirty(inode)) 43 if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 5e226d4b41d3..39e038ac8fcb 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -280,16 +280,7 @@ struct inode *nilfs_new_inode(struct inode *dir, int mode)
280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */ 280 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
281 281
282 atomic_inc(&sbi->s_inodes_count); 282 atomic_inc(&sbi->s_inodes_count);
283 283 inode_init_owner(inode, dir, mode);
284 inode->i_uid = current_fsuid();
285 if (dir->i_mode & S_ISGID) {
286 inode->i_gid = dir->i_gid;
287 if (S_ISDIR(mode))
288 mode |= S_ISGID;
289 } else
290 inode->i_gid = current_fsgid();
291
292 inode->i_mode = mode;
293 inode->i_ino = ino; 284 inode->i_ino = ino;
294 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 285 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
295 286
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
228 struct page *, struct inode *); 228 struct page *, struct inode *);
229 229
230/* file.c */ 230/* file.c */
231extern int nilfs_sync_file(struct file *, struct dentry *, int); 231extern int nilfs_sync_file(struct file *, int);
232 232
233/* ioctl.c */ 233/* ioctl.c */
234long nilfs_ioctl(struct file *, unsigned int, unsigned long); 234long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fdf1c3b6d673..85fbb66455e2 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -127,8 +127,6 @@ struct nilfs_segment_buffer {
127 127
128extern struct kmem_cache *nilfs_segbuf_cachep; 128extern struct kmem_cache *nilfs_segbuf_cachep;
129 129
130int __init nilfs_init_segbuf_cache(void);
131void nilfs_destroy_segbuf_cache(void);
132struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *); 130struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
133void nilfs_segbuf_free(struct nilfs_segment_buffer *); 131void nilfs_segbuf_free(struct nilfs_segment_buffer *);
134void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long, 132void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index dca142361ccf..01e20dbb217d 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -221,8 +221,6 @@ enum {
221extern struct kmem_cache *nilfs_transaction_cachep; 221extern struct kmem_cache *nilfs_transaction_cachep;
222 222
223/* segment.c */ 223/* segment.c */
224extern int nilfs_init_transaction_cache(void);
225extern void nilfs_destroy_transaction_cache(void);
226extern void nilfs_relax_pressure_in_lock(struct super_block *); 224extern void nilfs_relax_pressure_in_lock(struct super_block *);
227 225
228extern int nilfs_construct_segment(struct super_block *); 226extern int nilfs_construct_segment(struct super_block *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 03b34b738993..414ef68931cf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1130,13 +1130,13 @@ static void nilfs_segbuf_init_once(void *obj)
1130 1130
1131static void nilfs_destroy_cachep(void) 1131static void nilfs_destroy_cachep(void)
1132{ 1132{
1133 if (nilfs_inode_cachep) 1133 if (nilfs_inode_cachep)
1134 kmem_cache_destroy(nilfs_inode_cachep); 1134 kmem_cache_destroy(nilfs_inode_cachep);
1135 if (nilfs_transaction_cachep) 1135 if (nilfs_transaction_cachep)
1136 kmem_cache_destroy(nilfs_transaction_cachep); 1136 kmem_cache_destroy(nilfs_transaction_cachep);
1137 if (nilfs_segbuf_cachep) 1137 if (nilfs_segbuf_cachep)
1138 kmem_cache_destroy(nilfs_segbuf_cachep); 1138 kmem_cache_destroy(nilfs_segbuf_cachep);
1139 if (nilfs_btree_path_cache) 1139 if (nilfs_btree_path_cache)
1140 kmem_cache_destroy(nilfs_btree_path_cache); 1140 kmem_cache_destroy(nilfs_btree_path_cache);
1141} 1141}
1142 1142
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index a756168a21c2..8c1097327abc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -674,7 +674,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
674 start * sects_per_block, 674 start * sects_per_block,
675 nblocks * sects_per_block, 675 nblocks * sects_per_block,
676 GFP_NOFS, 676 GFP_NOFS,
677 DISCARD_FL_BARRIER); 677 BLKDEV_IFL_BARRIER);
678 if (ret < 0) 678 if (ret < 0)
679 return ret; 679 return ret;
680 nblocks = 0; 680 nblocks = 0;
@@ -684,7 +684,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
684 ret = blkdev_issue_discard(nilfs->ns_bdev, 684 ret = blkdev_issue_discard(nilfs->ns_bdev,
685 start * sects_per_block, 685 start * sects_per_block,
686 nblocks * sects_per_block, 686 nblocks * sects_per_block,
687 GFP_NOFS, DISCARD_FL_BARRIER); 687 GFP_NOFS, BLKDEV_IFL_BARRIER);
688 return ret; 688 return ret;
689} 689}
690 690
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 40b1cf914ccb..27b75ebc7460 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -110,14 +110,10 @@ EXPORT_SYMBOL_GPL(get_inotify_watch);
110int pin_inotify_watch(struct inotify_watch *watch) 110int pin_inotify_watch(struct inotify_watch *watch)
111{ 111{
112 struct super_block *sb = watch->inode->i_sb; 112 struct super_block *sb = watch->inode->i_sb;
113 spin_lock(&sb_lock); 113 if (atomic_inc_not_zero(&sb->s_active)) {
114 if (sb->s_count >= S_BIAS) {
115 atomic_inc(&sb->s_active);
116 spin_unlock(&sb_lock);
117 atomic_inc(&watch->count); 114 atomic_inc(&watch->count);
118 return 1; 115 return 1;
119 } 116 }
120 spin_unlock(&sb_lock);
121 return 0; 117 return 0;
122} 118}
123 119
@@ -515,34 +511,8 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
515 * done. Cleanup is just deactivate_super(). However, that leaves a messy 511 * done. Cleanup is just deactivate_super(). However, that leaves a messy
516 * case - what if we *are* racing with umount() and active references to 512 * case - what if we *are* racing with umount() and active references to
517 * superblock can't be acquired anymore? We can bump ->s_count, grab 513 * superblock can't be acquired anymore? We can bump ->s_count, grab
518 * ->s_umount, which will almost certainly wait until the superblock is shut 514 * ->s_umount, which will wait until the superblock is shut down and the
519 * down and the watch in question is pining for fjords. That's fine, but 515 * watch in question is pining for fjords.
520 * there is a problem - we might have hit the window between ->s_active
521 * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock
522 * is past the point of no return and is heading for shutdown) and the
523 * moment when deactivate_super() acquires ->s_umount. We could just do
524 * drop_super() yield() and retry, but that's rather antisocial and this
525 * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having
526 * found that we'd got there first (i.e. that ->s_root is non-NULL) we know
527 * that we won't race with inotify_umount_inodes(). So we could grab a
528 * reference to watch and do the rest as above, just with drop_super() instead
529 * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we
530 * could grab ->s_umount. So the watch could've been gone already.
531 *
532 * That still can be dealt with - we need to save watch->wd, do idr_find()
533 * and compare its result with our pointer. If they match, we either have
534 * the damn thing still alive or we'd lost not one but two races at once,
535 * the watch had been killed and a new one got created with the same ->wd
536 * at the same address. That couldn't have happened in inotify_destroy(),
537 * but inotify_rm_wd() could run into that. Still, "new one got created"
538 * is not a problem - we have every right to kill it or leave it alone,
539 * whatever's more convenient.
540 *
541 * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as
542 * "grab it and kill it" check. If it's been our original watch, we are
543 * fine, if it's a newcomer - nevermind, just pretend that we'd won the
544 * race and kill the fscker anyway; we are safe since we know that its
545 * superblock won't be going away.
546 * 516 *
547 * And yes, this is far beyond mere "not very pretty"; so's the entire 517 * And yes, this is far beyond mere "not very pretty"; so's the entire
548 * concept of inotify to start with. 518 * concept of inotify to start with.
@@ -556,57 +526,31 @@ EXPORT_SYMBOL_GPL(inotify_init_watch);
556 * Called with ih->mutex held, drops it. Possible return values: 526 * Called with ih->mutex held, drops it. Possible return values:
557 * 0 - nothing to do, it has died 527 * 0 - nothing to do, it has died
558 * 1 - remove it, drop the reference and deactivate_super() 528 * 1 - remove it, drop the reference and deactivate_super()
559 * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid
560 * that variant, since it involved a lot of PITA, but that's the best that
561 * could've been done.
562 */ 529 */
563static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) 530static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch)
564{ 531{
565 struct super_block *sb = watch->inode->i_sb; 532 struct super_block *sb = watch->inode->i_sb;
566 s32 wd = watch->wd;
567 533
568 spin_lock(&sb_lock); 534 if (atomic_inc_not_zero(&sb->s_active)) {
569 if (sb->s_count >= S_BIAS) {
570 atomic_inc(&sb->s_active);
571 spin_unlock(&sb_lock);
572 get_inotify_watch(watch); 535 get_inotify_watch(watch);
573 mutex_unlock(&ih->mutex); 536 mutex_unlock(&ih->mutex);
574 return 1; /* the best outcome */ 537 return 1; /* the best outcome */
575 } 538 }
539 spin_lock(&sb_lock);
576 sb->s_count++; 540 sb->s_count++;
577 spin_unlock(&sb_lock); 541 spin_unlock(&sb_lock);
578 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ 542 mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */
579 down_read(&sb->s_umount); 543 down_read(&sb->s_umount);
580 if (likely(!sb->s_root)) { 544 /* fs is already shut down; the watch is dead */
581 /* fs is already shut down; the watch is dead */ 545 drop_super(sb);
582 drop_super(sb); 546 return 0;
583 return 0;
584 }
585 /* raced with the final deactivate_super() */
586 mutex_lock(&ih->mutex);
587 if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) {
588 /* the watch is dead */
589 mutex_unlock(&ih->mutex);
590 drop_super(sb);
591 return 0;
592 }
593 /* still alive or freed and reused with the same sb and wd; kill */
594 get_inotify_watch(watch);
595 mutex_unlock(&ih->mutex);
596 return 2;
597} 547}
598 548
599static void unpin_and_kill(struct inotify_watch *watch, int how) 549static void unpin_and_kill(struct inotify_watch *watch)
600{ 550{
601 struct super_block *sb = watch->inode->i_sb; 551 struct super_block *sb = watch->inode->i_sb;
602 put_inotify_watch(watch); 552 put_inotify_watch(watch);
603 switch (how) { 553 deactivate_super(sb);
604 case 1:
605 deactivate_super(sb);
606 break;
607 case 2:
608 drop_super(sb);
609 }
610} 554}
611 555
612/** 556/**
@@ -628,7 +572,6 @@ void inotify_destroy(struct inotify_handle *ih)
628 struct list_head *watches; 572 struct list_head *watches;
629 struct super_block *sb; 573 struct super_block *sb;
630 struct inode *inode; 574 struct inode *inode;
631 int how;
632 575
633 mutex_lock(&ih->mutex); 576 mutex_lock(&ih->mutex);
634 watches = &ih->watches; 577 watches = &ih->watches;
@@ -638,8 +581,7 @@ void inotify_destroy(struct inotify_handle *ih)
638 } 581 }
639 watch = list_first_entry(watches, struct inotify_watch, h_list); 582 watch = list_first_entry(watches, struct inotify_watch, h_list);
640 sb = watch->inode->i_sb; 583 sb = watch->inode->i_sb;
641 how = pin_to_kill(ih, watch); 584 if (!pin_to_kill(ih, watch))
642 if (!how)
643 continue; 585 continue;
644 586
645 inode = watch->inode; 587 inode = watch->inode;
@@ -654,7 +596,7 @@ void inotify_destroy(struct inotify_handle *ih)
654 596
655 mutex_unlock(&ih->mutex); 597 mutex_unlock(&ih->mutex);
656 mutex_unlock(&inode->inotify_mutex); 598 mutex_unlock(&inode->inotify_mutex);
657 unpin_and_kill(watch, how); 599 unpin_and_kill(watch);
658 } 600 }
659 601
660 /* free this handle: the put matching the get in inotify_init() */ 602 /* free this handle: the put matching the get in inotify_init() */
@@ -857,7 +799,6 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
857 struct inotify_watch *watch; 799 struct inotify_watch *watch;
858 struct super_block *sb; 800 struct super_block *sb;
859 struct inode *inode; 801 struct inode *inode;
860 int how;
861 802
862 mutex_lock(&ih->mutex); 803 mutex_lock(&ih->mutex);
863 watch = idr_find(&ih->idr, wd); 804 watch = idr_find(&ih->idr, wd);
@@ -866,8 +807,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
866 return -EINVAL; 807 return -EINVAL;
867 } 808 }
868 sb = watch->inode->i_sb; 809 sb = watch->inode->i_sb;
869 how = pin_to_kill(ih, watch); 810 if (!pin_to_kill(ih, watch))
870 if (!how)
871 return 0; 811 return 0;
872 812
873 inode = watch->inode; 813 inode = watch->inode;
@@ -881,7 +821,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd)
881 821
882 mutex_unlock(&ih->mutex); 822 mutex_unlock(&ih->mutex);
883 mutex_unlock(&inode->inotify_mutex); 823 mutex_unlock(&inode->inotify_mutex);
884 unpin_and_kill(watch, how); 824 unpin_and_kill(watch);
885 825
886 return 0; 826 return 0;
887} 827}
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4a..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
1527 * this problem for now. We do write the $BITMAP attribute if it is present 1527 * this problem for now. We do write the $BITMAP attribute if it is present
1528 * which is the important one for a directory so things are not too bad. 1528 * which is the important one for a directory so things are not too bad.
1529 */ 1529 */
1530static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry, 1530static int ntfs_dir_fsync(struct file *filp, int datasync)
1531 int datasync)
1532{ 1531{
1533 struct inode *bmp_vi, *vi = dentry->d_inode; 1532 struct inode *bmp_vi, *vi = filp->f_mapping->host;
1534 int err, ret; 1533 int err, ret;
1535 ntfs_attr na; 1534 ntfs_attr na;
1536 1535
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
98 * the page at all. For a more detailed explanation see ntfs_truncate() in 98 * the page at all. For a more detailed explanation see ntfs_truncate() in
99 * fs/ntfs/inode.c. 99 * fs/ntfs/inode.c.
100 * 100 *
101 * @cached_page and @lru_pvec are just optimizations for dealing with multiple
102 * pages.
103 *
104 * Return 0 on success and -errno on error. In the case that an error is 101 * Return 0 on success and -errno on error. In the case that an error is
105 * encountered it is possible that the initialized size will already have been 102 * encountered it is possible that the initialized size will already have been
106 * incremented some way towards @new_init_size but it is guaranteed that if 103 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
110 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be 107 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
111 * held by the caller. 108 * held by the caller.
112 */ 109 */
113static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size, 110static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
114 struct page **cached_page, struct pagevec *lru_pvec)
115{ 111{
116 s64 old_init_size; 112 s64 old_init_size;
117 loff_t old_i_size; 113 loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
403 * Obtain @nr_pages locked page cache pages from the mapping @mapping and 399 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
404 * starting at index @index. 400 * starting at index @index.
405 * 401 *
406 * If a page is newly created, increment its refcount and add it to the 402 * If a page is newly created, add it to lru list
407 * caller's lru-buffering pagevec @lru_pvec.
408 *
409 * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
410 * are obtained at once instead of just one page and that 0 is returned on
411 * success and -errno on error.
412 * 403 *
413 * Note, the page locks are obtained in ascending page index order. 404 * Note, the page locks are obtained in ascending page index order.
414 */ 405 */
415static inline int __ntfs_grab_cache_pages(struct address_space *mapping, 406static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
416 pgoff_t index, const unsigned nr_pages, struct page **pages, 407 pgoff_t index, const unsigned nr_pages, struct page **pages,
417 struct page **cached_page, struct pagevec *lru_pvec) 408 struct page **cached_page)
418{ 409{
419 int err, nr; 410 int err, nr;
420 411
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
430 goto err_out; 421 goto err_out;
431 } 422 }
432 } 423 }
433 err = add_to_page_cache(*cached_page, mapping, index, 424 err = add_to_page_cache_lru(*cached_page, mapping, index,
434 GFP_KERNEL); 425 GFP_KERNEL);
435 if (unlikely(err)) { 426 if (unlikely(err)) {
436 if (err == -EEXIST) 427 if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
438 goto err_out; 429 goto err_out;
439 } 430 }
440 pages[nr] = *cached_page; 431 pages[nr] = *cached_page;
441 page_cache_get(*cached_page);
442 if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
443 __pagevec_lru_add_file(lru_pvec);
444 *cached_page = NULL; 432 *cached_page = NULL;
445 } 433 }
446 index++; 434 index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1800 ssize_t status, written; 1788 ssize_t status, written;
1801 unsigned nr_pages; 1789 unsigned nr_pages;
1802 int err; 1790 int err;
1803 struct pagevec lru_pvec;
1804 1791
1805 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " 1792 ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
1806 "pos 0x%llx, count 0x%lx.", 1793 "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1912 } 1899 }
1913 } 1900 }
1914 } 1901 }
1915 pagevec_init(&lru_pvec, 0);
1916 written = 0; 1902 written = 0;
1917 /* 1903 /*
1918 * If the write starts beyond the initialized size, extend it up to the 1904 * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
1925 ll = ni->initialized_size; 1911 ll = ni->initialized_size;
1926 read_unlock_irqrestore(&ni->size_lock, flags); 1912 read_unlock_irqrestore(&ni->size_lock, flags);
1927 if (pos > ll) { 1913 if (pos > ll) {
1928 err = ntfs_attr_extend_initialized(ni, pos, &cached_page, 1914 err = ntfs_attr_extend_initialized(ni, pos);
1929 &lru_pvec);
1930 if (err < 0) { 1915 if (err < 0) {
1931 ntfs_error(vol->sb, "Cannot perform write to inode " 1916 ntfs_error(vol->sb, "Cannot perform write to inode "
1932 "0x%lx, attribute type 0x%x, because " 1917 "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
2012 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); 1997 ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
2013 /* Get and lock @do_pages starting at index @start_idx. */ 1998 /* Get and lock @do_pages starting at index @start_idx. */
2014 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, 1999 status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
2015 pages, &cached_page, &lru_pvec); 2000 pages, &cached_page);
2016 if (unlikely(status)) 2001 if (unlikely(status))
2017 break; 2002 break;
2018 /* 2003 /*
@@ -2077,7 +2062,6 @@ err_out:
2077 *ppos = pos; 2062 *ppos = pos;
2078 if (cached_page) 2063 if (cached_page)
2079 page_cache_release(cached_page); 2064 page_cache_release(cached_page);
2080 pagevec_lru_add_file(&lru_pvec);
2081 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", 2065 ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
2082 written ? "written" : "status", (unsigned long)written, 2066 written ? "written" : "status", (unsigned long)written,
2083 (long)status); 2067 (long)status);
@@ -2149,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2149/** 2133/**
2150 * ntfs_file_fsync - sync a file to disk 2134 * ntfs_file_fsync - sync a file to disk
2151 * @filp: file to be synced 2135 * @filp: file to be synced
2152 * @dentry: dentry describing the file to sync
2153 * @datasync: if non-zero only flush user data and not metadata 2136 * @datasync: if non-zero only flush user data and not metadata
2154 * 2137 *
2155 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync 2138 * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
@@ -2165,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
2165 * Also, if @datasync is true, we do not wait on the inode to be written out 2148 * Also, if @datasync is true, we do not wait on the inode to be written out
2166 * but we always wait on the page cache pages to be written out. 2149 * but we always wait on the page cache pages to be written out.
2167 * 2150 *
2168 * Note: In the past @filp could be NULL so we ignore it as we don't need it
2169 * anyway.
2170 *
2171 * Locking: Caller must hold i_mutex on the inode. 2151 * Locking: Caller must hold i_mutex on the inode.
2172 * 2152 *
2173 * TODO: We should probably also write all attribute/index inodes associated 2153 * TODO: We should probably also write all attribute/index inodes associated
2174 * with this inode but since we have no simple way of getting to them we ignore 2154 * with this inode but since we have no simple way of getting to them we ignore
2175 * this problem for now. 2155 * this problem for now.
2176 */ 2156 */
2177static int ntfs_file_fsync(struct file *filp, struct dentry *dentry, 2157static int ntfs_file_fsync(struct file *filp, int datasync)
2178 int datasync)
2179{ 2158{
2180 struct inode *vi = dentry->d_inode; 2159 struct inode *vi = filp->f_mapping->host;
2181 int err, ret = 0; 2160 int err, ret = 0;
2182 2161
2183 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); 2162 ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e13fc9e8fcdc..da702294d7e7 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -489,7 +489,7 @@ cleanup:
489 return ret; 489 return ret;
490} 490}
491 491
492struct xattr_handler ocfs2_xattr_acl_access_handler = { 492const struct xattr_handler ocfs2_xattr_acl_access_handler = {
493 .prefix = POSIX_ACL_XATTR_ACCESS, 493 .prefix = POSIX_ACL_XATTR_ACCESS,
494 .flags = ACL_TYPE_ACCESS, 494 .flags = ACL_TYPE_ACCESS,
495 .list = ocfs2_xattr_list_acl_access, 495 .list = ocfs2_xattr_list_acl_access,
@@ -497,7 +497,7 @@ struct xattr_handler ocfs2_xattr_acl_access_handler = {
497 .set = ocfs2_xattr_set_acl, 497 .set = ocfs2_xattr_set_acl,
498}; 498};
499 499
500struct xattr_handler ocfs2_xattr_acl_default_handler = { 500const struct xattr_handler ocfs2_xattr_acl_default_handler = {
501 .prefix = POSIX_ACL_XATTR_DEFAULT, 501 .prefix = POSIX_ACL_XATTR_DEFAULT,
502 .flags = ACL_TYPE_DEFAULT, 502 .flags = ACL_TYPE_DEFAULT,
503 .list = ocfs2_xattr_list_acl_default, 503 .list = ocfs2_xattr_list_acl_default,
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 403 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
404 * larger than 16 bits. 404 * larger than 16 bits.
405 */ 405 */
406 BUG_ON(ecc > USHORT_MAX); 406 BUG_ON(ecc > USHRT_MAX);
407 407
408 bc->bc_crc32e = cpu_to_le32(crc); 408 bc->bc_crc32e = cpu_to_le32(crc);
409 bc->bc_ecc = cpu_to_le16((u16)ecc); 409 bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no 508 * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
509 * larger than 16 bits. 509 * larger than 16 bits.
510 */ 510 */
511 BUG_ON(ecc > USHORT_MAX); 511 BUG_ON(ecc > USHRT_MAX);
512 512
513 bc->bc_crc32e = cpu_to_le32(crc); 513 bc->bc_crc32e = cpu_to_le32(crc);
514 bc->bc_ecc = cpu_to_le16((u16)ecc); 514 bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 97e54b9e654b..6a13ea64c447 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
175 return 0; 175 return 0;
176} 176}
177 177
178static int ocfs2_sync_file(struct file *file, 178static int ocfs2_sync_file(struct file *file, int datasync)
179 struct dentry *dentry,
180 int datasync)
181{ 179{
182 int err = 0; 180 int err = 0;
183 journal_t *journal; 181 journal_t *journal;
184 struct inode *inode = dentry->d_inode; 182 struct dentry *dentry = file->f_path.dentry;
183 struct inode *inode = file->f_mapping->host;
185 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 184 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
186 185
187 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, 186 mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -1053,7 +1052,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1053 } 1052 }
1054 1053
1055 /* 1054 /*
1056 * This will intentionally not wind up calling vmtruncate(), 1055 * This will intentionally not wind up calling simple_setsize(),
1057 * since all the work for a size change has been done above. 1056 * since all the work for a size change has been done above.
1058 * Otherwise, we could get into problems with truncate as 1057 * Otherwise, we could get into problems with truncate as
1059 * ip_alloc_sem is used there to protect against i_size 1058 * ip_alloc_sem is used there to protect against i_size
@@ -2119,9 +2118,13 @@ relock:
2119 * direct write may have instantiated a few 2118 * direct write may have instantiated a few
2120 * blocks outside i_size. Trim these off again. 2119 * blocks outside i_size. Trim these off again.
2121 * Don't need i_size_read because we hold i_mutex. 2120 * Don't need i_size_read because we hold i_mutex.
2121 *
2122 * XXX(hch): this looks buggy because ocfs2 did not
2123 * actually implement ->truncate. Take a look at
2124 * the new truncate sequence and update this accordingly
2122 */ 2125 */
2123 if (*ppos + count > inode->i_size) 2126 if (*ppos + count > inode->i_size)
2124 vmtruncate(inode, inode->i_size); 2127 simple_setsize(inode, inode->i_size);
2125 ret = written; 2128 ret = written;
2126 goto out_dio; 2129 goto out_dio;
2127 } 2130 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index db5dd3ed4df4..f171b51a74f7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -204,14 +204,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
204 inode->i_nlink = 2; 204 inode->i_nlink = 2;
205 else 205 else
206 inode->i_nlink = 1; 206 inode->i_nlink = 1;
207 inode->i_uid = current_fsuid(); 207 inode_init_owner(inode, dir, mode);
208 if (dir->i_mode & S_ISGID) {
209 inode->i_gid = dir->i_gid;
210 if (S_ISDIR(mode))
211 mode |= S_ISGID;
212 } else
213 inode->i_gid = current_fsgid();
214 inode->i_mode = mode;
215 dquot_initialize(inode); 208 dquot_initialize(inode);
216 return inode; 209 return inode;
217} 210}
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 812f10233b10..03a799fdd740 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -879,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
879 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 879 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
880 continue; 880 continue;
881 if (unsuspend) 881 if (unsuspend)
882 status = vfs_quota_enable( 882 status = dquot_resume(sb, type);
883 sb_dqopt(sb)->files[type], 883 else {
884 type, QFMT_OCFS2, 884 struct ocfs2_mem_dqinfo *oinfo;
885 DQUOT_SUSPENDED); 885
886 else 886 /* Cancel periodic syncing before suspending */
887 status = vfs_quota_disable(sb, type, 887 oinfo = sb_dqinfo(sb, type)->dqi_priv;
888 DQUOT_SUSPENDED); 888 cancel_delayed_work_sync(&oinfo->dqi_sync_work);
889 status = dquot_suspend(sb, type);
890 }
889 if (status < 0) 891 if (status < 0)
890 break; 892 break;
891 } 893 }
@@ -916,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
916 status = -ENOENT; 918 status = -ENOENT;
917 goto out_quota_off; 919 goto out_quota_off;
918 } 920 }
919 status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, 921 status = dquot_enable(inode[type], type, QFMT_OCFS2,
920 DQUOT_USAGE_ENABLED); 922 DQUOT_USAGE_ENABLED);
921 if (status < 0) 923 if (status < 0)
922 goto out_quota_off; 924 goto out_quota_off;
923 } 925 }
@@ -952,8 +954,8 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
952 /* Turn off quotas. This will remove all dquot structures from 954 /* Turn off quotas. This will remove all dquot structures from
953 * memory and so they will be automatically synced to global 955 * memory and so they will be automatically synced to global
954 * quota files */ 956 * quota files */
955 vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | 957 dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
956 DQUOT_LIMITS_ENABLED); 958 DQUOT_LIMITS_ENABLED);
957 if (!inode) 959 if (!inode)
958 continue; 960 continue;
959 iput(inode); 961 iput(inode);
@@ -962,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
962 964
963/* Handle quota on quotactl */ 965/* Handle quota on quotactl */
964static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, 966static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
965 char *path, int remount) 967 char *path)
966{ 968{
967 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, 969 unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
968 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; 970 OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -970,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
970 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) 972 if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
971 return -EINVAL; 973 return -EINVAL;
972 974
973 if (remount) 975 return dquot_enable(sb_dqopt(sb)->files[type], type,
974 return 0; /* Just ignore it has been handled in 976 format_id, DQUOT_LIMITS_ENABLED);
975 * ocfs2_remount() */
976 return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
977 format_id, DQUOT_LIMITS_ENABLED);
978} 977}
979 978
980/* Handle quota off quotactl */ 979/* Handle quota off quotactl */
981static int ocfs2_quota_off(struct super_block *sb, int type, int remount) 980static int ocfs2_quota_off(struct super_block *sb, int type)
982{ 981{
983 if (remount) 982 return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
984 return 0; /* Ignore now and handle later in
985 * ocfs2_remount() */
986 return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
987} 983}
988 984
989static const struct quotactl_ops ocfs2_quotactl_ops = { 985static const struct quotactl_ops ocfs2_quotactl_ops = {
990 .quota_on = ocfs2_quota_on, 986 .quota_on = ocfs2_quota_on,
991 .quota_off = ocfs2_quota_off, 987 .quota_off = ocfs2_quota_off,
992 .quota_sync = vfs_quota_sync, 988 .quota_sync = dquot_quota_sync,
993 .get_info = vfs_get_dqinfo, 989 .get_info = dquot_get_dqinfo,
994 .set_info = vfs_set_dqinfo, 990 .set_info = dquot_set_dqinfo,
995 .get_dqblk = vfs_get_dqblk, 991 .get_dqblk = dquot_get_dqblk,
996 .set_dqblk = vfs_set_dqblk, 992 .set_dqblk = dquot_set_dqblk,
997}; 993};
998 994
999static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 995static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 98ee6c44102d..e97b34842cfe 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -97,7 +97,7 @@ static struct ocfs2_xattr_def_value_root def_xv = {
97 .xv.xr_list.l_count = cpu_to_le16(1), 97 .xv.xr_list.l_count = cpu_to_le16(1),
98}; 98};
99 99
100struct xattr_handler *ocfs2_xattr_handlers[] = { 100const struct xattr_handler *ocfs2_xattr_handlers[] = {
101 &ocfs2_xattr_user_handler, 101 &ocfs2_xattr_user_handler,
102 &ocfs2_xattr_acl_access_handler, 102 &ocfs2_xattr_acl_access_handler,
103 &ocfs2_xattr_acl_default_handler, 103 &ocfs2_xattr_acl_default_handler,
@@ -106,7 +106,7 @@ struct xattr_handler *ocfs2_xattr_handlers[] = {
106 NULL 106 NULL
107}; 107};
108 108
109static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { 109static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, 110 [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] 111 [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
112 = &ocfs2_xattr_acl_access_handler, 112 = &ocfs2_xattr_acl_access_handler,
@@ -540,7 +540,7 @@ static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
540 540
541static inline const char *ocfs2_xattr_prefix(int name_index) 541static inline const char *ocfs2_xattr_prefix(int name_index)
542{ 542{
543 struct xattr_handler *handler = NULL; 543 const struct xattr_handler *handler = NULL;
544 544
545 if (name_index > 0 && name_index < OCFS2_XATTR_MAX) 545 if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
546 handler = ocfs2_xattr_handler_map[name_index]; 546 handler = ocfs2_xattr_handler_map[name_index];
@@ -7213,7 +7213,7 @@ int ocfs2_init_security_set(handle_t *handle,
7213 xattr_ac, data_ac); 7213 xattr_ac, data_ac);
7214} 7214}
7215 7215
7216struct xattr_handler ocfs2_xattr_security_handler = { 7216const struct xattr_handler ocfs2_xattr_security_handler = {
7217 .prefix = XATTR_SECURITY_PREFIX, 7217 .prefix = XATTR_SECURITY_PREFIX,
7218 .list = ocfs2_xattr_security_list, 7218 .list = ocfs2_xattr_security_list,
7219 .get = ocfs2_xattr_security_get, 7219 .get = ocfs2_xattr_security_get,
@@ -7257,7 +7257,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
7257 name, value, size, flags); 7257 name, value, size, flags);
7258} 7258}
7259 7259
7260struct xattr_handler ocfs2_xattr_trusted_handler = { 7260const struct xattr_handler ocfs2_xattr_trusted_handler = {
7261 .prefix = XATTR_TRUSTED_PREFIX, 7261 .prefix = XATTR_TRUSTED_PREFIX,
7262 .list = ocfs2_xattr_trusted_list, 7262 .list = ocfs2_xattr_trusted_list,
7263 .get = ocfs2_xattr_trusted_get, 7263 .get = ocfs2_xattr_trusted_get,
@@ -7313,7 +7313,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
7313 name, value, size, flags); 7313 name, value, size, flags);
7314} 7314}
7315 7315
7316struct xattr_handler ocfs2_xattr_user_handler = { 7316const struct xattr_handler ocfs2_xattr_user_handler = {
7317 .prefix = XATTR_USER_PREFIX, 7317 .prefix = XATTR_USER_PREFIX,
7318 .list = ocfs2_xattr_user_list, 7318 .list = ocfs2_xattr_user_list,
7319 .get = ocfs2_xattr_user_get, 7319 .get = ocfs2_xattr_user_get,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index abd72a47f520..aa64bb37a65b 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -37,12 +37,12 @@ struct ocfs2_security_xattr_info {
37 size_t value_len; 37 size_t value_len;
38}; 38};
39 39
40extern struct xattr_handler ocfs2_xattr_user_handler; 40extern const struct xattr_handler ocfs2_xattr_user_handler;
41extern struct xattr_handler ocfs2_xattr_trusted_handler; 41extern const struct xattr_handler ocfs2_xattr_trusted_handler;
42extern struct xattr_handler ocfs2_xattr_security_handler; 42extern const struct xattr_handler ocfs2_xattr_security_handler;
43extern struct xattr_handler ocfs2_xattr_acl_access_handler; 43extern const struct xattr_handler ocfs2_xattr_acl_access_handler;
44extern struct xattr_handler ocfs2_xattr_acl_default_handler; 44extern const struct xattr_handler ocfs2_xattr_acl_default_handler;
45extern struct xattr_handler *ocfs2_xattr_handlers[]; 45extern const struct xattr_handler *ocfs2_xattr_handlers[];
46 46
47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); 47ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, 48int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
329 .aio_read = generic_file_aio_read, 329 .aio_read = generic_file_aio_read,
330 .aio_write = generic_file_aio_write, 330 .aio_write = generic_file_aio_write,
331 .mmap = generic_file_mmap, 331 .mmap = generic_file_mmap,
332 .fsync = simple_fsync, 332 .fsync = generic_file_fsync,
333 .splice_read = generic_file_splice_read, 333 .splice_read = generic_file_splice_read,
334}; 334};
335 335
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index b44bb835e8ea..089839a6cc64 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -37,9 +37,7 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
37 goto fail; 37 goto fail;
38 38
39 inode->i_ino = new_block; 39 inode->i_ino = new_block;
40 inode->i_mode = mode; 40 inode_init_owner(inode, NULL, mode);
41 inode->i_uid = current_fsuid();
42 inode->i_gid = current_fsgid();
43 inode->i_mapping->a_ops = &omfs_aops; 41 inode->i_mapping->a_ops = &omfs_aops;
44 42
45 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 43 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index 74e5cd9f718e..5463266db9e6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -17,7 +17,6 @@
17#include <linux/securebits.h> 17#include <linux/securebits.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/vfs.h>
21#include <linux/fcntl.h> 20#include <linux/fcntl.h>
22#include <linux/slab.h> 21#include <linux/slab.h>
23#include <asm/uaccess.h> 22#include <asm/uaccess.h>
@@ -33,171 +32,6 @@
33 32
34#include "internal.h" 33#include "internal.h"
35 34
36int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
37{
38 int retval = -ENODEV;
39
40 if (dentry) {
41 retval = -ENOSYS;
42 if (dentry->d_sb->s_op->statfs) {
43 memset(buf, 0, sizeof(*buf));
44 retval = security_sb_statfs(dentry);
45 if (retval)
46 return retval;
47 retval = dentry->d_sb->s_op->statfs(dentry, buf);
48 if (retval == 0 && buf->f_frsize == 0)
49 buf->f_frsize = buf->f_bsize;
50 }
51 }
52 return retval;
53}
54
55EXPORT_SYMBOL(vfs_statfs);
56
57static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
58{
59 struct kstatfs st;
60 int retval;
61
62 retval = vfs_statfs(dentry, &st);
63 if (retval)
64 return retval;
65
66 if (sizeof(*buf) == sizeof(st))
67 memcpy(buf, &st, sizeof(st));
68 else {
69 if (sizeof buf->f_blocks == 4) {
70 if ((st.f_blocks | st.f_bfree | st.f_bavail |
71 st.f_bsize | st.f_frsize) &
72 0xffffffff00000000ULL)
73 return -EOVERFLOW;
74 /*
75 * f_files and f_ffree may be -1; it's okay to stuff
76 * that into 32 bits
77 */
78 if (st.f_files != -1 &&
79 (st.f_files & 0xffffffff00000000ULL))
80 return -EOVERFLOW;
81 if (st.f_ffree != -1 &&
82 (st.f_ffree & 0xffffffff00000000ULL))
83 return -EOVERFLOW;
84 }
85
86 buf->f_type = st.f_type;
87 buf->f_bsize = st.f_bsize;
88 buf->f_blocks = st.f_blocks;
89 buf->f_bfree = st.f_bfree;
90 buf->f_bavail = st.f_bavail;
91 buf->f_files = st.f_files;
92 buf->f_ffree = st.f_ffree;
93 buf->f_fsid = st.f_fsid;
94 buf->f_namelen = st.f_namelen;
95 buf->f_frsize = st.f_frsize;
96 memset(buf->f_spare, 0, sizeof(buf->f_spare));
97 }
98 return 0;
99}
100
101static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
102{
103 struct kstatfs st;
104 int retval;
105
106 retval = vfs_statfs(dentry, &st);
107 if (retval)
108 return retval;
109
110 if (sizeof(*buf) == sizeof(st))
111 memcpy(buf, &st, sizeof(st));
112 else {
113 buf->f_type = st.f_type;
114 buf->f_bsize = st.f_bsize;
115 buf->f_blocks = st.f_blocks;
116 buf->f_bfree = st.f_bfree;
117 buf->f_bavail = st.f_bavail;
118 buf->f_files = st.f_files;
119 buf->f_ffree = st.f_ffree;
120 buf->f_fsid = st.f_fsid;
121 buf->f_namelen = st.f_namelen;
122 buf->f_frsize = st.f_frsize;
123 memset(buf->f_spare, 0, sizeof(buf->f_spare));
124 }
125 return 0;
126}
127
128SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
129{
130 struct path path;
131 int error;
132
133 error = user_path(pathname, &path);
134 if (!error) {
135 struct statfs tmp;
136 error = vfs_statfs_native(path.dentry, &tmp);
137 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
138 error = -EFAULT;
139 path_put(&path);
140 }
141 return error;
142}
143
144SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
145{
146 struct path path;
147 long error;
148
149 if (sz != sizeof(*buf))
150 return -EINVAL;
151 error = user_path(pathname, &path);
152 if (!error) {
153 struct statfs64 tmp;
154 error = vfs_statfs64(path.dentry, &tmp);
155 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
156 error = -EFAULT;
157 path_put(&path);
158 }
159 return error;
160}
161
162SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
163{
164 struct file * file;
165 struct statfs tmp;
166 int error;
167
168 error = -EBADF;
169 file = fget(fd);
170 if (!file)
171 goto out;
172 error = vfs_statfs_native(file->f_path.dentry, &tmp);
173 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
174 error = -EFAULT;
175 fput(file);
176out:
177 return error;
178}
179
180SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
181{
182 struct file * file;
183 struct statfs64 tmp;
184 int error;
185
186 if (sz != sizeof(*buf))
187 return -EINVAL;
188
189 error = -EBADF;
190 file = fget(fd);
191 if (!file)
192 goto out;
193 error = vfs_statfs64(file->f_path.dentry, &tmp);
194 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
195 error = -EFAULT;
196 fput(file);
197out:
198 return error;
199}
200
201int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, 35int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
202 struct file *filp) 36 struct file *filp)
203{ 37{
diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c
index a97b477ac0fc..6921e7890be6 100644
--- a/fs/partitions/acorn.c
+++ b/fs/partitions/acorn.c
@@ -70,14 +70,14 @@ struct riscix_record {
70 70
71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 71#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
72 defined(CONFIG_ACORN_PARTITION_ADFS) 72 defined(CONFIG_ACORN_PARTITION_ADFS)
73static int 73static int riscix_partition(struct parsed_partitions *state,
74riscix_partition(struct parsed_partitions *state, struct block_device *bdev, 74 unsigned long first_sect, int slot,
75 unsigned long first_sect, int slot, unsigned long nr_sects) 75 unsigned long nr_sects)
76{ 76{
77 Sector sect; 77 Sector sect;
78 struct riscix_record *rr; 78 struct riscix_record *rr;
79 79
80 rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, &sect); 80 rr = read_part_sector(state, first_sect, &sect);
81 if (!rr) 81 if (!rr)
82 return -1; 82 return -1;
83 83
@@ -123,9 +123,9 @@ struct linux_part {
123 123
124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ 124#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
125 defined(CONFIG_ACORN_PARTITION_ADFS) 125 defined(CONFIG_ACORN_PARTITION_ADFS)
126static int 126static int linux_partition(struct parsed_partitions *state,
127linux_partition(struct parsed_partitions *state, struct block_device *bdev, 127 unsigned long first_sect, int slot,
128 unsigned long first_sect, int slot, unsigned long nr_sects) 128 unsigned long nr_sects)
129{ 129{
130 Sector sect; 130 Sector sect;
131 struct linux_part *linuxp; 131 struct linux_part *linuxp;
@@ -135,7 +135,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
135 135
136 put_partition(state, slot++, first_sect, size); 136 put_partition(state, slot++, first_sect, size);
137 137
138 linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, &sect); 138 linuxp = read_part_sector(state, first_sect, &sect);
139 if (!linuxp) 139 if (!linuxp)
140 return -1; 140 return -1;
141 141
@@ -157,8 +157,7 @@ linux_partition(struct parsed_partitions *state, struct block_device *bdev,
157#endif 157#endif
158 158
159#ifdef CONFIG_ACORN_PARTITION_CUMANA 159#ifdef CONFIG_ACORN_PARTITION_CUMANA
160int 160int adfspart_check_CUMANA(struct parsed_partitions *state)
161adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
162{ 161{
163 unsigned long first_sector = 0; 162 unsigned long first_sector = 0;
164 unsigned int start_blk = 0; 163 unsigned int start_blk = 0;
@@ -185,7 +184,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
185 struct adfs_discrecord *dr; 184 struct adfs_discrecord *dr;
186 unsigned int nr_sects; 185 unsigned int nr_sects;
187 186
188 data = read_dev_sector(bdev, start_blk * 2 + 6, &sect); 187 data = read_part_sector(state, start_blk * 2 + 6, &sect);
189 if (!data) 188 if (!data)
190 return -1; 189 return -1;
191 190
@@ -217,14 +216,14 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
217#ifdef CONFIG_ACORN_PARTITION_RISCIX 216#ifdef CONFIG_ACORN_PARTITION_RISCIX
218 case PARTITION_RISCIX_SCSI: 217 case PARTITION_RISCIX_SCSI:
219 /* RISCiX - we don't know how to find the next one. */ 218 /* RISCiX - we don't know how to find the next one. */
220 slot = riscix_partition(state, bdev, first_sector, 219 slot = riscix_partition(state, first_sector, slot,
221 slot, nr_sects); 220 nr_sects);
222 break; 221 break;
223#endif 222#endif
224 223
225 case PARTITION_LINUX: 224 case PARTITION_LINUX:
226 slot = linux_partition(state, bdev, first_sector, 225 slot = linux_partition(state, first_sector, slot,
227 slot, nr_sects); 226 nr_sects);
228 break; 227 break;
229 } 228 }
230 put_dev_sector(sect); 229 put_dev_sector(sect);
@@ -249,8 +248,7 @@ adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev
249 * hda1 = ADFS partition on first drive. 248 * hda1 = ADFS partition on first drive.
250 * hda2 = non-ADFS partition. 249 * hda2 = non-ADFS partition.
251 */ 250 */
252int 251int adfspart_check_ADFS(struct parsed_partitions *state)
253adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
254{ 252{
255 unsigned long start_sect, nr_sects, sectscyl, heads; 253 unsigned long start_sect, nr_sects, sectscyl, heads;
256 Sector sect; 254 Sector sect;
@@ -259,7 +257,7 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
259 unsigned char id; 257 unsigned char id;
260 int slot = 1; 258 int slot = 1;
261 259
262 data = read_dev_sector(bdev, 6, &sect); 260 data = read_part_sector(state, 6, &sect);
263 if (!data) 261 if (!data)
264 return -1; 262 return -1;
265 263
@@ -278,21 +276,21 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
278 /* 276 /*
279 * Work out start of non-adfs partition. 277 * Work out start of non-adfs partition.
280 */ 278 */
281 nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect; 279 nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
282 280
283 if (start_sect) { 281 if (start_sect) {
284 switch (id) { 282 switch (id) {
285#ifdef CONFIG_ACORN_PARTITION_RISCIX 283#ifdef CONFIG_ACORN_PARTITION_RISCIX
286 case PARTITION_RISCIX_SCSI: 284 case PARTITION_RISCIX_SCSI:
287 case PARTITION_RISCIX_MFM: 285 case PARTITION_RISCIX_MFM:
288 slot = riscix_partition(state, bdev, start_sect, 286 slot = riscix_partition(state, start_sect, slot,
289 slot, nr_sects); 287 nr_sects);
290 break; 288 break;
291#endif 289#endif
292 290
293 case PARTITION_LINUX: 291 case PARTITION_LINUX:
294 slot = linux_partition(state, bdev, start_sect, 292 slot = linux_partition(state, start_sect, slot,
295 slot, nr_sects); 293 nr_sects);
296 break; 294 break;
297 } 295 }
298 } 296 }
@@ -308,10 +306,11 @@ struct ics_part {
308 __le32 size; 306 __le32 size;
309}; 307};
310 308
311static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block) 309static int adfspart_check_ICSLinux(struct parsed_partitions *state,
310 unsigned long block)
312{ 311{
313 Sector sect; 312 Sector sect;
314 unsigned char *data = read_dev_sector(bdev, block, &sect); 313 unsigned char *data = read_part_sector(state, block, &sect);
315 int result = 0; 314 int result = 0;
316 315
317 if (data) { 316 if (data) {
@@ -349,8 +348,7 @@ static inline int valid_ics_sector(const unsigned char *data)
349 * hda2 = ADFS partition 1 on first drive. 348 * hda2 = ADFS partition 1 on first drive.
350 * ..etc.. 349 * ..etc..
351 */ 350 */
352int 351int adfspart_check_ICS(struct parsed_partitions *state)
353adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
354{ 352{
355 const unsigned char *data; 353 const unsigned char *data;
356 const struct ics_part *p; 354 const struct ics_part *p;
@@ -360,7 +358,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
360 /* 358 /*
361 * Try ICS style partitions - sector 0 contains partition info. 359 * Try ICS style partitions - sector 0 contains partition info.
362 */ 360 */
363 data = read_dev_sector(bdev, 0, &sect); 361 data = read_part_sector(state, 0, &sect);
364 if (!data) 362 if (!data)
365 return -1; 363 return -1;
366 364
@@ -392,7 +390,7 @@ adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
392 * partition is. We must not make this visible 390 * partition is. We must not make this visible
393 * to the filesystem. 391 * to the filesystem.
394 */ 392 */
395 if (size > 1 && adfspart_check_ICSLinux(bdev, start)) { 393 if (size > 1 && adfspart_check_ICSLinux(state, start)) {
396 start += 1; 394 start += 1;
397 size -= 1; 395 size -= 1;
398 } 396 }
@@ -446,8 +444,7 @@ static inline int valid_ptec_sector(const unsigned char *data)
446 * hda2 = ADFS partition 1 on first drive. 444 * hda2 = ADFS partition 1 on first drive.
447 * ..etc.. 445 * ..etc..
448 */ 446 */
449int 447int adfspart_check_POWERTEC(struct parsed_partitions *state)
450adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
451{ 448{
452 Sector sect; 449 Sector sect;
453 const unsigned char *data; 450 const unsigned char *data;
@@ -455,7 +452,7 @@ adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bd
455 int slot = 1; 452 int slot = 1;
456 int i; 453 int i;
457 454
458 data = read_dev_sector(bdev, 0, &sect); 455 data = read_part_sector(state, 0, &sect);
459 if (!data) 456 if (!data)
460 return -1; 457 return -1;
461 458
@@ -508,8 +505,7 @@ static const char eesox_name[] = {
508 * 1. The individual ADFS boot block entries that are placed on the disk. 505 * 1. The individual ADFS boot block entries that are placed on the disk.
509 * 2. The start address of the next entry. 506 * 2. The start address of the next entry.
510 */ 507 */
511int 508int adfspart_check_EESOX(struct parsed_partitions *state)
512adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
513{ 509{
514 Sector sect; 510 Sector sect;
515 const unsigned char *data; 511 const unsigned char *data;
@@ -518,7 +514,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
518 sector_t start = 0; 514 sector_t start = 0;
519 int i, slot = 1; 515 int i, slot = 1;
520 516
521 data = read_dev_sector(bdev, 7, &sect); 517 data = read_part_sector(state, 7, &sect);
522 if (!data) 518 if (!data)
523 return -1; 519 return -1;
524 520
@@ -545,7 +541,7 @@ adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
545 if (i != 0) { 541 if (i != 0) {
546 sector_t size; 542 sector_t size;
547 543
548 size = get_capacity(bdev->bd_disk); 544 size = get_capacity(state->bdev->bd_disk);
549 put_partition(state, slot++, start, size - start); 545 put_partition(state, slot++, start, size - start);
550 printk("\n"); 546 printk("\n");
551 } 547 }
diff --git a/fs/partitions/acorn.h b/fs/partitions/acorn.h
index 81fd50ecc080..ede828529692 100644
--- a/fs/partitions/acorn.h
+++ b/fs/partitions/acorn.h
@@ -7,8 +7,8 @@
7 * format, and everyone stick to it? 7 * format, and everyone stick to it?
8 */ 8 */
9 9
10int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev); 10int adfspart_check_CUMANA(struct parsed_partitions *state);
11int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev); 11int adfspart_check_ADFS(struct parsed_partitions *state);
12int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev); 12int adfspart_check_ICS(struct parsed_partitions *state);
13int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev); 13int adfspart_check_POWERTEC(struct parsed_partitions *state);
14int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev); 14int adfspart_check_EESOX(struct parsed_partitions *state);
diff --git a/fs/partitions/amiga.c b/fs/partitions/amiga.c
index 9917a8c360f2..ba443d4229f8 100644
--- a/fs/partitions/amiga.c
+++ b/fs/partitions/amiga.c
@@ -23,8 +23,7 @@ checksum_block(__be32 *m, int size)
23 return sum; 23 return sum;
24} 24}
25 25
26int 26int amiga_partition(struct parsed_partitions *state)
27amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
28{ 27{
29 Sector sect; 28 Sector sect;
30 unsigned char *data; 29 unsigned char *data;
@@ -38,11 +37,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
38 for (blk = 0; ; blk++, put_dev_sector(sect)) { 37 for (blk = 0; ; blk++, put_dev_sector(sect)) {
39 if (blk == RDB_ALLOCATION_LIMIT) 38 if (blk == RDB_ALLOCATION_LIMIT)
40 goto rdb_done; 39 goto rdb_done;
41 data = read_dev_sector(bdev, blk, &sect); 40 data = read_part_sector(state, blk, &sect);
42 if (!data) { 41 if (!data) {
43 if (warn_no_part) 42 if (warn_no_part)
44 printk("Dev %s: unable to read RDB block %d\n", 43 printk("Dev %s: unable to read RDB block %d\n",
45 bdevname(bdev, b), blk); 44 bdevname(state->bdev, b), blk);
46 res = -1; 45 res = -1;
47 goto rdb_done; 46 goto rdb_done;
48 } 47 }
@@ -64,7 +63,7 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
64 } 63 }
65 64
66 printk("Dev %s: RDB in block %d has bad checksum\n", 65 printk("Dev %s: RDB in block %d has bad checksum\n",
67 bdevname(bdev, b), blk); 66 bdevname(state->bdev, b), blk);
68 } 67 }
69 68
70 /* blksize is blocks per 512 byte standard block */ 69 /* blksize is blocks per 512 byte standard block */
@@ -75,11 +74,11 @@ amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
75 put_dev_sector(sect); 74 put_dev_sector(sect);
76 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { 75 for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
77 blk *= blksize; /* Read in terms partition table understands */ 76 blk *= blksize; /* Read in terms partition table understands */
78 data = read_dev_sector(bdev, blk, &sect); 77 data = read_part_sector(state, blk, &sect);
79 if (!data) { 78 if (!data) {
80 if (warn_no_part) 79 if (warn_no_part)
81 printk("Dev %s: unable to read partition block %d\n", 80 printk("Dev %s: unable to read partition block %d\n",
82 bdevname(bdev, b), blk); 81 bdevname(state->bdev, b), blk);
83 res = -1; 82 res = -1;
84 goto rdb_done; 83 goto rdb_done;
85 } 84 }
diff --git a/fs/partitions/amiga.h b/fs/partitions/amiga.h
index 2f3e9ce22d53..d094585cadaa 100644
--- a/fs/partitions/amiga.h
+++ b/fs/partitions/amiga.h
@@ -2,5 +2,5 @@
2 * fs/partitions/amiga.h 2 * fs/partitions/amiga.h
3 */ 3 */
4 4
5int amiga_partition(struct parsed_partitions *state, struct block_device *bdev); 5int amiga_partition(struct parsed_partitions *state);
6 6
diff --git a/fs/partitions/atari.c b/fs/partitions/atari.c
index 1f3572d5b755..4439ff1b6cec 100644
--- a/fs/partitions/atari.c
+++ b/fs/partitions/atari.c
@@ -30,7 +30,7 @@ static inline int OK_id(char *s)
30 memcmp (s, "RAW", 3) == 0 ; 30 memcmp (s, "RAW", 3) == 0 ;
31} 31}
32 32
33int atari_partition(struct parsed_partitions *state, struct block_device *bdev) 33int atari_partition(struct parsed_partitions *state)
34{ 34{
35 Sector sect; 35 Sector sect;
36 struct rootsector *rs; 36 struct rootsector *rs;
@@ -42,12 +42,12 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ 42 int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
43#endif 43#endif
44 44
45 rs = (struct rootsector *) read_dev_sector(bdev, 0, &sect); 45 rs = read_part_sector(state, 0, &sect);
46 if (!rs) 46 if (!rs)
47 return -1; 47 return -1;
48 48
49 /* Verify this is an Atari rootsector: */ 49 /* Verify this is an Atari rootsector: */
50 hd_size = bdev->bd_inode->i_size >> 9; 50 hd_size = state->bdev->bd_inode->i_size >> 9;
51 if (!VALID_PARTITION(&rs->part[0], hd_size) && 51 if (!VALID_PARTITION(&rs->part[0], hd_size) &&
52 !VALID_PARTITION(&rs->part[1], hd_size) && 52 !VALID_PARTITION(&rs->part[1], hd_size) &&
53 !VALID_PARTITION(&rs->part[2], hd_size) && 53 !VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -84,7 +84,7 @@ int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
84 printk(" XGM<"); 84 printk(" XGM<");
85 partsect = extensect = be32_to_cpu(pi->st); 85 partsect = extensect = be32_to_cpu(pi->st);
86 while (1) { 86 while (1) {
87 xrs = (struct rootsector *)read_dev_sector(bdev, partsect, &sect2); 87 xrs = read_part_sector(state, partsect, &sect2);
88 if (!xrs) { 88 if (!xrs) {
89 printk (" block %ld read failed\n", partsect); 89 printk (" block %ld read failed\n", partsect);
90 put_dev_sector(sect); 90 put_dev_sector(sect);
diff --git a/fs/partitions/atari.h b/fs/partitions/atari.h
index 63186b00e135..fe2d32a89f36 100644
--- a/fs/partitions/atari.h
+++ b/fs/partitions/atari.h
@@ -31,4 +31,4 @@ struct rootsector
31 u16 checksum; /* checksum for bootable disks */ 31 u16 checksum; /* checksum for bootable disks */
32} __attribute__((__packed__)); 32} __attribute__((__packed__));
33 33
34int atari_partition(struct parsed_partitions *state, struct block_device *bdev); 34int atari_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index e238ab23a9e7..5dcd4b0c5533 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -45,7 +45,7 @@ extern void md_autodetect_dev(dev_t dev);
45 45
46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ 46int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
47 47
48static int (*check_part[])(struct parsed_partitions *, struct block_device *) = { 48static int (*check_part[])(struct parsed_partitions *) = {
49 /* 49 /*
50 * Probe partition formats with tables at disk address 0 50 * Probe partition formats with tables at disk address 0
51 * that also have an ADFS boot block at 0xdc0. 51 * that also have an ADFS boot block at 0xdc0.
@@ -161,10 +161,11 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
161 struct parsed_partitions *state; 161 struct parsed_partitions *state;
162 int i, res, err; 162 int i, res, err;
163 163
164 state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL); 164 state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
165 if (!state) 165 if (!state)
166 return NULL; 166 return NULL;
167 167
168 state->bdev = bdev;
168 disk_name(hd, 0, state->name); 169 disk_name(hd, 0, state->name);
169 printk(KERN_INFO " %s:", state->name); 170 printk(KERN_INFO " %s:", state->name);
170 if (isdigit(state->name[strlen(state->name)-1])) 171 if (isdigit(state->name[strlen(state->name)-1]))
@@ -174,7 +175,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
174 i = res = err = 0; 175 i = res = err = 0;
175 while (!res && check_part[i]) { 176 while (!res && check_part[i]) {
176 memset(&state->parts, 0, sizeof(state->parts)); 177 memset(&state->parts, 0, sizeof(state->parts));
177 res = check_part[i++](state, bdev); 178 res = check_part[i++](state);
178 if (res < 0) { 179 if (res < 0) {
179 /* We have hit an I/O error which we don't report now. 180 /* We have hit an I/O error which we don't report now.
180 * But record it, and let the others do their job. 181 * But record it, and let the others do their job.
@@ -186,6 +187,8 @@ check_partition(struct gendisk *hd, struct block_device *bdev)
186 } 187 }
187 if (res > 0) 188 if (res > 0)
188 return state; 189 return state;
190 if (state->access_beyond_eod)
191 err = -ENOSPC;
189 if (err) 192 if (err)
190 /* The partition is unrecognized. So report I/O errors if there were any */ 193 /* The partition is unrecognized. So report I/O errors if there were any */
191 res = err; 194 res = err;
@@ -538,12 +541,33 @@ exit:
538 disk_part_iter_exit(&piter); 541 disk_part_iter_exit(&piter);
539} 542}
540 543
544static bool disk_unlock_native_capacity(struct gendisk *disk)
545{
546 const struct block_device_operations *bdops = disk->fops;
547
548 if (bdops->unlock_native_capacity &&
549 !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
550 printk(KERN_CONT "enabling native capacity\n");
551 bdops->unlock_native_capacity(disk);
552 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
553 return true;
554 } else {
555 printk(KERN_CONT "truncated\n");
556 return false;
557 }
558}
559
541int rescan_partitions(struct gendisk *disk, struct block_device *bdev) 560int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
542{ 561{
562 struct parsed_partitions *state = NULL;
543 struct disk_part_iter piter; 563 struct disk_part_iter piter;
544 struct hd_struct *part; 564 struct hd_struct *part;
545 struct parsed_partitions *state;
546 int p, highest, res; 565 int p, highest, res;
566rescan:
567 if (state && !IS_ERR(state)) {
568 kfree(state);
569 state = NULL;
570 }
547 571
548 if (bdev->bd_part_count) 572 if (bdev->bd_part_count)
549 return -EBUSY; 573 return -EBUSY;
@@ -562,8 +586,32 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
562 bdev->bd_invalidated = 0; 586 bdev->bd_invalidated = 0;
563 if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) 587 if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
564 return 0; 588 return 0;
565 if (IS_ERR(state)) /* I/O error reading the partition table */ 589 if (IS_ERR(state)) {
590 /*
591 * I/O error reading the partition table. If any
592 * partition code tried to read beyond EOD, retry
593 * after unlocking native capacity.
594 */
595 if (PTR_ERR(state) == -ENOSPC) {
596 printk(KERN_WARNING "%s: partition table beyond EOD, ",
597 disk->disk_name);
598 if (disk_unlock_native_capacity(disk))
599 goto rescan;
600 }
566 return -EIO; 601 return -EIO;
602 }
603 /*
604 * If any partition code tried to read beyond EOD, try
605 * unlocking native capacity even if partition table is
606 * sucessfully read as we could be missing some partitions.
607 */
608 if (state->access_beyond_eod) {
609 printk(KERN_WARNING
610 "%s: partition table partially beyond EOD, ",
611 disk->disk_name);
612 if (disk_unlock_native_capacity(disk))
613 goto rescan;
614 }
567 615
568 /* tell userspace that the media / partition table may have changed */ 616 /* tell userspace that the media / partition table may have changed */
569 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); 617 kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
@@ -581,7 +629,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
581 /* add partitions */ 629 /* add partitions */
582 for (p = 1; p < state->limit; p++) { 630 for (p = 1; p < state->limit; p++) {
583 sector_t size, from; 631 sector_t size, from;
584try_scan: 632
585 size = state->parts[p].size; 633 size = state->parts[p].size;
586 if (!size) 634 if (!size)
587 continue; 635 continue;
@@ -589,30 +637,21 @@ try_scan:
589 from = state->parts[p].from; 637 from = state->parts[p].from;
590 if (from >= get_capacity(disk)) { 638 if (from >= get_capacity(disk)) {
591 printk(KERN_WARNING 639 printk(KERN_WARNING
592 "%s: p%d ignored, start %llu is behind the end of the disk\n", 640 "%s: p%d start %llu is beyond EOD, ",
593 disk->disk_name, p, (unsigned long long) from); 641 disk->disk_name, p, (unsigned long long) from);
642 if (disk_unlock_native_capacity(disk))
643 goto rescan;
594 continue; 644 continue;
595 } 645 }
596 646
597 if (from + size > get_capacity(disk)) { 647 if (from + size > get_capacity(disk)) {
598 const struct block_device_operations *bdops = disk->fops;
599 unsigned long long capacity;
600
601 printk(KERN_WARNING 648 printk(KERN_WARNING
602 "%s: p%d size %llu exceeds device capacity, ", 649 "%s: p%d size %llu extends beyond EOD, ",
603 disk->disk_name, p, (unsigned long long) size); 650 disk->disk_name, p, (unsigned long long) size);
604 651
605 if (bdops->set_capacity && 652 if (disk_unlock_native_capacity(disk)) {
606 (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) { 653 /* free state and restart */
607 printk(KERN_CONT "enabling native capacity\n"); 654 goto rescan;
608 capacity = bdops->set_capacity(disk, ~0ULL);
609 disk->flags |= GENHD_FL_NATIVE_CAPACITY;
610 if (capacity > get_capacity(disk)) {
611 set_capacity(disk, capacity);
612 check_disk_size_change(disk, bdev);
613 bdev->bd_invalidated = 0;
614 }
615 goto try_scan;
616 } else { 655 } else {
617 /* 656 /*
618 * we can not ignore partitions of broken tables 657 * we can not ignore partitions of broken tables
@@ -620,7 +659,6 @@ try_scan:
620 * we limit them to the end of the disk to avoid 659 * we limit them to the end of the disk to avoid
621 * creating invalid block devices 660 * creating invalid block devices
622 */ 661 */
623 printk(KERN_CONT "limited to end of disk\n");
624 size = get_capacity(disk) - from; 662 size = get_capacity(disk) - from;
625 } 663 }
626 } 664 }
diff --git a/fs/partitions/check.h b/fs/partitions/check.h
index 98dbe1a84528..52f8bd399396 100644
--- a/fs/partitions/check.h
+++ b/fs/partitions/check.h
@@ -6,6 +6,7 @@
6 * description. 6 * description.
7 */ 7 */
8struct parsed_partitions { 8struct parsed_partitions {
9 struct block_device *bdev;
9 char name[BDEVNAME_SIZE]; 10 char name[BDEVNAME_SIZE];
10 struct { 11 struct {
11 sector_t from; 12 sector_t from;
@@ -14,8 +15,19 @@ struct parsed_partitions {
14 } parts[DISK_MAX_PARTS]; 15 } parts[DISK_MAX_PARTS];
15 int next; 16 int next;
16 int limit; 17 int limit;
18 bool access_beyond_eod;
17}; 19};
18 20
21static inline void *read_part_sector(struct parsed_partitions *state,
22 sector_t n, Sector *p)
23{
24 if (n >= get_capacity(state->bdev->bd_disk)) {
25 state->access_beyond_eod = true;
26 return NULL;
27 }
28 return read_dev_sector(state->bdev, n, p);
29}
30
19static inline void 31static inline void
20put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) 32put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
21{ 33{
diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c
index 91babdae7587..9efb2cfe2410 100644
--- a/fs/partitions/efi.c
+++ b/fs/partitions/efi.c
@@ -140,8 +140,7 @@ efi_crc32(const void *buf, unsigned long len)
140 * the part[0] entry for this disk, and is the number of 140 * the part[0] entry for this disk, and is the number of
141 * physical sectors available on the disk. 141 * physical sectors available on the disk.
142 */ 142 */
143static u64 143static u64 last_lba(struct block_device *bdev)
144last_lba(struct block_device *bdev)
145{ 144{
146 if (!bdev || !bdev->bd_inode) 145 if (!bdev || !bdev->bd_inode)
147 return 0; 146 return 0;
@@ -181,27 +180,28 @@ is_pmbr_valid(legacy_mbr *mbr)
181 180
182/** 181/**
183 * read_lba(): Read bytes from disk, starting at given LBA 182 * read_lba(): Read bytes from disk, starting at given LBA
184 * @bdev 183 * @state
185 * @lba 184 * @lba
186 * @buffer 185 * @buffer
187 * @size_t 186 * @size_t
188 * 187 *
189 * Description: Reads @count bytes from @bdev into @buffer. 188 * Description: Reads @count bytes from @state->bdev into @buffer.
190 * Returns number of bytes read on success, 0 on error. 189 * Returns number of bytes read on success, 0 on error.
191 */ 190 */
192static size_t 191static size_t read_lba(struct parsed_partitions *state,
193read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) 192 u64 lba, u8 *buffer, size_t count)
194{ 193{
195 size_t totalreadcount = 0; 194 size_t totalreadcount = 0;
195 struct block_device *bdev = state->bdev;
196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512); 196 sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
197 197
198 if (!bdev || !buffer || lba > last_lba(bdev)) 198 if (!buffer || lba > last_lba(bdev))
199 return 0; 199 return 0;
200 200
201 while (count) { 201 while (count) {
202 int copied = 512; 202 int copied = 512;
203 Sector sect; 203 Sector sect;
204 unsigned char *data = read_dev_sector(bdev, n++, &sect); 204 unsigned char *data = read_part_sector(state, n++, &sect);
205 if (!data) 205 if (!data)
206 break; 206 break;
207 if (copied > count) 207 if (copied > count)
@@ -217,19 +217,20 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
217 217
218/** 218/**
219 * alloc_read_gpt_entries(): reads partition entries from disk 219 * alloc_read_gpt_entries(): reads partition entries from disk
220 * @bdev 220 * @state
221 * @gpt - GPT header 221 * @gpt - GPT header
222 * 222 *
223 * Description: Returns ptes on success, NULL on error. 223 * Description: Returns ptes on success, NULL on error.
224 * Allocates space for PTEs based on information found in @gpt. 224 * Allocates space for PTEs based on information found in @gpt.
225 * Notes: remember to free pte when you're done! 225 * Notes: remember to free pte when you're done!
226 */ 226 */
227static gpt_entry * 227static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
228alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt) 228 gpt_header *gpt)
229{ 229{
230 size_t count; 230 size_t count;
231 gpt_entry *pte; 231 gpt_entry *pte;
232 if (!bdev || !gpt) 232
233 if (!gpt)
233 return NULL; 234 return NULL;
234 235
235 count = le32_to_cpu(gpt->num_partition_entries) * 236 count = le32_to_cpu(gpt->num_partition_entries) *
@@ -240,7 +241,7 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
240 if (!pte) 241 if (!pte)
241 return NULL; 242 return NULL;
242 243
243 if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba), 244 if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
244 (u8 *) pte, 245 (u8 *) pte,
245 count) < count) { 246 count) < count) {
246 kfree(pte); 247 kfree(pte);
@@ -252,27 +253,24 @@ alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
252 253
253/** 254/**
254 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk 255 * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
255 * @bdev 256 * @state
256 * @lba is the Logical Block Address of the partition table 257 * @lba is the Logical Block Address of the partition table
257 * 258 *
258 * Description: returns GPT header on success, NULL on error. Allocates 259 * Description: returns GPT header on success, NULL on error. Allocates
259 * and fills a GPT header starting at @ from @bdev. 260 * and fills a GPT header starting at @ from @state->bdev.
260 * Note: remember to free gpt when finished with it. 261 * Note: remember to free gpt when finished with it.
261 */ 262 */
262static gpt_header * 263static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
263alloc_read_gpt_header(struct block_device *bdev, u64 lba) 264 u64 lba)
264{ 265{
265 gpt_header *gpt; 266 gpt_header *gpt;
266 unsigned ssz = bdev_logical_block_size(bdev); 267 unsigned ssz = bdev_logical_block_size(state->bdev);
267
268 if (!bdev)
269 return NULL;
270 268
271 gpt = kzalloc(ssz, GFP_KERNEL); 269 gpt = kzalloc(ssz, GFP_KERNEL);
272 if (!gpt) 270 if (!gpt)
273 return NULL; 271 return NULL;
274 272
275 if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { 273 if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
276 kfree(gpt); 274 kfree(gpt);
277 gpt=NULL; 275 gpt=NULL;
278 return NULL; 276 return NULL;
@@ -283,7 +281,7 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
283 281
284/** 282/**
285 * is_gpt_valid() - tests one GPT header and PTEs for validity 283 * is_gpt_valid() - tests one GPT header and PTEs for validity
286 * @bdev 284 * @state
287 * @lba is the logical block address of the GPT header to test 285 * @lba is the logical block address of the GPT header to test
288 * @gpt is a GPT header ptr, filled on return. 286 * @gpt is a GPT header ptr, filled on return.
289 * @ptes is a PTEs ptr, filled on return. 287 * @ptes is a PTEs ptr, filled on return.
@@ -291,16 +289,15 @@ alloc_read_gpt_header(struct block_device *bdev, u64 lba)
291 * Description: returns 1 if valid, 0 on error. 289 * Description: returns 1 if valid, 0 on error.
292 * If valid, returns pointers to newly allocated GPT header and PTEs. 290 * If valid, returns pointers to newly allocated GPT header and PTEs.
293 */ 291 */
294static int 292static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
295is_gpt_valid(struct block_device *bdev, u64 lba, 293 gpt_header **gpt, gpt_entry **ptes)
296 gpt_header **gpt, gpt_entry **ptes)
297{ 294{
298 u32 crc, origcrc; 295 u32 crc, origcrc;
299 u64 lastlba; 296 u64 lastlba;
300 297
301 if (!bdev || !gpt || !ptes) 298 if (!ptes)
302 return 0; 299 return 0;
303 if (!(*gpt = alloc_read_gpt_header(bdev, lba))) 300 if (!(*gpt = alloc_read_gpt_header(state, lba)))
304 return 0; 301 return 0;
305 302
306 /* Check the GUID Partition Table signature */ 303 /* Check the GUID Partition Table signature */
@@ -336,7 +333,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
336 /* Check the first_usable_lba and last_usable_lba are 333 /* Check the first_usable_lba and last_usable_lba are
337 * within the disk. 334 * within the disk.
338 */ 335 */
339 lastlba = last_lba(bdev); 336 lastlba = last_lba(state->bdev);
340 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { 337 if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
341 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", 338 pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
342 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), 339 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -350,7 +347,7 @@ is_gpt_valid(struct block_device *bdev, u64 lba,
350 goto fail; 347 goto fail;
351 } 348 }
352 349
353 if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt))) 350 if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
354 goto fail; 351 goto fail;
355 352
356 /* Check the GUID Partition Entry Array CRC */ 353 /* Check the GUID Partition Entry Array CRC */
@@ -495,7 +492,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
495 492
496/** 493/**
497 * find_valid_gpt() - Search disk for valid GPT headers and PTEs 494 * find_valid_gpt() - Search disk for valid GPT headers and PTEs
498 * @bdev 495 * @state
499 * @gpt is a GPT header ptr, filled on return. 496 * @gpt is a GPT header ptr, filled on return.
500 * @ptes is a PTEs ptr, filled on return. 497 * @ptes is a PTEs ptr, filled on return.
501 * Description: Returns 1 if valid, 0 on error. 498 * Description: Returns 1 if valid, 0 on error.
@@ -508,24 +505,25 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
508 * This protects against devices which misreport their size, and forces 505 * This protects against devices which misreport their size, and forces
509 * the user to decide to use the Alternate GPT. 506 * the user to decide to use the Alternate GPT.
510 */ 507 */
511static int 508static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
512find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes) 509 gpt_entry **ptes)
513{ 510{
514 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; 511 int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
515 gpt_header *pgpt = NULL, *agpt = NULL; 512 gpt_header *pgpt = NULL, *agpt = NULL;
516 gpt_entry *pptes = NULL, *aptes = NULL; 513 gpt_entry *pptes = NULL, *aptes = NULL;
517 legacy_mbr *legacymbr; 514 legacy_mbr *legacymbr;
518 u64 lastlba; 515 u64 lastlba;
519 if (!bdev || !gpt || !ptes) 516
517 if (!ptes)
520 return 0; 518 return 0;
521 519
522 lastlba = last_lba(bdev); 520 lastlba = last_lba(state->bdev);
523 if (!force_gpt) { 521 if (!force_gpt) {
524 /* This will be added to the EFI Spec. per Intel after v1.02. */ 522 /* This will be added to the EFI Spec. per Intel after v1.02. */
525 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); 523 legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
526 if (legacymbr) { 524 if (legacymbr) {
527 read_lba(bdev, 0, (u8 *) legacymbr, 525 read_lba(state, 0, (u8 *) legacymbr,
528 sizeof (*legacymbr)); 526 sizeof (*legacymbr));
529 good_pmbr = is_pmbr_valid(legacymbr); 527 good_pmbr = is_pmbr_valid(legacymbr);
530 kfree(legacymbr); 528 kfree(legacymbr);
531 } 529 }
@@ -533,15 +531,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
533 goto fail; 531 goto fail;
534 } 532 }
535 533
536 good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA, 534 good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
537 &pgpt, &pptes); 535 &pgpt, &pptes);
538 if (good_pgpt) 536 if (good_pgpt)
539 good_agpt = is_gpt_valid(bdev, 537 good_agpt = is_gpt_valid(state,
540 le64_to_cpu(pgpt->alternate_lba), 538 le64_to_cpu(pgpt->alternate_lba),
541 &agpt, &aptes); 539 &agpt, &aptes);
542 if (!good_agpt && force_gpt) 540 if (!good_agpt && force_gpt)
543 good_agpt = is_gpt_valid(bdev, lastlba, 541 good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
544 &agpt, &aptes);
545 542
546 /* The obviously unsuccessful case */ 543 /* The obviously unsuccessful case */
547 if (!good_pgpt && !good_agpt) 544 if (!good_pgpt && !good_agpt)
@@ -583,9 +580,8 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
583} 580}
584 581
585/** 582/**
586 * efi_partition(struct parsed_partitions *state, struct block_device *bdev) 583 * efi_partition(struct parsed_partitions *state)
587 * @state 584 * @state
588 * @bdev
589 * 585 *
590 * Description: called from check.c, if the disk contains GPT 586 * Description: called from check.c, if the disk contains GPT
591 * partitions, sets up partition entries in the kernel. 587 * partitions, sets up partition entries in the kernel.
@@ -602,15 +598,14 @@ find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
602 * 1 if successful 598 * 1 if successful
603 * 599 *
604 */ 600 */
605int 601int efi_partition(struct parsed_partitions *state)
606efi_partition(struct parsed_partitions *state, struct block_device *bdev)
607{ 602{
608 gpt_header *gpt = NULL; 603 gpt_header *gpt = NULL;
609 gpt_entry *ptes = NULL; 604 gpt_entry *ptes = NULL;
610 u32 i; 605 u32 i;
611 unsigned ssz = bdev_logical_block_size(bdev) / 512; 606 unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
612 607
613 if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { 608 if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
614 kfree(gpt); 609 kfree(gpt);
615 kfree(ptes); 610 kfree(ptes);
616 return 0; 611 return 0;
@@ -623,7 +618,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
623 u64 size = le64_to_cpu(ptes[i].ending_lba) - 618 u64 size = le64_to_cpu(ptes[i].ending_lba) -
624 le64_to_cpu(ptes[i].starting_lba) + 1ULL; 619 le64_to_cpu(ptes[i].starting_lba) + 1ULL;
625 620
626 if (!is_pte_valid(&ptes[i], last_lba(bdev))) 621 if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
627 continue; 622 continue;
628 623
629 put_partition(state, i+1, start * ssz, size * ssz); 624 put_partition(state, i+1, start * ssz, size * ssz);
@@ -631,7 +626,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev)
631 /* If this is a RAID volume, tell md */ 626 /* If this is a RAID volume, tell md */
632 if (!efi_guidcmp(ptes[i].partition_type_guid, 627 if (!efi_guidcmp(ptes[i].partition_type_guid,
633 PARTITION_LINUX_RAID_GUID)) 628 PARTITION_LINUX_RAID_GUID))
634 state->parts[i+1].flags = 1; 629 state->parts[i + 1].flags = ADDPART_FLAG_RAID;
635 } 630 }
636 kfree(ptes); 631 kfree(ptes);
637 kfree(gpt); 632 kfree(gpt);
diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h
index 6998b589abf9..b69ab729558f 100644
--- a/fs/partitions/efi.h
+++ b/fs/partitions/efi.h
@@ -110,7 +110,7 @@ typedef struct _legacy_mbr {
110} __attribute__ ((packed)) legacy_mbr; 110} __attribute__ ((packed)) legacy_mbr;
111 111
112/* Functions */ 112/* Functions */
113extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev); 113extern int efi_partition(struct parsed_partitions *state);
114 114
115#endif 115#endif
116 116
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index fc71aab08460..3e73de5967ff 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -58,9 +58,9 @@ cchhb2blk (struct vtoc_cchhb *ptr, struct hd_geometry *geo) {
58 58
59/* 59/*
60 */ 60 */
61int 61int ibm_partition(struct parsed_partitions *state)
62ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
63{ 62{
63 struct block_device *bdev = state->bdev;
64 int blocksize, res; 64 int blocksize, res;
65 loff_t i_size, offset, size, fmt_size; 65 loff_t i_size, offset, size, fmt_size;
66 dasd_information2_t *info; 66 dasd_information2_t *info;
@@ -100,7 +100,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
100 /* 100 /*
101 * Get volume label, extract name and type. 101 * Get volume label, extract name and type.
102 */ 102 */
103 data = read_dev_sector(bdev, info->label_block*(blocksize/512), &sect); 103 data = read_part_sector(state, info->label_block*(blocksize/512),
104 &sect);
104 if (data == NULL) 105 if (data == NULL)
105 goto out_readerr; 106 goto out_readerr;
106 107
@@ -193,8 +194,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
193 */ 194 */
194 blk = cchhb2blk(&label->vol.vtoc, geo) + 1; 195 blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
195 counter = 0; 196 counter = 0;
196 data = read_dev_sector(bdev, blk * (blocksize/512), 197 data = read_part_sector(state, blk * (blocksize/512),
197 &sect); 198 &sect);
198 while (data != NULL) { 199 while (data != NULL) {
199 struct vtoc_format1_label f1; 200 struct vtoc_format1_label f1;
200 201
@@ -208,9 +209,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
208 || f1.DS1FMTID == _ascebc['7'] 209 || f1.DS1FMTID == _ascebc['7']
209 || f1.DS1FMTID == _ascebc['9']) { 210 || f1.DS1FMTID == _ascebc['9']) {
210 blk++; 211 blk++;
211 data = read_dev_sector(bdev, blk * 212 data = read_part_sector(state,
212 (blocksize/512), 213 blk * (blocksize/512), &sect);
213 &sect);
214 continue; 214 continue;
215 } 215 }
216 216
@@ -230,9 +230,8 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
230 size * (blocksize >> 9)); 230 size * (blocksize >> 9));
231 counter++; 231 counter++;
232 blk++; 232 blk++;
233 data = read_dev_sector(bdev, 233 data = read_part_sector(state,
234 blk * (blocksize/512), 234 blk * (blocksize/512), &sect);
235 &sect);
236 } 235 }
237 236
238 if (!data) 237 if (!data)
diff --git a/fs/partitions/ibm.h b/fs/partitions/ibm.h
index 31f85a6ac459..08fb0804a812 100644
--- a/fs/partitions/ibm.h
+++ b/fs/partitions/ibm.h
@@ -1 +1 @@
int ibm_partition(struct parsed_partitions *, struct block_device *); int ibm_partition(struct parsed_partitions *);
diff --git a/fs/partitions/karma.c b/fs/partitions/karma.c
index 176d89bcf123..1cc928bb762f 100644
--- a/fs/partitions/karma.c
+++ b/fs/partitions/karma.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "karma.h" 10#include "karma.h"
11 11
12int karma_partition(struct parsed_partitions *state, struct block_device *bdev) 12int karma_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 int slot = 1; 15 int slot = 1;
@@ -29,7 +29,7 @@ int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
29 } __attribute__((packed)) *label; 29 } __attribute__((packed)) *label;
30 struct d_partition *p; 30 struct d_partition *p;
31 31
32 data = read_dev_sector(bdev, 0, &sect); 32 data = read_part_sector(state, 0, &sect);
33 if (!data) 33 if (!data)
34 return -1; 34 return -1;
35 35
diff --git a/fs/partitions/karma.h b/fs/partitions/karma.h
index ecf7d3f2a3d8..c764b2e9df21 100644
--- a/fs/partitions/karma.h
+++ b/fs/partitions/karma.h
@@ -4,5 +4,5 @@
4 4
5#define KARMA_LABEL_MAGIC 0xAB56 5#define KARMA_LABEL_MAGIC 0xAB56
6 6
7int karma_partition(struct parsed_partitions *state, struct block_device *bdev); 7int karma_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 8652fb99e962..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/pagemap.h> 27#include <linux/pagemap.h>
28#include <linux/stringify.h> 28#include <linux/stringify.h>
29#include <linux/kernel.h>
29#include "ldm.h" 30#include "ldm.h"
30#include "check.h" 31#include "check.h"
31#include "msdos.h" 32#include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
77 int h; 78 int h;
78 79
79 /* high part */ 80 /* high part */
80 if ((x = src[0] - '0') <= '9'-'0') h = x; 81 x = h = hex_to_bin(src[0]);
81 else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10; 82 if (h < 0)
82 else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10; 83 return -1;
83 else return -1;
84 h <<= 4;
85 84
86 /* low part */ 85 /* low part */
87 if ((x = src[1] - '0') <= '9'-'0') return h | x; 86 h = hex_to_bin(src[1]);
88 if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10); 87 if (h < 0)
89 if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10); 88 return -1;
90 return -1; 89
90 return (x << 4) + h;
91} 91}
92 92
93/** 93/**
@@ -309,7 +309,7 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
309 309
310/** 310/**
311 * ldm_validate_privheads - Compare the primary privhead with its backups 311 * ldm_validate_privheads - Compare the primary privhead with its backups
312 * @bdev: Device holding the LDM Database 312 * @state: Partition check state including device holding the LDM Database
313 * @ph1: Memory struct to fill with ph contents 313 * @ph1: Memory struct to fill with ph contents
314 * 314 *
315 * Read and compare all three privheads from disk. 315 * Read and compare all three privheads from disk.
@@ -321,8 +321,8 @@ static bool ldm_compare_tocblocks (const struct tocblock *toc1,
321 * Return: 'true' Success 321 * Return: 'true' Success
322 * 'false' Error 322 * 'false' Error
323 */ 323 */
324static bool ldm_validate_privheads (struct block_device *bdev, 324static bool ldm_validate_privheads(struct parsed_partitions *state,
325 struct privhead *ph1) 325 struct privhead *ph1)
326{ 326{
327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; 327 static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
328 struct privhead *ph[3] = { ph1 }; 328 struct privhead *ph[3] = { ph1 };
@@ -332,7 +332,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
332 long num_sects; 332 long num_sects;
333 int i; 333 int i;
334 334
335 BUG_ON (!bdev || !ph1); 335 BUG_ON (!state || !ph1);
336 336
337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); 337 ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); 338 ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
@@ -346,8 +346,8 @@ static bool ldm_validate_privheads (struct block_device *bdev,
346 346
347 /* Read and parse privheads */ 347 /* Read and parse privheads */
348 for (i = 0; i < 3; i++) { 348 for (i = 0; i < 3; i++) {
349 data = read_dev_sector (bdev, 349 data = read_part_sector(state, ph[0]->config_start + off[i],
350 ph[0]->config_start + off[i], &sect); 350 &sect);
351 if (!data) { 351 if (!data) {
352 ldm_crit ("Disk read failed."); 352 ldm_crit ("Disk read failed.");
353 goto out; 353 goto out;
@@ -363,7 +363,7 @@ static bool ldm_validate_privheads (struct block_device *bdev,
363 } 363 }
364 } 364 }
365 365
366 num_sects = bdev->bd_inode->i_size >> 9; 366 num_sects = state->bdev->bd_inode->i_size >> 9;
367 367
368 if ((ph[0]->config_start > num_sects) || 368 if ((ph[0]->config_start > num_sects) ||
369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { 369 ((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -397,20 +397,20 @@ out:
397 397
398/** 398/**
399 * ldm_validate_tocblocks - Validate the table of contents and its backups 399 * ldm_validate_tocblocks - Validate the table of contents and its backups
400 * @bdev: Device holding the LDM Database 400 * @state: Partition check state including device holding the LDM Database
401 * @base: Offset, into @bdev, of the database 401 * @base: Offset, into @state->bdev, of the database
402 * @ldb: Cache of the database structures 402 * @ldb: Cache of the database structures
403 * 403 *
404 * Find and compare the four tables of contents of the LDM Database stored on 404 * Find and compare the four tables of contents of the LDM Database stored on
405 * @bdev and return the parsed information into @toc1. 405 * @state->bdev and return the parsed information into @toc1.
406 * 406 *
407 * The offsets and sizes of the configs are range-checked against a privhead. 407 * The offsets and sizes of the configs are range-checked against a privhead.
408 * 408 *
409 * Return: 'true' @toc1 contains validated TOCBLOCK info 409 * Return: 'true' @toc1 contains validated TOCBLOCK info
410 * 'false' @toc1 contents are undefined 410 * 'false' @toc1 contents are undefined
411 */ 411 */
412static bool ldm_validate_tocblocks(struct block_device *bdev, 412static bool ldm_validate_tocblocks(struct parsed_partitions *state,
413 unsigned long base, struct ldmdb *ldb) 413 unsigned long base, struct ldmdb *ldb)
414{ 414{
415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; 415 static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
416 struct tocblock *tb[4]; 416 struct tocblock *tb[4];
@@ -420,7 +420,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
420 int i, nr_tbs; 420 int i, nr_tbs;
421 bool result = false; 421 bool result = false;
422 422
423 BUG_ON(!bdev || !ldb); 423 BUG_ON(!state || !ldb);
424 ph = &ldb->ph; 424 ph = &ldb->ph;
425 tb[0] = &ldb->toc; 425 tb[0] = &ldb->toc;
426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL); 426 tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
@@ -437,7 +437,7 @@ static bool ldm_validate_tocblocks(struct block_device *bdev,
437 * skip any that fail as long as we get at least one valid TOCBLOCK. 437 * skip any that fail as long as we get at least one valid TOCBLOCK.
438 */ 438 */
439 for (nr_tbs = i = 0; i < 4; i++) { 439 for (nr_tbs = i = 0; i < 4; i++) {
440 data = read_dev_sector(bdev, base + off[i], &sect); 440 data = read_part_sector(state, base + off[i], &sect);
441 if (!data) { 441 if (!data) {
442 ldm_error("Disk read failed for TOCBLOCK %d.", i); 442 ldm_error("Disk read failed for TOCBLOCK %d.", i);
443 continue; 443 continue;
@@ -473,7 +473,7 @@ err:
473 473
474/** 474/**
475 * ldm_validate_vmdb - Read the VMDB and validate it 475 * ldm_validate_vmdb - Read the VMDB and validate it
476 * @bdev: Device holding the LDM Database 476 * @state: Partition check state including device holding the LDM Database
477 * @base: Offset, into @bdev, of the database 477 * @base: Offset, into @bdev, of the database
478 * @ldb: Cache of the database structures 478 * @ldb: Cache of the database structures
479 * 479 *
@@ -483,8 +483,8 @@ err:
483 * Return: 'true' @ldb contains validated VBDB info 483 * Return: 'true' @ldb contains validated VBDB info
484 * 'false' @ldb contents are undefined 484 * 'false' @ldb contents are undefined
485 */ 485 */
486static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base, 486static bool ldm_validate_vmdb(struct parsed_partitions *state,
487 struct ldmdb *ldb) 487 unsigned long base, struct ldmdb *ldb)
488{ 488{
489 Sector sect; 489 Sector sect;
490 u8 *data; 490 u8 *data;
@@ -492,12 +492,12 @@ static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
492 struct vmdb *vm; 492 struct vmdb *vm;
493 struct tocblock *toc; 493 struct tocblock *toc;
494 494
495 BUG_ON (!bdev || !ldb); 495 BUG_ON (!state || !ldb);
496 496
497 vm = &ldb->vm; 497 vm = &ldb->vm;
498 toc = &ldb->toc; 498 toc = &ldb->toc;
499 499
500 data = read_dev_sector (bdev, base + OFF_VMDB, &sect); 500 data = read_part_sector(state, base + OFF_VMDB, &sect);
501 if (!data) { 501 if (!data) {
502 ldm_crit ("Disk read failed."); 502 ldm_crit ("Disk read failed.");
503 return false; 503 return false;
@@ -534,21 +534,21 @@ out:
534 534
535/** 535/**
536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk 536 * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
537 * @bdev: Device holding the LDM Database 537 * @state: Partition check state including device holding the LDM Database
538 * 538 *
539 * This function provides a weak test to decide whether the device is a dynamic 539 * This function provides a weak test to decide whether the device is a dynamic
540 * disk or not. It looks for an MS-DOS-style partition table containing at 540 * disk or not. It looks for an MS-DOS-style partition table containing at
541 * least one partition of type 0x42 (formerly SFS, now used by Windows for 541 * least one partition of type 0x42 (formerly SFS, now used by Windows for
542 * dynamic disks). 542 * dynamic disks).
543 * 543 *
544 * N.B. The only possible error can come from the read_dev_sector and that is 544 * N.B. The only possible error can come from the read_part_sector and that is
545 * only likely to happen if the underlying device is strange. If that IS 545 * only likely to happen if the underlying device is strange. If that IS
546 * the case we should return zero to let someone else try. 546 * the case we should return zero to let someone else try.
547 * 547 *
548 * Return: 'true' @bdev is a dynamic disk 548 * Return: 'true' @state->bdev is a dynamic disk
549 * 'false' @bdev is not a dynamic disk, or an error occurred 549 * 'false' @state->bdev is not a dynamic disk, or an error occurred
550 */ 550 */
551static bool ldm_validate_partition_table (struct block_device *bdev) 551static bool ldm_validate_partition_table(struct parsed_partitions *state)
552{ 552{
553 Sector sect; 553 Sector sect;
554 u8 *data; 554 u8 *data;
@@ -556,9 +556,9 @@ static bool ldm_validate_partition_table (struct block_device *bdev)
556 int i; 556 int i;
557 bool result = false; 557 bool result = false;
558 558
559 BUG_ON (!bdev); 559 BUG_ON(!state);
560 560
561 data = read_dev_sector (bdev, 0, &sect); 561 data = read_part_sector(state, 0, &sect);
562 if (!data) { 562 if (!data) {
563 ldm_crit ("Disk read failed."); 563 ldm_crit ("Disk read failed.");
564 return false; 564 return false;
@@ -1391,8 +1391,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1391 1391
1392/** 1392/**
1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory 1393 * ldm_get_vblks - Read the on-disk database of VBLKs into memory
1394 * @bdev: Device holding the LDM Database 1394 * @state: Partition check state including device holding the LDM Database
1395 * @base: Offset, into @bdev, of the database 1395 * @base: Offset, into @state->bdev, of the database
1396 * @ldb: Cache of the database structures 1396 * @ldb: Cache of the database structures
1397 * 1397 *
1398 * To use the information from the VBLKs, they need to be read from the disk, 1398 * To use the information from the VBLKs, they need to be read from the disk,
@@ -1401,8 +1401,8 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
1401 * Return: 'true' All the VBLKs were read successfully 1401 * Return: 'true' All the VBLKs were read successfully
1402 * 'false' An error occurred 1402 * 'false' An error occurred
1403 */ 1403 */
1404static bool ldm_get_vblks (struct block_device *bdev, unsigned long base, 1404static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
1405 struct ldmdb *ldb) 1405 struct ldmdb *ldb)
1406{ 1406{
1407 int size, perbuf, skip, finish, s, v, recs; 1407 int size, perbuf, skip, finish, s, v, recs;
1408 u8 *data = NULL; 1408 u8 *data = NULL;
@@ -1410,7 +1410,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1410 bool result = false; 1410 bool result = false;
1411 LIST_HEAD (frags); 1411 LIST_HEAD (frags);
1412 1412
1413 BUG_ON (!bdev || !ldb); 1413 BUG_ON(!state || !ldb);
1414 1414
1415 size = ldb->vm.vblk_size; 1415 size = ldb->vm.vblk_size;
1416 perbuf = 512 / size; 1416 perbuf = 512 / size;
@@ -1418,7 +1418,7 @@ static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
1418 finish = (size * ldb->vm.last_vblk_seq) >> 9; 1418 finish = (size * ldb->vm.last_vblk_seq) >> 9;
1419 1419
1420 for (s = skip; s < finish; s++) { /* For each sector */ 1420 for (s = skip; s < finish; s++) { /* For each sector */
1421 data = read_dev_sector (bdev, base + OFF_VMDB + s, &sect); 1421 data = read_part_sector(state, base + OFF_VMDB + s, &sect);
1422 if (!data) { 1422 if (!data) {
1423 ldm_crit ("Disk read failed."); 1423 ldm_crit ("Disk read failed.");
1424 goto out; 1424 goto out;
@@ -1474,8 +1474,7 @@ static void ldm_free_vblks (struct list_head *lh)
1474 1474
1475/** 1475/**
1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it 1476 * ldm_partition - Find out whether a device is a dynamic disk and handle it
1477 * @pp: List of the partitions parsed so far 1477 * @state: Partition check state including device holding the LDM Database
1478 * @bdev: Device holding the LDM Database
1479 * 1478 *
1480 * This determines whether the device @bdev is a dynamic disk and if so creates 1479 * This determines whether the device @bdev is a dynamic disk and if so creates
1481 * the partitions necessary in the gendisk structure pointed to by @hd. 1480 * the partitions necessary in the gendisk structure pointed to by @hd.
@@ -1485,21 +1484,21 @@ static void ldm_free_vblks (struct list_head *lh)
1485 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, 1484 * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
1486 * and so on: the actual data containing partitions. 1485 * and so on: the actual data containing partitions.
1487 * 1486 *
1488 * Return: 1 Success, @bdev is a dynamic disk and we handled it 1487 * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
1489 * 0 Success, @bdev is not a dynamic disk 1488 * 0 Success, @state->bdev is not a dynamic disk
1490 * -1 An error occurred before enough information had been read 1489 * -1 An error occurred before enough information had been read
1491 * Or @bdev is a dynamic disk, but it may be corrupted 1490 * Or @state->bdev is a dynamic disk, but it may be corrupted
1492 */ 1491 */
1493int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev) 1492int ldm_partition(struct parsed_partitions *state)
1494{ 1493{
1495 struct ldmdb *ldb; 1494 struct ldmdb *ldb;
1496 unsigned long base; 1495 unsigned long base;
1497 int result = -1; 1496 int result = -1;
1498 1497
1499 BUG_ON (!pp || !bdev); 1498 BUG_ON(!state);
1500 1499
1501 /* Look for signs of a Dynamic Disk */ 1500 /* Look for signs of a Dynamic Disk */
1502 if (!ldm_validate_partition_table (bdev)) 1501 if (!ldm_validate_partition_table(state))
1503 return 0; 1502 return 0;
1504 1503
1505 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); 1504 ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
@@ -1509,15 +1508,15 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1509 } 1508 }
1510 1509
1511 /* Parse and check privheads. */ 1510 /* Parse and check privheads. */
1512 if (!ldm_validate_privheads (bdev, &ldb->ph)) 1511 if (!ldm_validate_privheads(state, &ldb->ph))
1513 goto out; /* Already logged */ 1512 goto out; /* Already logged */
1514 1513
1515 /* All further references are relative to base (database start). */ 1514 /* All further references are relative to base (database start). */
1516 base = ldb->ph.config_start; 1515 base = ldb->ph.config_start;
1517 1516
1518 /* Parse and check tocs and vmdb. */ 1517 /* Parse and check tocs and vmdb. */
1519 if (!ldm_validate_tocblocks (bdev, base, ldb) || 1518 if (!ldm_validate_tocblocks(state, base, ldb) ||
1520 !ldm_validate_vmdb (bdev, base, ldb)) 1519 !ldm_validate_vmdb(state, base, ldb))
1521 goto out; /* Already logged */ 1520 goto out; /* Already logged */
1522 1521
1523 /* Initialize vblk lists in ldmdb struct */ 1522 /* Initialize vblk lists in ldmdb struct */
@@ -1527,13 +1526,13 @@ int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
1527 INIT_LIST_HEAD (&ldb->v_comp); 1526 INIT_LIST_HEAD (&ldb->v_comp);
1528 INIT_LIST_HEAD (&ldb->v_part); 1527 INIT_LIST_HEAD (&ldb->v_part);
1529 1528
1530 if (!ldm_get_vblks (bdev, base, ldb)) { 1529 if (!ldm_get_vblks(state, base, ldb)) {
1531 ldm_crit ("Failed to read the VBLKs from the database."); 1530 ldm_crit ("Failed to read the VBLKs from the database.");
1532 goto cleanup; 1531 goto cleanup;
1533 } 1532 }
1534 1533
1535 /* Finally, create the data partition devices. */ 1534 /* Finally, create the data partition devices. */
1536 if (ldm_create_data_partitions (pp, ldb)) { 1535 if (ldm_create_data_partitions(state, ldb)) {
1537 ldm_debug ("Parsed LDM database successfully."); 1536 ldm_debug ("Parsed LDM database successfully.");
1538 result = 1; 1537 result = 1;
1539 } 1538 }
diff --git a/fs/partitions/ldm.h b/fs/partitions/ldm.h
index 30e08e809c1d..d1fb50b28d86 100644
--- a/fs/partitions/ldm.h
+++ b/fs/partitions/ldm.h
@@ -209,7 +209,7 @@ struct ldmdb { /* Cache of the database */
209 struct list_head v_part; 209 struct list_head v_part;
210}; 210};
211 211
212int ldm_partition (struct parsed_partitions *state, struct block_device *bdev); 212int ldm_partition(struct parsed_partitions *state);
213 213
214#endif /* _FS_PT_LDM_H_ */ 214#endif /* _FS_PT_LDM_H_ */
215 215
diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c
index d4a0fad3563b..74465ff7c263 100644
--- a/fs/partitions/mac.c
+++ b/fs/partitions/mac.c
@@ -27,7 +27,7 @@ static inline void mac_fix_string(char *stg, int len)
27 stg[i] = 0; 27 stg[i] = 0;
28} 28}
29 29
30int mac_partition(struct parsed_partitions *state, struct block_device *bdev) 30int mac_partition(struct parsed_partitions *state)
31{ 31{
32 int slot = 1; 32 int slot = 1;
33 Sector sect; 33 Sector sect;
@@ -42,7 +42,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
42 struct mac_driver_desc *md; 42 struct mac_driver_desc *md;
43 43
44 /* Get 0th block and look at the first partition map entry. */ 44 /* Get 0th block and look at the first partition map entry. */
45 md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, &sect); 45 md = read_part_sector(state, 0, &sect);
46 if (!md) 46 if (!md)
47 return -1; 47 return -1;
48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { 48 if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
@@ -51,7 +51,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
51 } 51 }
52 secsize = be16_to_cpu(md->block_size); 52 secsize = be16_to_cpu(md->block_size);
53 put_dev_sector(sect); 53 put_dev_sector(sect);
54 data = read_dev_sector(bdev, secsize/512, &sect); 54 data = read_part_sector(state, secsize/512, &sect);
55 if (!data) 55 if (!data)
56 return -1; 56 return -1;
57 part = (struct mac_partition *) (data + secsize%512); 57 part = (struct mac_partition *) (data + secsize%512);
@@ -64,7 +64,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
64 for (blk = 1; blk <= blocks_in_map; ++blk) { 64 for (blk = 1; blk <= blocks_in_map; ++blk) {
65 int pos = blk * secsize; 65 int pos = blk * secsize;
66 put_dev_sector(sect); 66 put_dev_sector(sect);
67 data = read_dev_sector(bdev, pos/512, &sect); 67 data = read_part_sector(state, pos/512, &sect);
68 if (!data) 68 if (!data)
69 return -1; 69 return -1;
70 part = (struct mac_partition *) (data + pos%512); 70 part = (struct mac_partition *) (data + pos%512);
@@ -75,7 +75,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
75 be32_to_cpu(part->block_count) * (secsize/512)); 75 be32_to_cpu(part->block_count) * (secsize/512));
76 76
77 if (!strnicmp(part->type, "Linux_RAID", 10)) 77 if (!strnicmp(part->type, "Linux_RAID", 10))
78 state->parts[slot].flags = 1; 78 state->parts[slot].flags = ADDPART_FLAG_RAID;
79#ifdef CONFIG_PPC_PMAC 79#ifdef CONFIG_PPC_PMAC
80 /* 80 /*
81 * If this is the first bootable partition, tell the 81 * If this is the first bootable partition, tell the
@@ -123,7 +123,8 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
123 } 123 }
124#ifdef CONFIG_PPC_PMAC 124#ifdef CONFIG_PPC_PMAC
125 if (found_root_goodness) 125 if (found_root_goodness)
126 note_bootable_part(bdev->bd_dev, found_root, found_root_goodness); 126 note_bootable_part(state->bdev->bd_dev, found_root,
127 found_root_goodness);
127#endif 128#endif
128 129
129 put_dev_sector(sect); 130 put_dev_sector(sect);
diff --git a/fs/partitions/mac.h b/fs/partitions/mac.h
index bbf26e1386fa..3c7d98436380 100644
--- a/fs/partitions/mac.h
+++ b/fs/partitions/mac.h
@@ -41,4 +41,4 @@ struct mac_driver_desc {
41 /* ... more stuff */ 41 /* ... more stuff */
42}; 42};
43 43
44int mac_partition(struct parsed_partitions *state, struct block_device *bdev); 44int mac_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 90be97f1f5a8..15bfb7b1e044 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -64,7 +64,7 @@ msdos_magic_present(unsigned char *p)
64#define AIX_LABEL_MAGIC2 0xC2 64#define AIX_LABEL_MAGIC2 0xC2
65#define AIX_LABEL_MAGIC3 0xD4 65#define AIX_LABEL_MAGIC3 0xD4
66#define AIX_LABEL_MAGIC4 0xC1 66#define AIX_LABEL_MAGIC4 0xC1
67static int aix_magic_present(unsigned char *p, struct block_device *bdev) 67static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
68{ 68{
69 struct partition *pt = (struct partition *) (p + 0x1be); 69 struct partition *pt = (struct partition *) (p + 0x1be);
70 Sector sect; 70 Sector sect;
@@ -85,7 +85,7 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
85 is_extended_partition(pt)) 85 is_extended_partition(pt))
86 return 0; 86 return 0;
87 } 87 }
88 d = read_dev_sector(bdev, 7, &sect); 88 d = read_part_sector(state, 7, &sect);
89 if (d) { 89 if (d) {
90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') 90 if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
91 ret = 1; 91 ret = 1;
@@ -105,15 +105,14 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
105 * only for the actual data partitions. 105 * only for the actual data partitions.
106 */ 106 */
107 107
108static void 108static void parse_extended(struct parsed_partitions *state,
109parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109 sector_t first_sector, sector_t first_size)
110 sector_t first_sector, sector_t first_size)
111{ 110{
112 struct partition *p; 111 struct partition *p;
113 Sector sect; 112 Sector sect;
114 unsigned char *data; 113 unsigned char *data;
115 sector_t this_sector, this_size; 114 sector_t this_sector, this_size;
116 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 115 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
117 int loopct = 0; /* number of links followed 116 int loopct = 0; /* number of links followed
118 without finding a data partition */ 117 without finding a data partition */
119 int i; 118 int i;
@@ -126,7 +125,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
126 return; 125 return;
127 if (state->next == state->limit) 126 if (state->next == state->limit)
128 return; 127 return;
129 data = read_dev_sector(bdev, this_sector, &sect); 128 data = read_part_sector(state, this_sector, &sect);
130 if (!data) 129 if (!data)
131 return; 130 return;
132 131
@@ -198,9 +197,8 @@ done:
198/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also 197/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
199 indicates linux swap. Be careful before believing this is Solaris. */ 198 indicates linux swap. Be careful before believing this is Solaris. */
200 199
201static void 200static void parse_solaris_x86(struct parsed_partitions *state,
202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 201 sector_t offset, sector_t size, int origin)
203 sector_t offset, sector_t size, int origin)
204{ 202{
205#ifdef CONFIG_SOLARIS_X86_PARTITION 203#ifdef CONFIG_SOLARIS_X86_PARTITION
206 Sector sect; 204 Sector sect;
@@ -208,7 +206,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
208 int i; 206 int i;
209 short max_nparts; 207 short max_nparts;
210 208
211 v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, &sect); 209 v = read_part_sector(state, offset + 1, &sect);
212 if (!v) 210 if (!v)
213 return; 211 return;
214 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { 212 if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
@@ -245,16 +243,15 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
245 * Create devices for BSD partitions listed in a disklabel, under a 243 * Create devices for BSD partitions listed in a disklabel, under a
246 * dos-like partition. See parse_extended() for more information. 244 * dos-like partition. See parse_extended() for more information.
247 */ 245 */
248static void 246static void parse_bsd(struct parsed_partitions *state,
249parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 247 sector_t offset, sector_t size, int origin, char *flavour,
250 sector_t offset, sector_t size, int origin, char *flavour, 248 int max_partitions)
251 int max_partitions)
252{ 249{
253 Sector sect; 250 Sector sect;
254 struct bsd_disklabel *l; 251 struct bsd_disklabel *l;
255 struct bsd_partition *p; 252 struct bsd_partition *p;
256 253
257 l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, &sect); 254 l = read_part_sector(state, offset + 1, &sect);
258 if (!l) 255 if (!l)
259 return; 256 return;
260 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { 257 if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
@@ -291,33 +288,28 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
291} 288}
292#endif 289#endif
293 290
294static void 291static void parse_freebsd(struct parsed_partitions *state,
295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 292 sector_t offset, sector_t size, int origin)
296 sector_t offset, sector_t size, int origin)
297{ 293{
298#ifdef CONFIG_BSD_DISKLABEL 294#ifdef CONFIG_BSD_DISKLABEL
299 parse_bsd(state, bdev, offset, size, origin, 295 parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
300 "bsd", BSD_MAXPARTITIONS);
301#endif 296#endif
302} 297}
303 298
304static void 299static void parse_netbsd(struct parsed_partitions *state,
305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 300 sector_t offset, sector_t size, int origin)
306 sector_t offset, sector_t size, int origin)
307{ 301{
308#ifdef CONFIG_BSD_DISKLABEL 302#ifdef CONFIG_BSD_DISKLABEL
309 parse_bsd(state, bdev, offset, size, origin, 303 parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
310 "netbsd", BSD_MAXPARTITIONS);
311#endif 304#endif
312} 305}
313 306
314static void 307static void parse_openbsd(struct parsed_partitions *state,
315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 308 sector_t offset, sector_t size, int origin)
316 sector_t offset, sector_t size, int origin)
317{ 309{
318#ifdef CONFIG_BSD_DISKLABEL 310#ifdef CONFIG_BSD_DISKLABEL
319 parse_bsd(state, bdev, offset, size, origin, 311 parse_bsd(state, offset, size, origin, "openbsd",
320 "openbsd", OPENBSD_MAXPARTITIONS); 312 OPENBSD_MAXPARTITIONS);
321#endif 313#endif
322} 314}
323 315
@@ -325,16 +317,15 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
325 * Create devices for Unixware partitions listed in a disklabel, under a 317 * Create devices for Unixware partitions listed in a disklabel, under a
326 * dos-like partition. See parse_extended() for more information. 318 * dos-like partition. See parse_extended() for more information.
327 */ 319 */
328static void 320static void parse_unixware(struct parsed_partitions *state,
329parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 321 sector_t offset, sector_t size, int origin)
330 sector_t offset, sector_t size, int origin)
331{ 322{
332#ifdef CONFIG_UNIXWARE_DISKLABEL 323#ifdef CONFIG_UNIXWARE_DISKLABEL
333 Sector sect; 324 Sector sect;
334 struct unixware_disklabel *l; 325 struct unixware_disklabel *l;
335 struct unixware_slice *p; 326 struct unixware_slice *p;
336 327
337 l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, &sect); 328 l = read_part_sector(state, offset + 29, &sect);
338 if (!l) 329 if (!l)
339 return; 330 return;
340 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || 331 if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
@@ -365,9 +356,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
365 * Anand Krishnamurthy <anandk@wiproge.med.ge.com> 356 * Anand Krishnamurthy <anandk@wiproge.med.ge.com>
366 * Rajeev V. Pillai <rajeevvp@yahoo.com> 357 * Rajeev V. Pillai <rajeevvp@yahoo.com>
367 */ 358 */
368static void 359static void parse_minix(struct parsed_partitions *state,
369parse_minix(struct parsed_partitions *state, struct block_device *bdev, 360 sector_t offset, sector_t size, int origin)
370 sector_t offset, sector_t size, int origin)
371{ 361{
372#ifdef CONFIG_MINIX_SUBPARTITION 362#ifdef CONFIG_MINIX_SUBPARTITION
373 Sector sect; 363 Sector sect;
@@ -375,7 +365,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
375 struct partition *p; 365 struct partition *p;
376 int i; 366 int i;
377 367
378 data = read_dev_sector(bdev, offset, &sect); 368 data = read_part_sector(state, offset, &sect);
379 if (!data) 369 if (!data)
380 return; 370 return;
381 371
@@ -404,8 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
404 394
405static struct { 395static struct {
406 unsigned char id; 396 unsigned char id;
407 void (*parse)(struct parsed_partitions *, struct block_device *, 397 void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
408 sector_t, sector_t, int);
409} subtypes[] = { 398} subtypes[] = {
410 {FREEBSD_PARTITION, parse_freebsd}, 399 {FREEBSD_PARTITION, parse_freebsd},
411 {NETBSD_PARTITION, parse_netbsd}, 400 {NETBSD_PARTITION, parse_netbsd},
@@ -417,16 +406,16 @@ static struct {
417 {0, NULL}, 406 {0, NULL},
418}; 407};
419 408
420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 409int msdos_partition(struct parsed_partitions *state)
421{ 410{
422 sector_t sector_size = bdev_logical_block_size(bdev) / 512; 411 sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
423 Sector sect; 412 Sector sect;
424 unsigned char *data; 413 unsigned char *data;
425 struct partition *p; 414 struct partition *p;
426 struct fat_boot_sector *fb; 415 struct fat_boot_sector *fb;
427 int slot; 416 int slot;
428 417
429 data = read_dev_sector(bdev, 0, &sect); 418 data = read_part_sector(state, 0, &sect);
430 if (!data) 419 if (!data)
431 return -1; 420 return -1;
432 if (!msdos_magic_present(data + 510)) { 421 if (!msdos_magic_present(data + 510)) {
@@ -434,7 +423,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
434 return 0; 423 return 0;
435 } 424 }
436 425
437 if (aix_magic_present(data, bdev)) { 426 if (aix_magic_present(state, data)) {
438 put_dev_sector(sect); 427 put_dev_sector(sect);
439 printk( " [AIX]"); 428 printk( " [AIX]");
440 return 0; 429 return 0;
@@ -503,13 +492,13 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
503 put_partition(state, slot, start, n); 492 put_partition(state, slot, start, n);
504 493
505 printk(" <"); 494 printk(" <");
506 parse_extended(state, bdev, start, size); 495 parse_extended(state, start, size);
507 printk(" >"); 496 printk(" >");
508 continue; 497 continue;
509 } 498 }
510 put_partition(state, slot, start, size); 499 put_partition(state, slot, start, size);
511 if (SYS_IND(p) == LINUX_RAID_PARTITION) 500 if (SYS_IND(p) == LINUX_RAID_PARTITION)
512 state->parts[slot].flags = 1; 501 state->parts[slot].flags = ADDPART_FLAG_RAID;
513 if (SYS_IND(p) == DM6_PARTITION) 502 if (SYS_IND(p) == DM6_PARTITION)
514 printk("[DM]"); 503 printk("[DM]");
515 if (SYS_IND(p) == EZD_PARTITION) 504 if (SYS_IND(p) == EZD_PARTITION)
@@ -532,8 +521,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
532 521
533 if (!subtypes[n].parse) 522 if (!subtypes[n].parse)
534 continue; 523 continue;
535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size, 524 subtypes[n].parse(state, start_sect(p) * sector_size,
536 nr_sects(p)*sector_size, slot); 525 nr_sects(p) * sector_size, slot);
537 } 526 }
538 put_dev_sector(sect); 527 put_dev_sector(sect);
539 return 1; 528 return 1;
diff --git a/fs/partitions/msdos.h b/fs/partitions/msdos.h
index 01e5e0b6902d..38c781c490b3 100644
--- a/fs/partitions/msdos.h
+++ b/fs/partitions/msdos.h
@@ -4,5 +4,5 @@
4 4
5#define MSDOS_LABEL_MAGIC 0xAA55 5#define MSDOS_LABEL_MAGIC 0xAA55
6 6
7int msdos_partition(struct parsed_partitions *state, struct block_device *bdev); 7int msdos_partition(struct parsed_partitions *state);
8 8
diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index c05c17bc5df3..fc22b85d436a 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "osf.h" 11#include "osf.h"
12 12
13int osf_partition(struct parsed_partitions *state, struct block_device *bdev) 13int osf_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 int slot = 1; 16 int slot = 1;
@@ -49,7 +49,7 @@ int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
49 } * label; 49 } * label;
50 struct d_partition * partition; 50 struct d_partition * partition;
51 51
52 data = read_dev_sector(bdev, 0, &sect); 52 data = read_part_sector(state, 0, &sect);
53 if (!data) 53 if (!data)
54 return -1; 54 return -1;
55 55
diff --git a/fs/partitions/osf.h b/fs/partitions/osf.h
index 427b8eab314b..20ed2315ec16 100644
--- a/fs/partitions/osf.h
+++ b/fs/partitions/osf.h
@@ -4,4 +4,4 @@
4 4
5#define DISKLABELMAGIC (0x82564557UL) 5#define DISKLABELMAGIC (0x82564557UL)
6 6
7int osf_partition(struct parsed_partitions *state, struct block_device *bdev); 7int osf_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sgi.c b/fs/partitions/sgi.c
index ed5ac83fe83a..43b1df9aa16c 100644
--- a/fs/partitions/sgi.c
+++ b/fs/partitions/sgi.c
@@ -27,7 +27,7 @@ struct sgi_disklabel {
27 __be32 _unused1; /* Padding */ 27 __be32 _unused1; /* Padding */
28}; 28};
29 29
30int sgi_partition(struct parsed_partitions *state, struct block_device *bdev) 30int sgi_partition(struct parsed_partitions *state)
31{ 31{
32 int i, csum; 32 int i, csum;
33 __be32 magic; 33 __be32 magic;
@@ -39,7 +39,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
39 struct sgi_partition *p; 39 struct sgi_partition *p;
40 char b[BDEVNAME_SIZE]; 40 char b[BDEVNAME_SIZE];
41 41
42 label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, &sect); 42 label = read_part_sector(state, 0, &sect);
43 if (!label) 43 if (!label)
44 return -1; 44 return -1;
45 p = &label->partitions[0]; 45 p = &label->partitions[0];
@@ -57,7 +57,7 @@ int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
57 } 57 }
58 if(csum) { 58 if(csum) {
59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", 59 printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
60 bdevname(bdev, b)); 60 bdevname(state->bdev, b));
61 put_dev_sector(sect); 61 put_dev_sector(sect);
62 return 0; 62 return 0;
63 } 63 }
diff --git a/fs/partitions/sgi.h b/fs/partitions/sgi.h
index 5d5595c09928..b9553ebdd5a9 100644
--- a/fs/partitions/sgi.h
+++ b/fs/partitions/sgi.h
@@ -2,7 +2,7 @@
2 * fs/partitions/sgi.h 2 * fs/partitions/sgi.h
3 */ 3 */
4 4
5extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev); 5extern int sgi_partition(struct parsed_partitions *state);
6 6
7#define SGI_LABEL_MAGIC 0x0be5a941 7#define SGI_LABEL_MAGIC 0x0be5a941
8 8
diff --git a/fs/partitions/sun.c b/fs/partitions/sun.c
index c95e6a62c01d..a32660e25f7f 100644
--- a/fs/partitions/sun.c
+++ b/fs/partitions/sun.c
@@ -10,7 +10,7 @@
10#include "check.h" 10#include "check.h"
11#include "sun.h" 11#include "sun.h"
12 12
13int sun_partition(struct parsed_partitions *state, struct block_device *bdev) 13int sun_partition(struct parsed_partitions *state)
14{ 14{
15 int i; 15 int i;
16 __be16 csum; 16 __be16 csum;
@@ -61,7 +61,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
61 int use_vtoc; 61 int use_vtoc;
62 int nparts; 62 int nparts;
63 63
64 label = (struct sun_disklabel *)read_dev_sector(bdev, 0, &sect); 64 label = read_part_sector(state, 0, &sect);
65 if (!label) 65 if (!label)
66 return -1; 66 return -1;
67 67
@@ -78,7 +78,7 @@ int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
78 csum ^= *ush--; 78 csum ^= *ush--;
79 if (csum) { 79 if (csum) {
80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", 80 printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
81 bdevname(bdev, b)); 81 bdevname(state->bdev, b));
82 put_dev_sector(sect); 82 put_dev_sector(sect);
83 return 0; 83 return 0;
84 } 84 }
diff --git a/fs/partitions/sun.h b/fs/partitions/sun.h
index 7f864d1f86d4..2424baa8319f 100644
--- a/fs/partitions/sun.h
+++ b/fs/partitions/sun.h
@@ -5,4 +5,4 @@
5#define SUN_LABEL_MAGIC 0xDABE 5#define SUN_LABEL_MAGIC 0xDABE
6#define SUN_VTOC_SANITY 0x600DDEEE 6#define SUN_VTOC_SANITY 0x600DDEEE
7 7
8int sun_partition(struct parsed_partitions *state, struct block_device *bdev); 8int sun_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/sysv68.c b/fs/partitions/sysv68.c
index 4eba27b78643..9030c864428e 100644
--- a/fs/partitions/sysv68.c
+++ b/fs/partitions/sysv68.c
@@ -46,7 +46,7 @@ struct slice {
46}; 46};
47 47
48 48
49int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev) 49int sysv68_partition(struct parsed_partitions *state)
50{ 50{
51 int i, slices; 51 int i, slices;
52 int slot = 1; 52 int slot = 1;
@@ -55,7 +55,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
55 struct dkblk0 *b; 55 struct dkblk0 *b;
56 struct slice *slice; 56 struct slice *slice;
57 57
58 data = read_dev_sector(bdev, 0, &sect); 58 data = read_part_sector(state, 0, &sect);
59 if (!data) 59 if (!data)
60 return -1; 60 return -1;
61 61
@@ -68,7 +68,7 @@ int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
68 i = be32_to_cpu(b->dk_ios.ios_slcblk); 68 i = be32_to_cpu(b->dk_ios.ios_slcblk);
69 put_dev_sector(sect); 69 put_dev_sector(sect);
70 70
71 data = read_dev_sector(bdev, i, &sect); 71 data = read_part_sector(state, i, &sect);
72 if (!data) 72 if (!data)
73 return -1; 73 return -1;
74 74
diff --git a/fs/partitions/sysv68.h b/fs/partitions/sysv68.h
index fa733f68431b..bf2f5ffa97ac 100644
--- a/fs/partitions/sysv68.h
+++ b/fs/partitions/sysv68.h
@@ -1 +1 @@
extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev); extern int sysv68_partition(struct parsed_partitions *state);
diff --git a/fs/partitions/ultrix.c b/fs/partitions/ultrix.c
index ec852c11dce4..db9eef260364 100644
--- a/fs/partitions/ultrix.c
+++ b/fs/partitions/ultrix.c
@@ -9,7 +9,7 @@
9#include "check.h" 9#include "check.h"
10#include "ultrix.h" 10#include "ultrix.h"
11 11
12int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev) 12int ultrix_partition(struct parsed_partitions *state)
13{ 13{
14 int i; 14 int i;
15 Sector sect; 15 Sector sect;
@@ -26,7 +26,7 @@ int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
26#define PT_MAGIC 0x032957 /* Partition magic number */ 26#define PT_MAGIC 0x032957 /* Partition magic number */
27#define PT_VALID 1 /* Indicates if struct is valid */ 27#define PT_VALID 1 /* Indicates if struct is valid */
28 28
29 data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, &sect); 29 data = read_part_sector(state, (16384 - sizeof(*label))/512, &sect);
30 if (!data) 30 if (!data)
31 return -1; 31 return -1;
32 32
diff --git a/fs/partitions/ultrix.h b/fs/partitions/ultrix.h
index a74bf8e2d370..a3cc00b2bded 100644
--- a/fs/partitions/ultrix.h
+++ b/fs/partitions/ultrix.h
@@ -2,4 +2,4 @@
2 * fs/partitions/ultrix.h 2 * fs/partitions/ultrix.h
3 */ 3 */
4 4
5int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev); 5int ultrix_partition(struct parsed_partitions *state);
diff --git a/fs/pipe.c b/fs/pipe.c
index 37ba29ff3158..279eef96c51c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/log2.h>
14#include <linux/mount.h> 15#include <linux/mount.h>
15#include <linux/pipe_fs_i.h> 16#include <linux/pipe_fs_i.h>
16#include <linux/uio.h> 17#include <linux/uio.h>
@@ -18,11 +19,23 @@
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/audit.h> 20#include <linux/audit.h>
20#include <linux/syscalls.h> 21#include <linux/syscalls.h>
22#include <linux/fcntl.h>
21 23
22#include <asm/uaccess.h> 24#include <asm/uaccess.h>
23#include <asm/ioctls.h> 25#include <asm/ioctls.h>
24 26
25/* 27/*
28 * The max size that a non-root user is allowed to grow the pipe. Can
29 * be set by root in /proc/sys/fs/pipe-max-size
30 */
31unsigned int pipe_max_size = 1048576;
32
33/*
34 * Minimum pipe size, as required by POSIX
35 */
36unsigned int pipe_min_size = PAGE_SIZE;
37
38/*
26 * We use a start+len construction, which provides full use of the 39 * We use a start+len construction, which provides full use of the
27 * allocated memory. 40 * allocated memory.
28 * -- Florian Coosmann (FGC) 41 * -- Florian Coosmann (FGC)
@@ -222,6 +235,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
222 235
223 return kmap(buf->page); 236 return kmap(buf->page);
224} 237}
238EXPORT_SYMBOL(generic_pipe_buf_map);
225 239
226/** 240/**
227 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 241 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -241,6 +255,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
241 } else 255 } else
242 kunmap(buf->page); 256 kunmap(buf->page);
243} 257}
258EXPORT_SYMBOL(generic_pipe_buf_unmap);
244 259
245/** 260/**
246 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 261 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -271,6 +286,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
271 286
272 return 1; 287 return 1;
273} 288}
289EXPORT_SYMBOL(generic_pipe_buf_steal);
274 290
275/** 291/**
276 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 292 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -286,6 +302,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
286{ 302{
287 page_cache_get(buf->page); 303 page_cache_get(buf->page);
288} 304}
305EXPORT_SYMBOL(generic_pipe_buf_get);
289 306
290/** 307/**
291 * generic_pipe_buf_confirm - verify contents of the pipe buffer 308 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -301,6 +318,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
301{ 318{
302 return 0; 319 return 0;
303} 320}
321EXPORT_SYMBOL(generic_pipe_buf_confirm);
304 322
305/** 323/**
306 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 324 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -315,6 +333,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
315{ 333{
316 page_cache_release(buf->page); 334 page_cache_release(buf->page);
317} 335}
336EXPORT_SYMBOL(generic_pipe_buf_release);
318 337
319static const struct pipe_buf_operations anon_pipe_buf_ops = { 338static const struct pipe_buf_operations anon_pipe_buf_ops = {
320 .can_merge = 1, 339 .can_merge = 1,
@@ -390,7 +409,7 @@ redo:
390 if (!buf->len) { 409 if (!buf->len) {
391 buf->ops = NULL; 410 buf->ops = NULL;
392 ops->release(pipe, buf); 411 ops->release(pipe, buf);
393 curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); 412 curbuf = (curbuf + 1) & (pipe->buffers - 1);
394 pipe->curbuf = curbuf; 413 pipe->curbuf = curbuf;
395 pipe->nrbufs = --bufs; 414 pipe->nrbufs = --bufs;
396 do_wakeup = 1; 415 do_wakeup = 1;
@@ -472,7 +491,7 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov,
472 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 491 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
473 if (pipe->nrbufs && chars != 0) { 492 if (pipe->nrbufs && chars != 0) {
474 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 493 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
475 (PIPE_BUFFERS-1); 494 (pipe->buffers - 1);
476 struct pipe_buffer *buf = pipe->bufs + lastbuf; 495 struct pipe_buffer *buf = pipe->bufs + lastbuf;
477 const struct pipe_buf_operations *ops = buf->ops; 496 const struct pipe_buf_operations *ops = buf->ops;
478 int offset = buf->offset + buf->len; 497 int offset = buf->offset + buf->len;
@@ -518,8 +537,8 @@ redo1:
518 break; 537 break;
519 } 538 }
520 bufs = pipe->nrbufs; 539 bufs = pipe->nrbufs;
521 if (bufs < PIPE_BUFFERS) { 540 if (bufs < pipe->buffers) {
522 int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); 541 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
523 struct pipe_buffer *buf = pipe->bufs + newbuf; 542 struct pipe_buffer *buf = pipe->bufs + newbuf;
524 struct page *page = pipe->tmp_page; 543 struct page *page = pipe->tmp_page;
525 char *src; 544 char *src;
@@ -580,7 +599,7 @@ redo2:
580 if (!total_len) 599 if (!total_len)
581 break; 600 break;
582 } 601 }
583 if (bufs < PIPE_BUFFERS) 602 if (bufs < pipe->buffers)
584 continue; 603 continue;
585 if (filp->f_flags & O_NONBLOCK) { 604 if (filp->f_flags & O_NONBLOCK) {
586 if (!ret) 605 if (!ret)
@@ -640,7 +659,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
640 nrbufs = pipe->nrbufs; 659 nrbufs = pipe->nrbufs;
641 while (--nrbufs >= 0) { 660 while (--nrbufs >= 0) {
642 count += pipe->bufs[buf].len; 661 count += pipe->bufs[buf].len;
643 buf = (buf+1) & (PIPE_BUFFERS-1); 662 buf = (buf+1) & (pipe->buffers - 1);
644 } 663 }
645 mutex_unlock(&inode->i_mutex); 664 mutex_unlock(&inode->i_mutex);
646 665
@@ -671,7 +690,7 @@ pipe_poll(struct file *filp, poll_table *wait)
671 } 690 }
672 691
673 if (filp->f_mode & FMODE_WRITE) { 692 if (filp->f_mode & FMODE_WRITE) {
674 mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0; 693 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
675 /* 694 /*
676 * Most Unices do not set POLLERR for FIFOs but on Linux they 695 * Most Unices do not set POLLERR for FIFOs but on Linux they
677 * behave exactly like pipes for poll(). 696 * behave exactly like pipes for poll().
@@ -877,25 +896,32 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
877 896
878 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 897 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
879 if (pipe) { 898 if (pipe) {
880 init_waitqueue_head(&pipe->wait); 899 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
881 pipe->r_counter = pipe->w_counter = 1; 900 if (pipe->bufs) {
882 pipe->inode = inode; 901 init_waitqueue_head(&pipe->wait);
902 pipe->r_counter = pipe->w_counter = 1;
903 pipe->inode = inode;
904 pipe->buffers = PIPE_DEF_BUFFERS;
905 return pipe;
906 }
907 kfree(pipe);
883 } 908 }
884 909
885 return pipe; 910 return NULL;
886} 911}
887 912
888void __free_pipe_info(struct pipe_inode_info *pipe) 913void __free_pipe_info(struct pipe_inode_info *pipe)
889{ 914{
890 int i; 915 int i;
891 916
892 for (i = 0; i < PIPE_BUFFERS; i++) { 917 for (i = 0; i < pipe->buffers; i++) {
893 struct pipe_buffer *buf = pipe->bufs + i; 918 struct pipe_buffer *buf = pipe->bufs + i;
894 if (buf->ops) 919 if (buf->ops)
895 buf->ops->release(pipe, buf); 920 buf->ops->release(pipe, buf);
896 } 921 }
897 if (pipe->tmp_page) 922 if (pipe->tmp_page)
898 __free_page(pipe->tmp_page); 923 __free_page(pipe->tmp_page);
924 kfree(pipe->bufs);
899 kfree(pipe); 925 kfree(pipe);
900} 926}
901 927
@@ -1094,6 +1120,126 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
1094} 1120}
1095 1121
1096/* 1122/*
1123 * Allocate a new array of pipe buffers and copy the info over. Returns the
1124 * pipe size if successful, or return -ERROR on error.
1125 */
1126static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1127{
1128 struct pipe_buffer *bufs;
1129
1130 /*
1131 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1132 * expect a lot of shrink+grow operations, just free and allocate
1133 * again like we would do for growing. If the pipe currently
1134 * contains more buffers than arg, then return busy.
1135 */
1136 if (nr_pages < pipe->nrbufs)
1137 return -EBUSY;
1138
1139 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
1140 if (unlikely(!bufs))
1141 return -ENOMEM;
1142
1143 /*
1144 * The pipe array wraps around, so just start the new one at zero
1145 * and adjust the indexes.
1146 */
1147 if (pipe->nrbufs) {
1148 unsigned int tail;
1149 unsigned int head;
1150
1151 tail = pipe->curbuf + pipe->nrbufs;
1152 if (tail < pipe->buffers)
1153 tail = 0;
1154 else
1155 tail &= (pipe->buffers - 1);
1156
1157 head = pipe->nrbufs - tail;
1158 if (head)
1159 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1160 if (tail)
1161 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
1162 }
1163
1164 pipe->curbuf = 0;
1165 kfree(pipe->bufs);
1166 pipe->bufs = bufs;
1167 pipe->buffers = nr_pages;
1168 return nr_pages * PAGE_SIZE;
1169}
1170
1171/*
1172 * Currently we rely on the pipe array holding a power-of-2 number
1173 * of pages.
1174 */
1175static inline unsigned int round_pipe_size(unsigned int size)
1176{
1177 unsigned long nr_pages;
1178
1179 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
1180 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
1181}
1182
1183/*
1184 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
1185 * will return an error.
1186 */
1187int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
1188 size_t *lenp, loff_t *ppos)
1189{
1190 int ret;
1191
1192 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
1193 if (ret < 0 || !write)
1194 return ret;
1195
1196 pipe_max_size = round_pipe_size(pipe_max_size);
1197 return ret;
1198}
1199
1200long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1201{
1202 struct pipe_inode_info *pipe;
1203 long ret;
1204
1205 pipe = file->f_path.dentry->d_inode->i_pipe;
1206 if (!pipe)
1207 return -EBADF;
1208
1209 mutex_lock(&pipe->inode->i_mutex);
1210
1211 switch (cmd) {
1212 case F_SETPIPE_SZ: {
1213 unsigned int size, nr_pages;
1214
1215 size = round_pipe_size(arg);
1216 nr_pages = size >> PAGE_SHIFT;
1217
1218 ret = -EINVAL;
1219 if (!nr_pages)
1220 goto out;
1221
1222 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
1223 ret = -EPERM;
1224 goto out;
1225 }
1226 ret = pipe_set_size(pipe, nr_pages);
1227 break;
1228 }
1229 case F_GETPIPE_SZ:
1230 ret = pipe->buffers * PAGE_SIZE;
1231 break;
1232 default:
1233 ret = -EINVAL;
1234 break;
1235 }
1236
1237out:
1238 mutex_unlock(&pipe->inode->i_mutex);
1239 return ret;
1240}
1241
1242/*
1097 * pipefs should _never_ be mounted by userland - too much of security hassle, 1243 * pipefs should _never_ be mounted by userland - too much of security hassle,
1098 * no real gain from having the whole whorehouse mounted. So we don't need 1244 * no real gain from having the whole whorehouse mounted. So we don't need
1099 * any operations on the root directory. However, we need a non-trivial 1245 * any operations on the root directory. However, we need a non-trivial
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
267 shpending = p->signal->shared_pending.signal; 267 shpending = p->signal->shared_pending.signal;
268 blocked = p->blocked; 268 blocked = p->blocked;
269 collect_sigign_sigcatch(p, &ignored, &caught); 269 collect_sigign_sigcatch(p, &ignored, &caught);
270 num_threads = atomic_read(&p->signal->count); 270 num_threads = get_nr_threads(p);
271 rcu_read_lock(); /* FIXME: is this correct? */ 271 rcu_read_lock(); /* FIXME: is this correct? */
272 qsize = atomic_read(&__task_cred(p)->user->sigpending); 272 qsize = atomic_read(&__task_cred(p)->user->sigpending);
273 rcu_read_unlock(); 273 rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
410 tty_nr = new_encode_dev(tty_devnum(sig->tty)); 410 tty_nr = new_encode_dev(tty_devnum(sig->tty));
411 } 411 }
412 412
413 num_threads = atomic_read(&sig->count); 413 num_threads = get_nr_threads(task);
414 collect_sigign_sigcatch(task, &sigign, &sigcatch); 414 collect_sigign_sigcatch(task, &sigign, &sigcatch);
415 415
416 cmin_flt = sig->cmin_flt; 416 cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449dc..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
166 return result; 166 return result;
167} 167}
168 168
169static int get_nr_threads(struct task_struct *tsk)
170{
171 unsigned long flags;
172 int count = 0;
173
174 if (lock_task_sighand(tsk, &flags)) {
175 count = atomic_read(&tsk->signal->count);
176 unlock_task_sighand(tsk, &flags);
177 }
178 return count;
179}
180
181static int proc_cwd_link(struct inode *inode, struct path *path) 169static int proc_cwd_link(struct inode *inode, struct path *path)
182{ 170{
183 struct task_struct *task = get_proc_task(inode); 171 struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
2444 const struct pid_entry *p = ptr; 2432 const struct pid_entry *p = ptr;
2445 struct inode *inode; 2433 struct inode *inode;
2446 struct proc_inode *ei; 2434 struct proc_inode *ei;
2447 struct dentry *error = ERR_PTR(-EINVAL); 2435 struct dentry *error;
2448 2436
2449 /* Allocate the inode */ 2437 /* Allocate the inode */
2450 error = ERR_PTR(-ENOMEM); 2438 error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
2794 2782
2795struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) 2783struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2796{ 2784{
2797 struct dentry *result = ERR_PTR(-ENOENT); 2785 struct dentry *result;
2798 struct task_struct *task; 2786 struct task_struct *task;
2799 unsigned tgid; 2787 unsigned tgid;
2800 struct pid_namespace *ns; 2788 struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
343/* 343/*
344 * Return an inode number between PROC_DYNAMIC_FIRST and 344 * Return an inode number between PROC_DYNAMIC_FIRST and
345 * 0xffffffff, or zero on failure. 345 * 0xffffffff, or zero on failure.
346 *
347 * Current inode allocations in the proc-fs (hex-numbers):
348 *
349 * 00000000 reserved
350 * 00000001-00000fff static entries (goners)
351 * 001 root-ino
352 *
353 * 00001000-00001fff unused
354 * 0001xxxx-7fffxxxx pid-dir entries for pid 1-7fff
355 * 80000000-efffffff unused
356 * f0000000-ffffffff dynamic entries
357 *
358 * Goal:
359 * Once we split the thing into several virtual filesystems,
360 * we will get rid of magical ranges (and this comment, BTW).
361 */ 346 */
362static unsigned int get_inode_number(void) 347static unsigned int get_inode_number(void)
363{ 348{
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351be..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
588 */ 588 */
589static void __init proc_kcore_text_init(void) 589static void __init proc_kcore_text_init(void)
590{ 590{
591 kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT); 591 kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
592} 592}
593#else 593#else
594static void __init proc_kcore_text_init(void) 594static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
110 if (err) 110 if (err)
111 return; 111 return;
112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns); 112 proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
113 err = PTR_ERR(proc_mnt);
114 if (IS_ERR(proc_mnt)) { 113 if (IS_ERR(proc_mnt)) {
115 unregister_filesystem(&proc_fs_type); 114 unregister_filesystem(&proc_fs_type);
116 return; 115 return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
634 return err; 634 return err;
635} 635}
636 636
637#ifdef CONFIG_HUGETLB_PAGE
637static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) 638static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
638{ 639{
639 u64 pme = 0; 640 u64 pme = 0;
@@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
664 665
665 return err; 666 return err;
666} 667}
668#endif /* HUGETLB_PAGE */
667 669
668/* 670/*
669 * /proc/pid/pagemap - an array mapping virtual pages to pfns 671 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
733 735
734 pagemap_walk.pmd_entry = pagemap_pte_range; 736 pagemap_walk.pmd_entry = pagemap_pte_range;
735 pagemap_walk.pte_hole = pagemap_pte_hole; 737 pagemap_walk.pte_hole = pagemap_pte_hole;
738#ifdef CONFIG_HUGETLB_PAGE
736 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 739 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
740#endif
737 pagemap_walk.mm = mm; 741 pagemap_walk.mm = mm;
738 pagemap_walk.private = &pm; 742 pagemap_walk.private = &pm;
739 743
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
77 77
78const struct file_operations qnx4_dir_operations = 78const struct file_operations qnx4_dir_operations =
79{ 79{
80 .llseek = generic_file_llseek,
80 .read = generic_read_dir, 81 .read = generic_read_dir,
81 .readdir = qnx4_readdir, 82 .readdir = qnx4_readdir,
82 .fsync = simple_fsync, 83 .fsync = generic_file_fsync,
83}; 84};
84 85
85const struct inode_operations qnx4_dir_inode_operations = 86const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c3..12c233da1b6b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -228,10 +228,6 @@ static struct hlist_head *dquot_hash;
228 228
229struct dqstats dqstats; 229struct dqstats dqstats;
230EXPORT_SYMBOL(dqstats); 230EXPORT_SYMBOL(dqstats);
231#ifdef CONFIG_SMP
232struct dqstats *dqstats_pcpu;
233EXPORT_SYMBOL(dqstats_pcpu);
234#endif
235 231
236static qsize_t inode_get_rsv_space(struct inode *inode); 232static qsize_t inode_get_rsv_space(struct inode *inode);
237static void __dquot_initialize(struct inode *inode, int type); 233static void __dquot_initialize(struct inode *inode, int type);
@@ -584,7 +580,7 @@ out:
584} 580}
585EXPORT_SYMBOL(dquot_scan_active); 581EXPORT_SYMBOL(dquot_scan_active);
586 582
587int vfs_quota_sync(struct super_block *sb, int type, int wait) 583int dquot_quota_sync(struct super_block *sb, int type, int wait)
588{ 584{
589 struct list_head *dirty; 585 struct list_head *dirty;
590 struct dquot *dquot; 586 struct dquot *dquot;
@@ -656,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
656 652
657 return 0; 653 return 0;
658} 654}
659EXPORT_SYMBOL(vfs_quota_sync); 655EXPORT_SYMBOL(dquot_quota_sync);
660 656
661/* Free unused dquots from cache */ 657/* Free unused dquots from cache */
662static void prune_dqcache(int count) 658static void prune_dqcache(int count)
@@ -676,27 +672,10 @@ static void prune_dqcache(int count)
676 } 672 }
677} 673}
678 674
679static int dqstats_read(unsigned int type)
680{
681 int count = 0;
682#ifdef CONFIG_SMP
683 int cpu;
684 for_each_possible_cpu(cpu)
685 count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
686 /* Statistics reading is racy, but absolute accuracy isn't required */
687 if (count < 0)
688 count = 0;
689#else
690 count = dqstats.stat[type];
691#endif
692 return count;
693}
694
695/* 675/*
696 * This is called from kswapd when we think we need some 676 * This is called from kswapd when we think we need some
697 * more memory 677 * more memory
698 */ 678 */
699
700static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) 679static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
701{ 680{
702 if (nr) { 681 if (nr) {
@@ -704,7 +683,9 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
704 prune_dqcache(nr); 683 prune_dqcache(nr);
705 spin_unlock(&dq_list_lock); 684 spin_unlock(&dq_list_lock);
706 } 685 }
707 return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure; 686 return ((unsigned)
687 percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
688 /100) * sysctl_vfs_cache_pressure;
708} 689}
709 690
710static struct shrinker dqcache_shrinker = { 691static struct shrinker dqcache_shrinker = {
@@ -1514,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
1514/* 1495/*
1515 * This operation can block, but only after everything is updated 1496 * This operation can block, but only after everything is updated
1516 */ 1497 */
1517int __dquot_alloc_space(struct inode *inode, qsize_t number, 1498int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
1518 int warn, int reserve)
1519{ 1499{
1520 int cnt, ret = 0; 1500 int cnt, ret = 0;
1521 char warntype[MAXQUOTAS]; 1501 char warntype[MAXQUOTAS];
1502 int warn = flags & DQUOT_SPACE_WARN;
1503 int reserve = flags & DQUOT_SPACE_RESERVE;
1504 int nofail = flags & DQUOT_SPACE_NOFAIL;
1522 1505
1523 /* 1506 /*
1524 * First test before acquiring mutex - solves deadlocks when we 1507 * First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
1539 continue; 1522 continue;
1540 ret = check_bdq(inode->i_dquot[cnt], number, !warn, 1523 ret = check_bdq(inode->i_dquot[cnt], number, !warn,
1541 warntype+cnt); 1524 warntype+cnt);
1542 if (ret) { 1525 if (ret && !nofail) {
1543 spin_unlock(&dq_data_lock); 1526 spin_unlock(&dq_data_lock);
1544 goto out_flush_warn; 1527 goto out_flush_warn;
1545 } 1528 }
@@ -1638,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
1638/* 1621/*
1639 * This operation can block, but only after everything is updated 1622 * This operation can block, but only after everything is updated
1640 */ 1623 */
1641void __dquot_free_space(struct inode *inode, qsize_t number, int reserve) 1624void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
1642{ 1625{
1643 unsigned int cnt; 1626 unsigned int cnt;
1644 char warntype[MAXQUOTAS]; 1627 char warntype[MAXQUOTAS];
1628 int reserve = flags & DQUOT_SPACE_RESERVE;
1645 1629
1646 /* First test before acquiring mutex - solves deadlocks when we 1630 /* First test before acquiring mutex - solves deadlocks when we
1647 * re-enter the quota code and are already holding the mutex */ 1631 * re-enter the quota code and are already holding the mutex */
@@ -1812,7 +1796,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
1812 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) 1796 if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
1813 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA); 1797 transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
1814 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid) 1798 if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
1815 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA); 1799 transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
1816 1800
1817 ret = __dquot_transfer(inode, transfer_to); 1801 ret = __dquot_transfer(inode, transfer_to);
1818 dqput_all(transfer_to); 1802 dqput_all(transfer_to);
@@ -1847,6 +1831,7 @@ const struct dquot_operations dquot_operations = {
1847 .alloc_dquot = dquot_alloc, 1831 .alloc_dquot = dquot_alloc,
1848 .destroy_dquot = dquot_destroy, 1832 .destroy_dquot = dquot_destroy,
1849}; 1833};
1834EXPORT_SYMBOL(dquot_operations);
1850 1835
1851/* 1836/*
1852 * Generic helper for ->open on filesystems supporting disk quotas. 1837 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1865,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open);
1865/* 1850/*
1866 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) 1851 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
1867 */ 1852 */
1868int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) 1853int dquot_disable(struct super_block *sb, int type, unsigned int flags)
1869{ 1854{
1870 int cnt, ret = 0; 1855 int cnt, ret = 0;
1871 struct quota_info *dqopt = sb_dqopt(sb); 1856 struct quota_info *dqopt = sb_dqopt(sb);
@@ -1995,14 +1980,15 @@ put_inodes:
1995 } 1980 }
1996 return ret; 1981 return ret;
1997} 1982}
1998EXPORT_SYMBOL(vfs_quota_disable); 1983EXPORT_SYMBOL(dquot_disable);
1999 1984
2000int vfs_quota_off(struct super_block *sb, int type, int remount) 1985int dquot_quota_off(struct super_block *sb, int type)
2001{ 1986{
2002 return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : 1987 return dquot_disable(sb, type,
2003 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); 1988 DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
2004} 1989}
2005EXPORT_SYMBOL(vfs_quota_off); 1990EXPORT_SYMBOL(dquot_quota_off);
1991
2006/* 1992/*
2007 * Turn quotas on on a device 1993 * Turn quotas on on a device
2008 */ 1994 */
@@ -2120,36 +2106,43 @@ out_fmt:
2120} 2106}
2121 2107
2122/* Reenable quotas on remount RW */ 2108/* Reenable quotas on remount RW */
2123static int vfs_quota_on_remount(struct super_block *sb, int type) 2109int dquot_resume(struct super_block *sb, int type)
2124{ 2110{
2125 struct quota_info *dqopt = sb_dqopt(sb); 2111 struct quota_info *dqopt = sb_dqopt(sb);
2126 struct inode *inode; 2112 struct inode *inode;
2127 int ret; 2113 int ret = 0, cnt;
2128 unsigned int flags; 2114 unsigned int flags;
2129 2115
2130 mutex_lock(&dqopt->dqonoff_mutex); 2116 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2131 if (!sb_has_quota_suspended(sb, type)) { 2117 if (type != -1 && cnt != type)
2118 continue;
2119
2120 mutex_lock(&dqopt->dqonoff_mutex);
2121 if (!sb_has_quota_suspended(sb, cnt)) {
2122 mutex_unlock(&dqopt->dqonoff_mutex);
2123 continue;
2124 }
2125 inode = dqopt->files[cnt];
2126 dqopt->files[cnt] = NULL;
2127 spin_lock(&dq_state_lock);
2128 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2129 DQUOT_LIMITS_ENABLED,
2130 cnt);
2131 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
2132 spin_unlock(&dq_state_lock);
2132 mutex_unlock(&dqopt->dqonoff_mutex); 2133 mutex_unlock(&dqopt->dqonoff_mutex);
2133 return 0;
2134 }
2135 inode = dqopt->files[type];
2136 dqopt->files[type] = NULL;
2137 spin_lock(&dq_state_lock);
2138 flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
2139 DQUOT_LIMITS_ENABLED, type);
2140 dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
2141 spin_unlock(&dq_state_lock);
2142 mutex_unlock(&dqopt->dqonoff_mutex);
2143 2134
2144 flags = dquot_generic_flag(flags, type); 2135 flags = dquot_generic_flag(flags, cnt);
2145 ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, 2136 ret = vfs_load_quota_inode(inode, cnt,
2146 flags); 2137 dqopt->info[cnt].dqi_fmt_id, flags);
2147 iput(inode); 2138 iput(inode);
2139 }
2148 2140
2149 return ret; 2141 return ret;
2150} 2142}
2143EXPORT_SYMBOL(dquot_resume);
2151 2144
2152int vfs_quota_on_path(struct super_block *sb, int type, int format_id, 2145int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
2153 struct path *path) 2146 struct path *path)
2154{ 2147{
2155 int error = security_quota_on(path->dentry); 2148 int error = security_quota_on(path->dentry);
@@ -2164,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
2164 DQUOT_LIMITS_ENABLED); 2157 DQUOT_LIMITS_ENABLED);
2165 return error; 2158 return error;
2166} 2159}
2167EXPORT_SYMBOL(vfs_quota_on_path); 2160EXPORT_SYMBOL(dquot_quota_on_path);
2168 2161
2169int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, 2162int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
2170 int remount)
2171{ 2163{
2172 struct path path; 2164 struct path path;
2173 int error; 2165 int error;
2174 2166
2175 if (remount)
2176 return vfs_quota_on_remount(sb, type);
2177
2178 error = kern_path(name, LOOKUP_FOLLOW, &path); 2167 error = kern_path(name, LOOKUP_FOLLOW, &path);
2179 if (!error) { 2168 if (!error) {
2180 error = vfs_quota_on_path(sb, type, format_id, &path); 2169 error = dquot_quota_on_path(sb, type, format_id, &path);
2181 path_put(&path); 2170 path_put(&path);
2182 } 2171 }
2183 return error; 2172 return error;
2184} 2173}
2185EXPORT_SYMBOL(vfs_quota_on); 2174EXPORT_SYMBOL(dquot_quota_on);
2186 2175
2187/* 2176/*
2188 * More powerful function for turning on quotas allowing setting 2177 * More powerful function for turning on quotas allowing setting
2189 * of individual quota flags 2178 * of individual quota flags
2190 */ 2179 */
2191int vfs_quota_enable(struct inode *inode, int type, int format_id, 2180int dquot_enable(struct inode *inode, int type, int format_id,
2192 unsigned int flags) 2181 unsigned int flags)
2193{ 2182{
2194 int ret = 0; 2183 int ret = 0;
2195 struct super_block *sb = inode->i_sb; 2184 struct super_block *sb = inode->i_sb;
2196 struct quota_info *dqopt = sb_dqopt(sb); 2185 struct quota_info *dqopt = sb_dqopt(sb);
2197 2186
2198 /* Just unsuspend quotas? */ 2187 /* Just unsuspend quotas? */
2199 if (flags & DQUOT_SUSPENDED) 2188 BUG_ON(flags & DQUOT_SUSPENDED);
2200 return vfs_quota_on_remount(sb, type); 2189
2201 if (!flags) 2190 if (!flags)
2202 return 0; 2191 return 0;
2203 /* Just updating flags needed? */ 2192 /* Just updating flags needed? */
@@ -2229,13 +2218,13 @@ out_lock:
2229load_quota: 2218load_quota:
2230 return vfs_load_quota_inode(inode, type, format_id, flags); 2219 return vfs_load_quota_inode(inode, type, format_id, flags);
2231} 2220}
2232EXPORT_SYMBOL(vfs_quota_enable); 2221EXPORT_SYMBOL(dquot_enable);
2233 2222
2234/* 2223/*
2235 * This function is used when filesystem needs to initialize quotas 2224 * This function is used when filesystem needs to initialize quotas
2236 * during mount time. 2225 * during mount time.
2237 */ 2226 */
2238int vfs_quota_on_mount(struct super_block *sb, char *qf_name, 2227int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
2239 int format_id, int type) 2228 int format_id, int type)
2240{ 2229{
2241 struct dentry *dentry; 2230 struct dentry *dentry;
@@ -2261,24 +2250,7 @@ out:
2261 dput(dentry); 2250 dput(dentry);
2262 return error; 2251 return error;
2263} 2252}
2264EXPORT_SYMBOL(vfs_quota_on_mount); 2253EXPORT_SYMBOL(dquot_quota_on_mount);
2265
2266/* Wrapper to turn on quotas when remounting rw */
2267int vfs_dq_quota_on_remount(struct super_block *sb)
2268{
2269 int cnt;
2270 int ret = 0, err;
2271
2272 if (!sb->s_qcop || !sb->s_qcop->quota_on)
2273 return -ENOSYS;
2274 for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
2275 err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
2276 if (err < 0 && !ret)
2277 ret = err;
2278 }
2279 return ret;
2280}
2281EXPORT_SYMBOL(vfs_dq_quota_on_remount);
2282 2254
2283static inline qsize_t qbtos(qsize_t blocks) 2255static inline qsize_t qbtos(qsize_t blocks)
2284{ 2256{
@@ -2313,8 +2285,8 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2313 spin_unlock(&dq_data_lock); 2285 spin_unlock(&dq_data_lock);
2314} 2286}
2315 2287
2316int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, 2288int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
2317 struct fs_disk_quota *di) 2289 struct fs_disk_quota *di)
2318{ 2290{
2319 struct dquot *dquot; 2291 struct dquot *dquot;
2320 2292
@@ -2326,7 +2298,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
2326 2298
2327 return 0; 2299 return 0;
2328} 2300}
2329EXPORT_SYMBOL(vfs_get_dqblk); 2301EXPORT_SYMBOL(dquot_get_dqblk);
2330 2302
2331#define VFS_FS_DQ_MASK \ 2303#define VFS_FS_DQ_MASK \
2332 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \ 2304 (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
@@ -2425,7 +2397,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
2425 return 0; 2397 return 0;
2426} 2398}
2427 2399
2428int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, 2400int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
2429 struct fs_disk_quota *di) 2401 struct fs_disk_quota *di)
2430{ 2402{
2431 struct dquot *dquot; 2403 struct dquot *dquot;
@@ -2441,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
2441out: 2413out:
2442 return rc; 2414 return rc;
2443} 2415}
2444EXPORT_SYMBOL(vfs_set_dqblk); 2416EXPORT_SYMBOL(dquot_set_dqblk);
2445 2417
2446/* Generic routine for getting common part of quota file information */ 2418/* Generic routine for getting common part of quota file information */
2447int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2419int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2448{ 2420{
2449 struct mem_dqinfo *mi; 2421 struct mem_dqinfo *mi;
2450 2422
@@ -2463,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2463 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2435 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2464 return 0; 2436 return 0;
2465} 2437}
2466EXPORT_SYMBOL(vfs_get_dqinfo); 2438EXPORT_SYMBOL(dquot_get_dqinfo);
2467 2439
2468/* Generic routine for setting common part of quota file information */ 2440/* Generic routine for setting common part of quota file information */
2469int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) 2441int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
2470{ 2442{
2471 struct mem_dqinfo *mi; 2443 struct mem_dqinfo *mi;
2472 int err = 0; 2444 int err = 0;
@@ -2493,27 +2465,27 @@ out:
2493 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); 2465 mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
2494 return err; 2466 return err;
2495} 2467}
2496EXPORT_SYMBOL(vfs_set_dqinfo); 2468EXPORT_SYMBOL(dquot_set_dqinfo);
2497 2469
2498const struct quotactl_ops vfs_quotactl_ops = { 2470const struct quotactl_ops dquot_quotactl_ops = {
2499 .quota_on = vfs_quota_on, 2471 .quota_on = dquot_quota_on,
2500 .quota_off = vfs_quota_off, 2472 .quota_off = dquot_quota_off,
2501 .quota_sync = vfs_quota_sync, 2473 .quota_sync = dquot_quota_sync,
2502 .get_info = vfs_get_dqinfo, 2474 .get_info = dquot_get_dqinfo,
2503 .set_info = vfs_set_dqinfo, 2475 .set_info = dquot_set_dqinfo,
2504 .get_dqblk = vfs_get_dqblk, 2476 .get_dqblk = dquot_get_dqblk,
2505 .set_dqblk = vfs_set_dqblk 2477 .set_dqblk = dquot_set_dqblk
2506}; 2478};
2507 2479EXPORT_SYMBOL(dquot_quotactl_ops);
2508 2480
2509static int do_proc_dqstats(struct ctl_table *table, int write, 2481static int do_proc_dqstats(struct ctl_table *table, int write,
2510 void __user *buffer, size_t *lenp, loff_t *ppos) 2482 void __user *buffer, size_t *lenp, loff_t *ppos)
2511{ 2483{
2512#ifdef CONFIG_SMP
2513 /* Update global table */
2514 unsigned int type = (int *)table->data - dqstats.stat; 2484 unsigned int type = (int *)table->data - dqstats.stat;
2515 dqstats.stat[type] = dqstats_read(type); 2485
2516#endif 2486 /* Update global table */
2487 dqstats.stat[type] =
2488 percpu_counter_sum_positive(&dqstats.counter[type]);
2517 return proc_dointvec(table, write, buffer, lenp, ppos); 2489 return proc_dointvec(table, write, buffer, lenp, ppos);
2518} 2490}
2519 2491
@@ -2606,7 +2578,7 @@ static ctl_table sys_table[] = {
2606 2578
2607static int __init dquot_init(void) 2579static int __init dquot_init(void)
2608{ 2580{
2609 int i; 2581 int i, ret;
2610 unsigned long nr_hash, order; 2582 unsigned long nr_hash, order;
2611 2583
2612 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); 2584 printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2624,12 +2596,11 @@ static int __init dquot_init(void)
2624 if (!dquot_hash) 2596 if (!dquot_hash)
2625 panic("Cannot create dquot hash table"); 2597 panic("Cannot create dquot hash table");
2626 2598
2627#ifdef CONFIG_SMP 2599 for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
2628 dqstats_pcpu = alloc_percpu(struct dqstats); 2600 ret = percpu_counter_init(&dqstats.counter[i], 0);
2629 if (!dqstats_pcpu) 2601 if (ret)
2630 panic("Cannot create dquot stats table"); 2602 panic("Cannot create dquot stat counters");
2631#endif 2603 }
2632 memset(&dqstats, 0, sizeof(struct dqstats));
2633 2604
2634 /* Find power-of-two hlist_heads which can fit into allocation */ 2605 /* Find power-of-two hlist_heads which can fit into allocation */
2635 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head); 2606 nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index cfc78826da90..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -45,36 +45,22 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
45 return security_quotactl(cmd, type, id, sb); 45 return security_quotactl(cmd, type, id, sb);
46} 46}
47 47
48static void quota_sync_one(struct super_block *sb, void *arg)
49{
50 if (sb->s_qcop && sb->s_qcop->quota_sync)
51 sb->s_qcop->quota_sync(sb, *(int *)arg, 1);
52}
53
48static int quota_sync_all(int type) 54static int quota_sync_all(int type)
49{ 55{
50 struct super_block *sb;
51 int ret; 56 int ret;
52 57
53 if (type >= MAXQUOTAS) 58 if (type >= MAXQUOTAS)
54 return -EINVAL; 59 return -EINVAL;
55 ret = security_quotactl(Q_SYNC, type, 0, NULL); 60 ret = security_quotactl(Q_SYNC, type, 0, NULL);
56 if (ret) 61 if (!ret)
57 return ret; 62 iterate_supers(quota_sync_one, &type);
58 63 return ret;
59 spin_lock(&sb_lock);
60restart:
61 list_for_each_entry(sb, &super_blocks, s_list) {
62 if (!sb->s_qcop || !sb->s_qcop->quota_sync)
63 continue;
64
65 sb->s_count++;
66 spin_unlock(&sb_lock);
67 down_read(&sb->s_umount);
68 if (sb->s_root)
69 sb->s_qcop->quota_sync(sb, type, 1);
70 up_read(&sb->s_umount);
71 spin_lock(&sb_lock);
72 if (__put_super_and_need_restart(sb))
73 goto restart;
74 }
75 spin_unlock(&sb_lock);
76
77 return 0;
78} 64}
79 65
80static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id, 66static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
@@ -87,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
87 if (IS_ERR(pathname)) 73 if (IS_ERR(pathname))
88 return PTR_ERR(pathname); 74 return PTR_ERR(pathname);
89 if (sb->s_qcop->quota_on) 75 if (sb->s_qcop->quota_on)
90 ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); 76 ret = sb->s_qcop->quota_on(sb, type, id, pathname);
91 putname(pathname); 77 putname(pathname);
92 return ret; 78 return ret;
93} 79}
@@ -274,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
274 case Q_QUOTAOFF: 260 case Q_QUOTAOFF:
275 if (!sb->s_qcop->quota_off) 261 if (!sb->s_qcop->quota_off)
276 return -ENOSYS; 262 return -ENOSYS;
277 return sb->s_qcop->quota_off(sb, type, 0); 263 return sb->s_qcop->quota_off(sb, type);
278 case Q_GETFMT: 264 case Q_GETFMT:
279 return quota_getfmt(sb, type, addr); 265 return quota_getfmt(sb, type, addr);
280 case Q_GETINFO: 266 case Q_GETINFO:
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .mmap = generic_file_mmap, 45 .mmap = generic_file_mmap,
46 .fsync = simple_sync_file, 46 .fsync = noop_fsync,
47 .splice_read = generic_file_splice_read, 47 .splice_read = generic_file_splice_read,
48 .splice_write = generic_file_splice_write, 48 .splice_write = generic_file_splice_write,
49 .llseek = generic_file_llseek, 49 .llseek = generic_file_llseek,
50}; 50};
51 51
52const struct inode_operations ramfs_file_inode_operations = { 52const struct inode_operations ramfs_file_inode_operations = {
53 .setattr = simple_setattr,
53 .getattr = simple_getattr, 54 .getattr = simple_getattr,
54}; 55};
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad81a429..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
42 .aio_read = generic_file_aio_read, 42 .aio_read = generic_file_aio_read,
43 .write = do_sync_write, 43 .write = do_sync_write,
44 .aio_write = generic_file_aio_write, 44 .aio_write = generic_file_aio_write,
45 .fsync = simple_sync_file, 45 .fsync = noop_fsync,
46 .splice_read = generic_file_splice_read, 46 .splice_read = generic_file_splice_read,
47 .splice_write = generic_file_splice_write, 47 .splice_write = generic_file_splice_write,
48 .llseek = generic_file_llseek, 48 .llseek = generic_file_llseek,
@@ -146,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
146 return ret; 146 return ret;
147 } 147 }
148 148
149 ret = vmtruncate(inode, newsize); 149 ret = simple_setsize(inode, newsize);
150 150
151 return ret; 151 return ret;
152} 152}
@@ -169,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
169 169
170 /* pick out size-changing events */ 170 /* pick out size-changing events */
171 if (ia->ia_valid & ATTR_SIZE) { 171 if (ia->ia_valid & ATTR_SIZE) {
172 loff_t size = i_size_read(inode); 172 loff_t size = inode->i_size;
173
173 if (ia->ia_size != size) { 174 if (ia->ia_size != size) {
174 ret = ramfs_nommu_resize(inode, ia->ia_size, size); 175 ret = ramfs_nommu_resize(inode, ia->ia_size, size);
175 if (ret < 0 || ia->ia_valid == ATTR_SIZE) 176 if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -182,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
182 } 183 }
183 } 184 }
184 185
185 ret = inode_setattr(inode, ia); 186 generic_setattr(inode, ia);
186 out: 187 out:
187 ia->ia_valid = old_ia_valid; 188 ia->ia_valid = old_ia_valid;
188 return ret; 189 return ret;
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index f47cd212dee1..a5ebae70dc6d 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -52,14 +52,13 @@ static struct backing_dev_info ramfs_backing_dev_info = {
52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, 52 BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
53}; 53};
54 54
55struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) 55struct inode *ramfs_get_inode(struct super_block *sb,
56 const struct inode *dir, int mode, dev_t dev)
56{ 57{
57 struct inode * inode = new_inode(sb); 58 struct inode * inode = new_inode(sb);
58 59
59 if (inode) { 60 if (inode) {
60 inode->i_mode = mode; 61 inode_init_owner(inode, dir, mode);
61 inode->i_uid = current_fsuid();
62 inode->i_gid = current_fsgid();
63 inode->i_mapping->a_ops = &ramfs_aops; 62 inode->i_mapping->a_ops = &ramfs_aops;
64 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; 63 inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
65 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); 64 mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
@@ -95,15 +94,10 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
95static int 94static int
96ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) 95ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
97{ 96{
98 struct inode * inode = ramfs_get_inode(dir->i_sb, mode, dev); 97 struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev);
99 int error = -ENOSPC; 98 int error = -ENOSPC;
100 99
101 if (inode) { 100 if (inode) {
102 if (dir->i_mode & S_ISGID) {
103 inode->i_gid = dir->i_gid;
104 if (S_ISDIR(mode))
105 inode->i_mode |= S_ISGID;
106 }
107 d_instantiate(dentry, inode); 101 d_instantiate(dentry, inode);
108 dget(dentry); /* Extra count - pin the dentry in core */ 102 dget(dentry); /* Extra count - pin the dentry in core */
109 error = 0; 103 error = 0;
@@ -130,13 +124,11 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char *
130 struct inode *inode; 124 struct inode *inode;
131 int error = -ENOSPC; 125 int error = -ENOSPC;
132 126
133 inode = ramfs_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0); 127 inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
134 if (inode) { 128 if (inode) {
135 int l = strlen(symname)+1; 129 int l = strlen(symname)+1;
136 error = page_symlink(inode, symname, l); 130 error = page_symlink(inode, symname, l);
137 if (!error) { 131 if (!error) {
138 if (dir->i_mode & S_ISGID)
139 inode->i_gid = dir->i_gid;
140 d_instantiate(dentry, inode); 132 d_instantiate(dentry, inode);
141 dget(dentry); 133 dget(dentry);
142 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 134 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -241,7 +233,7 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent)
241 sb->s_op = &ramfs_ops; 233 sb->s_op = &ramfs_ops;
242 sb->s_time_gran = 1; 234 sb->s_time_gran = 1;
243 235
244 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); 236 inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0);
245 if (!inode) { 237 if (!inode) {
246 err = -ENOMEM; 238 err = -ENOMEM;
247 goto fail; 239 goto fail;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
97} 97}
98EXPORT_SYMBOL(generic_file_llseek); 98EXPORT_SYMBOL(generic_file_llseek);
99 99
100/**
101 * noop_llseek - No Operation Performed llseek implementation
102 * @file: file structure to seek on
103 * @offset: file offset to seek to
104 * @origin: type of seek
105 *
106 * This is an implementation of ->llseek useable for the rare special case when
107 * userspace expects the seek to succeed but the (device) file is actually not
108 * able to perform the seek. In this case you use noop_llseek() instead of
109 * falling back to the default implementation of ->llseek.
110 */
111loff_t noop_llseek(struct file *file, loff_t offset, int origin)
112{
113 return file->f_pos;
114}
115EXPORT_SYMBOL(noop_llseek);
116
100loff_t no_llseek(struct file *file, loff_t offset, int origin) 117loff_t no_llseek(struct file *file, loff_t offset, int origin)
101{ 118{
102 return -ESPIPE; 119 return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,10 +14,10 @@
14extern const struct reiserfs_key MIN_KEY; 14extern const struct reiserfs_key MIN_KEY;
15 15
16static int reiserfs_readdir(struct file *, void *, filldir_t); 16static int reiserfs_readdir(struct file *, void *, filldir_t);
17static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 17static int reiserfs_dir_fsync(struct file *filp, int datasync);
18 int datasync);
19 18
20const struct file_operations reiserfs_dir_operations = { 19const struct file_operations reiserfs_dir_operations = {
20 .llseek = generic_file_llseek,
21 .read = generic_read_dir, 21 .read = generic_read_dir,
22 .readdir = reiserfs_readdir, 22 .readdir = reiserfs_readdir,
23 .fsync = reiserfs_dir_fsync, 23 .fsync = reiserfs_dir_fsync,
@@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
27#endif 27#endif
28}; 28};
29 29
30static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, 30static int reiserfs_dir_fsync(struct file *filp, int datasync)
31 int datasync)
32{ 31{
33 struct inode *inode = dentry->d_inode; 32 struct inode *inode = filp->f_mapping->host;
34 int err; 33 int err;
35 reiserfs_write_lock(inode->i_sb); 34 reiserfs_write_lock(inode->i_sb);
36 err = reiserfs_commit_for_inode(inode); 35 err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 1d9c12714c5c..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
134 * be removed... 134 * be removed...
135 */ 135 */
136 136
137static int reiserfs_sync_file(struct file *filp, 137static int reiserfs_sync_file(struct file *filp, int datasync)
138 struct dentry *dentry, int datasync)
139{ 138{
140 struct inode *inode = dentry->d_inode; 139 struct inode *inode = filp->f_mapping->host;
141 int err; 140 int err;
142 int barrier_done; 141 int barrier_done;
143 142
@@ -147,7 +146,8 @@ static int reiserfs_sync_file(struct file *filp,
147 barrier_done = reiserfs_commit_for_inode(inode); 146 barrier_done = reiserfs_commit_for_inode(inode);
148 reiserfs_write_unlock(inode->i_sb); 147 reiserfs_write_unlock(inode->i_sb);
149 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) 148 if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
150 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 149 blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
150 BLKDEV_IFL_WAIT);
151 if (barrier_done < 0) 151 if (barrier_done < 0)
152 return barrier_done; 152 return barrier_done;
153 return (err < 0) ? -EIO : 0; 153 return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index d0c43cb99ffc..ee78d4a0086a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -561,23 +561,13 @@ static int drop_new_inode(struct inode *inode)
561*/ 561*/
562static int new_inode_init(struct inode *inode, struct inode *dir, int mode) 562static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
563{ 563{
564
565 /* the quota init calls have to know who to charge the quota to, so
566 ** we have to set uid and gid here
567 */
568 inode->i_uid = current_fsuid();
569 inode->i_mode = mode;
570 /* Make inode invalid - just in case we are going to drop it before 564 /* Make inode invalid - just in case we are going to drop it before
571 * the initialization happens */ 565 * the initialization happens */
572 INODE_PKEY(inode)->k_objectid = 0; 566 INODE_PKEY(inode)->k_objectid = 0;
573 567 /* the quota init calls have to know who to charge the quota to, so
574 if (dir->i_mode & S_ISGID) { 568 ** we have to set uid and gid here
575 inode->i_gid = dir->i_gid; 569 */
576 if (S_ISDIR(mode)) 570 inode_init_owner(inode, dir, mode);
577 inode->i_mode |= S_ISGID;
578 } else {
579 inode->i_gid = current_fsgid();
580 }
581 dquot_initialize(inode); 571 dquot_initialize(inode);
582 return 0; 572 return 0;
583} 573}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 59125fb36d42..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
158#ifdef CONFIG_QUOTA 158#ifdef CONFIG_QUOTA
159 int i; 159 int i;
160 int ms_active_set; 160 int ms_active_set;
161 int quota_enabled[MAXQUOTAS];
161#endif 162#endif
162 163
163 /* compose key to look for "save" links */ 164 /* compose key to look for "save" links */
@@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
179 } 180 }
180 /* Turn on quotas so that they are updated correctly */ 181 /* Turn on quotas so that they are updated correctly */
181 for (i = 0; i < MAXQUOTAS; i++) { 182 for (i = 0; i < MAXQUOTAS; i++) {
183 quota_enabled[i] = 1;
182 if (REISERFS_SB(s)->s_qf_names[i]) { 184 if (REISERFS_SB(s)->s_qf_names[i]) {
183 int ret = reiserfs_quota_on_mount(s, i); 185 int ret;
186
187 if (sb_has_quota_active(s, i)) {
188 quota_enabled[i] = 0;
189 continue;
190 }
191 ret = reiserfs_quota_on_mount(s, i);
184 if (ret < 0) 192 if (ret < 0)
185 reiserfs_warning(s, "reiserfs-2500", 193 reiserfs_warning(s, "reiserfs-2500",
186 "cannot turn on journaled " 194 "cannot turn on journaled "
@@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
304#ifdef CONFIG_QUOTA 312#ifdef CONFIG_QUOTA
305 /* Turn quotas off */ 313 /* Turn quotas off */
306 for (i = 0; i < MAXQUOTAS; i++) { 314 for (i = 0; i < MAXQUOTAS; i++) {
307 if (sb_dqopt(s)->files[i]) 315 if (sb_dqopt(s)->files[i] && quota_enabled[i])
308 vfs_quota_off(s, i, 0); 316 dquot_quota_off(s, i);
309 } 317 }
310 if (ms_active_set) 318 if (ms_active_set)
311 /* Restore the flag back */ 319 /* Restore the flag back */
@@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
466 struct reiserfs_transaction_handle th; 474 struct reiserfs_transaction_handle th;
467 th.t_trans_id = 0; 475 th.t_trans_id = 0;
468 476
477 dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
478
469 reiserfs_write_lock(s); 479 reiserfs_write_lock(s);
470 480
471 if (s->s_dirt) 481 if (s->s_dirt)
@@ -620,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
620static int reiserfs_release_dquot(struct dquot *); 630static int reiserfs_release_dquot(struct dquot *);
621static int reiserfs_mark_dquot_dirty(struct dquot *); 631static int reiserfs_mark_dquot_dirty(struct dquot *);
622static int reiserfs_write_info(struct super_block *, int); 632static int reiserfs_write_info(struct super_block *, int);
623static int reiserfs_quota_on(struct super_block *, int, int, char *, int); 633static int reiserfs_quota_on(struct super_block *, int, int, char *);
624 634
625static const struct dquot_operations reiserfs_quota_operations = { 635static const struct dquot_operations reiserfs_quota_operations = {
626 .write_dquot = reiserfs_write_dquot, 636 .write_dquot = reiserfs_write_dquot,
@@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
634 644
635static const struct quotactl_ops reiserfs_qctl_operations = { 645static const struct quotactl_ops reiserfs_qctl_operations = {
636 .quota_on = reiserfs_quota_on, 646 .quota_on = reiserfs_quota_on,
637 .quota_off = vfs_quota_off, 647 .quota_off = dquot_quota_off,
638 .quota_sync = vfs_quota_sync, 648 .quota_sync = dquot_quota_sync,
639 .get_info = vfs_get_dqinfo, 649 .get_info = dquot_get_dqinfo,
640 .set_info = vfs_set_dqinfo, 650 .set_info = dquot_set_dqinfo,
641 .get_dqblk = vfs_get_dqblk, 651 .get_dqblk = dquot_get_dqblk,
642 .set_dqblk = vfs_set_dqblk, 652 .set_dqblk = dquot_set_dqblk,
643}; 653};
644#endif 654#endif
645 655
@@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1242 if (s->s_flags & MS_RDONLY) 1252 if (s->s_flags & MS_RDONLY)
1243 /* it is read-only already */ 1253 /* it is read-only already */
1244 goto out_ok; 1254 goto out_ok;
1255
1256 err = dquot_suspend(s, -1);
1257 if (err < 0)
1258 goto out_err;
1259
1245 /* try to remount file system with read-only permissions */ 1260 /* try to remount file system with read-only permissions */
1246 if (sb_umount_state(rs) == REISERFS_VALID_FS 1261 if (sb_umount_state(rs) == REISERFS_VALID_FS
1247 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) { 1262 || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1295 s->s_dirt = 0; 1310 s->s_dirt = 0;
1296 1311
1297 if (!(*mount_flags & MS_RDONLY)) { 1312 if (!(*mount_flags & MS_RDONLY)) {
1313 dquot_resume(s, -1);
1298 finish_unfinished(s); 1314 finish_unfinished(s);
1299 reiserfs_xattr_init(s, *mount_flags); 1315 reiserfs_xattr_init(s, *mount_flags);
1300 } 1316 }
@@ -2022,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
2022 */ 2038 */
2023static int reiserfs_quota_on_mount(struct super_block *sb, int type) 2039static int reiserfs_quota_on_mount(struct super_block *sb, int type)
2024{ 2040{
2025 return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type], 2041 return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
2026 REISERFS_SB(sb)->s_jquota_fmt, type); 2042 REISERFS_SB(sb)->s_jquota_fmt, type);
2027} 2043}
2028 2044
2029/* 2045/*
2030 * Standard function to be called on quota_on 2046 * Standard function to be called on quota_on
2031 */ 2047 */
2032static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, 2048static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2033 char *name, int remount) 2049 char *name)
2034{ 2050{
2035 int err; 2051 int err;
2036 struct path path; 2052 struct path path;
@@ -2039,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2039 2055
2040 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) 2056 if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
2041 return -EINVAL; 2057 return -EINVAL;
2042 /* No more checks needed? Path and format_id are bogus anyway... */ 2058
2043 if (remount)
2044 return vfs_quota_on(sb, type, format_id, name, 1);
2045 err = kern_path(name, LOOKUP_FOLLOW, &path); 2059 err = kern_path(name, LOOKUP_FOLLOW, &path);
2046 if (err) 2060 if (err)
2047 return err; 2061 return err;
@@ -2085,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
2085 if (err) 2099 if (err)
2086 goto out; 2100 goto out;
2087 } 2101 }
2088 err = vfs_quota_on_path(sb, type, format_id, &path); 2102 err = dquot_quota_on_path(sb, type, format_id, &path);
2089out: 2103out:
2090 path_put(&path); 2104 path_put(&path);
2091 return err; 2105 return err;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e7cc00e636dc..8c4cf273c672 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -723,11 +723,11 @@ out:
723 (handler) = *(handlers)++) 723 (handler) = *(handlers)++)
724 724
725/* This is the implementation for the xattr plugin infrastructure */ 725/* This is the implementation for the xattr plugin infrastructure */
726static inline struct xattr_handler * 726static inline const struct xattr_handler *
727find_xattr_handler_prefix(struct xattr_handler **handlers, 727find_xattr_handler_prefix(const struct xattr_handler **handlers,
728 const char *name) 728 const char *name)
729{ 729{
730 struct xattr_handler *xah; 730 const struct xattr_handler *xah;
731 731
732 if (!handlers) 732 if (!handlers)
733 return NULL; 733 return NULL;
@@ -748,7 +748,7 @@ ssize_t
748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, 748reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
749 size_t size) 749 size_t size)
750{ 750{
751 struct xattr_handler *handler; 751 const struct xattr_handler *handler;
752 752
753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 753 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
754 754
@@ -767,7 +767,7 @@ int
767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, 767reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
768 size_t size, int flags) 768 size_t size, int flags)
769{ 769{
770 struct xattr_handler *handler; 770 const struct xattr_handler *handler;
771 771
772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 772 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
773 773
@@ -784,7 +784,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
784 */ 784 */
785int reiserfs_removexattr(struct dentry *dentry, const char *name) 785int reiserfs_removexattr(struct dentry *dentry, const char *name)
786{ 786{
787 struct xattr_handler *handler; 787 const struct xattr_handler *handler;
788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); 788 handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name);
789 789
790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) 790 if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
@@ -807,7 +807,7 @@ static int listxattr_filler(void *buf, const char *name, int namelen,
807 size_t size; 807 size_t size;
808 if (name[0] != '.' || 808 if (name[0] != '.' ||
809 (namelen != 1 && (name[1] != '.' || namelen != 2))) { 809 (namelen != 1 && (name[1] != '.' || namelen != 2))) {
810 struct xattr_handler *handler; 810 const struct xattr_handler *handler;
811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, 811 handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
812 name); 812 name);
813 if (!handler) /* Unsupported xattr name */ 813 if (!handler) /* Unsupported xattr name */
@@ -920,7 +920,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
920#endif 920#endif
921 921
922/* Actual operations that are exported to VFS-land */ 922/* Actual operations that are exported to VFS-land */
923struct xattr_handler *reiserfs_xattr_handlers[] = { 923const struct xattr_handler *reiserfs_xattr_handlers[] = {
924#ifdef CONFIG_REISERFS_FS_XATTR 924#ifdef CONFIG_REISERFS_FS_XATTR
925 &reiserfs_xattr_user_handler, 925 &reiserfs_xattr_user_handler,
926 &reiserfs_xattr_trusted_handler, 926 &reiserfs_xattr_trusted_handler,
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 9cdb759645a9..536d697a8a28 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -500,7 +500,7 @@ static size_t posix_acl_access_list(struct dentry *dentry, char *list,
500 return size; 500 return size;
501} 501}
502 502
503struct xattr_handler reiserfs_posix_acl_access_handler = { 503const struct xattr_handler reiserfs_posix_acl_access_handler = {
504 .prefix = POSIX_ACL_XATTR_ACCESS, 504 .prefix = POSIX_ACL_XATTR_ACCESS,
505 .flags = ACL_TYPE_ACCESS, 505 .flags = ACL_TYPE_ACCESS,
506 .get = posix_acl_get, 506 .get = posix_acl_get,
@@ -520,7 +520,7 @@ static size_t posix_acl_default_list(struct dentry *dentry, char *list,
520 return size; 520 return size;
521} 521}
522 522
523struct xattr_handler reiserfs_posix_acl_default_handler = { 523const struct xattr_handler reiserfs_posix_acl_default_handler = {
524 .prefix = POSIX_ACL_XATTR_DEFAULT, 524 .prefix = POSIX_ACL_XATTR_DEFAULT,
525 .flags = ACL_TYPE_DEFAULT, 525 .flags = ACL_TYPE_DEFAULT,
526 .get = posix_acl_get, 526 .get = posix_acl_get,
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 7271a477c041..237c6928d3c6 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -111,7 +111,7 @@ void reiserfs_security_free(struct reiserfs_security_handle *sec)
111 sec->value = NULL; 111 sec->value = NULL;
112} 112}
113 113
114struct xattr_handler reiserfs_xattr_security_handler = { 114const struct xattr_handler reiserfs_xattr_security_handler = {
115 .prefix = XATTR_SECURITY_PREFIX, 115 .prefix = XATTR_SECURITY_PREFIX,
116 .get = security_get, 116 .get = security_get,
117 .set = security_set, 117 .set = security_set,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 5b08aaca3daf..9883736ce3ec 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -48,7 +48,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
48 return len; 48 return len;
49} 49}
50 50
51struct xattr_handler reiserfs_xattr_trusted_handler = { 51const struct xattr_handler reiserfs_xattr_trusted_handler = {
52 .prefix = XATTR_TRUSTED_PREFIX, 52 .prefix = XATTR_TRUSTED_PREFIX,
53 .get = trusted_get, 53 .get = trusted_get,
54 .set = trusted_set, 54 .set = trusted_set,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 75d59c49b911..45ae1a00013a 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -44,7 +44,7 @@ static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
44 return len; 44 return len;
45} 45}
46 46
47struct xattr_handler reiserfs_xattr_user_handler = { 47const struct xattr_handler reiserfs_xattr_user_handler = {
48 .prefix = XATTR_USER_PREFIX, 48 .prefix = XATTR_USER_PREFIX,
49 .get = user_get, 49 .get = user_get,
50 .set = user_set, 50 .set = user_set,
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
37 37
38const struct file_operations smb_dir_operations = 38const struct file_operations smb_dir_operations =
39{ 39{
40 .llseek = generic_file_llseek,
40 .read = generic_read_dir, 41 .read = generic_read_dir,
41 .readdir = smb_readdir, 42 .readdir = smb_readdir,
42 .ioctl = smb_ioctl, 43 .unlocked_ioctl = smb_ioctl,
43 .open = smb_dir_open, 44 .open = smb_dir_open,
44}; 45};
45 46
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf06..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
28#include "proto.h" 28#include "proto.h"
29 29
30static int 30static int
31smb_fsync(struct file *file, struct dentry * dentry, int datasync) 31smb_fsync(struct file *file, int datasync)
32{ 32{
33 struct dentry *dentry = file->f_path.dentry;
33 struct smb_sb_info *server = server_from_dentry(dentry); 34 struct smb_sb_info *server = server_from_dentry(dentry);
34 int result; 35 int result;
35 36
@@ -437,7 +438,7 @@ const struct file_operations smb_file_operations =
437 .aio_read = smb_file_aio_read, 438 .aio_read = smb_file_aio_read,
438 .write = do_sync_write, 439 .write = do_sync_write,
439 .aio_write = smb_file_aio_write, 440 .aio_write = smb_file_aio_write,
440 .ioctl = smb_ioctl, 441 .unlocked_ioctl = smb_ioctl,
441 .mmap = smb_file_mmap, 442 .mmap = smb_file_mmap,
442 .open = smb_file_open, 443 .open = smb_file_open,
443 .release = smb_file_release, 444 .release = smb_file_release,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index dfa1d67f8fca..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -714,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
714 error = server->ops->truncate(inode, attr->ia_size); 714 error = server->ops->truncate(inode, attr->ia_size);
715 if (error) 715 if (error)
716 goto out; 716 goto out;
717 error = vmtruncate(inode, attr->ia_size); 717 error = simple_setsize(inode, attr->ia_size);
718 if (error) 718 if (error)
719 goto out; 719 goto out;
720 refresh = 1; 720 refresh = 1;
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highuid.h> 15#include <linux/highuid.h>
16#include <linux/smp_lock.h>
16#include <linux/net.h> 17#include <linux/net.h>
17 18
18#include <linux/smb_fs.h> 19#include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
22 23
23#include "proto.h" 24#include "proto.h"
24 25
25int 26long
26smb_ioctl(struct inode *inode, struct file *filp, 27smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
27 unsigned int cmd, unsigned long arg)
28{ 28{
29 struct smb_sb_info *server = server_from_inode(inode); 29 struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
30 struct smb_conn_opt opt; 30 struct smb_conn_opt opt;
31 int result = -EINVAL; 31 int result = -EINVAL;
32 32
33 lock_kernel();
33 switch (cmd) { 34 switch (cmd) {
34 uid16_t uid16; 35 uid16_t uid16;
35 uid_t uid32; 36 uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
62 default: 63 default:
63 break; 64 break;
64 } 65 }
66 unlock_kernel();
65 67
66 return result; 68 return result;
67} 69}
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
67extern const struct file_operations smb_file_operations; 67extern const struct file_operations smb_file_operations;
68extern const struct inode_operations smb_file_inode_operations; 68extern const struct inode_operations smb_file_inode_operations;
69/* ioctl.c */ 69/* ioctl.c */
70extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg); 70extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
71/* smbiod.c */ 71/* smbiod.c */
72extern void smbiod_wake_up(void); 72extern void smbiod_wake_up(void);
73extern int smbiod_register_server(struct smb_sb_info *server); 73extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
15#include <linux/pagemap.h> 15#include <linux/pagemap.h>
16#include <linux/net.h> 16#include <linux/net.h>
17#include <linux/namei.h> 17#include <linux/namei.h>
18#include <linux/slab.h>
19 18
20#include <asm/uaccess.h> 19#include <asm/uaccess.h>
21#include <asm/system.h> 20#include <asm/system.h>
diff --git a/fs/splice.c b/fs/splice.c
index 9313b6124a2e..740e6b9faf7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -193,8 +193,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
193 break; 193 break;
194 } 194 }
195 195
196 if (pipe->nrbufs < PIPE_BUFFERS) { 196 if (pipe->nrbufs < pipe->buffers) {
197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
198 struct pipe_buffer *buf = pipe->bufs + newbuf; 198 struct pipe_buffer *buf = pipe->bufs + newbuf;
199 199
200 buf->page = spd->pages[page_nr]; 200 buf->page = spd->pages[page_nr];
@@ -214,7 +214,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
214 214
215 if (!--spd->nr_pages) 215 if (!--spd->nr_pages)
216 break; 216 break;
217 if (pipe->nrbufs < PIPE_BUFFERS) 217 if (pipe->nrbufs < pipe->buffers)
218 continue; 218 continue;
219 219
220 break; 220 break;
@@ -265,6 +265,36 @@ static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
265 page_cache_release(spd->pages[i]); 265 page_cache_release(spd->pages[i]);
266} 266}
267 267
268/*
269 * Check if we need to grow the arrays holding pages and partial page
270 * descriptions.
271 */
272int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
273{
274 if (pipe->buffers <= PIPE_DEF_BUFFERS)
275 return 0;
276
277 spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
278 spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
279
280 if (spd->pages && spd->partial)
281 return 0;
282
283 kfree(spd->pages);
284 kfree(spd->partial);
285 return -ENOMEM;
286}
287
288void splice_shrink_spd(struct pipe_inode_info *pipe,
289 struct splice_pipe_desc *spd)
290{
291 if (pipe->buffers <= PIPE_DEF_BUFFERS)
292 return;
293
294 kfree(spd->pages);
295 kfree(spd->partial);
296}
297
268static int 298static int
269__generic_file_splice_read(struct file *in, loff_t *ppos, 299__generic_file_splice_read(struct file *in, loff_t *ppos,
270 struct pipe_inode_info *pipe, size_t len, 300 struct pipe_inode_info *pipe, size_t len,
@@ -272,8 +302,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
272{ 302{
273 struct address_space *mapping = in->f_mapping; 303 struct address_space *mapping = in->f_mapping;
274 unsigned int loff, nr_pages, req_pages; 304 unsigned int loff, nr_pages, req_pages;
275 struct page *pages[PIPE_BUFFERS]; 305 struct page *pages[PIPE_DEF_BUFFERS];
276 struct partial_page partial[PIPE_BUFFERS]; 306 struct partial_page partial[PIPE_DEF_BUFFERS];
277 struct page *page; 307 struct page *page;
278 pgoff_t index, end_index; 308 pgoff_t index, end_index;
279 loff_t isize; 309 loff_t isize;
@@ -286,15 +316,18 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
286 .spd_release = spd_release_page, 316 .spd_release = spd_release_page,
287 }; 317 };
288 318
319 if (splice_grow_spd(pipe, &spd))
320 return -ENOMEM;
321
289 index = *ppos >> PAGE_CACHE_SHIFT; 322 index = *ppos >> PAGE_CACHE_SHIFT;
290 loff = *ppos & ~PAGE_CACHE_MASK; 323 loff = *ppos & ~PAGE_CACHE_MASK;
291 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 324 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
292 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 325 nr_pages = min(req_pages, pipe->buffers);
293 326
294 /* 327 /*
295 * Lookup the (hopefully) full range of pages we need. 328 * Lookup the (hopefully) full range of pages we need.
296 */ 329 */
297 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 330 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
298 index += spd.nr_pages; 331 index += spd.nr_pages;
299 332
300 /* 333 /*
@@ -321,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
321 break; 354 break;
322 355
323 error = add_to_page_cache_lru(page, mapping, index, 356 error = add_to_page_cache_lru(page, mapping, index,
324 mapping_gfp_mask(mapping)); 357 GFP_KERNEL);
325 if (unlikely(error)) { 358 if (unlikely(error)) {
326 page_cache_release(page); 359 page_cache_release(page);
327 if (error == -EEXIST) 360 if (error == -EEXIST)
@@ -335,7 +368,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
335 unlock_page(page); 368 unlock_page(page);
336 } 369 }
337 370
338 pages[spd.nr_pages++] = page; 371 spd.pages[spd.nr_pages++] = page;
339 index++; 372 index++;
340 } 373 }
341 374
@@ -356,7 +389,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
356 * this_len is the max we'll use from this page 389 * this_len is the max we'll use from this page
357 */ 390 */
358 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 391 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
359 page = pages[page_nr]; 392 page = spd.pages[page_nr];
360 393
361 if (PageReadahead(page)) 394 if (PageReadahead(page))
362 page_cache_async_readahead(mapping, &in->f_ra, in, 395 page_cache_async_readahead(mapping, &in->f_ra, in,
@@ -393,8 +426,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
393 error = -ENOMEM; 426 error = -ENOMEM;
394 break; 427 break;
395 } 428 }
396 page_cache_release(pages[page_nr]); 429 page_cache_release(spd.pages[page_nr]);
397 pages[page_nr] = page; 430 spd.pages[page_nr] = page;
398 } 431 }
399 /* 432 /*
400 * page was already under io and is now done, great 433 * page was already under io and is now done, great
@@ -451,8 +484,8 @@ fill_it:
451 len = this_len; 484 len = this_len;
452 } 485 }
453 486
454 partial[page_nr].offset = loff; 487 spd.partial[page_nr].offset = loff;
455 partial[page_nr].len = this_len; 488 spd.partial[page_nr].len = this_len;
456 len -= this_len; 489 len -= this_len;
457 loff = 0; 490 loff = 0;
458 spd.nr_pages++; 491 spd.nr_pages++;
@@ -464,12 +497,13 @@ fill_it:
464 * we got, 'nr_pages' is how many pages are in the map. 497 * we got, 'nr_pages' is how many pages are in the map.
465 */ 498 */
466 while (page_nr < nr_pages) 499 while (page_nr < nr_pages)
467 page_cache_release(pages[page_nr++]); 500 page_cache_release(spd.pages[page_nr++]);
468 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 501 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
469 502
470 if (spd.nr_pages) 503 if (spd.nr_pages)
471 return splice_to_pipe(pipe, &spd); 504 error = splice_to_pipe(pipe, &spd);
472 505
506 splice_shrink_spd(pipe, &spd);
473 return error; 507 return error;
474} 508}
475 509
@@ -560,9 +594,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
560 unsigned int nr_pages; 594 unsigned int nr_pages;
561 unsigned int nr_freed; 595 unsigned int nr_freed;
562 size_t offset; 596 size_t offset;
563 struct page *pages[PIPE_BUFFERS]; 597 struct page *pages[PIPE_DEF_BUFFERS];
564 struct partial_page partial[PIPE_BUFFERS]; 598 struct partial_page partial[PIPE_DEF_BUFFERS];
565 struct iovec vec[PIPE_BUFFERS]; 599 struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
566 pgoff_t index; 600 pgoff_t index;
567 ssize_t res; 601 ssize_t res;
568 size_t this_len; 602 size_t this_len;
@@ -576,11 +610,22 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
576 .spd_release = spd_release_page, 610 .spd_release = spd_release_page,
577 }; 611 };
578 612
613 if (splice_grow_spd(pipe, &spd))
614 return -ENOMEM;
615
616 res = -ENOMEM;
617 vec = __vec;
618 if (pipe->buffers > PIPE_DEF_BUFFERS) {
619 vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
620 if (!vec)
621 goto shrink_ret;
622 }
623
579 index = *ppos >> PAGE_CACHE_SHIFT; 624 index = *ppos >> PAGE_CACHE_SHIFT;
580 offset = *ppos & ~PAGE_CACHE_MASK; 625 offset = *ppos & ~PAGE_CACHE_MASK;
581 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 626 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
582 627
583 for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { 628 for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
584 struct page *page; 629 struct page *page;
585 630
586 page = alloc_page(GFP_USER); 631 page = alloc_page(GFP_USER);
@@ -591,7 +636,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
591 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); 636 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
592 vec[i].iov_base = (void __user *) page_address(page); 637 vec[i].iov_base = (void __user *) page_address(page);
593 vec[i].iov_len = this_len; 638 vec[i].iov_len = this_len;
594 pages[i] = page; 639 spd.pages[i] = page;
595 spd.nr_pages++; 640 spd.nr_pages++;
596 len -= this_len; 641 len -= this_len;
597 offset = 0; 642 offset = 0;
@@ -610,11 +655,11 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
610 nr_freed = 0; 655 nr_freed = 0;
611 for (i = 0; i < spd.nr_pages; i++) { 656 for (i = 0; i < spd.nr_pages; i++) {
612 this_len = min_t(size_t, vec[i].iov_len, res); 657 this_len = min_t(size_t, vec[i].iov_len, res);
613 partial[i].offset = 0; 658 spd.partial[i].offset = 0;
614 partial[i].len = this_len; 659 spd.partial[i].len = this_len;
615 if (!this_len) { 660 if (!this_len) {
616 __free_page(pages[i]); 661 __free_page(spd.pages[i]);
617 pages[i] = NULL; 662 spd.pages[i] = NULL;
618 nr_freed++; 663 nr_freed++;
619 } 664 }
620 res -= this_len; 665 res -= this_len;
@@ -625,13 +670,18 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
625 if (res > 0) 670 if (res > 0)
626 *ppos += res; 671 *ppos += res;
627 672
673shrink_ret:
674 if (vec != __vec)
675 kfree(vec);
676 splice_shrink_spd(pipe, &spd);
628 return res; 677 return res;
629 678
630err: 679err:
631 for (i = 0; i < spd.nr_pages; i++) 680 for (i = 0; i < spd.nr_pages; i++)
632 __free_page(pages[i]); 681 __free_page(spd.pages[i]);
633 682
634 return error; 683 res = error;
684 goto shrink_ret;
635} 685}
636EXPORT_SYMBOL(default_file_splice_read); 686EXPORT_SYMBOL(default_file_splice_read);
637 687
@@ -784,7 +834,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
784 if (!buf->len) { 834 if (!buf->len) {
785 buf->ops = NULL; 835 buf->ops = NULL;
786 ops->release(pipe, buf); 836 ops->release(pipe, buf);
787 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 837 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
788 pipe->nrbufs--; 838 pipe->nrbufs--;
789 if (pipe->inode) 839 if (pipe->inode)
790 sd->need_wakeup = true; 840 sd->need_wakeup = true;
@@ -1211,7 +1261,7 @@ out_release:
1211 * If we did an incomplete transfer we must release 1261 * If we did an incomplete transfer we must release
1212 * the pipe buffers in question: 1262 * the pipe buffers in question:
1213 */ 1263 */
1214 for (i = 0; i < PIPE_BUFFERS; i++) { 1264 for (i = 0; i < pipe->buffers; i++) {
1215 struct pipe_buffer *buf = pipe->bufs + i; 1265 struct pipe_buffer *buf = pipe->bufs + i;
1216 1266
1217 if (buf->ops) { 1267 if (buf->ops) {
@@ -1371,7 +1421,8 @@ static long do_splice(struct file *in, loff_t __user *off_in,
1371 */ 1421 */
1372static int get_iovec_page_array(const struct iovec __user *iov, 1422static int get_iovec_page_array(const struct iovec __user *iov,
1373 unsigned int nr_vecs, struct page **pages, 1423 unsigned int nr_vecs, struct page **pages,
1374 struct partial_page *partial, int aligned) 1424 struct partial_page *partial, int aligned,
1425 unsigned int pipe_buffers)
1375{ 1426{
1376 int buffers = 0, error = 0; 1427 int buffers = 0, error = 0;
1377 1428
@@ -1414,8 +1465,8 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1414 break; 1465 break;
1415 1466
1416 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1467 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1417 if (npages > PIPE_BUFFERS - buffers) 1468 if (npages > pipe_buffers - buffers)
1418 npages = PIPE_BUFFERS - buffers; 1469 npages = pipe_buffers - buffers;
1419 1470
1420 error = get_user_pages_fast((unsigned long)base, npages, 1471 error = get_user_pages_fast((unsigned long)base, npages,
1421 0, &pages[buffers]); 1472 0, &pages[buffers]);
@@ -1450,7 +1501,7 @@ static int get_iovec_page_array(const struct iovec __user *iov,
1450 * or if we mapped the max number of pages that we have 1501 * or if we mapped the max number of pages that we have
1451 * room for. 1502 * room for.
1452 */ 1503 */
1453 if (error < npages || buffers == PIPE_BUFFERS) 1504 if (error < npages || buffers == pipe_buffers)
1454 break; 1505 break;
1455 1506
1456 nr_vecs--; 1507 nr_vecs--;
@@ -1593,8 +1644,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1593 unsigned long nr_segs, unsigned int flags) 1644 unsigned long nr_segs, unsigned int flags)
1594{ 1645{
1595 struct pipe_inode_info *pipe; 1646 struct pipe_inode_info *pipe;
1596 struct page *pages[PIPE_BUFFERS]; 1647 struct page *pages[PIPE_DEF_BUFFERS];
1597 struct partial_page partial[PIPE_BUFFERS]; 1648 struct partial_page partial[PIPE_DEF_BUFFERS];
1598 struct splice_pipe_desc spd = { 1649 struct splice_pipe_desc spd = {
1599 .pages = pages, 1650 .pages = pages,
1600 .partial = partial, 1651 .partial = partial,
@@ -1602,17 +1653,25 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1602 .ops = &user_page_pipe_buf_ops, 1653 .ops = &user_page_pipe_buf_ops,
1603 .spd_release = spd_release_page, 1654 .spd_release = spd_release_page,
1604 }; 1655 };
1656 long ret;
1605 1657
1606 pipe = pipe_info(file->f_path.dentry->d_inode); 1658 pipe = pipe_info(file->f_path.dentry->d_inode);
1607 if (!pipe) 1659 if (!pipe)
1608 return -EBADF; 1660 return -EBADF;
1609 1661
1610 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1662 if (splice_grow_spd(pipe, &spd))
1611 flags & SPLICE_F_GIFT); 1663 return -ENOMEM;
1664
1665 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
1666 spd.partial, flags & SPLICE_F_GIFT,
1667 pipe->buffers);
1612 if (spd.nr_pages <= 0) 1668 if (spd.nr_pages <= 0)
1613 return spd.nr_pages; 1669 ret = spd.nr_pages;
1670 else
1671 ret = splice_to_pipe(pipe, &spd);
1614 1672
1615 return splice_to_pipe(pipe, &spd); 1673 splice_shrink_spd(pipe, &spd);
1674 return ret;
1616} 1675}
1617 1676
1618/* 1677/*
@@ -1738,13 +1797,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1738 * Check ->nrbufs without the inode lock first. This function 1797 * Check ->nrbufs without the inode lock first. This function
1739 * is speculative anyways, so missing one is ok. 1798 * is speculative anyways, so missing one is ok.
1740 */ 1799 */
1741 if (pipe->nrbufs < PIPE_BUFFERS) 1800 if (pipe->nrbufs < pipe->buffers)
1742 return 0; 1801 return 0;
1743 1802
1744 ret = 0; 1803 ret = 0;
1745 pipe_lock(pipe); 1804 pipe_lock(pipe);
1746 1805
1747 while (pipe->nrbufs >= PIPE_BUFFERS) { 1806 while (pipe->nrbufs >= pipe->buffers) {
1748 if (!pipe->readers) { 1807 if (!pipe->readers) {
1749 send_sig(SIGPIPE, current, 0); 1808 send_sig(SIGPIPE, current, 0);
1750 ret = -EPIPE; 1809 ret = -EPIPE;
@@ -1810,7 +1869,7 @@ retry:
1810 * Cannot make any progress, because either the input 1869 * Cannot make any progress, because either the input
1811 * pipe is empty or the output pipe is full. 1870 * pipe is empty or the output pipe is full.
1812 */ 1871 */
1813 if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { 1872 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1814 /* Already processed some buffers, break */ 1873 /* Already processed some buffers, break */
1815 if (ret) 1874 if (ret)
1816 break; 1875 break;
@@ -1831,7 +1890,7 @@ retry:
1831 } 1890 }
1832 1891
1833 ibuf = ipipe->bufs + ipipe->curbuf; 1892 ibuf = ipipe->bufs + ipipe->curbuf;
1834 nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; 1893 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1835 obuf = opipe->bufs + nbuf; 1894 obuf = opipe->bufs + nbuf;
1836 1895
1837 if (len >= ibuf->len) { 1896 if (len >= ibuf->len) {
@@ -1841,7 +1900,7 @@ retry:
1841 *obuf = *ibuf; 1900 *obuf = *ibuf;
1842 ibuf->ops = NULL; 1901 ibuf->ops = NULL;
1843 opipe->nrbufs++; 1902 opipe->nrbufs++;
1844 ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; 1903 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1845 ipipe->nrbufs--; 1904 ipipe->nrbufs--;
1846 input_wakeup = true; 1905 input_wakeup = true;
1847 } else { 1906 } else {
@@ -1914,11 +1973,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
1914 * If we have iterated all input buffers or ran out of 1973 * If we have iterated all input buffers or ran out of
1915 * output room, break. 1974 * output room, break.
1916 */ 1975 */
1917 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1976 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1918 break; 1977 break;
1919 1978
1920 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1979 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1921 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1980 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1922 1981
1923 /* 1982 /*
1924 * Get a reference to this pipe buffer, 1983 * Get a reference to this pipe buffer,
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config SQUASHFS_XATTRS
30 bool "Squashfs XATTR support"
31 depends on SQUASHFS
32 default n
33 help
34 Saying Y here includes support for extended attributes (xattrs).
35 Xattrs are name:value pairs associated with inodes by
36 the kernel or by users (see the attr(5) manual page).
37
38 If unsure, say N.
39
29config SQUASHFS_EMBEDDED 40config SQUASHFS_EMBEDDED
30 41
31 bool "Additional option for memory-constrained systems" 42 bool "Additional option for memory-constrained systems"
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o 7squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
9
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
40 40
41#include <linux/fs.h> 41#include <linux/fs.h>
42#include <linux/vfs.h> 42#include <linux/vfs.h>
43#include <linux/xattr.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
46#include "squashfs_fs_i.h" 47#include "squashfs_fs_i.h"
47#include "squashfs.h" 48#include "squashfs.h"
49#include "xattr.h"
48 50
49/* 51/*
50 * Initialise VFS inode with the base inode information common to all 52 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
111 int err, type, offset = SQUASHFS_INODE_OFFSET(ino); 113 int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
112 union squashfs_inode squashfs_ino; 114 union squashfs_inode squashfs_ino;
113 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; 115 struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
116 int xattr_id = SQUASHFS_INVALID_XATTR;
114 117
115 TRACE("Entered squashfs_read_inode\n"); 118 TRACE("Entered squashfs_read_inode\n");
116 119
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
199 frag_offset = 0; 202 frag_offset = 0;
200 } 203 }
201 204
205 xattr_id = le32_to_cpu(sqsh_ino->xattr);
202 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 206 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
203 inode->i_size = le64_to_cpu(sqsh_ino->file_size); 207 inode->i_size = le64_to_cpu(sqsh_ino->file_size);
208 inode->i_op = &squashfs_inode_ops;
204 inode->i_fop = &generic_ro_fops; 209 inode->i_fop = &generic_ro_fops;
205 inode->i_mode |= S_IFREG; 210 inode->i_mode |= S_IFREG;
206 inode->i_blocks = ((inode->i_size - 211 inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
251 if (err < 0) 256 if (err < 0)
252 goto failed_read; 257 goto failed_read;
253 258
259 xattr_id = le32_to_cpu(sqsh_ino->xattr);
254 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 260 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
255 inode->i_size = le32_to_cpu(sqsh_ino->file_size); 261 inode->i_size = le32_to_cpu(sqsh_ino->file_size);
256 inode->i_op = &squashfs_dir_inode_ops; 262 inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
280 286
281 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); 287 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
282 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); 288 inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
283 inode->i_op = &page_symlink_inode_operations; 289 inode->i_op = &squashfs_symlink_inode_ops;
284 inode->i_data.a_ops = &squashfs_symlink_aops; 290 inode->i_data.a_ops = &squashfs_symlink_aops;
285 inode->i_mode |= S_IFLNK; 291 inode->i_mode |= S_IFLNK;
286 squashfs_i(inode)->start = block; 292 squashfs_i(inode)->start = block;
287 squashfs_i(inode)->offset = offset; 293 squashfs_i(inode)->offset = offset;
288 294
295 if (type == SQUASHFS_LSYMLINK_TYPE) {
296 __le32 xattr;
297
298 err = squashfs_read_metadata(sb, NULL, &block,
299 &offset, inode->i_size);
300 if (err < 0)
301 goto failed_read;
302 err = squashfs_read_metadata(sb, &xattr, &block,
303 &offset, sizeof(xattr));
304 if (err < 0)
305 goto failed_read;
306 xattr_id = le32_to_cpu(xattr);
307 }
308
289 TRACE("Symbolic link inode %x:%x, start_block %llx, offset " 309 TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
290 "%x\n", SQUASHFS_INODE_BLK(ino), offset, 310 "%x\n", SQUASHFS_INODE_BLK(ino), offset,
291 block, offset); 311 block, offset);
292 break; 312 break;
293 } 313 }
294 case SQUASHFS_BLKDEV_TYPE: 314 case SQUASHFS_BLKDEV_TYPE:
295 case SQUASHFS_CHRDEV_TYPE: 315 case SQUASHFS_CHRDEV_TYPE: {
296 case SQUASHFS_LBLKDEV_TYPE:
297 case SQUASHFS_LCHRDEV_TYPE: {
298 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; 316 struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
299 unsigned int rdev; 317 unsigned int rdev;
300 318
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
315 SQUASHFS_INODE_BLK(ino), offset, rdev); 333 SQUASHFS_INODE_BLK(ino), offset, rdev);
316 break; 334 break;
317 } 335 }
336 case SQUASHFS_LBLKDEV_TYPE:
337 case SQUASHFS_LCHRDEV_TYPE: {
338 struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
339 unsigned int rdev;
340
341 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
342 sizeof(*sqsh_ino));
343 if (err < 0)
344 goto failed_read;
345
346 if (type == SQUASHFS_LCHRDEV_TYPE)
347 inode->i_mode |= S_IFCHR;
348 else
349 inode->i_mode |= S_IFBLK;
350 xattr_id = le32_to_cpu(sqsh_ino->xattr);
351 inode->i_op = &squashfs_inode_ops;
352 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
353 rdev = le32_to_cpu(sqsh_ino->rdev);
354 init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
355
356 TRACE("Device inode %x:%x, rdev %x\n",
357 SQUASHFS_INODE_BLK(ino), offset, rdev);
358 break;
359 }
318 case SQUASHFS_FIFO_TYPE: 360 case SQUASHFS_FIFO_TYPE:
319 case SQUASHFS_SOCKET_TYPE: 361 case SQUASHFS_SOCKET_TYPE: {
320 case SQUASHFS_LFIFO_TYPE:
321 case SQUASHFS_LSOCKET_TYPE: {
322 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; 362 struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
323 363
324 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, 364 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
334 init_special_inode(inode, inode->i_mode, 0); 374 init_special_inode(inode, inode->i_mode, 0);
335 break; 375 break;
336 } 376 }
377 case SQUASHFS_LFIFO_TYPE:
378 case SQUASHFS_LSOCKET_TYPE: {
379 struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
380
381 err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
382 sizeof(*sqsh_ino));
383 if (err < 0)
384 goto failed_read;
385
386 if (type == SQUASHFS_LFIFO_TYPE)
387 inode->i_mode |= S_IFIFO;
388 else
389 inode->i_mode |= S_IFSOCK;
390 xattr_id = le32_to_cpu(sqsh_ino->xattr);
391 inode->i_op = &squashfs_inode_ops;
392 inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
393 init_special_inode(inode, inode->i_mode, 0);
394 break;
395 }
337 default: 396 default:
338 ERROR("Unknown inode type %d in squashfs_iget!\n", type); 397 ERROR("Unknown inode type %d in squashfs_iget!\n", type);
339 return -EINVAL; 398 return -EINVAL;
340 } 399 }
341 400
401 if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
402 err = squashfs_xattr_lookup(sb, xattr_id,
403 &squashfs_i(inode)->xattr_count,
404 &squashfs_i(inode)->xattr_size,
405 &squashfs_i(inode)->xattr);
406 if (err < 0)
407 goto failed_read;
408 inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
409 + 1;
410 } else
411 squashfs_i(inode)->xattr_count = 0;
412
342 return 0; 413 return 0;
343 414
344failed_read: 415failed_read:
345 ERROR("Unable to read inode 0x%llx\n", ino); 416 ERROR("Unable to read inode 0x%llx\n", ino);
346 return err; 417 return err;
347} 418}
419
420
421const struct inode_operations squashfs_inode_ops = {
422 .getxattr = generic_getxattr,
423 .listxattr = squashfs_listxattr
424};
425
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/string.h> 58#include <linux/string.h>
59#include <linux/dcache.h> 59#include <linux/dcache.h>
60#include <linux/xattr.h>
60 61
61#include "squashfs_fs.h" 62#include "squashfs_fs.h"
62#include "squashfs_fs_sb.h" 63#include "squashfs_fs_sb.h"
63#include "squashfs_fs_i.h" 64#include "squashfs_fs_i.h"
64#include "squashfs.h" 65#include "squashfs.h"
66#include "xattr.h"
65 67
66/* 68/*
67 * Lookup name in the directory index, returning the location of the metadata 69 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
237 239
238 240
239const struct inode_operations squashfs_dir_inode_ops = { 241const struct inode_operations squashfs_dir_inode_ops = {
240 .lookup = squashfs_lookup 242 .lookup = squashfs_lookup,
243 .getxattr = generic_getxattr,
244 .listxattr = squashfs_listxattr
241}; 245};
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
73 unsigned int); 73 unsigned int);
74extern int squashfs_read_inode(struct inode *, long long); 74extern int squashfs_read_inode(struct inode *, long long);
75 75
76/* xattr.c */
77extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
78
76/* 79/*
77 * Inodes, files and decompressor operations 80 * Inodes, files, decompressor and xattr operations
78 */ 81 */
79 82
80/* dir.c */ 83/* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
86/* file.c */ 89/* file.c */
87extern const struct address_space_operations squashfs_aops; 90extern const struct address_space_operations squashfs_aops;
88 91
92/* inode.c */
93extern const struct inode_operations squashfs_inode_ops;
94
89/* namei.c */ 95/* namei.c */
90extern const struct inode_operations squashfs_dir_inode_ops; 96extern const struct inode_operations squashfs_dir_inode_ops;
91 97
92/* symlink.c */ 98/* symlink.c */
93extern const struct address_space_operations squashfs_symlink_aops; 99extern const struct address_space_operations squashfs_symlink_aops;
100extern const struct inode_operations squashfs_symlink_inode_ops;
101
102/* xattr.c */
103extern const struct xattr_handler *squashfs_xattr_handlers[];
94 104
95/* zlib_wrapper.c */ 105/* zlib_wrapper.c */
96extern const struct squashfs_decompressor squashfs_zlib_comp_ops; 106extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
46#define SQUASHFS_NAME_LEN 256 46#define SQUASHFS_NAME_LEN 256
47 47
48#define SQUASHFS_INVALID_FRAG (0xffffffffU) 48#define SQUASHFS_INVALID_FRAG (0xffffffffU)
49#define SQUASHFS_INVALID_XATTR (0xffffffffU)
49#define SQUASHFS_INVALID_BLK (-1LL) 50#define SQUASHFS_INVALID_BLK (-1LL)
50 51
51/* Filesystem flags */ 52/* Filesystem flags */
@@ -96,6 +97,13 @@
96#define SQUASHFS_LFIFO_TYPE 13 97#define SQUASHFS_LFIFO_TYPE 13
97#define SQUASHFS_LSOCKET_TYPE 14 98#define SQUASHFS_LSOCKET_TYPE 14
98 99
100/* Xattr types */
101#define SQUASHFS_XATTR_USER 0
102#define SQUASHFS_XATTR_TRUSTED 1
103#define SQUASHFS_XATTR_SECURITY 2
104#define SQUASHFS_XATTR_VALUE_OOL 256
105#define SQUASHFS_XATTR_PREFIX_MASK 0xff
106
99/* Flag whether block is compressed or uncompressed, bit is set if block is 107/* Flag whether block is compressed or uncompressed, bit is set if block is
100 * uncompressed */ 108 * uncompressed */
101#define SQUASHFS_COMPRESSED_BIT (1 << 15) 109#define SQUASHFS_COMPRESSED_BIT (1 << 15)
@@ -174,6 +182,24 @@
174 182
175#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ 183#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
176 sizeof(u64)) 184 sizeof(u64))
185/* xattr id lookup table defines */
186#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
187
188#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
189 SQUASHFS_METADATA_SIZE)
190
191#define SQUASHFS_XATTR_BLOCK_OFFSET(A) (SQUASHFS_XATTR_BYTES(A) % \
192 SQUASHFS_METADATA_SIZE)
193
194#define SQUASHFS_XATTR_BLOCKS(A) ((SQUASHFS_XATTR_BYTES(A) + \
195 SQUASHFS_METADATA_SIZE - 1) / \
196 SQUASHFS_METADATA_SIZE)
197
198#define SQUASHFS_XATTR_BLOCK_BYTES(A) (SQUASHFS_XATTR_BLOCKS(A) *\
199 sizeof(u64))
200#define SQUASHFS_XATTR_BLK(A) ((unsigned int) ((A) >> 16))
201
202#define SQUASHFS_XATTR_OFFSET(A) ((unsigned int) ((A) & 0xffff))
177 203
178/* cached data constants for filesystem */ 204/* cached data constants for filesystem */
179#define SQUASHFS_CACHED_BLKS 8 205#define SQUASHFS_CACHED_BLKS 8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
228 __le64 root_inode; 254 __le64 root_inode;
229 __le64 bytes_used; 255 __le64 bytes_used;
230 __le64 id_table_start; 256 __le64 id_table_start;
231 __le64 xattr_table_start; 257 __le64 xattr_id_table_start;
232 __le64 inode_table_start; 258 __le64 inode_table_start;
233 __le64 directory_table_start; 259 __le64 directory_table_start;
234 __le64 fragment_table_start; 260 __le64 fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
261 __le32 nlink; 287 __le32 nlink;
262}; 288};
263 289
290struct squashfs_lipc_inode {
291 __le16 inode_type;
292 __le16 mode;
293 __le16 uid;
294 __le16 guid;
295 __le32 mtime;
296 __le32 inode_number;
297 __le32 nlink;
298 __le32 xattr;
299};
300
264struct squashfs_dev_inode { 301struct squashfs_dev_inode {
265 __le16 inode_type; 302 __le16 inode_type;
266 __le16 mode; 303 __le16 mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
272 __le32 rdev; 309 __le32 rdev;
273}; 310};
274 311
312struct squashfs_ldev_inode {
313 __le16 inode_type;
314 __le16 mode;
315 __le16 uid;
316 __le16 guid;
317 __le32 mtime;
318 __le32 inode_number;
319 __le32 nlink;
320 __le32 rdev;
321 __le32 xattr;
322};
323
275struct squashfs_symlink_inode { 324struct squashfs_symlink_inode {
276 __le16 inode_type; 325 __le16 inode_type;
277 __le16 mode; 326 __le16 mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
349union squashfs_inode { 398union squashfs_inode {
350 struct squashfs_base_inode base; 399 struct squashfs_base_inode base;
351 struct squashfs_dev_inode dev; 400 struct squashfs_dev_inode dev;
401 struct squashfs_ldev_inode ldev;
352 struct squashfs_symlink_inode symlink; 402 struct squashfs_symlink_inode symlink;
353 struct squashfs_reg_inode reg; 403 struct squashfs_reg_inode reg;
354 struct squashfs_lreg_inode lreg; 404 struct squashfs_lreg_inode lreg;
355 struct squashfs_dir_inode dir; 405 struct squashfs_dir_inode dir;
356 struct squashfs_ldir_inode ldir; 406 struct squashfs_ldir_inode ldir;
357 struct squashfs_ipc_inode ipc; 407 struct squashfs_ipc_inode ipc;
408 struct squashfs_lipc_inode lipc;
358}; 409};
359 410
360struct squashfs_dir_entry { 411struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
377 unsigned int unused; 428 unsigned int unused;
378}; 429};
379 430
431struct squashfs_xattr_entry {
432 __le16 type;
433 __le16 size;
434 char data[0];
435};
436
437struct squashfs_xattr_val {
438 __le32 vsize;
439 char value[0];
440};
441
442struct squashfs_xattr_id {
443 __le64 xattr;
444 __le32 count;
445 __le32 size;
446};
447
448struct squashfs_xattr_id_table {
449 __le64 xattr_table_start;
450 __le32 xattr_ids;
451 __le32 unused;
452};
453
380#endif 454#endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
26struct squashfs_inode_info { 26struct squashfs_inode_info {
27 u64 start; 27 u64 start;
28 int offset; 28 int offset;
29 u64 xattr;
30 unsigned int xattr_size;
31 int xattr_count;
29 union { 32 union {
30 struct { 33 struct {
31 u64 fragment_block; 34 u64 fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
61 int next_meta_index; 61 int next_meta_index;
62 __le64 *id_table; 62 __le64 *id_table;
63 __le64 *fragment_index; 63 __le64 *fragment_index;
64 __le64 *xattr_id_table;
64 struct mutex read_data_mutex; 65 struct mutex read_data_mutex;
65 struct mutex meta_index_mutex; 66 struct mutex meta_index_mutex;
66 struct meta_index *meta_index; 67 struct meta_index *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
68 __le64 *inode_lookup_table; 69 __le64 *inode_lookup_table;
69 u64 inode_table; 70 u64 inode_table;
70 u64 directory_table; 71 u64 directory_table;
72 u64 xattr_table;
71 unsigned int block_size; 73 unsigned int block_size;
72 unsigned short block_log; 74 unsigned short block_log;
73 long long bytes_used; 75 long long bytes_used;
74 unsigned int inodes; 76 unsigned int inodes;
77 int xattr_ids;
75}; 78};
76#endif 79#endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
36#include <linux/init.h> 36#include <linux/init.h>
37#include <linux/module.h> 37#include <linux/module.h>
38#include <linux/magic.h> 38#include <linux/magic.h>
39#include <linux/xattr.h>
39 40
40#include "squashfs_fs.h" 41#include "squashfs_fs.h"
41#include "squashfs_fs_sb.h" 42#include "squashfs_fs_sb.h"
42#include "squashfs_fs_i.h" 43#include "squashfs_fs_i.h"
43#include "squashfs.h" 44#include "squashfs.h"
44#include "decompressor.h" 45#include "decompressor.h"
46#include "xattr.h"
45 47
46static struct file_system_type squashfs_fs_type; 48static struct file_system_type squashfs_fs_type;
47static const struct super_operations squashfs_super_ops; 49static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
82 long long root_inode; 84 long long root_inode;
83 unsigned short flags; 85 unsigned short flags;
84 unsigned int fragments; 86 unsigned int fragments;
85 u64 lookup_table_start; 87 u64 lookup_table_start, xattr_id_table_start;
86 int err; 88 int err;
87 89
88 TRACE("Entered squashfs_fill_superblock\n"); 90 TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
139 if (msblk->decompressor == NULL) 141 if (msblk->decompressor == NULL)
140 goto failed_mount; 142 goto failed_mount;
141 143
142 /*
143 * Check if there's xattrs in the filesystem. These are not
144 * supported in this version, so warn that they will be ignored.
145 */
146 if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
147 ERROR("Xattrs in filesystem, these will be ignored\n");
148
149 /* Check the filesystem does not extend beyond the end of the 144 /* Check the filesystem does not extend beyond the end of the
150 block device */ 145 block device */
151 msblk->bytes_used = le64_to_cpu(sblk->bytes_used); 146 msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
253allocate_lookup_table: 248allocate_lookup_table:
254 lookup_table_start = le64_to_cpu(sblk->lookup_table_start); 249 lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
255 if (lookup_table_start == SQUASHFS_INVALID_BLK) 250 if (lookup_table_start == SQUASHFS_INVALID_BLK)
256 goto allocate_root; 251 goto allocate_xattr_table;
257 252
258 /* Allocate and read inode lookup table */ 253 /* Allocate and read inode lookup table */
259 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, 254 msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
266 261
267 sb->s_export_op = &squashfs_export_ops; 262 sb->s_export_op = &squashfs_export_ops;
268 263
264allocate_xattr_table:
265 sb->s_xattr = squashfs_xattr_handlers;
266 xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
267 if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
268 goto allocate_root;
269
270 /* Allocate and read xattr id lookup table */
271 msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
272 xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
273 if (IS_ERR(msblk->xattr_id_table)) {
274 err = PTR_ERR(msblk->xattr_id_table);
275 msblk->xattr_id_table = NULL;
276 if (err != -ENOTSUPP)
277 goto failed_mount;
278 }
269allocate_root: 279allocate_root:
270 root = new_inode(sb); 280 root = new_inode(sb);
271 if (!root) { 281 if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
301 kfree(msblk->inode_lookup_table); 311 kfree(msblk->inode_lookup_table);
302 kfree(msblk->fragment_index); 312 kfree(msblk->fragment_index);
303 kfree(msblk->id_table); 313 kfree(msblk->id_table);
314 kfree(msblk->xattr_id_table);
304 kfree(sb->s_fs_info); 315 kfree(sb->s_fs_info);
305 sb->s_fs_info = NULL; 316 sb->s_fs_info = NULL;
306 kfree(sblk); 317 kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
355 kfree(sbi->fragment_index); 366 kfree(sbi->fragment_index);
356 kfree(sbi->meta_index); 367 kfree(sbi->meta_index);
357 kfree(sbi->inode_lookup_table); 368 kfree(sbi->inode_lookup_table);
369 kfree(sbi->xattr_id_table);
358 kfree(sb->s_fs_info); 370 kfree(sb->s_fs_info);
359 sb->s_fs_info = NULL; 371 sb->s_fs_info = NULL;
360 } 372 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
35#include <linux/kernel.h> 35#include <linux/kernel.h>
36#include <linux/string.h> 36#include <linux/string.h>
37#include <linux/pagemap.h> 37#include <linux/pagemap.h>
38#include <linux/xattr.h>
38 39
39#include "squashfs_fs.h" 40#include "squashfs_fs.h"
40#include "squashfs_fs_sb.h" 41#include "squashfs_fs_sb.h"
41#include "squashfs_fs_i.h" 42#include "squashfs_fs_i.h"
42#include "squashfs.h" 43#include "squashfs.h"
44#include "xattr.h"
43 45
44static int squashfs_symlink_readpage(struct file *file, struct page *page) 46static int squashfs_symlink_readpage(struct file *file, struct page *page)
45{ 47{
@@ -114,3 +116,12 @@ error_out:
114const struct address_space_operations squashfs_symlink_aops = { 116const struct address_space_operations squashfs_symlink_aops = {
115 .readpage = squashfs_symlink_readpage 117 .readpage = squashfs_symlink_readpage
116}; 118};
119
120const struct inode_operations squashfs_symlink_inode_ops = {
121 .readlink = generic_readlink,
122 .follow_link = page_follow_link_light,
123 .put_link = page_put_link,
124 .getxattr = generic_getxattr,
125 .listxattr = squashfs_listxattr
126};
127
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24#include <linux/init.h>
25#include <linux/module.h>
26#include <linux/string.h>
27#include <linux/fs.h>
28#include <linux/vfs.h>
29#include <linux/xattr.h>
30#include <linux/slab.h>
31
32#include "squashfs_fs.h"
33#include "squashfs_fs_sb.h"
34#include "squashfs_fs_i.h"
35#include "squashfs.h"
36
37static const struct xattr_handler *squashfs_xattr_handler(int);
38
39ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
40 size_t buffer_size)
41{
42 struct inode *inode = d->d_inode;
43 struct super_block *sb = inode->i_sb;
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
46 + msblk->xattr_table;
47 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
48 int count = squashfs_i(inode)->xattr_count;
49 size_t rest = buffer_size;
50 int err;
51
52 /* check that the file system has xattrs */
53 if (msblk->xattr_id_table == NULL)
54 return -EOPNOTSUPP;
55
56 /* loop reading each xattr name */
57 while (count--) {
58 struct squashfs_xattr_entry entry;
59 struct squashfs_xattr_val val;
60 const struct xattr_handler *handler;
61 int name_size, prefix_size = 0;
62
63 err = squashfs_read_metadata(sb, &entry, &start, &offset,
64 sizeof(entry));
65 if (err < 0)
66 goto failed;
67
68 name_size = le16_to_cpu(entry.size);
69 handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
70 if (handler)
71 prefix_size = handler->list(d, buffer, rest, NULL,
72 name_size, handler->flags);
73 if (prefix_size) {
74 if (buffer) {
75 if (prefix_size + name_size + 1 > rest) {
76 err = -ERANGE;
77 goto failed;
78 }
79 buffer += prefix_size;
80 }
81 err = squashfs_read_metadata(sb, buffer, &start,
82 &offset, name_size);
83 if (err < 0)
84 goto failed;
85 if (buffer) {
86 buffer[name_size] = '\0';
87 buffer += name_size + 1;
88 }
89 rest -= prefix_size + name_size + 1;
90 } else {
91 /* no handler or insuffficient privileges, so skip */
92 err = squashfs_read_metadata(sb, NULL, &start,
93 &offset, name_size);
94 if (err < 0)
95 goto failed;
96 }
97
98
99 /* skip remaining xattr entry */
100 err = squashfs_read_metadata(sb, &val, &start, &offset,
101 sizeof(val));
102 if (err < 0)
103 goto failed;
104
105 err = squashfs_read_metadata(sb, NULL, &start, &offset,
106 le32_to_cpu(val.vsize));
107 if (err < 0)
108 goto failed;
109 }
110 err = buffer_size - rest;
111
112failed:
113 return err;
114}
115
116
117static int squashfs_xattr_get(struct inode *inode, int name_index,
118 const char *name, void *buffer, size_t buffer_size)
119{
120 struct super_block *sb = inode->i_sb;
121 struct squashfs_sb_info *msblk = sb->s_fs_info;
122 u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
123 + msblk->xattr_table;
124 int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
125 int count = squashfs_i(inode)->xattr_count;
126 int name_len = strlen(name);
127 int err, vsize;
128 char *target = kmalloc(name_len, GFP_KERNEL);
129
130 if (target == NULL)
131 return -ENOMEM;
132
133 /* loop reading each xattr name */
134 for (; count; count--) {
135 struct squashfs_xattr_entry entry;
136 struct squashfs_xattr_val val;
137 int type, prefix, name_size;
138
139 err = squashfs_read_metadata(sb, &entry, &start, &offset,
140 sizeof(entry));
141 if (err < 0)
142 goto failed;
143
144 name_size = le16_to_cpu(entry.size);
145 type = le16_to_cpu(entry.type);
146 prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
147
148 if (prefix == name_index && name_size == name_len)
149 err = squashfs_read_metadata(sb, target, &start,
150 &offset, name_size);
151 else
152 err = squashfs_read_metadata(sb, NULL, &start,
153 &offset, name_size);
154 if (err < 0)
155 goto failed;
156
157 if (prefix == name_index && name_size == name_len &&
158 strncmp(target, name, name_size) == 0) {
159 /* found xattr */
160 if (type & SQUASHFS_XATTR_VALUE_OOL) {
161 __le64 xattr;
162 /* val is a reference to the real location */
163 err = squashfs_read_metadata(sb, &val, &start,
164 &offset, sizeof(val));
165 if (err < 0)
166 goto failed;
167 err = squashfs_read_metadata(sb, &xattr, &start,
168 &offset, sizeof(xattr));
169 if (err < 0)
170 goto failed;
171 xattr = le64_to_cpu(xattr);
172 start = SQUASHFS_XATTR_BLK(xattr) +
173 msblk->xattr_table;
174 offset = SQUASHFS_XATTR_OFFSET(xattr);
175 }
176 /* read xattr value */
177 err = squashfs_read_metadata(sb, &val, &start, &offset,
178 sizeof(val));
179 if (err < 0)
180 goto failed;
181
182 vsize = le32_to_cpu(val.vsize);
183 if (buffer) {
184 if (vsize > buffer_size) {
185 err = -ERANGE;
186 goto failed;
187 }
188 err = squashfs_read_metadata(sb, buffer, &start,
189 &offset, vsize);
190 if (err < 0)
191 goto failed;
192 }
193 break;
194 }
195
196 /* no match, skip remaining xattr entry */
197 err = squashfs_read_metadata(sb, &val, &start, &offset,
198 sizeof(val));
199 if (err < 0)
200 goto failed;
201 err = squashfs_read_metadata(sb, NULL, &start, &offset,
202 le32_to_cpu(val.vsize));
203 if (err < 0)
204 goto failed;
205 }
206 err = count ? vsize : -ENODATA;
207
208failed:
209 kfree(target);
210 return err;
211}
212
213
214/*
215 * User namespace support
216 */
217static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
218 const char *name, size_t name_len, int type)
219{
220 if (list && XATTR_USER_PREFIX_LEN <= list_size)
221 memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
222 return XATTR_USER_PREFIX_LEN;
223}
224
225static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
226 size_t size, int type)
227{
228 if (name[0] == '\0')
229 return -EINVAL;
230
231 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
232 buffer, size);
233}
234
235static const struct xattr_handler squashfs_xattr_user_handler = {
236 .prefix = XATTR_USER_PREFIX,
237 .list = squashfs_user_list,
238 .get = squashfs_user_get
239};
240
241/*
242 * Trusted namespace support
243 */
244static size_t squashfs_trusted_list(struct dentry *d, char *list,
245 size_t list_size, const char *name, size_t name_len, int type)
246{
247 if (!capable(CAP_SYS_ADMIN))
248 return 0;
249
250 if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
251 memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
252 return XATTR_TRUSTED_PREFIX_LEN;
253}
254
255static int squashfs_trusted_get(struct dentry *d, const char *name,
256 void *buffer, size_t size, int type)
257{
258 if (name[0] == '\0')
259 return -EINVAL;
260
261 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
262 buffer, size);
263}
264
265static const struct xattr_handler squashfs_xattr_trusted_handler = {
266 .prefix = XATTR_TRUSTED_PREFIX,
267 .list = squashfs_trusted_list,
268 .get = squashfs_trusted_get
269};
270
271/*
272 * Security namespace support
273 */
274static size_t squashfs_security_list(struct dentry *d, char *list,
275 size_t list_size, const char *name, size_t name_len, int type)
276{
277 if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
278 memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
279 return XATTR_SECURITY_PREFIX_LEN;
280}
281
282static int squashfs_security_get(struct dentry *d, const char *name,
283 void *buffer, size_t size, int type)
284{
285 if (name[0] == '\0')
286 return -EINVAL;
287
288 return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
289 buffer, size);
290}
291
292static const struct xattr_handler squashfs_xattr_security_handler = {
293 .prefix = XATTR_SECURITY_PREFIX,
294 .list = squashfs_security_list,
295 .get = squashfs_security_get
296};
297
298static inline const struct xattr_handler *squashfs_xattr_handler(int type)
299{
300 if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
301 /* ignore unrecognised type */
302 return NULL;
303
304 switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
305 case SQUASHFS_XATTR_USER:
306 return &squashfs_xattr_user_handler;
307 case SQUASHFS_XATTR_TRUSTED:
308 return &squashfs_xattr_trusted_handler;
309 case SQUASHFS_XATTR_SECURITY:
310 return &squashfs_xattr_security_handler;
311 default:
312 /* ignore unrecognised type */
313 return NULL;
314 }
315}
316
317const struct xattr_handler *squashfs_xattr_handlers[] = {
318 &squashfs_xattr_user_handler,
319 &squashfs_xattr_trusted_handler,
320 &squashfs_xattr_security_handler,
321 NULL
322};
323
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr.h
22 */
23
24#ifdef CONFIG_SQUASHFS_XATTRS
25extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
26 u64 *, int *);
27extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
28 int *, unsigned long long *);
29#else
30static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
31 u64 start, u64 *xattr_table_start, int *xattr_ids)
32{
33 ERROR("Xattrs in filesystem, these will be ignored\n");
34 return ERR_PTR(-ENOTSUPP);
35}
36
37static inline int squashfs_xattr_lookup(struct super_block *sb,
38 unsigned int index, int *count, int *size,
39 unsigned long long *xattr)
40{
41 return 0;
42}
43#define squashfs_listxattr NULL
44#define generic_getxattr NULL
45#define squashfs_xattr_handlers NULL
46#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
1/*
2 * Squashfs - a compressed read only filesystem for Linux
3 *
4 * Copyright (c) 2010
5 * Phillip Lougher <phillip@lougher.demon.co.uk>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2,
10 * or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 * xattr_id.c
22 */
23
24/*
25 * This file implements code to map the 32-bit xattr id stored in the inode
26 * into the on disk location of the xattr data.
27 */
28
29#include <linux/fs.h>
30#include <linux/vfs.h>
31#include <linux/slab.h>
32
33#include "squashfs_fs.h"
34#include "squashfs_fs_sb.h"
35#include "squashfs_fs_i.h"
36#include "squashfs.h"
37
38/*
39 * Map xattr id using the xattr id look up table
40 */
41int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
42 int *count, unsigned int *size, unsigned long long *xattr)
43{
44 struct squashfs_sb_info *msblk = sb->s_fs_info;
45 int block = SQUASHFS_XATTR_BLOCK(index);
46 int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
47 u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
48 struct squashfs_xattr_id id;
49 int err;
50
51 err = squashfs_read_metadata(sb, &id, &start_block, &offset,
52 sizeof(id));
53 if (err < 0)
54 return err;
55
56 *xattr = le64_to_cpu(id.xattr);
57 *size = le32_to_cpu(id.size);
58 *count = le32_to_cpu(id.count);
59 return 0;
60}
61
62
63/*
64 * Read uncompressed xattr id lookup table indexes from disk into memory
65 */
66__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
67 u64 *xattr_table_start, int *xattr_ids)
68{
69 unsigned int len;
70 __le64 *xid_table;
71 struct squashfs_xattr_id_table id_table;
72 int err;
73
74 err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
75 if (err < 0) {
76 ERROR("unable to read xattr id table\n");
77 return ERR_PTR(err);
78 }
79 *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
80 *xattr_ids = le32_to_cpu(id_table.xattr_ids);
81 len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
82
83 TRACE("In read_xattr_index_table, length %d\n", len);
84
85 /* Allocate xattr id lookup table indexes */
86 xid_table = kmalloc(len, GFP_KERNEL);
87 if (xid_table == NULL) {
88 ERROR("Failed to allocate xattr id index table\n");
89 return ERR_PTR(-ENOMEM);
90 }
91
92 err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
93 if (err < 0) {
94 ERROR("unable to read xattr id index table\n");
95 kfree(xid_table);
96 return ERR_PTR(err);
97 }
98
99 return xid_table;
100}
diff --git a/fs/statfs.c b/fs/statfs.c
new file mode 100644
index 000000000000..4ef021f3b612
--- /dev/null
+++ b/fs/statfs.c
@@ -0,0 +1,196 @@
1#include <linux/syscalls.h>
2#include <linux/module.h>
3#include <linux/fs.h>
4#include <linux/file.h>
5#include <linux/namei.h>
6#include <linux/statfs.h>
7#include <linux/security.h>
8#include <linux/uaccess.h>
9
10int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
11{
12 int retval = -ENODEV;
13
14 if (dentry) {
15 retval = -ENOSYS;
16 if (dentry->d_sb->s_op->statfs) {
17 memset(buf, 0, sizeof(*buf));
18 retval = security_sb_statfs(dentry);
19 if (retval)
20 return retval;
21 retval = dentry->d_sb->s_op->statfs(dentry, buf);
22 if (retval == 0 && buf->f_frsize == 0)
23 buf->f_frsize = buf->f_bsize;
24 }
25 }
26 return retval;
27}
28
29EXPORT_SYMBOL(vfs_statfs);
30
31static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf)
32{
33 struct kstatfs st;
34 int retval;
35
36 retval = vfs_statfs(dentry, &st);
37 if (retval)
38 return retval;
39
40 if (sizeof(*buf) == sizeof(st))
41 memcpy(buf, &st, sizeof(st));
42 else {
43 if (sizeof buf->f_blocks == 4) {
44 if ((st.f_blocks | st.f_bfree | st.f_bavail |
45 st.f_bsize | st.f_frsize) &
46 0xffffffff00000000ULL)
47 return -EOVERFLOW;
48 /*
49 * f_files and f_ffree may be -1; it's okay to stuff
50 * that into 32 bits
51 */
52 if (st.f_files != -1 &&
53 (st.f_files & 0xffffffff00000000ULL))
54 return -EOVERFLOW;
55 if (st.f_ffree != -1 &&
56 (st.f_ffree & 0xffffffff00000000ULL))
57 return -EOVERFLOW;
58 }
59
60 buf->f_type = st.f_type;
61 buf->f_bsize = st.f_bsize;
62 buf->f_blocks = st.f_blocks;
63 buf->f_bfree = st.f_bfree;
64 buf->f_bavail = st.f_bavail;
65 buf->f_files = st.f_files;
66 buf->f_ffree = st.f_ffree;
67 buf->f_fsid = st.f_fsid;
68 buf->f_namelen = st.f_namelen;
69 buf->f_frsize = st.f_frsize;
70 memset(buf->f_spare, 0, sizeof(buf->f_spare));
71 }
72 return 0;
73}
74
75static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf)
76{
77 struct kstatfs st;
78 int retval;
79
80 retval = vfs_statfs(dentry, &st);
81 if (retval)
82 return retval;
83
84 if (sizeof(*buf) == sizeof(st))
85 memcpy(buf, &st, sizeof(st));
86 else {
87 buf->f_type = st.f_type;
88 buf->f_bsize = st.f_bsize;
89 buf->f_blocks = st.f_blocks;
90 buf->f_bfree = st.f_bfree;
91 buf->f_bavail = st.f_bavail;
92 buf->f_files = st.f_files;
93 buf->f_ffree = st.f_ffree;
94 buf->f_fsid = st.f_fsid;
95 buf->f_namelen = st.f_namelen;
96 buf->f_frsize = st.f_frsize;
97 memset(buf->f_spare, 0, sizeof(buf->f_spare));
98 }
99 return 0;
100}
101
102SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
103{
104 struct path path;
105 int error;
106
107 error = user_path(pathname, &path);
108 if (!error) {
109 struct statfs tmp;
110 error = vfs_statfs_native(path.dentry, &tmp);
111 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
112 error = -EFAULT;
113 path_put(&path);
114 }
115 return error;
116}
117
118SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
119{
120 struct path path;
121 long error;
122
123 if (sz != sizeof(*buf))
124 return -EINVAL;
125 error = user_path(pathname, &path);
126 if (!error) {
127 struct statfs64 tmp;
128 error = vfs_statfs64(path.dentry, &tmp);
129 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
130 error = -EFAULT;
131 path_put(&path);
132 }
133 return error;
134}
135
136SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
137{
138 struct file *file;
139 struct statfs tmp;
140 int error;
141
142 error = -EBADF;
143 file = fget(fd);
144 if (!file)
145 goto out;
146 error = vfs_statfs_native(file->f_path.dentry, &tmp);
147 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
148 error = -EFAULT;
149 fput(file);
150out:
151 return error;
152}
153
154SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
155{
156 struct file *file;
157 struct statfs64 tmp;
158 int error;
159
160 if (sz != sizeof(*buf))
161 return -EINVAL;
162
163 error = -EBADF;
164 file = fget(fd);
165 if (!file)
166 goto out;
167 error = vfs_statfs64(file->f_path.dentry, &tmp);
168 if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
169 error = -EFAULT;
170 fput(file);
171out:
172 return error;
173}
174
175SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
176{
177 struct super_block *s;
178 struct ustat tmp;
179 struct kstatfs sbuf;
180 int err;
181
182 s = user_get_super(new_decode_dev(dev));
183 if (!s)
184 return -EINVAL;
185
186 err = vfs_statfs(s->s_root, &sbuf);
187 drop_super(s);
188 if (err)
189 return err;
190
191 memset(&tmp,0,sizeof(struct ustat));
192 tmp.f_tfree = sbuf.f_bfree;
193 tmp.f_tinode = sbuf.f_ffree;
194
195 return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0;
196}
diff --git a/fs/super.c b/fs/super.c
index 1527e6a0ee35..5c35bc7a499e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -22,23 +22,14 @@
22 22
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/init.h>
26#include <linux/smp_lock.h>
27#include <linux/acct.h> 25#include <linux/acct.h>
28#include <linux/blkdev.h> 26#include <linux/blkdev.h>
29#include <linux/quotaops.h>
30#include <linux/namei.h>
31#include <linux/mount.h> 27#include <linux/mount.h>
32#include <linux/security.h> 28#include <linux/security.h>
33#include <linux/syscalls.h>
34#include <linux/vfs.h>
35#include <linux/writeback.h> /* for the emergency remount stuff */ 29#include <linux/writeback.h> /* for the emergency remount stuff */
36#include <linux/idr.h> 30#include <linux/idr.h>
37#include <linux/kobject.h>
38#include <linux/mutex.h> 31#include <linux/mutex.h>
39#include <linux/file.h>
40#include <linux/backing-dev.h> 32#include <linux/backing-dev.h>
41#include <asm/uaccess.h>
42#include "internal.h" 33#include "internal.h"
43 34
44 35
@@ -93,16 +84,15 @@ static struct super_block *alloc_super(struct file_system_type *type)
93 * subclass. 84 * subclass.
94 */ 85 */
95 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); 86 down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
96 s->s_count = S_BIAS; 87 s->s_count = 1;
97 atomic_set(&s->s_active, 1); 88 atomic_set(&s->s_active, 1);
98 mutex_init(&s->s_vfs_rename_mutex); 89 mutex_init(&s->s_vfs_rename_mutex);
90 lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
99 mutex_init(&s->s_dquot.dqio_mutex); 91 mutex_init(&s->s_dquot.dqio_mutex);
100 mutex_init(&s->s_dquot.dqonoff_mutex); 92 mutex_init(&s->s_dquot.dqonoff_mutex);
101 init_rwsem(&s->s_dquot.dqptr_sem); 93 init_rwsem(&s->s_dquot.dqptr_sem);
102 init_waitqueue_head(&s->s_wait_unfrozen); 94 init_waitqueue_head(&s->s_wait_unfrozen);
103 s->s_maxbytes = MAX_NON_LFS; 95 s->s_maxbytes = MAX_NON_LFS;
104 s->dq_op = sb_dquot_ops;
105 s->s_qcop = sb_quotactl_ops;
106 s->s_op = &default_op; 96 s->s_op = &default_op;
107 s->s_time_gran = 1000000000; 97 s->s_time_gran = 1000000000;
108 } 98 }
@@ -127,39 +117,14 @@ static inline void destroy_super(struct super_block *s)
127/* Superblock refcounting */ 117/* Superblock refcounting */
128 118
129/* 119/*
130 * Drop a superblock's refcount. Returns non-zero if the superblock was 120 * Drop a superblock's refcount. The caller must hold sb_lock.
131 * destroyed. The caller must hold sb_lock.
132 */ 121 */
133static int __put_super(struct super_block *sb) 122void __put_super(struct super_block *sb)
134{ 123{
135 int ret = 0;
136
137 if (!--sb->s_count) { 124 if (!--sb->s_count) {
125 list_del_init(&sb->s_list);
138 destroy_super(sb); 126 destroy_super(sb);
139 ret = 1;
140 }
141 return ret;
142}
143
144/*
145 * Drop a superblock's refcount.
146 * Returns non-zero if the superblock is about to be destroyed and
147 * at least is already removed from super_blocks list, so if we are
148 * making a loop through super blocks then we need to restart.
149 * The caller must hold sb_lock.
150 */
151int __put_super_and_need_restart(struct super_block *sb)
152{
153 /* check for race with generic_shutdown_super() */
154 if (list_empty(&sb->s_list)) {
155 /* super block is removed, need to restart... */
156 __put_super(sb);
157 return 1;
158 } 127 }
159 /* can't be the last, since s_list is still in use */
160 sb->s_count--;
161 BUG_ON(sb->s_count == 0);
162 return 0;
163} 128}
164 129
165/** 130/**
@@ -178,57 +143,47 @@ void put_super(struct super_block *sb)
178 143
179 144
180/** 145/**
181 * deactivate_super - drop an active reference to superblock 146 * deactivate_locked_super - drop an active reference to superblock
182 * @s: superblock to deactivate 147 * @s: superblock to deactivate
183 * 148 *
184 * Drops an active reference to superblock, acquiring a temprory one if 149 * Drops an active reference to superblock, converting it into a temprory
185 * there is no active references left. In that case we lock superblock, 150 * one if there is no other active references left. In that case we
186 * tell fs driver to shut it down and drop the temporary reference we 151 * tell fs driver to shut it down and drop the temporary reference we
187 * had just acquired. 152 * had just acquired.
153 *
154 * Caller holds exclusive lock on superblock; that lock is released.
188 */ 155 */
189void deactivate_super(struct super_block *s) 156void deactivate_locked_super(struct super_block *s)
190{ 157{
191 struct file_system_type *fs = s->s_type; 158 struct file_system_type *fs = s->s_type;
192 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 159 if (atomic_dec_and_test(&s->s_active)) {
193 s->s_count -= S_BIAS-1;
194 spin_unlock(&sb_lock);
195 vfs_dq_off(s, 0);
196 down_write(&s->s_umount);
197 fs->kill_sb(s); 160 fs->kill_sb(s);
198 put_filesystem(fs); 161 put_filesystem(fs);
199 put_super(s); 162 put_super(s);
163 } else {
164 up_write(&s->s_umount);
200 } 165 }
201} 166}
202 167
203EXPORT_SYMBOL(deactivate_super); 168EXPORT_SYMBOL(deactivate_locked_super);
204 169
205/** 170/**
206 * deactivate_locked_super - drop an active reference to superblock 171 * deactivate_super - drop an active reference to superblock
207 * @s: superblock to deactivate 172 * @s: superblock to deactivate
208 * 173 *
209 * Equivalent of up_write(&s->s_umount); deactivate_super(s);, except that 174 * Variant of deactivate_locked_super(), except that superblock is *not*
210 * it does not unlock it until it's all over. As the result, it's safe to 175 * locked by caller. If we are going to drop the final active reference,
211 * use to dispose of new superblock on ->get_sb() failure exits - nobody 176 * lock will be acquired prior to that.
212 * will see the sucker until it's all over. Equivalent using up_write +
213 * deactivate_super is safe for that purpose only if superblock is either
214 * safe to use or has NULL ->s_root when we unlock.
215 */ 177 */
216void deactivate_locked_super(struct super_block *s) 178void deactivate_super(struct super_block *s)
217{ 179{
218 struct file_system_type *fs = s->s_type; 180 if (!atomic_add_unless(&s->s_active, -1, 1)) {
219 if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { 181 down_write(&s->s_umount);
220 s->s_count -= S_BIAS-1; 182 deactivate_locked_super(s);
221 spin_unlock(&sb_lock);
222 vfs_dq_off(s, 0);
223 fs->kill_sb(s);
224 put_filesystem(fs);
225 put_super(s);
226 } else {
227 up_write(&s->s_umount);
228 } 183 }
229} 184}
230 185
231EXPORT_SYMBOL(deactivate_locked_super); 186EXPORT_SYMBOL(deactivate_super);
232 187
233/** 188/**
234 * grab_super - acquire an active reference 189 * grab_super - acquire an active reference
@@ -243,22 +198,17 @@ EXPORT_SYMBOL(deactivate_locked_super);
243 */ 198 */
244static int grab_super(struct super_block *s) __releases(sb_lock) 199static int grab_super(struct super_block *s) __releases(sb_lock)
245{ 200{
201 if (atomic_inc_not_zero(&s->s_active)) {
202 spin_unlock(&sb_lock);
203 return 1;
204 }
205 /* it's going away */
246 s->s_count++; 206 s->s_count++;
247 spin_unlock(&sb_lock); 207 spin_unlock(&sb_lock);
208 /* wait for it to die */
248 down_write(&s->s_umount); 209 down_write(&s->s_umount);
249 if (s->s_root) {
250 spin_lock(&sb_lock);
251 if (s->s_count > S_BIAS) {
252 atomic_inc(&s->s_active);
253 s->s_count--;
254 spin_unlock(&sb_lock);
255 return 1;
256 }
257 spin_unlock(&sb_lock);
258 }
259 up_write(&s->s_umount); 210 up_write(&s->s_umount);
260 put_super(s); 211 put_super(s);
261 yield();
262 return 0; 212 return 0;
263} 213}
264 214
@@ -321,8 +271,7 @@ void generic_shutdown_super(struct super_block *sb)
321 } 271 }
322 spin_lock(&sb_lock); 272 spin_lock(&sb_lock);
323 /* should be initialized for __put_super_and_need_restart() */ 273 /* should be initialized for __put_super_and_need_restart() */
324 list_del_init(&sb->s_list); 274 list_del_init(&sb->s_instances);
325 list_del(&sb->s_instances);
326 spin_unlock(&sb_lock); 275 spin_unlock(&sb_lock);
327 up_write(&sb->s_umount); 276 up_write(&sb->s_umount);
328} 277}
@@ -357,6 +306,7 @@ retry:
357 up_write(&s->s_umount); 306 up_write(&s->s_umount);
358 destroy_super(s); 307 destroy_super(s);
359 } 308 }
309 down_write(&old->s_umount);
360 return old; 310 return old;
361 } 311 }
362 } 312 }
@@ -408,11 +358,12 @@ EXPORT_SYMBOL(drop_super);
408 */ 358 */
409void sync_supers(void) 359void sync_supers(void)
410{ 360{
411 struct super_block *sb; 361 struct super_block *sb, *n;
412 362
413 spin_lock(&sb_lock); 363 spin_lock(&sb_lock);
414restart: 364 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
415 list_for_each_entry(sb, &super_blocks, s_list) { 365 if (list_empty(&sb->s_instances))
366 continue;
416 if (sb->s_op->write_super && sb->s_dirt) { 367 if (sb->s_op->write_super && sb->s_dirt) {
417 sb->s_count++; 368 sb->s_count++;
418 spin_unlock(&sb_lock); 369 spin_unlock(&sb_lock);
@@ -423,14 +374,43 @@ restart:
423 up_read(&sb->s_umount); 374 up_read(&sb->s_umount);
424 375
425 spin_lock(&sb_lock); 376 spin_lock(&sb_lock);
426 if (__put_super_and_need_restart(sb)) 377 __put_super(sb);
427 goto restart;
428 } 378 }
429 } 379 }
430 spin_unlock(&sb_lock); 380 spin_unlock(&sb_lock);
431} 381}
432 382
433/** 383/**
384 * iterate_supers - call function for all active superblocks
385 * @f: function to call
386 * @arg: argument to pass to it
387 *
388 * Scans the superblock list and calls given function, passing it
389 * locked superblock and given argument.
390 */
391void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
392{
393 struct super_block *sb, *n;
394
395 spin_lock(&sb_lock);
396 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
397 if (list_empty(&sb->s_instances))
398 continue;
399 sb->s_count++;
400 spin_unlock(&sb_lock);
401
402 down_read(&sb->s_umount);
403 if (sb->s_root)
404 f(sb, arg);
405 up_read(&sb->s_umount);
406
407 spin_lock(&sb_lock);
408 __put_super(sb);
409 }
410 spin_unlock(&sb_lock);
411}
412
413/**
434 * get_super - get the superblock of a device 414 * get_super - get the superblock of a device
435 * @bdev: device to get the superblock for 415 * @bdev: device to get the superblock for
436 * 416 *
@@ -438,7 +418,7 @@ restart:
438 * mounted on the device given. %NULL is returned if no match is found. 418 * mounted on the device given. %NULL is returned if no match is found.
439 */ 419 */
440 420
441struct super_block * get_super(struct block_device *bdev) 421struct super_block *get_super(struct block_device *bdev)
442{ 422{
443 struct super_block *sb; 423 struct super_block *sb;
444 424
@@ -448,17 +428,20 @@ struct super_block * get_super(struct block_device *bdev)
448 spin_lock(&sb_lock); 428 spin_lock(&sb_lock);
449rescan: 429rescan:
450 list_for_each_entry(sb, &super_blocks, s_list) { 430 list_for_each_entry(sb, &super_blocks, s_list) {
431 if (list_empty(&sb->s_instances))
432 continue;
451 if (sb->s_bdev == bdev) { 433 if (sb->s_bdev == bdev) {
452 sb->s_count++; 434 sb->s_count++;
453 spin_unlock(&sb_lock); 435 spin_unlock(&sb_lock);
454 down_read(&sb->s_umount); 436 down_read(&sb->s_umount);
437 /* still alive? */
455 if (sb->s_root) 438 if (sb->s_root)
456 return sb; 439 return sb;
457 up_read(&sb->s_umount); 440 up_read(&sb->s_umount);
458 /* restart only when sb is no longer on the list */ 441 /* nope, got unmounted */
459 spin_lock(&sb_lock); 442 spin_lock(&sb_lock);
460 if (__put_super_and_need_restart(sb)) 443 __put_super(sb);
461 goto rescan; 444 goto rescan;
462 } 445 }
463 } 446 }
464 spin_unlock(&sb_lock); 447 spin_unlock(&sb_lock);
@@ -473,7 +456,7 @@ EXPORT_SYMBOL(get_super);
473 * 456 *
474 * Scans the superblock list and finds the superblock of the file system 457 * Scans the superblock list and finds the superblock of the file system
475 * mounted on the device given. Returns the superblock with an active 458 * mounted on the device given. Returns the superblock with an active
476 * reference and s_umount held exclusively or %NULL if none was found. 459 * reference or %NULL if none was found.
477 */ 460 */
478struct super_block *get_active_super(struct block_device *bdev) 461struct super_block *get_active_super(struct block_device *bdev)
479{ 462{
@@ -482,81 +465,49 @@ struct super_block *get_active_super(struct block_device *bdev)
482 if (!bdev) 465 if (!bdev)
483 return NULL; 466 return NULL;
484 467
468restart:
485 spin_lock(&sb_lock); 469 spin_lock(&sb_lock);
486 list_for_each_entry(sb, &super_blocks, s_list) { 470 list_for_each_entry(sb, &super_blocks, s_list) {
487 if (sb->s_bdev != bdev) 471 if (list_empty(&sb->s_instances))
488 continue; 472 continue;
489 473 if (sb->s_bdev == bdev) {
490 sb->s_count++; 474 if (grab_super(sb)) /* drops sb_lock */
491 spin_unlock(&sb_lock);
492 down_write(&sb->s_umount);
493 if (sb->s_root) {
494 spin_lock(&sb_lock);
495 if (sb->s_count > S_BIAS) {
496 atomic_inc(&sb->s_active);
497 sb->s_count--;
498 spin_unlock(&sb_lock);
499 return sb; 475 return sb;
500 } 476 else
501 spin_unlock(&sb_lock); 477 goto restart;
502 } 478 }
503 up_write(&sb->s_umount);
504 put_super(sb);
505 yield();
506 spin_lock(&sb_lock);
507 } 479 }
508 spin_unlock(&sb_lock); 480 spin_unlock(&sb_lock);
509 return NULL; 481 return NULL;
510} 482}
511 483
512struct super_block * user_get_super(dev_t dev) 484struct super_block *user_get_super(dev_t dev)
513{ 485{
514 struct super_block *sb; 486 struct super_block *sb;
515 487
516 spin_lock(&sb_lock); 488 spin_lock(&sb_lock);
517rescan: 489rescan:
518 list_for_each_entry(sb, &super_blocks, s_list) { 490 list_for_each_entry(sb, &super_blocks, s_list) {
491 if (list_empty(&sb->s_instances))
492 continue;
519 if (sb->s_dev == dev) { 493 if (sb->s_dev == dev) {
520 sb->s_count++; 494 sb->s_count++;
521 spin_unlock(&sb_lock); 495 spin_unlock(&sb_lock);
522 down_read(&sb->s_umount); 496 down_read(&sb->s_umount);
497 /* still alive? */
523 if (sb->s_root) 498 if (sb->s_root)
524 return sb; 499 return sb;
525 up_read(&sb->s_umount); 500 up_read(&sb->s_umount);
526 /* restart only when sb is no longer on the list */ 501 /* nope, got unmounted */
527 spin_lock(&sb_lock); 502 spin_lock(&sb_lock);
528 if (__put_super_and_need_restart(sb)) 503 __put_super(sb);
529 goto rescan; 504 goto rescan;
530 } 505 }
531 } 506 }
532 spin_unlock(&sb_lock); 507 spin_unlock(&sb_lock);
533 return NULL; 508 return NULL;
534} 509}
535 510
536SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
537{
538 struct super_block *s;
539 struct ustat tmp;
540 struct kstatfs sbuf;
541 int err = -EINVAL;
542
543 s = user_get_super(new_decode_dev(dev));
544 if (s == NULL)
545 goto out;
546 err = vfs_statfs(s->s_root, &sbuf);
547 drop_super(s);
548 if (err)
549 goto out;
550
551 memset(&tmp,0,sizeof(struct ustat));
552 tmp.f_tfree = sbuf.f_bfree;
553 tmp.f_tinode = sbuf.f_ffree;
554
555 err = copy_to_user(ubuf,&tmp,sizeof(struct ustat)) ? -EFAULT : 0;
556out:
557 return err;
558}
559
560/** 511/**
561 * do_remount_sb - asks filesystem to change mount options. 512 * do_remount_sb - asks filesystem to change mount options.
562 * @sb: superblock in question 513 * @sb: superblock in question
@@ -569,7 +520,7 @@ out:
569int do_remount_sb(struct super_block *sb, int flags, void *data, int force) 520int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
570{ 521{
571 int retval; 522 int retval;
572 int remount_rw, remount_ro; 523 int remount_ro;
573 524
574 if (sb->s_frozen != SB_UNFROZEN) 525 if (sb->s_frozen != SB_UNFROZEN)
575 return -EBUSY; 526 return -EBUSY;
@@ -585,7 +536,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
585 sync_filesystem(sb); 536 sync_filesystem(sb);
586 537
587 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY); 538 remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
588 remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
589 539
590 /* If we are remounting RDONLY and current sb is read/write, 540 /* If we are remounting RDONLY and current sb is read/write,
591 make sure there are no rw files opened */ 541 make sure there are no rw files opened */
@@ -594,9 +544,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
594 mark_files_ro(sb); 544 mark_files_ro(sb);
595 else if (!fs_may_remount_ro(sb)) 545 else if (!fs_may_remount_ro(sb))
596 return -EBUSY; 546 return -EBUSY;
597 retval = vfs_dq_off(sb, 1);
598 if (retval < 0 && retval != -ENOSYS)
599 return -EBUSY;
600 } 547 }
601 548
602 if (sb->s_op->remount_fs) { 549 if (sb->s_op->remount_fs) {
@@ -605,8 +552,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
605 return retval; 552 return retval;
606 } 553 }
607 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); 554 sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
608 if (remount_rw) 555
609 vfs_dq_quota_on_remount(sb);
610 /* 556 /*
611 * Some filesystems modify their metadata via some other path than the 557 * Some filesystems modify their metadata via some other path than the
612 * bdev buffer cache (eg. use a private mapping, or directories in 558 * bdev buffer cache (eg. use a private mapping, or directories in
@@ -622,24 +568,24 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
622 568
623static void do_emergency_remount(struct work_struct *work) 569static void do_emergency_remount(struct work_struct *work)
624{ 570{
625 struct super_block *sb; 571 struct super_block *sb, *n;
626 572
627 spin_lock(&sb_lock); 573 spin_lock(&sb_lock);
628 list_for_each_entry(sb, &super_blocks, s_list) { 574 list_for_each_entry_safe(sb, n, &super_blocks, s_list) {
575 if (list_empty(&sb->s_instances))
576 continue;
629 sb->s_count++; 577 sb->s_count++;
630 spin_unlock(&sb_lock); 578 spin_unlock(&sb_lock);
631 down_write(&sb->s_umount); 579 down_write(&sb->s_umount);
632 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) { 580 if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
633 /* 581 /*
634 * ->remount_fs needs lock_kernel().
635 *
636 * What lock protects sb->s_flags?? 582 * What lock protects sb->s_flags??
637 */ 583 */
638 do_remount_sb(sb, MS_RDONLY, NULL, 1); 584 do_remount_sb(sb, MS_RDONLY, NULL, 1);
639 } 585 }
640 up_write(&sb->s_umount); 586 up_write(&sb->s_umount);
641 put_super(sb);
642 spin_lock(&sb_lock); 587 spin_lock(&sb_lock);
588 __put_super(sb);
643 } 589 }
644 spin_unlock(&sb_lock); 590 spin_unlock(&sb_lock);
645 kfree(work); 591 kfree(work);
@@ -990,6 +936,96 @@ out:
990 936
991EXPORT_SYMBOL_GPL(vfs_kern_mount); 937EXPORT_SYMBOL_GPL(vfs_kern_mount);
992 938
939/**
940 * freeze_super - lock the filesystem and force it into a consistent state
941 * @sb: the super to lock
942 *
943 * Syncs the super to make sure the filesystem is consistent and calls the fs's
944 * freeze_fs. Subsequent calls to this without first thawing the fs will return
945 * -EBUSY.
946 */
947int freeze_super(struct super_block *sb)
948{
949 int ret;
950
951 atomic_inc(&sb->s_active);
952 down_write(&sb->s_umount);
953 if (sb->s_frozen) {
954 deactivate_locked_super(sb);
955 return -EBUSY;
956 }
957
958 if (sb->s_flags & MS_RDONLY) {
959 sb->s_frozen = SB_FREEZE_TRANS;
960 smp_wmb();
961 up_write(&sb->s_umount);
962 return 0;
963 }
964
965 sb->s_frozen = SB_FREEZE_WRITE;
966 smp_wmb();
967
968 sync_filesystem(sb);
969
970 sb->s_frozen = SB_FREEZE_TRANS;
971 smp_wmb();
972
973 sync_blockdev(sb->s_bdev);
974 if (sb->s_op->freeze_fs) {
975 ret = sb->s_op->freeze_fs(sb);
976 if (ret) {
977 printk(KERN_ERR
978 "VFS:Filesystem freeze failed\n");
979 sb->s_frozen = SB_UNFROZEN;
980 deactivate_locked_super(sb);
981 return ret;
982 }
983 }
984 up_write(&sb->s_umount);
985 return 0;
986}
987EXPORT_SYMBOL(freeze_super);
988
989/**
990 * thaw_super -- unlock filesystem
991 * @sb: the super to thaw
992 *
993 * Unlocks the filesystem and marks it writeable again after freeze_super().
994 */
995int thaw_super(struct super_block *sb)
996{
997 int error;
998
999 down_write(&sb->s_umount);
1000 if (sb->s_frozen == SB_UNFROZEN) {
1001 up_write(&sb->s_umount);
1002 return -EINVAL;
1003 }
1004
1005 if (sb->s_flags & MS_RDONLY)
1006 goto out;
1007
1008 if (sb->s_op->unfreeze_fs) {
1009 error = sb->s_op->unfreeze_fs(sb);
1010 if (error) {
1011 printk(KERN_ERR
1012 "VFS:Filesystem thaw failed\n");
1013 sb->s_frozen = SB_FREEZE_TRANS;
1014 up_write(&sb->s_umount);
1015 return error;
1016 }
1017 }
1018
1019out:
1020 sb->s_frozen = SB_UNFROZEN;
1021 smp_wmb();
1022 wake_up(&sb->s_wait_unfrozen);
1023 deactivate_locked_super(sb);
1024
1025 return 0;
1026}
1027EXPORT_SYMBOL(thaw_super);
1028
993static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype) 1029static struct vfsmount *fs_set_subtype(struct vfsmount *mnt, const char *fstype)
994{ 1030{
995 int err; 1031 int err;
diff --git a/fs/sync.c b/fs/sync.c
index 92b228176f7c..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -77,50 +77,18 @@ int sync_filesystem(struct super_block *sb)
77} 77}
78EXPORT_SYMBOL_GPL(sync_filesystem); 78EXPORT_SYMBOL_GPL(sync_filesystem);
79 79
80static void sync_one_sb(struct super_block *sb, void *arg)
81{
82 if (!(sb->s_flags & MS_RDONLY) && sb->s_bdi)
83 __sync_filesystem(sb, *(int *)arg);
84}
80/* 85/*
81 * Sync all the data for all the filesystems (called by sys_sync() and 86 * Sync all the data for all the filesystems (called by sys_sync() and
82 * emergency sync) 87 * emergency sync)
83 *
84 * This operation is careful to avoid the livelock which could easily happen
85 * if two or more filesystems are being continuously dirtied. s_need_sync
86 * is used only here. We set it against all filesystems and then clear it as
87 * we sync them. So redirtied filesystems are skipped.
88 *
89 * But if process A is currently running sync_filesystems and then process B
90 * calls sync_filesystems as well, process B will set all the s_need_sync
91 * flags again, which will cause process A to resync everything. Fix that with
92 * a local mutex.
93 */ 88 */
94static void sync_filesystems(int wait) 89static void sync_filesystems(int wait)
95{ 90{
96 struct super_block *sb; 91 iterate_supers(sync_one_sb, &wait);
97 static DEFINE_MUTEX(mutex);
98
99 mutex_lock(&mutex); /* Could be down_interruptible */
100 spin_lock(&sb_lock);
101 list_for_each_entry(sb, &super_blocks, s_list)
102 sb->s_need_sync = 1;
103
104restart:
105 list_for_each_entry(sb, &super_blocks, s_list) {
106 if (!sb->s_need_sync)
107 continue;
108 sb->s_need_sync = 0;
109 sb->s_count++;
110 spin_unlock(&sb_lock);
111
112 down_read(&sb->s_umount);
113 if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
114 __sync_filesystem(sb, wait);
115 up_read(&sb->s_umount);
116
117 /* restart only when sb is no longer on the list */
118 spin_lock(&sb_lock);
119 if (__put_super_and_need_restart(sb))
120 goto restart;
121 }
122 spin_unlock(&sb_lock);
123 mutex_unlock(&mutex);
124} 92}
125 93
126/* 94/*
@@ -162,12 +130,10 @@ void emergency_sync(void)
162 130
163/* 131/*
164 * Generic function to fsync a file. 132 * Generic function to fsync a file.
165 *
166 * filp may be NULL if called via the msync of a vma.
167 */ 133 */
168int file_fsync(struct file *filp, struct dentry *dentry, int datasync) 134int file_fsync(struct file *filp, int datasync)
169{ 135{
170 struct inode * inode = dentry->d_inode; 136 struct inode *inode = filp->f_mapping->host;
171 struct super_block * sb; 137 struct super_block * sb;
172 int ret, err; 138 int ret, err;
173 139
@@ -190,7 +156,6 @@ EXPORT_SYMBOL(file_fsync);
190/** 156/**
191 * vfs_fsync_range - helper to sync a range of data & metadata to disk 157 * vfs_fsync_range - helper to sync a range of data & metadata to disk
192 * @file: file to sync 158 * @file: file to sync
193 * @dentry: dentry of @file
194 * @start: offset in bytes of the beginning of data range to sync 159 * @start: offset in bytes of the beginning of data range to sync
195 * @end: offset in bytes of the end of data range (inclusive) 160 * @end: offset in bytes of the end of data range (inclusive)
196 * @datasync: perform only datasync 161 * @datasync: perform only datasync
@@ -198,32 +163,13 @@ EXPORT_SYMBOL(file_fsync);
198 * Write back data in range @start..@end and metadata for @file to disk. If 163 * Write back data in range @start..@end and metadata for @file to disk. If
199 * @datasync is set only metadata needed to access modified file data is 164 * @datasync is set only metadata needed to access modified file data is
200 * written. 165 * written.
201 *
202 * In case this function is called from nfsd @file may be %NULL and
203 * only @dentry is set. This can only happen when the filesystem
204 * implements the export_operations API.
205 */ 166 */
206int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, 167int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
207 loff_t end, int datasync)
208{ 168{
209 const struct file_operations *fop; 169 struct address_space *mapping = file->f_mapping;
210 struct address_space *mapping;
211 int err, ret; 170 int err, ret;
212 171
213 /* 172 if (!file->f_op || !file->f_op->fsync) {
214 * Get mapping and operations from the file in case we have
215 * as file, or get the default values for them in case we
216 * don't have a struct file available. Damn nfsd..
217 */
218 if (file) {
219 mapping = file->f_mapping;
220 fop = file->f_op;
221 } else {
222 mapping = dentry->d_inode->i_mapping;
223 fop = dentry->d_inode->i_fop;
224 }
225
226 if (!fop || !fop->fsync) {
227 ret = -EINVAL; 173 ret = -EINVAL;
228 goto out; 174 goto out;
229 } 175 }
@@ -235,7 +181,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
235 * livelocks in fsync_buffers_list(). 181 * livelocks in fsync_buffers_list().
236 */ 182 */
237 mutex_lock(&mapping->host->i_mutex); 183 mutex_lock(&mapping->host->i_mutex);
238 err = fop->fsync(file, dentry, datasync); 184 err = file->f_op->fsync(file, datasync);
239 if (!ret) 185 if (!ret)
240 ret = err; 186 ret = err;
241 mutex_unlock(&mapping->host->i_mutex); 187 mutex_unlock(&mapping->host->i_mutex);
@@ -248,19 +194,14 @@ EXPORT_SYMBOL(vfs_fsync_range);
248/** 194/**
249 * vfs_fsync - perform a fsync or fdatasync on a file 195 * vfs_fsync - perform a fsync or fdatasync on a file
250 * @file: file to sync 196 * @file: file to sync
251 * @dentry: dentry of @file
252 * @datasync: only perform a fdatasync operation 197 * @datasync: only perform a fdatasync operation
253 * 198 *
254 * Write back data and metadata for @file to disk. If @datasync is 199 * Write back data and metadata for @file to disk. If @datasync is
255 * set only metadata needed to access modified file data is written. 200 * set only metadata needed to access modified file data is written.
256 *
257 * In case this function is called from nfsd @file may be %NULL and
258 * only @dentry is set. This can only happen when the filesystem
259 * implements the export_operations API.
260 */ 201 */
261int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) 202int vfs_fsync(struct file *file, int datasync)
262{ 203{
263 return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); 204 return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
264} 205}
265EXPORT_SYMBOL(vfs_fsync); 206EXPORT_SYMBOL(vfs_fsync);
266 207
@@ -271,7 +212,7 @@ static int do_fsync(unsigned int fd, int datasync)
271 212
272 file = fget(fd); 213 file = fget(fd);
273 if (file) { 214 if (file) {
274 ret = vfs_fsync(file, file->f_path.dentry, datasync); 215 ret = vfs_fsync(file, datasync);
275 fput(file); 216 fput(file);
276 } 217 }
277 return ret; 218 return ret;
@@ -299,8 +240,7 @@ int generic_write_sync(struct file *file, loff_t pos, loff_t count)
299{ 240{
300 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) 241 if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
301 return 0; 242 return 0;
302 return vfs_fsync_range(file, file->f_path.dentry, pos, 243 return vfs_fsync_range(file, pos, pos + count - 1,
303 pos + count - 1,
304 (file->f_flags & __O_SYNC) ? 0 : 1); 244 (file->f_flags & __O_SYNC) ? 0 : 1);
305} 245}
306EXPORT_SYMBOL(generic_write_sync); 246EXPORT_SYMBOL(generic_write_sync);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bbd77e95cf7f..0835a3b70e03 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,13 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
117 if (error) 117 if (error)
118 goto out; 118 goto out;
119 119
120 iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ 120 error = sysfs_sd_setattr(sd, iattr);
121
122 error = inode_setattr(inode, iattr);
123 if (error) 121 if (error)
124 goto out; 122 goto out;
125 123
126 error = sysfs_sd_setattr(sd, iattr); 124 /* this ignores size changes */
125 generic_setattr(inode, iattr);
126
127out: 127out:
128 mutex_unlock(&sysfs_mutex); 128 mutex_unlock(&sysfs_mutex);
129 return error; 129 return error;
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 1dabed286b4c..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
24 .llseek = generic_file_llseek, 24 .llseek = generic_file_llseek,
25 .read = generic_read_dir, 25 .read = generic_read_dir,
26 .readdir = sysv_readdir, 26 .readdir = sysv_readdir,
27 .fsync = simple_fsync, 27 .fsync = generic_file_fsync,
28}; 28};
29 29
30static inline void dir_put_page(struct page *page) 30static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
26 .write = do_sync_write, 26 .write = do_sync_write,
27 .aio_write = generic_file_aio_write, 27 .aio_write = generic_file_aio_write,
28 .mmap = generic_file_mmap, 28 .mmap = generic_file_mmap,
29 .fsync = simple_fsync, 29 .fsync = generic_file_fsync,
30 .splice_read = generic_file_splice_read, 30 .splice_read = generic_file_splice_read,
31}; 31};
32 32
diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c
index 241e9765cfad..bbd69bdb0fa8 100644
--- a/fs/sysv/ialloc.c
+++ b/fs/sysv/ialloc.c
@@ -159,15 +159,7 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); 159 *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count);
160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); 160 fs16_add(sbi, sbi->s_sb_total_free_inodes, -1);
161 dirty_sb(sb); 161 dirty_sb(sb);
162 162 inode_init_owner(inode, dir, mode);
163 if (dir->i_mode & S_ISGID) {
164 inode->i_gid = dir->i_gid;
165 if (S_ISDIR(mode))
166 mode |= S_ISGID;
167 } else
168 inode->i_gid = current_fsgid();
169
170 inode->i_uid = current_fsuid();
171 inode->i_ino = fs16_to_cpu(sbi, ino); 163 inode->i_ino = fs16_to_cpu(sbi, ino);
172 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 164 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
173 inode->i_blocks = 0; 165 inode->i_blocks = 0;
@@ -176,7 +168,6 @@ struct inode * sysv_new_inode(const struct inode * dir, mode_t mode)
176 insert_inode_hash(inode); 168 insert_inode_hash(inode);
177 mark_inode_dirty(inode); 169 mark_inode_dirty(inode);
178 170
179 inode->i_mode = mode; /* for sysv_write_inode() */
180 sysv_write_inode(inode, 0); /* ensure inode not allocated again */ 171 sysv_write_inode(inode, 0); /* ensure inode not allocated again */
181 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ 172 mark_inode_dirty(inode); /* cleared by sysv_write_inode() */
182 /* That's it. */ 173 /* That's it. */
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
43 * then attach current time stamp. 43 * then attach current time stamp.
44 * But if the filesystem was marked clean, keep it clean. 44 * But if the filesystem was marked clean, keep it clean.
45 */ 45 */
46 sb->s_dirt = 0;
46 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); 47 old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
47 if (sbi->s_type == FSTYPE_SYSV4) { 48 if (sbi->s_type == FSTYPE_SYSV4) {
48 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) 49 if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 401e503d44a1..87ebcce72213 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -104,14 +104,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
104 */ 104 */
105 inode->i_flags |= (S_NOCMTIME); 105 inode->i_flags |= (S_NOCMTIME);
106 106
107 inode->i_uid = current_fsuid(); 107 inode_init_owner(inode, dir, mode);
108 if (dir->i_mode & S_ISGID) {
109 inode->i_gid = dir->i_gid;
110 if (S_ISDIR(mode))
111 mode |= S_ISGID;
112 } else
113 inode->i_gid = current_fsgid();
114 inode->i_mode = mode;
115 inode->i_mtime = inode->i_atime = inode->i_ctime = 108 inode->i_mtime = inode->i_atime = inode->i_ctime =
116 ubifs_current_time(inode); 109 ubifs_current_time(inode);
117 inode->i_mapping->nrpages = 0; 110 inode->i_mapping->nrpages = 0;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b807..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,12 +967,15 @@ static int do_writepage(struct page *page, int len)
967 * the page locked, and it locks @ui_mutex. However, write-back does take inode 967 * the page locked, and it locks @ui_mutex. However, write-back does take inode
968 * @i_mutex, which means other VFS operations may be run on this inode at the 968 * @i_mutex, which means other VFS operations may be run on this inode at the
969 * same time. And the problematic one is truncation to smaller size, from where 969 * same time. And the problematic one is truncation to smaller size, from where
970 * we have to call 'vmtruncate()', which first changes @inode->i_size, then 970 * we have to call 'simple_setsize()', which first changes @inode->i_size, then
971 * drops the truncated pages. And while dropping the pages, it takes the page 971 * drops the truncated pages. And while dropping the pages, it takes the page
972 * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with 972 * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This 973 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
974 * means that @inode->i_size is changed while @ui_mutex is unlocked. 974 * means that @inode->i_size is changed while @ui_mutex is unlocked.
975 * 975 *
976 * XXX: with the new truncate the above is not true anymore, the simple_setsize
977 * calls can be replaced with the individual components.
978 *
976 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond 979 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
977 * inode size. How do we do this if @inode->i_size may became smaller while we 980 * inode size. How do we do this if @inode->i_size may became smaller while we
978 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the 981 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1125,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
1125 budgeted = 0; 1128 budgeted = 0;
1126 } 1129 }
1127 1130
1128 err = vmtruncate(inode, new_size); 1131 err = simple_setsize(inode, new_size);
1129 if (err) 1132 if (err)
1130 goto out_budg; 1133 goto out_budg;
1131 1134
@@ -1214,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1214 1217
1215 if (attr->ia_valid & ATTR_SIZE) { 1218 if (attr->ia_valid & ATTR_SIZE) {
1216 dbg_gen("size %lld -> %lld", inode->i_size, new_size); 1219 dbg_gen("size %lld -> %lld", inode->i_size, new_size);
1217 err = vmtruncate(inode, new_size); 1220 err = simple_setsize(inode, new_size);
1218 if (err) 1221 if (err)
1219 goto out; 1222 goto out;
1220 } 1223 }
@@ -1223,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
1223 if (attr->ia_valid & ATTR_SIZE) { 1226 if (attr->ia_valid & ATTR_SIZE) {
1224 /* Truncation changes inode [mc]time */ 1227 /* Truncation changes inode [mc]time */
1225 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); 1228 inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
1226 /* 'vmtruncate()' changed @i_size, update @ui_size */ 1229 /* 'simple_setsize()' changed @i_size, update @ui_size */
1227 ui->ui_size = inode->i_size; 1230 ui->ui_size = inode->i_size;
1228 } 1231 }
1229 1232
@@ -1304,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
1304 return NULL; 1307 return NULL;
1305} 1308}
1306 1309
1307int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) 1310int ubifs_fsync(struct file *file, int datasync)
1308{ 1311{
1309 struct inode *inode = dentry->d_inode; 1312 struct inode *inode = file->f_mapping->host;
1310 struct ubifs_info *c = inode->i_sb->s_fs_info; 1313 struct ubifs_info *c = inode->i_sb->s_fs_info;
1311 int err; 1314 int err;
1312 1315
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad014..2eef553d50c8 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses 379 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot 380 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
381 * make sure @inode->i_size is always changed under @ui_mutex, because it 381 * make sure @inode->i_size is always changed under @ui_mutex, because it
382 * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock 382 * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are 383 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one 384 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
385 * could consider to rework locking and base it on "shadow" fields. 385 * could consider to rework locking and base it on "shadow" fields.
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
1678int ubifs_calc_dark(const struct ubifs_info *c, int spc); 1678int ubifs_calc_dark(const struct ubifs_info *c, int spc);
1679 1679
1680/* file.c */ 1680/* file.c */
1681int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); 1681int ubifs_fsync(struct file *file, int datasync);
1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr); 1682int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
1683 1683
1684/* dir.c */ 1684/* dir.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 9a9378b4eb5a..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
21 21
22#include "udfdecl.h" 22#include "udfdecl.h"
23 23
24#include <linux/quotaops.h>
25#include <linux/buffer_head.h> 24#include <linux/buffer_head.h>
26#include <linux/bitops.h> 25#include <linux/bitops.h>
27 26
@@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
159 udf_debug("byte=%2x\n", 158 udf_debug("byte=%2x\n",
160 ((char *)bh->b_data)[(bit + i) >> 3]); 159 ((char *)bh->b_data)[(bit + i) >> 3]);
161 } else { 160 } else {
162 if (inode)
163 dquot_free_block(inode, 1);
164 udf_add_free_space(sb, sbi->s_partition, 1); 161 udf_add_free_space(sb, sbi->s_partition, 1);
165 } 162 }
166 } 163 }
@@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
210 bit = block % (sb->s_blocksize << 3); 207 bit = block % (sb->s_blocksize << 3);
211 208
212 while (bit < (sb->s_blocksize << 3) && block_count > 0) { 209 while (bit < (sb->s_blocksize << 3) && block_count > 0) {
213 if (!udf_test_bit(bit, bh->b_data)) 210 if (!udf_clear_bit(bit, bh->b_data))
214 goto out; 211 goto out;
215 else if (dquot_prealloc_block(inode, 1))
216 goto out;
217 else if (!udf_clear_bit(bit, bh->b_data)) {
218 udf_debug("bit already cleared for block %d\n", bit);
219 dquot_free_block(inode, 1);
220 goto out;
221 }
222 block_count--; 212 block_count--;
223 alloc_count++; 213 alloc_count++;
224 bit++; 214 bit++;
@@ -338,20 +328,6 @@ search_back:
338 } 328 }
339 329
340got_block: 330got_block:
341
342 /*
343 * Check quota for allocation of this block.
344 */
345 if (inode) {
346 int ret = dquot_alloc_block(inode, 1);
347
348 if (ret) {
349 mutex_unlock(&sbi->s_alloc_mutex);
350 *err = ret;
351 return 0;
352 }
353 }
354
355 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) - 331 newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
356 (sizeof(struct spaceBitmapDesc) << 3); 332 (sizeof(struct spaceBitmapDesc) << 3);
357 333
@@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
401 } 377 }
402 378
403 iinfo = UDF_I(table); 379 iinfo = UDF_I(table);
404 /* We do this up front - There are some error conditions that
405 could occure, but.. oh well */
406 if (inode)
407 dquot_free_block(inode, count);
408 udf_add_free_space(sb, sbi->s_partition, count); 380 udf_add_free_space(sb, sbi->s_partition, count);
409 381
410 start = bloc->logicalBlockNum + offset; 382 start = bloc->logicalBlockNum + offset;
@@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
649 epos.offset -= adsize; 621 epos.offset -= adsize;
650 622
651 alloc_count = (elen >> sb->s_blocksize_bits); 623 alloc_count = (elen >> sb->s_blocksize_bits);
652 if (inode && dquot_prealloc_block(inode, 624 if (alloc_count > block_count) {
653 alloc_count > block_count ? block_count : alloc_count))
654 alloc_count = 0;
655 else if (alloc_count > block_count) {
656 alloc_count = block_count; 625 alloc_count = block_count;
657 eloc.logicalBlockNum += alloc_count; 626 eloc.logicalBlockNum += alloc_count;
658 elen -= (alloc_count << sb->s_blocksize_bits); 627 elen -= (alloc_count << sb->s_blocksize_bits);
@@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
752 newblock = goal_eloc.logicalBlockNum; 721 newblock = goal_eloc.logicalBlockNum;
753 goal_eloc.logicalBlockNum++; 722 goal_eloc.logicalBlockNum++;
754 goal_elen -= sb->s_blocksize; 723 goal_elen -= sb->s_blocksize;
755 if (inode) {
756 *err = dquot_alloc_block(inode, 1);
757 if (*err) {
758 brelse(goal_epos.bh);
759 mutex_unlock(&sbi->s_alloc_mutex);
760 return 0;
761 }
762 }
763 724
764 if (goal_elen) 725 if (goal_elen)
765 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1); 726 udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a77..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
207 207
208/* readdir and lookup functions */ 208/* readdir and lookup functions */
209const struct file_operations udf_dir_operations = { 209const struct file_operations udf_dir_operations = {
210 .llseek = generic_file_llseek,
210 .read = generic_read_dir, 211 .read = generic_read_dir,
211 .readdir = udf_readdir, 212 .readdir = udf_readdir,
212 .unlocked_ioctl = udf_ioctl, 213 .unlocked_ioctl = udf_ioctl,
213 .fsync = simple_fsync, 214 .fsync = generic_file_fsync,
214}; 215};
diff --git a/fs/udf/file.c b/fs/udf/file.c
index baae3a723946..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,6 @@
34#include <linux/errno.h> 34#include <linux/errno.h>
35#include <linux/smp_lock.h> 35#include <linux/smp_lock.h>
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/quotaops.h>
38#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
39#include <linux/aio.h> 38#include <linux/aio.h>
40#include <linux/smp_lock.h> 39#include <linux/smp_lock.h>
@@ -219,39 +218,16 @@ const struct file_operations udf_file_operations = {
219 .read = do_sync_read, 218 .read = do_sync_read,
220 .aio_read = generic_file_aio_read, 219 .aio_read = generic_file_aio_read,
221 .unlocked_ioctl = udf_ioctl, 220 .unlocked_ioctl = udf_ioctl,
222 .open = dquot_file_open, 221 .open = generic_file_open,
223 .mmap = generic_file_mmap, 222 .mmap = generic_file_mmap,
224 .write = do_sync_write, 223 .write = do_sync_write,
225 .aio_write = udf_file_aio_write, 224 .aio_write = udf_file_aio_write,
226 .release = udf_release_file, 225 .release = udf_release_file,
227 .fsync = simple_fsync, 226 .fsync = generic_file_fsync,
228 .splice_read = generic_file_splice_read, 227 .splice_read = generic_file_splice_read,
229 .llseek = generic_file_llseek, 228 .llseek = generic_file_llseek,
230}; 229};
231 230
232int udf_setattr(struct dentry *dentry, struct iattr *iattr)
233{
234 struct inode *inode = dentry->d_inode;
235 int error;
236
237 error = inode_change_ok(inode, iattr);
238 if (error)
239 return error;
240
241 if (is_quota_modification(inode, iattr))
242 dquot_initialize(inode);
243
244 if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
245 (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
246 error = dquot_transfer(inode, iattr);
247 if (error)
248 return error;
249 }
250
251 return inode_setattr(inode, iattr);
252}
253
254const struct inode_operations udf_file_inode_operations = { 231const struct inode_operations udf_file_inode_operations = {
255 .truncate = udf_truncate, 232 .truncate = udf_truncate,
256 .setattr = udf_setattr,
257}; 233};
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index fb68c9cd0c3e..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
20 20
21#include "udfdecl.h" 21#include "udfdecl.h"
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/quotaops.h>
24#include <linux/sched.h> 23#include <linux/sched.h>
25#include <linux/slab.h> 24#include <linux/slab.h>
26 25
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
32 struct super_block *sb = inode->i_sb; 31 struct super_block *sb = inode->i_sb;
33 struct udf_sb_info *sbi = UDF_SB(sb); 32 struct udf_sb_info *sbi = UDF_SB(sb);
34 33
35 /*
36 * Note: we must free any quota before locking the superblock,
37 * as writing the quota to disk may need the lock as well.
38 */
39 dquot_free_inode(inode);
40 dquot_drop(inode);
41
42 clear_inode(inode); 34 clear_inode(inode);
43 35
44 mutex_lock(&sbi->s_alloc_mutex); 36 mutex_lock(&sbi->s_alloc_mutex);
@@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
61 struct super_block *sb = dir->i_sb; 53 struct super_block *sb = dir->i_sb;
62 struct udf_sb_info *sbi = UDF_SB(sb); 54 struct udf_sb_info *sbi = UDF_SB(sb);
63 struct inode *inode; 55 struct inode *inode;
64 int block, ret; 56 int block;
65 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum; 57 uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
66 struct udf_inode_info *iinfo; 58 struct udf_inode_info *iinfo;
67 struct udf_inode_info *dinfo = UDF_I(dir); 59 struct udf_inode_info *dinfo = UDF_I(dir);
@@ -124,15 +116,8 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
124 udf_updated_lvid(sb); 116 udf_updated_lvid(sb);
125 } 117 }
126 mutex_unlock(&sbi->s_alloc_mutex); 118 mutex_unlock(&sbi->s_alloc_mutex);
127 inode->i_mode = mode; 119
128 inode->i_uid = current_fsuid(); 120 inode_init_owner(inode, dir, mode);
129 if (dir->i_mode & S_ISGID) {
130 inode->i_gid = dir->i_gid;
131 if (S_ISDIR(mode))
132 mode |= S_ISGID;
133 } else {
134 inode->i_gid = current_fsgid();
135 }
136 121
137 iinfo->i_location.logicalBlockNum = block; 122 iinfo->i_location.logicalBlockNum = block;
138 iinfo->i_location.partitionReferenceNum = 123 iinfo->i_location.partitionReferenceNum =
@@ -153,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
153 insert_inode_hash(inode); 138 insert_inode_hash(inode);
154 mark_inode_dirty(inode); 139 mark_inode_dirty(inode);
155 140
156 dquot_initialize(inode);
157 ret = dquot_alloc_inode(inode);
158 if (ret) {
159 dquot_drop(inode);
160 inode->i_flags |= S_NOQUOTA;
161 inode->i_nlink = 0;
162 iput(inode);
163 *err = ret;
164 return NULL;
165 }
166
167 *err = 0; 141 *err = 0;
168 return inode; 142 return inode;
169} 143}
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8a3fbd177cab..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
36#include <linux/pagemap.h> 36#include <linux/pagemap.h>
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/quotaops.h>
40#include <linux/slab.h> 39#include <linux/slab.h>
41#include <linux/crc-itu-t.h> 40#include <linux/crc-itu-t.h>
42 41
@@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
71 70
72void udf_delete_inode(struct inode *inode) 71void udf_delete_inode(struct inode *inode)
73{ 72{
74 if (!is_bad_inode(inode))
75 dquot_initialize(inode);
76
77 truncate_inode_pages(&inode->i_data, 0); 73 truncate_inode_pages(&inode->i_data, 0);
78 74
79 if (is_bad_inode(inode)) 75 if (is_bad_inode(inode))
@@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode)
113 (unsigned long long)iinfo->i_lenExtents); 109 (unsigned long long)iinfo->i_lenExtents);
114 } 110 }
115 111
116 dquot_drop(inode);
117 kfree(iinfo->i_ext.i_data); 112 kfree(iinfo->i_ext.i_data);
118 iinfo->i_ext.i_data = NULL; 113 iinfo->i_ext.i_data = NULL;
119} 114}
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 75816025f95f..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
27#include <linux/errno.h> 27#include <linux/errno.h>
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/quotaops.h>
31#include <linux/smp_lock.h> 30#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 31#include <linux/buffer_head.h>
33#include <linux/sched.h> 32#include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
563 int err; 562 int err;
564 struct udf_inode_info *iinfo; 563 struct udf_inode_info *iinfo;
565 564
566 dquot_initialize(dir);
567
568 lock_kernel(); 565 lock_kernel();
569 inode = udf_new_inode(dir, mode, &err); 566 inode = udf_new_inode(dir, mode, &err);
570 if (!inode) { 567 if (!inode) {
@@ -579,7 +576,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
579 inode->i_data.a_ops = &udf_aops; 576 inode->i_data.a_ops = &udf_aops;
580 inode->i_op = &udf_file_inode_operations; 577 inode->i_op = &udf_file_inode_operations;
581 inode->i_fop = &udf_file_operations; 578 inode->i_fop = &udf_file_operations;
582 inode->i_mode = mode;
583 mark_inode_dirty(inode); 579 mark_inode_dirty(inode);
584 580
585 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 581 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -618,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
618 if (!old_valid_dev(rdev)) 614 if (!old_valid_dev(rdev))
619 return -EINVAL; 615 return -EINVAL;
620 616
621 dquot_initialize(dir);
622
623 lock_kernel(); 617 lock_kernel();
624 err = -EIO; 618 err = -EIO;
625 inode = udf_new_inode(dir, mode, &err); 619 inode = udf_new_inode(dir, mode, &err);
@@ -627,7 +621,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
627 goto out; 621 goto out;
628 622
629 iinfo = UDF_I(inode); 623 iinfo = UDF_I(inode);
630 inode->i_uid = current_fsuid();
631 init_special_inode(inode, mode, rdev); 624 init_special_inode(inode, mode, rdev);
632 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 625 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
633 if (!fi) { 626 if (!fi) {
@@ -666,15 +659,13 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
666 struct udf_inode_info *dinfo = UDF_I(dir); 659 struct udf_inode_info *dinfo = UDF_I(dir);
667 struct udf_inode_info *iinfo; 660 struct udf_inode_info *iinfo;
668 661
669 dquot_initialize(dir);
670
671 lock_kernel(); 662 lock_kernel();
672 err = -EMLINK; 663 err = -EMLINK;
673 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1) 664 if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
674 goto out; 665 goto out;
675 666
676 err = -EIO; 667 err = -EIO;
677 inode = udf_new_inode(dir, S_IFDIR, &err); 668 inode = udf_new_inode(dir, S_IFDIR | mode, &err);
678 if (!inode) 669 if (!inode)
679 goto out; 670 goto out;
680 671
@@ -697,9 +688,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
697 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT; 688 FID_FILE_CHAR_DIRECTORY | FID_FILE_CHAR_PARENT;
698 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL); 689 udf_write_fi(inode, &cfi, fi, &fibh, NULL, NULL);
699 brelse(fibh.sbh); 690 brelse(fibh.sbh);
700 inode->i_mode = S_IFDIR | mode;
701 if (dir->i_mode & S_ISGID)
702 inode->i_mode |= S_ISGID;
703 mark_inode_dirty(inode); 691 mark_inode_dirty(inode);
704 692
705 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); 693 fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
@@ -805,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
805 struct fileIdentDesc *fi, cfi; 793 struct fileIdentDesc *fi, cfi;
806 struct kernel_lb_addr tloc; 794 struct kernel_lb_addr tloc;
807 795
808 dquot_initialize(dir);
809
810 retval = -ENOENT; 796 retval = -ENOENT;
811 lock_kernel(); 797 lock_kernel();
812 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 798 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -853,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
853 struct fileIdentDesc cfi; 839 struct fileIdentDesc cfi;
854 struct kernel_lb_addr tloc; 840 struct kernel_lb_addr tloc;
855 841
856 dquot_initialize(dir);
857
858 retval = -ENOENT; 842 retval = -ENOENT;
859 lock_kernel(); 843 lock_kernel();
860 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); 844 fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -909,10 +893,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
909 struct buffer_head *bh; 893 struct buffer_head *bh;
910 struct udf_inode_info *iinfo; 894 struct udf_inode_info *iinfo;
911 895
912 dquot_initialize(dir);
913
914 lock_kernel(); 896 lock_kernel();
915 inode = udf_new_inode(dir, S_IFLNK, &err); 897 inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
916 if (!inode) 898 if (!inode)
917 goto out; 899 goto out;
918 900
@@ -923,7 +905,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
923 } 905 }
924 906
925 iinfo = UDF_I(inode); 907 iinfo = UDF_I(inode);
926 inode->i_mode = S_IFLNK | S_IRWXUGO;
927 inode->i_data.a_ops = &udf_symlink_aops; 908 inode->i_data.a_ops = &udf_symlink_aops;
928 inode->i_op = &udf_symlink_inode_operations; 909 inode->i_op = &udf_symlink_inode_operations;
929 910
@@ -1081,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
1081 int err; 1062 int err;
1082 struct buffer_head *bh; 1063 struct buffer_head *bh;
1083 1064
1084 dquot_initialize(dir);
1085
1086 lock_kernel(); 1065 lock_kernel();
1087 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) { 1066 if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
1088 unlock_kernel(); 1067 unlock_kernel();
@@ -1145,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
1145 struct kernel_lb_addr tloc; 1124 struct kernel_lb_addr tloc;
1146 struct udf_inode_info *old_iinfo = UDF_I(old_inode); 1125 struct udf_inode_info *old_iinfo = UDF_I(old_inode);
1147 1126
1148 dquot_initialize(old_dir);
1149 dquot_initialize(new_dir);
1150
1151 lock_kernel(); 1127 lock_kernel();
1152 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); 1128 ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
1153 if (ofi) { 1129 if (ofi) {
@@ -1393,7 +1369,6 @@ const struct export_operations udf_export_ops = {
1393const struct inode_operations udf_dir_inode_operations = { 1369const struct inode_operations udf_dir_inode_operations = {
1394 .lookup = udf_lookup, 1370 .lookup = udf_lookup,
1395 .create = udf_create, 1371 .create = udf_create,
1396 .setattr = udf_setattr,
1397 .link = udf_link, 1372 .link = udf_link,
1398 .unlink = udf_unlink, 1373 .unlink = udf_unlink,
1399 .symlink = udf_symlink, 1374 .symlink = udf_symlink,
@@ -1406,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = {
1406 .readlink = generic_readlink, 1381 .readlink = generic_readlink,
1407 .follow_link = page_follow_link_light, 1382 .follow_link = page_follow_link_light,
1408 .put_link = page_put_link, 1383 .put_link = page_put_link,
1409 .setattr = udf_setattr,
1410}; 1384};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
557{ 557{
558 struct udf_options uopt; 558 struct udf_options uopt;
559 struct udf_sb_info *sbi = UDF_SB(sb); 559 struct udf_sb_info *sbi = UDF_SB(sb);
560 int error = 0;
560 561
561 uopt.flags = sbi->s_flags; 562 uopt.flags = sbi->s_flags;
562 uopt.uid = sbi->s_uid; 563 uopt.uid = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
582 *flags |= MS_RDONLY; 583 *flags |= MS_RDONLY;
583 } 584 }
584 585
585 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { 586 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
586 unlock_kernel(); 587 goto out_unlock;
587 return 0; 588
588 }
589 if (*flags & MS_RDONLY) 589 if (*flags & MS_RDONLY)
590 udf_close_lvid(sb); 590 udf_close_lvid(sb);
591 else 591 else
592 udf_open_lvid(sb); 592 udf_open_lvid(sb);
593 593
594out_unlock:
594 unlock_kernel(); 595 unlock_kernel();
595 return 0; 596 return error;
596} 597}
597 598
598/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */ 599/* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1939 /* Fill in the rest of the superblock */ 1940 /* Fill in the rest of the superblock */
1940 sb->s_op = &udf_sb_ops; 1941 sb->s_op = &udf_sb_ops;
1941 sb->s_export_op = &udf_export_ops; 1942 sb->s_export_op = &udf_export_ops;
1942 sb->dq_op = NULL; 1943
1943 sb->s_dirt = 0; 1944 sb->s_dirt = 0;
1944 sb->s_magic = UDF_SUPER_MAGIC; 1945 sb->s_magic = UDF_SUPER_MAGIC;
1945 sb->s_time_gran = 1000; 1946 sb->s_time_gran = 1000;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 9079ff7d6255..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -131,7 +131,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
131 131
132/* file.c */ 132/* file.c */
133extern long udf_ioctl(struct file *, unsigned int, unsigned long); 133extern long udf_ioctl(struct file *, unsigned int, unsigned long);
134extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
135/* inode.c */ 134/* inode.c */
136extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); 135extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
137extern int udf_sync_inode(struct inode *); 136extern int udf_sync_inode(struct inode *);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
12#include <linux/stat.h> 12#include <linux/stat.h>
13#include <linux/time.h> 13#include <linux/time.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/quotaops.h>
16#include <linux/buffer_head.h> 15#include <linux/buffer_head.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/bitops.h> 17#include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
85 "bit already cleared for fragment %u", i); 84 "bit already cleared for fragment %u", i);
86 } 85 }
87 86
88 dquot_free_block(inode, count);
89
90
91 fs32_add(sb, &ucg->cg_cs.cs_nffree, count); 87 fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
92 uspi->cs_total.cs_nffree += count; 88 uspi->cs_total.cs_nffree += count;
93 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 89 fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
195 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 191 ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
196 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 192 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
197 ufs_clusteracct (sb, ucpi, blkno, 1); 193 ufs_clusteracct (sb, ucpi, blkno, 1);
198 dquot_free_block(inode, uspi->s_fpb);
199 194
200 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1); 195 fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
201 uspi->cs_total.cs_nbfree++; 196 uspi->cs_total.cs_nbfree++;
@@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
511 struct ufs_cg_private_info * ucpi; 506 struct ufs_cg_private_info * ucpi;
512 struct ufs_cylinder_group * ucg; 507 struct ufs_cylinder_group * ucg;
513 unsigned cgno, fragno, fragoff, count, fragsize, i; 508 unsigned cgno, fragno, fragoff, count, fragsize, i;
514 int ret;
515 509
516 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n", 510 UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
517 (unsigned long long)fragment, oldcount, newcount); 511 (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
557 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1); 551 fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
558 for (i = oldcount; i < newcount; i++) 552 for (i = oldcount; i < newcount; i++)
559 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i); 553 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
560 ret = dquot_alloc_block(inode, count);
561 if (ret) {
562 *err = ret;
563 return 0;
564 }
565 554
566 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count); 555 fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
567 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count); 556 fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
598 struct ufs_cylinder_group * ucg; 587 struct ufs_cylinder_group * ucg;
599 unsigned oldcg, i, j, k, allocsize; 588 unsigned oldcg, i, j, k, allocsize;
600 u64 result; 589 u64 result;
601 int ret;
602 590
603 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n", 591 UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
604 inode->i_ino, cgno, (unsigned long long)goal, count); 592 inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +655,6 @@ cg_found:
667 for (i = count; i < uspi->s_fpb; i++) 655 for (i = count; i < uspi->s_fpb; i++)
668 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i); 656 ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
669 i = uspi->s_fpb - count; 657 i = uspi->s_fpb - count;
670 dquot_free_block(inode, i);
671 658
672 fs32_add(sb, &ucg->cg_cs.cs_nffree, i); 659 fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
673 uspi->cs_total.cs_nffree += i; 660 uspi->cs_total.cs_nffree += i;
@@ -679,11 +666,6 @@ cg_found:
679 result = ufs_bitmap_search (sb, ucpi, goal, allocsize); 666 result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
680 if (result == INVBLOCK) 667 if (result == INVBLOCK)
681 return 0; 668 return 0;
682 ret = dquot_alloc_block(inode, count);
683 if (ret) {
684 *err = ret;
685 return 0;
686 }
687 for (i = 0; i < count; i++) 669 for (i = 0; i < count; i++)
688 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i); 670 ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
689 671
@@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
718 struct ufs_super_block_first * usb1; 700 struct ufs_super_block_first * usb1;
719 struct ufs_cylinder_group * ucg; 701 struct ufs_cylinder_group * ucg;
720 u64 result, blkno; 702 u64 result, blkno;
721 int ret;
722 703
723 UFSD("ENTER, goal %llu\n", (unsigned long long)goal); 704 UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
724 705
@@ -752,11 +733,6 @@ gotit:
752 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno); 733 ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
753 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD) 734 if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
754 ufs_clusteracct (sb, ucpi, blkno, -1); 735 ufs_clusteracct (sb, ucpi, blkno, -1);
755 ret = dquot_alloc_block(inode, uspi->s_fpb);
756 if (ret) {
757 *err = ret;
758 return INVBLOCK;
759 }
760 736
761 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1); 737 fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
762 uspi->cs_total.cs_nbfree--; 738 uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
666const struct file_operations ufs_dir_operations = { 666const struct file_operations ufs_dir_operations = {
667 .read = generic_read_dir, 667 .read = generic_read_dir,
668 .readdir = ufs_readdir, 668 .readdir = ufs_readdir,
669 .fsync = simple_fsync, 669 .fsync = generic_file_fsync,
670 .llseek = generic_file_llseek, 670 .llseek = generic_file_llseek,
671}; 671};
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
24 */ 24 */
25 25
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/quotaops.h>
28 27
29#include "ufs_fs.h" 28#include "ufs_fs.h"
30#include "ufs.h" 29#include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
41 .write = do_sync_write, 40 .write = do_sync_write,
42 .aio_write = generic_file_aio_write, 41 .aio_write = generic_file_aio_write,
43 .mmap = generic_file_mmap, 42 .mmap = generic_file_mmap,
44 .open = dquot_file_open, 43 .open = generic_file_open,
45 .fsync = simple_fsync, 44 .fsync = generic_file_fsync,
46 .splice_read = generic_file_splice_read, 45 .splice_read = generic_file_splice_read,
47}; 46};
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 230ecf608026..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
27#include <linux/time.h> 27#include <linux/time.h>
28#include <linux/stat.h> 28#include <linux/stat.h>
29#include <linux/string.h> 29#include <linux/string.h>
30#include <linux/quotaops.h>
31#include <linux/buffer_head.h> 30#include <linux/buffer_head.h>
32#include <linux/sched.h> 31#include <linux/sched.h>
33#include <linux/bitops.h> 32#include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
95 94
96 is_directory = S_ISDIR(inode->i_mode); 95 is_directory = S_ISDIR(inode->i_mode);
97 96
98 dquot_free_inode(inode);
99 dquot_drop(inode);
100
101 clear_inode (inode); 97 clear_inode (inode);
102 98
103 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit)) 99 if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -303,15 +299,7 @@ cg_found:
303 sb->s_dirt = 1; 299 sb->s_dirt = 1;
304 300
305 inode->i_ino = cg * uspi->s_ipg + bit; 301 inode->i_ino = cg * uspi->s_ipg + bit;
306 inode->i_mode = mode; 302 inode_init_owner(inode, dir, mode);
307 inode->i_uid = current_fsuid();
308 if (dir->i_mode & S_ISGID) {
309 inode->i_gid = dir->i_gid;
310 if (S_ISDIR(mode))
311 inode->i_mode |= S_ISGID;
312 } else
313 inode->i_gid = current_fsgid();
314
315 inode->i_blocks = 0; 303 inode->i_blocks = 0;
316 inode->i_generation = 0; 304 inode->i_generation = 0;
317 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 305 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
@@ -355,21 +343,12 @@ cg_found:
355 343
356 unlock_super (sb); 344 unlock_super (sb);
357 345
358 dquot_initialize(inode);
359 err = dquot_alloc_inode(inode);
360 if (err) {
361 dquot_drop(inode);
362 goto fail_without_unlock;
363 }
364
365 UFSD("allocating inode %lu\n", inode->i_ino); 346 UFSD("allocating inode %lu\n", inode->i_ino);
366 UFSD("EXIT\n"); 347 UFSD("EXIT\n");
367 return inode; 348 return inode;
368 349
369fail_remove_inode: 350fail_remove_inode:
370 unlock_super(sb); 351 unlock_super(sb);
371fail_without_unlock:
372 inode->i_flags |= S_NOQUOTA;
373 inode->i_nlink = 0; 352 inode->i_nlink = 0;
374 iput(inode); 353 iput(inode);
375 UFSD("EXIT (FAILED): err %d\n", err); 354 UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index cffa756f1047..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
37#include <linux/smp_lock.h> 37#include <linux/smp_lock.h>
38#include <linux/buffer_head.h> 38#include <linux/buffer_head.h>
39#include <linux/writeback.h> 39#include <linux/writeback.h>
40#include <linux/quotaops.h>
41 40
42#include "ufs_fs.h" 41#include "ufs_fs.h"
43#include "ufs.h" 42#include "ufs.h"
@@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode)
910{ 909{
911 loff_t old_i_size; 910 loff_t old_i_size;
912 911
913 if (!is_bad_inode(inode))
914 dquot_initialize(inode);
915
916 truncate_inode_pages(&inode->i_data, 0); 912 truncate_inode_pages(&inode->i_data, 0);
917 if (is_bad_inode(inode)) 913 if (is_bad_inode(inode))
918 goto no_delete; 914 goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index eabc02eb1294..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
30#include <linux/time.h> 30#include <linux/time.h>
31#include <linux/fs.h> 31#include <linux/fs.h>
32#include <linux/smp_lock.h> 32#include <linux/smp_lock.h>
33#include <linux/quotaops.h>
34 33
35#include "ufs_fs.h" 34#include "ufs_fs.h"
36#include "ufs.h" 35#include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
86 85
87 UFSD("BEGIN\n"); 86 UFSD("BEGIN\n");
88 87
89 dquot_initialize(dir);
90
91 inode = ufs_new_inode(dir, mode); 88 inode = ufs_new_inode(dir, mode);
92 err = PTR_ERR(inode); 89 err = PTR_ERR(inode);
93 90
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
112 if (!old_valid_dev(rdev)) 109 if (!old_valid_dev(rdev))
113 return -EINVAL; 110 return -EINVAL;
114 111
115 dquot_initialize(dir);
116
117 inode = ufs_new_inode(dir, mode); 112 inode = ufs_new_inode(dir, mode);
118 err = PTR_ERR(inode); 113 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) { 114 if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
138 if (l > sb->s_blocksize) 133 if (l > sb->s_blocksize)
139 goto out_notlocked; 134 goto out_notlocked;
140 135
141 dquot_initialize(dir);
142
143 lock_kernel(); 136 lock_kernel();
144 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); 137 inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
145 err = PTR_ERR(inode); 138 err = PTR_ERR(inode);
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
185 return -EMLINK; 178 return -EMLINK;
186 } 179 }
187 180
188 dquot_initialize(dir);
189
190 inode->i_ctime = CURRENT_TIME_SEC; 181 inode->i_ctime = CURRENT_TIME_SEC;
191 inode_inc_link_count(inode); 182 inode_inc_link_count(inode);
192 atomic_inc(&inode->i_count); 183 atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
204 if (dir->i_nlink >= UFS_LINK_MAX) 195 if (dir->i_nlink >= UFS_LINK_MAX)
205 goto out; 196 goto out;
206 197
207 dquot_initialize(dir);
208
209 lock_kernel(); 198 lock_kernel();
210 inode_inc_link_count(dir); 199 inode_inc_link_count(dir);
211 200
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
250 struct page *page; 239 struct page *page;
251 int err = -ENOENT; 240 int err = -ENOENT;
252 241
253 dquot_initialize(dir);
254
255 de = ufs_find_entry(dir, &dentry->d_name, &page); 242 de = ufs_find_entry(dir, &dentry->d_name, &page);
256 if (!de) 243 if (!de)
257 goto out; 244 goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
296 struct ufs_dir_entry *old_de; 283 struct ufs_dir_entry *old_de;
297 int err = -ENOENT; 284 int err = -ENOENT;
298 285
299 dquot_initialize(old_dir);
300 dquot_initialize(new_dir);
301
302 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page); 286 old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
303 if (!old_de) 287 if (!old_de)
304 goto out; 288 goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
77 77
78#include <linux/errno.h> 78#include <linux/errno.h>
79#include <linux/fs.h> 79#include <linux/fs.h>
80#include <linux/quotaops.h>
81#include <linux/slab.h> 80#include <linux/slab.h>
82#include <linux/time.h> 81#include <linux/time.h>
83#include <linux/stat.h> 82#include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
918 sbi->s_bytesex = BYTESEX_LE; 917 sbi->s_bytesex = BYTESEX_LE;
919 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 918 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
920 case UFS_MAGIC: 919 case UFS_MAGIC:
920 case UFS_MAGIC_BW:
921 case UFS2_MAGIC: 921 case UFS2_MAGIC:
922 case UFS_MAGIC_LFN: 922 case UFS_MAGIC_LFN:
923 case UFS_MAGIC_FEA: 923 case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
927 sbi->s_bytesex = BYTESEX_BE; 927 sbi->s_bytesex = BYTESEX_BE;
928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { 928 switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
929 case UFS_MAGIC: 929 case UFS_MAGIC:
930 case UFS_MAGIC_BW:
930 case UFS2_MAGIC: 931 case UFS2_MAGIC:
931 case UFS_MAGIC_LFN: 932 case UFS_MAGIC_LFN:
932 case UFS_MAGIC_FEA: 933 case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
1045 */ 1046 */
1046 sb->s_op = &ufs_super_ops; 1047 sb->s_op = &ufs_super_ops;
1047 sb->s_export_op = &ufs_export_ops; 1048 sb->s_export_op = &ufs_export_ops;
1048 sb->dq_op = NULL; /***/ 1049
1049 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); 1050 sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
1050 1051
1051 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); 1052 uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
1435 kmem_cache_destroy(ufs_inode_cachep); 1436 kmem_cache_destroy(ufs_inode_cachep);
1436} 1437}
1437 1438
1438static void ufs_clear_inode(struct inode *inode)
1439{
1440 dquot_drop(inode);
1441}
1442
1443#ifdef CONFIG_QUOTA
1444static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
1445static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
1446#endif
1447
1448static const struct super_operations ufs_super_ops = { 1439static const struct super_operations ufs_super_ops = {
1449 .alloc_inode = ufs_alloc_inode, 1440 .alloc_inode = ufs_alloc_inode,
1450 .destroy_inode = ufs_destroy_inode, 1441 .destroy_inode = ufs_destroy_inode,
1451 .write_inode = ufs_write_inode, 1442 .write_inode = ufs_write_inode,
1452 .delete_inode = ufs_delete_inode, 1443 .delete_inode = ufs_delete_inode,
1453 .clear_inode = ufs_clear_inode,
1454 .put_super = ufs_put_super, 1444 .put_super = ufs_put_super,
1455 .write_super = ufs_write_super, 1445 .write_super = ufs_write_super,
1456 .sync_fs = ufs_sync_fs, 1446 .sync_fs = ufs_sync_fs,
1457 .statfs = ufs_statfs, 1447 .statfs = ufs_statfs,
1458 .remount_fs = ufs_remount, 1448 .remount_fs = ufs_remount,
1459 .show_options = ufs_show_options, 1449 .show_options = ufs_show_options,
1460#ifdef CONFIG_QUOTA
1461 .quota_read = ufs_quota_read,
1462 .quota_write = ufs_quota_write,
1463#endif
1464}; 1450};
1465 1451
1466#ifdef CONFIG_QUOTA
1467
1468/* Read data from quotafile - avoid pagecache and such because we cannot afford
1469 * acquiring the locks... As quota files are never truncated and quota code
1470 * itself serializes the operations (and noone else should touch the files)
1471 * we don't have to be afraid of races */
1472static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
1473 size_t len, loff_t off)
1474{
1475 struct inode *inode = sb_dqopt(sb)->files[type];
1476 sector_t blk = off >> sb->s_blocksize_bits;
1477 int err = 0;
1478 int offset = off & (sb->s_blocksize - 1);
1479 int tocopy;
1480 size_t toread;
1481 struct buffer_head *bh;
1482 loff_t i_size = i_size_read(inode);
1483
1484 if (off > i_size)
1485 return 0;
1486 if (off+len > i_size)
1487 len = i_size-off;
1488 toread = len;
1489 while (toread > 0) {
1490 tocopy = sb->s_blocksize - offset < toread ?
1491 sb->s_blocksize - offset : toread;
1492
1493 bh = ufs_bread(inode, blk, 0, &err);
1494 if (err)
1495 return err;
1496 if (!bh) /* A hole? */
1497 memset(data, 0, tocopy);
1498 else {
1499 memcpy(data, bh->b_data+offset, tocopy);
1500 brelse(bh);
1501 }
1502 offset = 0;
1503 toread -= tocopy;
1504 data += tocopy;
1505 blk++;
1506 }
1507 return len;
1508}
1509
1510/* Write to quotafile */
1511static ssize_t ufs_quota_write(struct super_block *sb, int type,
1512 const char *data, size_t len, loff_t off)
1513{
1514 struct inode *inode = sb_dqopt(sb)->files[type];
1515 sector_t blk = off >> sb->s_blocksize_bits;
1516 int err = 0;
1517 int offset = off & (sb->s_blocksize - 1);
1518 int tocopy;
1519 size_t towrite = len;
1520 struct buffer_head *bh;
1521
1522 mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
1523 while (towrite > 0) {
1524 tocopy = sb->s_blocksize - offset < towrite ?
1525 sb->s_blocksize - offset : towrite;
1526
1527 bh = ufs_bread(inode, blk, 1, &err);
1528 if (!bh)
1529 goto out;
1530 lock_buffer(bh);
1531 memcpy(bh->b_data+offset, data, tocopy);
1532 flush_dcache_page(bh->b_page);
1533 set_buffer_uptodate(bh);
1534 mark_buffer_dirty(bh);
1535 unlock_buffer(bh);
1536 brelse(bh);
1537 offset = 0;
1538 towrite -= tocopy;
1539 data += tocopy;
1540 blk++;
1541 }
1542out:
1543 if (len == towrite) {
1544 mutex_unlock(&inode->i_mutex);
1545 return err;
1546 }
1547 if (inode->i_size < off+len-towrite)
1548 i_size_write(inode, off+len-towrite);
1549 inode->i_version++;
1550 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
1551 mark_inode_dirty(inode);
1552 mutex_unlock(&inode->i_mutex);
1553 return len - towrite;
1554}
1555
1556#endif
1557
1558static int ufs_get_sb(struct file_system_type *fs_type, 1452static int ufs_get_sb(struct file_system_type *fs_type,
1559 int flags, const char *dev_name, void *data, struct vfsmount *mnt) 1453 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
1560{ 1454{
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f294c44577dc..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
44#include <linux/buffer_head.h> 44#include <linux/buffer_head.h>
45#include <linux/blkdev.h> 45#include <linux/blkdev.h>
46#include <linux/sched.h> 46#include <linux/sched.h>
47#include <linux/quotaops.h>
48 47
49#include "ufs_fs.h" 48#include "ufs_fs.h"
50#include "ufs.h" 49#include "ufs.h"
@@ -501,12 +500,10 @@ out:
501 return err; 500 return err;
502} 501}
503 502
504
505/* 503/*
506 * We don't define our `inode->i_op->truncate', and call it here, 504 * TODO:
507 * because of: 505 * - truncate case should use proper ordering instead of using
508 * - there is no way to know old size 506 * simple_setsize
509 * - there is no way inform user about error, if it happens in `truncate'
510 */ 507 */
511int ufs_setattr(struct dentry *dentry, struct iattr *attr) 508int ufs_setattr(struct dentry *dentry, struct iattr *attr)
512{ 509{
@@ -518,19 +515,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
518 if (error) 515 if (error)
519 return error; 516 return error;
520 517
521 if (is_quota_modification(inode, attr))
522 dquot_initialize(inode);
523
524 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
525 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
526 error = dquot_transfer(inode, attr);
527 if (error)
528 return error;
529 }
530 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { 518 if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
531 loff_t old_i_size = inode->i_size; 519 loff_t old_i_size = inode->i_size;
532 520
533 error = vmtruncate(inode, attr->ia_size); 521 error = simple_setsize(inode, attr->ia_size);
534 if (error) 522 if (error)
535 return error; 523 return error;
536 error = ufs_truncate(inode, old_i_size); 524 error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
48#define UFS_SECTOR_SIZE 512 48#define UFS_SECTOR_SIZE 512
49#define UFS_SECTOR_BITS 9 49#define UFS_SECTOR_BITS 9
50#define UFS_MAGIC 0x00011954 50#define UFS_MAGIC 0x00011954
51#define UFS_MAGIC_BW 0x0f242697
51#define UFS2_MAGIC 0x19540119 52#define UFS2_MAGIC 0x19540119
52#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */ 53#define UFS_CIGAM 0x54190100 /* byteswapped MAGIC */
53 54
diff --git a/fs/xattr.c b/fs/xattr.c
index 46f87e828b48..01bb8135e14a 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -590,10 +590,10 @@ strcmp_prefix(const char *a, const char *a_prefix)
590/* 590/*
591 * Find the xattr_handler with the matching prefix. 591 * Find the xattr_handler with the matching prefix.
592 */ 592 */
593static struct xattr_handler * 593static const struct xattr_handler *
594xattr_resolve_name(struct xattr_handler **handlers, const char **name) 594xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
595{ 595{
596 struct xattr_handler *handler; 596 const struct xattr_handler *handler;
597 597
598 if (!*name) 598 if (!*name)
599 return NULL; 599 return NULL;
@@ -614,7 +614,7 @@ xattr_resolve_name(struct xattr_handler **handlers, const char **name)
614ssize_t 614ssize_t
615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) 615generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size)
616{ 616{
617 struct xattr_handler *handler; 617 const struct xattr_handler *handler;
618 618
619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 619 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
620 if (!handler) 620 if (!handler)
@@ -629,7 +629,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
629ssize_t 629ssize_t
630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) 630generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
631{ 631{
632 struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; 632 const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr;
633 unsigned int size = 0; 633 unsigned int size = 0;
634 634
635 if (!buffer) { 635 if (!buffer) {
@@ -659,7 +659,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
659int 659int
660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) 660generic_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags)
661{ 661{
662 struct xattr_handler *handler; 662 const struct xattr_handler *handler;
663 663
664 if (size == 0) 664 if (size == 0)
665 value = ""; /* empty EA, do not remove */ 665 value = ""; /* empty EA, do not remove */
@@ -676,7 +676,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
676int 676int
677generic_removexattr(struct dentry *dentry, const char *name) 677generic_removexattr(struct dentry *dentry, const char *name)
678{ 678{
679 struct xattr_handler *handler; 679 const struct xattr_handler *handler;
680 680
681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); 681 handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
682 if (!handler) 682 if (!handler)
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y += xfs_alloc.o \
77 xfs_itable.o \ 77 xfs_itable.o \
78 xfs_dfrag.o \ 78 xfs_dfrag.o \
79 xfs_log.o \ 79 xfs_log.o \
80 xfs_log_cil.o \
80 xfs_log_recover.o \ 81 xfs_log_recover.o \
81 xfs_mount.o \ 82 xfs_mount.o \
82 xfs_mru_cache.o \ 83 xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
index a7bc925c4d60..9f769b5b38fc 100644
--- a/fs/xfs/linux-2.6/xfs_acl.c
+++ b/fs/xfs/linux-2.6/xfs_acl.c
@@ -440,14 +440,14 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
440 return error; 440 return error;
441} 441}
442 442
443struct xattr_handler xfs_xattr_acl_access_handler = { 443const struct xattr_handler xfs_xattr_acl_access_handler = {
444 .prefix = POSIX_ACL_XATTR_ACCESS, 444 .prefix = POSIX_ACL_XATTR_ACCESS,
445 .flags = ACL_TYPE_ACCESS, 445 .flags = ACL_TYPE_ACCESS,
446 .get = xfs_xattr_acl_get, 446 .get = xfs_xattr_acl_get,
447 .set = xfs_xattr_acl_set, 447 .set = xfs_xattr_acl_set,
448}; 448};
449 449
450struct xattr_handler xfs_xattr_acl_default_handler = { 450const struct xattr_handler xfs_xattr_acl_default_handler = {
451 .prefix = POSIX_ACL_XATTR_DEFAULT, 451 .prefix = POSIX_ACL_XATTR_DEFAULT,
452 .flags = ACL_TYPE_DEFAULT, 452 .flags = ACL_TYPE_DEFAULT,
453 .get = xfs_xattr_acl_get, 453 .get = xfs_xattr_acl_get,
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 089eaca860b4..34640d6dbdcb 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1333,6 +1333,21 @@ xfs_vm_writepage(
1333 trace_xfs_writepage(inode, page, 0); 1333 trace_xfs_writepage(inode, page, 0);
1334 1334
1335 /* 1335 /*
1336 * Refuse to write the page out if we are called from reclaim context.
1337 *
1338 * This is primarily to avoid stack overflows when called from deep
1339 * used stacks in random callers for direct reclaim, but disabling
1340 * reclaim for kswap is a nice side-effect as kswapd causes rather
1341 * suboptimal I/O patters, too.
1342 *
1343 * This should really be done by the core VM, but until that happens
1344 * filesystems like XFS, btrfs and ext4 have to take care of this
1345 * by themselves.
1346 */
1347 if (current->flags & PF_MEMALLOC)
1348 goto out_fail;
1349
1350 /*
1336 * We need a transaction if: 1351 * We need a transaction if:
1337 * 1. There are delalloc buffers on the page 1352 * 1. There are delalloc buffers on the page
1338 * 2. The page is uptodate and we have unmapped buffers 1353 * 2. The page is uptodate and we have unmapped buffers
@@ -1366,14 +1381,6 @@ xfs_vm_writepage(
1366 if (!page_has_buffers(page)) 1381 if (!page_has_buffers(page))
1367 create_empty_buffers(page, 1 << inode->i_blkbits, 0); 1382 create_empty_buffers(page, 1 << inode->i_blkbits, 0);
1368 1383
1369
1370 /*
1371 * VM calculation for nr_to_write seems off. Bump it way
1372 * up, this gets simple streaming writes zippy again.
1373 * To be reviewed again after Jens' writeback changes.
1374 */
1375 wbc->nr_to_write *= 4;
1376
1377 /* 1384 /*
1378 * Convert delayed allocate, unwritten or unmapped space 1385 * Convert delayed allocate, unwritten or unmapped space
1379 * to real space and flush out to disk. 1386 * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f01de3c55c43..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
37 37
38#include "xfs_sb.h" 38#include "xfs_sb.h"
39#include "xfs_inum.h" 39#include "xfs_inum.h"
40#include "xfs_log.h"
40#include "xfs_ag.h" 41#include "xfs_ag.h"
41#include "xfs_dmapi.h" 42#include "xfs_dmapi.h"
42#include "xfs_mount.h" 43#include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
850 * Note that this in no way locks the underlying pages, so it is only 851 * Note that this in no way locks the underlying pages, so it is only
851 * useful for synchronizing concurrent use of buffer objects, not for 852 * useful for synchronizing concurrent use of buffer objects, not for
852 * synchronizing independent access to the underlying pages. 853 * synchronizing independent access to the underlying pages.
854 *
855 * If we come across a stale, pinned, locked buffer, we know that we
856 * are being asked to lock a buffer that has been reallocated. Because
857 * it is pinned, we know that the log has not been pushed to disk and
858 * hence it will still be locked. Rather than sleeping until someone
859 * else pushes the log, push it ourselves before trying to get the lock.
853 */ 860 */
854void 861void
855xfs_buf_lock( 862xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
857{ 864{
858 trace_xfs_buf_lock(bp, _RET_IP_); 865 trace_xfs_buf_lock(bp, _RET_IP_);
859 866
867 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
868 xfs_log_force(bp->b_mount, 0);
860 if (atomic_read(&bp->b_io_remaining)) 869 if (atomic_read(&bp->b_io_remaining))
861 blk_run_address_space(bp->b_target->bt_mapping); 870 blk_run_address_space(bp->b_target->bt_mapping);
862 down(&bp->b_sema); 871 down(&bp->b_sema);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index d8fb1b5d6cb5..257a56b127cf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
100STATIC int 100STATIC int
101xfs_file_fsync( 101xfs_file_fsync(
102 struct file *file, 102 struct file *file,
103 struct dentry *dentry,
104 int datasync) 103 int datasync)
105{ 104{
106 struct xfs_inode *ip = XFS_I(dentry->d_inode); 105 struct inode *inode = file->f_mapping->host;
106 struct xfs_inode *ip = XFS_I(inode);
107 struct xfs_trans *tp; 107 struct xfs_trans *tp;
108 int error = 0; 108 int error = 0;
109 int log_flushed = 0; 109 int log_flushed = 0;
@@ -140,8 +140,8 @@ xfs_file_fsync(
140 * might gets cleared when the inode gets written out via the AIL 140 * might gets cleared when the inode gets written out via the AIL
141 * or xfs_iflush_cluster. 141 * or xfs_iflush_cluster.
142 */ 142 */
143 if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) || 143 if (((inode->i_state & I_DIRTY_DATASYNC) ||
144 ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) && 144 ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
145 ip->i_update_core) { 145 ip->i_update_core) {
146 /* 146 /*
147 * Kick off a transaction to log the inode core to get the 147 * Kick off a transaction to log the inode core to get the
@@ -868,7 +868,7 @@ write_retry:
868 mutex_lock(&inode->i_mutex); 868 mutex_lock(&inode->i_mutex);
869 xfs_ilock(ip, iolock); 869 xfs_ilock(ip, iolock);
870 870
871 error2 = -xfs_file_fsync(file, file->f_path.dentry, 871 error2 = -xfs_file_fsync(file,
872 (file->f_flags & __O_SYNC) ? 0 : 1); 872 (file->f_flags & __O_SYNC) ? 0 : 1);
873 if (!error) 873 if (!error)
874 error = error2; 874 error = error2;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9c8019c78c92..44f0b2de153e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -585,11 +585,20 @@ xfs_vn_fallocate(
585 bf.l_len = len; 585 bf.l_len = len;
586 586
587 xfs_ilock(ip, XFS_IOLOCK_EXCL); 587 xfs_ilock(ip, XFS_IOLOCK_EXCL);
588
589 /* check the new inode size is valid before allocating */
590 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
591 offset + len > i_size_read(inode)) {
592 new_size = offset + len;
593 error = inode_newsize_ok(inode, new_size);
594 if (error)
595 goto out_unlock;
596 }
597
588 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf, 598 error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
589 0, XFS_ATTR_NOLOCK); 599 0, XFS_ATTR_NOLOCK);
590 if (!error && !(mode & FALLOC_FL_KEEP_SIZE) && 600 if (error)
591 offset + len > i_size_read(inode)) 601 goto out_unlock;
592 new_size = offset + len;
593 602
594 /* Change file size if needed */ 603 /* Change file size if needed */
595 if (new_size) { 604 if (new_size) {
@@ -600,6 +609,7 @@ xfs_vn_fallocate(
600 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK); 609 error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
601 } 610 }
602 611
612out_unlock:
603 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 613 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
604out_error: 614out_error:
605 return error; 615 return error;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index e31bf21fe5d3..067cafbfc635 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,10 +19,10 @@
19#include "xfs_dmapi.h" 19#include "xfs_dmapi.h"
20#include "xfs_sb.h" 20#include "xfs_sb.h"
21#include "xfs_inum.h" 21#include "xfs_inum.h"
22#include "xfs_log.h"
22#include "xfs_ag.h" 23#include "xfs_ag.h"
23#include "xfs_mount.h" 24#include "xfs_mount.h"
24#include "xfs_quota.h" 25#include "xfs_quota.h"
25#include "xfs_log.h"
26#include "xfs_trans.h" 26#include "xfs_trans.h"
27#include "xfs_bmap_btree.h" 27#include "xfs_bmap_btree.h"
28#include "xfs_inode.h" 28#include "xfs_inode.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index e9002513e08f..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */ 119#define MNTOPT_DMAPI "dmapi" /* DMI enabled (DMAPI / XDSM) */
120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */ 120#define MNTOPT_XDSM "xdsm" /* DMI enabled (DMAPI / XDSM) */
121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */ 121#define MNTOPT_DMI "dmi" /* DMI enabled (DMAPI / XDSM) */
122#define MNTOPT_DELAYLOG "delaylog" /* Delayed loging enabled */
123#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed loging disabled */
122 124
123/* 125/*
124 * Table driven mount option parser. 126 * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
374 mp->m_flags |= XFS_MOUNT_DMAPI; 376 mp->m_flags |= XFS_MOUNT_DMAPI;
375 } else if (!strcmp(this_char, MNTOPT_DMI)) { 377 } else if (!strcmp(this_char, MNTOPT_DMI)) {
376 mp->m_flags |= XFS_MOUNT_DMAPI; 378 mp->m_flags |= XFS_MOUNT_DMAPI;
379 } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
380 mp->m_flags |= XFS_MOUNT_DELAYLOG;
381 cmn_err(CE_WARN,
382 "Enabling EXPERIMENTAL delayed logging feature "
383 "- use at your own risk.\n");
384 } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
385 mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
377 } else if (!strcmp(this_char, "ihashsize")) { 386 } else if (!strcmp(this_char, "ihashsize")) {
378 cmn_err(CE_WARN, 387 cmn_err(CE_WARN,
379 "XFS: ihashsize no longer used, option is deprecated."); 388 "XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
535 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM }, 544 { XFS_MOUNT_FILESTREAMS, "," MNTOPT_FILESTREAM },
536 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI }, 545 { XFS_MOUNT_DMAPI, "," MNTOPT_DMAPI },
537 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, 546 { XFS_MOUNT_GRPID, "," MNTOPT_GRPID },
547 { XFS_MOUNT_DELAYLOG, "," MNTOPT_DELAYLOG },
538 { 0, NULL } 548 { 0, NULL }
539 }; 549 };
540 static struct proc_xfs_info xfs_info_unset[] = { 550 static struct proc_xfs_info xfs_info_unset[] = {
@@ -725,7 +735,8 @@ void
725xfs_blkdev_issue_flush( 735xfs_blkdev_issue_flush(
726 xfs_buftarg_t *buftarg) 736 xfs_buftarg_t *buftarg)
727{ 737{
728 blkdev_issue_flush(buftarg->bt_bdev, NULL); 738 blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
739 BLKDEV_IFL_WAIT);
729} 740}
730 741
731STATIC void 742STATIC void
@@ -1754,7 +1765,7 @@ xfs_init_zones(void)
1754 * but it is much faster. 1765 * but it is much faster.
1755 */ 1766 */
1756 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) + 1767 xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
1757 (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / 1768 (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
1758 NBWORD) * sizeof(int))), "xfs_buf_item"); 1769 NBWORD) * sizeof(int))), "xfs_buf_item");
1759 if (!xfs_buf_item_zone) 1770 if (!xfs_buf_item_zone)
1760 goto out_destroy_trans_zone; 1771 goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 233d4b9881b1..519618e9279e 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -85,7 +85,7 @@ extern __uint64_t xfs_max_file_offset(unsigned int);
85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 85extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
86 86
87extern const struct export_operations xfs_export_operations; 87extern const struct export_operations xfs_export_operations;
88extern struct xattr_handler *xfs_xattr_handlers[]; 88extern const struct xattr_handler *xfs_xattr_handlers[];
89extern const struct quotactl_ops xfs_quotactl_operations; 89extern const struct quotactl_ops xfs_quotactl_operations;
90 90
91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) 91#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3884e20bc14e..ef7f0218bccb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -164,10 +164,6 @@ xfs_inode_ag_iterator(
164 struct xfs_perag *pag; 164 struct xfs_perag *pag;
165 165
166 pag = xfs_perag_get(mp, ag); 166 pag = xfs_perag_get(mp, ag);
167 if (!pag->pag_ici_init) {
168 xfs_perag_put(pag);
169 continue;
170 }
171 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, 167 error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
172 exclusive, &nr); 168 exclusive, &nr);
173 xfs_perag_put(pag); 169 xfs_perag_put(pag);
@@ -867,12 +863,7 @@ xfs_reclaim_inode_shrink(
867 down_read(&xfs_mount_list_lock); 863 down_read(&xfs_mount_list_lock);
868 list_for_each_entry(mp, &xfs_mount_list, m_mplist) { 864 list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
869 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { 865 for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
870
871 pag = xfs_perag_get(mp, ag); 866 pag = xfs_perag_get(mp, ag);
872 if (!pag->pag_ici_init) {
873 xfs_perag_put(pag);
874 continue;
875 }
876 reclaimable += pag->pag_ici_reclaimable; 867 reclaimable += pag->pag_ici_reclaimable;
877 xfs_perag_put(pag); 868 xfs_perag_put(pag);
878 } 869 }
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 207fa77f63ae..d12be8470cba 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -50,7 +50,6 @@
50#include "quota/xfs_dquot_item.h" 50#include "quota/xfs_dquot_item.h"
51#include "quota/xfs_dquot.h" 51#include "quota/xfs_dquot.h"
52#include "xfs_log_recover.h" 52#include "xfs_log_recover.h"
53#include "xfs_buf_item.h"
54#include "xfs_inode_item.h" 53#include "xfs_inode_item.h"
55 54
56/* 55/*
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd2901..73d5aa117384 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
82 ) 82 )
83) 83)
84 84
85#define DEFINE_PERAG_REF_EVENT(name) \
86TRACE_EVENT(name, \
87 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
88 unsigned long caller_ip), \
89 TP_ARGS(mp, agno, refcount, caller_ip), \
90 TP_STRUCT__entry( \
91 __field(dev_t, dev) \
92 __field(xfs_agnumber_t, agno) \
93 __field(int, refcount) \
94 __field(unsigned long, caller_ip) \
95 ), \
96 TP_fast_assign( \
97 __entry->dev = mp->m_super->s_dev; \
98 __entry->agno = agno; \
99 __entry->refcount = refcount; \
100 __entry->caller_ip = caller_ip; \
101 ), \
102 TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
103 MAJOR(__entry->dev), MINOR(__entry->dev), \
104 __entry->agno, \
105 __entry->refcount, \
106 (char *)__entry->caller_ip) \
107);
108
109DEFINE_PERAG_REF_EVENT(xfs_perag_get)
110DEFINE_PERAG_REF_EVENT(xfs_perag_put)
111
112#define DEFINE_ATTR_LIST_EVENT(name) \ 85#define DEFINE_ATTR_LIST_EVENT(name) \
113DEFINE_EVENT(xfs_attr_list_class, name, \ 86DEFINE_EVENT(xfs_attr_list_class, name, \
114 TP_PROTO(struct xfs_attr_list_context *ctx), \ 87 TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -122,6 +95,37 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
122DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk); 95DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
123DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); 96DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
124 97
98DECLARE_EVENT_CLASS(xfs_perag_class,
99 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
100 unsigned long caller_ip),
101 TP_ARGS(mp, agno, refcount, caller_ip),
102 TP_STRUCT__entry(
103 __field(dev_t, dev)
104 __field(xfs_agnumber_t, agno)
105 __field(int, refcount)
106 __field(unsigned long, caller_ip)
107 ),
108 TP_fast_assign(
109 __entry->dev = mp->m_super->s_dev;
110 __entry->agno = agno;
111 __entry->refcount = refcount;
112 __entry->caller_ip = caller_ip;
113 ),
114 TP_printk("dev %d:%d agno %u refcount %d caller %pf",
115 MAJOR(__entry->dev), MINOR(__entry->dev),
116 __entry->agno,
117 __entry->refcount,
118 (char *)__entry->caller_ip)
119);
120
121#define DEFINE_PERAG_REF_EVENT(name) \
122DEFINE_EVENT(xfs_perag_class, name, \
123 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
124 unsigned long caller_ip), \
125 TP_ARGS(mp, agno, refcount, caller_ip))
126DEFINE_PERAG_REF_EVENT(xfs_perag_get);
127DEFINE_PERAG_REF_EVENT(xfs_perag_put);
128
125TRACE_EVENT(xfs_attr_list_node_descend, 129TRACE_EVENT(xfs_attr_list_node_descend,
126 TP_PROTO(struct xfs_attr_list_context *ctx, 130 TP_PROTO(struct xfs_attr_list_context *ctx,
127 struct xfs_da_node_entry *btree), 131 struct xfs_da_node_entry *btree),
@@ -775,165 +779,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
775DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit); 779DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
776DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub); 780DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
777 781
778#define DEFINE_RW_EVENT(name) \ 782DECLARE_EVENT_CLASS(xfs_file_class,
779TRACE_EVENT(name, \ 783 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
780 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \ 784 TP_ARGS(ip, count, offset, flags),
781 TP_ARGS(ip, count, offset, flags), \ 785 TP_STRUCT__entry(
782 TP_STRUCT__entry( \ 786 __field(dev_t, dev)
783 __field(dev_t, dev) \ 787 __field(xfs_ino_t, ino)
784 __field(xfs_ino_t, ino) \ 788 __field(xfs_fsize_t, size)
785 __field(xfs_fsize_t, size) \ 789 __field(xfs_fsize_t, new_size)
786 __field(xfs_fsize_t, new_size) \ 790 __field(loff_t, offset)
787 __field(loff_t, offset) \ 791 __field(size_t, count)
788 __field(size_t, count) \ 792 __field(int, flags)
789 __field(int, flags) \ 793 ),
790 ), \ 794 TP_fast_assign(
791 TP_fast_assign( \ 795 __entry->dev = VFS_I(ip)->i_sb->s_dev;
792 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 796 __entry->ino = ip->i_ino;
793 __entry->ino = ip->i_ino; \ 797 __entry->size = ip->i_d.di_size;
794 __entry->size = ip->i_d.di_size; \ 798 __entry->new_size = ip->i_new_size;
795 __entry->new_size = ip->i_new_size; \ 799 __entry->offset = offset;
796 __entry->offset = offset; \ 800 __entry->count = count;
797 __entry->count = count; \ 801 __entry->flags = flags;
798 __entry->flags = flags; \ 802 ),
799 ), \ 803 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
800 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 804 "offset 0x%llx count 0x%zx ioflags %s",
801 "offset 0x%llx count 0x%zx ioflags %s", \ 805 MAJOR(__entry->dev), MINOR(__entry->dev),
802 MAJOR(__entry->dev), MINOR(__entry->dev), \ 806 __entry->ino,
803 __entry->ino, \ 807 __entry->size,
804 __entry->size, \ 808 __entry->new_size,
805 __entry->new_size, \ 809 __entry->offset,
806 __entry->offset, \ 810 __entry->count,
807 __entry->count, \ 811 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
808 __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
809) 812)
813
814#define DEFINE_RW_EVENT(name) \
815DEFINE_EVENT(xfs_file_class, name, \
816 TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
817 TP_ARGS(ip, count, offset, flags))
810DEFINE_RW_EVENT(xfs_file_read); 818DEFINE_RW_EVENT(xfs_file_read);
811DEFINE_RW_EVENT(xfs_file_buffered_write); 819DEFINE_RW_EVENT(xfs_file_buffered_write);
812DEFINE_RW_EVENT(xfs_file_direct_write); 820DEFINE_RW_EVENT(xfs_file_direct_write);
813DEFINE_RW_EVENT(xfs_file_splice_read); 821DEFINE_RW_EVENT(xfs_file_splice_read);
814DEFINE_RW_EVENT(xfs_file_splice_write); 822DEFINE_RW_EVENT(xfs_file_splice_write);
815 823
816 824DECLARE_EVENT_CLASS(xfs_page_class,
817#define DEFINE_PAGE_EVENT(name) \ 825 TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
818TRACE_EVENT(name, \ 826 TP_ARGS(inode, page, off),
819 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \ 827 TP_STRUCT__entry(
820 TP_ARGS(inode, page, off), \ 828 __field(dev_t, dev)
821 TP_STRUCT__entry( \ 829 __field(xfs_ino_t, ino)
822 __field(dev_t, dev) \ 830 __field(pgoff_t, pgoff)
823 __field(xfs_ino_t, ino) \ 831 __field(loff_t, size)
824 __field(pgoff_t, pgoff) \ 832 __field(unsigned long, offset)
825 __field(loff_t, size) \ 833 __field(int, delalloc)
826 __field(unsigned long, offset) \ 834 __field(int, unmapped)
827 __field(int, delalloc) \ 835 __field(int, unwritten)
828 __field(int, unmapped) \ 836 ),
829 __field(int, unwritten) \ 837 TP_fast_assign(
830 ), \ 838 int delalloc = -1, unmapped = -1, unwritten = -1;
831 TP_fast_assign( \ 839
832 int delalloc = -1, unmapped = -1, unwritten = -1; \ 840 if (page_has_buffers(page))
833 \ 841 xfs_count_page_state(page, &delalloc,
834 if (page_has_buffers(page)) \ 842 &unmapped, &unwritten);
835 xfs_count_page_state(page, &delalloc, \ 843 __entry->dev = inode->i_sb->s_dev;
836 &unmapped, &unwritten); \ 844 __entry->ino = XFS_I(inode)->i_ino;
837 __entry->dev = inode->i_sb->s_dev; \ 845 __entry->pgoff = page_offset(page);
838 __entry->ino = XFS_I(inode)->i_ino; \ 846 __entry->size = i_size_read(inode);
839 __entry->pgoff = page_offset(page); \ 847 __entry->offset = off;
840 __entry->size = i_size_read(inode); \ 848 __entry->delalloc = delalloc;
841 __entry->offset = off; \ 849 __entry->unmapped = unmapped;
842 __entry->delalloc = delalloc; \ 850 __entry->unwritten = unwritten;
843 __entry->unmapped = unmapped; \ 851 ),
844 __entry->unwritten = unwritten; \ 852 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
845 ), \ 853 "delalloc %d unmapped %d unwritten %d",
846 TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \ 854 MAJOR(__entry->dev), MINOR(__entry->dev),
847 "delalloc %d unmapped %d unwritten %d", \ 855 __entry->ino,
848 MAJOR(__entry->dev), MINOR(__entry->dev), \ 856 __entry->pgoff,
849 __entry->ino, \ 857 __entry->size,
850 __entry->pgoff, \ 858 __entry->offset,
851 __entry->size, \ 859 __entry->delalloc,
852 __entry->offset, \ 860 __entry->unmapped,
853 __entry->delalloc, \ 861 __entry->unwritten)
854 __entry->unmapped, \
855 __entry->unwritten) \
856) 862)
863
864#define DEFINE_PAGE_EVENT(name) \
865DEFINE_EVENT(xfs_page_class, name, \
866 TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
867 TP_ARGS(inode, page, off))
857DEFINE_PAGE_EVENT(xfs_writepage); 868DEFINE_PAGE_EVENT(xfs_writepage);
858DEFINE_PAGE_EVENT(xfs_releasepage); 869DEFINE_PAGE_EVENT(xfs_releasepage);
859DEFINE_PAGE_EVENT(xfs_invalidatepage); 870DEFINE_PAGE_EVENT(xfs_invalidatepage);
860 871
861#define DEFINE_IOMAP_EVENT(name) \ 872DECLARE_EVENT_CLASS(xfs_iomap_class,
862TRACE_EVENT(name, \ 873 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
863 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \ 874 int flags, struct xfs_bmbt_irec *irec),
864 int flags, struct xfs_bmbt_irec *irec), \ 875 TP_ARGS(ip, offset, count, flags, irec),
865 TP_ARGS(ip, offset, count, flags, irec), \ 876 TP_STRUCT__entry(
866 TP_STRUCT__entry( \ 877 __field(dev_t, dev)
867 __field(dev_t, dev) \ 878 __field(xfs_ino_t, ino)
868 __field(xfs_ino_t, ino) \ 879 __field(loff_t, size)
869 __field(loff_t, size) \ 880 __field(loff_t, new_size)
870 __field(loff_t, new_size) \ 881 __field(loff_t, offset)
871 __field(loff_t, offset) \ 882 __field(size_t, count)
872 __field(size_t, count) \ 883 __field(int, flags)
873 __field(int, flags) \ 884 __field(xfs_fileoff_t, startoff)
874 __field(xfs_fileoff_t, startoff) \ 885 __field(xfs_fsblock_t, startblock)
875 __field(xfs_fsblock_t, startblock) \ 886 __field(xfs_filblks_t, blockcount)
876 __field(xfs_filblks_t, blockcount) \ 887 ),
877 ), \ 888 TP_fast_assign(
878 TP_fast_assign( \ 889 __entry->dev = VFS_I(ip)->i_sb->s_dev;
879 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 890 __entry->ino = ip->i_ino;
880 __entry->ino = ip->i_ino; \ 891 __entry->size = ip->i_d.di_size;
881 __entry->size = ip->i_d.di_size; \ 892 __entry->new_size = ip->i_new_size;
882 __entry->new_size = ip->i_new_size; \ 893 __entry->offset = offset;
883 __entry->offset = offset; \ 894 __entry->count = count;
884 __entry->count = count; \ 895 __entry->flags = flags;
885 __entry->flags = flags; \ 896 __entry->startoff = irec ? irec->br_startoff : 0;
886 __entry->startoff = irec ? irec->br_startoff : 0; \ 897 __entry->startblock = irec ? irec->br_startblock : 0;
887 __entry->startblock = irec ? irec->br_startblock : 0; \ 898 __entry->blockcount = irec ? irec->br_blockcount : 0;
888 __entry->blockcount = irec ? irec->br_blockcount : 0; \ 899 ),
889 ), \ 900 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
890 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 901 "offset 0x%llx count %zd flags %s "
891 "offset 0x%llx count %zd flags %s " \ 902 "startoff 0x%llx startblock %lld blockcount 0x%llx",
892 "startoff 0x%llx startblock %lld blockcount 0x%llx", \ 903 MAJOR(__entry->dev), MINOR(__entry->dev),
893 MAJOR(__entry->dev), MINOR(__entry->dev), \ 904 __entry->ino,
894 __entry->ino, \ 905 __entry->size,
895 __entry->size, \ 906 __entry->new_size,
896 __entry->new_size, \ 907 __entry->offset,
897 __entry->offset, \ 908 __entry->count,
898 __entry->count, \ 909 __print_flags(__entry->flags, "|", BMAPI_FLAGS),
899 __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ 910 __entry->startoff,
900 __entry->startoff, \ 911 (__int64_t)__entry->startblock,
901 (__int64_t)__entry->startblock, \ 912 __entry->blockcount)
902 __entry->blockcount) \
903) 913)
914
915#define DEFINE_IOMAP_EVENT(name) \
916DEFINE_EVENT(xfs_iomap_class, name, \
917 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
918 int flags, struct xfs_bmbt_irec *irec), \
919 TP_ARGS(ip, offset, count, flags, irec))
904DEFINE_IOMAP_EVENT(xfs_iomap_enter); 920DEFINE_IOMAP_EVENT(xfs_iomap_enter);
905DEFINE_IOMAP_EVENT(xfs_iomap_found); 921DEFINE_IOMAP_EVENT(xfs_iomap_found);
906DEFINE_IOMAP_EVENT(xfs_iomap_alloc); 922DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
907 923
908#define DEFINE_SIMPLE_IO_EVENT(name) \ 924DECLARE_EVENT_CLASS(xfs_simple_io_class,
909TRACE_EVENT(name, \ 925 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
910 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \ 926 TP_ARGS(ip, offset, count),
911 TP_ARGS(ip, offset, count), \ 927 TP_STRUCT__entry(
912 TP_STRUCT__entry( \ 928 __field(dev_t, dev)
913 __field(dev_t, dev) \ 929 __field(xfs_ino_t, ino)
914 __field(xfs_ino_t, ino) \ 930 __field(loff_t, size)
915 __field(loff_t, size) \ 931 __field(loff_t, new_size)
916 __field(loff_t, new_size) \ 932 __field(loff_t, offset)
917 __field(loff_t, offset) \ 933 __field(size_t, count)
918 __field(size_t, count) \ 934 ),
919 ), \ 935 TP_fast_assign(
920 TP_fast_assign( \ 936 __entry->dev = VFS_I(ip)->i_sb->s_dev;
921 __entry->dev = VFS_I(ip)->i_sb->s_dev; \ 937 __entry->ino = ip->i_ino;
922 __entry->ino = ip->i_ino; \ 938 __entry->size = ip->i_d.di_size;
923 __entry->size = ip->i_d.di_size; \ 939 __entry->new_size = ip->i_new_size;
924 __entry->new_size = ip->i_new_size; \ 940 __entry->offset = offset;
925 __entry->offset = offset; \ 941 __entry->count = count;
926 __entry->count = count; \ 942 ),
927 ), \ 943 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
928 TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ 944 "offset 0x%llx count %zd",
929 "offset 0x%llx count %zd", \ 945 MAJOR(__entry->dev), MINOR(__entry->dev),
930 MAJOR(__entry->dev), MINOR(__entry->dev), \ 946 __entry->ino,
931 __entry->ino, \ 947 __entry->size,
932 __entry->size, \ 948 __entry->new_size,
933 __entry->new_size, \ 949 __entry->offset,
934 __entry->offset, \ 950 __entry->count)
935 __entry->count) \
936); 951);
952
953#define DEFINE_SIMPLE_IO_EVENT(name) \
954DEFINE_EVENT(xfs_simple_io_class, name, \
955 TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
956 TP_ARGS(ip, offset, count))
937DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); 957DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
938DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); 958DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
939 959
@@ -1059,83 +1079,112 @@ TRACE_EVENT(xfs_bunmap,
1059 1079
1060); 1080);
1061 1081
1082#define XFS_BUSY_SYNC \
1083 { 0, "async" }, \
1084 { 1, "sync" }
1085
1062TRACE_EVENT(xfs_alloc_busy, 1086TRACE_EVENT(xfs_alloc_busy,
1063 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1087 TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
1064 xfs_extlen_t len, int slot), 1088 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
1065 TP_ARGS(mp, agno, agbno, len, slot), 1089 TP_ARGS(trans, agno, agbno, len, sync),
1066 TP_STRUCT__entry( 1090 TP_STRUCT__entry(
1067 __field(dev_t, dev) 1091 __field(dev_t, dev)
1092 __field(struct xfs_trans *, tp)
1093 __field(int, tid)
1068 __field(xfs_agnumber_t, agno) 1094 __field(xfs_agnumber_t, agno)
1069 __field(xfs_agblock_t, agbno) 1095 __field(xfs_agblock_t, agbno)
1070 __field(xfs_extlen_t, len) 1096 __field(xfs_extlen_t, len)
1071 __field(int, slot) 1097 __field(int, sync)
1072 ), 1098 ),
1073 TP_fast_assign( 1099 TP_fast_assign(
1074 __entry->dev = mp->m_super->s_dev; 1100 __entry->dev = trans->t_mountp->m_super->s_dev;
1101 __entry->tp = trans;
1102 __entry->tid = trans->t_ticket->t_tid;
1075 __entry->agno = agno; 1103 __entry->agno = agno;
1076 __entry->agbno = agbno; 1104 __entry->agbno = agbno;
1077 __entry->len = len; 1105 __entry->len = len;
1078 __entry->slot = slot; 1106 __entry->sync = sync;
1079 ), 1107 ),
1080 TP_printk("dev %d:%d agno %u agbno %u len %u slot %d", 1108 TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
1081 MAJOR(__entry->dev), MINOR(__entry->dev), 1109 MAJOR(__entry->dev), MINOR(__entry->dev),
1110 __entry->tp,
1111 __entry->tid,
1082 __entry->agno, 1112 __entry->agno,
1083 __entry->agbno, 1113 __entry->agbno,
1084 __entry->len, 1114 __entry->len,
1085 __entry->slot) 1115 __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
1086 1116
1087); 1117);
1088 1118
1089#define XFS_BUSY_STATES \
1090 { 0, "found" }, \
1091 { 1, "missing" }
1092
1093TRACE_EVENT(xfs_alloc_unbusy, 1119TRACE_EVENT(xfs_alloc_unbusy,
1094 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, 1120 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1095 int slot, int found), 1121 xfs_agblock_t agbno, xfs_extlen_t len),
1096 TP_ARGS(mp, agno, slot, found), 1122 TP_ARGS(mp, agno, agbno, len),
1097 TP_STRUCT__entry( 1123 TP_STRUCT__entry(
1098 __field(dev_t, dev) 1124 __field(dev_t, dev)
1099 __field(xfs_agnumber_t, agno) 1125 __field(xfs_agnumber_t, agno)
1100 __field(int, slot) 1126 __field(xfs_agblock_t, agbno)
1101 __field(int, found) 1127 __field(xfs_extlen_t, len)
1102 ), 1128 ),
1103 TP_fast_assign( 1129 TP_fast_assign(
1104 __entry->dev = mp->m_super->s_dev; 1130 __entry->dev = mp->m_super->s_dev;
1105 __entry->agno = agno; 1131 __entry->agno = agno;
1106 __entry->slot = slot; 1132 __entry->agbno = agbno;
1107 __entry->found = found; 1133 __entry->len = len;
1108 ), 1134 ),
1109 TP_printk("dev %d:%d agno %u slot %d %s", 1135 TP_printk("dev %d:%d agno %u agbno %u len %u",
1110 MAJOR(__entry->dev), MINOR(__entry->dev), 1136 MAJOR(__entry->dev), MINOR(__entry->dev),
1111 __entry->agno, 1137 __entry->agno,
1112 __entry->slot, 1138 __entry->agbno,
1113 __print_symbolic(__entry->found, XFS_BUSY_STATES)) 1139 __entry->len)
1114); 1140);
1115 1141
1142#define XFS_BUSY_STATES \
1143 { 0, "missing" }, \
1144 { 1, "found" }
1145
1116TRACE_EVENT(xfs_alloc_busysearch, 1146TRACE_EVENT(xfs_alloc_busysearch,
1117 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, 1147 TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
1118 xfs_extlen_t len, xfs_lsn_t lsn), 1148 xfs_agblock_t agbno, xfs_extlen_t len, int found),
1119 TP_ARGS(mp, agno, agbno, len, lsn), 1149 TP_ARGS(mp, agno, agbno, len, found),
1120 TP_STRUCT__entry( 1150 TP_STRUCT__entry(
1121 __field(dev_t, dev) 1151 __field(dev_t, dev)
1122 __field(xfs_agnumber_t, agno) 1152 __field(xfs_agnumber_t, agno)
1123 __field(xfs_agblock_t, agbno) 1153 __field(xfs_agblock_t, agbno)
1124 __field(xfs_extlen_t, len) 1154 __field(xfs_extlen_t, len)
1125 __field(xfs_lsn_t, lsn) 1155 __field(int, found)
1126 ), 1156 ),
1127 TP_fast_assign( 1157 TP_fast_assign(
1128 __entry->dev = mp->m_super->s_dev; 1158 __entry->dev = mp->m_super->s_dev;
1129 __entry->agno = agno; 1159 __entry->agno = agno;
1130 __entry->agbno = agbno; 1160 __entry->agbno = agbno;
1131 __entry->len = len; 1161 __entry->len = len;
1132 __entry->lsn = lsn; 1162 __entry->found = found;
1133 ), 1163 ),
1134 TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", 1164 TP_printk("dev %d:%d agno %u agbno %u len %u %s",
1135 MAJOR(__entry->dev), MINOR(__entry->dev), 1165 MAJOR(__entry->dev), MINOR(__entry->dev),
1136 __entry->agno, 1166 __entry->agno,
1137 __entry->agbno, 1167 __entry->agbno,
1138 __entry->len, 1168 __entry->len,
1169 __print_symbolic(__entry->found, XFS_BUSY_STATES))
1170);
1171
1172TRACE_EVENT(xfs_trans_commit_lsn,
1173 TP_PROTO(struct xfs_trans *trans),
1174 TP_ARGS(trans),
1175 TP_STRUCT__entry(
1176 __field(dev_t, dev)
1177 __field(struct xfs_trans *, tp)
1178 __field(xfs_lsn_t, lsn)
1179 ),
1180 TP_fast_assign(
1181 __entry->dev = trans->t_mountp->m_super->s_dev;
1182 __entry->tp = trans;
1183 __entry->lsn = trans->t_commit_lsn;
1184 ),
1185 TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
1186 MAJOR(__entry->dev), MINOR(__entry->dev),
1187 __entry->tp,
1139 __entry->lsn) 1188 __entry->lsn)
1140); 1189);
1141 1190
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
index fa01b9daba6b..87d3e03878c8 100644
--- a/fs/xfs/linux-2.6/xfs_xattr.c
+++ b/fs/xfs/linux-2.6/xfs_xattr.c
@@ -72,28 +72,28 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
72 (void *)value, size, xflags); 72 (void *)value, size, xflags);
73} 73}
74 74
75static struct xattr_handler xfs_xattr_user_handler = { 75static const struct xattr_handler xfs_xattr_user_handler = {
76 .prefix = XATTR_USER_PREFIX, 76 .prefix = XATTR_USER_PREFIX,
77 .flags = 0, /* no flags implies user namespace */ 77 .flags = 0, /* no flags implies user namespace */
78 .get = xfs_xattr_get, 78 .get = xfs_xattr_get,
79 .set = xfs_xattr_set, 79 .set = xfs_xattr_set,
80}; 80};
81 81
82static struct xattr_handler xfs_xattr_trusted_handler = { 82static const struct xattr_handler xfs_xattr_trusted_handler = {
83 .prefix = XATTR_TRUSTED_PREFIX, 83 .prefix = XATTR_TRUSTED_PREFIX,
84 .flags = ATTR_ROOT, 84 .flags = ATTR_ROOT,
85 .get = xfs_xattr_get, 85 .get = xfs_xattr_get,
86 .set = xfs_xattr_set, 86 .set = xfs_xattr_set,
87}; 87};
88 88
89static struct xattr_handler xfs_xattr_security_handler = { 89static const struct xattr_handler xfs_xattr_security_handler = {
90 .prefix = XATTR_SECURITY_PREFIX, 90 .prefix = XATTR_SECURITY_PREFIX,
91 .flags = ATTR_SECURE, 91 .flags = ATTR_SECURE,
92 .get = xfs_xattr_get, 92 .get = xfs_xattr_get,
93 .set = xfs_xattr_set, 93 .set = xfs_xattr_set,
94}; 94};
95 95
96struct xattr_handler *xfs_xattr_handlers[] = { 96const struct xattr_handler *xfs_xattr_handlers[] = {
97 &xfs_xattr_user_handler, 97 &xfs_xattr_user_handler,
98 &xfs_xattr_trusted_handler, 98 &xfs_xattr_trusted_handler,
99 &xfs_xattr_security_handler, 99 &xfs_xattr_security_handler,
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index b89ec5df0129..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++) 344 for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
345 xfs_qm_dqinit_core(curid, type, d); 345 xfs_qm_dqinit_core(curid, type, d);
346 xfs_trans_dquot_buf(tp, bp, 346 xfs_trans_dquot_buf(tp, bp,
347 (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF : 347 (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
348 ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF : 348 ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
349 XFS_BLI_GDQUOT_BUF))); 349 XFS_BLF_GDQUOT_BUF)));
350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1); 350 xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
351} 351}
352 352
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 38e764146644..2d8b7bc792c9 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -249,8 +249,10 @@ xfs_qm_hold_quotafs_ref(
249 249
250 if (!xfs_Gqm) { 250 if (!xfs_Gqm) {
251 xfs_Gqm = xfs_Gqm_init(); 251 xfs_Gqm = xfs_Gqm_init();
252 if (!xfs_Gqm) 252 if (!xfs_Gqm) {
253 mutex_unlock(&xfs_Gqm_lock);
253 return ENOMEM; 254 return ENOMEM;
255 }
254 } 256 }
255 257
256 /* 258 /*
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index d13eeba2c8f8..0135e2a669d7 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -49,8 +49,8 @@ extern int xfs_acl_chmod(struct inode *inode);
49extern int posix_acl_access_exists(struct inode *inode); 49extern int posix_acl_access_exists(struct inode *inode);
50extern int posix_acl_default_exists(struct inode *inode); 50extern int posix_acl_default_exists(struct inode *inode);
51 51
52extern struct xattr_handler xfs_xattr_acl_access_handler; 52extern const struct xattr_handler xfs_xattr_acl_access_handler;
53extern struct xattr_handler xfs_xattr_acl_default_handler; 53extern const struct xattr_handler xfs_xattr_acl_default_handler;
54#else 54#else
55# define xfs_check_acl NULL 55# define xfs_check_acl NULL
56# define xfs_get_acl(inode, type) NULL 56# define xfs_get_acl(inode, type) NULL
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
175} xfs_agfl_t; 175} xfs_agfl_t;
176 176
177/* 177/*
178 * Busy block/extent entry. Used in perag to mark blocks that have been freed 178 * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
179 * but whose transactions aren't committed to disk yet. 179 * have been freed but whose transactions aren't committed to disk yet.
180 *
181 * Note that we use the transaction ID to record the transaction, not the
182 * transaction structure itself. See xfs_alloc_busy_insert() for details.
180 */ 183 */
181typedef struct xfs_perag_busy { 184struct xfs_busy_extent {
182 xfs_agblock_t busy_start; 185 struct rb_node rb_node; /* ag by-bno indexed search tree */
183 xfs_extlen_t busy_length; 186 struct list_head list; /* transaction busy extent list */
184 struct xfs_trans *busy_tp; /* transaction that did the free */ 187 xfs_agnumber_t agno;
185} xfs_perag_busy_t; 188 xfs_agblock_t bno;
189 xfs_extlen_t length;
190 xlog_tid_t tid; /* transaction that created this */
191};
186 192
187/* 193/*
188 * Per-ag incore structure, copies of information in agf and agi, 194 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,17 +222,16 @@ typedef struct xfs_perag {
216 xfs_agino_t pagl_leftrec; 222 xfs_agino_t pagl_leftrec;
217 xfs_agino_t pagl_rightrec; 223 xfs_agino_t pagl_rightrec;
218#ifdef __KERNEL__ 224#ifdef __KERNEL__
219 spinlock_t pagb_lock; /* lock for pagb_list */ 225 spinlock_t pagb_lock; /* lock for pagb_tree */
226 struct rb_root pagb_tree; /* ordered tree of busy extents */
220 227
221 atomic_t pagf_fstrms; /* # of filestreams active in this AG */ 228 atomic_t pagf_fstrms; /* # of filestreams active in this AG */
222 229
223 int pag_ici_init; /* incore inode cache initialised */
224 rwlock_t pag_ici_lock; /* incore inode lock */ 230 rwlock_t pag_ici_lock; /* incore inode lock */
225 struct radix_tree_root pag_ici_root; /* incore inode cache root */ 231 struct radix_tree_root pag_ici_root; /* incore inode cache root */
226 int pag_ici_reclaimable; /* reclaimable inodes */ 232 int pag_ici_reclaimable; /* reclaimable inodes */
227#endif 233#endif
228 int pagb_count; /* pagb slots in use */ 234 int pagb_count; /* pagb slots in use */
229 xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
230} xfs_perag_t; 235} xfs_perag_t;
231 236
232/* 237/*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
46#define XFSA_FIXUP_BNO_OK 1 46#define XFSA_FIXUP_BNO_OK 1
47#define XFSA_FIXUP_CNT_OK 2 47#define XFSA_FIXUP_CNT_OK 2
48 48
49STATIC void 49static int
50xfs_alloc_search_busy(xfs_trans_t *tp, 50xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
51 xfs_agnumber_t agno, 51 xfs_agblock_t bno, xfs_extlen_t len);
52 xfs_agblock_t bno,
53 xfs_extlen_t len);
54 52
55/* 53/*
56 * Prototypes for per-ag allocation routines 54 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
540 be32_to_cpu(agf->agf_length)); 538 be32_to_cpu(agf->agf_length));
541 xfs_alloc_log_agf(args->tp, args->agbp, 539 xfs_alloc_log_agf(args->tp, args->agbp,
542 XFS_AGF_FREEBLKS); 540 XFS_AGF_FREEBLKS);
543 /* search the busylist for these blocks */ 541 /*
544 xfs_alloc_search_busy(args->tp, args->agno, 542 * Search the busylist for these blocks and mark the
545 args->agbno, args->len); 543 * transaction as synchronous if blocks are found. This
544 * avoids the need to block due to a synchronous log
545 * force to ensure correct ordering as the synchronous
546 * transaction will guarantee that for us.
547 */
548 if (xfs_alloc_busy_search(args->mp, args->agno,
549 args->agbno, args->len))
550 xfs_trans_set_sync(args->tp);
546 } 551 }
547 if (!args->isfl) 552 if (!args->isfl)
548 xfs_trans_mod_sb(args->tp, 553 xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
1693 * when the iclog commits to disk. If a busy block is allocated, 1698 * when the iclog commits to disk. If a busy block is allocated,
1694 * the iclog is pushed up to the LSN that freed the block. 1699 * the iclog is pushed up to the LSN that freed the block.
1695 */ 1700 */
1696 xfs_alloc_mark_busy(tp, agno, bno, len); 1701 xfs_alloc_busy_insert(tp, agno, bno, len);
1697 return 0; 1702 return 0;
1698 1703
1699 error0: 1704 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
1989 *bnop = bno; 1994 *bnop = bno;
1990 1995
1991 /* 1996 /*
1992 * As blocks are freed, they are added to the per-ag busy list 1997 * As blocks are freed, they are added to the per-ag busy list and
1993 * and remain there until the freeing transaction is committed to 1998 * remain there until the freeing transaction is committed to disk.
1994 * disk. Now that we have allocated blocks, this list must be 1999 * Now that we have allocated blocks, this list must be searched to see
1995 * searched to see if a block is being reused. If one is, then 2000 * if a block is being reused. If one is, then the freeing transaction
1996 * the freeing transaction must be pushed to disk NOW by forcing 2001 * must be pushed to disk before this transaction.
1997 * to disk all iclogs up that transaction's LSN. 2002 *
2003 * We do this by setting the current transaction to a sync transaction
2004 * which guarantees that the freeing transaction is on disk before this
2005 * transaction. This is done instead of a synchronous log force here so
2006 * that we don't sit and wait with the AGF locked in the transaction
2007 * during the log force.
1998 */ 2008 */
1999 xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1); 2009 if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
2010 xfs_trans_set_sync(tp);
2000 return 0; 2011 return 0;
2001} 2012}
2002 2013
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
2201 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 2212 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
2202 spin_lock_init(&pag->pagb_lock); 2213 spin_lock_init(&pag->pagb_lock);
2203 pag->pagb_count = 0; 2214 pag->pagb_count = 0;
2204 memset(pag->pagb_list, 0, sizeof(pag->pagb_list)); 2215 pag->pagb_tree = RB_ROOT;
2205 pag->pagf_init = 1; 2216 pag->pagf_init = 1;
2206 } 2217 }
2207#ifdef DEBUG 2218#ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
2479 * list is reused, the transaction that freed it must be forced to disk 2490 * list is reused, the transaction that freed it must be forced to disk
2480 * before continuing to use the block. 2491 * before continuing to use the block.
2481 * 2492 *
2482 * xfs_alloc_mark_busy - add to the per-ag busy list 2493 * xfs_alloc_busy_insert - add to the per-ag busy list
2483 * xfs_alloc_clear_busy - remove an item from the per-ag busy list 2494 * xfs_alloc_busy_clear - remove an item from the per-ag busy list
2495 * xfs_alloc_busy_search - search for a busy extent
2496 */
2497
2498/*
2499 * Insert a new extent into the busy tree.
2500 *
2501 * The busy extent tree is indexed by the start block of the busy extent.
2502 * there can be multiple overlapping ranges in the busy extent tree but only
2503 * ever one entry at a given start block. The reason for this is that
2504 * multi-block extents can be freed, then smaller chunks of that extent
2505 * allocated and freed again before the first transaction commit is on disk.
2506 * If the exact same start block is freed a second time, we have to wait for
2507 * that busy extent to pass out of the tree before the new extent is inserted.
2508 * There are two main cases we have to handle here.
2509 *
2510 * The first case is a transaction that triggers a "free - allocate - free"
2511 * cycle. This can occur during btree manipulations as a btree block is freed
2512 * to the freelist, then allocated from the free list, then freed again. In
2513 * this case, the second extxpnet free is what triggers the duplicate and as
2514 * such the transaction IDs should match. Because the extent was allocated in
2515 * this transaction, the transaction must be marked as synchronous. This is
2516 * true for all cases where the free/alloc/free occurs in the one transaction,
2517 * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
2518 * This serves to catch violations of the second case quite effectively.
2519 *
2520 * The second case is where the free/alloc/free occur in different
2521 * transactions. In this case, the thread freeing the extent the second time
2522 * can't mark the extent busy immediately because it is already tracked in a
2523 * transaction that may be committing. When the log commit for the existing
2524 * busy extent completes, the busy extent will be removed from the tree. If we
2525 * allow the second busy insert to continue using that busy extent structure,
2526 * it can be freed before this transaction is safely in the log. Hence our
2527 * only option in this case is to force the log to remove the existing busy
2528 * extent from the list before we insert the new one with the current
2529 * transaction ID.
2530 *
2531 * The problem we are trying to avoid in the free-alloc-free in separate
2532 * transactions is most easily described with a timeline:
2533 *
2534 * Thread 1 Thread 2 Thread 3 xfslogd
2535 * xact alloc
2536 * free X
2537 * mark busy
2538 * commit xact
2539 * free xact
2540 * xact alloc
2541 * alloc X
2542 * busy search
2543 * mark xact sync
2544 * commit xact
2545 * free xact
2546 * force log
2547 * checkpoint starts
2548 * ....
2549 * xact alloc
2550 * free X
2551 * mark busy
2552 * finds match
2553 * *** KABOOM! ***
2554 * ....
2555 * log IO completes
2556 * unbusy X
2557 * checkpoint completes
2558 *
2559 * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
2560 * the checkpoint completes, and the busy extent it matched will have been
2561 * removed from the tree when it is woken. Hence it can then continue safely.
2562 *
2563 * However, to ensure this matching process is robust, we need to use the
2564 * transaction ID for identifying transaction, as delayed logging results in
2565 * the busy extent and transaction lifecycles being different. i.e. the busy
2566 * extent is active for a lot longer than the transaction. Hence the
2567 * transaction structure can be freed and reallocated, then mark the same
2568 * extent busy again in the new transaction. In this case the new transaction
2569 * will have a different tid but can have the same address, and hence we need
2570 * to check against the tid.
2571 *
2572 * Future: for delayed logging, we could avoid the log force if the extent was
2573 * first freed in the current checkpoint sequence. This, however, requires the
2574 * ability to pin the current checkpoint in memory until this transaction
2575 * commits to ensure that both the original free and the current one combine
2576 * logically into the one checkpoint. If the checkpoint sequences are
2577 * different, however, we still need to wait on a log force.
2484 */ 2578 */
2485void 2579void
2486xfs_alloc_mark_busy(xfs_trans_t *tp, 2580xfs_alloc_busy_insert(
2487 xfs_agnumber_t agno, 2581 struct xfs_trans *tp,
2488 xfs_agblock_t bno, 2582 xfs_agnumber_t agno,
2489 xfs_extlen_t len) 2583 xfs_agblock_t bno,
2584 xfs_extlen_t len)
2490{ 2585{
2491 xfs_perag_busy_t *bsy; 2586 struct xfs_busy_extent *new;
2587 struct xfs_busy_extent *busyp;
2492 struct xfs_perag *pag; 2588 struct xfs_perag *pag;
2493 int n; 2589 struct rb_node **rbp;
2590 struct rb_node *parent;
2591 int match;
2494 2592
2495 pag = xfs_perag_get(tp->t_mountp, agno);
2496 spin_lock(&pag->pagb_lock);
2497 2593
2498 /* search pagb_list for an open slot */ 2594 new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
2499 for (bsy = pag->pagb_list, n = 0; 2595 if (!new) {
2500 n < XFS_PAGB_NUM_SLOTS; 2596 /*
2501 bsy++, n++) { 2597 * No Memory! Since it is now not possible to track the free
2502 if (bsy->busy_tp == NULL) { 2598 * block, make this a synchronous transaction to insure that
2503 break; 2599 * the block is not reused before this transaction commits.
2504 } 2600 */
2601 trace_xfs_alloc_busy(tp, agno, bno, len, 1);
2602 xfs_trans_set_sync(tp);
2603 return;
2505 } 2604 }
2506 2605
2507 trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n); 2606 new->agno = agno;
2607 new->bno = bno;
2608 new->length = len;
2609 new->tid = xfs_log_get_trans_ident(tp);
2508 2610
2509 if (n < XFS_PAGB_NUM_SLOTS) { 2611 INIT_LIST_HEAD(&new->list);
2510 bsy = &pag->pagb_list[n]; 2612
2511 pag->pagb_count++; 2613 /* trace before insert to be able to see failed inserts */
2512 bsy->busy_start = bno; 2614 trace_xfs_alloc_busy(tp, agno, bno, len, 0);
2513 bsy->busy_length = len; 2615
2514 bsy->busy_tp = tp; 2616 pag = xfs_perag_get(tp->t_mountp, new->agno);
2515 xfs_trans_add_busy(tp, agno, n); 2617restart:
2516 } else { 2618 spin_lock(&pag->pagb_lock);
2619 rbp = &pag->pagb_tree.rb_node;
2620 parent = NULL;
2621 busyp = NULL;
2622 match = 0;
2623 while (*rbp && match >= 0) {
2624 parent = *rbp;
2625 busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
2626
2627 if (new->bno < busyp->bno) {
2628 /* may overlap, but exact start block is lower */
2629 rbp = &(*rbp)->rb_left;
2630 if (new->bno + new->length > busyp->bno)
2631 match = busyp->tid == new->tid ? 1 : -1;
2632 } else if (new->bno > busyp->bno) {
2633 /* may overlap, but exact start block is higher */
2634 rbp = &(*rbp)->rb_right;
2635 if (bno < busyp->bno + busyp->length)
2636 match = busyp->tid == new->tid ? 1 : -1;
2637 } else {
2638 match = busyp->tid == new->tid ? 1 : -1;
2639 break;
2640 }
2641 }
2642 if (match < 0) {
2643 /* overlap marked busy in different transaction */
2644 spin_unlock(&pag->pagb_lock);
2645 xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
2646 goto restart;
2647 }
2648 if (match > 0) {
2517 /* 2649 /*
2518 * The busy list is full! Since it is now not possible to 2650 * overlap marked busy in same transaction. Update if exact
2519 * track the free block, make this a synchronous transaction 2651 * start block match, otherwise combine the busy extents into
2520 * to insure that the block is not reused before this 2652 * a single range.
2521 * transaction commits.
2522 */ 2653 */
2523 xfs_trans_set_sync(tp); 2654 if (busyp->bno == new->bno) {
2524 } 2655 busyp->length = max(busyp->length, new->length);
2656 spin_unlock(&pag->pagb_lock);
2657 ASSERT(tp->t_flags & XFS_TRANS_SYNC);
2658 xfs_perag_put(pag);
2659 kmem_free(new);
2660 return;
2661 }
2662 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2663 new->length = max(busyp->bno + busyp->length,
2664 new->bno + new->length) -
2665 min(busyp->bno, new->bno);
2666 new->bno = min(busyp->bno, new->bno);
2667 } else
2668 busyp = NULL;
2525 2669
2670 rb_link_node(&new->rb_node, parent, rbp);
2671 rb_insert_color(&new->rb_node, &pag->pagb_tree);
2672
2673 list_add(&new->list, &tp->t_busy);
2526 spin_unlock(&pag->pagb_lock); 2674 spin_unlock(&pag->pagb_lock);
2527 xfs_perag_put(pag); 2675 xfs_perag_put(pag);
2676 kmem_free(busyp);
2528} 2677}
2529 2678
2530void 2679/*
2531xfs_alloc_clear_busy(xfs_trans_t *tp, 2680 * Search for a busy extent within the range of the extent we are about to
2532 xfs_agnumber_t agno, 2681 * allocate. You need to be holding the busy extent tree lock when calling
2533 int idx) 2682 * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
2683 * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
2684 * match. This is done so that a non-zero return indicates an overlap that
2685 * will require a synchronous transaction, but it can still be
2686 * used to distinguish between a partial or exact match.
2687 */
2688static int
2689xfs_alloc_busy_search(
2690 struct xfs_mount *mp,
2691 xfs_agnumber_t agno,
2692 xfs_agblock_t bno,
2693 xfs_extlen_t len)
2534{ 2694{
2535 struct xfs_perag *pag; 2695 struct xfs_perag *pag;
2536 xfs_perag_busy_t *list; 2696 struct rb_node *rbp;
2697 struct xfs_busy_extent *busyp;
2698 int match = 0;
2537 2699
2538 ASSERT(idx < XFS_PAGB_NUM_SLOTS); 2700 pag = xfs_perag_get(mp, agno);
2539 pag = xfs_perag_get(tp->t_mountp, agno);
2540 spin_lock(&pag->pagb_lock); 2701 spin_lock(&pag->pagb_lock);
2541 list = pag->pagb_list;
2542 2702
2543 trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp); 2703 rbp = pag->pagb_tree.rb_node;
2544 2704
2545 if (list[idx].busy_tp == tp) { 2705 /* find closest start bno overlap */
2546 list[idx].busy_tp = NULL; 2706 while (rbp) {
2547 pag->pagb_count--; 2707 busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
2708 if (bno < busyp->bno) {
2709 /* may overlap, but exact start block is lower */
2710 if (bno + len > busyp->bno)
2711 match = -1;
2712 rbp = rbp->rb_left;
2713 } else if (bno > busyp->bno) {
2714 /* may overlap, but exact start block is higher */
2715 if (bno < busyp->bno + busyp->length)
2716 match = -1;
2717 rbp = rbp->rb_right;
2718 } else {
2719 /* bno matches busyp, length determines exact match */
2720 match = (busyp->length == len) ? 1 : -1;
2721 break;
2722 }
2548 } 2723 }
2549
2550 spin_unlock(&pag->pagb_lock); 2724 spin_unlock(&pag->pagb_lock);
2725 trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
2551 xfs_perag_put(pag); 2726 xfs_perag_put(pag);
2727 return match;
2552} 2728}
2553 2729
2554 2730void
2555/* 2731xfs_alloc_busy_clear(
2556 * If we find the extent in the busy list, force the log out to get the 2732 struct xfs_mount *mp,
2557 * extent out of the busy list so the caller can use it straight away. 2733 struct xfs_busy_extent *busyp)
2558 */
2559STATIC void
2560xfs_alloc_search_busy(xfs_trans_t *tp,
2561 xfs_agnumber_t agno,
2562 xfs_agblock_t bno,
2563 xfs_extlen_t len)
2564{ 2734{
2565 struct xfs_perag *pag; 2735 struct xfs_perag *pag;
2566 xfs_perag_busy_t *bsy;
2567 xfs_agblock_t uend, bend;
2568 xfs_lsn_t lsn = 0;
2569 int cnt;
2570 2736
2571 pag = xfs_perag_get(tp->t_mountp, agno); 2737 trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
2572 spin_lock(&pag->pagb_lock); 2738 busyp->length);
2573 cnt = pag->pagb_count;
2574 2739
2575 /* 2740 ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
2576 * search pagb_list for this slot, skipping open slots. We have to 2741 busyp->length) == 1);
2577 * search the entire array as there may be multiple overlaps and
2578 * we have to get the most recent LSN for the log force to push out
2579 * all the transactions that span the range.
2580 */
2581 uend = bno + len - 1;
2582 for (cnt = 0; cnt < pag->pagb_count; cnt++) {
2583 bsy = &pag->pagb_list[cnt];
2584 if (!bsy->busy_tp)
2585 continue;
2586 2742
2587 bend = bsy->busy_start + bsy->busy_length - 1; 2743 list_del_init(&busyp->list);
2588 if (bno > bend || uend < bsy->busy_start)
2589 continue;
2590 2744
2591 /* (start1,length1) within (start2, length2) */ 2745 pag = xfs_perag_get(mp, busyp->agno);
2592 if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) 2746 spin_lock(&pag->pagb_lock);
2593 lsn = bsy->busy_tp->t_commit_lsn; 2747 rb_erase(&busyp->rb_node, &pag->pagb_tree);
2594 }
2595 spin_unlock(&pag->pagb_lock); 2748 spin_unlock(&pag->pagb_lock);
2596 xfs_perag_put(pag); 2749 xfs_perag_put(pag);
2597 trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
2598 2750
2599 /* 2751 kmem_free(busyp);
2600 * If a block was found, force the log through the LSN of the
2601 * transaction that freed the block
2602 */
2603 if (lsn)
2604 xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
2605} 2752}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
22struct xfs_mount; 22struct xfs_mount;
23struct xfs_perag; 23struct xfs_perag;
24struct xfs_trans; 24struct xfs_trans;
25struct xfs_busy_extent;
25 26
26/* 27/*
27 * Freespace allocation types. Argument to xfs_alloc_[v]extent. 28 * Freespace allocation types. Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
119#ifdef __KERNEL__ 120#ifdef __KERNEL__
120 121
121void 122void
122xfs_alloc_mark_busy(xfs_trans_t *tp, 123xfs_alloc_busy_insert(xfs_trans_t *tp,
123 xfs_agnumber_t agno, 124 xfs_agnumber_t agno,
124 xfs_agblock_t bno, 125 xfs_agblock_t bno,
125 xfs_extlen_t len); 126 xfs_extlen_t len);
126 127
127void 128void
128xfs_alloc_clear_busy(xfs_trans_t *tp, 129xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
129 xfs_agnumber_t ag,
130 int idx);
131 130
132#endif /* __KERNEL__ */ 131#endif /* __KERNEL__ */
133 132
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
134 * disk. If a busy block is allocated, the iclog is pushed up to the 134 * disk. If a busy block is allocated, the iclog is pushed up to the
135 * LSN that freed the block. 135 * LSN that freed the block.
136 */ 136 */
137 xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1); 137 xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
138 xfs_trans_agbtree_delta(cur->bc_tp, -1); 138 xfs_trans_agbtree_delta(cur->bc_tp, -1);
139 return 0; 139 return 0;
140} 140}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 240340a4727b..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
64 nbytes = last - first + 1; 64 nbytes = last - first + 1;
65 bfset(bip->bli_logged, first, nbytes); 65 bfset(bip->bli_logged, first, nbytes);
66 for (x = 0; x < nbytes; x++) { 66 for (x = 0; x < nbytes; x++) {
67 chunk_num = byte >> XFS_BLI_SHIFT; 67 chunk_num = byte >> XFS_BLF_SHIFT;
68 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 68 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
69 bit_num = chunk_num & (NBWORD - 1); 69 bit_num = chunk_num & (NBWORD - 1);
70 wordp = &(bip->bli_format.blf_data_map[word_num]); 70 wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
166 * cancel flag in it. 166 * cancel flag in it.
167 */ 167 */
168 trace_xfs_buf_item_size_stale(bip); 168 trace_xfs_buf_item_size_stale(bip);
169 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 169 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
170 return 1; 170 return 1;
171 } 171 }
172 172
@@ -197,9 +197,9 @@ xfs_buf_item_size(
197 } else if (next_bit != last_bit + 1) { 197 } else if (next_bit != last_bit + 1) {
198 last_bit = next_bit; 198 last_bit = next_bit;
199 nvecs++; 199 nvecs++;
200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != 200 } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
201 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + 201 (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
202 XFS_BLI_CHUNK)) { 202 XFS_BLF_CHUNK)) {
203 last_bit = next_bit; 203 last_bit = next_bit;
204 nvecs++; 204 nvecs++;
205 } else { 205 } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
254 vecp++; 254 vecp++;
255 nvecs = 1; 255 nvecs = 1;
256 256
257 /*
258 * If it is an inode buffer, transfer the in-memory state to the
259 * format flags and clear the in-memory state. We do not transfer
260 * this state if the inode buffer allocation has not yet been committed
261 * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
262 * correct replay of the inode allocation.
263 */
264 if (bip->bli_flags & XFS_BLI_INODE_BUF) {
265 if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
266 xfs_log_item_in_current_chkpt(&bip->bli_item)))
267 bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
268 bip->bli_flags &= ~XFS_BLI_INODE_BUF;
269 }
270
257 if (bip->bli_flags & XFS_BLI_STALE) { 271 if (bip->bli_flags & XFS_BLI_STALE) {
258 /* 272 /*
259 * The buffer is stale, so all we need to log 273 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
261 * cancel flag in it. 275 * cancel flag in it.
262 */ 276 */
263 trace_xfs_buf_item_format_stale(bip); 277 trace_xfs_buf_item_format_stale(bip);
264 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 278 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
265 bip->bli_format.blf_size = nvecs; 279 bip->bli_format.blf_size = nvecs;
266 return; 280 return;
267 } 281 }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
294 * keep counting and scanning. 308 * keep counting and scanning.
295 */ 309 */
296 if (next_bit == -1) { 310 if (next_bit == -1) {
297 buffer_offset = first_bit * XFS_BLI_CHUNK; 311 buffer_offset = first_bit * XFS_BLF_CHUNK;
298 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 312 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
299 vecp->i_len = nbits * XFS_BLI_CHUNK; 313 vecp->i_len = nbits * XFS_BLF_CHUNK;
300 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 314 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
301 nvecs++; 315 nvecs++;
302 break; 316 break;
303 } else if (next_bit != last_bit + 1) { 317 } else if (next_bit != last_bit + 1) {
304 buffer_offset = first_bit * XFS_BLI_CHUNK; 318 buffer_offset = first_bit * XFS_BLF_CHUNK;
305 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 319 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
306 vecp->i_len = nbits * XFS_BLI_CHUNK; 320 vecp->i_len = nbits * XFS_BLF_CHUNK;
307 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 321 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
308 nvecs++; 322 nvecs++;
309 vecp++; 323 vecp++;
310 first_bit = next_bit; 324 first_bit = next_bit;
311 last_bit = next_bit; 325 last_bit = next_bit;
312 nbits = 1; 326 nbits = 1;
313 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != 327 } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
314 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + 328 (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
315 XFS_BLI_CHUNK)) { 329 XFS_BLF_CHUNK)) {
316 buffer_offset = first_bit * XFS_BLI_CHUNK; 330 buffer_offset = first_bit * XFS_BLF_CHUNK;
317 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 331 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
318 vecp->i_len = nbits * XFS_BLI_CHUNK; 332 vecp->i_len = nbits * XFS_BLF_CHUNK;
319 vecp->i_type = XLOG_REG_TYPE_BCHUNK; 333 vecp->i_type = XLOG_REG_TYPE_BCHUNK;
320/* You would think we need to bump the nvecs here too, but we do not 334/* You would think we need to bump the nvecs here too, but we do not
321 * this number is used by recovery, and it gets confused by the boundary 335 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
341} 355}
342 356
343/* 357/*
344 * This is called to pin the buffer associated with the buf log 358 * This is called to pin the buffer associated with the buf log item in memory
345 * item in memory so it cannot be written out. Simply call bpin() 359 * so it cannot be written out. Simply call bpin() on the buffer to do this.
346 * on the buffer to do this. 360 *
361 * We also always take a reference to the buffer log item here so that the bli
362 * is held while the item is pinned in memory. This means that we can
363 * unconditionally drop the reference count a transaction holds when the
364 * transaction is completed.
347 */ 365 */
366
348STATIC void 367STATIC void
349xfs_buf_item_pin( 368xfs_buf_item_pin(
350 xfs_buf_log_item_t *bip) 369 xfs_buf_log_item_t *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
356 ASSERT(atomic_read(&bip->bli_refcount) > 0); 375 ASSERT(atomic_read(&bip->bli_refcount) > 0);
357 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 376 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
358 (bip->bli_flags & XFS_BLI_STALE)); 377 (bip->bli_flags & XFS_BLI_STALE));
378 atomic_inc(&bip->bli_refcount);
359 trace_xfs_buf_item_pin(bip); 379 trace_xfs_buf_item_pin(bip);
360 xfs_bpin(bp); 380 xfs_bpin(bp);
361} 381}
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
393 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 413 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
394 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 414 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
395 ASSERT(XFS_BUF_ISSTALE(bp)); 415 ASSERT(XFS_BUF_ISSTALE(bp));
396 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 416 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
397 trace_xfs_buf_item_unpin_stale(bip); 417 trace_xfs_buf_item_unpin_stale(bip);
398 418
399 /* 419 /*
@@ -489,20 +509,23 @@ xfs_buf_item_trylock(
489} 509}
490 510
491/* 511/*
492 * Release the buffer associated with the buf log item. 512 * Release the buffer associated with the buf log item. If there is no dirty
493 * If there is no dirty logged data associated with the 513 * logged data associated with the buffer recorded in the buf log item, then
494 * buffer recorded in the buf log item, then free the 514 * free the buf log item and remove the reference to it in the buffer.
495 * buf log item and remove the reference to it in the 515 *
496 * buffer. 516 * This call ignores the recursion count. It is only called when the buffer
517 * should REALLY be unlocked, regardless of the recursion count.
497 * 518 *
498 * This call ignores the recursion count. It is only called 519 * We unconditionally drop the transaction's reference to the log item. If the
499 * when the buffer should REALLY be unlocked, regardless 520 * item was logged, then another reference was taken when it was pinned, so we
500 * of the recursion count. 521 * can safely drop the transaction reference now. This also allows us to avoid
522 * potential races with the unpin code freeing the bli by not referencing the
523 * bli after we've dropped the reference count.
501 * 524 *
502 * If the XFS_BLI_HOLD flag is set in the buf log item, then 525 * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
503 * free the log item if necessary but do not unlock the buffer. 526 * if necessary but do not unlock the buffer. This is for support of
504 * This is for support of xfs_trans_bhold(). Make sure the 527 * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
505 * XFS_BLI_HOLD field is cleared if we don't free the item. 528 * free the item.
506 */ 529 */
507STATIC void 530STATIC void
508xfs_buf_item_unlock( 531xfs_buf_item_unlock(
@@ -514,73 +537,54 @@ xfs_buf_item_unlock(
514 537
515 bp = bip->bli_buf; 538 bp = bip->bli_buf;
516 539
517 /* 540 /* Clear the buffer's association with this transaction. */
518 * Clear the buffer's association with this transaction.
519 */
520 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 541 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
521 542
522 /* 543 /*
523 * If this is a transaction abort, don't return early. 544 * If this is a transaction abort, don't return early. Instead, allow
524 * Instead, allow the brelse to happen. 545 * the brelse to happen. Normally it would be done for stale
525 * Normally it would be done for stale (cancelled) buffers 546 * (cancelled) buffers at unpin time, but we'll never go through the
526 * at unpin time, but we'll never go through the pin/unpin 547 * pin/unpin cycle if we abort inside commit.
527 * cycle if we abort inside commit.
528 */ 548 */
529 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 549 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
530 550
531 /* 551 /*
532 * If the buf item is marked stale, then don't do anything. 552 * Before possibly freeing the buf item, determine if we should
533 * We'll unlock the buffer and free the buf item when the 553 * release the buffer at the end of this routine.
534 * buffer is unpinned for the last time.
535 */ 554 */
536 if (bip->bli_flags & XFS_BLI_STALE) { 555 hold = bip->bli_flags & XFS_BLI_HOLD;
537 bip->bli_flags &= ~XFS_BLI_LOGGED; 556
538 trace_xfs_buf_item_unlock_stale(bip); 557 /* Clear the per transaction state. */
539 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 558 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
540 if (!aborted)
541 return;
542 }
543 559
544 /* 560 /*
545 * Drop the transaction's reference to the log item if 561 * If the buf item is marked stale, then don't do anything. We'll
546 * it was not logged as part of the transaction. Otherwise 562 * unlock the buffer and free the buf item when the buffer is unpinned
547 * we'll drop the reference in xfs_buf_item_unpin() when 563 * for the last time.
548 * the transaction is really through with the buffer.
549 */ 564 */
550 if (!(bip->bli_flags & XFS_BLI_LOGGED)) { 565 if (bip->bli_flags & XFS_BLI_STALE) {
551 atomic_dec(&bip->bli_refcount); 566 trace_xfs_buf_item_unlock_stale(bip);
552 } else { 567 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
553 /* 568 if (!aborted) {
554 * Clear the logged flag since this is per 569 atomic_dec(&bip->bli_refcount);
555 * transaction state. 570 return;
556 */ 571 }
557 bip->bli_flags &= ~XFS_BLI_LOGGED;
558 } 572 }
559 573
560 /*
561 * Before possibly freeing the buf item, determine if we should
562 * release the buffer at the end of this routine.
563 */
564 hold = bip->bli_flags & XFS_BLI_HOLD;
565 trace_xfs_buf_item_unlock(bip); 574 trace_xfs_buf_item_unlock(bip);
566 575
567 /* 576 /*
568 * If the buf item isn't tracking any data, free it. 577 * If the buf item isn't tracking any data, free it, otherwise drop the
569 * Otherwise, if XFS_BLI_HOLD is set clear it. 578 * reference we hold to it.
570 */ 579 */
571 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 580 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
572 bip->bli_format.blf_map_size)) { 581 bip->bli_format.blf_map_size))
573 xfs_buf_item_relse(bp); 582 xfs_buf_item_relse(bp);
574 } else if (hold) { 583 else
575 bip->bli_flags &= ~XFS_BLI_HOLD; 584 atomic_dec(&bip->bli_refcount);
576 }
577 585
578 /* 586 if (!hold)
579 * Release the buffer if XFS_BLI_HOLD was not set.
580 */
581 if (!hold) {
582 xfs_buf_relse(bp); 587 xfs_buf_relse(bp);
583 }
584} 588}
585 589
586/* 590/*
@@ -717,12 +721,12 @@ xfs_buf_item_init(
717 } 721 }
718 722
719 /* 723 /*
720 * chunks is the number of XFS_BLI_CHUNK size pieces 724 * chunks is the number of XFS_BLF_CHUNK size pieces
721 * the buffer can be divided into. Make sure not to 725 * the buffer can be divided into. Make sure not to
722 * truncate any pieces. map_size is the size of the 726 * truncate any pieces. map_size is the size of the
723 * bitmap needed to describe the chunks of the buffer. 727 * bitmap needed to describe the chunks of the buffer.
724 */ 728 */
725 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); 729 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
726 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 730 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
727 731
728 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 732 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -790,8 +794,8 @@ xfs_buf_item_log(
790 /* 794 /*
791 * Convert byte offsets to bit numbers. 795 * Convert byte offsets to bit numbers.
792 */ 796 */
793 first_bit = first >> XFS_BLI_SHIFT; 797 first_bit = first >> XFS_BLF_SHIFT;
794 last_bit = last >> XFS_BLI_SHIFT; 798 last_bit = last >> XFS_BLF_SHIFT;
795 799
796 /* 800 /*
797 * Calculate the total number of bits to be set. 801 * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index df4454511f73..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format {
41 * This flag indicates that the buffer contains on disk inodes 41 * This flag indicates that the buffer contains on disk inodes
42 * and requires special recovery handling. 42 * and requires special recovery handling.
43 */ 43 */
44#define XFS_BLI_INODE_BUF 0x1 44#define XFS_BLF_INODE_BUF 0x1
45/* 45/*
46 * This flag indicates that the buffer should not be replayed 46 * This flag indicates that the buffer should not be replayed
47 * during recovery because its blocks are being freed. 47 * during recovery because its blocks are being freed.
48 */ 48 */
49#define XFS_BLI_CANCEL 0x2 49#define XFS_BLF_CANCEL 0x2
50/* 50/*
51 * This flag indicates that the buffer contains on disk 51 * This flag indicates that the buffer contains on disk
52 * user or group dquots and may require special recovery handling. 52 * user or group dquots and may require special recovery handling.
53 */ 53 */
54#define XFS_BLI_UDQUOT_BUF 0x4 54#define XFS_BLF_UDQUOT_BUF 0x4
55#define XFS_BLI_PDQUOT_BUF 0x8 55#define XFS_BLF_PDQUOT_BUF 0x8
56#define XFS_BLI_GDQUOT_BUF 0x10 56#define XFS_BLF_GDQUOT_BUF 0x10
57 57
58#define XFS_BLI_CHUNK 128 58#define XFS_BLF_CHUNK 128
59#define XFS_BLI_SHIFT 7 59#define XFS_BLF_SHIFT 7
60#define BIT_TO_WORD_SHIFT 5 60#define BIT_TO_WORD_SHIFT 5
61#define NBWORD (NBBY * sizeof(unsigned int)) 61#define NBWORD (NBBY * sizeof(unsigned int))
62 62
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format {
69#define XFS_BLI_LOGGED 0x08 69#define XFS_BLI_LOGGED 0x08
70#define XFS_BLI_INODE_ALLOC_BUF 0x10 70#define XFS_BLI_INODE_ALLOC_BUF 0x10
71#define XFS_BLI_STALE_INODE 0x20 71#define XFS_BLI_STALE_INODE 0x20
72#define XFS_BLI_INODE_BUF 0x40
72 73
73#define XFS_BLI_FLAGS \ 74#define XFS_BLI_FLAGS \
74 { XFS_BLI_HOLD, "HOLD" }, \ 75 { XFS_BLI_HOLD, "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format {
76 { XFS_BLI_STALE, "STALE" }, \ 77 { XFS_BLI_STALE, "STALE" }, \
77 { XFS_BLI_LOGGED, "LOGGED" }, \ 78 { XFS_BLI_LOGGED, "LOGGED" }, \
78 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \ 79 { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
79 { XFS_BLI_STALE_INODE, "STALE_INODE" } 80 { XFS_BLI_STALE_INODE, "STALE_INODE" }, \
81 { XFS_BLI_INODE_BUF, "INODE_BUF" }
80 82
81 83
82#ifdef __KERNEL__ 84#ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ef96175c0744..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
170 va_list ap; 170 va_list ap;
171 171
172#ifdef DEBUG 172#ifdef DEBUG
173 xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT; 173 xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
174#endif 174#endif
175 175
176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag) 176 if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818f..75df75f43d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -382,9 +382,6 @@ xfs_iget(
382 382
383 /* get the perag structure and ensure that it's inode capable */ 383 /* get the perag structure and ensure that it's inode capable */
384 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 384 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
385 if (!pag->pagi_inodeok)
386 return EINVAL;
387 ASSERT(pag->pag_ici_init);
388 agino = XFS_INO_TO_AGINO(mp, ino); 385 agino = XFS_INO_TO_AGINO(mp, ino);
389 386
390again: 387again:
@@ -744,30 +741,24 @@ xfs_ilock_demote(
744} 741}
745 742
746#ifdef DEBUG 743#ifdef DEBUG
747/*
748 * Debug-only routine, without additional rw_semaphore APIs, we can
749 * now only answer requests regarding whether we hold the lock for write
750 * (reader state is outside our visibility, we only track writer state).
751 *
752 * Note: this means !xfs_isilocked would give false positives, so don't do that.
753 */
754int 744int
755xfs_isilocked( 745xfs_isilocked(
756 xfs_inode_t *ip, 746 xfs_inode_t *ip,
757 uint lock_flags) 747 uint lock_flags)
758{ 748{
759 if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) == 749 if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
760 XFS_ILOCK_EXCL) { 750 if (!(lock_flags & XFS_ILOCK_SHARED))
761 if (!ip->i_lock.mr_writer) 751 return !!ip->i_lock.mr_writer;
762 return 0; 752 return rwsem_is_locked(&ip->i_lock.mr_lock);
763 } 753 }
764 754
765 if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) == 755 if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
766 XFS_IOLOCK_EXCL) { 756 if (!(lock_flags & XFS_IOLOCK_SHARED))
767 if (!ip->i_iolock.mr_writer) 757 return !!ip->i_iolock.mr_writer;
768 return 0; 758 return rwsem_is_locked(&ip->i_iolock.mr_lock);
769 } 759 }
770 760
771 return 1; 761 ASSERT(0);
762 return 0;
772} 763}
773#endif 764#endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8cd6e8d8fe9c..d53c39de7d05 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
1940 int blks_per_cluster; 1940 int blks_per_cluster;
1941 int nbufs; 1941 int nbufs;
1942 int ninodes; 1942 int ninodes;
1943 int i, j, found, pre_flushed; 1943 int i, j;
1944 xfs_daddr_t blkno; 1944 xfs_daddr_t blkno;
1945 xfs_buf_t *bp; 1945 xfs_buf_t *bp;
1946 xfs_inode_t *ip, **ip_found; 1946 xfs_inode_t *ip;
1947 xfs_inode_log_item_t *iip; 1947 xfs_inode_log_item_t *iip;
1948 xfs_log_item_t *lip; 1948 xfs_log_item_t *lip;
1949 struct xfs_perag *pag; 1949 struct xfs_perag *pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
1960 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; 1960 nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1961 } 1961 }
1962 1962
1963 ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
1964
1965 for (j = 0; j < nbufs; j++, inum += ninodes) { 1963 for (j = 0; j < nbufs; j++, inum += ninodes) {
1964 int found = 0;
1965
1966 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 1966 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1967 XFS_INO_TO_AGBNO(mp, inum)); 1967 XFS_INO_TO_AGBNO(mp, inum));
1968 1968
1969 /*
1970 * We obtain and lock the backing buffer first in the process
1971 * here, as we have to ensure that any dirty inode that we
1972 * can't get the flush lock on is attached to the buffer.
1973 * If we scan the in-memory inodes first, then buffer IO can
1974 * complete before we get a lock on it, and hence we may fail
1975 * to mark all the active inodes on the buffer stale.
1976 */
1977 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1978 mp->m_bsize * blks_per_cluster,
1979 XBF_LOCK);
1980
1981 /*
1982 * Walk the inodes already attached to the buffer and mark them
1983 * stale. These will all have the flush locks held, so an
1984 * in-memory inode walk can't lock them.
1985 */
1986 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1987 while (lip) {
1988 if (lip->li_type == XFS_LI_INODE) {
1989 iip = (xfs_inode_log_item_t *)lip;
1990 ASSERT(iip->ili_logged == 1);
1991 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
1992 xfs_trans_ail_copy_lsn(mp->m_ail,
1993 &iip->ili_flush_lsn,
1994 &iip->ili_item.li_lsn);
1995 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1996 found++;
1997 }
1998 lip = lip->li_bio_list;
1999 }
1969 2000
1970 /* 2001 /*
1971 * Look for each inode in memory and attempt to lock it, 2002 * For each inode in memory attempt to add it to the inode
1972 * we can be racing with flush and tail pushing here. 2003 * buffer and set it up for being staled on buffer IO
1973 * any inode we get the locks on, add to an array of 2004 * completion. This is safe as we've locked out tail pushing
1974 * inode items to process later. 2005 * and flushing by locking the buffer.
1975 * 2006 *
1976 * The get the buffer lock, we could beat a flush 2007 * We have already marked every inode that was part of a
1977 * or tail pushing thread to the lock here, in which 2008 * transaction stale above, which means there is no point in
1978 * case they will go looking for the inode buffer 2009 * even trying to lock them.
1979 * and fail, we need some other form of interlock
1980 * here.
1981 */ 2010 */
1982 found = 0;
1983 for (i = 0; i < ninodes; i++) { 2011 for (i = 0; i < ninodes; i++) {
1984 read_lock(&pag->pag_ici_lock); 2012 read_lock(&pag->pag_ici_lock);
1985 ip = radix_tree_lookup(&pag->pag_ici_root, 2013 ip = radix_tree_lookup(&pag->pag_ici_root,
1986 XFS_INO_TO_AGINO(mp, (inum + i))); 2014 XFS_INO_TO_AGINO(mp, (inum + i)));
1987 2015
1988 /* Inode not in memory or we found it already, 2016 /* Inode not in memory or stale, nothing to do */
1989 * nothing to do
1990 */
1991 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) { 2017 if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
1992 read_unlock(&pag->pag_ici_lock); 2018 read_unlock(&pag->pag_ici_lock);
1993 continue; 2019 continue;
1994 } 2020 }
1995 2021
1996 if (xfs_inode_clean(ip)) { 2022 /* don't try to lock/unlock the current inode */
1997 read_unlock(&pag->pag_ici_lock); 2023 if (ip != free_ip &&
1998 continue; 2024 !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1999 }
2000
2001 /* If we can get the locks then add it to the
2002 * list, otherwise by the time we get the bp lock
2003 * below it will already be attached to the
2004 * inode buffer.
2005 */
2006
2007 /* This inode will already be locked - by us, lets
2008 * keep it that way.
2009 */
2010
2011 if (ip == free_ip) {
2012 if (xfs_iflock_nowait(ip)) {
2013 xfs_iflags_set(ip, XFS_ISTALE);
2014 if (xfs_inode_clean(ip)) {
2015 xfs_ifunlock(ip);
2016 } else {
2017 ip_found[found++] = ip;
2018 }
2019 }
2020 read_unlock(&pag->pag_ici_lock); 2025 read_unlock(&pag->pag_ici_lock);
2021 continue; 2026 continue;
2022 } 2027 }
2028 read_unlock(&pag->pag_ici_lock);
2023 2029
2024 if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2030 if (!xfs_iflock_nowait(ip)) {
2025 if (xfs_iflock_nowait(ip)) { 2031 if (ip != free_ip)
2026 xfs_iflags_set(ip, XFS_ISTALE);
2027
2028 if (xfs_inode_clean(ip)) {
2029 xfs_ifunlock(ip);
2030 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2031 } else {
2032 ip_found[found++] = ip;
2033 }
2034 } else {
2035 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2032 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2036 } 2033 continue;
2037 } 2034 }
2038 read_unlock(&pag->pag_ici_lock);
2039 }
2040 2035
2041 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2036 xfs_iflags_set(ip, XFS_ISTALE);
2042 mp->m_bsize * blks_per_cluster, 2037 if (xfs_inode_clean(ip)) {
2043 XBF_LOCK); 2038 ASSERT(ip != free_ip);
2044 2039 xfs_ifunlock(ip);
2045 pre_flushed = 0; 2040 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2046 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 2041 continue;
2047 while (lip) {
2048 if (lip->li_type == XFS_LI_INODE) {
2049 iip = (xfs_inode_log_item_t *)lip;
2050 ASSERT(iip->ili_logged == 1);
2051 lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2052 xfs_trans_ail_copy_lsn(mp->m_ail,
2053 &iip->ili_flush_lsn,
2054 &iip->ili_item.li_lsn);
2055 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2056 pre_flushed++;
2057 } 2042 }
2058 lip = lip->li_bio_list;
2059 }
2060 2043
2061 for (i = 0; i < found; i++) {
2062 ip = ip_found[i];
2063 iip = ip->i_itemp; 2044 iip = ip->i_itemp;
2064
2065 if (!iip) { 2045 if (!iip) {
2046 /* inode with unlogged changes only */
2047 ASSERT(ip != free_ip);
2066 ip->i_update_core = 0; 2048 ip->i_update_core = 0;
2067 xfs_ifunlock(ip); 2049 xfs_ifunlock(ip);
2068 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2050 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2069 continue; 2051 continue;
2070 } 2052 }
2053 found++;
2071 2054
2072 iip->ili_last_fields = iip->ili_format.ilf_fields; 2055 iip->ili_last_fields = iip->ili_format.ilf_fields;
2073 iip->ili_format.ilf_fields = 0; 2056 iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
2078 xfs_buf_attach_iodone(bp, 2061 xfs_buf_attach_iodone(bp,
2079 (void(*)(xfs_buf_t*,xfs_log_item_t*)) 2062 (void(*)(xfs_buf_t*,xfs_log_item_t*))
2080 xfs_istale_done, (xfs_log_item_t *)iip); 2063 xfs_istale_done, (xfs_log_item_t *)iip);
2081 if (ip != free_ip) { 2064
2065 if (ip != free_ip)
2082 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2066 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2083 }
2084 } 2067 }
2085 2068
2086 if (found || pre_flushed) 2069 if (found)
2087 xfs_trans_stale_inode_buf(tp, bp); 2070 xfs_trans_stale_inode_buf(tp, bp);
2088 xfs_trans_binval(tp, bp); 2071 xfs_trans_binval(tp, bp);
2089 } 2072 }
2090 2073
2091 kmem_free(ip_found);
2092 xfs_perag_put(pag); 2074 xfs_perag_put(pag);
2093} 2075}
2094 2076
@@ -2649,8 +2631,6 @@ xfs_iflush_cluster(
2649 int i; 2631 int i;
2650 2632
2651 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2633 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2652 ASSERT(pag->pagi_inodeok);
2653 ASSERT(pag->pag_ici_init);
2654 2634
2655 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog; 2635 inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2656 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *); 2636 ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3038dd52c72a..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -54,9 +54,6 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp,
54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); 54STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes);
55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); 55STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
56STATIC void xlog_dealloc_log(xlog_t *log); 56STATIC void xlog_dealloc_log(xlog_t *log);
57STATIC int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
58 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
59 xlog_in_core_t **commit_iclog, uint flags);
60 57
61/* local state machine functions */ 58/* local state machine functions */
62STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int); 59STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t *log,
86STATIC void xlog_ungrant_log_space(xlog_t *log, 83STATIC void xlog_ungrant_log_space(xlog_t *log,
87 xlog_ticket_t *ticket); 84 xlog_ticket_t *ticket);
88 85
89
90/* local ticket functions */
91STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
92 int unit_bytes,
93 int count,
94 char clientid,
95 uint flags);
96
97#if defined(DEBUG) 86#if defined(DEBUG)
98STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr); 87STATIC void xlog_verify_dest_ptr(xlog_t *log, char *ptr);
99STATIC void xlog_verify_grant_head(xlog_t *log, int equals); 88STATIC void xlog_verify_grant_head(xlog_t *log, int equals);
@@ -360,6 +349,15 @@ xfs_log_reserve(
360 ASSERT(flags & XFS_LOG_PERM_RESERV); 349 ASSERT(flags & XFS_LOG_PERM_RESERV);
361 internal_ticket = *ticket; 350 internal_ticket = *ticket;
362 351
352 /*
353 * this is a new transaction on the ticket, so we need to
354 * change the transaction ID so that the next transaction has a
355 * different TID in the log. Just add one to the existing tid
356 * so that we can see chains of rolling transactions in the log
357 * easily.
358 */
359 internal_ticket->t_tid++;
360
363 trace_xfs_log_reserve(log, internal_ticket); 361 trace_xfs_log_reserve(log, internal_ticket);
364 362
365 xlog_grant_push_ail(mp, internal_ticket->t_unit_res); 363 xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -367,7 +365,8 @@ xfs_log_reserve(
367 } else { 365 } else {
368 /* may sleep if need to allocate more tickets */ 366 /* may sleep if need to allocate more tickets */
369 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt, 367 internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
370 client, flags); 368 client, flags,
369 KM_SLEEP|KM_MAYFAIL);
371 if (!internal_ticket) 370 if (!internal_ticket)
372 return XFS_ERROR(ENOMEM); 371 return XFS_ERROR(ENOMEM);
373 internal_ticket->t_trans_type = t_type; 372 internal_ticket->t_trans_type = t_type;
@@ -452,6 +451,13 @@ xfs_log_mount(
452 /* Normal transactions can now occur */ 451 /* Normal transactions can now occur */
453 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 452 mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
454 453
454 /*
455 * Now the log has been fully initialised and we know were our
456 * space grant counters are, we can initialise the permanent ticket
457 * needed for delayed logging to work.
458 */
459 xlog_cil_init_post_recovery(mp->m_log);
460
455 return 0; 461 return 0;
456 462
457out_destroy_ail: 463out_destroy_ail:
@@ -658,6 +664,10 @@ xfs_log_item_init(
658 item->li_ailp = mp->m_ail; 664 item->li_ailp = mp->m_ail;
659 item->li_type = type; 665 item->li_type = type;
660 item->li_ops = ops; 666 item->li_ops = ops;
667 item->li_lv = NULL;
668
669 INIT_LIST_HEAD(&item->li_ail);
670 INIT_LIST_HEAD(&item->li_cil);
661} 671}
662 672
663/* 673/*
@@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1168 *iclogp = log->l_iclog; /* complete ring */ 1178 *iclogp = log->l_iclog; /* complete ring */
1169 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */ 1179 log->l_iclog->ic_prev = prev_iclog; /* re-write 1st prev ptr */
1170 1180
1181 error = xlog_cil_init(log);
1182 if (error)
1183 goto out_free_iclog;
1171 return log; 1184 return log;
1172 1185
1173out_free_iclog: 1186out_free_iclog:
@@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
1494 xlog_in_core_t *iclog, *next_iclog; 1507 xlog_in_core_t *iclog, *next_iclog;
1495 int i; 1508 int i;
1496 1509
1510 xlog_cil_destroy(log);
1511
1497 iclog = log->l_iclog; 1512 iclog = log->l_iclog;
1498 for (i=0; i<log->l_iclog_bufs; i++) { 1513 for (i=0; i<log->l_iclog_bufs; i++) {
1499 sv_destroy(&iclog->ic_force_wait); 1514 sv_destroy(&iclog->ic_force_wait);
@@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t *log,
1536 * print out info relating to regions written which consume 1551 * print out info relating to regions written which consume
1537 * the reservation 1552 * the reservation
1538 */ 1553 */
1539STATIC void 1554void
1540xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket) 1555xlog_print_tic_res(
1556 struct xfs_mount *mp,
1557 struct xlog_ticket *ticket)
1541{ 1558{
1542 uint i; 1559 uint i;
1543 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t); 1560 uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
1637 "bad-rtype" : res_type_str[r_type-1]), 1654 "bad-rtype" : res_type_str[r_type-1]),
1638 ticket->t_res_arr[i].r_len); 1655 ticket->t_res_arr[i].r_len);
1639 } 1656 }
1657
1658 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
1659 "xfs_log_write: reservation ran out. Need to up reservation");
1660 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
1640} 1661}
1641 1662
1642/* 1663/*
@@ -1865,7 +1886,7 @@ xlog_write_copy_finish(
1865 * we don't update ic_offset until the end when we know exactly how many 1886 * we don't update ic_offset until the end when we know exactly how many
1866 * bytes have been written out. 1887 * bytes have been written out.
1867 */ 1888 */
1868STATIC int 1889int
1869xlog_write( 1890xlog_write(
1870 struct log *log, 1891 struct log *log,
1871 struct xfs_log_vec *log_vector, 1892 struct xfs_log_vec *log_vector,
@@ -1889,22 +1910,26 @@ xlog_write(
1889 *start_lsn = 0; 1910 *start_lsn = 0;
1890 1911
1891 len = xlog_write_calc_vec_length(ticket, log_vector); 1912 len = xlog_write_calc_vec_length(ticket, log_vector);
1892 if (ticket->t_curr_res < len) { 1913 if (log->l_cilp) {
1893 xlog_print_tic_res(log->l_mp, ticket); 1914 /*
1894#ifdef DEBUG 1915 * Region headers and bytes are already accounted for.
1895 xlog_panic( 1916 * We only need to take into account start records and
1896 "xfs_log_write: reservation ran out. Need to up reservation"); 1917 * split regions in this function.
1897#else 1918 */
1898 /* Customer configurable panic */ 1919 if (ticket->t_flags & XLOG_TIC_INITED)
1899 xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp, 1920 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1900 "xfs_log_write: reservation ran out. Need to up reservation");
1901 1921
1902 /* If we did not panic, shutdown the filesystem */ 1922 /*
1903 xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE); 1923 * Commit record headers need to be accounted for. These
1904#endif 1924 * come in as separate writes so are easy to detect.
1905 } 1925 */
1926 if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
1927 ticket->t_curr_res -= sizeof(xlog_op_header_t);
1928 } else
1929 ticket->t_curr_res -= len;
1906 1930
1907 ticket->t_curr_res -= len; 1931 if (ticket->t_curr_res < 0)
1932 xlog_print_tic_res(log->l_mp, ticket);
1908 1933
1909 index = 0; 1934 index = 0;
1910 lv = log_vector; 1935 lv = log_vector;
@@ -3000,6 +3025,8 @@ _xfs_log_force(
3000 3025
3001 XFS_STATS_INC(xs_log_force); 3026 XFS_STATS_INC(xs_log_force);
3002 3027
3028 xlog_cil_push(log, 1);
3029
3003 spin_lock(&log->l_icloglock); 3030 spin_lock(&log->l_icloglock);
3004 3031
3005 iclog = log->l_iclog; 3032 iclog = log->l_iclog;
@@ -3149,6 +3176,12 @@ _xfs_log_force_lsn(
3149 3176
3150 XFS_STATS_INC(xs_log_force); 3177 XFS_STATS_INC(xs_log_force);
3151 3178
3179 if (log->l_cilp) {
3180 lsn = xlog_cil_push_lsn(log, lsn);
3181 if (lsn == NULLCOMMITLSN)
3182 return 0;
3183 }
3184
3152try_again: 3185try_again:
3153 spin_lock(&log->l_icloglock); 3186 spin_lock(&log->l_icloglock);
3154 iclog = log->l_iclog; 3187 iclog = log->l_iclog;
@@ -3313,22 +3346,30 @@ xfs_log_ticket_get(
3313 return ticket; 3346 return ticket;
3314} 3347}
3315 3348
3349xlog_tid_t
3350xfs_log_get_trans_ident(
3351 struct xfs_trans *tp)
3352{
3353 return tp->t_ticket->t_tid;
3354}
3355
3316/* 3356/*
3317 * Allocate and initialise a new log ticket. 3357 * Allocate and initialise a new log ticket.
3318 */ 3358 */
3319STATIC xlog_ticket_t * 3359xlog_ticket_t *
3320xlog_ticket_alloc( 3360xlog_ticket_alloc(
3321 struct log *log, 3361 struct log *log,
3322 int unit_bytes, 3362 int unit_bytes,
3323 int cnt, 3363 int cnt,
3324 char client, 3364 char client,
3325 uint xflags) 3365 uint xflags,
3366 int alloc_flags)
3326{ 3367{
3327 struct xlog_ticket *tic; 3368 struct xlog_ticket *tic;
3328 uint num_headers; 3369 uint num_headers;
3329 int iclog_space; 3370 int iclog_space;
3330 3371
3331 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL); 3372 tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
3332 if (!tic) 3373 if (!tic)
3333 return NULL; 3374 return NULL;
3334 3375
@@ -3647,6 +3688,11 @@ xlog_state_ioerror(
3647 * c. nothing new gets queued up after (a) and (b) are done. 3688 * c. nothing new gets queued up after (a) and (b) are done.
3648 * d. if !logerror, flush the iclogs to disk, then seal them off 3689 * d. if !logerror, flush the iclogs to disk, then seal them off
3649 * for business. 3690 * for business.
3691 *
3692 * Note: for delayed logging the !logerror case needs to flush the regions
3693 * held in memory out to the iclogs before flushing them to disk. This needs
3694 * to be done before the log is marked as shutdown, otherwise the flush to the
3695 * iclogs will fail.
3650 */ 3696 */
3651int 3697int
3652xfs_log_force_umount( 3698xfs_log_force_umount(
@@ -3680,6 +3726,16 @@ xfs_log_force_umount(
3680 return 1; 3726 return 1;
3681 } 3727 }
3682 retval = 0; 3728 retval = 0;
3729
3730 /*
3731 * Flush the in memory commit item list before marking the log as
3732 * being shut down. We need to do it in this order to ensure all the
3733 * completed transactions are flushed to disk with the xfs_log_force()
3734 * call below.
3735 */
3736 if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
3737 xlog_cil_push(log, 1);
3738
3683 /* 3739 /*
3684 * We must hold both the GRANT lock and the LOG lock, 3740 * We must hold both the GRANT lock and the LOG lock,
3685 * before we mark the filesystem SHUTDOWN and wake 3741 * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 229d1f36ba9a..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
19#define __XFS_LOG_H__ 19#define __XFS_LOG_H__
20 20
21/* get lsn fields */ 21/* get lsn fields */
22
23#define CYCLE_LSN(lsn) ((uint)((lsn)>>32)) 22#define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
24#define BLOCK_LSN(lsn) ((uint)(lsn)) 23#define BLOCK_LSN(lsn) ((uint)(lsn))
25 24
@@ -114,6 +113,9 @@ struct xfs_log_vec {
114 struct xfs_log_vec *lv_next; /* next lv in build list */ 113 struct xfs_log_vec *lv_next; /* next lv in build list */
115 int lv_niovecs; /* number of iovecs in lv */ 114 int lv_niovecs; /* number of iovecs in lv */
116 struct xfs_log_iovec *lv_iovecp; /* iovec array */ 115 struct xfs_log_iovec *lv_iovecp; /* iovec array */
116 struct xfs_log_item *lv_item; /* owner */
117 char *lv_buf; /* formatted buffer */
118 int lv_buf_len; /* size of formatted buffer */
117}; 119};
118 120
119/* 121/*
@@ -134,6 +136,7 @@ struct xlog_in_core;
134struct xlog_ticket; 136struct xlog_ticket;
135struct xfs_log_item; 137struct xfs_log_item;
136struct xfs_item_ops; 138struct xfs_item_ops;
139struct xfs_trans;
137 140
138void xfs_log_item_init(struct xfs_mount *mp, 141void xfs_log_item_init(struct xfs_mount *mp,
139 struct xfs_log_item *item, 142 struct xfs_log_item *item,
@@ -187,9 +190,16 @@ int xfs_log_need_covered(struct xfs_mount *mp);
187 190
188void xlog_iodone(struct xfs_buf *); 191void xlog_iodone(struct xfs_buf *);
189 192
190struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket); 193struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
191void xfs_log_ticket_put(struct xlog_ticket *ticket); 194void xfs_log_ticket_put(struct xlog_ticket *ticket);
192 195
196xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
197
198int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
199 struct xfs_log_vec *log_vector,
200 xfs_lsn_t *commit_lsn, int flags);
201bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
202
193#endif 203#endif
194 204
195 205
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
1/*
2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public License
14 * along with this program; if not, write the Free Software Foundation,
15 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16 */
17
18#include "xfs.h"
19#include "xfs_fs.h"
20#include "xfs_types.h"
21#include "xfs_bit.h"
22#include "xfs_log.h"
23#include "xfs_inum.h"
24#include "xfs_trans.h"
25#include "xfs_trans_priv.h"
26#include "xfs_log_priv.h"
27#include "xfs_sb.h"
28#include "xfs_ag.h"
29#include "xfs_dir2.h"
30#include "xfs_dmapi.h"
31#include "xfs_mount.h"
32#include "xfs_error.h"
33#include "xfs_alloc.h"
34
35/*
36 * Perform initial CIL structure initialisation. If the CIL is not
37 * enabled in this filesystem, ensure the log->l_cilp is null so
38 * we can check this conditional to determine if we are doing delayed
39 * logging or not.
40 */
41int
42xlog_cil_init(
43 struct log *log)
44{
45 struct xfs_cil *cil;
46 struct xfs_cil_ctx *ctx;
47
48 log->l_cilp = NULL;
49 if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
50 return 0;
51
52 cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
53 if (!cil)
54 return ENOMEM;
55
56 ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
57 if (!ctx) {
58 kmem_free(cil);
59 return ENOMEM;
60 }
61
62 INIT_LIST_HEAD(&cil->xc_cil);
63 INIT_LIST_HEAD(&cil->xc_committing);
64 spin_lock_init(&cil->xc_cil_lock);
65 init_rwsem(&cil->xc_ctx_lock);
66 sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
67
68 INIT_LIST_HEAD(&ctx->committing);
69 INIT_LIST_HEAD(&ctx->busy_extents);
70 ctx->sequence = 1;
71 ctx->cil = cil;
72 cil->xc_ctx = ctx;
73
74 cil->xc_log = log;
75 log->l_cilp = cil;
76 return 0;
77}
78
79void
80xlog_cil_destroy(
81 struct log *log)
82{
83 if (!log->l_cilp)
84 return;
85
86 if (log->l_cilp->xc_ctx) {
87 if (log->l_cilp->xc_ctx->ticket)
88 xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
89 kmem_free(log->l_cilp->xc_ctx);
90 }
91
92 ASSERT(list_empty(&log->l_cilp->xc_cil));
93 kmem_free(log->l_cilp);
94}
95
96/*
97 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
98 * recover, so we don't allow failure here. Also, we allocate in a context that
99 * we don't want to be issuing transactions from, so we need to tell the
100 * allocation code this as well.
101 *
102 * We don't reserve any space for the ticket - we are going to steal whatever
103 * space we require from transactions as they commit. To ensure we reserve all
104 * the space required, we need to set the current reservation of the ticket to
105 * zero so that we know to steal the initial transaction overhead from the
106 * first transaction commit.
107 */
108static struct xlog_ticket *
109xlog_cil_ticket_alloc(
110 struct log *log)
111{
112 struct xlog_ticket *tic;
113
114 tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
115 KM_SLEEP|KM_NOFS);
116 tic->t_trans_type = XFS_TRANS_CHECKPOINT;
117
118 /*
119 * set the current reservation to zero so we know to steal the basic
120 * transaction overhead reservation from the first transaction commit.
121 */
122 tic->t_curr_res = 0;
123 return tic;
124}
125
126/*
127 * After the first stage of log recovery is done, we know where the head and
128 * tail of the log are. We need this log initialisation done before we can
129 * initialise the first CIL checkpoint context.
130 *
131 * Here we allocate a log ticket to track space usage during a CIL push. This
132 * ticket is passed to xlog_write() directly so that we don't slowly leak log
133 * space by failing to account for space used by log headers and additional
134 * region headers for split regions.
135 */
136void
137xlog_cil_init_post_recovery(
138 struct log *log)
139{
140 if (!log->l_cilp)
141 return;
142
143 log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
144 log->l_cilp->xc_ctx->sequence = 1;
145 log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
146 log->l_curr_block);
147}
148
149/*
150 * Insert the log item into the CIL and calculate the difference in space
151 * consumed by the item. Add the space to the checkpoint ticket and calculate
152 * if the change requires additional log metadata. If it does, take that space
153 * as well. Remove the amount of space we addded to the checkpoint ticket from
154 * the current transaction ticket so that the accounting works out correctly.
155 *
156 * If this is the first time the item is being placed into the CIL in this
157 * context, pin it so it can't be written to disk until the CIL is flushed to
158 * the iclog and the iclog written to disk.
159 */
160static void
161xlog_cil_insert(
162 struct log *log,
163 struct xlog_ticket *ticket,
164 struct xfs_log_item *item,
165 struct xfs_log_vec *lv)
166{
167 struct xfs_cil *cil = log->l_cilp;
168 struct xfs_log_vec *old = lv->lv_item->li_lv;
169 struct xfs_cil_ctx *ctx = cil->xc_ctx;
170 int len;
171 int diff_iovecs;
172 int iclog_space;
173
174 if (old) {
175 /* existing lv on log item, space used is a delta */
176 ASSERT(!list_empty(&item->li_cil));
177 ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
178
179 len = lv->lv_buf_len - old->lv_buf_len;
180 diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
181 kmem_free(old->lv_buf);
182 kmem_free(old);
183 } else {
184 /* new lv, must pin the log item */
185 ASSERT(!lv->lv_item->li_lv);
186 ASSERT(list_empty(&item->li_cil));
187
188 len = lv->lv_buf_len;
189 diff_iovecs = lv->lv_niovecs;
190 IOP_PIN(lv->lv_item);
191
192 }
193 len += diff_iovecs * sizeof(xlog_op_header_t);
194
195 /* attach new log vector to log item */
196 lv->lv_item->li_lv = lv;
197
198 spin_lock(&cil->xc_cil_lock);
199 list_move_tail(&item->li_cil, &cil->xc_cil);
200 ctx->nvecs += diff_iovecs;
201
202 /*
203 * If this is the first time the item is being committed to the CIL,
204 * store the sequence number on the log item so we can tell
205 * in future commits whether this is the first checkpoint the item is
206 * being committed into.
207 */
208 if (!item->li_seq)
209 item->li_seq = ctx->sequence;
210
211 /*
212 * Now transfer enough transaction reservation to the context ticket
213 * for the checkpoint. The context ticket is special - the unit
214 * reservation has to grow as well as the current reservation as we
215 * steal from tickets so we can correctly determine the space used
216 * during the transaction commit.
217 */
218 if (ctx->ticket->t_curr_res == 0) {
219 /* first commit in checkpoint, steal the header reservation */
220 ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
221 ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
222 ticket->t_curr_res -= ctx->ticket->t_unit_res;
223 }
224
225 /* do we need space for more log record headers? */
226 iclog_space = log->l_iclog_size - log->l_iclog_hsize;
227 if (len > 0 && (ctx->space_used / iclog_space !=
228 (ctx->space_used + len) / iclog_space)) {
229 int hdrs;
230
231 hdrs = (len + iclog_space - 1) / iclog_space;
232 /* need to take into account split region headers, too */
233 hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
234 ctx->ticket->t_unit_res += hdrs;
235 ctx->ticket->t_curr_res += hdrs;
236 ticket->t_curr_res -= hdrs;
237 ASSERT(ticket->t_curr_res >= len);
238 }
239 ticket->t_curr_res -= len;
240 ctx->space_used += len;
241
242 spin_unlock(&cil->xc_cil_lock);
243}
244
245/*
246 * Format log item into a flat buffers
247 *
248 * For delayed logging, we need to hold a formatted buffer containing all the
249 * changes on the log item. This enables us to relog the item in memory and
250 * write it out asynchronously without needing to relock the object that was
251 * modified at the time it gets written into the iclog.
252 *
253 * This function builds a vector for the changes in each log item in the
254 * transaction. It then works out the length of the buffer needed for each log
255 * item, allocates them and formats the vector for the item into the buffer.
256 * The buffer is then attached to the log item are then inserted into the
257 * Committed Item List for tracking until the next checkpoint is written out.
258 *
259 * We don't set up region headers during this process; we simply copy the
260 * regions into the flat buffer. We can do this because we still have to do a
261 * formatting step to write the regions into the iclog buffer. Writing the
262 * ophdrs during the iclog write means that we can support splitting large
263 * regions across iclog boundares without needing a change in the format of the
264 * item/region encapsulation.
265 *
266 * Hence what we need to do now is change the rewrite the vector array to point
267 * to the copied region inside the buffer we just allocated. This allows us to
268 * format the regions into the iclog as though they are being formatted
269 * directly out of the objects themselves.
270 */
271static void
272xlog_cil_format_items(
273 struct log *log,
274 struct xfs_log_vec *log_vector,
275 struct xlog_ticket *ticket,
276 xfs_lsn_t *start_lsn)
277{
278 struct xfs_log_vec *lv;
279
280 if (start_lsn)
281 *start_lsn = log->l_cilp->xc_ctx->sequence;
282
283 ASSERT(log_vector);
284 for (lv = log_vector; lv; lv = lv->lv_next) {
285 void *ptr;
286 int index;
287 int len = 0;
288
289 /* build the vector array and calculate it's length */
290 IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
291 for (index = 0; index < lv->lv_niovecs; index++)
292 len += lv->lv_iovecp[index].i_len;
293
294 lv->lv_buf_len = len;
295 lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
296 ptr = lv->lv_buf;
297
298 for (index = 0; index < lv->lv_niovecs; index++) {
299 struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
300
301 memcpy(ptr, vec->i_addr, vec->i_len);
302 vec->i_addr = ptr;
303 ptr += vec->i_len;
304 }
305 ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
306
307 xlog_cil_insert(log, ticket, lv->lv_item, lv);
308 }
309}
310
311static void
312xlog_cil_free_logvec(
313 struct xfs_log_vec *log_vector)
314{
315 struct xfs_log_vec *lv;
316
317 for (lv = log_vector; lv; ) {
318 struct xfs_log_vec *next = lv->lv_next;
319 kmem_free(lv->lv_buf);
320 kmem_free(lv);
321 lv = next;
322 }
323}
324
325/*
326 * Commit a transaction with the given vector to the Committed Item List.
327 *
328 * To do this, we need to format the item, pin it in memory if required and
329 * account for the space used by the transaction. Once we have done that we
330 * need to release the unused reservation for the transaction, attach the
331 * transaction to the checkpoint context so we carry the busy extents through
332 * to checkpoint completion, and then unlock all the items in the transaction.
333 *
334 * For more specific information about the order of operations in
335 * xfs_log_commit_cil() please refer to the comments in
336 * xfs_trans_commit_iclog().
337 *
338 * Called with the context lock already held in read mode to lock out
339 * background commit, returns without it held once background commits are
340 * allowed again.
341 */
342int
343xfs_log_commit_cil(
344 struct xfs_mount *mp,
345 struct xfs_trans *tp,
346 struct xfs_log_vec *log_vector,
347 xfs_lsn_t *commit_lsn,
348 int flags)
349{
350 struct log *log = mp->m_log;
351 int log_flags = 0;
352 int push = 0;
353
354 if (flags & XFS_TRANS_RELEASE_LOG_RES)
355 log_flags = XFS_LOG_REL_PERM_RESERV;
356
357 if (XLOG_FORCED_SHUTDOWN(log)) {
358 xlog_cil_free_logvec(log_vector);
359 return XFS_ERROR(EIO);
360 }
361
362 /* lock out background commit */
363 down_read(&log->l_cilp->xc_ctx_lock);
364 xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
365
366 /* check we didn't blow the reservation */
367 if (tp->t_ticket->t_curr_res < 0)
368 xlog_print_tic_res(log->l_mp, tp->t_ticket);
369
370 /* attach the transaction to the CIL if it has any busy extents */
371 if (!list_empty(&tp->t_busy)) {
372 spin_lock(&log->l_cilp->xc_cil_lock);
373 list_splice_init(&tp->t_busy,
374 &log->l_cilp->xc_ctx->busy_extents);
375 spin_unlock(&log->l_cilp->xc_cil_lock);
376 }
377
378 tp->t_commit_lsn = *commit_lsn;
379 xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
380 xfs_trans_unreserve_and_mod_sb(tp);
381
382 /* check for background commit before unlock */
383 if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
384 push = 1;
385 up_read(&log->l_cilp->xc_ctx_lock);
386
387 /*
388 * We need to push CIL every so often so we don't cache more than we
389 * can fit in the log. The limit really is that a checkpoint can't be
390 * more than half the log (the current checkpoint is not allowed to
391 * overwrite the previous checkpoint), but commit latency and memory
392 * usage limit this to a smaller size in most cases.
393 */
394 if (push)
395 xlog_cil_push(log, 0);
396 return 0;
397}
398
399/*
400 * Mark all items committed and clear busy extents. We free the log vector
401 * chains in a separate pass so that we unpin the log items as quickly as
402 * possible.
403 */
404static void
405xlog_cil_committed(
406 void *args,
407 int abort)
408{
409 struct xfs_cil_ctx *ctx = args;
410 struct xfs_log_vec *lv;
411 int abortflag = abort ? XFS_LI_ABORTED : 0;
412 struct xfs_busy_extent *busyp, *n;
413
414 /* unpin all the log items */
415 for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
416 xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
417 abortflag);
418 }
419
420 list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
421 xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
422
423 spin_lock(&ctx->cil->xc_cil_lock);
424 list_del(&ctx->committing);
425 spin_unlock(&ctx->cil->xc_cil_lock);
426
427 xlog_cil_free_logvec(ctx->lv_chain);
428 kmem_free(ctx);
429}
430
431/*
432 * Push the Committed Item List to the log. If the push_now flag is not set,
433 * then it is a background flush and so we can chose to ignore it.
434 */
435int
436xlog_cil_push(
437 struct log *log,
438 int push_now)
439{
440 struct xfs_cil *cil = log->l_cilp;
441 struct xfs_log_vec *lv;
442 struct xfs_cil_ctx *ctx;
443 struct xfs_cil_ctx *new_ctx;
444 struct xlog_in_core *commit_iclog;
445 struct xlog_ticket *tic;
446 int num_lv;
447 int num_iovecs;
448 int len;
449 int error = 0;
450 struct xfs_trans_header thdr;
451 struct xfs_log_iovec lhdr;
452 struct xfs_log_vec lvhdr = { NULL };
453 xfs_lsn_t commit_lsn;
454
455 if (!cil)
456 return 0;
457
458 new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
459 new_ctx->ticket = xlog_cil_ticket_alloc(log);
460
461 /* lock out transaction commit, but don't block on background push */
462 if (!down_write_trylock(&cil->xc_ctx_lock)) {
463 if (!push_now)
464 goto out_free_ticket;
465 down_write(&cil->xc_ctx_lock);
466 }
467 ctx = cil->xc_ctx;
468
469 /* check if we've anything to push */
470 if (list_empty(&cil->xc_cil))
471 goto out_skip;
472
473 /* check for spurious background flush */
474 if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
475 goto out_skip;
476
477 /*
478 * pull all the log vectors off the items in the CIL, and
479 * remove the items from the CIL. We don't need the CIL lock
480 * here because it's only needed on the transaction commit
481 * side which is currently locked out by the flush lock.
482 */
483 lv = NULL;
484 num_lv = 0;
485 num_iovecs = 0;
486 len = 0;
487 while (!list_empty(&cil->xc_cil)) {
488 struct xfs_log_item *item;
489 int i;
490
491 item = list_first_entry(&cil->xc_cil,
492 struct xfs_log_item, li_cil);
493 list_del_init(&item->li_cil);
494 if (!ctx->lv_chain)
495 ctx->lv_chain = item->li_lv;
496 else
497 lv->lv_next = item->li_lv;
498 lv = item->li_lv;
499 item->li_lv = NULL;
500
501 num_lv++;
502 num_iovecs += lv->lv_niovecs;
503 for (i = 0; i < lv->lv_niovecs; i++)
504 len += lv->lv_iovecp[i].i_len;
505 }
506
507 /*
508 * initialise the new context and attach it to the CIL. Then attach
509 * the current context to the CIL committing lsit so it can be found
510 * during log forces to extract the commit lsn of the sequence that
511 * needs to be forced.
512 */
513 INIT_LIST_HEAD(&new_ctx->committing);
514 INIT_LIST_HEAD(&new_ctx->busy_extents);
515 new_ctx->sequence = ctx->sequence + 1;
516 new_ctx->cil = cil;
517 cil->xc_ctx = new_ctx;
518
519 /*
520 * The switch is now done, so we can drop the context lock and move out
521 * of a shared context. We can't just go straight to the commit record,
522 * though - we need to synchronise with previous and future commits so
523 * that the commit records are correctly ordered in the log to ensure
524 * that we process items during log IO completion in the correct order.
525 *
526 * For example, if we get an EFI in one checkpoint and the EFD in the
527 * next (e.g. due to log forces), we do not want the checkpoint with
528 * the EFD to be committed before the checkpoint with the EFI. Hence
529 * we must strictly order the commit records of the checkpoints so
530 * that: a) the checkpoint callbacks are attached to the iclogs in the
531 * correct order; and b) the checkpoints are replayed in correct order
532 * in log recovery.
533 *
534 * Hence we need to add this context to the committing context list so
535 * that higher sequences will wait for us to write out a commit record
536 * before they do.
537 */
538 spin_lock(&cil->xc_cil_lock);
539 list_add(&ctx->committing, &cil->xc_committing);
540 spin_unlock(&cil->xc_cil_lock);
541 up_write(&cil->xc_ctx_lock);
542
543 /*
544 * Build a checkpoint transaction header and write it to the log to
545 * begin the transaction. We need to account for the space used by the
546 * transaction header here as it is not accounted for in xlog_write().
547 *
548 * The LSN we need to pass to the log items on transaction commit is
549 * the LSN reported by the first log vector write. If we use the commit
550 * record lsn then we can move the tail beyond the grant write head.
551 */
552 tic = ctx->ticket;
553 thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
554 thdr.th_type = XFS_TRANS_CHECKPOINT;
555 thdr.th_tid = tic->t_tid;
556 thdr.th_num_items = num_iovecs;
557 lhdr.i_addr = (xfs_caddr_t)&thdr;
558 lhdr.i_len = sizeof(xfs_trans_header_t);
559 lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
560 tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
561
562 lvhdr.lv_niovecs = 1;
563 lvhdr.lv_iovecp = &lhdr;
564 lvhdr.lv_next = ctx->lv_chain;
565
566 error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
567 if (error)
568 goto out_abort;
569
570 /*
571 * now that we've written the checkpoint into the log, strictly
572 * order the commit records so replay will get them in the right order.
573 */
574restart:
575 spin_lock(&cil->xc_cil_lock);
576 list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
577 /*
578 * Higher sequences will wait for this one so skip them.
579 * Don't wait for own own sequence, either.
580 */
581 if (new_ctx->sequence >= ctx->sequence)
582 continue;
583 if (!new_ctx->commit_lsn) {
584 /*
585 * It is still being pushed! Wait for the push to
586 * complete, then start again from the beginning.
587 */
588 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
589 goto restart;
590 }
591 }
592 spin_unlock(&cil->xc_cil_lock);
593
594 commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
595 if (error || commit_lsn == -1)
596 goto out_abort;
597
598 /* attach all the transactions w/ busy extents to iclog */
599 ctx->log_cb.cb_func = xlog_cil_committed;
600 ctx->log_cb.cb_arg = ctx;
601 error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
602 if (error)
603 goto out_abort;
604
605 /*
606 * now the checkpoint commit is complete and we've attached the
607 * callbacks to the iclog we can assign the commit LSN to the context
608 * and wake up anyone who is waiting for the commit to complete.
609 */
610 spin_lock(&cil->xc_cil_lock);
611 ctx->commit_lsn = commit_lsn;
612 sv_broadcast(&cil->xc_commit_wait);
613 spin_unlock(&cil->xc_cil_lock);
614
615 /* release the hounds! */
616 return xfs_log_release_iclog(log->l_mp, commit_iclog);
617
618out_skip:
619 up_write(&cil->xc_ctx_lock);
620out_free_ticket:
621 xfs_log_ticket_put(new_ctx->ticket);
622 kmem_free(new_ctx);
623 return 0;
624
625out_abort:
626 xlog_cil_committed(ctx, XFS_LI_ABORTED);
627 return XFS_ERROR(EIO);
628}
629
630/*
631 * Conditionally push the CIL based on the sequence passed in.
632 *
633 * We only need to push if we haven't already pushed the sequence
634 * number given. Hence the only time we will trigger a push here is
635 * if the push sequence is the same as the current context.
636 *
637 * We return the current commit lsn to allow the callers to determine if a
638 * iclog flush is necessary following this call.
639 *
640 * XXX: Initially, just push the CIL unconditionally and return whatever
641 * commit lsn is there. It'll be empty, so this is broken for now.
642 */
643xfs_lsn_t
644xlog_cil_push_lsn(
645 struct log *log,
646 xfs_lsn_t push_seq)
647{
648 struct xfs_cil *cil = log->l_cilp;
649 struct xfs_cil_ctx *ctx;
650 xfs_lsn_t commit_lsn = NULLCOMMITLSN;
651
652restart:
653 down_write(&cil->xc_ctx_lock);
654 ASSERT(push_seq <= cil->xc_ctx->sequence);
655
656 /* check to see if we need to force out the current context */
657 if (push_seq == cil->xc_ctx->sequence) {
658 up_write(&cil->xc_ctx_lock);
659 xlog_cil_push(log, 1);
660 goto restart;
661 }
662
663 /*
664 * See if we can find a previous sequence still committing.
665 * We can drop the flush lock as soon as we have the cil lock
666 * because we are now only comparing contexts protected by
667 * the cil lock.
668 *
669 * We need to wait for all previous sequence commits to complete
670 * before allowing the force of push_seq to go ahead. Hence block
671 * on commits for those as well.
672 */
673 spin_lock(&cil->xc_cil_lock);
674 up_write(&cil->xc_ctx_lock);
675 list_for_each_entry(ctx, &cil->xc_committing, committing) {
676 if (ctx->sequence > push_seq)
677 continue;
678 if (!ctx->commit_lsn) {
679 /*
680 * It is still being pushed! Wait for the push to
681 * complete, then start again from the beginning.
682 */
683 sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
684 goto restart;
685 }
686 if (ctx->sequence != push_seq)
687 continue;
688 /* found it! */
689 commit_lsn = ctx->commit_lsn;
690 }
691 spin_unlock(&cil->xc_cil_lock);
692 return commit_lsn;
693}
694
695/*
696 * Check if the current log item was first committed in this sequence.
697 * We can't rely on just the log item being in the CIL, we have to check
698 * the recorded commit sequence number.
699 *
700 * Note: for this to be used in a non-racy manner, it has to be called with
701 * CIL flushing locked out. As a result, it should only be used during the
702 * transaction commit process when deciding what to format into the item.
703 */
704bool
705xfs_log_item_in_current_chkpt(
706 struct xfs_log_item *lip)
707{
708 struct xfs_cil_ctx *ctx;
709
710 if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
711 return false;
712 if (list_empty(&lip->li_cil))
713 return false;
714
715 ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
716
717 /*
718 * li_seq is written on the first commit of a log item to record the
719 * first checkpoint it is written to. Hence if it is different to the
720 * current sequence, we're in a new checkpoint.
721 */
722 if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
723 return false;
724 return true;
725}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9cf695154451..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ 152#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being 153#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
154 shutdown */ 154 shutdown */
155typedef __uint32_t xlog_tid_t;
156
157 155
158#ifdef __KERNEL__ 156#ifdef __KERNEL__
159/* 157/*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
379} xlog_in_core_t; 377} xlog_in_core_t;
380 378
381/* 379/*
380 * The CIL context is used to aggregate per-transaction details as well be
381 * passed to the iclog for checkpoint post-commit processing. After being
382 * passed to the iclog, another context needs to be allocated for tracking the
383 * next set of transactions to be aggregated into a checkpoint.
384 */
385struct xfs_cil;
386
387struct xfs_cil_ctx {
388 struct xfs_cil *cil;
389 xfs_lsn_t sequence; /* chkpt sequence # */
390 xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
391 xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
392 struct xlog_ticket *ticket; /* chkpt ticket */
393 int nvecs; /* number of regions */
394 int space_used; /* aggregate size of regions */
395 struct list_head busy_extents; /* busy extents in chkpt */
396 struct xfs_log_vec *lv_chain; /* logvecs being pushed */
397 xfs_log_callback_t log_cb; /* completion callback hook. */
398 struct list_head committing; /* ctx committing list */
399};
400
401/*
402 * Committed Item List structure
403 *
404 * This structure is used to track log items that have been committed but not
405 * yet written into the log. It is used only when the delayed logging mount
406 * option is enabled.
407 *
408 * This structure tracks the list of committing checkpoint contexts so
409 * we can avoid the problem of having to hold out new transactions during a
410 * flush until we have a the commit record LSN of the checkpoint. We can
411 * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
412 * sequence match and extract the commit LSN directly from there. If the
413 * checkpoint is still in the process of committing, we can block waiting for
414 * the commit LSN to be determined as well. This should make synchronous
415 * operations almost as efficient as the old logging methods.
416 */
417struct xfs_cil {
418 struct log *xc_log;
419 struct list_head xc_cil;
420 spinlock_t xc_cil_lock;
421 struct xfs_cil_ctx *xc_ctx;
422 struct rw_semaphore xc_ctx_lock;
423 struct list_head xc_committing;
424 sv_t xc_commit_wait;
425};
426
427/*
428 * The amount of log space we should the CIL to aggregate is difficult to size.
429 * Whatever we chose we have to make we can get a reservation for the log space
430 * effectively, that it is large enough to capture sufficient relogging to
431 * reduce log buffer IO significantly, but it is not too large for the log or
432 * induces too much latency when writing out through the iclogs. We track both
433 * space consumed and the number of vectors in the checkpoint context, so we
434 * need to decide which to use for limiting.
435 *
436 * Every log buffer we write out during a push needs a header reserved, which
437 * is at least one sector and more for v2 logs. Hence we need a reservation of
438 * at least 512 bytes per 32k of log space just for the LR headers. That means
439 * 16KB of reservation per megabyte of delayed logging space we will consume,
440 * plus various headers. The number of headers will vary based on the num of
441 * io vectors, so limiting on a specific number of vectors is going to result
442 * in transactions of varying size. IOWs, it is more consistent to track and
443 * limit space consumed in the log rather than by the number of objects being
444 * logged in order to prevent checkpoint ticket overruns.
445 *
446 * Further, use of static reservations through the log grant mechanism is
447 * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
448 * grant) and a significant deadlock potential because regranting write space
449 * can block on log pushes. Hence if we have to regrant log space during a log
450 * push, we can deadlock.
451 *
452 * However, we can avoid this by use of a dynamic "reservation stealing"
453 * technique during transaction commit whereby unused reservation space in the
454 * transaction ticket is transferred to the CIL ctx commit ticket to cover the
455 * space needed by the checkpoint transaction. This means that we never need to
456 * specifically reserve space for the CIL checkpoint transaction, nor do we
457 * need to regrant space once the checkpoint completes. This also means the
458 * checkpoint transaction ticket is specific to the checkpoint context, rather
459 * than the CIL itself.
460 *
461 * With dynamic reservations, we can basically make up arbitrary limits for the
462 * checkpoint size so long as they don't violate any other size rules. Hence
463 * the initial maximum size for the checkpoint transaction will be set to a
464 * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
465 * right now based on the latency of writing out a large amount of data through
466 * the circular iclog buffers.
467 */
468
469#define XLOG_CIL_SPACE_LIMIT(log) \
470 (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
471
472/*
382 * The reservation head lsn is not made up of a cycle number and block number. 473 * The reservation head lsn is not made up of a cycle number and block number.
383 * Instead, it uses a cycle number and byte number. Logs don't expect to 474 * Instead, it uses a cycle number and byte number. Logs don't expect to
384 * overflow 31 bits worth of byte offset, so using a byte number will mean 475 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
388 /* The following fields don't need locking */ 479 /* The following fields don't need locking */
389 struct xfs_mount *l_mp; /* mount point */ 480 struct xfs_mount *l_mp; /* mount point */
390 struct xfs_ail *l_ailp; /* AIL log is working with */ 481 struct xfs_ail *l_ailp; /* AIL log is working with */
482 struct xfs_cil *l_cilp; /* CIL log is working with */
391 struct xfs_buf *l_xbuf; /* extra buffer for log 483 struct xfs_buf *l_xbuf; /* extra buffer for log
392 * wrapping */ 484 * wrapping */
393 struct xfs_buftarg *l_targ; /* buftarg of log */ 485 struct xfs_buftarg *l_targ; /* buftarg of log */
@@ -438,14 +530,17 @@ typedef struct log {
438 530
439#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 531#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
440 532
441
442/* common routines */ 533/* common routines */
443extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp); 534extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
444extern int xlog_recover(xlog_t *log); 535extern int xlog_recover(xlog_t *log);
445extern int xlog_recover_finish(xlog_t *log); 536extern int xlog_recover_finish(xlog_t *log);
446extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int); 537extern void xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
447 538
448extern kmem_zone_t *xfs_log_ticket_zone; 539extern kmem_zone_t *xfs_log_ticket_zone;
540struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
541 int count, char client, uint xflags,
542 int alloc_flags);
543
449 544
450static inline void 545static inline void
451xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) 546xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
455 *off += bytes; 550 *off += bytes;
456} 551}
457 552
553void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
554int xlog_write(struct log *log, struct xfs_log_vec *log_vector,
555 struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
556 xlog_in_core_t **commit_iclog, uint flags);
557
558/*
559 * Committed Item List interfaces
560 */
561int xlog_cil_init(struct log *log);
562void xlog_cil_init_post_recovery(struct log *log);
563void xlog_cil_destroy(struct log *log);
564
565int xlog_cil_push(struct log *log, int push_now);
566xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
567
458/* 568/*
459 * Unmount record type is used as a pseudo transaction type for the ticket. 569 * Unmount record type is used as a pseudo transaction type for the ticket.
460 * It's value must be outside the range of XFS_TRANS_* values. 570 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0de08e366315..ed0684cc50ee 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -132,15 +132,10 @@ xlog_align(
132 int nbblks, 132 int nbblks,
133 xfs_buf_t *bp) 133 xfs_buf_t *bp)
134{ 134{
135 xfs_daddr_t offset; 135 xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
136 xfs_caddr_t ptr;
137 136
138 offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1); 137 ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
139 ptr = XFS_BUF_PTR(bp) + BBTOB(offset); 138 return XFS_BUF_PTR(bp) + BBTOB(offset);
140
141 ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
142
143 return ptr;
144} 139}
145 140
146 141
@@ -1576,7 +1571,7 @@ xlog_recover_reorder_trans(
1576 1571
1577 switch (ITEM_TYPE(item)) { 1572 switch (ITEM_TYPE(item)) {
1578 case XFS_LI_BUF: 1573 case XFS_LI_BUF:
1579 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) { 1574 if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1580 trace_xfs_log_recover_item_reorder_head(log, 1575 trace_xfs_log_recover_item_reorder_head(log,
1581 trans, item, pass); 1576 trans, item, pass);
1582 list_move(&item->ri_list, &trans->r_itemq); 1577 list_move(&item->ri_list, &trans->r_itemq);
@@ -1638,7 +1633,7 @@ xlog_recover_do_buffer_pass1(
1638 /* 1633 /*
1639 * If this isn't a cancel buffer item, then just return. 1634 * If this isn't a cancel buffer item, then just return.
1640 */ 1635 */
1641 if (!(flags & XFS_BLI_CANCEL)) { 1636 if (!(flags & XFS_BLF_CANCEL)) {
1642 trace_xfs_log_recover_buf_not_cancel(log, buf_f); 1637 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1643 return; 1638 return;
1644 } 1639 }
@@ -1696,7 +1691,7 @@ xlog_recover_do_buffer_pass1(
1696 * Check to see whether the buffer being recovered has a corresponding 1691 * Check to see whether the buffer being recovered has a corresponding
1697 * entry in the buffer cancel record table. If it does then return 1 1692 * entry in the buffer cancel record table. If it does then return 1
1698 * so that it will be cancelled, otherwise return 0. If the buffer is 1693 * so that it will be cancelled, otherwise return 0. If the buffer is
1699 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement 1694 * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1700 * the refcount on the entry in the table and remove it from the table 1695 * the refcount on the entry in the table and remove it from the table
1701 * if this is the last reference. 1696 * if this is the last reference.
1702 * 1697 *
@@ -1721,7 +1716,7 @@ xlog_check_buffer_cancelled(
1721 * There is nothing in the table built in pass one, 1716 * There is nothing in the table built in pass one,
1722 * so this buffer must not be cancelled. 1717 * so this buffer must not be cancelled.
1723 */ 1718 */
1724 ASSERT(!(flags & XFS_BLI_CANCEL)); 1719 ASSERT(!(flags & XFS_BLF_CANCEL));
1725 return 0; 1720 return 0;
1726 } 1721 }
1727 1722
@@ -1733,7 +1728,7 @@ xlog_check_buffer_cancelled(
1733 * There is no corresponding entry in the table built 1728 * There is no corresponding entry in the table built
1734 * in pass one, so this buffer has not been cancelled. 1729 * in pass one, so this buffer has not been cancelled.
1735 */ 1730 */
1736 ASSERT(!(flags & XFS_BLI_CANCEL)); 1731 ASSERT(!(flags & XFS_BLF_CANCEL));
1737 return 0; 1732 return 0;
1738 } 1733 }
1739 1734
@@ -1752,7 +1747,7 @@ xlog_check_buffer_cancelled(
1752 * one in the table and remove it if this is the 1747 * one in the table and remove it if this is the
1753 * last reference. 1748 * last reference.
1754 */ 1749 */
1755 if (flags & XFS_BLI_CANCEL) { 1750 if (flags & XFS_BLF_CANCEL) {
1756 bcp->bc_refcount--; 1751 bcp->bc_refcount--;
1757 if (bcp->bc_refcount == 0) { 1752 if (bcp->bc_refcount == 0) {
1758 if (prevp == NULL) { 1753 if (prevp == NULL) {
@@ -1772,7 +1767,7 @@ xlog_check_buffer_cancelled(
1772 * We didn't find a corresponding entry in the table, so 1767 * We didn't find a corresponding entry in the table, so
1773 * return 0 so that the buffer is NOT cancelled. 1768 * return 0 so that the buffer is NOT cancelled.
1774 */ 1769 */
1775 ASSERT(!(flags & XFS_BLI_CANCEL)); 1770 ASSERT(!(flags & XFS_BLF_CANCEL));
1776 return 0; 1771 return 0;
1777} 1772}
1778 1773
@@ -1874,8 +1869,8 @@ xlog_recover_do_inode_buffer(
1874 nbits = xfs_contig_bits(data_map, map_size, 1869 nbits = xfs_contig_bits(data_map, map_size,
1875 bit); 1870 bit);
1876 ASSERT(nbits > 0); 1871 ASSERT(nbits > 0);
1877 reg_buf_offset = bit << XFS_BLI_SHIFT; 1872 reg_buf_offset = bit << XFS_BLF_SHIFT;
1878 reg_buf_bytes = nbits << XFS_BLI_SHIFT; 1873 reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1879 item_index++; 1874 item_index++;
1880 } 1875 }
1881 1876
@@ -1889,7 +1884,7 @@ xlog_recover_do_inode_buffer(
1889 } 1884 }
1890 1885
1891 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1886 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1892 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); 1887 ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1893 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1888 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1894 1889
1895 /* 1890 /*
@@ -1955,9 +1950,9 @@ xlog_recover_do_reg_buffer(
1955 nbits = xfs_contig_bits(data_map, map_size, bit); 1950 nbits = xfs_contig_bits(data_map, map_size, bit);
1956 ASSERT(nbits > 0); 1951 ASSERT(nbits > 0);
1957 ASSERT(item->ri_buf[i].i_addr != NULL); 1952 ASSERT(item->ri_buf[i].i_addr != NULL);
1958 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); 1953 ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1959 ASSERT(XFS_BUF_COUNT(bp) >= 1954 ASSERT(XFS_BUF_COUNT(bp) >=
1960 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); 1955 ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
1961 1956
1962 /* 1957 /*
1963 * Do a sanity check if this is a dquot buffer. Just checking 1958 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1966,7 +1961,7 @@ xlog_recover_do_reg_buffer(
1966 */ 1961 */
1967 error = 0; 1962 error = 0;
1968 if (buf_f->blf_flags & 1963 if (buf_f->blf_flags &
1969 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1964 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1970 if (item->ri_buf[i].i_addr == NULL) { 1965 if (item->ri_buf[i].i_addr == NULL) {
1971 cmn_err(CE_ALERT, 1966 cmn_err(CE_ALERT,
1972 "XFS: NULL dquot in %s.", __func__); 1967 "XFS: NULL dquot in %s.", __func__);
@@ -1987,9 +1982,9 @@ xlog_recover_do_reg_buffer(
1987 } 1982 }
1988 1983
1989 memcpy(xfs_buf_offset(bp, 1984 memcpy(xfs_buf_offset(bp,
1990 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1985 (uint)bit << XFS_BLF_SHIFT), /* dest */
1991 item->ri_buf[i].i_addr, /* source */ 1986 item->ri_buf[i].i_addr, /* source */
1992 nbits<<XFS_BLI_SHIFT); /* length */ 1987 nbits<<XFS_BLF_SHIFT); /* length */
1993 next: 1988 next:
1994 i++; 1989 i++;
1995 bit += nbits; 1990 bit += nbits;
@@ -2148,11 +2143,11 @@ xlog_recover_do_dquot_buffer(
2148 } 2143 }
2149 2144
2150 type = 0; 2145 type = 0;
2151 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) 2146 if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2152 type |= XFS_DQ_USER; 2147 type |= XFS_DQ_USER;
2153 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) 2148 if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2154 type |= XFS_DQ_PROJ; 2149 type |= XFS_DQ_PROJ;
2155 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) 2150 if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2156 type |= XFS_DQ_GROUP; 2151 type |= XFS_DQ_GROUP;
2157 /* 2152 /*
2158 * This type of quotas was turned off, so ignore this buffer 2153 * This type of quotas was turned off, so ignore this buffer
@@ -2173,7 +2168,7 @@ xlog_recover_do_dquot_buffer(
2173 * here which overlaps that may be stale. 2168 * here which overlaps that may be stale.
2174 * 2169 *
2175 * When meta-data buffers are freed at run time we log a buffer item 2170 * When meta-data buffers are freed at run time we log a buffer item
2176 * with the XFS_BLI_CANCEL bit set to indicate that previous copies 2171 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2177 * of the buffer in the log should not be replayed at recovery time. 2172 * of the buffer in the log should not be replayed at recovery time.
2178 * This is so that if the blocks covered by the buffer are reused for 2173 * This is so that if the blocks covered by the buffer are reused for
2179 * file data before we crash we don't end up replaying old, freed 2174 * file data before we crash we don't end up replaying old, freed
@@ -2207,7 +2202,7 @@ xlog_recover_do_buffer_trans(
2207 if (pass == XLOG_RECOVER_PASS1) { 2202 if (pass == XLOG_RECOVER_PASS1) {
2208 /* 2203 /*
2209 * In this pass we're only looking for buf items 2204 * In this pass we're only looking for buf items
2210 * with the XFS_BLI_CANCEL bit set. 2205 * with the XFS_BLF_CANCEL bit set.
2211 */ 2206 */
2212 xlog_recover_do_buffer_pass1(log, buf_f); 2207 xlog_recover_do_buffer_pass1(log, buf_f);
2213 return 0; 2208 return 0;
@@ -2244,7 +2239,7 @@ xlog_recover_do_buffer_trans(
2244 2239
2245 mp = log->l_mp; 2240 mp = log->l_mp;
2246 buf_flags = XBF_LOCK; 2241 buf_flags = XBF_LOCK;
2247 if (!(flags & XFS_BLI_INODE_BUF)) 2242 if (!(flags & XFS_BLF_INODE_BUF))
2248 buf_flags |= XBF_MAPPED; 2243 buf_flags |= XBF_MAPPED;
2249 2244
2250 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); 2245 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2257,10 +2252,10 @@ xlog_recover_do_buffer_trans(
2257 } 2252 }
2258 2253
2259 error = 0; 2254 error = 0;
2260 if (flags & XFS_BLI_INODE_BUF) { 2255 if (flags & XFS_BLF_INODE_BUF) {
2261 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2256 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2262 } else if (flags & 2257 } else if (flags &
2263 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2258 (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2264 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2259 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2265 } else { 2260 } else {
2266 xlog_recover_do_reg_buffer(mp, item, bp, buf_f); 2261 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
28#define XLOG_RHASH(tid) \ 28#define XLOG_RHASH(tid) \
29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1)) 29 ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
30 30
31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1) 31#define XLOG_MAX_REGIONS_IN_ITEM (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
32 32
33 33
34/* 34/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d7bf38c8cd1c..d59f4e8bedcf 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -268,10 +268,10 @@ xfs_sb_validate_fsb_count(
268 268
269#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ 269#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */
270 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) 270 if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
271 return E2BIG; 271 return EFBIG;
272#else /* Limited by UINT_MAX of sectors */ 272#else /* Limited by UINT_MAX of sectors */
273 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) 273 if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
274 return E2BIG; 274 return EFBIG;
275#endif 275#endif
276 return 0; 276 return 0;
277} 277}
@@ -393,7 +393,7 @@ xfs_mount_validate_sb(
393 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { 393 xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
394 xfs_fs_mount_cmn_err(flags, 394 xfs_fs_mount_cmn_err(flags,
395 "file system too large to be mounted on this system."); 395 "file system too large to be mounted on this system.");
396 return XFS_ERROR(E2BIG); 396 return XFS_ERROR(EFBIG);
397 } 397 }
398 398
399 if (unlikely(sbp->sb_inprogress)) { 399 if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +413,6 @@ xfs_mount_validate_sb(
413 return 0; 413 return 0;
414} 414}
415 415
416STATIC void
417xfs_initialize_perag_icache(
418 xfs_perag_t *pag)
419{
420 if (!pag->pag_ici_init) {
421 rwlock_init(&pag->pag_ici_lock);
422 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
423 pag->pag_ici_init = 1;
424 }
425}
426
427int 416int
428xfs_initialize_perag( 417xfs_initialize_perag(
429 xfs_mount_t *mp, 418 xfs_mount_t *mp,
@@ -436,13 +425,8 @@ xfs_initialize_perag(
436 xfs_agino_t agino; 425 xfs_agino_t agino;
437 xfs_ino_t ino; 426 xfs_ino_t ino;
438 xfs_sb_t *sbp = &mp->m_sb; 427 xfs_sb_t *sbp = &mp->m_sb;
439 xfs_ino_t max_inum = XFS_MAXINUMBER_32;
440 int error = -ENOMEM; 428 int error = -ENOMEM;
441 429
442 /* Check to see if the filesystem can overflow 32 bit inodes */
443 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
444 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
445
446 /* 430 /*
447 * Walk the current per-ag tree so we don't try to initialise AGs 431 * Walk the current per-ag tree so we don't try to initialise AGs
448 * that already exist (growfs case). Allocate and insert all the 432 * that already exist (growfs case). Allocate and insert all the
@@ -456,11 +440,18 @@ xfs_initialize_perag(
456 } 440 }
457 if (!first_initialised) 441 if (!first_initialised)
458 first_initialised = index; 442 first_initialised = index;
443
459 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); 444 pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
460 if (!pag) 445 if (!pag)
461 goto out_unwind; 446 goto out_unwind;
447 pag->pag_agno = index;
448 pag->pag_mount = mp;
449 rwlock_init(&pag->pag_ici_lock);
450 INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
451
462 if (radix_tree_preload(GFP_NOFS)) 452 if (radix_tree_preload(GFP_NOFS))
463 goto out_unwind; 453 goto out_unwind;
454
464 spin_lock(&mp->m_perag_lock); 455 spin_lock(&mp->m_perag_lock);
465 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { 456 if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
466 BUG(); 457 BUG();
@@ -469,25 +460,26 @@ xfs_initialize_perag(
469 error = -EEXIST; 460 error = -EEXIST;
470 goto out_unwind; 461 goto out_unwind;
471 } 462 }
472 pag->pag_agno = index;
473 pag->pag_mount = mp;
474 spin_unlock(&mp->m_perag_lock); 463 spin_unlock(&mp->m_perag_lock);
475 radix_tree_preload_end(); 464 radix_tree_preload_end();
476 } 465 }
477 466
478 /* Clear the mount flag if no inode can overflow 32 bits 467 /*
479 * on this filesystem, or if specifically requested.. 468 * If we mount with the inode64 option, or no inode overflows
469 * the legacy 32-bit address space clear the inode32 option.
480 */ 470 */
481 if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) { 471 agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
472 ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
473
474 if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
482 mp->m_flags |= XFS_MOUNT_32BITINODES; 475 mp->m_flags |= XFS_MOUNT_32BITINODES;
483 } else { 476 else
484 mp->m_flags &= ~XFS_MOUNT_32BITINODES; 477 mp->m_flags &= ~XFS_MOUNT_32BITINODES;
485 }
486 478
487 /* If we can overflow then setup the ag headers accordingly */
488 if (mp->m_flags & XFS_MOUNT_32BITINODES) { 479 if (mp->m_flags & XFS_MOUNT_32BITINODES) {
489 /* Calculate how much should be reserved for inodes to 480 /*
490 * meet the max inode percentage. 481 * Calculate how much should be reserved for inodes to meet
482 * the max inode percentage.
491 */ 483 */
492 if (mp->m_maxicount) { 484 if (mp->m_maxicount) {
493 __uint64_t icount; 485 __uint64_t icount;
@@ -500,30 +492,28 @@ xfs_initialize_perag(
500 } else { 492 } else {
501 max_metadata = agcount; 493 max_metadata = agcount;
502 } 494 }
495
503 for (index = 0; index < agcount; index++) { 496 for (index = 0; index < agcount; index++) {
504 ino = XFS_AGINO_TO_INO(mp, index, agino); 497 ino = XFS_AGINO_TO_INO(mp, index, agino);
505 if (ino > max_inum) { 498 if (ino > XFS_MAXINUMBER_32) {
506 index++; 499 index++;
507 break; 500 break;
508 } 501 }
509 502
510 /* This ag is preferred for inodes */
511 pag = xfs_perag_get(mp, index); 503 pag = xfs_perag_get(mp, index);
512 pag->pagi_inodeok = 1; 504 pag->pagi_inodeok = 1;
513 if (index < max_metadata) 505 if (index < max_metadata)
514 pag->pagf_metadata = 1; 506 pag->pagf_metadata = 1;
515 xfs_initialize_perag_icache(pag);
516 xfs_perag_put(pag); 507 xfs_perag_put(pag);
517 } 508 }
518 } else { 509 } else {
519 /* Setup default behavior for smaller filesystems */
520 for (index = 0; index < agcount; index++) { 510 for (index = 0; index < agcount; index++) {
521 pag = xfs_perag_get(mp, index); 511 pag = xfs_perag_get(mp, index);
522 pag->pagi_inodeok = 1; 512 pag->pagi_inodeok = 1;
523 xfs_initialize_perag_icache(pag);
524 xfs_perag_put(pag); 513 xfs_perag_put(pag);
525 } 514 }
526 } 515 }
516
527 if (maxagi) 517 if (maxagi)
528 *maxagi = index; 518 *maxagi = index;
529 return 0; 519 return 0;
@@ -1009,7 +999,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1009 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 999 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
1010 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { 1000 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
1011 cmn_err(CE_WARN, "XFS: size check 1 failed"); 1001 cmn_err(CE_WARN, "XFS: size check 1 failed");
1012 return XFS_ERROR(E2BIG); 1002 return XFS_ERROR(EFBIG);
1013 } 1003 }
1014 error = xfs_read_buf(mp, mp->m_ddev_targp, 1004 error = xfs_read_buf(mp, mp->m_ddev_targp,
1015 d - XFS_FSS_TO_BB(mp, 1), 1005 d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1019 } else { 1009 } else {
1020 cmn_err(CE_WARN, "XFS: size check 2 failed"); 1010 cmn_err(CE_WARN, "XFS: size check 2 failed");
1021 if (error == ENOSPC) 1011 if (error == ENOSPC)
1022 error = XFS_ERROR(E2BIG); 1012 error = XFS_ERROR(EFBIG);
1023 return error; 1013 return error;
1024 } 1014 }
1025 1015
@@ -1027,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1027 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); 1017 d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
1028 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { 1018 if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
1029 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1019 cmn_err(CE_WARN, "XFS: size check 3 failed");
1030 return XFS_ERROR(E2BIG); 1020 return XFS_ERROR(EFBIG);
1031 } 1021 }
1032 error = xfs_read_buf(mp, mp->m_logdev_targp, 1022 error = xfs_read_buf(mp, mp->m_logdev_targp,
1033 d - XFS_FSB_TO_BB(mp, 1), 1023 d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp)
1037 } else { 1027 } else {
1038 cmn_err(CE_WARN, "XFS: size check 3 failed"); 1028 cmn_err(CE_WARN, "XFS: size check 3 failed");
1039 if (error == ENOSPC) 1029 if (error == ENOSPC)
1040 error = XFS_ERROR(E2BIG); 1030 error = XFS_ERROR(EFBIG);
1041 return error; 1031 return error;
1042 } 1032 }
1043 } 1033 }
@@ -1254,7 +1244,7 @@ xfs_mountfs(
1254 * Allocate and initialize the per-ag data. 1244 * Allocate and initialize the per-ag data.
1255 */ 1245 */
1256 spin_lock_init(&mp->m_perag_lock); 1246 spin_lock_init(&mp->m_perag_lock);
1257 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS); 1247 INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
1258 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); 1248 error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
1259 if (error) { 1249 if (error) {
1260 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); 1250 cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops 268#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
269 must be synchronous except 269 must be synchronous except
270 for space allocations */ 270 for space allocations */
271#define XFS_MOUNT_DELAYLOG (1ULL << 1) /* delayed logging is enabled */
271#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */ 272#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
272#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) 273#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
273#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem 274#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..16445518506d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2247,7 +2247,7 @@ xfs_rtmount_init(
2247 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu", 2247 cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
2248 (unsigned long long) XFS_BB_TO_FSB(mp, d), 2248 (unsigned long long) XFS_BB_TO_FSB(mp, d),
2249 (unsigned long long) mp->m_sb.sb_rblocks); 2249 (unsigned long long) mp->m_sb.sb_rblocks);
2250 return XFS_ERROR(E2BIG); 2250 return XFS_ERROR(EFBIG);
2251 } 2251 }
2252 error = xfs_read_buf(mp, mp->m_rtdev_targp, 2252 error = xfs_read_buf(mp, mp->m_rtdev_targp,
2253 d - XFS_FSB_TO_BB(mp, 1), 2253 d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2256,7 @@ xfs_rtmount_init(
2256 cmn_err(CE_WARN, 2256 cmn_err(CE_WARN,
2257 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error); 2257 "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
2258 if (error == ENOSPC) 2258 if (error == ENOSPC)
2259 return XFS_ERROR(E2BIG); 2259 return XFS_ERROR(EFBIG);
2260 return error; 2260 return error;
2261 } 2261 }
2262 xfs_buf_relse(bp); 2262 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
147# define xfs_rtfree_extent(t,b,l) (ENOSYS) 147# define xfs_rtfree_extent(t,b,l) (ENOSYS)
148# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) 148# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS)
149# define xfs_growfs_rt(mp,in) (ENOSYS) 149# define xfs_growfs_rt(mp,in) (ENOSYS)
150# define xfs_rtmount_init(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 150static inline int /* error */
151xfs_rtmount_init(
152 xfs_mount_t *mp) /* file system mount structure */
153{
154 if (mp->m_sb.sb_rblocks == 0)
155 return 0;
156
157 cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
158 return ENOSYS;
159}
151# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) 160# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
152# define xfs_rtunmount_inodes(m) 161# define xfs_rtunmount_inodes(m)
153#endif /* CONFIG_XFS_RT */ 162#endif /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be578ecb4af2..28547dfce037 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,137 +44,493 @@
44#include "xfs_trans_priv.h" 44#include "xfs_trans_priv.h"
45#include "xfs_trans_space.h" 45#include "xfs_trans_space.h"
46#include "xfs_inode_item.h" 46#include "xfs_inode_item.h"
47#include "xfs_trace.h"
47 48
48kmem_zone_t *xfs_trans_zone; 49kmem_zone_t *xfs_trans_zone;
49 50
51
52/*
53 * Various log reservation values.
54 *
55 * These are based on the size of the file system block because that is what
56 * most transactions manipulate. Each adds in an additional 128 bytes per
57 * item logged to try to account for the overhead of the transaction mechanism.
58 *
59 * Note: Most of the reservations underestimate the number of allocation
60 * groups into which they could free extents in the xfs_bmap_finish() call.
61 * This is because the number in the worst case is quite high and quite
62 * unusual. In order to fix this we need to change xfs_bmap_finish() to free
63 * extents in only a single AG at a time. This will require changes to the
64 * EFI code as well, however, so that the EFI for the extents not freed is
65 * logged again in each transaction. See SGI PV #261917.
66 *
67 * Reservation functions here avoid a huge stack in xfs_trans_init due to
68 * register overflow from temporaries in the calculations.
69 */
70
71
50/* 72/*
51 * Reservation functions here avoid a huge stack in xfs_trans_init 73 * In a write transaction we can allocate a maximum of 2
52 * due to register overflow from temporaries in the calculations. 74 * extents. This gives:
75 * the inode getting the new extents: inode size
76 * the inode's bmap btree: max depth * block size
77 * the agfs of the ags from which the extents are allocated: 2 * sector
78 * the superblock free block counter: sector size
79 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
80 * And the bmap_finish transaction can free bmap blocks in a join:
81 * the agfs of the ags containing the blocks: 2 * sector size
82 * the agfls of the ags containing the blocks: 2 * sector size
83 * the super block free block counter: sector size
84 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
53 */ 85 */
54STATIC uint 86STATIC uint
55xfs_calc_write_reservation(xfs_mount_t *mp) 87xfs_calc_write_reservation(
88 struct xfs_mount *mp)
56{ 89{
57 return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 90 return XFS_DQUOT_LOGRES(mp) +
91 MAX((mp->m_sb.sb_inodesize +
92 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
93 2 * mp->m_sb.sb_sectsize +
94 mp->m_sb.sb_sectsize +
95 XFS_ALLOCFREE_LOG_RES(mp, 2) +
96 128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
97 XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
98 (2 * mp->m_sb.sb_sectsize +
99 2 * mp->m_sb.sb_sectsize +
100 mp->m_sb.sb_sectsize +
101 XFS_ALLOCFREE_LOG_RES(mp, 2) +
102 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
58} 103}
59 104
105/*
106 * In truncating a file we free up to two extents at once. We can modify:
107 * the inode being truncated: inode size
108 * the inode's bmap btree: (max depth + 1) * block size
109 * And the bmap_finish transaction can free the blocks and bmap blocks:
110 * the agf for each of the ags: 4 * sector size
111 * the agfl for each of the ags: 4 * sector size
112 * the super block to reflect the freed blocks: sector size
113 * worst case split in allocation btrees per extent assuming 4 extents:
114 * 4 exts * 2 trees * (2 * max depth - 1) * block size
115 * the inode btree: max depth * blocksize
116 * the allocation btrees: 2 trees * (max depth - 1) * block size
117 */
60STATIC uint 118STATIC uint
61xfs_calc_itruncate_reservation(xfs_mount_t *mp) 119xfs_calc_itruncate_reservation(
120 struct xfs_mount *mp)
62{ 121{
63 return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 122 return XFS_DQUOT_LOGRES(mp) +
123 MAX((mp->m_sb.sb_inodesize +
124 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
125 128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
126 (4 * mp->m_sb.sb_sectsize +
127 4 * mp->m_sb.sb_sectsize +
128 mp->m_sb.sb_sectsize +
129 XFS_ALLOCFREE_LOG_RES(mp, 4) +
130 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
131 128 * 5 +
132 XFS_ALLOCFREE_LOG_RES(mp, 1) +
133 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
134 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
64} 135}
65 136
137/*
138 * In renaming a files we can modify:
139 * the four inodes involved: 4 * inode size
140 * the two directory btrees: 2 * (max depth + v2) * dir block size
141 * the two directory bmap btrees: 2 * max depth * block size
142 * And the bmap_finish transaction can free dir and bmap blocks (two sets
143 * of bmap blocks) giving:
144 * the agf for the ags in which the blocks live: 3 * sector size
145 * the agfl for the ags in which the blocks live: 3 * sector size
146 * the superblock for the free block count: sector size
147 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
148 */
66STATIC uint 149STATIC uint
67xfs_calc_rename_reservation(xfs_mount_t *mp) 150xfs_calc_rename_reservation(
151 struct xfs_mount *mp)
68{ 152{
69 return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 153 return XFS_DQUOT_LOGRES(mp) +
154 MAX((4 * mp->m_sb.sb_inodesize +
155 2 * XFS_DIROP_LOG_RES(mp) +
156 128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
157 (3 * mp->m_sb.sb_sectsize +
158 3 * mp->m_sb.sb_sectsize +
159 mp->m_sb.sb_sectsize +
160 XFS_ALLOCFREE_LOG_RES(mp, 3) +
161 128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
70} 162}
71 163
164/*
165 * For creating a link to an inode:
166 * the parent directory inode: inode size
167 * the linked inode: inode size
168 * the directory btree could split: (max depth + v2) * dir block size
169 * the directory bmap btree could join or split: (max depth + v2) * blocksize
170 * And the bmap_finish transaction can free some bmap blocks giving:
171 * the agf for the ag in which the blocks live: sector size
172 * the agfl for the ag in which the blocks live: sector size
173 * the superblock for the free block count: sector size
174 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
175 */
72STATIC uint 176STATIC uint
73xfs_calc_link_reservation(xfs_mount_t *mp) 177xfs_calc_link_reservation(
178 struct xfs_mount *mp)
74{ 179{
75 return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 180 return XFS_DQUOT_LOGRES(mp) +
181 MAX((mp->m_sb.sb_inodesize +
182 mp->m_sb.sb_inodesize +
183 XFS_DIROP_LOG_RES(mp) +
184 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
185 (mp->m_sb.sb_sectsize +
186 mp->m_sb.sb_sectsize +
187 mp->m_sb.sb_sectsize +
188 XFS_ALLOCFREE_LOG_RES(mp, 1) +
189 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
76} 190}
77 191
192/*
193 * For removing a directory entry we can modify:
194 * the parent directory inode: inode size
195 * the removed inode: inode size
196 * the directory btree could join: (max depth + v2) * dir block size
197 * the directory bmap btree could join or split: (max depth + v2) * blocksize
198 * And the bmap_finish transaction can free the dir and bmap blocks giving:
199 * the agf for the ag in which the blocks live: 2 * sector size
200 * the agfl for the ag in which the blocks live: 2 * sector size
201 * the superblock for the free block count: sector size
202 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
203 */
78STATIC uint 204STATIC uint
79xfs_calc_remove_reservation(xfs_mount_t *mp) 205xfs_calc_remove_reservation(
206 struct xfs_mount *mp)
80{ 207{
81 return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 208 return XFS_DQUOT_LOGRES(mp) +
209 MAX((mp->m_sb.sb_inodesize +
210 mp->m_sb.sb_inodesize +
211 XFS_DIROP_LOG_RES(mp) +
212 128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
213 (2 * mp->m_sb.sb_sectsize +
214 2 * mp->m_sb.sb_sectsize +
215 mp->m_sb.sb_sectsize +
216 XFS_ALLOCFREE_LOG_RES(mp, 2) +
217 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
82} 218}
83 219
220/*
221 * For symlink we can modify:
222 * the parent directory inode: inode size
223 * the new inode: inode size
224 * the inode btree entry: 1 block
225 * the directory btree: (max depth + v2) * dir block size
226 * the directory inode's bmap btree: (max depth + v2) * block size
227 * the blocks for the symlink: 1 kB
228 * Or in the first xact we allocate some inodes giving:
229 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
230 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
231 * the inode btree: max depth * blocksize
232 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
233 */
84STATIC uint 234STATIC uint
85xfs_calc_symlink_reservation(xfs_mount_t *mp) 235xfs_calc_symlink_reservation(
236 struct xfs_mount *mp)
86{ 237{
87 return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 238 return XFS_DQUOT_LOGRES(mp) +
239 MAX((mp->m_sb.sb_inodesize +
240 mp->m_sb.sb_inodesize +
241 XFS_FSB_TO_B(mp, 1) +
242 XFS_DIROP_LOG_RES(mp) +
243 1024 +
244 128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
245 (2 * mp->m_sb.sb_sectsize +
246 XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
247 XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
248 XFS_ALLOCFREE_LOG_RES(mp, 1) +
249 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
250 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
88} 251}
89 252
253/*
254 * For create we can modify:
255 * the parent directory inode: inode size
256 * the new inode: inode size
257 * the inode btree entry: block size
258 * the superblock for the nlink flag: sector size
259 * the directory btree: (max depth + v2) * dir block size
260 * the directory inode's bmap btree: (max depth + v2) * block size
261 * Or in the first xact we allocate some inodes giving:
262 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
263 * the superblock for the nlink flag: sector size
264 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
265 * the inode btree: max depth * blocksize
266 * the allocation btrees: 2 trees * (max depth - 1) * block size
267 */
90STATIC uint 268STATIC uint
91xfs_calc_create_reservation(xfs_mount_t *mp) 269xfs_calc_create_reservation(
270 struct xfs_mount *mp)
92{ 271{
93 return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 272 return XFS_DQUOT_LOGRES(mp) +
273 MAX((mp->m_sb.sb_inodesize +
274 mp->m_sb.sb_inodesize +
275 mp->m_sb.sb_sectsize +
276 XFS_FSB_TO_B(mp, 1) +
277 XFS_DIROP_LOG_RES(mp) +
278 128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
279 (3 * mp->m_sb.sb_sectsize +
280 XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
281 XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
282 XFS_ALLOCFREE_LOG_RES(mp, 1) +
283 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
284 XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
94} 285}
95 286
287/*
288 * Making a new directory is the same as creating a new file.
289 */
96STATIC uint 290STATIC uint
97xfs_calc_mkdir_reservation(xfs_mount_t *mp) 291xfs_calc_mkdir_reservation(
292 struct xfs_mount *mp)
98{ 293{
99 return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 294 return xfs_calc_create_reservation(mp);
100} 295}
101 296
297/*
298 * In freeing an inode we can modify:
299 * the inode being freed: inode size
300 * the super block free inode counter: sector size
301 * the agi hash list and counters: sector size
302 * the inode btree entry: block size
303 * the on disk inode before ours in the agi hash list: inode cluster size
304 * the inode btree: max depth * blocksize
305 * the allocation btrees: 2 trees * (max depth - 1) * block size
306 */
102STATIC uint 307STATIC uint
103xfs_calc_ifree_reservation(xfs_mount_t *mp) 308xfs_calc_ifree_reservation(
309 struct xfs_mount *mp)
104{ 310{
105 return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 311 return XFS_DQUOT_LOGRES(mp) +
312 mp->m_sb.sb_inodesize +
313 mp->m_sb.sb_sectsize +
314 mp->m_sb.sb_sectsize +
315 XFS_FSB_TO_B(mp, 1) +
316 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
317 XFS_INODE_CLUSTER_SIZE(mp)) +
318 128 * 5 +
319 XFS_ALLOCFREE_LOG_RES(mp, 1) +
320 128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
321 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
106} 322}
107 323
324/*
325 * When only changing the inode we log the inode and possibly the superblock
326 * We also add a bit of slop for the transaction stuff.
327 */
108STATIC uint 328STATIC uint
109xfs_calc_ichange_reservation(xfs_mount_t *mp) 329xfs_calc_ichange_reservation(
330 struct xfs_mount *mp)
110{ 331{
111 return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 332 return XFS_DQUOT_LOGRES(mp) +
333 mp->m_sb.sb_inodesize +
334 mp->m_sb.sb_sectsize +
335 512;
336
112} 337}
113 338
339/*
340 * Growing the data section of the filesystem.
341 * superblock
342 * agi and agf
343 * allocation btrees
344 */
114STATIC uint 345STATIC uint
115xfs_calc_growdata_reservation(xfs_mount_t *mp) 346xfs_calc_growdata_reservation(
347 struct xfs_mount *mp)
116{ 348{
117 return XFS_CALC_GROWDATA_LOG_RES(mp); 349 return mp->m_sb.sb_sectsize * 3 +
350 XFS_ALLOCFREE_LOG_RES(mp, 1) +
351 128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
118} 352}
119 353
354/*
355 * Growing the rt section of the filesystem.
356 * In the first set of transactions (ALLOC) we allocate space to the
357 * bitmap or summary files.
358 * superblock: sector size
359 * agf of the ag from which the extent is allocated: sector size
360 * bmap btree for bitmap/summary inode: max depth * blocksize
361 * bitmap/summary inode: inode size
362 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
363 */
120STATIC uint 364STATIC uint
121xfs_calc_growrtalloc_reservation(xfs_mount_t *mp) 365xfs_calc_growrtalloc_reservation(
366 struct xfs_mount *mp)
122{ 367{
123 return XFS_CALC_GROWRTALLOC_LOG_RES(mp); 368 return 2 * mp->m_sb.sb_sectsize +
369 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
370 mp->m_sb.sb_inodesize +
371 XFS_ALLOCFREE_LOG_RES(mp, 1) +
372 128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
373 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
124} 374}
125 375
376/*
377 * Growing the rt section of the filesystem.
378 * In the second set of transactions (ZERO) we zero the new metadata blocks.
379 * one bitmap/summary block: blocksize
380 */
126STATIC uint 381STATIC uint
127xfs_calc_growrtzero_reservation(xfs_mount_t *mp) 382xfs_calc_growrtzero_reservation(
383 struct xfs_mount *mp)
128{ 384{
129 return XFS_CALC_GROWRTZERO_LOG_RES(mp); 385 return mp->m_sb.sb_blocksize + 128;
130} 386}
131 387
388/*
389 * Growing the rt section of the filesystem.
390 * In the third set of transactions (FREE) we update metadata without
391 * allocating any new blocks.
392 * superblock: sector size
393 * bitmap inode: inode size
394 * summary inode: inode size
395 * one bitmap block: blocksize
396 * summary blocks: new summary size
397 */
132STATIC uint 398STATIC uint
133xfs_calc_growrtfree_reservation(xfs_mount_t *mp) 399xfs_calc_growrtfree_reservation(
400 struct xfs_mount *mp)
134{ 401{
135 return XFS_CALC_GROWRTFREE_LOG_RES(mp); 402 return mp->m_sb.sb_sectsize +
403 2 * mp->m_sb.sb_inodesize +
404 mp->m_sb.sb_blocksize +
405 mp->m_rsumsize +
406 128 * 5;
136} 407}
137 408
409/*
410 * Logging the inode modification timestamp on a synchronous write.
411 * inode
412 */
138STATIC uint 413STATIC uint
139xfs_calc_swrite_reservation(xfs_mount_t *mp) 414xfs_calc_swrite_reservation(
415 struct xfs_mount *mp)
140{ 416{
141 return XFS_CALC_SWRITE_LOG_RES(mp); 417 return mp->m_sb.sb_inodesize + 128;
142} 418}
143 419
420/*
421 * Logging the inode mode bits when writing a setuid/setgid file
422 * inode
423 */
144STATIC uint 424STATIC uint
145xfs_calc_writeid_reservation(xfs_mount_t *mp) 425xfs_calc_writeid_reservation(xfs_mount_t *mp)
146{ 426{
147 return XFS_CALC_WRITEID_LOG_RES(mp); 427 return mp->m_sb.sb_inodesize + 128;
148} 428}
149 429
430/*
431 * Converting the inode from non-attributed to attributed.
432 * the inode being converted: inode size
433 * agf block and superblock (for block allocation)
434 * the new block (directory sized)
435 * bmap blocks for the new directory block
436 * allocation btrees
437 */
150STATIC uint 438STATIC uint
151xfs_calc_addafork_reservation(xfs_mount_t *mp) 439xfs_calc_addafork_reservation(
440 struct xfs_mount *mp)
152{ 441{
153 return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 442 return XFS_DQUOT_LOGRES(mp) +
443 mp->m_sb.sb_inodesize +
444 mp->m_sb.sb_sectsize * 2 +
445 mp->m_dirblksize +
446 XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
447 XFS_ALLOCFREE_LOG_RES(mp, 1) +
448 128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
449 XFS_ALLOCFREE_LOG_COUNT(mp, 1));
154} 450}
155 451
452/*
453 * Removing the attribute fork of a file
454 * the inode being truncated: inode size
455 * the inode's bmap btree: max depth * block size
456 * And the bmap_finish transaction can free the blocks and bmap blocks:
457 * the agf for each of the ags: 4 * sector size
458 * the agfl for each of the ags: 4 * sector size
459 * the super block to reflect the freed blocks: sector size
460 * worst case split in allocation btrees per extent assuming 4 extents:
461 * 4 exts * 2 trees * (2 * max depth - 1) * block size
462 */
156STATIC uint 463STATIC uint
157xfs_calc_attrinval_reservation(xfs_mount_t *mp) 464xfs_calc_attrinval_reservation(
465 struct xfs_mount *mp)
158{ 466{
159 return XFS_CALC_ATTRINVAL_LOG_RES(mp); 467 return MAX((mp->m_sb.sb_inodesize +
468 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
469 128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
470 (4 * mp->m_sb.sb_sectsize +
471 4 * mp->m_sb.sb_sectsize +
472 mp->m_sb.sb_sectsize +
473 XFS_ALLOCFREE_LOG_RES(mp, 4) +
474 128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
160} 475}
161 476
477/*
478 * Setting an attribute.
479 * the inode getting the attribute
480 * the superblock for allocations
481 * the agfs extents are allocated from
482 * the attribute btree * max depth
483 * the inode allocation btree
484 * Since attribute transaction space is dependent on the size of the attribute,
485 * the calculation is done partially at mount time and partially at runtime.
486 */
162STATIC uint 487STATIC uint
163xfs_calc_attrset_reservation(xfs_mount_t *mp) 488xfs_calc_attrset_reservation(
489 struct xfs_mount *mp)
164{ 490{
165 return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 491 return XFS_DQUOT_LOGRES(mp) +
492 mp->m_sb.sb_inodesize +
493 mp->m_sb.sb_sectsize +
494 XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
495 128 * (2 + XFS_DA_NODE_MAXDEPTH);
166} 496}
167 497
498/*
499 * Removing an attribute.
500 * the inode: inode size
501 * the attribute btree could join: max depth * block size
502 * the inode bmap btree could join or split: max depth * block size
503 * And the bmap_finish transaction can free the attr blocks freed giving:
504 * the agf for the ag in which the blocks live: 2 * sector size
505 * the agfl for the ag in which the blocks live: 2 * sector size
506 * the superblock for the free block count: sector size
507 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
508 */
168STATIC uint 509STATIC uint
169xfs_calc_attrrm_reservation(xfs_mount_t *mp) 510xfs_calc_attrrm_reservation(
511 struct xfs_mount *mp)
170{ 512{
171 return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp); 513 return XFS_DQUOT_LOGRES(mp) +
514 MAX((mp->m_sb.sb_inodesize +
515 XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
516 XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
517 128 * (1 + XFS_DA_NODE_MAXDEPTH +
518 XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
519 (2 * mp->m_sb.sb_sectsize +
520 2 * mp->m_sb.sb_sectsize +
521 mp->m_sb.sb_sectsize +
522 XFS_ALLOCFREE_LOG_RES(mp, 2) +
523 128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
172} 524}
173 525
526/*
527 * Clearing a bad agino number in an agi hash bucket.
528 */
174STATIC uint 529STATIC uint
175xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp) 530xfs_calc_clear_agi_bucket_reservation(
531 struct xfs_mount *mp)
176{ 532{
177 return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp); 533 return mp->m_sb.sb_sectsize + 128;
178} 534}
179 535
180/* 536/*
@@ -183,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
183 */ 539 */
184void 540void
185xfs_trans_init( 541xfs_trans_init(
186 xfs_mount_t *mp) 542 struct xfs_mount *mp)
187{ 543{
188 xfs_trans_reservations_t *resp; 544 struct xfs_trans_reservations *resp = &mp->m_reservations;
189 545
190 resp = &(mp->m_reservations);
191 resp->tr_write = xfs_calc_write_reservation(mp); 546 resp->tr_write = xfs_calc_write_reservation(mp);
192 resp->tr_itruncate = xfs_calc_itruncate_reservation(mp); 547 resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
193 resp->tr_rename = xfs_calc_rename_reservation(mp); 548 resp->tr_rename = xfs_calc_rename_reservation(mp);
@@ -243,9 +598,8 @@ _xfs_trans_alloc(
243 tp->t_type = type; 598 tp->t_type = type;
244 tp->t_mountp = mp; 599 tp->t_mountp = mp;
245 tp->t_items_free = XFS_LIC_NUM_SLOTS; 600 tp->t_items_free = XFS_LIC_NUM_SLOTS;
246 tp->t_busy_free = XFS_LBC_NUM_SLOTS;
247 xfs_lic_init(&(tp->t_items)); 601 xfs_lic_init(&(tp->t_items));
248 XFS_LBC_INIT(&(tp->t_busy)); 602 INIT_LIST_HEAD(&tp->t_busy);
249 return tp; 603 return tp;
250} 604}
251 605
@@ -255,8 +609,13 @@ _xfs_trans_alloc(
255 */ 609 */
256STATIC void 610STATIC void
257xfs_trans_free( 611xfs_trans_free(
258 xfs_trans_t *tp) 612 struct xfs_trans *tp)
259{ 613{
614 struct xfs_busy_extent *busyp, *n;
615
616 list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
617 xfs_alloc_busy_clear(tp->t_mountp, busyp);
618
260 atomic_dec(&tp->t_mountp->m_active_trans); 619 atomic_dec(&tp->t_mountp->m_active_trans);
261 xfs_trans_free_dqinfo(tp); 620 xfs_trans_free_dqinfo(tp);
262 kmem_zone_free(xfs_trans_zone, tp); 621 kmem_zone_free(xfs_trans_zone, tp);
@@ -285,9 +644,8 @@ xfs_trans_dup(
285 ntp->t_type = tp->t_type; 644 ntp->t_type = tp->t_type;
286 ntp->t_mountp = tp->t_mountp; 645 ntp->t_mountp = tp->t_mountp;
287 ntp->t_items_free = XFS_LIC_NUM_SLOTS; 646 ntp->t_items_free = XFS_LIC_NUM_SLOTS;
288 ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
289 xfs_lic_init(&(ntp->t_items)); 647 xfs_lic_init(&(ntp->t_items));
290 XFS_LBC_INIT(&(ntp->t_busy)); 648 INIT_LIST_HEAD(&ntp->t_busy);
291 649
292 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 650 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
293 ASSERT(tp->t_ticket != NULL); 651 ASSERT(tp->t_ticket != NULL);
@@ -423,7 +781,6 @@ undo_blocks:
423 return error; 781 return error;
424} 782}
425 783
426
427/* 784/*
428 * Record the indicated change to the given field for application 785 * Record the indicated change to the given field for application
429 * to the file system's superblock when the transaction commits. 786 * to the file system's superblock when the transaction commits.
@@ -652,7 +1009,7 @@ xfs_trans_apply_sb_deltas(
652 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we 1009 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
653 * still need to update the incore superblock with the changes. 1010 * still need to update the incore superblock with the changes.
654 */ 1011 */
655STATIC void 1012void
656xfs_trans_unreserve_and_mod_sb( 1013xfs_trans_unreserve_and_mod_sb(
657 xfs_trans_t *tp) 1014 xfs_trans_t *tp)
658{ 1015{
@@ -880,7 +1237,7 @@ xfs_trans_fill_vecs(
880 * they could be immediately flushed and we'd have to race with the flusher 1237 * they could be immediately flushed and we'd have to race with the flusher
881 * trying to pull the item from the AIL as we add it. 1238 * trying to pull the item from the AIL as we add it.
882 */ 1239 */
883static void 1240void
884xfs_trans_item_committed( 1241xfs_trans_item_committed(
885 struct xfs_log_item *lip, 1242 struct xfs_log_item *lip,
886 xfs_lsn_t commit_lsn, 1243 xfs_lsn_t commit_lsn,
@@ -930,26 +1287,6 @@ xfs_trans_item_committed(
930 IOP_UNPIN(lip); 1287 IOP_UNPIN(lip);
931} 1288}
932 1289
933/* Clear all the per-AG busy list items listed in this transaction */
934static void
935xfs_trans_clear_busy_extents(
936 struct xfs_trans *tp)
937{
938 xfs_log_busy_chunk_t *lbcp;
939 xfs_log_busy_slot_t *lbsp;
940 int i;
941
942 for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
943 i = 0;
944 for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
945 if (XFS_LBC_ISFREE(lbcp, i))
946 continue;
947 xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
948 }
949 }
950 xfs_trans_free_busy(tp);
951}
952
953/* 1290/*
954 * This is typically called by the LM when a transaction has been fully 1291 * This is typically called by the LM when a transaction has been fully
955 * committed to disk. It needs to unpin the items which have 1292 * committed to disk. It needs to unpin the items which have
@@ -984,7 +1321,6 @@ xfs_trans_committed(
984 kmem_free(licp); 1321 kmem_free(licp);
985 } 1322 }
986 1323
987 xfs_trans_clear_busy_extents(tp);
988 xfs_trans_free(tp); 1324 xfs_trans_free(tp);
989} 1325}
990 1326
@@ -1012,8 +1348,7 @@ xfs_trans_uncommit(
1012 xfs_trans_unreserve_and_mod_sb(tp); 1348 xfs_trans_unreserve_and_mod_sb(tp);
1013 xfs_trans_unreserve_and_mod_dquots(tp); 1349 xfs_trans_unreserve_and_mod_dquots(tp);
1014 1350
1015 xfs_trans_free_items(tp, flags); 1351 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1016 xfs_trans_free_busy(tp);
1017 xfs_trans_free(tp); 1352 xfs_trans_free(tp);
1018} 1353}
1019 1354
@@ -1075,6 +1410,8 @@ xfs_trans_commit_iclog(
1075 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags); 1410 *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
1076 1411
1077 tp->t_commit_lsn = *commit_lsn; 1412 tp->t_commit_lsn = *commit_lsn;
1413 trace_xfs_trans_commit_lsn(tp);
1414
1078 if (nvec > XFS_TRANS_LOGVEC_COUNT) 1415 if (nvec > XFS_TRANS_LOGVEC_COUNT)
1079 kmem_free(log_vector); 1416 kmem_free(log_vector);
1080 1417
@@ -1161,6 +1498,93 @@ xfs_trans_commit_iclog(
1161 return xfs_log_release_iclog(mp, commit_iclog); 1498 return xfs_log_release_iclog(mp, commit_iclog);
1162} 1499}
1163 1500
1501/*
1502 * Walk the log items and allocate log vector structures for
1503 * each item large enough to fit all the vectors they require.
1504 * Note that this format differs from the old log vector format in
1505 * that there is no transaction header in these log vectors.
1506 */
1507STATIC struct xfs_log_vec *
1508xfs_trans_alloc_log_vecs(
1509 xfs_trans_t *tp)
1510{
1511 xfs_log_item_desc_t *lidp;
1512 struct xfs_log_vec *lv = NULL;
1513 struct xfs_log_vec *ret_lv = NULL;
1514
1515 lidp = xfs_trans_first_item(tp);
1516
1517 /* Bail out if we didn't find a log item. */
1518 if (!lidp) {
1519 ASSERT(0);
1520 return NULL;
1521 }
1522
1523 while (lidp != NULL) {
1524 struct xfs_log_vec *new_lv;
1525
1526 /* Skip items which aren't dirty in this transaction. */
1527 if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
1528 lidp = xfs_trans_next_item(tp, lidp);
1529 continue;
1530 }
1531
1532 /* Skip items that do not have any vectors for writing */
1533 lidp->lid_size = IOP_SIZE(lidp->lid_item);
1534 if (!lidp->lid_size) {
1535 lidp = xfs_trans_next_item(tp, lidp);
1536 continue;
1537 }
1538
1539 new_lv = kmem_zalloc(sizeof(*new_lv) +
1540 lidp->lid_size * sizeof(struct xfs_log_iovec),
1541 KM_SLEEP);
1542
1543 /* The allocated iovec region lies beyond the log vector. */
1544 new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
1545 new_lv->lv_niovecs = lidp->lid_size;
1546 new_lv->lv_item = lidp->lid_item;
1547 if (!ret_lv)
1548 ret_lv = new_lv;
1549 else
1550 lv->lv_next = new_lv;
1551 lv = new_lv;
1552 lidp = xfs_trans_next_item(tp, lidp);
1553 }
1554
1555 return ret_lv;
1556}
1557
1558static int
1559xfs_trans_commit_cil(
1560 struct xfs_mount *mp,
1561 struct xfs_trans *tp,
1562 xfs_lsn_t *commit_lsn,
1563 int flags)
1564{
1565 struct xfs_log_vec *log_vector;
1566 int error;
1567
1568 /*
1569 * Get each log item to allocate a vector structure for
1570 * the log item to to pass to the log write code. The
1571 * CIL commit code will format the vector and save it away.
1572 */
1573 log_vector = xfs_trans_alloc_log_vecs(tp);
1574 if (!log_vector)
1575 return ENOMEM;
1576
1577 error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
1578 if (error)
1579 return error;
1580
1581 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1582
1583 /* xfs_trans_free_items() unlocks them first */
1584 xfs_trans_free_items(tp, *commit_lsn, 0);
1585 xfs_trans_free(tp);
1586 return 0;
1587}
1164 1588
1165/* 1589/*
1166 * xfs_trans_commit 1590 * xfs_trans_commit
@@ -1221,7 +1645,11 @@ _xfs_trans_commit(
1221 xfs_trans_apply_sb_deltas(tp); 1645 xfs_trans_apply_sb_deltas(tp);
1222 xfs_trans_apply_dquot_deltas(tp); 1646 xfs_trans_apply_dquot_deltas(tp);
1223 1647
1224 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags); 1648 if (mp->m_flags & XFS_MOUNT_DELAYLOG)
1649 error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
1650 else
1651 error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
1652
1225 if (error == ENOMEM) { 1653 if (error == ENOMEM) {
1226 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR); 1654 xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
1227 error = XFS_ERROR(EIO); 1655 error = XFS_ERROR(EIO);
@@ -1259,8 +1687,7 @@ out_unreserve:
1259 error = XFS_ERROR(EIO); 1687 error = XFS_ERROR(EIO);
1260 } 1688 }
1261 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1689 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1262 xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0); 1690 xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
1263 xfs_trans_free_busy(tp);
1264 xfs_trans_free(tp); 1691 xfs_trans_free(tp);
1265 1692
1266 XFS_STATS_INC(xs_trans_empty); 1693 XFS_STATS_INC(xs_trans_empty);
@@ -1338,8 +1765,7 @@ xfs_trans_cancel(
1338 /* mark this thread as no longer being in a transaction */ 1765 /* mark this thread as no longer being in a transaction */
1339 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); 1766 current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
1340 1767
1341 xfs_trans_free_items(tp, flags); 1768 xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
1342 xfs_trans_free_busy(tp);
1343 xfs_trans_free(tp); 1769 xfs_trans_free(tp);
1344} 1770}
1345 1771
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c62beee0921e..e639e8e9a2a9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header {
106#define XFS_TRANS_GROWFSRT_FREE 39 106#define XFS_TRANS_GROWFSRT_FREE 39
107#define XFS_TRANS_SWAPEXT 40 107#define XFS_TRANS_SWAPEXT 40
108#define XFS_TRANS_SB_COUNT 41 108#define XFS_TRANS_SB_COUNT 41
109#define XFS_TRANS_TYPE_MAX 41 109#define XFS_TRANS_CHECKPOINT 42
110#define XFS_TRANS_TYPE_MAX 42
110/* new transaction types need to be reflected in xfs_logprint(8) */ 111/* new transaction types need to be reflected in xfs_logprint(8) */
111 112
112#define XFS_TRANS_TYPES \ 113#define XFS_TRANS_TYPES \
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header {
148 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \ 149 { XFS_TRANS_GROWFSRT_FREE, "GROWFSRT_FREE" }, \
149 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \ 150 { XFS_TRANS_SWAPEXT, "SWAPEXT" }, \
150 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \ 151 { XFS_TRANS_SB_COUNT, "SB_COUNT" }, \
152 { XFS_TRANS_CHECKPOINT, "CHECKPOINT" }, \
151 { XFS_TRANS_DUMMY1, "DUMMY1" }, \ 153 { XFS_TRANS_DUMMY1, "DUMMY1" }, \
152 { XFS_TRANS_DUMMY2, "DUMMY2" }, \ 154 { XFS_TRANS_DUMMY2, "DUMMY2" }, \
153 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" } 155 { XLOG_UNMOUNT_REC_TYPE, "UNMOUNT" }
@@ -298,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
298 300
299 301
300/* 302/*
301 * Various log reservation values.
302 * These are based on the size of the file system block
303 * because that is what most transactions manipulate.
304 * Each adds in an additional 128 bytes per item logged to
305 * try to account for the overhead of the transaction mechanism.
306 *
307 * Note:
308 * Most of the reservations underestimate the number of allocation
309 * groups into which they could free extents in the xfs_bmap_finish()
310 * call. This is because the number in the worst case is quite high
311 * and quite unusual. In order to fix this we need to change
312 * xfs_bmap_finish() to free extents in only a single AG at a time.
313 * This will require changes to the EFI code as well, however, so that
314 * the EFI for the extents not freed is logged again in each transaction.
315 * See bug 261917.
316 */
317
318/*
319 * Per-extent log reservation for the allocation btree changes 303 * Per-extent log reservation for the allocation btree changes
320 * involved in freeing or allocating an extent. 304 * involved in freeing or allocating an extent.
321 * 2 trees * (2 blocks/level * max depth - 1) * block size 305 * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -339,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
339 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \ 323 (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
340 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1) 324 XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
341 325
342/*
343 * In a write transaction we can allocate a maximum of 2
344 * extents. This gives:
345 * the inode getting the new extents: inode size
346 * the inode's bmap btree: max depth * block size
347 * the agfs of the ags from which the extents are allocated: 2 * sector
348 * the superblock free block counter: sector size
349 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
350 * And the bmap_finish transaction can free bmap blocks in a join:
351 * the agfs of the ags containing the blocks: 2 * sector size
352 * the agfls of the ags containing the blocks: 2 * sector size
353 * the super block free block counter: sector size
354 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
355 */
356#define XFS_CALC_WRITE_LOG_RES(mp) \
357 (MAX( \
358 ((mp)->m_sb.sb_inodesize + \
359 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
360 (2 * (mp)->m_sb.sb_sectsize) + \
361 (mp)->m_sb.sb_sectsize + \
362 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
363 (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
364 ((2 * (mp)->m_sb.sb_sectsize) + \
365 (2 * (mp)->m_sb.sb_sectsize) + \
366 (mp)->m_sb.sb_sectsize + \
367 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
368 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
369 326
370#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write) 327#define XFS_WRITE_LOG_RES(mp) ((mp)->m_reservations.tr_write)
371
372/*
373 * In truncating a file we free up to two extents at once. We can modify:
374 * the inode being truncated: inode size
375 * the inode's bmap btree: (max depth + 1) * block size
376 * And the bmap_finish transaction can free the blocks and bmap blocks:
377 * the agf for each of the ags: 4 * sector size
378 * the agfl for each of the ags: 4 * sector size
379 * the super block to reflect the freed blocks: sector size
380 * worst case split in allocation btrees per extent assuming 4 extents:
381 * 4 exts * 2 trees * (2 * max depth - 1) * block size
382 * the inode btree: max depth * blocksize
383 * the allocation btrees: 2 trees * (max depth - 1) * block size
384 */
385#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
386 (MAX( \
387 ((mp)->m_sb.sb_inodesize + \
388 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
389 (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
390 ((4 * (mp)->m_sb.sb_sectsize) + \
391 (4 * (mp)->m_sb.sb_sectsize) + \
392 (mp)->m_sb.sb_sectsize + \
393 XFS_ALLOCFREE_LOG_RES(mp, 4) + \
394 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
395 (128 * 5) + \
396 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
397 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
398 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
399
400#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate) 328#define XFS_ITRUNCATE_LOG_RES(mp) ((mp)->m_reservations.tr_itruncate)
401
402/*
403 * In renaming a files we can modify:
404 * the four inodes involved: 4 * inode size
405 * the two directory btrees: 2 * (max depth + v2) * dir block size
406 * the two directory bmap btrees: 2 * max depth * block size
407 * And the bmap_finish transaction can free dir and bmap blocks (two sets
408 * of bmap blocks) giving:
409 * the agf for the ags in which the blocks live: 3 * sector size
410 * the agfl for the ags in which the blocks live: 3 * sector size
411 * the superblock for the free block count: sector size
412 * the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
413 */
414#define XFS_CALC_RENAME_LOG_RES(mp) \
415 (MAX( \
416 ((4 * (mp)->m_sb.sb_inodesize) + \
417 (2 * XFS_DIROP_LOG_RES(mp)) + \
418 (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
419 ((3 * (mp)->m_sb.sb_sectsize) + \
420 (3 * (mp)->m_sb.sb_sectsize) + \
421 (mp)->m_sb.sb_sectsize + \
422 XFS_ALLOCFREE_LOG_RES(mp, 3) + \
423 (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
424
425#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename) 329#define XFS_RENAME_LOG_RES(mp) ((mp)->m_reservations.tr_rename)
426
427/*
428 * For creating a link to an inode:
429 * the parent directory inode: inode size
430 * the linked inode: inode size
431 * the directory btree could split: (max depth + v2) * dir block size
432 * the directory bmap btree could join or split: (max depth + v2) * blocksize
433 * And the bmap_finish transaction can free some bmap blocks giving:
434 * the agf for the ag in which the blocks live: sector size
435 * the agfl for the ag in which the blocks live: sector size
436 * the superblock for the free block count: sector size
437 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
438 */
439#define XFS_CALC_LINK_LOG_RES(mp) \
440 (MAX( \
441 ((mp)->m_sb.sb_inodesize + \
442 (mp)->m_sb.sb_inodesize + \
443 XFS_DIROP_LOG_RES(mp) + \
444 (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
445 ((mp)->m_sb.sb_sectsize + \
446 (mp)->m_sb.sb_sectsize + \
447 (mp)->m_sb.sb_sectsize + \
448 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
449 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
450
451#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link) 330#define XFS_LINK_LOG_RES(mp) ((mp)->m_reservations.tr_link)
452
453/*
454 * For removing a directory entry we can modify:
455 * the parent directory inode: inode size
456 * the removed inode: inode size
457 * the directory btree could join: (max depth + v2) * dir block size
458 * the directory bmap btree could join or split: (max depth + v2) * blocksize
459 * And the bmap_finish transaction can free the dir and bmap blocks giving:
460 * the agf for the ag in which the blocks live: 2 * sector size
461 * the agfl for the ag in which the blocks live: 2 * sector size
462 * the superblock for the free block count: sector size
463 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
464 */
465#define XFS_CALC_REMOVE_LOG_RES(mp) \
466 (MAX( \
467 ((mp)->m_sb.sb_inodesize + \
468 (mp)->m_sb.sb_inodesize + \
469 XFS_DIROP_LOG_RES(mp) + \
470 (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
471 ((2 * (mp)->m_sb.sb_sectsize) + \
472 (2 * (mp)->m_sb.sb_sectsize) + \
473 (mp)->m_sb.sb_sectsize + \
474 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
475 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
476
477#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove) 331#define XFS_REMOVE_LOG_RES(mp) ((mp)->m_reservations.tr_remove)
478
479/*
480 * For symlink we can modify:
481 * the parent directory inode: inode size
482 * the new inode: inode size
483 * the inode btree entry: 1 block
484 * the directory btree: (max depth + v2) * dir block size
485 * the directory inode's bmap btree: (max depth + v2) * block size
486 * the blocks for the symlink: 1 kB
487 * Or in the first xact we allocate some inodes giving:
488 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
489 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
490 * the inode btree: max depth * blocksize
491 * the allocation btrees: 2 trees * (2 * max depth - 1) * block size
492 */
493#define XFS_CALC_SYMLINK_LOG_RES(mp) \
494 (MAX( \
495 ((mp)->m_sb.sb_inodesize + \
496 (mp)->m_sb.sb_inodesize + \
497 XFS_FSB_TO_B(mp, 1) + \
498 XFS_DIROP_LOG_RES(mp) + \
499 1024 + \
500 (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
501 (2 * (mp)->m_sb.sb_sectsize + \
502 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
503 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
504 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
505 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
506 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
507
508#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink) 332#define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
509
510/*
511 * For create we can modify:
512 * the parent directory inode: inode size
513 * the new inode: inode size
514 * the inode btree entry: block size
515 * the superblock for the nlink flag: sector size
516 * the directory btree: (max depth + v2) * dir block size
517 * the directory inode's bmap btree: (max depth + v2) * block size
518 * Or in the first xact we allocate some inodes giving:
519 * the agi and agf of the ag getting the new inodes: 2 * sectorsize
520 * the superblock for the nlink flag: sector size
521 * the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
522 * the inode btree: max depth * blocksize
523 * the allocation btrees: 2 trees * (max depth - 1) * block size
524 */
525#define XFS_CALC_CREATE_LOG_RES(mp) \
526 (MAX( \
527 ((mp)->m_sb.sb_inodesize + \
528 (mp)->m_sb.sb_inodesize + \
529 (mp)->m_sb.sb_sectsize + \
530 XFS_FSB_TO_B(mp, 1) + \
531 XFS_DIROP_LOG_RES(mp) + \
532 (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
533 (3 * (mp)->m_sb.sb_sectsize + \
534 XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
535 XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
536 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
537 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
538 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
539
540#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create) 333#define XFS_CREATE_LOG_RES(mp) ((mp)->m_reservations.tr_create)
541
542/*
543 * Making a new directory is the same as creating a new file.
544 */
545#define XFS_CALC_MKDIR_LOG_RES(mp) XFS_CALC_CREATE_LOG_RES(mp)
546
547#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir) 334#define XFS_MKDIR_LOG_RES(mp) ((mp)->m_reservations.tr_mkdir)
548
549/*
550 * In freeing an inode we can modify:
551 * the inode being freed: inode size
552 * the super block free inode counter: sector size
553 * the agi hash list and counters: sector size
554 * the inode btree entry: block size
555 * the on disk inode before ours in the agi hash list: inode cluster size
556 * the inode btree: max depth * blocksize
557 * the allocation btrees: 2 trees * (max depth - 1) * block size
558 */
559#define XFS_CALC_IFREE_LOG_RES(mp) \
560 ((mp)->m_sb.sb_inodesize + \
561 (mp)->m_sb.sb_sectsize + \
562 (mp)->m_sb.sb_sectsize + \
563 XFS_FSB_TO_B((mp), 1) + \
564 MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
565 (128 * 5) + \
566 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
567 (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
568 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
569
570
571#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) 335#define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree)
572
573/*
574 * When only changing the inode we log the inode and possibly the superblock
575 * We also add a bit of slop for the transaction stuff.
576 */
577#define XFS_CALC_ICHANGE_LOG_RES(mp) ((mp)->m_sb.sb_inodesize + \
578 (mp)->m_sb.sb_sectsize + 512)
579
580#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange) 336#define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
581
582/*
583 * Growing the data section of the filesystem.
584 * superblock
585 * agi and agf
586 * allocation btrees
587 */
588#define XFS_CALC_GROWDATA_LOG_RES(mp) \
589 ((mp)->m_sb.sb_sectsize * 3 + \
590 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
591 (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
592
593#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata) 337#define XFS_GROWDATA_LOG_RES(mp) ((mp)->m_reservations.tr_growdata)
594
595/*
596 * Growing the rt section of the filesystem.
597 * In the first set of transactions (ALLOC) we allocate space to the
598 * bitmap or summary files.
599 * superblock: sector size
600 * agf of the ag from which the extent is allocated: sector size
601 * bmap btree for bitmap/summary inode: max depth * blocksize
602 * bitmap/summary inode: inode size
603 * allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
604 */
605#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
606 (2 * (mp)->m_sb.sb_sectsize + \
607 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
608 (mp)->m_sb.sb_inodesize + \
609 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
610 (128 * \
611 (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
612 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
613
614#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc) 338#define XFS_GROWRTALLOC_LOG_RES(mp) ((mp)->m_reservations.tr_growrtalloc)
615
616/*
617 * Growing the rt section of the filesystem.
618 * In the second set of transactions (ZERO) we zero the new metadata blocks.
619 * one bitmap/summary block: blocksize
620 */
621#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
622 ((mp)->m_sb.sb_blocksize + 128)
623
624#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero) 339#define XFS_GROWRTZERO_LOG_RES(mp) ((mp)->m_reservations.tr_growrtzero)
625
626/*
627 * Growing the rt section of the filesystem.
628 * In the third set of transactions (FREE) we update metadata without
629 * allocating any new blocks.
630 * superblock: sector size
631 * bitmap inode: inode size
632 * summary inode: inode size
633 * one bitmap block: blocksize
634 * summary blocks: new summary size
635 */
636#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
637 ((mp)->m_sb.sb_sectsize + \
638 2 * (mp)->m_sb.sb_inodesize + \
639 (mp)->m_sb.sb_blocksize + \
640 (mp)->m_rsumsize + \
641 (128 * 5))
642
643#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree) 340#define XFS_GROWRTFREE_LOG_RES(mp) ((mp)->m_reservations.tr_growrtfree)
644
645/*
646 * Logging the inode modification timestamp on a synchronous write.
647 * inode
648 */
649#define XFS_CALC_SWRITE_LOG_RES(mp) \
650 ((mp)->m_sb.sb_inodesize + 128)
651
652#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 341#define XFS_SWRITE_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
653
654/* 342/*
655 * Logging the inode timestamps on an fsync -- same as SWRITE 343 * Logging the inode timestamps on an fsync -- same as SWRITE
656 * as long as SWRITE logs the entire inode core 344 * as long as SWRITE logs the entire inode core
657 */ 345 */
658#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 346#define XFS_FSYNC_TS_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
659
660/*
661 * Logging the inode mode bits when writing a setuid/setgid file
662 * inode
663 */
664#define XFS_CALC_WRITEID_LOG_RES(mp) \
665 ((mp)->m_sb.sb_inodesize + 128)
666
667#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite) 347#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
668
669/*
670 * Converting the inode from non-attributed to attributed.
671 * the inode being converted: inode size
672 * agf block and superblock (for block allocation)
673 * the new block (directory sized)
674 * bmap blocks for the new directory block
675 * allocation btrees
676 */
677#define XFS_CALC_ADDAFORK_LOG_RES(mp) \
678 ((mp)->m_sb.sb_inodesize + \
679 (mp)->m_sb.sb_sectsize * 2 + \
680 (mp)->m_dirblksize + \
681 XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
682 XFS_ALLOCFREE_LOG_RES(mp, 1) + \
683 (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
684 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
685
686#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork) 348#define XFS_ADDAFORK_LOG_RES(mp) ((mp)->m_reservations.tr_addafork)
687
688/*
689 * Removing the attribute fork of a file
690 * the inode being truncated: inode size
691 * the inode's bmap btree: max depth * block size
692 * And the bmap_finish transaction can free the blocks and bmap blocks:
693 * the agf for each of the ags: 4 * sector size
694 * the agfl for each of the ags: 4 * sector size
695 * the super block to reflect the freed blocks: sector size
696 * worst case split in allocation btrees per extent assuming 4 extents:
697 * 4 exts * 2 trees * (2 * max depth - 1) * block size
698 */
699#define XFS_CALC_ATTRINVAL_LOG_RES(mp) \
700 (MAX( \
701 ((mp)->m_sb.sb_inodesize + \
702 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
703 (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
704 ((4 * (mp)->m_sb.sb_sectsize) + \
705 (4 * (mp)->m_sb.sb_sectsize) + \
706 (mp)->m_sb.sb_sectsize + \
707 XFS_ALLOCFREE_LOG_RES(mp, 4) + \
708 (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
709
710#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval) 349#define XFS_ATTRINVAL_LOG_RES(mp) ((mp)->m_reservations.tr_attrinval)
711
712/*
713 * Setting an attribute.
714 * the inode getting the attribute
715 * the superblock for allocations
716 * the agfs extents are allocated from
717 * the attribute btree * max depth
718 * the inode allocation btree
719 * Since attribute transaction space is dependent on the size of the attribute,
720 * the calculation is done partially at mount time and partially at runtime.
721 */
722#define XFS_CALC_ATTRSET_LOG_RES(mp) \
723 ((mp)->m_sb.sb_inodesize + \
724 (mp)->m_sb.sb_sectsize + \
725 XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
726 (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
727
728#define XFS_ATTRSET_LOG_RES(mp, ext) \ 350#define XFS_ATTRSET_LOG_RES(mp, ext) \
729 ((mp)->m_reservations.tr_attrset + \ 351 ((mp)->m_reservations.tr_attrset + \
730 (ext * (mp)->m_sb.sb_sectsize) + \ 352 (ext * (mp)->m_sb.sb_sectsize) + \
731 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \ 353 (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
732 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))))) 354 (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
733
734/*
735 * Removing an attribute.
736 * the inode: inode size
737 * the attribute btree could join: max depth * block size
738 * the inode bmap btree could join or split: max depth * block size
739 * And the bmap_finish transaction can free the attr blocks freed giving:
740 * the agf for the ag in which the blocks live: 2 * sector size
741 * the agfl for the ag in which the blocks live: 2 * sector size
742 * the superblock for the free block count: sector size
743 * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
744 */
745#define XFS_CALC_ATTRRM_LOG_RES(mp) \
746 (MAX( \
747 ((mp)->m_sb.sb_inodesize + \
748 XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
749 XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
750 (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
751 ((2 * (mp)->m_sb.sb_sectsize) + \
752 (2 * (mp)->m_sb.sb_sectsize) + \
753 (mp)->m_sb.sb_sectsize + \
754 XFS_ALLOCFREE_LOG_RES(mp, 2) + \
755 (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
756
757#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm) 355#define XFS_ATTRRM_LOG_RES(mp) ((mp)->m_reservations.tr_attrrm)
758
759/*
760 * Clearing a bad agino number in an agi hash bucket.
761 */
762#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
763 ((mp)->m_sb.sb_sectsize + 128)
764
765#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi) 356#define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp) ((mp)->m_reservations.tr_clearagi)
766 357
767 358
@@ -813,6 +404,7 @@ struct xfs_log_item_desc;
813struct xfs_mount; 404struct xfs_mount;
814struct xfs_trans; 405struct xfs_trans;
815struct xfs_dquot_acct; 406struct xfs_dquot_acct;
407struct xfs_busy_extent;
816 408
817typedef struct xfs_log_item { 409typedef struct xfs_log_item {
818 struct list_head li_ail; /* AIL pointers */ 410 struct list_head li_ail; /* AIL pointers */
@@ -828,6 +420,11 @@ typedef struct xfs_log_item {
828 /* buffer item iodone */ 420 /* buffer item iodone */
829 /* callback func */ 421 /* callback func */
830 struct xfs_item_ops *li_ops; /* function list */ 422 struct xfs_item_ops *li_ops; /* function list */
423
424 /* delayed logging */
425 struct list_head li_cil; /* CIL pointers */
426 struct xfs_log_vec *li_lv; /* active log vector */
427 xfs_lsn_t li_seq; /* CIL commit seq */
831} xfs_log_item_t; 428} xfs_log_item_t;
832 429
833#define XFS_LI_IN_AIL 0x1 430#define XFS_LI_IN_AIL 0x1
@@ -872,34 +469,6 @@ typedef struct xfs_item_ops {
872#define XFS_ITEM_PUSHBUF 3 469#define XFS_ITEM_PUSHBUF 3
873 470
874/* 471/*
875 * This structure is used to maintain a list of block ranges that have been
876 * freed in the transaction. The ranges are listed in the perag[] busy list
877 * between when they're freed and the transaction is committed to disk.
878 */
879
880typedef struct xfs_log_busy_slot {
881 xfs_agnumber_t lbc_ag;
882 ushort lbc_idx; /* index in perag.busy[] */
883} xfs_log_busy_slot_t;
884
885#define XFS_LBC_NUM_SLOTS 31
886typedef struct xfs_log_busy_chunk {
887 struct xfs_log_busy_chunk *lbc_next;
888 uint lbc_free; /* free slots bitmask */
889 ushort lbc_unused; /* first unused */
890 xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
891} xfs_log_busy_chunk_t;
892
893#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
894#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
895
896#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
897#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
898#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
899#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
900#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
901
902/*
903 * This is the type of function which can be given to xfs_trans_callback() 472 * This is the type of function which can be given to xfs_trans_callback()
904 * to be called upon the transaction's commit to disk. 473 * to be called upon the transaction's commit to disk.
905 */ 474 */
@@ -950,8 +519,7 @@ typedef struct xfs_trans {
950 unsigned int t_items_free; /* log item descs free */ 519 unsigned int t_items_free; /* log item descs free */
951 xfs_log_item_chunk_t t_items; /* first log item desc chunk */ 520 xfs_log_item_chunk_t t_items; /* first log item desc chunk */
952 xfs_trans_header_t t_header; /* header for in-log trans */ 521 xfs_trans_header_t t_header; /* header for in-log trans */
953 unsigned int t_busy_free; /* busy descs free */ 522 struct list_head t_busy; /* list of busy extents */
954 xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
955 unsigned long t_pflags; /* saved process flags state */ 523 unsigned long t_pflags; /* saved process flags state */
956} xfs_trans_t; 524} xfs_trans_t;
957 525
@@ -1025,9 +593,6 @@ int _xfs_trans_commit(xfs_trans_t *,
1025void xfs_trans_cancel(xfs_trans_t *, int); 593void xfs_trans_cancel(xfs_trans_t *, int);
1026int xfs_trans_ail_init(struct xfs_mount *); 594int xfs_trans_ail_init(struct xfs_mount *);
1027void xfs_trans_ail_destroy(struct xfs_mount *); 595void xfs_trans_ail_destroy(struct xfs_mount *);
1028xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
1029 xfs_agnumber_t ag,
1030 xfs_extlen_t idx);
1031 596
1032extern kmem_zone_t *xfs_trans_zone; 597extern kmem_zone_t *xfs_trans_zone;
1033 598
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 9cd809025f3a..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -114,7 +114,7 @@ _xfs_trans_bjoin(
114 xfs_buf_item_init(bp, tp->t_mountp); 114 xfs_buf_item_init(bp, tp->t_mountp);
115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 115 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 116 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 117 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); 118 ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
119 if (reset_recur) 119 if (reset_recur)
120 bip->bli_recur = 0; 120 bip->bli_recur = 0;
@@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 511 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF); 512 ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 513 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 514 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
515 ASSERT(atomic_read(&bip->bli_refcount) > 0); 515 ASSERT(atomic_read(&bip->bli_refcount) > 0);
516 516
517 /* 517 /*
@@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t *tp,
619 619
620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 620 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 621 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 622 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
623 ASSERT(atomic_read(&bip->bli_refcount) > 0); 623 ASSERT(atomic_read(&bip->bli_refcount) > 0);
624 bip->bli_flags |= XFS_BLI_HOLD; 624 bip->bli_flags |= XFS_BLI_HOLD;
625 trace_xfs_trans_bhold(bip); 625 trace_xfs_trans_bhold(bip);
@@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t *tp,
641 641
642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 642 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 643 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL)); 644 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
645 ASSERT(atomic_read(&bip->bli_refcount) > 0); 645 ASSERT(atomic_read(&bip->bli_refcount) > 0);
646 ASSERT(bip->bli_flags & XFS_BLI_HOLD); 646 ASSERT(bip->bli_flags & XFS_BLI_HOLD);
647 bip->bli_flags &= ~XFS_BLI_HOLD; 647 bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t *tp,
704 bip->bli_flags &= ~XFS_BLI_STALE; 704 bip->bli_flags &= ~XFS_BLI_STALE;
705 ASSERT(XFS_BUF_ISSTALE(bp)); 705 ASSERT(XFS_BUF_ISSTALE(bp));
706 XFS_BUF_UNSTALE(bp); 706 XFS_BUF_UNSTALE(bp);
707 bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL; 707 bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
708 } 708 }
709 709
710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip); 710 lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -762,8 +762,8 @@ xfs_trans_binval(
762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 762 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
763 ASSERT(XFS_BUF_ISSTALE(bp)); 763 ASSERT(XFS_BUF_ISSTALE(bp));
764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 764 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF)); 765 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
766 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 766 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY); 767 ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY); 768 ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
769 return; 769 return;
@@ -774,7 +774,7 @@ xfs_trans_binval(
774 * in the buf log item. The STALE flag will be used in 774 * in the buf log item. The STALE flag will be used in
775 * xfs_buf_item_unpin() to determine if it should clean up 775 * xfs_buf_item_unpin() to determine if it should clean up
776 * when the last reference to the buf item is given up. 776 * when the last reference to the buf item is given up.
777 * We set the XFS_BLI_CANCEL flag in the buf log format structure 777 * We set the XFS_BLF_CANCEL flag in the buf log format structure
778 * and log the buf item. This will be used at recovery time 778 * and log the buf item. This will be used at recovery time
779 * to determine that copies of the buffer in the log before 779 * to determine that copies of the buffer in the log before
780 * this should not be replayed. 780 * this should not be replayed.
@@ -792,9 +792,9 @@ xfs_trans_binval(
792 XFS_BUF_UNDELAYWRITE(bp); 792 XFS_BUF_UNDELAYWRITE(bp);
793 XFS_BUF_STALE(bp); 793 XFS_BUF_STALE(bp);
794 bip->bli_flags |= XFS_BLI_STALE; 794 bip->bli_flags |= XFS_BLI_STALE;
795 bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY); 795 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
796 bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF; 796 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
797 bip->bli_format.blf_flags |= XFS_BLI_CANCEL; 797 bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
798 memset((char *)(bip->bli_format.blf_data_map), 0, 798 memset((char *)(bip->bli_format.blf_data_map), 0,
799 (bip->bli_format.blf_map_size * sizeof(uint))); 799 (bip->bli_format.blf_map_size * sizeof(uint)));
800 lidp->lid_flags |= XFS_LID_DIRTY; 800 lidp->lid_flags |= XFS_LID_DIRTY;
@@ -802,16 +802,16 @@ xfs_trans_binval(
802} 802}
803 803
804/* 804/*
805 * This call is used to indicate that the buffer contains on-disk 805 * This call is used to indicate that the buffer contains on-disk inodes which
806 * inodes which must be handled specially during recovery. They 806 * must be handled specially during recovery. They require special handling
807 * require special handling because only the di_next_unlinked from 807 * because only the di_next_unlinked from the inodes in the buffer should be
808 * the inodes in the buffer should be recovered. The rest of the 808 * recovered. The rest of the data in the buffer is logged via the inodes
809 * data in the buffer is logged via the inodes themselves. 809 * themselves.
810 * 810 *
811 * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log 811 * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
812 * format structure so that we'll know what to do at recovery time. 812 * transferred to the buffer's log format structure so that we'll know what to
813 * do at recovery time.
813 */ 814 */
814/* ARGSUSED */
815void 815void
816xfs_trans_inode_buf( 816xfs_trans_inode_buf(
817 xfs_trans_t *tp, 817 xfs_trans_t *tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 826 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
827 ASSERT(atomic_read(&bip->bli_refcount) > 0); 827 ASSERT(atomic_read(&bip->bli_refcount) > 0);
828 828
829 bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; 829 bip->bli_flags |= XFS_BLI_INODE_BUF;
830} 830}
831 831
832/* 832/*
@@ -908,9 +908,9 @@ xfs_trans_dquot_buf(
908 ASSERT(XFS_BUF_ISBUSY(bp)); 908 ASSERT(XFS_BUF_ISBUSY(bp));
909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); 909 ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 910 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
911 ASSERT(type == XFS_BLI_UDQUOT_BUF || 911 ASSERT(type == XFS_BLF_UDQUOT_BUF ||
912 type == XFS_BLI_PDQUOT_BUF || 912 type == XFS_BLF_PDQUOT_BUF ||
913 type == XFS_BLI_GDQUOT_BUF); 913 type == XFS_BLF_GDQUOT_BUF);
914 914
915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); 915 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
916 ASSERT(atomic_read(&bip->bli_refcount) > 0); 916 ASSERT(atomic_read(&bip->bli_refcount) > 0);
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
299void 299void
300xfs_trans_free_items( 300xfs_trans_free_items(
301 xfs_trans_t *tp, 301 xfs_trans_t *tp,
302 xfs_lsn_t commit_lsn,
302 int flags) 303 int flags)
303{ 304{
304 xfs_log_item_chunk_t *licp; 305 xfs_log_item_chunk_t *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
311 * Special case the embedded chunk so we don't free it below. 312 * Special case the embedded chunk so we don't free it below.
312 */ 313 */
313 if (!xfs_lic_are_all_free(licp)) { 314 if (!xfs_lic_are_all_free(licp)) {
314 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 315 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
315 xfs_lic_all_free(licp); 316 xfs_lic_all_free(licp);
316 licp->lic_unused = 0; 317 licp->lic_unused = 0;
317 } 318 }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
322 */ 323 */
323 while (licp != NULL) { 324 while (licp != NULL) {
324 ASSERT(!xfs_lic_are_all_free(licp)); 325 ASSERT(!xfs_lic_are_all_free(licp));
325 (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN); 326 (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
326 next_licp = licp->lic_next; 327 next_licp = licp->lic_next;
327 kmem_free(licp); 328 kmem_free(licp);
328 licp = next_licp; 329 licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
438 439
439 return freed; 440 return freed;
440} 441}
441
442
443/*
444 * This is called to add the given busy item to the transaction's
445 * list of busy items. It must find a free busy item descriptor
446 * or allocate a new one and add the item to that descriptor.
447 * The function returns a pointer to busy descriptor used to point
448 * to the new busy entry. The log busy entry will now point to its new
449 * descriptor with its ???? field.
450 */
451xfs_log_busy_slot_t *
452xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
453{
454 xfs_log_busy_chunk_t *lbcp;
455 xfs_log_busy_slot_t *lbsp;
456 int i=0;
457
458 /*
459 * If there are no free descriptors, allocate a new chunk
460 * of them and put it at the front of the chunk list.
461 */
462 if (tp->t_busy_free == 0) {
463 lbcp = (xfs_log_busy_chunk_t*)
464 kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
465 ASSERT(lbcp != NULL);
466 /*
467 * Initialize the chunk, and then
468 * claim the first slot in the newly allocated chunk.
469 */
470 XFS_LBC_INIT(lbcp);
471 XFS_LBC_CLAIM(lbcp, 0);
472 lbcp->lbc_unused = 1;
473 lbsp = XFS_LBC_SLOT(lbcp, 0);
474
475 /*
476 * Link in the new chunk and update the free count.
477 */
478 lbcp->lbc_next = tp->t_busy.lbc_next;
479 tp->t_busy.lbc_next = lbcp;
480 tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
481
482 /*
483 * Initialize the descriptor and the generic portion
484 * of the log item.
485 *
486 * Point the new slot at this item and return it.
487 * Also point the log item at its currently active
488 * descriptor and set the item's mount pointer.
489 */
490 lbsp->lbc_ag = ag;
491 lbsp->lbc_idx = idx;
492 return lbsp;
493 }
494
495 /*
496 * Find the free descriptor. It is somewhere in the chunklist
497 * of descriptors.
498 */
499 lbcp = &tp->t_busy;
500 while (lbcp != NULL) {
501 if (XFS_LBC_VACANCY(lbcp)) {
502 if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
503 i = lbcp->lbc_unused;
504 break;
505 } else {
506 /* out-of-order vacancy */
507 cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
508 ASSERT(0);
509 }
510 }
511 lbcp = lbcp->lbc_next;
512 }
513 ASSERT(lbcp != NULL);
514 /*
515 * If we find a free descriptor, claim it,
516 * initialize it, and return it.
517 */
518 XFS_LBC_CLAIM(lbcp, i);
519 if (lbcp->lbc_unused <= i) {
520 lbcp->lbc_unused = i + 1;
521 }
522 lbsp = XFS_LBC_SLOT(lbcp, i);
523 tp->t_busy_free--;
524 lbsp->lbc_ag = ag;
525 lbsp->lbc_idx = idx;
526 return lbsp;
527}
528
529
530/*
531 * xfs_trans_free_busy
532 * Free all of the busy lists from a transaction
533 */
534void
535xfs_trans_free_busy(xfs_trans_t *tp)
536{
537 xfs_log_busy_chunk_t *lbcp;
538 xfs_log_busy_chunk_t *lbcq;
539
540 lbcp = tp->t_busy.lbc_next;
541 while (lbcp != NULL) {
542 lbcq = lbcp->lbc_next;
543 kmem_free(lbcp);
544 lbcp = lbcq;
545 }
546
547 XFS_LBC_INIT(&tp->t_busy);
548 tp->t_busy.lbc_unused = 0;
549}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc *xfs_trans_find_item(struct xfs_trans *,
35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *); 35struct xfs_log_item_desc *xfs_trans_first_item(struct xfs_trans *);
36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *, 36struct xfs_log_item_desc *xfs_trans_next_item(struct xfs_trans *,
37 struct xfs_log_item_desc *); 37 struct xfs_log_item_desc *);
38void xfs_trans_free_items(struct xfs_trans *, int); 38
39void xfs_trans_unlock_items(struct xfs_trans *, 39void xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
40 xfs_lsn_t); 40void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
41void xfs_trans_free_busy(xfs_trans_t *tp); 41 int flags);
42xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp, 42
43 xfs_agnumber_t ag, 43void xfs_trans_item_committed(struct xfs_log_item *lip,
44 xfs_extlen_t idx); 44 xfs_lsn_t commit_lsn, int aborted);
45void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
45 46
46/* 47/*
47 * AIL traversal cursor. 48 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef __uint32_t xfs_dahash_t; /* dir/attr hash value */
75 75
76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */ 76typedef __uint16_t xfs_prid_t; /* prid_t truncated to 16bits in XFS */
77 77
78typedef __uint32_t xlog_tid_t; /* transaction ID type */
79
78/* 80/*
79 * These types are 64 bits on disk but are either 32 or 64 bits in memory. 81 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
80 * Disk based types: 82 * Disk based types:
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea38..a06bd62504fc 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -267,7 +267,7 @@ xfs_setattr(
267 if (code) { 267 if (code) {
268 ASSERT(tp == NULL); 268 ASSERT(tp == NULL);
269 lock_flags &= ~XFS_ILOCK_EXCL; 269 lock_flags &= ~XFS_ILOCK_EXCL;
270 ASSERT(lock_flags == XFS_IOLOCK_EXCL); 270 ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
271 goto error_return; 271 goto error_return;
272 } 272 }
273 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); 273 tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);